Question

Can you create an example of a decision tree model with Python that features data processing,...

Can you create an example of a decision tree model with Python that features data processing, data mining, and result playing? You have to implement it from scratch, and no library databases can be used.
0 0
Add a comment Improve this question Transcribed image text
Answer #1

This is a code for the decision tree in which only default python packages are used.

from random import seed
from random import randrange
from csv import reader

#load a csv
def load_csv(fName):
file=open(fName,"rb")
lines=reader(file)
dataset=list(lines)
return dataset

# Convert string column to float

def str_column_to_float(dataSet, column):
   for row in dataSet:
       row[column] = float(row[column].strip())

# Split a dataset into k folds

def cross_validation_split(dataset,folds):
   dataset_split = list()
   dataset_copy = list(dataset)
   fold_size = int(len(dataset) / folds)
   for i in range(folds):
       fold = list()
       while len(fold) < fold_size:
           index = randrange(len(dataset_copy))
           fold.append(dataset_copy.pop(index))
       dataset_split.append(fold)
   return dataset_split

# Calculate accuracy percentage

def accuracy_metric(actual, predicted):
   correct = 0
   for i in range(len(actual)):
       if actual[i] == predicted[i]:
           correct += 1
   return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split

def evaluate_algorithm(dataset, algorithm, fold_cnt, *args):
   folds = cross_validation_split(dataset, fold_cnt)
   scores = list()
   for fold in folds:
       train_set = list(folds)
       train_set.remove(fold)
       train_set = sum(train_set, [])
       test_set = list()
       for row in fold:
           row_copy = list(row)
           test_set.append(row_copy)
           row_copy[-1] = None
       predicted = algorithm(train_set, test_set, *args)
       actual = [row[-1] for row in fold]
       accuracy = accuracy_metric(actual, predicted)
       scores.append(accuracy)
   return scores

# Split a dataset based on an attribute and an attribute value

def test_split(index, value, dataSet):
   left, right = list(), list()
   for row in dataSet:
       if row[index] < value:
           left.append(row)
       else:
           right.append(row)
   return left, right

# Calculate the Gini index for a split dataset

def gini_index(groups, classes):
   # count all samples at split point
   n_instances = float(sum([len(group) for group in groups]))
   # sum weighted Gini index for each group
   gini = 0.0
   for group in groups:
       size = float(len(group))
       # avoid divide by zero
       if size == 0:
           continue
       score = 0.0
       # score the group based on the score for each class
       for class_val in classes:
           p = [row[-1] for row in group].count(class_val) / size
           score += p * p
       # weight the group score by its relative size
       gini += (1.0 - score) * (size / n_instances)
   return gini

# Select the best split point for a dataset

def get_split(dataSet):
   class_values = list(set(row[-1] for row in dataSet))
   b_index, b_value, b_score, b_groups = 999, 999, 999, None
   for index in range(len(dataSet[0])-1):
       for row in dataSet:
           groups = test_split(index, row[index], dataSet)
           gini = gini_index(groups, class_values)
           if gini < b_score:
               b_index, b_value, b_score, b_groups = index, row[index], gini, groups
   return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Create a terminal node value

def to_terminal(grp):
   outcomes = [row[-1] for row in grp]
   return max(set(outcomes), key=outcomes.count)

# Create child splits for a node or make terminal

def split(node, max_depth, min_size, depth):
   left, right = node['groups']
   del(node['groups'])
   # check for a no split
   if not left or not right:
       node['left'] = node['right'] = to_terminal(left + right)
       return
   # check for max depth
   if depth >= max_depth:
       node['left'], node['right'] = to_terminal(left), to_terminal(right)
       return
   # process left child
   if len(left) <= min_size:
       node['left'] = to_terminal(left)
   else:
       node['left'] = get_split(left)
       split(node['left'], max_depth, min_size, depth+1)
   # process right child
   if len(right) <= min_size:
       node['right'] = to_terminal(right)
   else:
       node['right'] = get_split(right)
       split(node['right'], max_depth, min_size, depth+1)

# Build a decision tree

def build_tree(train, max_depth, min_size):
   root = get_split(train)
   split(root, max_depth, min_size, 1)
   return root

# Make a prediction with a decision tree

def predict(node, row):
   if row[node['index']] < node['value']:
       if isinstance(node['left'], dict):
           return predict(node['left'], row)
       else:
           return node['left']
   else:
       if isinstance(node['right'], dict):
           return predict(node['right'], row)
       else:
           return node['right']

# Classification and Regression Tree Algorithm
def decision_tree(train, test, max_depth, min_size):
   tree = build_tree(train, max_depth, min_size)
   predictions = list()
   for row in test:
       prediction = predict(tree, row)
       predictions.append(prediction)
   return(predictions)

# Test
seed(1)
# load and prepare data
filename = 'your_file_name.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(len(dataset[0])):
   str_column_to_float(dataset, i)
# evaluate algorithm
n_folds = 5
max_depth = 5
min_size = 10
scores = evaluate_algorithm(dataset, decision_tree, n_folds, max_depth, min_size)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Add a comment
Know the answer?
Add Answer to:
Can you create an example of a decision tree model with Python that features data processing,...
Your Answer:

Post as a guest

Your Name:

What's your source?

Earn Coins

Coins can be redeemed for fabulous gifts.

Not the answer you're looking for? Ask your own homework help question. Our experts will answer your question WITHIN MINUTES for Free.
Similar Homework Help Questions
  • Question: By providing an example, discuss the roles of decision tree in Big Data Analytics. Requirements:...

    Question: By providing an example, discuss the roles of decision tree in Big Data Analytics. Requirements: Define Decision Tree. In which scenario can Decision Tree be used in Data Analysis. Provide examples.

  • Can you give me a poste for Science Writing TOPIC: DECISION TREE Decision Tree Algorithm Pseudocode:-...

    Can you give me a poste for Science Writing TOPIC: DECISION TREE Decision Tree Algorithm Pseudocode:- 1) Place the best attribute of the dataset at the root node of the tree. 2) Split the training set into subsets. Subsets should be make in such a way that each subset contains data with the same value for an attribute. 3) Repeat steps 1 and 2 on each subset until you find leaf nodes in all the branches of the tree. Two...

  • in python 11.1 Binary Search Tree In this assignment, you will implement a Binary Search Tree...

    in python 11.1 Binary Search Tree In this assignment, you will implement a Binary Search Tree You will also need to implement a Node class. This class will not be tested, but is needed to implement the BST. Your BST must implement the following methods. You are free to implement additional helper methods. It is recommended you create your own helper methods Constructor: Creates an Empty Tree String Method: Returns the string "Empty Tree" for an empty tree. Otherwise, returns...

  • Hi team, can you create a python programs for 1. Create DataFrame shape of 300x5 from...

    Hi team, can you create a python programs for 1. Create DataFrame shape of 300x5 from scratch a. Column_1: Generate random 300 floating numbers between 1 and 0 b. Column_2: Generate random 300 integers between 10 and 1000 c. Column_3: multiply column_2 with column_1 d. Column_4: Generate random 300 Ordinal categorical variable with three unique values e. Column_5: Generate random 300 Nominal categorical variable with two unique values 2. Get head, tail of the created data 3. Show mean, max,...

  • For this assignment you will build a decision tree (sometimes referred to as a flowchart) that...

    For this assignment you will build a decision tree (sometimes referred to as a flowchart) that would help to determine whether a nerve fibers were part of the sympathetic or parasympathetic division of the ANS. At the end of this assignment you should have produced a decision tree that you or one of your peers could use to properly distinguish between the two divisions of the ANS. You can use morphological or functional characteristics to narrow down identification, however, characteristics...

  • ***Please no handwriting** a. Is the transportation model an example of decision making under certainty or...

    ***Please no handwriting** a. Is the transportation model an example of decision making under certainty or decision making under uncertainty? Why? a. What is a balanced transportation model? Describe the approach you would use to solve an unbalanced model? 2.What is the minimal-spanning tree model? Give several examples of problems that can be solved using this type of model. 3.What is the maximal-flow model? What types of problems can be solved using this type of model? 4.Describe a problem that...

  • 1. Can a Decision Tree be used to perform clustering? If so, explain how. If not, provide a counterexample showing how it is not suitable for the function. python- database

    1. Can a Decision Tree be used to perform clustering? If so, explain how. If not, provide a counterexample showing how it is not suitable for the function. python- database

  • How is decision tree analysis used in business analytics? Give an example of how it is...

    How is decision tree analysis used in business analytics? Give an example of how it is used to solve a business problem. Mention the references.link from where you got the information

  • Topic: The Need for Data Mining Overview: Over the last decade, advances in processing power and...

    Topic: The Need for Data Mining Overview: Over the last decade, advances in processing power and speed have enabled us to move beyond manual, tedious, and time consuming data mining practices to quick, easy, and automated collection for data analysis. The more complex the data sets, the more potential there is to uncover relevant insights. Retailers, banks, manufacturers, telecommunications providers, and insurers are using data mining to discover relationships among everything from price optimization, promotions, and demographics to how the...

  • I need help defining a class in C++ that acts like a python dictionary. I need...

    I need help defining a class in C++ that acts like a python dictionary. I need to be able to add keys-values to an already existing dictionary. For example id= {'sam':75, 'robert':09907, 'timmmy',95453, 'samuel',5333)}. i want to create a class in which i can do id.add("samuel', 8439922) and it adds it to the dictionary. i want to implement this with using vectors to store the pairs. I need to do this without using any features like maps, every thing has...

ADVERTISEMENT
Free Homework Help App
Download From Google Play
Scan Your Homework
to Get Instant Free Answers
Need Online Homework Help?
Ask a Question
Get Answers For Free
Most questions answered within 3 hours.
ADVERTISEMENT
ADVERTISEMENT