I've been working on random forest algorithm for classification with roulette selection to find best splits. I came up with this homebrew based on this article https://machinelearningmastery.com/implement-random-forest-scratch-python/.
It works well enough, but I have a major problem with execution time. It's reasonably fast for 1-6 trees, but it completely chokes on 7 and more. I'm looking for ways to make it more efficient, so it doesn't take a month to finish basic tests. I'm a beginner, and I don't know much about optimizing code, so I'd appreciate any suggestions on how I could improve. Here's the link to Github repo: https://github.com/pdabrow11/Random-Forest-Roulette. It contains all .py files and datasets used for classification. Here's the raw code:
- data_load.py
import csv
def read_file(file):
with open(file, 'r') as myfile:
data = myfile.readlines()
return data
def load_data(data, var):
dataset = list()
for row in data:
row = row.rstrip("\n")
if var == 1:
dataset.append(row.split(","))
if var == 2:
dataset.append(row.split(" "))
if var == 3:
dataset.append(row.split(";"))
return dataset
def process_dat_data(data):
dataset = list()
for row in data:
row_temp = list()
for i in range(1, len(row)):
res = row[i].find(':')
row_temp.append(row[i][res+1:])
row_temp.append(row[0])
dataset.append(row_temp)
return dataset
def str2int(dataset):
for i in range(0, len(dataset[0])):
class_values = [row[i] for row in dataset]
unique = set(class_values)
lookup = dict()
for j, value in enumerate(unique):
lookup[value] = float(j)
for row in dataset:
row[i] = lookup[row[i]]
def str2float(dataset):
for row in dataset:
for i in range(0, len(row)):
row[i] = float(row[i])
return dataset
def write_data(data):
with open("tests_res.csv", 'a') as file:
f = csv.writer(file, delimiter =',', lineterminator='\n')
f.writerows(data)
- random_forest.py
from math import sqrt
from random import seed
from random import randrange
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
import numpy as np
import numpy.random as npr
def cross_validation_split(dataset, folds_num):
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / folds_num)
for i in range(folds_num):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
def calc_accuracy(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
def evaluate_algorithm(dataset, folds_num, trees_num, *args):
attributes_num = int(sqrt(len(dataset[0])-1))
folds = cross_validation_split(dataset, folds_num)
scores = list()
scores_sklearn = list()
res = RandomForestClassifier(n_estimators=trees_num)
for fold in folds:
train_set = list(folds)
train_set.remove(fold)
train_set = sum(train_set, [])
test_set = list()
y_test_set = list()
X, y = list(), list()
for row in fold:
#row_copy = list(row)
#row_copy[-1] = None
test_set.append(row[:-1])
y_test_set.append(row[-1])
predicted = random_forest(train_set, test_set, trees_num, attributes_num, *args)
actual = [row[-1] for row in fold]
accuracy = calc_accuracy(actual, predicted)
scores.append(accuracy)
# Sklearn
for row in train_set:
X.append(row[:-1])
y.append(row[-1])
#res = clone(res)
res.fit(X, y)
accuracy_sklearn = res.score(test_set, y_test_set)*100
scores_sklearn.append(accuracy_sklearn)
return scores, scores_sklearn
def split_attribute(dataset, index, value):
left, right = list(), list()
for row in dataset:
if row[index] < value:
left.append(row)
else:
right.append(row)
return left, right
def subset(dataset, ratio):
sample = list()
sample_num = round(len(dataset) * ratio)
while len(sample) < sample_num:
index = randrange(len(dataset))
sample.append(dataset[index])
return sample
def gini_index(groups, classes):
examples_num = int(sum([len(group) for group in groups]))
gini = 0.0
for group in groups:
size = int(len(group))
if size == 0:
continue
P, E = 0.0, 1.0
for single_class in classes:
P = [row[-1] for row in group].count(single_class) / size
E -= P ** 2
gini += (size / examples_num) * E
return gini
def roulette_split(dataset, attributes_num, threshold):
classes = list(set(row[-1] for row in dataset))
index_list, val_list, group_list, fit = [], [], [], []
attributes = list()
while len(attributes) < attributes_num:
index = randrange(len(dataset[0])-1)
if index not in attributes:
attributes.append(index)
counter = 0
for index in attributes:
for row in dataset:
groups = split_attribute(dataset, index, row[index])
gini = gini_index(groups, classes)
index_list.append(index)
val_list.append(row[index])
group_list.append(groups)
fit.append(1-gini)
counter += 1
wheel_size = 0
fit_args_sorted = np.argsort(fit)
for i in range (0, int(threshold*counter)):
fit[fit_args_sorted[i]] = 0
for i in range (0, counter):
wheel_size += fit[i]
selection_probs = [fit[i]/wheel_size for i in range (0, counter)]
winner = int(npr.choice(np.arange(counter), 1, p=selection_probs))
return {'val':val_list[winner], 'groups':group_list[winner], 'index':index_list[winner]}
def terminal(group):
results = [row[-1] for row in group]
return max(results, key=results.count)
def one_class(node):
res = True
for i in range(0, len(node)):
if node[0][-1] != node[i][-1]:
res = False
return res
return res
def are_the_same(node):
res = True
for i in range(0, len(node[0])-1):
for j in range(0, len(node)):
for k in range(0, len(node)):
if node[j][i] != node[k][i]:
res = False
return res
return res
def split(node, attributes_num, depth, threshold):
left, right = node['groups']
del(node['groups'])
if not left or not right:
node['left'] = node['right'] = terminal(left + right)
return
if one_class(left) or are_the_same(left):
node['left'] = terminal(left)
else:
node['left'] = roulette_split(left, attributes_num, threshold)
split(node['left'], attributes_num, depth+1, threshold)
if one_class(right) or are_the_same(right):
node['right'] = terminal(right)
else:
node['right'] = roulette_split(right, attributes_num, threshold)
split(node['right'], attributes_num, depth+1, threshold)
def build_tree(train, attributes_num, threshold):
root = roulette_split(train, attributes_num, threshold)
split(root, attributes_num, 1, threshold)
return root
def predict(node, row):
if row[node['index']] < node['val']:
if isinstance(node['left'], dict):
return predict(node['left'], row)
else:
return node['left']
else:
if isinstance(node['right'], dict):
return predict(node['right'], row)
else:
return node['right']
def bagging_predict(trees, row):
predictions = [predict(tree, row) for tree in trees]
return max(set(predictions), key=predictions.count)
def random_forest(train, test, attributes_num, trees_num, sample_size, threshold):
trees = list()
for i in range(trees_num):
sample = subset(train, sample_size)
tree = build_tree(sample, attributes_num, threshold)
trees.append(tree)
predictions = [bagging_predict(trees, row) for row in test]
return(predictions)
- launch.py
from asyncore import write
from random_forest import *
from data_load import *
from random_forest import *
from data_load import *
def main():
data_absence = read_file("datasets_v2/Absenteeism_at_work.csv")
dataset_absence = load_data(data_absence, 3)
dataset_absence = dataset_absence[1:]
str2float(dataset_absence)
data_car = read_file("datasets_v2/car.data")
dataset_car = load_data(data_car, 1)
str2int(dataset_car)
data_opt = read_file("datasets_v2/batch_optdigits.tra")
dataset_opt = load_data(data_opt, 1)
str2float(dataset_opt)
data_gas = read_file("datasets_v2/GasSensorDataset/batch1.dat")
dataset_gas = load_data(data_gas, 2)
dataset_gas = process_dat_data(dataset_gas)
str2float(dataset_gas)
sk_scores = []
for_scores = []
col_1 = []
col_2 = []
col_3 = []
col_4 = []
col_5 = []
col_6 = []
col_7 = []
col_8 = []
for dataset in [dataset_car, dataset_absence, dataset_gas]:
sample_size = 0.05
folds_num = 5
for threshold in [0.2 , 0.5, 0.9]:
print('Dla mojej informacji - threshold: ', threshold, '\n')
for trees_num in range(1,10):
sk_scores.clear()
for_scores.clear()
for i in [1,2,3,5]:
seed(i)
scores = evaluate_algorithm(dataset, folds_num, trees_num, sample_size, threshold)
score = sum(scores[0])/float(len(scores[0]))
for_scores.append(score)
sk_score = sum(scores[1])/float(len(scores[1]))
sk_scores.append(sk_score)
if dataset == dataset_car:
col_1.append('SECOM Dataset')
col_1.append('SECOM Dataset')
elif dataset == dataset_gas:
col_1.append('Gas Sensor Dataset')
col_1.append('Gas Sensor Dataset')
else:
col_1.append('Absenteeism Dataset')
col_1.append('Absenteeism Dataset')
for_mean = round(np.mean(for_scores),2)
for_stdev = round(np.std(for_scores),2)
for_best = round(np.amax(for_scores),2)
for_worst = round(np.amax(for_scores),2)
sk_mean = round(np.mean(sk_scores),2)
sk_stdev = round(np.std(sk_scores),2)
sk_best = round(np.amax(sk_scores),2)
sk_worst = round(np.amax(sk_scores),2)
col_2.append('Własna implementacja')
col_2.append('Sklearn')
col_3.append(trees_num)
col_3.append(trees_num)
col_4.append(threshold)
col_4.append(threshold)
col_5.append(for_mean)
col_6.append(for_stdev)
col_7.append(for_best)
col_8.append(for_worst)
col_5.append(sk_mean)
col_6.append(sk_stdev)
col_7.append(sk_best)
col_8.append(sk_worst)
print(trees_num)
header = ['Zbiór danych', 'Implementacja', 'Ilość drzew', 'Próg selekcji', 'Średnia jakość', 'Odchylenie standardowe', 'Najlepsza jakość', 'Najgorsza jakość']
file_data = np.column_stack((col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8))
file_data = np.vstack((header, file_data))
open('tests_res.csv', 'w').close()
write_data(file_data)
if __name__ == "__main__":
main()