Initial commit

This commit is contained in:
Aladdin Persson
2021-01-30 21:49:15 +01:00
commit 65b8c80495
432 changed files with 1290844 additions and 0 deletions

View File

@@ -0,0 +1,90 @@
1.1107, -2.1079, 1
-0.5498, 0.0943, 1
-0.0382, 1.8829,1
0.0555, -0.6139,1
0.5870, -1.2067,1
0.5453, 0.2509,1
-0.3927, -0.6220,1
-1.1905, -1.8785,1
-0.4240, 0.7772,1
-0.7139, 1.5846,1
-0.8883, 2.1408,1
-0.6922, 0.0993,1
1.4350, 1.2334,1
-0.7576, 0.7386,1
-1.1144, -1.7059,1
0.6612, -1.7296,1
-2.1381, -0.0600,1
1.3857, 1.2178,1
-1.4951, 0.0373,1
0.8029, 0.9739,1
1.5607, 1.5862,1
0.8563, -1.4245,1
0.0397, -1.3799,1
1.2331, 1.7421,1
-2.0015, 0.8355,1
-0.3428, -0.4780,1
-0.8891, 1.2634,1
0.3832, -0.1189,1
0.4172, 1.0132,1
-0.8695, -0.7947,1
2.9737, 3.6438,2
3.7680, 1.8649,2
0.1166, 0.9435,2
0.6896, 3.9160,2
1.2234, 2.9899,2
2.3009, 0.4150,2
3.7693, 3.8027,2
1.9450, 3.4208,2
0.9290, 3.3611,2
5.0027, 2.7870,2
1.0101, 1.8737,2
2.0751, 2.2628,2
1.9113, 3.6777,2
2.3127, 3.9130,2
1.9392, 2.3976,2
3.1218, 2.5495,2
1.7032, 1.1509,2
0.4212, 3.5322,2
2.7686, 0.9402,2
2.1696, 2.9285,2
0.3380, 2.0947,2
3.6886, 0.4054,2
2.6315, 3.1962,2
-0.5332, 3.1421,2
0.3380, 3.0801,2
1.4030, 1.1841,2
2.8739, 2.7777,2
1.1254, 3.2404,2
0.0988, 1.9522,2
0.3688, 2.8904,2
1.4758, -1.6387,3
1.9289, -1.8191,3
2.5741, -1.3213,3
2.1917, -1.2852,3
0.8358, -2.3349,3
2.6863, -1.8834,3
3.1102, -0.4854,3
3.7073, -0.6466,3
3.6394, -0.4097,3
0.5365, -3.6555,3
2.9295, -0.3819,3
0.8168, -3.1133,3
1.3432, -1.7717,3
1.1039, -2.2261,3
1.3754, -2.2236,3
0.6757, -2.5379,3
-0.2029, -3.8420,3
2.4210, -1.9788,3
1.0335, -2.6042,3
0.9638, -2.9449,3
-0.8198, -5.4449,3
1.9552, -1.5530,3
0.3505, -3.1887,3
2.4943, -1.8116,3
1.9761, -1.0664,3
0.5994, -3.0513,3
2.2076, -1.6728,3
1.9941, -1.8826,3
1.7487, -2.9644,3
1.4160, -2.4234,3

View File

@@ -0,0 +1,100 @@
701,478,227,863,963,2
96,147,210,493,586,2
798,143,431,541,94,1
233,146,667,886,771,1
668,815,628,429,387,3
718,456,883,281,840,1
182,837,144,664,460,2
882,533,203,776,56,3
648,715,288,619,293,1
178,951,965,164,1,3
270,432,457,978,794,1
335,219,596,763,231,1
47,477,78,423,616,3
324,969,514,55,722,2
824,571,159,516,594,2
837,667,957,150,508,3
833,945,311,12,859,1
536,280,21,292,518,1
943,55,709,269,425,1
593,178,861,130,26,3
54,165,3,638,816,2
637,861,423,855,98,1
222,502,427,944,732,1
8,465,403,376,761,2
184,602,673,825,741,1
639,677,204,385,236,2
176,843,479,952,898,2
125,626,553,74,1000,3
302,495,294,362,169,2
131,912,803,232,852,1
117,609,290,133,357,2
207,812,788,182,494,1
954,76,257,620,844,1
287,266,195,30,344,3
440,590,324,868,969,3
831,290,228,586,971,1
567,734,460,429,689,1
864,939,191,620,431,1
905,337,200,400,77,2
304,997,141,208,615,3
19,280,187,44,639,1
280,279,275,305,123,1
866,519,331,241,972,1
27,77,860,458,643,3
486,713,917,324,855,2
466,16,897,222,731,1
712,230,215,805,341,1
300,100,292,978,115,3
938,800,911,345,49,3
98,593,43,583,684,1
348,479,406,605,595,2
892,877,592,339,615,3
203,53,995,704,927,2
991,968,886,43,883,1
733,939,71,388,56,1
249,376,830,628,812,2
4,877,743,242,266,1
95,537,106,490,518,2
870,704,430,270,327,2
402,97,283,569,638,3
537,979,966,729,8,3
399,51,285,973,509,1
662,951,947,923,112,3
71,573,9,305,351,2
240,837,836,277,177,1
513,318,709,435,367,2
553,253,980,868,26,1
848,543,171,420,73,1
449,538,720,347,500,2
42,319,830,447,727,2
165,968,151,672,452,3
1,781,142,137,157,2
907,364,776,490,502,2
146,512,87,344,233,3
478,62,55,815,283,3
751,789,112,277,483,1
189,597,866,73,397,3
607,210,327,538,68,2
337,401,557,667,642,1
249,894,84,81,643,1
896,858,568,345,157,1
362,886,558,531,735,1
865,418,866,824,370,3
14,517,514,257,129,2
845,833,998,211,684,2
289,302,416,364,920,2
383,173,991,815,368,3
652,325,903,471,224,3
757,580,974,667,620,1
946,247,684,191,332,2
63,330,199,280,608,2
752,298,95,143,134,2
987,105,747,931,413,3
510,23,385,711,701,1
326,195,651,727,85,3
214,978,396,428,14,1
646,133,388,896,971,1
849,817,294,491,397,2
854,973,274,315,897,3
666,530,683,234,439,1
1 701 478 227 863 963 2
2 96 147 210 493 586 2
3 798 143 431 541 94 1
4 233 146 667 886 771 1
5 668 815 628 429 387 3
6 718 456 883 281 840 1
7 182 837 144 664 460 2
8 882 533 203 776 56 3
9 648 715 288 619 293 1
10 178 951 965 164 1 3
11 270 432 457 978 794 1
12 335 219 596 763 231 1
13 47 477 78 423 616 3
14 324 969 514 55 722 2
15 824 571 159 516 594 2
16 837 667 957 150 508 3
17 833 945 311 12 859 1
18 536 280 21 292 518 1
19 943 55 709 269 425 1
20 593 178 861 130 26 3
21 54 165 3 638 816 2
22 637 861 423 855 98 1
23 222 502 427 944 732 1
24 8 465 403 376 761 2
25 184 602 673 825 741 1
26 639 677 204 385 236 2
27 176 843 479 952 898 2
28 125 626 553 74 1000 3
29 302 495 294 362 169 2
30 131 912 803 232 852 1
31 117 609 290 133 357 2
32 207 812 788 182 494 1
33 954 76 257 620 844 1
34 287 266 195 30 344 3
35 440 590 324 868 969 3
36 831 290 228 586 971 1
37 567 734 460 429 689 1
38 864 939 191 620 431 1
39 905 337 200 400 77 2
40 304 997 141 208 615 3
41 19 280 187 44 639 1
42 280 279 275 305 123 1
43 866 519 331 241 972 1
44 27 77 860 458 643 3
45 486 713 917 324 855 2
46 466 16 897 222 731 1
47 712 230 215 805 341 1
48 300 100 292 978 115 3
49 938 800 911 345 49 3
50 98 593 43 583 684 1
51 348 479 406 605 595 2
52 892 877 592 339 615 3
53 203 53 995 704 927 2
54 991 968 886 43 883 1
55 733 939 71 388 56 1
56 249 376 830 628 812 2
57 4 877 743 242 266 1
58 95 537 106 490 518 2
59 870 704 430 270 327 2
60 402 97 283 569 638 3
61 537 979 966 729 8 3
62 399 51 285 973 509 1
63 662 951 947 923 112 3
64 71 573 9 305 351 2
65 240 837 836 277 177 1
66 513 318 709 435 367 2
67 553 253 980 868 26 1
68 848 543 171 420 73 1
69 449 538 720 347 500 2
70 42 319 830 447 727 2
71 165 968 151 672 452 3
72 1 781 142 137 157 2
73 907 364 776 490 502 2
74 146 512 87 344 233 3
75 478 62 55 815 283 3
76 751 789 112 277 483 1
77 189 597 866 73 397 3
78 607 210 327 538 68 2
79 337 401 557 667 642 1
80 249 894 84 81 643 1
81 896 858 568 345 157 1
82 362 886 558 531 735 1
83 865 418 866 824 370 3
84 14 517 514 257 129 2
85 845 833 998 211 684 2
86 289 302 416 364 920 2
87 383 173 991 815 368 3
88 652 325 903 471 224 3
89 757 580 974 667 620 1
90 946 247 684 191 332 2
91 63 330 199 280 608 2
92 752 298 95 143 134 2
93 987 105 747 931 413 3
94 510 23 385 711 701 1
95 326 195 651 727 85 3
96 214 978 396 428 14 1
97 646 133 388 896 971 1
98 849 817 294 491 397 2
99 854 973 274 315 897 3
100 666 530 683 234 439 1

View File

@@ -0,0 +1,90 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3

View File

@@ -0,0 +1,170 @@
"""
Author: Philip Andreadis
e-mail: philip_andreadis@hotmail.com
Implementation of Random Forest model from scratch.
The DecisionTree class from this project is used for generating the trees of the random forest.
This class remains with no changes as the dataset is split into a number of folds with a random subset of features on which each tree is trained on.
As a result each tree is trained on a different group of the dataset in order to avoid correlation between them.
The predicted class value of each instance is chosen by voting from each single tree's outcome.
Parameters of the model:
MAX_DEPTH (int): Maximum depth of the decision tree
MIN_NODE (int): Minimum number of instances a node can have. If this threshold is exceeded the node is terminated
FOLD_SIZE (int): Value between 1-10 representing the percentage of the original dataset size each fold should be.
N_TREES (int):The toral number of trees that will be trained.
Input dataset to train() function must be a numpy array containing both feature and label values.
"""
from random import randrange
from random import randint
import numpy as np
from decision_tree import DecisionTree
# fold size (% of dataset size) e.g. 3 means 30%
FOLD_SIZE = 10
# number of trees
N_TREES = 20
# max tree depth
MAX_DEPTH = 30
# min size of tree node
MIN_NODE = 1
class RandomForest:
def __init__(self,n_trees,fold_size):
self.n_trees = n_trees
self.fold_size = fold_size
self.trees = list()
"""
This function splits the given dataset into n-folds with replacement. The number of folds is equal to the number of the trees that will be trained.
Each tree will have one fold as input. The size of the folds is a percentage (p) of the size of the original dataset.
Parameters:
dataset: np array of the given dataset
n_folds (int): number of folds in which the dataset should be split. Must be equal to the number of trees the user wants to train
p (int): suggests the percentage of the dataset's size the size of a single fold should be.
Returns list of np arrays: list with the k-folds
"""
def cross_validation_split(self,dataset, n_folds, p):
dataset_split = list()
fold_size = int(len(dataset)*p/10)
for i in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset))
fold.append(dataset[index])
set = np.array(fold)
dataset_split.append(set)
return dataset_split
"""
This function randomizes the selection of the features each tree will be trained on.
Parameters:
splits list of np arrays: list of folds
Returns list of np arrays: list with the k-folds with some features randomly removed
"""
def randomize_features(self,splits):
dataset_split = list()
l = len(splits[0][0])
n_features = int((l-1)*5/10)
for split in splits:
for i in range(n_features):
rng = list(range(len(split[0]) - 1))
selected = rng.pop(randint(0,len(rng)-1))
split = np.delete(split, selected, 1)
set = np.array(split)
dataset_split.append(set)
return dataset_split
"""
Prints out all the decision trees of the random forest.
BUG: The feature number is not representative of its initial enumeration in the original dataset due to the randomization.
This means that we do not know on which features each tree is trained on.
"""
def print_trees(self):
i = 1
for t in self.trees:
print("Tree#",i)
temp = t.final_tree
t.print_dt(temp)
print("\n")
i = i+1
"""
Iteratively train each decision tree.
Parameters:
X (np.array): Training data
"""
def train(self,X):
train_x = self.cross_validation_split(X,self.n_trees,self.fold_size)
train_x = self.randomize_features(train_x)
for fold in train_x:
dt = DecisionTree(MAX_DEPTH, MIN_NODE)
dt.train(fold)
self.trees.append(dt)
"""
This function outputs the class value for each instance of the given dataset as predicted by the random forest algorithm.
Parameters:
X (np.array): Dataset with labels
Returns y (np.array): array with the predicted class values of the dataset
"""
def predict(self,X):
predicts = list()
final_predicts = list()
for tree in self.trees:
predicts.append(tree.predict(X))
# iterate through each tree's class prediction and find the most frequent for each instance
for i in range(len(predicts[0])):
values = list()
for j in range(len(predicts)):
values.append(predicts[j][i])
final_predicts.append(max(set(values), key=values.count))
return final_predicts,predicts
if __name__ == "__main__":
# Training data
train_data = np.loadtxt("example_data/data.txt", delimiter=",")
train_y = np.loadtxt("example_data/targets.txt")
mock_train = np.loadtxt("example_data/mock_data.csv", delimiter=",")
mock_y = mock_train[ : , -1]
# Build and train model
rf = RandomForest(N_TREES,FOLD_SIZE)
rf.train(mock_train)
# Evaluate model on training data
y_pred,y_pred_ind = rf.predict(mock_train)
print(f"Accuracy of random forest: {sum(y_pred == mock_y) / mock_y.shape[0]}")
print("\nAccuracy for each individual tree:")
c = 1
for i in y_pred_ind:
print("\nTree",c)
print(f"Accuracy: {sum(i == mock_y) / mock_y.shape[0]}")
c = c+1