ajout du cas discret

2019-04-29 21:35:45 -04:00 · 2019-04-29 21:35:45 -04:00 · 54badfdff4
commit 54badfdff4
parent ee8c7cbb2a
2 changed files with 89 additions and 44 deletions
--- a/Code/DecisionTree.py
+++ b/Code/DecisionTree.py
@ -10,7 +10,8 @@ je vais avoir besoin de tester les méthodes train, predict et test de votre cod
 """

 import numpy as np
-
+import metrics
+import time

 # le nom de votre classe
 # NeuralNet pour le modèle Réseaux de Neurones
@ -18,13 +19,14 @@ import numpy as np

 class DecisionTree: #nom de la class à changer

-    def __init__(self, **kwargs):
+    def __init__(self, attribute_type, **kwargs):
        """
        c'est un Initializer. 
        Vous pouvez passer d'autre paramètres au besoin,
        c'est à vous d'utiliser vos propres notations
        """
        self.tree=[]
+        self.attribute_type=attribute_type
        
    def plurality_value(self,train_labels):
        values,_,counts = np.unique(train_labels,return_index=True, return_counts=True)
@ -36,33 +38,39 @@ class DecisionTree: #nom de la class à changer
        total_count = sum(counts)
        entropie_total = -sum(counts/total_count * np.log2(counts/total_count))
        #print("Entropie Total:"+str(entropie_total))
-        #Trouver split
-        attribute_sort_order = np.argsort(train[:,attribute])
-        sorted_labels = train_labels[attribute_sort_order]
-        lags = np.hstack((np.array([False]),sorted_labels[:-1] != sorted_labels[1:]))
-        lags2 = np.hstack((np.array([False]),lags))[:-1]
-        potential_splits = 0.5*train[attribute_sort_order,attribute][lags]+0.5*train[attribute_sort_order,attribute][lags2]
-        if (len(potential_splits)==0):
-            potential_splits = np.array([np.median(train[attribute_sort_order,attribute])])
-        #print("Potential Split:"+str(potential_splits))
-        split_gain = []
-        for v in potential_splits:
-            split_labels_1 = train_labels[train[:,attribute] <= v]
-            split_labels_2 = train_labels[train[:,attribute] > v]
-            _,_,counts1 = np.unique(split_labels_1,return_index=True, return_counts=True)
+        
+        if (self.attribute_type=="continuous"):
+            #Trouver split
+            attribute_sort_order = np.argsort(train[:,attribute])
+            sorted_labels = train_labels[attribute_sort_order]
+            lags = np.hstack((np.array([False]),sorted_labels[:-1] != sorted_labels[1:]))
+            lags2 = np.hstack((np.array([False]),lags))[:-1]
+            potential_splits = 0.5*train[attribute_sort_order,attribute][lags]+0.5*train[attribute_sort_order,attribute][lags2]
+            if (len(potential_splits)==0):
+                potential_splits = np.array([np.median(train[attribute_sort_order,attribute])])
+            #print("Potential Split:"+str(potential_splits))
+            split_gain = []
+            for v in potential_splits:
+                split_labels_1 = train_labels[train[:,attribute] <= v]
+                split_labels_2 = train_labels[train[:,attribute] > v]
+                _,_,counts1 = np.unique(split_labels_1,return_index=True, return_counts=True)
+                total_count1 = sum(counts1)
+                entropie_total1 = -sum(counts1/total_count1 * np.log2(counts1/total_count1))
+                _,_,counts2 = np.unique(split_labels_2,return_index=True, return_counts=True)
+                total_count2 = sum(counts2)
+                entropie_total2 = -sum(counts2/total_count2 * np.log2(counts2/total_count2))
+                split_gain.append(entropie_total-(total_count1/total_count*entropie_total1+total_count2/total_count*entropie_total2))
+            #Valeur unique attribut
+            #print("Split Gain:"+str(split_gain))
+            best_split = potential_splits[np.argmax(split_gain)]
+            best_gain = max(split_gain)
+            return best_gain,best_split
+        if (self.attribute_type=="discrete"):
+            _,counts1 = np.unique(train[:,attribute], return_counts=True)
            total_count1 = sum(counts1)
            entropie_total1 = -sum(counts1/total_count1 * np.log2(counts1/total_count1))
-            _,_,counts2 = np.unique(split_labels_2,return_index=True, return_counts=True)
-            total_count2 = sum(counts2)
-            entropie_total2 = -sum(counts2/total_count2 * np.log2(counts2/total_count2))
-            split_gain.append(entropie_total-(total_count1/total_count*entropie_total1+total_count2/total_count*entropie_total2))
-        #Valeur unique attribut
-        #print("Split Gain:"+str(split_gain))
-        best_split = potential_splits[np.argmax(split_gain)]
-        best_gain = max(split_gain)
-        
-        return best_gain,best_split
-        
+            gain = entropie_total - entropie_total1
+            return gain,None
        
    
    def decision_tree_learning(self,train, train_labels, attributes, parent_examples):
@ -92,22 +100,35 @@ class DecisionTree: #nom de la class à changer
                gain, split = self.importance(train, train_labels, a)
                a_gain.append(gain)
                a_split.append(split)
+                
            # Calcul du meilleur attribut
+            
            pos_gain_max = np.argmax(a_gain)
            a_max = attr[pos_gain_max]
            a_max_split = a_split[pos_gain_max]
-            tree = []
            attributes[a_max]=0
+            
+            # Nouvel arbre
+            
+            tree = []
+            
            # pour chaque valeur de l'attribut, faire un sous-arbre
-            for v in [True,False]:
-                print("Nouvelle branche: l'attribut "+str(a_max)+"<="+str(a_max_split)+" est: "+str(v))
-                train_pos = np.where((train[:,a_max] <= a_max_split) == v)
-                subtree = self.decision_tree_learning(train[train_pos],train_labels[train_pos],attributes,train_labels)
-                tree.append(("Branche",a_max,a_max_split,v,subtree))
+            if (self.attribute_type=="continuous"):
+                for v in [True,False]:
+                    print("Nouvelle branche: l'attribut "+str(a_max)+"<="+str(a_max_split)+" est: "+str(v))
+                    train_pos = np.where((train[:,a_max] <= a_max_split) == v)
+                    subtree = self.decision_tree_learning(train[train_pos],train_labels[train_pos],attributes,train_labels)
+                    tree.append(("Branche",a_max,a_max_split,v,subtree))
+            if (self.attribute_type=="discrete"):
+                for v in np.unique(train[:,a_max]):
+                    print("Nouvelle branche: l'attribut "+str(a_max)+" est: "+str(v))
+                    train_pos = np.where(train[:,a_max] == v)
+                    subtree = self.decision_tree_learning(train[train_pos],train_labels[train_pos],attributes,train_labels)
+                    tree.append(("Branche",a_max,v,subtree))
        return tree
                
        
-    def train(self, train, train_labels): #vous pouvez rajouter d'autres attribus au besoin
+    def train(self, train, train_labels,verbose=True): #vous pouvez rajouter d'autres attribus au besoin
        """
        c'est la méthode qui va entrainer votre modèle,
        train est une matrice de taille nxm, avec 
@ -115,6 +136,7 @@ class DecisionTree: #nom de la class à changer
        m : le mobre d'attribus (le nombre de caractéristiques)
        
        train_labels : est une matrice de taille nx1
+        verbose: afficher les métriques calculées
        
        vous pouvez rajouter d'autres arguments, il suffit juste de
        les expliquer en commentaire
@ -131,6 +153,8 @@ class DecisionTree: #nom de la class à changer
        
        self.tree = self.decision_tree_learning(train, train_labels, attributes, None)
        
+        return self.test(train, train_labels,verbose)
+        
        
    def extract_tree(self,myTree,exemple):
        for b in myTree:
@ -138,8 +162,12 @@ class DecisionTree: #nom de la class à changer
            if b[0] == 'Feuille':
                return b[1]
            # On est dans une branche, on teste le split
-            if ((exemple[b[1]] <= b[2]) == b[3]):
-                return self.extract_tree(b[4],exemple)
+            if self.attribute_type=="continuous":
+                if ((exemple[b[1]] <= b[2]) == b[3]):
+                    return self.extract_tree(b[4],exemple)
+            if self.attribute_type=="discrete":
+                if (exemple[b[1]] == b[2]):
+                    return self.extract_tree(b[3],exemple)
        return None
            
        
@ -155,15 +183,13 @@ class DecisionTree: #nom de la class à changer
        
        return self.extract_tree(self.tree,exemple)
        
-        
-        
-
-    def test(self, test, test_labels):
+    def test(self, test, test_labels, verbose=True):
        """
        c'est la méthode qui va tester votre modèle sur les données de test
        l'argument test est une matrice de taille nxm, avec 
        n : le nombre d'exemple de test dans le dataset
        m : le mobre d'attribus (le nombre de caractéristiques)
+        verbose: afficher les métriques calculées
        
        test_labels : est une matrice taille nx1
        
@ -177,6 +203,15 @@ class DecisionTree: #nom de la class à changer
        Bien entendu ces tests doivent etre faits sur les données de test seulement
        
        """
+        start_time = time.time()
+        prediction_test = [self.predict(exemple,label) for exemple,label in zip(test,test_labels)]
+        cm = metrics.confusion_matrix(test_labels,prediction_test)
+        accuracy, precision, recall = metrics.prediction_metrics(cm,test_labels,prediction_test)
+        compute_time = time.time() - start_time
+        if (verbose):
+            metrics.print_prediction_metrics(cm,accuracy,precision,recall,compute_time)
+
+        return cm,accuracy,precision,recall,compute_time
    
    
    # Vous pouvez rajouter d'autres méthodes et fonctions,
--- a/Code/main.py
+++ b/Code/main.py
@ -8,10 +8,20 @@ import DecisionTree  # importer la classe de l'Arbre de Décision
 # importer d'autres fichiers et classes si vous en avez développés
 # importer d'autres bibliothèques au besoin, sauf celles qui font du machine learning

-dt = DecisionTree.DecisionTree()
+train1, train_labels1, test1, test_labels1 = ld.load_iris_dataset(train_ratio = 0.7)
+train2, train_labels2, test2, test_labels2 = ld.load_monks_dataset(1)
+train3, train_labels3, test3, test_labels3 = ld.load_monks_dataset(2)
+train4, train_labels4, test4, test_labels4 = ld.load_monks_dataset(3)
+train5, train_labels5, test5, test_labels5 = ld.load_congressional_dataset(train_ratio = 0.7)

-dt.train(train,train_labels)

-dt.tree
+dt1 = DecisionTree.DecisionTree(attribute_type="continuous")
+dt1.train(train1, train_labels1)
+dt1.predict(test1[0],test_labels1[0])
+dt1.test(test1, test_labels1)

-[(dt.predict(exemple,label),label) for exemple,label in zip(train,train_labels)]
+
+dt5 = DecisionTree.DecisionTree(attribute_type="discrete")
+dt5.train(train5, train_labels5)
+dt5.predict(test5[0],test_labels5[0])
+dt5.test(test5, test_labels5)