#!/usr/bin/env python3 import numpy as np #--> Handles numpy arrays import matplotlib import matplotlib.pyplot as plt #--> Handles the plotting import pandas as pd #--> All about pandas, the DataFrame, not the aninmal... from matplotlib.colors import LogNorm #--> Enable log scale to your 2D histogram from sklearn.utils import shuffle #--> SHuffle data from sklearn.ensemble import RandomForestClassifier #--> Get the random forest classifier from sklearn.metrics import roc_curve #--> All about ROC from sklearn.metrics import confusion_matrix #--> All about confusion #Note: Most of the parts seen here have been taken from: #a) https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html #b) https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html #c) https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py #In case of comments, questions, or complaints, please contact: #Daniel Lersch - dlersch@jlab.org print(" ") print("RUN TRAINING OF A RANDOM FOREST CLASSIFIER") print(" ") #1.) Load the data and have a look at it: #---------------------------------------- print("Load DataFrame...") data = 'fsu_ml_hwk_data.csv' #-->Full path to where the data is stored data_df = pd.read_csv(data) #--> Read the data in print("...done!") print(" ") print("Show a few first entries of the DataFrame...") print(" ") print(data_df.head(10)) print(" ") print("...done!") print(" ") #---------------------------------------- #2.) Define features and target values: #---------------------------------------- print("Prepare data for classifier training...") X = data_df[['var1','var2','var3']].values Y = data_df['label'].values #---------------------------------------- #3.) Shuffle the data, in order to avoid bias: #---------------------------------------- x_train, y_train = shuffle(X,Y,random_state=0) print("...done!") print(" ") #---------------------------------------- #4.) Setup the random forest classifier: #---------------------------------------- print("Setup random forest classifier...") my_rf = RandomForestClassifier( n_estimators=10, #--> Number of trees in your forest warm_start=True, max_depth=5, #--> Maximum depth of tree random_state=0 ) print("...done!") print(" ") #---------------------------------------- #5.) Now train the random forest: #---------------------------------------- print("Train the random forest classifier...") my_rf.fit(x_train,y_train) print("...done!") print(" ") #---------------------------------------- #6.) Get predictions and add them to the data frame: #---------------------------------------- print("Add classifier predictions to the DataFrame...") data_df['prediction'] = my_rf.predict(X) probabilities = my_rf.predict_proba(X) data_df['probability2'] = probabilities[:,1] print(" ") print(data_df.head(10)) print(" ") print("...done!") print(" ") #---------------------------------------- n_bins = 100 print("Create a few monitroing plots...") #7.) Plot the random forest output for species 2 only: #---------------------------------------- plt.rcParams.update({'font.size': 18}) plt.subplots_adjust(bottom=0.15,top=0.9) plt.hist(data_df[data_df['label'] == 1]['probability2'],bins=n_bins,log=True,facecolor='r',alpha=0.5) plt.xlabel('Random Forest Output for Species2') plt.ylabel('Entries [a.u.]') #plt.show() plt.savefig('RF_output.png') plt.close() #---------------------------------------- #8.) Plot variable 2 vs. variable 1 before and after classification of species2: #---------------------------------------- fig,ax = plt.subplots(1,2,sharex=True,sharey=True) plt.subplots_adjust(bottom=0.15,top=0.9) #Before classification: #Note: we look at events that JUST contain species2, i.e. labeled with 1: ax[0].hist2d(data_df[data_df['label'] == 1]['var2'],data_df[data_df['label'] == 1]['var1'],bins=n_bins,norm=LogNorm()) #After classification: #Note: we look at events containining ALL species, but labeled with 1: ax[1].hist2d(data_df[data_df['prediction'] == 1]['var2'],data_df[data_df['prediction'] == 1]['var1'],bins=n_bins,norm=LogNorm()) #Make some axis-labels: ax[0].set_xlabel('Variable 1') ax[0].set_ylabel('Variable 2') ax[1].set_xlabel('Variable 1') #plt.show() plt.savefig('RF_correlation_plots.png') plt.close() print("...done!") print(" ") #---------------------------------------- #9.) Get the roc-curve: #---------------------------------------- print("Get and plot the ROC-Curve for Species 2...") #This is ROC-curve for species 2, which is labeled with 1: fpr_s2, tpr_s2, th_s2 = roc_curve(data_df['label'].values,data_df['probability2'].values,pos_label=1) #fpr = false positive rate #tpr = true positive rate plt.subplots_adjust(bottom=0.15,top=0.9) plt.plot(fpr_s2,tpr_s2,'rd',label='ROC: Species 2') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend() #plt.show() plt.savefig('RF_ROC_species2.png') plt.close() print("...done!") print(" ") #---------------------------------------- #10.) Get the confusion matrix: #---------------------------------------- print("Calculate and show the confusion matrix...") #Apparently, not all scikit versions support the plot_confusion_matrix() function #so we have to do a little work-around. I am sorry for that my_labels = [0,1,2] #Calculate the confusion matrix: my_confusion_matrix = confusion_matrix(data_df['label'].values,data_df['prediction'].values,labels=my_labels) #and normalize it: my_confusion_matrix = np.transpose( np.transpose(my_confusion_matrix)/ my_confusion_matrix.astype(np.float).sum(axis=1) ) #Note: The following lines have been taken from: #https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py textFormat = '.2f' matrixTitle = 'Normalized Confusion Matrix' fig,ax = plt.subplots() plt.rcParams['font.size'] = 20 plt.subplots_adjust(bottom=0.25,top=0.9) im = ax.imshow(my_confusion_matrix,interpolation='nearest') ax.set_xticks(np.arange(my_confusion_matrix.shape[1])) ax.set_yticks(np.arange(my_confusion_matrix.shape[0])) ax.set_xticklabels(my_labels) ax.set_yticklabels(my_labels) ax.set_xticks(np.arange(my_confusion_matrix.shape[1]+1)-.5,minor=True) ax.set_yticks(np.arange(my_confusion_matrix.shape[0]+1)-.5,minor=True) ax.tick_params(axis='both', which='major', labelsize=30) ax.set_xlabel('Predicted Label',fontsize=30,labelpad=15) ax.set_ylabel('True Label',fontsize=30,labelpad=25) ax.set_title(matrixTitle,y = 1.03) ax.figure.colorbar(im,ax=ax) colorThresh = my_confusion_matrix.max() / 2. nDim = len(my_labels) #++++++++++++++++++++++++++++++++ for i in range(0,nDim): #++++++++++++++++++++++++++++++++ for j in range(0,nDim): ax.text(j,i, format(my_confusion_matrix[i][j],textFormat), ha = 'center', va = 'center', color = "black" if my_confusion_matrix[i,j] > colorThresh else "white") #++++++++++++++++++++++++++++++++ #++++++++++++++++++++++++++++++++ #plt.show() plt.savefig('RF_confusion_matrix.png') plt.close() print("...done! Have a great day!") print(" ") #----------------------------------------