import pandas as pd from sklearn.utils import shuffle from sklearn.neural_network import MLPClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import MinMaxScaler import matplotlib.pyplot as plt from joblib import dump from matplotlib.colors import LogNorm print(" ") print("TRAIN CLASSIFIER (SUPERVISED)") print(" ") #First, we load the dataframe from the FSU cluster #******************************************** print("Load dataframe...") data_directory = 'http://hadron.physics.fsu.edu/~dlersch/GlueX_PANDA_EIC_ML_Workshop' data_name = 'hands_on_data_033_033_033.csv' dataFrame = pd.read_csv(data_directory + '/' + data_name) print("...done!") print(" ") #******************************************** #Then,we set up two classification algorithms #Please feel free to change the internal parameter #for each algorithm(e.g. number of hidden layers for the neural network) #******************************************** #1.) Setting up the neural network: print(" ") print("Setting up a multilayer perceptron...") use_mlp_early_stopping = True #Change this flag to train the MLP with / without early stopping my_mlp = MLPClassifier( hidden_layer_sizes=(5), activation='tanh', solver='sgd', shuffle=True, validation_fraction=0.25,#--> Use validation data to avoid overfitting early_stopping=use_mlp_early_stopping, #--> Change to disable early stopping and see the difference, no validation curve available then... max_iter = 10,#--> Change this number to increase the number of training iterations learning_rate_init=0.01, warm_start=True, tol=1e-6 ) print("...done!") print(" ") #2.) Setting up the random forest: print(" ") print("Setting up a random forest...") my_rf = RandomForestClassifier( n_estimators=10, #--> Number of trees in your forest warm_start=True, max_depth=5, #--> Maximum depth of tree bootstrap=True, #--> Sample subsets from the training data to train each tre with an indiviudal set random_state=0 ) print("...done!") print(" ") #******************************************** #Pre-process the data, i.e. choose features and normalize them #--> avoid to strong fluctuations in feature ranges #******************************************** print("Pre-process data...") used_features = ['var1','var2','var3','var4'] #--> Change the elements here, in order to use different features X = dataFrame[used_features].values Y = dataFrame['label'].values scaler = MinMaxScaler() X = scaler.fit_transform(X) #Uncomment this line to run without feature normalization x_train, y_train = shuffle(X,Y,random_state=0) print("...done!") print(" ") #******************************************** #Train each model and store the learning curve #for the MLP #******************************************** print("Train MLP...") my_mlp.fit(x_train,y_train) print("...done!") print(" ") #Getting the training/validation scores is quite easy: plt.rcParams.update({'font.size': 25}) training_curve = my_mlp.loss_curve_ plt.plot(training_curve,label='training data') if use_mlp_early_stopping: validation_curve = my_mlp.validation_scores_ plt.plot(validation_curve,label='validation data') plt.legend() plt.xlabel('Epoch') plt.ylabel('Loss') plt.ylim(0.0,1.0) plt.grid(True) plt.savefig('learning_curve_mlp.png') plt.close() print("Train random forest classifier...") my_rf.fit(x_train,y_train) print("...done!") print(" ") #******************************************** #Store the trained models, as we want to use #them later on #******************************************** print("Save trained models...") dump(my_mlp,'mlp_classifier.joblib') dump(my_rf,'random_forest_classifier.joblib') print("...done!") print(" ") #******************************************** #For convenience, we add the classifier #predictions to our existing dataframe... #******************************************** print("Add classifier predictions to the dataframe...") mlp_predictions = my_mlp.predict_proba(X) rf_predictions = my_rf.predict_proba(X) dataFrame['MLP_Output1'] = mlp_predictions[:,0] dataFrame['MLP_Output2'] = mlp_predictions[:,1] dataFrame['MLP_Output3'] = mlp_predictions[:,2] dataFrame['RF_Output1'] = rf_predictions[:,0] dataFrame['RF_Output2'] = rf_predictions[:,1] dataFrame['RF_Output3'] = rf_predictions[:,2] print("...done!") print(" ") print("Just to be sure, check the dataframe:") print(dataFrame.head(10)) print(" ") #******************************************** #Check the outputs of each classifier #-->try to understand what the algorithm is actually doing #******************************************** print("Plot classifier outputs...") fig,ax = plt.subplots(1,2) fig.set_size_inches(15, 8) ax[0].hist(dataFrame[dataFrame['label']==0.0]['MLP_Output1'],bins=100,facecolor='g',alpha=0.5,label='Species 1',log=True) ax[0].hist(dataFrame[dataFrame['label']==1.0]['MLP_Output2'],bins=100,facecolor='r',alpha=0.5,label='Species 2',log=True) ax[0].hist(dataFrame[dataFrame['label']==2.0]['MLP_Output3'],bins=100,facecolor='b',alpha=0.5,label='Species 3',log=True) ax[1].hist(dataFrame[dataFrame['label']==0.0]['RF_Output1'],bins=100,facecolor='g',alpha=0.5,label='Species 1',log=True) ax[1].hist(dataFrame[dataFrame['label']==1.0]['RF_Output2'],bins=100,facecolor='r',alpha=0.5,label='Species 2',log=True) ax[1].hist(dataFrame[dataFrame['label']==2.0]['RF_Output3'],bins=100,facecolor='b',alpha=0.5,label='Species 3',log=True) ax[0].set_xlabel('Neural Network Outputs') ax[0].set_ylabel('Entries [a.u.]') ax[0].legend() ax[1].set_xlabel('Random Forest Outputs') ax[1].legend() fig.savefig('classifier_outputs.png') plt.close(fig) print("...done!") print(" ") #******************************************** #Look at correlation plots after the #classification #******************************************** print("Check correlation plots after classification...") fig_data,ax_data = plt.subplots(1,2) fig_data.set_size_inches(25, 8) threshold = 0.5 #--> Change this threshold and see what happens! ax_data[0].hist2d(dataFrame[dataFrame['MLP_Output1'] > threshold]['var1'],dataFrame[dataFrame['MLP_Output1'] > threshold]['var2'],bins=100,norm=LogNorm()) ax_data[1].hist2d(dataFrame[dataFrame['MLP_Output1'] > threshold]['var3'],dataFrame[dataFrame['MLP_Output1'] > threshold]['var4'],bins=100,norm=LogNorm()) ax_data[0].set_xlabel('Variable 1') ax_data[0].set_ylabel('Variable 2') ax_data[1].set_xlabel('Variable 4') ax_data[1].set_ylabel('Variable 3') fig_data.savefig('correlation_plots_post_cl.png') plt.close(fig_data) print("...done!") print(" ") #********************************************