import pandas as pd
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from joblib import dump
from matplotlib.colors import LogNorm

print("  ")
print("TRAIN CLASSIFIER (SUPERVISED)")
print("  ")

#First, we load the dataframe from the FSU cluster
#********************************************
print("Load dataframe...")

data_directory = 'http://hadron.physics.fsu.edu/~dlersch/GlueX_PANDA_EIC_ML_Workshop'
data_name = 'hands_on_data_033_033_033.csv'
dataFrame = pd.read_csv(data_directory + '/' + data_name)

print("...done!")
print("  ")
#********************************************

#Then,we set up two classification algorithms
#Please feel free to change the internal parameter
#for each algorithm(e.g. number of hidden layers for the neural network)
#********************************************
#1.) Setting up the neural network:
print(" ")
print("Setting up a multilayer perceptron...")

use_mlp_early_stopping = True  #Change this flag to train the MLP with / without early stopping
my_mlp = MLPClassifier(
         hidden_layer_sizes=(5),
         activation='tanh',
         solver='sgd',
         shuffle=True,
         validation_fraction=0.25,#--> Use validation data to avoid overfitting
         early_stopping=use_mlp_early_stopping, #--> Change to disable early stopping and see the difference, no validation curve available then...
         max_iter = 10,#--> Change this number to increase the number of training iterations 
         learning_rate_init=0.01,
         warm_start=True,
         tol=1e-6
)
print("...done!")
print(" ")

#2.) Setting up the random forest:
print(" ")
print("Setting up a random forest...")
my_rf = RandomForestClassifier(
         n_estimators=10, #--> Number of trees in your forest
         warm_start=True,
         max_depth=5, #--> Maximum depth of tree
         bootstrap=True, #--> Sample subsets from the training data to train each tre with an indiviudal set
         random_state=0
)
print("...done!")
print(" ")
#********************************************

#Pre-process the data, i.e. choose features and normalize them
#--> avoid to strong fluctuations in feature ranges
#********************************************
print("Pre-process data...")
used_features = ['var1','var2','var3','var4'] #--> Change the elements here, in order to use different features
X = dataFrame[used_features].values
Y = dataFrame['label'].values

scaler = MinMaxScaler() 
X = scaler.fit_transform(X) #Uncomment this line to run without feature normalization

x_train, y_train = shuffle(X,Y,random_state=0)

print("...done!")
print(" ")
#********************************************

#Train each model and store the learning curve 
#for the  MLP
#********************************************
print("Train MLP...")

my_mlp.fit(x_train,y_train)

print("...done!")
print(" ")

#Getting the training/validation scores is quite easy:
plt.rcParams.update({'font.size': 25})

training_curve = my_mlp.loss_curve_
plt.plot(training_curve,label='training data')

if use_mlp_early_stopping:
     validation_curve = my_mlp.validation_scores_
     plt.plot(validation_curve,label='validation data')

plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim(0.0,1.0)
plt.grid(True)
plt.savefig('learning_curve_mlp.png')
plt.close()

print("Train random forest classifier...")

my_rf.fit(x_train,y_train)

print("...done!")
print(" ")
#********************************************

#Store the trained models, as we want to use
#them later on
#********************************************
print("Save trained models...")

dump(my_mlp,'mlp_classifier.joblib')
dump(my_rf,'random_forest_classifier.joblib')

print("...done!")
print(" ")
#********************************************

#For convenience, we add the classifier
#predictions to our existing dataframe...
#********************************************
print("Add classifier predictions to the dataframe...")

mlp_predictions = my_mlp.predict_proba(X)
rf_predictions = my_rf.predict_proba(X)

dataFrame['MLP_Output1'] = mlp_predictions[:,0]
dataFrame['MLP_Output2'] = mlp_predictions[:,1]
dataFrame['MLP_Output3'] = mlp_predictions[:,2]

dataFrame['RF_Output1'] = rf_predictions[:,0]
dataFrame['RF_Output2'] = rf_predictions[:,1]
dataFrame['RF_Output3'] = rf_predictions[:,2]

print("...done!")
print(" ")

print("Just to be sure, check the dataframe:")
print(dataFrame.head(10))
print(" ")
#********************************************

#Check the outputs of each classifier
#-->try to understand what the algorithm is actually doing
#********************************************
print("Plot classifier outputs...")

fig,ax = plt.subplots(1,2)
fig.set_size_inches(15, 8)

ax[0].hist(dataFrame[dataFrame['label']==0.0]['MLP_Output1'],bins=100,facecolor='g',alpha=0.5,label='Species 1',log=True)
ax[0].hist(dataFrame[dataFrame['label']==1.0]['MLP_Output2'],bins=100,facecolor='r',alpha=0.5,label='Species 2',log=True)
ax[0].hist(dataFrame[dataFrame['label']==2.0]['MLP_Output3'],bins=100,facecolor='b',alpha=0.5,label='Species 3',log=True)

ax[1].hist(dataFrame[dataFrame['label']==0.0]['RF_Output1'],bins=100,facecolor='g',alpha=0.5,label='Species 1',log=True)
ax[1].hist(dataFrame[dataFrame['label']==1.0]['RF_Output2'],bins=100,facecolor='r',alpha=0.5,label='Species 2',log=True)
ax[1].hist(dataFrame[dataFrame['label']==2.0]['RF_Output3'],bins=100,facecolor='b',alpha=0.5,label='Species 3',log=True)

ax[0].set_xlabel('Neural Network Outputs')
ax[0].set_ylabel('Entries [a.u.]')
ax[0].legend()

ax[1].set_xlabel('Random Forest Outputs')
ax[1].legend()

fig.savefig('classifier_outputs.png')
plt.close(fig)

print("...done!")
print("  ")
#********************************************

#Look at correlation plots after the 
#classification
#********************************************
print("Check correlation plots after classification...")

fig_data,ax_data = plt.subplots(1,2)
fig_data.set_size_inches(25, 8)

threshold = 0.5 #--> Change this threshold and see what happens!

ax_data[0].hist2d(dataFrame[dataFrame['MLP_Output1'] > threshold]['var1'],dataFrame[dataFrame['MLP_Output1'] > threshold]['var2'],bins=100,norm=LogNorm())
ax_data[1].hist2d(dataFrame[dataFrame['MLP_Output1'] > threshold]['var3'],dataFrame[dataFrame['MLP_Output1'] > threshold]['var4'],bins=100,norm=LogNorm())

ax_data[0].set_xlabel('Variable 1')
ax_data[0].set_ylabel('Variable 2')

ax_data[1].set_xlabel('Variable 4')
ax_data[1].set_ylabel('Variable 3')

fig_data.savefig('correlation_plots_post_cl.png')
plt.close(fig_data)

print("...done!")
print("  ")
#********************************************