import pandas as pd import matplotlib.pyplot as plt from matplotlib.colors import LogNorm import numpy as np print(" ") print("INSPECT WORKSHOP DATA") print(" ") #First, we load the dataframe from the FSU cluster #******************************************** print("Load dataframe...") data_directory = 'http://hadron.physics.fsu.edu/~dlersch/GlueX_PANDA_EIC_ML_Workshop' data_name = 'hands_on_data_033_033_033.csv' dataFrame = pd.read_csv(data_directory + '/' + data_name) print("...done!") print(" ") #******************************************** #Now take a look at the first 10 entries of the dataframe, #just to get a feeling for its structure #******************************************** print("Check first 10 entries...") print(" ") print(dataFrame.head(10)) print(" ") print("...done!") print(" ") #******************************************** #Check the absolute and relative abundance #of each particle species #******************************************** print("Number of events with species 1:") n_species1 = dataFrame[dataFrame['label'] == 0.0]['label'].count() print(n_species1) print(" ") print("Number of events with species 2:") n_species2 = dataFrame[dataFrame['label'] == 1.0]['label'].count() print(n_species2) print(" ") print("Number of events with species 3:") n_species3 = dataFrame[dataFrame['label'] == 2.0]['label'].count() print(n_species3) n_all_events = n_species1 + n_species2 + n_species3 r_1 = n_species1 / float(n_all_events) r_2 = n_species2 / float(n_all_events) r_3 = n_species3 / float(n_all_events) print(" ") print("Relative abundances:") print("Species1: " + str(r_1)) print("Species2: " + str(r_2)) print("Species3: " + str(r_3)) print(" ") #******************************************** #Check the correlations between the #features #******************************************** print("Produce correlation plots...") fig_data,ax_data = plt.subplots(1,3) fig_data.set_size_inches(15, 8) ax_data[0].hist2d(dataFrame[dataFrame['label']==0.0]['var1'],dataFrame[dataFrame['label']==0.0]['var2'],bins=100,norm=LogNorm(),range=[[0,15],[0,15]]) ax_data[1].hist2d(dataFrame[dataFrame['label']==0.0]['var3'],dataFrame[dataFrame['label']==0.0]['var4'],bins=100,norm=LogNorm(),range=[[-0.01,0.03],[-0.01,0.03]]) ax_data[2].hist2d(dataFrame[dataFrame['label']==0.0]['var2'],dataFrame[dataFrame['label']==0.0]['var5'],bins=100,norm=LogNorm(),range=[[0,15],[0,15]]) ax_data[0].set_xlabel('Variable 1') ax_data[0].set_ylabel('Variable 2') ax_data[1].set_xlabel('Variable 1') ax_data[1].set_ylabel('Variable 3') ax_data[2].set_xlabel('Variable 2') ax_data[2].set_ylabel('Variable 3') fig_data.savefig('correlation_plots_pre_cl.png') plt.close(fig_data) print("...done!") print(" ") #******************************************** #Finally, determine the correlation matrix, #using the spearment correlation function #******************************************** feature_names = ['var1','var2','var3','var4','var5','var6'] feature_correlations = dataFrame[feature_names].corr('spearman').values #--> There are different options on how to calculate the feature correlations textFormat = '.2f' matrixTitle = 'Feature Correlations' plt.rcParams['font.size'] = 20 plt.subplots_adjust(bottom=0.25,top=0.9) fig,ax = plt.subplots() fig.set_size_inches(15.0,8.0) im = ax.imshow(feature_correlations,interpolation='nearest') ax.set_xticks(np.arange(feature_correlations.shape[1])) ax.set_yticks(np.arange(feature_correlations.shape[0])) ax.set_xticklabels(feature_names,rotation='vertical',fontweight='normal') ax.set_yticklabels(feature_names,rotation='horizontal',fontweight='normal') ax.set_xticks(np.arange(feature_correlations.shape[1]+1)-.5,minor=True) ax.set_yticks(np.arange(feature_correlations.shape[0]+1)-.5,minor=True) ax.tick_params(axis='both', which='major', labelsize=30) ax.set_title(matrixTitle,y = 1.03,fontweight='normal') ax.figure.colorbar(im,ax=ax) colorThresh = feature_correlations.max() / 2. nDim = len(feature_names) #++++++++++++++++++++++++++++++++ for i in range(0,nDim): #++++++++++++++++++++++++++++++++ for j in range(0,nDim): ax.text(j,i, format(feature_correlations[i][j],textFormat), fontweight = 'normal', ha = 'center', va = 'center', color = "black" if feature_correlations[i,j] > colorThresh else "white") #++++++++++++++++++++++++++++++++ #++++++++++++++++++++++++++++++++ fig.savefig('correlation_matrix.png') plt.close(fig) #********************************************