V3 [Frontiers revision]

2022-11-08 22:41:36 +05:30
commit 20705b233f
@@ -273,7 +273,6 @@ def hjorthParameters(xV):
    # in the matlab code for hjorth complexity subtraction by mob not division was used 
    return mobility, complexity

-##########
 # false nearest neighbor descriptor
 def falseNearestNeighbor(eegData, fast=True):
    # Average Mutual Information
@@ -293,18 +292,18 @@ def falseNearestNeighbor(eegData, fast=True):
            else:
                cur_eegData = eegData[chan, :, epoch]
                lagidx = 0  # we are looking for the index of the lag that makes the signal maximally uncorrelated to the original
-                minNMI = 1  # normed_mutual_info is from 1 (perfectly correlated) to 0 (not at all correlated) 
-                for lag in range(1, max_delay):
-                x = cur_eegData[:-lag]
-                xlag = cur_eegData[lag:]
-                convert float data into histogram bins
-                nbins = int(np.floor(1 + np.log2(len(x)) + 0.5))
-                x_discrete = np.histogram(x, bins=nbins)[0]
-                xlag_discrete = np.histogram(xlag, bins=nbins)[0]
-                cNMI = normed_mutual_info(x_discrete, xlag_discrete)
-                if cNMI < minNMI:
-                    minNMI = cNMI
-                    lagidx = lag
+                # # minNMI = 1  # normed_mutual_info is from 1 (perfectly correlated) to 0 (not at all correlated) 
+                # # for lag in range(1, max_delay):
+                # #     x = cur_eegData[:-lag]
+                # #     xlag = cur_eegData[lag:]
+                # #     # convert float data into histogram bins
+                # #     nbins = int(np.floor(1 + np.log2(len(x)) + 0.5))
+                # #     x_discrete = np.histogram(x, bins=nbins)[0]
+                # #     xlag_discrete = np.histogram(xlag, bins=nbins)[0]
+                # #     cNMI = normed_mutual_info(x_discrete, xlag_discrete)
+                # #     if cNMI < minNMI:
+                # #         minNMI = cNMI
+                # #         lagidx = lag
                # nearest neighbors part
                knn = int(max(2, 6*lagidx))  # heuristic (number of nearest neighbors to look up)
                m = 1 # lagidx + 1
@@ -40,12 +40,18 @@ import copy
 from sklearn import feature_selection
 import argparse

-import cuml
-from cuml.svm import SVR
-from cuml.ensemble import RandomForestRegressor
-from cuml.svm import SVC
-from cuml.ensemble import RandomForestClassifier
-from cuml.metrics import  accuracy_score
+from sklearn.svm import SVR 
+from sklearn.svm import SVC
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score
+
+# import cuml
+# from cuml.svm import SVR
+# from cuml.ensemble import RandomForestRegressor
+# from cuml.svm import SVC
+# from cuml.ensemble import RandomForestClassifier
+# from cuml.metrics import  accuracy_score


 # In[ ]:
@@ -1,735 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-# In[ ]:
-
-
-from ImportUtils import *
-from sklearn.model_selection import ParameterGrid
-
-from sklearn.model_selection import train_test_split
-
-from sklearn.preprocessing import StandardScaler
-from sklearn.metrics import accuracy_score
-
-from sklearn.feature_selection import chi2
-from sklearn.feature_selection import SelectKBest, f_classif
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler
-from sklearn.metrics import accuracy_score
-
-from sklearn.ensemble import RandomForestRegressor as sklearnrfi
-
-import os
-import glob
-from scipy import io,signal
-import numpy as np
-import pandas as pd
-from sklearn import preprocessing
-import pickle
-from sklearn.metrics import mean_squared_error
-from sklearn.impute import SimpleImputer
-
-
-import matplotlib.pyplot as plt
-# %matplotlib inline
-import seaborn as sns
-import copy
-
-def topElectrodeRegressionRanking(dataset, window, stride, sfreq, clf, label, scale=False, pca=False):
-    '''
-    Ranks of features according to rmse computed by regressor passed in clf
-    Plots electrode v/s rmse graph
-    
-    '''
-    # parameters :-
-                # dataset - name of the dataset
-                # window - length of the sliding window in seconds
-                # stride - length of the stride of the sliding window in seconds
-                # sfreq - sampling frequency of the EEG data
-                # clf - name of the classifier to be used
-                # label - valence/arousal/dominance/liking label (shape depends upon the dataset) in an enumerated form (0- valence ; 1-arousal ; 2- like; 3-dominance)
-                # scale - sclaing of the EEG data if required
-                
-    # returns :-
-                # void
-        
-    pwd = os.getcwd()
-
-    #load extracted features
-    #####################################################################################################################################################
-    featurepath = os.getcwd() + '/' + dataset + '/data_extracted/featuresDict/'
-    ans = np.load((featurepath + "shannonEntropy_{}_{}.npz").format(window,stride), allow_pickle=True)['features']
-    Y_epoch = np.load((featurepath + "shannonEntropy_{}_{}.npz").format(window,stride), allow_pickle=True)['Y']
-    
-    rmseList = []
-    electrodeList = ['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', 'FC6', 'F4', 'F8', 'AF4']
-    fs = sfreq
-    pwd = os.getcwd()
-    featuresDict = loadFeaturesDict(dataset)
-    asm_features = ['dasm_delta', 'dasm_theta', 'dasm_alpha', 'dasm_beta', 'dasm_gamma', 'rasm_delta', 'rasm_theta', 'rasm_alpha', 'rasm_beta', 'rasm_gamma']
-    for asm in asm_features:
-        featuresDict.pop(asm)
-
-    common = []
-    with open('intersection.pkl', 'rb') as f:
-        common = pickle.load(f)
-
-    for k in list(featuresDict.keys()):
-        if k not in common:
-            # pop out common feature
-            featuresDict.pop(k)
-
-    selectFeatures = list(featuresDict.keys())
-    y = Y_epoch[:,label] #valence
-    #####################################################################################################################################################
-    
-    for electrode in range(14):
-        # Load FeaturesDict from memory
-        
-
-        print("Number of segments are: {}".format(ans.shape[1]))
-        
-        featureMatrix = np.empty((len(selectFeatures),ans.shape[1])) #[14*32 + 1,80640]
-        i=0
-        for key,value in featuresDict.items():
-            featureMatrix[i,:] = value[electrode,:]
-            i = i+1
-
-        print(featureMatrix.T.shape)
-        featureMatrix = featureMatrix.astype(np.float32)
-
-        #Impute NaN values with zero
-        if np.isnan(featureMatrix).any():
-            featureMatrix = np.nan_to_num(featureMatrix,nan=0)
-
-        #Name Feature vector columns
-        feature_channel_index = []
-        for feature in selectFeatures:
-            feature_channel_index.append(feature + str(electrode))
-
-        print("Number of Feature-Columns: {}\n".format(len(feature_channel_index))) #debug
-        
-        #Preparing dataset from feature matrix
-        X = pd.DataFrame(featureMatrix.T)
-        X.columns = feature_channel_index
-        X = X.replace([np.inf, -np.inf], np.nan)
-        X = X.fillna(0)
-        
-
-        print("Features Ready for undergoing selection tests done ...\n")
-
-        # Perform train_test_split to get training and test data 
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-
-        # Normalise-scale data 
-        # Feature Scaling
-        if(scale == True):
-            sc = StandardScaler()
-            X_train = sc.fit_transform(X_train)
-            X_test = sc.transform(X_test)
-
-        # Apply classfier
-        clf.fit(X_train, y_train)
-        y_predict = clf.predict(X_test)
-        rmse = mean_squared_error(y_test, y_predict,squared=False)
-        print("window: {}, stide: {}, rmse: {}".format(window,stride,rmse))
-        rmseList.append(rmse)
-        
-
-    #rank electrodes based on RMSE computed by the classifier
-    electrode_df = pd.DataFrame(electrodeList)
-    rmse_df = pd.DataFrame(rmseList)
-    #concat two dataframes for better visualization 
-    electrodeRanking = pd.concat([electrode_df, rmse_df],axis=1)
-    electrodeRanking.columns = ['Electrode','RMSE']  #naming the dataframe columns
-    features_result = electrodeRanking.sort_values('RMSE')
-    print(features_result)
-    # return features_result
-    
-    ##################################################################################
-    N =  features_result.shape[0]
-    topRmseList = []
-    topNList = ["{}".format(x) for x in range(1,N+1)]
-
-    
-    for n in range(1,N+1):
-        
-
-        topnelectrodes = features_result.head(n)
-        electrode_index = topnelectrodes.index
-        electrode_index = list(electrode_index)[:n]
-
-        # X-Values
-        featureMatrix = np.empty((len(selectFeatures)*len(electrode_index),ans.shape[1]))
-
-        i = 0
-        for index in electrode_index:
-            for key,value in featuresDict.items():
-                featureMatrix[i,:] = value[index,:]
-                i = i+1
-        
-        featureMatrix = featureMatrix.astype(np.float32)
-        print(featureMatrix.T.shape)
-        
-        # Removing NaN Values
-        if np.isnan(featureMatrix).any():
-            featureMatrix = np.nan_to_num(featureMatrix,nan=0)
-
-        # Name Feature vector columns
-        feature_channel_index = []
-        for index in electrode_index:
-            for feature in selectFeatures:
-                feature_channel_index.append(feature + str(index))
-
-        print("Number of Feature-Columns: {}\n".format(len(feature_channel_index)))
-
-        X = pd.DataFrame(featureMatrix.T)
-        X.columns = feature_channel_index
-        X = X.replace([np.inf, -np.inf], np.nan)
-        X = X.fillna(0)
-        
-
-        print("Features Ready for undergoing selection tests done ...\n")
-
-
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-
-        # Normalise-scale data 
-        # Feature Scaling
-        if(scale == True):
-            sc = StandardScaler()
-            X_train = sc.fit_transform(X_train)
-            X_test = sc.transform(X_test)
-
-        # Apply classfier
-        
-        search_method = "tpot"
-        best_clf = None
-        if(search_method == "bayes_sk_opt"):
-
-            # BayesCV scikit opt
-            search_space = {"bootstrap": Categorical([True, False]), # values for boostrap can be either True or False
-            "max_depth": Integer(6, 20), # values of max_depth are integers from 6 to 20
-            "max_features": Categorical(['auto', 'sqrt','log2']), 
-            "min_samples_leaf": Integer(2, 10),
-            "min_samples_split": Integer(2, 10),
-            "n_estimators": Integer(100, 500)
-            }
-
-            forest_bayes_search = BayesSearchCV(clf, search_space, n_iter=32, cv=5)
-            print(forest_bayes_search)
-            print(forest_bayes_search.fit(X_train, y_train))
-            print("Best Parameters are: ", forest_bayes_search.best_params_)
-            best_clf = forest_bayes_search.best_estimator_
-
-        elif(search_method =="random_grid_search"):
-            print("Random Search followed by GridSearch initiated!\n");
-            #RandomSearchCV followed by GridSearchCV
-            random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
-                'max_features': ['auto', 'sqrt','log2'],
-                'max_depth': [int(x) for x in np.linspace(10, 1000,10)],
-                'min_samples_split': [2, 5, 10,14],
-                'min_samples_leaf': [1, 2, 4,6,8],
-                }
-            rf_randomcv=RandomizedSearchCV(estimator=clf,param_distributions=random_grid,n_iter=100,cv=5,verbose=2,random_state=100)        
-            print(rf_randomcv.fit(X_train, y_train))
-            print("Best Parameters for RandomSearchCV are: ", rf_randomcv.best_params_)
-            print("RMSE with RandomSearchCV is :",mean_squared_error(y_test, rf_randomcv.best_estimator_.predict(X_test),squared=False));
-            
-            param_grid = {
-                'max_depth': [rf_randomcv.best_params_['max_depth']],
-                'max_features': [rf_randomcv.best_params_['max_features']],
-                'min_samples_leaf': [rf_randomcv.best_params_['min_samples_leaf'], 
-                                    rf_randomcv.best_params_['min_samples_leaf']+2, 
-                                    rf_randomcv.best_params_['min_samples_leaf'] + 4],
-                'min_samples_split': [rf_randomcv.best_params_['min_samples_split'] - 2,
-                                    rf_randomcv.best_params_['min_samples_split'] - 1,
-                                    rf_randomcv.best_params_['min_samples_split'], 
-                                    rf_randomcv.best_params_['min_samples_split'] +1,
-                                    rf_randomcv.best_params_['min_samples_split'] + 2],
-                'n_estimators': [rf_randomcv.best_params_['n_estimators'] - 200, rf_randomcv.best_params_['n_estimators'] - 100, 
-                                rf_randomcv.best_params_['n_estimators'], 
-                                rf_randomcv.best_params_['n_estimators'] + 100, rf_randomcv.best_params_['n_estimators'] + 200]
-            }
-
-            grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10, verbose=5)
-            grid_search.fit(X_train,y_train)
-            best_clf = rf_randomcv.best_estimator_
-        elif search_method =="manual_search":
-            min_rmse = 1000
-            best_clf = clf
-            min_params = None
-            # 2*3*3*3*3
-            param_grid = {'n_estimators': [50, 100],
-            'max_features': ['auto'],
-            'max_depth': [2, 10, 100],
-            'min_samples_split': [2, 5, 10],
-            'min_samples_leaf': [1, 2, 8],
-            }
-
-            param_grid = ParameterGrid(param_grid)
-            for params in param_grid:
-                print("Current Parameters : ", params)
-                temp_clf = RandomForestRegressor( max_features = params['max_features'], min_samples_leaf = params['min_samples_leaf'], min_samples_split = params['min_samples_split'], n_estimators = params['n_estimators'],max_depth = params['max_depth']);
-                temp_clf.fit(X_train,y_train)
-                y_predict = temp_clf.predict(X_test)
-                rmse = mean_squared_error(y_test, y_predict,squared=False)
-                print("Current RMSE with above params : ", rmse)
-                if(min_rmse > rmse):
-                    min_rmse = rmse;
-                    best_clf = temp_clf;
-                    min_params = params;
-
-            print("Best Params for parameter search are : \n", min_params)
-            print("window: {}, stide: {}, rmse: {}".format(window,stride,min_rmse))
-            topRmseList.append(min_rmse)
-        elif search_method == "tpot":
-            from tpot import TPOTRegressor;
-            # TPOT setup
-            GENERATIONS = 5
-            POP_SIZE = 100
-            CV = 5
-            SEED = 42
-
-            tpot = TPOTRegressor(
-            generations=GENERATIONS,
-            population_size=POP_SIZE,
-            random_state=SEED,
-            config_dict="TPOT cuML",
-            n_jobs=1, # cuML requires n_jobs=1
-            cv=CV,
-            verbosity=2,
-            )
-
-            tpot.fit(X_train, y_train)
-
-            y_predict = tpot.predict(X_test)
-            rmse = mean_squared_error(y_test, y_predict,squared=False)
-            print("window: {}, stide: {}, rmse: {}".format(window,stride,rmse))
-            topRmseList.append(rmse)
-
-
-        else:
-            best_clf = clf
-            best_clf.fit(X_train,y_train)
-
-        
-        if search_method != "manual_search" and search_method != "tpot":
-            y_predict = best_clf.predict(X_test)
-            rmse = mean_squared_error(y_test, y_predict,squared=False)
-            print("window: {}, stide: {}, rmse: {}".format(window,stride,rmse))
-            topRmseList.append(rmse)
-
-
-    topNElectrode_df = pd.DataFrame(topNList)
-    topNRmse_df = pd.DataFrame(topRmseList)
-    #concat two dataframes for better visualization 
-    topNElectrodeRanking = pd.concat([topNElectrode_df, topNRmse_df],axis=1)
-    topNElectrodeRanking.columns = ['Electrode','RMSE']  #naming the dataframe columns
-    print(topNElectrodeRanking)  
-    
-    # Plotting
-    fig = plt.gcf()
-    fig.set_size_inches(20, 10)
-    plt.rcParams.update({'font.size': 30})
-    plt.xlabel('Top N Electrodes')
-    plt.ylabel('RMSE')
-    plt.plot(topNElectrodeRanking.loc[:,"Electrode"], topNElectrodeRanking.loc[:,"RMSE"])
-    plt.tight_layout()
-
-
-# In[ ]:
-
-
-def topFeaturesRegressionRanking(dataset, window, stride, sfreq, clf, label, scale=False, pca=False):
-    '''
-    Ranks of features according to rmse computed by regressor passed in clf
-    Plots electrode v/s rmse graph
-    
-    '''
-    # parameters :-
-                # dataset - name of the dataset
-                # window - length of the sliding window in seconds
-                # stride - length of the stride of the sliding window in seconds
-                # sfreq - sampling frequency of the EEG data
-                # clf - name of the classifier to be used
-                # label - valence/arousal/dominance/liking label (shape depends upon the dataset)
-                # scale - sclaing of the EEG data if required
-                
-    # returns :-
-                # void
-    fs = sfreq
-    pwd = os.getcwd()
-    featurepath = os.getcwd() + '/' + dataset + '/data_extracted/featuresDict/'
-    ans = np.load((featurepath + "shannonEntropy_{}_{}.npz").format(window,stride), allow_pickle=True)['features']
-    Y_epoch = np.load((featurepath + "shannonEntropy_{}_{}.npz").format(window,stride), allow_pickle=True)['Y']
-    print("Number of segments are: {}".format(ans.shape[1]))
-    
-    featuresDict = None
-    featuresDict = loadFeaturesDict(dataset)
-
-    common = []
-    with open('intersection.pkl', 'rb') as f:
-        common = pickle.load(f)
-
-    for k in list(featuresDict.keys()):
-        if k not in common:
-            # pop out common feature
-            featuresDict.pop(k)
-
-    featuresList = list(featuresDict.keys())
-    
-    y = Y_epoch[:,label] #valence
-
-    
-    rmseList = []
-
-    ####################################################################
-    #modify featuresList
-    featureMatrix = np.empty((0,ans.shape[1])) #[14*32 + 1,80640]
-    for key,value in featuresDict.items():
-        featureMatrix = np.append(featureMatrix,value,axis=0)
-
-
-    if np.isnan(featureMatrix).any():
-        featureMatrix = np.nan_to_num(featureMatrix,nan=0)
-
-    featureMatrix = featureMatrix.astype('float64')
-
-
-    feature_channel_index = []
-    for feature in featuresList:
-        for i in range(featuresDict[feature].shape[0]):
-            if(i>=10):
-                feature_channel_index.append(feature + str(i))
-            else:
-                feature_channel_index.append(feature + '0' + str(i))
-
-    print(len(list(featuresDict.keys())))
-    print("Number of Feature-Columns: {}\n".format(len(feature_channel_index)))
-
-    X = pd.DataFrame(featureMatrix.T)
-    X = X.replace([np.inf, -np.inf], np.nan)
-    X = X.fillna(0)
-    X.columns = feature_channel_index
-
-    #Remove Variance = 0 features     
-    constant_filter = VarianceThreshold(threshold=0)
-    constant_filter.fit(X)
-    constant_columns = [column for column in X.columns
-                    if column not in
-    X.columns[constant_filter.get_support()]]
-    X = constant_filter.transform(X)
-    
-    for column in constant_columns:
-        feature_channel_index.remove(column)
-
-    print(len(feature_channel_index),feature_channel_index )
-
-    X = pd.DataFrame(X)
-    X.columns = feature_channel_index
-
-
-    filtered_featuresList = []
-    print(type(X))
-    for col in X.columns:
-        feature = col[:-2]
-        electrode = int(col[-2:])
-        if(feature not in filtered_featuresList):
-            filtered_featuresList.append(feature)
-        
-    featuresList = filtered_featuresList
-
-    for feature in featuresList:
-        # Load FeaturesDict from memory
-        
-        
-        
-        featureMatrix = featuresDict[feature]
-        featureMatrix = featureMatrix.astype(np.float32)
-
-        if np.isnan(featureMatrix).any():
-            featureMatrix = np.nan_to_num(featureMatrix,nan=0)
-
-        
-
-        feature_channel_index = []
-        
-        for i in range(featuresDict[feature].shape[0]):
-            feature_channel_index.append(feature + str(i))
-
-        print("Number of Feature-Columns: {}\n".format(len(feature_channel_index)))
-
-        X = pd.DataFrame(featureMatrix.T)
-        X = X.replace([np.inf, -np.inf], np.nan)
-        X = X.fillna(0)
-        X.columns = feature_channel_index
-        
-
-        print("Features Ready for undergoing selection tests done ...\n")
-
-
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-
-        # Normalise-scale data 
-        # Feature Scaling
-        if(scale == True):
-            sc = StandardScaler()
-            X_train = sc.fit_transform(X_train)
-            X_test = sc.transform(X_test)
-
-        # Apply classfier
-        clf.fit(X_train, y_train)
-        y_predict = clf.predict(X_test)
-        rmse = mean_squared_error(y_test, y_predict,squared=False)
-        print("window: {}, stide: {}, rmse: {}".format(window,stride,rmse))
-        rmseList.append(rmse)
-        
-
-    
-    features_df = pd.DataFrame(featuresList)
-    rmse_df = pd.DataFrame(rmseList)
-    #concat two dataframes for better visualization 
-    featureRanking = pd.concat([features_df, rmse_df],axis=1)
-    featureRanking.columns = ['Feature','RMSE']  #naming the dataframe columns
-    features_result = featureRanking.sort_values('RMSE')
-    features_result.to_csv(pwd + "/" + dataset + "/arousal_plots/" + "CommonFeaturesRegressionRanking" + str(window) + str(stride) + ".csv")
-    print(features_result)
-    
-        ###########################################
-    N =  features_result.shape[0]
-    topNRmseList = []
-    topNList = ["{}".format(x) for x in range(1,N+1)]
-
-
-    
-    for n in range(1,N+1):
-        
-
-        topnfeatures = copy.deepcopy(features_result.head(n))
-        topnfeatures = topnfeatures['Feature'].tolist() #list of feature-names
-        
-        # X-Values################################################
-
-        featureMatrix = np.empty((0,ans.shape[1]))
-    
-        for feature in topnfeatures:
-            featureMatrix = np.append(featureMatrix, featuresDict[feature], axis=0)
-        
-        featureMatrix = featureMatrix.astype(np.float32)
-        print(featureMatrix.T.shape)
-
-        feature_channel_index = []
-        for feature in topnfeatures:
-            i=0
-            for i in range(featuresDict[feature].shape[0]):
-                feature_channel_index.append(feature + str(i))
-
-        
-        # Removing NaN Values
-        if np.isnan(featureMatrix).any():
-            featureMatrix = np.nan_to_num(featureMatrix,nan=0)
-
-        
-        print("Number of Feature-Columns: {}\n".format(len(feature_channel_index)))
-
-        X = pd.DataFrame(featureMatrix.T)
-        X.columns = feature_channel_index
-        X = X.replace([np.inf, -np.inf], np.nan)
-        X = X.fillna(0)
-        
-
-        print("Features Ready for undergoing selection tests done ...\n")
-
-
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-
-        # Normalise-scale data 
-        # Feature Scaling
-        if(scale == True):
-            sc = StandardScaler()
-            X_train = sc.fit_transform(X_train)
-            X_test = sc.transform(X_test)
-
-        clf.fit(X_train, y_train)
-        y_predict = clf.predict(X_test)
-        rmse = mean_squared_error(y_test, y_predict,squared=False)
-        print("window: {}, stide: {}, rmse: {}".format(window,stride,rmse))
-        topNRmseList.append(rmse)
-
-
-
-    topNFeatures_df = pd.DataFrame(topNList)
-    topNRmse_df = pd.DataFrame(topNRmseList)
-
-    #concat two dataframes for better visualization 
-    topNFeaturesRanking = pd.concat([topNFeatures_df, topNRmse_df],axis=1)
-    topNFeaturesRanking.columns = ['Feature','RMSE']  #naming the dataframe columns
-    print(topNFeaturesRanking)
-    topNFeaturesRanking.to_csv(pwd + "/" + dataset + "/arousal_plots/" + "topCommonFeaturesRegressionRanking" + str(window) + str(stride) + ".csv")    
-    
-    # Plotting
-    fig = plt.gcf()
-    fig.set_size_inches(25, 10)
-    plt.rcParams.update({'font.size': 30})
-    plt.xlabel('Top N Features')
-    plt.ylabel('RMSE')
-    plt.plot(topNFeaturesRanking.loc[:,"Feature"], topNFeaturesRanking.loc[:,"RMSE"])
-    plt.tight_layout()
-
-
-# In[ ]:
-
-
-def topFeatureColumnsRegressionRanking(dataset, window, stride, sfreq, clf, label, scale=False, pca=False):
-    
-    # parameters :-
-                # dataset - name of the dataset
-                # window - length of the sliding window in seconds
-                # stride - length of the stride of the sliding window in seconds
-                # sfreq - sampling frequency of the EEG data
-                # clf - name of the classifier to be used
-                # label - valence/arousal/dominance/liking label (shape depends upon the dataset)
-                # scale - sclaing of the EEG data if required
-                
-    # returns :-
-                # void
-    
-    fs = sfreq
-    pwd = os.getcwd()
-    featurepath = os.getcwd() + '/' + dataset + '/data_extracted/featuresDict/'
-
-    ans = np.load((featurepath + "shannonEntropy_{}_{}.npz").format(window,stride), allow_pickle=True)['features']
-    Y_epoch = np.load((featurepath + "shannonEntropy_{}_{}.npz").format(window,stride), allow_pickle=True)['Y']
-    electrodeList = ['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', 'FC6', 'F4', 'F8', 'AF4']
-
-    
-    print("Number of segments are: {}".format(ans.shape[1]))
-    
-    #X##############################################################################################
-    
-    featuresDict = None
-    featuresDict = loadFeaturesDict(dataset)
-    
-    common = []
-    with open('intersection.pkl', 'rb') as f:
-        common = pickle.load(f)
-
-    for k in list(featuresDict.keys()):
-        if k not in common:
-            # pop out common feature
-            featuresDict.pop(k)
-
-    
-    featuresList = list(featuresDict.keys())
-    
-    # defining column names
-    feature_channel_index = []
-
-    for feature in featuresList:
-        for i in range(featuresDict[feature].shape[0]):
-            feature_channel_index.append(feature + str(i))
-    
-    #defining feature matrix
-    featureMatrix = np.empty((0,ans.shape[1])) #[14*32 + 1,80640]
-    for key,value in featuresDict.items():
-        featureMatrix = np.append(featureMatrix,value,axis=0)
-
-    
-    print("Shape of FeatureMatrix: {}\n".format(featureMatrix.T.shape))
-    
-    #data-imputation and nan-removal
-    featureMatrix = featureMatrix.astype(np.float32)
-    
-    if np.isnan(featureMatrix).any():
-        featureMatrix = np.nan_to_num(featureMatrix,nan=0)
-
-    X = pd.DataFrame(featureMatrix.T)
-    X = X.replace([np.inf, -np.inf], np.nan)
-    X = X.fillna(0)
-    X.columns = feature_channel_index
-    
-
-    #Y#####################################################################
-
-    y = Y_epoch[:,label] #valence
-
-    ########################################################################
-    rmseList = []
-
-    for col in feature_channel_index:
-        input_df = pd.DataFrame(X[col])
-
-        X_train, X_test, y_train, y_test = train_test_split(input_df, y, test_size=0.2, random_state=42)
-
-        # Normalise-scale data 
-        # Feature Scaling
-        if(scale == True):
-            sc = StandardScaler()
-            X_train = sc.fit_transform(X_train)
-            X_test = sc.transform(X_test)
-
-        # Apply classfier
-        clf.fit(X_train, y_train)
-        y_predict = clf.predict(X_test)
-        rmse = mean_squared_error(y_test, y_predict, squared=False)
-        rmseList.append(rmse)
-
-    
-
-    col_df = pd.DataFrame(feature_channel_index)
-    rmse_df = pd.DataFrame(rmseList)
-    #concat two dataframes for better visualization 
-    colRanking = pd.concat([col_df, rmse_df],axis=1)
-    colRanking.columns = ['Column','RMSE']  #naming the dataframe columns
-    features_result = colRanking.sort_values('RMSE')
-    print(features_result)
-
-
-    N = len(feature_channel_index)
-    topNRmseList = []
-    topNList = ["{}".format(x) for x in range(1,N+1)]
-
-    for n in range(1, N+1):
-        ranking_df = features_result.head(n)
-        topncols = ranking_df['Column'].tolist()
-        
-        X_train, X_test, y_train, y_test = train_test_split(X[topncols], y, test_size=0.2, random_state=42)
-
-        # Normalise-scale data 
-        # Feature Scaling
-        if(scale == True):
-            sc = StandardScaler()
-            X_train = sc.fit_transform(X_train)
-            X_test = sc.transform(X_test)
-
-        # Apply classfier
-        clf.fit(X_train, y_train)
-        y_predict = clf.predict(X_test)
-        rmse = mean_squared_error(y_test, y_predict, squared=False)
-        topNRmseList.append(rmse)
-
-
-    topcol_df = pd.DataFrame(topNList)
-    toprmse_df = pd.DataFrame(topNRmseList)
-    #concat two dataframes for better visualization 
-    topcolRanking = pd.concat([topcol_df, toprmse_df],axis=1)
-    topcolRanking.columns = ['Column','RMSE']  #naming the dataframe columns
-    topfeatures_result = topcolRanking
-    print(topfeatures_result)
-    topfeatures_result.to_csv(pwd + "/" + dataset + "/arousal_plots/" + "ColumnsRegressionRanking" + str(window) + str(stride) + ".csv")
-
-
-    # Plotting
-    fig = plt.gcf()
-    fig.set_size_inches(60, 9)
-    plt.xlabel('Top N Columns')
-    plt.ylabel('RMSE')
-    plt.title("Top N Columns v/s RMSE Plot for Window:{} Stride:{} epoched data by varying N".format(window,stride))
-    plt.plot(topfeatures_result.loc[:,"Column"], topfeatures_result.loc[:,"RMSE"])
-    plt.tight_layout()
-    plt.savefig(pwd + "/" + dataset + "/arousal_plots/" + "topFeatureColumnsRegressionRanking" + str(window) + str(stride) + ".svg", bbox_inches='tight')
-    plt.show()
-    plt.clf()
-
@@ -1,17 +1,19 @@
 #!/usr/bin/env python
 # coding: utf-8
+# DATE - 01/11/2022

-# In[ ]:
+# AUTHOR - ROHIT GARG


 from ImportUtils import *

 from sklearn.ensemble import RandomForestRegressor as sklearnrfi
 from sklearn.feature_selection import VarianceThreshold
+from sklearn.metrics import r2_score
+from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import explained_variance_score


-# In[ ]:
-

 def topElectrodeFSRegressionRanking(dataset, window, stride, sfreq, clf, label, scale=False, pca=False, mutual_info = False, method='SelectKBest'):
    '''
@@ -129,7 +131,7 @@ def topElectrodeFSRegressionRanking(dataset, window, stride, sfreq, clf, label,
    featureScores.columns = ['Specs','Score']  #naming the dataframe columns
    features_result = featureScores.nlargest(X.shape[1],'Score')
    print(features_result)
-    features_result.to_csv(pwd + "/" + dataset + "/arousal_plots/" + "CommonElectrodeFSRegressionRanking"+ method + str(window) + str(stride) + ".csv")
+    features_result.to_csv(f"output/{dataset}_{label}_electrode_selection.csv")


    ###################################################################
@@ -150,10 +152,14 @@ def topElectrodeFSRegressionRanking(dataset, window, stride, sfreq, clf, label,
    
    N =  len(topelectrodes)
    topRmseList = []
+    topR2List = []
+    topMAEList = []
+    topEVList = []
    topNList = ["{}".format(x) for x in range(1,N+1)]

    
    for n in range(1,N+1):
+
        
        electrode_index = topelectrodes[:n]
        print(topelectrodes)
@@ -207,8 +213,17 @@ def topElectrodeFSRegressionRanking(dataset, window, stride, sfreq, clf, label,
        clf.fit(X_train, y_train)
        y_predict = clf.predict(X_test)
        rmse = mean_squared_error(y_test, y_predict,squared=False)
+        score_r2 = r2_score(y_test, y_predict)
+        score_mae = mean_absolute_error(y_test, y_predict)
+        score_ev = explained_variance_score(y_test, y_predict)
        print("window: {}, stide: {}, rmse: {}".format(window,stride,rmse))
+        print(f"r2: {score_r2}")
+        print(f"mae: {score_mae}")
+        print(f"ev: {score_ev}")
        topRmseList.append(rmse)
+        topR2List.append(score_r2)
+        topMAEList.append(score_mae)
+        topEVList.append(score_ev)



@@ -217,11 +232,19 @@ def topElectrodeFSRegressionRanking(dataset, window, stride, sfreq, clf, label,
    # features_result = features_result.reset_index()
    topNElectrode_df = pd.DataFrame(topNList)
    topNRmse_df = pd.DataFrame(topRmseList)
+    topNR2_df = pd.DataFrame(topR2List)
+    topNMAE_df = pd.DataFrame(topMAEList)
+    topNEV_df = pd.DataFrame(topEVList)
+
    #concat two dataframes for better visualization 
-    topNElectrodeRanking = pd.concat([topNElectrode_df, topNRmse_df],axis=1)
-    topNElectrodeRanking.columns = ['Electrode','RMSE']  #naming the dataframe columns
-    print(topNElectrodeRanking)
-    topNElectrodeRanking.to_csv(pwd + "/" + dataset + "/arousal_plots/" + "topCommonElectrodeFSRegressionRanking"+ method + str(window) + str(stride) + ".csv")
+    topNElectrode = pd.concat([topNElectrode_df, topNRmse_df, topNR2_df, topNMAE_df, topNEV_df],axis=1)
+
+    topNElectrode.columns = ['Electrode','RMSE', 'R2', 'MAE', 'EV']  #naming the dataframe columns
+
+    print(topNElectrode)
+
+    topNElectrode.to_csv(f"output/{dataset}_{label}_electrode.csv")
+
    # return features_result
    
    
@@ -231,8 +254,10 @@ def topElectrodeFSRegressionRanking(dataset, window, stride, sfreq, clf, label,
    plt.rcParams.update({'font.size': 30})
    plt.xlabel('Top N Electrodes')
    plt.ylabel('RMSE')
-    plt.plot(topNElectrodeRanking.loc[:,"Electrode"], topNElectrodeRanking.loc[:,"RMSE"])
+    plt.plot(topNElectrode.loc[:,"Electrode"], topNElectrode.loc[:,"RMSE"])
    plt.tight_layout()
+    plt.savefig(f"output/{dataset}_{label}_electrode_RMSE.svg")
+    plt.clf()


 # In[ ]:
@@ -363,7 +388,7 @@ def topFeatureFSRegressionRanking(dataset, window, stride, sfreq, clf, label, sc
    featureScores.columns = ['Specs','Score']  #naming the dataframe columns
    features_result = featureScores.nlargest(X.shape[1],'Score')
    print(features_result)
-    features_result.to_csv(pwd + "/" + dataset + "/arousal_plots/" + "CommonFeatureFSRegressionRanking"+ method + str(window) + str(stride) + ".csv")
+    features_result.to_csv(f"output/{dataset}_{label}_feature_selection.csv")



@@ -386,8 +411,11 @@ def topFeatureFSRegressionRanking(dataset, window, stride, sfreq, clf, label, sc
    # TOP-N-FEATURE-RANKING
    print(topfeatures)
    print(topelectrodes)
-    N =  len(topfeatures)
+    N = len(topfeatures)
    topNRmseList = []
+    topR2List = []
+    topMAEList = []
+    topEVList = []
    topNList = ["{}".format(x) for x in range(1,N+1)]


@@ -441,8 +469,17 @@ def topFeatureFSRegressionRanking(dataset, window, stride, sfreq, clf, label, sc
        clf.fit(X_train, y_train)
        y_predict = clf.predict(X_test)
        rmse = mean_squared_error(y_test, y_predict,squared=False)
+        score_r2 = r2_score(y_test, y_predict)
+        score_mae = mean_absolute_error(y_test, y_predict)
+        score_ev = explained_variance_score(y_test, y_predict)
        print("window: {}, stide: {}, rmse: {}".format(window,stride,rmse))
+        print(f"r2: {score_r2}")
+        print(f"mae: {score_mae}")
+        print(f"ev: {score_ev}")
+        topEVList.append(score_ev)
        topNRmseList.append(rmse)
+        topR2List.append(score_r2)
+        topMAEList.append(score_mae)



@@ -450,198 +487,32 @@ def topFeatureFSRegressionRanking(dataset, window, stride, sfreq, clf, label, sc
    topNFeatures_df = pd.DataFrame(topNList)

    topNRmse_df = pd.DataFrame(topNRmseList)
+    topNR2_df = pd.DataFrame(topR2List)
+    topNMAE_df = pd.DataFrame(topMAEList)
+    topNEV_df = pd.DataFrame(topEVList)
+

    #concat two dataframes for better visualization 
-    topNFeaturesRanking = pd.concat([topNFeatures_df, topNRmse_df],axis=1)
-    topNFeaturesRanking.columns = ['Feature','RMSE']  #naming the dataframe columns
-    print(topNFeaturesRanking)
-    topNFeaturesRanking.to_csv(pwd + "/" + dataset + "/arousal_plots/" + "topCommonFeatureFSRegressionRanking"+ method + str(window) + str(stride) + ".csv")
-    
+    topNFeatures = pd.concat([topNFeatures_df, topNRmse_df, topNR2_df, topNMAE_df, topNEV_df],axis=1)
+
+    topNFeatures.columns = ['Feature', 'RMSE', 'R2', 'MAE', 'EV']  #naming the dataframe columns
+
+    print(topNFeatures)
+
+    topNFeatures.to_csv(f"output/{dataset}_{label}_features.csv")
+
    # Plotting
    fig = plt.gcf()
    fig.set_size_inches(25, 10)
    plt.rcParams.update({'font.size': 30})
    plt.xlabel('Top N Features')
    plt.ylabel('RMSE')
-    plt.plot(topNFeaturesRanking.loc[:,"Feature"], topNFeaturesRanking.loc[:,"RMSE"])
+    plt.plot(topNFeatures.loc[:,"Feature"], topNFeatures.loc[:,"RMSE"])
    plt.tight_layout()


 # In[ ]:

-
-def topFSColumnsRegressionRanking(dataset, window, stride, sfreq, clf, label, scale=False, pca=False, mutual_info = False, method='SelectKBest'):
-        # Method C
-        # parameters :-
-                # dataset - name of the dataset
-                # window - length of the sliding window in seconds
-                # stride - length of the stride of the sliding window in seconds
-                # sfreq - sampling frequency of the EEG data
-                # clf - name of the classifier to be used
-                # label - valence/arousal/dominance/liking label (shape depends upon the dataset)
-                # scale - sclaing of the EEG data if required
-                # mutual_info - Mutual ranking between features based on information theory
-                # method - 'RandomForest' 'RFE' 'SelectKBest'
-                
-    # returns :-
-                # void
-    fs = sfreq
-    pwd = os.getcwd()
-
-    featurepath = os.getcwd() + '/' + dataset + '/data_extracted/featuresDict/'
-    ans = np.load((featurepath + "shannonEntropy_{}_{}.npz").format(window,stride), allow_pickle=True)['features']
-    Y_epoch = np.load((featurepath + "shannonEntropy_{}_{}.npz").format(window,stride), allow_pickle=True)['Y']
-
-    print("Number of segments are: {}".format(ans.shape[1]))
-    
-    #X##############################################################################################
-    
-    featuresDict = None
-    featuresDict = loadFeaturesDict(dataset)
-
-    common = []
-    with open('intersection.pkl', 'rb') as f:
-        common = pickle.load(f)
-
-    for k in list(featuresDict.keys()):
-        if k not in common:
-            # pop out common feature
-            featuresDict.pop(k)
-
-    print("Number of Features:",len(list(featuresDict.keys())))
-    featuresList = list(featuresDict.keys())
-
-    feature_channel_index = []
-
-    feature_channel_index = []
-    for feature in featuresList:
-        for i in range(featuresDict[feature].shape[0]):
-            if(i>=10):
-                feature_channel_index.append(feature +'_'+  str(i))
-            else:
-                feature_channel_index.append(feature + '_0' + str(i))
-
-    print(len(list(featuresDict.keys())))
-    print("Number of Feature-Columns: {}\n".format(len(feature_channel_index)))
-
-    #defining feature matrix
-    featureMatrix = np.empty((0,ans.shape[1])) #[14*32 + 1,80640]
-    for key,value in featuresDict.items():
-        featureMatrix = np.append(featureMatrix,value,axis=0)
-        
-
-    
-    print("Shape of FeatureMatrix: {}\n".format(featureMatrix.T.shape))
-    
-    #data-imputation and nan-removal
-    featureMatrix = featureMatrix.astype(np.float32)
-    
-    if np.isnan(featureMatrix).any():
-        featureMatrix = np.nan_to_num(featureMatrix,nan=0)
-
-    X = pd.DataFrame(featureMatrix.T)
-    X = X.replace([np.inf, -np.inf], np.nan)
-    X = X.fillna(0)
-    X.columns = feature_channel_index
-    
-
-    #Y#####################################################################
-
-    y = Y_epoch[:,label] #valence
-    # y = pd.DataFrame(y)
-
-    ########################################################################
-    dfscores = None
-
-    if(method == 'RandomForest'):
-        '''Random Forest Feature Importances'''
-        estimator = sklearnrfi() #RandomForestRegressor()
-        fit = estimator.fit(X,y)
-        dfscores = pd.DataFrame(fit.feature_importances_)
-    elif(method == 'RFE'):
-        ''' RFE'''
-        selector = RFE(clf, n_features_to_select=X.shape[1], step=1)
-        selector = selector.fit(X, y)
-        dfscores = pd.DataFrame(selector.ranking_)
-
-    elif(method == 'SelectKBest'):
-        """SelecKBest"""
-        #apply SelectKBest class to extract top 10 best features
-        func = None
-        if mutual_info == False:
-            func = f_classif
-        else:
-            func = mutual_info_classif
-
-        bestfeatures = SelectKBest(score_func=func, k=X.shape[1])
-        fit = bestfeatures.fit(X,y)
-
-        dfscores = pd.DataFrame(fit.scores_)
-        
-
-
-
-    dfcolumns = pd.DataFrame(X.columns)
-
-    #concat two dataframes for better visualization 
-    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
-    featureScores.columns = ['Column','Score']  #naming the dataframe columns
-    features_result = featureScores.nlargest(X.shape[1],'Score')
-    print(features_result)
-
-    N = len(feature_channel_index)
-    topNRmseList = []
-    topNList = ["{}".format(x) for x in range(1,N+1)]
-
-    for n in range(1, N+1):
-        ranking_df = features_result.head(n)
-        topncols = ranking_df['Column'].tolist()
-        
-        input_df = pd.DataFrame(X[topncols])
-
-        X_train, X_test, y_train, y_test = train_test_split(input_df, y, test_size=0.2, random_state=42)
-
-        # Normalise-scale data 
-        # Feature Scaling
-        if(scale == True):
-            sc = StandardScaler()
-            X_train = sc.fit_transform(X_train)
-            X_test = sc.transform(X_test)
-
-        # Apply classfier       
-        clf.fit(X_train, y_train)
-        y_predict = clf.predict(X_test)
-        rmse = mean_squared_error(y_test, y_predict, squared=False)
-        print(n,rmse)
-        topNRmseList.append(rmse)
-
-
-    topcol_df = pd.DataFrame(topNList)
-    toprmse_df = pd.DataFrame(topNRmseList)
-    #concat two dataframes for better visualization 
-    topcolRanking = pd.concat([topcol_df, toprmse_df],axis=1)
-    topcolRanking.columns = ['Column','RMSE']  #naming the dataframe columns
-    topfeatures_result = topcolRanking
-    print(topfeatures_result)
-    topfeatures_result.to_csv(pwd + "/" + dataset + "/arousal_plots/" + "topFSColumnsRegressionRanking"+method + str(window) + str(stride) + ".csv")
-
-    # Plotting
-    fig = plt.gcf()
-    fig.set_size_inches(60, 9)
-
-    plt.xlabel('Top N Columns')
-    plt.ylabel('RMSE')
-    plt.title("Top N Columns v/s RMSE Plot for Window:{} Stride:{} epoched data by varying N".format(window,stride))
-    plt.plot(topfeatures_result.loc[:,"Column"], topfeatures_result.loc[:,"RMSE"])
-    plt.tight_layout()
-    plt.savefig(pwd + "/" + dataset + "/arousal_plots/" + "topFSColumnsRegressionRanking"+method + str(window) + str(stride) + ".svg", bbox_inches='tight', dpi=500)
-    plt.show()
-    plt.clf()
-
-
-# In[ ]:
-
-
 if __name__ == '__main__':
    pass
    
@@ -5,16 +5,17 @@


 # Script to get the feature ranking and electrode ranking through 
-        # Method A :- Random Forest Regressor
-        # Method B :- F score based Ranking
-        # Method C :- Random Forest Importances approach 
+        
+# Method :- F score based Ranking
+        
 # Main function

 from ImportUtils import *
 from TopNByFSMethods import *
 from TopNByClassifier import *
 from args_eeg import args as my_args
-
+# uncomment to extract features
+# from EpochedFeatures import *
 if __name__ == '__main__':

    # args object to fetch command line inputs
@@ -34,37 +35,12 @@ if __name__ == '__main__':
    fs_method = args.fs_method

    #feature extraction
-    getEpochedFeatures(dataset, window, stride, sfreq, label)
+    # uncomment to extract features
+    # getEpochedFeatures(dataset, window, stride, sfreq, label)
    if(top == "e"):
        clf = RandomForestRegressor()
-        topElectrodeRegressionRanking(dataset, window, stride, sfreq, clf, label, scale=False, pca=False)
        topElectrodeFSRegressionRanking(dataset, window, stride, sfreq, clf, label, scale=False, pca=False, mutual_info = False, method='SelectKBest')
-        topElectrodeFSRegressionRanking(dataset, window, stride, sfreq, clf, label, scale=False, pca=False, mutual_info = False, method='RandomForest')
-        plt.legend(["Method A","Method B", "Method C"])
-
-        if(label == 1):
-            plt.savefig(pwd + "/" + dataset + "/arousal_plots/" + "CorrectedElectrodewiseRanking" + str(window) + str(stride) + ".svg", bbox_inches='tight')
-            plt.show()
-            plt.clf()
-        
-        else:
-            plt.savefig(pwd + "/" + dataset + "/plots/" + "CorrectedElectrodewiseRanking" + str(window) + str(stride) + ".svg", bbox_inches='tight')
-            plt.show()
-            plt.clf()    
        
    elif(top == "f"):
        clf = RandomForestRegressor()
-        topFeaturesRegressionRanking(dataset, window, stride, sfreq, clf, label, scale=False, pca=False)
-        topFeatureFSRegressionRanking(dataset, window, stride, sfreq, clf, label, scale=False, pca=False, mutual_info = False, method='SelectKBest')
-        topFeatureFSRegressionRanking(dataset, window, stride, sfreq, clf, label, scale=False, pca=False, mutual_info = False, method='RandomForest')
-        if(label == 1):
-            plt.legend(["Method A","Method B", "Method C"])
-            plt.savefig(pwd + "/" + dataset + "/arousal_plots/" + "CorrectedFeaturewiseRanking" + str(window) + str(stride) + ".svg", bbox_inches='tight')
-            plt.show()
-            plt.clf()
-        else:
-            plt.legend(["Method A","Method B", "Method C"])
-            plt.savefig(pwd + "/" + dataset + "/plots/" + "CorrectedFeaturewiseRanking" + str(window) + str(stride) + ".svg", bbox_inches='tight')
-            plt.show()
-            plt.clf()
-
+        topFeatureFSRegressionRanking(dataset, window, stride, sfreq, clf, label, scale=False, pca=False, mutual_info = False, method='SelectKBest')