diff --git a/exploration/ml_pipeline.py b/exploration/ml_pipeline.py index a794e66..b6b3bb6 100644 --- a/exploration/ml_pipeline.py +++ b/exploration/ml_pipeline.py @@ -20,6 +20,9 @@ import numpy as np import matplotlib.pyplot as plt import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import recall_score, f1_score + nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) @@ -34,8 +37,8 @@ index_columns = ["local_segment", "local_segment_label", "local_segment_start_da df.set_index(index_columns, inplace=True) # Create binary target -# bins = [-1, 0, 4] # bins for stressfulness (0-4) target -# df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high'] +bins = [-1, 0, 4] # bins for stressfulness (0-4) target +df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high'] nan_cols = df.columns[df.isna().any()].tolist() @@ -53,20 +56,38 @@ for split in cv.get_splits(): pre.one_hot_encode_train_and_test_sets(categorical_columns) train_X, train_y, test_X, test_y = pre.get_train_test_sets() - # train_X = train_X[train_X.columns[:30]] + + print(train_X.shape, test_X.shape) + # Predict before feature selection + rfc = RandomForestClassifier(n_estimators=10) + rfc.fit(train_X, train_y) + predictions = rfc.predict(test_X) + + print("Recall:", recall_score(test_y, predictions)) + print("F1:", f1_score(test_y, predictions)) # Feature selection on train set - # Morda se implementira GroupKfold namesto stratifiedKFold? >> - # >> Tako se bo posamezen pid pojavil ali v test ali v train setu train_groups, test_groups = cv.get_groups_sets(split) fs = FeatureSelection(train_X, train_y, train_groups) - selected_features = fs.select_features(n_min=20, n_max=50, k=60, - ml_type="classification_multi", - metric="f1", n_tolerance=20) + selected_features = fs.select_features(n_min=20, n_max=29, k=40, + ml_type="classification_bin", + metric="recall", n_tolerance=20) + + train_X = train_X[selected_features] + test_X = test_X[selected_features] + print(selected_features) print(len(selected_features)) + # Predict after feature selection + rfc = RandomForestClassifier(n_estimators=500) + rfc.fit(train_X, train_y) + predictions = rfc.predict(test_X) + + print("Recall:", recall_score(test_y, predictions)) + print("F1:", f1_score(test_y, predictions)) + break # %%