# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% import sys, os import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import recall_score, f1_score nb_dir = os.path.split(os.getcwd())[0] if nb_dir not in sys.path: sys.path.append(nb_dir) from machine_learning.cross_validation import CrossValidation from machine_learning.preprocessing import Preprocessing from machine_learning.feature_selection import FeatureSelection # %% df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv") index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] df.set_index(index_columns, inplace=True) # Create binary target bins = [-1, 0, 4] # bins for stressfulness (0-4) target df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high'] nan_cols = df.columns[df.isna().any()].tolist() df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0)) cv = CrossValidation(data=df, cv_method="logo") categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"] interval_feature_list, other_feature_list = [], [] # %% for split in cv.get_splits(): train_X, train_y, test_X, test_y = cv.get_train_test_sets(split) pre = Preprocessing(train_X, train_y, test_X, test_y) pre.one_hot_encode_train_and_test_sets(categorical_columns) train_X, train_y, test_X, test_y = pre.get_train_test_sets() print(train_X.shape, test_X.shape) # Predict before feature selection rfc = RandomForestClassifier(n_estimators=10) rfc.fit(train_X, train_y) predictions = rfc.predict(test_X) print("Recall:", recall_score(test_y, predictions)) print("F1:", f1_score(test_y, predictions)) # Feature selection on train set train_groups, test_groups = cv.get_groups_sets(split) fs = FeatureSelection(train_X, train_y, train_groups) selected_features = fs.select_features(n_min=20, n_max=29, k=40, ml_type="classification_bin", metric="recall", n_tolerance=20) train_X = train_X[selected_features] test_X = test_X[selected_features] print(selected_features) print(len(selected_features)) # Predict after feature selection rfc = RandomForestClassifier(n_estimators=500) rfc.fit(train_X, train_y) predictions = rfc.predict(test_X) print("Recall:", recall_score(test_y, predictions)) print("F1:", f1_score(test_y, predictions)) break # %%