2023-02-23 10:41:36 +01:00
|
|
|
# ---
|
|
|
|
# jupyter:
|
|
|
|
# jupytext:
|
|
|
|
# formats: ipynb,py:percent
|
|
|
|
# text_representation:
|
|
|
|
# extension: .py
|
|
|
|
# format_name: percent
|
|
|
|
# format_version: '1.3'
|
|
|
|
# jupytext_version: 1.13.0
|
|
|
|
# kernelspec:
|
|
|
|
# display_name: straw2analysis
|
|
|
|
# language: python
|
|
|
|
# name: straw2analysis
|
|
|
|
# ---
|
|
|
|
|
|
|
|
# %%
|
|
|
|
import sys, os
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
import pandas as pd
|
|
|
|
|
2023-04-20 13:29:14 +02:00
|
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
|
|
from sklearn.metrics import recall_score, f1_score
|
|
|
|
|
2023-02-23 10:41:36 +01:00
|
|
|
nb_dir = os.path.split(os.getcwd())[0]
|
|
|
|
if nb_dir not in sys.path:
|
|
|
|
sys.path.append(nb_dir)
|
|
|
|
|
|
|
|
from machine_learning.cross_validation import CrossValidation
|
|
|
|
from machine_learning.preprocessing import Preprocessing
|
2023-04-19 15:56:34 +02:00
|
|
|
from machine_learning.feature_selection import FeatureSelection
|
2023-02-23 10:41:36 +01:00
|
|
|
|
|
|
|
# %%
|
|
|
|
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
|
|
|
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
|
|
|
df.set_index(index_columns, inplace=True)
|
|
|
|
|
2023-04-19 15:56:34 +02:00
|
|
|
# Create binary target
|
2023-04-20 13:29:14 +02:00
|
|
|
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
|
|
|
df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']
|
2023-04-19 15:56:34 +02:00
|
|
|
|
|
|
|
|
|
|
|
nan_cols = df.columns[df.isna().any()].tolist()
|
|
|
|
df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))
|
|
|
|
|
2023-02-23 10:41:36 +01:00
|
|
|
cv = CrossValidation(data=df, cv_method="logo")
|
|
|
|
|
|
|
|
categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
|
|
|
|
interval_feature_list, other_feature_list = [], []
|
|
|
|
|
2023-04-19 15:56:34 +02:00
|
|
|
# %%
|
2023-02-23 10:41:36 +01:00
|
|
|
for split in cv.get_splits():
|
|
|
|
train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
|
|
|
|
pre = Preprocessing(train_X, train_y, test_X, test_y)
|
|
|
|
pre.one_hot_encode_train_and_test_sets(categorical_columns)
|
|
|
|
train_X, train_y, test_X, test_y = pre.get_train_test_sets()
|
2023-04-19 15:56:34 +02:00
|
|
|
|
2023-04-20 13:29:14 +02:00
|
|
|
|
|
|
|
print(train_X.shape, test_X.shape)
|
|
|
|
# Predict before feature selection
|
|
|
|
rfc = RandomForestClassifier(n_estimators=10)
|
|
|
|
rfc.fit(train_X, train_y)
|
|
|
|
predictions = rfc.predict(test_X)
|
|
|
|
|
|
|
|
print("Recall:", recall_score(test_y, predictions))
|
|
|
|
print("F1:", f1_score(test_y, predictions))
|
2023-04-19 15:56:34 +02:00
|
|
|
|
|
|
|
# Feature selection on train set
|
2023-04-20 11:20:26 +02:00
|
|
|
train_groups, test_groups = cv.get_groups_sets(split)
|
|
|
|
|
|
|
|
fs = FeatureSelection(train_X, train_y, train_groups)
|
2023-04-20 13:29:14 +02:00
|
|
|
selected_features = fs.select_features(n_min=20, n_max=29, k=40,
|
|
|
|
ml_type="classification_bin",
|
|
|
|
metric="recall", n_tolerance=20)
|
|
|
|
|
|
|
|
train_X = train_X[selected_features]
|
|
|
|
test_X = test_X[selected_features]
|
|
|
|
|
2023-04-19 15:56:34 +02:00
|
|
|
print(selected_features)
|
|
|
|
print(len(selected_features))
|
|
|
|
|
2023-04-20 13:29:14 +02:00
|
|
|
# Predict after feature selection
|
|
|
|
rfc = RandomForestClassifier(n_estimators=500)
|
|
|
|
rfc.fit(train_X, train_y)
|
|
|
|
predictions = rfc.predict(test_X)
|
|
|
|
|
|
|
|
print("Recall:", recall_score(test_y, predictions))
|
|
|
|
print("F1:", f1_score(test_y, predictions))
|
|
|
|
|
2023-02-23 10:41:36 +01:00
|
|
|
break
|
|
|
|
|
|
|
|
# %%
|