stress_at_work_analysis/exploration/ml_pipeline.py

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.13.0
#   kernelspec:
#     display_name: straw2analysis
#     language: python
#     name: straw2analysis
# ---

# %% 
import sys, os

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, f1_score

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from machine_learning.cross_validation import CrossValidation
from machine_learning.preprocessing import Preprocessing
from machine_learning.feature_selection import FeatureSelection

# %% 
df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
df.set_index(index_columns, inplace=True)

# Create binary target 
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']


nan_cols = df.columns[df.isna().any()].tolist()
df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))

cv = CrossValidation(data=df, cv_method="logo")

categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
interval_feature_list, other_feature_list = [], []

# %%
for split in cv.get_splits():
    train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
    pre = Preprocessing(train_X, train_y, test_X, test_y)
    pre.one_hot_encode_train_and_test_sets(categorical_columns)
    train_X, train_y, test_X, test_y = pre.get_train_test_sets()
    
    
    print(train_X.shape, test_X.shape)
    # Predict before feature selection
    rfc = RandomForestClassifier(n_estimators=10)
    rfc.fit(train_X, train_y)
    predictions = rfc.predict(test_X)
    
    print("Recall:", recall_score(test_y, predictions))
    print("F1:", f1_score(test_y, predictions))
    
    # Feature selection on train set
    train_groups, test_groups = cv.get_groups_sets(split)

    fs = FeatureSelection(train_X, train_y, train_groups) 
    selected_features = fs.select_features(n_min=20, n_max=29, k=40,
                                           ml_type="classification_bin", 
                                           metric="recall", n_tolerance=20)
    
    train_X = train_X[selected_features]
    test_X = test_X[selected_features]
    
    print(selected_features)
    print(len(selected_features))
    
    # Predict after feature selection    
    rfc = RandomForestClassifier(n_estimators=500)
    rfc.fit(train_X, train_y)
    predictions = rfc.predict(test_X)
    
    print("Recall:", recall_score(test_y, predictions))
    print("F1:", f1_score(test_y, predictions))
    
    break

# %%
Add a ML pipeline script to develop a whole pipeline. 2023-02-23 10:41:36 +01:00			`# ---`
			`# jupyter:`
			`# jupytext:`
			`# formats: ipynb,py:percent`
			`# text_representation:`
			`# extension: .py`
			`# format_name: percent`
			`# format_version: '1.3'`
			`# jupytext_version: 1.13.0`
			`# kernelspec:`
			`# display_name: straw2analysis`
			`# language: python`
			`# name: straw2analysis`
			`# ---`

			`# %%`
			`import sys, os`

			`import numpy as np`
			`import matplotlib.pyplot as plt`
			`import pandas as pd`

Added testing section after feature selection. 2023-04-20 13:29:14 +02:00			`from sklearn.ensemble import RandomForestClassifier`
			`from sklearn.metrics import recall_score, f1_score`

Add a ML pipeline script to develop a whole pipeline. 2023-02-23 10:41:36 +01:00			`nb_dir = os.path.split(os.getcwd())[0]`
			`if nb_dir not in sys.path:`
			`sys.path.append(nb_dir)`

			`from machine_learning.cross_validation import CrossValidation`
			`from machine_learning.preprocessing import Preprocessing`
Implement feature selection method which is used in ML pipeline. 2023-04-19 15:56:34 +02:00			`from machine_learning.feature_selection import FeatureSelection`
Add a ML pipeline script to develop a whole pipeline. 2023-02-23 10:41:36 +01:00
			`# %%`
			`df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")`
			`index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]`
			`df.set_index(index_columns, inplace=True)`

Implement feature selection method which is used in ML pipeline. 2023-04-19 15:56:34 +02:00			`# Create binary target`
Added testing section after feature selection. 2023-04-20 13:29:14 +02:00			`bins = [-1, 0, 4] # bins for stressfulness (0-4) target`
			`df['target'], edges = pd.cut(df.target, bins=bins, labels=[0, 1], retbins=True, right=True) #['low', 'medium', 'high']`
Implement feature selection method which is used in ML pipeline. 2023-04-19 15:56:34 +02:00

			`nan_cols = df.columns[df.isna().any()].tolist()`
			`df[nan_cols] = df[nan_cols].fillna(round(df[nan_cols].median(), 0))`

Add a ML pipeline script to develop a whole pipeline. 2023-02-23 10:41:36 +01:00			`cv = CrossValidation(data=df, cv_method="logo")`

			`categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]`
			`interval_feature_list, other_feature_list = [], []`

Implement feature selection method which is used in ML pipeline. 2023-04-19 15:56:34 +02:00			`# %%`
Add a ML pipeline script to develop a whole pipeline. 2023-02-23 10:41:36 +01:00			`for split in cv.get_splits():`
			`train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)`
			`pre = Preprocessing(train_X, train_y, test_X, test_y)`
			`pre.one_hot_encode_train_and_test_sets(categorical_columns)`
			`train_X, train_y, test_X, test_y = pre.get_train_test_sets()`
Implement feature selection method which is used in ML pipeline. 2023-04-19 15:56:34 +02:00
Added testing section after feature selection. 2023-04-20 13:29:14 +02:00
			`print(train_X.shape, test_X.shape)`
			`# Predict before feature selection`
			`rfc = RandomForestClassifier(n_estimators=10)`
			`rfc.fit(train_X, train_y)`
			`predictions = rfc.predict(test_X)`

			`print("Recall:", recall_score(test_y, predictions))`
			`print("F1:", f1_score(test_y, predictions))`
Implement feature selection method which is used in ML pipeline. 2023-04-19 15:56:34 +02:00
			`# Feature selection on train set`
Add GroupKFold to feature selection CV. Start with generic metric calculation procedure. 2023-04-20 11:20:26 +02:00			`train_groups, test_groups = cv.get_groups_sets(split)`

			`fs = FeatureSelection(train_X, train_y, train_groups)`
Added testing section after feature selection. 2023-04-20 13:29:14 +02:00			`selected_features = fs.select_features(n_min=20, n_max=29, k=40,`
			`ml_type="classification_bin",`
			`metric="recall", n_tolerance=20)`

			`train_X = train_X[selected_features]`
			`test_X = test_X[selected_features]`

Implement feature selection method which is used in ML pipeline. 2023-04-19 15:56:34 +02:00			`print(selected_features)`
			`print(len(selected_features))`

Added testing section after feature selection. 2023-04-20 13:29:14 +02:00			`# Predict after feature selection`
			`rfc = RandomForestClassifier(n_estimators=500)`
			`rfc.fit(train_X, train_y)`
			`predictions = rfc.predict(test_X)`

			`print("Recall:", recall_score(test_y, predictions))`
			`print("F1:", f1_score(test_y, predictions))`

Add a ML pipeline script to develop a whole pipeline. 2023-02-23 10:41:36 +01:00			`break`

			`# %%`