Merge branch 'ml_pipeline'

Clean up categories.
Explore saved categories.
2023-04-18 15:55:03 +02:00 · 2023-04-18 15:49:33 +02:00 · 2023-04-18 15:34:06 +02:00 · 2023-04-18 14:57:59 +02:00 · 2023-04-18 14:54:35 +02:00 · 2023-04-14 17:20:22 +02:00
49 changed files with 4789 additions and 954 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,7 +5,19 @@ __pycache__/
 /exploration/*.ipynb
 /config/*.ipynb
 /statistical_analysis/*.ipynb
 /presentation/*.ipynb
 /machine_learning/intermediate_results/
 /data/features/
 /data/baseline/
 /data/*input*.csv
 /data/daily*
 /data/intradaily*
 /data/stressfulness_event*
 /data/30min*
 /presentation/*scores.csv
 /presentation/Results.ods
 .Rproj.user
 .Rhistory
 /presentation/*.nb.html
 presentation/event_stressful_detection_half_loso.csv
 presentation/event_stressful_detection_loso.csv
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,6 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (straw2analysis)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="straw2analysis" project-jdk-type="Python SDK" />
  <component name="PyCharmDSProjectLayout">
    <option name="id" value="JupyterRightHiddenStructureLayout" />
  </component>
  <component name="PyCharmProfessionalAdvertiser">
    <option name="shown" value="true" />
  </component>
--- a/.idea/straw2analysis.iml
+++ b/.idea/straw2analysis.iml
@ -5,7 +5,7 @@
      <excludeFolder url="file://$MODULE_DIR$/config/.ipynb_checkpoints" />
      <excludeFolder url="file://$MODULE_DIR$/exploration/.ipynb_checkpoints" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.9 (straw2analysis)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="straw2analysis" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyDocumentationSettings">
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -3,5 +3,6 @@
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
    <mapping directory="$PROJECT_DIR$/rapids" vcs="Git" />
    <mapping directory="$PROJECT_DIR$/rapids/calculatingfeatures" vcs="Git" />
  </component>
 </project>
--- a/config/environment.yml
+++ b/config/environment.yml
@ -7,8 +7,10 @@ dependencies:
  - black
  - isort
  - flake8
  - imbalanced-learn=0.10.0
  - jupyterlab
  - jupytext
  - lightgbm
  - mypy
  - nodejs
  - pandas
--- a/data/input_PANAS_negative_affect_mean.csv
+++ b/data/input_PANAS_negative_affect_mean.csv
--- a/exploration/all_sensors_sequential_addition_scores.xlsx
+++ b/exploration/all_sensors_sequential_addition_scores.xlsx
--- a/exploration/ex_all_feat_ml_pipeline.py
+++ b/exploration/ex_all_feat_ml_pipeline.py
@ -1,473 +0,0 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"source_hidden": true}
 # %matplotlib inline
 import datetime
 import importlib
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import yaml
 from pyprojroot import here
 from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble
 from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
 from sklearn.metrics import mean_squared_error, r2_score
 from sklearn.impute import SimpleImputer
 from xgboost import XGBRegressor
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import machine_learning.features_sensor
 import machine_learning.labels
 import machine_learning.model
 # %% [markdown]
 # # RAPIDS models
 # %% [markdown]
 # ## PANAS negative affect
 # %% jupyter={"source_hidden": true}
 # model_input = pd.read_csv("../data/input_PANAS_NA.csv") # Nestandardizirani podatki
 model_input = pd.read_csv("../data/z_input_PANAS_NA.csv") # Standardizirani podatki
 # %% [markdown]
 # ### NaNs before dropping cols and rows
 # %% jupyter={"source_hidden": true}
 sns.set(rc={"figure.figsize":(16, 8)})
 sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
 # %% jupyter={"source_hidden": true}
 nan_cols = list(model_input.loc[:, model_input.isna().all()].columns)
 nan_cols
 # %% jupyter={"source_hidden": true}
 model_input.dropna(axis=1, how="all", inplace=True)
 model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
 # %% [markdown]
 # ### NaNs after dropping NaN cols and rows where target is NaN
 # %% jupyter={"source_hidden": true}
 sns.set(rc={"figure.figsize":(16, 8)})
 sns.heatmap(model_input.sort_values('pid').set_index('pid').isna(), cbar=False)
 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 #if "pid" in model_input.columns:
 #    index_columns.append("pid")
 model_input.set_index(index_columns, inplace=True)
 data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
 # %% jupyter={"source_hidden": true}
 categorical_feature_colnames = ["gender", "startlanguage"]
 # %% jupyter={"source_hidden": true}
 categorical_features = data_x[categorical_feature_colnames].copy()
 # %% jupyter={"source_hidden": true}
 mode_categorical_features = categorical_features.mode().iloc[0]
 # %% jupyter={"source_hidden": true}
 # fillna with mode
 categorical_features = categorical_features.fillna(mode_categorical_features)
 # %% jupyter={"source_hidden": true}
 # one-hot encoding
 categorical_features = categorical_features.apply(lambda col: col.astype("category"))
 if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)
 # %% jupyter={"source_hidden": true}
 numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
 # %% jupyter={"source_hidden": true}
 train_x = pd.concat([numerical_features, categorical_features], axis=1)
 # %% jupyter={"source_hidden": true}
 train_x.dtypes
 # %% jupyter={"source_hidden": true}
 logo = LeaveOneGroupOut()
 logo.get_n_splits(
    train_x,
    data_y,
    groups=data_groups,
 )
 # %% jupyter={"source_hidden": true}
 sum(data_y.isna())
 # %% [markdown]
 # ### Linear Regression
 # %% jupyter={"source_hidden": true}
 lin_reg_rapids = linear_model.LinearRegression()
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
 # %% jupyter={"source_hidden": true}
 lin_reg_scores = cross_val_score(
    lin_reg_rapids,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring='r2'
 )
 lin_reg_scores
 np.median(lin_reg_scores)
 # %% [markdown]
 # ### Ridge regression
 # %% jupyter={"source_hidden": true}
 ridge_reg = linear_model.Ridge(alpha=.5)
 # %% tags=[] jupyter={"source_hidden": true}
 ridge_reg_scores = cross_val_score(
    ridge_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring="r2"
 )
 np.median(ridge_reg_scores)
 # %% [markdown]
 # ### Lasso
 # %% jupyter={"source_hidden": true}
 lasso_reg = linear_model.Lasso(alpha=0.1)
 # %% jupyter={"source_hidden": true}
 lasso_reg_score = cross_val_score(
    lasso_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring="r2"
 )
 np.median(lasso_reg_score)
 # %% [markdown]
 # ### Bayesian Ridge
 # %% jupyter={"source_hidden": true}
 bayesian_ridge_reg = linear_model.BayesianRidge()
 # %% jupyter={"source_hidden": true}
 bayesian_ridge_reg_score = cross_val_score(
    bayesian_ridge_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring="r2"
 )
 np.median(bayesian_ridge_reg_score)
 # %% [markdown]
 # ### RANSAC (outlier robust regression)
 # %% jupyter={"source_hidden": true}
 ransac_reg = linear_model.RANSACRegressor()
 # %% jupyter={"source_hidden": true}
 np.median(
    cross_val_score(
    ransac_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring="r2"
    )
 )
 # %% [markdown]
 # ### Support vector regression
 # %% jupyter={"source_hidden": true}
 svr = svm.SVR()
 # %% jupyter={"source_hidden": true}
 np.median(
    cross_val_score(
    svr,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring="r2"
    )
 )
 # %% [markdown]
 # ### Kernel Ridge regression
 # %% jupyter={"source_hidden": true}
 kridge = kernel_ridge.KernelRidge()
 # %% jupyter={"source_hidden": true}
 np.median(
    cross_val_score(
        kridge,
        X=imputer.fit_transform(train_x),
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring="r2"
    )
 )
 # %% [markdown]
 # ### Gaussian Process Regression
 # %% jupyter={"source_hidden": true}
 gpr = gaussian_process.GaussianProcessRegressor()
 # %% jupyter={"source_hidden": true}
 np.median(
    cross_val_score(
        gpr,
        X=imputer.fit_transform(train_x),
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring="r2"
    )
 )
 # %%
 def insert_row(df, row):
    return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
 # %%
 def run_all_models(input_csv):
    # Prepare data
    model_input = pd.read_csv(input_csv)
    model_input.dropna(axis=1, how="all", inplace=True)
    model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
    index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
    model_input.set_index(index_columns, inplace=True)
    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
    categorical_feature_colnames = ["gender", "startlanguage"]
    categorical_features = data_x[categorical_feature_colnames].copy()
    mode_categorical_features = categorical_features.mode().iloc[0]
    # fillna with mode
    categorical_features = categorical_features.fillna(mode_categorical_features)
    # one-hot encoding
    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
    if not categorical_features.empty:
        categorical_features = pd.get_dummies(categorical_features)
    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
    train_x = pd.concat([numerical_features, categorical_features], axis=1)
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    train_x_imputed = imputer.fit_transform(train_x)
    # Prepare cross validation
    logo = LeaveOneGroupOut()
    logo.get_n_splits(
        train_x,
        data_y,
        groups=data_groups,
    )
    scores = pd.DataFrame(columns=["method", "median", "max"])
    # Validate models
    lin_reg_rapids = linear_model.LinearRegression()
    lin_reg_scores = cross_val_score(
        lin_reg_rapids,
        X=train_x_imputed,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring='r2'
    )
    print("Linear regression:")
    print(np.median(lin_reg_scores))
    scores = insert_row(scores, ["Linear regression",np.median(lin_reg_scores),np.max(lin_reg_scores)])
    ridge_reg = linear_model.Ridge(alpha=.5)
    ridge_reg_scores = cross_val_score(
        ridge_reg,
        X=train_x_imputed,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring="r2"
    )
    print("Ridge regression")
    print(np.median(ridge_reg_scores))
    scores = insert_row(scores, ["Ridge regression",np.median(ridge_reg_scores),np.max(ridge_reg_scores)])
    lasso_reg = linear_model.Lasso(alpha=0.1)
    lasso_reg_score = cross_val_score(
        lasso_reg,
        X=train_x_imputed,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring="r2"
    )
    print("Lasso regression")
    print(np.median(lasso_reg_score))
    scores = insert_row(scores, ["Lasso regression",np.median(lasso_reg_score),np.max(lasso_reg_score)])
    bayesian_ridge_reg = linear_model.BayesianRidge()
    bayesian_ridge_reg_score = cross_val_score(
        bayesian_ridge_reg,
        X=train_x_imputed,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring="r2"
    )
    print("Bayesian Ridge")
    print(np.median(bayesian_ridge_reg_score))
    scores = insert_row(scores, ["Bayesian Ridge",np.median(bayesian_ridge_reg_score),np.max(bayesian_ridge_reg_score)])
    ransac_reg = linear_model.RANSACRegressor()
    ransac_reg_score = cross_val_score(
        ransac_reg,
        X=train_x_imputed,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring="r2"
    )
    print("RANSAC (outlier robust regression)")
    print(np.median(ransac_reg_score))
    scores = insert_row(scores, ["RANSAC",np.median(ransac_reg_score),np.max(ransac_reg_score)])
    svr = svm.SVR()
    svr_score = cross_val_score(
        svr,
        X=train_x_imputed,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring="r2"
    )
    print("Support vector regression")
    print(np.median(svr_score))
    scores = insert_row(scores, ["Support vector regression",np.median(svr_score),np.max(svr_score)])
    kridge = kernel_ridge.KernelRidge()
    kridge_score = cross_val_score(
        kridge,
        X=train_x_imputed,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring="r2"
    )
    print("Kernel Ridge regression")
    print(np.median(kridge_score))
    scores = insert_row(scores, ["Kernel Ridge regression",np.median(kridge_score),np.max(kridge_score)])
    gpr = gaussian_process.GaussianProcessRegressor()
    gpr_score = cross_val_score(
        gpr,
        X=train_x_imputed,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring="r2"
    )
    print("Gaussian Process Regression")
    print(np.median(gpr_score))
    scores = insert_row(scores, ["Gaussian Process Regression",np.median(gpr_score),np.max(gpr_score)])
    rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
    rfr_score = cross_val_score(
        rfr,
        X=train_x_imputed,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring="r2"
    )
    print("Random Forest Regression")
    print(np.median(rfr_score))
    scores = insert_row(scores, ["Random Forest Regression",np.median(rfr_score),np.max(rfr_score)])
    xgb = XGBRegressor()
    xgb_score = cross_val_score(
        xgb,
        X=train_x_imputed,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring="r2"
    )
    print("XGBoost Regressor")
    print(np.median(xgb_score))
    scores = insert_row(scores, ["XGBoost Regressor",np.median(xgb_score),np.max(xgb_score)])
    ada = ensemble.AdaBoostRegressor()
    ada_score = cross_val_score(
        ada,
        X=train_x_imputed,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring="r2"
    )
    print("ADA Boost Regressor")
    print(np.median(ada_score))
    scores = insert_row(scores, ["ADA Boost Regressor",np.median(ada_score),np.max(ada_score)])
    return scores
--- a/exploration/expl_features_analysis.py
+++ b/exploration/expl_features_analysis.py
@ -0,0 +1,318 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 # %matplotlib inline
 import os, sys, math
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 from sklearn.tree import DecisionTreeClassifier
 from sklearn import tree
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import train_test_split
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 def calc_entropy(column):
    """
    Calculate entropy given a pandas series, list, or numpy array.
    """
    # Compute the counts of each unique value in the column
    counts = np.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            # use log from math and set base to 2
            entropy += prob * math.log(prob, 2)
    return -entropy
 def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    #Find the unique values in the column
    values = data[split_name].unique()
    # Make two subsets of the data, based on the unique values
    left_split = data[data[split_name] == values[0]]
    right_split = data[data[split_name] == values[1]]
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    # Return information gain
    return original_entropy - to_subtract
 def get_information_gains(data, target_name):
  #Intialize an empty dictionary for information gains
  information_gains = {}
  #Iterate through each column name in our list
  for col in list(data.columns):
    #Find the information gain for the column
    information_gain = calc_information_gain(data, col, target_name)
    #Add the information gain to our dictionary using the column name as the ekey                                         
    information_gains[col] = information_gain
  #Return the key with the highest value                                          
  #return max(information_gains, key=information_gains.get)
  return information_gains
 def n_features_with_highest_info_gain(info_gain_dict, n=None):
    """
    Get n-features that have highest information gain
    """
    if n is None:
        n = len(info_gain_dict)
    import heapq
    n_largest = heapq.nlargest(n, info_gain_dict.items(), key=lambda i: i[1])
    return {feature[0]: feature[1] for feature in n_largest}
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
 categorical_feature_colnames = ["gender", "startlanguage"]
 additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
 categorical_features = model_input[categorical_feature_colnames].copy()
 mode_categorical_features = categorical_features.mode().iloc[0]
 # fillna with mode
 categorical_features = categorical_features.fillna(mode_categorical_features)
 # one-hot encoding
 categorical_features = categorical_features.apply(lambda col: col.astype("category"))
 if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)
 numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
 model_input = pd.concat([numerical_features, categorical_features], axis=1)
 # Binarizacija targeta
 bins = [-1, 0, 4] # bins for stressfulness (0-4) target
 model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) 
 print(model_input['target'].value_counts(), edges)
 # %%
 info_gains = get_information_gains(model_input, 'target')
 # %% [markdown]
 # Present the feature importance results
 # %%
 print("Total columns:", len(info_gains))
 print(pd.Series(info_gains).value_counts())
 n_features_with_highest_info_gain(info_gains, n=189)
 # %%
 def compute_impurity(feature, impurity_criterion):
    """
    This function calculates impurity of a feature.
    Supported impurity criteria: 'entropy', 'gini'
    input: feature (this needs to be a Pandas series)
    output: feature impurity
    """
    probs = feature.value_counts(normalize=True)
    if impurity_criterion == 'entropy':
        impurity = -1 * np.sum(np.log2(probs) * probs)
    elif impurity_criterion == 'gini':
        impurity = 1 - np.sum(np.square(probs))
    else:
        raise ValueError('Unknown impurity criterion')
    return impurity
 def comp_feature_information_gain(df, target, descriptive_feature, split_criterion, print_flag=False):
    """
    This function calculates information gain for splitting on 
    a particular descriptive feature for a given dataset
    and a given impurity criteria.
    Supported split criterion: 'entropy', 'gini'
    """
    if print_flag:
        print('target feature:', target)
        print('descriptive_feature:', descriptive_feature)
        print('split criterion:', split_criterion)
    target_entropy = compute_impurity(df[target], split_criterion)
    # we define two lists below:
    # entropy_list to store the entropy of each partition
    # weight_list to store the relative number of observations in each partition
    entropy_list = list()
    weight_list = list()
    # loop over each level of the descriptive feature
    # to partition the dataset with respect to that level
    # and compute the entropy and the weight of the level's partition
    for level in df[descriptive_feature].unique():
        df_feature_level = df[df[descriptive_feature] == level]
        entropy_level = compute_impurity(df_feature_level[target], split_criterion)
        entropy_list.append(round(entropy_level, 3))
        weight_level = len(df_feature_level) / len(df)
        weight_list.append(round(weight_level, 3))
    # print('impurity of partitions:', entropy_list)
    # print('weights of partitions:', weight_list)
    feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
    information_gain = target_entropy - feature_remaining_impurity
    if print_flag:
        print('impurity of partitions:', entropy_list)
        print('weights of partitions:', weight_list)
        print('remaining impurity:', feature_remaining_impurity)
        print('information gain:', information_gain)
        print('====================')
    return information_gain
 def calc_information_gain_2(data, split_name, target_name, split_criterion):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original impurity
    original_impurity = compute_impurity(data[target_name], split_criterion)
    #Find the unique values in the column
    values = data[split_name].unique()
    # Make two subsets of the data, based on the unique values
    left_split = data[data[split_name] == values[0]]
    right_split = data[data[split_name] == values[1]]
    # Loop through the splits and calculate the subset impurities
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * compute_impurity(subset[target_name], split_criterion) 
    # Return information gain
    return original_impurity - to_subtract
 def get_information_gains_2(data, target_name, split_criterion):
  #Intialize an empty dictionary for information gains
  information_gains = {}
  #Iterate through each column name in our list
  for feature in list(data.columns):
    #Find the information gain for the column
    information_gain = calc_information_gain_2(model_input, target_name, feature, split_criterion)
    #Add the information gain to our dictionary using the column name as the ekey                                         
    information_gains[feature] = information_gain
  #Return the key with the highest value                                          
  #return max(information_gains, key=information_gains.get)
  return information_gains
 # %% [markdown]
 # Present the feature importance results from other methods
 # %%
 split_criterion = 'entropy'
 print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
 information_gains = get_information_gains_2(model_input, 'target', split_criterion)
 print(pd.Series(information_gains).value_counts().sort_index(ascending=False))
 n_features_with_highest_info_gain(information_gains)
 # %%
 # Present the feature importance using a tree (that uses gini imputity measure)
 split_criterion = 'entropy'
 print("Target impurity:", compute_impurity(model_input['target'], split_criterion))
 X, y  = model_input.drop(columns=['target', 'pid']), model_input['target']
 imputer = SimpleImputer(missing_values=np.nan, strategy='median')
 X = imputer.fit_transform(X)
 X, _, y, _ =  train_test_split(X, y, random_state=19, test_size=0.25)
 clf = DecisionTreeClassifier(criterion=split_criterion)
 clf.fit(X, y)
 feat_importance = clf.tree_.compute_feature_importances(normalize=False)
 print("feat importance = ", feat_importance)
 print("shape", feat_importance.shape)
 tree_feat_imp = dict(zip(model_input.drop(columns=['target', 'pid']).columns, feat_importance.tolist()))
 info_gains_dict = pd.Series(n_features_with_highest_info_gain(tree_feat_imp))
 info_gains_dict[info_gains_dict > 0]
 # %%
 # Binarizacija vrednosti tree Information Gain-a
 bins = [-0.1, 0, 0.1] # bins for target's correlations with features
 cut_info_gains = pd.cut(info_gains_dict, bins=bins, labels=['IG=0', 'IG>0'], right=True) 
 plt.title(f"Tree information gains by value ({split_criterion})")
 cut_info_gains.value_counts().plot(kind='bar', color='purple')
 plt.xticks(rotation=45, ha='right')
 print(cut_info_gains.value_counts())
 pd.Series(n_features_with_highest_info_gain(tree_feat_imp, 20))
 # %%
 # Plot feature importance tree graph 
 plt.figure(figsize=(12,12))
 tree.plot_tree(clf,
               feature_names = list(model_input.drop(columns=['target', 'pid']).columns), 
               class_names=True,
               filled = True, fontsize=5, max_depth=3)
 plt.savefig('tree_high_dpi', dpi=800)
 # %% [markdown]
 # Present the feature importance by correlation with target
 corrs = abs(model_input.drop(columns=["target", 'pid'], axis=1).apply(lambda x: x.corr(model_input.target.astype(int))))
 # corrs.sort_values(ascending=False)
 # Binarizacija vrednosti korelacij
 bins = [0, 0.1, 0.2, 0.3] # bins for target's correlations with features
 cut_corrs = pd.cut(corrs, bins=bins, labels=['very week (0-0.1)', 'weak (0.1-0.2)', 'medium (0.2-0.3)'], right=True) 
 plt.title("Target's correlations with features")
 cut_corrs.value_counts().plot(kind='bar')
 plt.xticks(rotation=45, ha='right')
 print(cut_corrs.value_counts())
 print(corrs[corrs > 0.1]) # or corrs < -0.1])
 # %%
 # %%
--- a/exploration/expl_features_groups_analysis.py
+++ b/exploration/expl_features_groups_analysis.py
@ -0,0 +1,328 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 # %matplotlib inline
 import os, sys, math
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 from sklearn.impute import SimpleImputer
 from sklearn.naive_bayes import GaussianNB  
 from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
 from sklearn import metrics 
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
 categorical_feature_colnames = ["gender", "startlanguage"]
 additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
 categorical_features = model_input[categorical_feature_colnames].copy()
 mode_categorical_features = categorical_features.mode().iloc[0]
 # fillna with mode
 categorical_features = categorical_features.fillna(mode_categorical_features)
 # one-hot encoding
 categorical_features = categorical_features.apply(lambda col: col.astype("category"))
 if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)
 numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
 model_input = pd.concat([numerical_features, categorical_features], axis=1)
 # Binarizacija targeta
 bins = [-1, 0, 4] # bins for stressfulness (0-4) target
 model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) 
 print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
 print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)
 # %%
 # Add prefix to demographical features
 demo_features = ['age', 'limesurvey_demand', 'limesurvey_control', 'limesurvey_demand_control_ratio', 'limesurvey_demand_control_ratio_quartile', 
                'gender_F', 'gender_M', 'startlanguage_nl', 'startlanguage_sl']
 new_names = [(col, "demo_"+col) for col in demo_features]
 model_input.rename(columns=dict(new_names), inplace=True)
 demo_features = ['demo_age', 'demo_limesurvey_demand', 'demo_limesurvey_control', 'demo_limesurvey_demand_control_ratio', 
                'demo_limesurvey_demand_control_ratio_quartile', 'target', 'demo_gender_F', 'demo_gender_M', 
                'demo_startlanguage_nl', 'demo_startlanguage_sl']
 # %%
 # Get phone and non-phone columns
 import warnings
 def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
    """
    This function makes predictions with sensor groups. 
    It takes in a dataframe (df), a list of group substrings (groups_substrings) 
    and an optional parameter include_group (default is True). 
    It creates a list of columns in the dataframe that contain the group substrings, 
    while excluding the 'pid' and 'target' columns. It then splits the data into training 
    and test sets, using a test size of 0.25 for the first split and 0.2 for the second split. 
    A SimpleImputer is used to fill in missing values with median values. 
    A LogisticRegression is then used to fit the training set and make predictions 
    on the test set. Finally, accuracy, precision, recall and F1 scores are printed 
    for each substring group depending on whether or not include_group 
    is set to True or False.
    """
    best_sensor = None
    best_recall_score, best_f1_score = None, None
    for fgroup_substr in groups_substrings:
        if fgroup_substr is None:
            feature_group_cols = list(df.columns)
            feature_group_cols.remove("pid")
            feature_group_cols.remove("target")
        else: 
            if include_group:
                feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
            else:    
                feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
        X, y  = df.drop(columns=['target', 'pid'])[feature_group_cols+with_cols], df['target']
        X, _, y, _ =  train_test_split(X, y, stratify=y, random_state=19, test_size=0.2)
        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        nb = GaussianNB()
        model_cv = cross_validate(
            nb,
            X=imputer.fit_transform(X),
            y=y,
            cv=StratifiedKFold(n_splits=5, shuffle=True),
            n_jobs=-1,
            scoring=('accuracy', 'precision', 'recall', 'f1')
        )
        X_train, X_test, y_train, y_test =  train_test_split(X, y, stratify=y, random_state=2, test_size=0.2)
        if print_flag:
            if include_group:
                print("\nPrediction with", fgroup_substr)
            else:
                print("\nPrediction without", fgroup_substr)
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
            acc = np.mean(model_cv['test_accuracy'])
            acc_std = np.std(model_cv['test_accuracy'])
            prec = np.mean(model_cv['test_precision'])
            prec_std = np.std(model_cv['test_precision'])
            rec = np.mean(model_cv['test_recall'])
            rec_std = np.std(model_cv['test_recall'])
            f1 = np.mean(model_cv['test_f1'])
            f1_std = np.std(model_cv['test_f1'])
        if print_flag:
            print("************************************************")
            print(f"Accuracy: {acc} (sd={acc_std})")
            print(f"Precison: {prec} (sd={prec_std})")
            print(f"Recall: {rec} (sd={rec_std})")
            print(f"F1: {f1} (sd={f1_std})\n")
        if (not best_recall_score and not best_f1_score) or (rec > best_recall_score):
            best_sensor = fgroup_substr
            best_recall_score, best_f1_score = rec, f1
            best_recall_score_std, best_f1_score_std = rec_std, f1_std
    return best_sensor, best_recall_score, best_f1_score, best_recall_score_std, best_f1_score_std 
 # %% [markdown]
 # ### sensor big feature groups (phone, empatica, demographical)
 big_groups_substr = ["phone_", "empatica_", "demo_"]
 make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=big_groups_substr, include_group=False)
 # %% [markdown]
 # ### Empatica sezor groups
 # make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
 # e4_sensors = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
 # make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=e4_sensors, include_group=False)
 # %% [markdown]
 # ### Phone sensor groups
 # make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
 # phone_sensors = ["phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery", "phone_calls_", 
 #                 "phone_light_", "phone_location_", "phone_messages", "phone_screen_", "phone_speech_"]
 # make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=phone_sensors, include_group=False)
 # %%
 # Write all the sensors  (phone, empatica), seperate other (demographical) cols also
 sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
                        "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
 # %%
 def find_sensor_group_features_importance(model_input, sensor_groups_strings):
    """
    This function finds the importance of sensor groups for a given model input. It takes two parameters: 
    model_input and sensor_groups_strings. It creates an empty list called sensor_importance_scores, 
    which will be populated with tuples containing the best sensor, its recall score, and its F1 score. 
    It then makes a copy of the model input and the sensor groups strings. It then loops through each group 
    in the list of strings, creating a list of important columns from the sensor importance scores list. 
    It then calls make_predictions_with_sensor_groups to determine the best sensor, its recall score, 
    and its F1 score. These values are added to the sensor importance scores list as a tuple. The function 
    then removes that best sensor from the list of strings before looping again until all groups have been evaluated. 
    Finally, it returns the populated list of tuples containing all sensors' scores. 
    """
    sensor_importance_scores = []
    model_input = model_input.copy()
    sensor_groups_strings = sensor_groups_strings.copy()
    groups_len = len(sensor_groups_strings)
    for i in range(groups_len):
        important_cols = [col[0] for col in sensor_importance_scores]
        with_cols = [col for col in model_input.columns if any(col.startswith(y) for y in important_cols)]
        best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std  = \
            make_predictions_with_sensor_groups(model_input, 
            groups_substrings=sensor_groups_strings, include_group=True, 
            with_cols=with_cols)
        sensor_importance_scores.append((best_sensor, best_recall_score, best_f1_sore, best_recall_score_std, best_f1_score_std ))
        print(f"\nAdded sensor: {best_sensor}\n")
        sensor_groups_strings.remove(best_sensor)
    return sensor_importance_scores
 # %%
 # Method for sorting list of tuples into 3 lists
 def sort_tuples_to_lists(list_of_tuples):
    """
    sort_tuples_to_lists(list_of_tuples) is a method that takes in a list of tuples as an argument 
    and sorts them into three separate lists. The first list, xs, contains the first element 
    of each tuple. The second list, yrecall, contains the second element of each tuple rounded 
    to 4 decimal places. The third list, y_fscore, contains the third element of each tuple 
    rounded to 4 decimal places. The method returns all three lists. 
    """
    xs, y_recall, y_fscore, recall_std, fscore_std = [], [], [], [], []
    for a_tuple in list_of_tuples:
        xs.append(a_tuple[0])
        y_recall.append(round(a_tuple[1], 4))
        y_fscore.append(round(a_tuple[2], 4))
        recall_std.append(round(a_tuple[3], 4))
        fscore_std.append(round(a_tuple[4], 4))
    return xs, y_recall, y_fscore, recall_std, fscore_std
 def plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
                                                        title="Sequential addition of features and its F1, and recall scores"):
    """
    This function plots the sequential progress of feature addition scores using two subplots. 
    The first subplot is for recall scores and the second subplot is for F1-scores. 
    The parameters xs, yrecall, and yfscore are used to plot the data on the respective axes. 
    The title of the plot can be specified by the user using the parameter title. 
    The maximum recall index and maximum F1-score index are also plotted using a black dot. 
    The figure size is set to 18.5 inches in width and 10.5 inches in height, 
    and the x-axis labels are rotated by 90 degrees. Finally, the plot is displayed 
    using plt.show().
    """
    fig, ax = plt.subplots(nrows=2, sharex=True)
    ax[0].plot(xs, np.array(y_recall)+np.array(recall_std), linestyle=":", color='m') # Upper SD
    ax[0].plot(xs, y_recall, color='red')
    ax[0].plot(xs, np.array(y_recall)-np.array(recall_std), linestyle=":", color='m') # Lower SD
    mrec_indx = np.argmax(y_recall)
    ax[0].plot(xs[mrec_indx], y_recall[mrec_indx], "-o", color='black')
    ax[0].legend(["Upper std", "Mean Recall", "Lower std"])
    ax[1].plot(xs, np.array(y_fscore)+np.array(fscore_std), linestyle=":", color='c') # Upper SD
    ax[1].plot(xs, y_fscore)
    ax[1].plot(xs, np.array(y_fscore)-np.array(fscore_std), linestyle=":", color='c') # Lower SD
    mfscore_indx = np.argmax(y_fscore)
    ax[1].plot(xs[mfscore_indx], y_fscore[mfscore_indx], "-o", color='black')
    ax[1].legend(["Upper std", "Mean F1-score", "Lower std"])
    fig.set_size_inches(18.5, 10.5)
    ax[0].title.set_text('Recall scores')
    ax[1].title.set_text('F1-scores')
    plt.suptitle(title, fontsize=14)
    plt.xticks(rotation=90)
    plt.show()
 # %%
 sensors_features_groups = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_",
                        "phone_activity_", "phone_applications_", "phone_bluetooth_", "phone_battery_", "phone_calls_", "phone_light_",
                        "phone_locations_", "phone_messages", "phone_screen_"] # , "phone_speech_"]
 # sensors_features_groups = ["phone_", "empatica_", "demo_"]
 # %%
 # sensor_importance_scores = find_sensor_group_features_importance(model_input, big_groups_substr)
 sensor_groups_importance_scores = find_sensor_group_features_importance(model_input, sensors_features_groups)
 xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(sensor_groups_importance_scores)
 # %% [markdown]
 # ### Visualize sensors groups F1 and recall scores
 print(sensor_groups_importance_scores)
 plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
                                                    title="Sequential addition of sensors and its F1, and recall scores")
 # %%
 # Take the most important feature group and investigate it feature-by-feature
 best_sensor_group = sensor_groups_importance_scores[0][0] # take the highest rated sensor group
 best_sensor_features = [col for col in model_input if col.startswith(best_sensor_group)]
 # best_sensor_features_scores = find_sensor_group_features_importance(model_input, best_sensor_features)
 # xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(best_sensor_features_scores)
 # %% [markdown]
 # ### Visualize best sensor's F1 and recall scores
 # print(best_sensor_features_scores)
 # plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std,
 #                                                     title="Best sensor addition it's features with F1 and recall scores")
 # %%
 # This section iterates over all sensor groups and investigates sequential feature importance feature-by-feature
 # It also saves the sequence of scores for all sensors' features in excel file
 seq_columns = ["sensor_name", "feature_sequence", "recall", "f1_score"]
 feature_sequence = pd.DataFrame(columns=seq_columns)
 for i, sensor_group in enumerate(sensor_groups_importance_scores):
    current_sensor_features = [col for col in model_input if col.startswith(sensor_group[0])]
    current_sensor_features_scores = find_sensor_group_features_importance(model_input, current_sensor_features)
    xs, y_recall, y_fscore, recall_std, fscore_std = sort_tuples_to_lists(current_sensor_features_scores)
    feature_sequence = pd.concat([feature_sequence, pd.DataFrame({"sensor_name":sensor_group[0], "feature_sequence": [xs], "recall": [y_recall], 
                                                             "f1_score": [y_fscore], "recall_std": [recall_std], "f1_std": [fscore_std]})])
    plot_sequential_progress_of_feature_addition_scores(xs, y_recall, y_fscore, recall_std, fscore_std, 
    title=f"Sequential addition of features for {sensor_group[0]} and its F1, and recall scores")
 feature_sequence.to_excel("all_sensors_sequential_addition_scores.xlsx", index=False)
 # %%
 # TODO: method that reads data from the excel file, specified above, and then the method,
 # that selects only features that are max a thresh[%] below the max value (best for recall
 # possibly for f1). This method should additionally take threshold parameter.
 # %%
--- a/exploration/expl_stress_event.py
+++ b/exploration/expl_stress_event.py
@ -0,0 +1,166 @@
 # -*- coding: utf-8 -*-
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %%
 import os
 import sys
 import datetime
 import math
 import seaborn as sns
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import participants.query_db
 from features.esm import *
 from features.esm_JCQ import *
 from features.esm_SAM import *
 from IPython.core.interactiveshell import InteractiveShell
 InteractiveShell.ast_node_interactivity = "all"
 # %%
 participants_inactive_usernames = participants.query_db.get_usernames(
    collection_start=datetime.date.fromisoformat("2020-08-01")
 )
 df_esm_inactive = get_esm_data(participants_inactive_usernames)
 # %%
 df_esm_preprocessed = preprocess_esm(df_esm_inactive)
 # %% [markdown]
 # Investigate stressfulness events
 # %%
 extracted_ers = df_esm_preprocessed.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire length
 extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire answering is 15 min
 session_start_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].min().to_frame().rename(columns={'timestamp': 'session_start_timestamp'}) # questionnaire start timestamp
 session_end_timestamp = df_esm_preprocessed.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
 se_time = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
 se_duration = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
 # Make se_durations to the appropriate lengths
 # Extracted 3 targets that will be transfered in the csv file to the cleaning script. 
 df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].columns
 se_stressfulness_event_tg = df_esm_preprocessed[df_esm_preprocessed.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'appraisal_stressfulness_event'})
 # All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
 extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \
                                .join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
                                .join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
                                .join(se_time, on=['device_id', 'esm_session'], how='left') \
                                .join(se_duration, on=['device_id', 'esm_session'], how='left') \
 # Filter-out the sessions that are not useful. Because of the ambiguity this excludes: 
 # (1) straw event times that are marked as "0 - I don't remember"
 # (2) straw event durations that are marked as "0 - I don't remember" 
 extracted_ers = extracted_ers[(~extracted_ers.se_time.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("0 - ")) & (~extracted_ers.se_duration.astype(str).str.startswith("Removed "))]
 extracted_ers.reset_index(drop=True, inplace=True)
 # Add default duration in case if participant answered that no stressful event occured
 # Prepare data to fit the data structure in the CSV file ...
 # Add the event time as the start of the questionnaire if no stress event occured
 extracted_ers['se_time'] = extracted_ers['se_time'].fillna(extracted_ers['session_start_timestamp'])
 # Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds 
 extracted_ers['event_timestamp'] = extracted_ers['se_time'].apply(lambda x: x if isinstance(x, int) else pd.to_datetime(x).timestamp() * 1000).astype('int64')
 extracted_ers['shift_direction'] = -1
 """>>>>> begin section (could be optimized) <<<<<"""
 # Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
 # is taken as end time of the segment. Else the user input duration is taken. 
 extracted_ers['temp_duration'] = extracted_ers['se_duration']
 extracted_ers['se_duration'] = \
    np.where(
        extracted_ers['se_duration'].astype(str).str.startswith("1 - "),
        extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'], 
        extracted_ers['se_duration']
    )
 # This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds.
 extracted_ers['se_duration'] = \
    extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else abs(pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60)
 # Check whether min se_duration is at least the same duration as the ioi. Filter-out the rest.
 """>>>>> end section <<<<<"""
 # %% [markdown]
 # Count negative values of duration
 print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
 print("Count stressed:", extracted_ers[(~extracted_ers['se_duration'].isna())][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
 print("Count negative durations (invalid se_time user input):", extracted_ers[extracted_ers['se_duration'] < 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
 print("Count 0 durations:", extracted_ers[extracted_ers['se_duration'] == 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0])
 extracted_ers[extracted_ers['se_duration'] <= 0][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']].shape[0]
 extracted_ers[(~extracted_ers['se_duration'].isna()) & (extracted_ers['se_duration'] <= 0)][['se_duration', 'temp_duration', 'session_end_timestamp', 'event_timestamp']]
 ax = extracted_ers.hist(column='se_duration', bins='auto', grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)
 hist, bin_edges = np.histogram(extracted_ers['se_duration'].dropna())
 hist
 bin_edges
 extracted_ers = extracted_ers[extracted_ers['se_duration'] >= 0]
 # %%
 # bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos'  ..... right=False
 bins = [-100000000, -0.0000001, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
 extracted_ers['bins'], edges = pd.cut(extracted_ers.se_duration, bins=bins, labels=['neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
 sns.displot(
    data=extracted_ers.dropna(),
    x="bins",
    binwidth=0.1,
 )
 # %% [markdown]
 extracted_ers[extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'] >= 0]
 extracted_ers['se_time'].value_counts()
 pd.set_option('display.max_rows', 100)
 # Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika. 
 extracted_ers = extracted_ers[~extracted_ers['se_duration'].isna()] # Remove no stress events
 extracted_ers['diff_se_time_session_end'] = (extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'])
 print("Count all:", extracted_ers[['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
 print("Count negative durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] < 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']])
 print("Count 0 durations:", extracted_ers[extracted_ers['diff_se_time_session_end'] == 0][['se_duration', 'temp_duration', 'session_start_timestamp', 'event_timestamp']].shape[0])
 extracted_ers[extracted_ers['diff_se_time_session_end'] < 0]['diff_se_time_session_end']
 # extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)]
 bins2 = [-100000, 0, 300, 600, 1200, 3600, 7200, 14400, 1000000000] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
 extracted_ers['bins2'], edges = pd.cut(extracted_ers.diff_se_time_session_end, bins=bins2, labels=['neg_zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'], retbins=True, right=True) #['low', 'medium', 'high']
 extracted_ers['bins2']
 sns.displot(
    data=extracted_ers.dropna(),
    x="bins2",
    binwidth=0.1,
 )
 extracted_ers.shape
 extracted_ers.dropna().shape
 print()
 # %%
 extracted_ers['appraisal_stressfulness_event_num'] = extracted_ers['appraisal_stressfulness_event'].str[0].astype(int)
 print("duration-target (corr):", extracted_ers['se_duration'].corr(extracted_ers['appraisal_stressfulness_event_num']))
 # %%
 # Explore groupby participants?
--- a/exploration/ml_pipeline.py
+++ b/exploration/ml_pipeline.py
@ -0,0 +1,49 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% 
 import sys, os
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 from machine_learning.cross_validation import CrossValidation
 from machine_learning.preprocessing import Preprocessing
 # %% 
 df = pd.read_csv("../data/stressfulness_event_with_speech/input_appraisal_stressfulness_event_mean.csv")
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 df.set_index(index_columns, inplace=True)
 cv = CrossValidation(data=df, cv_method="logo")
 categorical_columns = ["gender", "startlanguage", "mostcommonactivity", "homelabel"]
 interval_feature_list, other_feature_list = [], []
 print(df.columns.tolist())
 for split in cv.get_splits():
    train_X, train_y, test_X, test_y = cv.get_train_test_sets(split)
    pre = Preprocessing(train_X, train_y, test_X, test_y)
    pre.one_hot_encode_train_and_test_sets(categorical_columns)
    train_X, train_y, test_X, test_y = pre.get_train_test_sets()
    break
 # %%
--- a/exploration/ml_pipeline_classification.py
+++ b/exploration/ml_pipeline_classification.py
@ -0,0 +1,462 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 # %matplotlib inline
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
 from sklearn.model_selection import LeaveOneGroupOut, cross_validate, StratifiedKFold
 from sklearn.dummy import DummyClassifier
 from sklearn.impute import SimpleImputer
 from lightgbm import LGBMClassifier
 import xgboost as xg
 from IPython.core.interactiveshell import InteractiveShell
 InteractiveShell.ast_node_interactivity = "all"
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import machine_learning.helper
 # %% [markdown]
 # # RAPIDS models
 # %% [markdown]
 # ## Set script's parameters
 #
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 cv_method_str = 'logo' # logo, half_logo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
 n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
 undersampling = False # (bool) If True this will train and test data on balanced dataset (using undersampling method)
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
 # model_input = model_input[model_input.columns.drop(list(model_input.filter(regex='empatica_temperature')))]
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input.set_index(index_columns, inplace=True)
 model_input['target'].value_counts()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 # bins = [-10, 0, 10] # bins for z-scored targets
 bins = [-1, 0, 4] # bins for stressfulness (0-4) target
 model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
 model_input['target'].value_counts(), edges
 # model_input = model_input[model_input['target'] != "medium"]
 model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
 model_input['target'].value_counts()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 # UnderSampling
 if undersampling:
    no_stress = model_input[model_input['target'] == 0]
    stress = model_input[model_input['target'] == 1]
    no_stress = no_stress.sample(n=len(stress))
    model_input = pd.concat([stress,no_stress], axis=0)
 #   model_input_new = pd.DataFrame(columns=model_input.columns)
 #   for pid in model_input["pid"].unique():
 #     stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 1)]
 #     no_stress = model_input[(model_input["pid"] == pid) & (model_input['target'] == 0)]
 #     if (len(stress) == 0):
 #       continue
 #     if (len(no_stress) == 0):
 #       continue
 #     model_input_new = pd.concat([model_input_new, stress], axis=0)
 #     no_stress = no_stress.sample(n=min(len(stress), len(no_stress)))
 #     # In case there are more stress samples than no_stress, take all instances of no_stress.
 #     model_input_new = pd.concat([model_input_new, no_stress], axis=0)
 #     model_input = model_input_new   
 #     model_input_new = pd.concat([model_input_new, no_stress], axis=0)
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 if cv_method_str == 'half_logo':
    model_input['pid_index'] = model_input.groupby('pid').cumcount()
    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
 else:
    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 categorical_feature_colnames = ["gender", "startlanguage"]
 additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
 categorical_features = data_x[categorical_feature_colnames].copy()
 mode_categorical_features = categorical_features.mode().iloc[0]
 # fillna with mode
 categorical_features = categorical_features.fillna(mode_categorical_features)
 # one-hot encoding
 categorical_features = categorical_features.apply(lambda col: col.astype("category"))
 if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)
 numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
 train_x = pd.concat([numerical_features, categorical_features], axis=1)
 train_x.dtypes
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
 if cv_method_str == 'logo' or cv_method_str == 'half_logo':
    cv_method = LeaveOneGroupOut()
    cv_method.get_n_splits(
        train_x,
        data_y,
        groups=data_groups,
    )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 imputer = SimpleImputer(missing_values=np.nan, strategy='median')
 # %% [markdown]
 # ### Baseline: Dummy Classifier (most frequent)
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 dummy_class = DummyClassifier(strategy="most_frequent")
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 dummy_classifier = cross_validate(
    dummy_class,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(dummy_classifier['test_accuracy']))
 print("Acc (mean)", np.mean(dummy_classifier['test_accuracy']))
 print("Precision", np.mean(dummy_classifier['test_precision']))
 print("Recall", np.mean(dummy_classifier['test_recall']))
 print("F1", np.mean(dummy_classifier['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dummy_classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dummy_classifier['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown] nteract={"transient": {"deleting": false}}
 # ### All models
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 final_scores = machine_learning.helper.run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
 # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
 # %%
 final_scores.index.name = "metric"
 final_scores = final_scores.set_index(["method", final_scores.index])
 final_scores.to_csv(f"../presentation/event_stressful_detection_{cv_method_str}.csv")
 # %% [markdown]
 # ### Logistic Regression
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 logistic_regression = linear_model.LogisticRegression()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 log_reg_scores = cross_validate(
    logistic_regression,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(log_reg_scores['test_accuracy']))
 print("Acc (mean)", np.mean(log_reg_scores['test_accuracy']))
 print("Precision", np.mean(log_reg_scores['test_precision']))
 print("Recall", np.mean(log_reg_scores['test_recall']))
 print("F1", np.mean(log_reg_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-log_reg_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(log_reg_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### Support Vector Machine
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 svc = svm.SVC()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 svc_scores = cross_validate(
    svc,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(svc_scores['test_accuracy']))
 print("Acc (mean)", np.mean(svc_scores['test_accuracy']))
 print("Precision", np.mean(svc_scores['test_precision']))
 print("Recall", np.mean(svc_scores['test_recall']))
 print("F1", np.mean(svc_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-svc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(svc_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### Gaussian Naive Bayes
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 gaussian_nb = naive_bayes.GaussianNB()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 gaussian_nb_scores = cross_validate(
    gaussian_nb,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(gaussian_nb_scores['test_accuracy']))
 print("Acc (mean)", np.mean(gaussian_nb_scores['test_accuracy']))
 print("Precision", np.mean(gaussian_nb_scores['test_precision']))
 print("Recall", np.mean(gaussian_nb_scores['test_recall']))
 print("F1", np.mean(gaussian_nb_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gaussian_nb_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### Stochastic Gradient Descent Classifier
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 sgdc = linear_model.SGDClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 sgdc_scores = cross_validate(
    sgdc,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(sgdc_scores['test_accuracy']))
 print("Acc (mean)", np.mean(sgdc_scores['test_accuracy']))
 print("Precision", np.mean(sgdc_scores['test_precision']))
 print("Recall", np.mean(sgdc_scores['test_recall']))
 print("F1", np.mean(sgdc_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-sgdc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(sgdc_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### K-nearest neighbors
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 knn = neighbors.KNeighborsClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 knn_scores = cross_validate(
    knn,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(knn_scores['test_accuracy']))
 print("Acc (mean)", np.mean(knn_scores['test_accuracy']))
 print("Precision", np.mean(knn_scores['test_precision']))
 print("Recall", np.mean(knn_scores['test_recall']))
 print("F1", np.mean(knn_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-knn_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(knn_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### Decision Tree
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 dtree = tree.DecisionTreeClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 dtree_scores = cross_validate(
    dtree,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(dtree_scores['test_accuracy']))
 print("Acc (mean)", np.mean(dtree_scores['test_accuracy']))
 print("Precision", np.mean(dtree_scores['test_precision']))
 print("Recall", np.mean(dtree_scores['test_recall']))
 print("F1", np.mean(dtree_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-dtree_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(dtree_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### Random Forest Classifier
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 rfc = ensemble.RandomForestClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 rfc_scores = cross_validate(
    rfc,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1'), 
    return_estimator=True
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(rfc_scores['test_accuracy']))
 print("Acc (mean)", np.mean(rfc_scores['test_accuracy']))
 print("Precision", np.mean(rfc_scores['test_precision']))
 print("Recall", np.mean(rfc_scores['test_recall']))
 print("F1", np.mean(rfc_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-rfc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(rfc_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### Feature importance (RFC)
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 rfc_es_fimp = pd.DataFrame(columns=list(train_x.columns))
 for idx, estimator in enumerate(rfc_scores['estimator']):
    feature_importances = pd.DataFrame(estimator.feature_importances_,
                                       index = list(train_x.columns),
                                        columns=['importance'])
    # print("\nFeatures sorted by their score for estimator {}:".format(idx))
    # print(feature_importances.sort_values('importance', ascending=False).head(10))                                    
    rfc_es_fimp = pd.concat([rfc_es_fimp, feature_importances]).groupby(level=0).mean()
 pd.set_option('display.max_rows', 100)
 print(rfc_es_fimp.sort_values('importance', ascending=False).head(30))
 rfc_es_fimp.sort_values('importance', ascending=False).head(30).plot.bar()
 rfc_es_fimp.sort_values('importance', ascending=False).tail(30).plot.bar()
 train_x['empatica_temperature_cr_stdDev_X_SO_mean'].value_counts()
 # %% [markdown]
 # ### Gradient Boosting Classifier
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 gbc = ensemble.GradientBoostingClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 gbc_scores = cross_validate(
    gbc,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(gbc_scores['test_accuracy']))
 print("Acc (mean)", np.mean(gbc_scores['test_accuracy']))
 print("Precision", np.mean(gbc_scores['test_precision']))
 print("Recall", np.mean(gbc_scores['test_recall']))
 print("F1", np.mean(gbc_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-gbc_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(gbc_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### LGBM Classifier
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 lgbm = LGBMClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 lgbm_scores = cross_validate(
    lgbm,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(lgbm_scores['test_accuracy']))
 print("Acc (mean)", np.mean(lgbm_scores['test_accuracy']))
 print("Precision", np.mean(lgbm_scores['test_precision']))
 print("Recall", np.mean(lgbm_scores['test_recall']))
 print("F1", np.mean(lgbm_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-lgbm_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(lgbm_scores['test_accuracy'], n_sl)[:n_sl]))
 # %% [markdown]
 # ### XGBoost Classifier
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 xgb_classifier = xg.sklearn.XGBClassifier()
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 xgb_classifier_scores = cross_validate(
    xgb_classifier,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=('accuracy', 'precision', 'recall', 'f1')
 )
 # %% jupyter={"source_hidden": false, "outputs_hidden": false}
 print("Acc (median)", np.nanmedian(xgb_classifier_scores['test_accuracy']))
 print("Acc (mean)", np.mean(xgb_classifier_scores['test_accuracy']))
 print("Precision", np.mean(xgb_classifier_scores['test_precision']))
 print("Recall", np.mean(xgb_classifier_scores['test_recall']))
 print("F1", np.mean(xgb_classifier_scores['test_f1']))
 print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl])[::-1])
 print(f"Smallest {n_sl} ACC:", np.sort(np.partition(xgb_classifier_scores['test_accuracy'], n_sl)[:n_sl]))
--- a/exploration/ml_pipeline_classification_with_clustering.py
+++ b/exploration/ml_pipeline_classification_with_clustering.py
@ -0,0 +1,184 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"source_hidden": true}
 # %matplotlib inline
 import datetime
 import importlib
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 from scipy import stats
 from sklearn.model_selection import LeaveOneGroupOut, cross_validate
 from sklearn.impute import SimpleImputer
 from sklearn.dummy import DummyClassifier
 from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
 import xgboost as xg 
 from sklearn.cluster import KMeans
 from IPython.core.interactiveshell import InteractiveShell
 InteractiveShell.ast_node_interactivity = "all"
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import machine_learning.labels
 import machine_learning.model
 from machine_learning.classification_models import ClassificationModels
 # %% [markdown]
 # # RAPIDS models
 # %% [markdown]
 # ## Set script's parameters
 n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
 cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
 n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
 # %% jupyter={"source_hidden": true}
 model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
 model_input.columns[list(model_input.columns).index('age'):-1]
 lime_cols = [col for col in model_input if col.startswith('limesurvey')]
 lime_cols
 lime_col = 'limesurvey_demand_control_ratio_quartile'
 clust_col = lime_col
 model_input[clust_col].describe()
 # %% jupyter={"source_hidden": true}
 # Filter-out outlier rows by clust_col 
 #model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
 uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
 uniq = uniq.dropna()
 plt.bar(uniq['pid'], uniq[clust_col])
 # %% jupyter={"source_hidden": true}
 # Get clusters by cluster col & and merge the clusters to main df
 km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
 np.unique(km, return_counts=True)
 uniq['cluster'] = km
 uniq
 model_input = model_input.merge(uniq[['pid', 'cluster']])   
 # %% jupyter={"source_hidden": true}
 model_input.set_index(index_columns, inplace=True)
 # %% jupyter={"source_hidden": true}
 # Create dict with classification ml models
 cm = ClassificationModels()
 cmodels = cm.get_cmodels()
 # %% jupyter={"source_hidden": true}
 for k in range(n_clusters):
    model_input_subset = model_input[model_input["cluster"] == k].copy()
    bins = [-10, -1, 1, 10] # bins for z-scored targets
    model_input_subset.loc[:, 'target'] = \
        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=['low', 'medium', 'high'], right=False) #['low', 'medium', 'high']
    model_input_subset['target'].value_counts()
    model_input_subset = model_input_subset[model_input_subset['target'] != "medium"]
    model_input_subset['target'] = model_input_subset['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
    model_input_subset['target'].value_counts()
    if cv_method_str == 'half_logo':
        model_input_subset['pid_index'] = model_input_subset.groupby('pid').cumcount()
        model_input_subset['pid_count'] = model_input_subset.groupby('pid')['pid'].transform('count')
        model_input_subset["pid_index"] = (model_input_subset['pid_index'] / model_input_subset['pid_count'] + 1).round()
        model_input_subset["pid_half"] = model_input_subset["pid"] + "_" +  model_input_subset["pid_index"].astype(int).astype(str)
        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input_subset["target"], model_input_subset["pid_half"]
    else:
        data_x, data_y, data_groups = model_input_subset.drop(["target", "pid"], axis=1), model_input_subset["target"], model_input_subset["pid"]
    # Treat categorical features
    categorical_feature_colnames = ["gender", "startlanguage"]
    additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
    categorical_feature_colnames += additional_categorical_features
    categorical_features = data_x[categorical_feature_colnames].copy()
    mode_categorical_features = categorical_features.mode().iloc[0]
    # fillna with mode
    categorical_features = categorical_features.fillna(mode_categorical_features)
    # one-hot encoding
    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
    if not categorical_features.empty:
        categorical_features = pd.get_dummies(categorical_features)
    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
    train_x = pd.concat([numerical_features, categorical_features], axis=1)
    # Establish cv method
    cv_method = StratifiedKFold(n_splits=5, shuffle=True) # Defaults to 5 k-folds in cross_validate method
    if cv_method_str == 'logo' or cv_method_str == 'half_logo':
        cv_method = LeaveOneGroupOut()
        cv_method.get_n_splits(
            train_x,
            data_y,
            groups=data_groups,
        )
    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    for model_title, model in cmodels.items():
        classifier = cross_validate(
            model['model'],
            X=imputer.fit_transform(train_x),
            y=data_y,
            groups=data_groups,
            cv=cv_method,
            n_jobs=-1,
            error_score='raise',
            scoring=('accuracy', 'precision', 'recall', 'f1')
        )
        print("\n-------------------------------------\n")
        print("Current cluster:", k, end="\n")
        print("Current model:", model_title, end="\n")
        print("Acc", np.mean(classifier['test_accuracy']))
        print("Precision", np.mean(classifier['test_precision']))
        print("Recall", np.mean(classifier['test_recall']))
        print("F1", np.mean(classifier['test_f1']))
        print(f"Largest {n_sl} ACC:", np.sort(-np.partition(-classifier['test_accuracy'], n_sl)[:n_sl])[::-1])
        print(f"Smallest {n_sl} ACC:", np.sort(np.partition(classifier['test_accuracy'], n_sl)[:n_sl]))
        cmodels[model_title]['metrics'][0] += np.mean(classifier['test_accuracy'])
        cmodels[model_title]['metrics'][1] += np.mean(classifier['test_precision'])
        cmodels[model_title]['metrics'][2] += np.mean(classifier['test_recall'])
        cmodels[model_title]['metrics'][3] += np.mean(classifier['test_f1'])
 # %% jupyter={"source_hidden": true}
 # Get overall results
 cm.get_total_models_scores(n_clusters=n_clusters)
--- a/exploration/ml_pipeline_classification_with_clustering_2_class.py
+++ b/exploration/ml_pipeline_classification_with_clustering_2_class.py
@ -0,0 +1,171 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"source_hidden": true}
 # %matplotlib inline
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 from scipy import stats
 from sklearn.model_selection import train_test_split
 from sklearn.impute import SimpleImputer
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 from sklearn.cluster import KMeans
 from IPython.core.interactiveshell import InteractiveShell
 InteractiveShell.ast_node_interactivity = "all"
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 from machine_learning.classification_models import ClassificationModels
 # %% [markdown]
 # # RAPIDS models
 # %% [markdown]
 # # Useful method
 def treat_categorical_features(input_set):
    categorical_feature_colnames = ["gender", "startlanguage"]
    additional_categorical_features = [col for col in input_set.columns if "mostcommonactivity" in col or "homelabel" in col]
    categorical_feature_colnames += additional_categorical_features
    categorical_features = input_set[categorical_feature_colnames].copy()
    mode_categorical_features = categorical_features.mode().iloc[0]
    # fillna with mode
    categorical_features = categorical_features.fillna(mode_categorical_features)
    # one-hot encoding
    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
    if not categorical_features.empty:
        categorical_features = pd.get_dummies(categorical_features)
    numerical_features = input_set.drop(categorical_feature_colnames, axis=1)
    return pd.concat([numerical_features, categorical_features], axis=1)
 # %% [markdown]
 # ## Set script's parameters
 n_clusters = 3 # Number of clusters (could be regarded as a hyperparameter)
 n_sl = 3 # Number of largest/smallest accuracies (of particular CV) outputs
 # %% jupyter={"source_hidden": true}
 model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
 model_input.columns[list(model_input.columns).index('age'):-1]
 lime_cols = [col for col in model_input if col.startswith('limesurvey')]
 lime_cols
 lime_col = 'limesurvey_demand_control_ratio'
 clust_col = lime_col
 model_input[clust_col].describe()
 # %% jupyter={"source_hidden": true}
 # Filter-out outlier rows by clust_col 
 model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
 uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
 plt.bar(uniq['pid'], uniq[clust_col])
 # %% jupyter={"source_hidden": true}
 # Get clusters by cluster col & and merge the clusters to main df
 km = KMeans(n_clusters=n_clusters).fit_predict(uniq.set_index('pid'))
 np.unique(km, return_counts=True)
 uniq['cluster'] = km
 uniq
 model_input = model_input.merge(uniq[['pid', 'cluster']])   
 # %% jupyter={"source_hidden": true}
 model_input.set_index(index_columns, inplace=True)
 # %% jupyter={"source_hidden": true}
 # Create dict with classification ml models
 cm = ClassificationModels()
 cmodels = cm.get_cmodels()
 # %% jupyter={"source_hidden": true}
 for k in range(n_clusters):
    model_input_subset = model_input[model_input["cluster"] == k].copy()
    # Takes 10th percentile and above 90th percentile as the test set -> the rest for the training set. Only two classes, seperated by z-score of 0.
    model_input_subset['numerical_target'] = model_input_subset['target']
    bins = [-10, 0, 10] # bins for z-scored targets
    model_input_subset.loc[:, 'target'] = \
        pd.cut(model_input_subset.loc[:, 'target'], bins=bins, labels=[0, 1], right=True)
    p15 = np.percentile(model_input_subset['numerical_target'], 15)
    p85 = np.percentile(model_input_subset['numerical_target'], 85)
    # Treat categorical features
    model_input_subset = treat_categorical_features(model_input_subset)
    # Split to train, validate, and test subsets
    train_set = model_input_subset[(model_input_subset['numerical_target'] > p15) & (model_input_subset['numerical_target'] < p85)].drop(['numerical_target'], axis=1)
    test_set = model_input_subset[(model_input_subset['numerical_target'] <= p15) | (model_input_subset['numerical_target'] >= p85)].drop(['numerical_target'], axis=1)
    train_set['target'].value_counts()
    test_set['target'].value_counts()
    train_x, train_y = train_set.drop(["target", "pid"], axis=1), train_set["target"]
    validate_x, test_x, validate_y, test_y = \
        train_test_split(test_set.drop(["target", "pid"], axis=1), test_set["target"], test_size=0.50, random_state=42)
    # Impute missing values
    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    train_x = imputer.fit_transform(train_x)
    validate_x = imputer.fit_transform(validate_x)
    test_x = imputer.fit_transform(test_x)
    for model_title, model in cmodels.items():
        model['model'].fit(train_x, train_y)
        y_pred = model['model'].predict(validate_x)
        acc = accuracy_score(validate_y, y_pred)
        prec = precision_score(validate_y, y_pred)
        rec = recall_score(validate_y, y_pred)
        f1 = f1_score(validate_y, y_pred)
        print("\n-------------------------------------\n")
        print("Current cluster:", k, end="\n")
        print("Current model:", model_title, end="\n")
        print("Acc", acc)
        print("Precision", prec)
        print("Recall", rec)
        print("F1", f1)
        cmodels[model_title]['metrics'][0] += acc
        cmodels[model_title]['metrics'][1] += prec
        cmodels[model_title]['metrics'][2] += rec
        cmodels[model_title]['metrics'][3] += f1
 # %% jupyter={"source_hidden": true}
 # Get overall results
 cm.get_total_models_scores(n_clusters=n_clusters)
--- a/exploration/ml_pipeline_regression.py
+++ b/exploration/ml_pipeline_regression.py
@ -0,0 +1,355 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"source_hidden": true}
 # %matplotlib inline
 import datetime
 import importlib
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import yaml
 from pyprojroot import here
 from sklearn import linear_model, svm, kernel_ridge, gaussian_process
 from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate
 from sklearn.metrics import mean_squared_error, r2_score
 from sklearn.impute import SimpleImputer
 from sklearn.dummy import DummyRegressor
 import xgboost as xg
 from IPython.core.interactiveshell import InteractiveShell
 InteractiveShell.ast_node_interactivity = "all"
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import machine_learning.features_sensor
 import machine_learning.labels
 import machine_learning.model
 # %% [markdown]
 # # RAPIDS models
 # %% [markdown]
 # ## PANAS negative affect
 # %% jupyter={"source_hidden": true}
 model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 #if "pid" in model_input.columns:
 #    index_columns.append("pid")
 model_input.set_index(index_columns, inplace=True)
 cv_method = 'half_logo' # logo, half_logo, 5kfold
 if cv_method == 'logo':
    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
 else:
    model_input['pid_index'] = model_input.groupby('pid').cumcount()
    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
 # %% jupyter={"source_hidden": true}
 categorical_feature_colnames = ["gender", "startlanguage"]
 additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
 # %% jupyter={"source_hidden": true}
 categorical_features = data_x[categorical_feature_colnames].copy()
 # %% jupyter={"source_hidden": true}
 mode_categorical_features = categorical_features.mode().iloc[0]
 # %% jupyter={"source_hidden": true}
 # fillna with mode
 categorical_features = categorical_features.fillna(mode_categorical_features)
 # %% jupyter={"source_hidden": true}
 # one-hot encoding
 categorical_features = categorical_features.apply(lambda col: col.astype("category"))
 if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)
 # %% jupyter={"source_hidden": true}
 numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
 # %% jupyter={"source_hidden": true}
 train_x = pd.concat([numerical_features, categorical_features], axis=1)
 # %% jupyter={"source_hidden": true}
 train_x.dtypes
 # %% jupyter={"source_hidden": true}
 logo = LeaveOneGroupOut()
 logo.get_n_splits(
    train_x,
    data_y,
    groups=data_groups,
 )
 # Defaults to 5 k folds in cross_validate method
 if cv_method != 'logo' and cv_method != 'half_logo':
    logo = None
 # %% jupyter={"source_hidden": true}
 sum(data_y.isna())
 # %% [markdown]
 # ### Baseline: Dummy Regression (mean)
 dummy_regr = DummyRegressor(strategy="mean")
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
 # %% jupyter={"source_hidden": true}
 dummy_regressor = cross_validate(
    dummy_regr,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.median(dummy_regressor['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.median(dummy_regressor['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(dummy_regressor['test_neg_root_mean_squared_error']))
 print("R2", np.median(dummy_regressor['test_r2']))
 # %% [markdown]
 # ### Linear Regression
 # %% jupyter={"source_hidden": true}
 lin_reg_rapids = linear_model.LinearRegression()
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
 # %% jupyter={"source_hidden": true}
 lin_reg_scores = cross_validate(
    lin_reg_rapids,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
 print("R2", np.median(lin_reg_scores['test_r2']))
 # %% [markdown]
 # ### XGBRegressor Linear Regression
 # %% jupyter={"source_hidden": true}
 xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
 # %% jupyter={"source_hidden": true}
 xgb_reg_scores = cross_validate(
    xgb_r,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.median(xgb_reg_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.median(xgb_reg_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(xgb_reg_scores['test_neg_root_mean_squared_error']))
 print("R2", np.median(xgb_reg_scores['test_r2']))
 # %% [markdown]
 # ### XGBRegressor Pseudo Huber Error Regression
 # %% jupyter={"source_hidden": true}
 xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
 # %% jupyter={"source_hidden": true}
 xgb_psuedo_huber_reg_scores = cross_validate(
    xgb_psuedo_huber_r,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
 print("R2", np.median(xgb_psuedo_huber_reg_scores['test_r2']))
 # %% [markdown]
 # ### Ridge regression
 # %% jupyter={"source_hidden": true}
 ridge_reg = linear_model.Ridge(alpha=.5)
 # %% tags=[] jupyter={"source_hidden": true}
 ridge_reg_scores = cross_validate(
    ridge_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.median(ridge_reg_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.median(ridge_reg_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(ridge_reg_scores['test_neg_root_mean_squared_error']))
 print("R2", np.median(ridge_reg_scores['test_r2']))
 # %% [markdown]
 # ### Lasso
 # %% jupyter={"source_hidden": true}
 lasso_reg = linear_model.Lasso(alpha=0.1)
 # %% jupyter={"source_hidden": true}
 lasso_reg_score = cross_validate(
    lasso_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.median(lasso_reg_score['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.median(lasso_reg_score['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(lasso_reg_score['test_neg_root_mean_squared_error']))
 print("R2", np.median(lasso_reg_score['test_r2']))
 # %% [markdown]
 # ### Bayesian Ridge
 # %% jupyter={"source_hidden": true}
 bayesian_ridge_reg = linear_model.BayesianRidge()
 # %% jupyter={"source_hidden": true}
 bayesian_ridge_reg_score = cross_validate(
    bayesian_ridge_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
 print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
 # %% [markdown]
 # ### RANSAC (outlier robust regression)
 # %% jupyter={"source_hidden": true}
 ransac_reg = linear_model.RANSACRegressor()
 # %% jupyter={"source_hidden": true}
 ransac_reg_scores = cross_validate(
    ransac_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.median(ransac_reg_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.median(ransac_reg_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(ransac_reg_scores['test_neg_root_mean_squared_error']))
 print("R2", np.median(ransac_reg_scores['test_r2']))
 # %% [markdown]
 # ### Support vector regression
 # %% jupyter={"source_hidden": true}
 svr = svm.SVR()
 # %% jupyter={"source_hidden": true}
 svr_scores = cross_validate(
    svr,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.median(svr_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.median(svr_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(svr_scores['test_neg_root_mean_squared_error']))
 print("R2", np.median(svr_scores['test_r2']))
 # %% [markdown]
 # ### Kernel Ridge regression
 # %% jupyter={"source_hidden": true}
 kridge = kernel_ridge.KernelRidge()
 # %% jupyter={"source_hidden": true}
 kridge_scores = cross_validate(
    kridge,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.median(kridge_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.median(kridge_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(kridge_scores['test_neg_root_mean_squared_error']))
 print("R2", np.median(kridge_scores['test_r2']))
 # %% [markdown]
 # ### Gaussian Process Regression
 # %% jupyter={"source_hidden": true}
 gpr = gaussian_process.GaussianProcessRegressor()
 # %% jupyter={"source_hidden": true}
 gpr_scores = cross_validate(
    gpr,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.median(gpr_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.median(gpr_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(gpr_scores['test_neg_root_mean_squared_error']))
 print("R2", np.median(gpr_scores['test_r2']))
 # %%
--- a/exploration/ml_pipeline_stress_event_cleaned.py
+++ b/exploration/ml_pipeline_stress_event_cleaned.py
@ -0,0 +1,359 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"source_hidden": true}
 # %matplotlib inline
 import datetime
 import importlib
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import yaml
 from pyprojroot import here
 from sklearn import linear_model, svm, kernel_ridge, gaussian_process
 from sklearn.model_selection import LeaveOneGroupOut, LeavePGroupsOut, cross_val_score, cross_validate
 from sklearn.metrics import mean_squared_error, r2_score
 from sklearn.impute import SimpleImputer
 from sklearn.dummy import DummyRegressor
 import xgboost as xg
 from IPython.core.interactiveshell import InteractiveShell
 InteractiveShell.ast_node_interactivity = "all"
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import machine_learning.features_sensor
 import machine_learning.labels
 import machine_learning.model
 # %% [markdown]
 # # RAPIDS models
 # %% [markdown]
 # ## PANAS negative affect
 # %% jupyter={"source_hidden": true}
 model_input = pd.read_csv("../data/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input.set_index(index_columns, inplace=True)
 cv_method = 'half_logo'
 if cv_method == 'logo':
    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
 else:
    model_input[(model_input['pid'] == "p037") | (model_input['pid'] == "p064") | (model_input['pid'] == "p092")]
    model_input['pid_index'] = model_input.groupby('pid').cumcount()
    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
 # %% jupyter={"source_hidden": true}
 categorical_feature_colnames = ["gender", "startlanguage"]
 additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
 # %% jupyter={"source_hidden": true}
 categorical_features = data_x[categorical_feature_colnames].copy()
 # %% jupyter={"source_hidden": true}
 mode_categorical_features = categorical_features.mode().iloc[0]
 # %% jupyter={"source_hidden": true}
 # fillna with mode
 categorical_features = categorical_features.fillna(mode_categorical_features)
 # %% jupyter={"source_hidden": true}
 # one-hot encoding
 categorical_features = categorical_features.apply(lambda col: col.astype("category"))
 if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)
 # %% jupyter={"source_hidden": true}
 numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
 # %% jupyter={"source_hidden": true}
 train_x = pd.concat([numerical_features, categorical_features], axis=1)
 # %% jupyter={"source_hidden": true}
 train_x.dtypes
 # %% jupyter={"source_hidden": true}
 logo = LeaveOneGroupOut()
 logo.get_n_splits(
    train_x,
    data_y,
    groups=data_groups,
 )
 # Defaults to 5 k folds in cross_validate method
 if cv_method != 'logo' and cv_method != 'half_logo':
    logo = None
 # %% jupyter={"source_hidden": true}
 sum(data_y.isna())
 # %% [markdown]
 # ### Baseline: Dummy Regression (mean)
 # %%
 dummy_regr = DummyRegressor(strategy="mean")
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
 # %% jupyter={"source_hidden": true}
 lin_reg_scores = cross_validate(
    dummy_regr,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.nanmedian(lin_reg_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_root_mean_squared_error']))
 print("R2", np.nanmedian(lin_reg_scores['test_r2']))
 # %% [markdown]
 # ### Linear Regression
 # %% jupyter={"source_hidden": true}
 lin_reg_rapids = linear_model.LinearRegression()
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
 # %% jupyter={"source_hidden": true}
 lin_reg_scores = cross_validate(
    lin_reg_rapids,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.nanmedian(lin_reg_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.nanmedian(lin_reg_scores['test_neg_root_mean_squared_error']))
 print("R2", np.nanmedian(lin_reg_scores['test_r2']))
 # %% [markdown]
 # ### XGBRegressor Linear Regression
 # %% jupyter={"source_hidden": true}
 xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10)
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
 # %% jupyter={"source_hidden": true}
 xgb_reg_scores = cross_validate(
    xgb_r,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.nanmedian(xgb_reg_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.nanmedian(xgb_reg_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.nanmedian(xgb_reg_scores['test_neg_root_mean_squared_error']))
 print("R2", np.nanmedian(xgb_reg_scores['test_r2']))
 # %% [markdown]
 # ### XGBRegressor Pseudo Huber Error Regression
 # %% jupyter={"source_hidden": true}
 xgb_psuedo_huber_r = xg.XGBRegressor(objective ='reg:pseudohubererror', n_estimators = 10)
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
 # %% jupyter={"source_hidden": true}
 xgb_psuedo_huber_reg_scores = cross_validate(
    xgb_psuedo_huber_r,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.nanmedian(xgb_psuedo_huber_reg_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.nanmedian(xgb_psuedo_huber_reg_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.nanmedian(xgb_psuedo_huber_reg_scores['test_neg_root_mean_squared_error']))
 print("R2", np.nanmedian(xgb_psuedo_huber_reg_scores['test_r2']))
 # %% [markdown]
 # ### Ridge regression
 # %% jupyter={"source_hidden": true}
 ridge_reg = linear_model.Ridge(alpha=.5)
 # %% tags=[] jupyter={"source_hidden": true}
 ridge_reg_scores = cross_validate(
    ridge_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.nanmedian(ridge_reg_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.nanmedian(ridge_reg_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.nanmedian(ridge_reg_scores['test_neg_root_mean_squared_error']))
 print("R2", np.nanmedian(ridge_reg_scores['test_r2']))
 # %% [markdown]
 # ### Lasso
 # %% jupyter={"source_hidden": true}
 lasso_reg = linear_model.Lasso(alpha=0.1)
 # %% jupyter={"source_hidden": true}
 lasso_reg_score = cross_validate(
    lasso_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.nanmedian(lasso_reg_score['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.nanmedian(lasso_reg_score['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.nanmedian(lasso_reg_score['test_neg_root_mean_squared_error']))
 print("R2", np.nanmedian(lasso_reg_score['test_r2']))
 # %% [markdown]
 # ### Bayesian Ridge
 # %% jupyter={"source_hidden": true}
 bayesian_ridge_reg = linear_model.BayesianRidge()
 # %% jupyter={"source_hidden": true}
 bayesian_ridge_reg_score = cross_validate(
    bayesian_ridge_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.nanmedian(bayesian_ridge_reg_score['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.nanmedian(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.nanmedian(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
 print("R2", np.nanmedian(bayesian_ridge_reg_score['test_r2']))
 # %% [markdown]
 # ### RANSAC (outlier robust regression)
 # %% jupyter={"source_hidden": true}
 ransac_reg = linear_model.RANSACRegressor()
 # %% jupyter={"source_hidden": true}
 ransac_reg_scores = cross_validate(
    ransac_reg,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.nanmedian(ransac_reg_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.nanmedian(ransac_reg_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.nanmedian(ransac_reg_scores['test_neg_root_mean_squared_error']))
 print("R2", np.nanmedian(ransac_reg_scores['test_r2']))
 # %% [markdown]
 # ### Support vector regression
 # %% jupyter={"source_hidden": true}
 svr = svm.SVR()
 # %% jupyter={"source_hidden": true}
 svr_scores = cross_validate(
    svr,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.nanmedian(svr_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.nanmedian(svr_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.nanmedian(svr_scores['test_neg_root_mean_squared_error']))
 print("R2", np.nanmedian(svr_scores['test_r2']))
 # %% [markdown]
 # ### Kernel Ridge regression
 # %% jupyter={"source_hidden": true}
 kridge = kernel_ridge.KernelRidge()
 # %% jupyter={"source_hidden": true}
 kridge_scores = cross_validate(
    kridge,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.nanmedian(kridge_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.nanmedian(kridge_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.nanmedian(kridge_scores['test_neg_root_mean_squared_error']))
 print("R2", np.nanmedian(kridge_scores['test_r2']))
 # %% [markdown]
 # ### Gaussian Process Regression
 # %% jupyter={"source_hidden": true}
 gpr = gaussian_process.GaussianProcessRegressor()
 # %% jupyter={"source_hidden": true}
 gpr_scores = cross_validate(
    gpr,
    X=imputer.fit_transform(train_x),
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.nanmedian(gpr_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.nanmedian(gpr_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.nanmedian(gpr_scores['test_neg_root_mean_squared_error']))
 print("R2", np.nanmedian(gpr_scores['test_r2']))
 # %%
--- a/exploration/tree_high_dpi.png
+++ b/exploration/tree_high_dpi.png
--- a/machine_learning/classification_models.py
+++ b/machine_learning/classification_models.py
@ -0,0 +1,71 @@
 from sklearn.dummy import DummyClassifier
 from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
 from lightgbm import LGBMClassifier
 import xgboost as xg 
 class ClassificationModels():
    def __init__(self):
        self.cmodels = self.init_classification_models()
    def get_cmodels(self):
        return self.cmodels
    def init_classification_models(self):
        cmodels = {
            'dummy_classifier': {
                'model': DummyClassifier(strategy="most_frequent"),
                'metrics': [0, 0, 0, 0]
            },
            'logistic_regression': {
                'model': linear_model.LogisticRegression(max_iter=1000),
                'metrics': [0, 0, 0, 0]
            },
            'support_vector_machine': {
                'model': svm.SVC(),
                'metrics': [0, 0, 0, 0]
            },
            'gaussian_naive_bayes': {
                'model': naive_bayes.GaussianNB(),
                'metrics': [0, 0, 0, 0]
            },
            'stochastic_gradient_descent_classifier': {
                'model': linear_model.SGDClassifier(),
                'metrics': [0, 0, 0, 0]
            },
            'knn': {
                'model': neighbors.KNeighborsClassifier(),
                'metrics': [0, 0, 0, 0]
            },
            'decision_tree': {
                'model': tree.DecisionTreeClassifier(),
                'metrics': [0, 0, 0, 0]
            },
            'random_forest_classifier': {
                'model': ensemble.RandomForestClassifier(),
                'metrics': [0, 0, 0, 0]
            },
            'gradient_boosting_classifier': {
                'model': ensemble.GradientBoostingClassifier(),
                'metrics': [0, 0, 0, 0]
            },
            'lgbm_classifier': {
                'model': LGBMClassifier(),
                'metrics': [0, 0, 0, 0]
            },
            'XGBoost_classifier': {
                'model': xg.sklearn.XGBClassifier(),
                'metrics': [0, 0, 0, 0]
            }
        }
        return cmodels
    def get_total_models_scores(self, n_clusters=1):
        for model_title, model in self.cmodels.items():
            print("\n************************************\n")
            print("Current model:", model_title, end="\n")
            print("Acc:", model['metrics'][0]/n_clusters)
            print("Precision:", model['metrics'][1]/n_clusters)
            print("Recall:", model['metrics'][2]/n_clusters)
            print("F1:", model['metrics'][3]/n_clusters)
--- a/machine_learning/cross_validation.py
+++ b/machine_learning/cross_validation.py
@ -0,0 +1,121 @@
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold
 class CrossValidation():
    """This code implements a CrossValidation class for creating cross validation splits.
    """
    def __init__(self, data=None, cv_method='logo'):
        """This method initializes the cv_method argument and optionally prepares the data if supplied.
        Args:
            cv_method (str, optional): String of cross validation method; options are 'logo', 'half_logo' and '5kfold'. 
                Defaults to 'logo'.
            data (DataFrame, optional): Pandas DataFrame with target, pid columns and other features as columns. 
                Defaults to None.
        """
        self.initialize_cv_method(cv_method)
        if data is not None:
            self.prepare_data(data)
    def prepare_data(self, data):
        """Prepares the data ready to be passed to the cross-validation algorithm, depending on the cv_method type. 
            For example, if cv_method is set to 'half_logo' new columns 'pid_index', 'pid_count', 'pid_half' 
            are added and used in the process.
        Args:
            data (_type_): Pandas DataFrame with target, pid columns and other features as columns.
        """
        self.data = data
        if self.cv_method == "logo":
            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
        elif self.cv_method == "half_logo":
            data['pid_index'] = data.groupby('pid').cumcount()
            data['pid_count'] = data.groupby('pid')['pid'].transform('count')
            data["pid_index"] = (data['pid_index'] / data['pid_count'] + 1).round()
            data["pid_half"] = data["pid"] + "_" +  data["pid_index"].astype(int).astype(str)
            data_X, data_y, data_groups = data.drop(["target", "pid", "pid_index", "pid_half"], axis=1), data["target"], data["pid_half"]
        elif self.cv_method == "5kfold":
            data_X, data_y, data_groups = data.drop(["target", "pid"], axis=1), data["target"], data["pid"]
        self.X, self.y, self.groups = data_X, data_y, data_groups
    def initialize_cv_method(self, cv_method):
        """Initializes the given cv_method type. Depending on the type, the appropriate splitting technique is used.
        Args:
            cv_method (str): The type of cross-validation method to use; options are 'logo', 'half_logo' and '5kfold'.
        Raises:
            ValueError: If cv_method is not in the list of available methods, it raises an ValueError.
        """
        self.cv_method = cv_method
        if self.cv_method not in ["logo", "half_logo", "5kfold"]:
            raise ValueError("Invalid cv_method input. Correct values are: 'logo', 'half_logo', '5kfold'")
        if self.cv_method in ["logo", "half_logo"]:
            self.cv = LeaveOneGroupOut()
        elif self.cv_method == "5kfold":
            self.cv = StratifiedKFold(n_splits=5, shuffle=True)
    def get_splits(self):
        """Returns a generator object containing the cross-validation splits. 
        Raises:
            ValueError: Raises ValueError if no data has been set.
        """
        if not self.data.empty:
            return self.cv.split(self.X, self.y, self.groups)
        else: 
            raise ValueError("No data has been set. Use 'prepare_data(data)' method to set the data.")
    def get_data(self):
        """data getter
        Returns:
            Pandas DataFrame: Returns the data from the class instance.
        """
        return self.data
    def get_x_y_groups(self):
        """X, y, and groups data getter
        Returns:
            Pandas DataFrame: Returns the data from the class instance.
        """
        return self.X, self.y, self.groups
    def get_train_test_sets(self, split):
        """Gets train and test sets, dependent on the split parameter. This method can be used in a specific splitting context,
        where by index we can get train and test sets.
        Args:
            split (tuple of indices): It represents one iteration of the split generator (see get_splits method). 
        Returns:
            tuple of Pandas DataFrames: This method returns train_X, train_y, test_X, test_y, with correctly indexed rows by split param.
        """
        return self.X.iloc[split[0]], self.y.iloc[split[0]], self.X.iloc[split[1]], self.y.iloc[split[1]]
--- a/machine_learning/feature_selection.py
+++ b/machine_learning/feature_selection.py
@ -0,0 +1,221 @@
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 from sklearn.feature_selection import SequentialFeatureSelector
 from sklearn.naive_bayes import GaussianNB
 from sklearn.linear_model import Lasso 
 """ Feature selection pipeline: a methods that can be used in the wrapper metod alongside other wrapper contents (hyperparameter tuning etc.).
 (1) Establish methods for each of the steps in feature selection protocol.
 (2) Ensure that above methods are given only a part of data and use appropriate random seeds - to later simulate use case in production. 
 (3) Implement a method which gives graphical exploration of (1) (a) and (b) steps of the feature selection.
 (4) Prepare a core method that can be fit into a wrapper (see sklearn wrapper methods) and integrates methods from (1)
 """
 class FeatureSelection:
    def __init__(self, X_train, X_test, y_train, y_test): # TODO: what about leave-one-subject-out CV?
        pass # TODO.... 
    def select_best_feature(df, features, method="remove", ml_type="classification", metric="recall", stored_features=[]):
        """The method selects the best feature by testing the prediction on the feature set with or without the current feature.
        The "remove" method removes a particular feature and predicts on the test set without it. The "add" method adds a particulat 
        feature to the previously established feature set (stored_features). The best feature is selected dependent on the metric
        specified as a parameter.
        Args:
            df (DataFrame): Input data on which the predictions will be made.
            features (list): List of features to select the best/worst from
            method (str, optional): remove or add features.  Defaults to "remove".
            ml_type (str, optional): Either classification or regression ml problem controls the ML algorithm and  metric. Defaults to "classification".
            metric (str, optional): Selected metric with which the best/worst feature will be determined. Defaults to "recall".
            stored_features (list, optional): In case if method is 'add', stored features refer to the features that had been previously added. Defaults to [].
        Raises:
            ValueError: Raises if classification or regression metrics are not recognised if a specific ml_type is selected.
            ValueError: If unknown ml_type is chosen. 
        Returns:
            tuple: name of the best feature, best feature score, best feature score standard deviation.
        """
        best_feature = None
        if ml_type == "classification" and metric not in ['accuracy', 'precision', 'recall', 'f1']:
            raise ValueError("Classification metric not recognized. Please choose 'accuracy', 'precision', 'recall' and/or 'f1'")
        elif ml_type == "regression" and metric not in ['r2']:
            raise ValueError("Regression metric not recognized. Please choose 'r2'")
        for feat in features:
            if method == "remove":
                pred_features = [col for col in df.columns if feat != col] # All but feat
            elif method == "add":
                pred_features = [feat] + stored_features # Feat with stored features
            X, y  = df.drop(columns=['target', 'pid'])[pred_features], df['target']
            if ml_type == "classification":
                nb = GaussianNB()
                model_cv = cross_validate(
                    nb,
                    X=X,
                    y=y,
                    cv=StratifiedKFold(n_splits=5, shuffle=True),
                    n_jobs=-1,
                    scoring=('accuracy', 'precision', 'recall', 'f1')
                )
                with warnings.catch_warnings():
                    warnings.filterwarnings("ignore", message="Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.")
                    if metric == "accuracy":
                        acc = np.mean(model_cv['test_accuracy'])
                        acc_std = np.std(model_cv['test_accuracy'])
                        if not best_feature or (acc > best_metric_score):
                            best_feature = feat
                            best_metric_score = acc
                            best_metric_score_std = acc_std
                    elif metric == "precision":
                        prec = np.mean(model_cv['test_precision'])
                        prec_std = np.std(model_cv['test_precision'])
                        if not best_feature or (prec > best_metric_score):
                            best_feature = feat
                            best_metric_score = prec
                            best_metric_score_std = prec_std
                    elif metric == "recall":
                        rec = np.mean(model_cv['test_recall'])
                        rec_std = np.std(model_cv['test_recall'])
                        if not best_feature or (rec > best_metric_score):
                            best_feature = feat
                            best_metric_score = rec
                            best_metric_score_std = rec_std
                    else:
                        f1 = np.mean(model_cv['test_f1'])
                        f1_std = np.std(model_cv['test_f1'])
                        if not best_feature or (f1 > best_metric_score):
                            best_feature = feat
                            best_metric_score = f1
                            best_metric_score_std = f1_std 
            elif ml_type == "regression":
                lass = Lasso()
                model_cv = cross_validate(
                    lass,
                    X=X,
                    y=y,
                    cv=StratifiedKFold(n_splits=5, shuffle=True),
                    n_jobs=-1,
                    scoring=('r2')
                )
                if metric == "r2":
                    r2 = np.mean(model_cv['test_r2'])
                    r2_std = np.std(model_cv['test_r2'])
                    if not best_feature or (r2 > best_metric_score):
                        best_feature = feat
                        best_metric_score = r2
                        best_metric_score_std = r2_std
            else:
                raise ValueError("ML type not yet implemented!")
        return best_feature, best_metric_score, best_metric_score_std
    def select_features(df, n_min=20, n_max=50, method="remove", n_not_improve=10):
        n_features = df.shape[1] - 2 # -2 beacause pid and target are not considered
        if n_max > n_features:
            n_max = n_features
        if n_min > n_features:
            raise ValueError("The number of features in the dataframe must be at least as n_min-1 parameter.")
        if n_max < n_min:
            raise ValueError("n_max parameter needs to be greater than or equal to n_min parameter.")
        features = df.columns.tolist()
        features.remove("pid")
        features.remove("target")
        feature_importance = []
        if method == "remove":
            for i in reversed(range(n_features)):
                best_feature, best_metric_score, best_metric_score_std = \
                    self.select_best_feature(df, features, method=method, ml_type="classification", metric="recall")
                feature_importance.append(tuple(i+1, best_feature, best_metric_score, best_metric_score_std))
                features.remove(best_feature)
            feature_importance_df = pd.DataFrame(feature_importance, columns=['i', 'name', 'metric', 'metric_sd'])
            # Selekcijski kriterij značilk v rangu max-min
            # Npr. izbira najboljšega score-a v tem rangu. Ali pa dokler se v tem rangu score zvišuje za 0.0X, ko se ne izberi tisti set značilk.
            # Set značilk se bo izbral od i=1 do i=index_izbrane_značilke
            # "Tipping point" značilka mora biti v rangu max-min
            selection_area = feature_importance_df[(feature_importance_df["i"] >= n_min+1) & (feature_importance_df["i"] <= n_max)]
            selection_area.set_index(["i", "name"], inplace=True)
            diffrences = selection_area.diff()
            diffrences.dropna(how='any', inplace=True)
            # Morda tudi komulativna sumacija? Kjer se preprosto index z najvišjo vrednostjo 
            cumulative_sumation = diffrences.cumsum()
            tipping_feature_indx_1 = cumulative_sumation.idxmax()["metric"]
            # Zelo konzervativna metoda, ki ob prvem neizboljšanjem rezultata preneha z iskanjem boljše alternative 
            tipping_feature_indx_2 = None
            for indx, row in diffrences.iterrows():
                if row["metric"] > 0:
                    tipping_feature_indx_2 = indx
                else: 
                    break
            # Metoda, ki pusti n_not_improve značilkam, da premagajo dosedajno najboljši score     
            tipping_feature_indx_3 = None
            cum_sum_score = 0
            i_worse = 0
            # TODO: morda bi bilo smisleno združiti diff, cumsum in scores stolpce ...
            for indx, row in selection_area.iterrows():
                if row["metric"] > 0:
                    tipping_feature_indx_3 = indx
                    cum_sum_score += row["metric"]
                    i_worse = 0
                else:
                    i_worse += 1
                if i_worse == n_not_improve:
                    break
    def make_predictions_with_features(df, groups_substrings, include_group=True, with_cols=[], print_flag=False):
        pass
    def vizualize_feature_selection_process():
        pass
    def execute_feature_selection_step():
        pass
--- a/machine_learning/helper.py
+++ b/machine_learning/helper.py
@ -1,6 +1,15 @@
 from pathlib import Path
 from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble, naive_bayes, neighbors, tree
 from sklearn.model_selection import LeaveOneGroupOut, cross_validate, cross_validate
 from sklearn.metrics import mean_squared_error, r2_score
 from sklearn.impute import SimpleImputer
 from sklearn.dummy import DummyRegressor, DummyClassifier
 from xgboost import XGBRegressor, XGBClassifier
 import xgboost as xg
 import pandas as pd
 import numpy as np
 def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
@ -55,3 +64,396 @@ def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> P
    export_filename = filename_prefix + "_" + data_type + ".csv"
    full_path = folder / export_filename
    return full_path
 def insert_row(df, row):
    return pd.concat([df, pd.DataFrame([row], columns=df.columns)], ignore_index=True)
 def prepare_regression_model_input(input_csv):
    model_input = pd.read_csv(input_csv)
    index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
    model_input.set_index(index_columns, inplace=True)
    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
    categorical_feature_colnames = ["gender", "startlanguage", "limesurvey_demand_control_ratio_quartile"]
    additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
    categorical_feature_colnames += additional_categorical_features
    #TODO: check whether limesurvey_demand_control_ratio_quartile NaNs could be replaced meaningfully
    categorical_features = data_x[categorical_feature_colnames].copy()
    mode_categorical_features = categorical_features.mode().iloc[0]
    # fillna with mode
    categorical_features = categorical_features.fillna(mode_categorical_features)
    # one-hot encoding
    categorical_features = categorical_features.apply(lambda col: col.astype("category"))
    if not categorical_features.empty:
        categorical_features = pd.get_dummies(categorical_features)
    numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
    train_x = pd.concat([numerical_features, categorical_features], axis=1)
    return train_x, data_y, data_groups
 def run_all_regression_models(input_csv):
    # Prepare data
    data_x, data_y, data_groups = prepare_regression_model_input(input_csv)
    # Prepare cross validation
    logo = LeaveOneGroupOut()
    logo.get_n_splits(
        data_x,
        data_y,
        groups=data_groups,
    )
    metrics = ['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error']
    test_metrics = ["test_" + metric for metric in metrics]
    scores = pd.DataFrame(columns=["method", "max", "nanmedian"])
    # Validate models
    dummy_regr = DummyRegressor(strategy="mean")
    dummy_regr_scores = cross_validate(
        dummy_regr,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring=metrics
    )
    print("Dummy model:")
    print("R^2: ", np.nanmedian(dummy_regr_scores['test_r2']))
    scores_df = pd.DataFrame(dummy_regr_scores)[test_metrics]
    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "dummy"
    scores = pd.concat([scores, scores_df])
    lin_reg_rapids = linear_model.LinearRegression()
    lin_reg_scores = cross_validate(
        lin_reg_rapids,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring=metrics
    )
    print("Linear regression:")
    print("R^2: ", np.nanmedian(lin_reg_scores['test_r2']))
    scores_df = pd.DataFrame(lin_reg_scores)[test_metrics]
    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "linear_reg"
    scores = pd.concat([scores, scores_df])
    ridge_reg = linear_model.Ridge(alpha=.5)
    ridge_reg_scores = cross_validate(
        ridge_reg,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring=metrics
    )
    print("Ridge regression")
    scores_df = pd.DataFrame(ridge_reg_scores)[test_metrics]
    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "ridge_reg"
    scores = pd.concat([scores, scores_df])
    lasso_reg = linear_model.Lasso(alpha=0.1)
    lasso_reg_score = cross_validate(
        lasso_reg,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring=metrics
    )
    print("Lasso regression")
    scores_df = pd.DataFrame(lasso_reg_score)[test_metrics]
    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "lasso_reg"
    scores = pd.concat([scores, scores_df])
    bayesian_ridge_reg = linear_model.BayesianRidge()
    bayesian_ridge_reg_score = cross_validate(
        bayesian_ridge_reg,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring=metrics
    )
    print("Bayesian Ridge")
    scores_df = pd.DataFrame(bayesian_ridge_reg_score)[test_metrics]
    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "bayesian_ridge"
    scores = pd.concat([scores, scores_df])
    ransac_reg = linear_model.RANSACRegressor()
    ransac_reg_score = cross_validate(
        ransac_reg,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring=metrics
    )
    print("RANSAC (outlier robust regression)")
    scores_df = pd.DataFrame(ransac_reg_score)[test_metrics]
    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "RANSAC"
    scores = pd.concat([scores, scores_df])
    svr = svm.SVR()
    svr_score = cross_validate(
        svr,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring=metrics
    )
    print("Support vector regression")
    scores_df = pd.DataFrame(svr_score)[test_metrics]
    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "SVR"
    scores = pd.concat([scores, scores_df])
    kridge = kernel_ridge.KernelRidge()
    kridge_score = cross_validate(
        kridge,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring=metrics
    )
    print("Kernel Ridge regression")
    scores_df = pd.DataFrame(kridge_score)[test_metrics]
    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "kernel_ridge"
    scores = pd.concat([scores, scores_df])
    gpr = gaussian_process.GaussianProcessRegressor()
    gpr_score = cross_validate(
        gpr,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring=metrics
    )
    print("Gaussian Process Regression")
    scores_df = pd.DataFrame(gpr_score)[test_metrics]
    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "gaussian_proc"
    scores = pd.concat([scores, scores_df])
    rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
    rfr_score = cross_validate(
        rfr,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring=metrics
    )
    print("Random Forest Regression")
    scores_df = pd.DataFrame(rfr_score)[test_metrics]
    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "random_forest"
    scores = pd.concat([scores, scores_df])
    xgb = XGBRegressor()
    xgb_score = cross_validate(
        xgb,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring=metrics
    )
    print("XGBoost Regressor")
    scores_df = pd.DataFrame(xgb_score)[test_metrics]
    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "XGBoost"
    scores = pd.concat([scores, scores_df])
    ada = ensemble.AdaBoostRegressor()
    ada_score = cross_validate(
        ada,
        X=data_x,
        y=data_y,
        groups=data_groups,
        cv=logo,
        n_jobs=-1,
        scoring=metrics
    )
    print("ADA Boost Regressor")
    scores_df = pd.DataFrame(ada_score)[test_metrics]
    scores_df = scores_df.agg(['max', np.nanmedian]).transpose()
    scores_df["method"] = "ADA_boost"
    scores = pd.concat([scores, scores_df])
    return scores
 def run_all_classification_models(data_x, data_y, data_groups, cv_method):
    metrics = ['accuracy', 'average_precision', 'recall', 'f1']
    test_metrics = ["test_" + metric for metric in metrics]
    scores = pd.DataFrame(columns=["method", "max", "mean"])
    dummy_class = DummyClassifier(strategy="most_frequent")
    dummy_score = cross_validate(
    dummy_class,
    X=data_x,
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    error_score='raise',
    scoring=metrics
    )
    print("Dummy")
    scores_df = pd.DataFrame(dummy_score)[test_metrics]
    scores_df = scores_df.agg(['max', 'mean']).transpose()
    scores_df["method"] = "Dummy"
    scores = pd.concat([scores, scores_df])
    logistic_regression = linear_model.LogisticRegression()
    log_reg_scores = cross_validate(
    logistic_regression,
    X=data_x,
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    scoring=metrics
    )
    print("Logistic regression")
    scores_df = pd.DataFrame(log_reg_scores)[test_metrics]
    scores_df = scores_df.agg(['max', 'mean']).transpose()
    scores_df["method"] = "logistic_reg"
    scores = pd.concat([scores, scores_df])
    svc = svm.SVC()
    svc_scores = cross_validate(
    svc,
    X=data_x,
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    scoring=metrics
    )
    print("Support Vector Machine")
    scores_df = pd.DataFrame(svc_scores)[test_metrics]
    scores_df = scores_df.agg(['max', 'mean']).transpose()
    scores_df["method"] = "svc"
    scores = pd.concat([scores, scores_df])
    gaussian_nb = naive_bayes.GaussianNB()
    gaussian_nb_scores = cross_validate(
    gaussian_nb,
    X=data_x,
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    scoring=metrics
    )
    print("Gaussian Naive Bayes")
    scores_df = pd.DataFrame(gaussian_nb_scores)[test_metrics]
    scores_df = scores_df.agg(['max', 'mean']).transpose()
    scores_df["method"] = "gaussian_naive_bayes"
    scores = pd.concat([scores, scores_df])
    sgdc = linear_model.SGDClassifier()
    sgdc_scores = cross_validate(
    sgdc,
    X=data_x,
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    scoring=metrics
    )
    print("Stochastic Gradient Descent")
    scores_df = pd.DataFrame(sgdc_scores)[test_metrics]
    scores_df = scores_df.agg(['max', 'mean']).transpose()
    scores_df["method"] = "stochastic_gradient_descent"
    scores = pd.concat([scores, scores_df])
    rfc = ensemble.RandomForestClassifier()
    rfc_scores = cross_validate(
    rfc,
    X=data_x,
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    scoring=metrics
    )
    print("Random Forest")
    scores_df = pd.DataFrame(rfc_scores)[test_metrics]
    scores_df = scores_df.agg(['max', 'mean']).transpose()
    scores_df["method"] = "random_forest"
    scores = pd.concat([scores, scores_df])
    xgb_classifier = XGBClassifier()
    xgb_scores = cross_validate(
    xgb_classifier,
    X=data_x,
    y=data_y,
    groups=data_groups,
    cv=cv_method,
    n_jobs=-1,
    scoring=metrics
    )
    print("XGBoost")
    scores_df = pd.DataFrame(xgb_scores)[test_metrics]
    scores_df = scores_df.agg(['max', 'mean']).transpose()
    scores_df["method"] = "xgboost"
    scores = pd.concat([scores, scores_df])
    return scores
--- a/machine_learning/preprocessing.py
+++ b/machine_learning/preprocessing.py
@ -0,0 +1,126 @@
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 class Preprocessing:
    """This class presents Preprocessing methods which can be used in context of an individual CV iteration or, simply, on whole data. 
       It's blind to the test data - e.g, it imputes the test data with train data mean. 
       This means, it somehow needs an access to the information about data split. In context 
    """
    def __init__(self, train_X, train_y, test_X, test_y):
        self.train_X = train_X
        self.train_y = train_y
        self.test_X = test_X
        self.test_y = test_y
    def one_hot_encoder(self, categorical_features, numerical_features, mode):
        """
        This code is an implementation of one-hot encoding. It takes in two data sets, 
        one with categorical features and one with numerical features and a mode parameter. 
        First it uses the fillna() function to fill in any missing values present in the 
        categorical data set with the mode value. Then it uses the apply () method to 
        convert each column of the data set into a category data type which is then 
        transformed using the pd.get_dummies() function. Finally it concatenates the 
        numerical data set and the transformed categorical data set using pd.concat() and 
        returns it.
        Args:
            categorical_features (DataFrame): DataFrame including only categorical columns.
            numerical_features (_type_): DataFrame including only numerical columns.
            mode (int): Mode of the column with which DataFrame is filled. TODO: check mode results
        Returns:
            DataFrame: Hot-One Encoded DataFrame.
        """
        # Fill train set with mode
        categorical_features = categorical_features.fillna(mode)
        # one-hot encoding
        categorical_features = categorical_features.apply(lambda col: col.astype("category"))
        if not categorical_features.empty:
            categorical_features = pd.get_dummies(categorical_features)
        return pd.concat([numerical_features, categorical_features], axis=1)
    def one_hot_encode_train_and_test_sets(self, categorical_columns=["gender", "startlanguage", "mostcommonactivity", "homelabel"]):
        """
        This code is used to transform categorical data into numerical representations. 
        It first identifies the categorical columns, then copies them and saves them as 
        a new dataset. The missing data is filled with the mode (most frequent value in 
        the respective column). This new dataset is then subjected to one-hot encoding, 
        which is a process of transforming categorical data into machine interpretable 
        numerical form by converting categories into multiple binary outcome variables. 
        These encoded values are then concatenated to the numerical features prior to 
        being returned as the final dataset.
        Args:
            categorical_columns (list, optional): List of categorical columns in the dataset. 
                Defaults to ["gender", "startlanguage", "mostcommonactivity", "homelabel"].
        """
        categorical_columns = [col for col in self.train_X.columns if col in categorical_columns]
        # For train set
        train_X_categorical_features = self.train_X[categorical_columns].copy()
        train_X_numerical_features = self.train_X.drop(categorical_columns, axis=1)
        mode_train_X_categorical_features = train_X_categorical_features.mode().iloc[0]
        self.train_X = self.one_hot_encoder(train_X_categorical_features, train_X_numerical_features, mode_train_X_categorical_features)
        # For test set
        test_X_categorical_features = self.test_X[categorical_columns].copy()
        test_X_numerical_features = self.test_X.drop(categorical_columns, axis=1)
        self.test_X = self.one_hot_encoder(test_X_categorical_features, test_X_numerical_features, mode_train_X_categorical_features)
    def imputer(self, interval_feature_list, other_feature_list, groupby_feature="pid"):
        # TODO: TESTING
        if groupby:
            # Interval numerical features # TODO: How can we get and assign appropriate groupby means and assign them to correct columns?
            # VVVVV ......  IN PROGRES ...... VVVVV
            means = self.train_X[interval_feature_list].groupby(groupby_feature).mean() 
            self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + other_feature_list)]] = \
                self.train_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
            self.test_X[self.test_X.loc[:, ~self.test_X.columns.isin([groupby_feature] + other_feature_list)]] = \
                self.test_X[interval_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.mean()))
            # Other features
            self.train_X[self.train_X.loc[:, ~self.train_X.columns.isin([groupby_feature] + interval_feature_list)]] = \
                self.train_X[other_feature_list].groupby(groupby_feature).apply(lambda x: x.fillna(x.median()))
        else:
            # Interval numerical features
            means = self.train_X[interval_feature_list].mean()
            self.train_X[interval_feature_list].fillna(means, inplace=True)
            self.test_X[interval_feature_list].fillna(means, inplace=True)
            # Other features
            medians = self.train_X[other_feature_list].median()
            self.train_X[other_feature_list].fillna(medians, inplace=True)
            self.test_X[other_feature_list].fillna(medians, inplace=True)
    def get_train_test_sets(self):
        """Train and test sets getter
        Returns:
            tuple of Pandas DataFrames: Gets train test sets in traditional sklearn format.
        """
        return self.train_X, self.train_y, self.test_X, self.test_y
--- a/machine_learning/prox_comm_PANAS_nb.ipynb
+++ b/machine_learning/prox_comm_PANAS_nb.ipynb
--- a/presentation/ApplicationCategories.R
+++ b/presentation/ApplicationCategories.R
@ -0,0 +1,51 @@
 library(conflicted)
 library(yaml)
 library(RPostgreSQL)
 library(tidyverse)
 conflicts_prefer(
  dplyr::filter,
  dplyr::lag
 )
 library(magrittr)
 # read the password from file
 credentials <- yaml.load_file("../rapids/credentials.yaml")
 pw <- credentials$PSQL_STRAW$password
 # load the PostgreSQL driver
 drv <- RPostgres::Postgres()
 # creates a connection to the postgres database
 # note that "con" will be used later in each connection to the database
 con <- RPostgres::dbConnect(drv,
  dbname = "staw",
  host = "eol.ijs.si", port = 5432,
  user = "staw_db", password = pw
 )
 rm(pw, credentials) # removes the password
 # check for the bluetooth table, an example
 dbExistsTable(con, "app_categories")
 df_app_categories <- tbl(con, "app_categories") %>%
  collect()
 head(df_app_categories)
 table(df_app_categories$play_store_genre)
 # Correct some mistakes
 df_app_categories %<>% mutate(
  play_store_genre = {
    function(x) {
      case_when(
        x == "Education,Education" ~ "Education",
        x == "EducationEducation" ~ "Education",
        x == "not_found" ~ "System",
        .default = x
      )
    }
  }(play_store_genre)
 )
 dbDisconnect(con)
--- a/presentation/StressfulEvents.Rmd
+++ b/presentation/StressfulEvents.Rmd
@ -0,0 +1,103 @@
 ---
 title: "Stressful event detection"
 output: html_notebook
 ---
 ```{r chunk_options, include=FALSE}
 knitr::opts_chunk$set(
  comment = "#>", echo = FALSE, fig.width = 6
 )
 ```
 ```{r libraries, include=FALSE}
 library(knitr)
 library(kableExtra)
 library(stringr)
 library(RColorBrewer)
 library(magrittr)
 library(tidyverse)
 ```
 ```{r fig_setup, include=FALSE}
 accent <- RColorBrewer::brewer.pal(7, "Accent")
 ```
 ```{r read_data, include=FALSE}
 podatki <- read_csv("E:/STRAWresults/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv")
 podatki %<>% mutate(pid = as_factor(pid))
 ```
 # Event descriptions
 Participants were asked "Was there a particular event that created tension in you?" with the following options:
 - 0 - No	
 - 1 - Yes, slightly	
 - 2 - Yes, moderately	
 - 3 - Yes, considerably	
 - 4 - Yes, extremely
 If they answered anything but "No", they were also asked about the event's perceived threat (e.g. "Did this event make you feel anxious?") and challenge (e.g. "How eager are you to tackle this event?"). 
 We only consider general "stressfulness" in this presentation.
 Most of the time, nothing stressful happened:
 ```{r target_table}
 kable(table(podatki$target), col.names = c("stressfulness", "frequency")) %>% 
  kable_styling(full_width = FALSE)
 ```
 Most participants had somewhere between 0 and 10 stressful events.
 ```{r target_distribution}
 podatki %>% 
  group_by(pid) %>% 
  summarise(no_of_events = sum(target > 0)) %>% 
  ggplot(aes(no_of_events)) +
  geom_histogram(binwidth = 1, fill = accent[1]) +
  coord_cartesian(expand = FALSE) +
  labs(x = "Number of events per participant") +
  theme_classic()
 ```
 When a stressful event occurred, participants mostly perceived it as slightly to moderately stressful on average.
 ```{r mean_stressfulness_distribution}
 podatki %>% 
  filter(target > 0) %>% 
  group_by(pid) %>% 
  summarise(mean_stressfulness = mean(target)) %>% 
  ggplot(aes(mean_stressfulness)) +
  geom_histogram(binwidth = 0.1, fill = accent[1]) +
  coord_cartesian(expand = FALSE) +
  labs(x = "Mean stressfulness per participant") +
  theme_classic()
 ```
 # Problem description
 We are trying to predict whether a stressful event occurred, i.e. stressfulness > 0, or not (stressfulness == 0).
 First, set up a leave-one-subject-out validation and use original distribution of the class variable.
 For this, the majority classifier has a mean accuracy of 0.85 (and median 0.90), while the F1-score, precision and recall are all 0.
 We also have an option to validate the results differently, such as with "half-loso", i.e. leaving half of the subject's data in the training set and only use half for testing, or k-fold cross-validation.
 Additionally, we can undersample the majority class to balance the dataset.
 # Results
 ## Leave one subject out, original distribution
 ```{r event_detection}
 scores <- read_csv("event_stressful_detection_loso.csv", col_types = "ffdd")
 scores_wide <- scores %>% 
  select(!max) %>% 
  pivot_wider(names_from = metric, 
              names_sep = "_",
              values_from = mean) %>% 
  rename_all(~str_replace(.,"^test_",""))
 kable(scores_wide, digits = 2) %>% 
  column_spec(4, color = 'white', background = 'black') %>% 
  kable_styling(full_width = TRUE)
 ```
--- a/presentation/classification.py
+++ b/presentation/classification.py
@ -0,0 +1,127 @@
 # ---
 # jupyter:
 #   jupytext:
 #     formats: ipynb,py:percent
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %% jupyter={"source_hidden": true}
 # %matplotlib inline
 import datetime
 import importlib
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble 
 from sklearn.model_selection import LeaveOneGroupOut, cross_validate
 from sklearn.dummy import DummyClassifier
 from sklearn.impute import SimpleImputer
 import xgboost as xg
 from IPython.core.interactiveshell import InteractiveShell
 InteractiveShell.ast_node_interactivity = "all"
 from pathlib import Path
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import machine_learning.labels
 import machine_learning.model
 from machine_learning.helper import run_all_classification_models
 # %% [markdown]
 # # RAPIDS models
 # %% [markdown]
 # ## Set script's parameters
 #
 # %%
 cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
 n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
 # %% jupyter={"source_hidden": true}
 filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
 model_input = pd.read_csv(filename)
 # %% jupyter={"source_hidden": true}
 index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
 model_input.set_index(index_columns, inplace=True)
 model_input['target'].value_counts()
 # %% jupyter={"source_hidden": true}
 bins = [-10, -1, 1, 10] # bins for z-scored targets
 # bins = [0, 1, 4] # bins for stressfulness (1-4) target
 model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'medium', 'high'], retbins=True, right=True) #['low', 'medium', 'high']
 model_input['target'].value_counts(), edges
 model_input = model_input[model_input['target'] != "medium"]
 model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1)
 model_input['target'].value_counts()
 if cv_method_str == 'halflogo':
    model_input['pid_index'] = model_input.groupby('pid').cumcount()
    model_input['pid_count'] = model_input.groupby('pid')['pid'].transform('count')
    model_input["pid_index"] = (model_input['pid_index'] / model_input['pid_count'] + 1).round()
    model_input["pid_half"] = model_input["pid"] + "_" +  model_input["pid_index"].astype(int).astype(str)
    data_x, data_y, data_groups = model_input.drop(["target", "pid", "pid_index", "pid_half"], axis=1), model_input["target"], model_input["pid_half"]
 else:
    data_x, data_y, data_groups = model_input.drop(["target", "pid"], axis=1), model_input["target"], model_input["pid"]
 # %% jupyter={"source_hidden": true}
 categorical_feature_colnames = ["gender", "startlanguage"]
 additional_categorical_features = [col for col in data_x.columns if "mostcommonactivity" in col or "homelabel" in col]
 categorical_feature_colnames += additional_categorical_features
 categorical_features = data_x[categorical_feature_colnames].copy()
 mode_categorical_features = categorical_features.mode().iloc[0]
 # fillna with mode
 categorical_features = categorical_features.fillna(mode_categorical_features)
 # one-hot encoding
 categorical_features = categorical_features.apply(lambda col: col.astype("category"))
 if not categorical_features.empty:
    categorical_features = pd.get_dummies(categorical_features)
 numerical_features = data_x.drop(categorical_feature_colnames, axis=1)
 train_x = pd.concat([numerical_features, categorical_features], axis=1)
 # %% jupyter={"source_hidden": true}
 cv_method = None # Defaults to 5 k-folds in cross_validate method
 if cv_method_str == 'logo' or cv_method_str == 'half_logo':
    cv_method = LeaveOneGroupOut()
    cv_method.get_n_splits(
        train_x,
        data_y,
        groups=data_groups,
    )
 # %% jupyter={"source_hidden": true}
 imputer = SimpleImputer(missing_values=np.nan, strategy='median')
 # %%
 final_scores = run_all_classification_models(imputer.fit_transform(train_x), data_y, data_groups, cv_method)
 # %%
 final_scores.index.name = "metric"
 final_scores = final_scores.set_index(["method", final_scores.index])
 final_scores.to_csv("event_stressfulness_lmh_lh_scores.csv")
--- a/presentation/event_stressful_detection_5fold.csv
+++ b/presentation/event_stressful_detection_5fold.csv
@ -0,0 +1,29 @@
 method,metric,max,mean
 Dummy,test_accuracy,0.8557046979865772,0.8548446932649828
 Dummy,test_average_precision,0.1457286432160804,0.14515530673501736
 Dummy,test_recall,0.0,0.0
 Dummy,test_f1,0.0,0.0
 logistic_reg,test_accuracy,0.8640939597315436,0.8504895843872606
 logistic_reg,test_average_precision,0.44363425265068757,0.37511495347389834
 logistic_reg,test_recall,0.3023255813953488,0.24266238973536486
 logistic_reg,test_f1,0.3909774436090226,0.318943511424051
 svc,test_accuracy,0.8557046979865772,0.8548446932649828
 svc,test_average_precision,0.44514416839823046,0.4068200938341621
 svc,test_recall,0.0,0.0
 svc,test_f1,0.0,0.0
 gaussian_naive_bayes,test_accuracy,0.7684563758389261,0.7479123806954234
 gaussian_naive_bayes,test_average_precision,0.2534828030085334,0.23379392278901853
 gaussian_naive_bayes,test_recall,0.42528735632183906,0.3924619085805935
 gaussian_naive_bayes,test_f1,0.34285714285714286,0.3107236284017699
 stochastic_gradient_descent,test_accuracy,0.8576214405360134,0.7773610783222601
 stochastic_gradient_descent,test_average_precision,0.3813093757959869,0.3617503752215592
 stochastic_gradient_descent,test_recall,0.686046511627907,0.2822507350975675
 stochastic_gradient_descent,test_f1,0.3652173913043478,0.21849107443075583
 random_forest,test_accuracy,0.9110738255033557,0.9011129472867694
 random_forest,test_average_precision,0.6998372262021191,0.6619275281099584
 random_forest,test_recall,0.4069767441860465,0.35356856455493185
 random_forest,test_f1,0.5691056910569107,0.5078402513053142
 xgboost,test_accuracy,0.9128978224455612,0.9007711937764886
 xgboost,test_average_precision,0.7366643049075349,0.698622165966308
 xgboost,test_recall,0.5287356321839081,0.44346431435445066
 xgboost,test_f1,0.638888888888889,0.5633957169928393
--- a/presentation/event_stressful_detection_logo.csv
+++ b/presentation/event_stressful_detection_logo.csv
@ -0,0 +1,29 @@
 method,metric,max,mean
 Dummy,test_accuracy,1.0,0.8524114578096439
 Dummy,test_average_precision,0.7,0.14758854219035614
 Dummy,test_recall,0.0,0.0
 Dummy,test_f1,0.0,0.0
 logistic_reg,test_accuracy,0.9824561403508771,0.8445351955631311
 logistic_reg,test_average_precision,1.0,0.44605167668563583
 logistic_reg,test_recall,1.0,0.25353566685532386
 logistic_reg,test_f1,0.823529411764706,0.27951926390778625
 svc,test_accuracy,1.0,0.8524114578096439
 svc,test_average_precision,0.9612401707068228,0.44179454944271934
 svc,test_recall,0.0,0.0
 svc,test_f1,0.0,0.0
 gaussian_naive_bayes,test_accuracy,0.9,0.7491301746887129
 gaussian_naive_bayes,test_average_precision,0.9189430193277607,0.2833170327386991
 gaussian_naive_bayes,test_recall,1.0,0.3743761174081108
 gaussian_naive_bayes,test_f1,0.7000000000000001,0.2698456659235668
 stochastic_gradient_descent,test_accuracy,1.0,0.7926428596764739
 stochastic_gradient_descent,test_average_precision,1.0,0.4421948838597582
 stochastic_gradient_descent,test_recall,1.0,0.30156420704502945
 stochastic_gradient_descent,test_f1,0.8148148148148148,0.24088393234361388
 random_forest,test_accuracy,1.0,0.8722158105763481
 random_forest,test_average_precision,1.0,0.49817066323226833
 random_forest,test_recall,1.0,0.18161263127840668
 random_forest,test_f1,1.0,0.2508096532365307
 xgboost,test_accuracy,1.0,0.8812627400277729
 xgboost,test_average_precision,1.0,0.5505695112208401
 xgboost,test_recall,1.0,0.2896161238315027
 xgboost,test_f1,0.9411764705882353,0.36887408735855665
--- a/presentation/event_stressfulness.py
+++ b/presentation/event_stressfulness.py
@ -0,0 +1,60 @@
 # ---
 # jupyter:
 #   jupytext:
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: Python 3.10.8 ('straw2analysis')
 #     language: python
 #     name: python3
 # ---
 # %%
 # %matplotlib inline
 import datetime
 import importlib
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import yaml
 from pyprojroot import here
 from sklearn import linear_model, svm, kernel_ridge, gaussian_process
 from sklearn.model_selection import LeaveOneGroupOut, LeavePGroupsOut, cross_val_score, cross_validate
 from sklearn.metrics import mean_squared_error, r2_score
 from sklearn.impute import SimpleImputer
 from sklearn.dummy import DummyRegressor
 import xgboost as xg
 from pathlib import Path
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import machine_learning.features_sensor
 import machine_learning.labels
 import machine_learning.model
 import machine_learning.helper
 # %% tags=["active-ipynb"]
 # filename = Path("E:/STRAWresults/inputData/stressfulness_event/input_appraisal_stressfulness_event_mean.csv")
 # filename = Path('C:/Users/Primoz/VSCodeProjects/straw2analysis/data/stressfulness_event/input_appraisal_stressfulness_event_mean.csv')
 # %%
 final_scores = machine_learning.helper.run_all_regression_models(filename)
 # %%
 final_scores.index.name = "metric"
 final_scores = final_scores.set_index(["method", final_scores.index])
 # %%
 final_scores.to_csv("event_stressfulness_scores.csv")
--- a/presentation/plots/d18NArfr_PCA.pdf
+++ b/presentation/plots/d18NArfr_PCA.pdf
--- a/presentation/plots/d18NArfr_hist.pdf
+++ b/presentation/plots/d18NArfr_hist.pdf
--- a/presentation/plots/d18NArfr_relplot.pdf
+++ b/presentation/plots/d18NArfr_relplot.pdf
--- a/presentation/plots/d18demandBayRidge_PCA.pdf
+++ b/presentation/plots/d18demandBayRidge_PCA.pdf
--- a/presentation/plots/d18demandBayRidge_relplot.pdf
+++ b/presentation/plots/d18demandBayRidge_relplot.pdf
--- a/presentation/plots/d18demandBayridge_hist.pdf
+++ b/presentation/plots/d18demandBayridge_hist.pdf
--- a/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
+++ b/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
--- a/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
+++ b/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
--- a/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
+++ b/presentation/plots/daily_24_hours_JCQ_job_demand_Bayesian
--- a/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
+++ b/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
--- a/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
+++ b/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
--- a/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
+++ b/presentation/plots/daily_24_hours_PANAS_negative_affect_Bayesian
--- a/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
+++ b/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
--- a/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
+++ b/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
--- a/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
+++ b/presentation/plots/intradaily_30_min_JCQ_job_demand_Bayesian
--- a/presentation/presentation.Rproj
+++ b/presentation/presentation.Rproj
@ -0,0 +1,17 @@
 Version: 1.0
 RestoreWorkspace: Default
 SaveWorkspace: Default
 AlwaysSaveHistory: Default
 EnableCodeIndexing: Yes
 UseSpacesForTab: Yes
 NumSpacesForTab: 2
 Encoding: UTF-8
 RnwWeave: Sweave
 LaTeX: pdfLaTeX
 AutoAppendNewline: Yes
 SpellingDictionary: en_GB
--- a/presentation/prox_comm_PANAS_nb.py
+++ b/presentation/prox_comm_PANAS_nb.py
@ -0,0 +1,131 @@
 # ---
 # jupyter:
 #   jupytext:
 #     text_representation:
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
 #       jupytext_version: 1.13.0
 #   kernelspec:
 #     display_name: straw2analysis
 #     language: python
 #     name: straw2analysis
 # ---
 # %%
 # %matplotlib inline
 import yaml
 from sklearn import linear_model
 from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
 import os
 import importlib
 import matplotlib.pyplot as plt
 import sys
 import numpy as np
 import seaborn as sns
 import pandas as pd
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 # %%
 from machine_learning import pipeline, features_sensor, labels, model
 # %%
 importlib.reload(labels)
 # %%
 with open("./config/prox_comm_PANAS_features.yaml", "r") as file:
    sensor_features_params = yaml.safe_load(file)
 sensor_features = features_sensor.SensorFeatures(**sensor_features_params)
 #sensor_features.set_sensor_data()
 sensor_features.calculate_features(cached=True)
 # %%
 all_features = sensor_features.get_features("all","all")
 # %%
 with open("./config/prox_comm_PANAS_labels.yaml", "r") as file:
    labels_params = yaml.safe_load(file)
 labels_current = labels.Labels(**labels_params)
 #labels_current.set_labels()
 labels_current.aggregate_labels(cached=True)
 # %%
 model_validation = model.ModelValidation(
    sensor_features.get_features("all", "all"),
    labels_current.get_aggregated_labels(),
    group_variable="participant_id",
    cv_name="loso",
 )
 model_validation.model = linear_model.LinearRegression()
 model_validation.set_cv_method()
 # %%
 model_loso_r2 = model_validation.cross_validate()
 # %%
 print(model_loso_r2)
 print(np.mean(model_loso_r2))
 # %%
 model_loso_r2[model_loso_r2 > 0]
 # %%
 logo = LeaveOneGroupOut()
 # %%
 try_X = model_validation.X.reset_index().drop(["participant_id","date_lj"], axis=1)
 try_y = model_validation.y.reset_index().drop(["participant_id","date_lj"], axis=1)
 # %%
 model_loso_mean_absolute_error = -1 * cross_val_score(
 estimator=model_validation.model,
 X=try_X,
 y=try_y,
 groups=model_validation.groups,
 cv=logo.split(X=try_X, y=try_y, groups=model_validation.groups), 
 scoring='neg_mean_absolute_error'
 )
 # %%
 model_loso_mean_absolute_error
 # %%
 np.median(model_loso_mean_absolute_error)
 # %%
 model_validation.model.fit(try_X, try_y)
 # %%
 Y_predicted = model_validation.model.predict(try_X)
 # %%
 try_y.rename(columns={"NA": "NA_true"}, inplace=True)
 try_y["NA_predicted"] = Y_predicted
 NA_long = pd.wide_to_long(
    try_y.reset_index(),
    i="index",
    j="value",
    stubnames="NA",
    sep="_",
    suffix=".+",
 )
 # %%
 g1 = sns.displot(NA_long, x="NA", hue="value", binwidth=0.1, height=5, aspect=1.5)
 sns.move_legend(g1, "upper left", bbox_to_anchor=(.55, .45))
 g1.set_axis_labels("Daily mean", "Day count")
 display(g1)
 g1.savefig("prox_comm_PANAS_predictions.pdf")
 # %%
 from sklearn.metrics import mean_absolute_error
 # %%
 mean_absolute_error(try_y["NA_true"], try_y["NA_predicted"])
 # %%
 model_loso_mean_absolute_error
--- a/presentation/results_presentation.py
+++ b/presentation/results_presentation.py
@ -0,0 +1,163 @@
 # %%
 import datetime
 import importlib
 import os
 import sys
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 import yaml
 from pyprojroot import here
 from sklearn import linear_model, svm, kernel_ridge, gaussian_process, ensemble
 from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, cross_validate, cross_val_predict
 from sklearn.metrics import mean_squared_error, r2_score
 from sklearn.impute import SimpleImputer
 from sklearn.dummy import DummyRegressor
 from sklearn.decomposition import PCA
 from IPython.core.interactiveshell import InteractiveShell
 InteractiveShell.ast_node_interactivity = "all"
 nb_dir = os.path.split(os.getcwd())[0]
 if nb_dir not in sys.path:
    sys.path.append(nb_dir)
 import machine_learning.helper
 # %%
 segment = "intradaily_30_min"
 target = "JCQ_job_demand"
 csv_name = "./data/" + segment + "_all_targets/input_" + target + "_mean.csv"
 #csv_name = "./data/daily_18_hours_all_targets/input_JCQ_job_demand_mean.csv"
 # %%
 data_x, data_y, data_groups = machine_learning.helper.prepare_model_input(csv_name)
 # %%
 data_y.head()
 # %%
 scores = machine_learning.helper.run_all_models(csv_name)
 # %% jupyter={"source_hidden": true}
 logo = LeaveOneGroupOut()
 logo.get_n_splits(
    data_x,
    data_y,
    groups=data_groups,
 )
 # %% [markdown]
 # ### Baseline: Dummy Regression (mean)
 dummy_regr = DummyRegressor(strategy="mean")
 # %% jupyter={"source_hidden": true}
 lin_reg_scores = cross_validate(
    dummy_regr,
    X=data_x,
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.median(lin_reg_scores['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.median(lin_reg_scores['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(lin_reg_scores['test_neg_root_mean_squared_error']))
 print("R2", np.median(lin_reg_scores['test_r2']))
 ##################
 # %%
 chosen_model = "Random Forest"
 rfr = ensemble.RandomForestRegressor(max_features=0.3, n_jobs=-1)
 rfr_score = cross_validate(
    rfr,
    X=data_x,
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Squared Error", np.median(rfr_score['test_neg_mean_squared_error']))
 print("Negative Mean Absolute Error", np.median(rfr_score['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(rfr_score['test_neg_root_mean_squared_error']))
 print("R2", np.median(rfr_score['test_r2']))
 # %%
 y_predicted = cross_val_predict(rfr, data_x, data_y, groups=data_groups, cv=logo)
 #########################
 # %%
 chosen_model = "Bayesian Ridge"
 bayesian_ridge_reg = linear_model.BayesianRidge()
 bayesian_ridge_reg_score = cross_validate(
    bayesian_ridge_reg,
    X=data_x,
    y=data_y,
    groups=data_groups,
    cv=logo,
    n_jobs=-1,
    scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
 )
 print("Negative Mean Absolute Error", np.median(bayesian_ridge_reg_score['test_neg_mean_absolute_error']))
 print("Negative Root Mean Squared Error", np.median(bayesian_ridge_reg_score['test_neg_root_mean_squared_error']))
 print("R2", np.median(bayesian_ridge_reg_score['test_r2']))
 # %%
 y_predicted = cross_val_predict(bayesian_ridge_reg, data_x, data_y, groups=data_groups, cv=logo)
 # %%
 data_y = pd.DataFrame(pd.concat([data_y, data_groups], axis=1))
 data_y.rename(columns={"target": "y_true"}, inplace=True)
 data_y["y_predicted"] = y_predicted
 # %%
 data_y.head()
 # %%
 g1 = sns.relplot(data=data_y, x="y_true", y="y_predicted")
 #g1.set_axis_labels("true", "predicted")
 #g1.map(plt.axhline, y=0, color=".7", dashes=(2, 1), zorder=0)
 #g1.map(plt.axline, xy1=(0,0), slope=1)
 g1.set(title=",".join([segment, target, chosen_model]))
 display(g1)
 g1.savefig("_".join([segment, target, chosen_model, "_relplot.pdf"]))
 # %%
 data_y_long = pd.wide_to_long(
    data_y.reset_index(),
    i=["local_segment", "pid"],
    j="value",
    stubnames="y",
    sep="_",
    suffix=".+",
 )
 # %%
 data_y_long.head()
 # %%
 g2 = sns.displot(data_y_long, x="y", hue="value", binwidth=0.1, height=5, aspect=1.5)
 sns.move_legend(g2, "upper left", bbox_to_anchor=(.55, .45))
 g2.set(title=",".join([segment, target, chosen_model]))
 g2.savefig("_".join([segment, target, chosen_model, "hist.pdf"]))
 # %%
 pca = PCA(n_components=2)
 pca.fit(data_x)
 print(pca.explained_variance_ratio_)
 # %%
 data_x_pca = pca.fit_transform(data_x)
 data_pca = pd.DataFrame(pd.concat([data_y.reset_index()["y_true"], pd.DataFrame(data_x_pca, columns = {"pca_0", "pca_1"})], axis=1))
 # %%
 data_pca
 # %%
 g3 = sns.relplot(data = data_pca, x = "pca_0", y = "pca_1", hue = "y_true", palette = sns.color_palette("Spectral", as_cmap=True))
 g3.set(title=",".join([segment, target, chosen_model]) + "\n variance explained = " + str(round(sum(pca.explained_variance_ratio_), 2)))
 g3.savefig("_".join([segment, target, chosen_model, "_PCA.pdf"]))
 # %%
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit f78aa3e7b3567423b44045766b230cd60d557cb0
+Subproject commit 03687a1ac204f0a4347eb758dada8005f68b0bb1
Author	SHA1	Message	Date
junos	297eb45933	Merge branch 'ml_pipeline'	2023-04-18 15:55:03 +02:00
junos	0b16aa6fe4	Clean up categories.	2023-04-18 15:49:33 +02:00
junos	d092e17e33	Explore saved categories.	2023-04-18 15:34:06 +02:00
junos	d000551884	Ignore only some files in presentation.	2023-04-18 14:57:59 +02:00
junos	5b9a1dba1a	Revert "Ignore presentation." This reverts commit `ec7cd09a09`.	2023-04-18 14:54:35 +02:00
Primoz	10ca47583c	Implement feature selection methods (WIP).	2023-04-14 17:20:22 +02:00
junos	c0904dd681	Update rapids and various PyCharm settings.	2023-04-12 19:38:15 +02:00
junos	f672709ea6	JupyText config.	2023-04-05 20:14:47 +02:00
junos	633d029dc2	Merge remote-tracking branch 'origin/ml_pipeline' into ml_pipeline	2023-03-01 13:37:18 +01:00
junos	ec7cd09a09	Ignore presentation.	2023-03-01 13:37:10 +01:00
Primoz	8a532fa95a	Add a ML pipeline script to develop a whole pipeline.	2023-02-23 10:41:36 +01:00
Primoz	bccc1cd1de	Clean and fix Preprocessing module.	2023-02-23 10:40:58 +01:00
Primoz	9ed863b7a1	Add a CrossValidation module with all the required methods.	2023-02-23 10:40:17 +01:00
Primoz	f69cb25266	Add planning comments.	2023-02-22 18:12:52 +01:00
Primoz	7f6ae9b323	Add imputation and One-Hot Encoding Methods.	2023-02-22 18:05:01 +01:00
Primoz	8f6cb3f444	Add preprocessing class.	2023-02-22 13:44:03 +01:00
Primoz	ef12f64fe5	Add feature selection Class skeleton.	2023-02-20 11:51:34 +01:00
Primoz	63741c3627	Add feature selection pipeline script with initail plan.	2023-02-15 12:27:39 +01:00
Primoz	806ca1b37d	Added excel intermediate scores file	2023-02-06 14:02:24 +01:00
Primoz	d06da18c26	Fix error with method name.	2023-02-06 11:54:54 +01:00
Primoz	93a34986d9	Comment code sections and change to pd.concatinate method.	2023-02-06 11:31:21 +01:00
Primoz	08e81610a9	Fix method params	2023-02-06 11:21:17 +01:00
Primoz	ef78f179cd	Add neccessary parameters to plot method.	2023-02-06 11:19:04 +01:00
Primoz	afeb7b4872	Remove data_yield from features.	2023-02-06 11:16:53 +01:00
Primoz	ea3f805ba7	Change ML model and ddd CV to sequential feat_select. Add std lines to plots.	2023-02-06 11:09:15 +01:00
Primoz	e3aef2dae7	Add save to file code, and todo comment	2023-02-01 15:13:57 +01:00
Primoz	b286753696	Add vizualization sections for sequential addition of sensors' features.	2023-02-01 13:51:56 +01:00
Primoz	07ef72dec5	Implement algorithm for sequential adding of the most important features.	2023-01-25 14:19:29 +01:00
Primoz	85e572fca0	Expand analysis of the features (individualy and by sensor groups).	2023-01-23 16:32:07 +01:00
Primoz	6a98c8cdcf	Prepare scripts for feature importance analysis.	2023-01-19 16:20:43 +01:00
Primoz	d263b32564	Temp: remove stratified logo from ml pipeline.	2023-01-19 09:26:55 +01:00
Primoz	ad2fab133f	Explore features with Entropy and IG.	2023-01-13 17:08:56 +01:00
junos	72fdd9c5ec	Use stratified downsampling. And run all models with a method from machine_learning.helper.	2023-01-04 21:48:37 +01:00
junos	b0b9edccc4	Unhide jupyter code cells and outputs.	2023-01-04 21:25:12 +01:00
junos	61d786b2ca	Describe LOSO results.	2023-01-04 20:38:05 +01:00
junos	37eada4a2e	Present results for stressful events.	2023-01-04 20:00:08 +01:00
junos	af6843634c	Update RAPIDS submodule.	2023-01-04 18:22:53 +01:00
junos	8bbe0b2ba8	Merge branch 'ml_pipeline' of repo.ijs.si:junoslukan/straw2analysis into ml_pipeline # Conflicts: # .gitignore # exploration/ml_pipeline_classification_with_clustering.py	2023-01-04 18:19:43 +01:00
junos	1516d1c000	Cluster by demand_control_ratio_quartile.	2023-01-04 18:16:14 +01:00
Primoz	339142ff31	Add expl stress event script and other changes.	2022-12-21 15:02:25 +01:00
Primoz	adcb823d3f	Add stress event duration exploration script.	2022-12-15 16:43:40 +01:00
Primoz	a61ab9ee51	Add feature importance check.	2022-12-15 16:43:13 +01:00
Primoz	164d12ed2f	Add undersampling method (with on/off parameter).	2022-12-13 17:01:46 +01:00
Primoz	0a45e35164	Remove unused imports prt. 2.	2022-12-09 13:56:42 +01:00
Primoz	78b6e7fa07	Remove unused imports from ML pipeline scripts.	2022-12-09 13:53:16 +01:00
Primoz	6507b053c5	Add StrtifiedKFold with shuffling as a default CV method.	2022-12-09 13:46:13 +01:00
Primoz	ac03b36c0f	Add files to .gitignore and add files path for stressfulness event.	2022-12-09 13:44:20 +01:00
junos	2bb95657d8	Finish classification presentation.	2022-12-08 10:02:13 +01:00
junos	852e17afbe	Define classification models method.	2022-12-08 10:00:14 +01:00
junos	509707855e	Prepare classification presentation.	2022-12-07 21:43:34 +01:00
junos	8ea0c1834c	Present event stressfulness results.	2022-12-07 21:26:35 +01:00
junos	8131626c4a	Include more metrics in regression helper methods.	2022-12-07 21:25:05 +01:00
junos	95ab66fd81	Move to presentation.	2022-12-07 21:24:20 +01:00
junos	525496418f	Configure jupytext.	2022-12-07 16:22:41 +01:00
junos	12f2c927fa	Merge branch 'ml_pipeline' of repo.ijs.si:junoslukan/straw2analysis into ml_pipeline # Conflicts: # exploration/ml_pipeline_daily.py - deleted	2022-12-07 15:36:52 +01:00
junos	71e1fcf8ca	Save results.	2022-12-07 15:33:18 +01:00
Primoz	cf0e4f89be	Test nonstandardized data with regular classification pipeline.	2022-11-29 14:06:06 +01:00
Primoz	7504aa34cf	Add additional categorical features (uncomment).	2022-11-28 13:42:46 +01:00
Primoz	9a218c8e2a	Add a script for two class train test split clustering classification.	2022-11-25 14:44:11 +01:00
Primoz	98f78d72fc	Create a classification models class and use it in the ml pipeline script.	2022-11-25 12:35:45 +01:00
Primoz	218b684514	Automize clustering classification logic and add parameters at the begining of the scripts. General changes and improvements.	2022-11-24 16:12:20 +01:00
Primoz	ddde80b421	Add classification with clustering ml pipeline script.	2022-11-24 09:24:13 +01:00
Primoz	7afef5582f	Add TEMP lime_survey cols	2022-11-22 14:44:33 +01:00
Primoz	183758cd37	Improve general ml classification pipeline script.	2022-11-22 14:31:49 +01:00
Primoz	40029a8205	Add a script for ml classification pipeline.	2022-11-21 14:47:19 +01:00
Primoz	ae0f54ecc2	Combine different segment scripts and set ml pipeline as a regression problem.	2022-11-21 11:41:11 +01:00
Primoz	8defb271c9	Extend ml pipeline scripts with two additional CV methods.	2022-11-21 11:23:47 +01:00
junos	ae2d7a038d	Present results.	2022-11-16 21:36:43 +01:00
junos	389198b17f	Prepare data in a separate step. Change categorical features.	2022-11-16 19:34:18 +01:00
junos	c462d55096	Update function with imputation already handled.	2022-11-16 18:18:12 +01:00
junos	a5c09a292f	Move function to helper.py	2022-11-16 18:13:30 +01:00
junos	848416bf6a	Merge branch 'master' into ml_pipeline # Conflicts: # .gitignore # rapids	2022-11-16 17:46:01 +01:00
Primoz	b59798df26	Add a new file tailored for stressfulness event regression.	2022-11-16 14:49:40 +01:00
Primoz	87ebb9f296	Delete files ... add to gitignore	2022-11-16 11:08:03 +01:00
Primoz	1d8dcf8b21	Add 30 min features data and modify script.	2022-11-02 15:16:19 +01:00
Primoz	9f7fa0c8e0	Add 18 hour daily data and slightly modify jupyter script.	2022-10-18 10:29:59 +02:00
Primoz	cdff4da930	Merge branch 'ml_pipeline' of https://repo.ijs.si/junoslukan/straw2analysis into ml_pipeline	2022-10-17 22:15:17 +02:00
Primoz	ad5f50babe	Correctly imputed data uploaded on STRAW (all targets)	2022-10-12 12:48:10 +02:00
Primoz	466cd3dc23	Processing of a newly cleaned script. Addition of two ML models. And modifications with one hot encoding.	2022-10-10 16:47:00 +02:00
		`@ -1 +1 @@`
			`Subproject commit f78aa3e7b3567423b44045766b230cd60d557cb0`				`Subproject commit 03687a1ac204f0a4347eb758dada8005f68b0bb1`