# --- # jupyter: # jupytext: # formats: ipynb,py:percent # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.13.0 # kernelspec: # display_name: straw2analysis # language: python # name: straw2analysis # --- # %% jupyter={"source_hidden": false, "outputs_hidden": false} # %matplotlib inline import os, sys, math import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.impute import SimpleImputer from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn import metrics # %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}} index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns) categorical_feature_colnames = ["gender", "startlanguage"] additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col] categorical_feature_colnames += additional_categorical_features categorical_features = model_input[categorical_feature_colnames].copy() mode_categorical_features = categorical_features.mode().iloc[0] # fillna with mode categorical_features = categorical_features.fillna(mode_categorical_features) # one-hot encoding categorical_features = categorical_features.apply(lambda col: col.astype("category")) if not categorical_features.empty: categorical_features = pd.get_dummies(categorical_features) numerical_features = model_input.drop(categorical_feature_colnames, axis=1) model_input = pd.concat([numerical_features, categorical_features], axis=1) # Binarizacija targeta bins = [-1, 0, 4] # bins for stressfulness (0-4) target model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True) print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns))) print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape) # %% # Get phone and non-phone columns def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True): """ This function makes predictions with sensor groups. It takes in a dataframe (df), a list of group substrings (groups_substrings) and an optional parameter include_group (default is True). It creates a list of columns in the dataframe that contain the group substrings, while excluding the 'pid' and 'target' columns. It then splits the data into training and test sets, using a test size of 0.25 for the first split and 0.2 for the second split. A SimpleImputer is used to fill in missing values with median values. A RandomForestClassifier is then used to fit the training set and make predictions on the test set. Finally, accuracy, precision, recall and F1 scores are printed for each substring group depending on whether or not include_group is set to True or False. """ for fgroup_substr in groups_substrings: if include_group: feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']] else: feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']] X, y = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target'] imputer = SimpleImputer(missing_values=np.nan, strategy='median') X = imputer.fit_transform(X) X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2) rfc = RandomForestClassifier(random_state=0) rfc.fit(X_train, y_train) y_pred = rfc.predict(X_test) if include_group: print("\nPrediction with", fgroup_substr) else: print("\nPrediction without", fgroup_substr) print("************************************************") print("Accuracy", metrics.accuracy_score(y_test, y_pred)) print("Precision", metrics.precision_score(y_test, y_pred)) print("Recall", metrics.recall_score(y_test, y_pred)) print("F1", metrics.f1_score(y_test, y_pred), "\n") # %% model_input groups_substr = ["_", "phone_", "empatica_"] make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False) # %% make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"] make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False) # %% make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True) groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"] make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False) # %% # Create an empty list to store the feature column groups feature_column_groups = [] # Iterate through each column in model_input for column in model_input.columns: # Split the column name by '_' split_column = column.split('_') # Create a variable to store the prefix of the current column prefix = '' # Iterate through each part of the split column name for part in split_column: # Add the part to the prefix variable prefix += part + '_' # Check if the prefix is already in our feature column groups list if prefix not in feature_column_groups: # If not, add it to our list of feature columns groups feature_column_groups.append(prefix) # Print out all possible feature columns groups that contain more than one entry in a columns list print(feature_column_groups) # %% # Write all the sensors (phone, empatica), seperate other (demographical) cols also