149 lines
6.2 KiB
Python
149 lines
6.2 KiB
Python
# ---
|
|
# jupyter:
|
|
# jupytext:
|
|
# formats: ipynb,py:percent
|
|
# text_representation:
|
|
# extension: .py
|
|
# format_name: percent
|
|
# format_version: '1.3'
|
|
# jupytext_version: 1.13.0
|
|
# kernelspec:
|
|
# display_name: straw2analysis
|
|
# language: python
|
|
# name: straw2analysis
|
|
# ---
|
|
|
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false}
|
|
# %matplotlib inline
|
|
|
|
import os, sys, math
|
|
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import pandas as pd
|
|
|
|
from sklearn.impute import SimpleImputer
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn import metrics
|
|
|
|
# %% jupyter={"source_hidden": false, "outputs_hidden": false} nteract={"transient": {"deleting": false}}
|
|
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
|
model_input = pd.read_csv("../data/stressfulness_event_with_target_0_ver2/input_appraisal_stressfulness_event_mean.csv").set_index(index_columns)
|
|
|
|
categorical_feature_colnames = ["gender", "startlanguage"]
|
|
additional_categorical_features = [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
|
|
categorical_feature_colnames += additional_categorical_features
|
|
|
|
categorical_features = model_input[categorical_feature_colnames].copy()
|
|
mode_categorical_features = categorical_features.mode().iloc[0]
|
|
|
|
# fillna with mode
|
|
categorical_features = categorical_features.fillna(mode_categorical_features)
|
|
|
|
# one-hot encoding
|
|
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
|
if not categorical_features.empty:
|
|
categorical_features = pd.get_dummies(categorical_features)
|
|
|
|
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
|
|
model_input = pd.concat([numerical_features, categorical_features], axis=1)
|
|
|
|
# Binarizacija targeta
|
|
bins = [-1, 0, 4] # bins for stressfulness (0-4) target
|
|
model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=[0, 1], retbins=True, right=True)
|
|
|
|
print("Non-numeric cols (or target):", list(model_input.columns.difference(model_input.select_dtypes(include=np.number).columns)))
|
|
print("Shapes of numeric df:", model_input.shape, model_input.select_dtypes(include=np.number).shape)
|
|
|
|
|
|
# %%
|
|
|
|
# Get phone and non-phone columns
|
|
|
|
def make_predictions_with_sensor_groups(df, groups_substrings, include_group=True):
|
|
"""
|
|
This function makes predictions with sensor groups.
|
|
It takes in a dataframe (df), a list of group substrings (groups_substrings)
|
|
and an optional parameter include_group (default is True).
|
|
It creates a list of columns in the dataframe that contain the group substrings,
|
|
while excluding the 'pid' and 'target' columns. It then splits the data into training
|
|
and test sets, using a test size of 0.25 for the first split and 0.2 for the second split.
|
|
A SimpleImputer is used to fill in missing values with median values.
|
|
A RandomForestClassifier is then used to fit the training set and make predictions
|
|
on the test set. Finally, accuracy, precision, recall and F1 scores are printed
|
|
for each substring group depending on whether or not include_group
|
|
is set to True or False.
|
|
|
|
"""
|
|
for fgroup_substr in groups_substrings:
|
|
if include_group:
|
|
feature_group_cols = [col for col in df.columns if fgroup_substr in col and col not in ['pid', 'target']]
|
|
else:
|
|
feature_group_cols = [col for col in df.columns if fgroup_substr not in col and col not in ['pid', 'target']]
|
|
|
|
X, y = df.drop(columns=['target', 'pid'])[feature_group_cols], df['target']
|
|
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
|
|
X = imputer.fit_transform(X)
|
|
|
|
X, _, y, _ = train_test_split(X, y, random_state=19, test_size=0.25)
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=0.2)
|
|
|
|
rfc = RandomForestClassifier(random_state=0)
|
|
rfc.fit(X_train, y_train)
|
|
y_pred = rfc.predict(X_test)
|
|
|
|
if include_group:
|
|
print("\nPrediction with", fgroup_substr)
|
|
else:
|
|
print("\nPrediction without", fgroup_substr)
|
|
|
|
print("************************************************")
|
|
print("Accuracy", metrics.accuracy_score(y_test, y_pred))
|
|
print("Precision", metrics.precision_score(y_test, y_pred))
|
|
print("Recall", metrics.recall_score(y_test, y_pred))
|
|
print("F1", metrics.f1_score(y_test, y_pred), "\n")
|
|
# %%
|
|
model_input
|
|
groups_substr = ["_", "phone_", "empatica_"]
|
|
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
|
|
|
|
# %%
|
|
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
|
|
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
|
|
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
|
|
|
|
# %%
|
|
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings="_", include_group=True)
|
|
groups_substr = ["empatica_inter_beat_", "empatica_accelerometer_", "empatica_temperature_", "empatica_electrodermal_"]
|
|
make_predictions_with_sensor_groups(model_input.copy(), groups_substrings=groups_substr, include_group=False)
|
|
|
|
# %%
|
|
# Create an empty list to store the feature column groups
|
|
feature_column_groups = []
|
|
|
|
# Iterate through each column in model_input
|
|
for column in model_input.columns:
|
|
|
|
# Split the column name by '_'
|
|
split_column = column.split('_')
|
|
|
|
# Create a variable to store the prefix of the current column
|
|
prefix = ''
|
|
|
|
# Iterate through each part of the split column name
|
|
for part in split_column:
|
|
|
|
# Add the part to the prefix variable
|
|
prefix += part + '_'
|
|
|
|
# Check if the prefix is already in our feature column groups list
|
|
if prefix not in feature_column_groups:
|
|
|
|
# If not, add it to our list of feature columns groups
|
|
feature_column_groups.append(prefix)
|
|
|
|
# Print out all possible feature columns groups that contain more than one entry in a columns list
|
|
print(feature_column_groups)
|
|
# %%
|
|
# Write all the sensors (phone, empatica), seperate other (demographical) cols also |