stress_at_work_analysis/exploration/ex_ml_pipeline.py

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.12.0
#   kernelspec:
#     display_name: straw2analysis
#     language: python
#     name: straw2analysis
# ---

# %%
# %matplotlib inline
import datetime
import importlib
import os
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import yaml
from sklearn import linear_model
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import machine_learning.features_sensor
import machine_learning.labels
import machine_learning.model

# %%
import participants.query_db
from features import esm, helper, proximity

# %% [markdown] tags=[]
# # 1. Get the relevant data

# %%
participants_inactive_usernames = participants.query_db.get_usernames(
    collection_start=datetime.date.fromisoformat("2020-08-01")
)
# Consider only two participants to simplify.
ptcp_2 = participants_inactive_usernames[0:2]

# %% [markdown] jp-MarkdownHeadingCollapsed=true tags=[]
# ## 1.1 Labels

# %%
df_esm = esm.get_esm_data(ptcp_2)
df_esm_preprocessed = esm.preprocess_esm(df_esm)

# %%
df_esm_PANAS = df_esm_preprocessed[
    (df_esm_preprocessed["questionnaire_id"] == 8)
    | (df_esm_preprocessed["questionnaire_id"] == 9)
]
df_esm_PANAS_clean = esm.clean_up_esm(df_esm_PANAS)

# %% [markdown]
# ## 1.2 Sensor data

# %%
df_proximity = proximity.get_proximity_data(ptcp_2)
df_proximity = helper.get_date_from_timestamp(df_proximity)
df_proximity = proximity.recode_proximity(df_proximity)

# %% [markdown]
# ## 1.3 Standardization/personalization

# %% [markdown]
# # 2. Grouping/segmentation

# %%
df_esm_PANAS_daily_means = (
    df_esm_PANAS_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])
    .esm_user_answer_numeric.agg("mean")
    .reset_index()
    .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
)

# %%
df_esm_PANAS_daily_means = (
    df_esm_PANAS_daily_means.pivot(
        index=["participant_id", "date_lj"],
        columns="questionnaire_id",
        values="esm_numeric_mean",
    )
    .reset_index(col_level=1)
    .rename(columns={8.0: "PA", 9.0: "NA"})
    .set_index(["participant_id", "date_lj"])
)


# %%
df_proximity_daily_counts = proximity.count_proximity(df_proximity, ["date_lj"])

# %%
df_proximity_daily_counts

# %% [markdown]
# # 3. Join features (and export to csv?)

# %%
df_full_data_daily_means = df_esm_PANAS_daily_means.join(
    df_proximity_daily_counts
).reset_index()

# %% [markdown]
# # 4. Machine learning model and parameters

# %%
lin_reg_proximity = linear_model.LinearRegression()

# %% [markdown]
# ## 4.1 Validation method

# %%
logo = LeaveOneGroupOut()
logo.get_n_splits(
    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
    df_full_data_daily_means["PA"],
    groups=df_full_data_daily_means["participant_id"],
)

# %% [markdown]
# ## 4.2 Fit results (export?)

# %%
cross_val_score(
    lin_reg_proximity,
    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
    df_full_data_daily_means["PA"],
    groups=df_full_data_daily_means["participant_id"],
    cv=logo,
    n_jobs=-1,
    scoring="r2",
)

# %%
lin_reg_proximity.fit(
    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
    df_full_data_daily_means["PA"],
)

# %%
lin_reg_proximity.score(
    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
    df_full_data_daily_means["PA"],
)

# %% [markdown]
# # Merging these into a pipeline

# %%
from machine_learning import features_sensor, labels, model, pipeline

# %%
importlib.reload(features_sensor)

# %%
with open("../machine_learning/config/minimal_features.yaml", "r") as file:
    sensor_features_params = yaml.safe_load(file)
print(sensor_features_params)

# %%
sensor_features = machine_learning.features_sensor.SensorFeatures(
    **sensor_features_params
)
sensor_features.data_types

# %%
sensor_features.set_participants_label("nokia_0000003")

# %%
sensor_features.data_types = ["proximity", "communication"]
sensor_features.participants_usernames = ptcp_2

# %%
sensor_features.get_sensor_data("proximity")

# %%
sensor_features.set_sensor_data()

# %%
sensor_features.get_sensor_data("proximity")

# %%
sensor_features.calculate_features(cached=False)
features_all_calculated = sensor_features.get_features("all", "all")

# %%
sensor_features.calculate_features(cached=True)
features_all_read = sensor_features.get_features("all", "all")

# %%
features_all_read = features_all_read.reset_index()
features_all_read["date_lj"] = features_all_read["date_lj"].dt.date
features_all_read.set_index(["participant_id", "date_lj"], inplace=True)
# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
# When calculated, it is represented as date.

# %%
np.isclose(features_all_read, features_all_calculated).all()

# %%
with open("../machine_learning/config/minimal_labels.yaml", "r") as file:
    labels_params = yaml.safe_load(file)

# %%
labels = machine_learning.labels.Labels(**labels_params)
labels.participants_usernames = ptcp_2
labels.set_participants_label("nokia_0000003")
labels.questionnaires

# %%
labels.set_labels()

# %%
labels.get_labels("PANAS")

# %%
labels.aggregate_labels(cached=False)
labels_calculated = labels.get_aggregated_labels()

# %%
labels.aggregate_labels(cached=True)
labels_read = labels.get_aggregated_labels()
labels_read = labels_read.reset_index()
labels_read["date_lj"] = labels_read["date_lj"].dt.date
labels_read.set_index(["participant_id", "date_lj"], inplace=True)
# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
# When calculated, it is represented as date.

# %%
np.isclose(labels_read, labels_calculated).all()

# %%
model_validation = machine_learning.model.ModelValidation(
    sensor_features.get_features("all", "all"),
    labels.get_aggregated_labels(),
    group_variable="participant_id",
    cv_name="loso",
)
model_validation.model = linear_model.LinearRegression()
model_validation.set_cv_method()

# %%
model_validation.cross_validate()

# %%
model_validation.groups

# %%
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`# ---`
			`# jupyter:`
			`# jupytext:`
			`# formats: ipynb,py:percent`
			`# text_representation:`
			`# extension: .py`
			`# format_name: percent`
			`# format_version: '1.3'`
Refactor machine_learning/pipeline.py by defining one class by file. 2021-09-13 11:41:57 +02:00			`# jupytext_version: 1.12.0`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`# kernelspec:`
			`# display_name: straw2analysis`
			`# language: python`
			`# name: straw2analysis`
			`# ---`

			`# %%`
			`# %matplotlib inline`
			`import datetime`
Aggregate labels using grouping_variable. 2021-08-20 19:17:22 +02:00			`import importlib`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`import os`
			`import sys`

Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`import numpy as np`
			`import pandas as pd`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`import seaborn as sns`
Add communication features to pipeline. 2021-08-19 17:05:44 +02:00			`import yaml`
Add an example for linear regression. 2021-08-12 16:54:00 +02:00			`from sklearn import linear_model`
			`from sklearn.model_selection import LeaveOneGroupOut, cross_val_score`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00
			`nb_dir = os.path.split(os.getcwd())[0]`
			`if nb_dir not in sys.path:`
			`sys.path.append(nb_dir)`

Refactor machine_learning/pipeline.py by defining one class by file. 2021-09-13 11:41:57 +02:00			`import machine_learning.features_sensor`
			`import machine_learning.labels`
			`import machine_learning.model`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00
Document the SensorFeatures class and its __init__ method. 2021-09-13 17:43:47 +02:00			`# %%`
			`import participants.query_db`
			`from features import esm, helper, proximity`

Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`# %% [markdown] tags=[]`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`# # 1. Get the relevant data`

			`# %%`
			`participants_inactive_usernames = participants.query_db.get_usernames(`
			`collection_start=datetime.date.fromisoformat("2020-08-01")`
			`)`
			`# Consider only two participants to simplify.`
			`ptcp_2 = participants_inactive_usernames[0:2]`

Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`# %% [markdown] jp-MarkdownHeadingCollapsed=true tags=[]`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`# ## 1.1 Labels`

			`# %%`
			`df_esm = esm.get_esm_data(ptcp_2)`
			`df_esm_preprocessed = esm.preprocess_esm(df_esm)`

			`# %%`
			`df_esm_PANAS = df_esm_preprocessed[`
			`(df_esm_preprocessed["questionnaire_id"] == 8)`
			`\| (df_esm_preprocessed["questionnaire_id"] == 9)`
			`]`
			`df_esm_PANAS_clean = esm.clean_up_esm(df_esm_PANAS)`

			`# %% [markdown]`
			`# ## 1.2 Sensor data`

			`# %%`
			`df_proximity = proximity.get_proximity_data(ptcp_2)`
Use the same function for ESM and other data. 2021-08-11 17:26:44 +02:00			`df_proximity = helper.get_date_from_timestamp(df_proximity)`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`df_proximity = proximity.recode_proximity(df_proximity)`

Add a parameter for grouping. 2021-08-12 15:07:20 +02:00			`# %% [markdown]`
			`# ## 1.3 Standardization/personalization`

Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`# %% [markdown]`
			`# # 2. Grouping/segmentation`

			`# %%`
			`df_esm_PANAS_daily_means = (`
			`df_esm_PANAS_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])`
			`.esm_user_answer_numeric.agg("mean")`
			`.reset_index()`
			`.rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})`
			`)`
Add a parameter for grouping. 2021-08-12 15:07:20 +02:00
Add an example for linear regression. 2021-08-12 16:54:00 +02:00			`# %%`
			`df_esm_PANAS_daily_means = (`
			`df_esm_PANAS_daily_means.pivot(`
			`index=["participant_id", "date_lj"],`
			`columns="questionnaire_id",`
			`values="esm_numeric_mean",`
			`)`
			`.reset_index(col_level=1)`
			`.rename(columns={8.0: "PA", 9.0: "NA"})`
			`.set_index(["participant_id", "date_lj"])`
			`)`


Add a parameter for grouping. 2021-08-12 15:07:20 +02:00			`# %%`
Add a demo of pipeline. 2021-11-17 10:44:49 +01:00			`df_proximity_daily_counts = proximity.count_proximity(df_proximity, ["date_lj"])`
Add a parameter for grouping. 2021-08-12 15:07:20 +02:00
			`# %%`
			`df_proximity_daily_counts`

			`# %% [markdown]`
			`# # 3. Join features (and export to csv?)`

Add an example for linear regression. 2021-08-12 16:54:00 +02:00			`# %%`
			`df_full_data_daily_means = df_esm_PANAS_daily_means.join(`
			`df_proximity_daily_counts`
			`).reset_index()`

Add a parameter for grouping. 2021-08-12 15:07:20 +02:00			`# %% [markdown]`
			`# # 4. Machine learning model and parameters`

			`# %%`
Add an example for linear regression. 2021-08-12 16:54:00 +02:00			`lin_reg_proximity = linear_model.LinearRegression()`

			`# %% [markdown]`
			`# ## 4.1 Validation method`

			`# %%`
			`logo = LeaveOneGroupOut()`
			`logo.get_n_splits(`
			`df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],`
			`df_full_data_daily_means["PA"],`
			`groups=df_full_data_daily_means["participant_id"],`
			`)`

			`# %% [markdown]`
			`# ## 4.2 Fit results (export?)`

			`# %%`
			`cross_val_score(`
			`lin_reg_proximity,`
			`df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],`
			`df_full_data_daily_means["PA"],`
			`groups=df_full_data_daily_means["participant_id"],`
			`cv=logo,`
			`n_jobs=-1,`
			`scoring="r2",`
			`)`

			`# %%`
			`lin_reg_proximity.fit(`
			`df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],`
			`df_full_data_daily_means["PA"],`
			`)`

			`# %%`
			`lin_reg_proximity.score(`
			`df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],`
			`df_full_data_daily_means["PA"],`
			`)`
[WIP] Start merging steps into a class for a pipeline. 2021-08-12 17:38:08 +02:00
			`# %% [markdown]`
			`# # Merging these into a pipeline`

			`# %%`
Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`from machine_learning import features_sensor, labels, model, pipeline`
[WIP] Start merging steps into a class for a pipeline. 2021-08-12 17:38:08 +02:00
Aggregate labels using grouping_variable. 2021-08-20 19:17:22 +02:00			`# %%`
Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`importlib.reload(features_sensor)`
Aggregate labels using grouping_variable. 2021-08-20 19:17:22 +02:00
[WIP] Start merging steps into a class for a pipeline. 2021-08-12 17:38:08 +02:00			`# %%`
Add communication features to pipeline. 2021-08-19 17:05:44 +02:00			`with open("../machine_learning/config/minimal_features.yaml", "r") as file:`
Accept nested feature configuration. To do this, pass a dict as parameters to SensorFeatures class, rather than actually reading the object from yaml file. 2021-08-19 17:23:23 +02:00			`sensor_features_params = yaml.safe_load(file)`
Merge features into a common df. But first, group communication by the grouping_variable. 2021-08-20 17:59:00 +02:00			`print(sensor_features_params)`
Accept nested feature configuration. To do this, pass a dict as parameters to SensorFeatures class, rather than actually reading the object from yaml file. 2021-08-19 17:23:23 +02:00
			`# %%`
Document the SensorFeatures class and its __init__ method. 2021-09-13 17:43:47 +02:00			`sensor_features = machine_learning.features_sensor.SensorFeatures(`
			`**sensor_features_params`
			`)`
Accept nested feature configuration. To do this, pass a dict as parameters to SensorFeatures class, rather than actually reading the object from yaml file. 2021-08-19 17:23:23 +02:00			`sensor_features.data_types`
[WIP] Methods to get the labels and data plus aggregate them. 2021-08-12 19:06:43 +02:00
Save calculated features to csv files. 2021-08-23 16:36:26 +02:00			`# %%`
			`sensor_features.set_participants_label("nokia_0000003")`

Merge features into a common df. But first, group communication by the grouping_variable. 2021-08-20 17:59:00 +02:00			`# %%`
			`sensor_features.data_types = ["proximity", "communication"]`
[WIP] Add a class for model validation. 2021-08-20 19:44:50 +02:00			`sensor_features.participants_usernames = ptcp_2`
Merge features into a common df. But first, group communication by the grouping_variable. 2021-08-20 17:59:00 +02:00
Add communication features to pipeline. 2021-08-19 17:05:44 +02:00			`# %%`
			`sensor_features.get_sensor_data("proximity")`

[WIP] Methods to get the labels and data plus aggregate them. 2021-08-12 19:06:43 +02:00			`# %%`
Convert the class into a YAML object. Add an example config file and demonstrate its usage in ex_ml_pipeline.ipynb. 2021-08-19 16:31:42 +02:00			`sensor_features.set_sensor_data()`
[WIP] Methods to get the labels and data plus aggregate them. 2021-08-12 19:06:43 +02:00
			`# %%`
Convert the class into a YAML object. Add an example config file and demonstrate its usage in ex_ml_pipeline.ipynb. 2021-08-19 16:31:42 +02:00			`sensor_features.get_sensor_data("proximity")`
[WIP] Start merging steps into a class for a pipeline. 2021-08-12 17:38:08 +02:00
			`# %%`
Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`sensor_features.calculate_features(cached=False)`
			`features_all_calculated = sensor_features.get_features("all", "all")`

			`# %%`
			`sensor_features.calculate_features(cached=True)`
			`features_all_read = sensor_features.get_features("all", "all")`

			`# %%`
			`features_all_read = features_all_read.reset_index()`
			`features_all_read["date_lj"] = features_all_read["date_lj"].dt.date`
			`features_all_read.set_index(["participant_id", "date_lj"], inplace=True)`
			`# date_lj column is parsed as a date and represented as Timestamp, when read from csv.`
			`# When calculated, it is represented as date.`
[WIP] Start merging steps into a class for a pipeline. 2021-08-12 17:38:08 +02:00
Merge features into a common df. But first, group communication by the grouping_variable. 2021-08-20 17:59:00 +02:00			`# %%`
Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`np.isclose(features_all_read, features_all_calculated).all()`
Merge features into a common df. But first, group communication by the grouping_variable. 2021-08-20 17:59:00 +02:00
Accept nested feature configuration. To do this, pass a dict as parameters to SensorFeatures class, rather than actually reading the object from yaml file. 2021-08-19 17:23:23 +02:00			`# %%`
Add a similar class for labels. 2021-08-19 17:44:04 +02:00			`with open("../machine_learning/config/minimal_labels.yaml", "r") as file:`
			`labels_params = yaml.safe_load(file)`

			`# %%`
Refactor machine_learning/pipeline.py by defining one class by file. 2021-09-13 11:41:57 +02:00			`labels = machine_learning.labels.Labels(**labels_params)`
[WIP] Add a class for model validation. 2021-08-20 19:44:50 +02:00			`labels.participants_usernames = ptcp_2`
Add export capabilities to labels.py. 2021-09-15 15:36:36 +02:00			`labels.set_participants_label("nokia_0000003")`
Add a similar class for labels. 2021-08-19 17:44:04 +02:00			`labels.questionnaires`

			`# %%`
			`labels.set_labels()`

			`# %%`
			`labels.get_labels("PANAS")`

			`# %%`
Add an option to read cached labels from a file. 2021-09-15 15:45:49 +02:00			`labels.aggregate_labels(cached=False)`
			`labels_calculated = labels.get_aggregated_labels()`
Aggregate labels using grouping_variable. 2021-08-20 19:17:22 +02:00
			`# %%`
Add an option to read cached labels from a file. 2021-09-15 15:45:49 +02:00			`labels.aggregate_labels(cached=True)`
			`labels_read = labels.get_aggregated_labels()`
			`labels_read = labels_read.reset_index()`
			`labels_read["date_lj"] = labels_read["date_lj"].dt.date`
			`labels_read.set_index(["participant_id", "date_lj"], inplace=True)`
			`# date_lj column is parsed as a date and represented as Timestamp, when read from csv.`
			`# When calculated, it is represented as date.`

			`# %%`
			`np.isclose(labels_read, labels_calculated).all()`
Aggregate labels using grouping_variable. 2021-08-20 19:17:22 +02:00
			`# %%`
Refactor machine_learning/pipeline.py by defining one class by file. 2021-09-13 11:41:57 +02:00			`model_validation = machine_learning.model.ModelValidation(`
[WIP] Add a class for model validation. 2021-08-20 19:44:50 +02:00			`sensor_features.get_features("all", "all"),`
			`labels.get_aggregated_labels(),`
			`group_variable="participant_id",`
			`cv_name="loso",`
			`)`
			`model_validation.model = linear_model.LinearRegression()`
			`model_validation.set_cv_method()`

			`# %%`
			`model_validation.cross_validate()`

			`# %%`
			`model_validation.groups`

			`# %%`