stress_at_work_analysis/exploration/ex_ml_pipeline.py

# ---
# jupyter:
#   jupytext:
#     formats: ipynb,py:percent
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.13.0
#   kernelspec:
#     display_name: straw2analysis
#     language: python
#     name: straw2analysis
# ---

# %%
# %matplotlib inline
import datetime
import importlib
import os
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import yaml
from pyprojroot import here
from sklearn import linear_model
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
from sklearn.impute import SimpleImputer

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import machine_learning.features_sensor
import machine_learning.labels
import machine_learning.model

# %%
import participants.query_db
from features import esm, helper, proximity

# %% [markdown] tags=[]
# # 1. Get the relevant data

# %%
participants_inactive_usernames = participants.query_db.get_usernames(
    collection_start=datetime.date.fromisoformat("2020-08-01")
)
# Consider only two participants to simplify.
ptcp_2 = participants_inactive_usernames[0:2]

# %% [markdown] jp-MarkdownHeadingCollapsed=true tags=[]
# ## 1.1 Labels

# %%
df_esm = esm.get_esm_data(ptcp_2)
df_esm_preprocessed = esm.preprocess_esm(df_esm)

# %%
df_esm_PANAS = df_esm_preprocessed[
    (df_esm_preprocessed["questionnaire_id"] == 8)
    | (df_esm_preprocessed["questionnaire_id"] == 9)
]
df_esm_PANAS_clean = esm.clean_up_esm(df_esm_PANAS)

# %% [markdown]
# ## 1.2 Sensor data

# %%
df_proximity = proximity.get_proximity_data(ptcp_2)
df_proximity = helper.get_date_from_timestamp(df_proximity)
df_proximity = proximity.recode_proximity(df_proximity)

# %% [markdown]
# ## 1.3 Standardization/personalization

# %% [markdown]
# # 2. Grouping/segmentation

# %%
df_esm_PANAS_daily_means = (
    df_esm_PANAS_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])
    .esm_user_answer_numeric.agg("mean")
    .reset_index()
    .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
)

# %%
df_esm_PANAS_daily_means = (
    df_esm_PANAS_daily_means.pivot(
        index=["participant_id", "date_lj"],
        columns="questionnaire_id",
        values="esm_numeric_mean",
    )
    .reset_index(col_level=1)
    .rename(columns={8.0: "PA", 9.0: "NA"})
    .set_index(["participant_id", "date_lj"])
)


# %%
df_proximity_daily_counts = proximity.count_proximity(df_proximity, ["date_lj"])

# %%
df_proximity_daily_counts

# %% [markdown]
# # 3. Join features (and export to csv?)

# %%
df_full_data_daily_means = df_esm_PANAS_daily_means.join(
    df_proximity_daily_counts
).reset_index()

# %% [markdown]
# # 4. Machine learning model and parameters

# %%
lin_reg_proximity = linear_model.LinearRegression()

# %% [markdown]
# ## 4.1 Validation method

# %%
logo = LeaveOneGroupOut()
logo.get_n_splits(
    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
    df_full_data_daily_means["PA"],
    groups=df_full_data_daily_means["participant_id"],
)

# %% [markdown]
# ## 4.2 Fit results (export?)

# %%
cross_val_score(
    lin_reg_proximity,
    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
    df_full_data_daily_means["PA"],
    groups=df_full_data_daily_means["participant_id"],
    cv=logo,
    n_jobs=-1,
    scoring="r2",
)

# %%
lin_reg_proximity.fit(
    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
    df_full_data_daily_means["PA"],
)

# %%
lin_reg_proximity.score(
    df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],
    df_full_data_daily_means["PA"],
)

# %% [markdown]
# # Merging these into a pipeline

# %%
from machine_learning import features_sensor, labels, model, pipeline

# %%
importlib.reload(features_sensor)

# %%
with open("../machine_learning/config/minimal_features.yaml", "r") as file:
    sensor_features_params = yaml.safe_load(file)
print(sensor_features_params)

# %%
sensor_features = machine_learning.features_sensor.SensorFeatures(
    **sensor_features_params
)
sensor_features.data_types

# %%
sensor_features.set_participants_label("nokia_0000003")

# %%
sensor_features.data_types = ["proximity", "communication"]
sensor_features.participants_usernames = ptcp_2

# %%
sensor_features.get_sensor_data("proximity")

# %%
sensor_features.set_sensor_data()

# %%
sensor_features.get_sensor_data("proximity")

# %%
sensor_features.calculate_features(cached=False)
features_all_calculated = sensor_features.get_features("all", "all")

# %%
sensor_features.calculate_features(cached=True)
features_all_read = sensor_features.get_features("all", "all")

# %%
features_all_read = features_all_read.reset_index()
features_all_read["date_lj"] = features_all_read["date_lj"].dt.date
features_all_read.set_index(["participant_id", "date_lj"], inplace=True)
# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
# When calculated, it is represented as date.

# %%
np.isclose(features_all_read, features_all_calculated).all()

# %%
with open("../machine_learning/config/minimal_labels.yaml", "r") as file:
    labels_params = yaml.safe_load(file)

# %%
labels = machine_learning.labels.Labels(**labels_params)
labels.participants_usernames = ptcp_2
labels.set_participants_label("nokia_0000003")
labels.questionnaires

# %%
labels.set_labels()

# %%
labels.get_labels("PANAS")

# %%
labels.aggregate_labels(cached=False)
labels_calculated = labels.get_aggregated_labels()

# %%
labels.aggregate_labels(cached=True)
labels_read = labels.get_aggregated_labels()
labels_read = labels_read.reset_index()
labels_read["date_lj"] = labels_read["date_lj"].dt.date
labels_read.set_index(["participant_id", "date_lj"], inplace=True)
# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
# When calculated, it is represented as date.

# %%
np.isclose(labels_read, labels_calculated).all()

# %%
model_validation = machine_learning.model.ModelValidation(
    sensor_features.get_features("all", "all"),
    labels.get_aggregated_labels(),
    group_variable="participant_id",
    cv_name="loso",
)
model_validation.model = linear_model.LinearRegression()
model_validation.set_cv_method()

# %%
model_validation.cross_validate()

# %%
model_validation.groups

# %% [markdown]
# # Use RAPIDS

# %%
with open(here("rapids/config.yaml"), "r") as file:
    rapids_config = yaml.safe_load(file)

# %%
for key in rapids_config.keys():    
    if isinstance(rapids_config[key], dict):  # Remove top-level configs
        if ("PROVIDERS" in rapids_config[key]):  # Retain features (that have providers)
            if rapids_config[key]["PROVIDERS"]:  # Remove non-implemented features
                for provider in rapids_config[key]["PROVIDERS"]:
                    if rapids_config[key]["PROVIDERS"][provider]["COMPUTE"]:  # Check that the features were actually calculated
                        if "FEATURES" in rapids_config[key]["PROVIDERS"][provider]:
                            print(key)
                            print(provider)
                            print(rapids_config[key]["PROVIDERS"][provider]["FEATURES"])

# %%
features_rapids = pd.read_csv(here("rapids/data/processed/features/all_participants/all_sensor_features.csv"), parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"])

# %%
features_rapids.columns

# %%
features_rapids = features_rapids.assign(date_lj=lambda x: x.local_segment_start_datetime.dt.date)

# %%
features_rapids["participant_id"] = features_rapids["pid"].str.extract("(\d+)")
features_rapids["participant_id"] = pd.to_numeric(features_rapids["participant_id"])
features_rapids.set_index(["participant_id", "date_lj"], inplace=True)

# %%
with open("../machine_learning/config/minimal_labels.yaml", "r") as file:
    labels_params = yaml.safe_load(file)

# %%
labels = machine_learning.labels.Labels(**labels_params)
labels.set_participants_label("all")

# %%
labels.aggregate_labels(cached=True)
labels_read = labels.get_aggregated_labels()
labels_read = labels_read.reset_index()
labels_read["date_lj"] = labels_read["date_lj"].dt.date
labels_read.set_index(["participant_id", "date_lj"], inplace=True)
# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
# When calculated, it is represented as date.

# %%
features_rapids.shape

# %%
labels_read.shape

# %%
features_labels = features_rapids.join(labels_read, how="inner").reset_index()

# %%
features_labels.shape

# %%
features_labels.columns

# %%
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# %%
feature_columns = features_labels.columns[6:-3]
label_column = "NA"
group_column = "pid"

# %%
lin_reg_rapids = linear_model.LinearRegression()
logo = LeaveOneGroupOut()
logo.get_n_splits(
    features_labels[feature_columns],
    features_labels[label_column],
    groups=features_labels[group_column],
)

# %%
cross_val_score(
    lin_reg_rapids,
    X=imputer.fit_transform(features_labels[feature_columns]),
    y=features_labels[label_column],
    groups=features_labels[group_column],
    cv=logo,
    n_jobs=-1,
    scoring="r2",
)

# %%
sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(features_labels[feature_columns].isna(), cbar=False)

# %% [markdown] tags=[]
# ```yaml
# ALL_CLEANING_INDIVIDUAL:
#   PROVIDERS:
#     RAPIDS:
#       COMPUTE: True
#       IMPUTE_SELECTED_EVENT_FEATURES: # Fill NAs with 0 only for event-based features, see table below
#         COMPUTE: True
#         MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 # Any feature value in a time segment instance with phone data yield > [MIN_DATA_YIELDED_MINUTES_TO_IMPUTE] will be replaced with a zero. 
#       COLS_NAN_THRESHOLD: 0.3 # Discard columns with missing value ratios higher than [COLS_NAN_THRESHOLD]. Set to 1 to disable
#       COLS_VAR_THRESHOLD: True # Set to True to discard columns with zero variance
#       ROWS_NAN_THRESHOLD: 1 # Discard rows with missing value ratios higher than [ROWS_NAN_THRESHOLD]. Set to 1 to disable
#       DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
#       DATA_YIELD_RATIO_THRESHOLD: 0.3 # Discard rows with ratiovalidyieldedhours or ratiovalidyieldedminutes feature less than [DATA_YIELD_RATIO_THRESHOLD]. The feature name is determined by [DATA_YIELD_FEATURE] parameter. Set to 0 to disable
#       DROP_HIGHLY_CORRELATED_FEATURES:
#         COMPUTE: False
#         MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
#         CORR_THRESHOLD: 0.95
#       SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
# ```

# %%
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`# ---`
			`# jupyter:`
			`# jupytext:`
			`# formats: ipynb,py:percent`
			`# text_representation:`
			`# extension: .py`
			`# format_name: percent`
			`# format_version: '1.3'`
Read RAPIDS features and create columns. 2022-01-07 17:00:12 +01:00			`# jupytext_version: 1.13.0`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`# kernelspec:`
			`# display_name: straw2analysis`
			`# language: python`
			`# name: straw2analysis`
			`# ---`

			`# %%`
			`# %matplotlib inline`
			`import datetime`
Aggregate labels using grouping_variable. 2021-08-20 19:17:22 +02:00			`import importlib`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`import os`
			`import sys`

Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`import numpy as np`
			`import pandas as pd`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`import seaborn as sns`
Add communication features to pipeline. 2021-08-19 17:05:44 +02:00			`import yaml`
Read RAPIDS features and create columns. 2022-01-07 17:00:12 +01:00			`from pyprojroot import here`
Add an example for linear regression. 2021-08-12 16:54:00 +02:00			`from sklearn import linear_model`
			`from sklearn.model_selection import LeaveOneGroupOut, cross_val_score`
ML with RAPIDS and missing values. 2022-01-19 12:53:03 +01:00			`from sklearn.impute import SimpleImputer`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00
			`nb_dir = os.path.split(os.getcwd())[0]`
			`if nb_dir not in sys.path:`
			`sys.path.append(nb_dir)`

Refactor machine_learning/pipeline.py by defining one class by file. 2021-09-13 11:41:57 +02:00			`import machine_learning.features_sensor`
			`import machine_learning.labels`
			`import machine_learning.model`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00
Document the SensorFeatures class and its __init__ method. 2021-09-13 17:43:47 +02:00			`# %%`
			`import participants.query_db`
			`from features import esm, helper, proximity`

Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`# %% [markdown] tags=[]`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`# # 1. Get the relevant data`

			`# %%`
			`participants_inactive_usernames = participants.query_db.get_usernames(`
			`collection_start=datetime.date.fromisoformat("2020-08-01")`
			`)`
			`# Consider only two participants to simplify.`
			`ptcp_2 = participants_inactive_usernames[0:2]`

Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`# %% [markdown] jp-MarkdownHeadingCollapsed=true tags=[]`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`# ## 1.1 Labels`

			`# %%`
			`df_esm = esm.get_esm_data(ptcp_2)`
			`df_esm_preprocessed = esm.preprocess_esm(df_esm)`

			`# %%`
			`df_esm_PANAS = df_esm_preprocessed[`
			`(df_esm_preprocessed["questionnaire_id"] == 8)`
			`\| (df_esm_preprocessed["questionnaire_id"] == 9)`
			`]`
			`df_esm_PANAS_clean = esm.clean_up_esm(df_esm_PANAS)`

			`# %% [markdown]`
			`# ## 1.2 Sensor data`

			`# %%`
			`df_proximity = proximity.get_proximity_data(ptcp_2)`
Use the same function for ESM and other data. 2021-08-11 17:26:44 +02:00			`df_proximity = helper.get_date_from_timestamp(df_proximity)`
Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`df_proximity = proximity.recode_proximity(df_proximity)`

Add a parameter for grouping. 2021-08-12 15:07:20 +02:00			`# %% [markdown]`
			`# ## 1.3 Standardization/personalization`

Start machine learning pipeline example. Select data and labels. 2021-08-11 16:42:30 +02:00			`# %% [markdown]`
			`# # 2. Grouping/segmentation`

			`# %%`
			`df_esm_PANAS_daily_means = (`
			`df_esm_PANAS_clean.groupby(["participant_id", "date_lj", "questionnaire_id"])`
			`.esm_user_answer_numeric.agg("mean")`
			`.reset_index()`
			`.rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})`
			`)`
Add a parameter for grouping. 2021-08-12 15:07:20 +02:00
Add an example for linear regression. 2021-08-12 16:54:00 +02:00			`# %%`
			`df_esm_PANAS_daily_means = (`
			`df_esm_PANAS_daily_means.pivot(`
			`index=["participant_id", "date_lj"],`
			`columns="questionnaire_id",`
			`values="esm_numeric_mean",`
			`)`
			`.reset_index(col_level=1)`
			`.rename(columns={8.0: "PA", 9.0: "NA"})`
			`.set_index(["participant_id", "date_lj"])`
			`)`


Add a parameter for grouping. 2021-08-12 15:07:20 +02:00			`# %%`
Add a demo of pipeline. 2021-11-17 10:44:49 +01:00			`df_proximity_daily_counts = proximity.count_proximity(df_proximity, ["date_lj"])`
Add a parameter for grouping. 2021-08-12 15:07:20 +02:00
			`# %%`
			`df_proximity_daily_counts`

			`# %% [markdown]`
			`# # 3. Join features (and export to csv?)`

Add an example for linear regression. 2021-08-12 16:54:00 +02:00			`# %%`
			`df_full_data_daily_means = df_esm_PANAS_daily_means.join(`
			`df_proximity_daily_counts`
			`).reset_index()`

Add a parameter for grouping. 2021-08-12 15:07:20 +02:00			`# %% [markdown]`
			`# # 4. Machine learning model and parameters`

			`# %%`
Add an example for linear regression. 2021-08-12 16:54:00 +02:00			`lin_reg_proximity = linear_model.LinearRegression()`

			`# %% [markdown]`
			`# ## 4.1 Validation method`

			`# %%`
			`logo = LeaveOneGroupOut()`
			`logo.get_n_splits(`
			`df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],`
			`df_full_data_daily_means["PA"],`
			`groups=df_full_data_daily_means["participant_id"],`
			`)`

			`# %% [markdown]`
			`# ## 4.2 Fit results (export?)`

			`# %%`
			`cross_val_score(`
			`lin_reg_proximity,`
			`df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],`
			`df_full_data_daily_means["PA"],`
			`groups=df_full_data_daily_means["participant_id"],`
			`cv=logo,`
			`n_jobs=-1,`
			`scoring="r2",`
			`)`

			`# %%`
			`lin_reg_proximity.fit(`
			`df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],`
			`df_full_data_daily_means["PA"],`
			`)`

			`# %%`
			`lin_reg_proximity.score(`
			`df_full_data_daily_means[["freq_prox_near", "prop_prox_near"]],`
			`df_full_data_daily_means["PA"],`
			`)`
[WIP] Start merging steps into a class for a pipeline. 2021-08-12 17:38:08 +02:00
			`# %% [markdown]`
			`# # Merging these into a pipeline`

			`# %%`
Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`from machine_learning import features_sensor, labels, model, pipeline`
[WIP] Start merging steps into a class for a pipeline. 2021-08-12 17:38:08 +02:00
Aggregate labels using grouping_variable. 2021-08-20 19:17:22 +02:00			`# %%`
Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`importlib.reload(features_sensor)`
Aggregate labels using grouping_variable. 2021-08-20 19:17:22 +02:00
[WIP] Start merging steps into a class for a pipeline. 2021-08-12 17:38:08 +02:00			`# %%`
Add communication features to pipeline. 2021-08-19 17:05:44 +02:00			`with open("../machine_learning/config/minimal_features.yaml", "r") as file:`
Accept nested feature configuration. To do this, pass a dict as parameters to SensorFeatures class, rather than actually reading the object from yaml file. 2021-08-19 17:23:23 +02:00			`sensor_features_params = yaml.safe_load(file)`
Merge features into a common df. But first, group communication by the grouping_variable. 2021-08-20 17:59:00 +02:00			`print(sensor_features_params)`
Accept nested feature configuration. To do this, pass a dict as parameters to SensorFeatures class, rather than actually reading the object from yaml file. 2021-08-19 17:23:23 +02:00
			`# %%`
Document the SensorFeatures class and its __init__ method. 2021-09-13 17:43:47 +02:00			`sensor_features = machine_learning.features_sensor.SensorFeatures(`
			`**sensor_features_params`
			`)`
Accept nested feature configuration. To do this, pass a dict as parameters to SensorFeatures class, rather than actually reading the object from yaml file. 2021-08-19 17:23:23 +02:00			`sensor_features.data_types`
[WIP] Methods to get the labels and data plus aggregate them. 2021-08-12 19:06:43 +02:00
Save calculated features to csv files. 2021-08-23 16:36:26 +02:00			`# %%`
			`sensor_features.set_participants_label("nokia_0000003")`

Merge features into a common df. But first, group communication by the grouping_variable. 2021-08-20 17:59:00 +02:00			`# %%`
			`sensor_features.data_types = ["proximity", "communication"]`
[WIP] Add a class for model validation. 2021-08-20 19:44:50 +02:00			`sensor_features.participants_usernames = ptcp_2`
Merge features into a common df. But first, group communication by the grouping_variable. 2021-08-20 17:59:00 +02:00
Add communication features to pipeline. 2021-08-19 17:05:44 +02:00			`# %%`
			`sensor_features.get_sensor_data("proximity")`

[WIP] Methods to get the labels and data plus aggregate them. 2021-08-12 19:06:43 +02:00			`# %%`
Convert the class into a YAML object. Add an example config file and demonstrate its usage in ex_ml_pipeline.ipynb. 2021-08-19 16:31:42 +02:00			`sensor_features.set_sensor_data()`
[WIP] Methods to get the labels and data plus aggregate them. 2021-08-12 19:06:43 +02:00
			`# %%`
Convert the class into a YAML object. Add an example config file and demonstrate its usage in ex_ml_pipeline.ipynb. 2021-08-19 16:31:42 +02:00			`sensor_features.get_sensor_data("proximity")`
[WIP] Start merging steps into a class for a pipeline. 2021-08-12 17:38:08 +02:00
			`# %%`
Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`sensor_features.calculate_features(cached=False)`
			`features_all_calculated = sensor_features.get_features("all", "all")`

			`# %%`
			`sensor_features.calculate_features(cached=True)`
			`features_all_read = sensor_features.get_features("all", "all")`

			`# %%`
			`features_all_read = features_all_read.reset_index()`
			`features_all_read["date_lj"] = features_all_read["date_lj"].dt.date`
			`features_all_read.set_index(["participant_id", "date_lj"], inplace=True)`
			`# date_lj column is parsed as a date and represented as Timestamp, when read from csv.`
			`# When calculated, it is represented as date.`
[WIP] Start merging steps into a class for a pipeline. 2021-08-12 17:38:08 +02:00
Merge features into a common df. But first, group communication by the grouping_variable. 2021-08-20 17:59:00 +02:00			`# %%`
Enable reading features from csv files. 2021-09-14 17:42:34 +02:00			`np.isclose(features_all_read, features_all_calculated).all()`
Merge features into a common df. But first, group communication by the grouping_variable. 2021-08-20 17:59:00 +02:00
Accept nested feature configuration. To do this, pass a dict as parameters to SensorFeatures class, rather than actually reading the object from yaml file. 2021-08-19 17:23:23 +02:00			`# %%`
Add a similar class for labels. 2021-08-19 17:44:04 +02:00			`with open("../machine_learning/config/minimal_labels.yaml", "r") as file:`
			`labels_params = yaml.safe_load(file)`

			`# %%`
Refactor machine_learning/pipeline.py by defining one class by file. 2021-09-13 11:41:57 +02:00			`labels = machine_learning.labels.Labels(**labels_params)`
[WIP] Add a class for model validation. 2021-08-20 19:44:50 +02:00			`labels.participants_usernames = ptcp_2`
Add export capabilities to labels.py. 2021-09-15 15:36:36 +02:00			`labels.set_participants_label("nokia_0000003")`
Add a similar class for labels. 2021-08-19 17:44:04 +02:00			`labels.questionnaires`

			`# %%`
			`labels.set_labels()`

			`# %%`
			`labels.get_labels("PANAS")`

			`# %%`
Add an option to read cached labels from a file. 2021-09-15 15:45:49 +02:00			`labels.aggregate_labels(cached=False)`
			`labels_calculated = labels.get_aggregated_labels()`
Aggregate labels using grouping_variable. 2021-08-20 19:17:22 +02:00
			`# %%`
Add an option to read cached labels from a file. 2021-09-15 15:45:49 +02:00			`labels.aggregate_labels(cached=True)`
			`labels_read = labels.get_aggregated_labels()`
			`labels_read = labels_read.reset_index()`
			`labels_read["date_lj"] = labels_read["date_lj"].dt.date`
			`labels_read.set_index(["participant_id", "date_lj"], inplace=True)`
			`# date_lj column is parsed as a date and represented as Timestamp, when read from csv.`
			`# When calculated, it is represented as date.`

			`# %%`
			`np.isclose(labels_read, labels_calculated).all()`
Aggregate labels using grouping_variable. 2021-08-20 19:17:22 +02:00
			`# %%`
Refactor machine_learning/pipeline.py by defining one class by file. 2021-09-13 11:41:57 +02:00			`model_validation = machine_learning.model.ModelValidation(`
[WIP] Add a class for model validation. 2021-08-20 19:44:50 +02:00			`sensor_features.get_features("all", "all"),`
			`labels.get_aggregated_labels(),`
			`group_variable="participant_id",`
			`cv_name="loso",`
			`)`
			`model_validation.model = linear_model.LinearRegression()`
			`model_validation.set_cv_method()`

			`# %%`
			`model_validation.cross_validate()`

			`# %%`
			`model_validation.groups`

Read RAPIDS features and create columns. 2022-01-07 17:00:12 +01:00			`# %% [markdown]`
			`# # Use RAPIDS`

			`# %%`
			`with open(here("rapids/config.yaml"), "r") as file:`
			`rapids_config = yaml.safe_load(file)`

			`# %%`
ML with RAPIDS and missing values. 2022-01-19 12:53:03 +01:00			`for key in rapids_config.keys():`
Read RAPIDS features and create columns. 2022-01-07 17:00:12 +01:00			`if isinstance(rapids_config[key], dict): # Remove top-level configs`
ML with RAPIDS and missing values. 2022-01-19 12:53:03 +01:00			`if ("PROVIDERS" in rapids_config[key]): # Retain features (that have providers)`
Read RAPIDS features and create columns. 2022-01-07 17:00:12 +01:00			`if rapids_config[key]["PROVIDERS"]: # Remove non-implemented features`
			`for provider in rapids_config[key]["PROVIDERS"]:`
ML with RAPIDS and missing values. 2022-01-19 12:53:03 +01:00			`if rapids_config[key]["PROVIDERS"][provider]["COMPUTE"]: # Check that the features were actually calculated`
Read RAPIDS features and create columns. 2022-01-07 17:00:12 +01:00			`if "FEATURES" in rapids_config[key]["PROVIDERS"][provider]:`
			`print(key)`
			`print(provider)`
			`print(rapids_config[key]["PROVIDERS"][provider]["FEATURES"])`

			`# %%`
ML with RAPIDS and missing values. 2022-01-19 12:53:03 +01:00			`features_rapids = pd.read_csv(here("rapids/data/processed/features/all_participants/all_sensor_features.csv"), parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"])`
Read RAPIDS features and create columns. 2022-01-07 17:00:12 +01:00
			`# %%`
			`features_rapids.columns`

			`# %%`
ML with RAPIDS and missing values. 2022-01-19 12:53:03 +01:00			`features_rapids = features_rapids.assign(date_lj=lambda x: x.local_segment_start_datetime.dt.date)`
Read RAPIDS features and create columns. 2022-01-07 17:00:12 +01:00
[WIP] Add a class for model validation. 2021-08-20 19:44:50 +02:00			`# %%`
Read RAPIDS features and create columns. 2022-01-07 17:00:12 +01:00			`features_rapids["participant_id"] = features_rapids["pid"].str.extract("(\d+)")`
			`features_rapids["participant_id"] = pd.to_numeric(features_rapids["participant_id"])`
ML with RAPIDS and missing values. 2022-01-19 12:53:03 +01:00			`features_rapids.set_index(["participant_id", "date_lj"], inplace=True)`

			`# %%`
			`with open("../machine_learning/config/minimal_labels.yaml", "r") as file:`
			`labels_params = yaml.safe_load(file)`

			`# %%`
			`labels = machine_learning.labels.Labels(**labels_params)`
			`labels.set_participants_label("all")`

			`# %%`
			`labels.aggregate_labels(cached=True)`
			`labels_read = labels.get_aggregated_labels()`
			`labels_read = labels_read.reset_index()`
			`labels_read["date_lj"] = labels_read["date_lj"].dt.date`
			`labels_read.set_index(["participant_id", "date_lj"], inplace=True)`
			`# date_lj column is parsed as a date and represented as Timestamp, when read from csv.`
			`# When calculated, it is represented as date.`

			`# %%`
			`features_rapids.shape`

			`# %%`
			`labels_read.shape`

			`# %%`
			`features_labels = features_rapids.join(labels_read, how="inner").reset_index()`

			`# %%`
			`features_labels.shape`

			`# %%`
			`features_labels.columns`

			`# %%`
			`imputer = SimpleImputer(missing_values=np.nan, strategy='mean')`

			`# %%`
			`feature_columns = features_labels.columns[6:-3]`
			`label_column = "NA"`
			`group_column = "pid"`

			`# %%`
			`lin_reg_rapids = linear_model.LinearRegression()`
			`logo = LeaveOneGroupOut()`
			`logo.get_n_splits(`
			`features_labels[feature_columns],`
			`features_labels[label_column],`
			`groups=features_labels[group_column],`
			`)`

			`# %%`
			`cross_val_score(`
			`lin_reg_rapids,`
			`X=imputer.fit_transform(features_labels[feature_columns]),`
			`y=features_labels[label_column],`
			`groups=features_labels[group_column],`
			`cv=logo,`
			`n_jobs=-1,`
			`scoring="r2",`
			`)`

			`# %%`
			`sns.set(rc={"figure.figsize":(16, 8)})`
			`sns.heatmap(features_labels[feature_columns].isna(), cbar=False)`

			`# %% [markdown] tags=[]`
			# ```yaml
			`# ALL_CLEANING_INDIVIDUAL:`
			`# PROVIDERS:`
			`# RAPIDS:`
			`# COMPUTE: True`
			`# IMPUTE_SELECTED_EVENT_FEATURES: # Fill NAs with 0 only for event-based features, see table below`
			`# COMPUTE: True`
			`# MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 # Any feature value in a time segment instance with phone data yield > [MIN_DATA_YIELDED_MINUTES_TO_IMPUTE] will be replaced with a zero.`
			`# COLS_NAN_THRESHOLD: 0.3 # Discard columns with missing value ratios higher than [COLS_NAN_THRESHOLD]. Set to 1 to disable`
			`# COLS_VAR_THRESHOLD: True # Set to True to discard columns with zero variance`
			`# ROWS_NAN_THRESHOLD: 1 # Discard rows with missing value ratios higher than [ROWS_NAN_THRESHOLD]. Set to 1 to disable`
			`# DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES`
			`# DATA_YIELD_RATIO_THRESHOLD: 0.3 # Discard rows with ratiovalidyieldedhours or ratiovalidyieldedminutes feature less than [DATA_YIELD_RATIO_THRESHOLD]. The feature name is determined by [DATA_YIELD_FEATURE] parameter. Set to 0 to disable`
			`# DROP_HIGHLY_CORRELATED_FEATURES:`
			`# COMPUTE: False`
			`# MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5`
			`# CORR_THRESHOLD: 0.95`
			`# SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R`
			# ```

			`# %%`