Aggregate labels using grouping_variable.

2021-08-20 19:17:22 +02:00 · 2021-08-20 19:17:22 +02:00 · 0b98d59aad
parent 08fdec34f1
commit 0b98d59aad
3 changed files with 35 additions and 2 deletions
--- a/exploration/ex_ml_pipeline.py
+++ b/exploration/ex_ml_pipeline.py
@ -16,6 +16,7 @@
 # %%
 # %matplotlib inline
 import datetime
+import importlib
 import os
 import sys

@ -156,6 +157,9 @@ lin_reg_proximity.score(
 # %%
 from machine_learning import pipeline

+# %%
+importlib.reload(pipeline)
+
 # %%
 with open("../machine_learning/config/minimal_features.yaml", "r") as file:
    sensor_features_params = yaml.safe_load(file)
@ -204,3 +208,9 @@ labels.set_labels()
 labels.get_labels("PANAS")

 # %%
+labels.aggregate_labels()
+
+# %%
+labels.get_aggregated_labels()
+
+# %%
--- a/machine_learning/config/minimal_labels.yaml
+++ b/machine_learning/config/minimal_labels.yaml
@ -1,4 +1,4 @@
-grouping_variable: date_lj
+grouping_variable: [date_lj]
 labels:
  PANAS:
    - PA
--- a/machine_learning/pipeline.py
+++ b/machine_learning/pipeline.py
@ -94,7 +94,7 @@ class SensorFeatures:
 class Labels:
    def __init__(
        self,
-        grouping_variable: str,
+        grouping_variable: list,
        labels: dict,
        participants_usernames: Collection = None,
    ):
@ -113,6 +113,8 @@ class Labels:
        self.df_esm_interest = pd.DataFrame()
        self.df_esm_clean = pd.DataFrame()

+        self.df_esm_means = pd.DataFrame()
+
    def set_labels(self):
        self.df_esm = esm.get_esm_data(self.participants_usernames)
        self.df_esm_preprocessed = esm.preprocess_esm(self.df_esm)
@ -135,6 +137,27 @@ class Labels:
        else:
            raise KeyError("This questionnaire has not been implemented as a label.")

+    def aggregate_labels(self):
+        self.df_esm_means = (
+            self.df_esm_clean.groupby(["participant_id", "questionnaire_id"] + self.grouping_variable)
+            .esm_user_answer_numeric.agg("mean")
+            .reset_index()
+            .rename(columns={"esm_user_answer_numeric": "esm_numeric_mean"})
+        )
+        self.df_esm_means = (
+            self.df_esm_means.pivot(
+                index=["participant_id"] + self.grouping_variable,
+                columns="questionnaire_id",
+                values="esm_numeric_mean",
+            )
+            .reset_index(col_level=1)
+            .rename(columns=QUESTIONNAIRE_IDS_RENAME)
+            .set_index(["participant_id"] + self.grouping_variable)
+        )
+
+    def get_aggregated_labels(self):
+        return self.df_esm_means
+

 def safe_outer_merge_on_index(left, right):
    if left.empty: