Enable reading features from csv files.

2021-09-14 17:42:34 +02:00 · 2021-09-14 17:42:34 +02:00 · 28699a0fdf
parent af9e81fe40
commit 28699a0fdf
2 changed files with 102 additions and 33 deletions
--- a/exploration/ex_ml_pipeline.py
+++ b/exploration/ex_ml_pipeline.py
@ -20,6 +20,8 @@ import importlib
 import os
 import sys

+import numpy as np
+import pandas as pd
 import seaborn as sns
 import yaml
 from sklearn import linear_model
@ -37,7 +39,7 @@ import machine_learning.model
 import participants.query_db
 from features import esm, helper, proximity

-# %% [markdown]
+# %% [markdown] tags=[]
 # # 1. Get the relevant data

 # %%
@ -47,7 +49,7 @@ participants_inactive_usernames = participants.query_db.get_usernames(
 # Consider only two participants to simplify.
 ptcp_2 = participants_inactive_usernames[0:2]

-# %% [markdown]
+# %% [markdown] jp-MarkdownHeadingCollapsed=true tags=[]
 # ## 1.1 Labels

 # %%
@ -98,7 +100,7 @@ df_esm_PANAS_daily_means = (

 # %%
 df_proximity_daily_counts = proximity.count_proximity(
-    df_proximity, ["participant_id", "date_lj"]
+    df_proximity, ["date_lj"]
 )

 # %%
@ -159,10 +161,10 @@ lin_reg_proximity.score(
 # # Merging these into a pipeline

 # %%
-from machine_learning import pipeline
+from machine_learning import features_sensor, labels, model, pipeline

 # %%
-importlib.reload(pipeline)
+importlib.reload(features_sensor)

 # %%
 with open("../machine_learning/config/minimal_features.yaml", "r") as file:
@ -192,10 +194,22 @@ sensor_features.set_sensor_data()
 sensor_features.get_sensor_data("proximity")

 # %%
-sensor_features.calculate_features()
+sensor_features.calculate_features(cached=False)
+features_all_calculated = sensor_features.get_features("all", "all")

 # %%
-sensor_features.get_features("all", "all")
+sensor_features.calculate_features(cached=True)
+features_all_read = sensor_features.get_features("all", "all")
+
+# %%
+features_all_read = features_all_read.reset_index()
+features_all_read["date_lj"] = features_all_read["date_lj"].dt.date
+features_all_read.set_index(["participant_id", "date_lj"], inplace=True)
+# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
+# When calculated, it is represented as date.
+
+# %%
+np.isclose(features_all_read, features_all_calculated).all()

 # %%
 with open("../machine_learning/config/minimal_labels.yaml", "r") as file:
--- a/machine_learning/features_sensor.py
+++ b/machine_learning/features_sensor.py
@ -128,34 +128,68 @@ class SensorFeatures:
        else:
            raise KeyError("This data type has not been implemented.")

-    def calculate_features(self):
+    def calculate_features(self, cached=True):
        print("Calculating features ...")
        if not self.participants_label:
            raise ValueError(WARNING_PARTICIPANTS_LABEL)
+        self.df_features_all = pd.DataFrame()
+
        if "proximity" in self.data_types:
+            try:
+                if not cached:  # Do not use the file, even if it exists.
+                    raise FileNotFoundError
+                self.df_proximity_counts = read_csv_with_settings(
+                    self.folder,
+                    self.filename_prefix,
+                    data_type="prox",
+                    grouping_variable=self.grouping_variable,
+                )
+                print("Read proximity features from the file.")
+            except FileNotFoundError:
+                # We need to recalculate the features in this case.
                self.df_proximity_counts = proximity.count_proximity(
                    self.df_proximity, self.grouping_variable
                )
+                print("Calculated proximity features.")
+                to_csv_with_settings(
+                    self.df_proximity_counts,
+                    self.folder,
+                    self.filename_prefix,
+                    data_type="prox",
+                )
+            finally:
                self.df_features_all = safe_outer_merge_on_index(
                    self.df_features_all, self.df_proximity_counts
                )
-            print("Calculated proximity features.")
-        to_csv_with_settings(
-            self.df_proximity, self.folder, self.filename_prefix, data_type="prox"
-        )

        if "communication" in self.data_types:
+            try:
+                if not cached:  # Do not use the file, even if it exists.
+                    raise FileNotFoundError
+                self.df_calls_sms = read_csv_with_settings(
+                    self.folder,
+                    self.filename_prefix,
+                    data_type="comm",
+                    grouping_variable=self.grouping_variable,
+                )
+                print("Read communication features from the file.")
+            except FileNotFoundError:
+                # We need to recalculate the features in this case.
                self.df_calls_sms = communication.calls_sms_features(
                    df_calls=self.df_calls,
                    df_sms=self.df_sms,
                    group_by=self.grouping_variable,
                )
-            self.df_features_all = safe_outer_merge_on_index(
-                self.df_features_all, self.df_calls_sms
-            )
                print("Calculated communication features.")
                to_csv_with_settings(
-            self.df_calls_sms, self.folder, self.filename_prefix, data_type="comm"
+                    self.df_calls_sms,
+                    self.folder,
+                    self.filename_prefix,
+                    data_type="comm",
+                )
+            finally:
+                self.df_features_all = safe_outer_merge_on_index(
+                    self.df_features_all, self.df_calls_sms
                )

        self.df_features_all.fillna(
@ -211,14 +245,35 @@ def safe_outer_merge_on_index(left, right):
 def to_csv_with_settings(
    df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str
 ) -> None:
-    export_filename = filename_prefix + "_" + data_type + ".csv"
-    full_path = folder / export_filename
+    full_path = construct_full_path(folder, filename_prefix, data_type)
    df.to_csv(
        path_or_buf=full_path,
        sep=",",
        na_rep="NA",
        header=True,
-        index=False,
+        index=True,
        encoding="utf-8",
    )
    print("Exported the dataframe to " + str(full_path))
+
+
+def read_csv_with_settings(
+    folder: Path, filename_prefix: str, data_type: str, grouping_variable: list
+) -> pd.DataFrame:
+    full_path = construct_full_path(folder, filename_prefix, data_type)
+    return pd.read_csv(
+        filepath_or_buffer=full_path,
+        sep=",",
+        header=0,
+        na_values="NA",
+        encoding="utf-8",
+        index_col=(["participant_id"] + grouping_variable),
+        parse_dates=True,
+        infer_datetime_format=True,
+    )
+
+
+def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> Path:
+    export_filename = filename_prefix + "_" + data_type + ".csv"
+    full_path = folder / export_filename
+    return full_path