Enable reading features from csv files.

2021-09-14 17:42:34 +02:00 · 2021-09-14 17:42:34 +02:00 · 28699a0fdf
parent af9e81fe40
commit 28699a0fdf
2 changed files with 102 additions and 33 deletions
--- a/exploration/ex_ml_pipeline.py
+++ b/exploration/ex_ml_pipeline.py
@ -20,6 +20,8 @@ import importlib
 import os
 import sys
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import yaml
 from sklearn import linear_model
@ -37,7 +39,7 @@ import machine_learning.model
 import participants.query_db
 from features import esm, helper, proximity
-# %% [markdown]
+# %% [markdown] tags=[]
 # # 1. Get the relevant data
 # %%
@ -47,7 +49,7 @@ participants_inactive_usernames = participants.query_db.get_usernames(
 # Consider only two participants to simplify.
 ptcp_2 = participants_inactive_usernames[0:2]
-# %% [markdown]
+# %% [markdown] jp-MarkdownHeadingCollapsed=true tags=[]
 # ## 1.1 Labels
 # %%
@ -98,7 +100,7 @@ df_esm_PANAS_daily_means = (
 # %%
 df_proximity_daily_counts = proximity.count_proximity(
-    df_proximity, ["participant_id", "date_lj"]
+    df_proximity, ["date_lj"]
 )
 # %%
@ -159,10 +161,10 @@ lin_reg_proximity.score(
 # # Merging these into a pipeline
 # %%
-from machine_learning import pipeline
+from machine_learning import features_sensor, labels, model, pipeline
 # %%
-importlib.reload(pipeline)
+importlib.reload(features_sensor)
 # %%
 with open("../machine_learning/config/minimal_features.yaml", "r") as file:
@ -192,10 +194,22 @@ sensor_features.set_sensor_data()
 sensor_features.get_sensor_data("proximity")
 # %%
-sensor_features.calculate_features()
+sensor_features.calculate_features(cached=False)
 features_all_calculated = sensor_features.get_features("all", "all")
 # %%
-sensor_features.get_features("all", "all")
+sensor_features.calculate_features(cached=True)
 features_all_read = sensor_features.get_features("all", "all")
 # %%
 features_all_read = features_all_read.reset_index()
 features_all_read["date_lj"] = features_all_read["date_lj"].dt.date
 features_all_read.set_index(["participant_id", "date_lj"], inplace=True)
 # date_lj column is parsed as a date and represented as Timestamp, when read from csv.
 # When calculated, it is represented as date.
 # %%
 np.isclose(features_all_read, features_all_calculated).all()
 # %%
 with open("../machine_learning/config/minimal_labels.yaml", "r") as file:
--- a/machine_learning/features_sensor.py
+++ b/machine_learning/features_sensor.py
@ -128,35 +128,69 @@ class SensorFeatures:
        else:
            raise KeyError("This data type has not been implemented.")
-    def calculate_features(self):
+    def calculate_features(self, cached=True):
        print("Calculating features ...")
        if not self.participants_label:
            raise ValueError(WARNING_PARTICIPANTS_LABEL)
        self.df_features_all = pd.DataFrame()
        if "proximity" in self.data_types:
-            self.df_proximity_counts = proximity.count_proximity(
+            try:
-                self.df_proximity, self.grouping_variable
+                if not cached:  # Do not use the file, even if it exists.
-            )
+                    raise FileNotFoundError
-            self.df_features_all = safe_outer_merge_on_index(
+                self.df_proximity_counts = read_csv_with_settings(
-                self.df_features_all, self.df_proximity_counts
+                    self.folder,
-            )
+                    self.filename_prefix,
-            print("Calculated proximity features.")
+                    data_type="prox",
-        to_csv_with_settings(
+                    grouping_variable=self.grouping_variable,
-            self.df_proximity, self.folder, self.filename_prefix, data_type="prox"
+                )
-        )
+                print("Read proximity features from the file.")
            except FileNotFoundError:
                # We need to recalculate the features in this case.
                self.df_proximity_counts = proximity.count_proximity(
                    self.df_proximity, self.grouping_variable
                )
                print("Calculated proximity features.")
                to_csv_with_settings(
                    self.df_proximity_counts,
                    self.folder,
                    self.filename_prefix,
                    data_type="prox",
                )
            finally:
                self.df_features_all = safe_outer_merge_on_index(
                    self.df_features_all, self.df_proximity_counts
                )
        if "communication" in self.data_types:
-            self.df_calls_sms = communication.calls_sms_features(
+            try:
-                df_calls=self.df_calls,
+                if not cached:  # Do not use the file, even if it exists.
-                df_sms=self.df_sms,
+                    raise FileNotFoundError
-                group_by=self.grouping_variable,
+                self.df_calls_sms = read_csv_with_settings(
-            )
+                    self.folder,
-            self.df_features_all = safe_outer_merge_on_index(
+                    self.filename_prefix,
-                self.df_features_all, self.df_calls_sms
+                    data_type="comm",
-            )
+                    grouping_variable=self.grouping_variable,
-            print("Calculated communication features.")
+                )
-        to_csv_with_settings(
+                print("Read communication features from the file.")
-            self.df_calls_sms, self.folder, self.filename_prefix, data_type="comm"
+            except FileNotFoundError:
-        )
+                # We need to recalculate the features in this case.
                self.df_calls_sms = communication.calls_sms_features(
                    df_calls=self.df_calls,
                    df_sms=self.df_sms,
                    group_by=self.grouping_variable,
                )
                print("Calculated communication features.")
                to_csv_with_settings(
                    self.df_calls_sms,
                    self.folder,
                    self.filename_prefix,
                    data_type="comm",
                )
            finally:
                self.df_features_all = safe_outer_merge_on_index(
                    self.df_features_all, self.df_calls_sms
                )
        self.df_features_all.fillna(
            value=proximity.FILL_NA_PROXIMITY, inplace=True, downcast="infer",
@ -211,14 +245,35 @@ def safe_outer_merge_on_index(left, right):
 def to_csv_with_settings(
    df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str
 ) -> None:
-    export_filename = filename_prefix + "_" + data_type + ".csv"
+    full_path = construct_full_path(folder, filename_prefix, data_type)
    full_path = folder / export_filename
    df.to_csv(
        path_or_buf=full_path,
        sep=",",
        na_rep="NA",
        header=True,
-        index=False,
+        index=True,
        encoding="utf-8",
    )
    print("Exported the dataframe to " + str(full_path))
 def read_csv_with_settings(
    folder: Path, filename_prefix: str, data_type: str, grouping_variable: list
 ) -> pd.DataFrame:
    full_path = construct_full_path(folder, filename_prefix, data_type)
    return pd.read_csv(
        filepath_or_buffer=full_path,
        sep=",",
        header=0,
        na_values="NA",
        encoding="utf-8",
        index_col=(["participant_id"] + grouping_variable),
        parse_dates=True,
        infer_datetime_format=True,
    )
 def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> Path:
    export_filename = filename_prefix + "_" + data_type + ".csv"
    full_path = folder / export_filename
    return full_path