Further refactor by moving helper functions.

2021-09-15 15:14:54 +02:00 · 2021-09-15 15:14:54 +02:00 · 20748890a8
parent 28699a0fdf
commit 20748890a8
3 changed files with 69 additions and 59 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,4 @@ __pycache__/
 /exploration/*.ipynb
 /config/*.ipynb
 /statistical_analysis/*.ipynb
+/machine_learning/intermediate_results/
--- a/machine_learning/features_sensor.py
+++ b/machine_learning/features_sensor.py
@ -8,6 +8,11 @@ from pyprojroot import here

 import participants.query_db
 from features import communication, helper, proximity
+from machine_learning.helper import (
+    read_csv_with_settings,
+    safe_outer_merge_on_index,
+    to_csv_with_settings,
+)

 WARNING_PARTICIPANTS_LABEL = (
    "Before calculating features, please set participants label using self.set_participants_label() "
@ -53,7 +58,7 @@ class SensorFeatures:
        grouping_variable: str,
        features: dict,
        participants_usernames: Collection = None,
-    ):
+    ) -> None:
        """
        Specifies the grouping variable and usernames for which to calculate features.
        Sets other (implicit) attributes used in other methods.
@ -97,12 +102,12 @@ class SensorFeatures:
        self.df_sms = pd.DataFrame()
        self.df_calls_sms = pd.DataFrame()

-        self.folder = None
+        self.folder: Path = Path()
        self.filename_prefix = ""
        self.construct_export_path()
        print("SensorFeatures initialized.")

-    def set_sensor_data(self):
+    def set_sensor_data(self) -> None:
        print("Querying database ...")
        if "proximity" in self.data_types:
            self.df_proximity = proximity.get_proximity_data(
@ -128,7 +133,7 @@ class SensorFeatures:
        else:
            raise KeyError("This data type has not been implemented.")

-    def calculate_features(self, cached=True):
+    def calculate_features(self, cached=True) -> None:
        print("Calculating features ...")
        if not self.participants_label:
            raise ValueError(WARNING_PARTICIPANTS_LABEL)
@ -213,7 +218,7 @@ class SensorFeatures:
        else:
            raise KeyError("This data type has not been implemented.")

-    def construct_export_path(self):
+    def construct_export_path(self) -> None:
        if not self.participants_label:
            warnings.warn(WARNING_PARTICIPANTS_LABEL, UserWarning)
        self.folder = here("machine_learning/intermediate_results/features", warn=True)
@ -221,59 +226,6 @@ class SensorFeatures:
            self.participants_label + "_" + self.grouping_variable_name
        )

-    def set_participants_label(self, label: str):
+    def set_participants_label(self, label: str) -> None:
        self.participants_label = label
        self.construct_export_path()
-
-
-def safe_outer_merge_on_index(left, right):
-    if left.empty:
-        return right
-    elif right.empty:
-        return left
-    else:
-        return pd.merge(
-            left,
-            right,
-            how="outer",
-            left_index=True,
-            right_index=True,
-            validate="one_to_one",
-        )
-
-
-def to_csv_with_settings(
-    df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str
-) -> None:
-    full_path = construct_full_path(folder, filename_prefix, data_type)
-    df.to_csv(
-        path_or_buf=full_path,
-        sep=",",
-        na_rep="NA",
-        header=True,
-        index=True,
-        encoding="utf-8",
-    )
-    print("Exported the dataframe to " + str(full_path))
-
-
-def read_csv_with_settings(
-    folder: Path, filename_prefix: str, data_type: str, grouping_variable: list
-) -> pd.DataFrame:
-    full_path = construct_full_path(folder, filename_prefix, data_type)
-    return pd.read_csv(
-        filepath_or_buffer=full_path,
-        sep=",",
-        header=0,
-        na_values="NA",
-        encoding="utf-8",
-        index_col=(["participant_id"] + grouping_variable),
-        parse_dates=True,
-        infer_datetime_format=True,
-    )
-
-
-def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> Path:
-    export_filename = filename_prefix + "_" + data_type + ".csv"
-    full_path = folder / export_filename
-    return full_path
--- a/machine_learning/helper.py
+++ b/machine_learning/helper.py
@ -0,0 +1,57 @@
+from pathlib import Path
+
+import pandas as pd
+
+
+def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
+    if left.empty:
+        return right
+    elif right.empty:
+        return left
+    else:
+        return pd.merge(
+            left,
+            right,
+            how="outer",
+            left_index=True,
+            right_index=True,
+            validate="one_to_one",
+        )
+
+
+def to_csv_with_settings(
+    df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str
+) -> None:
+    full_path = construct_full_path(folder, filename_prefix, data_type)
+    df.to_csv(
+        path_or_buf=full_path,
+        sep=",",
+        na_rep="NA",
+        header=True,
+        index=True,
+        encoding="utf-8",
+    )
+    print("Exported the dataframe to " + str(full_path))
+
+
+def read_csv_with_settings(
+    folder: Path, filename_prefix: str, data_type: str, grouping_variable: list
+) -> pd.DataFrame:
+    full_path = construct_full_path(folder, filename_prefix, data_type)
+    return pd.read_csv(
+        filepath_or_buffer=full_path,
+        sep=",",
+        header=0,
+        na_values="NA",
+        encoding="utf-8",
+        index_col=(["participant_id"] + grouping_variable),
+        parse_dates=True,
+        infer_datetime_format=True,
+        cache_dates=True,
+    )
+
+
+def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> Path:
+    export_filename = filename_prefix + "_" + data_type + ".csv"
+    full_path = folder / export_filename
+    return full_path