diff --git a/.gitignore b/.gitignore index 6098804..9f0a3e2 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ __pycache__/ /exploration/*.ipynb /config/*.ipynb /statistical_analysis/*.ipynb +/machine_learning/intermediate_results/ diff --git a/machine_learning/features_sensor.py b/machine_learning/features_sensor.py index 05b97f7..f25e37e 100644 --- a/machine_learning/features_sensor.py +++ b/machine_learning/features_sensor.py @@ -8,6 +8,11 @@ from pyprojroot import here import participants.query_db from features import communication, helper, proximity +from machine_learning.helper import ( + read_csv_with_settings, + safe_outer_merge_on_index, + to_csv_with_settings, +) WARNING_PARTICIPANTS_LABEL = ( "Before calculating features, please set participants label using self.set_participants_label() " @@ -53,7 +58,7 @@ class SensorFeatures: grouping_variable: str, features: dict, participants_usernames: Collection = None, - ): + ) -> None: """ Specifies the grouping variable and usernames for which to calculate features. Sets other (implicit) attributes used in other methods. @@ -97,12 +102,12 @@ class SensorFeatures: self.df_sms = pd.DataFrame() self.df_calls_sms = pd.DataFrame() - self.folder = None + self.folder: Path = Path() self.filename_prefix = "" self.construct_export_path() print("SensorFeatures initialized.") - def set_sensor_data(self): + def set_sensor_data(self) -> None: print("Querying database ...") if "proximity" in self.data_types: self.df_proximity = proximity.get_proximity_data( @@ -128,7 +133,7 @@ class SensorFeatures: else: raise KeyError("This data type has not been implemented.") - def calculate_features(self, cached=True): + def calculate_features(self, cached=True) -> None: print("Calculating features ...") if not self.participants_label: raise ValueError(WARNING_PARTICIPANTS_LABEL) @@ -213,7 +218,7 @@ class SensorFeatures: else: raise KeyError("This data type has not been implemented.") - def construct_export_path(self): + def construct_export_path(self) -> None: if not self.participants_label: warnings.warn(WARNING_PARTICIPANTS_LABEL, UserWarning) self.folder = here("machine_learning/intermediate_results/features", warn=True) @@ -221,59 +226,6 @@ class SensorFeatures: self.participants_label + "_" + self.grouping_variable_name ) - def set_participants_label(self, label: str): + def set_participants_label(self, label: str) -> None: self.participants_label = label self.construct_export_path() - - -def safe_outer_merge_on_index(left, right): - if left.empty: - return right - elif right.empty: - return left - else: - return pd.merge( - left, - right, - how="outer", - left_index=True, - right_index=True, - validate="one_to_one", - ) - - -def to_csv_with_settings( - df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str -) -> None: - full_path = construct_full_path(folder, filename_prefix, data_type) - df.to_csv( - path_or_buf=full_path, - sep=",", - na_rep="NA", - header=True, - index=True, - encoding="utf-8", - ) - print("Exported the dataframe to " + str(full_path)) - - -def read_csv_with_settings( - folder: Path, filename_prefix: str, data_type: str, grouping_variable: list -) -> pd.DataFrame: - full_path = construct_full_path(folder, filename_prefix, data_type) - return pd.read_csv( - filepath_or_buffer=full_path, - sep=",", - header=0, - na_values="NA", - encoding="utf-8", - index_col=(["participant_id"] + grouping_variable), - parse_dates=True, - infer_datetime_format=True, - ) - - -def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> Path: - export_filename = filename_prefix + "_" + data_type + ".csv" - full_path = folder / export_filename - return full_path diff --git a/machine_learning/helper.py b/machine_learning/helper.py new file mode 100644 index 0000000..a606633 --- /dev/null +++ b/machine_learning/helper.py @@ -0,0 +1,57 @@ +from pathlib import Path + +import pandas as pd + + +def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame: + if left.empty: + return right + elif right.empty: + return left + else: + return pd.merge( + left, + right, + how="outer", + left_index=True, + right_index=True, + validate="one_to_one", + ) + + +def to_csv_with_settings( + df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str +) -> None: + full_path = construct_full_path(folder, filename_prefix, data_type) + df.to_csv( + path_or_buf=full_path, + sep=",", + na_rep="NA", + header=True, + index=True, + encoding="utf-8", + ) + print("Exported the dataframe to " + str(full_path)) + + +def read_csv_with_settings( + folder: Path, filename_prefix: str, data_type: str, grouping_variable: list +) -> pd.DataFrame: + full_path = construct_full_path(folder, filename_prefix, data_type) + return pd.read_csv( + filepath_or_buffer=full_path, + sep=",", + header=0, + na_values="NA", + encoding="utf-8", + index_col=(["participant_id"] + grouping_variable), + parse_dates=True, + infer_datetime_format=True, + cache_dates=True, + ) + + +def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> Path: + export_filename = filename_prefix + "_" + data_type + ".csv" + full_path = folder / export_filename + return full_path