Further refactor by moving helper functions.

rapids
junos 2021-09-15 15:14:54 +02:00
parent 28699a0fdf
commit 20748890a8
3 changed files with 69 additions and 59 deletions

1
.gitignore vendored
View File

@ -5,3 +5,4 @@ __pycache__/
/exploration/*.ipynb /exploration/*.ipynb
/config/*.ipynb /config/*.ipynb
/statistical_analysis/*.ipynb /statistical_analysis/*.ipynb
/machine_learning/intermediate_results/

View File

@ -8,6 +8,11 @@ from pyprojroot import here
import participants.query_db import participants.query_db
from features import communication, helper, proximity from features import communication, helper, proximity
from machine_learning.helper import (
read_csv_with_settings,
safe_outer_merge_on_index,
to_csv_with_settings,
)
WARNING_PARTICIPANTS_LABEL = ( WARNING_PARTICIPANTS_LABEL = (
"Before calculating features, please set participants label using self.set_participants_label() " "Before calculating features, please set participants label using self.set_participants_label() "
@ -53,7 +58,7 @@ class SensorFeatures:
grouping_variable: str, grouping_variable: str,
features: dict, features: dict,
participants_usernames: Collection = None, participants_usernames: Collection = None,
): ) -> None:
""" """
Specifies the grouping variable and usernames for which to calculate features. Specifies the grouping variable and usernames for which to calculate features.
Sets other (implicit) attributes used in other methods. Sets other (implicit) attributes used in other methods.
@ -97,12 +102,12 @@ class SensorFeatures:
self.df_sms = pd.DataFrame() self.df_sms = pd.DataFrame()
self.df_calls_sms = pd.DataFrame() self.df_calls_sms = pd.DataFrame()
self.folder = None self.folder: Path = Path()
self.filename_prefix = "" self.filename_prefix = ""
self.construct_export_path() self.construct_export_path()
print("SensorFeatures initialized.") print("SensorFeatures initialized.")
def set_sensor_data(self): def set_sensor_data(self) -> None:
print("Querying database ...") print("Querying database ...")
if "proximity" in self.data_types: if "proximity" in self.data_types:
self.df_proximity = proximity.get_proximity_data( self.df_proximity = proximity.get_proximity_data(
@ -128,7 +133,7 @@ class SensorFeatures:
else: else:
raise KeyError("This data type has not been implemented.") raise KeyError("This data type has not been implemented.")
def calculate_features(self, cached=True): def calculate_features(self, cached=True) -> None:
print("Calculating features ...") print("Calculating features ...")
if not self.participants_label: if not self.participants_label:
raise ValueError(WARNING_PARTICIPANTS_LABEL) raise ValueError(WARNING_PARTICIPANTS_LABEL)
@ -213,7 +218,7 @@ class SensorFeatures:
else: else:
raise KeyError("This data type has not been implemented.") raise KeyError("This data type has not been implemented.")
def construct_export_path(self): def construct_export_path(self) -> None:
if not self.participants_label: if not self.participants_label:
warnings.warn(WARNING_PARTICIPANTS_LABEL, UserWarning) warnings.warn(WARNING_PARTICIPANTS_LABEL, UserWarning)
self.folder = here("machine_learning/intermediate_results/features", warn=True) self.folder = here("machine_learning/intermediate_results/features", warn=True)
@ -221,59 +226,6 @@ class SensorFeatures:
self.participants_label + "_" + self.grouping_variable_name self.participants_label + "_" + self.grouping_variable_name
) )
def set_participants_label(self, label: str): def set_participants_label(self, label: str) -> None:
self.participants_label = label self.participants_label = label
self.construct_export_path() self.construct_export_path()
def safe_outer_merge_on_index(left, right):
if left.empty:
return right
elif right.empty:
return left
else:
return pd.merge(
left,
right,
how="outer",
left_index=True,
right_index=True,
validate="one_to_one",
)
def to_csv_with_settings(
df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str
) -> None:
full_path = construct_full_path(folder, filename_prefix, data_type)
df.to_csv(
path_or_buf=full_path,
sep=",",
na_rep="NA",
header=True,
index=True,
encoding="utf-8",
)
print("Exported the dataframe to " + str(full_path))
def read_csv_with_settings(
folder: Path, filename_prefix: str, data_type: str, grouping_variable: list
) -> pd.DataFrame:
full_path = construct_full_path(folder, filename_prefix, data_type)
return pd.read_csv(
filepath_or_buffer=full_path,
sep=",",
header=0,
na_values="NA",
encoding="utf-8",
index_col=(["participant_id"] + grouping_variable),
parse_dates=True,
infer_datetime_format=True,
)
def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> Path:
export_filename = filename_prefix + "_" + data_type + ".csv"
full_path = folder / export_filename
return full_path

View File

@ -0,0 +1,57 @@
from pathlib import Path
import pandas as pd
def safe_outer_merge_on_index(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
if left.empty:
return right
elif right.empty:
return left
else:
return pd.merge(
left,
right,
how="outer",
left_index=True,
right_index=True,
validate="one_to_one",
)
def to_csv_with_settings(
df: pd.DataFrame, folder: Path, filename_prefix: str, data_type: str
) -> None:
full_path = construct_full_path(folder, filename_prefix, data_type)
df.to_csv(
path_or_buf=full_path,
sep=",",
na_rep="NA",
header=True,
index=True,
encoding="utf-8",
)
print("Exported the dataframe to " + str(full_path))
def read_csv_with_settings(
folder: Path, filename_prefix: str, data_type: str, grouping_variable: list
) -> pd.DataFrame:
full_path = construct_full_path(folder, filename_prefix, data_type)
return pd.read_csv(
filepath_or_buffer=full_path,
sep=",",
header=0,
na_values="NA",
encoding="utf-8",
index_col=(["participant_id"] + grouping_variable),
parse_dates=True,
infer_datetime_format=True,
cache_dates=True,
)
def construct_full_path(folder: Path, filename_prefix: str, data_type: str) -> Path:
export_filename = filename_prefix + "_" + data_type + ".csv"
full_path = folder / export_filename
return full_path