Add an option to read cached labels from a file.

2021-09-15 15:45:49 +02:00 · 2021-09-15 15:45:49 +02:00 · b8c7606664
parent ed062d25ee
commit b8c7606664
2 changed files with 52 additions and 27 deletions
--- a/exploration/ex_ml_pipeline.py
+++ b/exploration/ex_ml_pipeline.py
@ -228,10 +228,20 @@ labels.set_labels()
 labels.get_labels("PANAS")
 # %%
-labels.aggregate_labels()
+labels.aggregate_labels(cached=False)
 labels_calculated = labels.get_aggregated_labels()
 # %%
-labels.get_aggregated_labels()
+labels.aggregate_labels(cached=True)
 labels_read = labels.get_aggregated_labels()
 labels_read = labels_read.reset_index()
 labels_read["date_lj"] = labels_read["date_lj"].dt.date
 labels_read.set_index(["participant_id", "date_lj"], inplace=True)
 # date_lj column is parsed as a date and represented as Timestamp, when read from csv.
 # When calculated, it is represented as date.
 # %%
 np.isclose(labels_read, labels_calculated).all()
 # %%
 model_validation = machine_learning.model.ModelValidation(
--- a/machine_learning/labels.py
+++ b/machine_learning/labels.py
@ -9,7 +9,7 @@ from pyprojroot import here
 import participants.query_db
 from features import esm
 from machine_learning import QUESTIONNAIRE_IDS, QUESTIONNAIRE_IDS_RENAME
-from machine_learning.helper import to_csv_with_settings
+from machine_learning.helper import to_csv_with_settings, read_csv_with_settings
 WARNING_PARTICIPANTS_LABEL = (
    "Before aggregating labels, please set participants label using self.set_participants_label() "
@ -75,8 +75,23 @@ class Labels:
        else:
            raise KeyError("This questionnaire has not been implemented as a label.")
-    def aggregate_labels(self) -> None:
+    def aggregate_labels(self, cached=True) -> None:
        print("Aggregating labels ...")
        if not self.participants_label:
            raise ValueError(WARNING_PARTICIPANTS_LABEL)
        try:
            if not cached:  # Do not use the file, even if it exists.
                raise FileNotFoundError
            self.df_esm_means = read_csv_with_settings(
                self.folder,
                self.filename_prefix,
                data_type="_".join(self.questionnaires),
                grouping_variable=self.grouping_variable
            )
            print("Read labels from the file.")
        except FileNotFoundError:
            # We need to recalculate the features in this case.
            self.df_esm_means = (
                self.df_esm_clean.groupby(
                    ["participant_id", "questionnaire_id"] + self.grouping_variable