From 1c57320ab3e8f79cb240dc5c08ec21be8e6082f8 Mon Sep 17 00:00:00 2001
From: Meng Li <34143965+Meng6@users.noreply.github.com>
Date: Fri, 25 Jun 2021 19:06:46 -0400
Subject: [PATCH] Update segment labels and fix the bug when we do not have any
 labels for event segments

---
 .../phone_locations/barnett/daily_features.R  |  3 +-
 src/features/utils/utils.R                    |  3 ++
 src/features/utils/utils.py                   | 26 +++++++------
 .../heatmap_feature_correlation_matrix.py     |  2 +-
 ..._yield_per_participant_per_time_segment.py |  2 +-
 ...atmap_sensor_row_count_per_time_segment.py | 38 ++++++++++---------
 ...map_sensors_per_minute_per_time_segment.py |  4 +-
 .../histogram_phone_data_yield.py             |  2 +-
 8 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/src/features/phone_locations/barnett/daily_features.R b/src/features/phone_locations/barnett/daily_features.R
index 4a9009c5..86e87718 100644
--- a/src/features/phone_locations/barnett/daily_features.R
+++ b/src/features/phone_locations/barnett/daily_features.R
@@ -26,9 +26,8 @@ barnett_daily_features <- function(snakemake){
   location <- location %>% 
     filter(accuracy < accuracy_limit) %>% 
     mutate(is_daily = str_detect(assigned_segments, paste0(".*#", datetime_start_regex, ",", datetime_end_regex, ".*")))
-    
   
-  if(nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)){
+  if(nrow(segment_labels) == 0 || nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)){
     warning("Barnett's location features cannot be computed for data or time segments that do not span one or more entire days (00:00:00 to 23:59:59). Values below point to the problem:",
             "\nLocation data rows within accuracy: ", nrow(location %>% filter(accuracy < accuracy_limit)),
             "\nLocation data rows within a daily time segment: ", nrow(filter(location, is_daily)),
diff --git a/src/features/utils/utils.R b/src/features/utils/utils.R
index 02931503..9a3dabce 100644
--- a/src/features/utils/utils.R
+++ b/src/features/utils/utils.R
@@ -60,6 +60,9 @@ fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_d
         source(provider[["SRC_SCRIPT"]])
         features_function <- match.fun(paste0(tolower(provider_key), "_features"))
         time_segments <- time_segments_labels %>% pull(label)
+        if(length(time_segments) == 0){
+          time_segments <- c("")
+        }
         for (time_segment in time_segments){
             print(paste(rapids_log_tag,"Processing", sensor_key, provider_key, time_segment))
 
diff --git a/src/features/utils/utils.py b/src/features/utils/utils.py
index 7183a82d..9288d1b4 100644
--- a/src/features/utils/utils.py
+++ b/src/features/utils/utils.py
@@ -99,19 +99,21 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
 
     if provider["COMPUTE"] == True:
 
-            feature_module = import_path(provider["SRC_SCRIPT"])
-            feature_function = getattr(feature_module,  provider_key.lower() + "_features")
-            
-            for time_segment in time_segments_labels["label"]:
-                    print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, time_segment))
-                    features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes)
-                    if not "local_segment" in features.columns:
-                        raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n  The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")
-                    features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns]
-                    sensor_features = pd.concat([sensor_features, features], axis=0, sort=False)
+        feature_module = import_path(provider["SRC_SCRIPT"])
+        feature_function = getattr(feature_module,  provider_key.lower() + "_features")
+        
+        if time_segments_labels["label"].empty:
+            time_segments_labels["label"] = [""]
+        for time_segment in time_segments_labels["label"]:
+            print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, time_segment))
+            features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes)
+            if not "local_segment" in features.columns:
+                raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n  The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")
+            features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns]
+            sensor_features = pd.concat([sensor_features, features], axis=0, sort=False)
     else:
-            for feature in provider["FEATURES"]:
-                    sensor_features[feature] = None
+        for feature in provider["FEATURES"]:
+            sensor_features[feature] = None
     segment_colums = pd.DataFrame()
     sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '')
     split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
diff --git a/src/visualization/heatmap_feature_correlation_matrix.py b/src/visualization/heatmap_feature_correlation_matrix.py
index 1a42d447..a3474409 100644
--- a/src/visualization/heatmap_feature_correlation_matrix.py
+++ b/src/visualization/heatmap_feature_correlation_matrix.py
@@ -26,7 +26,7 @@ features = pd.read_csv(snakemake.input["all_sensor_features"])
 
 
 if time_segments_type == "FREQUENCY":
-    features["local_segment_label"] = features["local_segment_label"].str.replace(r"[0-9]{4}", "")
+    features["local_segment_label"] = features["local_segment_label"].str[:-4]
 if time_segments_type == "EVENT":
     features["local_segment_label"] = "event"
 
diff --git a/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py b/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py
index 1e6f79fc..423b0051 100644
--- a/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py
+++ b/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py
@@ -58,7 +58,7 @@ time_segments = pd.read_csv(snakemake.input["time_segments_file"])["label"].uniq
 
 phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"]).sort_values(by=["pid", "local_segment_start_datetime"])
 if time_segments_type == "FREQUENCY":
-    phone_data_yield["local_segment_label"] = phone_data_yield["local_segment_label"].str.replace(r"[0-9]{4}", "")
+    phone_data_yield["local_segment_label"] = phone_data_yield["local_segment_label"].str[:-4]
 
 html_file = open(snakemake.output[0], "w", encoding="utf-8")
 if phone_data_yield.empty:
diff --git a/src/visualization/heatmap_sensor_row_count_per_time_segment.py b/src/visualization/heatmap_sensor_row_count_per_time_segment.py
index df89a5dc..cd34793f 100644
--- a/src/visualization/heatmap_sensor_row_count_per_time_segment.py
+++ b/src/visualization/heatmap_sensor_row_count_per_time_segment.py
@@ -11,6 +11,25 @@ mod = util.module_from_spec(spec)
 spec.loader.exec_module(mod)
 filter_data_by_segment = getattr(mod,  "filter_data_by_segment")
 
+def getRowCount(sensor_paths, sensor_names, time_segments_labels):
+    sensors_row_count = pd.DataFrame()
+    for sensor_path, sensor_name in zip(sensor_paths, sensor_names):
+        sensor_data = pd.read_csv(sensor_path, usecols=["assigned_segments"])
+
+        sensor_row_count = pd.DataFrame()
+        if not sensor_data.empty:
+            for time_segment in time_segments_labels:
+                sensor_data_per_segment = filter_data_by_segment(sensor_data, time_segment)
+
+                if not sensor_data_per_segment.empty:
+                    sensor_row_count = pd.concat([sensor_row_count, sensor_data_per_segment.groupby(["local_segment"])[["local_segment"]].count().rename(columns={"local_segment": sensor_name})], axis=0, sort=False)
+        sensors_row_count = pd.concat([sensors_row_count, sensor_row_count], axis=1, sort=False)
+    
+    sensors_row_count.index.name = "local_segment"
+    sensors_row_count.index = sensors_row_count.index.str.replace(r"_RR\d+SS#", "#")
+    
+    return sensors_row_count
+
 def getRowCountHeatmap(data_for_plot, pid, time_segment, html_file):
 
     fig = px.timeline(data_for_plot,
@@ -18,7 +37,7 @@ def getRowCountHeatmap(data_for_plot, pid, time_segment, html_file):
                         x_end="local_segment_end_datetime",
                         y="sensor",
                         color="scaled_value",
-                        color_continuous_scale="Peach", #"Viridis",
+                        color_continuous_scale="Peach",
                         opacity=0.7,
                         hover_data={"local_segment_start_datetime":False, "local_segment_end_datetime":False, "local_segment":True, "value":True, "scaled_value":True})
 
@@ -48,22 +67,7 @@ phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], index_col=["
 if ("phone_data_yield_rapids_ratiovalidyieldedminutes" not in phone_data_yield.columns) or ("phone_data_yield_rapids_ratiovalidyieldedhours" not in phone_data_yield.columns):
     raise ValueError("Please make sure [PHONE_DATA_YIELD][RAPIDS][COMPUTE] is True AND [PHONE_DATA_YIELD][RAPIDS][FEATURES] contains [ratiovalidyieldedminutes, ratiovalidyieldedhours].")
 
-# extract row count
-sensors_row_count = pd.DataFrame()
-for sensor_path, sensor_name in zip(snakemake.input["all_sensors"], sensor_names):
-    sensor_data = pd.read_csv(sensor_path, usecols=["assigned_segments"])
-
-    sensor_row_count = pd.DataFrame()
-    if not sensor_data.empty:
-        for time_segment in time_segments_labels:
-            sensor_data_per_segment = filter_data_by_segment(sensor_data, time_segment)
-
-            if not sensor_data_per_segment.empty:
-                sensor_row_count = pd.concat([sensor_row_count, sensor_data_per_segment.groupby(["local_segment"])[["local_segment"]].count().rename(columns={"local_segment": sensor_name})], axis=0, sort=False)
-    sensors_row_count = pd.concat([sensors_row_count, sensor_row_count], axis=1, sort=False)
-
-sensors_row_count.index.name = "local_segment"
-sensors_row_count.index = sensors_row_count.index.str.replace(r"_RR\d+SS", "")
+sensors_row_count = getRowCount(snakemake.input["all_sensors"], sensor_names, time_segments_labels)
 data_for_plot = phone_data_yield.rename(columns={"phone_data_yield_rapids_ratiovalidyieldedminutes": "ratiovalidyieldedminutes","phone_data_yield_rapids_ratiovalidyieldedhours": "ratiovalidyieldedhours"}).merge(sensors_row_count, how="left", left_index=True, right_index=True).reset_index()
 
 
diff --git a/src/visualization/heatmap_sensors_per_minute_per_time_segment.py b/src/visualization/heatmap_sensors_per_minute_per_time_segment.py
index 7c69a756..bc975da2 100644
--- a/src/visualization/heatmap_sensors_per_minute_per_time_segment.py
+++ b/src/visualization/heatmap_sensors_per_minute_per_time_segment.py
@@ -79,10 +79,10 @@ label = participant_file["PHONE"]["LABEL"]
 phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], parse_dates=["local_date_time"])
 if time_segments_type == "FREQUENCY":
     phone_data_yield["assigned_segments"] = phone_data_yield["assigned_segments"].str.replace(r"[0-9]{4}#", "#")
-    time_segments_labels["label"] = time_segments_labels["label"].str.replace(r"[0-9]{4}", "")
+    time_segments_labels["label"] = time_segments_labels["label"].str[:-4]
 if time_segments_type == "PERIODIC":
     phone_data_yield["assigned_segments"] = phone_data_yield["assigned_segments"].str.replace(r"_RR\d+SS#", "#")
-    time_segments_labels["label"] = time_segments_labels["label"].str.replace(r"_RR\d+SS", "")
+    time_segments_labels["label"] = time_segments_labels["label"].str.replace(r"_RR\d+SS$", "")
 
 html_file = open(snakemake.output[0], "a", encoding="utf-8")
 if phone_data_yield.empty:
diff --git a/src/visualization/histogram_phone_data_yield.py b/src/visualization/histogram_phone_data_yield.py
index 998d4fd9..34e24795 100644
--- a/src/visualization/histogram_phone_data_yield.py
+++ b/src/visualization/histogram_phone_data_yield.py
@@ -6,7 +6,7 @@ time_segments_type = snakemake.params["time_segments_type"]
 phone_data_yield = pd.read_csv(snakemake.input[0])
 
 if time_segments_type == "FREQUENCY":
-    phone_data_yield["local_segment_label"] = phone_data_yield["local_segment_label"].str.replace(r"[0-9]{4}", "")
+    phone_data_yield["local_segment_label"] = phone_data_yield["local_segment_label"].str[:-4]
 if time_segments_type == "EVENT":
     phone_data_yield["local_segment_label"] = "event"