From 1c57320ab3e8f79cb240dc5c08ec21be8e6082f8 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Fri, 25 Jun 2021 19:06:46 -0400 Subject: [PATCH] Update segment labels and fix the bug when we do not have any labels for event segments --- .../phone_locations/barnett/daily_features.R | 3 +- src/features/utils/utils.R | 3 ++ src/features/utils/utils.py | 26 +++++++------ .../heatmap_feature_correlation_matrix.py | 2 +- ..._yield_per_participant_per_time_segment.py | 2 +- ...atmap_sensor_row_count_per_time_segment.py | 38 ++++++++++--------- ...map_sensors_per_minute_per_time_segment.py | 4 +- .../histogram_phone_data_yield.py | 2 +- 8 files changed, 44 insertions(+), 36 deletions(-) diff --git a/src/features/phone_locations/barnett/daily_features.R b/src/features/phone_locations/barnett/daily_features.R index 4a9009c5..86e87718 100644 --- a/src/features/phone_locations/barnett/daily_features.R +++ b/src/features/phone_locations/barnett/daily_features.R @@ -26,9 +26,8 @@ barnett_daily_features <- function(snakemake){ location <- location %>% filter(accuracy < accuracy_limit) %>% mutate(is_daily = str_detect(assigned_segments, paste0(".*#", datetime_start_regex, ",", datetime_end_regex, ".*"))) - - if(nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)){ + if(nrow(segment_labels) == 0 || nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)){ warning("Barnett's location features cannot be computed for data or time segments that do not span one or more entire days (00:00:00 to 23:59:59). Values below point to the problem:", "\nLocation data rows within accuracy: ", nrow(location %>% filter(accuracy < accuracy_limit)), "\nLocation data rows within a daily time segment: ", nrow(filter(location, is_daily)), diff --git a/src/features/utils/utils.R b/src/features/utils/utils.R index 02931503..9a3dabce 100644 --- a/src/features/utils/utils.R +++ b/src/features/utils/utils.R @@ -60,6 +60,9 @@ fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_d source(provider[["SRC_SCRIPT"]]) features_function <- match.fun(paste0(tolower(provider_key), "_features")) time_segments <- time_segments_labels %>% pull(label) + if(length(time_segments) == 0){ + time_segments <- c("") + } for (time_segment in time_segments){ print(paste(rapids_log_tag,"Processing", sensor_key, provider_key, time_segment)) diff --git a/src/features/utils/utils.py b/src/features/utils/utils.py index 7183a82d..9288d1b4 100644 --- a/src/features/utils/utils.py +++ b/src/features/utils/utils.py @@ -99,19 +99,21 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file if provider["COMPUTE"] == True: - feature_module = import_path(provider["SRC_SCRIPT"]) - feature_function = getattr(feature_module, provider_key.lower() + "_features") - - for time_segment in time_segments_labels["label"]: - print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, time_segment)) - features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes) - if not "local_segment" in features.columns: - raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)") - features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns] - sensor_features = pd.concat([sensor_features, features], axis=0, sort=False) + feature_module = import_path(provider["SRC_SCRIPT"]) + feature_function = getattr(feature_module, provider_key.lower() + "_features") + + if time_segments_labels["label"].empty: + time_segments_labels["label"] = [""] + for time_segment in time_segments_labels["label"]: + print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, time_segment)) + features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes) + if not "local_segment" in features.columns: + raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)") + features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns] + sensor_features = pd.concat([sensor_features, features], axis=0, sort=False) else: - for feature in provider["FEATURES"]: - sensor_features[feature] = None + for feature in provider["FEATURES"]: + sensor_features[feature] = None segment_colums = pd.DataFrame() sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '') split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True) diff --git a/src/visualization/heatmap_feature_correlation_matrix.py b/src/visualization/heatmap_feature_correlation_matrix.py index 1a42d447..a3474409 100644 --- a/src/visualization/heatmap_feature_correlation_matrix.py +++ b/src/visualization/heatmap_feature_correlation_matrix.py @@ -26,7 +26,7 @@ features = pd.read_csv(snakemake.input["all_sensor_features"]) if time_segments_type == "FREQUENCY": - features["local_segment_label"] = features["local_segment_label"].str.replace(r"[0-9]{4}", "") + features["local_segment_label"] = features["local_segment_label"].str[:-4] if time_segments_type == "EVENT": features["local_segment_label"] = "event" diff --git a/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py b/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py index 1e6f79fc..423b0051 100644 --- a/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py +++ b/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py @@ -58,7 +58,7 @@ time_segments = pd.read_csv(snakemake.input["time_segments_file"])["label"].uniq phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"]).sort_values(by=["pid", "local_segment_start_datetime"]) if time_segments_type == "FREQUENCY": - phone_data_yield["local_segment_label"] = phone_data_yield["local_segment_label"].str.replace(r"[0-9]{4}", "") + phone_data_yield["local_segment_label"] = phone_data_yield["local_segment_label"].str[:-4] html_file = open(snakemake.output[0], "w", encoding="utf-8") if phone_data_yield.empty: diff --git a/src/visualization/heatmap_sensor_row_count_per_time_segment.py b/src/visualization/heatmap_sensor_row_count_per_time_segment.py index df89a5dc..cd34793f 100644 --- a/src/visualization/heatmap_sensor_row_count_per_time_segment.py +++ b/src/visualization/heatmap_sensor_row_count_per_time_segment.py @@ -11,6 +11,25 @@ mod = util.module_from_spec(spec) spec.loader.exec_module(mod) filter_data_by_segment = getattr(mod, "filter_data_by_segment") +def getRowCount(sensor_paths, sensor_names, time_segments_labels): + sensors_row_count = pd.DataFrame() + for sensor_path, sensor_name in zip(sensor_paths, sensor_names): + sensor_data = pd.read_csv(sensor_path, usecols=["assigned_segments"]) + + sensor_row_count = pd.DataFrame() + if not sensor_data.empty: + for time_segment in time_segments_labels: + sensor_data_per_segment = filter_data_by_segment(sensor_data, time_segment) + + if not sensor_data_per_segment.empty: + sensor_row_count = pd.concat([sensor_row_count, sensor_data_per_segment.groupby(["local_segment"])[["local_segment"]].count().rename(columns={"local_segment": sensor_name})], axis=0, sort=False) + sensors_row_count = pd.concat([sensors_row_count, sensor_row_count], axis=1, sort=False) + + sensors_row_count.index.name = "local_segment" + sensors_row_count.index = sensors_row_count.index.str.replace(r"_RR\d+SS#", "#") + + return sensors_row_count + def getRowCountHeatmap(data_for_plot, pid, time_segment, html_file): fig = px.timeline(data_for_plot, @@ -18,7 +37,7 @@ def getRowCountHeatmap(data_for_plot, pid, time_segment, html_file): x_end="local_segment_end_datetime", y="sensor", color="scaled_value", - color_continuous_scale="Peach", #"Viridis", + color_continuous_scale="Peach", opacity=0.7, hover_data={"local_segment_start_datetime":False, "local_segment_end_datetime":False, "local_segment":True, "value":True, "scaled_value":True}) @@ -48,22 +67,7 @@ phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], index_col=[" if ("phone_data_yield_rapids_ratiovalidyieldedminutes" not in phone_data_yield.columns) or ("phone_data_yield_rapids_ratiovalidyieldedhours" not in phone_data_yield.columns): raise ValueError("Please make sure [PHONE_DATA_YIELD][RAPIDS][COMPUTE] is True AND [PHONE_DATA_YIELD][RAPIDS][FEATURES] contains [ratiovalidyieldedminutes, ratiovalidyieldedhours].") -# extract row count -sensors_row_count = pd.DataFrame() -for sensor_path, sensor_name in zip(snakemake.input["all_sensors"], sensor_names): - sensor_data = pd.read_csv(sensor_path, usecols=["assigned_segments"]) - - sensor_row_count = pd.DataFrame() - if not sensor_data.empty: - for time_segment in time_segments_labels: - sensor_data_per_segment = filter_data_by_segment(sensor_data, time_segment) - - if not sensor_data_per_segment.empty: - sensor_row_count = pd.concat([sensor_row_count, sensor_data_per_segment.groupby(["local_segment"])[["local_segment"]].count().rename(columns={"local_segment": sensor_name})], axis=0, sort=False) - sensors_row_count = pd.concat([sensors_row_count, sensor_row_count], axis=1, sort=False) - -sensors_row_count.index.name = "local_segment" -sensors_row_count.index = sensors_row_count.index.str.replace(r"_RR\d+SS", "") +sensors_row_count = getRowCount(snakemake.input["all_sensors"], sensor_names, time_segments_labels) data_for_plot = phone_data_yield.rename(columns={"phone_data_yield_rapids_ratiovalidyieldedminutes": "ratiovalidyieldedminutes","phone_data_yield_rapids_ratiovalidyieldedhours": "ratiovalidyieldedhours"}).merge(sensors_row_count, how="left", left_index=True, right_index=True).reset_index() diff --git a/src/visualization/heatmap_sensors_per_minute_per_time_segment.py b/src/visualization/heatmap_sensors_per_minute_per_time_segment.py index 7c69a756..bc975da2 100644 --- a/src/visualization/heatmap_sensors_per_minute_per_time_segment.py +++ b/src/visualization/heatmap_sensors_per_minute_per_time_segment.py @@ -79,10 +79,10 @@ label = participant_file["PHONE"]["LABEL"] phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], parse_dates=["local_date_time"]) if time_segments_type == "FREQUENCY": phone_data_yield["assigned_segments"] = phone_data_yield["assigned_segments"].str.replace(r"[0-9]{4}#", "#") - time_segments_labels["label"] = time_segments_labels["label"].str.replace(r"[0-9]{4}", "") + time_segments_labels["label"] = time_segments_labels["label"].str[:-4] if time_segments_type == "PERIODIC": phone_data_yield["assigned_segments"] = phone_data_yield["assigned_segments"].str.replace(r"_RR\d+SS#", "#") - time_segments_labels["label"] = time_segments_labels["label"].str.replace(r"_RR\d+SS", "") + time_segments_labels["label"] = time_segments_labels["label"].str.replace(r"_RR\d+SS$", "") html_file = open(snakemake.output[0], "a", encoding="utf-8") if phone_data_yield.empty: diff --git a/src/visualization/histogram_phone_data_yield.py b/src/visualization/histogram_phone_data_yield.py index 998d4fd9..34e24795 100644 --- a/src/visualization/histogram_phone_data_yield.py +++ b/src/visualization/histogram_phone_data_yield.py @@ -6,7 +6,7 @@ time_segments_type = snakemake.params["time_segments_type"] phone_data_yield = pd.read_csv(snakemake.input[0]) if time_segments_type == "FREQUENCY": - phone_data_yield["local_segment_label"] = phone_data_yield["local_segment_label"].str.replace(r"[0-9]{4}", "") + phone_data_yield["local_segment_label"] = phone_data_yield["local_segment_label"].str[:-4] if time_segments_type == "EVENT": phone_data_yield["local_segment_label"] = "event"