diff --git a/Snakefile b/Snakefile index a73f412e..cc1ed4ba 100644 --- a/Snakefile +++ b/Snakefile @@ -381,6 +381,8 @@ if config["HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT"]["PLOT"]: files_to_compute.append("reports/data_exploration/heatmap_sensor_row_count_per_time_segment.html") if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]: + if not config["PHONE_DATA_YIELD"]["PROVIDERS"]["RAPIDS"]["COMPUTE"]: + raise ValueError("Error: [PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] must be True in config.yaml to get heatmaps of overall data yield.") files_to_compute.append("reports/data_exploration/heatmap_phone_data_yield_per_participant_per_time_segment.html") if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]: diff --git a/docs/img/hm-data-yield-participants-absolute-time.html b/docs/img/hm-data-yield-participants-absolute-time.html index e902c07c..527b5e69 100644 --- a/docs/img/hm-data-yield-participants-absolute-time.html +++ b/docs/img/hm-data-yield-participants-absolute-time.html @@ -1,11 +1,11 @@
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file +
+
+
+
+
+
+
+
+
+
\ No newline at end of file diff --git a/docs/img/hm-data-yield-participants-absolute-time.png b/docs/img/hm-data-yield-participants-absolute-time.png index 4129a8ee..fc122a5d 100644 Binary files a/docs/img/hm-data-yield-participants-absolute-time.png and b/docs/img/hm-data-yield-participants-absolute-time.png differ diff --git a/docs/img/hm-data-yield-participants-relative-time.html b/docs/img/hm-data-yield-participants-relative-time.html index 7c7366f3..712cbee2 100644 --- a/docs/img/hm-data-yield-participants-relative-time.html +++ b/docs/img/hm-data-yield-participants-relative-time.html @@ -1,11 +1,11 @@
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file +
+
+
+
+
+
+
+
+
+
\ No newline at end of file diff --git a/docs/img/hm-data-yield-participants-relative-time.png b/docs/img/hm-data-yield-participants-relative-time.png index 20f8caa3..640653b8 100644 Binary files a/docs/img/hm-data-yield-participants-relative-time.png and b/docs/img/hm-data-yield-participants-relative-time.png differ diff --git a/docs/img/hm-feature-correlations.html b/docs/img/hm-feature-correlations.html index 99e596fb..1d7302f5 100644 --- a/docs/img/hm-feature-correlations.html +++ b/docs/img/hm-feature-correlations.html @@ -1,96 +1,6 @@ -
- - - -
- -
- - - -
- -
- - - -
- -
- - - -
- -
- - - -
- -
\ No newline at end of file +
+
+
+
+
+
\ No newline at end of file diff --git a/docs/img/hm-feature-correlations.png b/docs/img/hm-feature-correlations.png index f6e60e46..befa34c3 100644 Binary files a/docs/img/hm-feature-correlations.png and b/docs/img/hm-feature-correlations.png differ diff --git a/docs/img/hm-sensor-rows.html b/docs/img/hm-sensor-rows.html index 5e90eaf4..b8d26fd8 100644 --- a/docs/img/hm-sensor-rows.html +++ b/docs/img/hm-sensor-rows.html @@ -377,7 +377,7 @@ summary {

Sensor Row Count per Time Segment For All Participants

RAPIDS

-

23 March, 2021

+

28 June, 2021

@@ -386,17 +386,17 @@ summary { .main-container {min-width:800px; max-width:100%;}
-
-
-
-
-
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
diff --git a/docs/img/hm-sensor-rows.png b/docs/img/hm-sensor-rows.png index c8c6cbcd..68e80243 100644 Binary files a/docs/img/hm-sensor-rows.png and b/docs/img/hm-sensor-rows.png differ diff --git a/docs/visualizations/data-quality-visualizations.md b/docs/visualizations/data-quality-visualizations.md index e1beae3d..e748e90e 100644 --- a/docs/visualizations/data-quality-visualizations.md +++ b/docs/visualizations/data-quality-visualizations.md @@ -20,7 +20,7 @@ These plots can be used as a rough indication of the smartphone monitoring cover ## 2. Heatmaps of overall data yield These heatmaps are a break down per time segment and per participant of [Visualization 1](#1-histograms-of-phone-data-yield). Heatmap's rows represent participants, columns represent time segment instances and the cells’ color represent the valid yielded minute or hour ratio for a participant during a time segment instance. -As different participants might join a study on different dates and time segments can be of any length and start on any day, the x-axis can be labelled with the absolute time of the start of each time segment instance or the time delta between the start of each time segment instance minus the start of the first instance. These plots provide a quick study overview of the monitoring coverage per person and per time segment. +As different participants might join a study on different dates and time segments can be of any length and start on any day, the x-axis can be labelled with the absolute time of each time segment instance or the time delta between each time segment instance and the start of the first instance for each participant. These plots provide a quick study overview of the monitoring coverage per person and per time segment. The figure below shows the heatmap of the valid yielded minute ratio for participants example01 and example02 on daily segments and, as we inferred from the previous histogram, the lighter (yellow) color on most time segment instances (cells) indicate both phones sensed data without interruptions for most days (except for the first and last ones). @@ -63,7 +63,7 @@ The figure below shows this heatmap for phone sensors collected by participant e ## 4. Heatmap of sensor row count These heatmaps are a per-sensor breakdown of [Visualization 1](#1-histograms-of-phone-data-yield) and [Visualization 2](#2-heatmaps-of-overall-data-yield). Note that the second row (ratio of valid yielded minutes) of this heatmap matches the respective participant (bottom) row the screenshot in Visualization 2. -In these heatmaps rows represent phone or Fitbit sensors, columns represent time segment instances and cell’s color shows the normalized (0 to 1) row count of each sensor within a time segment instance. RAPIDS creates one heatmap per participant and they can be used to judge missing data on a per participant and per sensor basis. +In these heatmaps rows represent phone or Fitbit sensors, columns represent time segment instances and cell’s color shows the normalized (0 to 1) row count of each sensor within a time segment instance. A grey cell represents missing data in that time segment instance. RAPIDS creates one heatmap per participant and they can be used to judge missing data on a per participant and per sensor basis. The figure below shows data for 14 phone sensors (including data yield) of example01’s daily segments. From the top two rows, we can see that the phone was sensing data for most of the monitoring period (as suggested by Figure 3 and Figure 4). We can also infer how phone usage influenced the different sensor streams; there are peaks of screen events during the first day (Apr 23rd), peaks of location coordinates on Apr 26th and Apr 30th, and no sent or received SMS except for Apr 23rd, Apr 29th and Apr 30th (unlabeled row between screen and locations). diff --git a/docs/workflow-examples/analysis.md b/docs/workflow-examples/analysis.md index e11ae3e7..c5288e97 100644 --- a/docs/workflow-examples/analysis.md +++ b/docs/workflow-examples/analysis.md @@ -69,7 +69,7 @@ Note you will see a lot of warning messages, you can ignore them since they happ ??? info "6. Feature cleaning." In this stage we perform four steps to clean our sensor feature file. First, we discard days with a data yield hour ratio less than or equal to 0.75, i.e. we include days with at least 18 hours of data. Second, we drop columns (features) with more than 30% of missing rows. Third, we drop columns with zero variance. Fourth, we drop rows (days) with more than 30% of missing columns (features). In this cleaning stage several parameters are created and exposed in `example_profile/example_config.yaml`. - After this step, we kept 158 features over 11 days for the individual model of p01, 101 features over 12 days for the individual model of p02 and 106 features over 20 days for the population model. Note that the difference in the number of features between p01 and p02 is mostly due to iOS restrictions that stops researchers from collecting the same number of sensors than in Android phones. + After this step, we kept 161 features over 11 days for the individual model of p01, 101 features over 12 days for the individual model of p02 and 109 features over 20 days for the population model. Note that the difference in the number of features between p01 and p02 is mostly due to iOS restrictions that stops researchers from collecting the same number of sensors than in Android phones. Feature cleaning for the individual models is done in the `clean_sensor_features_for_individual_participants` rule and for the population model in the `clean_sensor_features_for_all_participants` rule in `rules/models.smk`. diff --git a/example_profile/Snakefile b/example_profile/Snakefile index f969fdcb..65cea721 100644 --- a/example_profile/Snakefile +++ b/example_profile/Snakefile @@ -204,15 +204,28 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): else: raise ValueError("Error: Add PHONE_LOCATIONS (and as many PHONE_SENSORS as you have) to [PHONE_DATA_YIELD][SENSORS] in config.yaml. This is necessary to compute phone_yielded_timestamps (time when the smartphone was sensing data) which is used to resample fused location data (ALL_RESAMPLED and RESAMPLED_FUSED)") + if provider == "BARNETT": + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_barnett_daily.csv", pid=config["PIDS"])) + if provider == "DORYAB": + files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime_with_doryab_columns.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_locations_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime.csv", pid=config["PIDS"])) - files_to_compute.extend(expand("data/interim/{pid}/phone_locations_processed_with_datetime_with_home.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") +for provider in config["FITBIT_CALORIES_INTRADAY"]["PROVIDERS"].keys(): + if config["FITBIT_CALORIES_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_calories_intraday_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_calories_intraday_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_calories_intraday_features/fitbit_calories_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_CALORIES_INTRADAY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_calories_intraday.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + for provider in config["FITBIT_DATA_YIELD"]["PROVIDERS"].keys(): if config["FITBIT_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"])) @@ -271,6 +284,12 @@ for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys(): for provider in config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"].keys(): if config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: + + if config["FITBIT_STEPS_INTRADAY"]["EXCLUDE_SLEEP"]["TIME_BASED"]["EXCLUDE"] or config["FITBIT_STEPS_INTRADAY"]["EXCLUDE_SLEEP"]["FITBIT_BASED"]["EXCLUDE"]: + if config["FITBIT_STEPS_INTRADAY"]["EXCLUDE_SLEEP"]["FITBIT_BASED"]["EXCLUDE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_sleep_summary_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_intraday_with_datetime_exclude_sleep.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_raw.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_with_datetime.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=get_script_language(config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower())) @@ -357,6 +376,8 @@ if config["HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT"]["PLOT"]: files_to_compute.append("reports/data_exploration/heatmap_sensor_row_count_per_time_segment.html") if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]: + if not config["PHONE_DATA_YIELD"]["PROVIDERS"]["RAPIDS"]["COMPUTE"]: + raise ValueError("Error: [PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] must be True in config.yaml to get heatmaps of overall data yield.") files_to_compute.append("reports/data_exploration/heatmap_phone_data_yield_per_participant_per_time_segment.html") if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]: diff --git a/example_profile/example_config.yaml b/example_profile/example_config.yaml index be3deb29..c2f269c7 100644 --- a/example_profile/example_config.yaml +++ b/example_profile/example_config.yaml @@ -198,7 +198,11 @@ PHONE_DATA_YIELD: # See https://www.rapids.science/latest/features/phone-keyboard/ PHONE_KEYBOARD: CONTAINER: keyboard - PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD + PROVIDERS: + RAPIDS: + COMPUTE: False + FEATURES: ["sessioncount","averageinterkeydelay","averagesessionlength","changeintextlengthlessthanminusone","changeintextlengthequaltominusone","changeintextlengthequaltoone","changeintextlengthmorethanone","maxtextlength","lastmessagelength","totalkeyboardtouches"] + SRC_SCRIPT: src/features/phone_keyboard/rapids/main.py # See https://www.rapids.science/latest/features/phone-light/ PHONE_LIGHT: diff --git a/rules/reports.smk b/rules/reports.smk index 3a4fa6bd..bd1b448c 100644 --- a/rules/reports.smk +++ b/rules/reports.smk @@ -1,6 +1,8 @@ rule histogram_phone_data_yield: input: "data/processed/features/all_participants/all_sensor_features.csv" + params: + time_segments_type = config["TIME_SEGMENTS"]["TYPE"] output: "reports/data_exploration/histogram_phone_data_yield.html" script: @@ -12,7 +14,8 @@ rule heatmap_sensors_per_minute_per_time_segment: participant_file = "data/external/participant_files/{pid}.yaml", time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: - pid = "{pid}" + pid = "{pid}", + time_segments_type = config["TIME_SEGMENTS"]["TYPE"] output: "reports/interim/{pid}/heatmap_sensors_per_minute_per_time_segment.html" script: @@ -33,7 +36,9 @@ rule heatmap_sensor_row_count_per_time_segment: participant_file = "data/external/participant_files/{pid}.yaml", time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: - pid = "{pid}" + pid = "{pid}", + sensor_names = config["HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT"]["SENSORS"], + time_segments_type = config["TIME_SEGMENTS"]["TYPE"] output: "reports/interim/{pid}/heatmap_sensor_row_count_per_time_segment.html" script: @@ -49,11 +54,13 @@ rule merge_heatmap_sensor_row_count_per_time_segment: rule heatmap_phone_data_yield_per_participant_per_time_segment: input: - phone_data_yield = expand("data/processed/features/{pid}/phone_data_yield.csv", pid=config["PIDS"]), - participant_file = expand("data/external/participant_files/{pid}.yaml", pid=config["PIDS"]), - time_segments_labels = expand("data/interim/time_segments/{pid}_time_segments_labels.csv", pid=config["PIDS"]) + participant_files = expand("data/external/participant_files/{pid}.yaml", pid=config["PIDS"]), + time_segments_file = config["TIME_SEGMENTS"]["FILE"], + phone_data_yield = "data/processed/features/all_participants/all_sensor_features.csv", params: - time = config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["TIME"] + pids = config["PIDS"], + time = config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["TIME"], + time_segments_type = config["TIME_SEGMENTS"]["TYPE"] output: "reports/data_exploration/heatmap_phone_data_yield_per_participant_per_time_segment.html" script: @@ -63,6 +70,7 @@ rule heatmap_feature_correlation_matrix: input: all_sensor_features = "data/processed/features/all_participants/all_sensor_features.csv" # before data cleaning params: + time_segments_type = config["TIME_SEGMENTS"]["TYPE"], min_rows_ratio = config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["MIN_ROWS_RATIO"], corr_threshold = config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["CORR_THRESHOLD"], corr_method = config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["CORR_METHOD"] diff --git a/src/features/phone_locations/barnett/daily_features.R b/src/features/phone_locations/barnett/daily_features.R index 4a9009c5..86e87718 100644 --- a/src/features/phone_locations/barnett/daily_features.R +++ b/src/features/phone_locations/barnett/daily_features.R @@ -26,9 +26,8 @@ barnett_daily_features <- function(snakemake){ location <- location %>% filter(accuracy < accuracy_limit) %>% mutate(is_daily = str_detect(assigned_segments, paste0(".*#", datetime_start_regex, ",", datetime_end_regex, ".*"))) - - if(nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)){ + if(nrow(segment_labels) == 0 || nrow(location) == 0 || all(location$is_daily == FALSE) || (max(location$timestamp) - min(location$timestamp) < 86400000)){ warning("Barnett's location features cannot be computed for data or time segments that do not span one or more entire days (00:00:00 to 23:59:59). Values below point to the problem:", "\nLocation data rows within accuracy: ", nrow(location %>% filter(accuracy < accuracy_limit)), "\nLocation data rows within a daily time segment: ", nrow(filter(location, is_daily)), diff --git a/src/features/utils/utils.R b/src/features/utils/utils.R index 02931503..9a3dabce 100644 --- a/src/features/utils/utils.R +++ b/src/features/utils/utils.R @@ -60,6 +60,9 @@ fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_d source(provider[["SRC_SCRIPT"]]) features_function <- match.fun(paste0(tolower(provider_key), "_features")) time_segments <- time_segments_labels %>% pull(label) + if(length(time_segments) == 0){ + time_segments <- c("") + } for (time_segment in time_segments){ print(paste(rapids_log_tag,"Processing", sensor_key, provider_key, time_segment)) diff --git a/src/features/utils/utils.py b/src/features/utils/utils.py index 7183a82d..9288d1b4 100644 --- a/src/features/utils/utils.py +++ b/src/features/utils/utils.py @@ -99,19 +99,21 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file if provider["COMPUTE"] == True: - feature_module = import_path(provider["SRC_SCRIPT"]) - feature_function = getattr(feature_module, provider_key.lower() + "_features") - - for time_segment in time_segments_labels["label"]: - print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, time_segment)) - features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes) - if not "local_segment" in features.columns: - raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)") - features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns] - sensor_features = pd.concat([sensor_features, features], axis=0, sort=False) + feature_module = import_path(provider["SRC_SCRIPT"]) + feature_function = getattr(feature_module, provider_key.lower() + "_features") + + if time_segments_labels["label"].empty: + time_segments_labels["label"] = [""] + for time_segment in time_segments_labels["label"]: + print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, time_segment)) + features = feature_function(sensor_data_files, time_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes) + if not "local_segment" in features.columns: + raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + provider["SRC_SCRIPT"] + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different time segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)") + features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns] + sensor_features = pd.concat([sensor_features, features], axis=0, sort=False) else: - for feature in provider["FEATURES"]: - sensor_features[feature] = None + for feature in provider["FEATURES"]: + sensor_features[feature] = None segment_colums = pd.DataFrame() sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '') split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True) diff --git a/src/visualization/heatmap_feature_correlation_matrix.py b/src/visualization/heatmap_feature_correlation_matrix.py index 8fb01934..2a75a513 100644 --- a/src/visualization/heatmap_feature_correlation_matrix.py +++ b/src/visualization/heatmap_feature_correlation_matrix.py @@ -10,19 +10,27 @@ def getCorrMatrixHeatmap(corr_matrix, time_segment, html_file): fig = go.Figure(data=go.Heatmap(z=corr_matrix.values.tolist(), x=feature_names, y=feature_names, - colorscale="Viridis")) + colorscale="Viridis", + zmin=-1, zmax=1)) fig.update_layout(title="Correlation matrix of features of " + time_segment + " segments.") html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) - +time_segments_type = snakemake.params["time_segments_type"] min_rows_ratio = snakemake.params["min_rows_ratio"] corr_threshold = snakemake.params["corr_threshold"] corr_method = snakemake.params["corr_method"] features = pd.read_csv(snakemake.input["all_sensor_features"]) + + +if time_segments_type == "FREQUENCY": + features["local_segment_label"] = features["local_segment_label"].str[:-4] +if time_segments_type == "EVENT": + features["local_segment_label"] = "event" + time_segments = set(features["local_segment_label"]) html_file = open(snakemake.output[0], "a", encoding="utf-8") diff --git a/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py b/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py index 8f639a62..d3466e24 100644 --- a/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py +++ b/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py @@ -1,93 +1,87 @@ import pandas as pd -import numpy as np -import plotly.graph_objects as go +import plotly.express as px import yaml +def getPidAndLabel(participant_file_paths, pids): + pid2label, y_axis_labels = {}, [] + for participant_file_path, pid in zip(participant_file_paths, pids): + with open(participant_file_path, "r", encoding="utf-8") as f: + participant_file = yaml.safe_load(f) + label = str(participant_file["PHONE"]["LABEL"]) -def getPhoneDataYieldHeatmap(data_for_plot, y_axis_labels, time_segment, type, time, html_file): + pid2label[pid] = label + y_axis_labels.append(pid + "." + label) + return pid2label, y_axis_labels - fig = go.Figure(data=go.Heatmap(z=data_for_plot.values.tolist(), - x=data_for_plot.columns.tolist(), - y=y_axis_labels, - hovertext=data_for_plot.values.tolist(), - hovertemplate="Time since first segment: %{x}
Participant: %{y}
Ratiovalidyielded" + type + ": %{z}" if time == "RELATIVE_TIME" else "Time: %{x}
Participant: %{y}
Ratiovalidyielded" + type + ": %{z}", - zmin=0, zmax=1, - colorscale="Viridis")) +def getPhoneDataYieldHeatmap(phone_data_yield, time, time_segment, html_file): if time == "RELATIVE_TIME": - fig.update_layout(title="Heatmap of valid yielded " + type + " ratio for " + time_segment + " segments.
y-axis shows participant information (format: pid.label).
x-axis shows the time since their first segment.
z-axis (color) shows valid yielded " + type + " ratio during a segment instance.") - else: - fig.update_layout(title="Heatmap of valid yielded " + type + " ratio for " + time_segment + " segments.
y-axis shows participant information (format: pid.label).
x-axis shows the time.
z-axis (color) shows valid yielded " + type + " ratio during a segment instance.") + # Number of minutes after the first start date time of local segments + phone_data_yield["local_segment_end_datetime"] = (phone_data_yield["local_segment_end_datetime"] - phone_data_yield["local_segment_start_datetime"].min()) + pd.Timestamp(2000,1,1) + phone_data_yield["local_segment_start_datetime"] = (phone_data_yield["local_segment_start_datetime"] - phone_data_yield["local_segment_start_datetime"].min()) + pd.Timestamp(2000,1,1) - fig["layout"]["xaxis"].update(side="bottom") - fig["layout"].update(xaxis_title="Time Since First Segment" if time == "RELATIVE_TIME" else "Time") - fig["layout"].update(margin=dict(t=160)) - - html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) + for type in ["minutes", "hours"]: + + column_name = "phone_data_yield_rapids_ratiovalidyielded" + type + + fig = px.timeline(phone_data_yield, + x_start="local_segment_start_datetime", + x_end="local_segment_end_datetime", + y="y_axis_label", + color=column_name, + color_continuous_scale="Viridis", + range_color=[0, 1], + opacity=0.7, + hover_data={'local_segment_start_datetime':False, 'local_segment_end_datetime':False, 'local_segment':True}) + + fig.update_layout(title="Heatmap of valid yielded " + type + " ratio for " + time_segment + " segments and " + time.lower().replace("_", " ") + ".
y-axis shows participant information (format: pid.label).
x-axis shows the time" + (" since their first segment" if time == "RELATIVE_TIME" else "") + ".
z-axis (color) shows valid yielded " + type + " ratio during a segment instance.", + xaxis=dict(side="bottom", title="Time Since First Segment" if time == "RELATIVE_TIME" else "Time"), + yaxis=dict(side="left", title="Participant information"), + margin=dict(t=160)) + + if time == "RELATIVE_TIME": + fig.update_layout(xaxis_tickformat="%y years %j days
%X") + + html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) + + return html_file +pid2label, y_axis_labels = getPidAndLabel(snakemake.input["participant_files"], snakemake.params["pids"]) +time_segments_type = snakemake.params["time_segments_type"] # FREQUENCY or PERIODIC or EVENT +time = snakemake.params["time"] # ABSOLUTE_TIME or RELATIVE_TIME +time_segments = pd.read_csv(snakemake.input["time_segments_file"])["label"].unique() +phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"]).sort_values(by=["pid", "local_segment_start_datetime"]) +if time_segments_type == "FREQUENCY": + phone_data_yield["local_segment_label"] = phone_data_yield["local_segment_label"].str[:-4] -time = snakemake.params["time"] -y_axis_labels, phone_data_yield_minutes, phone_data_yield_hours = [], {}, {} -for phone_data_yield_path, participant_file_path, time_segments_path in zip(snakemake.input["phone_data_yield"], snakemake.input["participant_file"], snakemake.input["time_segments_labels"]): - - # set pid.label as y_axis_label - pid = phone_data_yield_path.split("/")[3] - time_segments = pd.read_csv(time_segments_path, header=0)["label"] - - with open(participant_file_path, "r", encoding="utf-8") as f: - participant_file = yaml.safe_load(f) - label = participant_file["PHONE"]["LABEL"] - - y_axis_label = pid + "." + label - y_axis_labels.append(y_axis_label) - - - phone_data_yield = pd.read_csv(phone_data_yield_path, index_col=["local_segment_start_datetime"], parse_dates=["local_segment_start_datetime"]) - # make sure the phone_data_yield file contains "phone_data_yield_rapids_ratiovalidyieldedminutes" and "phone_data_yield_rapids_ratiovalidyieldedhours" columns +html_file = open(snakemake.output[0], "w", encoding="utf-8") +if phone_data_yield.empty: + html_file.write("There is no sensor data for the sensors in [PHONE_DATA_YIELD][SENSORS].") +else: + # Make sure the phone_data_yield file contains both "phone_data_yield_rapids_ratiovalidyieldedminutes" and "phone_data_yield_rapids_ratiovalidyieldedhours" columns if ("phone_data_yield_rapids_ratiovalidyieldedminutes" not in phone_data_yield.columns) or ("phone_data_yield_rapids_ratiovalidyieldedhours" not in phone_data_yield.columns): raise ValueError("Please make sure [PHONE_DATA_YIELD][RAPIDS][COMPUTE] is True AND [PHONE_DATA_YIELD][RAPIDS][FEATURES] contains [ratiovalidyieldedminutes, ratiovalidyieldedhours].") - if not phone_data_yield.empty: + phone_data_yield.loc[:, ["phone_data_yield_rapids_ratiovalidyieldedminutes", "phone_data_yield_rapids_ratiovalidyieldedhours"]] = phone_data_yield.loc[:, ["phone_data_yield_rapids_ratiovalidyieldedminutes", "phone_data_yield_rapids_ratiovalidyieldedhours"]].round(3).clip(upper=1) + phone_data_yield["y_axis_label"] = phone_data_yield["pid"].apply(lambda pid: pid + "." + str(pid2label[pid])) + if time_segments_type == "EVENT": + html_file = getPhoneDataYieldHeatmap(phone_data_yield, time, "event", html_file) + else: # FREQUENCY or PERIODIC for time_segment in time_segments: - phone_data_yield_per_segment = phone_data_yield[phone_data_yield["local_segment_label"] == time_segment] + + phone_data_yield_per_segment = phone_data_yield[phone_data_yield["local_segment_label"] == time_segment].copy() if not phone_data_yield_per_segment.empty: - if time == "RELATIVE_TIME": - # set number of minutes after the first start date time of local segments as x_axis_label - phone_data_yield_per_segment.index = phone_data_yield_per_segment.index - phone_data_yield_per_segment.index.min() - elif time == "ABSOLUTE_TIME": - pass - else: - raise ValueError("[HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT][TIME] can only be RELATIVE_TIME or ABSOLUTE_TIME") + html_file = getPhoneDataYieldHeatmap(phone_data_yield_per_segment, time, time_segment, html_file) - phone_data_yield_minutes_per_segment = phone_data_yield_per_segment[["phone_data_yield_rapids_ratiovalidyieldedminutes"]].rename(columns={"phone_data_yield_rapids_ratiovalidyieldedminutes": y_axis_label}) - phone_data_yield_hours_per_segment = phone_data_yield_per_segment[["phone_data_yield_rapids_ratiovalidyieldedhours"]].rename(columns={"phone_data_yield_rapids_ratiovalidyieldedhours": y_axis_label}) - - if time_segment not in phone_data_yield_minutes.keys(): - phone_data_yield_minutes[time_segment] = phone_data_yield_minutes_per_segment - phone_data_yield_hours[time_segment] = phone_data_yield_hours_per_segment - else: - phone_data_yield_minutes[time_segment] = pd.concat([phone_data_yield_minutes[time_segment], phone_data_yield_minutes_per_segment], axis=1, sort=True) - phone_data_yield_hours[time_segment] = pd.concat([phone_data_yield_hours[time_segment], phone_data_yield_hours_per_segment], axis=1, sort=True) - - -html_file = open(snakemake.output[0], "a", encoding="utf-8") -if len(phone_data_yield_minutes.keys()) == 0: - html_file.write("There is no sensor data for the sensors in [PHONE_DATA_YIELD][SENSORS].") -for time_segment in phone_data_yield_minutes.keys(): - minutes_data_for_plot = phone_data_yield_minutes[time_segment].transpose().reindex(pd.Index(y_axis_labels)).round(3) - hours_data_for_plot = phone_data_yield_hours[time_segment].transpose().reindex(pd.Index(y_axis_labels)).round(3) - - getPhoneDataYieldHeatmap(minutes_data_for_plot, y_axis_labels, time_segment, "minutes", time, html_file) - getPhoneDataYieldHeatmap(hours_data_for_plot, y_axis_labels, time_segment, "hours", time, html_file) html_file.close() diff --git a/src/visualization/heatmap_sensor_row_count_per_time_segment.py b/src/visualization/heatmap_sensor_row_count_per_time_segment.py index 6b62e6e1..ecea23cf 100644 --- a/src/visualization/heatmap_sensor_row_count_per_time_segment.py +++ b/src/visualization/heatmap_sensor_row_count_per_time_segment.py @@ -1,89 +1,96 @@ import pandas as pd import numpy as np -import plotly.graph_objects as go +import plotly.express as px from importlib import util from pathlib import Path import yaml - -def getRowCountHeatmap(data_for_plot, scaled_data_for_plot, pid, time_segment, html_file): - - fig = go.Figure(data=go.Heatmap(z=scaled_data_for_plot.values.tolist(), - x=data_for_plot.columns, - y=data_for_plot.index, - hovertext=data_for_plot.values.tolist(), - hovertemplate="Segment start: %{x}
Sensor: %{y}
Row count: %{hovertext}", - zmin=0, zmax=1, - colorscale='Viridis')) - - fig.update_layout(title="Heatmap of sensor row count for " + time_segment + " segments. Pid: " + pid +". Label: " + label + "
y-axis shows the included sensors.
x-axis shows the start (date and time) of a time segment.
z-axis (color) shows row count per sensor per segment instance.") - fig["layout"].update(margin=dict(t=160)) - - html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) - - - - # import filter_data_by_segment from src/features/utils/utils.py spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "features" / "utils" / "utils.py")) mod = util.module_from_spec(spec) spec.loader.exec_module(mod) filter_data_by_segment = getattr(mod, "filter_data_by_segment") +def getRowCount(sensor_paths, sensor_names, time_segments_labels): + sensors_row_count = pd.DataFrame() + for sensor_path, sensor_name in zip(sensor_paths, sensor_names): + sensor_data = pd.read_csv(sensor_path, usecols=["assigned_segments"]) + + sensor_row_count = pd.DataFrame() + if not sensor_data.empty: + for time_segment in time_segments_labels: + sensor_data_per_segment = filter_data_by_segment(sensor_data, time_segment) + + if not sensor_data_per_segment.empty: + sensor_row_count = pd.concat([sensor_row_count, sensor_data_per_segment.groupby(["local_segment"])[["local_segment"]].count().rename(columns={"local_segment": sensor_name})], axis=0, sort=False) + sensors_row_count = pd.concat([sensors_row_count, sensor_row_count], axis=1, sort=False) + + sensors_row_count.index.name = "local_segment" + sensors_row_count.index = sensors_row_count.index.str.replace(r"_RR\d+SS#", "#") + + return sensors_row_count + +def getRowCountHeatmap(data_for_plot, pid, time_segment, html_file): + + fig = px.timeline(data_for_plot, + x_start="local_segment_start_datetime", + x_end="local_segment_end_datetime", + y="sensor", + color="scaled_value", + color_continuous_scale="Peach", + range_color=[0, 1], + opacity=0.7, + hover_data={"local_segment_start_datetime":False, "local_segment_end_datetime":False, "local_segment":True, "value":True, "scaled_value":True}) + + fig.update_layout(title="Heatmap of sensor row count for " + time_segment + " segments. Pid: " + pid +". Label: " + label + "
y-axis shows the included sensors.
x-axis shows time segments.
z-axis (color) shows row count per sensor per segment instance.", + xaxis=dict(side="bottom", title="Time Segments"), + yaxis=dict(side="left", title="Sensors"), + margin=dict(t=160)) + + html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) + + return html_file - -phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], index_col=["local_segment_start_datetime"], parse_dates=["local_segment_start_datetime"]) -# make sure the phone_data_yield file contains "phone_data_yield_rapids_ratiovalidyieldedminutes" and "phone_data_yield_rapids_ratiovalidyieldedhours" columns -if ("phone_data_yield_rapids_ratiovalidyieldedminutes" not in phone_data_yield.columns) or ("phone_data_yield_rapids_ratiovalidyieldedhours" not in phone_data_yield.columns): - raise ValueError("Please make sure [PHONE_DATA_YIELD][RAPIDS][COMPUTE] is True AND [PHONE_DATA_YIELD][RAPIDS][FEATURES] contains [ratiovalidyieldedminutes, ratiovalidyieldedhours].") -phone_data_yield = phone_data_yield[["local_segment_label", "phone_data_yield_rapids_ratiovalidyieldedminutes", "phone_data_yield_rapids_ratiovalidyieldedhours"]] - -time_segments = pd.read_csv(snakemake.input["time_segments_labels"], header=0)["label"] -pid = snakemake.params["pid"] - with open(snakemake.input["participant_file"], "r", encoding="utf-8") as f: participant_file = yaml.safe_load(f) label = participant_file["PHONE"]["LABEL"] -sensor_names = [] -sensors_row_count = dict(zip(time_segments, [pd.DataFrame()] * len(time_segments))) +pid = snakemake.params["pid"] +sensor_names = [sensor_name.lower() for sensor_name in snakemake.params["sensor_names"]] +time_segments_type = snakemake.params["time_segments_type"] +time_segments_labels = pd.read_csv(snakemake.input["time_segments_labels"], header=0)["label"] -for sensor_path in snakemake.input["all_sensors"]: - sensor_data = pd.read_csv(sensor_path, usecols=["assigned_segments"]) - sensor_name = sensor_path.split("/")[-1].replace("_with_datetime.csv", "") - sensor_names.append(sensor_name) - - if not sensor_data.empty: - for time_segment in time_segments: - sensor_data_per_segment = filter_data_by_segment(sensor_data, time_segment) +phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], index_col=["local_segment"], parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"]) #index_col=["local_segment_start_datetime"], - if not sensor_data_per_segment.empty: - # extract local start datetime of the segment from "local_segment" column - sensor_data_per_segment["local_segment_start_datetime"] = pd.to_datetime(sensor_data_per_segment["local_segment"].apply(lambda x: x.split("#")[1].split(",")[0])) - sensor_row_count = sensor_data_per_segment.groupby("local_segment_start_datetime")[["local_segment"]].count().rename(columns={"local_segment": sensor_name}) - sensors_row_count[time_segment] = pd.concat([sensors_row_count[time_segment], sensor_row_count], axis=1, sort=False) +# make sure the phone_data_yield file contains "phone_data_yield_rapids_ratiovalidyieldedminutes" and "phone_data_yield_rapids_ratiovalidyieldedhours" columns +if ("phone_data_yield_rapids_ratiovalidyieldedminutes" not in phone_data_yield.columns) or ("phone_data_yield_rapids_ratiovalidyieldedhours" not in phone_data_yield.columns): + raise ValueError("Please make sure [PHONE_DATA_YIELD][RAPIDS][COMPUTE] is True AND [PHONE_DATA_YIELD][RAPIDS][FEATURES] contains [ratiovalidyieldedminutes, ratiovalidyieldedhours].") +phone_data_yield.loc[:, ["phone_data_yield_rapids_ratiovalidyieldedminutes", "phone_data_yield_rapids_ratiovalidyieldedhours"]] = phone_data_yield.loc[:, ["phone_data_yield_rapids_ratiovalidyieldedminutes", "phone_data_yield_rapids_ratiovalidyieldedhours"]].round(3).clip(upper=1) + +sensors_row_count = getRowCount(snakemake.input["all_sensors"], sensor_names, time_segments_labels) +data_for_plot = phone_data_yield.rename(columns={"phone_data_yield_rapids_ratiovalidyieldedminutes": "ratiovalidyieldedminutes","phone_data_yield_rapids_ratiovalidyieldedhours": "ratiovalidyieldedhours"}).merge(sensors_row_count, how="left", left_index=True, right_index=True).reset_index() + + +if time_segments_type == "FREQUENCY": + data_for_plot["local_segment_label"] = data_for_plot["local_segment_label"].str[:-4] +elif time_segments_type == "EVENT": + data_for_plot["local_segment_label"] = "event" -# add phone data yield features and plot heatmap -html_file = open(snakemake.output[0], "a", encoding="utf-8") sensor_names.extend(["ratiovalidyieldedminutes", "ratiovalidyieldedhours"]) -for time_segment in time_segments: - if not phone_data_yield.empty: - phone_data_yield_per_segment = phone_data_yield[phone_data_yield["local_segment_label"] == time_segment].rename(columns={"phone_data_yield_rapids_ratiovalidyieldedminutes": "ratiovalidyieldedminutes","phone_data_yield_rapids_ratiovalidyieldedhours": "ratiovalidyieldedhours"}).round(3) - if not phone_data_yield_per_segment.empty: - sensors_row_count[time_segment] = pd.concat([sensors_row_count[time_segment], phone_data_yield_per_segment], axis=1, sort=True) - - # consider all the sensors - data_for_plot = sensors_row_count[time_segment].transpose().reindex(pd.Index(sensor_names)) - - if data_for_plot.empty: +html_file = open(snakemake.output[0], "a", encoding="utf-8") +for time_segment in set(data_for_plot["local_segment_label"]): + if not data_for_plot.empty: + data_for_plot_per_segment = data_for_plot[data_for_plot["local_segment_label"] == time_segment] + if data_for_plot_per_segment.empty: html_file.write("There are no records of selected sensors in database for " + time_segment + " segments. Pid: " + pid + ". Label: " + label + ".
") else: + data_for_plot_per_segment = data_for_plot_per_segment.reindex(columns=["local_segment", "local_segment_start_datetime", "local_segment_end_datetime"] + sensor_names).set_index(["local_segment", "local_segment_start_datetime", "local_segment_end_datetime"]) # except for phone data yield sensor, scale each sensor (row) to the range of [0, 1] - scaled_data_for_plot = data_for_plot.copy() - scaled_data_for_plot.loc[sensor_names[:-2]] = scaled_data_for_plot.fillna(np.nan).loc[sensor_names[:-2]].apply(lambda x: (x - np.nanmin(x)) / (np.nanmax(x) - np.nanmin(x)) if np.nanmax(x) != np.nanmin(x) else (x / np.nanmin(x)), axis=1) - - getRowCountHeatmap(data_for_plot, scaled_data_for_plot, pid, time_segment, html_file) + scaled_data_for_plot_per_segment = data_for_plot_per_segment.copy() + scaled_data_for_plot_per_segment[sensor_names[:-2]] = scaled_data_for_plot_per_segment.fillna(np.nan)[sensor_names[:-2]].apply(lambda x: (x - np.nanmin(x)) / (np.nanmax(x) - np.nanmin(x)) if np.nanmax(x) != np.nanmin(x) else (x / np.nanmin(x)), axis=0) + data_for_plot_processed = pd.concat([data_for_plot_per_segment.stack(dropna=False).to_frame("value"), scaled_data_for_plot_per_segment.stack(dropna=False).round(3).to_frame("scaled_value")], axis=1).reset_index().rename(columns={"level_3": "sensor"}) + getRowCountHeatmap(data_for_plot_processed, pid, time_segment, html_file) html_file.close() diff --git a/src/visualization/heatmap_sensors_per_minute_per_time_segment.py b/src/visualization/heatmap_sensors_per_minute_per_time_segment.py index dd524322..bc975da2 100644 --- a/src/visualization/heatmap_sensors_per_minute_per_time_segment.py +++ b/src/visualization/heatmap_sensors_per_minute_per_time_segment.py @@ -5,6 +5,11 @@ from importlib import util from pathlib import Path import yaml +# import filter_data_by_segment from src/features/utils/utils.py +spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "features" / "utils" / "utils.py")) +mod = util.module_from_spec(spec) +spec.loader.exec_module(mod) +filter_data_by_segment = getattr(mod, "filter_data_by_segment") def colors2colorscale(colors): colorscale = [] @@ -16,85 +21,83 @@ def colors2colorscale(colors): colorscale.append([1, colors[i]]) return colorscale -def getSensorsPerMinPerSegmentHeatmap(phone_data_yield, pid, time_segment, html_file): - - x_axis_labels = [pd.Timedelta(minutes=x) for x in phone_data_yield.columns] +def getDataForPlot(phone_data_yield_per_segment): + # calculate the length (in minute) of per segment instance + phone_data_yield_per_segment["length"] = phone_data_yield_per_segment["timestamps_segment"].str.split(",").apply(lambda x: int((int(x[1])-int(x[0])) / (1000 * 60))) + # calculate the number of sensors logged at least one row of data per minute. + phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(["local_segment", "length", "local_date", "local_hour", "local_minute"])[["sensor", "local_date_time"]].max().reset_index() + # extract local start datetime of the segment from "local_segment" column + phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(phone_data_yield_per_segment["local_segment"].apply(lambda x: x.split("#")[1].split(",")[0])) + # calculate the number of minutes after local start datetime of the segment + phone_data_yield_per_segment["minutes_after_segment_start"] = ((phone_data_yield_per_segment["local_date_time"] - phone_data_yield_per_segment["local_segment_start_datetimes"]) / pd.Timedelta(minutes=1)).astype("int") - fig = go.Figure(data=go.Heatmap(z=phone_data_yield.values.tolist(), - x=x_axis_labels, - y=phone_data_yield.index, - zmin=0, zmax=16, - colorscale=colors2colorscale(colors), - colorbar=dict(thickness=25, tickvals=[1/2 + x for x in range(16)],ticktext=[x for x in range(16)]))) + # impute missing rows with 0 + columns_for_full_index = phone_data_yield_per_segment[["local_segment_start_datetimes", "length"]].drop_duplicates(keep="first") + columns_for_full_index = columns_for_full_index.apply(lambda row: [[row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)], axis=1) + full_index = [] + for columns in columns_for_full_index: + full_index = full_index + columns + full_index = pd.MultiIndex.from_tuples(full_index, names=("local_segment_start_datetimes", "minutes_after_segment_start")) + phone_data_yield_per_segment = phone_data_yield_per_segment.set_index(["local_segment_start_datetimes", "minutes_after_segment_start"]).reindex(full_index).reset_index().fillna(0) - fig.update_layout(title="Number of sensors with any data per minute for " + time_segment + " segments. Pid: "+pid+". Label: " + label + "
y-axis shows the start (date and time) of a time segment.
x-axis shows the time since the start of the time segment.
z-axis (color) shows how many sensors logged at least one row of data per minute.") - fig["layout"].update(margin=dict(t=160)) - - html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) - - - - - -# import filter_data_by_segment from src/features/utils/utils.py -spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "features" / "utils" / "utils.py")) -mod = util.module_from_spec(spec) -spec.loader.exec_module(mod) -filter_data_by_segment = getattr(mod, "filter_data_by_segment") - - - - - - + # transpose the dataframe per local start datetime of the segment and discard the useless index layer + phone_data_yield_per_segment = phone_data_yield_per_segment.groupby("local_segment_start_datetimes")[["minutes_after_segment_start", "sensor"]].apply(lambda x: x.set_index("minutes_after_segment_start").transpose()) + phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values("local_segment_start_datetimes") + return phone_data_yield_per_segment +def getSensorsPerMinPerSegmentHeatmap(phone_data_yield, pid, label, time_segment, html_file): + if phone_data_yield.empty: + html_file.write("There is no sensor data of " + time_segment + " segments for " + pid + " (pid) and " + label + " (label).
") + else: + phone_data_yield.sort_index(inplace=True) + x_axis_labels = [pd.Timedelta(minutes=x) for x in phone_data_yield.columns] + + fig = go.Figure(data=go.Heatmap(z=phone_data_yield.values.tolist(), + x=x_axis_labels, + y=phone_data_yield.index, + zmin=0, zmax=16, + colorscale=colors2colorscale(colors), + colorbar=dict(thickness=25, tickvals=[1/2 + x for x in range(16)],ticktext=[x for x in range(16)]))) + + fig.update_layout(title="Number of sensors with any data per minute for " + time_segment + " segments. Pid: "+pid+". Label: " + label + "
y-axis shows the start (date and time) of a time segment.
x-axis shows the time since the start of the time segment.
z-axis (color) shows how many sensors logged at least one row of data per minute.") + fig["layout"].update(margin=dict(t=160)) + + html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) + return colors = ["red", "#3D0751", "#423176", "#414381", "#3F5688", "#42678B", "#42768C", "#45868B", "#4A968A", "#53A485", "#5FB57E", "#76C170", "#91CF63", "#B4DA55", "#D9E152", "#F8E755", "#DEE00F"] pid = snakemake.params["pid"] -time_segments_labels = pd.read_csv(snakemake.input["time_segments_labels"], header=0) +time_segments_type = snakemake.params["time_segments_type"] +time_segments_labels = pd.read_csv(snakemake.input["time_segments_labels"]) with open(snakemake.input["participant_file"], "r", encoding="utf-8") as f: participant_file = yaml.safe_load(f) label = participant_file["PHONE"]["LABEL"] phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], parse_dates=["local_date_time"]) +if time_segments_type == "FREQUENCY": + phone_data_yield["assigned_segments"] = phone_data_yield["assigned_segments"].str.replace(r"[0-9]{4}#", "#") + time_segments_labels["label"] = time_segments_labels["label"].str[:-4] +if time_segments_type == "PERIODIC": + phone_data_yield["assigned_segments"] = phone_data_yield["assigned_segments"].str.replace(r"_RR\d+SS#", "#") + time_segments_labels["label"] = time_segments_labels["label"].str.replace(r"_RR\d+SS$", "") html_file = open(snakemake.output[0], "a", encoding="utf-8") if phone_data_yield.empty: html_file.write("There is no sensor data for " + pid + " (pid) and " + label + " (label).") else: - for time_segment in time_segments_labels["label"]: + data_for_plot = pd.DataFrame() + for time_segment in set(time_segments_labels["label"]): phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment) - - if phone_data_yield_per_segment.empty: - html_file.write("There is no sensor data of " + time_segment + " segments for " + pid + " (pid) and " + label + " (label).
") - else: - # calculate the length (in minute) of per segment instance - phone_data_yield_per_segment["length"] = phone_data_yield_per_segment["timestamps_segment"].str.split(",").apply(lambda x: int((int(x[1])-int(x[0])) / (1000 * 60))) - # calculate the number of sensors logged at least one row of data per minute. - phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(["local_segment", "length", "local_date", "local_hour", "local_minute"])[["sensor", "local_date_time"]].max().reset_index() - # extract local start datetime of the segment from "local_segment" column - phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(phone_data_yield_per_segment["local_segment"].apply(lambda x: x.split("#")[1].split(",")[0])) - # calculate the number of minutes after local start datetime of the segment - phone_data_yield_per_segment["minutes_after_segment_start"] = ((phone_data_yield_per_segment["local_date_time"] - phone_data_yield_per_segment["local_segment_start_datetimes"]) / pd.Timedelta(minutes=1)).astype("int") - - # impute missing rows with 0 - columns_for_full_index = phone_data_yield_per_segment[["local_segment_start_datetimes", "length"]].drop_duplicates(keep="first") - columns_for_full_index = columns_for_full_index.apply(lambda row: [[row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)], axis=1) - full_index = [] - for columns in columns_for_full_index: - full_index = full_index + columns - full_index = pd.MultiIndex.from_tuples(full_index, names=("local_segment_start_datetimes", "minutes_after_segment_start")) - phone_data_yield_per_segment = phone_data_yield_per_segment.set_index(["local_segment_start_datetimes", "minutes_after_segment_start"]).reindex(full_index).reset_index().fillna(0) - - # transpose the dataframe per local start datetime of the segment and discard the useless index layer - phone_data_yield_per_segment = phone_data_yield_per_segment.groupby("local_segment_start_datetimes")[["minutes_after_segment_start", "sensor"]].apply(lambda x: x.set_index("minutes_after_segment_start").transpose()) - phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values("local_segment_start_datetimes") - - # get heatmap - getSensorsPerMinPerSegmentHeatmap(phone_data_yield_per_segment, pid, time_segment, html_file) - + if not phone_data_yield_per_segment.empty: + data_for_plot_per_segment = getDataForPlot(phone_data_yield_per_segment) + if time_segments_type == "EVENT": + data_for_plot = pd.concat([data_for_plot, data_for_plot_per_segment], axis=0) + else: + getSensorsPerMinPerSegmentHeatmap(data_for_plot_per_segment, pid, label, time_segment, html_file) + if time_segments_type == "EVENT": + getSensorsPerMinPerSegmentHeatmap(data_for_plot, pid, label, "event", html_file) html_file.close() diff --git a/src/visualization/histogram_phone_data_yield.py b/src/visualization/histogram_phone_data_yield.py index cd15ec8d..3f765b7b 100644 --- a/src/visualization/histogram_phone_data_yield.py +++ b/src/visualization/histogram_phone_data_yield.py @@ -2,8 +2,14 @@ import pandas as pd import plotly.express as px +time_segments_type = snakemake.params["time_segments_type"] phone_data_yield = pd.read_csv(snakemake.input[0]) +if time_segments_type == "FREQUENCY": + phone_data_yield["local_segment_label"] = phone_data_yield["local_segment_label"].str[:-4] +if time_segments_type == "EVENT": + phone_data_yield["local_segment_label"] = "event" + # make sure the input file contains "phone_data_yield_rapids_ratiovalidyieldedminutes" and "phone_data_yield_rapids_ratiovalidyieldedhours" columns if ("phone_data_yield_rapids_ratiovalidyieldedminutes" not in phone_data_yield.columns) or ("phone_data_yield_rapids_ratiovalidyieldedhours" not in phone_data_yield.columns): raise ValueError("Please make sure [PHONE_DATA_YIELD][RAPIDS][COMPUTE] is True AND [PHONE_DATA_YIELD][RAPIDS][FEATURES] contains [ratiovalidyieldedminutes, ratiovalidyieldedhours].") @@ -12,6 +18,8 @@ html_file = open(snakemake.output[0], "a", encoding="utf-8") if phone_data_yield.empty: html_file.write("There is no sensor data for the sensors in [PHONE_DATA_YIELD][SENSORS].") else: + phone_data_yield.loc[:, ["phone_data_yield_rapids_ratiovalidyieldedminutes", "phone_data_yield_rapids_ratiovalidyieldedhours"]] = phone_data_yield.loc[:, ["phone_data_yield_rapids_ratiovalidyieldedminutes", "phone_data_yield_rapids_ratiovalidyieldedhours"]].round(3).clip(upper=1) + # plot ratio valid yielded minutes histogram fig_ratiovalidyieldedminutes = px.histogram(phone_data_yield, x="phone_data_yield_rapids_ratiovalidyieldedminutes", color="local_segment_label") fig_ratiovalidyieldedminutes.update_layout(title="Histogram of valid yielded minutes ratio per time segment.") diff --git a/tools/config.schema.yaml b/tools/config.schema.yaml index 01e7ec02..d1c30054 100644 --- a/tools/config.schema.yaml +++ b/tools/config.schema.yaml @@ -1187,10 +1187,13 @@ properties: HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT: type: object - required: [PLOT] + required: [PLOT, TIME] properties: PLOT: type: boolean + TIME: + type: string + enum: [ABSOLUTE_TIME, RELATIVE_TIME] HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT: type: object