diff --git a/Snakefile b/Snakefile index 688e7675..04882a79 100644 --- a/Snakefile +++ b/Snakefile @@ -231,20 +231,19 @@ for provider in config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"].keys(): if config["HISTOGRAM_PHONE_DATA_YIELD"]["PLOT"]: files_to_compute.append("reports/data_exploration/histogram_phone_data_yield.html") -# visualization for data exploration -# if config["HEATMAP_FEATURES_CORRELATIONS"]["PLOT"]: -# files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/heatmap_features_correlations.html", min_valid_hours_per_day=config["HEATMAP_FEATURES_CORRELATIONS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"])) - -# if config["HEATMAP_DAYS_BY_SENSORS"]["PLOT"]: -# files_to_compute.extend(expand("reports/interim/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{pid}/heatmap_days_by_sensors.html", pid=config["PIDS"], min_valid_hours_per_day=config["HEATMAP_DAYS_BY_SENSORS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"])) -# files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/heatmap_days_by_sensors_all_participants.html", min_valid_hours_per_day=config["HEATMAP_DAYS_BY_SENSORS"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"])) +if config["HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT"]["PLOT"]: + files_to_compute.extend(expand("reports/interim/{pid}/heatmap_sensors_per_minute_per_time_segment.html", pid=config["PIDS"])) + files_to_compute.append("reports/data_exploration/heatmap_sensors_per_minute_per_time_segment.html") -# if config["HEATMAP_SENSED_BINS"]["PLOT"]: -# files_to_compute.extend(expand("reports/interim/heatmap_sensed_bins/{pid}/heatmap_sensed_bins.html", pid=config["PIDS"])) -# files_to_compute.extend(["reports/data_exploration/heatmap_sensed_bins_all_participants.html"]) +if config["HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT"]["PLOT"]: + files_to_compute.extend(expand("reports/interim/{pid}/heatmap_sensor_row_count_per_time_segment.html", pid=config["PIDS"])) + files_to_compute.append("reports/data_exploration/heatmap_sensor_row_count_per_time_segment.html") -# if config["OVERALL_COMPLIANCE_HEATMAP"]["PLOT"]: -# files_to_compute.extend(expand("reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/overall_compliance_heatmap.html", min_valid_hours_per_day=config["OVERALL_COMPLIANCE_HEATMAP"]["MIN_VALID_HOURS_PER_DAY"], min_valid_bins_per_hour=config["PHONE_VALID_SENSED_DAYS"]["MIN_VALID_BINS_PER_HOUR"])) +if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]: + files_to_compute.append("reports/data_exploration/heatmap_phone_data_yield_per_participant_per_time_segment.html") + +if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]: + files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html") rule all: diff --git a/config.yaml b/config.yaml index 1aef391f..49cfed3e 100644 --- a/config.yaml +++ b/config.yaml @@ -259,9 +259,6 @@ PHONE_WIFI_VISIBLE: - - - ######################################################################################################################## # FITBIT # ######################################################################################################################## @@ -350,7 +347,6 @@ FITBIT_STEPS_INTRADAY: - ######################################################################################################################## # PLOTS # ######################################################################################################################## @@ -358,32 +354,19 @@ FITBIT_STEPS_INTRADAY: HISTOGRAM_PHONE_DATA_YIELD: PLOT: False -HEATMAP_FEATURES_CORRELATIONS: +HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT: + PLOT: False + +HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT: + PLOT: False + SENSORS: [PHONE_ACCELEROMETER, PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE] + +HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT: + PLOT: False + +HEATMAP_FEATURE_CORRELATION_MATRIX: PLOT: False MIN_ROWS_RATIO: 0.5 - MIN_VALID_HOURS_PER_DAY: #*min_valid_hours_per_day - MIN_VALID_BINS_PER_HOUR: #*min_valid_bins_per_hour - PHONE_FEATURES: [accelerometer, activity_recognition, applications_foreground, battery, calls_incoming, calls_missed, calls_outgoing, conversation, light, location_doryab, messages_received, messages_sent, screen] - FITBIT_FEATURES: [fitbit_heartrate, fitbit_step, fitbit_sleep] CORR_THRESHOLD: 0.1 CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"} -HEATMAP_DAYS_BY_SENSORS: - PLOT: False - MIN_VALID_HOURS_PER_DAY: #*min_valid_hours_per_day - MIN_VALID_BINS_PER_HOUR: #*min_valid_bins_per_hour - EXPECTED_NUM_OF_DAYS: -1 - DB_TABLES: [accelerometer, applications_foreground, battery, bluetooth, calls, light, locations, messages, screen, wifi, sensor_wifi, plugin_google_activity_recognition, plugin_ios_activity_recognition, plugin_studentlife_audio_android, plugin_studentlife_audio] - -HEATMAP_SENSED_BINS: - PLOT: False - BIN_SIZE: #*bin_size - -OVERALL_COMPLIANCE_HEATMAP: - PLOT: False - ONLY_SHOW_VALID_DAYS: False - EXPECTED_NUM_OF_DAYS: -1 - BIN_SIZE: #*bin_size - MIN_VALID_HOURS_PER_DAY: #*min_valid_hours_per_day - MIN_VALID_BINS_PER_HOUR: #*min_valid_bins_per_hour - diff --git a/example_profile/Snakefile b/example_profile/Snakefile index 49530a30..acb3b1e1 100644 --- a/example_profile/Snakefile +++ b/example_profile/Snakefile @@ -214,6 +214,20 @@ for provider in config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"].keys(): if config["HISTOGRAM_PHONE_DATA_YIELD"]["PLOT"]: files_to_compute.append("reports/data_exploration/histogram_phone_data_yield.html") +if config["HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT"]["PLOT"]: + files_to_compute.extend(expand("reports/interim/{pid}/heatmap_sensors_per_minute_per_time_segment.html", pid=config["PIDS"])) + files_to_compute.append("reports/data_exploration/heatmap_sensors_per_minute_per_time_segment.html") + +if config["HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT"]["PLOT"]: + files_to_compute.extend(expand("reports/interim/{pid}/heatmap_sensor_row_count_per_time_segment.html", pid=config["PIDS"])) + files_to_compute.append("reports/data_exploration/heatmap_sensor_row_count_per_time_segment.html") + +if config["HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT"]["PLOT"]: + files_to_compute.append("reports/data_exploration/heatmap_phone_data_yield_per_participant_per_time_segment.html") + +if config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["PLOT"]: + files_to_compute.append("reports/data_exploration/heatmap_feature_correlation_matrix.html") + # Analysis Workflow Example models, scalers = [], [] for model_name in config["PARAMS_FOR_ANALYSIS"]["MODEL_NAMES"]: diff --git a/example_profile/example_config.yaml b/example_profile/example_config.yaml index 8285c716..b78a731d 100644 --- a/example_profile/example_config.yaml +++ b/example_profile/example_config.yaml @@ -323,6 +323,22 @@ FITBIT_STEPS_INTRADAY: HISTOGRAM_PHONE_DATA_YIELD: PLOT: True +HEATMAP_SENSORS_PER_MINUTE_PER_TIME_SEGMENT: + PLOT: True + +HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT: + PLOT: True + SENSORS: [PHONE_ACCELEROMETER, PHONE_ACTIVITY_RECOGNITION, PHONE_APPLICATIONS_FOREGROUND, PHONE_BATTERY, PHONE_BLUETOOTH, PHONE_CALLS, PHONE_CONVERSATION, PHONE_LIGHT, PHONE_LOCATIONS, PHONE_MESSAGES, PHONE_SCREEN, PHONE_WIFI_CONNECTED, PHONE_WIFI_VISIBLE] + +HEATMAP_PHONE_DATA_YIELD_PER_PARTICIPANT_PER_TIME_SEGMENT: + PLOT: True + +HEATMAP_FEATURE_CORRELATION_MATRIX: + PLOT: TRUE + MIN_ROWS_RATIO: 0.5 + CORR_THRESHOLD: 0.1 + CORR_METHOD: "pearson" # choose from {"pearson", "kendall", "spearman"} + ######################################################################################################################## diff --git a/rules/common.smk b/rules/common.smk index b3b0c815..ef9af1ca 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -1,19 +1,3 @@ -# Common.smk ########################################################################################################## - -def infer_participant_platform(participant_file): - with open(participant_file, encoding="ISO-8859-1") as external_file: - external_file_content = external_file.readlines() - platforms = external_file_content[1].strip().split(",") - if platforms[0] == "multiple" or (len(platforms) > 1 and "android" in platforms and "ios" in platforms): - platform = "android" - else: - platform = platforms[0] - - if platform not in ["android", "ios"]: - raise ValueError("Platform (line 2) in a participant file should be 'android', 'ios', or 'multiple'. You typed '" + platforms + "'") - - return platform - # Features.smk ######################################################################################################### def find_features_files(wildcards): feature_files = [] @@ -38,14 +22,3 @@ def input_merge_sensor_features_for_individual_participants(wildcards): break return feature_files -# Reports.smk ########################################################################################################### - -def optional_heatmap_days_by_sensors_input(wildcards): - platform = infer_participant_platform("data/external/"+wildcards.pid) - - if platform == "android": - tables_platform = [table for table in config["HEATMAP_DAYS_BY_SENSORS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]] # for android, discard any ios tables that may exist - elif platform == "ios": - tables_platform = [table for table in config["HEATMAP_DAYS_BY_SENSORS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist - - return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform) diff --git a/rules/reports.smk b/rules/reports.smk index 6071db09..9e30ed35 100644 --- a/rules/reports.smk +++ b/rules/reports.smk @@ -6,74 +6,66 @@ rule histogram_phone_data_yield: script: "../src/visualization/histogram_phone_data_yield.py" - - - -rule heatmap_features_correlations: +rule heatmap_sensors_per_minute_per_time_segment: input: - features = expand("data/processed/{pid}/{sensor}_{time_segment}.csv", pid=config["PIDS"], sensor=config["HEATMAP_FEATURES_CORRELATIONS"]["PHONE_FEATURES"]+config["HEATMAP_FEATURES_CORRELATIONS"]["FITBIT_FEATURES"], time_segment=config["TIME_SEGMENTS"]), - phone_valid_sensed_days = expand("data/interim/{pid}/phone_valid_sensed_days_{{min_valid_hours_per_day}}hours_{{min_valid_bins_per_hour}}bins.csv", pid=config["PIDS"]) + phone_data_yield = "data/interim/{pid}/phone_yielded_timestamps_with_datetime.csv", + participant_file = "data/external/participant_files/{pid}.yaml", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: - min_rows_ratio = config["HEATMAP_FEATURES_CORRELATIONS"]["MIN_ROWS_RATIO"], - corr_threshold = config["HEATMAP_FEATURES_CORRELATIONS"]["CORR_THRESHOLD"], - corr_method = config["HEATMAP_FEATURES_CORRELATIONS"]["CORR_METHOD"] + pid = "{pid}" output: - "reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/heatmap_features_correlations.html" + "reports/interim/{pid}/heatmap_sensors_per_minute_per_time_segment.html" script: - "../src/visualization/heatmap_features_correlations.py" + "../src/visualization/heatmap_sensors_per_minute_per_time_segment.py" -rule heatmap_days_by_sensors: +rule merge_heatmap_sensors_per_minute_per_time_segment: input: - sensors = optional_heatmap_days_by_sensors_input, - phone_valid_sensed_days = "data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv" + heatmap_sensors_per_minute_per_time_segment = expand("reports/interim/{pid}/heatmap_sensors_per_minute_per_time_segment.html", pid=config["PIDS"]) + output: + "reports/data_exploration/heatmap_sensors_per_minute_per_time_segment.html" + script: + "../src/visualization/merge_heatmap_sensors_per_minute_per_time_segment.Rmd" + +rule heatmap_sensor_row_count_per_time_segment: + input: + all_sensors = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor = map(str.lower, config["HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT"]["SENSORS"])), + phone_data_yield = "data/processed/features/{pid}/phone_data_yield.csv", + participant_file = "data/external/participant_files/{pid}.yaml", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" params: - pid = "{pid}", - expected_num_of_days = config["HEATMAP_DAYS_BY_SENSORS"]["EXPECTED_NUM_OF_DAYS"] + pid = "{pid}" output: - "reports/interim/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{pid}/heatmap_days_by_sensors.html" + "reports/interim/{pid}/heatmap_sensor_row_count_per_time_segment.html" script: - "../src/visualization/heatmap_days_by_sensors.py" + "../src/visualization/heatmap_sensor_row_count_per_time_segment.py" -rule heatmap_days_by_sensors_all_participants: +rule merge_heatmap_sensor_row_count_per_time_segment: input: - heatmap_rows = expand("reports/interim/{{min_valid_hours_per_day}}hours_{{min_valid_bins_per_hour}}bins/{pid}/heatmap_days_by_sensors.html", pid=config["PIDS"]) + heatmap_sensor_row_count_per_time_segment = expand("reports/interim/{pid}/heatmap_sensor_row_count_per_time_segment.html", pid=config["PIDS"]) output: - "reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/heatmap_days_by_sensors_all_participants.html" + "reports/data_exploration/heatmap_sensor_row_count_per_time_segment.html" script: - "../src/visualization/heatmap_days_by_sensors_all_participants.Rmd" + "../src/visualization/merge_heatmap_sensor_row_count_per_time_segment.Rmd" -rule heatmap_sensed_bins: +rule heatmap_phone_data_yield_per_participant_per_time_segment: input: - sensor = "data/interim/{pid}/phone_sensed_bins.csv", - pid_file = "data/external/{pid}" + phone_data_yield = expand("data/processed/features/{pid}/phone_data_yield.csv", pid=config["PIDS"]), + participant_file = expand("data/external/participant_files/{pid}.yaml", pid=config["PIDS"]), + time_segments_labels = expand("data/interim/time_segments/{pid}_time_segments_labels.csv", pid=config["PIDS"]) + output: + "reports/data_exploration/heatmap_phone_data_yield_per_participant_per_time_segment.html" + script: + "../src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py" + +rule heatmap_feature_correlation_matrix: + input: + all_sensor_features = "data/processed/features/all_participants/all_sensor_features.csv" # before data cleaning params: - pid = "{pid}", - bin_size = config["HEATMAP_SENSED_BINS"]["BIN_SIZE"] + min_rows_ratio = config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["MIN_ROWS_RATIO"], + corr_threshold = config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["CORR_THRESHOLD"], + corr_method = config["HEATMAP_FEATURE_CORRELATION_MATRIX"]["CORR_METHOD"] output: - "reports/interim/heatmap_sensed_bins/{pid}/heatmap_sensed_bins.html" + "reports/data_exploration/heatmap_feature_correlation_matrix.html" script: - "../src/visualization/heatmap_sensed_bins.py" + "../src/visualization/heatmap_feature_correlation_matrix.py" -rule heatmap_sensed_bins_all_participants: - input: - heatmap_sensed_bins = expand("reports/interim/heatmap_sensed_bins/{pid}/heatmap_sensed_bins.html", pid=config["PIDS"]) - output: - "reports/data_exploration/heatmap_sensed_bins_all_participants.html" - script: - "../src/visualization/heatmap_sensed_bins_all_participants.Rmd" - -rule overall_compliance_heatmap: - input: - phone_sensed_bins = expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]), - phone_valid_sensed_days = expand("data/interim/{pid}/phone_valid_sensed_days_{{min_valid_hours_per_day}}hours_{{min_valid_bins_per_hour}}bins.csv", pid=config["PIDS"]), - pid_files = expand("data/external/{pid}", pid=config["PIDS"]) - params: - only_show_valid_days = config["OVERALL_COMPLIANCE_HEATMAP"]["ONLY_SHOW_VALID_DAYS"], - local_timezone = config["PHONE_DATA_CONFIGURATION"]["TIMEZONE"]["VALUE"], - expected_num_of_days = config["OVERALL_COMPLIANCE_HEATMAP"]["EXPECTED_NUM_OF_DAYS"], - bin_size = config["OVERALL_COMPLIANCE_HEATMAP"]["BIN_SIZE"], - min_bins_per_hour = "{min_valid_bins_per_hour}" - output: - "reports/data_exploration/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/overall_compliance_heatmap.html" - script: - "../src/visualization/overall_compliance_heatmap.py" diff --git a/src/visualization/battery_consumption_rates_barchart.py b/src/visualization/battery_consumption_rates_barchart.py deleted file mode 100644 index 148b76ef..00000000 --- a/src/visualization/battery_consumption_rates_barchart.py +++ /dev/null @@ -1,34 +0,0 @@ -import pandas as pd -import datetime -import plotly.io as pio -import plotly.graph_objects as go - -def getBatteryConsumptionRatesBarChart(battery_data, pid): - plot = go.Figure(go.Bar( - x=battery_data["battery_daily_avgconsumptionrate"], - y=battery_data["local_date"].apply(lambda x: x.strftime("%Y/%m/%d")).tolist(), - orientation='h')) - plot.update_layout(title="Daily battery consumption rates bar chart for " + pid + "
Label: " + label + ", device_id: " + device_id, - xaxis_title="battery drains % per hour", - ) - return plot - - - -battery_data = pd.read_csv(snakemake.input["sensor"], parse_dates=["local_date"]) -pid = snakemake.params["pid"] - -with open(snakemake.input["pid_file"], encoding="ISO-8859-1") as external_file: - external_file_content = external_file.readlines() -device_id = external_file_content[0].split(",")[-1] -label = external_file_content[2] - -if battery_data.empty: - empty_html = open(snakemake.output[0], "w") - empty_html.write("There is no battery data for " + pid + "
Label: " + label + ", device_id: " + device_id) - empty_html.close() -else: - battery_data.set_index(["local_date"], inplace=True) - battery_data = battery_data.resample("1D").asfreq().fillna(0).reset_index() - plot = getBatteryConsumptionRatesBarChart(battery_data, pid) - pio.write_html(plot, file=snakemake.output[0], auto_open=False, include_plotlyjs="cdn") \ No newline at end of file diff --git a/src/visualization/compliance_report.Rmd b/src/visualization/compliance_report.Rmd deleted file mode 100644 index 2717875d..00000000 --- a/src/visualization/compliance_report.Rmd +++ /dev/null @@ -1,39 +0,0 @@ ---- -title: "Compliance Report" -author: - - "MoSHI Pipeline" -date: "`r format(Sys.time(), '%d %B, %Y')`" -params: - rmd: "compliance_report.Rmd" -output: - html_document: - highlight: tango - number_sections: no - theme: default - toc: yes - toc_depth: 3 - toc_float: - collapsed: no - smooth_scroll: yes ---- - -```{r include=FALSE} -source("renv/activate.R") -``` - -## Overall phone compliance - -```{r, echo=FALSE} -htmltools::includeHTML(snakemake@input[["compliance_heatmap"]]) -``` - -## Per sensor compliance -```{r, echo=FALSE} -heatmaps <- snakemake@input[["sensor_heatmaps"]] -heatmaps.html <- vector(mode="list", length(heatmaps)) - -for(sensor_id in 1:length(heatmaps)){ - heatmaps.html[[sensor_id]] <- htmltools::includeHTML(heatmaps[sensor_id]) -} -htmltools::tagList(heatmaps.html) -``` diff --git a/src/visualization/heatmap_days_by_sensors.py b/src/visualization/heatmap_days_by_sensors.py deleted file mode 100644 index f1ab53c0..00000000 --- a/src/visualization/heatmap_days_by_sensors.py +++ /dev/null @@ -1,74 +0,0 @@ -import numpy as np -import pandas as pd -import plotly.io as pio -import plotly.graph_objects as go -from datetime import datetime, timedelta - -def getRowCountHeatmap(row_count_sensors_normalized, row_count_sensors, pid, output_path): - plot = go.Figure(data=go.Heatmap(z=row_count_sensors_normalized.T.values.tolist(), - x=[datetime.strftime(idx[0], "%Y/%m/%d")+"("+str(idx[1])+")" for idx in row_count_sensors.index], - y=row_count_sensors.columns.tolist(), - hovertext=row_count_sensors.T.values.tolist(), - hovertemplate="Date: %{x}
Sensor: %{y}
Row count: %{hovertext}", - colorscale="Viridis")) - plot.update_layout(title="Row count heatmap for " + pid) - pio.write_html(plot, file=output_path, auto_open=False, include_plotlyjs="cdn") - - - -phone_valid_sensed_days = pd.read_csv(snakemake.input["phone_valid_sensed_days"], parse_dates=["local_date"], index_col=["local_date"]) -phone_valid_sensed_days = phone_valid_sensed_days[phone_valid_sensed_days["is_valid_sensed_day"] == True] - -row_count_sensors = pd.DataFrame() -for sensor_path in snakemake.input["sensors"]: - sensor_name = sensor_path.split("/")[-1].replace("_with_datetime.csv", "") - # plugin_studentlife_audio_android or plugin_studentlife_audio => conversion; plugin_google_activity_recognition or plugin_ios_activity_recognition => AR; applications_foreground => apps - sensor_name = sensor_name.replace("plugin_studentlife_audio_android", "conversion").replace("plugin_studentlife_audio", "conversion") \ - .replace("plugin_google_activity_recognition", "AR").replace("plugin_ios_activity_recognition", "AR") \ - .replace("applications_foreground", "apps") - - sensor_data = pd.read_csv(sensor_path, encoding="ISO-8859-1", parse_dates=["local_date"], dtype={"label": str}) - if sensor_data.empty: - row_count_sensor = pd.DataFrame(columns=[sensor_name]) - else: - row_count_sensor = sensor_data[["timestamp", "local_date"]].groupby(["local_date"]).count().rename(columns={"timestamp": sensor_name}) - row_count_sensors = row_count_sensors.join(row_count_sensor, how="outer") - -row_count_sensors.index = pd.to_datetime(row_count_sensors.index) -row_count_sensors = row_count_sensors.join(phone_valid_sensed_days[["valid_sensed_hours"]], how="outer") - -if row_count_sensors.empty: - empty_html = open(snakemake.output[0], "w") - empty_html.write("There are no records of sensors in database.") - empty_html.close() -else: - # set date_idx based on the first date - reference_date = row_count_sensors.index.min() - last_date = row_count_sensors.index.max() - row_count_sensors["date_idx"] = (row_count_sensors.index - reference_date).days - row_count_sensors["local_date"] = row_count_sensors.index - row_count_sensors.set_index(["local_date", "date_idx"], inplace=True) - - - expected_num_of_days = int(snakemake.params["expected_num_of_days"]) - if expected_num_of_days < -1: - raise ValueError("EXPECTED_NUM_OF_DAYS of HEATMAP_DAYS_BY_SENSORS section in config.yaml must be larger or equal to -1.") - # if expected_num_of_days = -1, return all dates - expected_num_of_days = (last_date - reference_date).days if expected_num_of_days == -1 else expected_num_of_days - - # add empty rows to make sure different participants have the same date_idx range - date_idx_range = [idx for idx in range(expected_num_of_days)] - date_range = [reference_date + timedelta(days=idx) for idx in date_idx_range] - all_dates = pd.DataFrame({"local_date": date_range, "date_idx": date_idx_range}) - all_dates.set_index(["local_date", "date_idx"], inplace=True) - - row_count_sensors = row_count_sensors.merge(all_dates, left_index=True, right_index=True, how="right") - - # normalize each sensor (column) - if row_count_sensors.count().max() > 1: - row_count_sensors_normalized = row_count_sensors.fillna(np.nan).apply(lambda x: (x - np.nanmin(x)) / (np.nanmax(x) - np.nanmin(x)) if np.nanmax(x) != np.nanmin(x) else (x / np.nanmin(x)), axis=0) - else: - row_count_sensors_normalized = row_count_sensors - - pid = sensor_path.split("/")[2] - getRowCountHeatmap(row_count_sensors_normalized, row_count_sensors, pid, snakemake.output[0]) diff --git a/src/visualization/heatmap_feature_correlation_matrix.py b/src/visualization/heatmap_feature_correlation_matrix.py new file mode 100644 index 00000000..8fb01934 --- /dev/null +++ b/src/visualization/heatmap_feature_correlation_matrix.py @@ -0,0 +1,48 @@ +import numpy as np +import pandas as pd +import plotly.graph_objects as go + + +def getCorrMatrixHeatmap(corr_matrix, time_segment, html_file): + + feature_names = corr_matrix.columns + + fig = go.Figure(data=go.Heatmap(z=corr_matrix.values.tolist(), + x=feature_names, + y=feature_names, + colorscale="Viridis")) + + fig.update_layout(title="Correlation matrix of features of " + time_segment + " segments.") + + html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) + + + +min_rows_ratio = snakemake.params["min_rows_ratio"] +corr_threshold = snakemake.params["corr_threshold"] +corr_method = snakemake.params["corr_method"] + +features = pd.read_csv(snakemake.input["all_sensor_features"]) +time_segments = set(features["local_segment_label"]) + +html_file = open(snakemake.output[0], "a", encoding="utf-8") +if features.empty: + html_file.write("There are no features for any participant.") +else: + + for time_segment in time_segments: + features_per_segment = features[features["local_segment_label"] == time_segment] + if features_per_segment.empty: + html_file.write("There are no features for " + time_segment + " segments.
") + else: + # drop useless columns + features_per_segment = features_per_segment.drop(["pid", "local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"], axis=1).astype(float) + # get correlation matrix + corr_matrix = features_per_segment.corr(method=corr_method, min_periods=min_rows_ratio * features_per_segment.shape[0]) + # replace correlation coefficients less than corr_threshold with NA + corr_matrix[(corr_matrix > -corr_threshold) & (corr_matrix < corr_threshold)] = np.nan + + # plot heatmap + getCorrMatrixHeatmap(corr_matrix, time_segment, html_file) + +html_file.close() diff --git a/src/visualization/heatmap_features_correlations.py b/src/visualization/heatmap_features_correlations.py deleted file mode 100644 index 8093a9db..00000000 --- a/src/visualization/heatmap_features_correlations.py +++ /dev/null @@ -1,59 +0,0 @@ -import numpy as np -import pandas as pd -import plotly.io as pio -import plotly.graph_objects as go - - -def getCorrMatrixHeatmap(corr_matrix, output_path): - colnames = corr_matrix.columns - plot = go.Figure(data=go.Heatmap(z=corr_matrix.values.tolist(), - x=colnames, - y=colnames, - colorscale="Viridis")) - plot.update_layout(title="Correlation Matrix Heatmap") - pio.write_html(plot, file=output_path, auto_open=False, include_plotlyjs="cdn") - - -min_rows_ratio = snakemake.params["min_rows_ratio"] -corr_threshold = snakemake.params["corr_threshold"] - -# merge features -features, features_all_sensors = pd.DataFrame(columns=["local_date"]), pd.DataFrame(columns=["local_date"]) -pids = set() -last_pid = None -for path in snakemake.input["features"]: - pid = path.split("/")[2] - if pid not in pids: - pids.add(pid) - features_all_sensors["pid"] = last_pid - features = pd.concat([features, features_all_sensors], axis=0, ignore_index=True, sort=False) - features_all_sensors = pd.DataFrame(columns=["local_date"]) - features_per_sensor = pd.read_csv(path) - features_all_sensors = features_all_sensors.merge(features_per_sensor, on="local_date", how="outer") - last_pid = pid - -features_all_sensors["pid"] = last_pid -features = pd.concat([features, features_all_sensors], axis=0, ignore_index=True, sort=False) -features.set_index(["pid", "local_date"], inplace=True) - -# select days based on the input of "phone_valid_sensed_days" -selected_participants_and_days = pd.DataFrame() -for path in snakemake.input["phone_valid_sensed_days"]: - pid = path.split("/")[2] - phone_valid_sensed_days = pd.read_csv(path) - phone_valid_sensed_days = phone_valid_sensed_days[phone_valid_sensed_days["is_valid_sensed_day"] == True] - phone_valid_sensed_days["pid"] = pid - selected_participants_and_days = pd.concat([selected_participants_and_days, phone_valid_sensed_days], axis=0) - -selected_participants_and_days.set_index(["pid", "local_date"], inplace=True) -features = features.loc[features.index.intersection(selected_participants_and_days.index), :] - -# get correlation matrix -features = features.astype(float) -corr_matrix = features.corr(method=snakemake.params["corr_method"], min_periods=min_rows_ratio * features.shape[0]) - -# replace correlation coefficients less than corr_threshold with NA -corr_matrix[(corr_matrix > -corr_threshold) & (corr_matrix < corr_threshold)] = np.nan - -# plot heatmap -getCorrMatrixHeatmap(corr_matrix, snakemake.output[0]) diff --git a/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py b/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py new file mode 100644 index 00000000..fd9595c5 --- /dev/null +++ b/src/visualization/heatmap_phone_data_yield_per_participant_per_time_segment.py @@ -0,0 +1,85 @@ +import pandas as pd +import numpy as np +import plotly.graph_objects as go +import yaml + + + + + +def getPhoneDataYieldHeatmap(data_for_plot, y_axis_labels, time_segment, type, html_file): + + fig = go.Figure(data=go.Heatmap(z=data_for_plot.values.tolist(), + x=data_for_plot.columns.tolist(), + y=y_axis_labels, + hovertext=data_for_plot.values.tolist(), + hovertemplate="Time since first segment: %{x}
Participant: %{y}
Ratiovalidyielded" + type + ": %{z}", + zmin=0, zmax=1, + colorscale="Viridis")) + + fig.update_layout(title="Heatmap of valid yielded " + type + " ratio for " + time_segment + " segments.
y-axis shows participant information (format: pid.label).
x-axis shows the time since their first segment.
z-axis (color) shows valid yielded " + type + " ratio during a segment instance.") + + fig["layout"]["xaxis"].update(side="bottom") + fig["layout"].update(xaxis_title="Time Since First Segment") + fig["layout"].update(margin=dict(t=160)) + + html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) + + + + + + + +y_axis_labels, phone_data_yield_minutes, phone_data_yield_hours = [], {}, {} +for phone_data_yield_path, participant_file_path, time_segments_path in zip(snakemake.input["phone_data_yield"], snakemake.input["participant_file"], snakemake.input["time_segments_labels"]): + + # set pid.label as y_axis_label + pid = phone_data_yield_path.split("/")[3] + time_segments = pd.read_csv(time_segments_path, header=0)["label"] + + with open(participant_file_path, "r", encoding="utf-8") as f: + participant_file = yaml.safe_load(f) + label = participant_file["PHONE"]["LABEL"] + + y_axis_label = pid + "." + label + y_axis_labels.append(y_axis_label) + + + phone_data_yield = pd.read_csv(phone_data_yield_path, index_col=["local_segment_start_datetime"], parse_dates=["local_segment_start_datetime"]) + # make sure the phone_data_yield file contains "phone_data_yield_rapids_ratiovalidyieldedminutes" and "phone_data_yield_rapids_ratiovalidyieldedhours" columns + if ("phone_data_yield_rapids_ratiovalidyieldedminutes" not in phone_data_yield.columns) or ("phone_data_yield_rapids_ratiovalidyieldedhours" not in phone_data_yield.columns): + raise ValueError("Please make sure [PHONE_DATA_YIELD][RAPIDS][COMPUTE] is True AND [PHONE_DATA_YIELD][RAPIDS][FEATURES] contains [ratiovalidyieldedminutes, ratiovalidyieldedhours].") + + if not phone_data_yield.empty: + + for time_segment in time_segments: + phone_data_yield_per_segment = phone_data_yield[phone_data_yield["local_segment_label"] == time_segment] + + if not phone_data_yield_per_segment.empty: + + # set number of minutes after the first start date time of local segments as x_axis_label + phone_data_yield_per_segment.index = phone_data_yield_per_segment.index - phone_data_yield_per_segment.index.min() + + phone_data_yield_minutes_per_segment = phone_data_yield_per_segment[["phone_data_yield_rapids_ratiovalidyieldedminutes"]].rename(columns={"phone_data_yield_rapids_ratiovalidyieldedminutes": y_axis_label}) + phone_data_yield_hours_per_segment = phone_data_yield_per_segment[["phone_data_yield_rapids_ratiovalidyieldedhours"]].rename(columns={"phone_data_yield_rapids_ratiovalidyieldedhours": y_axis_label}) + + if time_segment not in phone_data_yield_minutes.keys(): + phone_data_yield_minutes[time_segment] = phone_data_yield_minutes_per_segment + phone_data_yield_hours[time_segment] = phone_data_yield_hours_per_segment + else: + phone_data_yield_minutes[time_segment] = pd.concat([phone_data_yield_minutes[time_segment], phone_data_yield_minutes_per_segment], axis=1, sort=True) + phone_data_yield_hours[time_segment] = pd.concat([phone_data_yield_hours[time_segment], phone_data_yield_hours_per_segment], axis=1, sort=True) + + +html_file = open(snakemake.output[0], "a", encoding="utf-8") +if len(phone_data_yield_minutes.keys()) == 0: + html_file.write("There is no sensor data for the sensors in [PHONE_DATA_YIELD][SENSORS].") +for time_segment in phone_data_yield_minutes.keys(): + minutes_data_for_plot = phone_data_yield_minutes[time_segment].transpose().reindex(pd.Index(y_axis_labels)).round(3) + hours_data_for_plot = phone_data_yield_hours[time_segment].transpose().reindex(pd.Index(y_axis_labels)).round(3) + + getPhoneDataYieldHeatmap(minutes_data_for_plot, y_axis_labels, time_segment, "minutes", html_file) + getPhoneDataYieldHeatmap(hours_data_for_plot, y_axis_labels, time_segment, "hours", html_file) + +html_file.close() diff --git a/src/visualization/heatmap_rows.py b/src/visualization/heatmap_rows.py deleted file mode 100644 index 764478ec..00000000 --- a/src/visualization/heatmap_rows.py +++ /dev/null @@ -1,68 +0,0 @@ -import pandas as pd -import numpy as np -import plotly.io as pio -import plotly.graph_objects as go -import datetime - -def getComplianceMatrix(dates, compliance_bins): - compliance_matrix = [] - for date in dates: - date_bins = compliance_bins[compliance_bins["local_date"] == date]["count"].tolist() - compliance_matrix.append(date_bins) - return compliance_matrix - - -def getRowCountHeatmap(dates, row_count_per_bin, sensor_name, pid, output_path, bin_size): - bins_per_hour = int(60 / bin_size) - x_axis_labels = ["{0:0=2d}".format(x // bins_per_hour) + ":" + \ - "{0:0=2d}".format(x % bins_per_hour * bin_size) for x in range(24 * bins_per_hour)] - plot = go.Figure(data=go.Heatmap(z=row_count_per_bin, - x=x_axis_labels, - y=[datetime.datetime.strftime(date, '%Y/%m/%d') for date in dates], - colorscale="Viridis")) - plot.update_layout(title="Row count heatmap for " + sensor_name + " of " + pid + "
Label: " + label + ", device_id: " + device_id) - pio.write_html(plot, file=output_path, auto_open=False, include_plotlyjs="cdn") - - - -sensor_data = pd.read_csv(snakemake.input["sensor"], encoding="ISO-8859-1") -sensor_name = snakemake.params["table"] -pid = snakemake.params["pid"] -bin_size = snakemake.params["bin_size"] - -with open(snakemake.input["pid_file"], encoding="ISO-8859-1") as external_file: - external_file_content = external_file.readlines() -device_id = external_file_content[0].split(",")[-1] -label = external_file_content[2] - - -# check if we have sensor data -if sensor_data.empty: - empty_html = open(snakemake.output[0], "w") - empty_html.write("There is no " + sensor_name + " data for " + pid + "
Label: " + label + ", device_id: " + device_id) - empty_html.close() -else: - start_date = sensor_data["local_date"][0] - end_date = sensor_data.at[sensor_data.index[-1],"local_date"] - - sensor_data["local_date_time"] = pd.to_datetime(sensor_data["local_date_time"]) - sensor_data = sensor_data[["local_date_time"]] - sensor_data["count"] = 1 - - # Add first and last day boundaries for resampling - sensor_data = sensor_data.append([pd.Series([datetime.datetime.strptime(start_date + " 00:00:00", "%Y-%m-%d %H:%M:%S"), 0], sensor_data.columns), - pd.Series([datetime.datetime.strptime(end_date + " 23:59:59", "%Y-%m-%d %H:%M:%S"), 0], sensor_data.columns)]) - - # Resample into bins with the size of bin_size - resampled_bins = pd.DataFrame(sensor_data.resample(str(bin_size) + "T", on="local_date_time")["count"].sum()) - - # Extract list of dates for creating the heatmap - resampled_bins.reset_index(inplace=True) - resampled_bins["local_date"] = resampled_bins["local_date_time"].dt.date - dates = resampled_bins["local_date"].drop_duplicates().tolist() - - # Create heatmap - row_count_per_bin = getComplianceMatrix(dates, resampled_bins) - row_count_per_bin = np.asarray(row_count_per_bin) - row_count_per_bin = np.where(row_count_per_bin == 0, np.nan, row_count_per_bin) - getRowCountHeatmap(dates, row_count_per_bin, sensor_name, pid, snakemake.output[0], bin_size) diff --git a/src/visualization/heatmap_sensed_bins.py b/src/visualization/heatmap_sensed_bins.py deleted file mode 100644 index 26639400..00000000 --- a/src/visualization/heatmap_sensed_bins.py +++ /dev/null @@ -1,50 +0,0 @@ -import pandas as pd -import numpy as np -import plotly.io as pio -import plotly.graph_objects as go -import datetime - -def getDatesComplianceMatrix(phone_sensed_bins): - dates = phone_sensed_bins.index - compliance_matrix = [] - for date in dates: - compliance_matrix.append(phone_sensed_bins.loc[date, :].tolist()) - return dates, compliance_matrix - -def getComplianceHeatmap(dates, compliance_matrix, pid, output_path, bin_size): - bins_per_hour = int(60 / bin_size) - x_axis_labels = ["{0:0=2d}".format(x // bins_per_hour) + ":" + \ - "{0:0=2d}".format(x % bins_per_hour * bin_size) for x in range(24 * bins_per_hour)] - plot = go.Figure(data=go.Heatmap(z=compliance_matrix, - x=x_axis_labels, - y=[datetime.datetime.strftime(date, '%Y/%m/%d') for date in dates], - colorscale='Viridis', - colorbar={'tick0': 0,'dtick': 1})) - plot.update_layout(title="Heatmap sensed bins.
Five-minute bins showing how many sensors logged at least one row of data in that period for " + pid + "
Label: " + label + ", device_id: " + device_id) - pio.write_html(plot, file=output_path, auto_open=False, include_plotlyjs="cdn") - -# get current patient id -pid = snakemake.params["pid"] -bin_size = snakemake.params["bin_size"] - -with open(snakemake.input["pid_file"], encoding="ISO-8859-1") as external_file: - external_file_content = external_file.readlines() -device_id = external_file_content[0].split(",")[-1] -label = external_file_content[2] - -phone_sensed_bins = pd.read_csv(snakemake.input["sensor"], parse_dates=["local_date"], index_col="local_date") - -if phone_sensed_bins.empty: - empty_html = open(snakemake.output[0], "w", encoding="ISO-8859-1") - empty_html.write("There is no sensor data for " + pid + "
Label: " + label + ", device_id: " + device_id) - empty_html.close() -else: - # resample to impute missing dates - phone_sensed_bins = phone_sensed_bins.resample("1D").asfreq().fillna(0) - # get dates and compliance_matrix - dates, compliance_matrix = getDatesComplianceMatrix(phone_sensed_bins) - # convert compliance_matrix from list to np.array and replace 0 with np.nan - compliance_matrix = np.asarray(compliance_matrix) - compliance_matrix = np.where(compliance_matrix == 0, np.nan, compliance_matrix) - # get heatmap - getComplianceHeatmap(dates, compliance_matrix, pid, snakemake.output[0], bin_size) \ No newline at end of file diff --git a/src/visualization/heatmap_sensor_row_count_per_time_segment.py b/src/visualization/heatmap_sensor_row_count_per_time_segment.py new file mode 100644 index 00000000..6b62e6e1 --- /dev/null +++ b/src/visualization/heatmap_sensor_row_count_per_time_segment.py @@ -0,0 +1,89 @@ +import pandas as pd +import numpy as np +import plotly.graph_objects as go +from importlib import util +from pathlib import Path +import yaml + + +def getRowCountHeatmap(data_for_plot, scaled_data_for_plot, pid, time_segment, html_file): + + fig = go.Figure(data=go.Heatmap(z=scaled_data_for_plot.values.tolist(), + x=data_for_plot.columns, + y=data_for_plot.index, + hovertext=data_for_plot.values.tolist(), + hovertemplate="Segment start: %{x}
Sensor: %{y}
Row count: %{hovertext}", + zmin=0, zmax=1, + colorscale='Viridis')) + + fig.update_layout(title="Heatmap of sensor row count for " + time_segment + " segments. Pid: " + pid +". Label: " + label + "
y-axis shows the included sensors.
x-axis shows the start (date and time) of a time segment.
z-axis (color) shows row count per sensor per segment instance.") + fig["layout"].update(margin=dict(t=160)) + + html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) + + + + +# import filter_data_by_segment from src/features/utils/utils.py +spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "features" / "utils" / "utils.py")) +mod = util.module_from_spec(spec) +spec.loader.exec_module(mod) +filter_data_by_segment = getattr(mod, "filter_data_by_segment") + + + + + +phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], index_col=["local_segment_start_datetime"], parse_dates=["local_segment_start_datetime"]) +# make sure the phone_data_yield file contains "phone_data_yield_rapids_ratiovalidyieldedminutes" and "phone_data_yield_rapids_ratiovalidyieldedhours" columns +if ("phone_data_yield_rapids_ratiovalidyieldedminutes" not in phone_data_yield.columns) or ("phone_data_yield_rapids_ratiovalidyieldedhours" not in phone_data_yield.columns): + raise ValueError("Please make sure [PHONE_DATA_YIELD][RAPIDS][COMPUTE] is True AND [PHONE_DATA_YIELD][RAPIDS][FEATURES] contains [ratiovalidyieldedminutes, ratiovalidyieldedhours].") +phone_data_yield = phone_data_yield[["local_segment_label", "phone_data_yield_rapids_ratiovalidyieldedminutes", "phone_data_yield_rapids_ratiovalidyieldedhours"]] + +time_segments = pd.read_csv(snakemake.input["time_segments_labels"], header=0)["label"] +pid = snakemake.params["pid"] + +with open(snakemake.input["participant_file"], "r", encoding="utf-8") as f: + participant_file = yaml.safe_load(f) +label = participant_file["PHONE"]["LABEL"] + +sensor_names = [] +sensors_row_count = dict(zip(time_segments, [pd.DataFrame()] * len(time_segments))) + +for sensor_path in snakemake.input["all_sensors"]: + sensor_data = pd.read_csv(sensor_path, usecols=["assigned_segments"]) + sensor_name = sensor_path.split("/")[-1].replace("_with_datetime.csv", "") + sensor_names.append(sensor_name) + + if not sensor_data.empty: + for time_segment in time_segments: + sensor_data_per_segment = filter_data_by_segment(sensor_data, time_segment) + + if not sensor_data_per_segment.empty: + # extract local start datetime of the segment from "local_segment" column + sensor_data_per_segment["local_segment_start_datetime"] = pd.to_datetime(sensor_data_per_segment["local_segment"].apply(lambda x: x.split("#")[1].split(",")[0])) + sensor_row_count = sensor_data_per_segment.groupby("local_segment_start_datetime")[["local_segment"]].count().rename(columns={"local_segment": sensor_name}) + sensors_row_count[time_segment] = pd.concat([sensors_row_count[time_segment], sensor_row_count], axis=1, sort=False) + +# add phone data yield features and plot heatmap +html_file = open(snakemake.output[0], "a", encoding="utf-8") +sensor_names.extend(["ratiovalidyieldedminutes", "ratiovalidyieldedhours"]) +for time_segment in time_segments: + if not phone_data_yield.empty: + phone_data_yield_per_segment = phone_data_yield[phone_data_yield["local_segment_label"] == time_segment].rename(columns={"phone_data_yield_rapids_ratiovalidyieldedminutes": "ratiovalidyieldedminutes","phone_data_yield_rapids_ratiovalidyieldedhours": "ratiovalidyieldedhours"}).round(3) + if not phone_data_yield_per_segment.empty: + sensors_row_count[time_segment] = pd.concat([sensors_row_count[time_segment], phone_data_yield_per_segment], axis=1, sort=True) + + # consider all the sensors + data_for_plot = sensors_row_count[time_segment].transpose().reindex(pd.Index(sensor_names)) + + if data_for_plot.empty: + html_file.write("There are no records of selected sensors in database for " + time_segment + " segments. Pid: " + pid + ". Label: " + label + ".
") + else: + # except for phone data yield sensor, scale each sensor (row) to the range of [0, 1] + scaled_data_for_plot = data_for_plot.copy() + scaled_data_for_plot.loc[sensor_names[:-2]] = scaled_data_for_plot.fillna(np.nan).loc[sensor_names[:-2]].apply(lambda x: (x - np.nanmin(x)) / (np.nanmax(x) - np.nanmin(x)) if np.nanmax(x) != np.nanmin(x) else (x / np.nanmin(x)), axis=1) + + getRowCountHeatmap(data_for_plot, scaled_data_for_plot, pid, time_segment, html_file) + +html_file.close() diff --git a/src/visualization/heatmap_sensors_per_minute_per_time_segment.py b/src/visualization/heatmap_sensors_per_minute_per_time_segment.py new file mode 100644 index 00000000..dd524322 --- /dev/null +++ b/src/visualization/heatmap_sensors_per_minute_per_time_segment.py @@ -0,0 +1,100 @@ +import pandas as pd +import numpy as np +import plotly.graph_objects as go +from importlib import util +from pathlib import Path +import yaml + + +def colors2colorscale(colors): + colorscale = [] + length = len(colors) + for i in range(length): + if i != length - 1: + colorscale = colorscale + [[i/(length-1), colors[i]], [(i+1)/(length-1), colors[i]]] + else: + colorscale.append([1, colors[i]]) + return colorscale + +def getSensorsPerMinPerSegmentHeatmap(phone_data_yield, pid, time_segment, html_file): + + x_axis_labels = [pd.Timedelta(minutes=x) for x in phone_data_yield.columns] + + fig = go.Figure(data=go.Heatmap(z=phone_data_yield.values.tolist(), + x=x_axis_labels, + y=phone_data_yield.index, + zmin=0, zmax=16, + colorscale=colors2colorscale(colors), + colorbar=dict(thickness=25, tickvals=[1/2 + x for x in range(16)],ticktext=[x for x in range(16)]))) + + fig.update_layout(title="Number of sensors with any data per minute for " + time_segment + " segments. Pid: "+pid+". Label: " + label + "
y-axis shows the start (date and time) of a time segment.
x-axis shows the time since the start of the time segment.
z-axis (color) shows how many sensors logged at least one row of data per minute.") + fig["layout"].update(margin=dict(t=160)) + + html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) + + + + + +# import filter_data_by_segment from src/features/utils/utils.py +spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "features" / "utils" / "utils.py")) +mod = util.module_from_spec(spec) +spec.loader.exec_module(mod) +filter_data_by_segment = getattr(mod, "filter_data_by_segment") + + + + + + + + + + +colors = ["red", "#3D0751", "#423176", "#414381", "#3F5688", "#42678B", "#42768C", "#45868B", "#4A968A", "#53A485", "#5FB57E", "#76C170", "#91CF63", "#B4DA55", "#D9E152", "#F8E755", "#DEE00F"] +pid = snakemake.params["pid"] +time_segments_labels = pd.read_csv(snakemake.input["time_segments_labels"], header=0) + +with open(snakemake.input["participant_file"], "r", encoding="utf-8") as f: + participant_file = yaml.safe_load(f) +label = participant_file["PHONE"]["LABEL"] + +phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], parse_dates=["local_date_time"]) + +html_file = open(snakemake.output[0], "a", encoding="utf-8") +if phone_data_yield.empty: + html_file.write("There is no sensor data for " + pid + " (pid) and " + label + " (label).") +else: + for time_segment in time_segments_labels["label"]: + phone_data_yield_per_segment = filter_data_by_segment(phone_data_yield, time_segment) + + if phone_data_yield_per_segment.empty: + html_file.write("There is no sensor data of " + time_segment + " segments for " + pid + " (pid) and " + label + " (label).
") + else: + # calculate the length (in minute) of per segment instance + phone_data_yield_per_segment["length"] = phone_data_yield_per_segment["timestamps_segment"].str.split(",").apply(lambda x: int((int(x[1])-int(x[0])) / (1000 * 60))) + # calculate the number of sensors logged at least one row of data per minute. + phone_data_yield_per_segment = phone_data_yield_per_segment.groupby(["local_segment", "length", "local_date", "local_hour", "local_minute"])[["sensor", "local_date_time"]].max().reset_index() + # extract local start datetime of the segment from "local_segment" column + phone_data_yield_per_segment["local_segment_start_datetimes"] = pd.to_datetime(phone_data_yield_per_segment["local_segment"].apply(lambda x: x.split("#")[1].split(",")[0])) + # calculate the number of minutes after local start datetime of the segment + phone_data_yield_per_segment["minutes_after_segment_start"] = ((phone_data_yield_per_segment["local_date_time"] - phone_data_yield_per_segment["local_segment_start_datetimes"]) / pd.Timedelta(minutes=1)).astype("int") + + # impute missing rows with 0 + columns_for_full_index = phone_data_yield_per_segment[["local_segment_start_datetimes", "length"]].drop_duplicates(keep="first") + columns_for_full_index = columns_for_full_index.apply(lambda row: [[row["local_segment_start_datetimes"], x] for x in range(row["length"] + 1)], axis=1) + full_index = [] + for columns in columns_for_full_index: + full_index = full_index + columns + full_index = pd.MultiIndex.from_tuples(full_index, names=("local_segment_start_datetimes", "minutes_after_segment_start")) + phone_data_yield_per_segment = phone_data_yield_per_segment.set_index(["local_segment_start_datetimes", "minutes_after_segment_start"]).reindex(full_index).reset_index().fillna(0) + + # transpose the dataframe per local start datetime of the segment and discard the useless index layer + phone_data_yield_per_segment = phone_data_yield_per_segment.groupby("local_segment_start_datetimes")[["minutes_after_segment_start", "sensor"]].apply(lambda x: x.set_index("minutes_after_segment_start").transpose()) + phone_data_yield_per_segment.index = phone_data_yield_per_segment.index.get_level_values("local_segment_start_datetimes") + + # get heatmap + getSensorsPerMinPerSegmentHeatmap(phone_data_yield_per_segment, pid, time_segment, html_file) + + +html_file.close() diff --git a/src/visualization/histogram_phone_data_yield.py b/src/visualization/histogram_phone_data_yield.py index e4a55aaf..cd15ec8d 100644 --- a/src/visualization/histogram_phone_data_yield.py +++ b/src/visualization/histogram_phone_data_yield.py @@ -8,15 +8,18 @@ phone_data_yield = pd.read_csv(snakemake.input[0]) if ("phone_data_yield_rapids_ratiovalidyieldedminutes" not in phone_data_yield.columns) or ("phone_data_yield_rapids_ratiovalidyieldedhours" not in phone_data_yield.columns): raise ValueError("Please make sure [PHONE_DATA_YIELD][RAPIDS][COMPUTE] is True AND [PHONE_DATA_YIELD][RAPIDS][FEATURES] contains [ratiovalidyieldedminutes, ratiovalidyieldedhours].") -# plot ratio valid yielded minutes histogram -fig_ratiovalidyieldedminutes = px.histogram(phone_data_yield, x="phone_data_yield_rapids_ratiovalidyieldedminutes", color="local_segment_label") -fig_ratiovalidyieldedminutes.update_layout(title="Ratio Valid Yielded Minutes Histogram") - -# plot ratio valid yielded hours histogram -fig_ratiovalidyieldedhours = px.histogram(phone_data_yield, x="phone_data_yield_rapids_ratiovalidyieldedhours", color="local_segment_label") -fig_ratiovalidyieldedhours.update_layout(title="Ratio Valid Yielded Hours Histogram") - - -with open(snakemake.output[0], "a") as html_file: +html_file = open(snakemake.output[0], "a", encoding="utf-8") +if phone_data_yield.empty: + html_file.write("There is no sensor data for the sensors in [PHONE_DATA_YIELD][SENSORS].") +else: + # plot ratio valid yielded minutes histogram + fig_ratiovalidyieldedminutes = px.histogram(phone_data_yield, x="phone_data_yield_rapids_ratiovalidyieldedminutes", color="local_segment_label") + fig_ratiovalidyieldedminutes.update_layout(title="Histogram of valid yielded minutes ratio per time segment.") html_file.write(fig_ratiovalidyieldedminutes.to_html(full_html=False, include_plotlyjs="cdn")) + + # plot ratio valid yielded hours histogram + fig_ratiovalidyieldedhours = px.histogram(phone_data_yield, x="phone_data_yield_rapids_ratiovalidyieldedhours", color="local_segment_label") + fig_ratiovalidyieldedhours.update_layout(title="Histogram of valid yielded hours ratio per time segment.") html_file.write(fig_ratiovalidyieldedhours.to_html(full_html=False, include_plotlyjs="cdn")) + +html_file.close() diff --git a/src/visualization/heatmap_sensed_bins_all_participants.Rmd b/src/visualization/merge_heatmap_sensor_row_count_per_time_segment.Rmd similarity index 63% rename from src/visualization/heatmap_sensed_bins_all_participants.Rmd rename to src/visualization/merge_heatmap_sensor_row_count_per_time_segment.Rmd index e6dbdbbf..b6c8463c 100644 --- a/src/visualization/heatmap_sensed_bins_all_participants.Rmd +++ b/src/visualization/merge_heatmap_sensor_row_count_per_time_segment.Rmd @@ -1,10 +1,10 @@ --- -title: "Heatmap Sensed Bins Report" +title: "Sensor Row Count per Time Segment For All Participants" author: - - "MoSHI Pipeline" + - "RAPIDS" date: "`r format(Sys.time(), '%d %B, %Y')`" params: - rmd: "heatmap_sensed_bins_all_participants.Rmd" + rmd: "merge_heatmap_sensor_row_count_per_time_segment.Rmd" output: html_document: highlight: tango @@ -17,14 +17,17 @@ output: smooth_scroll: yes --- + + ```{r include=FALSE} source("renv/activate.R") ``` -## All phone sensors ```{r, echo=FALSE} -heatmaps <- snakemake@input[["heatmap_sensed_bins"]] +heatmaps <- snakemake@input[["heatmap_sensor_row_count_per_time_segment"]] heatmaps.html <- vector(mode="list", length(heatmaps)) for(pid in 1:length(heatmaps)){ diff --git a/src/visualization/heatmap_days_by_sensors_all_participants.Rmd b/src/visualization/merge_heatmap_sensors_per_minute_per_time_segment.Rmd similarity index 63% rename from src/visualization/heatmap_days_by_sensors_all_participants.Rmd rename to src/visualization/merge_heatmap_sensors_per_minute_per_time_segment.Rmd index cb4303c2..2e1143e0 100644 --- a/src/visualization/heatmap_days_by_sensors_all_participants.Rmd +++ b/src/visualization/merge_heatmap_sensors_per_minute_per_time_segment.Rmd @@ -1,10 +1,10 @@ --- -title: "Heatmap Rows Report" +title: "Sensors per Minute per Time Segment for All Participants" author: - - "MoSHI Pipeline" + - "RAPIDS" date: "`r format(Sys.time(), '%d %B, %Y')`" params: - rmd: "heatmap_days_by_sensors_all_participants.Rmd" + rmd: "merge_heatmap_sensors_per_minute_per_time_segment.Rmd" output: html_document: highlight: tango @@ -17,14 +17,17 @@ output: smooth_scroll: yes --- + + ```{r include=FALSE} source("renv/activate.R") ``` -## All phone sensors ```{r, echo=FALSE} -heatmaps <- snakemake@input[["heatmap_rows"]] +heatmaps <- snakemake@input[["heatmap_sensors_per_minute_per_time_segment"]] heatmaps.html <- vector(mode="list", length(heatmaps)) for(pid in 1:length(heatmaps)){ diff --git a/src/visualization/overall_compliance_heatmap.py b/src/visualization/overall_compliance_heatmap.py deleted file mode 100644 index 877ab0d2..00000000 --- a/src/visualization/overall_compliance_heatmap.py +++ /dev/null @@ -1,102 +0,0 @@ -import pandas as pd -import numpy as np -import plotly.io as pio -import plotly.graph_objects as go -from dateutil import tz -import datetime - -def getOneRow(data_per_participant, last_certain_dates, col_name, row, expected_num_of_days, only_show_valid_days): - - data = pd.read_csv(data_per_participant, index_col=["local_date"]) - - if col_name == "num_sensors": - data["num_sensors"] = data.max(axis=1) - - if only_show_valid_days and col_name == "valid_sensed_hours": - # replace invalid days' valid sensed hours with np.nan to let our heatmap only shows valid days - data.loc[data[data["is_valid_sensed_day"] == False].index, "valid_sensed_hours"] = np.nan - - if expected_num_of_days == -1: - # show all days - data.index = pd.to_datetime(data.index) - start_date = data.index.min() - # upsample data into one day bins - data = data.resample("1D").sum() - data["date_idx"] = (data.index - start_date).days - data.set_index("date_idx", inplace=True, drop=True) - row = row + data[col_name].tolist() - else: - # only show last certain days - for date in last_certain_dates: - if date in data.index: - row.append(data.loc[date][col_name]) - else: - row.append(0) - - return row - -def getOverallComplianceHeatmap(sensors_with_data, valid_sensed_hours, last_certain_dates, bin_size, min_bins_per_hour, expected_num_of_days, output_path): - plot = go.Figure(data=go.Heatmap(z=valid_sensed_hours[last_certain_dates].values, - x=[date.replace("-", "/") for date in last_certain_dates] if expected_num_of_days != -1 else last_certain_dates, - y=[pid + "." + label for pid, label in zip(sensors_with_data["pid"].to_list(), sensors_with_data["label"].to_list())], - text=sensors_with_data[last_certain_dates].values, - hovertemplate="Date: %{x}
Participant: %{y}
Valid sensed hours: %{z}
Number of sensors with data: %{text}" if expected_num_of_days != -1 else "Day index: %{x}
Participant: %{y}
Valid sensed hours: %{z}
Number of sensors with data: %{text}", - colorscale="Viridis", - colorbar={"tick0": 0,"dtick": 1}, - showscale=True)) - if expected_num_of_days != -1: - plot.update_layout(title="Overall compliance heatmap for last " + str(expected_num_of_days) + " days.
Bin's color shows valid sensed hours for that day.
A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes.
You can hover over every day to see the number of sensors with data in that day.") - else: - plot.update_layout(title="Overall compliance heatmap for all days.
Bin's color shows valid sensed hours for that day.
A valid hour has at least one row of any sensor in "+ str(min_bins_per_hour) +" out of " + str(int(60 / bin_size)) + " bins of " + str(bin_size) + " minutes.
You can hover over every day to see the number of sensors with data in that day.") - - plot["layout"]["xaxis"].update(side="bottom") - plot["layout"].update(xaxis_title="Day indexes") - plot["layout"].update(margin=dict(t=160)) - pio.write_html(plot, file=output_path, auto_open=False, include_plotlyjs="cdn") - - -phone_sensed_bins = snakemake.input["phone_sensed_bins"] -phone_valid_sensed_days = snakemake.input["phone_valid_sensed_days"] -pid_files = snakemake.input["pid_files"] -only_show_valid_days = snakemake.params["only_show_valid_days"] -local_timezone = snakemake.params["local_timezone"] -bin_size = snakemake.params["bin_size"] -min_bins_per_hour = snakemake.params["min_bins_per_hour"] -expected_num_of_days = int(snakemake.params["expected_num_of_days"]) - -if expected_num_of_days < -1: - raise ValueError("EXPECTED_NUM_OF_DAYS of OVERALL_COMPLIANCE_HEATMAP section in config.yaml must be larger or equal to -1.") - -last_certain_dates = [] -if expected_num_of_days != -1: - # get the list of dates to show - cur_date = datetime.datetime.now().astimezone(tz.gettz(local_timezone)).date() - for date_offset in range(expected_num_of_days-1, -1, -1): - last_certain_dates.append((cur_date - datetime.timedelta(days=date_offset)).strftime("%Y-%m-%d")) - -sensors_with_data_records, valid_sensed_hours_records = [], [] -for sensors_with_data_individual, valid_sensed_hours_individual, pid_file in zip(phone_sensed_bins, phone_valid_sensed_days, pid_files): - - with open(pid_file, encoding="ISO-8859-1") as external_file: - external_file_content = external_file.readlines() - device_id = external_file_content[0].split(",")[-1].strip() - label = external_file_content[2].strip() - pid = pid_file.split("/")[-1] - - sensors_with_data_records.append(getOneRow(sensors_with_data_individual, last_certain_dates, "num_sensors", [pid, label, device_id], expected_num_of_days, only_show_valid_days)) - valid_sensed_hours_records.append(getOneRow(valid_sensed_hours_individual, last_certain_dates, "valid_sensed_hours", [pid, label, device_id], expected_num_of_days, only_show_valid_days)) - -if expected_num_of_days == -1: - # get the date_idx of all days - total_num_of_days = max([len(x) for x in sensors_with_data_records]) - 3 - last_certain_dates = [date_idx for date_idx in range(total_num_of_days)] - -sensors_with_data = pd.DataFrame(data=sensors_with_data_records, columns=["pid", "label", "device_id"] + last_certain_dates).replace(0, np.nan) -valid_sensed_hours = pd.DataFrame(data=valid_sensed_hours_records, columns=["pid", "label", "device_id"] + last_certain_dates).replace(0, np.nan) - -if sensors_with_data.empty: - empty_html = open(snakemake.output[0], "w") - empty_html.write("There is no sensor data for all participants") - empty_html.close() -else: - getOverallComplianceHeatmap(sensors_with_data, valid_sensed_hours, last_certain_dates, bin_size, min_bins_per_hour, expected_num_of_days, snakemake.output[0]) diff --git a/src/visualization/visualize.py b/src/visualization/visualize.py deleted file mode 100644 index e69de29b..00000000