Update heatmap of sensor row count

2021-06-25 13:14:07 -04:00 · 2021-06-25 13:14:07 -04:00 · bc06477d89
parent e98a8ff7ca
commit bc06477d89
2 changed files with 62 additions and 58 deletions
--- a/rules/reports.smk
+++ b/rules/reports.smk
@ -35,7 +35,9 @@ rule heatmap_sensor_row_count_per_time_segment:
        participant_file = "data/external/participant_files/{pid}.yaml",
        time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
    params:
-        pid = "{pid}"
+        pid = "{pid}",
+        sensor_names = config["HEATMAP_SENSOR_ROW_COUNT_PER_TIME_SEGMENT"]["SENSORS"],
+        time_segments_type = config["TIME_SEGMENTS"]["TYPE"]
    output:
        "reports/interim/{pid}/heatmap_sensor_row_count_per_time_segment.html"
    script:
--- a/src/visualization/heatmap_sensor_row_count_per_time_segment.py
+++ b/src/visualization/heatmap_sensor_row_count_per_time_segment.py
@ -1,89 +1,91 @@
 import pandas as pd
 import numpy as np
-import plotly.graph_objects as go
+import plotly.express as px
 from importlib import util
 from pathlib import Path
 import yaml

-
-def getRowCountHeatmap(data_for_plot, scaled_data_for_plot, pid, time_segment, html_file):
-
-    fig = go.Figure(data=go.Heatmap(z=scaled_data_for_plot.values.tolist(),
-                                     x=data_for_plot.columns,
-                                     y=data_for_plot.index,
-                                     hovertext=data_for_plot.values.tolist(),
-                                     hovertemplate="Segment start: %{x}<br>Sensor: %{y}<br>Row count: %{hovertext}<extra></extra>",
-                                     zmin=0, zmax=1,
-                                     colorscale='Viridis'))
-
-    fig.update_layout(title="Heatmap of sensor row count for " + time_segment + " segments. Pid: " + pid +". Label: " + label + "<br>y-axis shows the included sensors.<br>x-axis shows the start (date and time) of a time segment.<br>z-axis (color) shows row count per sensor per segment instance.")
-    fig["layout"].update(margin=dict(t=160))
-    
-    html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn"))
-
-
-
-
 # import filter_data_by_segment from src/features/utils/utils.py
 spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "features" / "utils" / "utils.py"))
 mod = util.module_from_spec(spec)
 spec.loader.exec_module(mod)
 filter_data_by_segment = getattr(mod,  "filter_data_by_segment")

+def getRowCountHeatmap(data_for_plot, pid, time_segment, html_file):
+
+    fig = px.timeline(data_for_plot,
+                        x_start="local_segment_start_datetime",
+                        x_end="local_segment_end_datetime",
+                        y="sensor",
+                        color="scaled_value",
+                        color_continuous_scale="Peach", #"Viridis",
+                        opacity=0.7,
+                        hover_data={"local_segment_start_datetime":False, "local_segment_end_datetime":False, "local_segment":True, "value":True, "scaled_value":True})
+
+    fig.update_layout(title="Heatmap of sensor row count for " + time_segment + " segments. Pid: " + pid +". Label: " + label + "<br>y-axis shows the included sensors.<br>x-axis shows time segments.<br>z-axis (color) shows row count per sensor per segment instance.",
+                xaxis=dict(side="bottom", title="Time Segments"),
+                yaxis=dict(side="left", title="Sensors"),
+                margin=dict(t=160))
+
+    html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn"))
+
+    return html_file



-
-phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], index_col=["local_segment_start_datetime"], parse_dates=["local_segment_start_datetime"])
-# make sure the phone_data_yield file contains "phone_data_yield_rapids_ratiovalidyieldedminutes" and "phone_data_yield_rapids_ratiovalidyieldedhours" columns
-if ("phone_data_yield_rapids_ratiovalidyieldedminutes" not in phone_data_yield.columns) or ("phone_data_yield_rapids_ratiovalidyieldedhours" not in phone_data_yield.columns):
-    raise ValueError("Please make sure [PHONE_DATA_YIELD][RAPIDS][COMPUTE] is True AND [PHONE_DATA_YIELD][RAPIDS][FEATURES] contains [ratiovalidyieldedminutes, ratiovalidyieldedhours].")
-phone_data_yield = phone_data_yield[["local_segment_label", "phone_data_yield_rapids_ratiovalidyieldedminutes", "phone_data_yield_rapids_ratiovalidyieldedhours"]]
-
-time_segments = pd.read_csv(snakemake.input["time_segments_labels"], header=0)["label"]
-pid = snakemake.params["pid"]
-
 with open(snakemake.input["participant_file"], "r", encoding="utf-8") as f:
    participant_file = yaml.safe_load(f)
 label = participant_file["PHONE"]["LABEL"]

-sensor_names = []
-sensors_row_count = dict(zip(time_segments, [pd.DataFrame()] * len(time_segments)))
+pid = snakemake.params["pid"]
+sensor_names = [sensor_name.lower() for sensor_name in snakemake.params["sensor_names"]]
+time_segments_type = snakemake.params["time_segments_type"]
+time_segments_labels = pd.read_csv(snakemake.input["time_segments_labels"], header=0)["label"]

-for sensor_path in snakemake.input["all_sensors"]:
+phone_data_yield = pd.read_csv(snakemake.input["phone_data_yield"], index_col=["local_segment"], parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"]) #index_col=["local_segment_start_datetime"], 
+
+# make sure the phone_data_yield file contains "phone_data_yield_rapids_ratiovalidyieldedminutes" and "phone_data_yield_rapids_ratiovalidyieldedhours" columns
+if ("phone_data_yield_rapids_ratiovalidyieldedminutes" not in phone_data_yield.columns) or ("phone_data_yield_rapids_ratiovalidyieldedhours" not in phone_data_yield.columns):
+    raise ValueError("Please make sure [PHONE_DATA_YIELD][RAPIDS][COMPUTE] is True AND [PHONE_DATA_YIELD][RAPIDS][FEATURES] contains [ratiovalidyieldedminutes, ratiovalidyieldedhours].")
+
+# extract row count
+sensors_row_count = pd.DataFrame()
+for sensor_path, sensor_name in zip(snakemake.input["all_sensors"], sensor_names):
    sensor_data = pd.read_csv(sensor_path, usecols=["assigned_segments"])
-    sensor_name = sensor_path.split("/")[-1].replace("_with_datetime.csv", "")
-    sensor_names.append(sensor_name)
-    
+
+    sensor_row_count = pd.DataFrame()
    if not sensor_data.empty:
-        for time_segment in time_segments:
+        for time_segment in time_segments_labels:
            sensor_data_per_segment = filter_data_by_segment(sensor_data, time_segment)

            if not sensor_data_per_segment.empty:
-                # extract local start datetime of the segment from "local_segment" column
-                sensor_data_per_segment["local_segment_start_datetime"] = pd.to_datetime(sensor_data_per_segment["local_segment"].apply(lambda x: x.split("#")[1].split(",")[0]))
-                sensor_row_count = sensor_data_per_segment.groupby("local_segment_start_datetime")[["local_segment"]].count().rename(columns={"local_segment": sensor_name})
-                sensors_row_count[time_segment] = pd.concat([sensors_row_count[time_segment], sensor_row_count], axis=1, sort=False)
+                sensor_row_count = pd.concat([sensor_row_count, sensor_data_per_segment.groupby(["local_segment"])[["local_segment"]].count().rename(columns={"local_segment": sensor_name})], axis=0, sort=False)
+    sensors_row_count = pd.concat([sensors_row_count, sensor_row_count], axis=1, sort=False)
+
+sensors_row_count.index.name = "local_segment"
+sensors_row_count.index = sensors_row_count.index.str.replace(r"_RR\d+SS", "")
+data_for_plot = phone_data_yield.rename(columns={"phone_data_yield_rapids_ratiovalidyieldedminutes": "ratiovalidyieldedminutes","phone_data_yield_rapids_ratiovalidyieldedhours": "ratiovalidyieldedhours"}).merge(sensors_row_count, how="left", left_index=True, right_index=True).reset_index()
+
+
+if time_segments_type == "FREQUENCY":
+    data_for_plot["local_segment_label"] = data_for_plot["local_segment_label"].str[:-4]
+elif time_segments_type == "EVENT":
+    data_for_plot["local_segment_label"] = "event"

-# add phone data yield features and plot heatmap
-html_file = open(snakemake.output[0], "a", encoding="utf-8")
 sensor_names.extend(["ratiovalidyieldedminutes", "ratiovalidyieldedhours"])
-for time_segment in time_segments:
-    if not phone_data_yield.empty:
-        phone_data_yield_per_segment = phone_data_yield[phone_data_yield["local_segment_label"] == time_segment].rename(columns={"phone_data_yield_rapids_ratiovalidyieldedminutes": "ratiovalidyieldedminutes","phone_data_yield_rapids_ratiovalidyieldedhours": "ratiovalidyieldedhours"}).round(3)
-        if not phone_data_yield_per_segment.empty:
-            sensors_row_count[time_segment] = pd.concat([sensors_row_count[time_segment], phone_data_yield_per_segment], axis=1, sort=True)
-    
-    # consider all the sensors
-    data_for_plot = sensors_row_count[time_segment].transpose().reindex(pd.Index(sensor_names))
-
-    if data_for_plot.empty:
+html_file = open(snakemake.output[0], "a", encoding="utf-8")
+for time_segment in set(data_for_plot["local_segment_label"]):
+    if not data_for_plot.empty:
+        data_for_plot_per_segment = data_for_plot[data_for_plot["local_segment_label"] == time_segment]
+    if data_for_plot_per_segment.empty:
        html_file.write("There are no records of selected sensors in database for " + time_segment + " segments. Pid: " + pid + ". Label: " + label + ".<br>")
    else:
+        data_for_plot_per_segment = data_for_plot_per_segment.reindex(columns=["local_segment", "local_segment_start_datetime", "local_segment_end_datetime"] + sensor_names).set_index(["local_segment", "local_segment_start_datetime", "local_segment_end_datetime"])
        # except for phone data yield sensor, scale each sensor (row) to the range of [0, 1]
-        scaled_data_for_plot = data_for_plot.copy()
-        scaled_data_for_plot.loc[sensor_names[:-2]] = scaled_data_for_plot.fillna(np.nan).loc[sensor_names[:-2]].apply(lambda x: (x - np.nanmin(x)) / (np.nanmax(x) - np.nanmin(x)) if np.nanmax(x) != np.nanmin(x) else (x / np.nanmin(x)), axis=1)
-
-        getRowCountHeatmap(data_for_plot, scaled_data_for_plot, pid, time_segment, html_file)
+        scaled_data_for_plot_per_segment = data_for_plot_per_segment.copy()
+        scaled_data_for_plot_per_segment[sensor_names[:-2]] = scaled_data_for_plot_per_segment.fillna(np.nan)[sensor_names[:-2]].apply(lambda x: (x - np.nanmin(x)) / (np.nanmax(x) - np.nanmin(x)) if np.nanmax(x) != np.nanmin(x) else (x / np.nanmin(x)), axis=0)
+        data_for_plot_processed = pd.concat([data_for_plot_per_segment.stack(dropna=False).to_frame("value"), scaled_data_for_plot_per_segment.stack(dropna=False).to_frame("scaled_value")], axis=1).reset_index().rename(columns={"level_3": "sensor"})
+        data_for_plot_processed[["value", "scaled_value"]] = data_for_plot_processed[["value", "scaled_value"]].round(3).clip(upper=1)
+        getRowCountHeatmap(data_for_plot_processed, pid, time_segment, html_file)

 html_file.close()