2020-12-04 03:00:32 +01:00
import pandas as pd
import numpy as np
2021-06-25 19:14:07 +02:00
import plotly . express as px
2020-12-04 03:00:32 +01:00
from importlib import util
from pathlib import Path
import yaml
# import filter_data_by_segment from src/features/utils/utils.py
spec = util . spec_from_file_location ( " util " , str ( Path ( snakemake . scriptdir ) . parent / " features " / " utils " / " utils.py " ) )
mod = util . module_from_spec ( spec )
spec . loader . exec_module ( mod )
filter_data_by_segment = getattr ( mod , " filter_data_by_segment " )
2021-06-26 01:06:46 +02:00
def getRowCount ( sensor_paths , sensor_names , time_segments_labels ) :
sensors_row_count = pd . DataFrame ( )
for sensor_path , sensor_name in zip ( sensor_paths , sensor_names ) :
sensor_data = pd . read_csv ( sensor_path , usecols = [ " assigned_segments " ] )
sensor_row_count = pd . DataFrame ( )
if not sensor_data . empty :
for time_segment in time_segments_labels :
sensor_data_per_segment = filter_data_by_segment ( sensor_data , time_segment )
if not sensor_data_per_segment . empty :
sensor_row_count = pd . concat ( [ sensor_row_count , sensor_data_per_segment . groupby ( [ " local_segment " ] ) [ [ " local_segment " ] ] . count ( ) . rename ( columns = { " local_segment " : sensor_name } ) ] , axis = 0 , sort = False )
sensors_row_count = pd . concat ( [ sensors_row_count , sensor_row_count ] , axis = 1 , sort = False )
sensors_row_count . index . name = " local_segment "
sensors_row_count . index = sensors_row_count . index . str . replace ( r " _RR \ d+SS# " , " # " )
return sensors_row_count
2021-06-25 19:14:07 +02:00
def getRowCountHeatmap ( data_for_plot , pid , time_segment , html_file ) :
2020-12-04 03:00:32 +01:00
2021-06-25 19:14:07 +02:00
fig = px . timeline ( data_for_plot ,
x_start = " local_segment_start_datetime " ,
x_end = " local_segment_end_datetime " ,
y = " sensor " ,
color = " scaled_value " ,
2021-06-26 01:06:46 +02:00
color_continuous_scale = " Peach " ,
2021-06-28 23:31:35 +02:00
range_color = [ 0 , 1 ] ,
2021-06-25 19:14:07 +02:00
opacity = 0.7 ,
hover_data = { " local_segment_start_datetime " : False , " local_segment_end_datetime " : False , " local_segment " : True , " value " : True , " scaled_value " : True } )
2020-12-04 03:00:32 +01:00
2021-06-25 19:14:07 +02:00
fig . update_layout ( title = " Heatmap of sensor row count for " + time_segment + " segments. Pid: " + pid + " . Label: " + label + " <br>y-axis shows the included sensors.<br>x-axis shows time segments.<br>z-axis (color) shows row count per sensor per segment instance. " ,
xaxis = dict ( side = " bottom " , title = " Time Segments " ) ,
yaxis = dict ( side = " left " , title = " Sensors " ) ,
margin = dict ( t = 160 ) )
2020-12-04 03:00:32 +01:00
2021-06-25 19:14:07 +02:00
html_file . write ( fig . to_html ( full_html = False , include_plotlyjs = " cdn " ) )
return html_file
2020-12-04 03:00:32 +01:00
with open ( snakemake . input [ " participant_file " ] , " r " , encoding = " utf-8 " ) as f :
participant_file = yaml . safe_load ( f )
label = participant_file [ " PHONE " ] [ " LABEL " ]
2021-06-25 19:14:07 +02:00
pid = snakemake . params [ " pid " ]
sensor_names = [ sensor_name . lower ( ) for sensor_name in snakemake . params [ " sensor_names " ] ]
time_segments_type = snakemake . params [ " time_segments_type " ]
time_segments_labels = pd . read_csv ( snakemake . input [ " time_segments_labels " ] , header = 0 ) [ " label " ]
phone_data_yield = pd . read_csv ( snakemake . input [ " phone_data_yield " ] , index_col = [ " local_segment " ] , parse_dates = [ " local_segment_start_datetime " , " local_segment_end_datetime " ] ) #index_col=["local_segment_start_datetime"],
2020-12-04 03:00:32 +01:00
2021-06-25 19:14:07 +02:00
# make sure the phone_data_yield file contains "phone_data_yield_rapids_ratiovalidyieldedminutes" and "phone_data_yield_rapids_ratiovalidyieldedhours" columns
if ( " phone_data_yield_rapids_ratiovalidyieldedminutes " not in phone_data_yield . columns ) or ( " phone_data_yield_rapids_ratiovalidyieldedhours " not in phone_data_yield . columns ) :
raise ValueError ( " Please make sure [PHONE_DATA_YIELD][RAPIDS][COMPUTE] is True AND [PHONE_DATA_YIELD][RAPIDS][FEATURES] contains [ratiovalidyieldedminutes, ratiovalidyieldedhours]. " )
2021-06-28 23:31:35 +02:00
phone_data_yield . loc [ : , [ " phone_data_yield_rapids_ratiovalidyieldedminutes " , " phone_data_yield_rapids_ratiovalidyieldedhours " ] ] = phone_data_yield . loc [ : , [ " phone_data_yield_rapids_ratiovalidyieldedminutes " , " phone_data_yield_rapids_ratiovalidyieldedhours " ] ] . round ( 3 ) . clip ( upper = 1 )
2021-06-25 19:14:07 +02:00
2021-06-26 01:06:46 +02:00
sensors_row_count = getRowCount ( snakemake . input [ " all_sensors " ] , sensor_names , time_segments_labels )
2021-06-25 19:14:07 +02:00
data_for_plot = phone_data_yield . rename ( columns = { " phone_data_yield_rapids_ratiovalidyieldedminutes " : " ratiovalidyieldedminutes " , " phone_data_yield_rapids_ratiovalidyieldedhours " : " ratiovalidyieldedhours " } ) . merge ( sensors_row_count , how = " left " , left_index = True , right_index = True ) . reset_index ( )
if time_segments_type == " FREQUENCY " :
data_for_plot [ " local_segment_label " ] = data_for_plot [ " local_segment_label " ] . str [ : - 4 ]
elif time_segments_type == " EVENT " :
data_for_plot [ " local_segment_label " ] = " event "
2020-12-04 03:00:32 +01:00
sensor_names . extend ( [ " ratiovalidyieldedminutes " , " ratiovalidyieldedhours " ] )
2021-06-25 19:14:07 +02:00
html_file = open ( snakemake . output [ 0 ] , " a " , encoding = " utf-8 " )
for time_segment in set ( data_for_plot [ " local_segment_label " ] ) :
if not data_for_plot . empty :
data_for_plot_per_segment = data_for_plot [ data_for_plot [ " local_segment_label " ] == time_segment ]
if data_for_plot_per_segment . empty :
2020-12-04 03:00:32 +01:00
html_file . write ( " There are no records of selected sensors in database for " + time_segment + " segments. Pid: " + pid + " . Label: " + label + " .<br> " )
else :
2021-06-25 19:14:07 +02:00
data_for_plot_per_segment = data_for_plot_per_segment . reindex ( columns = [ " local_segment " , " local_segment_start_datetime " , " local_segment_end_datetime " ] + sensor_names ) . set_index ( [ " local_segment " , " local_segment_start_datetime " , " local_segment_end_datetime " ] )
2020-12-04 03:00:32 +01:00
# except for phone data yield sensor, scale each sensor (row) to the range of [0, 1]
2021-06-25 19:14:07 +02:00
scaled_data_for_plot_per_segment = data_for_plot_per_segment . copy ( )
scaled_data_for_plot_per_segment [ sensor_names [ : - 2 ] ] = scaled_data_for_plot_per_segment . fillna ( np . nan ) [ sensor_names [ : - 2 ] ] . apply ( lambda x : ( x - np . nanmin ( x ) ) / ( np . nanmax ( x ) - np . nanmin ( x ) ) if np . nanmax ( x ) != np . nanmin ( x ) else ( x / np . nanmin ( x ) ) , axis = 0 )
2021-06-28 23:31:35 +02:00
data_for_plot_processed = pd . concat ( [ data_for_plot_per_segment . stack ( dropna = False ) . to_frame ( " value " ) , scaled_data_for_plot_per_segment . stack ( dropna = False ) . round ( 3 ) . to_frame ( " scaled_value " ) ] , axis = 1 ) . reset_index ( ) . rename ( columns = { " level_3 " : " sensor " } )
2021-06-25 19:14:07 +02:00
getRowCountHeatmap ( data_for_plot_processed , pid , time_segment , html_file )
2020-12-04 03:00:32 +01:00
html_file . close ( )