2022-10-17 17:07:33 +02:00
import pandas as pd
import numpy as np
import datetime
import math , sys , yaml
2022-10-26 16:16:25 +02:00
from esm_preprocess import clean_up_esm
from esm import classify_sessions_by_completion_time , preprocess_esm
2022-10-17 17:07:33 +02:00
input_data_files = dict ( snakemake . input )
2022-10-25 10:53:44 +02:00
def format_timestamp ( x ) :
tstring = " "
space = False
if x / / 3600 > 0 :
tstring + = f " { x / / 3600 } H "
space = True
if x % 3600 / / 60 > 0 :
tstring + = f " { x % 3600 / / 60 } M " if " H " in tstring else f " { x % 3600 / / 60 } M "
if x % 60 > 0 :
tstring + = f " { x % 60 } S " if " M " in tstring or " H " in tstring else f " { x % 60 } S "
return tstring
2022-10-26 16:16:25 +02:00
2022-10-27 16:12:56 +02:00
def extract_ers_from_file ( esm_df , device_id ) :
2022-10-17 17:07:33 +02:00
pd . set_option ( " display.max_rows " , None )
2022-10-26 16:16:25 +02:00
pd . set_option ( " display.max_columns " , None )
2022-10-17 17:07:33 +02:00
# extracted_ers = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"])
2022-10-26 16:16:25 +02:00
# esm_df = clean_up_esm(preprocess_esm(esm_df))
esm_preprocessed = clean_up_esm ( preprocess_esm ( esm_df ) )
2022-10-17 17:07:33 +02:00
# Take only during work sessions
2022-10-26 16:16:25 +02:00
# during_work = esm_df[esm_df["esm_trigger"].str.contains("during_work", na=False)]
# esm_trigger_group = esm_df.groupby("esm_session").agg(pd.Series.mode)['esm_trigger'] # Get most frequent esm_trigger within particular session
# esm_filtered_sessions = list(esm_trigger_group[esm_trigger_group == 'during_work'].index) # Take only sessions that contains during work
# Take only ema_completed sessions responses
classified = classify_sessions_by_completion_time ( esm_preprocessed )
esm_filtered_sessions = classified [ classified [ " session_response " ] == ' ema_completed ' ] . reset_index ( ) [ ' esm_session ' ]
esm_df = esm_preprocessed [ esm_preprocessed [ " esm_session " ] . isin ( esm_filtered_sessions ) ]
2022-10-17 17:07:33 +02:00
# Extract time-relevant information
2022-10-26 16:16:25 +02:00
extracted_ers = esm_df . groupby ( [ " device_id " , " esm_session " ] ) [ ' timestamp ' ] . apply ( lambda x : math . ceil ( ( x . max ( ) - x . min ( ) ) / 1000 ) ) . reset_index ( ) # in rounded up seconds
extracted_ers = extracted_ers [ extracted_ers [ " timestamp " ] < = 15 * 60 ] . reset_index ( drop = True ) # ensure that the longest duration of the questionnaire anwsering is 15 min
2022-10-17 17:07:33 +02:00
time_before_questionnaire = 30 * 60 # in seconds (30 minutes)
2022-10-26 16:16:25 +02:00
extracted_ers [ " label " ] = " straw_event_ " + snakemake . params [ " pid " ] + " _ " + extracted_ers . index . astype ( str ) . str . zfill ( 3 )
2022-10-17 17:07:33 +02:00
extracted_ers [ " event_timestamp " ] = esm_df . groupby ( " esm_session " ) [ ' timestamp ' ] . min ( ) . reset_index ( ) [ ' timestamp ' ]
2022-10-25 10:53:44 +02:00
extracted_ers [ " length " ] = ( extracted_ers [ " timestamp " ] + time_before_questionnaire ) . apply ( lambda x : format_timestamp ( x ) )
2022-10-17 17:07:33 +02:00
extracted_ers [ " shift " ] = time_before_questionnaire
2022-10-25 10:53:44 +02:00
extracted_ers [ " shift " ] = extracted_ers [ " shift " ] . apply ( lambda x : format_timestamp ( x ) )
2022-10-17 17:07:33 +02:00
extracted_ers [ " shift_direction " ] = - 1
extracted_ers [ " device_id " ] = device_id
return extracted_ers [ [ " label " , " event_timestamp " , " length " , " shift " , " shift_direction " , " device_id " ] ]
2022-10-27 16:12:56 +02:00
if snakemake . params [ " stage " ] == " extract " :
2022-10-17 17:07:33 +02:00
esm_df = pd . read_csv ( input_data_files [ ' esm_raw_input ' ] )
with open ( input_data_files [ ' pid_file ' ] , ' r ' ) as stream :
pid_file = yaml . load ( stream , Loader = yaml . FullLoader )
extracted_ers = extract_ers_from_file ( esm_df , pid_file [ " PHONE " ] [ " DEVICE_IDS " ] [ 0 ] )
extracted_ers . to_csv ( snakemake . output [ 0 ] , index = False )
elif snakemake . params [ " stage " ] == " merge " :
input_data_files = dict ( snakemake . input )
straw_events = pd . DataFrame ( columns = [ " label " , " event_timestamp " , " length " , " shift " , " shift_direction " , " device_id " ] )
for input_file in input_data_files [ " ers_files " ] :
ers_df = pd . read_csv ( input_file )
straw_events = pd . concat ( [ straw_events , ers_df ] , axis = 0 , ignore_index = True )
straw_events . to_csv ( snakemake . output [ 0 ] , index = False )