2022-10-17 17:07:33 +02:00
import pandas as pd
import numpy as np
import datetime
import math , sys , yaml
2022-10-26 16:16:25 +02:00
from esm_preprocess import clean_up_esm
from esm import classify_sessions_by_completion_time , preprocess_esm
2022-10-17 17:07:33 +02:00
input_data_files = dict ( snakemake . input )
2022-10-25 10:53:44 +02:00
def format_timestamp ( x ) :
tstring = " "
space = False
if x / / 3600 > 0 :
tstring + = f " { x / / 3600 } H "
space = True
if x % 3600 / / 60 > 0 :
tstring + = f " { x % 3600 / / 60 } M " if " H " in tstring else f " { x % 3600 / / 60 } M "
if x % 60 > 0 :
tstring + = f " { x % 60 } S " if " M " in tstring or " H " in tstring else f " { x % 60 } S "
return tstring
2022-10-27 16:12:56 +02:00
def extract_ers_from_file ( esm_df , device_id ) :
2022-10-17 17:07:33 +02:00
2022-11-03 14:51:18 +01:00
# pd.set_option("display.max_rows", None)
2022-10-26 16:16:25 +02:00
pd . set_option ( " display.max_columns " , None )
2022-10-17 17:07:33 +02:00
2022-11-03 10:30:12 +01:00
with open ( ' config.yaml ' , ' r ' ) as stream :
config = yaml . load ( stream , Loader = yaml . FullLoader )
2022-10-17 17:07:33 +02:00
2022-11-03 10:30:12 +01:00
targets_method = config [ " TIME_SEGMENTS " ] [ " TAILORED_EVENTS " ] [ " TARGETS_METHOD " ]
2022-10-28 11:00:13 +02:00
2022-11-03 10:30:12 +01:00
esm_preprocessed = clean_up_esm ( preprocess_esm ( esm_df ) )
2022-10-26 16:16:25 +02:00
# Take only ema_completed sessions responses
classified = classify_sessions_by_completion_time ( esm_preprocessed )
2022-11-03 14:51:18 +01:00
esm_filtered_sessions = classified [ classified [ " session_response " ] == ' ema_completed ' ] . reset_index ( ) [ [ ' device_id ' , ' esm_session ' ] ]
esm_df = esm_preprocessed . loc [ ( esm_preprocessed [ ' device_id ' ] . isin ( esm_filtered_sessions [ ' device_id ' ] ) ) & ( esm_preprocessed [ ' esm_session ' ] . isin ( esm_filtered_sessions [ ' esm_session ' ] ) ) ]
# Kako ugotoviti, kje je bilo vprašanje na distressed?
2022-10-17 17:07:33 +02:00
# Extract time-relevant information
time_before_questionnaire = 30 * 60 # in seconds (30 minutes)
2022-10-28 11:00:13 +02:00
extracted_ers = esm_df . groupby ( [ " device_id " , " esm_session " ] ) [ ' timestamp ' ] . apply ( lambda x : math . ceil ( ( x . max ( ) - x . min ( ) ) / 1000 ) ) . reset_index ( ) # in rounded up in seconds
extracted_ers [ [ ' event_timestamp ' , ' device_id ' ] ] = esm_df . groupby ( [ " device_id " , " esm_session " ] ) [ ' timestamp ' ] . min ( ) . reset_index ( ) [ [ ' timestamp ' , ' device_id ' ] ]
extracted_ers = extracted_ers [ extracted_ers [ " timestamp " ] < = 15 * 60 ] . reset_index ( drop = True ) # ensure that the longest duration of the questionnaire anwsering is 15 min
2022-10-26 16:16:25 +02:00
extracted_ers [ " label " ] = " straw_event_ " + snakemake . params [ " pid " ] + " _ " + extracted_ers . index . astype ( str ) . str . zfill ( 3 )
2022-10-25 10:53:44 +02:00
extracted_ers [ " length " ] = ( extracted_ers [ " timestamp " ] + time_before_questionnaire ) . apply ( lambda x : format_timestamp ( x ) )
2022-10-17 17:07:33 +02:00
extracted_ers [ " shift " ] = time_before_questionnaire
2022-10-25 10:53:44 +02:00
extracted_ers [ " shift " ] = extracted_ers [ " shift " ] . apply ( lambda x : format_timestamp ( x ) )
2022-10-17 17:07:33 +02:00
extracted_ers [ " shift_direction " ] = - 1
2022-11-03 14:51:18 +01:00
# sys.exit()
2022-10-17 17:07:33 +02:00
return extracted_ers [ [ " label " , " event_timestamp " , " length " , " shift " , " shift_direction " , " device_id " ] ]
2022-10-27 16:12:56 +02:00
if snakemake . params [ " stage " ] == " extract " :
2022-10-17 17:07:33 +02:00
esm_df = pd . read_csv ( input_data_files [ ' esm_raw_input ' ] )
with open ( input_data_files [ ' pid_file ' ] , ' r ' ) as stream :
pid_file = yaml . load ( stream , Loader = yaml . FullLoader )
extracted_ers = extract_ers_from_file ( esm_df , pid_file [ " PHONE " ] [ " DEVICE_IDS " ] [ 0 ] )
extracted_ers . to_csv ( snakemake . output [ 0 ] , index = False )
elif snakemake . params [ " stage " ] == " merge " :
input_data_files = dict ( snakemake . input )
straw_events = pd . DataFrame ( columns = [ " label " , " event_timestamp " , " length " , " shift " , " shift_direction " , " device_id " ] )
for input_file in input_data_files [ " ers_files " ] :
ers_df = pd . read_csv ( input_file )
straw_events = pd . concat ( [ straw_events , ers_df ] , axis = 0 , ignore_index = True )
straw_events . to_csv ( snakemake . output [ 0 ] , index = False )