2022-12-15 16:43:40 +01:00
# -*- coding: utf-8 -*-
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.13.0
# kernelspec:
# display_name: straw2analysis
# language: python
# name: straw2analysis
# ---
# %%
import os
import sys
import datetime
import math
import seaborn as sns
nb_dir = os . path . split ( os . getcwd ( ) ) [ 0 ]
if nb_dir not in sys . path :
sys . path . append ( nb_dir )
import participants . query_db
from features . esm import *
from features . esm_JCQ import *
from features . esm_SAM import *
from IPython . core . interactiveshell import InteractiveShell
InteractiveShell . ast_node_interactivity = " all "
# %%
participants_inactive_usernames = participants . query_db . get_usernames (
collection_start = datetime . date . fromisoformat ( " 2020-08-01 " )
)
df_esm_inactive = get_esm_data ( participants_inactive_usernames )
# %%
df_esm_preprocessed = preprocess_esm ( df_esm_inactive )
# %% [markdown]
# Investigate stressfulness events
# %%
extracted_ers = df_esm_preprocessed . groupby ( [ " device_id " , " esm_session " ] ) [ ' timestamp ' ] . apply ( lambda x : math . ceil ( ( x . max ( ) - x . min ( ) ) / 1000 ) ) . reset_index ( ) . rename ( columns = { ' timestamp ' : ' session_length ' } ) # questionnaire length
extracted_ers = extracted_ers [ extracted_ers [ " session_length " ] < = 15 * 60 ] . reset_index ( drop = True ) # ensure that the longest duration of the questionnaire answering is 15 min
session_start_timestamp = df_esm_preprocessed . groupby ( [ ' device_id ' , ' esm_session ' ] ) [ ' timestamp ' ] . min ( ) . to_frame ( ) . rename ( columns = { ' timestamp ' : ' session_start_timestamp ' } ) # questionnaire start timestamp
session_end_timestamp = df_esm_preprocessed . groupby ( [ ' device_id ' , ' esm_session ' ] ) [ ' timestamp ' ] . max ( ) . to_frame ( ) . rename ( columns = { ' timestamp ' : ' session_end_timestamp ' } ) # questionnaire end timestamp
se_time = df_esm_preprocessed [ df_esm_preprocessed . questionnaire_id == 90. ] . set_index ( [ ' device_id ' , ' esm_session ' ] ) [ ' esm_user_answer ' ] . to_frame ( ) . rename ( columns = { ' esm_user_answer ' : ' se_time ' } )
se_duration = df_esm_preprocessed [ df_esm_preprocessed . questionnaire_id == 91. ] . set_index ( [ ' device_id ' , ' esm_session ' ] ) [ ' esm_user_answer ' ] . to_frame ( ) . rename ( columns = { ' esm_user_answer ' : ' se_duration ' } )
# Make se_durations to the appropriate lengths
# Extracted 3 targets that will be transfered in the csv file to the cleaning script.
df_esm_preprocessed [ df_esm_preprocessed . questionnaire_id == 87. ] . columns
se_stressfulness_event_tg = df_esm_preprocessed [ df_esm_preprocessed . questionnaire_id == 87. ] . set_index ( [ ' device_id ' , ' esm_session ' ] ) [ ' esm_user_answer ' ] . to_frame ( ) . rename ( columns = { ' esm_user_answer ' : ' appraisal_stressfulness_event ' } )
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
extracted_ers = extracted_ers . join ( session_start_timestamp , on = [ ' device_id ' , ' esm_session ' ] , how = ' inner ' ) \
. join ( session_end_timestamp , on = [ ' device_id ' , ' esm_session ' ] , how = ' inner ' ) \
. join ( se_stressfulness_event_tg , on = [ ' device_id ' , ' esm_session ' ] , how = ' inner ' ) \
. join ( se_time , on = [ ' device_id ' , ' esm_session ' ] , how = ' left ' ) \
. join ( se_duration , on = [ ' device_id ' , ' esm_session ' ] , how = ' left ' ) \
# Filter-out the sessions that are not useful. Because of the ambiguity this excludes:
# (1) straw event times that are marked as "0 - I don't remember"
# (2) straw event durations that are marked as "0 - I don't remember"
extracted_ers = extracted_ers [ ( ~ extracted_ers . se_time . astype ( str ) . str . startswith ( " 0 - " ) ) & ( ~ extracted_ers . se_duration . astype ( str ) . str . startswith ( " 0 - " ) ) & ( ~ extracted_ers . se_duration . astype ( str ) . str . startswith ( " Removed " ) ) ]
extracted_ers . reset_index ( drop = True , inplace = True )
# Add default duration in case if participant answered that no stressful event occured
# Prepare data to fit the data structure in the CSV file ...
2022-12-21 15:02:25 +01:00
# Add the event time as the start of the questionnaire if no stress event occured
2022-12-15 16:43:40 +01:00
extracted_ers [ ' se_time ' ] = extracted_ers [ ' se_time ' ] . fillna ( extracted_ers [ ' session_start_timestamp ' ] )
# Type could be an int (timestamp [ms]) which stays the same, and datetime str which is converted to timestamp in miliseconds
extracted_ers [ ' event_timestamp ' ] = extracted_ers [ ' se_time ' ] . apply ( lambda x : x if isinstance ( x , int ) else pd . to_datetime ( x ) . timestamp ( ) * 1000 ) . astype ( ' int64 ' )
extracted_ers [ ' shift_direction ' ] = - 1
""" >>>>> begin section (could be optimized) <<<<< """
# Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
# is taken as end time of the segment. Else the user input duration is taken.
extracted_ers [ ' temp_duration ' ] = extracted_ers [ ' se_duration ' ]
extracted_ers [ ' se_duration ' ] = \
np . where (
extracted_ers [ ' se_duration ' ] . astype ( str ) . str . startswith ( " 1 - " ) ,
extracted_ers [ ' session_end_timestamp ' ] - extracted_ers [ ' event_timestamp ' ] ,
extracted_ers [ ' se_duration ' ]
)
# This converts the rows of timestamps in miliseconds and the rows with datetime... to timestamp in seconds.
extracted_ers [ ' se_duration ' ] = \
extracted_ers [ ' se_duration ' ] . apply ( lambda x : math . ceil ( x / 1000 ) if isinstance ( x , int ) else abs ( pd . to_datetime ( x ) . hour * 60 + pd . to_datetime ( x ) . minute ) * 60 )
# Check whether min se_duration is at least the same duration as the ioi. Filter-out the rest.
""" >>>>> end section <<<<< """
2022-12-21 15:02:25 +01:00
# %% [markdown]
2022-12-15 16:43:40 +01:00
# Count negative values of duration
print ( " Count all: " , extracted_ers [ [ ' se_duration ' , ' temp_duration ' , ' session_end_timestamp ' , ' event_timestamp ' ] ] . shape [ 0 ] )
print ( " Count stressed: " , extracted_ers [ ( ~ extracted_ers [ ' se_duration ' ] . isna ( ) ) ] [ [ ' se_duration ' , ' temp_duration ' , ' session_end_timestamp ' , ' event_timestamp ' ] ] . shape [ 0 ] )
print ( " Count negative durations (invalid se_time user input): " , extracted_ers [ extracted_ers [ ' se_duration ' ] < 0 ] [ [ ' se_duration ' , ' temp_duration ' , ' session_end_timestamp ' , ' event_timestamp ' ] ] . shape [ 0 ] )
print ( " Count 0 durations: " , extracted_ers [ extracted_ers [ ' se_duration ' ] == 0 ] [ [ ' se_duration ' , ' temp_duration ' , ' session_end_timestamp ' , ' event_timestamp ' ] ] . shape [ 0 ] )
extracted_ers [ extracted_ers [ ' se_duration ' ] < = 0 ] [ [ ' se_duration ' , ' temp_duration ' , ' session_end_timestamp ' , ' event_timestamp ' ] ] . shape [ 0 ]
extracted_ers [ ( ~ extracted_ers [ ' se_duration ' ] . isna ( ) ) & ( extracted_ers [ ' se_duration ' ] < = 0 ) ] [ [ ' se_duration ' , ' temp_duration ' , ' session_end_timestamp ' , ' event_timestamp ' ] ]
2022-12-21 15:02:25 +01:00
ax = extracted_ers . hist ( column = ' se_duration ' , bins = ' auto ' , grid = False , figsize = ( 12 , 8 ) , color = ' #86bf91 ' , zorder = 2 , rwidth = 0.9 )
2022-12-15 16:43:40 +01:00
hist , bin_edges = np . histogram ( extracted_ers [ ' se_duration ' ] . dropna ( ) )
hist
bin_edges
2022-12-21 15:02:25 +01:00
extracted_ers = extracted_ers [ extracted_ers [ ' se_duration ' ] > = 0 ]
2022-12-15 16:43:40 +01:00
# %%
# bins = [-100000000, 0, 0.0000001, 1200, 7200, 100000000] #'neg', 'zero', '<20min', '2h', 'high_pos' ..... right=False
bins = [ - 100000000 , - 0.0000001 , 0 , 300 , 600 , 1200 , 3600 , 7200 , 14400 , 1000000000 ] # 'neg', 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
extracted_ers [ ' bins ' ] , edges = pd . cut ( extracted_ers . se_duration , bins = bins , labels = [ ' neg ' , ' zero ' , ' 5min ' , ' 10min ' , ' 20min ' , ' 1h ' , ' 2h ' , ' 4h ' , ' more ' ] , retbins = True , right = True ) #['low', 'medium', 'high']
sns . displot (
data = extracted_ers . dropna ( ) ,
x = " bins " ,
binwidth = 0.1 ,
)
2022-12-21 15:02:25 +01:00
# %% [markdown]
extracted_ers [ extracted_ers [ ' session_end_timestamp ' ] - extracted_ers [ ' event_timestamp ' ] > = 0 ]
extracted_ers [ ' se_time ' ] . value_counts ( )
pd . set_option ( ' display.max_rows ' , 100 )
2022-12-15 16:43:40 +01:00
# Tukaj nas zanima, koliko so oddaljeni časi stresnega dogodka od konca vprašalnika.
2022-12-21 15:02:25 +01:00
extracted_ers = extracted_ers [ ~ extracted_ers [ ' se_duration ' ] . isna ( ) ] # Remove no stress events
2022-12-15 16:43:40 +01:00
extracted_ers [ ' diff_se_time_session_end ' ] = ( extracted_ers [ ' session_end_timestamp ' ] - extracted_ers [ ' event_timestamp ' ] )
2022-12-21 15:02:25 +01:00
print ( " Count all: " , extracted_ers [ [ ' se_duration ' , ' temp_duration ' , ' session_start_timestamp ' , ' event_timestamp ' ] ] . shape [ 0 ] )
print ( " Count negative durations: " , extracted_ers [ extracted_ers [ ' diff_se_time_session_end ' ] < 0 ] [ [ ' se_duration ' , ' temp_duration ' , ' session_start_timestamp ' , ' event_timestamp ' ] ] )
print ( " Count 0 durations: " , extracted_ers [ extracted_ers [ ' diff_se_time_session_end ' ] == 0 ] [ [ ' se_duration ' , ' temp_duration ' , ' session_start_timestamp ' , ' event_timestamp ' ] ] . shape [ 0 ] )
extracted_ers [ extracted_ers [ ' diff_se_time_session_end ' ] < 0 ] [ ' diff_se_time_session_end ' ]
# extracted_ers = extracted_ers[(extracted_ers['diff_se_time_session_end'] > 0)]
bins2 = [ - 100000 , 0 , 300 , 600 , 1200 , 3600 , 7200 , 14400 , 1000000000 ] # 'zero', '5min', '10min', '20min', '1h', '2h', '4h', 'more'
extracted_ers [ ' bins2 ' ] , edges = pd . cut ( extracted_ers . diff_se_time_session_end , bins = bins2 , labels = [ ' neg_zero ' , ' 5min ' , ' 10min ' , ' 20min ' , ' 1h ' , ' 2h ' , ' 4h ' , ' more ' ] , retbins = True , right = True ) #['low', 'medium', 'high']
extracted_ers [ ' bins2 ' ]
2022-12-15 16:43:40 +01:00
sns . displot (
data = extracted_ers . dropna ( ) ,
x = " bins2 " ,
binwidth = 0.1 ,
)
extracted_ers . shape
extracted_ers . dropna ( ) . shape
2022-12-21 15:02:25 +01:00
print ( )
2022-12-15 16:43:40 +01:00
# %%
extracted_ers [ ' appraisal_stressfulness_event_num ' ] = extracted_ers [ ' appraisal_stressfulness_event ' ] . str [ 0 ] . astype ( int )
print ( " duration-target (corr): " , extracted_ers [ ' se_duration ' ] . corr ( extracted_ers [ ' appraisal_stressfulness_event_num ' ] ) )
# %%
# Explore groupby participants?