Compare commits

...

6 Commits

Author SHA1 Message Date
Marcel Martinšek fb8868b77d removed cleaning to make it run 2023-03-31 13:27:01 +00:00
Marcel Martinšek da77f7476c Merge branch 'sociality-task' of https://repo.ijs.si/junoslukan/rapids into sociality-task 2023-03-31 13:08:19 +00:00
Marcel Martinšek 4db8810d08 corrected esm_features index column 2023-03-31 13:08:15 +00:00
junos 7832d7d098 Update R packages. 2023-03-30 20:33:35 +02:00
Marcel Martinšek e7bb9d6702 not working temp 2023-03-30 11:54:51 +00:00
Marcel Martinšek 689f677a3e updated r package dependencies to make it run 2023-03-29 13:09:26 +00:00
7 changed files with 685 additions and 170 deletions

View File

@ -26,7 +26,7 @@ TIME_SEGMENTS: &time_segments
INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
TAILORED_EVENTS: # Only relevant if TYPE=EVENT
COMPUTE: True
SEGMENTING_METHOD: "stress_event" # 30_before, 90_before, stress_event
SEGMENTING_METHOD: "10_before" # 30_before, 90_before, stress_event
INTERVAL_OF_INTEREST: 10 # duration of event of interest [minutes]
IOI_ERROR_TOLERANCE: 5 # interval of interest erorr tolerance (before and after IOI) [minutes]
@ -91,7 +91,7 @@ PHONE_ACTIVITY_RECOGNITION:
EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same AR episode.
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"]
ACTIVITY_CLASSES:
STATIONARY: ["still", "tilting"]
@ -120,7 +120,7 @@ PHONE_APPLICATIONS_FOREGROUND:
SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
INCLUDE_EPISODE_FEATURES: True
SINGLE_CATEGORIES: ["all", "email"]
MULTIPLE_CATEGORIES:
@ -155,7 +155,7 @@ PHONE_BATTERY:
EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
SRC_SCRIPT: src/features/phone_battery/rapids/main.py
@ -169,7 +169,7 @@ PHONE_BLUETOOTH:
SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R
DORYAB:
COMPUTE: True
COMPUTE: False
FEATURES:
ALL:
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
@ -190,7 +190,7 @@ PHONE_CALLS:
CONTAINER: call
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
FEATURES_TYPE: EPISODES # EVENTS or EPISODES
CALL_TYPES: [missed, incoming, outgoing]
FEATURES:
@ -219,18 +219,19 @@ PHONE_CONVERSATION: # TODO Adapt for speech
# See https://www.rapids.science/latest/features/phone-data-yield/
PHONE_DATA_YIELD:
SENSORS: [#PHONE_ACCELEROMETER,
PHONE_ACTIVITY_RECOGNITION,
PHONE_APPLICATIONS_FOREGROUND,
PHONE_APPLICATIONS_NOTIFICATIONS,
PHONE_BATTERY,
PHONE_BLUETOOTH,
PHONE_CALLS,
PHONE_LIGHT,
PHONE_LOCATIONS,
PHONE_MESSAGES,
PHONE_SCREEN,
PHONE_WIFI_VISIBLE]
SENSORS: [ #PHONE_ACCELEROMETER,
#PHONE_ACTIVITY_RECOGNITION,
#PHONE_APPLICATIONS_FOREGROUND,
#PHONE_APPLICATIONS_NOTIFICATIONS,
#PHONE_BATTERY,
PHONE_BLUETOOTH #,
#PHONE_CALLS,
#PHONE_LIGHT,
#PHONE_LOCATIONS,
#PHONE_MESSAGES,
#PHONE_SCREEN,
#PHONE_WIFI_VISIBLE
]
PROVIDERS:
RAPIDS:
COMPUTE: True
@ -243,9 +244,8 @@ PHONE_ESM:
PROVIDERS:
STRAW:
COMPUTE: True
SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
FEATURES: [mean]
SCALES: ["activities"]
FEATURES: [activities_n_others, activities_inperson, activities_formal]
SRC_SCRIPT: src/features/phone_esm/straw/main.py
# See https://www.rapids.science/latest/features/phone-keyboard/
@ -262,7 +262,7 @@ PHONE_LIGHT:
CONTAINER: light_sensor
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
SRC_SCRIPT: src/features/phone_light/rapids/main.py
@ -276,7 +276,7 @@ PHONE_LOCATIONS:
PROVIDERS:
DORYAB:
COMPUTE: True
COMPUTE: False
FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome", "homelabel"]
DBSCAN_EPS: 100 # meters
DBSCAN_MINSAMPLES: 5
@ -291,7 +291,7 @@ PHONE_LOCATIONS:
SRC_SCRIPT: src/features/phone_locations/doryab/main.py
BARNETT:
COMPUTE: True
COMPUTE: False
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
@ -309,7 +309,7 @@ PHONE_MESSAGES:
CONTAINER: sms
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
MESSAGES_TYPES : [received, sent]
FEATURES:
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
@ -321,7 +321,7 @@ PHONE_SCREEN:
CONTAINER: screen
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
REFERENCE_HOUR_FIRST_USE: 0
IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable
IGNORE_EPISODES_LONGER_THAN: 360 # in minutes, set to 0 to disable
@ -334,7 +334,7 @@ PHONE_SPEECH:
CONTAINER: speech
PROVIDERS:
STRAW:
COMPUTE: True
COMPUTE: False
FEATURES: ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"]
SRC_SCRIPT: src/features/phone_speech/straw/main.py
@ -352,7 +352,7 @@ PHONE_WIFI_VISIBLE:
CONTAINER: wifi
PROVIDERS:
RAPIDS:
COMPUTE: True
COMPUTE: False
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R
@ -521,10 +521,10 @@ EMPATICA_ACCELEROMETER:
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
CR:
COMPUTE: True
COMPUTE: False
FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features
WINDOWS:
COMPUTE: True
COMPUTE: False
WINDOW_LENGTH: 15 # specify window length in seconds
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
SRC_SCRIPT: src/features/empatica_accelerometer/cr/main.py
@ -548,11 +548,11 @@ EMPATICA_TEMPERATURE:
FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
CR:
COMPUTE: True
COMPUTE: False
FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean",
"stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"]
WINDOWS:
COMPUTE: True
COMPUTE: False
WINDOW_LENGTH: 300 # specify window length in seconds
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
SRC_SCRIPT: src/features/empatica_temperature/cr/main.py
@ -566,14 +566,14 @@ EMPATICA_ELECTRODERMAL_ACTIVITY:
FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
CR:
COMPUTE: True
COMPUTE: False
FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic',
'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore',
'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio',
'avgPeakIncreaseTime', 'avgPeakDecreaseTime', 'avgPeakDuration', 'signalOverallChange', 'changeDuration', 'changeRate', 'significantIncrease',
'significantDecrease']
WINDOWS:
COMPUTE: True
COMPUTE: False
WINDOW_LENGTH: 60 # specify window length in seconds
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', count_windows, eda_num_peaks_non_zero]
IMPUTE_NANS: True
@ -592,7 +592,7 @@ EMPATICA_BLOOD_VOLUME_PULSE:
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
WINDOWS:
COMPUTE: True
COMPUTE: False
WINDOW_LENGTH: 300 # specify window length in seconds
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/cr/main.py
@ -606,12 +606,12 @@ EMPATICA_INTER_BEAT_INTERVAL:
FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
CR:
COMPUTE: True
COMPUTE: False
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
PATCH_WITH_BVP: True
WINDOWS:
COMPUTE: True
COMPUTE: False
WINDOW_LENGTH: 300 # specify window length in seconds
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
SRC_SCRIPT: src/features/empatica_inter_beat_interval/cr/main.py
@ -673,12 +673,12 @@ ALL_CLEANING_INDIVIDUAL:
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
DROP_HIGHLY_CORRELATED_FEATURES:
COMPUTE: True
COMPUTE: False
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
STRAW:
COMPUTE: True
COMPUTE: False
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
@ -686,7 +686,7 @@ ALL_CLEANING_INDIVIDUAL:
COLS_NAN_THRESHOLD: 0.9 # set to 1 to remove only columns that contains all (100% of) NaN
COLS_VAR_THRESHOLD: True
DROP_HIGHLY_CORRELATED_FEATURES:
COMPUTE: True
COMPUTE: False
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95
STANDARDIZATION: True
@ -705,12 +705,12 @@ ALL_CLEANING_OVERALL:
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
DROP_HIGHLY_CORRELATED_FEATURES:
COMPUTE: True
COMPUTE: False
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
STRAW:
COMPUTE: True
COMPUTE: False
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
@ -718,7 +718,7 @@ ALL_CLEANING_OVERALL:
COLS_NAN_THRESHOLD: 0.8 # set to 1 to remove only columns that contains all (100% of) NaN
COLS_VAR_THRESHOLD: True
DROP_HIGHLY_CORRELATED_FEATURES:
COMPUTE: True
COMPUTE: False
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95
STANDARDIZATION: True
@ -732,7 +732,7 @@ ALL_CLEANING_OVERALL:
PARAMS_FOR_ANALYSIS:
BASELINE:
COMPUTE: True
COMPUTE: False
FOLDER: data/external/baseline
CONTAINER: [results-survey637813_final.csv, # Slovenia
results-survey358134_final.csv, # Belgium 1
@ -743,7 +743,7 @@ PARAMS_FOR_ANALYSIS:
CATEGORICAL_FEATURES: [gender]
TARGET:
COMPUTE: True
COMPUTE: False
LABEL: appraisal_stressfulness_event_mean
ALL_LABELS: [appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean]
# PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean,

338
renv.lock
View File

@ -1,6 +1,6 @@
{
"R": {
"Version": "4.1.2",
"Version": "4.2.3",
"Repositories": [
{
"Name": "CRAN",
@ -46,10 +46,10 @@
},
"Hmisc": {
"Package": "Hmisc",
"Version": "4.4-2",
"Version": "5.0-1",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "66458e906b2112a8b1639964efd77d7c"
"Repository": "CRAN",
"Hash": "bf9fe82c010a468fb32f913ff56d65e1"
},
"KernSmooth": {
"Package": "KernSmooth",
@ -104,7 +104,7 @@
"Package": "RPostgres",
"Version": "1.4.4",
"Source": "Repository",
"Repository": "CRAN",
"Repository": "RSPM",
"Hash": "c593ecb8dbca9faf3906431be610ca28"
},
"Rcpp": {
@ -181,7 +181,7 @@
"Package": "base64enc",
"Version": "0.1-3",
"Source": "Repository",
"Repository": "CRAN",
"Repository": "RSPM",
"Hash": "543776ae6848fde2f48ff3816d0628bc"
},
"bit": {
@ -221,17 +221,24 @@
},
"broom": {
"Package": "broom",
"Version": "0.7.3",
"Version": "1.0.4",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "5581a5ddc8fe2ac5e0d092ec2de4c4ae"
"Repository": "CRAN",
"Hash": "f62b2504021369a2449c54bbda362d30"
},
"cachem": {
"Package": "cachem",
"Version": "1.0.7",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "cda74447c42f529de601fe4d4050daef"
},
"callr": {
"Package": "callr",
"Version": "3.5.1",
"Version": "3.7.3",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "b7d7f1e926dfcd57c74ce93f5c048e80"
"Repository": "CRAN",
"Hash": "9b2191ede20fa29828139b9900922e51"
},
"caret": {
"Package": "caret",
@ -263,10 +270,10 @@
},
"cli": {
"Package": "cli",
"Version": "2.2.0",
"Version": "3.6.1",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "3ef298932294b775fa0a3eeaa3a645b0"
"Repository": "CRAN",
"Hash": "89e6d8219950eac806ae0c489052048a"
},
"clipr": {
"Package": "clipr",
@ -286,7 +293,7 @@
"Package": "codetools",
"Version": "0.2-18",
"Source": "Repository",
"Repository": "CRAN",
"Repository": "RSPM",
"Hash": "019388fc48e48b3da0d3a76ff94608a8"
},
"colorspace": {
@ -303,6 +310,13 @@
"Repository": "RSPM",
"Hash": "0f22be39ec1d141fd03683c06f3a6e67"
},
"conflicted": {
"Package": "conflicted",
"Version": "1.2.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "bb097fccb22d156624fd07cd2894ddb6"
},
"corpcor": {
"Package": "corpcor",
"Version": "1.6.9",
@ -319,10 +333,10 @@
},
"cpp11": {
"Package": "cpp11",
"Version": "0.2.4",
"Version": "0.4.3",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "ba66e5a750d39067d888aa7af797fed2"
"Repository": "CRAN",
"Hash": "ed588261931ee3be2c700d22e94a29ab"
},
"crayon": {
"Package": "crayon",
@ -354,10 +368,10 @@
},
"dbplyr": {
"Package": "dbplyr",
"Version": "2.1.1",
"Version": "2.3.2",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "1f37fa4ab2f5f7eded42f78b9a887182"
"Hash": "d24305b92db333726aed162a2c23a147"
},
"desc": {
"Package": "desc",
@ -382,17 +396,17 @@
},
"dplyr": {
"Package": "dplyr",
"Version": "1.0.5",
"Version": "1.1.1",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "d0d76c11ec807eb3f000eba4e3eb0f68"
"Repository": "CRAN",
"Hash": "eb5742d256a0d9306d85ea68756d8187"
},
"dtplyr": {
"Package": "dtplyr",
"Version": "1.1.0",
"Version": "1.3.1",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "1e14e4c5b2814de5225312394bc316da"
"Repository": "CRAN",
"Hash": "54ed3ea01b11e81a86544faaecfef8e2"
},
"e1071": {
"Package": "e1071",
@ -419,7 +433,7 @@
"Package": "evaluate",
"Version": "0.14",
"Source": "Repository",
"Repository": "CRAN",
"Repository": "RSPM",
"Hash": "ec8ca05cffcc70569eaaad8469d2a3a7"
},
"fansi": {
@ -452,10 +466,10 @@
},
"forcats": {
"Package": "forcats",
"Version": "0.5.0",
"Version": "1.0.0",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "1cb4279e697650f0bd78cd3601ee7576"
"Repository": "CRAN",
"Hash": "1a0a9a3d5083d0d573c4214576f1e690"
},
"foreach": {
"Package": "foreach",
@ -492,6 +506,13 @@
"Repository": "RSPM",
"Hash": "f568ce73d3d59582b0f7babd0eb33d07"
},
"gargle": {
"Package": "gargle",
"Version": "1.3.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "bb3208dcdfeb2e68bf33c87601b3cbe3"
},
"gclus": {
"Package": "gclus",
"Version": "1.3.2",
@ -515,10 +536,10 @@
},
"ggplot2": {
"Package": "ggplot2",
"Version": "3.3.2",
"Version": "3.4.1",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "4ded8b439797f7b1693bd3d238d0106b"
"Repository": "CRAN",
"Hash": "d494daf77c4aa7f084dbbe6ca5dcaca7"
},
"ggraph": {
"Package": "ggraph",
@ -557,16 +578,30 @@
},
"glue": {
"Package": "glue",
"Version": "1.4.2",
"Version": "1.6.2",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "6efd734b14c6471cfe443345f3e35e29"
"Repository": "CRAN",
"Hash": "4f2596dfb05dac67b9dc558e5c6fba2e"
},
"googledrive": {
"Package": "googledrive",
"Version": "2.1.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "e88ba642951bc8d1898ba0d12581850b"
},
"googlesheets4": {
"Package": "googlesheets4",
"Version": "1.1.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "fd7b97bd862a14297b0bb7ed28a3dada"
},
"gower": {
"Package": "gower",
"Version": "0.2.2",
"Source": "Repository",
"Repository": "CRAN",
"Repository": "RSPM",
"Hash": "be6a2b3529928bd803d1c437d1d43152"
},
"graphlayouts": {
@ -599,10 +634,10 @@
},
"haven": {
"Package": "haven",
"Version": "2.3.1",
"Version": "2.5.2",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "221d0ad75dfa03ebf17b1a4cc5c31dfc"
"Repository": "CRAN",
"Hash": "8b331e659e67d757db0fcc28e689c501"
},
"highr": {
"Package": "highr",
@ -613,10 +648,10 @@
},
"hms": {
"Package": "hms",
"Version": "1.1.1",
"Version": "1.1.3",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "5b8a2dd0fdbe2ab4f6081e6c7be6dfca"
"Hash": "b59377caa7ed00fa41808342002138f9"
},
"htmlTable": {
"Package": "htmlTable",
@ -648,10 +683,10 @@
},
"httr": {
"Package": "httr",
"Version": "1.4.2",
"Version": "1.4.5",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "a525aba14184fec243f9eaec62fbed43"
"Repository": "CRAN",
"Hash": "f6844033201269bec3ca0097bc6c97b3"
},
"huge": {
"Package": "huge",
@ -660,6 +695,13 @@
"Repository": "RSPM",
"Hash": "a4cde4dd1d2551edb99a3273a4ad34ea"
},
"ids": {
"Package": "ids",
"Version": "1.0.1",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "99df65cfef20e525ed38c3d2577f7190"
},
"igraph": {
"Package": "igraph",
"Version": "1.2.6",
@ -704,10 +746,10 @@
},
"jsonlite": {
"Package": "jsonlite",
"Version": "1.7.2",
"Version": "1.8.4",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "98138e0994d41508c7a6b84a0600cfcb"
"Repository": "CRAN",
"Hash": "a4269a09a9b865579b2635c77e572374"
},
"knitr": {
"Package": "knitr",
@ -760,10 +802,10 @@
},
"lifecycle": {
"Package": "lifecycle",
"Version": "1.0.0",
"Version": "1.0.3",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "3471fb65971f1a7b2d4ae7848cf2db8d"
"Repository": "CRAN",
"Hash": "001cecbeac1cff9301bdc3775ee46a86"
},
"listenv": {
"Package": "listenv",
@ -774,17 +816,17 @@
},
"lubridate": {
"Package": "lubridate",
"Version": "1.7.9.2",
"Version": "1.9.2",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "5b5b02f621d39a499def7923a5aee746"
"Repository": "CRAN",
"Hash": "e25f18436e3efd42c7c590a1c4c15390"
},
"magrittr": {
"Package": "magrittr",
"Version": "2.0.1",
"Version": "2.0.3",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "41287f1ac7d28a92f0a286ed507928d3"
"Repository": "CRAN",
"Hash": "7ce2733a9826b3aeb1775d56fd305472"
},
"markdown": {
"Package": "markdown",
@ -800,6 +842,13 @@
"Repository": "RSPM",
"Hash": "67101e7448dfd9add4ac418623060262"
},
"memoise": {
"Package": "memoise",
"Version": "2.0.1",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "e2817ccf4a065c5d9d7f2cfbe7c1d78c"
},
"mgcv": {
"Package": "mgcv",
"Version": "1.8-33",
@ -830,10 +879,10 @@
},
"modelr": {
"Package": "modelr",
"Version": "0.1.8",
"Version": "0.1.11",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "9fd59716311ee82cba83dc2826fc5577"
"Repository": "CRAN",
"Hash": "4f50122dc256b1b6996a4703fecea821"
},
"munsell": {
"Package": "munsell",
@ -888,7 +937,7 @@
"Package": "parallelly",
"Version": "1.29.0",
"Source": "Repository",
"Repository": "CRAN",
"Repository": "RSPM",
"Hash": "b5f399c9ce96977e22ef32c20b6cfe87"
},
"pbapply": {
@ -907,10 +956,10 @@
},
"pillar": {
"Package": "pillar",
"Version": "1.4.7",
"Version": "1.9.0",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "3b3dd89b2ee115a8b54e93a34cd546b4"
"Repository": "CRAN",
"Hash": "15da5a8412f317beeee6175fbc76f4bb"
},
"pkgbuild": {
"Package": "pkgbuild",
@ -977,10 +1026,10 @@
},
"processx": {
"Package": "processx",
"Version": "3.4.5",
"Version": "3.8.0",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "22aab6098cb14edd0a5973a8438b569b"
"Repository": "CRAN",
"Hash": "a33ee2d9bf07564efb888ad98410da84"
},
"prodlim": {
"Package": "prodlim",
@ -1000,7 +1049,7 @@
"Package": "progressr",
"Version": "0.9.0",
"Source": "Repository",
"Repository": "CRAN",
"Repository": "RSPM",
"Hash": "ca0d80ecc29903f7579edbabd91f4199"
},
"promises": {
@ -1033,10 +1082,10 @@
},
"purrr": {
"Package": "purrr",
"Version": "0.3.4",
"Version": "1.0.1",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "97def703420c8ab10d8f0e6c72101e02"
"Repository": "CRAN",
"Hash": "d71c815267c640f17ddbf7f16144b4bb"
},
"qap": {
"Package": "qap",
@ -1052,6 +1101,13 @@
"Repository": "RSPM",
"Hash": "d35964686307333a7121eb41c7dcd4e0"
},
"ragg": {
"Package": "ragg",
"Version": "1.2.5",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "690bc058ea2b1b8a407d3cfe3dce3ef9"
},
"rappdirs": {
"Package": "rappdirs",
"Version": "0.3.3",
@ -1061,17 +1117,17 @@
},
"readr": {
"Package": "readr",
"Version": "1.4.0",
"Version": "2.1.4",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "2639976851f71f330264a9c9c3d43a61"
"Repository": "CRAN",
"Hash": "b5047343b3825f37ad9d3b5d89aa1078"
},
"readxl": {
"Package": "readxl",
"Version": "1.3.1",
"Version": "1.4.2",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "63537c483c2dbec8d9e3183b3735254a"
"Repository": "CRAN",
"Hash": "2e6020b1399d95f947ed867045e9ca17"
},
"recipes": {
"Package": "recipes",
@ -1110,10 +1166,10 @@
},
"reprex": {
"Package": "reprex",
"Version": "0.3.0",
"Version": "2.0.2",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "b06bfb3504cc8a4579fd5567646f745b"
"Repository": "CRAN",
"Hash": "d66fe009d4c20b7ab1927eb405db9ee2"
},
"reshape2": {
"Package": "reshape2",
@ -1138,10 +1194,10 @@
},
"rlang": {
"Package": "rlang",
"Version": "0.4.10",
"Version": "1.1.0",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "599df23c40a4fce9c7b4764f28c37857"
"Repository": "CRAN",
"Hash": "dc079ccd156cde8647360f473c1fa718"
},
"rmarkdown": {
"Package": "rmarkdown",
@ -1173,24 +1229,24 @@
},
"rstudioapi": {
"Package": "rstudioapi",
"Version": "0.13",
"Version": "0.14",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "06c85365a03fdaf699966cc1d3cf53ea"
"Repository": "CRAN",
"Hash": "690bd2acc42a9166ce34845884459320"
},
"rvest": {
"Package": "rvest",
"Version": "0.3.6",
"Version": "1.0.3",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "a9795ccb2d608330e841998b67156764"
"Repository": "CRAN",
"Hash": "a4a5ac819a467808c60e36e92ddf195e"
},
"scales": {
"Package": "scales",
"Version": "1.1.1",
"Version": "1.2.1",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "6f76f71042411426ec8df6c54f34e6dd"
"Repository": "CRAN",
"Hash": "906cb23d2f1c5680b8ce439b44c6fa63"
},
"selectr": {
"Package": "selectr",
@ -1236,17 +1292,17 @@
},
"stringi": {
"Package": "stringi",
"Version": "1.5.3",
"Version": "1.7.12",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "a063ebea753c92910a4cca7b18bc1f05"
"Repository": "CRAN",
"Hash": "ca8bd84263c77310739d2cf64d84d7c9"
},
"stringr": {
"Package": "stringr",
"Version": "1.4.0",
"Version": "1.5.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "0759e6b6c0957edb1311028a49a35e76"
"Hash": "671a4d384ae9d32fc47a14e98bfa3dc8"
},
"survival": {
"Package": "survival",
@ -1262,6 +1318,13 @@
"Repository": "RSPM",
"Hash": "b227d13e29222b4574486cfcbde077fa"
},
"systemfonts": {
"Package": "systemfonts",
"Version": "1.0.4",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "90b28393209827327de889f49935140a"
},
"testthat": {
"Package": "testthat",
"Version": "3.0.1",
@ -1269,12 +1332,19 @@
"Repository": "RSPM",
"Hash": "17826764cb92d8b5aae6619896e5a161"
},
"textshaping": {
"Package": "textshaping",
"Version": "0.3.6",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "1ab6223d3670fac7143202cb6a2d43d5"
},
"tibble": {
"Package": "tibble",
"Version": "3.0.4",
"Version": "3.2.1",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "71dffd8544691c520dd8e41ed2d7e070"
"Repository": "CRAN",
"Hash": "a84e2cc86d07289b3b6f5069df7a004c"
},
"tidygraph": {
"Package": "tidygraph",
@ -1285,24 +1355,24 @@
},
"tidyr": {
"Package": "tidyr",
"Version": "1.1.2",
"Version": "1.3.0",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "c40b2d5824d829190f4b825f4496dfae"
"Repository": "CRAN",
"Hash": "e47debdc7ce599b070c8e78e8ac0cfcf"
},
"tidyselect": {
"Package": "tidyselect",
"Version": "1.1.0",
"Version": "1.2.0",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "6ea435c354e8448819627cf686f66e0a"
"Repository": "CRAN",
"Hash": "79540e5fcd9e0435af547d885f184fd5"
},
"tidyverse": {
"Package": "tidyverse",
"Version": "1.3.0",
"Version": "2.0.0",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "bd51be662f359fa99021f3d51e911490"
"Repository": "CRAN",
"Hash": "c328568cd14ea89a83bd4ca7f54ae07e"
},
"timeDate": {
"Package": "timeDate",
@ -1311,6 +1381,13 @@
"Repository": "RSPM",
"Hash": "fde4fc571f5f61978652c229d4713845"
},
"timechange": {
"Package": "timechange",
"Version": "0.2.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "8548b44f79a35ba1791308b61e6012d7"
},
"tinytex": {
"Package": "tinytex",
"Version": "0.28",
@ -1332,6 +1409,13 @@
"Repository": "RSPM",
"Hash": "fc77eb5297507cccfa3349a606061030"
},
"tzdb": {
"Package": "tzdb",
"Version": "0.3.0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "b2e1cbce7c903eaf23ec05c58e59fb5e"
},
"utf8": {
"Package": "utf8",
"Version": "1.1.4",
@ -1339,12 +1423,19 @@
"Repository": "RSPM",
"Hash": "4a5081acfb7b81a572e4384a7aaf2af1"
},
"vctrs": {
"Package": "vctrs",
"Version": "0.3.8",
"uuid": {
"Package": "uuid",
"Version": "1.1-0",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "ecf749a1b39ea72bd9b51b76292261f1"
"Hash": "f1cb46c157d080b729159d407be83496"
},
"vctrs": {
"Package": "vctrs",
"Version": "0.6.1",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "06eceb3a5d716fd0654cc23ca3d71a99"
},
"viridis": {
"Package": "viridis",
@ -1360,6 +1451,13 @@
"Repository": "RSPM",
"Hash": "ce4f6271baa94776db692f1cb2055bee"
},
"vroom": {
"Package": "vroom",
"Version": "1.6.1",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "7015a74373b83ffaef64023f4a0f5033"
},
"waldo": {
"Package": "waldo",
"Version": "0.2.3",
@ -1376,10 +1474,10 @@
},
"withr": {
"Package": "withr",
"Version": "2.3.0",
"Version": "2.5.0",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "7307d79f58d1885b38c4f4f1a8cb19dd"
"Repository": "CRAN",
"Hash": "c0e49a9760983e81e55cdd9be92e7182"
},
"xfun": {
"Package": "xfun",
@ -1390,10 +1488,10 @@
},
"xml2": {
"Package": "xml2",
"Version": "1.3.2",
"Version": "1.3.3",
"Source": "Repository",
"Repository": "RSPM",
"Hash": "d4d71a75dd3ea9eb5fa28cc21f9585e2"
"Repository": "CRAN",
"Hash": "40682ed6a969ea5abfd351eb67833adc"
},
"xtable": {
"Package": "xtable",

View File

@ -0,0 +1,292 @@
import pandas as pd
import numpy as np
id2qc = { 44:["What have you mainly been doing within the last 10 minutes?",
"Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
"Kaj ste v glavnem počeli v zadnjih 10 minutah?"],
45:["What type of individual work?",
"Wat voor soort individueel werk?",
"Kakšno vrsto samostojnega dela ste opravljali?"],
46:["How did you work with others?",
"Hoe heb je met anderen gewerkt?",
"Kako ste sodelovali z drugimi?"],
47:["What type of break?",
"Wat voor soort pauze?",
"Kakšno vrsto odmora ste imeli?"],
48:["Where did you travel between?",
"Waar heb je tussen gereisd?",
"Kam ste potovali?"],
49:["Did you use a computer or phone for that?",
"Heb je daarvoor een computer of telefoon gebruikt?",
"Ste za to uporabljali računalnik ali telefon?"],
50:["What kind of an interaction was that?",
"Wat voor interactie was dat?",
"Kakšne vrste sodelovanja je bilo to?"],
51:["How many people were involved besides yourself?",
"Hoeveel mensen waren er behalve jezelf betrokken?",
"Koliko oseb je bilo poleg vas še vpletenih?"],
# 52:["What have you mainly been doing within the last 10 minutes?",
# "Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
# "Kaj ste v glavnem počeli v zadnjih 10 minutah?"]
}
qc2id = {v:k for k,values in id2qc.items() for v in values}
next_questions = { 44: [45,46,47,48],
45:[49,49],
46:[50,50],
47:[],
48:[],
49:[],
50:[51,51],
51:[]
#52:[45,46,47,48],
}
def esm_activities_LTM_features(
df_esm_activities_cleaned: pd.DataFrame,
) -> pd.DataFrame:
""" Function for calculating LTM(Last 10 minutes) features of questionnaire answers. It first corrects the question ids according
to esm_instructions and the updated corpus of question_ids. It then processes each LTM question chain to
find relevant social properties given by the answers such as the number of people interacted with, the formality and whether the socializing was done in person.
Parameters
----------
df_esm_activities_cleaned: pd.DataFrame
A cleaned up dataframe, which must include esm_instructions, esm_user_answer_numeric.
Returns
-------
df_esm_activities_cleaned: pd.DataFrame
The same dataframe with columns which contain:
["correct_ids"] - Corrected question_ids
["ans_seq"] - For each LTM question, the sequence of numerical user answers pertaining to this chain of questions.
["n_others","inperson","formal"]- Properties of known potential social encounters as given by process_answers().
"""
#TODO: preprocess questionaires
#DONE: correct ids
correct_id_df = correct_activity_qids(df_esm_activities_cleaned)
#DONE: process subquestions
ids = correct_id_df["correct_ids"]
main_q_indices = ids[ids==44].index
q_group = []
i=-1
for id in ids:
if(id==44):
i=i+1
q_group.append(i)
correct_id_df["q_group"] = q_group
ans_seq = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).rename(columns={"esm_user_answer_numeric":"ans_seq"})
ans_seq.set_index(main_q_indices,inplace=True)
# correct_id_df["ans_seq"] = [[] for i in range(len(correct_id_df))]
# correct_id_df["ans_seq"].loc[main_q_indices] = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).values.reshape(-1)
#DONE: find types of status for each main question: socializing:[none,irl,online,unknown], num_people:[0,1,2,>2,unknown]
processed_ans_df = process_answers(ans_seq)
# df_out = df_esm_activities_cleaned.join(test)
return df_esm_activities_cleaned.join(processed_ans_df)
"""
possible answer sequences for LTM question chains
#alone
0,0,0 not social
0,0,1 not social
0,1,0 not social
0,1,1 not social
0,2 not social
0,3 not social
0,4 not social
0,5 not social
0,6 not social
#w/ others
1,0,0,0 1 irl
1,0,0,1 2 irl
1,0,0,2 3+ irl
1,0,1,0 1 irl
1,0,1,1 2 irl
1,0,1,2 3+ irl
1,1,0,0 1 online
1,1,0,1 2 online
1,1,0,2 3+ online
1,1,1,0 1 online
1,1,1,1 2 online
1,1,1,2 3+ online
1,2 positive likely to be more than 2
1,3 positive
#break
2,0 ambiguous
2,1 positive irl
2,2 ambiguous
2,3 ambiguous
#transit
3,0 ambiguous
3,1 ambiguous
3,2 ambiguous
"""
#TODO: docstring
def process_answers(df:pd.DataFrame)-> pd.DataFrame:
""" Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
> n_others: Number of other people interacted with in the last 10 minutes
- -1: Number is positive but unknown exactly
- 0: No people/alone
- 1: One extra person
- 2: Two extra people
- 3: More than two extra people
- NaN : Can't say anything with enough certainty.
> inperson:
- True/False: The interaction in question was/wasn't in person.
- None: Can't say anything with enough certainty.
> formal:
- True/False: The interaction in question was/wasn't formal.
- None: Can't say anything with enough certainty.
Args:
df (pd.DataFrame): _description_
Returns:
pd.DataFrame: _description_
"""
properties = {"n_others":[],
"inperson":[],
"formal":[]}
for ans_seq in df["ans_seq"]:
n_other = None
inperson = None
formal = None
if(ans_seq[0]==0):
n_other = 0
elif(ans_seq[0]==1):
if(ans_seq[1]==3):
n_other = -1 # anwsered "Other" but did work with other people
elif(ans_seq[1]==2):
n_other = 3 #assuming more than 2 people participated in the lecture or presentation
elif(ans_seq[1] in [0,1]):
inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
formal = ans_seq[2]==0#0 means formal
n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
elif(ans_seq[0]==2):
formal = False#assuming one does not have a formal meeting during break time
if(ans_seq[1]==1):
n_other = -1
inperson = True
#if not 1 then we dont know anythong for sure
elif(ans_seq[0]==3):
#we cant say whether the persion was carpooling or driving alone.
pass
properties["n_others"].append(n_other)
properties["inperson"].append(inperson)
properties["formal"].append(formal)
#df = df.join(pd.DataFrame(properties,index=df.index))
return pd.DataFrame(properties,index=df.index)
def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame:
"""_summary_
Args:
df (pd.DataFrame): _description_
Returns:
pd.DataFrame: Input dataframe with added column "correct_ids"
"""
df["correct_ids"] = df["esm_instructions"].apply(lambda x: qc2id[x])
return df
def process_answers_aggregation(df:pd.core.groupby.generic.DataFrameGroupBy)-> pd.core.groupby.generic.DataFrameGroupBy:
""" Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
> n_others: Number of other people interacted with in the last 10 minutes
- -1: Number is positive but unknown exactly
- 0: No people/alone
- 1: One extra person
- 2: Two extra people
- 3: More than two extra people
- NaN : Can't say anything with enough certainty.
> inperson:
- True/False: The interaction in question was/wasn't in person.
- None: Can't say anything with enough certainty.
> formal:
- True/False: The interaction in question was/wasn't formal.
- None: Can't say anything with enough certainty.
Args:
df (pd.DataFrame): _description_
Returns:
pd.DataFrame: _description_
"""
#print("=======================\nAPPLY START:\ndf=",df.columns,df.local_segment)
properties = {"n_others":[],
"inperson":[],
"formal":[]}
ans_seq = df["esm_user_answer_numeric"].values
n_other = None
inperson = None
formal = None
if(ans_seq[0]==0):
n_other = 0
elif(ans_seq[0]==1):
if(ans_seq[1]==3):
n_other = -1 # anwsered "Other" but did work with other people
elif(ans_seq[1]==2):
n_other = 3 #assuming more than 2 people participated in the lecture or presentation
elif(ans_seq[1] in [0,1]):
inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
formal = ans_seq[2]==0#0 means formal
n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
elif(ans_seq[0]==2):
formal = False#assuming one does not have a formal meeting during break time
if(ans_seq[1]==1):
n_other = -1
inperson = True
#if not 1 then we dont know anythong for sure
elif(ans_seq[0]==3):
#we cant say whether the persion was carpooling or driving alone.
pass
properties["n_others"].append(n_other)
properties["inperson"].append(inperson)
properties["formal"].append(formal)
df = df.join(pd.DataFrame(properties,index=df.index))
#print("APPLY END:\ndf=",df[["n_others","inperson","formal"]])
return df
#test stuff
def test():
from esm_preprocess import preprocess_esm,clean_up_esm
df = pd.read_csv("data/raw/p031/phone_esm_with_datetime.csv")
df = preprocess_esm(df)
df = clean_up_esm(df)
df = df[df["questionnaire_id"]==97]
original = esm_activities_LTM_features(df)
df["local_segment"] = [str(i)+":"+j for i,j in df[["esm_session","device_id"]].values]
temp = df.groupby("local_segment")
temp2 = temp.apply(process_answers_aggregation)
#compare with original function results
selection = original[original["correct_ids"]==44][["n_others", "inperson", "formal"]]
temp_selection = temp2.loc[selection.index]
temp_selection.compare(selection,keep_shape=True,keep_equal =True)
#print out ans_seq processing results
# import json
# i = 0
# for j,ans in correct_id_df[["esm_json","esm_user_answer"]].values:
# obj = json.loads(j)
# text = obj["esm_instructions"]
# if ("10 minut" in text):
# print("---\n",test.ans_seq.iloc[i])
# print(test[["n_others","inperson","formal"]].values[i])
# i = i+1
# print(text,ans)
#test()

View File

@ -1,4 +1,8 @@
import pandas as pd
import sys
import warnings
sys.path.append('src/features/phone_esm/straw')
from esm_activities import esm_activities_LTM_features,process_answers_aggregation
QUESTIONNAIRE_IDS = {
"sleep_quality": 1,
@ -39,24 +43,50 @@ QUESTIONNAIRE_IDS = {
def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
esm_data = pd.read_csv(sensor_data_files["sensor_data"])
requested_features = provider["FEATURES"]
# name of the features this function can compute
requested_scales = provider["SCALES"]
base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge","activities_n_others","activities_inperson","activities_formal"]
#TODO Check valid questionnaire and feature names.
# the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names))
esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
if not esm_data.empty:
esm_data = filter_data_by_segment(esm_data, time_segment)
if not esm_data.empty:
esm_features = pd.DataFrame()
for scale in requested_scales:
questionnaire_id = QUESTIONNAIRE_IDS[scale]
mask = esm_data["questionnaire_id"] == questionnaire_id
esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
#print(esm_data.loc[mask].head())
#print(time_segment)
if not mask.any():
temp = sensor_data_files["sensor_data"]
warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}-{time_segment}",RuntimeWarning)
continue
#TODO: calculation of LTM features
if scale=="activities":
requested_subset = [req for req in requested_features if req.startswith("activities")]
if not bool(requested_subset):
continue
# ltm_features = esm_activities_LTM_features(esm_data.loc[mask])
# print(esm_data["esm_json"].values)
# print(mask)
# print(esm_data.loc[mask])
# #ltm_features = ltm_features[ltm_features["correct_ids"==44]]
#print(esm_data.loc[mask]["local_segment"])
ltm_features = esm_data.loc[mask].groupby(["local_segment"]).apply(process_answers_aggregation)
#print("PRINTING ltm_features:\n",ltm_features)
ltm_features.rename(columns={"n_others":"activities_n_others","inperson":"activities_inperson","formal":"activities_formal"},inplace=True)
esm_features[requested_subset] = ltm_features.groupby("local_segment").first()[requested_subset]
#print(esm_features.columns)
#print("PRINTING esm_features after rename:\n",ltm_features)
#FIXME: it might be an issue that im calculating for whole time segment and not grouping by "local segment"
#print("~~~~~~~~~~~~~~~~~~~~~~~~===============================~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n LTM FEATURES STORED... AFTER RETURN:\n",ltm_features,esm_features[["activities_"+req for req in requested_subset]])
if("mean" in features_to_compute):
esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
#TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.
esm_features = esm_features.reset_index()
@ -64,3 +94,15 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
esm_features.rename(columns={'index': 'local_segment'}, inplace=True)
return esm_features
def test_main():
import temp_help
provider = {
"FEATURES":["mean","activities_n_others","activities_inperson","activities_formal"],
"SCALES":['activities']
}
sensor_data_files = {"sensor_data":"data/interim/p069/phone_esm_clean.csv"}
s_feat = straw_features(sensor_data_files,"straw_event_stress_event_p069_110",provider,temp_help.filter_data_by_segment)
print(s_feat)
#test_main()

View File

@ -67,7 +67,7 @@ def extract_ers(esm_df):
segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"]
if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
if segmenting_method in ["10_before", "30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
""" '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
All questionnaire durations over 15 minutes are excluded from the querying.
@ -79,7 +79,18 @@ def extract_ers(esm_df):
extracted_ers = extracted_ers[extracted_ers["timestamp"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min
extracted_ers["shift_direction"] = -1
if segmenting_method == "30_before":
if segmenting_method == "10_before":
"""The method 10-minutes before simply takes 10 minutes before the questionnaire and sums it with the questionnaire duration.
The timestamps are formatted with the help of format_timestamp() method.
"""
time_before_questionnaire = 10 * 60 # in seconds (10 minutes)
#TODO: split into small segments with manipulating lenght and shift
extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x))
extracted_ers["shift"] = time_before_questionnaire
extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x))
elif segmenting_method == "30_before":
"""The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
The timestamps are formatted with the help of format_timestamp() method.
"""

View File

@ -0,0 +1,70 @@
"""This file is TEMPORARY and intended for testing main.py
"""
def filter_data_by_segment(data, time_segment):
data.dropna(subset=["assigned_segments"], inplace=True)
if(data.shape[0] == 0): # data is empty
data["local_segment"] = data["timestamps_segment"] = None
return data
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
timestamps_regex = "[0-9]{13}"
segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex)
data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
data = data.drop(columns=["assigned_segments"])
data = data.dropna(subset = ["local_segment"])
if(data.shape[0] == 0): # there are no rows belonging to time_segment after droping na
data["timestamps_segment"] = None
else:
data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True)
# chunk episodes
if (not data.empty) and ("start_timestamp" in data.columns) and ("end_timestamp" in data.columns):
data = chunk_episodes(data)
return data
def chunk_episodes(sensor_episodes):
import copy
import pandas as pd
# Deduplicate episodes
# Drop rows where segments of start_timestamp and end_timestamp are the same
sensor_episodes = sensor_episodes.drop_duplicates(subset=["start_timestamp", "end_timestamp", "local_segment"], keep="first")
# Delete useless columns
for drop_col in ["local_date_time", "local_date", "local_time", "local_hour", "local_minute"]:
del sensor_episodes[drop_col]
# Avoid SettingWithCopyWarning
sensor_episodes = sensor_episodes.copy()
# Unix timestamp for current segment in milliseconds
sensor_episodes[["segment_start_timestamp", "segment_end_timestamp"]] = sensor_episodes["timestamps_segment"].str.split(",", expand=True).astype(int)
# Compute chunked timestamp
sensor_episodes["chunked_start_timestamp"] = sensor_episodes[["start_timestamp", "segment_start_timestamp"]].max(axis=1)
sensor_episodes["chunked_end_timestamp"] = sensor_episodes[["end_timestamp", "segment_end_timestamp"]].min(axis=1)
# Compute duration: intersection of current row and segment
sensor_episodes["duration"] = (sensor_episodes["chunked_end_timestamp"] - sensor_episodes["chunked_start_timestamp"]) / (1000 * 60)
# Merge episodes
cols_for_groupby = [col for col in sensor_episodes.columns if col not in ["timestamps_segment", "timestamp", "assigned_segments", "start_datetime", "end_datetime", "start_timestamp", "end_timestamp", "duration", "chunked_start_timestamp", "chunked_end_timestamp"]]
sensor_episodes_grouped = sensor_episodes.groupby(by=cols_for_groupby, sort=False, dropna=False)
merged_sensor_episodes = sensor_episodes_grouped[["duration"]].sum()
merged_sensor_episodes["start_timestamp"] = sensor_episodes_grouped["chunked_start_timestamp"].first()
merged_sensor_episodes["end_timestamp"] = sensor_episodes_grouped["chunked_end_timestamp"].last()
merged_sensor_episodes.reset_index(inplace=True)
# Compute datetime
merged_sensor_episodes["local_start_date_time"] = pd.to_datetime(merged_sensor_episodes["start_timestamp"], unit="ms", utc=True)
merged_sensor_episodes["local_start_date_time"] = pd.concat([data["local_start_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
merged_sensor_episodes["local_end_date_time"] = pd.to_datetime(merged_sensor_episodes["end_timestamp"], unit="ms", utc=True)
merged_sensor_episodes["local_end_date_time"] = pd.concat([data["local_end_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
return merged_sensor_episodes

View File

@ -14,6 +14,7 @@ def import_path(path):
sys.modules[module_name] = module
return module
#TODO:check why segments change to int
def filter_data_by_segment(data, time_segment):
data.dropna(subset=["assigned_segments"], inplace=True)
if(data.shape[0] == 0): # data is empty
@ -151,6 +152,7 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
else:
segment_colums = pd.DataFrame()
print(sensor_features,sensor_features['local_segment'])
sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '')
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])