Compare commits

..

6 Commits

Author SHA1 Message Date
Marcel Martinšek fb8868b77d removed cleaning to make it run 2023-03-31 13:27:01 +00:00
Marcel Martinšek da77f7476c Merge branch 'sociality-task' of https://repo.ijs.si/junoslukan/rapids into sociality-task 2023-03-31 13:08:19 +00:00
Marcel Martinšek 4db8810d08 corrected esm_features index column 2023-03-31 13:08:15 +00:00
junos 7832d7d098 Update R packages. 2023-03-30 20:33:35 +02:00
Marcel Martinšek e7bb9d6702 not working temp 2023-03-30 11:54:51 +00:00
Marcel Martinšek 689f677a3e updated r package dependencies to make it run 2023-03-29 13:09:26 +00:00
18 changed files with 673 additions and 1199 deletions

3
.gitignore vendored
View File

@ -100,9 +100,6 @@ data/external/*
!/data/external/wiki_tz.csv !/data/external/wiki_tz.csv
!/data/external/main_study_usernames.csv !/data/external/main_study_usernames.csv
!/data/external/timezone.csv !/data/external/timezone.csv
!/data/external/play_store_application_genre_catalogue.csv
!/data/external/play_store_categories_count.csv
data/raw/* data/raw/*
!/data/raw/.gitkeep !/data/raw/.gitkeep

View File

@ -26,7 +26,7 @@ TIME_SEGMENTS: &time_segments
INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
TAILORED_EVENTS: # Only relevant if TYPE=EVENT TAILORED_EVENTS: # Only relevant if TYPE=EVENT
COMPUTE: True COMPUTE: True
SEGMENTING_METHOD: "30_before" # 30_before, 90_before, stress_event SEGMENTING_METHOD: "10_before" # 30_before, 90_before, stress_event
INTERVAL_OF_INTEREST: 10 # duration of event of interest [minutes] INTERVAL_OF_INTEREST: 10 # duration of event of interest [minutes]
IOI_ERROR_TOLERANCE: 5 # interval of interest erorr tolerance (before and after IOI) [minutes] IOI_ERROR_TOLERANCE: 5 # interval of interest erorr tolerance (before and after IOI) [minutes]
@ -91,7 +91,7 @@ PHONE_ACTIVITY_RECOGNITION:
EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same AR episode. EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same AR episode.
PROVIDERS: PROVIDERS:
RAPIDS: RAPIDS:
COMPUTE: True COMPUTE: False
FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"] FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"]
ACTIVITY_CLASSES: ACTIVITY_CLASSES:
STATIONARY: ["still", "tilting"] STATIONARY: ["still", "tilting"]
@ -104,9 +104,9 @@ PHONE_APPLICATIONS_CRASHES:
CONTAINER: applications_crashes CONTAINER: applications_crashes
APPLICATION_CATEGORIES: APPLICATION_CATEGORIES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/play_store_application_genre_catalogue.csv" CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
SCRAPE_MISSING_CATEGORIES: False # whether to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
# See https://www.rapids.science/latest/features/phone-applications-foreground/ # See https://www.rapids.science/latest/features/phone-applications-foreground/
@ -114,32 +114,24 @@ PHONE_APPLICATIONS_FOREGROUND:
CONTAINER: applications CONTAINER: applications
APPLICATION_CATEGORIES: APPLICATION_CATEGORIES:
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store) CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
CATALOGUE_FILE: "data/external/play_store_application_genre_catalogue.csv" CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
# Refer to data/external/play_store_categories_count.csv for a list of categories (genres) and their frequency. PACKAGE_NAMES_HASHED: True
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
SCRAPE_MISSING_CATEGORIES: False # whether to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
PROVIDERS: PROVIDERS:
RAPIDS: RAPIDS:
COMPUTE: True COMPUTE: False
INCLUDE_EPISODE_FEATURES: True INCLUDE_EPISODE_FEATURES: True
SINGLE_CATEGORIES: ["Productivity", "Tools", "Communication", "Education", "Social"] SINGLE_CATEGORIES: ["all", "email"]
MULTIPLE_CATEGORIES: MULTIPLE_CATEGORIES:
games: ["Puzzle", "Card", "Casual", "Board", "Strategy", "Trivia", "Word", "Adventure", "Role Playing", "Simulation", "Board, Brain Games", "Racing"] social: ["socialnetworks", "socialmediatools"]
social: ["Communication", "Social", "Dating"] entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"]
productivity: ["Tools", "Productivity", "Finance", "Education", "News & Magazines", "Business", "Books & Reference"]
health: ["Health & Fitness", "Lifestyle", "Food & Drink", "Sports", "Medical", "Parenting"]
entertainment: ["Shopping", "Music & Audio", "Entertainment", "Travel & Local", "Photography", "Video Players & Editors", "Personalization", "House & Home", "Art & Design", "Auto & Vehicles", "Entertainment,Music & Video",
"Puzzle", "Card", "Casual", "Board", "Strategy", "Trivia", "Word", "Adventure", "Role Playing", "Simulation", "Board, Brain Games", "Racing" # Add all games.
]
maps_weather: ["Maps & Navigation", "Weather"]
CUSTOM_CATEGORIES: CUSTOM_CATEGORIES:
SINGLE_APPS: [] social_media: ["com.google.android.youtube", "com.snapchat.android", "com.instagram.android", "com.zhiliaoapp.musically", "com.facebook.katana"]
EXCLUDED_CATEGORIES: ["System", "STRAW"] dating: ["com.tinder", "com.relance.happycouple", "com.kiwi.joyride"]
# Note: A special option here is "is_system_app". SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps
# This excludes applications that have is_system_app = TRUE, which is a separate column in the table. EXCLUDED_CATEGORIES: []
# However, all of these applications have been assigned System category. EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"] # TODO list system apps?
# I will therefore filter by that category, which is a superset and is more complete. JL
EXCLUDED_APPS: []
FEATURES: FEATURES:
APP_EVENTS: ["countevent", "timeoffirstuse", "timeoflastuse", "frequencyentropy"] APP_EVENTS: ["countevent", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
APP_EPISODES: ["countepisode", "minduration", "maxduration", "meanduration", "sumduration"] APP_EPISODES: ["countepisode", "minduration", "maxduration", "meanduration", "sumduration"]
@ -163,7 +155,7 @@ PHONE_BATTERY:
EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode. EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
PROVIDERS: PROVIDERS:
RAPIDS: RAPIDS:
COMPUTE: True COMPUTE: False
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"] FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
SRC_SCRIPT: src/features/phone_battery/rapids/main.py SRC_SCRIPT: src/features/phone_battery/rapids/main.py
@ -177,7 +169,7 @@ PHONE_BLUETOOTH:
SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R
DORYAB: DORYAB:
COMPUTE: True COMPUTE: False
FEATURES: FEATURES:
ALL: ALL:
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"] DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
@ -198,7 +190,7 @@ PHONE_CALLS:
CONTAINER: call CONTAINER: call
PROVIDERS: PROVIDERS:
RAPIDS: RAPIDS:
COMPUTE: True COMPUTE: False
FEATURES_TYPE: EPISODES # EVENTS or EPISODES FEATURES_TYPE: EPISODES # EVENTS or EPISODES
CALL_TYPES: [missed, incoming, outgoing] CALL_TYPES: [missed, incoming, outgoing]
FEATURES: FEATURES:
@ -227,18 +219,19 @@ PHONE_CONVERSATION: # TODO Adapt for speech
# See https://www.rapids.science/latest/features/phone-data-yield/ # See https://www.rapids.science/latest/features/phone-data-yield/
PHONE_DATA_YIELD: PHONE_DATA_YIELD:
SENSORS: [#PHONE_ACCELEROMETER, SENSORS: [ #PHONE_ACCELEROMETER,
PHONE_ACTIVITY_RECOGNITION, #PHONE_ACTIVITY_RECOGNITION,
PHONE_APPLICATIONS_FOREGROUND, #PHONE_APPLICATIONS_FOREGROUND,
PHONE_APPLICATIONS_NOTIFICATIONS, #PHONE_APPLICATIONS_NOTIFICATIONS,
PHONE_BATTERY, #PHONE_BATTERY,
PHONE_BLUETOOTH, PHONE_BLUETOOTH #,
PHONE_CALLS, #PHONE_CALLS,
PHONE_LIGHT, #PHONE_LIGHT,
PHONE_LOCATIONS, #PHONE_LOCATIONS,
PHONE_MESSAGES, #PHONE_MESSAGES,
PHONE_SCREEN, #PHONE_SCREEN,
PHONE_WIFI_VISIBLE] #PHONE_WIFI_VISIBLE
]
PROVIDERS: PROVIDERS:
RAPIDS: RAPIDS:
COMPUTE: True COMPUTE: True
@ -251,9 +244,8 @@ PHONE_ESM:
PROVIDERS: PROVIDERS:
STRAW: STRAW:
COMPUTE: True COMPUTE: True
SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", SCALES: ["activities"]
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"] FEATURES: [activities_n_others, activities_inperson, activities_formal]
FEATURES: [mean]
SRC_SCRIPT: src/features/phone_esm/straw/main.py SRC_SCRIPT: src/features/phone_esm/straw/main.py
# See https://www.rapids.science/latest/features/phone-keyboard/ # See https://www.rapids.science/latest/features/phone-keyboard/
@ -270,7 +262,7 @@ PHONE_LIGHT:
CONTAINER: light_sensor CONTAINER: light_sensor
PROVIDERS: PROVIDERS:
RAPIDS: RAPIDS:
COMPUTE: True COMPUTE: False
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
SRC_SCRIPT: src/features/phone_light/rapids/main.py SRC_SCRIPT: src/features/phone_light/rapids/main.py
@ -284,7 +276,7 @@ PHONE_LOCATIONS:
PROVIDERS: PROVIDERS:
DORYAB: DORYAB:
COMPUTE: True COMPUTE: False
FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome", "homelabel"] FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome", "homelabel"]
DBSCAN_EPS: 100 # meters DBSCAN_EPS: 100 # meters
DBSCAN_MINSAMPLES: 5 DBSCAN_MINSAMPLES: 5
@ -299,7 +291,7 @@ PHONE_LOCATIONS:
SRC_SCRIPT: src/features/phone_locations/doryab/main.py SRC_SCRIPT: src/features/phone_locations/doryab/main.py
BARNETT: BARNETT:
COMPUTE: True COMPUTE: False
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"] FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
@ -317,7 +309,7 @@ PHONE_MESSAGES:
CONTAINER: sms CONTAINER: sms
PROVIDERS: PROVIDERS:
RAPIDS: RAPIDS:
COMPUTE: True COMPUTE: False
MESSAGES_TYPES : [received, sent] MESSAGES_TYPES : [received, sent]
FEATURES: FEATURES:
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact] received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
@ -329,7 +321,7 @@ PHONE_SCREEN:
CONTAINER: screen CONTAINER: screen
PROVIDERS: PROVIDERS:
RAPIDS: RAPIDS:
COMPUTE: True COMPUTE: False
REFERENCE_HOUR_FIRST_USE: 0 REFERENCE_HOUR_FIRST_USE: 0
IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable
IGNORE_EPISODES_LONGER_THAN: 360 # in minutes, set to 0 to disable IGNORE_EPISODES_LONGER_THAN: 360 # in minutes, set to 0 to disable
@ -342,7 +334,7 @@ PHONE_SPEECH:
CONTAINER: speech CONTAINER: speech
PROVIDERS: PROVIDERS:
STRAW: STRAW:
COMPUTE: True COMPUTE: False
FEATURES: ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"] FEATURES: ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"]
SRC_SCRIPT: src/features/phone_speech/straw/main.py SRC_SCRIPT: src/features/phone_speech/straw/main.py
@ -360,7 +352,7 @@ PHONE_WIFI_VISIBLE:
CONTAINER: wifi CONTAINER: wifi
PROVIDERS: PROVIDERS:
RAPIDS: RAPIDS:
COMPUTE: True COMPUTE: False
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"] FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R
@ -529,10 +521,10 @@ EMPATICA_ACCELEROMETER:
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"] FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
CR: CR:
COMPUTE: True COMPUTE: False
FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features
WINDOWS: WINDOWS:
COMPUTE: True COMPUTE: False
WINDOW_LENGTH: 15 # specify window length in seconds WINDOW_LENGTH: 15 # specify window length in seconds
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'] SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
SRC_SCRIPT: src/features/empatica_accelerometer/cr/main.py SRC_SCRIPT: src/features/empatica_accelerometer/cr/main.py
@ -556,11 +548,11 @@ EMPATICA_TEMPERATURE:
FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"] FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
CR: CR:
COMPUTE: True COMPUTE: False
FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean", FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean",
"stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"] "stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"]
WINDOWS: WINDOWS:
COMPUTE: True COMPUTE: False
WINDOW_LENGTH: 300 # specify window length in seconds WINDOW_LENGTH: 300 # specify window length in seconds
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'] SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
SRC_SCRIPT: src/features/empatica_temperature/cr/main.py SRC_SCRIPT: src/features/empatica_temperature/cr/main.py
@ -574,14 +566,14 @@ EMPATICA_ELECTRODERMAL_ACTIVITY:
FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"] FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
CR: CR:
COMPUTE: True COMPUTE: False
FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic', FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic',
'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore', 'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore',
'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio', 'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio',
'avgPeakIncreaseTime', 'avgPeakDecreaseTime', 'avgPeakDuration', 'signalOverallChange', 'changeDuration', 'changeRate', 'significantIncrease', 'avgPeakIncreaseTime', 'avgPeakDecreaseTime', 'avgPeakDuration', 'signalOverallChange', 'changeDuration', 'changeRate', 'significantIncrease',
'significantDecrease'] 'significantDecrease']
WINDOWS: WINDOWS:
COMPUTE: True COMPUTE: False
WINDOW_LENGTH: 60 # specify window length in seconds WINDOW_LENGTH: 60 # specify window length in seconds
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', count_windows, eda_num_peaks_non_zero] SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', count_windows, eda_num_peaks_non_zero]
IMPUTE_NANS: True IMPUTE_NANS: True
@ -600,7 +592,7 @@ EMPATICA_BLOOD_VOLUME_PULSE:
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features 'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
WINDOWS: WINDOWS:
COMPUTE: True COMPUTE: False
WINDOW_LENGTH: 300 # specify window length in seconds WINDOW_LENGTH: 300 # specify window length in seconds
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan'] SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/cr/main.py SRC_SCRIPT: src/features/empatica_blood_volume_pulse/cr/main.py
@ -614,12 +606,12 @@ EMPATICA_INTER_BEAT_INTERVAL:
FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"] FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
CR: CR:
COMPUTE: True COMPUTE: False
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features 'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
PATCH_WITH_BVP: True PATCH_WITH_BVP: True
WINDOWS: WINDOWS:
COMPUTE: True COMPUTE: False
WINDOW_LENGTH: 300 # specify window length in seconds WINDOW_LENGTH: 300 # specify window length in seconds
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan'] SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
SRC_SCRIPT: src/features/empatica_inter_beat_interval/cr/main.py SRC_SCRIPT: src/features/empatica_inter_beat_interval/cr/main.py
@ -681,12 +673,12 @@ ALL_CLEANING_INDIVIDUAL:
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
DROP_HIGHLY_CORRELATED_FEATURES: DROP_HIGHLY_CORRELATED_FEATURES:
COMPUTE: True COMPUTE: False
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95 CORR_THRESHOLD: 0.95
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
STRAW: STRAW:
COMPUTE: True COMPUTE: False
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
@ -694,7 +686,7 @@ ALL_CLEANING_INDIVIDUAL:
COLS_NAN_THRESHOLD: 0.9 # set to 1 to remove only columns that contains all (100% of) NaN COLS_NAN_THRESHOLD: 0.9 # set to 1 to remove only columns that contains all (100% of) NaN
COLS_VAR_THRESHOLD: True COLS_VAR_THRESHOLD: True
DROP_HIGHLY_CORRELATED_FEATURES: DROP_HIGHLY_CORRELATED_FEATURES:
COMPUTE: True COMPUTE: False
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95 CORR_THRESHOLD: 0.95
STANDARDIZATION: True STANDARDIZATION: True
@ -713,12 +705,12 @@ ALL_CLEANING_OVERALL:
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
DROP_HIGHLY_CORRELATED_FEATURES: DROP_HIGHLY_CORRELATED_FEATURES:
COMPUTE: True COMPUTE: False
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95 CORR_THRESHOLD: 0.95
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
STRAW: STRAW:
COMPUTE: True COMPUTE: False
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
@ -726,7 +718,7 @@ ALL_CLEANING_OVERALL:
COLS_NAN_THRESHOLD: 0.8 # set to 1 to remove only columns that contains all (100% of) NaN COLS_NAN_THRESHOLD: 0.8 # set to 1 to remove only columns that contains all (100% of) NaN
COLS_VAR_THRESHOLD: True COLS_VAR_THRESHOLD: True
DROP_HIGHLY_CORRELATED_FEATURES: DROP_HIGHLY_CORRELATED_FEATURES:
COMPUTE: True COMPUTE: False
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5 MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
CORR_THRESHOLD: 0.95 CORR_THRESHOLD: 0.95
STANDARDIZATION: True STANDARDIZATION: True
@ -740,7 +732,7 @@ ALL_CLEANING_OVERALL:
PARAMS_FOR_ANALYSIS: PARAMS_FOR_ANALYSIS:
BASELINE: BASELINE:
COMPUTE: True COMPUTE: False
FOLDER: data/external/baseline FOLDER: data/external/baseline
CONTAINER: [results-survey637813_final.csv, # Slovenia CONTAINER: [results-survey637813_final.csv, # Slovenia
results-survey358134_final.csv, # Belgium 1 results-survey358134_final.csv, # Belgium 1
@ -751,8 +743,8 @@ PARAMS_FOR_ANALYSIS:
CATEGORICAL_FEATURES: [gender] CATEGORICAL_FEATURES: [gender]
TARGET: TARGET:
COMPUTE: True COMPUTE: False
LABEL: appraisal_stressfulness_event_mean LABEL: appraisal_stressfulness_event_mean
ALL_LABELS: [PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean, JCQ_coworker_support_mean, appraisal_stressfulness_period_mean] ALL_LABELS: [appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean]
# PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean, # PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean,
# JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean # JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean

File diff suppressed because it is too large Load Diff

View File

@ -1,45 +0,0 @@
genre,n
System,261
Tools,96
Productivity,71
Health & Fitness,60
Finance,54
Communication,39
Music & Audio,39
Shopping,38
Lifestyle,33
Education,28
News & Magazines,24
Maps & Navigation,23
Entertainment,21
Business,18
Travel & Local,18
Books & Reference,16
Social,16
Weather,16
Food & Drink,14
Sports,14
Other,13
Photography,13
Puzzle,13
Video Players & Editors,12
Card,9
Casual,9
Personalization,8
Medical,7
Board,5
Strategy,4
House & Home,3
Trivia,3
Word,3
Adventure,2
Art & Design,2
Auto & Vehicles,2
Dating,2
Role Playing,2
STRAW,2
Simulation,2
"Board,Brain Games",1
"Entertainment,Music & Video",1
Parenting,1
Racing,1
1 genre n
2 System 261
3 Tools 96
4 Productivity 71
5 Health & Fitness 60
6 Finance 54
7 Communication 39
8 Music & Audio 39
9 Shopping 38
10 Lifestyle 33
11 Education 28
12 News & Magazines 24
13 Maps & Navigation 23
14 Entertainment 21
15 Business 18
16 Travel & Local 18
17 Books & Reference 16
18 Social 16
19 Weather 16
20 Food & Drink 14
21 Sports 14
22 Other 13
23 Photography 13
24 Puzzle 13
25 Video Players & Editors 12
26 Card 9
27 Casual 9
28 Personalization 8
29 Medical 7
30 Board 5
31 Strategy 4
32 House & Home 3
33 Trivia 3
34 Word 3
35 Adventure 2
36 Art & Design 2
37 Auto & Vehicles 2
38 Dating 2
39 Role Playing 2
40 STRAW 2
41 Simulation 2
42 Board,Brain Games 1
43 Entertainment,Music & Video 1
44 Parenting 1
45 Racing 1

View File

@ -1,30 +1,165 @@
name: rapids name: rapids
channels: channels:
- conda-forge - conda-forge
- defaults
dependencies: dependencies:
- auto-sklearn - _libgcc_mutex=0.1
- hmmlearn - _openmp_mutex=4.5
- imbalanced-learn - _py-xgboost-mutex=2.0
- jsonschema - appdirs=1.4.4
- lightgbm - arrow=0.16.0
- matplotlib - asn1crypto=1.4.0
- numpy - astropy=4.2.1
- pandas - attrs=20.3.0
- peakutils - binaryornot=0.4.4
- pip - blas=1.0
- plotly - brotlipy=0.7.0
- python-dateutil - bzip2=1.0.8
- pytz - ca-certificates=2021.7.5
- pywavelets - certifi=2021.5.30
- pyyaml - cffi=1.14.4
- scikit-learn - chardet=3.0.4
- scipy - click=7.1.2
- seaborn - colorama=0.4.4
- setuptools - cookiecutter=1.6.0
- bioconda::snakemake - cryptography=3.3.1
- bioconda::snakemake-minimal - datrie=0.8.2
- tqdm - docutils=0.16
- xgboost - future=0.18.2
- gitdb=4.0.5
- gitdb2=4.0.2
- gitpython=3.1.11
- idna=2.10
- imbalanced-learn=0.6.2
- importlib-metadata=2.0.0
- importlib_metadata=2.0.0
- intel-openmp=2019.4
- jinja2=2.11.2
- jinja2-time=0.2.0
- joblib=1.0.0
- jsonschema=3.2.0
- ld_impl_linux-64=2.36.1
- libblas=3.8.0
- libcblas=3.8.0
- libcxx=10.0.0
- libcxxabi=10.0.0
- libedit=3.1.20191231
- libffi=3.3
- libgcc-ng=11.2.0
- libgfortran
- libgfortran
- libgfortran
- liblapack=3.8.0
- libopenblas=0.3.10
- libstdcxx-ng=11.2.0
- libxgboost=0.90
- libzlib=1.2.11
- lightgbm=3.1.1
- llvm-openmp=10.0.0
- markupsafe=1.1.1
- mkl
- mkl-service=2.3.0
- mkl_fft=1.2.0
- mkl_random=1.1.1
- more-itertools=8.6.0
- ncurses=6.2
- numpy=1.19.2
- numpy-base=1.19.2
- openblas=0.3.4
- openssl=1.1.1k
- pandas=1.1.5
- pbr=5.5.1
- pip=20.3.3
- plotly=4.14.1
- poyo=0.5.0
- psutil=5.7.2
- py-xgboost=0.90
- pycparser=2.20
- pyerfa=1.7.1.1
- pyopenssl=20.0.1
- pysocks=1.7.1
- python=3.7.9
- python-dateutil=2.8.1
- python_abi=3.7
- pytz=2020.4
- pyyaml=5.3.1
- readline=8.0
- requests=2.25.0
- retrying=1.3.3
- setuptools=51.0.0
- six=1.15.0
- smmap=3.0.4
- smmap2=3.0.1
- sqlite=3.33.0
- threadpoolctl=2.1.0
- tk=8.6.10
- tqdm=4.62.0
- urllib3=1.25.11
- wheel=0.36.2
- whichcraft=0.6.1
- wrapt=1.12.1
- xgboost=0.90
- xz=5.2.5
- yaml=0.2.5
- zipp=3.4.0
- zlib=1.2.11
- pip: - pip:
- biosppy - amply==0.1.4
- cr_features>=0.2 - auto-sklearn==0.14.7
- bidict==0.22.0
- biosppy==0.8.0
- build==0.8.0
- cached-property==1.5.2
- cloudpickle==2.2.0
- configargparse==0.15.1
- configspace==0.4.21
- cr-features==0.2.1
- cycler==0.11.0
- cython==0.29.32
- dask==2022.2.0
- decorator==4.4.2
- distributed==2022.2.0
- distro==1.7.0
- emcee==3.1.2
- fonttools==4.33.2
- fsspec==2022.8.2
- h5py==3.6.0
- heapdict==1.0.1
- hmmlearn==0.2.7
- ipython-genutils==0.2.0
- jupyter-core==4.6.3
- kiwisolver==1.4.2
- liac-arff==2.5.0
- locket==1.0.0
- matplotlib==3.5.1
- msgpack==1.0.4
- nbformat==5.0.7
- opencv-python==4.5.5.64
- packaging==21.3
- partd==1.3.0
- peakutils==1.3.3
- pep517==0.13.0
- pillow==9.1.0
- pulp==2.4
- pynisher==0.6.4
- pyparsing==2.4.7
- pyrfr==0.8.3
- pyrsistent==0.15.5
- pywavelets==1.3.0
- ratelimiter==1.2.0.post0
- scikit-learn==0.24.2
- scipy==1.7.3
- seaborn==0.11.2
- shortuuid==1.0.8
- smac==1.2
- snakemake==5.30.2
- sortedcontainers==2.4.0
- tblib==1.7.0
- tomli==2.0.1
- toolz==0.12.0
- toposort==1.5
- tornado==6.2
- traitlets==4.3.3
- typing-extensions==4.2.0
- zict==2.2.0
prefix: /opt/conda/envs/rapids

View File

@ -247,8 +247,6 @@ rule empatica_readable_datetime:
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"] include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
output: output:
"data/raw/{pid}/empatica_{sensor}_with_datetime.csv" "data/raw/{pid}/empatica_{sensor}_with_datetime.csv"
resources:
mem_mb=50000
script: script:
"../src/data/datetime/readable_datetime.R" "../src/data/datetime/readable_datetime.R"

View File

@ -29,17 +29,24 @@ get_genre <- function(apps){
apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F) apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F)
genre_catalogue <- data.frame() genre_catalogue <- data.frame()
catalogue_source <- snakemake@params[["catalogue_source"]] catalogue_source <- snakemake@params[["catalogue_source"]]
package_names_hashed <- snakemake@params[["package_names_hashed"]]
update_catalogue_file <- snakemake@params[["update_catalogue_file"]] update_catalogue_file <- snakemake@params[["update_catalogue_file"]]
scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]] scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]]
apps_with_genre <- data.frame(matrix(ncol=length(colnames(apps)) + 1,nrow=0, dimnames=list(NULL, c(colnames(apps), "genre")))) apps_with_genre <- data.frame(matrix(ncol=length(colnames(apps)) + 1,nrow=0, dimnames=list(NULL, c(colnames(apps), "genre"))))
if (length(package_names_hashed) == 0) {package_names_hashed <- FALSE}
if(nrow(apps) > 0){ if(nrow(apps) > 0){
if(catalogue_source == "GOOGLE"){ if(catalogue_source == "GOOGLE"){
apps_with_genre <- apps %>% mutate(genre = NA_character_) apps_with_genre <- apps %>% mutate(genre = NA_character_)
} else if(catalogue_source == "FILE"){ } else if(catalogue_source == "FILE"){
genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character")) genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character"))
if (package_names_hashed) {
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_hash")
} else {
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name") apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
} }
}
if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){ if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){
apps_without_genre <- (apps_with_genre %>% filter(is.na(genre)) %>% distinct(package_name))$package_name apps_without_genre <- (apps_with_genre %>% filter(is.na(genre)) %>% distinct(package_name))$package_name

View File

@ -136,9 +136,8 @@ def patch_ibi_with_bvp(ibi_data, bvp_data):
# Begin with the cr-features part # Begin with the cr-features part
try: try:
ibi_data, ibi_start_timestamp = empatica2d_to_array(ibi_data_file) ibi_data, ibi_start_timestamp = empatica2d_to_array(ibi_data_file)
except (IndexError, KeyError) as e: except IndexError as e:
# Checks whether IBI.csv is empty # Checks whether IBI.csv is empty
# It may raise a KeyError if df is empty here: startTimeStamp = df.time[0]
df_test = pd.read_csv(ibi_data_file, names=['timings', 'inter_beat_interval'], header=None) df_test = pd.read_csv(ibi_data_file, names=['timings', 'inter_beat_interval'], header=None)
if df_test.empty: if df_test.empty:
df_test['timestamp'] = df_test['timings'] df_test['timestamp'] = df_test['timings']

View File

@ -120,7 +120,7 @@ def straw_cleaning(sensor_data_files, provider):
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')]
if provider["COLS_VAR_THRESHOLD"]: if provider["COLS_VAR_THRESHOLD"]:
features.drop(features.std(numeric_only=True)[features.std(numeric_only=True) == 0].index.values, axis=1, inplace=True) features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
fe5 = features.copy() fe5 = features.copy()
@ -134,7 +134,7 @@ def straw_cleaning(sensor_data_files, provider):
valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]] valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
corr_matrix = valid_features.corr().abs() corr_matrix = valid_features.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])] to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
features.drop(to_drop, axis=1, inplace=True) features.drop(to_drop, axis=1, inplace=True)
@ -150,15 +150,13 @@ def straw_cleaning(sensor_data_files, provider):
return features return features
def impute(df, method='zero'):
def k_nearest(df): def k_nearest(df):
pd.set_option('display.max_columns', None) pd.set_option('display.max_columns', None)
imputer = KNNImputer(n_neighbors=3) imputer = KNNImputer(n_neighbors=3)
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns) return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
def impute(df, method='zero'):
return { return {
'zero': df.fillna(0), 'zero': df.fillna(0),
'high_number': df.fillna(1500), 'high_number': df.fillna(1500),
@ -167,7 +165,6 @@ def impute(df, method='zero'):
'knn': k_nearest(df) 'knn': k_nearest(df)
}[method] }[method]
def graph_bf_af(features, phase_name, plt_flag=False): def graph_bf_af(features, phase_name, plt_flag=False):
if plt_flag: if plt_flag:
sns.set(rc={"figure.figsize":(16, 8)}) sns.set(rc={"figure.figsize":(16, 8)})

View File

@ -146,7 +146,7 @@ def straw_cleaning(sensor_data_files, provider, target):
# (5) REMOVE COLS WHERE VARIANCE IS 0 # (5) REMOVE COLS WHERE VARIANCE IS 0
if provider["COLS_VAR_THRESHOLD"]: if provider["COLS_VAR_THRESHOLD"]:
features.drop(features.std(numeric_only=True)[features.std(numeric_only=True) == 0].index.values, axis=1, inplace=True) features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
graph_bf_af(features, "6variance_drop") graph_bf_af(features, "6variance_drop")
@ -200,7 +200,7 @@ def straw_cleaning(sensor_data_files, provider, target):
valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]] valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
corr_matrix = valid_features.corr().abs() corr_matrix = valid_features.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])] to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
# sns.heatmap(corr_matrix, cmap="YlGnBu") # sns.heatmap(corr_matrix, cmap="YlGnBu")
@ -245,14 +245,12 @@ def straw_cleaning(sensor_data_files, provider, target):
return features return features
def impute(df, method='zero'):
def k_nearest(df): def k_nearest(df):
imputer = KNNImputer(n_neighbors=3) imputer = KNNImputer(n_neighbors=3)
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns) return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
def impute(df, method='zero'):
return { return {
'zero': df.fillna(0), 'zero': df.fillna(0),
'high_number': df.fillna(1500), 'high_number': df.fillna(1500),
@ -261,7 +259,6 @@ def impute(df, method='zero'):
'knn': k_nearest(df) 'knn': k_nearest(df)
}[method] }[method]
def graph_bf_af(features, phase_name, plt_flag=False): def graph_bf_af(features, phase_name, plt_flag=False):
if plt_flag: if plt_flag:
sns.set(rc={"figure.figsize":(16, 8)}) sns.set(rc={"figure.figsize":(16, 8)})

View File

@ -15,13 +15,13 @@ def extract_second_order_features(intraday_features, so_features_names, prefix="
so_features = pd.DataFrame() so_features = pd.DataFrame()
#print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest()) #print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest())
if "mean" in so_features_names: if "mean" in so_features_names:
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean(numeric_only=True).add_suffix("_SO_mean")], axis=1) so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean().add_suffix("_SO_mean")], axis=1)
if "median" in so_features_names: if "median" in so_features_names:
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median(numeric_only=True).add_suffix("_SO_median")], axis=1) so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median().add_suffix("_SO_median")], axis=1)
if "sd" in so_features_names: if "sd" in so_features_names:
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std(numeric_only=True).fillna(0).add_suffix("_SO_sd")], axis=1) so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std().fillna(0).add_suffix("_SO_sd")], axis=1)
if "nlargest" in so_features_names: # largest 5 -- maybe there is a faster groupby solution? if "nlargest" in so_features_names: # largest 5 -- maybe there is a faster groupby solution?
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]: for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:

View File

@ -26,7 +26,7 @@ def calculate_empatica_data_yield(features): # TODO
# Assigns 1 to values that are over 1 (in case of windows not being filled fully) # Assigns 1 to values that are over 1 (in case of windows not being filled fully)
features[empatica_data_yield_cols] = features[empatica_data_yield_cols].apply(lambda x: [y if y <= 1 or np.isnan(y) else 1 for y in x]) features[empatica_data_yield_cols] = features[empatica_data_yield_cols].apply(lambda x: [y if y <= 1 or np.isnan(y) else 1 for y in x])
features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1, numeric_only=True).fillna(0) features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1).fillna(0)
features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average) features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average)
return features return features

View File

@ -0,0 +1,292 @@
import pandas as pd
import numpy as np
id2qc = { 44:["What have you mainly been doing within the last 10 minutes?",
"Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
"Kaj ste v glavnem počeli v zadnjih 10 minutah?"],
45:["What type of individual work?",
"Wat voor soort individueel werk?",
"Kakšno vrsto samostojnega dela ste opravljali?"],
46:["How did you work with others?",
"Hoe heb je met anderen gewerkt?",
"Kako ste sodelovali z drugimi?"],
47:["What type of break?",
"Wat voor soort pauze?",
"Kakšno vrsto odmora ste imeli?"],
48:["Where did you travel between?",
"Waar heb je tussen gereisd?",
"Kam ste potovali?"],
49:["Did you use a computer or phone for that?",
"Heb je daarvoor een computer of telefoon gebruikt?",
"Ste za to uporabljali računalnik ali telefon?"],
50:["What kind of an interaction was that?",
"Wat voor interactie was dat?",
"Kakšne vrste sodelovanja je bilo to?"],
51:["How many people were involved besides yourself?",
"Hoeveel mensen waren er behalve jezelf betrokken?",
"Koliko oseb je bilo poleg vas še vpletenih?"],
# 52:["What have you mainly been doing within the last 10 minutes?",
# "Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
# "Kaj ste v glavnem počeli v zadnjih 10 minutah?"]
}
qc2id = {v:k for k,values in id2qc.items() for v in values}
next_questions = { 44: [45,46,47,48],
45:[49,49],
46:[50,50],
47:[],
48:[],
49:[],
50:[51,51],
51:[]
#52:[45,46,47,48],
}
def esm_activities_LTM_features(
df_esm_activities_cleaned: pd.DataFrame,
) -> pd.DataFrame:
""" Function for calculating LTM(Last 10 minutes) features of questionnaire answers. It first corrects the question ids according
to esm_instructions and the updated corpus of question_ids. It then processes each LTM question chain to
find relevant social properties given by the answers such as the number of people interacted with, the formality and whether the socializing was done in person.
Parameters
----------
df_esm_activities_cleaned: pd.DataFrame
A cleaned up dataframe, which must include esm_instructions, esm_user_answer_numeric.
Returns
-------
df_esm_activities_cleaned: pd.DataFrame
The same dataframe with columns which contain:
["correct_ids"] - Corrected question_ids
["ans_seq"] - For each LTM question, the sequence of numerical user answers pertaining to this chain of questions.
["n_others","inperson","formal"]- Properties of known potential social encounters as given by process_answers().
"""
#TODO: preprocess questionaires
#DONE: correct ids
correct_id_df = correct_activity_qids(df_esm_activities_cleaned)
#DONE: process subquestions
ids = correct_id_df["correct_ids"]
main_q_indices = ids[ids==44].index
q_group = []
i=-1
for id in ids:
if(id==44):
i=i+1
q_group.append(i)
correct_id_df["q_group"] = q_group
ans_seq = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).rename(columns={"esm_user_answer_numeric":"ans_seq"})
ans_seq.set_index(main_q_indices,inplace=True)
# correct_id_df["ans_seq"] = [[] for i in range(len(correct_id_df))]
# correct_id_df["ans_seq"].loc[main_q_indices] = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).values.reshape(-1)
#DONE: find types of status for each main question: socializing:[none,irl,online,unknown], num_people:[0,1,2,>2,unknown]
processed_ans_df = process_answers(ans_seq)
# df_out = df_esm_activities_cleaned.join(test)
return df_esm_activities_cleaned.join(processed_ans_df)
"""
possible answer sequences for LTM question chains
#alone
0,0,0 not social
0,0,1 not social
0,1,0 not social
0,1,1 not social
0,2 not social
0,3 not social
0,4 not social
0,5 not social
0,6 not social
#w/ others
1,0,0,0 1 irl
1,0,0,1 2 irl
1,0,0,2 3+ irl
1,0,1,0 1 irl
1,0,1,1 2 irl
1,0,1,2 3+ irl
1,1,0,0 1 online
1,1,0,1 2 online
1,1,0,2 3+ online
1,1,1,0 1 online
1,1,1,1 2 online
1,1,1,2 3+ online
1,2 positive likely to be more than 2
1,3 positive
#break
2,0 ambiguous
2,1 positive irl
2,2 ambiguous
2,3 ambiguous
#transit
3,0 ambiguous
3,1 ambiguous
3,2 ambiguous
"""
#TODO: docstring
def process_answers(df:pd.DataFrame)-> pd.DataFrame:
""" Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
> n_others: Number of other people interacted with in the last 10 minutes
- -1: Number is positive but unknown exactly
- 0: No people/alone
- 1: One extra person
- 2: Two extra people
- 3: More than two extra people
- NaN : Can't say anything with enough certainty.
> inperson:
- True/False: The interaction in question was/wasn't in person.
- None: Can't say anything with enough certainty.
> formal:
- True/False: The interaction in question was/wasn't formal.
- None: Can't say anything with enough certainty.
Args:
df (pd.DataFrame): _description_
Returns:
pd.DataFrame: _description_
"""
properties = {"n_others":[],
"inperson":[],
"formal":[]}
for ans_seq in df["ans_seq"]:
n_other = None
inperson = None
formal = None
if(ans_seq[0]==0):
n_other = 0
elif(ans_seq[0]==1):
if(ans_seq[1]==3):
n_other = -1 # anwsered "Other" but did work with other people
elif(ans_seq[1]==2):
n_other = 3 #assuming more than 2 people participated in the lecture or presentation
elif(ans_seq[1] in [0,1]):
inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
formal = ans_seq[2]==0#0 means formal
n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
elif(ans_seq[0]==2):
formal = False#assuming one does not have a formal meeting during break time
if(ans_seq[1]==1):
n_other = -1
inperson = True
#if not 1 then we dont know anythong for sure
elif(ans_seq[0]==3):
#we cant say whether the persion was carpooling or driving alone.
pass
properties["n_others"].append(n_other)
properties["inperson"].append(inperson)
properties["formal"].append(formal)
#df = df.join(pd.DataFrame(properties,index=df.index))
return pd.DataFrame(properties,index=df.index)
def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame:
"""_summary_
Args:
df (pd.DataFrame): _description_
Returns:
pd.DataFrame: Input dataframe with added column "correct_ids"
"""
df["correct_ids"] = df["esm_instructions"].apply(lambda x: qc2id[x])
return df
def process_answers_aggregation(df:pd.core.groupby.generic.DataFrameGroupBy)-> pd.core.groupby.generic.DataFrameGroupBy:
""" Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
> n_others: Number of other people interacted with in the last 10 minutes
- -1: Number is positive but unknown exactly
- 0: No people/alone
- 1: One extra person
- 2: Two extra people
- 3: More than two extra people
- NaN : Can't say anything with enough certainty.
> inperson:
- True/False: The interaction in question was/wasn't in person.
- None: Can't say anything with enough certainty.
> formal:
- True/False: The interaction in question was/wasn't formal.
- None: Can't say anything with enough certainty.
Args:
df (pd.DataFrame): _description_
Returns:
pd.DataFrame: _description_
"""
#print("=======================\nAPPLY START:\ndf=",df.columns,df.local_segment)
properties = {"n_others":[],
"inperson":[],
"formal":[]}
ans_seq = df["esm_user_answer_numeric"].values
n_other = None
inperson = None
formal = None
if(ans_seq[0]==0):
n_other = 0
elif(ans_seq[0]==1):
if(ans_seq[1]==3):
n_other = -1 # anwsered "Other" but did work with other people
elif(ans_seq[1]==2):
n_other = 3 #assuming more than 2 people participated in the lecture or presentation
elif(ans_seq[1] in [0,1]):
inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
formal = ans_seq[2]==0#0 means formal
n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
elif(ans_seq[0]==2):
formal = False#assuming one does not have a formal meeting during break time
if(ans_seq[1]==1):
n_other = -1
inperson = True
#if not 1 then we dont know anythong for sure
elif(ans_seq[0]==3):
#we cant say whether the persion was carpooling or driving alone.
pass
properties["n_others"].append(n_other)
properties["inperson"].append(inperson)
properties["formal"].append(formal)
df = df.join(pd.DataFrame(properties,index=df.index))
#print("APPLY END:\ndf=",df[["n_others","inperson","formal"]])
return df
#test stuff
def test():
from esm_preprocess import preprocess_esm,clean_up_esm
df = pd.read_csv("data/raw/p031/phone_esm_with_datetime.csv")
df = preprocess_esm(df)
df = clean_up_esm(df)
df = df[df["questionnaire_id"]==97]
original = esm_activities_LTM_features(df)
df["local_segment"] = [str(i)+":"+j for i,j in df[["esm_session","device_id"]].values]
temp = df.groupby("local_segment")
temp2 = temp.apply(process_answers_aggregation)
#compare with original function results
selection = original[original["correct_ids"]==44][["n_others", "inperson", "formal"]]
temp_selection = temp2.loc[selection.index]
temp_selection.compare(selection,keep_shape=True,keep_equal =True)
#print out ans_seq processing results
# import json
# i = 0
# for j,ans in correct_id_df[["esm_json","esm_user_answer"]].values:
# obj = json.loads(j)
# text = obj["esm_instructions"]
# if ("10 minut" in text):
# print("---\n",test.ans_seq.iloc[i])
# print(test[["n_others","inperson","formal"]].values[i])
# i = i+1
# print(text,ans)
#test()

View File

@ -1,4 +1,8 @@
import pandas as pd import pandas as pd
import sys
import warnings
sys.path.append('src/features/phone_esm/straw')
from esm_activities import esm_activities_LTM_features,process_answers_aggregation
QUESTIONNAIRE_IDS = { QUESTIONNAIRE_IDS = {
"sleep_quality": 1, "sleep_quality": 1,
@ -39,23 +43,49 @@ QUESTIONNAIRE_IDS = {
def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
esm_data = pd.read_csv(sensor_data_files["sensor_data"]) esm_data = pd.read_csv(sensor_data_files["sensor_data"])
requested_features = provider["FEATURES"] requested_features = provider["FEATURES"]
# name of the features this function can compute # name of the features this function can compute
requested_scales = provider["SCALES"] requested_scales = provider["SCALES"]
base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"] "appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge","activities_n_others","activities_inperson","activities_formal"]
#TODO Check valid questionnaire and feature names. #TODO Check valid questionnaire and feature names.
# the subset of requested features this function can compute # the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names)) features_to_compute = list(set(requested_features) & set(base_features_names))
esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
if not esm_data.empty: if not esm_data.empty:
esm_data = filter_data_by_segment(esm_data, time_segment) esm_data = filter_data_by_segment(esm_data, time_segment)
if not esm_data.empty: if not esm_data.empty:
esm_features = pd.DataFrame() esm_features = pd.DataFrame()
for scale in requested_scales: for scale in requested_scales:
questionnaire_id = QUESTIONNAIRE_IDS[scale] questionnaire_id = QUESTIONNAIRE_IDS[scale]
mask = esm_data["questionnaire_id"] == questionnaire_id mask = esm_data["questionnaire_id"] == questionnaire_id
#print(esm_data.loc[mask].head())
#print(time_segment)
if not mask.any():
temp = sensor_data_files["sensor_data"]
warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}-{time_segment}",RuntimeWarning)
continue
#TODO: calculation of LTM features
if scale=="activities":
requested_subset = [req for req in requested_features if req.startswith("activities")]
if not bool(requested_subset):
continue
# ltm_features = esm_activities_LTM_features(esm_data.loc[mask])
# print(esm_data["esm_json"].values)
# print(mask)
# print(esm_data.loc[mask])
# #ltm_features = ltm_features[ltm_features["correct_ids"==44]]
#print(esm_data.loc[mask]["local_segment"])
ltm_features = esm_data.loc[mask].groupby(["local_segment"]).apply(process_answers_aggregation)
#print("PRINTING ltm_features:\n",ltm_features)
ltm_features.rename(columns={"n_others":"activities_n_others","inperson":"activities_inperson","formal":"activities_formal"},inplace=True)
esm_features[requested_subset] = ltm_features.groupby("local_segment").first()[requested_subset]
#print(esm_features.columns)
#print("PRINTING esm_features after rename:\n",ltm_features)
#FIXME: it might be an issue that im calculating for whole time segment and not grouping by "local segment"
#print("~~~~~~~~~~~~~~~~~~~~~~~~===============================~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n LTM FEATURES STORED... AFTER RETURN:\n",ltm_features,esm_features[["activities_"+req for req in requested_subset]])
if("mean" in features_to_compute):
esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean() esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
#TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing. #TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.
@ -64,3 +94,15 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
esm_features.rename(columns={'index': 'local_segment'}, inplace=True) esm_features.rename(columns={'index': 'local_segment'}, inplace=True)
return esm_features return esm_features
def test_main():
import temp_help
provider = {
"FEATURES":["mean","activities_n_others","activities_inperson","activities_formal"],
"SCALES":['activities']
}
sensor_data_files = {"sensor_data":"data/interim/p069/phone_esm_clean.csv"}
s_feat = straw_features(sensor_data_files,"straw_event_stress_event_p069_110",provider,temp_help.filter_data_by_segment)
print(s_feat)
#test_main()

View File

@ -67,7 +67,7 @@ def extract_ers(esm_df):
segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"]
if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire if segmenting_method in ["10_before", "30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
""" '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below. """ '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
Both take x-minute period before the questionnaire that is summed with the questionnaire duration. Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
All questionnaire durations over 15 minutes are excluded from the querying. All questionnaire durations over 15 minutes are excluded from the querying.
@ -79,7 +79,18 @@ def extract_ers(esm_df):
extracted_ers = extracted_ers[extracted_ers["timestamp"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min extracted_ers = extracted_ers[extracted_ers["timestamp"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min
extracted_ers["shift_direction"] = -1 extracted_ers["shift_direction"] = -1
if segmenting_method == "30_before": if segmenting_method == "10_before":
"""The method 10-minutes before simply takes 10 minutes before the questionnaire and sums it with the questionnaire duration.
The timestamps are formatted with the help of format_timestamp() method.
"""
time_before_questionnaire = 10 * 60 # in seconds (10 minutes)
#TODO: split into small segments with manipulating lenght and shift
extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x))
extracted_ers["shift"] = time_before_questionnaire
extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x))
elif segmenting_method == "30_before":
"""The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration. """The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
The timestamps are formatted with the help of format_timestamp() method. The timestamps are formatted with the help of format_timestamp() method.
""" """
@ -140,8 +151,8 @@ def extract_ers(esm_df):
# Extracted 3 targets that will be transfered in the csv file to the cleaning script. # Extracted 3 targets that will be transfered in the csv file to the cleaning script.
se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'}) se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'})
se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean(numeric_only=True)['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'}) se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'})
se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean(numeric_only=True)['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'}) se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'})
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count) # All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \ extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \

View File

@ -0,0 +1,70 @@
"""This file is TEMPORARY and intended for testing main.py
"""
def filter_data_by_segment(data, time_segment):
data.dropna(subset=["assigned_segments"], inplace=True)
if(data.shape[0] == 0): # data is empty
data["local_segment"] = data["timestamps_segment"] = None
return data
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
timestamps_regex = "[0-9]{13}"
segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex)
data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
data = data.drop(columns=["assigned_segments"])
data = data.dropna(subset = ["local_segment"])
if(data.shape[0] == 0): # there are no rows belonging to time_segment after droping na
data["timestamps_segment"] = None
else:
data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True)
# chunk episodes
if (not data.empty) and ("start_timestamp" in data.columns) and ("end_timestamp" in data.columns):
data = chunk_episodes(data)
return data
def chunk_episodes(sensor_episodes):
import copy
import pandas as pd
# Deduplicate episodes
# Drop rows where segments of start_timestamp and end_timestamp are the same
sensor_episodes = sensor_episodes.drop_duplicates(subset=["start_timestamp", "end_timestamp", "local_segment"], keep="first")
# Delete useless columns
for drop_col in ["local_date_time", "local_date", "local_time", "local_hour", "local_minute"]:
del sensor_episodes[drop_col]
# Avoid SettingWithCopyWarning
sensor_episodes = sensor_episodes.copy()
# Unix timestamp for current segment in milliseconds
sensor_episodes[["segment_start_timestamp", "segment_end_timestamp"]] = sensor_episodes["timestamps_segment"].str.split(",", expand=True).astype(int)
# Compute chunked timestamp
sensor_episodes["chunked_start_timestamp"] = sensor_episodes[["start_timestamp", "segment_start_timestamp"]].max(axis=1)
sensor_episodes["chunked_end_timestamp"] = sensor_episodes[["end_timestamp", "segment_end_timestamp"]].min(axis=1)
# Compute duration: intersection of current row and segment
sensor_episodes["duration"] = (sensor_episodes["chunked_end_timestamp"] - sensor_episodes["chunked_start_timestamp"]) / (1000 * 60)
# Merge episodes
cols_for_groupby = [col for col in sensor_episodes.columns if col not in ["timestamps_segment", "timestamp", "assigned_segments", "start_datetime", "end_datetime", "start_timestamp", "end_timestamp", "duration", "chunked_start_timestamp", "chunked_end_timestamp"]]
sensor_episodes_grouped = sensor_episodes.groupby(by=cols_for_groupby, sort=False, dropna=False)
merged_sensor_episodes = sensor_episodes_grouped[["duration"]].sum()
merged_sensor_episodes["start_timestamp"] = sensor_episodes_grouped["chunked_start_timestamp"].first()
merged_sensor_episodes["end_timestamp"] = sensor_episodes_grouped["chunked_end_timestamp"].last()
merged_sensor_episodes.reset_index(inplace=True)
# Compute datetime
merged_sensor_episodes["local_start_date_time"] = pd.to_datetime(merged_sensor_episodes["start_timestamp"], unit="ms", utc=True)
merged_sensor_episodes["local_start_date_time"] = pd.concat([data["local_start_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
merged_sensor_episodes["local_end_date_time"] = pd.to_datetime(merged_sensor_episodes["end_timestamp"], unit="ms", utc=True)
merged_sensor_episodes["local_end_date_time"] = pd.concat([data["local_end_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
return merged_sensor_episodes

View File

@ -115,7 +115,7 @@ cluster_on = provider["CLUSTER_ON"]
strategy = provider["INFER_HOME_LOCATION_STRATEGY"] strategy = provider["INFER_HOME_LOCATION_STRATEGY"]
days_threshold = provider["MINIMUM_DAYS_TO_DETECT_HOME_CHANGES"] days_threshold = provider["MINIMUM_DAYS_TO_DETECT_HOME_CHANGES"]
if not location_data.timestamp.is_monotonic_increasing: if not location_data.timestamp.is_monotonic:
location_data.sort_values(by=["timestamp"], inplace=True) location_data.sort_values(by=["timestamp"], inplace=True)
location_data["duration_in_seconds"] = -1 * location_data.timestamp.diff(-1) / 1000 location_data["duration_in_seconds"] = -1 * location_data.timestamp.diff(-1) / 1000

View File

@ -14,6 +14,7 @@ def import_path(path):
sys.modules[module_name] = module sys.modules[module_name] = module
return module return module
#TODO:check why segments change to int
def filter_data_by_segment(data, time_segment): def filter_data_by_segment(data, time_segment):
data.dropna(subset=["assigned_segments"], inplace=True) data.dropna(subset=["assigned_segments"], inplace=True)
if(data.shape[0] == 0): # data is empty if(data.shape[0] == 0): # data is empty
@ -151,6 +152,7 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
else: else:
segment_colums = pd.DataFrame() segment_colums = pd.DataFrame()
print(sensor_features,sensor_features['local_segment'])
sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '') sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '')
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True) split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"]) new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])