Compare commits
6 Commits
master
...
sociality-
Author | SHA1 | Date |
---|---|---|
Marcel Martinšek | fb8868b77d | |
Marcel Martinšek | da77f7476c | |
Marcel Martinšek | 4db8810d08 | |
junos | 7832d7d098 | |
Marcel Martinšek | e7bb9d6702 | |
Marcel Martinšek | 689f677a3e |
|
@ -100,9 +100,6 @@ data/external/*
|
||||||
!/data/external/wiki_tz.csv
|
!/data/external/wiki_tz.csv
|
||||||
!/data/external/main_study_usernames.csv
|
!/data/external/main_study_usernames.csv
|
||||||
!/data/external/timezone.csv
|
!/data/external/timezone.csv
|
||||||
!/data/external/play_store_application_genre_catalogue.csv
|
|
||||||
!/data/external/play_store_categories_count.csv
|
|
||||||
|
|
||||||
|
|
||||||
data/raw/*
|
data/raw/*
|
||||||
!/data/raw/.gitkeep
|
!/data/raw/.gitkeep
|
||||||
|
|
130
config.yaml
130
config.yaml
|
@ -26,7 +26,7 @@ TIME_SEGMENTS: &time_segments
|
||||||
INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
|
INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
|
||||||
TAILORED_EVENTS: # Only relevant if TYPE=EVENT
|
TAILORED_EVENTS: # Only relevant if TYPE=EVENT
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
SEGMENTING_METHOD: "30_before" # 30_before, 90_before, stress_event
|
SEGMENTING_METHOD: "10_before" # 30_before, 90_before, stress_event
|
||||||
INTERVAL_OF_INTEREST: 10 # duration of event of interest [minutes]
|
INTERVAL_OF_INTEREST: 10 # duration of event of interest [minutes]
|
||||||
IOI_ERROR_TOLERANCE: 5 # interval of interest erorr tolerance (before and after IOI) [minutes]
|
IOI_ERROR_TOLERANCE: 5 # interval of interest erorr tolerance (before and after IOI) [minutes]
|
||||||
|
|
||||||
|
@ -91,7 +91,7 @@ PHONE_ACTIVITY_RECOGNITION:
|
||||||
EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same AR episode.
|
EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same AR episode.
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"]
|
FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"]
|
||||||
ACTIVITY_CLASSES:
|
ACTIVITY_CLASSES:
|
||||||
STATIONARY: ["still", "tilting"]
|
STATIONARY: ["still", "tilting"]
|
||||||
|
@ -104,9 +104,9 @@ PHONE_APPLICATIONS_CRASHES:
|
||||||
CONTAINER: applications_crashes
|
CONTAINER: applications_crashes
|
||||||
APPLICATION_CATEGORIES:
|
APPLICATION_CATEGORIES:
|
||||||
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
|
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
|
||||||
CATALOGUE_FILE: "data/external/play_store_application_genre_catalogue.csv"
|
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
|
||||||
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
|
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
|
||||||
SCRAPE_MISSING_CATEGORIES: False # whether to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
||||||
PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
|
PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/features/phone-applications-foreground/
|
# See https://www.rapids.science/latest/features/phone-applications-foreground/
|
||||||
|
@ -114,32 +114,24 @@ PHONE_APPLICATIONS_FOREGROUND:
|
||||||
CONTAINER: applications
|
CONTAINER: applications
|
||||||
APPLICATION_CATEGORIES:
|
APPLICATION_CATEGORIES:
|
||||||
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
|
CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
|
||||||
CATALOGUE_FILE: "data/external/play_store_application_genre_catalogue.csv"
|
CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
|
||||||
# Refer to data/external/play_store_categories_count.csv for a list of categories (genres) and their frequency.
|
PACKAGE_NAMES_HASHED: True
|
||||||
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
|
UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
|
||||||
SCRAPE_MISSING_CATEGORIES: False # whether to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
INCLUDE_EPISODE_FEATURES: True
|
INCLUDE_EPISODE_FEATURES: True
|
||||||
SINGLE_CATEGORIES: ["Productivity", "Tools", "Communication", "Education", "Social"]
|
SINGLE_CATEGORIES: ["all", "email"]
|
||||||
MULTIPLE_CATEGORIES:
|
MULTIPLE_CATEGORIES:
|
||||||
games: ["Puzzle", "Card", "Casual", "Board", "Strategy", "Trivia", "Word", "Adventure", "Role Playing", "Simulation", "Board, Brain Games", "Racing"]
|
social: ["socialnetworks", "socialmediatools"]
|
||||||
social: ["Communication", "Social", "Dating"]
|
entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"]
|
||||||
productivity: ["Tools", "Productivity", "Finance", "Education", "News & Magazines", "Business", "Books & Reference"]
|
|
||||||
health: ["Health & Fitness", "Lifestyle", "Food & Drink", "Sports", "Medical", "Parenting"]
|
|
||||||
entertainment: ["Shopping", "Music & Audio", "Entertainment", "Travel & Local", "Photography", "Video Players & Editors", "Personalization", "House & Home", "Art & Design", "Auto & Vehicles", "Entertainment,Music & Video",
|
|
||||||
"Puzzle", "Card", "Casual", "Board", "Strategy", "Trivia", "Word", "Adventure", "Role Playing", "Simulation", "Board, Brain Games", "Racing" # Add all games.
|
|
||||||
]
|
|
||||||
maps_weather: ["Maps & Navigation", "Weather"]
|
|
||||||
CUSTOM_CATEGORIES:
|
CUSTOM_CATEGORIES:
|
||||||
SINGLE_APPS: []
|
social_media: ["com.google.android.youtube", "com.snapchat.android", "com.instagram.android", "com.zhiliaoapp.musically", "com.facebook.katana"]
|
||||||
EXCLUDED_CATEGORIES: ["System", "STRAW"]
|
dating: ["com.tinder", "com.relance.happycouple", "com.kiwi.joyride"]
|
||||||
# Note: A special option here is "is_system_app".
|
SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps
|
||||||
# This excludes applications that have is_system_app = TRUE, which is a separate column in the table.
|
EXCLUDED_CATEGORIES: []
|
||||||
# However, all of these applications have been assigned System category.
|
EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"] # TODO list system apps?
|
||||||
# I will therefore filter by that category, which is a superset and is more complete. JL
|
|
||||||
EXCLUDED_APPS: []
|
|
||||||
FEATURES:
|
FEATURES:
|
||||||
APP_EVENTS: ["countevent", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
|
APP_EVENTS: ["countevent", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
|
||||||
APP_EPISODES: ["countepisode", "minduration", "maxduration", "meanduration", "sumduration"]
|
APP_EPISODES: ["countepisode", "minduration", "maxduration", "meanduration", "sumduration"]
|
||||||
|
@ -163,7 +155,7 @@ PHONE_BATTERY:
|
||||||
EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
|
EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
|
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
|
||||||
SRC_SCRIPT: src/features/phone_battery/rapids/main.py
|
SRC_SCRIPT: src/features/phone_battery/rapids/main.py
|
||||||
|
|
||||||
|
@ -177,7 +169,7 @@ PHONE_BLUETOOTH:
|
||||||
SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R
|
SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R
|
||||||
|
|
||||||
DORYAB:
|
DORYAB:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES:
|
FEATURES:
|
||||||
ALL:
|
ALL:
|
||||||
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
|
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
|
||||||
|
@ -198,7 +190,7 @@ PHONE_CALLS:
|
||||||
CONTAINER: call
|
CONTAINER: call
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES_TYPE: EPISODES # EVENTS or EPISODES
|
FEATURES_TYPE: EPISODES # EVENTS or EPISODES
|
||||||
CALL_TYPES: [missed, incoming, outgoing]
|
CALL_TYPES: [missed, incoming, outgoing]
|
||||||
FEATURES:
|
FEATURES:
|
||||||
|
@ -227,18 +219,19 @@ PHONE_CONVERSATION: # TODO Adapt for speech
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/features/phone-data-yield/
|
# See https://www.rapids.science/latest/features/phone-data-yield/
|
||||||
PHONE_DATA_YIELD:
|
PHONE_DATA_YIELD:
|
||||||
SENSORS: [#PHONE_ACCELEROMETER,
|
SENSORS: [ #PHONE_ACCELEROMETER,
|
||||||
PHONE_ACTIVITY_RECOGNITION,
|
#PHONE_ACTIVITY_RECOGNITION,
|
||||||
PHONE_APPLICATIONS_FOREGROUND,
|
#PHONE_APPLICATIONS_FOREGROUND,
|
||||||
PHONE_APPLICATIONS_NOTIFICATIONS,
|
#PHONE_APPLICATIONS_NOTIFICATIONS,
|
||||||
PHONE_BATTERY,
|
#PHONE_BATTERY,
|
||||||
PHONE_BLUETOOTH,
|
PHONE_BLUETOOTH #,
|
||||||
PHONE_CALLS,
|
#PHONE_CALLS,
|
||||||
PHONE_LIGHT,
|
#PHONE_LIGHT,
|
||||||
PHONE_LOCATIONS,
|
#PHONE_LOCATIONS,
|
||||||
PHONE_MESSAGES,
|
#PHONE_MESSAGES,
|
||||||
PHONE_SCREEN,
|
#PHONE_SCREEN,
|
||||||
PHONE_WIFI_VISIBLE]
|
#PHONE_WIFI_VISIBLE
|
||||||
|
]
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
|
@ -251,9 +244,8 @@ PHONE_ESM:
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
STRAW:
|
STRAW:
|
||||||
COMPUTE: True
|
COMPUTE: True
|
||||||
SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
|
SCALES: ["activities"]
|
||||||
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
|
FEATURES: [activities_n_others, activities_inperson, activities_formal]
|
||||||
FEATURES: [mean]
|
|
||||||
SRC_SCRIPT: src/features/phone_esm/straw/main.py
|
SRC_SCRIPT: src/features/phone_esm/straw/main.py
|
||||||
|
|
||||||
# See https://www.rapids.science/latest/features/phone-keyboard/
|
# See https://www.rapids.science/latest/features/phone-keyboard/
|
||||||
|
@ -270,7 +262,7 @@ PHONE_LIGHT:
|
||||||
CONTAINER: light_sensor
|
CONTAINER: light_sensor
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
|
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
|
||||||
SRC_SCRIPT: src/features/phone_light/rapids/main.py
|
SRC_SCRIPT: src/features/phone_light/rapids/main.py
|
||||||
|
|
||||||
|
@ -284,7 +276,7 @@ PHONE_LOCATIONS:
|
||||||
|
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
DORYAB:
|
DORYAB:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome", "homelabel"]
|
FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome", "homelabel"]
|
||||||
DBSCAN_EPS: 100 # meters
|
DBSCAN_EPS: 100 # meters
|
||||||
DBSCAN_MINSAMPLES: 5
|
DBSCAN_MINSAMPLES: 5
|
||||||
|
@ -299,7 +291,7 @@ PHONE_LOCATIONS:
|
||||||
SRC_SCRIPT: src/features/phone_locations/doryab/main.py
|
SRC_SCRIPT: src/features/phone_locations/doryab/main.py
|
||||||
|
|
||||||
BARNETT:
|
BARNETT:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
|
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
|
||||||
IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON
|
IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON
|
||||||
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
|
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
|
||||||
|
@ -317,7 +309,7 @@ PHONE_MESSAGES:
|
||||||
CONTAINER: sms
|
CONTAINER: sms
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
MESSAGES_TYPES : [received, sent]
|
MESSAGES_TYPES : [received, sent]
|
||||||
FEATURES:
|
FEATURES:
|
||||||
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
|
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
|
||||||
|
@ -329,7 +321,7 @@ PHONE_SCREEN:
|
||||||
CONTAINER: screen
|
CONTAINER: screen
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
REFERENCE_HOUR_FIRST_USE: 0
|
REFERENCE_HOUR_FIRST_USE: 0
|
||||||
IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable
|
IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable
|
||||||
IGNORE_EPISODES_LONGER_THAN: 360 # in minutes, set to 0 to disable
|
IGNORE_EPISODES_LONGER_THAN: 360 # in minutes, set to 0 to disable
|
||||||
|
@ -342,7 +334,7 @@ PHONE_SPEECH:
|
||||||
CONTAINER: speech
|
CONTAINER: speech
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
STRAW:
|
STRAW:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"]
|
FEATURES: ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"]
|
||||||
SRC_SCRIPT: src/features/phone_speech/straw/main.py
|
SRC_SCRIPT: src/features/phone_speech/straw/main.py
|
||||||
|
|
||||||
|
@ -360,7 +352,7 @@ PHONE_WIFI_VISIBLE:
|
||||||
CONTAINER: wifi
|
CONTAINER: wifi
|
||||||
PROVIDERS:
|
PROVIDERS:
|
||||||
RAPIDS:
|
RAPIDS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
|
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
|
||||||
SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R
|
SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R
|
||||||
|
|
||||||
|
@ -529,10 +521,10 @@ EMPATICA_ACCELEROMETER:
|
||||||
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
|
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
|
||||||
SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
|
SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
|
||||||
CR:
|
CR:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features
|
FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features
|
||||||
WINDOWS:
|
WINDOWS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
WINDOW_LENGTH: 15 # specify window length in seconds
|
WINDOW_LENGTH: 15 # specify window length in seconds
|
||||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
|
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
|
||||||
SRC_SCRIPT: src/features/empatica_accelerometer/cr/main.py
|
SRC_SCRIPT: src/features/empatica_accelerometer/cr/main.py
|
||||||
|
@ -556,11 +548,11 @@ EMPATICA_TEMPERATURE:
|
||||||
FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
|
FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
|
||||||
SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
|
SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
|
||||||
CR:
|
CR:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean",
|
FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean",
|
||||||
"stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"]
|
"stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"]
|
||||||
WINDOWS:
|
WINDOWS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
WINDOW_LENGTH: 300 # specify window length in seconds
|
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
|
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
|
||||||
SRC_SCRIPT: src/features/empatica_temperature/cr/main.py
|
SRC_SCRIPT: src/features/empatica_temperature/cr/main.py
|
||||||
|
@ -574,14 +566,14 @@ EMPATICA_ELECTRODERMAL_ACTIVITY:
|
||||||
FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
|
FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
|
||||||
SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
|
SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
|
||||||
CR:
|
CR:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic',
|
FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic',
|
||||||
'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore',
|
'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore',
|
||||||
'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio',
|
'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio',
|
||||||
'avgPeakIncreaseTime', 'avgPeakDecreaseTime', 'avgPeakDuration', 'signalOverallChange', 'changeDuration', 'changeRate', 'significantIncrease',
|
'avgPeakIncreaseTime', 'avgPeakDecreaseTime', 'avgPeakDuration', 'signalOverallChange', 'changeDuration', 'changeRate', 'significantIncrease',
|
||||||
'significantDecrease']
|
'significantDecrease']
|
||||||
WINDOWS:
|
WINDOWS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
WINDOW_LENGTH: 60 # specify window length in seconds
|
WINDOW_LENGTH: 60 # specify window length in seconds
|
||||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', count_windows, eda_num_peaks_non_zero]
|
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', count_windows, eda_num_peaks_non_zero]
|
||||||
IMPUTE_NANS: True
|
IMPUTE_NANS: True
|
||||||
|
@ -600,7 +592,7 @@ EMPATICA_BLOOD_VOLUME_PULSE:
|
||||||
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
|
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
|
||||||
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
|
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
|
||||||
WINDOWS:
|
WINDOWS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
WINDOW_LENGTH: 300 # specify window length in seconds
|
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
|
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
|
||||||
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/cr/main.py
|
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/cr/main.py
|
||||||
|
@ -614,12 +606,12 @@ EMPATICA_INTER_BEAT_INTERVAL:
|
||||||
FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
|
FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
|
||||||
SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
|
SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
|
||||||
CR:
|
CR:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
|
FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
|
||||||
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
|
'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
|
||||||
PATCH_WITH_BVP: True
|
PATCH_WITH_BVP: True
|
||||||
WINDOWS:
|
WINDOWS:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
WINDOW_LENGTH: 300 # specify window length in seconds
|
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
|
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
|
||||||
SRC_SCRIPT: src/features/empatica_inter_beat_interval/cr/main.py
|
SRC_SCRIPT: src/features/empatica_inter_beat_interval/cr/main.py
|
||||||
|
@ -681,12 +673,12 @@ ALL_CLEANING_INDIVIDUAL:
|
||||||
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||||
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
|
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
|
||||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||||
CORR_THRESHOLD: 0.95
|
CORR_THRESHOLD: 0.95
|
||||||
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
|
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
|
||||||
STRAW:
|
STRAW:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||||
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||||
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||||
|
@ -694,7 +686,7 @@ ALL_CLEANING_INDIVIDUAL:
|
||||||
COLS_NAN_THRESHOLD: 0.9 # set to 1 to remove only columns that contains all (100% of) NaN
|
COLS_NAN_THRESHOLD: 0.9 # set to 1 to remove only columns that contains all (100% of) NaN
|
||||||
COLS_VAR_THRESHOLD: True
|
COLS_VAR_THRESHOLD: True
|
||||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||||
CORR_THRESHOLD: 0.95
|
CORR_THRESHOLD: 0.95
|
||||||
STANDARDIZATION: True
|
STANDARDIZATION: True
|
||||||
|
@ -713,12 +705,12 @@ ALL_CLEANING_OVERALL:
|
||||||
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||||
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
|
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
|
||||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||||
CORR_THRESHOLD: 0.95
|
CORR_THRESHOLD: 0.95
|
||||||
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
|
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
|
||||||
STRAW:
|
STRAW:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||||
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||||
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||||
|
@ -726,7 +718,7 @@ ALL_CLEANING_OVERALL:
|
||||||
COLS_NAN_THRESHOLD: 0.8 # set to 1 to remove only columns that contains all (100% of) NaN
|
COLS_NAN_THRESHOLD: 0.8 # set to 1 to remove only columns that contains all (100% of) NaN
|
||||||
COLS_VAR_THRESHOLD: True
|
COLS_VAR_THRESHOLD: True
|
||||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||||
CORR_THRESHOLD: 0.95
|
CORR_THRESHOLD: 0.95
|
||||||
STANDARDIZATION: True
|
STANDARDIZATION: True
|
||||||
|
@ -740,7 +732,7 @@ ALL_CLEANING_OVERALL:
|
||||||
|
|
||||||
PARAMS_FOR_ANALYSIS:
|
PARAMS_FOR_ANALYSIS:
|
||||||
BASELINE:
|
BASELINE:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
FOLDER: data/external/baseline
|
FOLDER: data/external/baseline
|
||||||
CONTAINER: [results-survey637813_final.csv, # Slovenia
|
CONTAINER: [results-survey637813_final.csv, # Slovenia
|
||||||
results-survey358134_final.csv, # Belgium 1
|
results-survey358134_final.csv, # Belgium 1
|
||||||
|
@ -751,8 +743,8 @@ PARAMS_FOR_ANALYSIS:
|
||||||
CATEGORICAL_FEATURES: [gender]
|
CATEGORICAL_FEATURES: [gender]
|
||||||
|
|
||||||
TARGET:
|
TARGET:
|
||||||
COMPUTE: True
|
COMPUTE: False
|
||||||
LABEL: appraisal_stressfulness_event_mean
|
LABEL: appraisal_stressfulness_event_mean
|
||||||
ALL_LABELS: [PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean, JCQ_coworker_support_mean, appraisal_stressfulness_period_mean]
|
ALL_LABELS: [appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean]
|
||||||
# PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean,
|
# PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean,
|
||||||
# JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean
|
# JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,45 +0,0 @@
|
||||||
genre,n
|
|
||||||
System,261
|
|
||||||
Tools,96
|
|
||||||
Productivity,71
|
|
||||||
Health & Fitness,60
|
|
||||||
Finance,54
|
|
||||||
Communication,39
|
|
||||||
Music & Audio,39
|
|
||||||
Shopping,38
|
|
||||||
Lifestyle,33
|
|
||||||
Education,28
|
|
||||||
News & Magazines,24
|
|
||||||
Maps & Navigation,23
|
|
||||||
Entertainment,21
|
|
||||||
Business,18
|
|
||||||
Travel & Local,18
|
|
||||||
Books & Reference,16
|
|
||||||
Social,16
|
|
||||||
Weather,16
|
|
||||||
Food & Drink,14
|
|
||||||
Sports,14
|
|
||||||
Other,13
|
|
||||||
Photography,13
|
|
||||||
Puzzle,13
|
|
||||||
Video Players & Editors,12
|
|
||||||
Card,9
|
|
||||||
Casual,9
|
|
||||||
Personalization,8
|
|
||||||
Medical,7
|
|
||||||
Board,5
|
|
||||||
Strategy,4
|
|
||||||
House & Home,3
|
|
||||||
Trivia,3
|
|
||||||
Word,3
|
|
||||||
Adventure,2
|
|
||||||
Art & Design,2
|
|
||||||
Auto & Vehicles,2
|
|
||||||
Dating,2
|
|
||||||
Role Playing,2
|
|
||||||
STRAW,2
|
|
||||||
Simulation,2
|
|
||||||
"Board,Brain Games",1
|
|
||||||
"Entertainment,Music & Video",1
|
|
||||||
Parenting,1
|
|
||||||
Racing,1
|
|
|
185
environment.yml
185
environment.yml
|
@ -1,30 +1,165 @@
|
||||||
name: rapids
|
name: rapids
|
||||||
channels:
|
channels:
|
||||||
- conda-forge
|
- conda-forge
|
||||||
|
- defaults
|
||||||
dependencies:
|
dependencies:
|
||||||
- auto-sklearn
|
- _libgcc_mutex=0.1
|
||||||
- hmmlearn
|
- _openmp_mutex=4.5
|
||||||
- imbalanced-learn
|
- _py-xgboost-mutex=2.0
|
||||||
- jsonschema
|
- appdirs=1.4.4
|
||||||
- lightgbm
|
- arrow=0.16.0
|
||||||
- matplotlib
|
- asn1crypto=1.4.0
|
||||||
- numpy
|
- astropy=4.2.1
|
||||||
- pandas
|
- attrs=20.3.0
|
||||||
- peakutils
|
- binaryornot=0.4.4
|
||||||
- pip
|
- blas=1.0
|
||||||
- plotly
|
- brotlipy=0.7.0
|
||||||
- python-dateutil
|
- bzip2=1.0.8
|
||||||
- pytz
|
- ca-certificates=2021.7.5
|
||||||
- pywavelets
|
- certifi=2021.5.30
|
||||||
- pyyaml
|
- cffi=1.14.4
|
||||||
- scikit-learn
|
- chardet=3.0.4
|
||||||
- scipy
|
- click=7.1.2
|
||||||
- seaborn
|
- colorama=0.4.4
|
||||||
- setuptools
|
- cookiecutter=1.6.0
|
||||||
- bioconda::snakemake
|
- cryptography=3.3.1
|
||||||
- bioconda::snakemake-minimal
|
- datrie=0.8.2
|
||||||
- tqdm
|
- docutils=0.16
|
||||||
- xgboost
|
- future=0.18.2
|
||||||
|
- gitdb=4.0.5
|
||||||
|
- gitdb2=4.0.2
|
||||||
|
- gitpython=3.1.11
|
||||||
|
- idna=2.10
|
||||||
|
- imbalanced-learn=0.6.2
|
||||||
|
- importlib-metadata=2.0.0
|
||||||
|
- importlib_metadata=2.0.0
|
||||||
|
- intel-openmp=2019.4
|
||||||
|
- jinja2=2.11.2
|
||||||
|
- jinja2-time=0.2.0
|
||||||
|
- joblib=1.0.0
|
||||||
|
- jsonschema=3.2.0
|
||||||
|
- ld_impl_linux-64=2.36.1
|
||||||
|
- libblas=3.8.0
|
||||||
|
- libcblas=3.8.0
|
||||||
|
- libcxx=10.0.0
|
||||||
|
- libcxxabi=10.0.0
|
||||||
|
- libedit=3.1.20191231
|
||||||
|
- libffi=3.3
|
||||||
|
- libgcc-ng=11.2.0
|
||||||
|
- libgfortran
|
||||||
|
- libgfortran
|
||||||
|
- libgfortran
|
||||||
|
- liblapack=3.8.0
|
||||||
|
- libopenblas=0.3.10
|
||||||
|
- libstdcxx-ng=11.2.0
|
||||||
|
- libxgboost=0.90
|
||||||
|
- libzlib=1.2.11
|
||||||
|
- lightgbm=3.1.1
|
||||||
|
- llvm-openmp=10.0.0
|
||||||
|
- markupsafe=1.1.1
|
||||||
|
- mkl
|
||||||
|
- mkl-service=2.3.0
|
||||||
|
- mkl_fft=1.2.0
|
||||||
|
- mkl_random=1.1.1
|
||||||
|
- more-itertools=8.6.0
|
||||||
|
- ncurses=6.2
|
||||||
|
- numpy=1.19.2
|
||||||
|
- numpy-base=1.19.2
|
||||||
|
- openblas=0.3.4
|
||||||
|
- openssl=1.1.1k
|
||||||
|
- pandas=1.1.5
|
||||||
|
- pbr=5.5.1
|
||||||
|
- pip=20.3.3
|
||||||
|
- plotly=4.14.1
|
||||||
|
- poyo=0.5.0
|
||||||
|
- psutil=5.7.2
|
||||||
|
- py-xgboost=0.90
|
||||||
|
- pycparser=2.20
|
||||||
|
- pyerfa=1.7.1.1
|
||||||
|
- pyopenssl=20.0.1
|
||||||
|
- pysocks=1.7.1
|
||||||
|
- python=3.7.9
|
||||||
|
- python-dateutil=2.8.1
|
||||||
|
- python_abi=3.7
|
||||||
|
- pytz=2020.4
|
||||||
|
- pyyaml=5.3.1
|
||||||
|
- readline=8.0
|
||||||
|
- requests=2.25.0
|
||||||
|
- retrying=1.3.3
|
||||||
|
- setuptools=51.0.0
|
||||||
|
- six=1.15.0
|
||||||
|
- smmap=3.0.4
|
||||||
|
- smmap2=3.0.1
|
||||||
|
- sqlite=3.33.0
|
||||||
|
- threadpoolctl=2.1.0
|
||||||
|
- tk=8.6.10
|
||||||
|
- tqdm=4.62.0
|
||||||
|
- urllib3=1.25.11
|
||||||
|
- wheel=0.36.2
|
||||||
|
- whichcraft=0.6.1
|
||||||
|
- wrapt=1.12.1
|
||||||
|
- xgboost=0.90
|
||||||
|
- xz=5.2.5
|
||||||
|
- yaml=0.2.5
|
||||||
|
- zipp=3.4.0
|
||||||
|
- zlib=1.2.11
|
||||||
- pip:
|
- pip:
|
||||||
- biosppy
|
- amply==0.1.4
|
||||||
- cr_features>=0.2
|
- auto-sklearn==0.14.7
|
||||||
|
- bidict==0.22.0
|
||||||
|
- biosppy==0.8.0
|
||||||
|
- build==0.8.0
|
||||||
|
- cached-property==1.5.2
|
||||||
|
- cloudpickle==2.2.0
|
||||||
|
- configargparse==0.15.1
|
||||||
|
- configspace==0.4.21
|
||||||
|
- cr-features==0.2.1
|
||||||
|
- cycler==0.11.0
|
||||||
|
- cython==0.29.32
|
||||||
|
- dask==2022.2.0
|
||||||
|
- decorator==4.4.2
|
||||||
|
- distributed==2022.2.0
|
||||||
|
- distro==1.7.0
|
||||||
|
- emcee==3.1.2
|
||||||
|
- fonttools==4.33.2
|
||||||
|
- fsspec==2022.8.2
|
||||||
|
- h5py==3.6.0
|
||||||
|
- heapdict==1.0.1
|
||||||
|
- hmmlearn==0.2.7
|
||||||
|
- ipython-genutils==0.2.0
|
||||||
|
- jupyter-core==4.6.3
|
||||||
|
- kiwisolver==1.4.2
|
||||||
|
- liac-arff==2.5.0
|
||||||
|
- locket==1.0.0
|
||||||
|
- matplotlib==3.5.1
|
||||||
|
- msgpack==1.0.4
|
||||||
|
- nbformat==5.0.7
|
||||||
|
- opencv-python==4.5.5.64
|
||||||
|
- packaging==21.3
|
||||||
|
- partd==1.3.0
|
||||||
|
- peakutils==1.3.3
|
||||||
|
- pep517==0.13.0
|
||||||
|
- pillow==9.1.0
|
||||||
|
- pulp==2.4
|
||||||
|
- pynisher==0.6.4
|
||||||
|
- pyparsing==2.4.7
|
||||||
|
- pyrfr==0.8.3
|
||||||
|
- pyrsistent==0.15.5
|
||||||
|
- pywavelets==1.3.0
|
||||||
|
- ratelimiter==1.2.0.post0
|
||||||
|
- scikit-learn==0.24.2
|
||||||
|
- scipy==1.7.3
|
||||||
|
- seaborn==0.11.2
|
||||||
|
- shortuuid==1.0.8
|
||||||
|
- smac==1.2
|
||||||
|
- snakemake==5.30.2
|
||||||
|
- sortedcontainers==2.4.0
|
||||||
|
- tblib==1.7.0
|
||||||
|
- tomli==2.0.1
|
||||||
|
- toolz==0.12.0
|
||||||
|
- toposort==1.5
|
||||||
|
- tornado==6.2
|
||||||
|
- traitlets==4.3.3
|
||||||
|
- typing-extensions==4.2.0
|
||||||
|
- zict==2.2.0
|
||||||
|
prefix: /opt/conda/envs/rapids
|
||||||
|
|
|
@ -247,8 +247,6 @@ rule empatica_readable_datetime:
|
||||||
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
|
include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
|
||||||
output:
|
output:
|
||||||
"data/raw/{pid}/empatica_{sensor}_with_datetime.csv"
|
"data/raw/{pid}/empatica_{sensor}_with_datetime.csv"
|
||||||
resources:
|
|
||||||
mem_mb=50000
|
|
||||||
script:
|
script:
|
||||||
"../src/data/datetime/readable_datetime.R"
|
"../src/data/datetime/readable_datetime.R"
|
||||||
|
|
||||||
|
|
|
@ -29,17 +29,24 @@ get_genre <- function(apps){
|
||||||
apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F)
|
apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F)
|
||||||
genre_catalogue <- data.frame()
|
genre_catalogue <- data.frame()
|
||||||
catalogue_source <- snakemake@params[["catalogue_source"]]
|
catalogue_source <- snakemake@params[["catalogue_source"]]
|
||||||
|
package_names_hashed <- snakemake@params[["package_names_hashed"]]
|
||||||
update_catalogue_file <- snakemake@params[["update_catalogue_file"]]
|
update_catalogue_file <- snakemake@params[["update_catalogue_file"]]
|
||||||
scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]]
|
scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]]
|
||||||
apps_with_genre <- data.frame(matrix(ncol=length(colnames(apps)) + 1,nrow=0, dimnames=list(NULL, c(colnames(apps), "genre"))))
|
apps_with_genre <- data.frame(matrix(ncol=length(colnames(apps)) + 1,nrow=0, dimnames=list(NULL, c(colnames(apps), "genre"))))
|
||||||
|
|
||||||
|
if (length(package_names_hashed) == 0) {package_names_hashed <- FALSE}
|
||||||
|
|
||||||
if(nrow(apps) > 0){
|
if(nrow(apps) > 0){
|
||||||
if(catalogue_source == "GOOGLE"){
|
if(catalogue_source == "GOOGLE"){
|
||||||
apps_with_genre <- apps %>% mutate(genre = NA_character_)
|
apps_with_genre <- apps %>% mutate(genre = NA_character_)
|
||||||
} else if(catalogue_source == "FILE"){
|
} else if(catalogue_source == "FILE"){
|
||||||
genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character"))
|
genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character"))
|
||||||
|
if (package_names_hashed) {
|
||||||
|
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_hash")
|
||||||
|
} else {
|
||||||
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
|
apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){
|
if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){
|
||||||
apps_without_genre <- (apps_with_genre %>% filter(is.na(genre)) %>% distinct(package_name))$package_name
|
apps_without_genre <- (apps_with_genre %>% filter(is.na(genre)) %>% distinct(package_name))$package_name
|
||||||
|
|
|
@ -136,9 +136,8 @@ def patch_ibi_with_bvp(ibi_data, bvp_data):
|
||||||
# Begin with the cr-features part
|
# Begin with the cr-features part
|
||||||
try:
|
try:
|
||||||
ibi_data, ibi_start_timestamp = empatica2d_to_array(ibi_data_file)
|
ibi_data, ibi_start_timestamp = empatica2d_to_array(ibi_data_file)
|
||||||
except (IndexError, KeyError) as e:
|
except IndexError as e:
|
||||||
# Checks whether IBI.csv is empty
|
# Checks whether IBI.csv is empty
|
||||||
# It may raise a KeyError if df is empty here: startTimeStamp = df.time[0]
|
|
||||||
df_test = pd.read_csv(ibi_data_file, names=['timings', 'inter_beat_interval'], header=None)
|
df_test = pd.read_csv(ibi_data_file, names=['timings', 'inter_beat_interval'], header=None)
|
||||||
if df_test.empty:
|
if df_test.empty:
|
||||||
df_test['timestamp'] = df_test['timings']
|
df_test['timestamp'] = df_test['timings']
|
||||||
|
|
|
@ -120,7 +120,7 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')]
|
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')]
|
||||||
|
|
||||||
if provider["COLS_VAR_THRESHOLD"]:
|
if provider["COLS_VAR_THRESHOLD"]:
|
||||||
features.drop(features.std(numeric_only=True)[features.std(numeric_only=True) == 0].index.values, axis=1, inplace=True)
|
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
|
||||||
|
|
||||||
fe5 = features.copy()
|
fe5 = features.copy()
|
||||||
|
|
||||||
|
@ -134,7 +134,7 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
|
valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
|
||||||
|
|
||||||
corr_matrix = valid_features.corr().abs()
|
corr_matrix = valid_features.corr().abs()
|
||||||
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
|
||||||
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
|
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
|
||||||
|
|
||||||
features.drop(to_drop, axis=1, inplace=True)
|
features.drop(to_drop, axis=1, inplace=True)
|
||||||
|
@ -150,15 +150,13 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
|
|
||||||
return features
|
return features
|
||||||
|
|
||||||
|
def impute(df, method='zero'):
|
||||||
|
|
||||||
def k_nearest(df):
|
def k_nearest(df):
|
||||||
pd.set_option('display.max_columns', None)
|
pd.set_option('display.max_columns', None)
|
||||||
imputer = KNNImputer(n_neighbors=3)
|
imputer = KNNImputer(n_neighbors=3)
|
||||||
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
|
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
|
||||||
|
|
||||||
|
|
||||||
def impute(df, method='zero'):
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'zero': df.fillna(0),
|
'zero': df.fillna(0),
|
||||||
'high_number': df.fillna(1500),
|
'high_number': df.fillna(1500),
|
||||||
|
@ -167,7 +165,6 @@ def impute(df, method='zero'):
|
||||||
'knn': k_nearest(df)
|
'knn': k_nearest(df)
|
||||||
}[method]
|
}[method]
|
||||||
|
|
||||||
|
|
||||||
def graph_bf_af(features, phase_name, plt_flag=False):
|
def graph_bf_af(features, phase_name, plt_flag=False):
|
||||||
if plt_flag:
|
if plt_flag:
|
||||||
sns.set(rc={"figure.figsize":(16, 8)})
|
sns.set(rc={"figure.figsize":(16, 8)})
|
||||||
|
|
|
@ -146,7 +146,7 @@ def straw_cleaning(sensor_data_files, provider, target):
|
||||||
# (5) REMOVE COLS WHERE VARIANCE IS 0
|
# (5) REMOVE COLS WHERE VARIANCE IS 0
|
||||||
|
|
||||||
if provider["COLS_VAR_THRESHOLD"]:
|
if provider["COLS_VAR_THRESHOLD"]:
|
||||||
features.drop(features.std(numeric_only=True)[features.std(numeric_only=True) == 0].index.values, axis=1, inplace=True)
|
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
|
||||||
|
|
||||||
graph_bf_af(features, "6variance_drop")
|
graph_bf_af(features, "6variance_drop")
|
||||||
|
|
||||||
|
@ -200,7 +200,7 @@ def straw_cleaning(sensor_data_files, provider, target):
|
||||||
valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
|
valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
|
||||||
|
|
||||||
corr_matrix = valid_features.corr().abs()
|
corr_matrix = valid_features.corr().abs()
|
||||||
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
|
||||||
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
|
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
|
||||||
|
|
||||||
# sns.heatmap(corr_matrix, cmap="YlGnBu")
|
# sns.heatmap(corr_matrix, cmap="YlGnBu")
|
||||||
|
@ -245,14 +245,12 @@ def straw_cleaning(sensor_data_files, provider, target):
|
||||||
|
|
||||||
return features
|
return features
|
||||||
|
|
||||||
|
def impute(df, method='zero'):
|
||||||
|
|
||||||
def k_nearest(df):
|
def k_nearest(df):
|
||||||
imputer = KNNImputer(n_neighbors=3)
|
imputer = KNNImputer(n_neighbors=3)
|
||||||
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
|
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
|
||||||
|
|
||||||
|
|
||||||
def impute(df, method='zero'):
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'zero': df.fillna(0),
|
'zero': df.fillna(0),
|
||||||
'high_number': df.fillna(1500),
|
'high_number': df.fillna(1500),
|
||||||
|
@ -261,7 +259,6 @@ def impute(df, method='zero'):
|
||||||
'knn': k_nearest(df)
|
'knn': k_nearest(df)
|
||||||
}[method]
|
}[method]
|
||||||
|
|
||||||
|
|
||||||
def graph_bf_af(features, phase_name, plt_flag=False):
|
def graph_bf_af(features, phase_name, plt_flag=False):
|
||||||
if plt_flag:
|
if plt_flag:
|
||||||
sns.set(rc={"figure.figsize":(16, 8)})
|
sns.set(rc={"figure.figsize":(16, 8)})
|
||||||
|
|
|
@ -15,13 +15,13 @@ def extract_second_order_features(intraday_features, so_features_names, prefix="
|
||||||
so_features = pd.DataFrame()
|
so_features = pd.DataFrame()
|
||||||
#print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest())
|
#print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest())
|
||||||
if "mean" in so_features_names:
|
if "mean" in so_features_names:
|
||||||
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean(numeric_only=True).add_suffix("_SO_mean")], axis=1)
|
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean().add_suffix("_SO_mean")], axis=1)
|
||||||
|
|
||||||
if "median" in so_features_names:
|
if "median" in so_features_names:
|
||||||
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median(numeric_only=True).add_suffix("_SO_median")], axis=1)
|
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median().add_suffix("_SO_median")], axis=1)
|
||||||
|
|
||||||
if "sd" in so_features_names:
|
if "sd" in so_features_names:
|
||||||
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std(numeric_only=True).fillna(0).add_suffix("_SO_sd")], axis=1)
|
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std().fillna(0).add_suffix("_SO_sd")], axis=1)
|
||||||
|
|
||||||
if "nlargest" in so_features_names: # largest 5 -- maybe there is a faster groupby solution?
|
if "nlargest" in so_features_names: # largest 5 -- maybe there is a faster groupby solution?
|
||||||
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
|
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
|
||||||
|
|
|
@ -26,7 +26,7 @@ def calculate_empatica_data_yield(features): # TODO
|
||||||
# Assigns 1 to values that are over 1 (in case of windows not being filled fully)
|
# Assigns 1 to values that are over 1 (in case of windows not being filled fully)
|
||||||
features[empatica_data_yield_cols] = features[empatica_data_yield_cols].apply(lambda x: [y if y <= 1 or np.isnan(y) else 1 for y in x])
|
features[empatica_data_yield_cols] = features[empatica_data_yield_cols].apply(lambda x: [y if y <= 1 or np.isnan(y) else 1 for y in x])
|
||||||
|
|
||||||
features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1, numeric_only=True).fillna(0)
|
features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1).fillna(0)
|
||||||
features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average)
|
features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average)
|
||||||
|
|
||||||
return features
|
return features
|
||||||
|
|
|
@ -0,0 +1,292 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
id2qc = { 44:["What have you mainly been doing within the last 10 minutes?",
|
||||||
|
"Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
|
||||||
|
"Kaj ste v glavnem počeli v zadnjih 10 minutah?"],
|
||||||
|
45:["What type of individual work?",
|
||||||
|
"Wat voor soort individueel werk?",
|
||||||
|
"Kakšno vrsto samostojnega dela ste opravljali?"],
|
||||||
|
46:["How did you work with others?",
|
||||||
|
"Hoe heb je met anderen gewerkt?",
|
||||||
|
"Kako ste sodelovali z drugimi?"],
|
||||||
|
47:["What type of break?",
|
||||||
|
"Wat voor soort pauze?",
|
||||||
|
"Kakšno vrsto odmora ste imeli?"],
|
||||||
|
48:["Where did you travel between?",
|
||||||
|
"Waar heb je tussen gereisd?",
|
||||||
|
"Kam ste potovali?"],
|
||||||
|
49:["Did you use a computer or phone for that?",
|
||||||
|
"Heb je daarvoor een computer of telefoon gebruikt?",
|
||||||
|
"Ste za to uporabljali računalnik ali telefon?"],
|
||||||
|
50:["What kind of an interaction was that?",
|
||||||
|
"Wat voor interactie was dat?",
|
||||||
|
"Kakšne vrste sodelovanja je bilo to?"],
|
||||||
|
51:["How many people were involved besides yourself?",
|
||||||
|
"Hoeveel mensen waren er behalve jezelf betrokken?",
|
||||||
|
"Koliko oseb je bilo poleg vas še vpletenih?"],
|
||||||
|
# 52:["What have you mainly been doing within the last 10 minutes?",
|
||||||
|
# "Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
|
||||||
|
# "Kaj ste v glavnem počeli v zadnjih 10 minutah?"]
|
||||||
|
}
|
||||||
|
qc2id = {v:k for k,values in id2qc.items() for v in values}
|
||||||
|
|
||||||
|
next_questions = { 44: [45,46,47,48],
|
||||||
|
45:[49,49],
|
||||||
|
46:[50,50],
|
||||||
|
47:[],
|
||||||
|
48:[],
|
||||||
|
49:[],
|
||||||
|
50:[51,51],
|
||||||
|
51:[]
|
||||||
|
#52:[45,46,47,48],
|
||||||
|
}
|
||||||
|
|
||||||
|
def esm_activities_LTM_features(
|
||||||
|
df_esm_activities_cleaned: pd.DataFrame,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
""" Function for calculating LTM(Last 10 minutes) features of questionnaire answers. It first corrects the question ids according
|
||||||
|
to esm_instructions and the updated corpus of question_ids. It then processes each LTM question chain to
|
||||||
|
find relevant social properties given by the answers such as the number of people interacted with, the formality and whether the socializing was done in person.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df_esm_activities_cleaned: pd.DataFrame
|
||||||
|
A cleaned up dataframe, which must include esm_instructions, esm_user_answer_numeric.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
df_esm_activities_cleaned: pd.DataFrame
|
||||||
|
The same dataframe with columns which contain:
|
||||||
|
["correct_ids"] - Corrected question_ids
|
||||||
|
["ans_seq"] - For each LTM question, the sequence of numerical user answers pertaining to this chain of questions.
|
||||||
|
["n_others","inperson","formal"]- Properties of known potential social encounters as given by process_answers().
|
||||||
|
"""
|
||||||
|
#TODO: preprocess questionaires
|
||||||
|
#DONE: correct ids
|
||||||
|
correct_id_df = correct_activity_qids(df_esm_activities_cleaned)
|
||||||
|
#DONE: process subquestions
|
||||||
|
ids = correct_id_df["correct_ids"]
|
||||||
|
main_q_indices = ids[ids==44].index
|
||||||
|
q_group = []
|
||||||
|
i=-1
|
||||||
|
for id in ids:
|
||||||
|
if(id==44):
|
||||||
|
i=i+1
|
||||||
|
q_group.append(i)
|
||||||
|
correct_id_df["q_group"] = q_group
|
||||||
|
ans_seq = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).rename(columns={"esm_user_answer_numeric":"ans_seq"})
|
||||||
|
ans_seq.set_index(main_q_indices,inplace=True)
|
||||||
|
# correct_id_df["ans_seq"] = [[] for i in range(len(correct_id_df))]
|
||||||
|
# correct_id_df["ans_seq"].loc[main_q_indices] = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).values.reshape(-1)
|
||||||
|
#DONE: find types of status for each main question: socializing:[none,irl,online,unknown], num_people:[0,1,2,>2,unknown]
|
||||||
|
processed_ans_df = process_answers(ans_seq)
|
||||||
|
# df_out = df_esm_activities_cleaned.join(test)
|
||||||
|
return df_esm_activities_cleaned.join(processed_ans_df)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
possible answer sequences for LTM question chains
|
||||||
|
|
||||||
|
#alone
|
||||||
|
0,0,0 not social
|
||||||
|
0,0,1 not social
|
||||||
|
0,1,0 not social
|
||||||
|
0,1,1 not social
|
||||||
|
0,2 not social
|
||||||
|
0,3 not social
|
||||||
|
0,4 not social
|
||||||
|
0,5 not social
|
||||||
|
0,6 not social
|
||||||
|
#w/ others
|
||||||
|
1,0,0,0 1 irl
|
||||||
|
1,0,0,1 2 irl
|
||||||
|
1,0,0,2 3+ irl
|
||||||
|
1,0,1,0 1 irl
|
||||||
|
1,0,1,1 2 irl
|
||||||
|
1,0,1,2 3+ irl
|
||||||
|
1,1,0,0 1 online
|
||||||
|
1,1,0,1 2 online
|
||||||
|
1,1,0,2 3+ online
|
||||||
|
1,1,1,0 1 online
|
||||||
|
1,1,1,1 2 online
|
||||||
|
1,1,1,2 3+ online
|
||||||
|
1,2 positive likely to be more than 2
|
||||||
|
1,3 positive
|
||||||
|
#break
|
||||||
|
2,0 ambiguous
|
||||||
|
2,1 positive irl
|
||||||
|
2,2 ambiguous
|
||||||
|
2,3 ambiguous
|
||||||
|
#transit
|
||||||
|
3,0 ambiguous
|
||||||
|
3,1 ambiguous
|
||||||
|
3,2 ambiguous
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#TODO: docstring
|
||||||
|
def process_answers(df:pd.DataFrame)-> pd.DataFrame:
|
||||||
|
""" Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
|
||||||
|
> n_others: Number of other people interacted with in the last 10 minutes
|
||||||
|
- -1: Number is positive but unknown exactly
|
||||||
|
- 0: No people/alone
|
||||||
|
- 1: One extra person
|
||||||
|
- 2: Two extra people
|
||||||
|
- 3: More than two extra people
|
||||||
|
- NaN : Can't say anything with enough certainty.
|
||||||
|
> inperson:
|
||||||
|
- True/False: The interaction in question was/wasn't in person.
|
||||||
|
- None: Can't say anything with enough certainty.
|
||||||
|
> formal:
|
||||||
|
- True/False: The interaction in question was/wasn't formal.
|
||||||
|
- None: Can't say anything with enough certainty.
|
||||||
|
Args:
|
||||||
|
df (pd.DataFrame): _description_
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: _description_
|
||||||
|
"""
|
||||||
|
properties = {"n_others":[],
|
||||||
|
"inperson":[],
|
||||||
|
"formal":[]}
|
||||||
|
for ans_seq in df["ans_seq"]:
|
||||||
|
n_other = None
|
||||||
|
inperson = None
|
||||||
|
formal = None
|
||||||
|
if(ans_seq[0]==0):
|
||||||
|
n_other = 0
|
||||||
|
elif(ans_seq[0]==1):
|
||||||
|
if(ans_seq[1]==3):
|
||||||
|
n_other = -1 # anwsered "Other" but did work with other people
|
||||||
|
elif(ans_seq[1]==2):
|
||||||
|
n_other = 3 #assuming more than 2 people participated in the lecture or presentation
|
||||||
|
elif(ans_seq[1] in [0,1]):
|
||||||
|
inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
|
||||||
|
formal = ans_seq[2]==0#0 means formal
|
||||||
|
n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
|
||||||
|
elif(ans_seq[0]==2):
|
||||||
|
formal = False#assuming one does not have a formal meeting during break time
|
||||||
|
if(ans_seq[1]==1):
|
||||||
|
n_other = -1
|
||||||
|
inperson = True
|
||||||
|
#if not 1 then we dont know anythong for sure
|
||||||
|
elif(ans_seq[0]==3):
|
||||||
|
#we cant say whether the persion was carpooling or driving alone.
|
||||||
|
pass
|
||||||
|
properties["n_others"].append(n_other)
|
||||||
|
properties["inperson"].append(inperson)
|
||||||
|
properties["formal"].append(formal)
|
||||||
|
|
||||||
|
|
||||||
|
#df = df.join(pd.DataFrame(properties,index=df.index))
|
||||||
|
return pd.DataFrame(properties,index=df.index)
|
||||||
|
|
||||||
|
def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame:
|
||||||
|
"""_summary_
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df (pd.DataFrame): _description_
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: Input dataframe with added column "correct_ids"
|
||||||
|
"""
|
||||||
|
df["correct_ids"] = df["esm_instructions"].apply(lambda x: qc2id[x])
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def process_answers_aggregation(df:pd.core.groupby.generic.DataFrameGroupBy)-> pd.core.groupby.generic.DataFrameGroupBy:
|
||||||
|
""" Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
|
||||||
|
> n_others: Number of other people interacted with in the last 10 minutes
|
||||||
|
- -1: Number is positive but unknown exactly
|
||||||
|
- 0: No people/alone
|
||||||
|
- 1: One extra person
|
||||||
|
- 2: Two extra people
|
||||||
|
- 3: More than two extra people
|
||||||
|
- NaN : Can't say anything with enough certainty.
|
||||||
|
> inperson:
|
||||||
|
- True/False: The interaction in question was/wasn't in person.
|
||||||
|
- None: Can't say anything with enough certainty.
|
||||||
|
> formal:
|
||||||
|
- True/False: The interaction in question was/wasn't formal.
|
||||||
|
- None: Can't say anything with enough certainty.
|
||||||
|
Args:
|
||||||
|
df (pd.DataFrame): _description_
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: _description_
|
||||||
|
"""
|
||||||
|
|
||||||
|
#print("=======================\nAPPLY START:\ndf=",df.columns,df.local_segment)
|
||||||
|
properties = {"n_others":[],
|
||||||
|
"inperson":[],
|
||||||
|
"formal":[]}
|
||||||
|
ans_seq = df["esm_user_answer_numeric"].values
|
||||||
|
n_other = None
|
||||||
|
inperson = None
|
||||||
|
formal = None
|
||||||
|
if(ans_seq[0]==0):
|
||||||
|
n_other = 0
|
||||||
|
elif(ans_seq[0]==1):
|
||||||
|
if(ans_seq[1]==3):
|
||||||
|
n_other = -1 # anwsered "Other" but did work with other people
|
||||||
|
elif(ans_seq[1]==2):
|
||||||
|
n_other = 3 #assuming more than 2 people participated in the lecture or presentation
|
||||||
|
elif(ans_seq[1] in [0,1]):
|
||||||
|
inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
|
||||||
|
formal = ans_seq[2]==0#0 means formal
|
||||||
|
n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
|
||||||
|
elif(ans_seq[0]==2):
|
||||||
|
formal = False#assuming one does not have a formal meeting during break time
|
||||||
|
if(ans_seq[1]==1):
|
||||||
|
n_other = -1
|
||||||
|
inperson = True
|
||||||
|
#if not 1 then we dont know anythong for sure
|
||||||
|
elif(ans_seq[0]==3):
|
||||||
|
#we cant say whether the persion was carpooling or driving alone.
|
||||||
|
pass
|
||||||
|
properties["n_others"].append(n_other)
|
||||||
|
properties["inperson"].append(inperson)
|
||||||
|
properties["formal"].append(formal)
|
||||||
|
|
||||||
|
|
||||||
|
df = df.join(pd.DataFrame(properties,index=df.index))
|
||||||
|
#print("APPLY END:\ndf=",df[["n_others","inperson","formal"]])
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#test stuff
|
||||||
|
def test():
|
||||||
|
from esm_preprocess import preprocess_esm,clean_up_esm
|
||||||
|
df = pd.read_csv("data/raw/p031/phone_esm_with_datetime.csv")
|
||||||
|
df = preprocess_esm(df)
|
||||||
|
df = clean_up_esm(df)
|
||||||
|
df = df[df["questionnaire_id"]==97]
|
||||||
|
original = esm_activities_LTM_features(df)
|
||||||
|
df["local_segment"] = [str(i)+":"+j for i,j in df[["esm_session","device_id"]].values]
|
||||||
|
temp = df.groupby("local_segment")
|
||||||
|
temp2 = temp.apply(process_answers_aggregation)
|
||||||
|
|
||||||
|
#compare with original function results
|
||||||
|
selection = original[original["correct_ids"]==44][["n_others", "inperson", "formal"]]
|
||||||
|
temp_selection = temp2.loc[selection.index]
|
||||||
|
temp_selection.compare(selection,keep_shape=True,keep_equal =True)
|
||||||
|
|
||||||
|
#print out ans_seq processing results
|
||||||
|
# import json
|
||||||
|
# i = 0
|
||||||
|
# for j,ans in correct_id_df[["esm_json","esm_user_answer"]].values:
|
||||||
|
# obj = json.loads(j)
|
||||||
|
# text = obj["esm_instructions"]
|
||||||
|
# if ("10 minut" in text):
|
||||||
|
# print("---\n",test.ans_seq.iloc[i])
|
||||||
|
# print(test[["n_others","inperson","formal"]].values[i])
|
||||||
|
# i = i+1
|
||||||
|
# print(text,ans)
|
||||||
|
|
||||||
|
#test()
|
|
@ -1,4 +1,8 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import sys
|
||||||
|
import warnings
|
||||||
|
sys.path.append('src/features/phone_esm/straw')
|
||||||
|
from esm_activities import esm_activities_LTM_features,process_answers_aggregation
|
||||||
|
|
||||||
QUESTIONNAIRE_IDS = {
|
QUESTIONNAIRE_IDS = {
|
||||||
"sleep_quality": 1,
|
"sleep_quality": 1,
|
||||||
|
@ -39,23 +43,49 @@ QUESTIONNAIRE_IDS = {
|
||||||
|
|
||||||
def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||||
esm_data = pd.read_csv(sensor_data_files["sensor_data"])
|
esm_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||||
|
|
||||||
requested_features = provider["FEATURES"]
|
requested_features = provider["FEATURES"]
|
||||||
# name of the features this function can compute
|
# name of the features this function can compute
|
||||||
requested_scales = provider["SCALES"]
|
requested_scales = provider["SCALES"]
|
||||||
base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
|
base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
|
||||||
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
|
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge","activities_n_others","activities_inperson","activities_formal"]
|
||||||
#TODO Check valid questionnaire and feature names.
|
#TODO Check valid questionnaire and feature names.
|
||||||
# the subset of requested features this function can compute
|
# the subset of requested features this function can compute
|
||||||
features_to_compute = list(set(requested_features) & set(base_features_names))
|
features_to_compute = list(set(requested_features) & set(base_features_names))
|
||||||
esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
|
esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
|
||||||
if not esm_data.empty:
|
if not esm_data.empty:
|
||||||
esm_data = filter_data_by_segment(esm_data, time_segment)
|
esm_data = filter_data_by_segment(esm_data, time_segment)
|
||||||
|
|
||||||
if not esm_data.empty:
|
if not esm_data.empty:
|
||||||
esm_features = pd.DataFrame()
|
esm_features = pd.DataFrame()
|
||||||
for scale in requested_scales:
|
for scale in requested_scales:
|
||||||
questionnaire_id = QUESTIONNAIRE_IDS[scale]
|
questionnaire_id = QUESTIONNAIRE_IDS[scale]
|
||||||
mask = esm_data["questionnaire_id"] == questionnaire_id
|
mask = esm_data["questionnaire_id"] == questionnaire_id
|
||||||
|
#print(esm_data.loc[mask].head())
|
||||||
|
#print(time_segment)
|
||||||
|
if not mask.any():
|
||||||
|
temp = sensor_data_files["sensor_data"]
|
||||||
|
warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}-{time_segment}",RuntimeWarning)
|
||||||
|
continue
|
||||||
|
#TODO: calculation of LTM features
|
||||||
|
if scale=="activities":
|
||||||
|
requested_subset = [req for req in requested_features if req.startswith("activities")]
|
||||||
|
if not bool(requested_subset):
|
||||||
|
continue
|
||||||
|
# ltm_features = esm_activities_LTM_features(esm_data.loc[mask])
|
||||||
|
# print(esm_data["esm_json"].values)
|
||||||
|
# print(mask)
|
||||||
|
# print(esm_data.loc[mask])
|
||||||
|
# #ltm_features = ltm_features[ltm_features["correct_ids"==44]]
|
||||||
|
#print(esm_data.loc[mask]["local_segment"])
|
||||||
|
ltm_features = esm_data.loc[mask].groupby(["local_segment"]).apply(process_answers_aggregation)
|
||||||
|
#print("PRINTING ltm_features:\n",ltm_features)
|
||||||
|
ltm_features.rename(columns={"n_others":"activities_n_others","inperson":"activities_inperson","formal":"activities_formal"},inplace=True)
|
||||||
|
esm_features[requested_subset] = ltm_features.groupby("local_segment").first()[requested_subset]
|
||||||
|
#print(esm_features.columns)
|
||||||
|
#print("PRINTING esm_features after rename:\n",ltm_features)
|
||||||
|
#FIXME: it might be an issue that im calculating for whole time segment and not grouping by "local segment"
|
||||||
|
#print("~~~~~~~~~~~~~~~~~~~~~~~~===============================~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n LTM FEATURES STORED... AFTER RETURN:\n",ltm_features,esm_features[["activities_"+req for req in requested_subset]])
|
||||||
|
if("mean" in features_to_compute):
|
||||||
esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
|
esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
|
||||||
#TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.
|
#TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.
|
||||||
|
|
||||||
|
@ -64,3 +94,15 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
|
||||||
esm_features.rename(columns={'index': 'local_segment'}, inplace=True)
|
esm_features.rename(columns={'index': 'local_segment'}, inplace=True)
|
||||||
|
|
||||||
return esm_features
|
return esm_features
|
||||||
|
|
||||||
|
def test_main():
|
||||||
|
import temp_help
|
||||||
|
provider = {
|
||||||
|
"FEATURES":["mean","activities_n_others","activities_inperson","activities_formal"],
|
||||||
|
"SCALES":['activities']
|
||||||
|
}
|
||||||
|
sensor_data_files = {"sensor_data":"data/interim/p069/phone_esm_clean.csv"}
|
||||||
|
s_feat = straw_features(sensor_data_files,"straw_event_stress_event_p069_110",provider,temp_help.filter_data_by_segment)
|
||||||
|
print(s_feat)
|
||||||
|
|
||||||
|
#test_main()
|
|
@ -67,7 +67,7 @@ def extract_ers(esm_df):
|
||||||
|
|
||||||
segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"]
|
segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"]
|
||||||
|
|
||||||
if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
|
if segmenting_method in ["10_before", "30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
|
||||||
""" '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
|
""" '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
|
||||||
Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
|
Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
|
||||||
All questionnaire durations over 15 minutes are excluded from the querying.
|
All questionnaire durations over 15 minutes are excluded from the querying.
|
||||||
|
@ -79,7 +79,18 @@ def extract_ers(esm_df):
|
||||||
extracted_ers = extracted_ers[extracted_ers["timestamp"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min
|
extracted_ers = extracted_ers[extracted_ers["timestamp"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min
|
||||||
extracted_ers["shift_direction"] = -1
|
extracted_ers["shift_direction"] = -1
|
||||||
|
|
||||||
if segmenting_method == "30_before":
|
if segmenting_method == "10_before":
|
||||||
|
"""The method 10-minutes before simply takes 10 minutes before the questionnaire and sums it with the questionnaire duration.
|
||||||
|
The timestamps are formatted with the help of format_timestamp() method.
|
||||||
|
"""
|
||||||
|
time_before_questionnaire = 10 * 60 # in seconds (10 minutes)
|
||||||
|
#TODO: split into small segments with manipulating lenght and shift
|
||||||
|
extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x))
|
||||||
|
extracted_ers["shift"] = time_before_questionnaire
|
||||||
|
extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x))
|
||||||
|
|
||||||
|
|
||||||
|
elif segmenting_method == "30_before":
|
||||||
"""The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
|
"""The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
|
||||||
The timestamps are formatted with the help of format_timestamp() method.
|
The timestamps are formatted with the help of format_timestamp() method.
|
||||||
"""
|
"""
|
||||||
|
@ -140,8 +151,8 @@ def extract_ers(esm_df):
|
||||||
|
|
||||||
# Extracted 3 targets that will be transfered in the csv file to the cleaning script.
|
# Extracted 3 targets that will be transfered in the csv file to the cleaning script.
|
||||||
se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'})
|
se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'})
|
||||||
se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean(numeric_only=True)['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'})
|
se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'})
|
||||||
se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean(numeric_only=True)['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'})
|
se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'})
|
||||||
|
|
||||||
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
|
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
|
||||||
extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \
|
extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
"""This file is TEMPORARY and intended for testing main.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
def filter_data_by_segment(data, time_segment):
|
||||||
|
data.dropna(subset=["assigned_segments"], inplace=True)
|
||||||
|
if(data.shape[0] == 0): # data is empty
|
||||||
|
data["local_segment"] = data["timestamps_segment"] = None
|
||||||
|
return data
|
||||||
|
|
||||||
|
datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||||
|
timestamps_regex = "[0-9]{13}"
|
||||||
|
segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex)
|
||||||
|
data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
|
||||||
|
data = data.drop(columns=["assigned_segments"])
|
||||||
|
data = data.dropna(subset = ["local_segment"])
|
||||||
|
if(data.shape[0] == 0): # there are no rows belonging to time_segment after droping na
|
||||||
|
data["timestamps_segment"] = None
|
||||||
|
else:
|
||||||
|
data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True)
|
||||||
|
|
||||||
|
# chunk episodes
|
||||||
|
if (not data.empty) and ("start_timestamp" in data.columns) and ("end_timestamp" in data.columns):
|
||||||
|
data = chunk_episodes(data)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def chunk_episodes(sensor_episodes):
|
||||||
|
import copy
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Deduplicate episodes
|
||||||
|
# Drop rows where segments of start_timestamp and end_timestamp are the same
|
||||||
|
sensor_episodes = sensor_episodes.drop_duplicates(subset=["start_timestamp", "end_timestamp", "local_segment"], keep="first")
|
||||||
|
|
||||||
|
# Delete useless columns
|
||||||
|
for drop_col in ["local_date_time", "local_date", "local_time", "local_hour", "local_minute"]:
|
||||||
|
del sensor_episodes[drop_col]
|
||||||
|
|
||||||
|
# Avoid SettingWithCopyWarning
|
||||||
|
sensor_episodes = sensor_episodes.copy()
|
||||||
|
|
||||||
|
# Unix timestamp for current segment in milliseconds
|
||||||
|
sensor_episodes[["segment_start_timestamp", "segment_end_timestamp"]] = sensor_episodes["timestamps_segment"].str.split(",", expand=True).astype(int)
|
||||||
|
|
||||||
|
# Compute chunked timestamp
|
||||||
|
sensor_episodes["chunked_start_timestamp"] = sensor_episodes[["start_timestamp", "segment_start_timestamp"]].max(axis=1)
|
||||||
|
sensor_episodes["chunked_end_timestamp"] = sensor_episodes[["end_timestamp", "segment_end_timestamp"]].min(axis=1)
|
||||||
|
|
||||||
|
# Compute duration: intersection of current row and segment
|
||||||
|
sensor_episodes["duration"] = (sensor_episodes["chunked_end_timestamp"] - sensor_episodes["chunked_start_timestamp"]) / (1000 * 60)
|
||||||
|
|
||||||
|
# Merge episodes
|
||||||
|
cols_for_groupby = [col for col in sensor_episodes.columns if col not in ["timestamps_segment", "timestamp", "assigned_segments", "start_datetime", "end_datetime", "start_timestamp", "end_timestamp", "duration", "chunked_start_timestamp", "chunked_end_timestamp"]]
|
||||||
|
|
||||||
|
sensor_episodes_grouped = sensor_episodes.groupby(by=cols_for_groupby, sort=False, dropna=False)
|
||||||
|
merged_sensor_episodes = sensor_episodes_grouped[["duration"]].sum()
|
||||||
|
|
||||||
|
merged_sensor_episodes["start_timestamp"] = sensor_episodes_grouped["chunked_start_timestamp"].first()
|
||||||
|
merged_sensor_episodes["end_timestamp"] = sensor_episodes_grouped["chunked_end_timestamp"].last()
|
||||||
|
|
||||||
|
merged_sensor_episodes.reset_index(inplace=True)
|
||||||
|
|
||||||
|
# Compute datetime
|
||||||
|
merged_sensor_episodes["local_start_date_time"] = pd.to_datetime(merged_sensor_episodes["start_timestamp"], unit="ms", utc=True)
|
||||||
|
merged_sensor_episodes["local_start_date_time"] = pd.concat([data["local_start_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
|
||||||
|
|
||||||
|
merged_sensor_episodes["local_end_date_time"] = pd.to_datetime(merged_sensor_episodes["end_timestamp"], unit="ms", utc=True)
|
||||||
|
merged_sensor_episodes["local_end_date_time"] = pd.concat([data["local_end_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
|
||||||
|
|
||||||
|
return merged_sensor_episodes
|
|
@ -115,7 +115,7 @@ cluster_on = provider["CLUSTER_ON"]
|
||||||
strategy = provider["INFER_HOME_LOCATION_STRATEGY"]
|
strategy = provider["INFER_HOME_LOCATION_STRATEGY"]
|
||||||
days_threshold = provider["MINIMUM_DAYS_TO_DETECT_HOME_CHANGES"]
|
days_threshold = provider["MINIMUM_DAYS_TO_DETECT_HOME_CHANGES"]
|
||||||
|
|
||||||
if not location_data.timestamp.is_monotonic_increasing:
|
if not location_data.timestamp.is_monotonic:
|
||||||
location_data.sort_values(by=["timestamp"], inplace=True)
|
location_data.sort_values(by=["timestamp"], inplace=True)
|
||||||
|
|
||||||
location_data["duration_in_seconds"] = -1 * location_data.timestamp.diff(-1) / 1000
|
location_data["duration_in_seconds"] = -1 * location_data.timestamp.diff(-1) / 1000
|
||||||
|
|
|
@ -14,6 +14,7 @@ def import_path(path):
|
||||||
sys.modules[module_name] = module
|
sys.modules[module_name] = module
|
||||||
return module
|
return module
|
||||||
|
|
||||||
|
#TODO:check why segments change to int
|
||||||
def filter_data_by_segment(data, time_segment):
|
def filter_data_by_segment(data, time_segment):
|
||||||
data.dropna(subset=["assigned_segments"], inplace=True)
|
data.dropna(subset=["assigned_segments"], inplace=True)
|
||||||
if(data.shape[0] == 0): # data is empty
|
if(data.shape[0] == 0): # data is empty
|
||||||
|
@ -151,6 +152,7 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
|
||||||
|
|
||||||
else:
|
else:
|
||||||
segment_colums = pd.DataFrame()
|
segment_colums = pd.DataFrame()
|
||||||
|
print(sensor_features,sensor_features['local_segment'])
|
||||||
sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '')
|
||||||
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
|
||||||
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
|
||||||
|
|
Loading…
Reference in New Issue