removed cleaning to make it run

Merge branch 'sociality-task' of https://repo.ijs.si/junoslukan/rapids into sociality-task
corrected esm_features index column
2023-03-31 13:27:01 +00:00 · 2023-03-31 13:08:19 +00:00 · 2023-03-31 13:08:15 +00:00 · 2023-03-30 20:33:35 +02:00 · 2023-03-30 11:54:51 +00:00 · 2023-03-29 13:09:26 +00:00
18 changed files with 673 additions and 1199 deletions
--- a/.gitignore
+++ b/.gitignore
@ -100,9 +100,6 @@ data/external/*
 !/data/external/wiki_tz.csv
 !/data/external/main_study_usernames.csv
 !/data/external/timezone.csv
 !/data/external/play_store_application_genre_catalogue.csv
 !/data/external/play_store_categories_count.csv
 data/raw/*
 !/data/raw/.gitkeep
--- a/config.yaml
+++ b/config.yaml
@ -26,7 +26,7 @@ TIME_SEGMENTS: &time_segments
  INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
  TAILORED_EVENTS: # Only relevant if TYPE=EVENT
    COMPUTE: True
-    SEGMENTING_METHOD: "30_before" # 30_before, 90_before, stress_event
+    SEGMENTING_METHOD: "10_before" # 30_before, 90_before, stress_event
    INTERVAL_OF_INTEREST: 10 # duration of event of interest [minutes]
    IOI_ERROR_TOLERANCE: 5 # interval of interest erorr tolerance (before and after IOI) [minutes]
@ -91,7 +91,7 @@ PHONE_ACTIVITY_RECOGNITION:
  EPISODE_THRESHOLD_BETWEEN_ROWS: 5 # minutes. Max time difference for two consecutive rows to be considered within the same AR episode.
  PROVIDERS:
    RAPIDS:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"]
      ACTIVITY_CLASSES:
        STATIONARY: ["still", "tilting"]
@ -104,9 +104,9 @@ PHONE_APPLICATIONS_CRASHES:
  CONTAINER: applications_crashes
  APPLICATION_CATEGORIES:
    CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
-    CATALOGUE_FILE: "data/external/play_store_application_genre_catalogue.csv"
+    CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
-    UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
+    UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
-    SCRAPE_MISSING_CATEGORIES: False # whether to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
+    SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
  PROVIDERS: # None implemented yet but this sensor can be used in PHONE_DATA_YIELD
 # See https://www.rapids.science/latest/features/phone-applications-foreground/
@ -114,32 +114,24 @@ PHONE_APPLICATIONS_FOREGROUND:
  CONTAINER: applications
  APPLICATION_CATEGORIES:
    CATALOGUE_SOURCE: FILE # FILE (genres are read from CATALOGUE_FILE) or GOOGLE (genres are scrapped from the Play Store)
-    CATALOGUE_FILE: "data/external/play_store_application_genre_catalogue.csv"
+    CATALOGUE_FILE: "data/external/stachl_application_genre_catalogue.csv"
-    # Refer to data/external/play_store_categories_count.csv for a list of categories (genres) and their frequency.
+    PACKAGE_NAMES_HASHED: True
-    UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
+    UPDATE_CATALOGUE_FILE: False # if CATALOGUE_SOURCE is equal to FILE, whether or not to update CATALOGUE_FILE, if CATALOGUE_SOURCE is equal to GOOGLE all scraped genres will be saved to CATALOGUE_FILE
-    SCRAPE_MISSING_CATEGORIES: False # whether to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
+    SCRAPE_MISSING_CATEGORIES: False # whether or not to scrape missing genres, only effective if CATALOGUE_SOURCE is equal to FILE. If CATALOGUE_SOURCE is equal to GOOGLE, all genres are scraped anyway
  PROVIDERS:
    RAPIDS:
-      COMPUTE: True
+      COMPUTE: False
      INCLUDE_EPISODE_FEATURES: True
-      SINGLE_CATEGORIES: ["Productivity", "Tools", "Communication", "Education", "Social"]
+      SINGLE_CATEGORIES: ["all", "email"]
      MULTIPLE_CATEGORIES:
-        games: ["Puzzle", "Card", "Casual", "Board", "Strategy", "Trivia", "Word", "Adventure", "Role Playing", "Simulation", "Board, Brain Games", "Racing"]
+        social: ["socialnetworks", "socialmediatools"]
-        social: ["Communication", "Social", "Dating"]
+        entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"]
        productivity: ["Tools", "Productivity", "Finance", "Education", "News & Magazines", "Business", "Books & Reference"]
        health: ["Health & Fitness", "Lifestyle", "Food & Drink", "Sports", "Medical", "Parenting"]
        entertainment: ["Shopping", "Music & Audio", "Entertainment", "Travel & Local", "Photography", "Video Players & Editors", "Personalization", "House & Home", "Art & Design", "Auto & Vehicles", "Entertainment,Music & Video",
                        "Puzzle", "Card", "Casual", "Board", "Strategy", "Trivia", "Word", "Adventure", "Role Playing", "Simulation", "Board, Brain Games", "Racing" # Add all games.
        ]
        maps_weather: ["Maps & Navigation", "Weather"]
      CUSTOM_CATEGORIES:
-      SINGLE_APPS: []
+        social_media: ["com.google.android.youtube", "com.snapchat.android", "com.instagram.android", "com.zhiliaoapp.musically", "com.facebook.katana"]
-      EXCLUDED_CATEGORIES: ["System", "STRAW"]
+        dating: ["com.tinder", "com.relance.happycouple", "com.kiwi.joyride"]
-      # Note: A special option here is "is_system_app".
+      SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps
-      # This excludes applications that have is_system_app = TRUE, which is a separate column in the table.
+      EXCLUDED_CATEGORIES: []
-      # However, all of these applications have been assigned System category.
+      EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"] # TODO list system apps?
      # I will therefore filter by that category, which is a superset and is more complete. JL
      EXCLUDED_APPS: []
      FEATURES: 
        APP_EVENTS: ["countevent", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
        APP_EPISODES: ["countepisode", "minduration", "maxduration", "meanduration", "sumduration"]
@ -163,7 +155,7 @@ PHONE_BATTERY:
  EPISODE_THRESHOLD_BETWEEN_ROWS: 30 # minutes. Max time difference for two consecutive rows to be considered within the same battery episode.
  PROVIDERS:
    RAPIDS:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
      SRC_SCRIPT: src/features/phone_battery/rapids/main.py
@ -177,7 +169,7 @@ PHONE_BLUETOOTH:
      SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R
    DORYAB:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: 
        ALL: 
            DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
@ -198,7 +190,7 @@ PHONE_CALLS:
  CONTAINER: call
  PROVIDERS:
    RAPIDS:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES_TYPE: EPISODES # EVENTS or EPISODES
      CALL_TYPES: [missed, incoming, outgoing]
      FEATURES:
@ -227,18 +219,19 @@ PHONE_CONVERSATION: # TODO Adapt for speech
 # See https://www.rapids.science/latest/features/phone-data-yield/
 PHONE_DATA_YIELD:
-  SENSORS: [#PHONE_ACCELEROMETER,
+  SENSORS: [ #PHONE_ACCELEROMETER,
-            PHONE_ACTIVITY_RECOGNITION,
+            #PHONE_ACTIVITY_RECOGNITION,
-            PHONE_APPLICATIONS_FOREGROUND,
+            #PHONE_APPLICATIONS_FOREGROUND,
-            PHONE_APPLICATIONS_NOTIFICATIONS,
+            #PHONE_APPLICATIONS_NOTIFICATIONS,
-            PHONE_BATTERY,
+            #PHONE_BATTERY,
-            PHONE_BLUETOOTH,
+            PHONE_BLUETOOTH #,
-            PHONE_CALLS,
+            #PHONE_CALLS,
-            PHONE_LIGHT,
+            #PHONE_LIGHT,
-            PHONE_LOCATIONS,
+            #PHONE_LOCATIONS,
-            PHONE_MESSAGES,
+            #PHONE_MESSAGES,
-            PHONE_SCREEN,
+            #PHONE_SCREEN,
-            PHONE_WIFI_VISIBLE]
+            #PHONE_WIFI_VISIBLE
            ]
  PROVIDERS:
    RAPIDS:
      COMPUTE: True
@ -251,9 +244,8 @@ PHONE_ESM:
  PROVIDERS:
    STRAW:
      COMPUTE: True
-      SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", 
+      SCALES: ["activities"]
-              "appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
+      FEATURES: [activities_n_others, activities_inperson, activities_formal]
      FEATURES: [mean]
      SRC_SCRIPT: src/features/phone_esm/straw/main.py
 # See https://www.rapids.science/latest/features/phone-keyboard/
@ -270,7 +262,7 @@ PHONE_LIGHT:
  CONTAINER: light_sensor
  PROVIDERS:
    RAPIDS:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
      SRC_SCRIPT: src/features/phone_light/rapids/main.py
@ -284,7 +276,7 @@ PHONE_LOCATIONS:
  PROVIDERS:
    DORYAB:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: ["locationvariance","loglocationvariance","totaldistance","avgspeed","varspeed", "numberofsignificantplaces","numberlocationtransitions","radiusgyration","timeattop1location","timeattop2location","timeattop3location","movingtostaticratio","outlierstimepercent","maxlengthstayatclusters","minlengthstayatclusters","avglengthstayatclusters","stdlengthstayatclusters","locationentropy","normalizedlocationentropy","timeathome", "homelabel"]
      DBSCAN_EPS: 100 # meters
      DBSCAN_MINSAMPLES: 5
@ -299,7 +291,7 @@ PHONE_LOCATIONS:
      SRC_SCRIPT: src/features/phone_locations/doryab/main.py
    BARNETT:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
      IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON
      MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
@ -317,7 +309,7 @@ PHONE_MESSAGES:
  CONTAINER: sms
  PROVIDERS:
    RAPIDS:
-      COMPUTE: True
+      COMPUTE: False
      MESSAGES_TYPES : [received, sent]
      FEATURES: 
        received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
@ -329,7 +321,7 @@ PHONE_SCREEN:
  CONTAINER: screen
  PROVIDERS:
    RAPIDS:
-      COMPUTE: True
+      COMPUTE: False
      REFERENCE_HOUR_FIRST_USE: 0
      IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable
      IGNORE_EPISODES_LONGER_THAN: 360 # in minutes, set to 0 to disable
@ -342,7 +334,7 @@ PHONE_SPEECH:
  CONTAINER: speech
  PROVIDERS:
    STRAW:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: ["meanspeech", "stdspeech", "nlargest", "nsmallest", "medianspeech"]
      SRC_SCRIPT: src/features/phone_speech/straw/main.py
@ -360,7 +352,7 @@ PHONE_WIFI_VISIBLE:
  CONTAINER: wifi
  PROVIDERS:
    RAPIDS:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
      SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R
@ -529,10 +521,10 @@ EMPATICA_ACCELEROMETER:
      FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
      SRC_SCRIPT: src/features/empatica_accelerometer/dbdp/main.py
    CR:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: ["totalMagnitudeBand", "absoluteMeanBand", "varianceBand"] # Acc features
      WINDOWS:
-        COMPUTE: True
+        COMPUTE: False
        WINDOW_LENGTH: 15 # specify window length in seconds
        SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
      SRC_SCRIPT: src/features/empatica_accelerometer/cr/main.py
@ -556,11 +548,11 @@ EMPATICA_TEMPERATURE:
      FEATURES: ["maxtemp", "mintemp", "avgtemp", "mediantemp", "modetemp", "stdtemp", "diffmaxmodetemp", "diffminmodetemp", "entropytemp"]
      SRC_SCRIPT: src/features/empatica_temperature/dbdp/main.py
    CR:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: ["maximum", "minimum", "meanAbsChange", "longestStrikeAboveMean", "longestStrikeBelowMean", 
                  "stdDev", "median", "meanChange", "sumSquared", "squareSumOfComponent", "sumOfSquareComponents"]
      WINDOWS:
-        COMPUTE: True
+        COMPUTE: False
        WINDOW_LENGTH: 300 # specify window length in seconds
        SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
      SRC_SCRIPT: src/features/empatica_temperature/cr/main.py
@ -574,14 +566,14 @@ EMPATICA_ELECTRODERMAL_ACTIVITY:
      FEATURES: ["maxeda", "mineda", "avgeda", "medianeda", "modeeda", "stdeda", "diffmaxmodeeda", "diffminmodeeda", "entropyeda"]
      SRC_SCRIPT: src/features/empatica_electrodermal_activity/dbdp/main.py
    CR:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: ['mean', 'std', 'q25', 'q75', 'qd', 'deriv', 'power', 'numPeaks', 'ratePeaks', 'powerPeaks', 'sumPosDeriv', 'propPosDeriv', 'derivTonic', 
                  'sigTonicDifference', 'freqFeats','maxPeakAmplitudeChangeBefore', 'maxPeakAmplitudeChangeAfter', 'avgPeakAmplitudeChangeBefore', 
                  'avgPeakAmplitudeChangeAfter', 'avgPeakChangeRatio', 'maxPeakIncreaseTime', 'maxPeakDecreaseTime', 'maxPeakDuration', 'maxPeakChangeRatio',
                  'avgPeakIncreaseTime', 'avgPeakDecreaseTime', 'avgPeakDuration', 'signalOverallChange', 'changeDuration', 'changeRate', 'significantIncrease', 
                  'significantDecrease']
      WINDOWS:
-        COMPUTE: True
+        COMPUTE: False
        WINDOW_LENGTH: 60 # specify window length in seconds
        SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', count_windows, eda_num_peaks_non_zero]
        IMPUTE_NANS: True
@ -600,7 +592,7 @@ EMPATICA_BLOOD_VOLUME_PULSE:
      FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
                  'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features
      WINDOWS:
-        COMPUTE: True
+        COMPUTE: False
        WINDOW_LENGTH: 300 # specify window length in seconds
        SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
      SRC_SCRIPT: src/features/empatica_blood_volume_pulse/cr/main.py
@ -614,12 +606,12 @@ EMPATICA_INTER_BEAT_INTERVAL:
      FEATURES: ["maxibi", "minibi", "avgibi", "medianibi", "modeibi", "stdibi", "diffmaxmodeibi", "diffminmodeibi", "entropyibi"]
      SRC_SCRIPT: src/features/empatica_inter_beat_interval/dbdp/main.py
    CR:
-      COMPUTE: True
+      COMPUTE: False
      FEATURES: ['meanHr', 'ibi', 'sdnn', 'sdsd', 'rmssd', 'pnn20', 'pnn50', 'sd', 'sd2', 'sd1/sd2', 'numRR', # Time features
                  'VLF', 'LF', 'LFnorm', 'HF', 'HFnorm', 'LF/HF', 'fullIntegral'] # Freq features            
      PATCH_WITH_BVP: True
      WINDOWS:
-        COMPUTE: True
+        COMPUTE: False
        WINDOW_LENGTH: 300 # specify window length in seconds
        SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
      SRC_SCRIPT: src/features/empatica_inter_beat_interval/cr/main.py
@ -681,12 +673,12 @@ ALL_CLEANING_INDIVIDUAL:
      DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
      DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
      DROP_HIGHLY_CORRELATED_FEATURES:
-        COMPUTE: True
+        COMPUTE: False
        MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
        CORR_THRESHOLD: 0.95
      SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
    STRAW:
-      COMPUTE: True
+      COMPUTE: False
      PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
      PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
      EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
@ -694,7 +686,7 @@ ALL_CLEANING_INDIVIDUAL:
      COLS_NAN_THRESHOLD: 0.9 # set to 1 to remove only columns that contains all (100% of) NaN
      COLS_VAR_THRESHOLD: True
      DROP_HIGHLY_CORRELATED_FEATURES:
-        COMPUTE: True
+        COMPUTE: False
        MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
        CORR_THRESHOLD: 0.95
      STANDARDIZATION: True
@ -713,12 +705,12 @@ ALL_CLEANING_OVERALL:
      DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
      DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
      DROP_HIGHLY_CORRELATED_FEATURES:
-        COMPUTE: True
+        COMPUTE: False
        MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
        CORR_THRESHOLD: 0.95
      SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
    STRAW:
-      COMPUTE: True
+      COMPUTE: False
      PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
      PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
      EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
@ -726,7 +718,7 @@ ALL_CLEANING_OVERALL:
      COLS_NAN_THRESHOLD: 0.8 # set to 1 to remove only columns that contains all (100% of) NaN
      COLS_VAR_THRESHOLD: True
      DROP_HIGHLY_CORRELATED_FEATURES:
-        COMPUTE: True
+        COMPUTE: False
        MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
        CORR_THRESHOLD: 0.95
      STANDARDIZATION: True
@ -740,7 +732,7 @@ ALL_CLEANING_OVERALL:
 PARAMS_FOR_ANALYSIS:
  BASELINE:
-    COMPUTE: True
+    COMPUTE: False
    FOLDER: data/external/baseline
    CONTAINER: [results-survey637813_final.csv,  # Slovenia
                results-survey358134_final.csv,  # Belgium 1
@ -751,8 +743,8 @@ PARAMS_FOR_ANALYSIS:
    CATEGORICAL_FEATURES: [gender]
  TARGET:
-    COMPUTE: True
+    COMPUTE: False
    LABEL: appraisal_stressfulness_event_mean
-    ALL_LABELS: [PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean, JCQ_coworker_support_mean, appraisal_stressfulness_period_mean]
+    ALL_LABELS: [appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean]
                # PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean, 
                # JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean
--- a/data/external/play_store_application_genre_catalogue.csv
+++ b/data/external/play_store_application_genre_catalogue.csv
--- a/data/external/play_store_categories_count.csv
+++ b/data/external/play_store_categories_count.csv
@ -1,45 +0,0 @@
 genre,n
 System,261
 Tools,96
 Productivity,71
 Health & Fitness,60
 Finance,54
 Communication,39
 Music & Audio,39
 Shopping,38
 Lifestyle,33
 Education,28
 News & Magazines,24
 Maps & Navigation,23
 Entertainment,21
 Business,18
 Travel & Local,18
 Books & Reference,16
 Social,16
 Weather,16
 Food & Drink,14
 Sports,14
 Other,13
 Photography,13
 Puzzle,13
 Video Players & Editors,12
 Card,9
 Casual,9
 Personalization,8
 Medical,7
 Board,5
 Strategy,4
 House & Home,3
 Trivia,3
 Word,3
 Adventure,2
 Art & Design,2
 Auto & Vehicles,2
 Dating,2
 Role Playing,2
 STRAW,2
 Simulation,2
 "Board,Brain Games",1
 "Entertainment,Music & Video",1
 Parenting,1
 Racing,1
--- a/environment.yml
+++ b/environment.yml
@ -1,30 +1,165 @@
 name: rapids
 channels:
  - conda-forge
  - defaults
 dependencies:
-    - auto-sklearn
+  - _libgcc_mutex=0.1
-    - hmmlearn
+  - _openmp_mutex=4.5
-    - imbalanced-learn
+  - _py-xgboost-mutex=2.0
-    - jsonschema
+  - appdirs=1.4.4
-    - lightgbm
+  - arrow=0.16.0
-    - matplotlib
+  - asn1crypto=1.4.0
-    - numpy
+  - astropy=4.2.1
-    - pandas
+  - attrs=20.3.0
-    - peakutils
+  - binaryornot=0.4.4
-    - pip
+  - blas=1.0
-    - plotly
+  - brotlipy=0.7.0
-    - python-dateutil
+  - bzip2=1.0.8
-    - pytz
+  - ca-certificates=2021.7.5
-    - pywavelets
+  - certifi=2021.5.30
-    - pyyaml
+  - cffi=1.14.4
-    - scikit-learn
+  - chardet=3.0.4
-    - scipy
+  - click=7.1.2
-    - seaborn
+  - colorama=0.4.4
-    - setuptools
+  - cookiecutter=1.6.0
-    - bioconda::snakemake 
+  - cryptography=3.3.1
-    - bioconda::snakemake-minimal
+  - datrie=0.8.2
-    - tqdm
+  - docutils=0.16
-    - xgboost
+  - future=0.18.2
-    - pip:
+  - gitdb=4.0.5
-        - biosppy
+  - gitdb2=4.0.2
-        - cr_features>=0.2
+  - gitpython=3.1.11
  - idna=2.10
  - imbalanced-learn=0.6.2
  - importlib-metadata=2.0.0
  - importlib_metadata=2.0.0
  - intel-openmp=2019.4
  - jinja2=2.11.2
  - jinja2-time=0.2.0
  - joblib=1.0.0
  - jsonschema=3.2.0
  - ld_impl_linux-64=2.36.1
  - libblas=3.8.0
  - libcblas=3.8.0
  - libcxx=10.0.0
  - libcxxabi=10.0.0
  - libedit=3.1.20191231
  - libffi=3.3
  - libgcc-ng=11.2.0
  - libgfortran
  - libgfortran
  - libgfortran
  - liblapack=3.8.0
  - libopenblas=0.3.10
  - libstdcxx-ng=11.2.0
  - libxgboost=0.90
  - libzlib=1.2.11
  - lightgbm=3.1.1
  - llvm-openmp=10.0.0
  - markupsafe=1.1.1
  - mkl
  - mkl-service=2.3.0
  - mkl_fft=1.2.0
  - mkl_random=1.1.1
  - more-itertools=8.6.0
  - ncurses=6.2
  - numpy=1.19.2
  - numpy-base=1.19.2
  - openblas=0.3.4
  - openssl=1.1.1k
  - pandas=1.1.5
  - pbr=5.5.1
  - pip=20.3.3
  - plotly=4.14.1
  - poyo=0.5.0
  - psutil=5.7.2
  - py-xgboost=0.90
  - pycparser=2.20
  - pyerfa=1.7.1.1
  - pyopenssl=20.0.1
  - pysocks=1.7.1
  - python=3.7.9
  - python-dateutil=2.8.1
  - python_abi=3.7
  - pytz=2020.4
  - pyyaml=5.3.1
  - readline=8.0
  - requests=2.25.0
  - retrying=1.3.3
  - setuptools=51.0.0
  - six=1.15.0
  - smmap=3.0.4
  - smmap2=3.0.1
  - sqlite=3.33.0
  - threadpoolctl=2.1.0
  - tk=8.6.10
  - tqdm=4.62.0
  - urllib3=1.25.11
  - wheel=0.36.2
  - whichcraft=0.6.1
  - wrapt=1.12.1
  - xgboost=0.90
  - xz=5.2.5
  - yaml=0.2.5
  - zipp=3.4.0
  - zlib=1.2.11
  - pip:
    - amply==0.1.4
    - auto-sklearn==0.14.7
    - bidict==0.22.0
    - biosppy==0.8.0
    - build==0.8.0
    - cached-property==1.5.2
    - cloudpickle==2.2.0
    - configargparse==0.15.1
    - configspace==0.4.21
    - cr-features==0.2.1
    - cycler==0.11.0
    - cython==0.29.32
    - dask==2022.2.0
    - decorator==4.4.2
    - distributed==2022.2.0
    - distro==1.7.0
    - emcee==3.1.2
    - fonttools==4.33.2
    - fsspec==2022.8.2
    - h5py==3.6.0
    - heapdict==1.0.1
    - hmmlearn==0.2.7
    - ipython-genutils==0.2.0
    - jupyter-core==4.6.3
    - kiwisolver==1.4.2
    - liac-arff==2.5.0
    - locket==1.0.0
    - matplotlib==3.5.1
    - msgpack==1.0.4
    - nbformat==5.0.7
    - opencv-python==4.5.5.64
    - packaging==21.3
    - partd==1.3.0
    - peakutils==1.3.3
    - pep517==0.13.0
    - pillow==9.1.0
    - pulp==2.4
    - pynisher==0.6.4
    - pyparsing==2.4.7
    - pyrfr==0.8.3
    - pyrsistent==0.15.5
    - pywavelets==1.3.0
    - ratelimiter==1.2.0.post0
    - scikit-learn==0.24.2
    - scipy==1.7.3
    - seaborn==0.11.2
    - shortuuid==1.0.8
    - smac==1.2
    - snakemake==5.30.2
    - sortedcontainers==2.4.0
    - tblib==1.7.0
    - tomli==2.0.1
    - toolz==0.12.0
    - toposort==1.5
    - tornado==6.2
    - traitlets==4.3.3
    - typing-extensions==4.2.0
    - zict==2.2.0
 prefix: /opt/conda/envs/rapids
--- a/rules/preprocessing.smk
+++ b/rules/preprocessing.smk
@ -247,8 +247,6 @@ rule empatica_readable_datetime:
        include_past_periodic_segments = config["TIME_SEGMENTS"]["INCLUDE_PAST_PERIODIC_SEGMENTS"]
    output:
        "data/raw/{pid}/empatica_{sensor}_with_datetime.csv"
    resources:
        mem_mb=50000
    script:
        "../src/data/datetime/readable_datetime.R"
--- a/src/data/application_categories.R
+++ b/src/data/application_categories.R
@ -29,16 +29,23 @@ get_genre <- function(apps){
 apps <- read.csv(snakemake@input[[1]], stringsAsFactors = F)
 genre_catalogue <- data.frame()
 catalogue_source <- snakemake@params[["catalogue_source"]]
 package_names_hashed <- snakemake@params[["package_names_hashed"]]
 update_catalogue_file <- snakemake@params[["update_catalogue_file"]]
 scrape_missing_genres <- snakemake@params[["scrape_missing_genres"]]
 apps_with_genre <- data.frame(matrix(ncol=length(colnames(apps)) + 1,nrow=0, dimnames=list(NULL, c(colnames(apps), "genre"))))
 if (length(package_names_hashed) == 0) {package_names_hashed <- FALSE}
 if(nrow(apps) > 0){
  if(catalogue_source == "GOOGLE"){
    apps_with_genre <- apps %>% mutate(genre = NA_character_)
  } else if(catalogue_source == "FILE"){
    genre_catalogue <- read.csv(snakemake@params[["catalogue_file"]], colClasses = c("character", "character"))
-    apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
+    if (package_names_hashed) {
      apps_with_genre <- left_join(apps, genre_catalogue, by = "package_hash")
    } else {
      apps_with_genre <- left_join(apps, genre_catalogue, by = "package_name")
    }
  }
  if(catalogue_source == "GOOGLE" || (catalogue_source == "FILE" && scrape_missing_genres)){
--- a/src/data/streams/empatica_zip/container.py
+++ b/src/data/streams/empatica_zip/container.py
@ -136,9 +136,8 @@ def patch_ibi_with_bvp(ibi_data, bvp_data):
    # Begin with the cr-features part
    try:
        ibi_data, ibi_start_timestamp = empatica2d_to_array(ibi_data_file)
-    except (IndexError, KeyError) as e:
+    except IndexError as e:
        # Checks whether IBI.csv is empty
        # It may raise a KeyError if df is empty here: startTimeStamp = df.time[0]
        df_test = pd.read_csv(ibi_data_file, names=['timings', 'inter_beat_interval'], header=None)
        if df_test.empty:
            df_test['timestamp'] = df_test['timings']
--- a/src/features/all_cleaning_individual/straw/main.py
+++ b/src/features/all_cleaning_individual/straw/main.py
@ -120,7 +120,7 @@ def straw_cleaning(sensor_data_files, provider):
    esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')]
    if provider["COLS_VAR_THRESHOLD"]:
-        features.drop(features.std(numeric_only=True)[features.std(numeric_only=True) == 0].index.values, axis=1, inplace=True)
+        features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
    fe5 = features.copy()
@ -134,7 +134,7 @@ def straw_cleaning(sensor_data_files, provider):
        valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
        corr_matrix = valid_features.corr().abs()
-        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
        features.drop(to_drop, axis=1, inplace=True)
@ -150,14 +150,12 @@ def straw_cleaning(sensor_data_files, provider):
    return features
 def k_nearest(df):
    pd.set_option('display.max_columns', None)
    imputer = KNNImputer(n_neighbors=3)
    return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
 def impute(df, method='zero'):
    def k_nearest(df):
        pd.set_option('display.max_columns', None)
        imputer = KNNImputer(n_neighbors=3)
        return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return {
        'zero': df.fillna(0),
@ -167,7 +165,6 @@ def impute(df, method='zero'):
        'knn': k_nearest(df) 
    }[method]
 def graph_bf_af(features, phase_name, plt_flag=False):
    if plt_flag:
        sns.set(rc={"figure.figsize":(16, 8)})
--- a/src/features/all_cleaning_overall/straw/main.py
+++ b/src/features/all_cleaning_overall/straw/main.py
@ -146,7 +146,7 @@ def straw_cleaning(sensor_data_files, provider, target):
    # (5) REMOVE COLS WHERE VARIANCE IS 0
    if provider["COLS_VAR_THRESHOLD"]:
-        features.drop(features.std(numeric_only=True)[features.std(numeric_only=True) == 0].index.values, axis=1, inplace=True)
+        features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
    graph_bf_af(features, "6variance_drop")
@ -200,7 +200,7 @@ def straw_cleaning(sensor_data_files, provider, target):
        valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
        corr_matrix = valid_features.corr().abs()
-        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
        # sns.heatmap(corr_matrix, cmap="YlGnBu")
@ -245,13 +245,11 @@ def straw_cleaning(sensor_data_files, provider, target):
    return features
 def k_nearest(df):
    imputer = KNNImputer(n_neighbors=3)
    return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
 def impute(df, method='zero'):
    def k_nearest(df):
        imputer = KNNImputer(n_neighbors=3)
        return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return {
        'zero': df.fillna(0),
@ -261,7 +259,6 @@ def impute(df, method='zero'):
        'knn': k_nearest(df) 
    }[method]
 def graph_bf_af(features, phase_name, plt_flag=False):
    if plt_flag:
        sns.set(rc={"figure.figsize":(16, 8)})
--- a/src/features/cr_features_helper_methods.py
+++ b/src/features/cr_features_helper_methods.py
@ -15,13 +15,13 @@ def extract_second_order_features(intraday_features, so_features_names, prefix="
        so_features = pd.DataFrame()
        #print(intraday_features.drop("level_1", axis=1).groupby(["local_segment"]).nsmallest())
        if "mean" in so_features_names:
-            so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean(numeric_only=True).add_suffix("_SO_mean")], axis=1)
+            so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).mean().add_suffix("_SO_mean")], axis=1)
        if "median" in so_features_names:
-            so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median(numeric_only=True).add_suffix("_SO_median")], axis=1)
+            so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median().add_suffix("_SO_median")], axis=1)
        if "sd" in so_features_names:
-            so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std(numeric_only=True).fillna(0).add_suffix("_SO_sd")], axis=1)
+            so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std().fillna(0).add_suffix("_SO_sd")], axis=1)
        if "nlargest" in so_features_names: # largest 5 -- maybe there is a faster groupby solution?
            for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
--- a/src/features/empatica_data_yield.py
+++ b/src/features/empatica_data_yield.py
@ -26,7 +26,7 @@ def calculate_empatica_data_yield(features): # TODO
    # Assigns 1 to values that are over 1 (in case of windows not being filled fully)
    features[empatica_data_yield_cols] = features[empatica_data_yield_cols].apply(lambda x: [y if y <= 1 or np.isnan(y) else 1 for y in x])
-    features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1, numeric_only=True).fillna(0)
+    features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1).fillna(0)
    features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average)
    return features
--- a/src/features/phone_esm/straw/esm_activities.py
+++ b/src/features/phone_esm/straw/esm_activities.py
@ -0,0 +1,292 @@
 import pandas as pd
 import numpy as np
 id2qc = {  44:["What have you mainly been doing within the last 10 minutes?",
                            "Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
                            "Kaj ste v glavnem počeli v zadnjih 10 minutah?"],
                        45:["What type of individual work?",
                            "Wat voor soort individueel werk?",
                            "Kakšno vrsto samostojnega dela ste opravljali?"],
                        46:["How did you work with others?",
                            "Hoe heb je met anderen gewerkt?",
                            "Kako ste sodelovali z drugimi?"],
                        47:["What type of break?",
                            "Wat voor soort pauze?",
                            "Kakšno vrsto odmora ste imeli?"],
                        48:["Where did you travel between?",
                            "Waar heb je tussen gereisd?",
                            "Kam ste potovali?"],
                        49:["Did you use a computer or phone for that?",
                            "Heb je daarvoor een computer of telefoon gebruikt?",
                            "Ste za to uporabljali računalnik ali telefon?"],
                        50:["What kind of an interaction was that?",
                            "Wat voor interactie was dat?",
                            "Kakšne vrste sodelovanja je bilo to?"],
                        51:["How many people were involved besides yourself?",
                            "Hoeveel mensen waren er behalve jezelf betrokken?",
                            "Koliko oseb je bilo poleg vas še vpletenih?"],
                        # 52:["What have you mainly been doing within the last 10 minutes?",
                        #     "Waar ben je voornamelijk mee bezig geweest de laatste 10 minuten?",
                        #     "Kaj ste v glavnem počeli v zadnjih 10 minutah?"]
 }
 qc2id = {v:k for k,values in id2qc.items() for v in values}
 next_questions = {  44: [45,46,47,48],
                    45:[49,49],
                    46:[50,50],
                    47:[],
                    48:[],
                    49:[],
                    50:[51,51],
                    51:[]
                    #52:[45,46,47,48],
                 }
 def esm_activities_LTM_features(
    df_esm_activities_cleaned: pd.DataFrame,
 ) -> pd.DataFrame:
    """ Function for calculating LTM(Last 10 minutes) features of questionnaire answers. It first corrects the question ids according
        to esm_instructions and the updated corpus of question_ids. It then processes each LTM question chain to 
        find relevant social properties given by the answers such as the number of people interacted with, the formality and whether the socializing was done in person.
    Parameters
    ----------
    df_esm_activities_cleaned: pd.DataFrame
        A cleaned up dataframe, which must include esm_instructions, esm_user_answer_numeric.
    Returns
    -------
    df_esm_activities_cleaned: pd.DataFrame
        The same dataframe with columns which contain:
            ["correct_ids"] - Corrected question_ids
            ["ans_seq"] - For each LTM question, the sequence of numerical user answers pertaining to this chain of questions.
            ["n_others","inperson","formal"]- Properties of known potential social encounters as given by process_answers().
    """
    #TODO: preprocess questionaires
        #DONE: correct ids
    correct_id_df = correct_activity_qids(df_esm_activities_cleaned)
        #DONE: process subquestions 
    ids = correct_id_df["correct_ids"]
    main_q_indices = ids[ids==44].index
    q_group = []
    i=-1
    for id in ids:
        if(id==44):
            i=i+1
        q_group.append(i)
    correct_id_df["q_group"] = q_group
    ans_seq = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).rename(columns={"esm_user_answer_numeric":"ans_seq"})
    ans_seq.set_index(main_q_indices,inplace=True)
    # correct_id_df["ans_seq"] = [[] for i in range(len(correct_id_df))]
    # correct_id_df["ans_seq"].loc[main_q_indices] = correct_id_df.groupby("q_group").agg({"esm_user_answer_numeric":lambda group: list(group)}).values.reshape(-1)
    #DONE: find types of status for each main question: socializing:[none,irl,online,unknown], num_people:[0,1,2,>2,unknown]
    processed_ans_df = process_answers(ans_seq)
    # df_out = df_esm_activities_cleaned.join(test)
    return df_esm_activities_cleaned.join(processed_ans_df)
 """ 
 possible answer sequences for LTM question chains
 #alone
 0,0,0 not social
 0,0,1 not social
 0,1,0 not social
 0,1,1 not social
 0,2 not social
 0,3 not social
 0,4 not social
 0,5 not social
 0,6 not social
 #w/ others
 1,0,0,0 1 irl
 1,0,0,1 2 irl
 1,0,0,2 3+ irl
 1,0,1,0 1 irl
 1,0,1,1 2 irl
 1,0,1,2 3+ irl
 1,1,0,0 1 online 
 1,1,0,1 2 online 
 1,1,0,2 3+ online 
 1,1,1,0 1 online 
 1,1,1,1 2 online 
 1,1,1,2 3+ online 
 1,2 positive likely to be more than 2
 1,3 positive
 #break
 2,0 ambiguous
 2,1 positive irl
 2,2 ambiguous
 2,3 ambiguous
 #transit
 3,0 ambiguous
 3,1 ambiguous
 3,2 ambiguous
 """
 #TODO: docstring
 def process_answers(df:pd.DataFrame)-> pd.DataFrame:
    """ Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
        > n_others: Number of other people interacted with in the last 10 minutes
            - -1: Number is positive but unknown exactly
            - 0: No people/alone
            - 1: One extra person
            - 2: Two extra people
            - 3: More than two extra people
            - NaN : Can't say anything with enough certainty.
        > inperson: 
            - True/False: The interaction in question was/wasn't in person.
            - None: Can't say anything with enough certainty.
        > formal: 
            - True/False: The interaction in question was/wasn't formal.
            - None: Can't say anything with enough certainty.
    Args:
        df (pd.DataFrame): _description_
    Returns:
        pd.DataFrame: _description_
    """    
    properties = {"n_others":[],
                  "inperson":[],
                  "formal":[]}
    for ans_seq in df["ans_seq"]:
        n_other = None
        inperson = None
        formal = None
        if(ans_seq[0]==0):
            n_other = 0
        elif(ans_seq[0]==1):
            if(ans_seq[1]==3):
                n_other = -1    # anwsered "Other" but did work with other people        
            elif(ans_seq[1]==2):
                n_other = 3 #assuming more than 2 people participated in the lecture or presentation
            elif(ans_seq[1] in [0,1]):
                    inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
                    formal = ans_seq[2]==0#0 means formal
                    n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
        elif(ans_seq[0]==2):
            formal = False#assuming one does not have a formal meeting during break time
            if(ans_seq[1]==1):
                n_other = -1
                inperson = True
            #if not 1 then we dont know anythong for sure
        elif(ans_seq[0]==3):
            #we cant say whether the persion was carpooling or driving alone.
            pass
        properties["n_others"].append(n_other)
        properties["inperson"].append(inperson)
        properties["formal"].append(formal)
    #df = df.join(pd.DataFrame(properties,index=df.index))
    return pd.DataFrame(properties,index=df.index)
 def correct_activity_qids(df:pd.DataFrame)->pd.DataFrame:
    """_summary_
    Args:
        df (pd.DataFrame): _description_
    Returns:
        pd.DataFrame: Input dataframe with added column "correct_ids"
    """
    df["correct_ids"] = df["esm_instructions"].apply(lambda x: qc2id[x])
    return df
 def process_answers_aggregation(df:pd.core.groupby.generic.DataFrameGroupBy)-> pd.core.groupby.generic.DataFrameGroupBy:
    """ Function to process answer sequences for LTM question chains. It checks the chain of subquestion answers and extracts the following attributes:
        > n_others: Number of other people interacted with in the last 10 minutes
            - -1: Number is positive but unknown exactly
            - 0: No people/alone
            - 1: One extra person
            - 2: Two extra people
            - 3: More than two extra people
            - NaN : Can't say anything with enough certainty.
        > inperson: 
            - True/False: The interaction in question was/wasn't in person.
            - None: Can't say anything with enough certainty.
        > formal: 
            - True/False: The interaction in question was/wasn't formal.
            - None: Can't say anything with enough certainty.
    Args:
        df (pd.DataFrame): _description_
    Returns:
        pd.DataFrame: _description_
    """    
    #print("=======================\nAPPLY START:\ndf=",df.columns,df.local_segment)
    properties = {"n_others":[],
                  "inperson":[],
                  "formal":[]}
    ans_seq = df["esm_user_answer_numeric"].values
    n_other = None
    inperson = None
    formal = None
    if(ans_seq[0]==0):
        n_other = 0
    elif(ans_seq[0]==1):
        if(ans_seq[1]==3):
            n_other = -1    # anwsered "Other" but did work with other people        
        elif(ans_seq[1]==2):
            n_other = 3 #assuming more than 2 people participated in the lecture or presentation
        elif(ans_seq[1] in [0,1]):
                inperson = ans_seq[1]==0 #ans[1]==0, means irl interaction, ==1 means online or phone
                formal = ans_seq[2]==0#0 means formal
                n_other = ans_seq[3]+1 #ans3 is on [0,2] so we add 1 to make it [1,3]
    elif(ans_seq[0]==2):
        formal = False#assuming one does not have a formal meeting during break time
        if(ans_seq[1]==1):
            n_other = -1
            inperson = True
        #if not 1 then we dont know anythong for sure
    elif(ans_seq[0]==3):
        #we cant say whether the persion was carpooling or driving alone.
        pass
    properties["n_others"].append(n_other)
    properties["inperson"].append(inperson)
    properties["formal"].append(formal)
    df = df.join(pd.DataFrame(properties,index=df.index))
    #print("APPLY END:\ndf=",df[["n_others","inperson","formal"]])
    return df
 #test stuff
 def test():
    from esm_preprocess import preprocess_esm,clean_up_esm
    df = pd.read_csv("data/raw/p031/phone_esm_with_datetime.csv")
    df = preprocess_esm(df)
    df = clean_up_esm(df)
    df = df[df["questionnaire_id"]==97]
    original = esm_activities_LTM_features(df)
    df["local_segment"] = [str(i)+":"+j for i,j in df[["esm_session","device_id"]].values]
    temp = df.groupby("local_segment")
    temp2 = temp.apply(process_answers_aggregation)
    #compare with original function results
    selection = original[original["correct_ids"]==44][["n_others",  "inperson", "formal"]]
    temp_selection = temp2.loc[selection.index]
    temp_selection.compare(selection,keep_shape=True,keep_equal =True)
    #print out ans_seq processing results
    # import json
    # i = 0
    # for j,ans in correct_id_df[["esm_json","esm_user_answer"]].values:
    #     obj = json.loads(j)
    #     text = obj["esm_instructions"]
    #     if ("10 minut" in text):
    #         print("---\n",test.ans_seq.iloc[i])
    #         print(test[["n_others","inperson","formal"]].values[i])
    #         i = i+1
    #     print(text,ans)
 #test()
--- a/src/features/phone_esm/straw/main.py
+++ b/src/features/phone_esm/straw/main.py
@ -1,4 +1,8 @@
 import pandas as pd
 import sys
 import warnings
 sys.path.append('src/features/phone_esm/straw')   
 from esm_activities import esm_activities_LTM_features,process_answers_aggregation
 QUESTIONNAIRE_IDS = {
    "sleep_quality": 1,
@ -39,24 +43,50 @@ QUESTIONNAIRE_IDS = {
 def straw_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
    esm_data = pd.read_csv(sensor_data_files["sensor_data"])
    requested_features = provider["FEATURES"]
    # name of the features this function can compute
    requested_scales = provider["SCALES"]
    base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support", 
-                            "appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
+                            "appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge","activities_n_others","activities_inperson","activities_formal"]
    #TODO Check valid questionnaire and feature names.
    # the subset of requested features this function can compute
    features_to_compute = list(set(requested_features) & set(base_features_names))
    esm_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
    if not esm_data.empty:
        esm_data = filter_data_by_segment(esm_data, time_segment)
        if not esm_data.empty:
            esm_features = pd.DataFrame()
            for scale in requested_scales:
                questionnaire_id = QUESTIONNAIRE_IDS[scale]
                mask = esm_data["questionnaire_id"] == questionnaire_id
-                esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
+                #print(esm_data.loc[mask].head())
                #print(time_segment)
                if not mask.any():
                    temp = sensor_data_files["sensor_data"]
                    warnings.warn(f"Warning........... No relevant questions for scale {scale} in {temp}-{time_segment}",RuntimeWarning) 
                    continue
                #TODO: calculation of LTM features
                if scale=="activities":
                    requested_subset = [req for req in requested_features if req.startswith("activities")]
                    if not bool(requested_subset):
                        continue
                    # ltm_features = esm_activities_LTM_features(esm_data.loc[mask])
                    # print(esm_data["esm_json"].values)
                    # print(mask)
                    # print(esm_data.loc[mask])
                    # #ltm_features = ltm_features[ltm_features["correct_ids"==44]]
                    #print(esm_data.loc[mask]["local_segment"])
                    ltm_features = esm_data.loc[mask].groupby(["local_segment"]).apply(process_answers_aggregation)
                    #print("PRINTING ltm_features:\n",ltm_features)
                    ltm_features.rename(columns={"n_others":"activities_n_others","inperson":"activities_inperson","formal":"activities_formal"},inplace=True)
                    esm_features[requested_subset] = ltm_features.groupby("local_segment").first()[requested_subset]
                    #print(esm_features.columns)
                    #print("PRINTING esm_features after rename:\n",ltm_features)
                    #FIXME: it might be an issue that im calculating for whole time segment and not grouping by "local segment"
                    #print("~~~~~~~~~~~~~~~~~~~~~~~~===============================~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n     LTM FEATURES STORED...     AFTER RETURN:\n",ltm_features,esm_features[["activities_"+req for req in requested_subset]])
                if("mean" in features_to_compute):
                    esm_features[scale + "_mean"] = esm_data.loc[mask].groupby(["local_segment"])["esm_user_score"].mean()
                #TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.
            esm_features = esm_features.reset_index()
@ -64,3 +94,15 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
                esm_features.rename(columns={'index': 'local_segment'}, inplace=True)
    return esm_features
 def test_main():
    import temp_help
    provider = {
        "FEATURES":["mean","activities_n_others","activities_inperson","activities_formal"],
        "SCALES":['activities']
    }
    sensor_data_files = {"sensor_data":"data/interim/p069/phone_esm_clean.csv"}
    s_feat = straw_features(sensor_data_files,"straw_event_stress_event_p069_110",provider,temp_help.filter_data_by_segment)
    print(s_feat)
 #test_main()
--- a/src/features/phone_esm/straw/process_user_event_related_segments.py
+++ b/src/features/phone_esm/straw/process_user_event_related_segments.py
@ -67,7 +67,7 @@ def extract_ers(esm_df):
    segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"]
-    if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
+    if segmenting_method in ["10_before", "30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
        """ '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
        Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
        All questionnaire durations over 15 minutes are excluded from the querying.
@ -79,7 +79,18 @@ def extract_ers(esm_df):
        extracted_ers = extracted_ers[extracted_ers["timestamp"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min 
        extracted_ers["shift_direction"] = -1 
-        if segmenting_method == "30_before":
+        if segmenting_method == "10_before":
            """The method 10-minutes before simply takes 10 minutes before the questionnaire and sums it with the questionnaire duration.
            The timestamps are formatted with the help of format_timestamp() method.
            """
            time_before_questionnaire = 10 * 60 # in seconds (10 minutes)
            #TODO: split into small segments with manipulating lenght and shift
            extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x))
            extracted_ers["shift"] = time_before_questionnaire
            extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x))
        elif segmenting_method == "30_before":
            """The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
            The timestamps are formatted with the help of format_timestamp() method.
            """
@ -140,8 +151,8 @@ def extract_ers(esm_df):
        # Extracted 3 targets that will be transfered in the csv file to the cleaning script. 
        se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'})
-        se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean(numeric_only=True)['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'})
+        se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'})
-        se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean(numeric_only=True)['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'})
+        se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'})
        # All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
        extracted_ers = extracted_ers.join(session_start_timestamp, on=['device_id', 'esm_session'], how='inner') \
--- a/src/features/phone_esm/straw/temp_help.py
+++ b/src/features/phone_esm/straw/temp_help.py
@ -0,0 +1,70 @@
 """This file is TEMPORARY and intended for testing main.py
 """
 def filter_data_by_segment(data, time_segment):
    data.dropna(subset=["assigned_segments"], inplace=True)
    if(data.shape[0] == 0): # data is empty
        data["local_segment"] = data["timestamps_segment"] = None
        return data
    datetime_regex = "[0-9]{4}[\-|\/][0-9]{2}[\-|\/][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}"
    timestamps_regex = "[0-9]{13}"
    segment_regex = "\[({}#{},{};{},{})\]".format(time_segment, datetime_regex, datetime_regex, timestamps_regex, timestamps_regex)
    data["local_segment"] = data["assigned_segments"].str.extract(segment_regex, expand=True)
    data = data.drop(columns=["assigned_segments"])
    data = data.dropna(subset = ["local_segment"])
    if(data.shape[0] == 0): # there are no rows belonging to time_segment after droping na
        data["timestamps_segment"] = None
    else:
        data[["local_segment","timestamps_segment"]] = data["local_segment"].str.split(pat =";",n=1, expand=True)
    # chunk episodes
    if (not data.empty) and ("start_timestamp" in data.columns) and ("end_timestamp" in data.columns):
        data = chunk_episodes(data)
    return data
 def chunk_episodes(sensor_episodes):
    import copy
    import pandas as pd
    # Deduplicate episodes
    # Drop rows where segments of start_timestamp and end_timestamp are the same
    sensor_episodes = sensor_episodes.drop_duplicates(subset=["start_timestamp", "end_timestamp", "local_segment"], keep="first")
    # Delete useless columns
    for drop_col in ["local_date_time", "local_date", "local_time", "local_hour", "local_minute"]:
        del sensor_episodes[drop_col]
    # Avoid SettingWithCopyWarning
    sensor_episodes = sensor_episodes.copy()
    # Unix timestamp for current segment in milliseconds
    sensor_episodes[["segment_start_timestamp", "segment_end_timestamp"]] = sensor_episodes["timestamps_segment"].str.split(",", expand=True).astype(int)
    # Compute chunked timestamp
    sensor_episodes["chunked_start_timestamp"] = sensor_episodes[["start_timestamp", "segment_start_timestamp"]].max(axis=1)
    sensor_episodes["chunked_end_timestamp"] = sensor_episodes[["end_timestamp", "segment_end_timestamp"]].min(axis=1)
    # Compute duration: intersection of current row and segment
    sensor_episodes["duration"] = (sensor_episodes["chunked_end_timestamp"] - sensor_episodes["chunked_start_timestamp"]) / (1000 * 60)
    # Merge episodes
    cols_for_groupby = [col for col in sensor_episodes.columns if col not in ["timestamps_segment", "timestamp", "assigned_segments", "start_datetime", "end_datetime", "start_timestamp", "end_timestamp", "duration", "chunked_start_timestamp", "chunked_end_timestamp"]]
    sensor_episodes_grouped = sensor_episodes.groupby(by=cols_for_groupby, sort=False, dropna=False)
    merged_sensor_episodes = sensor_episodes_grouped[["duration"]].sum()
    merged_sensor_episodes["start_timestamp"] = sensor_episodes_grouped["chunked_start_timestamp"].first()
    merged_sensor_episodes["end_timestamp"] = sensor_episodes_grouped["chunked_end_timestamp"].last()
    merged_sensor_episodes.reset_index(inplace=True)
    # Compute datetime
    merged_sensor_episodes["local_start_date_time"] = pd.to_datetime(merged_sensor_episodes["start_timestamp"], unit="ms", utc=True)
    merged_sensor_episodes["local_start_date_time"] = pd.concat([data["local_start_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
    merged_sensor_episodes["local_end_date_time"] = pd.to_datetime(merged_sensor_episodes["end_timestamp"], unit="ms", utc=True)
    merged_sensor_episodes["local_end_date_time"] = pd.concat([data["local_end_date_time"].dt.tz_convert(tz) for tz, data in merged_sensor_episodes.groupby("local_timezone")]).apply(lambda x: x.tz_localize(None).replace(microsecond=0))
    return merged_sensor_episodes
--- a/src/features/phone_locations/doryab/add_doryab_extra_columns.py
+++ b/src/features/phone_locations/doryab/add_doryab_extra_columns.py
@ -115,7 +115,7 @@ cluster_on = provider["CLUSTER_ON"]
 strategy = provider["INFER_HOME_LOCATION_STRATEGY"]
 days_threshold = provider["MINIMUM_DAYS_TO_DETECT_HOME_CHANGES"]
-if not location_data.timestamp.is_monotonic_increasing:
+if not location_data.timestamp.is_monotonic:
    location_data.sort_values(by=["timestamp"], inplace=True)
 location_data["duration_in_seconds"] = -1 * location_data.timestamp.diff(-1) / 1000
--- a/src/features/utils/utils.py
+++ b/src/features/utils/utils.py
@ -14,6 +14,7 @@ def import_path(path):
    sys.modules[module_name] = module
    return module
 #TODO:check why segments change to int 
 def filter_data_by_segment(data, time_segment):
    data.dropna(subset=["assigned_segments"], inplace=True)
    if(data.shape[0] == 0): # data is empty
@ -151,6 +152,7 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
    else: 
        segment_colums = pd.DataFrame()
        print(sensor_features,sensor_features['local_segment'])
        sensor_features['local_segment'] = sensor_features['local_segment'].str.replace(r'_RR\d+SS', '')
        split_segemnt_columns = sensor_features["local_segment"].str.split(pat="(.*)#(.*),(.*)", expand=True)
        new_segment_columns = split_segemnt_columns.iloc[:,1:4] if split_segemnt_columns.shape[1] == 5 else pd.DataFrame(columns=["local_segment_label", "local_segment_start_datetime","local_segment_end_datetime"])
Author	SHA1	Message	Date
Marcel Martinšek	fb8868b77d	removed cleaning to make it run	2023-03-31 13:27:01 +00:00
Marcel Martinšek	da77f7476c	Merge branch 'sociality-task' of https://repo.ijs.si/junoslukan/rapids into sociality-task	2023-03-31 13:08:19 +00:00
Marcel Martinšek	4db8810d08	corrected esm_features index column	2023-03-31 13:08:15 +00:00
junos	7832d7d098	Update R packages.	2023-03-30 20:33:35 +02:00
Marcel Martinšek	e7bb9d6702	not working temp	2023-03-30 11:54:51 +00:00
Marcel Martinšek	689f677a3e	updated r package dependencies to make it run	2023-03-29 13:09:26 +00:00