Merge branch 'master' of https://repo.ijs.si/junoslukan/rapids
commit
8da7bd71b2
130
Snakefile
130
Snakefile
|
@ -33,12 +33,6 @@ for provider in config["PHONE_DATA_YIELD"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_data_yield.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_DATA_YIELD"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_data_yield.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys():
|
||||
if config["PHONE_MESSAGES"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -48,12 +42,6 @@ for provider in config["PHONE_MESSAGES"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_messages.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_MESSAGES"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_messages.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_CALLS"]["PROVIDERS"].keys():
|
||||
if config["PHONE_CALLS"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -68,12 +56,6 @@ for provider in config["PHONE_CALLS"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_calls.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_CALLS"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_calls.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_BLUETOOTH"]["PROVIDERS"].keys():
|
||||
if config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -83,12 +65,6 @@ for provider in config["PHONE_BLUETOOTH"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_bluetooth.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_BLUETOOTH"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_bluetooth.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"].keys():
|
||||
if config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -101,12 +77,6 @@ for provider in config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_activity_recognition.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_activity_recognition.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys():
|
||||
if config["PHONE_BATTERY"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -118,12 +88,6 @@ for provider in config["PHONE_BATTERY"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_battery.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_BATTERY"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_battery.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys():
|
||||
if config["PHONE_SCREEN"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -140,12 +104,6 @@ for provider in config["PHONE_SCREEN"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_screen.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_SCREEN"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_screen.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_LIGHT"]["PROVIDERS"].keys():
|
||||
if config["PHONE_LIGHT"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -155,12 +113,6 @@ for provider in config["PHONE_LIGHT"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_light.csv", pid=config["PIDS"],))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_LIGHT"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_light.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_ACCELEROMETER"]["PROVIDERS"].keys():
|
||||
if config["PHONE_ACCELEROMETER"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -184,12 +136,6 @@ for provider in config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_applications_foreground.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_APPLICATIONS_FOREGROUND"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_applications_foreground.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_WIFI_VISIBLE"]["PROVIDERS"].keys():
|
||||
if config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -199,12 +145,6 @@ for provider in config["PHONE_WIFI_VISIBLE"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_wifi_visible.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_WIFI_VISIBLE"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_wifi_visible.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_WIFI_CONNECTED"]["PROVIDERS"].keys():
|
||||
if config["PHONE_WIFI_CONNECTED"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -233,12 +173,6 @@ for provider in config["PHONE_ESM"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_esm.csv", pid=config["PIDS"]))
|
||||
# files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"]))
|
||||
# files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_ESM"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_esm.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
# We can delete these if's as soon as we add feature PROVIDERS to any of these sensors
|
||||
if isinstance(config["PHONE_APPLICATIONS_CRASHES"]["PROVIDERS"], dict):
|
||||
|
@ -304,12 +238,6 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["LIST"] and config["STANDARDIZATION"]["PROVIDERS"]["OTHER"]["COMPUTE"] \
|
||||
and config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_phone_locations.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["FITBIT_CALORIES_INTRADAY"]["PROVIDERS"].keys():
|
||||
if config["FITBIT_CALORIES_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -400,13 +328,6 @@ for provider in config["EMPATICA_ACCELEROMETER"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_accelerometer.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"] \
|
||||
and config["EMPATICA_ACCELEROMETER"]["PROVIDERS"][provider]["WINDOWS"]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_accelerometer_features/z_empatica_accelerometer_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_empatica_accelerometer.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["EMPATICA_HEARTRATE"]["PROVIDERS"].keys():
|
||||
if config["EMPATICA_HEARTRATE"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -426,13 +347,6 @@ for provider in config["EMPATICA_TEMPERATURE"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_temperature.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"] \
|
||||
and config["EMPATICA_TEMPERATURE"]["PROVIDERS"][provider]["WINDOWS"]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_temperature_features/z_empatica_temperature_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_empatica_temperature.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"].keys():
|
||||
if config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -442,13 +356,6 @@ for provider in config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_electrodermal_activity.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"] \
|
||||
and config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"][provider]["WINDOWS"]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_electrodermal_activity_features/z_empatica_electrodermal_activity_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_empatica_electrodermal_activity.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"].keys():
|
||||
if config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -458,13 +365,6 @@ for provider in config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_blood_volume_pulse.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"] \
|
||||
and config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"][provider]["WINDOWS"]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_blood_volume_pulse_features/z_empatica_blood_volume_pulse_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_empatica_blood_volume_pulse.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys():
|
||||
if config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
|
@ -474,13 +374,6 @@ for provider in config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/empatica_inter_beat_interval.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
if provider in config["STANDARDIZATION"]["PROVIDERS"] and config["STANDARDIZATION"]["PROVIDERS"][provider]["COMPUTE"] \
|
||||
and config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"][provider]["WINDOWS"]["STANDARDIZE_FEATURES"]:
|
||||
files_to_compute.extend(expand("data/interim/{pid}/empatica_inter_beat_interval_features/z_empatica_inter_beat_interval_{language}_{provider_key}_windows.csv", pid=config["PIDS"], language=get_script_language(config["STANDARDIZATION"]["PROVIDERS"][provider]["SRC_SCRIPT"]), provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_empatica_inter_beat_interval.csv", pid=config["PIDS"]))
|
||||
if config["STANDARDIZATION"]["MERGE_ALL"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/z_all_sensor_features.csv")
|
||||
|
||||
if isinstance(config["EMPATICA_TAGS"]["PROVIDERS"], dict):
|
||||
for provider in config["EMPATICA_TAGS"]["PROVIDERS"].keys():
|
||||
|
@ -517,24 +410,16 @@ for provider in config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"].keys():
|
|||
if config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
if provider == "STRAW":
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() + "_py.csv", pid=config["PIDS"]))
|
||||
if config["ALL_CLEANING_INDIVIDUAL"]["CLEAN_STANDARDIZED"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features_cleaned_" + provider.lower() + "_py.csv", pid=config["PIDS"]))
|
||||
else:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features_cleaned_" + provider.lower() + "_R.csv", pid=config["PIDS"]))
|
||||
if config["ALL_CLEANING_INDIVIDUAL"]["CLEAN_STANDARDIZED"]:
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/z_all_sensor_features_cleaned_" + provider.lower() + "_R.csv", pid=config["PIDS"]))
|
||||
|
||||
for provider in config["ALL_CLEANING_OVERALL"]["PROVIDERS"].keys():
|
||||
if config["ALL_CLEANING_OVERALL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
if provider == "STRAW":
|
||||
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +"_py.csv"))
|
||||
if config["ALL_CLEANING_OVERALL"]["CLEAN_STANDARDIZED"]:
|
||||
files_to_compute.extend(expand("data/processed/features/all_participants/z_all_sensor_features_cleaned_" + provider.lower() +"_py.csv"))
|
||||
for target in config["PARAMS_FOR_ANALYSIS"]["TARGET"]["ALL_LABELS"]:
|
||||
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +"_py_(" + target + ").csv"))
|
||||
else:
|
||||
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +"_R.csv"))
|
||||
if config["ALL_CLEANING_OVERALL"]["CLEAN_STANDARDIZED"]:
|
||||
files_to_compute.extend(expand("data/processed/features/all_participants/z_all_sensor_features_cleaned_" + provider.lower() +"_R.csv"))
|
||||
|
||||
files_to_compute.extend(expand("data/processed/features/all_participants/all_sensor_features_cleaned_" + provider.lower() +"_R.csv"))
|
||||
|
||||
# Baseline features
|
||||
if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
|
||||
|
@ -545,12 +430,9 @@ if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
|
|||
|
||||
# Targets (labels)
|
||||
if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]:
|
||||
# files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
|
||||
# files_to_compute.extend(expand("data/processed/models/population_model/input.csv"))
|
||||
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/z_input.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/models/population_model/z_input.csv"))
|
||||
|
||||
#files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
|
||||
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
|
||||
for target in config["PARAMS_FOR_ANALYSIS"]["TARGET"]["ALL_LABELS"]:
|
||||
files_to_compute.extend(expand("data/processed/models/population_model/input_" + target + ".csv"))
|
||||
|
||||
rule all:
|
||||
input:
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
from pprint import pprint
|
||||
import sklearn.metrics
|
||||
import autosklearn.regression
|
||||
|
||||
import datetime
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import yaml
|
||||
|
||||
from sklearn import linear_model, svm, kernel_ridge, gaussian_process
|
||||
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, train_test_split
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
from sklearn.impute import SimpleImputer
|
||||
|
||||
model_input = pd.read_csv("data/processed/models/population_model/input_PANAS_negative_affect_mean.csv") # Standardizirani podatki
|
||||
|
||||
model_input.dropna(axis=1, how="all", inplace=True)
|
||||
model_input.dropna(axis=0, how="any", subset=["target"], inplace=True)
|
||||
|
||||
categorical_feature_colnames = ["gender", "startlanguage"]
|
||||
categorical_feature_colnames += [col for col in model_input.columns if "mostcommonactivity" in col or "homelabel" in col]
|
||||
categorical_features = model_input[categorical_feature_colnames].copy()
|
||||
mode_categorical_features = categorical_features.mode().iloc[0]
|
||||
categorical_features = categorical_features.fillna(mode_categorical_features)
|
||||
categorical_features = categorical_features.apply(lambda col: col.astype("category"))
|
||||
if not categorical_features.empty:
|
||||
categorical_features = pd.get_dummies(categorical_features)
|
||||
numerical_features = model_input.drop(categorical_feature_colnames, axis=1)
|
||||
model_in = pd.concat([numerical_features, categorical_features], axis=1)
|
||||
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
model_in.set_index(index_columns, inplace=True)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(model_in.drop(["target", "pid"], axis=1), model_in["target"], test_size=0.30)
|
||||
|
||||
automl = autosklearn.regression.AutoSklearnRegressor(
|
||||
time_left_for_this_task=7200,
|
||||
per_run_time_limit=120
|
||||
)
|
||||
automl.fit(X_train, y_train, dataset_name='straw')
|
||||
|
||||
print(automl.leaderboard())
|
||||
pprint(automl.show_models(), indent=4)
|
||||
|
||||
train_predictions = automl.predict(X_train)
|
||||
print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
|
||||
test_predictions = automl.predict(X_test)
|
||||
print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))
|
||||
|
||||
import sys
|
||||
sys.exit()
|
93
config.yaml
93
config.yaml
|
@ -21,9 +21,12 @@ CREATE_PARTICIPANT_FILES:
|
|||
|
||||
# See https://www.rapids.science/latest/setup/configuration/#time-segments
|
||||
TIME_SEGMENTS: &time_segments
|
||||
TYPE: PERIODIC # FREQUENCY, PERIODIC, EVENT
|
||||
FILE: "data/external/timesegments_daily.csv"
|
||||
TYPE: EVENT # FREQUENCY, PERIODIC, EVENT
|
||||
FILE: "data/external/straw_events.csv"
|
||||
INCLUDE_PAST_PERIODIC_SEGMENTS: TRUE # Only relevant if TYPE=PERIODIC, see docs
|
||||
TAILORED_EVENTS: # Only relevant if TYPE=EVENT
|
||||
COMPUTE: True
|
||||
SEGMENTING_METHOD: "30_before" # 30_before, 90_before, stress_event
|
||||
|
||||
# See https://www.rapids.science/latest/setup/configuration/#timezone-of-your-study
|
||||
TIMEZONE:
|
||||
|
@ -70,7 +73,6 @@ PHONE_ACCELEROMETER:
|
|||
COMPUTE: False
|
||||
FEATURES: ["maxmagnitude", "minmagnitude", "avgmagnitude", "medianmagnitude", "stdmagnitude"]
|
||||
SRC_SCRIPT: src/features/phone_accelerometer/rapids/main.py
|
||||
|
||||
PANDA:
|
||||
COMPUTE: False
|
||||
VALID_SENSED_MINUTES: False
|
||||
|
@ -93,7 +95,6 @@ PHONE_ACTIVITY_RECOGNITION:
|
|||
STATIONARY: ["still", "tilting"]
|
||||
MOBILE: ["on_foot", "walking", "running", "on_bicycle"]
|
||||
VEHICLE: ["in_vehicle"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_activity_recognition/rapids/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-applications-crashes/
|
||||
|
@ -134,7 +135,6 @@ PHONE_APPLICATIONS_FOREGROUND:
|
|||
APP_EPISODES: ["countepisode", "minduration", "maxduration", "meanduration", "sumduration"]
|
||||
IGNORE_EPISODES_SHORTER_THAN: 0 # in minutes, set to 0 to disable
|
||||
IGNORE_EPISODES_LONGER_THAN: 300 # in minutes, set to 0 to disable
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_applications_foreground/rapids/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-applications-notifications/
|
||||
|
@ -155,7 +155,6 @@ PHONE_BATTERY:
|
|||
RAPIDS:
|
||||
COMPUTE: True
|
||||
FEATURES: ["countdischarge", "sumdurationdischarge", "countcharge", "sumdurationcharge", "avgconsumptionrate", "maxconsumptionrate"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_battery/rapids/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-bluetooth/
|
||||
|
@ -163,9 +162,8 @@ PHONE_BLUETOOTH:
|
|||
CONTAINER: bluetooth
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_bluetooth/rapids/main.R
|
||||
|
||||
DORYAB:
|
||||
|
@ -183,7 +181,6 @@ PHONE_BLUETOOTH:
|
|||
DEVICES: ["countscans", "uniquedevices", "meanscans", "stdscans"]
|
||||
SCANS_MOST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
|
||||
SCANS_LEAST_FREQUENT_DEVICE: ["withinsegments", "acrosssegments", "acrossdataset"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_bluetooth/doryab/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-calls/
|
||||
|
@ -198,7 +195,6 @@ PHONE_CALLS:
|
|||
missed: [count, distinctcontacts, timefirstcall, timelastcall, countmostfrequentcontact]
|
||||
incoming: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
|
||||
outgoing: [count, distinctcontacts, meanduration, sumduration, minduration, maxduration, stdduration, modeduration, entropyduration, timefirstcall, timelastcall, countmostfrequentcontact]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_calls/rapids/main.R
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-conversation/
|
||||
|
@ -238,7 +234,6 @@ PHONE_DATA_YIELD:
|
|||
COMPUTE: True
|
||||
FEATURES: [ratiovalidyieldedminutes, ratiovalidyieldedhours]
|
||||
MINUTE_RATIO_THRESHOLD_FOR_VALID_YIELDED_HOURS: 0.5 # 0 to 1, minimum percentage of valid minutes in an hour to be considered valid.
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_data_yield/rapids/main.R
|
||||
|
||||
PHONE_ESM:
|
||||
|
@ -246,9 +241,9 @@ PHONE_ESM:
|
|||
PROVIDERS:
|
||||
STRAW:
|
||||
COMPUTE: True
|
||||
SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support"]
|
||||
SCALES: ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
|
||||
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
|
||||
FEATURES: [mean]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_esm/straw/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-keyboard/
|
||||
|
@ -267,7 +262,6 @@ PHONE_LIGHT:
|
|||
RAPIDS:
|
||||
COMPUTE: True
|
||||
FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_light/rapids/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-locations/
|
||||
|
@ -292,7 +286,6 @@ PHONE_LOCATIONS:
|
|||
MINIMUM_DAYS_TO_DETECT_HOME_CHANGES: 3
|
||||
CLUSTERING_ALGORITHM: DBSCAN # DBSCAN, OPTICS
|
||||
RADIUS_FOR_HOME: 100
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_locations/doryab/main.py
|
||||
|
||||
BARNETT:
|
||||
|
@ -300,7 +293,6 @@ PHONE_LOCATIONS:
|
|||
FEATURES: ["hometime","disttravelled","rog","maxdiam","maxhomedist","siglocsvisited","avgflightlen","stdflightlen","avgflightdur","stdflightdur","probpause","siglocentropy","circdnrtn","wkenddayrtn"]
|
||||
IF_MULTIPLE_TIMEZONES: USE_MOST_COMMON
|
||||
MINUTES_DATA_USED: False # Use this for quality control purposes, how many minutes of data (location coordinates gruped by minute) were used to compute features
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_locations/barnett/main.R
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-log/
|
||||
|
@ -320,7 +312,6 @@ PHONE_MESSAGES:
|
|||
FEATURES:
|
||||
received: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
|
||||
sent: [count, distinctcontacts, timefirstmessage, timelastmessage, countmostfrequentcontact]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_messages/rapids/main.R
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-screen/
|
||||
|
@ -334,7 +325,6 @@ PHONE_SCREEN:
|
|||
IGNORE_EPISODES_LONGER_THAN: 360 # in minutes, set to 0 to disable
|
||||
FEATURES: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration", "firstuseafter"] # "episodepersensedminutes" needs to be added later
|
||||
EPISODE_TYPES: ["unlock"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_screen/rapids/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-wifi-connected/
|
||||
|
@ -353,7 +343,6 @@ PHONE_WIFI_VISIBLE:
|
|||
RAPIDS:
|
||||
COMPUTE: True
|
||||
FEATURES: ["countscans", "uniquedevices", "countscansmostuniquedevice"]
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/phone_wifi_visible/rapids/main.R
|
||||
|
||||
|
||||
|
@ -455,7 +444,6 @@ FITBIT_SLEEP_INTRADAY:
|
|||
UNIFIED: [awake, asleep]
|
||||
SLEEP_TYPES: [main, nap, all]
|
||||
SRC_SCRIPT: src/features/fitbit_sleep_intraday/rapids/main.py
|
||||
|
||||
PRICE:
|
||||
COMPUTE: False
|
||||
FEATURES: [avgduration, avgratioduration, avgstarttimeofepisodemain, avgendtimeofepisodemain, avgmidpointofepisodemain, stdstarttimeofepisodemain, stdendtimeofepisodemain, stdmidpointofepisodemain, socialjetlag, rmssdmeanstarttimeofepisodemain, rmssdmeanendtimeofepisodemain, rmssdmeanmidpointofepisodemain, rmssdmedianstarttimeofepisodemain, rmssdmedianendtimeofepisodemain, rmssdmedianmidpointofepisodemain]
|
||||
|
@ -528,7 +516,6 @@ EMPATICA_ACCELEROMETER:
|
|||
COMPUTE: True
|
||||
WINDOW_LENGTH: 15 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/empatica_accelerometer/cr/main.py
|
||||
|
||||
|
||||
|
@ -557,7 +544,6 @@ EMPATICA_TEMPERATURE:
|
|||
COMPUTE: True
|
||||
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows']
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/empatica_temperature/cr/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/empatica-electrodermal-activity/
|
||||
|
@ -579,7 +565,6 @@ EMPATICA_ELECTRODERMAL_ACTIVITY:
|
|||
COMPUTE: True
|
||||
WINDOW_LENGTH: 60 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', count_windows, eda_num_peaks_non_zero]
|
||||
STANDARDIZE_FEATURES: True
|
||||
IMPUTE_NANS: True
|
||||
SRC_SCRIPT: src/features/empatica_electrodermal_activity/cr/main.py
|
||||
|
||||
|
@ -599,7 +584,6 @@ EMPATICA_BLOOD_VOLUME_PULSE:
|
|||
COMPUTE: True
|
||||
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/empatica_blood_volume_pulse/cr/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/empatica-inter-beat-interval/
|
||||
|
@ -619,7 +603,6 @@ EMPATICA_INTER_BEAT_INTERVAL:
|
|||
COMPUTE: True
|
||||
WINDOW_LENGTH: 300 # specify window length in seconds
|
||||
SECOND_ORDER_FEATURES: ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows', 'hrv_num_windows_non_nan']
|
||||
STANDARDIZE_FEATURES: True
|
||||
SRC_SCRIPT: src/features/empatica_inter_beat_interval/cr/main.py
|
||||
|
||||
# See https://www.rapids.science/latest/features/empatica-tags/
|
||||
|
@ -667,10 +650,9 @@ HEATMAP_FEATURE_CORRELATION_MATRIX:
|
|||
########################################################################################################################
|
||||
|
||||
ALL_CLEANING_INDIVIDUAL:
|
||||
CLEAN_STANDARDIZED: True
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||
COMPUTE: False
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||
|
@ -684,28 +666,25 @@ ALL_CLEANING_INDIVIDUAL:
|
|||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
|
||||
STRAW: # currently the same as RAPIDS provider with a change in selecting the imputation type
|
||||
STRAW:
|
||||
COMPUTE: True
|
||||
IMPUTE_PHONE_SELECTED_EVENT_FEATURES:
|
||||
COMPUTE: False
|
||||
TYPE: median # options: zero, mean, median or k-nearest
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||
COLS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||
ROWS_NAN_THRESHOLD: 0.33 # set to 1 to disable
|
||||
COLS_NAN_THRESHOLD: 0.9 # set to 1 to remove only columns that contains all (100% of) NaN
|
||||
COLS_VAR_THRESHOLD: True
|
||||
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
|
||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||
COMPUTE: True
|
||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
STANDARDIZATION: True
|
||||
SRC_SCRIPT: src/features/all_cleaning_individual/straw/main.py
|
||||
|
||||
ALL_CLEANING_OVERALL:
|
||||
CLEAN_STANDARDIZED: True
|
||||
PROVIDERS:
|
||||
RAPIDS:
|
||||
COMPUTE: True
|
||||
COMPUTE: False
|
||||
IMPUTE_SELECTED_EVENT_FEATURES:
|
||||
COMPUTE: False
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||
|
@ -719,40 +698,22 @@ ALL_CLEANING_OVERALL:
|
|||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
SRC_SCRIPT: src/features/all_cleaning_overall/rapids/main.R
|
||||
STRAW: # currently the same as RAPIDS provider with a change in selecting the imputation type
|
||||
STRAW:
|
||||
COMPUTE: True
|
||||
IMPUTE_PHONE_SELECTED_EVENT_FEATURES:
|
||||
COMPUTE: False
|
||||
TYPE: median # options: zero, mean, median or k-nearest
|
||||
MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33
|
||||
COLS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
PHONE_DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_MINUTES # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||
PHONE_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||
EMPATICA_DATA_YIELD_RATIO_THRESHOLD: 0.5 # set to 0 to disable
|
||||
ROWS_NAN_THRESHOLD: 0.33 # set to 1 to disable
|
||||
COLS_NAN_THRESHOLD: 0.8 # set to 1 to remove only columns that contains all (100% of) NaN
|
||||
COLS_VAR_THRESHOLD: True
|
||||
ROWS_NAN_THRESHOLD: 1 # set to 1 to disable
|
||||
DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
|
||||
DATA_YIELD_RATIO_THRESHOLD: 0 # set to 0 to disable
|
||||
DROP_HIGHLY_CORRELATED_FEATURES:
|
||||
COMPUTE: True
|
||||
MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
|
||||
CORR_THRESHOLD: 0.95
|
||||
STANDARDIZATION: False
|
||||
SRC_SCRIPT: src/features/all_cleaning_overall/straw/main.py
|
||||
|
||||
|
||||
########################################################################################################################
|
||||
# Z-score standardization #
|
||||
########################################################################################################################
|
||||
|
||||
STANDARDIZATION: # Standardization for both providers is executed if only one of two providers is marked COMPUTE: TRUE
|
||||
MERGE_ALL: True # Creates the joint standardized file for each participant and all participants. Similar to merge_sensor_features_for_all_participants rule
|
||||
PROVIDERS:
|
||||
CR:
|
||||
COMPUTE: True
|
||||
SRC_SCRIPT: src/features/standardization/main.py
|
||||
OTHER:
|
||||
COMPUTE: True
|
||||
LIST: [RAPIDS, DORYAB, BARNETT, STRAW]
|
||||
SRC_SCRIPT: src/features/standardization/main.py
|
||||
|
||||
|
||||
########################################################################################################################
|
||||
# Baseline #
|
||||
########################################################################################################################
|
||||
|
@ -771,4 +732,8 @@ PARAMS_FOR_ANALYSIS:
|
|||
|
||||
TARGET:
|
||||
COMPUTE: True
|
||||
LABEL: PANAS_negative_affect_mean
|
||||
LABEL: appraisal_stressfulness_event_mean
|
||||
ALL_LABELS: [PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean,
|
||||
JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean]
|
||||
# PANAS_positive_affect_mean, PANAS_negative_affect_mean, JCQ_job_demand_mean, JCQ_job_control_mean, JCQ_supervisor_support_mean,
|
||||
# JCQ_coworker_support_mean, appraisal_stressfulness_period_mean, appraisal_stressfulness_event_mean, appraisal_threat_mean, appraisal_challenge_mean
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
PHONE:
|
||||
DEVICE_IDS: [a748ee1a-1d0b-4ae9-9074-279a2b6ba524] # the participant's AWARE device id
|
||||
PLATFORMS: [android] # or ios
|
||||
LABEL: MyTestP01 # any string
|
||||
START_DATE: 2020-01-01 # this can also be empty
|
||||
END_DATE: 2021-01-01 # this can also be empty
|
||||
DEVICE_IDS: [4b62a655-cbf0-4ac0-a448-06726f45b56a]
|
||||
PLATFORMS: [android]
|
||||
LABEL: uploader_53573
|
||||
START_DATE: 2021-05-21 09:21:24
|
||||
END_DATE: 2021-07-12 17:32:07
|
||||
EMPATICA:
|
||||
DEVICE_IDS: [empatica1]
|
||||
LABEL: test01
|
||||
START_DATE:
|
||||
END_DATE:
|
||||
DEVICE_IDS: [uploader_53573]
|
||||
LABEL: uploader_53573
|
||||
START_DATE: 2021-05-21 09:21:24
|
||||
END_DATE: 2021-07-12 17:32:07
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
label,start_time,length,repeats_on,repeats_value
|
||||
daily,04:00:00,23H 59M 59S,every_day,0
|
||||
working_day,04:00:00,18H 00M 00S,every_day,0
|
||||
|
|
|
|
@ -86,8 +86,6 @@ dependencies:
|
|||
- readline=8.0
|
||||
- requests=2.25.0
|
||||
- retrying=1.3.3
|
||||
- scikit-learn=0.23.2
|
||||
- scipy=1.5.2
|
||||
- setuptools=51.0.0
|
||||
- six=1.15.0
|
||||
- smmap=3.0.4
|
||||
|
@ -107,34 +105,61 @@ dependencies:
|
|||
- zlib=1.2.11
|
||||
- pip:
|
||||
- amply==0.1.4
|
||||
- auto-sklearn==0.14.7
|
||||
- bidict==0.22.0
|
||||
- biosppy==0.8.0
|
||||
- build==0.8.0
|
||||
- cached-property==1.5.2
|
||||
- cloudpickle==2.2.0
|
||||
- configargparse==0.15.1
|
||||
- configspace==0.4.21
|
||||
- cr-features==0.2.1
|
||||
- cycler==0.11.0
|
||||
- cython==0.29.32
|
||||
- dask==2022.2.0
|
||||
- decorator==4.4.2
|
||||
- distributed==2022.2.0
|
||||
- distro==1.7.0
|
||||
- emcee==3.1.2
|
||||
- fonttools==4.33.2
|
||||
- fsspec==2022.8.2
|
||||
- h5py==3.6.0
|
||||
- heapdict==1.0.1
|
||||
- hmmlearn==0.2.7
|
||||
- ipython-genutils==0.2.0
|
||||
- jupyter-core==4.6.3
|
||||
- kiwisolver==1.4.2
|
||||
- liac-arff==2.5.0
|
||||
- locket==1.0.0
|
||||
- matplotlib==3.5.1
|
||||
- msgpack==1.0.4
|
||||
- nbformat==5.0.7
|
||||
- opencv-python==4.5.5.64
|
||||
- packaging==21.3
|
||||
- partd==1.3.0
|
||||
- peakutils==1.3.3
|
||||
- pep517==0.13.0
|
||||
- pillow==9.1.0
|
||||
- pulp==2.4
|
||||
- pynisher==0.6.4
|
||||
- pyparsing==2.4.7
|
||||
- pyrfr==0.8.3
|
||||
- pyrsistent==0.15.5
|
||||
- pywavelets==1.3.0
|
||||
- ratelimiter==1.2.0.post0
|
||||
- scikit-learn==0.24.2
|
||||
- scipy==1.7.3
|
||||
- seaborn==0.11.2
|
||||
- shortuuid==1.0.8
|
||||
- smac==1.2
|
||||
- snakemake==5.30.2
|
||||
- sortedcontainers==2.4.0
|
||||
- tblib==1.7.0
|
||||
- tomli==2.0.1
|
||||
- toolz==0.12.0
|
||||
- toposort==1.5
|
||||
- tornado==6.2
|
||||
- traitlets==4.3.3
|
||||
- typing-extensions==4.2.0
|
||||
- zict==2.2.0
|
||||
prefix: /opt/conda/envs/rapids
|
||||
|
|
|
@ -40,15 +40,6 @@ def find_features_files(wildcards):
|
|||
feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", sensor_key=wildcards.sensor_key.lower(), language=get_script_language(provider["SRC_SCRIPT"]), provider_key=provider_key.lower()))
|
||||
return(feature_files)
|
||||
|
||||
def find_empaticas_standardized_features_files(wildcards):
|
||||
feature_files = []
|
||||
if "empatica" in wildcards.sensor_key:
|
||||
for provider_key, provider in config[(wildcards.sensor_key).upper()]["PROVIDERS"].items():
|
||||
if provider["COMPUTE"] and provider.get("WINDOWS", False) and provider["WINDOWS"]["COMPUTE"]:
|
||||
if "empatica" in wildcards.sensor_key:
|
||||
feature_files.extend(expand("data/interim/{{pid}}/{sensor_key}_features/z_{sensor_key}_{language}_{provider_key}.csv", sensor_key=wildcards.sensor_key.lower(), language=get_script_language(provider["SRC_SCRIPT"]), provider_key=provider_key.lower()))
|
||||
return(feature_files)
|
||||
|
||||
def find_joint_non_empatica_sensor_files(wildcards):
|
||||
joined_files = []
|
||||
for config_key in config.keys():
|
||||
|
@ -82,18 +73,6 @@ def input_merge_sensor_features_for_individual_participants(wildcards):
|
|||
break
|
||||
return feature_files
|
||||
|
||||
def input_merge_standardized_sensor_features_for_individual_participants(wildcards):
|
||||
feature_files = []
|
||||
for config_key in config.keys():
|
||||
if config_key.startswith(("PHONE", "FITBIT", "EMPATICA")) and "PROVIDERS" in config[config_key] and isinstance(config[config_key]["PROVIDERS"], dict):
|
||||
for provider_key, provider in config[config_key]["PROVIDERS"].items():
|
||||
if "COMPUTE" in provider.keys() and provider["COMPUTE"] and ("STANDARDIZE_FEATURES" in provider.keys() and provider["STANDARDIZE_FEATURES"] or
|
||||
"WINDOWS" in provider.keys() and "STANDARDIZE_FEATURES" in provider["WINDOWS"].keys() and provider["WINDOWS"]["STANDARDIZE_FEATURES"]):
|
||||
feature_files.append("data/processed/features/{pid}/z_" + config_key.lower() + ".csv")
|
||||
break
|
||||
|
||||
return feature_files
|
||||
|
||||
def get_phone_sensor_names():
|
||||
phone_sensor_names = []
|
||||
for config_key in config.keys():
|
||||
|
|
|
@ -796,20 +796,6 @@ rule empatica_accelerometer_python_features:
|
|||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_accelerometer_python_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_accelerometer_features/empatica_accelerometer_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_accelerometer",
|
||||
provider_main = config["EMPATICA_ACCELEROMETER"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_accelerometer_features/z_empatica_accelerometer_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_accelerometer_features/z_empatica_accelerometer_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_accelerometer_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_accelerometer_with_datetime.csv",
|
||||
|
@ -864,20 +850,6 @@ rule empatica_temperature_python_features:
|
|||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_temperature_python_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_temperature_features/empatica_temperature_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_temperature",
|
||||
provider_main = config["EMPATICA_TEMPERATURE"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_temperature_features/z_empatica_temperature_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_temperature_features/z_empatica_temperature_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_temperature_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_temperature_with_datetime.csv",
|
||||
|
@ -905,20 +877,6 @@ rule empatica_electrodermal_activity_python_features:
|
|||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_electrodermal_activity_python_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_electrodermal_activity_features/empatica_electrodermal_activity_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_electrodermal_activity",
|
||||
provider_main = config["EMPATICA_ELECTRODERMAL_ACTIVITY"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_electrodermal_activity_features/z_empatica_electrodermal_activity_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_electrodermal_activity_features/z_empatica_electrodermal_activity_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_electrodermal_activity_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_electrodermal_activity_with_datetime.csv",
|
||||
|
@ -946,20 +904,6 @@ rule empatica_blood_volume_pulse_python_features:
|
|||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_blood_volume_pulse_python_cr_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_blood_volume_pulse_features/empatica_blood_volume_pulse_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_blood_volume_pulse",
|
||||
provider_main = config["EMPATICA_BLOOD_VOLUME_PULSE"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_blood_volume_pulse_features/z_empatica_blood_volume_pulse_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_blood_volume_pulse_features/z_empatica_blood_volume_pulse_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_blood_volume_pulse_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_blood_volume_pulse_with_datetime.csv",
|
||||
|
@ -987,20 +931,6 @@ rule empatica_inter_beat_interval_python_features:
|
|||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule empatica_inter_beat_interval_python_features_standardization:
|
||||
input:
|
||||
windows_features_data = "data/interim/{pid}/empatica_inter_beat_interval_features/empatica_inter_beat_interval_python_{provider_key}_windows.csv"
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["CR"],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "empatica_inter_beat_interval",
|
||||
provider_main = config["EMPATICA_INTER_BEAT_INTERVAL"]["PROVIDERS"]["CR"]
|
||||
output:
|
||||
"data/interim/{pid}/empatica_inter_beat_interval_features/z_empatica_inter_beat_interval_python_{provider_key}.csv",
|
||||
"data/interim/{pid}/empatica_inter_beat_interval_features/z_empatica_inter_beat_interval_python_{provider_key}_windows.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule empatica_inter_beat_interval_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/empatica_inter_beat_interval_with_datetime.csv",
|
||||
|
@ -1048,38 +978,6 @@ rule merge_sensor_features_for_individual_participants:
|
|||
script:
|
||||
"../src/features/utils/merge_sensor_features_for_individual_participants.R"
|
||||
|
||||
rule join_standardized_features_from_empatica:
|
||||
input:
|
||||
sensor_features = find_empaticas_standardized_features_files
|
||||
wildcard_constraints:
|
||||
sensor_key = '(empatica).*'
|
||||
output:
|
||||
"data/processed/features/{pid}/z_{sensor_key}.csv"
|
||||
script:
|
||||
"../src/features/utils/join_features_from_providers.R"
|
||||
|
||||
rule standardize_features_from_providers_no_empatica:
|
||||
input:
|
||||
sensor_features = find_joint_non_empatica_sensor_files
|
||||
wildcard_constraints:
|
||||
sensor_key = '(phone|fitbit).*'
|
||||
params:
|
||||
provider = config["STANDARDIZATION"]["PROVIDERS"]["OTHER"],
|
||||
provider_key = "OTHER",
|
||||
sensor_key = "{sensor_key}"
|
||||
output:
|
||||
"data/processed/features/{pid}/z_{sensor_key}.csv"
|
||||
script:
|
||||
"../src/features/standardization/main.py"
|
||||
|
||||
rule merge_standardized_sensor_features_for_individual_participants:
|
||||
input:
|
||||
feature_files = input_merge_standardized_sensor_features_for_individual_participants
|
||||
output:
|
||||
"data/processed/features/{pid}/z_all_sensor_features.csv"
|
||||
script:
|
||||
"../src/features/utils/merge_sensor_features_for_individual_participants.R"
|
||||
|
||||
rule merge_sensor_features_for_all_participants:
|
||||
input:
|
||||
feature_files = expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])
|
||||
|
@ -1088,14 +986,6 @@ rule merge_sensor_features_for_all_participants:
|
|||
script:
|
||||
"../src/features/utils/merge_sensor_features_for_all_participants.R"
|
||||
|
||||
rule merge_standardized_sensor_features_for_all_participants:
|
||||
input:
|
||||
feature_files = expand("data/processed/features/{pid}/z_all_sensor_features.csv", pid=config["PIDS"])
|
||||
output:
|
||||
"data/processed/features/all_participants/z_all_sensor_features.csv"
|
||||
script:
|
||||
"../src/features/utils/merge_standardized_sensor_features_for_all_participants.R"
|
||||
|
||||
rule clean_sensor_features_for_individual_participants:
|
||||
input:
|
||||
sensor_data = rules.merge_sensor_features_for_individual_participants.output
|
||||
|
@ -1107,7 +997,7 @@ rule clean_sensor_features_for_individual_participants:
|
|||
script_extension = "{script_extension}",
|
||||
sensor_key = "all_cleaning_individual"
|
||||
output:
|
||||
"data/processed/features/{pid}/all_sensor_features_cleaned_{provider_key}_{script_extension}.csv" # bo predstavljalo probleme za naprej (kako iskati datoteke + standardizacija itd.)
|
||||
"data/processed/features/{pid}/all_sensor_features_cleaned_{provider_key}_{script_extension}.csv"
|
||||
script:
|
||||
"../src/features/entry.{params.script_extension}"
|
||||
|
||||
|
@ -1118,37 +1008,9 @@ rule clean_sensor_features_for_all_participants:
|
|||
provider = lambda wildcards: config["ALL_CLEANING_OVERALL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
provider_key = "{provider_key}",
|
||||
script_extension = "{script_extension}",
|
||||
sensor_key = "all_cleaning_overall"
|
||||
sensor_key = "all_cleaning_overall",
|
||||
target = "{target}"
|
||||
output:
|
||||
"data/processed/features/all_participants/all_sensor_features_cleaned_{provider_key}_{script_extension}.csv"
|
||||
"data/processed/features/all_participants/all_sensor_features_cleaned_{provider_key}_{script_extension}_({target}).csv"
|
||||
script:
|
||||
"../src/features/entry.{params.script_extension}"
|
||||
|
||||
rule clean_standardized_sensor_features_for_individual_participants:
|
||||
input:
|
||||
sensor_data = rules.merge_standardized_sensor_features_for_individual_participants.output
|
||||
wildcard_constraints:
|
||||
pid = "("+"|".join(config["PIDS"])+")"
|
||||
params:
|
||||
provider = lambda wildcards: config["ALL_CLEANING_INDIVIDUAL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
provider_key = "{provider_key}",
|
||||
script_extension = "{script_extension}",
|
||||
sensor_key = "all_cleaning_individual"
|
||||
output:
|
||||
"data/processed/features/{pid}/z_all_sensor_features_cleaned_{provider_key}_{script_extension}.csv"
|
||||
script:
|
||||
"../src/features/entry.{params.script_extension}"
|
||||
|
||||
rule clean_standardized_sensor_features_for_all_participants:
|
||||
input:
|
||||
sensor_data = rules.merge_standardized_sensor_features_for_all_participants.output
|
||||
params:
|
||||
provider = lambda wildcards: config["ALL_CLEANING_OVERALL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
provider_key = "{provider_key}",
|
||||
script_extension = "{script_extension}",
|
||||
sensor_key = "all_cleaning_overall"
|
||||
output:
|
||||
"data/processed/features/all_participants/z_all_sensor_features_cleaned_{provider_key}_{script_extension}.csv"
|
||||
script:
|
||||
"../src/features/entry.{params.script_extension}"
|
||||
|
||||
|
|
|
@ -30,43 +30,23 @@ rule baseline_features:
|
|||
|
||||
rule select_target:
|
||||
input:
|
||||
cleaned_sensor_features = "data/processed/features/{pid}/z_all_sensor_features_cleaned_straw_py.csv"
|
||||
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned_straw_py.csv"
|
||||
params:
|
||||
target_variable = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||
output:
|
||||
"data/processed/models/individual_model/{pid}/z_input.csv"
|
||||
"data/processed/models/individual_model/{pid}/input.csv"
|
||||
script:
|
||||
"../src/models/select_targets.py"
|
||||
|
||||
rule merge_features_and_targets_for_population_model:
|
||||
input:
|
||||
cleaned_sensor_features = "data/processed/features/all_participants/z_all_sensor_features_cleaned_straw_py.csv",
|
||||
cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned_straw_py_({target}).csv",
|
||||
demographic_features = expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]),
|
||||
params:
|
||||
target_variable=config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||
target_variable="{target}"
|
||||
output:
|
||||
"data/processed/models/population_model/z_input.csv"
|
||||
"data/processed/models/population_model/input_{target}.csv"
|
||||
script:
|
||||
"../src/models/merge_features_and_targets_for_population_model.py"
|
||||
|
||||
# rule select_target:
|
||||
# input:
|
||||
# cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned_straw_py.csv"
|
||||
# params:
|
||||
# target_variable = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||
# output:
|
||||
# "data/processed/models/individual_model/{pid}/input.csv"
|
||||
# script:
|
||||
# "../src/models/select_targets.py"
|
||||
|
||||
# rule merge_features_and_targets_for_population_model:
|
||||
# input:
|
||||
# cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned_straw_py.csv",
|
||||
# demographic_features = expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]),
|
||||
# params:
|
||||
# target_variable=config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||
# output:
|
||||
# "data/processed/models/population_model/input.csv"
|
||||
# script:
|
||||
# "../src/models/merge_features_and_targets_for_population_model.py"
|
||||
|
||||
|
|
|
@ -249,3 +249,29 @@ rule empatica_readable_datetime:
|
|||
"data/raw/{pid}/empatica_{sensor}_with_datetime.csv"
|
||||
script:
|
||||
"../src/data/datetime/readable_datetime.R"
|
||||
|
||||
|
||||
rule extract_event_information_from_esm:
|
||||
input:
|
||||
esm_raw_input = "data/raw/{pid}/phone_esm_raw.csv",
|
||||
pid_file = "data/external/participant_files/{pid}.yaml"
|
||||
params:
|
||||
stage = "extract",
|
||||
pid = "{pid}"
|
||||
output:
|
||||
"data/raw/ers/{pid}_ers.csv",
|
||||
"data/raw/ers/{pid}_stress_event_targets.csv"
|
||||
script:
|
||||
"../src/features/phone_esm/straw/process_user_event_related_segments.py"
|
||||
|
||||
rule merge_event_related_segments_files:
|
||||
input:
|
||||
ers_files = expand("data/raw/ers/{pid}_ers.csv", pid=config["PIDS"]),
|
||||
se_files = expand("data/raw/ers/{pid}_stress_event_targets.csv", pid=config["PIDS"])
|
||||
params:
|
||||
stage = "merge"
|
||||
output:
|
||||
"data/external/straw_events.csv",
|
||||
"data/external/stress_event_targets.csv"
|
||||
script:
|
||||
"../src/features/phone_esm/straw/process_user_event_related_segments.py"
|
|
@ -5,13 +5,16 @@ options(scipen=999)
|
|||
|
||||
assign_rows_to_segments <- function(data, segments){
|
||||
# This function is used by all segment types, we use data.tables because they are fast
|
||||
|
||||
data <- data.table::as.data.table(data)
|
||||
data[, assigned_segments := ""]
|
||||
for(i in seq_len(nrow(segments))) {
|
||||
segment <- segments[i,]
|
||||
|
||||
data[segment$segment_start_ts<= timestamp & segment$segment_end_ts >= timestamp,
|
||||
assigned_segments := stringi::stri_c(assigned_segments, segment$segment_id, sep = "|")]
|
||||
}
|
||||
|
||||
data[,assigned_segments:=substring(assigned_segments, 2)]
|
||||
data
|
||||
}
|
||||
|
|
|
@ -1,88 +1,174 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import math, sys
|
||||
import math, sys, random
|
||||
import yaml
|
||||
|
||||
from sklearn.impute import KNNImputer
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
sys.path.append('/rapids/')
|
||||
from src.features import empatica_data_yield as edy
|
||||
|
||||
pd.set_option('display.max_columns', 20)
|
||||
|
||||
def straw_cleaning(sensor_data_files, provider):
|
||||
|
||||
features = pd.read_csv(sensor_data_files["sensor_data"][0])
|
||||
|
||||
# TODO: reorder the cleaning steps so it makes sense for the analysis
|
||||
# TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this
|
||||
# the snakemake rules will also have to come with additional parameter (in rules/features.smk)
|
||||
|
||||
# Impute selected features event
|
||||
impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"]
|
||||
if impute_phone_features["COMPUTE"]:
|
||||
if not 'phone_data_yield_rapids_ratiovalidyieldedminutes' in features.columns:
|
||||
raise KeyError("RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
|
||||
|
||||
# TODO: if the type of the imputation will vary for different groups of features make conditional imputations here
|
||||
phone_cols = [col for col in features if \
|
||||
col.startswith('phone_applications_foreground_rapids_') or
|
||||
col.startswith('phone_battery_rapids_') or
|
||||
col.startswith('phone_calls_rapids_') or
|
||||
col.startswith('phone_keyboard_rapids_') or
|
||||
col.startswith('phone_messages_rapids_') or
|
||||
col.startswith('phone_screen_rapids_') or
|
||||
col.startswith('phone_wifi_')]
|
||||
|
||||
mask = features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE']
|
||||
features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"].lower())
|
||||
|
||||
# Drop rows with the value of data_yield_column less than data_yield_ratio_threshold
|
||||
data_yield_unit = provider["DATA_YIELD_FEATURE"].split("_")[3].lower()
|
||||
data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + data_yield_unit
|
||||
|
||||
if not data_yield_column in features.columns:
|
||||
raise KeyError(f"RAPIDS provider needs to impute the selected event features based on {data_yield_column} column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].")
|
||||
|
||||
if provider["DATA_YIELD_RATIO_THRESHOLD"]:
|
||||
features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]]
|
||||
|
||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm')] # For later preservation of esm_cols
|
||||
|
||||
# Remove cols if threshold of NaN values is passed
|
||||
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
||||
|
||||
# Remove cols where variance is 0
|
||||
if provider["COLS_VAR_THRESHOLD"]:
|
||||
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
|
||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
||||
|
||||
with open('config.yaml', 'r') as stream:
|
||||
config = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
|
||||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||
|
||||
# (1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
|
||||
if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
|
||||
target = config['PARAMS_FOR_ANALYSIS']['TARGET']['LABEL'] # get target label from config
|
||||
if 'phone_esm_straw_' + target in features:
|
||||
features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
|
||||
else:
|
||||
return features
|
||||
|
||||
# (2.1) QUALITY CHECK (DATA YIELD COLUMN) deletes the rows where E4 or phone data is low quality
|
||||
phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
|
||||
phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit
|
||||
|
||||
features = edy.calculate_empatica_data_yield(features)
|
||||
|
||||
if not phone_data_yield_column in features.columns and not "empatica_data_yield" in features.columns:
|
||||
raise KeyError(f"RAPIDS provider needs to clean the selected event features based on {phone_data_yield_column} and empatica_data_yield columns. For phone data yield, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].")
|
||||
|
||||
# Drop rows where phone data yield is less then given threshold
|
||||
if provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]:
|
||||
features = features[features[phone_data_yield_column] >= provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
||||
|
||||
# Drop rows where empatica data yield is less then given threshold
|
||||
if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]:
|
||||
features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
||||
|
||||
if features.empty:
|
||||
return features
|
||||
|
||||
# (2.2) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES?
|
||||
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
|
||||
features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans
|
||||
|
||||
# (3) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
|
||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
||||
|
||||
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
||||
|
||||
# Preserve esm cols if deleted (has to come after drop cols operations)
|
||||
for esm in esm_cols:
|
||||
if esm not in features:
|
||||
features[esm] = esm_cols[esm]
|
||||
|
||||
# Drop highly correlated features - To-Do še en thershold var, ki je v config + kako se tretirajo NaNs?
|
||||
|
||||
# (4) CONTEXTUAL IMPUTATION
|
||||
|
||||
# Impute selected phone features with a high number
|
||||
impute_w_hn = [col for col in features.columns if \
|
||||
"timeoffirstuse" in col or
|
||||
"timeoflastuse" in col or
|
||||
"timefirstcall" in col or
|
||||
"timelastcall" in col or
|
||||
"firstuseafter" in col or
|
||||
"timefirstmessages" in col or
|
||||
"timelastmessages" in col]
|
||||
features[impute_w_hn] = features[impute_w_hn].fillna(1500)
|
||||
|
||||
|
||||
# Impute special case (mostcommonactivity) and (homelabel)
|
||||
impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col]
|
||||
features[impute_w_sn] = features[impute_w_sn].fillna(4) # Special case of imputation - nominal/ordinal value
|
||||
|
||||
impute_w_sn2 = [col for col in features.columns if "homelabel" in col]
|
||||
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
|
||||
|
||||
impute_w_sn3 = [col for col in features.columns if "loglocationvariance" in col]
|
||||
features[impute_w_sn2] = features[impute_w_sn2].fillna(-1000000) # Special case of imputation - nominal/ordinal value
|
||||
|
||||
|
||||
# Impute selected phone features with 0
|
||||
impute_zero = [col for col in features if \
|
||||
col.startswith('phone_applications_foreground_rapids_') or
|
||||
col.startswith('phone_battery_rapids_') or
|
||||
col.startswith('phone_bluetooth_rapids_') or
|
||||
col.startswith('phone_light_rapids_') or
|
||||
col.startswith('phone_calls_rapids_') or
|
||||
col.startswith('phone_messages_rapids_') or
|
||||
col.startswith('phone_screen_rapids_') or
|
||||
col.startswith('phone_wifi_visible')]
|
||||
|
||||
features[impute_zero+list(esm_cols.columns)] = features[impute_zero+list(esm_cols.columns)].fillna(0)
|
||||
|
||||
## (5) STANDARDIZATION
|
||||
if provider["STANDARDIZATION"]:
|
||||
features.loc[:, ~features.columns.isin(excluded_columns)] = StandardScaler().fit_transform(features.loc[:, ~features.columns.isin(excluded_columns)])
|
||||
|
||||
# (6) IMPUTATION: IMPUTE DATA WITH KNN METHOD
|
||||
impute_cols = [col for col in features.columns if col not in excluded_columns]
|
||||
features.reset_index(drop=True, inplace=True)
|
||||
features[impute_cols] = impute(features[impute_cols], method="knn")
|
||||
|
||||
# (7) REMOVE COLS WHERE VARIANCE IS 0
|
||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')]
|
||||
|
||||
if provider["COLS_VAR_THRESHOLD"]:
|
||||
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
|
||||
|
||||
fe5 = features.copy()
|
||||
|
||||
# (8) DROP HIGHLY CORRELATED FEATURES
|
||||
drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]
|
||||
if drop_corr_features["COMPUTE"]:
|
||||
if drop_corr_features["COMPUTE"] and features.shape[0]: # If small amount of segments (rows) is present, do not execute correlation check
|
||||
|
||||
numerical_cols = features.select_dtypes(include=np.number).columns.tolist()
|
||||
|
||||
# Remove columns where NaN count threshold is passed
|
||||
valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
|
||||
|
||||
cor_matrix = valid_features.corr(method='spearman').abs()
|
||||
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))
|
||||
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > drop_corr_features["CORR_THRESHOLD"])]
|
||||
corr_matrix = valid_features.corr().abs()
|
||||
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
|
||||
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
|
||||
|
||||
features.drop(to_drop, axis=1, inplace=True)
|
||||
|
||||
# Remove rows if threshold of NaN values is passed
|
||||
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
|
||||
features.dropna(axis=0, thresh=min_count, inplace=True)
|
||||
# Preserve esm cols if deleted (has to come after drop cols operations)
|
||||
for esm in esm_cols:
|
||||
if esm not in features:
|
||||
features[esm] = esm_cols[esm]
|
||||
|
||||
# (9) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
|
||||
if features.isna().any().any():
|
||||
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
|
||||
|
||||
return features
|
||||
|
||||
def impute(df, method='zero'):
|
||||
|
||||
def k_nearest(df): # TODO: if needed, implement k-nearest imputation / interpolation
|
||||
pass
|
||||
def k_nearest(df):
|
||||
pd.set_option('display.max_columns', None)
|
||||
imputer = KNNImputer(n_neighbors=3)
|
||||
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
|
||||
|
||||
return { # rest of the columns should be imputed with the selected method
|
||||
return {
|
||||
'zero': df.fillna(0),
|
||||
'high_number': df.fillna(1500),
|
||||
'mean': df.fillna(df.mean()),
|
||||
'median': df.fillna(df.median()),
|
||||
'k-nearest': k_nearest(df)
|
||||
'knn': k_nearest(df)
|
||||
}[method]
|
||||
|
||||
|
||||
def graph_bf_af(features, phase_name, plt_flag=False):
|
||||
if plt_flag:
|
||||
sns.set(rc={"figure.figsize":(16, 8)})
|
||||
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)
|
||||
plt.savefig(f'features_overall_nans_{phase_name}.png', bbox_inches='tight')
|
||||
|
||||
print(f"\n-------------{phase_name}-------------")
|
||||
print("Rows number:", features.shape[0])
|
||||
print("Columns number:", len(features.columns))
|
||||
print("---------------------------------------------\n")
|
||||
|
|
|
@ -1,88 +1,261 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import math, sys
|
||||
import math, sys, random, warnings, yaml
|
||||
|
||||
from sklearn.impute import KNNImputer
|
||||
from sklearn.preprocessing import StandardScaler, minmax_scale
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
sys.path.append('/rapids/')
|
||||
from src.features import empatica_data_yield as edy
|
||||
|
||||
def straw_cleaning(sensor_data_files, provider, target):
|
||||
|
||||
def straw_cleaning(sensor_data_files, provider):
|
||||
|
||||
features = pd.read_csv(sensor_data_files["sensor_data"][0])
|
||||
|
||||
# TODO: reorder the cleaning steps so it makes sense for the analysis
|
||||
# TODO: add conditions that differentiates cleaning steps for standardized and nonstandardized features, for this
|
||||
# the snakemake rules will also have to come with additional parameter (in rules/features.smk)
|
||||
with open('config.yaml', 'r') as stream:
|
||||
config = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
|
||||
# Impute selected features event
|
||||
impute_phone_features = provider["IMPUTE_PHONE_SELECTED_EVENT_FEATURES"]
|
||||
if impute_phone_features["COMPUTE"]:
|
||||
if not 'phone_data_yield_rapids_ratiovalidyieldedminutes' in features.columns:
|
||||
raise KeyError("RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES].")
|
||||
|
||||
# TODO: if the type of the imputation will vary for different groups of features make conditional imputations here
|
||||
phone_cols = [col for col in features if \
|
||||
col.startswith('phone_applications_foreground_rapids_') or
|
||||
col.startswith('phone_battery_rapids_') or
|
||||
col.startswith('phone_calls_rapids_') or
|
||||
col.startswith('phone_keyboard_rapids_') or
|
||||
col.startswith('phone_messages_rapids_') or
|
||||
col.startswith('phone_screen_rapids_') or
|
||||
col.startswith('phone_wifi_')]
|
||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
||||
|
||||
mask = features['phone_data_yield_rapids_ratiovalidyieldedminutes'] > impute_phone_features['MIN_DATA_YIELDED_MINUTES_TO_IMPUTE']
|
||||
features.loc[mask, phone_cols] = impute(features[mask][phone_cols], method=impute_phone_features["TYPE"].lower())
|
||||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||
|
||||
# Drop rows with the value of data_yield_column less than data_yield_ratio_threshold
|
||||
data_yield_unit = provider["DATA_YIELD_FEATURE"].split("_")[3].lower()
|
||||
data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + data_yield_unit
|
||||
graph_bf_af(features, "1target_rows_before")
|
||||
|
||||
if not data_yield_column in features.columns:
|
||||
raise KeyError(f"RAPIDS provider needs to impute the selected event features based on {data_yield_column} column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].")
|
||||
|
||||
if provider["DATA_YIELD_RATIO_THRESHOLD"]:
|
||||
features = features[features[data_yield_column] >= provider["DATA_YIELD_RATIO_THRESHOLD"]]
|
||||
|
||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm')] # For later preservation of esm_cols
|
||||
|
||||
# Remove cols if threshold of NaN values is passed
|
||||
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
||||
# (1.0) OVERRIDE STRESSFULNESS EVENT TARGETS IF ERS SEGMENTING_METHOD IS "STRESS_EVENT"
|
||||
if config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"] == "stress_event":
|
||||
|
||||
# Remove cols where variance is 0
|
||||
stress_events_targets = pd.read_csv("data/external/stress_event_targets.csv")
|
||||
|
||||
if "appraisal_stressfulness_event_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
|
||||
features.drop(columns=['phone_esm_straw_appraisal_stressfulness_event_mean'], inplace=True)
|
||||
features = features.merge(stress_events_targets[["label", "appraisal_stressfulness_event"]] \
|
||||
.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
|
||||
.rename(columns={'appraisal_stressfulness_event': 'phone_esm_straw_appraisal_stressfulness_event_mean'})
|
||||
|
||||
if "appraisal_threat_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
|
||||
features.drop(columns=['phone_esm_straw_appraisal_threat_mean'], inplace=True)
|
||||
features = features.merge(stress_events_targets[["label", "appraisal_threat"]] \
|
||||
.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
|
||||
.rename(columns={'appraisal_threat': 'phone_esm_straw_appraisal_threat_mean'})
|
||||
|
||||
if "appraisal_challenge_mean" in config['PARAMS_FOR_ANALYSIS']['TARGET']['ALL_LABELS']:
|
||||
features.drop(columns=['phone_esm_straw_appraisal_challenge_mean'], inplace=True)
|
||||
features = features.merge(stress_events_targets[["label", "appraisal_challenge"]] \
|
||||
.rename(columns={'label': 'local_segment_label'}), on=['local_segment_label'], how='inner') \
|
||||
.rename(columns={'appraisal_challenge': 'phone_esm_straw_appraisal_challenge_mean'})
|
||||
|
||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
||||
|
||||
# (1.1) FILTER_OUT THE ROWS THAT DO NOT HAVE THE TARGET COLUMN AVAILABLE
|
||||
if config['PARAMS_FOR_ANALYSIS']['TARGET']['COMPUTE']:
|
||||
features = features[features['phone_esm_straw_' + target].notna()].reset_index(drop=True)
|
||||
|
||||
if features.empty:
|
||||
return pd.DataFrame(columns=excluded_columns)
|
||||
|
||||
graph_bf_af(features, "2target_rows_after")
|
||||
|
||||
# (2) QUALITY CHECK (DATA YIELD COLUMN) drops the rows where E4 or phone data is low quality
|
||||
phone_data_yield_unit = provider["PHONE_DATA_YIELD_FEATURE"].split("_")[3].lower()
|
||||
phone_data_yield_column = "phone_data_yield_rapids_ratiovalidyielded" + phone_data_yield_unit
|
||||
|
||||
features = edy.calculate_empatica_data_yield(features)
|
||||
|
||||
if not phone_data_yield_column in features.columns and not "empatica_data_yield" in features.columns:
|
||||
raise KeyError(f"RAPIDS provider needs to clean the selected event features based on {phone_data_yield_column} and empatica_data_yield columns. For phone data yield, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded{data_yield_unit}' in [FEATURES].")
|
||||
|
||||
hist = features[["empatica_data_yield", phone_data_yield_column]].hist()
|
||||
plt.savefig(f'phone_E4_histogram.png', bbox_inches='tight')
|
||||
|
||||
# Drop rows where phone data yield is less then given threshold
|
||||
if provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]:
|
||||
hist = features[phone_data_yield_column].hist(bins=5)
|
||||
plt.close()
|
||||
features = features[features[phone_data_yield_column] >= provider["PHONE_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
||||
|
||||
# Drop rows where empatica data yield is less then given threshold
|
||||
if provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]:
|
||||
features = features[features["empatica_data_yield"] >= provider["EMPATICA_DATA_YIELD_RATIO_THRESHOLD"]].reset_index(drop=True)
|
||||
|
||||
if features.empty:
|
||||
return pd.DataFrame(columns=excluded_columns)
|
||||
|
||||
graph_bf_af(features, "3data_yield_drop_rows")
|
||||
|
||||
if features.empty:
|
||||
return pd.DataFrame(columns=excluded_columns)
|
||||
|
||||
# (3) CONTEXTUAL IMPUTATION
|
||||
|
||||
# Impute selected phone features with a high number
|
||||
impute_w_hn = [col for col in features.columns if \
|
||||
"timeoffirstuse" in col or
|
||||
"timeoflastuse" in col or
|
||||
"timefirstcall" in col or
|
||||
"timelastcall" in col or
|
||||
"firstuseafter" in col or
|
||||
"timefirstmessages" in col or
|
||||
"timelastmessages" in col]
|
||||
features[impute_w_hn] = features[impute_w_hn].fillna(1500)
|
||||
|
||||
# Impute special case (mostcommonactivity) and (homelabel)
|
||||
impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col]
|
||||
features[impute_w_sn] = features[impute_w_sn].fillna(4) # Special case of imputation - nominal/ordinal value
|
||||
|
||||
impute_w_sn2 = [col for col in features.columns if "homelabel" in col]
|
||||
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
|
||||
|
||||
impute_w_sn3 = [col for col in features.columns if "loglocationvariance" in col]
|
||||
features[impute_w_sn3] = features[impute_w_sn3].fillna(-1000000) # Special case of imputation - loglocation
|
||||
|
||||
# Impute location features
|
||||
impute_locations = [col for col in features \
|
||||
if col.startswith('phone_locations_doryab_') and
|
||||
'radiusgyration' not in col
|
||||
]
|
||||
|
||||
# Impute selected phone, location, and esm features with 0
|
||||
impute_zero = [col for col in features if \
|
||||
col.startswith('phone_applications_foreground_rapids_') or
|
||||
col.startswith('phone_activity_recognition_') or
|
||||
col.startswith('phone_battery_rapids_') or
|
||||
col.startswith('phone_bluetooth_rapids_') or
|
||||
col.startswith('phone_light_rapids_') or
|
||||
col.startswith('phone_calls_rapids_') or
|
||||
col.startswith('phone_messages_rapids_') or
|
||||
col.startswith('phone_screen_rapids_') or
|
||||
col.startswith('phone_bluetooth_doryab_') or
|
||||
col.startswith('phone_wifi_visible')
|
||||
]
|
||||
|
||||
features[impute_zero+impute_locations+list(esm_cols.columns)] = features[impute_zero+impute_locations+list(esm_cols.columns)].fillna(0)
|
||||
|
||||
pd.set_option('display.max_rows', None)
|
||||
|
||||
graph_bf_af(features, "4context_imp")
|
||||
|
||||
# (4) REMOVE COLS IF THEIR NAN THRESHOLD IS PASSED (should be <= if even all NaN columns must be preserved - this solution now drops columns with all NaN rows)
|
||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')] # Get target (esm) columns
|
||||
|
||||
features = features.loc[:, features.isna().sum() < provider["COLS_NAN_THRESHOLD"] * features.shape[0]]
|
||||
|
||||
graph_bf_af(features, "5too_much_nans_cols")
|
||||
# (5) REMOVE COLS WHERE VARIANCE IS 0
|
||||
|
||||
if provider["COLS_VAR_THRESHOLD"]:
|
||||
features.drop(features.std()[features.std() == 0].index.values, axis=1, inplace=True)
|
||||
|
||||
|
||||
graph_bf_af(features, "6variance_drop")
|
||||
|
||||
# Preserve esm cols if deleted (has to come after drop cols operations)
|
||||
for esm in esm_cols:
|
||||
if esm not in features:
|
||||
features[esm] = esm_cols[esm]
|
||||
|
||||
# Drop highly correlated features - To-Do še en thershold var, ki je v config + kako se tretirajo NaNs?
|
||||
# (6) DO THE ROWS CONSIST OF ENOUGH NON-NAN VALUES?
|
||||
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
|
||||
features.dropna(axis=0, thresh=min_count, inplace=True) # Thresh => at least this many not-nans
|
||||
|
||||
graph_bf_af(features, "7too_much_nans_rows")
|
||||
|
||||
if features.empty:
|
||||
return pd.DataFrame(columns=excluded_columns)
|
||||
|
||||
# (7) STANDARDIZATION
|
||||
if provider["STANDARDIZATION"]:
|
||||
nominal_cols = [col for col in features.columns if "mostcommonactivity" in col or "homelabel" in col] # Excluded nominal features
|
||||
# Expected warning within this code block
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", category=RuntimeWarning)
|
||||
features.loc[:, ~features.columns.isin(excluded_columns + ["pid"] + nominal_cols)] = \
|
||||
features.loc[:, ~features.columns.isin(excluded_columns + nominal_cols)].groupby('pid').transform(lambda x: StandardScaler().fit_transform(x.values[:,np.newaxis]).ravel())
|
||||
|
||||
graph_bf_af(features, "8standardization")
|
||||
|
||||
# (8) IMPUTATION: IMPUTE DATA WITH KNN METHOD
|
||||
features.reset_index(drop=True, inplace=True)
|
||||
impute_cols = [col for col in features.columns if col not in excluded_columns and col != "pid"]
|
||||
|
||||
features[impute_cols] = impute(features[impute_cols], method="knn")
|
||||
|
||||
graph_bf_af(features, "9knn_after")
|
||||
|
||||
|
||||
# (9) DROP HIGHLY CORRELATED FEATURES
|
||||
esm_cols = features.loc[:, features.columns.str.startswith('phone_esm_straw')]
|
||||
|
||||
drop_corr_features = provider["DROP_HIGHLY_CORRELATED_FEATURES"]
|
||||
if drop_corr_features["COMPUTE"]:
|
||||
if drop_corr_features["COMPUTE"] and features.shape[0] > 5: # If small amount of segments (rows) is present, do not execute correlation check
|
||||
|
||||
numerical_cols = features.select_dtypes(include=np.number).columns.tolist()
|
||||
|
||||
# Remove columns where NaN count threshold is passed
|
||||
valid_features = features[numerical_cols].loc[:, features[numerical_cols].isna().sum() < drop_corr_features['MIN_OVERLAP_FOR_CORR_THRESHOLD'] * features[numerical_cols].shape[0]]
|
||||
|
||||
cor_matrix = valid_features.corr(method='spearman').abs()
|
||||
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))
|
||||
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > drop_corr_features["CORR_THRESHOLD"])]
|
||||
corr_matrix = valid_features.corr().abs()
|
||||
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
|
||||
to_drop = [column for column in upper.columns if any(upper[column] > drop_corr_features["CORR_THRESHOLD"])]
|
||||
|
||||
# sns.heatmap(corr_matrix, cmap="YlGnBu")
|
||||
# plt.savefig(f'correlation_matrix.png', bbox_inches='tight')
|
||||
# plt.close()
|
||||
|
||||
# s = corr_matrix.unstack()
|
||||
# so = s.sort_values(ascending=False)
|
||||
|
||||
# pd.set_option('display.max_rows', None)
|
||||
# sorted_upper = upper.unstack().sort_values(ascending=False)
|
||||
# print(sorted_upper[sorted_upper > drop_corr_features["CORR_THRESHOLD"]])
|
||||
|
||||
features.drop(to_drop, axis=1, inplace=True)
|
||||
|
||||
# Remove rows if threshold of NaN values is passed
|
||||
min_count = math.ceil((1 - provider["ROWS_NAN_THRESHOLD"]) * features.shape[1]) # minimal not nan values in row
|
||||
features.dropna(axis=0, thresh=min_count, inplace=True)
|
||||
# Preserve esm cols if deleted (has to come after drop cols operations)
|
||||
for esm in esm_cols:
|
||||
if esm not in features:
|
||||
features[esm] = esm_cols[esm]
|
||||
|
||||
graph_bf_af(features, "10correlation_drop")
|
||||
|
||||
# Transform categorical columns to category dtype
|
||||
|
||||
cat1 = [col for col in features.columns if "mostcommonactivity" in col]
|
||||
if cat1: # Transform columns to category dtype (mostcommonactivity)
|
||||
features[cat1] = features[cat1].astype(int).astype('category')
|
||||
|
||||
cat2 = [col for col in features.columns if "homelabel" in col]
|
||||
if cat2: # Transform columns to category dtype (homelabel)
|
||||
features[cat2] = features[cat2].astype(int).astype('category')
|
||||
|
||||
# (10) VERIFY IF THERE ARE ANY NANS LEFT IN THE DATAFRAME
|
||||
if features.isna().any().any():
|
||||
raise ValueError("There are still some NaNs present in the dataframe. Please check for implementation errors.")
|
||||
|
||||
return features
|
||||
|
||||
def impute(df, method='zero'):
|
||||
|
||||
def k_nearest(df): # TODO: if needed, implement k-nearest imputation / interpolation
|
||||
pass
|
||||
def k_nearest(df):
|
||||
imputer = KNNImputer(n_neighbors=3)
|
||||
return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
|
||||
|
||||
return { # rest of the columns should be imputed with the selected method
|
||||
return {
|
||||
'zero': df.fillna(0),
|
||||
'high_number': df.fillna(1500),
|
||||
'mean': df.fillna(df.mean()),
|
||||
'median': df.fillna(df.median()),
|
||||
'k-nearest': k_nearest(df)
|
||||
'knn': k_nearest(df)
|
||||
}[method]
|
||||
|
||||
|
||||
def graph_bf_af(features, phase_name, plt_flag=False):
|
||||
if plt_flag:
|
||||
sns.set(rc={"figure.figsize":(16, 8)})
|
||||
sns.heatmap(features.isna(), cbar=False) #features.select_dtypes(include=np.number)
|
||||
plt.savefig(f'features_overall_nans_{phase_name}.png', bbox_inches='tight')
|
||||
|
||||
print(f"\n-------------{phase_name}-------------")
|
||||
print("Rows number:", features.shape[0])
|
||||
print("Columns number:", len(features.columns))
|
||||
print("NaN values:", features.isna().sum().sum())
|
||||
print("---------------------------------------------\n")
|
||||
|
|
|
@ -21,7 +21,7 @@ def extract_second_order_features(intraday_features, so_features_names, prefix="
|
|||
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).median().add_suffix("_SO_median")], axis=1)
|
||||
|
||||
if "sd" in so_features_names:
|
||||
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std().add_suffix("_SO_sd")], axis=1)
|
||||
so_features = pd.concat([so_features, intraday_features.drop(prefix+"level_1", axis=1).groupby(groupby_cols).std().fillna(0).add_suffix("_SO_sd")], axis=1)
|
||||
|
||||
if "nlargest" in so_features_names: # largest 5 -- maybe there is a faster groupby solution?
|
||||
for column in intraday_features.loc[:, ~intraday_features.columns.isin(groupby_cols+[prefix+"level_1"])]:
|
||||
|
|
|
@ -43,7 +43,11 @@ def extract_acc_features_from_intraday_data(acc_intraday_data, features, window_
|
|||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
acc_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
|
||||
data_types = {'local_timezone': 'str', 'device_id': 'str', 'timestamp': 'int64', 'double_values_0': 'float64',
|
||||
'double_values_1': 'float64', 'double_values_2': 'float64', 'local_date_time': 'str', 'local_date': "str",
|
||||
'local_time': "str", 'local_hour': "str", 'local_minute': "str", 'assigned_segments': "str"}
|
||||
acc_intraday_data = pd.read_csv(sensor_data_files["sensor_data"], dtype=data_types)
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
|
||||
import sys, yaml
|
||||
|
||||
def calculate_empatica_data_yield(features): # TODO
|
||||
|
||||
# Get time segment duration in seconds from all segments in features dataframe
|
||||
datetime_start = pd.to_datetime(features['local_segment_start_datetime'], format='%Y-%m-%d %H:%M:%S')
|
||||
datetime_end = pd.to_datetime(features['local_segment_end_datetime'], format='%Y-%m-%d %H:%M:%S')
|
||||
tseg_duration = (datetime_end - datetime_start).dt.total_seconds()
|
||||
|
||||
with open('config.yaml', 'r') as stream:
|
||||
config = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
|
||||
sensors = ["EMPATICA_ACCELEROMETER", "EMPATICA_TEMPERATURE", "EMPATICA_ELECTRODERMAL_ACTIVITY", "EMPATICA_INTER_BEAT_INTERVAL"]
|
||||
for sensor in sensors:
|
||||
features[f"{sensor.lower()}_data_yield"] = \
|
||||
(features[f"{sensor.lower()}_cr_SO_windowsCount"] * config[sensor]["PROVIDERS"]["CR"]["WINDOWS"]["WINDOW_LENGTH"]) / tseg_duration \
|
||||
if f'{sensor.lower()}_cr_SO_windowsCount' in features else 0
|
||||
|
||||
empatica_data_yield_cols = [sensor.lower() + "_data_yield" for sensor in sensors]
|
||||
pd.set_option('display.max_rows', None)
|
||||
|
||||
# Assigns 1 to values that are over 1 (in case of windows not being filled fully)
|
||||
features[empatica_data_yield_cols] = features[empatica_data_yield_cols].apply(lambda x: [y if y <= 1 or np.isnan(y) else 1 for y in x])
|
||||
|
||||
features["empatica_data_yield"] = features[empatica_data_yield_cols].mean(axis=1).fillna(0)
|
||||
features.drop(empatica_data_yield_cols, axis=1, inplace=True) # In case of if the advanced operations will later not be needed (e.g., weighted average)
|
||||
|
||||
return features
|
|
@ -44,7 +44,11 @@ def extract_eda_features_from_intraday_data(eda_intraday_data, features, window_
|
|||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
eda_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
|
||||
data_types = {'local_timezone': 'str', 'device_id': 'str', 'timestamp': 'int64', 'electrodermal_activity': 'float64', 'local_date_time': 'str',
|
||||
'local_date': "str", 'local_time': "str", 'local_hour': "str", 'local_minute': "str", 'assigned_segments': "str"}
|
||||
|
||||
eda_intraday_data = pd.read_csv(sensor_data_files["sensor_data"], dtype=data_types)
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
||||
|
|
|
@ -50,6 +50,11 @@ def extract_ibi_features_from_intraday_data(ibi_intraday_data, features, window_
|
|||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
|
||||
data_types = {'local_timezone': 'str', 'device_id': 'str', 'timestamp': 'int64', 'inter_beat_interval': 'float64', 'timings': 'float64', 'local_date_time': 'str',
|
||||
'local_date': "str", 'local_time': "str", 'local_hour': "str", 'local_minute': "str", 'assigned_segments': "str"}
|
||||
|
||||
temperature_intraday_data = pd.read_csv(sensor_data_files["sensor_data"], dtype=data_types)
|
||||
ibi_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
|
|
@ -37,7 +37,10 @@ def extract_temp_features_from_intraday_data(temperature_intraday_data, features
|
|||
|
||||
|
||||
def cr_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
temperature_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
data_types = {'local_timezone': 'str', 'device_id': 'str', 'timestamp': 'int64', 'temperature': 'float64', 'local_date_time': 'str',
|
||||
'local_date': "str", 'local_time': "str", 'local_hour': "str", 'local_minute': "str", 'assigned_segments': "str"}
|
||||
|
||||
temperature_intraday_data = pd.read_csv(sensor_data_files["sensor_data"], dtype=data_types)
|
||||
|
||||
requested_intraday_features = provider["FEATURES"]
|
||||
|
||||
|
|
|
@ -13,7 +13,10 @@ calc_windows = True if (provider.get("WINDOWS", False) and provider["WINDOWS"].g
|
|||
|
||||
if sensor_key == "all_cleaning_individual" or sensor_key == "all_cleaning_overall":
|
||||
# Data cleaning
|
||||
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
|
||||
if "overall" in sensor_key:
|
||||
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files, snakemake.params["target"])
|
||||
else:
|
||||
sensor_features = run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files)
|
||||
else:
|
||||
# Extract sensor features
|
||||
del sensor_data_files["time_segments_labels"]
|
||||
|
|
|
@ -37,6 +37,6 @@ def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
|||
ar_features.index.names = ["local_segment"]
|
||||
ar_features = ar_features.reset_index()
|
||||
|
||||
ar_features.fillna(value={"count": 0, "countuniqueactivities": 0, "durationstationary": 0, "durationmobile": 0, "durationvehicle": 0}, inplace=True)
|
||||
ar_features.fillna(value={"count": 0, "countuniqueactivities": 0, "durationstationary": 0, "durationmobile": 0, "durationvehicle": 0, "mostcommonactivity": 4}, inplace=True)
|
||||
|
||||
return ar_features
|
||||
|
|
|
@ -9,19 +9,19 @@ def compute_features(filtered_data, apps_type, requested_features, apps_features
|
|||
if "timeoffirstuse" in requested_features:
|
||||
time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
||||
if time_first_event.empty:
|
||||
apps_features["timeoffirstuse" + apps_type] = np.nan
|
||||
apps_features["timeoffirstuse" + apps_type] = 1500 # np.nan
|
||||
else:
|
||||
apps_features["timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
|
||||
if "timeoflastuse" in requested_features:
|
||||
time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
||||
if time_last_event.empty:
|
||||
apps_features["timeoflastuse" + apps_type] = np.nan
|
||||
apps_features["timeoflastuse" + apps_type] = 1500 # np.nan
|
||||
else:
|
||||
apps_features["timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
|
||||
if "frequencyentropy" in requested_features:
|
||||
apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
|
||||
if (len(apps_with_count.index) < 2 ):
|
||||
apps_features["frequencyentropy" + apps_type] = np.nan
|
||||
apps_features["frequencyentropy" + apps_type] = 0 # np.nan
|
||||
else:
|
||||
apps_features["frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy)
|
||||
if "countevent" in requested_features:
|
||||
|
@ -43,6 +43,7 @@ def compute_features(filtered_data, apps_type, requested_features, apps_features
|
|||
apps_features["sumduration" + apps_type] = filtered_data.groupby(by = ["local_segment"])["duration"].sum()
|
||||
|
||||
apps_features.index.names = ["local_segment"]
|
||||
|
||||
return apps_features
|
||||
|
||||
def process_app_features(data, requested_features, time_segment, provider, filter_data_by_segment):
|
||||
|
|
|
@ -14,8 +14,8 @@ def deviceFeatures(devices, ownership, common_devices, features_to_compute, feat
|
|||
features = features.join(device_value_counts.groupby("local_segment")["bt_address"].nunique().to_frame("uniquedevices" + ownership), how="outer")
|
||||
if "meanscans" in features_to_compute:
|
||||
features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer")
|
||||
if "stdscans" in features_to_compute:
|
||||
features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership), how="outer")
|
||||
if "stdscans" in features_to_compute:
|
||||
features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership).fillna(0), how="outer")
|
||||
# Most frequent device within segments, across segments, and across dataset
|
||||
if "countscansmostfrequentdevicewithinsegments" in features_to_compute:
|
||||
features = features.join(device_value_counts.groupby("local_segment")["scans"].max().to_frame("countscansmostfrequentdevicewithinsegments" + ownership), how="outer")
|
||||
|
|
|
@ -88,6 +88,16 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
|||
features <- call_features_of_type(calls_of_type, features_type, call_type, time_segment, requested_features)
|
||||
call_features <- merge(call_features, features, all=TRUE)
|
||||
}
|
||||
call_features <- call_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count") | contains("sumduration") | contains("minduration") | contains("maxduration") | contains("meanduration") | contains("modeduration")), list( ~ replace_na(., 0)))
|
||||
|
||||
# Fill seleted columns with a high number
|
||||
time_cols <- select(call_features, contains("timefirstcall") | contains("timelastcall")) %>%
|
||||
colnames(.)
|
||||
|
||||
call_features <- call_features %>%
|
||||
mutate_at(., time_cols, ~replace(., is.na(.), 1500))
|
||||
|
||||
# Fill NA values with 0
|
||||
call_features <- call_features %>% mutate_all(~replace(., is.na(.), 0))
|
||||
|
||||
return(call_features)
|
||||
}
|
|
@ -0,0 +1,274 @@
|
|||
from collections.abc import Collection
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pytz import timezone
|
||||
import datetime, json
|
||||
|
||||
# from config.models import ESM, Participant
|
||||
# from features import helper
|
||||
|
||||
ESM_STATUS_ANSWERED = 2
|
||||
|
||||
GROUP_SESSIONS_BY = ["device_id", "esm_session"] # 'participant_id
|
||||
|
||||
SESSION_STATUS_UNANSWERED = "ema_unanswered"
|
||||
SESSION_STATUS_DAY_FINISHED = "day_finished"
|
||||
SESSION_STATUS_COMPLETE = "ema_completed"
|
||||
|
||||
ANSWER_DAY_FINISHED = "DayFinished3421"
|
||||
ANSWER_DAY_OFF = "DayOff3421"
|
||||
ANSWER_SET_EVENING = "DayFinishedSetEvening"
|
||||
|
||||
MAX_MORNING_LENGTH = 3
|
||||
# When the participants was not yet at work at the time of the first (morning) EMA,
|
||||
# only three items were answered.
|
||||
# Two sleep related items and one indicating NOT starting work yet.
|
||||
# Daytime EMAs are all longer, in fact they always consist of at least 6 items.
|
||||
|
||||
|
||||
TZ_LJ = timezone("Europe/Ljubljana")
|
||||
COLUMN_TIMESTAMP = "timestamp"
|
||||
COLUMN_TIMESTAMP_ESM = "double_esm_user_answer_timestamp"
|
||||
|
||||
|
||||
def get_date_from_timestamp(df_aware) -> pd.DataFrame:
|
||||
"""
|
||||
Transform a UNIX timestamp into a datetime (with Ljubljana timezone).
|
||||
Additionally, extract only the date part, where anything until 4 AM is considered the same day.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_aware: pd.DataFrame
|
||||
Any AWARE-type data as defined in models.py.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_aware: pd.DataFrame
|
||||
The same dataframe with datetime_lj and date_lj columns added.
|
||||
|
||||
"""
|
||||
if COLUMN_TIMESTAMP_ESM in df_aware:
|
||||
column_timestamp = COLUMN_TIMESTAMP_ESM
|
||||
else:
|
||||
column_timestamp = COLUMN_TIMESTAMP
|
||||
|
||||
df_aware["datetime_lj"] = df_aware[column_timestamp].apply(
|
||||
lambda x: datetime.datetime.fromtimestamp(x / 1000.0, tz=TZ_LJ)
|
||||
)
|
||||
df_aware = df_aware.assign(
|
||||
date_lj=lambda x: (x.datetime_lj - datetime.timedelta(hours=4)).dt.date
|
||||
)
|
||||
# Since daytime EMAs could *theoretically* last beyond midnight, but never after 4 AM,
|
||||
# the datetime is first translated to 4 h earlier.
|
||||
|
||||
return df_aware
|
||||
|
||||
|
||||
def preprocess_esm(df_esm: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Convert timestamps into human-readable datetimes and dates
|
||||
and expand the JSON column into several Pandas DF columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm: pd.DataFrame
|
||||
A dataframe of esm data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A dataframe with added columns: datetime in Ljubljana timezone and all fields from ESM_JSON column.
|
||||
"""
|
||||
df_esm = get_date_from_timestamp(df_esm)
|
||||
|
||||
df_esm_json = df_esm["esm_json"].apply(json.loads)
|
||||
df_esm_json = pd.json_normalize(df_esm_json).drop(
|
||||
columns=["esm_trigger"]
|
||||
) # The esm_trigger column is already present in the main df.
|
||||
return df_esm.join(df_esm_json)
|
||||
|
||||
|
||||
def classify_sessions_by_completion(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
For each distinct EMA session, determine how the participant responded to it.
|
||||
Possible outcomes are: SESSION_STATUS_UNANSWERED, SESSION_STATUS_DAY_FINISHED, and SESSION_STATUS_COMPLETE
|
||||
|
||||
This is done in three steps.
|
||||
|
||||
First, the esm_status is considered.
|
||||
If any of the ESMs in a session has a status *other than* "answered", then this session is taken as unfinished.
|
||||
|
||||
Second, the sessions which do not represent full questionnaires are identified.
|
||||
These are sessions where participants only marked they are finished with the day or have not yet started working.
|
||||
|
||||
Third, the sessions with only one item are marked with their trigger.
|
||||
We never offered questionnaires with single items, so we can be sure these are unfinished.
|
||||
|
||||
Finally, all sessions that remain are marked as completed.
|
||||
By going through different possibilities in expl_esm_adherence.ipynb, this turned out to be a reasonable option.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_session_counts: pd.Dataframe
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their statuses and the number of items.
|
||||
"""
|
||||
sessions_grouped = df_esm_preprocessed.groupby(GROUP_SESSIONS_BY)
|
||||
|
||||
# 0. First, assign all session statuses as NaN.
|
||||
df_session_counts = pd.DataFrame(sessions_grouped.count()["timestamp"]).rename(
|
||||
columns={"timestamp": "esm_session_count"}
|
||||
)
|
||||
df_session_counts["session_response"] = np.nan
|
||||
|
||||
# 1. Identify all ESMs with status other than answered.
|
||||
esm_not_answered = sessions_grouped.apply(
|
||||
lambda x: (x.esm_status != ESM_STATUS_ANSWERED).any()
|
||||
)
|
||||
df_session_counts.loc[
|
||||
esm_not_answered, "session_response"
|
||||
] = SESSION_STATUS_UNANSWERED
|
||||
|
||||
# 2. Identify non-sessions, i.e. answers about the end of the day.
|
||||
non_session = sessions_grouped.apply(
|
||||
lambda x: (
|
||||
(x.esm_user_answer == ANSWER_DAY_FINISHED) # I finished working for today.
|
||||
| (x.esm_user_answer == ANSWER_DAY_OFF) # I am not going to work today.
|
||||
| (
|
||||
x.esm_user_answer == ANSWER_SET_EVENING
|
||||
) # When would you like to answer the evening EMA?
|
||||
).any()
|
||||
)
|
||||
df_session_counts.loc[non_session, "session_response"] = SESSION_STATUS_DAY_FINISHED
|
||||
|
||||
# 3. Identify sessions appearing only once, as those were not true EMAs for sure.
|
||||
singleton_sessions = (df_session_counts.esm_session_count == 1) & (
|
||||
df_session_counts.session_response.isna()
|
||||
)
|
||||
df_session_1 = df_session_counts[singleton_sessions]
|
||||
df_esm_unique_session = df_session_1.join(
|
||||
df_esm_preprocessed.set_index(GROUP_SESSIONS_BY), how="left"
|
||||
)
|
||||
df_esm_unique_session = df_esm_unique_session.assign(
|
||||
session_response=lambda x: x.esm_trigger
|
||||
)["session_response"]
|
||||
df_session_counts.loc[
|
||||
df_esm_unique_session.index, "session_response"
|
||||
] = df_esm_unique_session
|
||||
|
||||
# 4. Mark the remaining sessions as completed.
|
||||
df_session_counts.loc[
|
||||
df_session_counts.session_response.isna(), "session_response"
|
||||
] = SESSION_STATUS_COMPLETE
|
||||
|
||||
return df_session_counts
|
||||
|
||||
|
||||
def classify_sessions_by_time(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
For each EMA session, determine the time of the first user answer and its time type (morning, workday, or evening.)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_session_time: pd.DataFrame
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with their time type and timestamp of first answer.
|
||||
"""
|
||||
df_session_time = (
|
||||
df_esm_preprocessed.sort_values(["datetime_lj"]) # "participant_id"
|
||||
.groupby(GROUP_SESSIONS_BY)
|
||||
.first()[["time", "datetime_lj"]]
|
||||
)
|
||||
return df_session_time
|
||||
|
||||
|
||||
def classify_sessions_by_completion_time(
|
||||
df_esm_preprocessed: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
The point of this function is to not only classify sessions by using the previously defined functions.
|
||||
It also serves to "correct" the time type of some EMA sessions.
|
||||
|
||||
A morning questionnaire could seamlessly transition into a daytime questionnaire,
|
||||
if the participant was already at work.
|
||||
In this case, the "time" label changed mid-session.
|
||||
Because of the way classify_sessions_by_time works, this questionnaire was classified as "morning".
|
||||
But for all intents and purposes, it can be treated as a "daytime" EMA.
|
||||
|
||||
The way this scenario is differentiated from a true "morning" questionnaire,
|
||||
where the participants NOT yet at work, is by considering their length.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df_esm_preprocessed: pd.DataFrame
|
||||
A preprocessed dataframe of esm data, which must include the session ID (esm_session).
|
||||
|
||||
Returns
|
||||
-------
|
||||
df_session_counts_time: pd.DataFrame
|
||||
A dataframe of all sessions (grouped by GROUP_SESSIONS_BY) with statuses, the number of items,
|
||||
their time type (with some morning EMAs reclassified) and timestamp of first answer.
|
||||
|
||||
"""
|
||||
df_session_counts = classify_sessions_by_completion(df_esm_preprocessed)
|
||||
df_session_time = classify_sessions_by_time(df_esm_preprocessed)
|
||||
|
||||
df_session_counts_time = df_session_time.join(df_session_counts)
|
||||
|
||||
morning_transition_to_daytime = (df_session_counts_time.time == "morning") & (
|
||||
df_session_counts_time.esm_session_count > MAX_MORNING_LENGTH
|
||||
)
|
||||
|
||||
df_session_counts_time.loc[morning_transition_to_daytime, "time"] = "daytime"
|
||||
|
||||
return df_session_counts_time
|
||||
|
||||
|
||||
# def clean_up_esm(df_esm_preprocessed: pd.DataFrame) -> pd.DataFrame:
|
||||
# """
|
||||
# This function eliminates invalid ESM responses.
|
||||
# It removes unanswered ESMs and those that indicate end of work and similar.
|
||||
# It also extracts a numeric answer from strings such as "4 - I strongly agree".
|
||||
|
||||
# Parameters
|
||||
# ----------
|
||||
# df_esm_preprocessed: pd.DataFrame
|
||||
# A preprocessed dataframe of esm data.
|
||||
|
||||
# Returns
|
||||
# -------
|
||||
# df_esm_clean: pd.DataFrame
|
||||
# A subset of the original dataframe.
|
||||
|
||||
# """
|
||||
# df_esm_clean = df_esm_preprocessed[
|
||||
# df_esm_preprocessed["esm_status"] == ESM_STATUS_ANSWERED
|
||||
# ]
|
||||
# df_esm_clean = df_esm_clean[
|
||||
# ~df_esm_clean["esm_user_answer"].isin(
|
||||
# [ANSWER_DAY_FINISHED, ANSWER_DAY_OFF, ANSWER_SET_EVENING]
|
||||
# )
|
||||
# ]
|
||||
# df_esm_clean["esm_user_answer_numeric"] = np.nan
|
||||
# esm_type_numeric = [
|
||||
# ESM.ESM_TYPE.get("radio"),
|
||||
# ESM.ESM_TYPE.get("scale"),
|
||||
# ESM.ESM_TYPE.get("number"),
|
||||
# ]
|
||||
# df_esm_clean.loc[
|
||||
# df_esm_clean["esm_type"].isin(esm_type_numeric)
|
||||
# ] = df_esm_clean.loc[df_esm_clean["esm_type"].isin(esm_type_numeric)].assign(
|
||||
# esm_user_answer_numeric=lambda x: x.esm_user_answer.str.slice(stop=1).astype(
|
||||
# int
|
||||
# )
|
||||
# )
|
||||
# return df_esm_clean
|
|
@ -42,7 +42,8 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
|
|||
requested_features = provider["FEATURES"]
|
||||
# name of the features this function can compute
|
||||
requested_scales = provider["SCALES"]
|
||||
base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support"]
|
||||
base_features_names = ["PANAS_positive_affect", "PANAS_negative_affect", "JCQ_job_demand", "JCQ_job_control", "JCQ_supervisor_support", "JCQ_coworker_support",
|
||||
"appraisal_stressfulness_period", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]
|
||||
#TODO Check valid questionnaire and feature names.
|
||||
# the subset of requested features this function can compute
|
||||
features_to_compute = list(set(requested_features) & set(base_features_names))
|
||||
|
@ -52,7 +53,6 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
|
|||
|
||||
if not esm_data.empty:
|
||||
esm_features = pd.DataFrame()
|
||||
|
||||
for scale in requested_scales:
|
||||
questionnaire_id = QUESTIONNAIRE_IDS[scale]
|
||||
mask = esm_data["questionnaire_id"] == questionnaire_id
|
||||
|
@ -60,4 +60,7 @@ def straw_features(sensor_data_files, time_segment, provider, filter_data_by_seg
|
|||
#TODO Create the column esm_user_score in esm_clean. Currently, this is only done when reversing.
|
||||
|
||||
esm_features = esm_features.reset_index()
|
||||
if 'index' in esm_features: # In calse of empty esm_features df
|
||||
esm_features.rename(columns={'index': 'local_segment'}, inplace=True)
|
||||
|
||||
return esm_features
|
||||
|
|
|
@ -0,0 +1,220 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import datetime
|
||||
|
||||
import math, sys, yaml
|
||||
|
||||
from esm_preprocess import clean_up_esm
|
||||
from esm import classify_sessions_by_completion_time, preprocess_esm
|
||||
|
||||
input_data_files = dict(snakemake.input)
|
||||
|
||||
def format_timestamp(x):
|
||||
"""This method formates inputed timestamp into format "HH MM SS". Including spaces. If there is no hours or minutes present
|
||||
that part is ignored, e.g., "MM SS" or just "SS".
|
||||
|
||||
Args:
|
||||
x (int): unix timestamp in seconds
|
||||
|
||||
Returns:
|
||||
str: formatted timestamp using "HH MM SS" sintax
|
||||
"""
|
||||
tstring=""
|
||||
space = False
|
||||
if x//3600 > 0:
|
||||
tstring += f"{x//3600}H"
|
||||
space = True
|
||||
if x % 3600 // 60 > 0:
|
||||
tstring += f" {x % 3600 // 60}M" if "H" in tstring else f"{x % 3600 // 60}M"
|
||||
if x % 60 > 0:
|
||||
tstring += f" {x % 60}S" if "M" in tstring or "H" in tstring else f"{x % 60}S"
|
||||
|
||||
return tstring
|
||||
|
||||
|
||||
def extract_ers(esm_df):
|
||||
"""This method has two major functionalities:
|
||||
(1) It prepares STRAW event-related segments file with the use of esm file. The execution protocol is depended on
|
||||
the segmenting method specified in the config.yaml file.
|
||||
(2) It prepares and writes csv with targets and corresponding time segments labels. This is later used
|
||||
in the overall cleaning script (straw).
|
||||
|
||||
Details about each segmenting method are listed below by each corresponding condition. Refer to the RAPIDS documentation for the
|
||||
ERS file format: https://www.rapids.science/1.9/setup/configuration/#time-segments -> event segments
|
||||
|
||||
Args:
|
||||
esm_df (DataFrame): read esm file that is dependend on the current participant.
|
||||
|
||||
Returns:
|
||||
extracted_ers (DataFrame): dataframe with all necessary information to write event-related segments file
|
||||
in the correct format.
|
||||
"""
|
||||
pd.set_option("display.max_rows", 20)
|
||||
pd.set_option("display.max_columns", None)
|
||||
|
||||
with open('config.yaml', 'r') as stream:
|
||||
config = yaml.load(stream, Loader=yaml.FullLoader)
|
||||
|
||||
pd.DataFrame(columns=["label", "intensity"]).to_csv(snakemake.output[1]) # Create an empty stress_events_targets file
|
||||
|
||||
esm_preprocessed = clean_up_esm(preprocess_esm(esm_df))
|
||||
|
||||
# Take only ema_completed sessions responses
|
||||
classified = classify_sessions_by_completion_time(esm_preprocessed)
|
||||
esm_filtered_sessions = classified[classified["session_response"] == 'ema_completed'].reset_index()[['device_id', 'esm_session']]
|
||||
esm_df = esm_preprocessed.loc[(esm_preprocessed['device_id'].isin(esm_filtered_sessions['device_id'])) & (esm_preprocessed['esm_session'].isin(esm_filtered_sessions['esm_session']))]
|
||||
|
||||
segmenting_method = config["TIME_SEGMENTS"]["TAILORED_EVENTS"]["SEGMENTING_METHOD"]
|
||||
|
||||
if segmenting_method in ["30_before", "90_before"]: # takes 30-minute peroid before the questionnaire + the duration of the questionnaire
|
||||
""" '30-minutes and 90-minutes before' have the same fundamental logic with couple of deviations that will be explained below.
|
||||
Both take x-minute period before the questionnaire that is summed with the questionnaire duration.
|
||||
All questionnaire durations over 15 minutes are excluded from the querying.
|
||||
"""
|
||||
# Extract time-relevant information
|
||||
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index() # questionnaire length
|
||||
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
||||
extracted_ers[['event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].min().reset_index()[['timestamp', 'device_id']]
|
||||
extracted_ers = extracted_ers[extracted_ers["timestamp"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min
|
||||
extracted_ers["shift_direction"] = -1
|
||||
|
||||
if segmenting_method == "30_before":
|
||||
"""The method 30-minutes before simply takes 30 minutes before the questionnaire and sums it with the questionnaire duration.
|
||||
The timestamps are formatted with the help of format_timestamp() method.
|
||||
"""
|
||||
time_before_questionnaire = 30 * 60 # in seconds (30 minutes)
|
||||
|
||||
extracted_ers["length"] = (extracted_ers["timestamp"] + time_before_questionnaire).apply(lambda x: format_timestamp(x))
|
||||
extracted_ers["shift"] = time_before_questionnaire
|
||||
extracted_ers["shift"] = extracted_ers["shift"].apply(lambda x: format_timestamp(x))
|
||||
|
||||
elif segmenting_method == "90_before":
|
||||
"""The method 90-minutes before has an important condition. If the time between the current and the previous questionnaire is
|
||||
longer then 90 minutes it takes 90 minutes, otherwise it takes the original time difference between the questionnaires.
|
||||
"""
|
||||
time_before_questionnaire = 90 * 60 # in seconds (90 minutes)
|
||||
|
||||
extracted_ers[['end_event_timestamp', 'device_id']] = esm_df.groupby(["device_id", "esm_session"])['timestamp'].max().reset_index()[['timestamp', 'device_id']]
|
||||
|
||||
extracted_ers['diffs'] = extracted_ers['event_timestamp'].astype('int64') - extracted_ers['end_event_timestamp'].shift(1, fill_value=0).astype('int64')
|
||||
extracted_ers.loc[extracted_ers['diffs'] > time_before_questionnaire * 1000, 'diffs'] = time_before_questionnaire * 1000
|
||||
|
||||
extracted_ers["diffs"] = (extracted_ers["diffs"] / 1000).apply(lambda x: math.ceil(x))
|
||||
|
||||
extracted_ers["length"] = (extracted_ers["timestamp"] + extracted_ers["diffs"]).apply(lambda x: format_timestamp(x))
|
||||
extracted_ers["shift"] = extracted_ers["diffs"].apply(lambda x: format_timestamp(x))
|
||||
|
||||
elif segmenting_method == "stress_event":
|
||||
"""This is a special case of the method as it consists of two important parts:
|
||||
(1) Generating of the ERS file (same as the methods above) and
|
||||
(2) Generating targets file alongside with the correct time segment labels.
|
||||
|
||||
This extracts event-related segments, depended on the event time and duration specified by the participant in the next
|
||||
questionnaire. Additionally, 5 minutes before the specified start time of this event is taken to take into a account the
|
||||
possiblity of the participant not remembering the start time percisely => this parameter can be manipulated with the variable
|
||||
"time_before_event" which is defined below.
|
||||
|
||||
By default, this method also excludes all events that are longer then 2.5 hours so that the segments are easily comparable.
|
||||
"""
|
||||
# Get and join required data
|
||||
extracted_ers = esm_df.groupby(["device_id", "esm_session"])['timestamp'].apply(lambda x: math.ceil((x.max() - x.min()) / 1000)).reset_index().rename(columns={'timestamp': 'session_length'}) # questionnaire end timestamp
|
||||
extracted_ers = extracted_ers[extracted_ers["session_length"] <= 15 * 60].reset_index(drop=True) # ensure that the longest duration of the questionnaire anwsering is 15 min
|
||||
session_end_timestamp = esm_df.groupby(['device_id', 'esm_session'])['timestamp'].max().to_frame().rename(columns={'timestamp': 'session_end_timestamp'}) # questionnaire end timestamp
|
||||
se_time = esm_df[esm_df.questionnaire_id == 90.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_time'})
|
||||
se_duration = esm_df[esm_df.questionnaire_id == 91.].set_index(['device_id', 'esm_session'])['esm_user_answer'].to_frame().rename(columns={'esm_user_answer': 'se_duration'})
|
||||
|
||||
# Extracted 3 targets that will be transfered with the csv file to the cleaning script.
|
||||
se_stressfulness_event_tg = esm_df[esm_df.questionnaire_id == 87.].set_index(['device_id', 'esm_session'])['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_stressfulness_event'})
|
||||
se_threat_tg = esm_df[esm_df.questionnaire_id == 88.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_threat'})
|
||||
se_challenge_tg = esm_df[esm_df.questionnaire_id == 89.].groupby(["device_id", "esm_session"]).mean()['esm_user_answer_numeric'].to_frame().rename(columns={'esm_user_answer_numeric': 'appraisal_challenge'})
|
||||
|
||||
# All relevant features are joined by inner join to remove standalone columns (e.g., stressfulness event target has larger count)
|
||||
extracted_ers = extracted_ers.join(session_end_timestamp, on=['device_id', 'esm_session'], how='inner') \
|
||||
.join(se_time, on=['device_id', 'esm_session'], how='inner') \
|
||||
.join(se_duration, on=['device_id', 'esm_session'], how='inner') \
|
||||
.join(se_stressfulness_event_tg, on=['device_id', 'esm_session'], how='inner') \
|
||||
.join(se_threat_tg, on=['device_id', 'esm_session'], how='inner') \
|
||||
.join(se_challenge_tg, on=['device_id', 'esm_session'], how='inner')
|
||||
|
||||
|
||||
# Filter sessions that are not useful. Because of the ambiguity this excludes:
|
||||
# (1) straw event times that are marked as "0 - I don't remember"
|
||||
# (2) straw event durations that are marked as "0 - I don't remember"
|
||||
extracted_ers = extracted_ers[(~extracted_ers.se_time.str.startswith("0 - ")) & (~extracted_ers.se_duration.str.startswith("0 - "))]
|
||||
|
||||
# Transform data into its final form, ready for the extraction
|
||||
extracted_ers.reset_index(drop=True, inplace=True)
|
||||
|
||||
time_before_event = 5 * 60 # in seconds (5 minutes)
|
||||
extracted_ers['event_timestamp'] = pd.to_datetime(extracted_ers['se_time']).apply(lambda x: x.timestamp() * 1000).astype('int64')
|
||||
extracted_ers['shift_direction'] = -1
|
||||
|
||||
# Checks whether the duration is marked with "1 - It's still ongoing" which means that the end of the current questionnaire
|
||||
# is taken as end time of the segment. Else the user input duration is taken.
|
||||
extracted_ers['se_duration'] = \
|
||||
np.where(
|
||||
extracted_ers['se_duration'].str.startswith("1 - "),
|
||||
extracted_ers['session_end_timestamp'] - extracted_ers['event_timestamp'],
|
||||
extracted_ers['se_duration']
|
||||
)
|
||||
|
||||
# This converts the rows of timestamps in miliseconds and the row with datetime to timestamp in seconds.
|
||||
extracted_ers['se_duration'] = \
|
||||
extracted_ers['se_duration'].apply(lambda x: math.ceil(x / 1000) if isinstance(x, int) else (pd.to_datetime(x).hour * 60 + pd.to_datetime(x).minute) * 60) + time_before_event
|
||||
|
||||
extracted_ers['shift'] = format_timestamp(time_before_event)
|
||||
extracted_ers['length'] = extracted_ers['se_duration'].apply(lambda x: format_timestamp(x))
|
||||
|
||||
# Drop event_timestamp duplicates in case of user referencing the same event over multiple questionnaires
|
||||
extracted_ers.drop_duplicates(subset=["event_timestamp"], keep='first', inplace=True)
|
||||
extracted_ers.reset_index(drop=True, inplace=True)
|
||||
|
||||
extracted_ers["label"] = f"straw_event_{segmenting_method}_" + snakemake.params["pid"] + "_" + extracted_ers.index.astype(str).str.zfill(3)
|
||||
|
||||
# Write the csv of extracted ERS labels with targets related to stressfulness event
|
||||
extracted_ers[["label", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"]].to_csv(snakemake.output[1], index=False)
|
||||
|
||||
else:
|
||||
raise Exception("Please select correct target method for the event-related segments.")
|
||||
extracted_ers = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"])
|
||||
|
||||
return extracted_ers[["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"]]
|
||||
|
||||
|
||||
"""
|
||||
Here the code is executed - this .py file is used both for extraction of the STRAW time_segments file for the individual
|
||||
participant, and also for merging all participant's files into one combined file which is later used for the time segments
|
||||
to all sensors assignment.
|
||||
|
||||
There are two files involved (see rules extract_event_information_from_esm and merge_event_related_segments_files in preprocessing.smk)
|
||||
(1) ERS file which contains all the information about the time segment timings and
|
||||
(2) targets file which has corresponding target value for the segment label which is later used to merge with other features in the cleaning script.
|
||||
For more information, see the comment in the method above.
|
||||
"""
|
||||
if snakemake.params["stage"] == "extract":
|
||||
esm_df = pd.read_csv(input_data_files['esm_raw_input'])
|
||||
|
||||
extracted_ers = extract_ers(esm_df)
|
||||
|
||||
extracted_ers.to_csv(snakemake.output[0], index=False)
|
||||
|
||||
elif snakemake.params["stage"] == "merge":
|
||||
|
||||
input_data_files = dict(snakemake.input)
|
||||
straw_events = pd.DataFrame(columns=["label", "event_timestamp", "length", "shift", "shift_direction", "device_id"])
|
||||
stress_events_targets = pd.DataFrame(columns=["label", "appraisal_stressfulness_event", "appraisal_threat", "appraisal_challenge"])
|
||||
|
||||
for input_file in input_data_files["ers_files"]:
|
||||
ers_df = pd.read_csv(input_file)
|
||||
straw_events = pd.concat([straw_events, ers_df], axis=0, ignore_index=True)
|
||||
|
||||
straw_events.to_csv(snakemake.output[0], index=False)
|
||||
|
||||
for input_file in input_data_files["se_files"]:
|
||||
se_df = pd.read_csv(input_file)
|
||||
stress_events_targets = pd.concat([stress_events_targets, se_df], axis=0, ignore_index=True)
|
||||
|
||||
stress_events_targets.to_csv(snakemake.output[1], index=False)
|
||||
|
||||
|
||||
|
|
@ -29,7 +29,7 @@ def rapids_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
|||
if "medianlux" in features_to_compute:
|
||||
light_features["medianlux"] = light_data.groupby(["local_segment"])["double_light_lux"].median()
|
||||
if "stdlux" in features_to_compute:
|
||||
light_features["stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std()
|
||||
light_features["stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std().fillna(0)
|
||||
|
||||
light_features = light_features.reset_index()
|
||||
|
||||
|
|
|
@ -37,7 +37,8 @@ def variance_and_logvariance_features(location_data, location_features):
|
|||
location_data["longitude_for_wvar"] = (location_data["double_longitude"] - location_data["longitude_wavg"]) ** 2 * location_data["duration"] * 60
|
||||
|
||||
location_features["locationvariance"] = ((location_data_grouped["latitude_for_wvar"].sum() + location_data_grouped["longitude_for_wvar"].sum()) / (location_data_grouped["duration"].sum() * 60 - 1)).fillna(0)
|
||||
location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, np.nan)
|
||||
|
||||
location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, -1000000)
|
||||
|
||||
return location_features
|
||||
|
||||
|
|
|
@ -65,6 +65,15 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
|||
features <- message_features_of_type(messages_of_type, message_type, time_segment, requested_features)
|
||||
messages_features <- merge(messages_features, features, all=TRUE)
|
||||
}
|
||||
messages_features <- messages_features %>% mutate_at(vars(contains("countmostfrequentcontact") | contains("distinctcontacts") | contains("count")), list( ~ replace_na(., 0)))
|
||||
# Fill seleted columns with a high number
|
||||
time_cols <- select(messages_features, contains("timefirstmessages") | contains("timelastmessages")) %>%
|
||||
colnames(.)
|
||||
|
||||
messages_features <- messages_features %>%
|
||||
mutate_at(., time_cols, ~replace(., is.na(.), 1500))
|
||||
|
||||
# Fill NA values with 0
|
||||
messages_features <- messages_features %>% mutate_all(~replace(., is.na(.), 0))
|
||||
|
||||
return(messages_features)
|
||||
}
|
|
@ -15,7 +15,7 @@ def getEpisodeDurationFeatures(screen_data, time_segment, episode, features, ref
|
|||
if "avgduration" in features:
|
||||
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].mean().rename(columns = {"duration":"avgduration" + episode})], axis = 1)
|
||||
if "stdduration" in features:
|
||||
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].std().rename(columns = {"duration":"stdduration" + episode})], axis = 1)
|
||||
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].std().fillna(0).rename(columns = {"duration":"stdduration" + episode})], axis = 1)
|
||||
if "firstuseafter" + "{0:0=2d}".format(reference_hour_first_use) in features:
|
||||
screen_data_episode_after_hour = screen_data_episode.copy()
|
||||
screen_data_episode_after_hour["hour"] = pd.to_datetime(screen_data_episode["local_start_date_time"]).dt.hour
|
||||
|
|
|
@ -9,21 +9,26 @@ compute_wifi_feature <- function(data, feature, time_segment){
|
|||
"countscans" = data %>% summarise(!!feature := n()),
|
||||
"uniquedevices" = data %>% summarise(!!feature := n_distinct(bssid)))
|
||||
return(data)
|
||||
|
||||
} else if(feature == "countscansmostuniquedevice"){
|
||||
# Get the most scanned device
|
||||
mostuniquedevice <- data %>%
|
||||
mostuniquedevice <- data %>%
|
||||
filter(bssid != "") %>%
|
||||
group_by(bssid) %>%
|
||||
mutate(N=n()) %>%
|
||||
ungroup() %>%
|
||||
filter(N == max(N)) %>%
|
||||
head(1) %>% # if there are multiple device with the same amount of scans pick the first one only
|
||||
pull(bssid)
|
||||
|
||||
data <- data %>% filter_data_by_segment(time_segment)
|
||||
|
||||
return(data %>%
|
||||
filter(bssid == mostuniquedevice) %>%
|
||||
group_by(local_segment) %>%
|
||||
summarise(!!feature := n()) %>%
|
||||
replace(is.na(.), 0))
|
||||
summarise(!!feature := n())
|
||||
)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -43,6 +48,6 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
|||
feature <- compute_wifi_feature(wifi_data, feature_name, time_segment)
|
||||
features <- merge(features, feature, by="local_segment", all = TRUE)
|
||||
}
|
||||
|
||||
features <- features %>% mutate_all(~replace(., is.na(.), 0))
|
||||
return(features)
|
||||
}
|
||||
|
|
|
@ -1,50 +0,0 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
import sys
|
||||
|
||||
sensor_data_files = dict(snakemake.input)
|
||||
|
||||
provider = snakemake.params["provider"]
|
||||
provider_key = snakemake.params["provider_key"]
|
||||
sensor_key = snakemake.params["sensor_key"]
|
||||
|
||||
pd.set_option('display.max_columns', None)
|
||||
|
||||
if provider_key == "cr":
|
||||
sys.path.append('/rapids/src/features/')
|
||||
from cr_features_helper_methods import extract_second_order_features
|
||||
|
||||
provider_main = snakemake.params["provider_main"]
|
||||
prefix = sensor_key + "_" + provider_key + "_"
|
||||
|
||||
windows_features_data = pd.read_csv(sensor_data_files["windows_features_data"])
|
||||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime', prefix + "level_1"]
|
||||
|
||||
if windows_features_data.empty:
|
||||
windows_features_data.to_csv(snakemake.output[1], index=False)
|
||||
windows_features_data.to_csv(snakemake.output[0], index=False)
|
||||
else:
|
||||
windows_features_data.loc[:, ~windows_features_data.columns.isin(excluded_columns)] = StandardScaler().fit_transform(windows_features_data.loc[:, ~windows_features_data.columns.isin(excluded_columns)])
|
||||
|
||||
windows_features_data.to_csv(snakemake.output[1], index=False)
|
||||
|
||||
if provider_main["WINDOWS"]["COMPUTE"] and "SECOND_ORDER_FEATURES" in provider_main["WINDOWS"]:
|
||||
so_features_names = provider_main["WINDOWS"]["SECOND_ORDER_FEATURES"]
|
||||
windows_so_features_data = extract_second_order_features(windows_features_data, so_features_names, prefix)
|
||||
windows_so_features_data.to_csv(snakemake.output[0], index=False)
|
||||
else:
|
||||
pd.DataFrame().to_csv(snakemake.output[0], index=False)
|
||||
|
||||
else:
|
||||
for sensor_features in sensor_data_files["sensor_features"]:
|
||||
if "/" + sensor_key + ".csv" in sensor_features:
|
||||
sensor_data = pd.read_csv(sensor_features)
|
||||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime']
|
||||
|
||||
if not sensor_data.empty:
|
||||
sensor_data.loc[:, ~sensor_data.columns.isin(excluded_columns)] = StandardScaler().fit_transform(sensor_data.loc[:, ~sensor_data.columns.isin(excluded_columns)])
|
||||
|
||||
sensor_data.to_csv(snakemake.output[0], index=False)
|
||||
break
|
|
@ -160,12 +160,16 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
|
|||
|
||||
return sensor_features
|
||||
|
||||
def run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files):
|
||||
def run_provider_cleaning_script(provider, provider_key, sensor_key, sensor_data_files, target=False):
|
||||
from importlib import import_module, util
|
||||
print("{} Processing {} {}".format(rapids_log_tag, sensor_key, provider_key))
|
||||
|
||||
cleaning_module = import_path(provider["SRC_SCRIPT"])
|
||||
cleaning_function = getattr(cleaning_module, provider_key.lower() + "_cleaning")
|
||||
sensor_features = cleaning_function(sensor_data_files, provider)
|
||||
|
||||
if target:
|
||||
sensor_features = cleaning_function(sensor_data_files, provider, target)
|
||||
else:
|
||||
sensor_features = cleaning_function(sensor_data_files, provider)
|
||||
|
||||
return sensor_features
|
|
@ -1,5 +1,6 @@
|
|||
import pandas as pd
|
||||
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
|
||||
column_names = df_input.columns
|
||||
|
@ -8,9 +9,9 @@ def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
|
|||
esm_names = column_names[esm_names_index]
|
||||
target_variable_index = esm_names.str.contains(target_variable_name)
|
||||
if all(~target_variable_index):
|
||||
raise ValueError("The requested target (", target_variable_name,
|
||||
")cannot be found in the dataset.",
|
||||
"Please check the names of phone_esm_ columns in z_all_sensor_features_cleaned_straw_py.csv")
|
||||
warnings.warn(f"The requested target (, {target_variable_name} ,)cannot be found in the dataset. Please check the names of phone_esm_ columns in cleaned python file")
|
||||
return None
|
||||
|
||||
sensor_features_plus_target = df_input.drop(esm_names, axis=1)
|
||||
sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]
|
||||
# We will only keep one column related to phone_esm and that will be our target variable.
|
||||
|
|
|
@ -12,9 +12,13 @@ for baseline_features_path in snakemake.input["demographic_features"]:
|
|||
all_baseline_features = pd.concat([all_baseline_features, baseline_features], axis=0)
|
||||
|
||||
# merge sensor features and baseline features
|
||||
features = sensor_features.merge(all_baseline_features, on="pid", how="left")
|
||||
if not sensor_features.empty:
|
||||
features = sensor_features.merge(all_baseline_features, on="pid", how="left")
|
||||
|
||||
target_variable_name = snakemake.params["target_variable"]
|
||||
model_input = retain_target_column(features, target_variable_name)
|
||||
target_variable_name = snakemake.params["target_variable"]
|
||||
model_input = retain_target_column(features, target_variable_name)
|
||||
|
||||
model_input.to_csv(snakemake.output[0], index=False)
|
||||
model_input.to_csv(snakemake.output[0], index=False)
|
||||
|
||||
else:
|
||||
sensor_features.to_csv(snakemake.output[0], index=False)
|
||||
|
|
|
@ -6,6 +6,8 @@ cleaned_sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"]
|
|||
target_variable_name = snakemake.params["target_variable"]
|
||||
|
||||
model_input = retain_target_column(cleaned_sensor_features, target_variable_name)
|
||||
model_input.dropna(axis ="index", how="any", subset=["target"], inplace=True)
|
||||
|
||||
model_input.to_csv(snakemake.output[0], index=False)
|
||||
if model_input is None:
|
||||
pd.DataFrame().to_csv(snakemake.output[0])
|
||||
else:
|
||||
model_input.to_csv(snakemake.output[0], index=False)
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 12 KiB |
|
@ -3,8 +3,8 @@ import seaborn as sns
|
|||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
participant = "p031"
|
||||
all_sensors = ["eda", "bvp", "ibi", "temp", "acc"]
|
||||
participant = "p01"
|
||||
all_sensors = ["eda", "ibi", "temp", "acc"]
|
||||
|
||||
for sensor in all_sensors:
|
||||
|
||||
|
|
|
@ -0,0 +1,285 @@
|
|||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
path = "/rapids/data/processed/features/all_participants/all_sensor_features.csv"
|
||||
df = pd.read_csv(path)
|
||||
|
||||
# activity_recognition
|
||||
|
||||
cols = [col for col in df.columns if "activity_recognition" in col]
|
||||
df_x = df[cols]
|
||||
|
||||
print(len(cols))
|
||||
print(df_x)
|
||||
|
||||
df_x = df_x.dropna(axis=0, how="all")
|
||||
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||
plt.savefig(f'activity_recognition_values', bbox_inches='tight')
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_x:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||
plt.savefig(f'cut_activity_recognition_values', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# applications_foreground
|
||||
|
||||
cols = [col for col in df.columns if "applications_foreground" in col]
|
||||
df_x = df[cols]
|
||||
|
||||
print(len(cols))
|
||||
print(df_x)
|
||||
|
||||
df_x = df_x.dropna(axis=0, how="all")
|
||||
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||
plt.savefig(f'applications_foreground_values', bbox_inches='tight')
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_x:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||
plt.savefig(f'cut_applications_foreground_values', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# battery
|
||||
|
||||
cols = [col for col in df.columns if "phone_battery" in col]
|
||||
df_x = df[cols]
|
||||
|
||||
print(len(cols))
|
||||
print(df_x)
|
||||
|
||||
df_x = df_x.dropna(axis=0, how="all")
|
||||
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||
plt.savefig(f'phone_battery_values', bbox_inches='tight')
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_x:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||
plt.savefig(f'cut_phone_battery_values', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# bluetooth_doryab
|
||||
|
||||
cols = [col for col in df.columns if "bluetooth_doryab" in col]
|
||||
df_x = df[cols]
|
||||
|
||||
print(len(cols))
|
||||
print(df_x)
|
||||
|
||||
df_x = df_x.dropna(axis=0, how="all")
|
||||
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||
plt.savefig(f'bluetooth_doryab_values', bbox_inches='tight')
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_x:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||
plt.savefig(f'cut_bluetooth_doryab_values', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# bluetooth_rapids
|
||||
|
||||
cols = [col for col in df.columns if "bluetooth_rapids" in col]
|
||||
df_x = df[cols]
|
||||
|
||||
print(len(cols))
|
||||
print(df_x)
|
||||
|
||||
df_x = df_x.dropna(axis=0, how="all")
|
||||
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||
plt.savefig(f'bluetooth_rapids_values', bbox_inches='tight')
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_x:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||
plt.savefig(f'cut_bluetooth_rapids_values', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# calls
|
||||
|
||||
cols = [col for col in df.columns if "phone_calls" in col]
|
||||
df_x = df[cols]
|
||||
|
||||
print(len(cols))
|
||||
print(df_x)
|
||||
|
||||
df_x = df_x.dropna(axis=0, how="all")
|
||||
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||
plt.savefig(f'phone_calls_values', bbox_inches='tight')
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_x:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||
plt.savefig(f'cut_phone_calls_values', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# data_yield
|
||||
|
||||
cols = [col for col in df.columns if "data_yield" in col]
|
||||
df_x = df[cols]
|
||||
|
||||
print(len(cols))
|
||||
print(df_x)
|
||||
|
||||
df_x = df_x.dropna(axis=0, how="all")
|
||||
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||
plt.savefig(f'data_yield_values', bbox_inches='tight')
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_x:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||
plt.savefig(f'cut_data_yield_values', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# esm
|
||||
|
||||
cols = [col for col in df.columns if "phone_esm" in col]
|
||||
df_x = df[cols]
|
||||
|
||||
print(len(cols))
|
||||
print(df_x)
|
||||
|
||||
df_x = df_x.dropna(axis=0, how="all")
|
||||
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||
plt.savefig(f'phone_esm_values', bbox_inches='tight')
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_x:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||
plt.savefig(f'cut_phone_esm_values', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# light
|
||||
|
||||
cols = [col for col in df.columns if "phone_light" in col]
|
||||
df_x = df[cols]
|
||||
|
||||
print(len(cols))
|
||||
print(df_x)
|
||||
|
||||
df_x = df_x.dropna(axis=0, how="all")
|
||||
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||
plt.savefig(f'phone_light_values', bbox_inches='tight')
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_x:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||
plt.savefig(f'cut_phone_light_values', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# locations_doryab
|
||||
|
||||
cols = [col for col in df.columns if "locations_doryab" in col]
|
||||
df_x = df[cols]
|
||||
|
||||
print(len(cols))
|
||||
print(df_x)
|
||||
|
||||
df_x = df_x.dropna(axis=0, how="all")
|
||||
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||
plt.savefig(f'locations_doryab_values', bbox_inches='tight')
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_x:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||
plt.savefig(f'cut_locations_doryab_values', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# locations_barnett
|
||||
|
||||
# Not working
|
||||
|
||||
# messages
|
||||
|
||||
cols = [col for col in df.columns if "phone_messages" in col]
|
||||
df_x = df[cols]
|
||||
|
||||
print(len(cols))
|
||||
print(df_x)
|
||||
|
||||
df_x = df_x.dropna(axis=0, how="all")
|
||||
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||
plt.savefig(f'phone_messages_values', bbox_inches='tight')
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_x:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||
plt.savefig(f'cut_phone_messages_values', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# screen
|
||||
|
||||
cols = [col for col in df.columns if "phone_screen" in col]
|
||||
df_x = df[cols]
|
||||
|
||||
print(len(cols))
|
||||
print(df_x)
|
||||
|
||||
df_x = df_x.dropna(axis=0, how="all")
|
||||
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||
plt.savefig(f'phone_screen_values', bbox_inches='tight')
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_x:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||
plt.savefig(f'cut_phone_screen_values', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# wifi_visible
|
||||
|
||||
cols = [col for col in df.columns if "wifi_visible" in col]
|
||||
df_x = df[cols]
|
||||
|
||||
print(len(cols))
|
||||
print(df_x)
|
||||
|
||||
df_x = df_x.dropna(axis=0, how="all")
|
||||
sns.heatmap(df_x.isna(), xticklabels=1)
|
||||
plt.savefig(f'wifi_visible_values', bbox_inches='tight')
|
||||
|
||||
df_q = pd.DataFrame()
|
||||
for col in df_x:
|
||||
df_q[col] = pd.to_numeric(pd.cut(df_x[col], bins=[-1,0,0.000000000001,1000], labels=[-1,0,1], right=False))
|
||||
|
||||
sns.heatmap(df_q, cbar=False, xticklabels=1)
|
||||
plt.savefig(f'cut_wifi_visible_values', bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# All features
|
||||
|
||||
print(len(df))
|
||||
print(df)
|
||||
|
||||
# df = df.dropna(axis=0, how="all")
|
||||
# df = df.dropna(axis=1, how="all")
|
||||
sns.heatmap(df.isna())
|
||||
plt.savefig(f'all_features', bbox_inches='tight')
|
||||
|
||||
print(df.columns[df.isna().all()].tolist())
|
||||
print("All NaNs:", df.isna().sum().sum())
|
||||
print("Df shape NaNs:", df.shape)
|
|
@ -0,0 +1,70 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
import sys
|
||||
|
||||
sys.path.append('/rapids/')
|
||||
from src.features import cr_features_helper_methods as crhm
|
||||
|
||||
pd.set_option("display.max_columns", None)
|
||||
features_win = pd.read_csv("data/interim/p031/empatica_temperature_features/empatica_temperature_python_cr_windows.csv", usecols=[0, 1, 2, 3, 4, 5])
|
||||
|
||||
# First standardization method
|
||||
excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime', "empatica_temperature_cr_level_1"]
|
||||
z1_windows = features_win.copy()
|
||||
z1_windows.loc[:, ~z1_windows.columns.isin(excluded_columns)] = StandardScaler().fit_transform(z1_windows.loc[:, ~z1_windows.columns.isin(excluded_columns)])
|
||||
z1 = crhm.extract_second_order_features(z1_windows, ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'], prefix="empatica_temperature_cr_")
|
||||
z1 = z1.iloc[:,4:]
|
||||
# print(z1)
|
||||
|
||||
# Second standardization method
|
||||
so_features_reg = crhm.extract_second_order_features(features_win, ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'], prefix="empatica_temperature_cr_")
|
||||
so_features_reg = so_features_reg.iloc[:,4:]
|
||||
z2 = pd.DataFrame(StandardScaler().fit_transform(so_features_reg), columns=so_features_reg.columns)
|
||||
# print(z2)
|
||||
|
||||
# Standardization of the first standardization method values
|
||||
z1_z = pd.DataFrame(StandardScaler().fit_transform(z1), columns=z1.columns)
|
||||
# print(z1_z)
|
||||
|
||||
# For SD
|
||||
fig, axs = plt.subplots(3, figsize=(8, 10))
|
||||
axs[0].plot(z1['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'])
|
||||
axs[0].set_title("Z1 - standardizirana okna, nato ekstrahiranje značilk SO")
|
||||
|
||||
axs[1].plot(z2['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'])
|
||||
axs[1].set_title("Z2 - ekstrahirane značilke SO 'normalnih' vrednosti, nato standardizacija")
|
||||
|
||||
axs[2].plot(z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'])
|
||||
axs[2].set_title("Standardiziran Z1")
|
||||
|
||||
fig.suptitle('Z-Score methods for temperature_squareSumOfComponent_SO_sd')
|
||||
plt.savefig('z_score_comparison_temperature_squareSumOfComponent_X_SO_sd', bbox_inches='tight')
|
||||
|
||||
showcase = pd.DataFrame()
|
||||
showcase['Z1__SD'] = z1['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']
|
||||
showcase['Z2__SD'] = z2['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']
|
||||
showcase['Z1__SD_STANDARDIZED'] = z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']
|
||||
print(showcase)
|
||||
|
||||
# For
|
||||
fig, axs = plt.subplots(3, figsize=(8, 10))
|
||||
axs[0].plot(z1['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'])
|
||||
axs[0].set_title("Z1 - standardizirana okna, nato ekstrahiranje značilk SO")
|
||||
|
||||
axs[1].plot(z2['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'])
|
||||
axs[1].set_title("Z2")
|
||||
|
||||
axs[2].plot(z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'])
|
||||
axs[2].set_title("Standardized Z1")
|
||||
|
||||
fig.suptitle('Z-Score methods for temperature_squareSumOfComponent_SO_nlargest')
|
||||
plt.savefig('z_score_comparison_temperature_squareSumOfComponent_X_SO_nlargest', bbox_inches='tight')
|
||||
|
||||
showcase2 = pd.DataFrame()
|
||||
showcase2['Z1__nlargest'] = z1['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']
|
||||
showcase2['Z2__nlargest'] = z2['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']
|
||||
showcase2['Z1__nlargest_STANDARDIZED'] = z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']
|
||||
print(showcase2)
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
import sys
|
||||
|
||||
df = pd.read_csv(f"/rapids/data/raw/p03/empatica_accelerometer_raw.csv")
|
||||
|
||||
|
||||
df['date'] = pd.to_datetime(df['timestamp'],unit='ms')
|
||||
df.set_index('date', inplace=True)
|
||||
print(df)
|
||||
df = df['double_values_0'].resample("31ms").mean()
|
||||
print(df)
|
||||
|
||||
st='2021-05-21 12:28:27'
|
||||
en='2021-05-21 12:59:12'
|
||||
|
||||
df = df.loc[(df.index > st) & (df.index < en)]
|
||||
plt.plot(df)
|
||||
|
||||
plt.savefig(f'NaN.png')
|
||||
sys.exit()
|
||||
|
||||
|
||||
plt.plot(df)
|
||||
|
||||
esm = pd.read_csv(f"/rapids/data/raw/p03/phone_esm_raw.csv")
|
||||
|
||||
esm['date'] = pd.to_datetime(esm['timestamp'],unit='ms')
|
||||
esm = esm[esm['date']]
|
||||
esm.set_index('date', inplace=True)
|
||||
print(esm)
|
||||
|
||||
esm = esm['esm_session'].resample("2900ms").mean()
|
||||
|
||||
plt.plot(esm)
|
||||
plt.savefig(f'NaN.png')
|
Loading…
Reference in New Issue