diff --git a/Snakefile b/Snakefile index dd2a70be..dc023282 100644 --- a/Snakefile +++ b/Snakefile @@ -55,16 +55,22 @@ for provider in config["BLUETOOTH"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["BLUETOOTH"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="BLUETOOTH".lower())) files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="BLUETOOTH".lower())) -if config["ACTIVITY_RECOGNITION"]["COMPUTE"]: - pids_android = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "android", config["PIDS"])) - pids_ios = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "ios", config["PIDS"])) - - for pids,table in zip([pids_android, pids_ios], [config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]): - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/processed/{pid}/{sensor}_deltas.csv", pid=pids, sensor=table)) - files_to_compute.extend(expand("data/processed/{pid}/activity_recognition_{day_segment}.csv",pid=config["PIDS"], day_segment = config["ACTIVITY_RECOGNITION"]["DAY_SEGMENTS"])) +for provider in config["ACTIVITY_RECOGNITION"]["PROVIDERS"].keys(): + if config["ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["COMPUTE"]: + pids_android = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "android", config["PIDS"])) + pids_ios = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "ios", config["PIDS"])) + + for pids,table in zip([pids_android, pids_ios], [config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]): + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=pids, sensor=table)) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=pids, sensor=table)) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table)) + + files_to_compute.extend(expand("data/interim/{pid}/activity_recognition_episodes.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/activity_recognition_episodes_resampled.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/activity_recognition_episodes_resampled_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="ACTIVITY_RECOGNITION".lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="ACTIVITY_RECOGNITION".lower())) + for provider in config["BATTERY"]["PROVIDERS"].keys(): if config["BATTERY"]["PROVIDERS"][provider]["COMPUTE"]: diff --git a/config.yaml b/config.yaml index b14909be..a852c126 100644 --- a/config.yaml +++ b/config.yaml @@ -113,12 +113,19 @@ BLUETOOTH: ACTIVITY_RECOGNITION: - COMPUTE: False DB_TABLE: ANDROID: plugin_google_activity_recognition IOS: plugin_ios_activity_recognition - DAY_SEGMENTS: *day_segments - FEATURES: ["count","mostcommonactivity","countuniqueactivities","activitychangecount","sumstationary","summobile","sumvehicle"] + PROVIDERS: + RAPIDS: + COMPUTE: False + FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"] + ACTIVITY_CLASSES: + STATIONARY: ["still", "tilting"] + MOBILE: ["on_foot", "walking", "running", "on_bicycle"] + VEHICLE: ["in_vehicle"] + SRC_FOLDER: "rapids" # inside src/features/activity_recognition + SRC_LANGUAGE: "python" BATTERY: DB_TABLE: battery diff --git a/rules/common.smk b/rules/common.smk index 5f16807e..4e247209 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -46,21 +46,19 @@ def find_features_files(wildcards): def optional_ar_input(wildcards): platform = infer_participant_platform("data/external/"+wildcards.pid) - + if platform == "android": - return ["data/raw/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_with_datetime_unified.csv", - "data/interim/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_episodes.csv"] + return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]) elif platform == "ios": - return ["data/raw/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_with_datetime_unified.csv", - "data/interim/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_episodes.csv"] + return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]) def optional_conversation_input(wildcards): platform = infer_participant_platform("data/external/"+wildcards.pid) if platform == "android": - return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["ANDROID"] + "_with_datetime_unified.csv"] + return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CONVERSATION"]["DB_TABLE"]["ANDROID"])[0] elif platform == "ios": - return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["IOS"] + "_with_datetime_unified.csv"] + return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CONVERSATION"]["DB_TABLE"]["IOS"])[0] def optional_steps_sleep_input(wildcards): if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED": diff --git a/rules/features.smk b/rules/features.smk index 94225cad..2e8424ad 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -6,70 +6,6 @@ rule join_features_from_providers: script: "../src/features/join_features_from_providers.R" -rule messages_r_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"]), - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["MESSAGES"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/messages_features/messages_r_{provider_key}.csv" - script: - "../src/features/messages/messages_entry.R" - -rule messages_python_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"]), - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["MESSAGES"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/messages_features/messages_python_{provider_key}.csv" - script: - "../src/features/messages/messages_entry.py" - -rule calls_python_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"]), - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/calls_features/calls_python_{provider_key}.csv" - script: - "../src/features/calls/calls_entry.py" - -rule calls_r_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"]), - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/calls_features/calls_r_{provider_key}.csv" - script: - "../src/features/calls/calls_entry.R" - -rule battery_episodes: - input: - expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor=config["BATTERY"]["DB_TABLE"]) - output: - "data/interim/{pid}/battery_episodes.csv" - script: - "../src/features/battery/episodes/battery_episodes.R" - -rule screen_episodes: - input: - screen = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["SCREEN"]["DB_TABLE"]) - output: - "data/interim/{pid}/screen_episodes.csv" - script: - "../src/features/screen/episodes/screen_episodes.R" - rule resample_episodes: input: "data/interim/{pid}/{sensor}_episodes.csv" @@ -92,178 +28,6 @@ rule resample_episodes_with_datetime: script: "../src/data/readable_datetime.R" - -rule google_activity_recognition_deltas: - input: - expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]) - output: - expand("data/interim/{{pid}}/{sensor}_episodes.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]) - script: - "../src/features/ar/episodes/activity_recognition_episodes.R" - -rule ios_activity_recognition_deltas: - input: - expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]) - output: - expand("data/interim/{{pid}}/{sensor}_episodes.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]) - script: - "../src/features/ar/episodes/activity_recognition_episodes.R" - -rule locations_python_features: - input: - sensor_data = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]), - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}", - output: - "data/interim/{pid}/locations_features/locations_python_{provider_key}.csv" - script: - "../src/features/locations/locations_entry.py" - -rule locations_r_features: - input: - sensor_data = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]), - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/locations_features/locations_r_{provider_key}.csv" - script: - "../src/features/locations/locations_entry.R" - -rule bluetooth_r_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"]), - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["BLUETOOTH"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/bluetooth_features/bluetooth_r_{provider_key}.csv" - script: - "../src/features/bluetooth/bluetooth_entry.R" - -rule bluetooth_python_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"]), - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["BLUETOOTH"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/bluetooth_features/bluetooth_python_{provider_key}.csv" - script: - "../src/features/bluetooth/bluetooth_entry.py" - -rule activity_features: - input: - optional_ar_input - params: - segment = "{day_segment}", - features = config["ACTIVITY_RECOGNITION"]["FEATURES"] - output: - "data/processed/{pid}/activity_recognition_{day_segment}.csv" - script: - "../src/features/activity_recognition.py" - -rule battery_r_features: - input: - battery_episodes = "data/interim/{pid}/battery_episodes_resampled_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["BATTERY"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/battery_features/battery_r_{provider_key}.csv" - script: - "../src/features/battery/battery_entry.R" - -rule battery_python_features: - input: - battery_episodes = "data/interim/{pid}/battery_episodes_resampled_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["BATTERY"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/battery_features/battery_python_{provider_key}.csv" - script: - "../src/features/battery/battery_entry.py" - -rule screen_r_features: - input: - screen_episodes = "data/interim/{pid}/screen_episodes_resampled_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["SCREEN"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/screen_features/screen_r_{provider_key}.csv" - script: - "../src/features/screen/screen_entry.R" - -rule screen_python_features: - input: - screen_episodes = "data/interim/{pid}/screen_episodes_resampled_with_datetime.csv", - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["SCREEN"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/screen_features/screen_python_{provider_key}.csv" - script: - "../src/features/screen/screen_entry.py" - -rule light_r_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"]), - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/light_features/light_r_{provider_key}.csv" - script: - "../src/features/light/light_entry.R" - -rule light_python_features: - input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"]), - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/light_features/light_python_{provider_key}.csv" - script: - "../src/features/light/light_entry.py" - -rule conversation_r_features: - input: - sensor_data = optional_conversation_input, - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["CONVERSATION"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/conversation_features/conversation_r_{provider_key}.csv" - script: - "../src/features/conversation/conversation_entry.R" - -rule conversation_python_features: - input: - sensor_data = optional_conversation_input, - day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" - params: - provider = lambda wildcards: config["CONVERSATION"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" - output: - "data/interim/{pid}/conversation_features/conversation_python_{provider_key}.csv" - script: - "../src/features/conversation/conversation_entry.py" - rule accelerometer_features: input: expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["ACCELEROMETER"]["DB_TABLE"]), @@ -278,53 +42,315 @@ rule accelerometer_features: script: "../src/features/accelerometer_features.py" +rule activity_recognition_episodes: + input: + optional_ar_input + output: + "data/interim/{pid}/activity_recognition_episodes.csv" + script: + "../src/features/activity_recognition/episodes/activity_recognition_episodes.R" + +rule activity_recognition_r_features: + input: + sensor_episodes = "data/interim/{pid}/activity_recognition_episodes_resampled_with_datetime.csv", + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["ACTIVITY_RECOGNITION"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "activity_recognition" + output: + "data/interim/{pid}/activity_recognition_features/activity_recognition_r_{provider_key}.csv" + script: + "../src/features/entry.R" + +rule activity_recognition_python_features: + input: + sensor_episodes = "data/interim/{pid}/activity_recognition_episodes_resampled_with_datetime.csv", + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["ACTIVITY_RECOGNITION"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "activity_recognition" + output: + "data/interim/{pid}/activity_recognition_features/activity_recognition_python_{provider_key}.csv" + script: + "../src/features/entry.py" + rule applications_foreground_r_features: input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]), + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])[0], day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" + provider_key = "{provider_key}", + sensor_key = "applications_foreground" output: "data/interim/{pid}/applications_foreground_features/applications_foreground_r_{provider_key}.csv" script: - "../src/features/applications_foreground/applications_foreground_entry.R" + "../src/features/entry.R" rule applications_foreground_python_features: input: - sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]), + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])[0], day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" + provider_key = "{provider_key}", + sensor_key = "applications_foreground" output: "data/interim/{pid}/applications_foreground_features/applications_foreground_python_{provider_key}.csv" script: - "../src/features/applications_foreground/applications_foreground_entry.py" + "../src/features/entry.py" + +rule battery_episodes: + input: + expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor=config["BATTERY"]["DB_TABLE"]) + output: + "data/interim/{pid}/battery_episodes.csv" + script: + "../src/features/battery/episodes/battery_episodes.R" + +rule battery_r_features: + input: + sensor_episodes = "data/interim/{pid}/battery_episodes_resampled_with_datetime.csv", + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["BATTERY"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "battery" + output: + "data/interim/{pid}/battery_features/battery_r_{provider_key}.csv" + script: + "../src/features/entry.R" + +rule battery_python_features: + input: + sensor_episodes = "data/interim/{pid}/battery_episodes_resampled_with_datetime.csv", + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["BATTERY"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "battery" + output: + "data/interim/{pid}/battery_features/battery_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule bluetooth_r_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])[0], + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["BLUETOOTH"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "bluetooth" + output: + "data/interim/{pid}/bluetooth_features/bluetooth_r_{provider_key}.csv" + script: + "../src/features/entry.R" + +rule bluetooth_python_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])[0], + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["BLUETOOTH"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "bluetooth" + output: + "data/interim/{pid}/bluetooth_features/bluetooth_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule calls_r_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"])[0], + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "calls" + output: + "data/interim/{pid}/calls_features/calls_r_{provider_key}.csv" + script: + "../src/features/entry.R" + +rule calls_python_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"])[0], + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "calls" + output: + "data/interim/{pid}/calls_features/calls_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule conversation_r_features: + input: + sensor_data = optional_conversation_input, + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["CONVERSATION"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "conversation" + output: + "data/interim/{pid}/conversation_features/conversation_r_{provider_key}.csv" + script: + "../src/features/entry.R" + +rule conversation_python_features: + input: + sensor_data = optional_conversation_input, + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["CONVERSATION"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "conversation" + output: + "data/interim/{pid}/conversation_features/conversation_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule light_r_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"])[0], + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "light" + output: + "data/interim/{pid}/light_features/light_r_{provider_key}.csv" + script: + "../src/features/entry.R" + +rule light_python_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"])[0], + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "light" + output: + "data/interim/{pid}/light_features/light_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule locations_r_features: + input: + sensor_data = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"])[0], + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "locations" + output: + "data/interim/{pid}/locations_features/locations_r_{provider_key}.csv" + script: + "../src/features/entry.R" + +rule locations_python_features: + input: + sensor_data = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"])[0], + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "locations" + output: + "data/interim/{pid}/locations_features/locations_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule messages_r_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"])[0], + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["MESSAGES"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "messages" + output: + "data/interim/{pid}/messages_features/messages_r_{provider_key}.csv" + script: + "../src/features/entry.R" + +rule messages_python_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"])[0], + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["MESSAGES"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "messages" + output: + "data/interim/{pid}/messages_features/messages_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule screen_episodes: + input: + screen = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["SCREEN"]["DB_TABLE"]) + output: + "data/interim/{pid}/screen_episodes.csv" + script: + "../src/features/screen/episodes/screen_episodes.R" + +rule screen_r_features: + input: + sensor_episodes = "data/interim/{pid}/screen_episodes_resampled_with_datetime.csv", + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["SCREEN"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "screen" + output: + "data/interim/{pid}/screen_features/screen_r_{provider_key}.csv" + script: + "../src/features/entry.R" + +rule screen_python_features: + input: + sensor_episodes = "data/interim/{pid}/screen_episodes_resampled_with_datetime.csv", + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["SCREEN"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}", + sensor_key = "screen" + output: + "data/interim/{pid}/screen_features/screen_python_{provider_key}.csv" + script: + "../src/features/entry.py" rule wifi_r_features: input: - sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower()), + sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower())[0], day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: provider = lambda wildcards: config["WIFI"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" + provider_key = "{provider_key}", + sensor_key = "wifi" output: "data/interim/{pid}/wifi_features/wifi_r_{provider_key}.csv" script: - "../src/features/wifi/wifi_entry.R" + "../src/features/entry.R" rule wifi_python_features: input: - sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower()), + sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower())[0], day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: provider = lambda wildcards: config["WIFI"]["PROVIDERS"][wildcards.provider_key], - provider_key = "{provider_key}" + provider_key = "{provider_key}", + sensor_key = "wifi" output: "data/interim/{pid}/wifi_features/wifi_python_{provider_key}.csv" script: - "../src/features/wifi/wifi_entry.py" + "../src/features/entry.py" rule fitbit_heartrate_features: input: diff --git a/src/features/activity_recognition.py b/src/features/activity_recognition.py deleted file mode 100644 index 5a3d7117..00000000 --- a/src/features/activity_recognition.py +++ /dev/null @@ -1,15 +0,0 @@ -import pandas as pd -from ar.ar_base import base_ar_features - -ar_data = pd.read_csv(snakemake.input[0],parse_dates=["local_date_time"]) -ar_deltas = pd.read_csv(snakemake.input[1],parse_dates=["local_start_date_time", "local_end_date_time", "local_start_date", "local_end_date"]) -day_segment = snakemake.params["segment"] -requested_features = snakemake.params["features"] -ar_features = pd.DataFrame(columns=["local_date"]) - - -ar_features = ar_features.merge(base_ar_features(ar_data, ar_deltas, day_segment, requested_features), on="local_date", how="outer") - -assert len(requested_features) + 1 == ar_features.shape[1], "The number of features in the output dataframe (=" + str(ar_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your activity recognition feature extraction functions" - -ar_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/ar/episodes/activity_recognition_episodes.R b/src/features/activity_recognition/episodes/activity_recognition_episodes.R similarity index 100% rename from src/features/ar/episodes/activity_recognition_episodes.R rename to src/features/activity_recognition/episodes/activity_recognition_episodes.R diff --git a/src/features/activity_recognition/rapids/main.py b/src/features/activity_recognition/rapids/main.py new file mode 100644 index 00000000..5c777ef2 --- /dev/null +++ b/src/features/activity_recognition/rapids/main.py @@ -0,0 +1,123 @@ +import pandas as pd +import numpy as np + +def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): + + chunk_episodes = kwargs["chunk_episodes"] + + ar_episodes = pd.read_csv(sensor_data_files["sensor_episodes"]) + activity_classes = provider["ACTIVITY_CLASSES"] + + # name of the features this function can compute + base_features_names = ["count","mostcommonactivity","countuniqueactivities","durationstationary","durationmobile","durationvehicle"] + # the subset of requested features this function can compute + requested_features = provider["FEATURES"] + features_to_compute = list(set(requested_features) & set(base_features_names)) + + ar_features = pd.DataFrame(columns=["local_segment"] + ["ar_rapids_" + x for x in features_to_compute]) + if not ar_episodes.empty: + ar_episodes = filter_data_by_segment(ar_episodes, day_segment) + + if not ar_episodes.empty: + # chunk episodes + ar_episodes = chunk_episodes(ar_episodes) + + if not ar_episodes.empty: + ar_features = pd.DataFrame() + + if "count" in features_to_compute: + ar_features["ar_rapids_count"] = ar_episodes.groupby(["local_segment"]).count()["episode_id"] + if "mostcommonactivity" in features_to_compute: + ar_features["ar_rapids_mostcommonactivity"] = ar_episodes.groupby(["local_segment"])["activity_type"].agg(lambda x: pd.Series.mode(x)[0]) + if "countuniqueactivities" in features_to_compute: + ar_features["ar_rapids_countuniqueactivities"] = ar_episodes.groupby(["local_segment"])["activity_type"].nunique() + + # duration features + for column, activity_labels in activity_classes.items(): + if "duration" + column.lower() in features_to_compute: + filtered_data = ar_episodes[ar_episodes["activity_name"].isin(pd.Series(activity_labels))] + if not filtered_data.empty: + ar_features["ar_rapids_duration_" + column] = ar_episodes[ar_episodes["activity_name"].isin(pd.Series(activity_labels))].groupby(["local_segment"])["duration"].sum().fillna(0) + else: + ar_features["ar_rapids_duration_" + column] = 0 + + ar_features.index.names = ["local_segment"] + ar_features = ar_features.reset_index() + + return ar_features + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + """ + + if not ar_data.empty: + ar_data = filter_data_by_segment(ar_data, day_segment) + + if not ar_data.empty: + # chunk_episodes + ar_data = chunk_episodes(ar_data) + + if not ar_data.empty: + + ar_data["episode_id"] = ((ar_data.ar_status != ar_data.ar_status.shift()) | (ar_data.start_timestamp - ar_data.end_timestamp.shift() > 1)).cumsum() + grouped = ar_data.groupby(by=["local_segment", "episode_id", "ar_status"]) + ar_episodes= grouped[["duration"]].sum() + ar_episodes["ar_diff"] = grouped["ar_level"].first() - grouped["ar_level"].last() + ar_episodes["ar_consumption_rate"] = ar_episodes["ar_diff"] / ar_episodes["duration"] + ar_episodes.reset_index(inplace=True) + + # for discharge episodes + ar_discharge_episodes = ar_episodes[(ar_episodes["ar_status"] == 3) | (ar_episodes["ar_status"] == 4)] + ar_discharge_features = pd.DataFrame() + if "countdischarge" in features_to_compute: + ar_discharge_features["ar_rapids_countdischarge"] = ar_discharge_episodes.groupby(["local_segment"])["episode_id"].count() + if "sumdurationdischarge" in features_to_compute: + ar_discharge_features["ar_rapids_sumdurationdischarge"] = ar_discharge_episodes.groupby(["local_segment"])["duration"].sum() + if "avgconsumptionrate" in features_to_compute: + ar_discharge_features["ar_rapids_avgconsumptionrate"] = ar_discharge_episodes.groupby(["local_segment"])["ar_consumption_rate"].mean() + if "maxconsumptionrate" in features_to_compute: + ar_discharge_features["ar_rapids_maxconsumptionrate"] = ar_discharge_episodes.groupby(["local_segment"])["ar_consumption_rate"].max() + + # for charge episodes + ar_charge_episodes = ar_episodes[(ar_episodes["ar_status"] == 2) | (ar_episodes["ar_status"] == 5)] + ar_charge_features = pd.DataFrame() + if "countcharge" in features_to_compute: + ar_charge_features["ar_rapids_countcharge"] = ar_charge_episodes.groupby(["local_segment"])["episode_id"].count() + if "sumdurationcharge" in features_to_compute: + ar_charge_features["ar_rapids_sumdurationcharge"] = ar_charge_episodes.groupby(["local_segment"])["duration"].sum() + + # combine discharge features and charge features; fill the missing values with ZERO + ar_features = pd.concat([ar_discharge_features, ar_charge_features], axis=1, sort=True).fillna(0) + + ar_features.index.rename("local_segment", inplace=True) + ar_features = ar_features.reset_index() + + return ar_features + """ \ No newline at end of file diff --git a/src/features/applications_foreground/applications_foreground_entry.R b/src/features/applications_foreground/applications_foreground_entry.R deleted file mode 100644 index 277ab623..00000000 --- a/src/features/applications_foreground/applications_foreground_entry.R +++ /dev/null @@ -1,13 +0,0 @@ -source("renv/activate.R") -source("src/features/utils/utils.R") -library("dplyr") -library("tidyr") - -sensor_data_file <- snakemake@input[["sensor_data"]] -day_segments_file <- snakemake@input[["day_segments_labels"]] -provider <- snakemake@params["provider"][["provider"]] -provider_key <- snakemake@params["provider_key"] - -sensor_features <- fetch_provider_features(provider, provider_key, "applications_foreground", sensor_data_file, day_segments_file) - -write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/applications_foreground/applications_foreground_entry.py b/src/features/applications_foreground/applications_foreground_entry.py deleted file mode 100644 index 49b9b141..00000000 --- a/src/features/applications_foreground/applications_foreground_entry.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -from importlib import import_module, util -from pathlib import Path - -# import fetch_provider_features from src/features/utils/utils.py -spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) -mod = util.module_from_spec(spec) -spec.loader.exec_module(mod) -fetch_provider_features = getattr(mod, "fetch_provider_features") - -sensor_data_file = snakemake.input["sensor_data"][0] -day_segments_file = snakemake.input["day_segments_labels"] -provider = snakemake.params["provider"] -provider_key = snakemake.params["provider_key"] - -sensor_features = fetch_provider_features(provider, provider_key, "applications_foreground", sensor_data_file, day_segments_file) - -sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/applications_foreground/rapids/main.py b/src/features/applications_foreground/rapids/main.py index 7b3533f3..6ec2516f 100644 --- a/src/features/applications_foreground/rapids/main.py +++ b/src/features/applications_foreground/rapids/main.py @@ -9,28 +9,31 @@ def compute_features(filtered_data, apps_type, requested_features, apps_features if "timeoffirstuse" in requested_features: time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment") if time_first_event.empty: - apps_features["apps_rapids" + "_timeoffirstuse" + apps_type] = np.nan + apps_features["apps_rapids_timeoffirstuse" + apps_type] = np.nan else: - apps_features["apps_rapids" + "_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"] + apps_features["apps_rapids_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"] if "timeoflastuse" in requested_features: time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment") if time_last_event.empty: - apps_features["apps_rapids" + "_timeoflastuse" + apps_type] = np.nan + apps_features["apps_rapids_timeoflastuse" + apps_type] = np.nan else: - apps_features["apps_rapids" + "_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"] + apps_features["apps_rapids_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"] if "frequencyentropy" in requested_features: apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index() if (len(apps_with_count.index) < 2 ): - apps_features["apps_rapids" + "_frequencyentropy" + apps_type] = np.nan + apps_features["apps_rapids_frequencyentropy" + apps_type] = np.nan else: - apps_features["apps_rapids" + "_frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy) + apps_features["apps_rapids_frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy) if "count" in requested_features: - apps_features["apps_rapids" + "_count" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"] - apps_features.fillna(value={"apps_rapids" + "_count" + apps_type: 0}, inplace=True) + apps_features["apps_rapids_count" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"] + apps_features.fillna(value={"apps_rapids_count" + apps_type: 0}, inplace=True) return apps_features -def rapids_features(apps_data, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): + + apps_data = pd.read_csv(sensor_data_files["sensor_data"]) + requested_features = provider["FEATURES"] excluded_categories = provider["EXCLUDED_CATEGORIES"] excluded_apps = provider["EXCLUDED_APPS"] @@ -49,10 +52,8 @@ def rapids_features(apps_data, day_segment, provider, filter_data_by_segment, *a apps_data = apps_data[~apps_data["genre"].isin(excluded_categories)] # exclude apps in the excluded_apps list apps_data = apps_data[~apps_data["package_name"].isin(excluded_apps)] - - - apps_features = pd.DataFrame(columns=["local_segment"] + ["apps_rapids_" + "_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + single_apps)]]) + apps_features = pd.DataFrame(columns=["local_segment"] + ["apps_rapids_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + single_apps)]]) if not apps_data.empty: # deep copy the apps_data for the top1global computation apps_data_global = apps_data.copy() diff --git a/src/features/ar/ar_base.py b/src/features/ar/ar_base.py deleted file mode 100644 index 0508545a..00000000 --- a/src/features/ar/ar_base.py +++ /dev/null @@ -1,63 +0,0 @@ -import pandas as pd -import numpy as np -import scipy.stats as stats -from features_utils import splitOvernightEpisodes, splitMultiSegmentEpisodes - -def base_ar_features(ar_data, ar_deltas, day_segment, requested_features): - # name of the features this function can compute - base_features_names = ["count","mostcommonactivity","countuniqueactivities","activitychangecount","sumstationary","summobile","sumvehicle"] - # the subset of requested features this function can compute - features_to_compute = list(set(requested_features) & set(base_features_names)) - - ar_features = pd.DataFrame(columns = ["local_date"] + ["ar_" + day_segment + "_" + x for x in features_to_compute]) - if not ar_data.empty: - ar_deltas = splitOvernightEpisodes(ar_deltas, [],["activity"]) - - if day_segment != "daily": - ar_deltas = splitMultiSegmentEpisodes(ar_deltas, day_segment, []) - - ar_data.local_date_time = pd.to_datetime(ar_data.local_date_time) - resampledData = ar_data.set_index(ar_data.local_date_time) - resampledData.drop(columns=["local_date_time"], inplace=True) - - if day_segment != "daily": - resampledData = resampledData.loc[resampledData["local_day_segment"] == day_segment] - - if not resampledData.empty: - ar_features = pd.DataFrame() - - # finding the count of samples of the day - if "count" in features_to_compute: - ar_features["ar_" + day_segment + "_count"] = resampledData["activity_type"].resample("D").count() - - # finding most common activity of the day - if "mostcommonactivity" in features_to_compute: - ar_features["ar_" + day_segment + "_mostcommonactivity"] = resampledData["activity_type"].resample("D").apply(lambda x: stats.mode(x)[0] if len(stats.mode(x)[0]) != 0 else None) - - # finding different number of activities during a day - if "countuniqueactivities" in features_to_compute: - ar_features["ar_" + day_segment + "_countuniqueactivities"] = resampledData["activity_type"].resample("D").nunique() - - # finding Number of times activity changed - if "activitychangecount" in features_to_compute: - resampledData["activity_type_shift"] = resampledData["activity_type"].shift().fillna(resampledData["activity_type"].head(1)) - resampledData["different_activity"] = np.where(resampledData["activity_type"]!=resampledData["activity_type_shift"],1,0) - ar_features["ar_" + day_segment + "_activitychangecount"] = resampledData["different_activity"].resample("D").sum() - - - deltas_features = {"sumstationary":["still","tilting"], - "summobile":["on_foot","walking","running","on_bicycle"], - "sumvehicle":["in_vehicle"]} - - for column, activity_labels in deltas_features.items(): - if column in features_to_compute: - filtered_data = ar_deltas[ar_deltas["activity"].isin(pd.Series(activity_labels))] - if not filtered_data.empty: - ar_features["ar_" + day_segment + "_" + column] = ar_deltas[ar_deltas["activity"].isin(pd.Series(activity_labels))].groupby(["local_start_date"])["time_diff"].sum().fillna(0) - else: - ar_features["ar_" + day_segment + "_" + column] = 0 - - ar_features.index.names = ["local_date"] - ar_features = ar_features.reset_index() - - return ar_features diff --git a/src/features/battery/battery_entry.R b/src/features/battery/battery_entry.R deleted file mode 100644 index f86dcd91..00000000 --- a/src/features/battery/battery_entry.R +++ /dev/null @@ -1,13 +0,0 @@ -source("renv/activate.R") -source("src/features/utils/utils.R") -library("dplyr") -library("tidyr") - -sensor_data_file <- snakemake@input[["battery_episodes"]] -day_segments_file <- snakemake@input[["day_segments_labels"]] -provider <- snakemake@params["provider"][["provider"]] -provider_key <- snakemake@params["provider_key"] - -sensor_features <- fetch_provider_features(provider, provider_key, "battery", sensor_data_file, day_segments_file) - -write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) \ No newline at end of file diff --git a/src/features/battery/battery_entry.py b/src/features/battery/battery_entry.py deleted file mode 100644 index c38db5d6..00000000 --- a/src/features/battery/battery_entry.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -from importlib import import_module, util -from pathlib import Path - -# import fetch_provider_features from src/features/utils/utils.py -spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) -mod = util.module_from_spec(spec) -spec.loader.exec_module(mod) -fetch_provider_features = getattr(mod, "fetch_provider_features") - -battery_episodes_file = snakemake.input["battery_episodes"] -day_segments_file = snakemake.input["day_segments_labels"] -provider = snakemake.params["provider"] -provider_key = snakemake.params["provider_key"] - -sensor_features = fetch_provider_features(provider, provider_key, "battery", battery_episodes_file, day_segments_file) - -sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/battery/rapids/main.py b/src/features/battery/rapids/main.py index 59a7fec6..6e6000e0 100644 --- a/src/features/battery/rapids/main.py +++ b/src/features/battery/rapids/main.py @@ -1,8 +1,9 @@ import pandas as pd from datetime import datetime, timedelta, time -def rapids_features(battery_data, day_segment, provider, filter_data_by_segment, *args, **kwargs): - +def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): + + battery_data = pd.read_csv(sensor_data_files["sensor_episodes"]) chunk_episodes = kwargs["chunk_episodes"] # name of the features this function can compute diff --git a/src/features/battery_features.py b/src/features/battery_features.py deleted file mode 100644 index 0a9c21c6..00000000 --- a/src/features/battery_features.py +++ /dev/null @@ -1,13 +0,0 @@ -import pandas as pd -from battery.battery_base import base_battery_features - -battery_data = pd.read_csv(snakemake.input[0], parse_dates=["local_start_date_time", "local_end_date_time", "local_start_date", "local_end_date"]) -day_segment = snakemake.params["day_segment"] -requested_features = snakemake.params["features"] -battery_features = pd.DataFrame(columns=["local_date"]) - -battery_features = battery_features.merge(base_battery_features(battery_data, day_segment, requested_features), on="local_date", how="outer") - -assert len(requested_features) + 1 == battery_features.shape[1], "The number of features in the output dataframe (=" + str(battery_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your battery feature extraction functions" - -battery_features.to_csv(snakemake.output[0], index=False) diff --git a/src/features/bluetooth/bluetooth_entry.R b/src/features/bluetooth/bluetooth_entry.R deleted file mode 100644 index fb0280ca..00000000 --- a/src/features/bluetooth/bluetooth_entry.R +++ /dev/null @@ -1,13 +0,0 @@ -source("renv/activate.R") -source("src/features/utils/utils.R") -library("dplyr") -library("tidyr") - -sensor_data_file <- snakemake@input[["sensor_data"]] -day_segments_file <- snakemake@input[["day_segments_labels"]] -provider <- snakemake@params["provider"][["provider"]] -provider_key <- snakemake@params["provider_key"] - -sensor_features <- fetch_provider_features(provider, provider_key, "bluetooth", sensor_data_file, day_segments_file) - -write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/bluetooth/bluetooth_entry.py b/src/features/bluetooth/bluetooth_entry.py deleted file mode 100644 index 78b804d6..00000000 --- a/src/features/bluetooth/bluetooth_entry.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -from importlib import import_module, util -from pathlib import Path - -# import fetch_provider_features from src/features/utils/utils.py -spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) -mod = util.module_from_spec(spec) -spec.loader.exec_module(mod) -fetch_provider_features = getattr(mod, "fetch_provider_features") - -sensor_data_file = snakemake.input["sensor_data"][0] -day_segments_file = snakemake.input["day_segments_labels"] -provider = snakemake.params["provider"] -provider_key = snakemake.params["provider_key"] - -sensor_features = fetch_provider_features(provider, provider_key, "bluetooth", sensor_data_file, day_segments_file) - -sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/bluetooth/rapids/main.R b/src/features/bluetooth/rapids/main.R index cba86eed..ae4b81dd 100644 --- a/src/features/bluetooth/rapids/main.R +++ b/src/features/bluetooth/rapids/main.R @@ -27,24 +27,26 @@ compute_bluetooth_feature <- function(data, feature, day_segment){ } } -rapids_features <- function(bluetooth_data, day_segment, provider){ - requested_features <- provider[["FEATURES"]] - - # Output dataframe - features = data.frame(local_segment = character(), stringsAsFactors = FALSE) +rapids_features <- function(sensor_data_files, day_segment, provider){ + + bluetooth_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) + requested_features <- provider[["FEATURES"]] + + # Output dataframe + features = data.frame(local_segment = character(), stringsAsFactors = FALSE) - # The name of the features this function can compute - base_features_names <- c("countscans", "uniquedevices", "countscansmostuniquedevice") + # The name of the features this function can compute + base_features_names <- c("countscans", "uniquedevices", "countscansmostuniquedevice") - # The subset of requested features this function can compute - features_to_compute <- intersect(base_features_names, requested_features) + # The subset of requested features this function can compute + features_to_compute <- intersect(base_features_names, requested_features) - for(feature_name in features_to_compute){ - feature <- compute_bluetooth_feature(bluetooth_data, feature_name, day_segment) - features <- merge(features, feature, by="local_segment", all = TRUE) - } + for(feature_name in features_to_compute){ + feature <- compute_bluetooth_feature(bluetooth_data, feature_name, day_segment) + features <- merge(features, feature, by="local_segment", all = TRUE) + } - features <- features %>% mutate_at(vars(contains("countscansmostuniquedevice")), list( ~ replace_na(., 0))) + features <- features %>% mutate_at(vars(contains("countscansmostuniquedevice")), list( ~ replace_na(., 0))) - return(features) + return(features) } \ No newline at end of file diff --git a/src/features/calls/calls_entry.py b/src/features/calls/calls_entry.py deleted file mode 100644 index 828c4718..00000000 --- a/src/features/calls/calls_entry.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -from importlib import import_module, util -from pathlib import Path - -# import fetch_provider_features from src/features/utils/utils.py -spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) -mod = util.module_from_spec(spec) -spec.loader.exec_module(mod) -fetch_provider_features = getattr(mod, "fetch_provider_features") - -sensor_data_file = snakemake.input["sensor_data"][0] -day_segments_file = snakemake.input["day_segments_labels"] -provider = snakemake.params["provider"] -provider_key = snakemake.params["provider_key"] - -sensor_features = fetch_provider_features(provider, provider_key, "calls", sensor_data_file, day_segments_file) - -sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/calls/rapids/main.R b/src/features/calls/rapids/main.R index 1ff01884..436fd71e 100644 --- a/src/features/calls/rapids/main.R +++ b/src/features/calls/rapids/main.R @@ -62,8 +62,9 @@ call_features_of_type <- function(calls, call_type, day_segment, requested_featu return(features) } -rapids_features <- function(calls, day_segment, provider){ - calls <- calls %>% filter_data_by_segment(day_segment) +rapids_features <- function(sensor_data_files, day_segment, provider){ + calls_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) + calls_data <- calls_data %>% filter_data_by_segment(day_segment) call_types = provider[["CALL_TYPES"]] call_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment")) @@ -74,7 +75,7 @@ rapids_features <- function(calls, day_segment, provider){ stop(paste("Call type can online be incoming, outgoing or missed but instead you typed: ", call_type, " in config[CALLS][CALL_TYPES]")) requested_features <- provider[["FEATURES"]][[call_type]] - calls_of_type <- calls %>% filter(call_type == call_type_label) + calls_of_type <- calls_data %>% filter(call_type == call_type_label) features <- call_features_of_type(calls_of_type, call_type, day_segment, requested_features) call_features <- merge(call_features, features, all=TRUE) diff --git a/src/features/conversation/conversation_entry.R b/src/features/conversation/conversation_entry.R deleted file mode 100644 index bf5d4fe9..00000000 --- a/src/features/conversation/conversation_entry.R +++ /dev/null @@ -1,13 +0,0 @@ -source("renv/activate.R") -source("src/features/utils/utils.R") -library("dplyr") -library("tidyr") - -sensor_data_file <- snakemake@input[["sensor_data"]] -day_segments_file <- snakemake@input[["day_segments_labels"]] -provider <- snakemake@params["provider"][["provider"]] -provider_key <- snakemake@params["provider_key"] - -sensor_features <- fetch_provider_features(provider, provider_key, "conversation", sensor_data_file, day_segments_file) - -write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/conversation/conversation_entry.py b/src/features/conversation/conversation_entry.py deleted file mode 100644 index 41eee92d..00000000 --- a/src/features/conversation/conversation_entry.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -from importlib import import_module, util -from pathlib import Path - -# import fetch_provider_features from src/features/utils/utils.py -spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) -mod = util.module_from_spec(spec) -spec.loader.exec_module(mod) -fetch_provider_features = getattr(mod, "fetch_provider_features") - -sensor_data_file = snakemake.input["sensor_data"][0] -day_segments_file = snakemake.input["day_segments_labels"] -provider = snakemake.params["provider"] -provider_key = snakemake.params["provider_key"] - -sensor_features = fetch_provider_features(provider, provider_key, "conversation", sensor_data_file, day_segments_file) - -sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/conversation/rapids/main.py b/src/features/conversation/rapids/main.py index 3863293e..52112bd3 100644 --- a/src/features/conversation/rapids/main.py +++ b/src/features/conversation/rapids/main.py @@ -1,8 +1,9 @@ import pandas as pd import numpy as np -# def rapids_features(conversation_data, day_segment, requested_features,recordingMinutes,pausedMinutes,expectedMinutes): -def rapids_features(conversation_data, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): + + conversation_data = pd.read_csv(sensor_data_files["sensor_data"]) requested_features = provider["FEATURES"] recordingMinutes = provider["RECORDING_MINUTES"] @@ -20,7 +21,7 @@ def rapids_features(conversation_data, day_segment, provider, filter_data_by_seg # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) - conversation_features = pd.DataFrame(columns=["local_segment"] + ["conversation_rapids" + "_" + x for x in features_to_compute]) + conversation_features = pd.DataFrame(columns=["local_segment"] + ["conversation_rapids_" + x for x in features_to_compute]) if not conversation_data.empty: conversation_data = filter_data_by_segment(conversation_data, day_segment) @@ -30,19 +31,19 @@ def rapids_features(conversation_data, day_segment, provider, filter_data_by_seg conversation_data = conversation_data.drop_duplicates(subset=["local_date", "local_time"], keep="first") if "minutessilence" in features_to_compute: - conversation_features["conversation_rapids" + "_minutessilence"] = conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60 + conversation_features["conversation_rapids_minutessilence"] = conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60 if "minutesnoise" in features_to_compute: - conversation_features["conversation_rapids" + "_minutesnoise"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60 + conversation_features["conversation_rapids_minutesnoise"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60 if "minutesvoice" in features_to_compute: - conversation_features["conversation_rapids" + "_minutesvoice"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60 + conversation_features["conversation_rapids_minutesvoice"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60 if "minutesunknown" in features_to_compute: - conversation_features["conversation_rapids" + "_minutesunknown"] = conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60 + conversation_features["conversation_rapids_minutesunknown"] = conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60 if "countconversation" in features_to_compute: - conversation_features["conversation_rapids" + "_countconversation"] = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['double_convo_start'].nunique() + conversation_features["conversation_rapids_countconversation"] = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['double_convo_start'].nunique() conv_duration = (conversation_data['double_convo_end']/1000 - conversation_data['double_convo_start']/1000)/60 conversation_data = conversation_data.assign(conv_duration = conv_duration.values) @@ -50,43 +51,43 @@ def rapids_features(conversation_data, day_segment, provider, filter_data_by_seg conv_totalDuration = conversation_data[(conversation_data['inference'] >= 0) & (conversation_data['inference'] < 4)].groupby(["local_segment"])['inference'].count()/60 if "silencesensedfraction" in features_to_compute: - conversation_features["conversation_rapids" + "_silencesensedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration + conversation_features["conversation_rapids_silencesensedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration if "noisesensedfraction" in features_to_compute: - conversation_features["conversation_rapids" + "_noisesensedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration + conversation_features["conversation_rapids_noisesensedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration if "voicesensedfraction" in features_to_compute: - conversation_features["conversation_rapids" + "_voicesensedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration + conversation_features["conversation_rapids_voicesensedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration if "unknownsensedfraction" in features_to_compute: - conversation_features["conversation_rapids" + "_unknownsensedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration + conversation_features["conversation_rapids_unknownsensedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration if "silenceexpectedfraction" in features_to_compute: - conversation_features["conversation_rapids" + "_silenceexpectedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes + conversation_features["conversation_rapids_silenceexpectedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes if "noiseexpectedfraction" in features_to_compute: - conversation_features["conversation_rapids" + "_noiseexpectedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes + conversation_features["conversation_rapids_noiseexpectedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes if "voiceexpectedfraction" in features_to_compute: - conversation_features["conversation_rapids" + "_voiceexpectedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes + conversation_features["conversation_rapids_voiceexpectedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes if "unknownexpectedfraction" in features_to_compute: - conversation_features["conversation_rapids" + "_unknownexpectedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes + conversation_features["conversation_rapids_unknownexpectedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes if "sumconversationduration" in features_to_compute: - conversation_features["conversation_rapids" + "_sumconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].sum() + conversation_features["conversation_rapids_sumconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].sum() if "avgconversationduration" in features_to_compute: - conversation_features["conversation_rapids" + "_avgconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].mean() + conversation_features["conversation_rapids_avgconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].mean() if "sdconversationduration" in features_to_compute: - conversation_features["conversation_rapids" + "_sdconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].std() + conversation_features["conversation_rapids_sdconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].std() if "minconversationduration" in features_to_compute: - conversation_features["conversation_rapids" + "_minconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].min() + conversation_features["conversation_rapids_minconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].min() if "maxconversationduration" in features_to_compute: - conversation_features["conversation_rapids" + "_maxconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].max() + conversation_features["conversation_rapids_maxconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].max() if "timefirstconversation" in features_to_compute: timestampsLastConversation = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['timestamp'].min() @@ -94,9 +95,9 @@ def rapids_features(conversation_data, day_segment, provider, filter_data_by_seg for date in list(timestampsLastConversation.index): lastimestamp = timestampsLastConversation.loc[date] lasttime = (conversation_data.query('timestamp == @lastimestamp', inplace = False))['local_time'].iat[0] - conversation_features.loc[date,"conversation_rapids" + "_timefirstconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1]) + conversation_features.loc[date,"conversation_rapids_timefirstconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1]) else: - conversation_features["conversation_rapids" + "_timefirstconversation"] = np.nan + conversation_features["conversation_rapids_timefirstconversation"] = np.nan if "timelastconversation" in features_to_compute: timestampsLastConversation = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['timestamp'].max() @@ -104,39 +105,39 @@ def rapids_features(conversation_data, day_segment, provider, filter_data_by_seg for date in list(timestampsLastConversation.index): lastimestamp = timestampsLastConversation.loc[date] lasttime = (conversation_data.query('timestamp == @lastimestamp', inplace = False))['local_time'].iat[0] - conversation_features.loc[date,"conversation_rapids" + "_timelastconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1]) + conversation_features.loc[date,"conversation_rapids_timelastconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1]) else: - conversation_features["conversation_rapids" + "_timelastconversation"] = np.nan + conversation_features["conversation_rapids_timelastconversation"] = np.nan if "noisesumenergy" in features_to_compute: - conversation_features["conversation_rapids" + "_noisesumenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].sum() + conversation_features["conversation_rapids_noisesumenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].sum() if "noiseavgenergy" in features_to_compute: - conversation_features["conversation_rapids" + "_noiseavgenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].mean() + conversation_features["conversation_rapids_noiseavgenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].mean() if "noisesdenergy" in features_to_compute: - conversation_features["conversation_rapids" + "_noisesdenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].std() + conversation_features["conversation_rapids_noisesdenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].std() if "noiseminenergy" in features_to_compute: - conversation_features["conversation_rapids" + "_noiseminenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].min() + conversation_features["conversation_rapids_noiseminenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].min() if "noisemaxenergy" in features_to_compute: - conversation_features["conversation_rapids" + "_noisemaxenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].max() + conversation_features["conversation_rapids_noisemaxenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].max() if "voicesumenergy" in features_to_compute: - conversation_features["conversation_rapids" + "_voicesumenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].sum() + conversation_features["conversation_rapids_voicesumenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].sum() if "voiceavgenergy" in features_to_compute: - conversation_features["conversation_rapids" + "_voiceavgenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].mean() + conversation_features["conversation_rapids_voiceavgenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].mean() if "voicesdenergy" in features_to_compute: - conversation_features["conversation_rapids" + "_voicesdenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].std() + conversation_features["conversation_rapids_voicesdenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].std() if "voiceminenergy" in features_to_compute: - conversation_features["conversation_rapids" + "_voiceminenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].min() + conversation_features["conversation_rapids_voiceminenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].min() if "voicemaxenergy" in features_to_compute: - conversation_features["conversation_rapids" + "_voicemaxenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].max() + conversation_features["conversation_rapids_voicemaxenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].max() conversation_features = conversation_features.reset_index() diff --git a/src/features/calls/calls_entry.R b/src/features/entry.R similarity index 57% rename from src/features/calls/calls_entry.R rename to src/features/entry.R index bea2c7cb..0318ce45 100644 --- a/src/features/calls/calls_entry.R +++ b/src/features/entry.R @@ -3,11 +3,14 @@ source("src/features/utils/utils.R") library("dplyr") library("tidyr") -sensor_data_file <- snakemake@input[["sensor_data"]] -day_segments_file <- snakemake@input[["day_segments_labels"]] +sensor_data_files <- snakemake@input +sensor_data_files$day_segments_labels <- NULL +day_segments_file <- snakemake@input[["day_segments_labels"]] + provider <- snakemake@params["provider"][["provider"]] provider_key <- snakemake@params["provider_key"] +sensor_key <- snakemake@params["sensor_key"] -sensor_features <- fetch_provider_features(provider, provider_key, "calls", sensor_data_file, day_segments_file) +sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, day_segments_file) -write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) +write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) \ No newline at end of file diff --git a/src/features/entry.py b/src/features/entry.py new file mode 100644 index 00000000..c76c0c4f --- /dev/null +++ b/src/features/entry.py @@ -0,0 +1,14 @@ +import pandas as pd +from utils.utils import fetch_provider_features + +sensor_data_files = dict(snakemake.input) +del sensor_data_files["day_segments_labels"] +day_segments_file = snakemake.input["day_segments_labels"] + +provider = snakemake.params["provider"] +provider_key = snakemake.params["provider_key"] +sensor_key = snakemake.params["sensor_key"] + +sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, day_segments_file) + +sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/light/light_entry.R b/src/features/light/light_entry.R deleted file mode 100644 index 99f8dca4..00000000 --- a/src/features/light/light_entry.R +++ /dev/null @@ -1,13 +0,0 @@ -source("renv/activate.R") -source("src/features/utils/utils.R") -library("dplyr") -library("tidyr") - -sensor_data_file <- snakemake@input[["sensor_data"]] -day_segments_file <- snakemake@input[["day_segments_labels"]] -provider <- snakemake@params["provider"][["provider"]] -provider_key <- snakemake@params["provider_key"] - -sensor_features <- fetch_provider_features(provider, provider_key, "light", sensor_data_file, day_segments_file) - -write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/light/light_entry.py b/src/features/light/light_entry.py deleted file mode 100644 index f0ee5e8f..00000000 --- a/src/features/light/light_entry.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -from importlib import import_module, util -from pathlib import Path - -# import fetch_provider_features from src/features/utils/utils.py -spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) -mod = util.module_from_spec(spec) -spec.loader.exec_module(mod) -fetch_provider_features = getattr(mod, "fetch_provider_features") - -sensor_data_file = snakemake.input["sensor_data"][0] -day_segments_file = snakemake.input["day_segments_labels"] -provider = snakemake.params["provider"] -provider_key = snakemake.params["provider_key"] - -sensor_features = fetch_provider_features(provider, provider_key, "light", sensor_data_file, day_segments_file) - -sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/light/rapids/main.py b/src/features/light/rapids/main.py index ffe31ea0..f7d2bbdb 100644 --- a/src/features/light/rapids/main.py +++ b/src/features/light/rapids/main.py @@ -1,33 +1,35 @@ import pandas as pd import numpy as np -def rapids_features(light_data, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): + + light_data = pd.read_csv(sensor_data_files["sensor_data"]) requested_features = provider["FEATURES"] # name of the features this function can compute base_features_names = ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) - light_features = pd.DataFrame(columns=["local_segment"] + ["light_rapids_" + "_" + x for x in features_to_compute]) + light_features = pd.DataFrame(columns=["local_segment"] + ["light_rapids_" + x for x in features_to_compute]) if not light_data.empty: light_data = filter_data_by_segment(light_data, day_segment) if not light_data.empty: light_features = pd.DataFrame() if "count" in features_to_compute: - light_features["light_rapids_" + "_count"] = light_data.groupby(["local_segment"]).count()["timestamp"] + light_features["light_rapids_count"] = light_data.groupby(["local_segment"]).count()["timestamp"] # get light ambient luminance related features if "maxlux" in features_to_compute: - light_features["light_rapids_" + "_maxlux"] = light_data.groupby(["local_segment"])["double_light_lux"].max() + light_features["light_rapids_maxlux"] = light_data.groupby(["local_segment"])["double_light_lux"].max() if "minlux" in features_to_compute: - light_features["light_rapids_" + "_minlux"] = light_data.groupby(["local_segment"])["double_light_lux"].min() + light_features["light_rapids_minlux"] = light_data.groupby(["local_segment"])["double_light_lux"].min() if "avglux" in features_to_compute: - light_features["light_rapids_" + "_avglux"] = light_data.groupby(["local_segment"])["double_light_lux"].mean() + light_features["light_rapids_avglux"] = light_data.groupby(["local_segment"])["double_light_lux"].mean() if "medianlux" in features_to_compute: - light_features["light_rapids_" + "_medianlux"] = light_data.groupby(["local_segment"])["double_light_lux"].median() + light_features["light_rapids_medianlux"] = light_data.groupby(["local_segment"])["double_light_lux"].median() if "stdlux" in features_to_compute: - light_features["light_rapids_" + "_stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std() + light_features["light_rapids_stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std() light_features = light_features.reset_index() diff --git a/src/features/locations/barnett/main.R b/src/features/locations/barnett/main.R index 9aafe523..08a61890 100644 --- a/src/features/locations/barnett/main.R +++ b/src/features/locations/barnett/main.R @@ -27,8 +27,11 @@ create_empty_file <- function(requested_features){ ) %>% select(all_of(requested_features))) } -barnett_features <- function(location_data, day_segment, params){ +barnett_features <- function(sensor_data_files, day_segment, params){ + + location_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) location_features <- NULL + location <- location_data accuracy_limit <- params[["ACCURACY_LIMIT"]] timezone <- params[["TIMEZONE"]] diff --git a/src/features/locations/doryab/main.py b/src/features/locations/doryab/main.py index 0f0d4037..70097d0c 100644 --- a/src/features/locations/doryab/main.py +++ b/src/features/locations/doryab/main.py @@ -4,7 +4,9 @@ from astropy.timeseries import LombScargle from sklearn.cluster import DBSCAN from math import radians, cos, sin, asin, sqrt -def doryab_features(location_data, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): + + location_data = pd.read_csv(sensor_data_files["sensor_data"]) requested_features = provider["FEATURES"] dbscan_eps = provider["DBSCAN_EPS"] dbscan_minsamples = provider["DBSCAN_MINSAMPLES"] diff --git a/src/features/locations/locations_entry.R b/src/features/locations/locations_entry.R deleted file mode 100644 index f8786432..00000000 --- a/src/features/locations/locations_entry.R +++ /dev/null @@ -1,13 +0,0 @@ -source("renv/activate.R") -source("src/features/utils/utils.R") -library("dplyr") -library("tidyr") - -sensor_data_file <- snakemake@input[["sensor_data"]] -day_segments_file <- snakemake@input[["day_segments_labels"]] -provider <- snakemake@params["provider"][["provider"]] -provider_key <- snakemake@params["provider_key"] - -sensor_features <- fetch_provider_features(provider, provider_key, "locations", sensor_data_file, day_segments_file) - -write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/locations/locations_entry.py b/src/features/locations/locations_entry.py deleted file mode 100644 index 9ff0470f..00000000 --- a/src/features/locations/locations_entry.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -from importlib import import_module, util -from pathlib import Path - -# import fetch_provider_features from src/features/utils/utils.py -spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) -mod = util.module_from_spec(spec) -spec.loader.exec_module(mod) -fetch_provider_features = getattr(mod, "fetch_provider_features") - -sensor_data_file = snakemake.input["sensor_data"][0] -day_segments_file = snakemake.input["day_segments_labels"] -provider = snakemake.params["provider"] -provider_key = snakemake.params["provider_key"] - -sensor_features = fetch_provider_features(provider, provider_key, "locations", sensor_data_file, day_segments_file) - -sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/messages/messages_entry.R b/src/features/messages/messages_entry.R deleted file mode 100644 index 63b0fa47..00000000 --- a/src/features/messages/messages_entry.R +++ /dev/null @@ -1,13 +0,0 @@ -source("renv/activate.R") -source("src/features/utils/utils.R") -library("dplyr") -library("tidyr") - -sensor_data_file <- snakemake@input[["sensor_data"]] -day_segments_file <- snakemake@input[["day_segments_labels"]] -provider <- snakemake@params["provider"][["provider"]] -provider_key <- snakemake@params["provider_key"] - -sensor_features <- fetch_provider_features(provider, provider_key, "messages", sensor_data_file, day_segments_file) - -write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/messages/messages_entry.py b/src/features/messages/messages_entry.py deleted file mode 100644 index ab46b28f..00000000 --- a/src/features/messages/messages_entry.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -from importlib import import_module, util -from pathlib import Path - -# import fetch_provider_features from src/features/utils/utils.py -spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) -mod = util.module_from_spec(spec) -spec.loader.exec_module(mod) -fetch_provider_features = getattr(mod, "fetch_provider_features") - -sensor_data_file = snakemake.input["sensor_data"][0] -day_segments_file = snakemake.input["day_segments_labels"] -provider = snakemake.params["provider"] -provider_key = snakemake.params["provider_key"] - -sensor_features = fetch_provider_features(provider, provider_key, "messages", sensor_data_file, day_segments_file) - -sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/messages/rapids/main.R b/src/features/messages/rapids/main.R index 7ec08232..53f226c0 100644 --- a/src/features/messages/rapids/main.R +++ b/src/features/messages/rapids/main.R @@ -50,8 +50,9 @@ message_features_of_type <- function(messages, messages_type, day_segment, reque return(features) } -rapids_features <- function(messages, day_segment, provider){ - messages <- messages %>% filter_data_by_segment(day_segment) +rapids_features <- function(sensor_data_files, day_segment, provider){ + messages_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) + messages_data <- messages_data %>% filter_data_by_segment(day_segment) messages_types = provider[["MESSAGES_TYPES"]] messages_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment")) @@ -62,7 +63,7 @@ rapids_features <- function(messages, day_segment, provider){ stop(paste("Message type can online be received or sent but instead you typed: ", message_type, " in config[MESSAGES][MESSAGES_TYPES]")) requested_features <- provider[["FEATURES"]][[message_type]] - messages_of_type <- messages %>% filter(message_type == message_type_label) + messages_of_type <- messages_data %>% filter(message_type == message_type_label) features <- message_features_of_type(messages_of_type, message_type, day_segment, requested_features) messages_features <- merge(messages_features, features, all=TRUE) diff --git a/src/features/screen/rapids/main.py b/src/features/screen/rapids/main.py index 6250b8c7..96170a0c 100644 --- a/src/features/screen/rapids/main.py +++ b/src/features/screen/rapids/main.py @@ -25,7 +25,9 @@ def getEpisodeDurationFeatures(screen_data, day_segment, episode, features, refe return duration_helper -def rapids_features(screen_data, day_segment, provider, filter_data_by_segment, *args, **kwargs): +def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): + + screen_data = pd.read_csv(sensor_data_files["sensor_episodes"]) reference_hour_first_use = provider["REFERENCE_HOUR_FIRST_USE"] requested_features_episodes = provider["FEATURES"] diff --git a/src/features/screen/screen_entry.R b/src/features/screen/screen_entry.R deleted file mode 100644 index bf970e2b..00000000 --- a/src/features/screen/screen_entry.R +++ /dev/null @@ -1,13 +0,0 @@ -source("renv/activate.R") -source("src/features/utils/utils.R") -library("dplyr") -library("tidyr") - -sensor_data_file <- snakemake@input[["screen_episodes"]] -day_segments_file <- snakemake@input[["day_segments_labels"]] -provider <- snakemake@params["provider"][["provider"]] -provider_key <- snakemake@params["provider_key"] - -sensor_features <- fetch_provider_features(provider, provider_key, "screen", sensor_data_file, day_segments_file) - -write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) \ No newline at end of file diff --git a/src/features/screen/screen_entry.py b/src/features/screen/screen_entry.py deleted file mode 100644 index 83199ce3..00000000 --- a/src/features/screen/screen_entry.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -from importlib import import_module, util -from pathlib import Path - -# import fetch_provider_features from src/features/utils/utils.py -spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) -mod = util.module_from_spec(spec) -spec.loader.exec_module(mod) -fetch_provider_features = getattr(mod, "fetch_provider_features") - -screen_episodes_file = snakemake.input["screen_episodes"] -day_segments_file = snakemake.input["day_segments_labels"] -provider = snakemake.params["provider"] -provider_key = snakemake.params["provider_key"] - -sensor_features = fetch_provider_features(provider, provider_key, "screen", screen_episodes_file, day_segments_file) - -sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/utils/utils.R b/src/features/utils/utils.R index 7199295a..bfc3cb71 100644 --- a/src/features/utils/utils.R +++ b/src/features/utils/utils.R @@ -43,24 +43,23 @@ chunk_episodes <- function(sensor_episodes){ return(chunked_episodes) } -fetch_provider_features <- function(provider, provider_key, config_key, sensor_data_file, day_segments_file){ +fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_data_files, day_segments_file){ sensor_features <- data.frame(local_segment = character(), stringsAsFactors = FALSE) - sensor_data <- read.csv(sensor_data_file, stringsAsFactors = FALSE) day_segments_labels <- read.csv(day_segments_file, stringsAsFactors = FALSE) if(!"FEATURES" %in% names(provider)) - stop(paste0("Provider config[", config_key,"][PROVIDERS][", provider_key,"] is missing a FEATURES attribute in config.yaml")) + stop(paste0("Provider config[", sensor_key,"][PROVIDERS][", provider_key,"] is missing a FEATURES attribute in config.yaml")) if(provider[["COMPUTE"]] == TRUE){ - code_path <- paste0("src/features/", config_key,"/", provider[["SRC_FOLDER"]], "/main.R") + code_path <- paste0("src/features/", sensor_key,"/", provider[["SRC_FOLDER"]], "/main.R") source(code_path) features_function <- match.fun(paste0(provider[["SRC_FOLDER"]], "_features")) day_segments <- day_segments_labels %>% pull(label) for (day_segment in day_segments){ - print(paste(rapids_log_tag,"Processing", config_key, provider_key, day_segment)) + print(paste(rapids_log_tag,"Processing", sensor_key, provider_key, day_segment)) - features <- features_function(sensor_data, day_segment, provider) + features <- features_function(sensor_data_files, day_segment, provider) # Check all features names contain the provider key so they are unique features_names <- colnames(features %>% select(-local_segment)) diff --git a/src/features/utils/utils.py b/src/features/utils/utils.py index 688f2be6..86741291 100644 --- a/src/features/utils/utils.py +++ b/src/features/utils/utils.py @@ -67,24 +67,24 @@ def chunk_episodes(sensor_episodes): return merged_sensor_episodes -def fetch_provider_features(provider, provider_key, config_key, sensor_data_file, day_segments_file): +def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, day_segments_file): import pandas as pd from importlib import import_module, util sensor_features = pd.DataFrame(columns=["local_segment"]) - sensor_data = pd.read_csv(sensor_data_file) day_segments_labels = pd.read_csv(day_segments_file, header=0) if "FEATURES" not in provider: - raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(config_key.upper(), provider_key)) + raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(sensor_key.upper(), provider_key)) if provider["COMPUTE"] == True: - code_path = provider["SRC_FOLDER"] + ".main" + + code_path = sensor_key + "." + provider["SRC_FOLDER"] + ".main" feature_module = import_module(code_path) feature_function = getattr(feature_module, provider["SRC_FOLDER"] + "_features") for day_segment in day_segments_labels["label"]: - print("{} Processing {} {} {}".format(rapids_log_tag, config_key, provider_key, day_segment)) - features = feature_function(sensor_data, day_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes) + print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, day_segment)) + features = feature_function(sensor_data_files, day_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes) sensor_features = sensor_features.merge(features, how="outer") else: for feature in provider["FEATURES"]: diff --git a/src/features/wifi/rapids/main.R b/src/features/wifi/rapids/main.R index ab61e5d5..7c4ea072 100644 --- a/src/features/wifi/rapids/main.R +++ b/src/features/wifi/rapids/main.R @@ -25,21 +25,22 @@ compute_wifi_feature <- function(data, feature, day_segment){ } } -rapids_features <- function(wifi_data, day_segment, provider){ - requested_features <- provider[["FEATURES"]] - # Output dataframe - features = data.frame(local_segment = character(), stringsAsFactors = FALSE) +rapids_features <- function(sensor_data_files, day_segment, provider){ + wifi_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE) + requested_features <- provider[["FEATURES"]] + # Output dataframe + features = data.frame(local_segment = character(), stringsAsFactors = FALSE) - # The name of the features this function can compute - base_features_names <- c("countscans", "uniquedevices", "countscansmostuniquedevice") + # The name of the features this function can compute + base_features_names <- c("countscans", "uniquedevices", "countscansmostuniquedevice") - # The subset of requested features this function can compute - features_to_compute <- intersect(base_features_names, requested_features) + # The subset of requested features this function can compute + features_to_compute <- intersect(base_features_names, requested_features) - for(feature_name in features_to_compute){ - feature <- compute_wifi_feature(wifi_data, feature_name, day_segment) - features <- merge(features, feature, by="local_segment", all = TRUE) - } + for(feature_name in features_to_compute){ + feature <- compute_wifi_feature(wifi_data, feature_name, day_segment) + features <- merge(features, feature, by="local_segment", all = TRUE) + } - return(features) + return(features) } diff --git a/src/features/wifi/wifi_entry.R b/src/features/wifi/wifi_entry.R deleted file mode 100644 index 1a825360..00000000 --- a/src/features/wifi/wifi_entry.R +++ /dev/null @@ -1,13 +0,0 @@ -source("renv/activate.R") -source("src/features/utils/utils.R") -library("dplyr") -library("tidyr") - -sensor_data_file <- snakemake@input[["sensor_data"]] -day_segments_file <- snakemake@input[["day_segments_labels"]] -provider <- snakemake@params["provider"][["provider"]] -provider_key <- snakemake@params["provider_key"] - -sensor_features <- fetch_provider_features(provider, provider_key, "wifi", sensor_data_file, day_segments_file) - -write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/wifi/wifi_entry.py b/src/features/wifi/wifi_entry.py deleted file mode 100644 index ffe8bb2f..00000000 --- a/src/features/wifi/wifi_entry.py +++ /dev/null @@ -1,18 +0,0 @@ -import pandas as pd -from importlib import import_module, util -from pathlib import Path - -# import fetch_provider_features from src/features/utils/utils.py -spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) -mod = util.module_from_spec(spec) -spec.loader.exec_module(mod) -fetch_provider_features = getattr(mod, "fetch_provider_features") - -sensor_data_file = snakemake.input["sensor_data"][0] -day_segments_file = snakemake.input["day_segments_labels"] -provider = snakemake.params["provider"] -provider_key = snakemake.params["provider_key"] - -sensor_features = fetch_provider_features(provider, provider_key, "wifi", sensor_data_file, day_segments_file) - -sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file