Update AR module for segments; Refactor input format

pull/103/head
Meng Li 2020-10-07 18:11:06 -04:00
parent 01ab59a3b6
commit 236b1cd809
43 changed files with 571 additions and 766 deletions

View File

@ -55,16 +55,22 @@ for provider in config["BLUETOOTH"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["BLUETOOTH"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="BLUETOOTH".lower())) files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["BLUETOOTH"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="BLUETOOTH".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="BLUETOOTH".lower())) files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="BLUETOOTH".lower()))
if config["ACTIVITY_RECOGNITION"]["COMPUTE"]: for provider in config["ACTIVITY_RECOGNITION"]["PROVIDERS"].keys():
pids_android = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "android", config["PIDS"])) if config["ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["COMPUTE"]:
pids_ios = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "ios", config["PIDS"])) pids_android = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "android", config["PIDS"]))
pids_ios = list(filter(lambda pid: infer_participant_platform("data/external/" + pid) == "ios", config["PIDS"]))
for pids,table in zip([pids_android, pids_ios], [config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]):
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=pids, sensor=table)) for pids,table in zip([pids_android, pids_ios], [config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]]):
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=pids, sensor=table)) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=pids, sensor=table))
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table)) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=pids, sensor=table))
files_to_compute.extend(expand("data/processed/{pid}/{sensor}_deltas.csv", pid=pids, sensor=table)) files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=pids, sensor=table))
files_to_compute.extend(expand("data/processed/{pid}/activity_recognition_{day_segment}.csv",pid=config["PIDS"], day_segment = config["ACTIVITY_RECOGNITION"]["DAY_SEGMENTS"]))
files_to_compute.extend(expand("data/interim/{pid}/activity_recognition_episodes.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/activity_recognition_episodes_resampled.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/activity_recognition_episodes_resampled_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["ACTIVITY_RECOGNITION"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="ACTIVITY_RECOGNITION".lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="ACTIVITY_RECOGNITION".lower()))
for provider in config["BATTERY"]["PROVIDERS"].keys(): for provider in config["BATTERY"]["PROVIDERS"].keys():
if config["BATTERY"]["PROVIDERS"][provider]["COMPUTE"]: if config["BATTERY"]["PROVIDERS"][provider]["COMPUTE"]:

View File

@ -113,12 +113,19 @@ BLUETOOTH:
ACTIVITY_RECOGNITION: ACTIVITY_RECOGNITION:
COMPUTE: False
DB_TABLE: DB_TABLE:
ANDROID: plugin_google_activity_recognition ANDROID: plugin_google_activity_recognition
IOS: plugin_ios_activity_recognition IOS: plugin_ios_activity_recognition
DAY_SEGMENTS: *day_segments PROVIDERS:
FEATURES: ["count","mostcommonactivity","countuniqueactivities","activitychangecount","sumstationary","summobile","sumvehicle"] RAPIDS:
COMPUTE: False
FEATURES: ["count", "mostcommonactivity", "countuniqueactivities", "durationstationary", "durationmobile", "durationvehicle"]
ACTIVITY_CLASSES:
STATIONARY: ["still", "tilting"]
MOBILE: ["on_foot", "walking", "running", "on_bicycle"]
VEHICLE: ["in_vehicle"]
SRC_FOLDER: "rapids" # inside src/features/activity_recognition
SRC_LANGUAGE: "python"
BATTERY: BATTERY:
DB_TABLE: battery DB_TABLE: battery

View File

@ -46,21 +46,19 @@ def find_features_files(wildcards):
def optional_ar_input(wildcards): def optional_ar_input(wildcards):
platform = infer_participant_platform("data/external/"+wildcards.pid) platform = infer_participant_platform("data/external/"+wildcards.pid)
if platform == "android": if platform == "android":
return ["data/raw/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_with_datetime_unified.csv", return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"])
"data/interim/{pid}/" + config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"] + "_episodes.csv"]
elif platform == "ios": elif platform == "ios":
return ["data/raw/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_with_datetime_unified.csv", return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"])
"data/interim/{pid}/"+config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]+"_episodes.csv"]
def optional_conversation_input(wildcards): def optional_conversation_input(wildcards):
platform = infer_participant_platform("data/external/"+wildcards.pid) platform = infer_participant_platform("data/external/"+wildcards.pid)
if platform == "android": if platform == "android":
return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["ANDROID"] + "_with_datetime_unified.csv"] return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CONVERSATION"]["DB_TABLE"]["ANDROID"])[0]
elif platform == "ios": elif platform == "ios":
return ["data/raw/{pid}/" + config["CONVERSATION"]["DB_TABLE"]["IOS"] + "_with_datetime_unified.csv"] return expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CONVERSATION"]["DB_TABLE"]["IOS"])[0]
def optional_steps_sleep_input(wildcards): def optional_steps_sleep_input(wildcards):
if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED": if config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"] == True and config["STEP"]["EXCLUDE_SLEEP"]["TYPE"] == "FITBIT_BASED":

View File

@ -6,70 +6,6 @@ rule join_features_from_providers:
script: script:
"../src/features/join_features_from_providers.R" "../src/features/join_features_from_providers.R"
rule messages_r_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"]),
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["MESSAGES"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/messages_features/messages_r_{provider_key}.csv"
script:
"../src/features/messages/messages_entry.R"
rule messages_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"]),
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["MESSAGES"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/messages_features/messages_python_{provider_key}.csv"
script:
"../src/features/messages/messages_entry.py"
rule calls_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"]),
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/calls_features/calls_python_{provider_key}.csv"
script:
"../src/features/calls/calls_entry.py"
rule calls_r_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"]),
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/calls_features/calls_r_{provider_key}.csv"
script:
"../src/features/calls/calls_entry.R"
rule battery_episodes:
input:
expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor=config["BATTERY"]["DB_TABLE"])
output:
"data/interim/{pid}/battery_episodes.csv"
script:
"../src/features/battery/episodes/battery_episodes.R"
rule screen_episodes:
input:
screen = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["SCREEN"]["DB_TABLE"])
output:
"data/interim/{pid}/screen_episodes.csv"
script:
"../src/features/screen/episodes/screen_episodes.R"
rule resample_episodes: rule resample_episodes:
input: input:
"data/interim/{pid}/{sensor}_episodes.csv" "data/interim/{pid}/{sensor}_episodes.csv"
@ -92,178 +28,6 @@ rule resample_episodes_with_datetime:
script: script:
"../src/data/readable_datetime.R" "../src/data/readable_datetime.R"
rule google_activity_recognition_deltas:
input:
expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"])
output:
expand("data/interim/{{pid}}/{sensor}_episodes.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"])
script:
"../src/features/ar/episodes/activity_recognition_episodes.R"
rule ios_activity_recognition_deltas:
input:
expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"])
output:
expand("data/interim/{{pid}}/{sensor}_episodes.csv", sensor=config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"])
script:
"../src/features/ar/episodes/activity_recognition_episodes.R"
rule locations_python_features:
input:
sensor_data = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]),
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
output:
"data/interim/{pid}/locations_features/locations_python_{provider_key}.csv"
script:
"../src/features/locations/locations_entry.py"
rule locations_r_features:
input:
sensor_data = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"]),
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/locations_features/locations_r_{provider_key}.csv"
script:
"../src/features/locations/locations_entry.R"
rule bluetooth_r_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"]),
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["BLUETOOTH"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/bluetooth_features/bluetooth_r_{provider_key}.csv"
script:
"../src/features/bluetooth/bluetooth_entry.R"
rule bluetooth_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"]),
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["BLUETOOTH"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/bluetooth_features/bluetooth_python_{provider_key}.csv"
script:
"../src/features/bluetooth/bluetooth_entry.py"
rule activity_features:
input:
optional_ar_input
params:
segment = "{day_segment}",
features = config["ACTIVITY_RECOGNITION"]["FEATURES"]
output:
"data/processed/{pid}/activity_recognition_{day_segment}.csv"
script:
"../src/features/activity_recognition.py"
rule battery_r_features:
input:
battery_episodes = "data/interim/{pid}/battery_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["BATTERY"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/battery_features/battery_r_{provider_key}.csv"
script:
"../src/features/battery/battery_entry.R"
rule battery_python_features:
input:
battery_episodes = "data/interim/{pid}/battery_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["BATTERY"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/battery_features/battery_python_{provider_key}.csv"
script:
"../src/features/battery/battery_entry.py"
rule screen_r_features:
input:
screen_episodes = "data/interim/{pid}/screen_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["SCREEN"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/screen_features/screen_r_{provider_key}.csv"
script:
"../src/features/screen/screen_entry.R"
rule screen_python_features:
input:
screen_episodes = "data/interim/{pid}/screen_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["SCREEN"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/screen_features/screen_python_{provider_key}.csv"
script:
"../src/features/screen/screen_entry.py"
rule light_r_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"]),
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/light_features/light_r_{provider_key}.csv"
script:
"../src/features/light/light_entry.R"
rule light_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"]),
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/light_features/light_python_{provider_key}.csv"
script:
"../src/features/light/light_entry.py"
rule conversation_r_features:
input:
sensor_data = optional_conversation_input,
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["CONVERSATION"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/conversation_features/conversation_r_{provider_key}.csv"
script:
"../src/features/conversation/conversation_entry.R"
rule conversation_python_features:
input:
sensor_data = optional_conversation_input,
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["CONVERSATION"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}"
output:
"data/interim/{pid}/conversation_features/conversation_python_{provider_key}.csv"
script:
"../src/features/conversation/conversation_entry.py"
rule accelerometer_features: rule accelerometer_features:
input: input:
expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["ACCELEROMETER"]["DB_TABLE"]), expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["ACCELEROMETER"]["DB_TABLE"]),
@ -278,53 +42,315 @@ rule accelerometer_features:
script: script:
"../src/features/accelerometer_features.py" "../src/features/accelerometer_features.py"
rule activity_recognition_episodes:
input:
optional_ar_input
output:
"data/interim/{pid}/activity_recognition_episodes.csv"
script:
"../src/features/activity_recognition/episodes/activity_recognition_episodes.R"
rule activity_recognition_r_features:
input:
sensor_episodes = "data/interim/{pid}/activity_recognition_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["ACTIVITY_RECOGNITION"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "activity_recognition"
output:
"data/interim/{pid}/activity_recognition_features/activity_recognition_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule activity_recognition_python_features:
input:
sensor_episodes = "data/interim/{pid}/activity_recognition_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["ACTIVITY_RECOGNITION"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "activity_recognition"
output:
"data/interim/{pid}/activity_recognition_features/activity_recognition_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule applications_foreground_r_features: rule applications_foreground_r_features:
input: input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]), sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params: params:
provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key], provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}" provider_key = "{provider_key}",
sensor_key = "applications_foreground"
output: output:
"data/interim/{pid}/applications_foreground_features/applications_foreground_r_{provider_key}.csv" "data/interim/{pid}/applications_foreground_features/applications_foreground_r_{provider_key}.csv"
script: script:
"../src/features/applications_foreground/applications_foreground_entry.R" "../src/features/entry.R"
rule applications_foreground_python_features: rule applications_foreground_python_features:
input: input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"]), sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_with_genre.csv", sensor=config["APPLICATIONS_FOREGROUND"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params: params:
provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key], provider = lambda wildcards: config["APPLICATIONS_FOREGROUND"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}" provider_key = "{provider_key}",
sensor_key = "applications_foreground"
output: output:
"data/interim/{pid}/applications_foreground_features/applications_foreground_python_{provider_key}.csv" "data/interim/{pid}/applications_foreground_features/applications_foreground_python_{provider_key}.csv"
script: script:
"../src/features/applications_foreground/applications_foreground_entry.py" "../src/features/entry.py"
rule battery_episodes:
input:
expand("data/raw/{{pid}}/{sensor}_raw.csv", sensor=config["BATTERY"]["DB_TABLE"])
output:
"data/interim/{pid}/battery_episodes.csv"
script:
"../src/features/battery/episodes/battery_episodes.R"
rule battery_r_features:
input:
sensor_episodes = "data/interim/{pid}/battery_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["BATTERY"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "battery"
output:
"data/interim/{pid}/battery_features/battery_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule battery_python_features:
input:
sensor_episodes = "data/interim/{pid}/battery_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["BATTERY"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "battery"
output:
"data/interim/{pid}/battery_features/battery_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule bluetooth_r_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["BLUETOOTH"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "bluetooth"
output:
"data/interim/{pid}/bluetooth_features/bluetooth_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule bluetooth_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["BLUETOOTH"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["BLUETOOTH"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "bluetooth"
output:
"data/interim/{pid}/bluetooth_features/bluetooth_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule calls_r_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "calls"
output:
"data/interim/{pid}/calls_features/calls_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule calls_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["CALLS"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "calls"
output:
"data/interim/{pid}/calls_features/calls_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule conversation_r_features:
input:
sensor_data = optional_conversation_input,
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["CONVERSATION"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "conversation"
output:
"data/interim/{pid}/conversation_features/conversation_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule conversation_python_features:
input:
sensor_data = optional_conversation_input,
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["CONVERSATION"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "conversation"
output:
"data/interim/{pid}/conversation_features/conversation_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule light_r_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "light"
output:
"data/interim/{pid}/light_features/light_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule light_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "light"
output:
"data/interim/{pid}/light_features/light_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule locations_r_features:
input:
sensor_data = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "locations"
output:
"data/interim/{pid}/locations_features/locations_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule locations_python_features:
input:
sensor_data = expand("data/interim/{{pid}}/{sensor}_processed_{locations_to_use}_with_datetime.csv", sensor=config["LOCATIONS"]["DB_TABLE"], locations_to_use=config["LOCATIONS"]["LOCATIONS_TO_USE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["LOCATIONS"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "locations"
output:
"data/interim/{pid}/locations_features/locations_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule messages_r_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["MESSAGES"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "messages"
output:
"data/interim/{pid}/messages_features/messages_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule messages_python_features:
input:
sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["MESSAGES"]["DB_TABLE"])[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["MESSAGES"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "messages"
output:
"data/interim/{pid}/messages_features/messages_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule screen_episodes:
input:
screen = expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["SCREEN"]["DB_TABLE"])
output:
"data/interim/{pid}/screen_episodes.csv"
script:
"../src/features/screen/episodes/screen_episodes.R"
rule screen_r_features:
input:
sensor_episodes = "data/interim/{pid}/screen_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["SCREEN"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "screen"
output:
"data/interim/{pid}/screen_features/screen_r_{provider_key}.csv"
script:
"../src/features/entry.R"
rule screen_python_features:
input:
sensor_episodes = "data/interim/{pid}/screen_episodes_resampled_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params:
provider = lambda wildcards: config["SCREEN"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}",
sensor_key = "screen"
output:
"data/interim/{pid}/screen_features/screen_python_{provider_key}.csv"
script:
"../src/features/entry.py"
rule wifi_r_features: rule wifi_r_features:
input: input:
sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower()), sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower())[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params: params:
provider = lambda wildcards: config["WIFI"]["PROVIDERS"][wildcards.provider_key], provider = lambda wildcards: config["WIFI"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}" provider_key = "{provider_key}",
sensor_key = "wifi"
output: output:
"data/interim/{pid}/wifi_features/wifi_r_{provider_key}.csv" "data/interim/{pid}/wifi_features/wifi_r_{provider_key}.csv"
script: script:
"../src/features/wifi/wifi_entry.R" "../src/features/entry.R"
rule wifi_python_features: rule wifi_python_features:
input: input:
sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower()), sensor_data = expand("data/raw/{{pid}}/{sensor_key}_with_datetime_visibleandconnected.csv", sensor_key="WIFI".lower())[0],
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params: params:
provider = lambda wildcards: config["WIFI"]["PROVIDERS"][wildcards.provider_key], provider = lambda wildcards: config["WIFI"]["PROVIDERS"][wildcards.provider_key],
provider_key = "{provider_key}" provider_key = "{provider_key}",
sensor_key = "wifi"
output: output:
"data/interim/{pid}/wifi_features/wifi_python_{provider_key}.csv" "data/interim/{pid}/wifi_features/wifi_python_{provider_key}.csv"
script: script:
"../src/features/wifi/wifi_entry.py" "../src/features/entry.py"
rule fitbit_heartrate_features: rule fitbit_heartrate_features:
input: input:

View File

@ -1,15 +0,0 @@
import pandas as pd
from ar.ar_base import base_ar_features
ar_data = pd.read_csv(snakemake.input[0],parse_dates=["local_date_time"])
ar_deltas = pd.read_csv(snakemake.input[1],parse_dates=["local_start_date_time", "local_end_date_time", "local_start_date", "local_end_date"])
day_segment = snakemake.params["segment"]
requested_features = snakemake.params["features"]
ar_features = pd.DataFrame(columns=["local_date"])
ar_features = ar_features.merge(base_ar_features(ar_data, ar_deltas, day_segment, requested_features), on="local_date", how="outer")
assert len(requested_features) + 1 == ar_features.shape[1], "The number of features in the output dataframe (=" + str(ar_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your activity recognition feature extraction functions"
ar_features.to_csv(snakemake.output[0], index=False)

View File

@ -0,0 +1,123 @@
import pandas as pd
import numpy as np
def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs):
chunk_episodes = kwargs["chunk_episodes"]
ar_episodes = pd.read_csv(sensor_data_files["sensor_episodes"])
activity_classes = provider["ACTIVITY_CLASSES"]
# name of the features this function can compute
base_features_names = ["count","mostcommonactivity","countuniqueactivities","durationstationary","durationmobile","durationvehicle"]
# the subset of requested features this function can compute
requested_features = provider["FEATURES"]
features_to_compute = list(set(requested_features) & set(base_features_names))
ar_features = pd.DataFrame(columns=["local_segment"] + ["ar_rapids_" + x for x in features_to_compute])
if not ar_episodes.empty:
ar_episodes = filter_data_by_segment(ar_episodes, day_segment)
if not ar_episodes.empty:
# chunk episodes
ar_episodes = chunk_episodes(ar_episodes)
if not ar_episodes.empty:
ar_features = pd.DataFrame()
if "count" in features_to_compute:
ar_features["ar_rapids_count"] = ar_episodes.groupby(["local_segment"]).count()["episode_id"]
if "mostcommonactivity" in features_to_compute:
ar_features["ar_rapids_mostcommonactivity"] = ar_episodes.groupby(["local_segment"])["activity_type"].agg(lambda x: pd.Series.mode(x)[0])
if "countuniqueactivities" in features_to_compute:
ar_features["ar_rapids_countuniqueactivities"] = ar_episodes.groupby(["local_segment"])["activity_type"].nunique()
# duration features
for column, activity_labels in activity_classes.items():
if "duration" + column.lower() in features_to_compute:
filtered_data = ar_episodes[ar_episodes["activity_name"].isin(pd.Series(activity_labels))]
if not filtered_data.empty:
ar_features["ar_rapids_duration_" + column] = ar_episodes[ar_episodes["activity_name"].isin(pd.Series(activity_labels))].groupby(["local_segment"])["duration"].sum().fillna(0)
else:
ar_features["ar_rapids_duration_" + column] = 0
ar_features.index.names = ["local_segment"]
ar_features = ar_features.reset_index()
return ar_features
"""
if not ar_data.empty:
ar_data = filter_data_by_segment(ar_data, day_segment)
if not ar_data.empty:
# chunk_episodes
ar_data = chunk_episodes(ar_data)
if not ar_data.empty:
ar_data["episode_id"] = ((ar_data.ar_status != ar_data.ar_status.shift()) | (ar_data.start_timestamp - ar_data.end_timestamp.shift() > 1)).cumsum()
grouped = ar_data.groupby(by=["local_segment", "episode_id", "ar_status"])
ar_episodes= grouped[["duration"]].sum()
ar_episodes["ar_diff"] = grouped["ar_level"].first() - grouped["ar_level"].last()
ar_episodes["ar_consumption_rate"] = ar_episodes["ar_diff"] / ar_episodes["duration"]
ar_episodes.reset_index(inplace=True)
# for discharge episodes
ar_discharge_episodes = ar_episodes[(ar_episodes["ar_status"] == 3) | (ar_episodes["ar_status"] == 4)]
ar_discharge_features = pd.DataFrame()
if "countdischarge" in features_to_compute:
ar_discharge_features["ar_rapids_countdischarge"] = ar_discharge_episodes.groupby(["local_segment"])["episode_id"].count()
if "sumdurationdischarge" in features_to_compute:
ar_discharge_features["ar_rapids_sumdurationdischarge"] = ar_discharge_episodes.groupby(["local_segment"])["duration"].sum()
if "avgconsumptionrate" in features_to_compute:
ar_discharge_features["ar_rapids_avgconsumptionrate"] = ar_discharge_episodes.groupby(["local_segment"])["ar_consumption_rate"].mean()
if "maxconsumptionrate" in features_to_compute:
ar_discharge_features["ar_rapids_maxconsumptionrate"] = ar_discharge_episodes.groupby(["local_segment"])["ar_consumption_rate"].max()
# for charge episodes
ar_charge_episodes = ar_episodes[(ar_episodes["ar_status"] == 2) | (ar_episodes["ar_status"] == 5)]
ar_charge_features = pd.DataFrame()
if "countcharge" in features_to_compute:
ar_charge_features["ar_rapids_countcharge"] = ar_charge_episodes.groupby(["local_segment"])["episode_id"].count()
if "sumdurationcharge" in features_to_compute:
ar_charge_features["ar_rapids_sumdurationcharge"] = ar_charge_episodes.groupby(["local_segment"])["duration"].sum()
# combine discharge features and charge features; fill the missing values with ZERO
ar_features = pd.concat([ar_discharge_features, ar_charge_features], axis=1, sort=True).fillna(0)
ar_features.index.rename("local_segment", inplace=True)
ar_features = ar_features.reset_index()
return ar_features
"""

View File

@ -1,13 +0,0 @@
source("renv/activate.R")
source("src/features/utils/utils.R")
library("dplyr")
library("tidyr")
sensor_data_file <- snakemake@input[["sensor_data"]]
day_segments_file <- snakemake@input[["day_segments_labels"]]
provider <- snakemake@params["provider"][["provider"]]
provider_key <- snakemake@params["provider_key"]
sensor_features <- fetch_provider_features(provider, provider_key, "applications_foreground", sensor_data_file, day_segments_file)
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -1,18 +0,0 @@
import pandas as pd
from importlib import import_module, util
from pathlib import Path
# import fetch_provider_features from src/features/utils/utils.py
spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
mod = util.module_from_spec(spec)
spec.loader.exec_module(mod)
fetch_provider_features = getattr(mod, "fetch_provider_features")
sensor_data_file = snakemake.input["sensor_data"][0]
day_segments_file = snakemake.input["day_segments_labels"]
provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"]
sensor_features = fetch_provider_features(provider, provider_key, "applications_foreground", sensor_data_file, day_segments_file)
sensor_features.to_csv(snakemake.output[0], index=False)

View File

@ -9,28 +9,31 @@ def compute_features(filtered_data, apps_type, requested_features, apps_features
if "timeoffirstuse" in requested_features: if "timeoffirstuse" in requested_features:
time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment") time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
if time_first_event.empty: if time_first_event.empty:
apps_features["apps_rapids" + "_timeoffirstuse" + apps_type] = np.nan apps_features["apps_rapids_timeoffirstuse" + apps_type] = np.nan
else: else:
apps_features["apps_rapids" + "_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"] apps_features["apps_rapids_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
if "timeoflastuse" in requested_features: if "timeoflastuse" in requested_features:
time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment") time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
if time_last_event.empty: if time_last_event.empty:
apps_features["apps_rapids" + "_timeoflastuse" + apps_type] = np.nan apps_features["apps_rapids_timeoflastuse" + apps_type] = np.nan
else: else:
apps_features["apps_rapids" + "_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"] apps_features["apps_rapids_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
if "frequencyentropy" in requested_features: if "frequencyentropy" in requested_features:
apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index() apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
if (len(apps_with_count.index) < 2 ): if (len(apps_with_count.index) < 2 ):
apps_features["apps_rapids" + "_frequencyentropy" + apps_type] = np.nan apps_features["apps_rapids_frequencyentropy" + apps_type] = np.nan
else: else:
apps_features["apps_rapids" + "_frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy) apps_features["apps_rapids_frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy)
if "count" in requested_features: if "count" in requested_features:
apps_features["apps_rapids" + "_count" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"] apps_features["apps_rapids_count" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"]
apps_features.fillna(value={"apps_rapids" + "_count" + apps_type: 0}, inplace=True) apps_features.fillna(value={"apps_rapids_count" + apps_type: 0}, inplace=True)
return apps_features return apps_features
def rapids_features(apps_data, day_segment, provider, filter_data_by_segment, *args, **kwargs): def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs):
apps_data = pd.read_csv(sensor_data_files["sensor_data"])
requested_features = provider["FEATURES"] requested_features = provider["FEATURES"]
excluded_categories = provider["EXCLUDED_CATEGORIES"] excluded_categories = provider["EXCLUDED_CATEGORIES"]
excluded_apps = provider["EXCLUDED_APPS"] excluded_apps = provider["EXCLUDED_APPS"]
@ -49,10 +52,8 @@ def rapids_features(apps_data, day_segment, provider, filter_data_by_segment, *a
apps_data = apps_data[~apps_data["genre"].isin(excluded_categories)] apps_data = apps_data[~apps_data["genre"].isin(excluded_categories)]
# exclude apps in the excluded_apps list # exclude apps in the excluded_apps list
apps_data = apps_data[~apps_data["package_name"].isin(excluded_apps)] apps_data = apps_data[~apps_data["package_name"].isin(excluded_apps)]
apps_features = pd.DataFrame(columns=["local_segment"] + ["apps_rapids_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + single_apps)]])
apps_features = pd.DataFrame(columns=["local_segment"] + ["apps_rapids_" + "_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + single_apps)]])
if not apps_data.empty: if not apps_data.empty:
# deep copy the apps_data for the top1global computation # deep copy the apps_data for the top1global computation
apps_data_global = apps_data.copy() apps_data_global = apps_data.copy()

View File

@ -1,63 +0,0 @@
import pandas as pd
import numpy as np
import scipy.stats as stats
from features_utils import splitOvernightEpisodes, splitMultiSegmentEpisodes
def base_ar_features(ar_data, ar_deltas, day_segment, requested_features):
# name of the features this function can compute
base_features_names = ["count","mostcommonactivity","countuniqueactivities","activitychangecount","sumstationary","summobile","sumvehicle"]
# the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names))
ar_features = pd.DataFrame(columns = ["local_date"] + ["ar_" + day_segment + "_" + x for x in features_to_compute])
if not ar_data.empty:
ar_deltas = splitOvernightEpisodes(ar_deltas, [],["activity"])
if day_segment != "daily":
ar_deltas = splitMultiSegmentEpisodes(ar_deltas, day_segment, [])
ar_data.local_date_time = pd.to_datetime(ar_data.local_date_time)
resampledData = ar_data.set_index(ar_data.local_date_time)
resampledData.drop(columns=["local_date_time"], inplace=True)
if day_segment != "daily":
resampledData = resampledData.loc[resampledData["local_day_segment"] == day_segment]
if not resampledData.empty:
ar_features = pd.DataFrame()
# finding the count of samples of the day
if "count" in features_to_compute:
ar_features["ar_" + day_segment + "_count"] = resampledData["activity_type"].resample("D").count()
# finding most common activity of the day
if "mostcommonactivity" in features_to_compute:
ar_features["ar_" + day_segment + "_mostcommonactivity"] = resampledData["activity_type"].resample("D").apply(lambda x: stats.mode(x)[0] if len(stats.mode(x)[0]) != 0 else None)
# finding different number of activities during a day
if "countuniqueactivities" in features_to_compute:
ar_features["ar_" + day_segment + "_countuniqueactivities"] = resampledData["activity_type"].resample("D").nunique()
# finding Number of times activity changed
if "activitychangecount" in features_to_compute:
resampledData["activity_type_shift"] = resampledData["activity_type"].shift().fillna(resampledData["activity_type"].head(1))
resampledData["different_activity"] = np.where(resampledData["activity_type"]!=resampledData["activity_type_shift"],1,0)
ar_features["ar_" + day_segment + "_activitychangecount"] = resampledData["different_activity"].resample("D").sum()
deltas_features = {"sumstationary":["still","tilting"],
"summobile":["on_foot","walking","running","on_bicycle"],
"sumvehicle":["in_vehicle"]}
for column, activity_labels in deltas_features.items():
if column in features_to_compute:
filtered_data = ar_deltas[ar_deltas["activity"].isin(pd.Series(activity_labels))]
if not filtered_data.empty:
ar_features["ar_" + day_segment + "_" + column] = ar_deltas[ar_deltas["activity"].isin(pd.Series(activity_labels))].groupby(["local_start_date"])["time_diff"].sum().fillna(0)
else:
ar_features["ar_" + day_segment + "_" + column] = 0
ar_features.index.names = ["local_date"]
ar_features = ar_features.reset_index()
return ar_features

View File

@ -1,13 +0,0 @@
source("renv/activate.R")
source("src/features/utils/utils.R")
library("dplyr")
library("tidyr")
sensor_data_file <- snakemake@input[["battery_episodes"]]
day_segments_file <- snakemake@input[["day_segments_labels"]]
provider <- snakemake@params["provider"][["provider"]]
provider_key <- snakemake@params["provider_key"]
sensor_features <- fetch_provider_features(provider, provider_key, "battery", sensor_data_file, day_segments_file)
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -1,18 +0,0 @@
import pandas as pd
from importlib import import_module, util
from pathlib import Path
# import fetch_provider_features from src/features/utils/utils.py
spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
mod = util.module_from_spec(spec)
spec.loader.exec_module(mod)
fetch_provider_features = getattr(mod, "fetch_provider_features")
battery_episodes_file = snakemake.input["battery_episodes"]
day_segments_file = snakemake.input["day_segments_labels"]
provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"]
sensor_features = fetch_provider_features(provider, provider_key, "battery", battery_episodes_file, day_segments_file)
sensor_features.to_csv(snakemake.output[0], index=False)

View File

@ -1,8 +1,9 @@
import pandas as pd import pandas as pd
from datetime import datetime, timedelta, time from datetime import datetime, timedelta, time
def rapids_features(battery_data, day_segment, provider, filter_data_by_segment, *args, **kwargs): def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs):
battery_data = pd.read_csv(sensor_data_files["sensor_episodes"])
chunk_episodes = kwargs["chunk_episodes"] chunk_episodes = kwargs["chunk_episodes"]
# name of the features this function can compute # name of the features this function can compute

View File

@ -1,13 +0,0 @@
import pandas as pd
from battery.battery_base import base_battery_features
battery_data = pd.read_csv(snakemake.input[0], parse_dates=["local_start_date_time", "local_end_date_time", "local_start_date", "local_end_date"])
day_segment = snakemake.params["day_segment"]
requested_features = snakemake.params["features"]
battery_features = pd.DataFrame(columns=["local_date"])
battery_features = battery_features.merge(base_battery_features(battery_data, day_segment, requested_features), on="local_date", how="outer")
assert len(requested_features) + 1 == battery_features.shape[1], "The number of features in the output dataframe (=" + str(battery_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your battery feature extraction functions"
battery_features.to_csv(snakemake.output[0], index=False)

View File

@ -1,13 +0,0 @@
source("renv/activate.R")
source("src/features/utils/utils.R")
library("dplyr")
library("tidyr")
sensor_data_file <- snakemake@input[["sensor_data"]]
day_segments_file <- snakemake@input[["day_segments_labels"]]
provider <- snakemake@params["provider"][["provider"]]
provider_key <- snakemake@params["provider_key"]
sensor_features <- fetch_provider_features(provider, provider_key, "bluetooth", sensor_data_file, day_segments_file)
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -1,18 +0,0 @@
import pandas as pd
from importlib import import_module, util
from pathlib import Path
# import fetch_provider_features from src/features/utils/utils.py
spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
mod = util.module_from_spec(spec)
spec.loader.exec_module(mod)
fetch_provider_features = getattr(mod, "fetch_provider_features")
sensor_data_file = snakemake.input["sensor_data"][0]
day_segments_file = snakemake.input["day_segments_labels"]
provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"]
sensor_features = fetch_provider_features(provider, provider_key, "bluetooth", sensor_data_file, day_segments_file)
sensor_features.to_csv(snakemake.output[0], index=False)

View File

@ -27,24 +27,26 @@ compute_bluetooth_feature <- function(data, feature, day_segment){
} }
} }
rapids_features <- function(bluetooth_data, day_segment, provider){ rapids_features <- function(sensor_data_files, day_segment, provider){
requested_features <- provider[["FEATURES"]]
bluetooth_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
# Output dataframe requested_features <- provider[["FEATURES"]]
features = data.frame(local_segment = character(), stringsAsFactors = FALSE)
# Output dataframe
features = data.frame(local_segment = character(), stringsAsFactors = FALSE)
# The name of the features this function can compute # The name of the features this function can compute
base_features_names <- c("countscans", "uniquedevices", "countscansmostuniquedevice") base_features_names <- c("countscans", "uniquedevices", "countscansmostuniquedevice")
# The subset of requested features this function can compute # The subset of requested features this function can compute
features_to_compute <- intersect(base_features_names, requested_features) features_to_compute <- intersect(base_features_names, requested_features)
for(feature_name in features_to_compute){ for(feature_name in features_to_compute){
feature <- compute_bluetooth_feature(bluetooth_data, feature_name, day_segment) feature <- compute_bluetooth_feature(bluetooth_data, feature_name, day_segment)
features <- merge(features, feature, by="local_segment", all = TRUE) features <- merge(features, feature, by="local_segment", all = TRUE)
} }
features <- features %>% mutate_at(vars(contains("countscansmostuniquedevice")), list( ~ replace_na(., 0))) features <- features %>% mutate_at(vars(contains("countscansmostuniquedevice")), list( ~ replace_na(., 0)))
return(features) return(features)
} }

View File

@ -1,18 +0,0 @@
import pandas as pd
from importlib import import_module, util
from pathlib import Path
# import fetch_provider_features from src/features/utils/utils.py
spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
mod = util.module_from_spec(spec)
spec.loader.exec_module(mod)
fetch_provider_features = getattr(mod, "fetch_provider_features")
sensor_data_file = snakemake.input["sensor_data"][0]
day_segments_file = snakemake.input["day_segments_labels"]
provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"]
sensor_features = fetch_provider_features(provider, provider_key, "calls", sensor_data_file, day_segments_file)
sensor_features.to_csv(snakemake.output[0], index=False)

View File

@ -62,8 +62,9 @@ call_features_of_type <- function(calls, call_type, day_segment, requested_featu
return(features) return(features)
} }
rapids_features <- function(calls, day_segment, provider){ rapids_features <- function(sensor_data_files, day_segment, provider){
calls <- calls %>% filter_data_by_segment(day_segment) calls_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
calls_data <- calls_data %>% filter_data_by_segment(day_segment)
call_types = provider[["CALL_TYPES"]] call_types = provider[["CALL_TYPES"]]
call_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment")) call_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment"))
@ -74,7 +75,7 @@ rapids_features <- function(calls, day_segment, provider){
stop(paste("Call type can online be incoming, outgoing or missed but instead you typed: ", call_type, " in config[CALLS][CALL_TYPES]")) stop(paste("Call type can online be incoming, outgoing or missed but instead you typed: ", call_type, " in config[CALLS][CALL_TYPES]"))
requested_features <- provider[["FEATURES"]][[call_type]] requested_features <- provider[["FEATURES"]][[call_type]]
calls_of_type <- calls %>% filter(call_type == call_type_label) calls_of_type <- calls_data %>% filter(call_type == call_type_label)
features <- call_features_of_type(calls_of_type, call_type, day_segment, requested_features) features <- call_features_of_type(calls_of_type, call_type, day_segment, requested_features)
call_features <- merge(call_features, features, all=TRUE) call_features <- merge(call_features, features, all=TRUE)

View File

@ -1,13 +0,0 @@
source("renv/activate.R")
source("src/features/utils/utils.R")
library("dplyr")
library("tidyr")
sensor_data_file <- snakemake@input[["sensor_data"]]
day_segments_file <- snakemake@input[["day_segments_labels"]]
provider <- snakemake@params["provider"][["provider"]]
provider_key <- snakemake@params["provider_key"]
sensor_features <- fetch_provider_features(provider, provider_key, "conversation", sensor_data_file, day_segments_file)
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -1,18 +0,0 @@
import pandas as pd
from importlib import import_module, util
from pathlib import Path
# import fetch_provider_features from src/features/utils/utils.py
spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
mod = util.module_from_spec(spec)
spec.loader.exec_module(mod)
fetch_provider_features = getattr(mod, "fetch_provider_features")
sensor_data_file = snakemake.input["sensor_data"][0]
day_segments_file = snakemake.input["day_segments_labels"]
provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"]
sensor_features = fetch_provider_features(provider, provider_key, "conversation", sensor_data_file, day_segments_file)
sensor_features.to_csv(snakemake.output[0], index=False)

View File

@ -1,8 +1,9 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
# def rapids_features(conversation_data, day_segment, requested_features,recordingMinutes,pausedMinutes,expectedMinutes): def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs):
def rapids_features(conversation_data, day_segment, provider, filter_data_by_segment, *args, **kwargs):
conversation_data = pd.read_csv(sensor_data_files["sensor_data"])
requested_features = provider["FEATURES"] requested_features = provider["FEATURES"]
recordingMinutes = provider["RECORDING_MINUTES"] recordingMinutes = provider["RECORDING_MINUTES"]
@ -20,7 +21,7 @@ def rapids_features(conversation_data, day_segment, provider, filter_data_by_seg
# the subset of requested features this function can compute # the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names)) features_to_compute = list(set(requested_features) & set(base_features_names))
conversation_features = pd.DataFrame(columns=["local_segment"] + ["conversation_rapids" + "_" + x for x in features_to_compute]) conversation_features = pd.DataFrame(columns=["local_segment"] + ["conversation_rapids_" + x for x in features_to_compute])
if not conversation_data.empty: if not conversation_data.empty:
conversation_data = filter_data_by_segment(conversation_data, day_segment) conversation_data = filter_data_by_segment(conversation_data, day_segment)
@ -30,19 +31,19 @@ def rapids_features(conversation_data, day_segment, provider, filter_data_by_seg
conversation_data = conversation_data.drop_duplicates(subset=["local_date", "local_time"], keep="first") conversation_data = conversation_data.drop_duplicates(subset=["local_date", "local_time"], keep="first")
if "minutessilence" in features_to_compute: if "minutessilence" in features_to_compute:
conversation_features["conversation_rapids" + "_minutessilence"] = conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60 conversation_features["conversation_rapids_minutessilence"] = conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60
if "minutesnoise" in features_to_compute: if "minutesnoise" in features_to_compute:
conversation_features["conversation_rapids" + "_minutesnoise"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60 conversation_features["conversation_rapids_minutesnoise"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60
if "minutesvoice" in features_to_compute: if "minutesvoice" in features_to_compute:
conversation_features["conversation_rapids" + "_minutesvoice"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60 conversation_features["conversation_rapids_minutesvoice"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60
if "minutesunknown" in features_to_compute: if "minutesunknown" in features_to_compute:
conversation_features["conversation_rapids" + "_minutesunknown"] = conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60 conversation_features["conversation_rapids_minutesunknown"] = conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60
if "countconversation" in features_to_compute: if "countconversation" in features_to_compute:
conversation_features["conversation_rapids" + "_countconversation"] = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['double_convo_start'].nunique() conversation_features["conversation_rapids_countconversation"] = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['double_convo_start'].nunique()
conv_duration = (conversation_data['double_convo_end']/1000 - conversation_data['double_convo_start']/1000)/60 conv_duration = (conversation_data['double_convo_end']/1000 - conversation_data['double_convo_start']/1000)/60
conversation_data = conversation_data.assign(conv_duration = conv_duration.values) conversation_data = conversation_data.assign(conv_duration = conv_duration.values)
@ -50,43 +51,43 @@ def rapids_features(conversation_data, day_segment, provider, filter_data_by_seg
conv_totalDuration = conversation_data[(conversation_data['inference'] >= 0) & (conversation_data['inference'] < 4)].groupby(["local_segment"])['inference'].count()/60 conv_totalDuration = conversation_data[(conversation_data['inference'] >= 0) & (conversation_data['inference'] < 4)].groupby(["local_segment"])['inference'].count()/60
if "silencesensedfraction" in features_to_compute: if "silencesensedfraction" in features_to_compute:
conversation_features["conversation_rapids" + "_silencesensedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration conversation_features["conversation_rapids_silencesensedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration
if "noisesensedfraction" in features_to_compute: if "noisesensedfraction" in features_to_compute:
conversation_features["conversation_rapids" + "_noisesensedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration conversation_features["conversation_rapids_noisesensedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration
if "voicesensedfraction" in features_to_compute: if "voicesensedfraction" in features_to_compute:
conversation_features["conversation_rapids" + "_voicesensedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration conversation_features["conversation_rapids_voicesensedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration
if "unknownsensedfraction" in features_to_compute: if "unknownsensedfraction" in features_to_compute:
conversation_features["conversation_rapids" + "_unknownsensedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration conversation_features["conversation_rapids_unknownsensedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration
if "silenceexpectedfraction" in features_to_compute: if "silenceexpectedfraction" in features_to_compute:
conversation_features["conversation_rapids" + "_silenceexpectedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes conversation_features["conversation_rapids_silenceexpectedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes
if "noiseexpectedfraction" in features_to_compute: if "noiseexpectedfraction" in features_to_compute:
conversation_features["conversation_rapids" + "_noiseexpectedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes conversation_features["conversation_rapids_noiseexpectedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes
if "voiceexpectedfraction" in features_to_compute: if "voiceexpectedfraction" in features_to_compute:
conversation_features["conversation_rapids" + "_voiceexpectedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes conversation_features["conversation_rapids_voiceexpectedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes
if "unknownexpectedfraction" in features_to_compute: if "unknownexpectedfraction" in features_to_compute:
conversation_features["conversation_rapids" + "_unknownexpectedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes conversation_features["conversation_rapids_unknownexpectedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes
if "sumconversationduration" in features_to_compute: if "sumconversationduration" in features_to_compute:
conversation_features["conversation_rapids" + "_sumconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].sum() conversation_features["conversation_rapids_sumconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].sum()
if "avgconversationduration" in features_to_compute: if "avgconversationduration" in features_to_compute:
conversation_features["conversation_rapids" + "_avgconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].mean() conversation_features["conversation_rapids_avgconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].mean()
if "sdconversationduration" in features_to_compute: if "sdconversationduration" in features_to_compute:
conversation_features["conversation_rapids" + "_sdconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].std() conversation_features["conversation_rapids_sdconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].std()
if "minconversationduration" in features_to_compute: if "minconversationduration" in features_to_compute:
conversation_features["conversation_rapids" + "_minconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].min() conversation_features["conversation_rapids_minconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].min()
if "maxconversationduration" in features_to_compute: if "maxconversationduration" in features_to_compute:
conversation_features["conversation_rapids" + "_maxconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].max() conversation_features["conversation_rapids_maxconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].max()
if "timefirstconversation" in features_to_compute: if "timefirstconversation" in features_to_compute:
timestampsLastConversation = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['timestamp'].min() timestampsLastConversation = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['timestamp'].min()
@ -94,9 +95,9 @@ def rapids_features(conversation_data, day_segment, provider, filter_data_by_seg
for date in list(timestampsLastConversation.index): for date in list(timestampsLastConversation.index):
lastimestamp = timestampsLastConversation.loc[date] lastimestamp = timestampsLastConversation.loc[date]
lasttime = (conversation_data.query('timestamp == @lastimestamp', inplace = False))['local_time'].iat[0] lasttime = (conversation_data.query('timestamp == @lastimestamp', inplace = False))['local_time'].iat[0]
conversation_features.loc[date,"conversation_rapids" + "_timefirstconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1]) conversation_features.loc[date,"conversation_rapids_timefirstconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1])
else: else:
conversation_features["conversation_rapids" + "_timefirstconversation"] = np.nan conversation_features["conversation_rapids_timefirstconversation"] = np.nan
if "timelastconversation" in features_to_compute: if "timelastconversation" in features_to_compute:
timestampsLastConversation = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['timestamp'].max() timestampsLastConversation = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['timestamp'].max()
@ -104,39 +105,39 @@ def rapids_features(conversation_data, day_segment, provider, filter_data_by_seg
for date in list(timestampsLastConversation.index): for date in list(timestampsLastConversation.index):
lastimestamp = timestampsLastConversation.loc[date] lastimestamp = timestampsLastConversation.loc[date]
lasttime = (conversation_data.query('timestamp == @lastimestamp', inplace = False))['local_time'].iat[0] lasttime = (conversation_data.query('timestamp == @lastimestamp', inplace = False))['local_time'].iat[0]
conversation_features.loc[date,"conversation_rapids" + "_timelastconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1]) conversation_features.loc[date,"conversation_rapids_timelastconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1])
else: else:
conversation_features["conversation_rapids" + "_timelastconversation"] = np.nan conversation_features["conversation_rapids_timelastconversation"] = np.nan
if "noisesumenergy" in features_to_compute: if "noisesumenergy" in features_to_compute:
conversation_features["conversation_rapids" + "_noisesumenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].sum() conversation_features["conversation_rapids_noisesumenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].sum()
if "noiseavgenergy" in features_to_compute: if "noiseavgenergy" in features_to_compute:
conversation_features["conversation_rapids" + "_noiseavgenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].mean() conversation_features["conversation_rapids_noiseavgenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].mean()
if "noisesdenergy" in features_to_compute: if "noisesdenergy" in features_to_compute:
conversation_features["conversation_rapids" + "_noisesdenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].std() conversation_features["conversation_rapids_noisesdenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].std()
if "noiseminenergy" in features_to_compute: if "noiseminenergy" in features_to_compute:
conversation_features["conversation_rapids" + "_noiseminenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].min() conversation_features["conversation_rapids_noiseminenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].min()
if "noisemaxenergy" in features_to_compute: if "noisemaxenergy" in features_to_compute:
conversation_features["conversation_rapids" + "_noisemaxenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].max() conversation_features["conversation_rapids_noisemaxenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].max()
if "voicesumenergy" in features_to_compute: if "voicesumenergy" in features_to_compute:
conversation_features["conversation_rapids" + "_voicesumenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].sum() conversation_features["conversation_rapids_voicesumenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].sum()
if "voiceavgenergy" in features_to_compute: if "voiceavgenergy" in features_to_compute:
conversation_features["conversation_rapids" + "_voiceavgenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].mean() conversation_features["conversation_rapids_voiceavgenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].mean()
if "voicesdenergy" in features_to_compute: if "voicesdenergy" in features_to_compute:
conversation_features["conversation_rapids" + "_voicesdenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].std() conversation_features["conversation_rapids_voicesdenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].std()
if "voiceminenergy" in features_to_compute: if "voiceminenergy" in features_to_compute:
conversation_features["conversation_rapids" + "_voiceminenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].min() conversation_features["conversation_rapids_voiceminenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].min()
if "voicemaxenergy" in features_to_compute: if "voicemaxenergy" in features_to_compute:
conversation_features["conversation_rapids" + "_voicemaxenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].max() conversation_features["conversation_rapids_voicemaxenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].max()
conversation_features = conversation_features.reset_index() conversation_features = conversation_features.reset_index()

View File

@ -3,11 +3,14 @@ source("src/features/utils/utils.R")
library("dplyr") library("dplyr")
library("tidyr") library("tidyr")
sensor_data_file <- snakemake@input[["sensor_data"]] sensor_data_files <- snakemake@input
day_segments_file <- snakemake@input[["day_segments_labels"]] sensor_data_files$day_segments_labels <- NULL
day_segments_file <- snakemake@input[["day_segments_labels"]]
provider <- snakemake@params["provider"][["provider"]] provider <- snakemake@params["provider"][["provider"]]
provider_key <- snakemake@params["provider_key"] provider_key <- snakemake@params["provider_key"]
sensor_key <- snakemake@params["sensor_key"]
sensor_features <- fetch_provider_features(provider, provider_key, "calls", sensor_data_file, day_segments_file) sensor_features <- fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, day_segments_file)
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -0,0 +1,14 @@
import pandas as pd
from utils.utils import fetch_provider_features
sensor_data_files = dict(snakemake.input)
del sensor_data_files["day_segments_labels"]
day_segments_file = snakemake.input["day_segments_labels"]
provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"]
sensor_key = snakemake.params["sensor_key"]
sensor_features = fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, day_segments_file)
sensor_features.to_csv(snakemake.output[0], index=False)

View File

@ -1,13 +0,0 @@
source("renv/activate.R")
source("src/features/utils/utils.R")
library("dplyr")
library("tidyr")
sensor_data_file <- snakemake@input[["sensor_data"]]
day_segments_file <- snakemake@input[["day_segments_labels"]]
provider <- snakemake@params["provider"][["provider"]]
provider_key <- snakemake@params["provider_key"]
sensor_features <- fetch_provider_features(provider, provider_key, "light", sensor_data_file, day_segments_file)
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -1,18 +0,0 @@
import pandas as pd
from importlib import import_module, util
from pathlib import Path
# import fetch_provider_features from src/features/utils/utils.py
spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
mod = util.module_from_spec(spec)
spec.loader.exec_module(mod)
fetch_provider_features = getattr(mod, "fetch_provider_features")
sensor_data_file = snakemake.input["sensor_data"][0]
day_segments_file = snakemake.input["day_segments_labels"]
provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"]
sensor_features = fetch_provider_features(provider, provider_key, "light", sensor_data_file, day_segments_file)
sensor_features.to_csv(snakemake.output[0], index=False)

View File

@ -1,33 +1,35 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
def rapids_features(light_data, day_segment, provider, filter_data_by_segment, *args, **kwargs): def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs):
light_data = pd.read_csv(sensor_data_files["sensor_data"])
requested_features = provider["FEATURES"] requested_features = provider["FEATURES"]
# name of the features this function can compute # name of the features this function can compute
base_features_names = ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] base_features_names = ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"]
# the subset of requested features this function can compute # the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names)) features_to_compute = list(set(requested_features) & set(base_features_names))
light_features = pd.DataFrame(columns=["local_segment"] + ["light_rapids_" + "_" + x for x in features_to_compute]) light_features = pd.DataFrame(columns=["local_segment"] + ["light_rapids_" + x for x in features_to_compute])
if not light_data.empty: if not light_data.empty:
light_data = filter_data_by_segment(light_data, day_segment) light_data = filter_data_by_segment(light_data, day_segment)
if not light_data.empty: if not light_data.empty:
light_features = pd.DataFrame() light_features = pd.DataFrame()
if "count" in features_to_compute: if "count" in features_to_compute:
light_features["light_rapids_" + "_count"] = light_data.groupby(["local_segment"]).count()["timestamp"] light_features["light_rapids_count"] = light_data.groupby(["local_segment"]).count()["timestamp"]
# get light ambient luminance related features # get light ambient luminance related features
if "maxlux" in features_to_compute: if "maxlux" in features_to_compute:
light_features["light_rapids_" + "_maxlux"] = light_data.groupby(["local_segment"])["double_light_lux"].max() light_features["light_rapids_maxlux"] = light_data.groupby(["local_segment"])["double_light_lux"].max()
if "minlux" in features_to_compute: if "minlux" in features_to_compute:
light_features["light_rapids_" + "_minlux"] = light_data.groupby(["local_segment"])["double_light_lux"].min() light_features["light_rapids_minlux"] = light_data.groupby(["local_segment"])["double_light_lux"].min()
if "avglux" in features_to_compute: if "avglux" in features_to_compute:
light_features["light_rapids_" + "_avglux"] = light_data.groupby(["local_segment"])["double_light_lux"].mean() light_features["light_rapids_avglux"] = light_data.groupby(["local_segment"])["double_light_lux"].mean()
if "medianlux" in features_to_compute: if "medianlux" in features_to_compute:
light_features["light_rapids_" + "_medianlux"] = light_data.groupby(["local_segment"])["double_light_lux"].median() light_features["light_rapids_medianlux"] = light_data.groupby(["local_segment"])["double_light_lux"].median()
if "stdlux" in features_to_compute: if "stdlux" in features_to_compute:
light_features["light_rapids_" + "_stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std() light_features["light_rapids_stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std()
light_features = light_features.reset_index() light_features = light_features.reset_index()

View File

@ -27,8 +27,11 @@ create_empty_file <- function(requested_features){
) %>% select(all_of(requested_features))) ) %>% select(all_of(requested_features)))
} }
barnett_features <- function(location_data, day_segment, params){ barnett_features <- function(sensor_data_files, day_segment, params){
location_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
location_features <- NULL location_features <- NULL
location <- location_data location <- location_data
accuracy_limit <- params[["ACCURACY_LIMIT"]] accuracy_limit <- params[["ACCURACY_LIMIT"]]
timezone <- params[["TIMEZONE"]] timezone <- params[["TIMEZONE"]]

View File

@ -4,7 +4,9 @@ from astropy.timeseries import LombScargle
from sklearn.cluster import DBSCAN from sklearn.cluster import DBSCAN
from math import radians, cos, sin, asin, sqrt from math import radians, cos, sin, asin, sqrt
def doryab_features(location_data, day_segment, provider, filter_data_by_segment, *args, **kwargs): def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs):
location_data = pd.read_csv(sensor_data_files["sensor_data"])
requested_features = provider["FEATURES"] requested_features = provider["FEATURES"]
dbscan_eps = provider["DBSCAN_EPS"] dbscan_eps = provider["DBSCAN_EPS"]
dbscan_minsamples = provider["DBSCAN_MINSAMPLES"] dbscan_minsamples = provider["DBSCAN_MINSAMPLES"]

View File

@ -1,13 +0,0 @@
source("renv/activate.R")
source("src/features/utils/utils.R")
library("dplyr")
library("tidyr")
sensor_data_file <- snakemake@input[["sensor_data"]]
day_segments_file <- snakemake@input[["day_segments_labels"]]
provider <- snakemake@params["provider"][["provider"]]
provider_key <- snakemake@params["provider_key"]
sensor_features <- fetch_provider_features(provider, provider_key, "locations", sensor_data_file, day_segments_file)
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -1,18 +0,0 @@
import pandas as pd
from importlib import import_module, util
from pathlib import Path
# import fetch_provider_features from src/features/utils/utils.py
spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
mod = util.module_from_spec(spec)
spec.loader.exec_module(mod)
fetch_provider_features = getattr(mod, "fetch_provider_features")
sensor_data_file = snakemake.input["sensor_data"][0]
day_segments_file = snakemake.input["day_segments_labels"]
provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"]
sensor_features = fetch_provider_features(provider, provider_key, "locations", sensor_data_file, day_segments_file)
sensor_features.to_csv(snakemake.output[0], index=False)

View File

@ -1,13 +0,0 @@
source("renv/activate.R")
source("src/features/utils/utils.R")
library("dplyr")
library("tidyr")
sensor_data_file <- snakemake@input[["sensor_data"]]
day_segments_file <- snakemake@input[["day_segments_labels"]]
provider <- snakemake@params["provider"][["provider"]]
provider_key <- snakemake@params["provider_key"]
sensor_features <- fetch_provider_features(provider, provider_key, "messages", sensor_data_file, day_segments_file)
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -1,18 +0,0 @@
import pandas as pd
from importlib import import_module, util
from pathlib import Path
# import fetch_provider_features from src/features/utils/utils.py
spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
mod = util.module_from_spec(spec)
spec.loader.exec_module(mod)
fetch_provider_features = getattr(mod, "fetch_provider_features")
sensor_data_file = snakemake.input["sensor_data"][0]
day_segments_file = snakemake.input["day_segments_labels"]
provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"]
sensor_features = fetch_provider_features(provider, provider_key, "messages", sensor_data_file, day_segments_file)
sensor_features.to_csv(snakemake.output[0], index=False)

View File

@ -50,8 +50,9 @@ message_features_of_type <- function(messages, messages_type, day_segment, reque
return(features) return(features)
} }
rapids_features <- function(messages, day_segment, provider){ rapids_features <- function(sensor_data_files, day_segment, provider){
messages <- messages %>% filter_data_by_segment(day_segment) messages_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
messages_data <- messages_data %>% filter_data_by_segment(day_segment)
messages_types = provider[["MESSAGES_TYPES"]] messages_types = provider[["MESSAGES_TYPES"]]
messages_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment")) messages_features <- setNames(data.frame(matrix(ncol=1, nrow=0)), c("local_segment"))
@ -62,7 +63,7 @@ rapids_features <- function(messages, day_segment, provider){
stop(paste("Message type can online be received or sent but instead you typed: ", message_type, " in config[MESSAGES][MESSAGES_TYPES]")) stop(paste("Message type can online be received or sent but instead you typed: ", message_type, " in config[MESSAGES][MESSAGES_TYPES]"))
requested_features <- provider[["FEATURES"]][[message_type]] requested_features <- provider[["FEATURES"]][[message_type]]
messages_of_type <- messages %>% filter(message_type == message_type_label) messages_of_type <- messages_data %>% filter(message_type == message_type_label)
features <- message_features_of_type(messages_of_type, message_type, day_segment, requested_features) features <- message_features_of_type(messages_of_type, message_type, day_segment, requested_features)
messages_features <- merge(messages_features, features, all=TRUE) messages_features <- merge(messages_features, features, all=TRUE)

View File

@ -25,7 +25,9 @@ def getEpisodeDurationFeatures(screen_data, day_segment, episode, features, refe
return duration_helper return duration_helper
def rapids_features(screen_data, day_segment, provider, filter_data_by_segment, *args, **kwargs): def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs):
screen_data = pd.read_csv(sensor_data_files["sensor_episodes"])
reference_hour_first_use = provider["REFERENCE_HOUR_FIRST_USE"] reference_hour_first_use = provider["REFERENCE_HOUR_FIRST_USE"]
requested_features_episodes = provider["FEATURES"] requested_features_episodes = provider["FEATURES"]

View File

@ -1,13 +0,0 @@
source("renv/activate.R")
source("src/features/utils/utils.R")
library("dplyr")
library("tidyr")
sensor_data_file <- snakemake@input[["screen_episodes"]]
day_segments_file <- snakemake@input[["day_segments_labels"]]
provider <- snakemake@params["provider"][["provider"]]
provider_key <- snakemake@params["provider_key"]
sensor_features <- fetch_provider_features(provider, provider_key, "screen", sensor_data_file, day_segments_file)
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -1,18 +0,0 @@
import pandas as pd
from importlib import import_module, util
from pathlib import Path
# import fetch_provider_features from src/features/utils/utils.py
spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
mod = util.module_from_spec(spec)
spec.loader.exec_module(mod)
fetch_provider_features = getattr(mod, "fetch_provider_features")
screen_episodes_file = snakemake.input["screen_episodes"]
day_segments_file = snakemake.input["day_segments_labels"]
provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"]
sensor_features = fetch_provider_features(provider, provider_key, "screen", screen_episodes_file, day_segments_file)
sensor_features.to_csv(snakemake.output[0], index=False)

View File

@ -43,24 +43,23 @@ chunk_episodes <- function(sensor_episodes){
return(chunked_episodes) return(chunked_episodes)
} }
fetch_provider_features <- function(provider, provider_key, config_key, sensor_data_file, day_segments_file){ fetch_provider_features <- function(provider, provider_key, sensor_key, sensor_data_files, day_segments_file){
sensor_features <- data.frame(local_segment = character(), stringsAsFactors = FALSE) sensor_features <- data.frame(local_segment = character(), stringsAsFactors = FALSE)
sensor_data <- read.csv(sensor_data_file, stringsAsFactors = FALSE)
day_segments_labels <- read.csv(day_segments_file, stringsAsFactors = FALSE) day_segments_labels <- read.csv(day_segments_file, stringsAsFactors = FALSE)
if(!"FEATURES" %in% names(provider)) if(!"FEATURES" %in% names(provider))
stop(paste0("Provider config[", config_key,"][PROVIDERS][", provider_key,"] is missing a FEATURES attribute in config.yaml")) stop(paste0("Provider config[", sensor_key,"][PROVIDERS][", provider_key,"] is missing a FEATURES attribute in config.yaml"))
if(provider[["COMPUTE"]] == TRUE){ if(provider[["COMPUTE"]] == TRUE){
code_path <- paste0("src/features/", config_key,"/", provider[["SRC_FOLDER"]], "/main.R") code_path <- paste0("src/features/", sensor_key,"/", provider[["SRC_FOLDER"]], "/main.R")
source(code_path) source(code_path)
features_function <- match.fun(paste0(provider[["SRC_FOLDER"]], "_features")) features_function <- match.fun(paste0(provider[["SRC_FOLDER"]], "_features"))
day_segments <- day_segments_labels %>% pull(label) day_segments <- day_segments_labels %>% pull(label)
for (day_segment in day_segments){ for (day_segment in day_segments){
print(paste(rapids_log_tag,"Processing", config_key, provider_key, day_segment)) print(paste(rapids_log_tag,"Processing", sensor_key, provider_key, day_segment))
features <- features_function(sensor_data, day_segment, provider) features <- features_function(sensor_data_files, day_segment, provider)
# Check all features names contain the provider key so they are unique # Check all features names contain the provider key so they are unique
features_names <- colnames(features %>% select(-local_segment)) features_names <- colnames(features %>% select(-local_segment))

View File

@ -67,24 +67,24 @@ def chunk_episodes(sensor_episodes):
return merged_sensor_episodes return merged_sensor_episodes
def fetch_provider_features(provider, provider_key, config_key, sensor_data_file, day_segments_file): def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_files, day_segments_file):
import pandas as pd import pandas as pd
from importlib import import_module, util from importlib import import_module, util
sensor_features = pd.DataFrame(columns=["local_segment"]) sensor_features = pd.DataFrame(columns=["local_segment"])
sensor_data = pd.read_csv(sensor_data_file)
day_segments_labels = pd.read_csv(day_segments_file, header=0) day_segments_labels = pd.read_csv(day_segments_file, header=0)
if "FEATURES" not in provider: if "FEATURES" not in provider:
raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(config_key.upper(), provider_key)) raise ValueError("Provider config[{}][PROVIDERS][{}] is missing a FEATURES attribute in config.yaml".format(sensor_key.upper(), provider_key))
if provider["COMPUTE"] == True: if provider["COMPUTE"] == True:
code_path = provider["SRC_FOLDER"] + ".main"
code_path = sensor_key + "." + provider["SRC_FOLDER"] + ".main"
feature_module = import_module(code_path) feature_module = import_module(code_path)
feature_function = getattr(feature_module, provider["SRC_FOLDER"] + "_features") feature_function = getattr(feature_module, provider["SRC_FOLDER"] + "_features")
for day_segment in day_segments_labels["label"]: for day_segment in day_segments_labels["label"]:
print("{} Processing {} {} {}".format(rapids_log_tag, config_key, provider_key, day_segment)) print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, day_segment))
features = feature_function(sensor_data, day_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes) features = feature_function(sensor_data_files, day_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes)
sensor_features = sensor_features.merge(features, how="outer") sensor_features = sensor_features.merge(features, how="outer")
else: else:
for feature in provider["FEATURES"]: for feature in provider["FEATURES"]:

View File

@ -25,21 +25,22 @@ compute_wifi_feature <- function(data, feature, day_segment){
} }
} }
rapids_features <- function(wifi_data, day_segment, provider){ rapids_features <- function(sensor_data_files, day_segment, provider){
requested_features <- provider[["FEATURES"]] wifi_data <- read.csv(sensor_data_files[["sensor_data"]], stringsAsFactors = FALSE)
# Output dataframe requested_features <- provider[["FEATURES"]]
features = data.frame(local_segment = character(), stringsAsFactors = FALSE) # Output dataframe
features = data.frame(local_segment = character(), stringsAsFactors = FALSE)
# The name of the features this function can compute # The name of the features this function can compute
base_features_names <- c("countscans", "uniquedevices", "countscansmostuniquedevice") base_features_names <- c("countscans", "uniquedevices", "countscansmostuniquedevice")
# The subset of requested features this function can compute # The subset of requested features this function can compute
features_to_compute <- intersect(base_features_names, requested_features) features_to_compute <- intersect(base_features_names, requested_features)
for(feature_name in features_to_compute){ for(feature_name in features_to_compute){
feature <- compute_wifi_feature(wifi_data, feature_name, day_segment) feature <- compute_wifi_feature(wifi_data, feature_name, day_segment)
features <- merge(features, feature, by="local_segment", all = TRUE) features <- merge(features, feature, by="local_segment", all = TRUE)
} }
return(features) return(features)
} }

View File

@ -1,13 +0,0 @@
source("renv/activate.R")
source("src/features/utils/utils.R")
library("dplyr")
library("tidyr")
sensor_data_file <- snakemake@input[["sensor_data"]]
day_segments_file <- snakemake@input[["day_segments_labels"]]
provider <- snakemake@params["provider"][["provider"]]
provider_key <- snakemake@params["provider_key"]
sensor_features <- fetch_provider_features(provider, provider_key, "wifi", sensor_data_file, day_segments_file)
write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE)

View File

@ -1,18 +0,0 @@
import pandas as pd
from importlib import import_module, util
from pathlib import Path
# import fetch_provider_features from src/features/utils/utils.py
spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py"))
mod = util.module_from_spec(spec)
spec.loader.exec_module(mod)
fetch_provider_features = getattr(mod, "fetch_provider_features")
sensor_data_file = snakemake.input["sensor_data"][0]
day_segments_file = snakemake.input["day_segments_labels"]
provider = snakemake.params["provider"]
provider_key = snakemake.params["provider_key"]
sensor_features = fetch_provider_features(provider, provider_key, "wifi", sensor_data_file, day_segments_file)
sensor_features.to_csv(snakemake.output[0], index=False)