diff --git a/Snakefile b/Snakefile index b7364f84..38133b8f 100644 --- a/Snakefile +++ b/Snakefile @@ -83,10 +83,12 @@ if config["SCREEN"]["COMPUTE"]: files_to_compute.extend(expand("data/processed/{pid}/screen_deltas.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/{pid}/screen_{day_segment}.csv", pid = config["PIDS"], day_segment = config["SCREEN"]["DAY_SEGMENTS"])) -if config["LIGHT"]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"])) - files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"])) - files_to_compute.extend(expand("data/processed/{pid}/light_{day_segment}.csv", pid = config["PIDS"], day_segment = config["LIGHT"]["DAY_SEGMENTS"])) +for provider in config["LIGHT"]["PROVIDERS"].keys(): + if config["LIGHT"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"])) + files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["LIGHT"]["DB_TABLE"])) + files_to_compute.extend(expand("data/interim/{pid}/{sensor_key}_features/{sensor_key}_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["LIGHT"]["PROVIDERS"][provider]["SRC_LANGUAGE"], provider_key=provider, sensor_key="LIGHT".lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/{sensor_key}.csv", pid=config["PIDS"], sensor_key="LIGHT".lower())) if config["ACCELEROMETER"]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["ACCELEROMETER"]["DB_TABLE"])) diff --git a/config.yaml b/config.yaml index 5011e834..603e7a36 100644 --- a/config.yaml +++ b/config.yaml @@ -136,10 +136,14 @@ SCREEN: EPISODE_TYPES: ["unlock"] LIGHT: - COMPUTE: False DB_TABLE: light - DAY_SEGMENTS: *day_segments - FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] + PROVIDERS: + RAPIDS: + COMPUTE: TRUE + FEATURES: ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] + SRC_FOLDER: "rapids" # inside src/features/light + SRC_LANGUAGE: "python" + ACCELEROMETER: COMPUTE: False diff --git a/rules/features.smk b/rules/features.smk index 22bfd035..d01b3bf7 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -173,16 +173,29 @@ rule screen_features: script: "../src/features/screen_features.py" -rule light_features: +rule light_r_features: input: - expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"]), + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"]), + day_segments_labels = "data/interim/day_segments_labels.csv" params: - day_segment = "{day_segment}", - features = config["LIGHT"]["FEATURES"], + provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}" output: - "data/processed/{pid}/light_{day_segment}.csv" + "data/interim/{pid}/light_features/light_r_{provider_key}.csv" script: - "../src/features/light_features.py" + "../src/features/light/light_entry.R" + +rule light_python_features: + input: + sensor_data = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["LIGHT"]["DB_TABLE"]), + day_segments_labels = "data/interim/day_segments_labels.csv" + params: + provider = lambda wildcards: config["LIGHT"]["PROVIDERS"][wildcards.provider_key], + provider_key = "{provider_key}" + output: + "data/interim/{pid}/light_features/light_python_{provider_key}.csv" + script: + "../src/features/light/light_entry.py" rule conversation_features: input: diff --git a/src/features/light/light_base.py b/src/features/light/light_base.py deleted file mode 100644 index 54450000..00000000 --- a/src/features/light/light_base.py +++ /dev/null @@ -1,34 +0,0 @@ -import pandas as pd -import numpy as np - -def base_light_features(light_data, day_segment, requested_features): - # name of the features this function can compute - base_features_names = ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] - # the subset of requested features this function can compute - features_to_compute = list(set(requested_features) & set(base_features_names)) - - light_features = pd.DataFrame(columns=["local_date"] + ["light_" + day_segment + "_" + x for x in features_to_compute]) - if not light_data.empty: - if day_segment != "daily": - light_data =light_data[light_data["local_day_segment"] == day_segment] - - if not light_data.empty: - light_features = pd.DataFrame() - if "count" in features_to_compute: - light_features["light_" + day_segment + "_count"] = light_data.groupby(["local_date"]).count()["timestamp"] - - # get light ambient luminance related features - if "maxlux" in features_to_compute: - light_features["light_" + day_segment + "_maxlux"] = light_data.groupby(["local_date"])["double_light_lux"].max() - if "minlux" in features_to_compute: - light_features["light_" + day_segment + "_minlux"] = light_data.groupby(["local_date"])["double_light_lux"].min() - if "avglux" in features_to_compute: - light_features["light_" + day_segment + "_avglux"] = light_data.groupby(["local_date"])["double_light_lux"].mean() - if "medianlux" in features_to_compute: - light_features["light_" + day_segment + "_medianlux"] = light_data.groupby(["local_date"])["double_light_lux"].median() - if "stdlux" in features_to_compute: - light_features["light_" + day_segment + "_stdlux"] = light_data.groupby(["local_date"])["double_light_lux"].std().fillna('NA') - - light_features = light_features.reset_index() - - return light_features \ No newline at end of file diff --git a/src/features/light/light_entry.R b/src/features/light/light_entry.R new file mode 100644 index 00000000..99f8dca4 --- /dev/null +++ b/src/features/light/light_entry.R @@ -0,0 +1,13 @@ +source("renv/activate.R") +source("src/features/utils/utils.R") +library("dplyr") +library("tidyr") + +sensor_data_file <- snakemake@input[["sensor_data"]] +day_segments_file <- snakemake@input[["day_segments_labels"]] +provider <- snakemake@params["provider"][["provider"]] +provider_key <- snakemake@params["provider_key"] + +sensor_features <- fetch_provider_features(provider, provider_key, "light", sensor_data_file, day_segments_file) + +write.csv(sensor_features, snakemake@output[[1]], row.names = FALSE) diff --git a/src/features/light/light_entry.py b/src/features/light/light_entry.py new file mode 100644 index 00000000..f0ee5e8f --- /dev/null +++ b/src/features/light/light_entry.py @@ -0,0 +1,18 @@ +import pandas as pd +from importlib import import_module, util +from pathlib import Path + +# import fetch_provider_features from src/features/utils/utils.py +spec = util.spec_from_file_location("util", str(Path(snakemake.scriptdir).parent / "utils" / "utils.py")) +mod = util.module_from_spec(spec) +spec.loader.exec_module(mod) +fetch_provider_features = getattr(mod, "fetch_provider_features") + +sensor_data_file = snakemake.input["sensor_data"][0] +day_segments_file = snakemake.input["day_segments_labels"] +provider = snakemake.params["provider"] +provider_key = snakemake.params["provider_key"] + +sensor_features = fetch_provider_features(provider, provider_key, "light", sensor_data_file, day_segments_file) + +sensor_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/light/rapids/main.py b/src/features/light/rapids/main.py new file mode 100644 index 00000000..105c1f49 --- /dev/null +++ b/src/features/light/rapids/main.py @@ -0,0 +1,34 @@ +import pandas as pd +import numpy as np + +def rapids_features(light_data, day_segment, provider, filter_data_by_segment, *args, **kwargs): + requested_features = provider["FEATURES"] + # name of the features this function can compute + base_features_names = ["count", "maxlux", "minlux", "avglux", "medianlux", "stdlux"] + # the subset of requested features this function can compute + features_to_compute = list(set(requested_features) & set(base_features_names)) + + light_features = pd.DataFrame(columns=["local_segment"] + ["light_rapids_" + "_" + x for x in features_to_compute]) + if not light_data.empty: + light_data = filter_data_by_segment(light_data, day_segment) + + if not light_data.empty: + light_features = pd.DataFrame() + if "count" in features_to_compute: + light_features["light_rapids_" + "_count"] = light_data.groupby(["local_segment"]).count()["timestamp"] + + # get light ambient luminance related features + if "maxlux" in features_to_compute: + light_features["light_rapids_" + "_maxlux"] = light_data.groupby(["local_segment"])["double_light_lux"].max() + if "minlux" in features_to_compute: + light_features["light_rapids_" + "_minlux"] = light_data.groupby(["local_segment"])["double_light_lux"].min() + if "avglux" in features_to_compute: + light_features["light_rapids_" + "_avglux"] = light_data.groupby(["local_segment"])["double_light_lux"].mean() + if "medianlux" in features_to_compute: + light_features["light_rapids_" + "_medianlux"] = light_data.groupby(["local_segment"])["double_light_lux"].median() + if "stdlux" in features_to_compute: + light_features["light_rapids_" + "_stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std().fillna('NA') + + light_features = light_features.reset_index() + + return light_features \ No newline at end of file