From a71efd6b85fb6016e33f6e69dfcc492902bc32b2 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Wed, 11 Nov 2020 21:16:48 -0500 Subject: [PATCH] Split FITBIT_STEPS into FITBIT_STEPS_SUMMARY and FITBIT_STEPS_INTRADAY --- Snakefile | 25 +++--- config.yaml | 34 ++++----- rules/features.smk | 64 +++++++++------- rules/preprocessing.smk | 10 +-- src/data/fitbit_parse_steps.py | 76 +++++++++++-------- .../rapids/main.py | 51 +++---------- .../fitbit_steps_summary/rapids/main.py | 67 ++++++++++++++++ 7 files changed, 189 insertions(+), 138 deletions(-) rename src/features/{fitbit_steps => fitbit_steps_intraday}/rapids/main.py (75%) create mode 100644 src/features/fitbit_steps_summary/rapids/main.py diff --git a/Snakefile b/Snakefile index 80240417..ab7ed4eb 100644 --- a/Snakefile +++ b/Snakefile @@ -144,9 +144,6 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"])) -if config["FITBIT_STEPS"]["TABLE_FORMAT"] not in ["JSON", "CSV"]: - raise ValueError("config['FITBIT_STEPS']['TABLE_FORMAT'] should be JSON or CSV but you typed" + config["FITBIT_STEPS"]["TABLE_FORMAT"]) - if config["FITBIT_CALORIES"]["TABLE_FORMAT"] not in ["JSON", "CSV"]: raise ValueError("config['FITBIT_CALORIES']['TABLE_FORMAT'] should be JSON or CSV but you typed" + config["FITBIT_CALORIES"]["TABLE_FORMAT"]) @@ -170,13 +167,21 @@ for provider in config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_intraday_features/fitbit_heartrate_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_intraday.csv", pid=config["PIDS"])) -for provider in config["FITBIT_STEPS"]["PROVIDERS"].keys(): - if config["FITBIT_STEPS"]["PROVIDERS"][provider]["COMPUTE"]: - files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_{fitbit_data_type}_raw.csv", pid=config["PIDS"], fitbit_data_type=(["json"] if config["FITBIT_STEPS"]["TABLE_FORMAT"] == "JSON" else ["summary", "intraday"]))) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_{fitbit_data_type}_parsed.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"])) - files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_{fitbit_data_type}_parsed_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"])) - files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_features/fitbit_steps_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) - files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps.csv", pid=config["PIDS"])) +for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys(): + if config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_parsed.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_summary.csv", pid=config["PIDS"])) + +for provider in config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"].keys(): + if config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_parsed.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_intraday.csv", pid=config["PIDS"])) for provider in config["FITBIT_CALORIES"]["PROVIDERS"].keys(): if config["FITBIT_CALORIES"]["PROVIDERS"][provider]["COMPUTE"]: diff --git a/config.yaml b/config.yaml index d4701353..94d71942 100644 --- a/config.yaml +++ b/config.yaml @@ -277,31 +277,27 @@ FITBIT_HEARTRATE_INTRADAY: SRC_FOLDER: "rapids" # inside src/features/fitbit_heartrate_intraday SRC_LANGUAGE: "python" -FITBIT_STEPS: - TABLE_FORMAT: JSON # JSON or CSV. If your JSON or CSV data are files change [DEVICE_DATA][FITBIT][SOURCE][TYPE] to FILES - TABLE: - JSON: fitbit_steps - CSV: - SUMMARY: steps_summary - INTRADAY: steps_intraday - EXCLUDE_SLEEP: # you can exclude sleep periods from the step features computation - EXCLUDE: False - TYPE: FIXED # FIXED OR FITBIT_BASED (configure FITBIT_SLEEP section) - FIXED: - START: "23:00" - END: "07:00" +FITBIT_STEPS_SUMMARY: + TABLE: steps_summary + PROVIDERS: + RAPIDS: + COMPUTE: False + FEATURES: ["maxsumsteps", "minsumsteps", "avgsumsteps", "mediansumsteps", "stdsumsteps"] + SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_summary + SRC_LANGUAGE: "python" + +FITBIT_STEPS_INTRADAY: + TABLE: steps_intraday PROVIDERS: RAPIDS: COMPUTE: False FEATURES: - SUMMARY: ["maxsumsteps", "minsumsteps", "avgsumsteps", "mediansumsteps", "stdsumsteps"] - INTRADAY: - STEPS: ["sum", "max", "min", "avg", "std"] - SEDENTARY_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"] - ACTIVE_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"] + STEPS: ["sum", "max", "min", "avg", "std"] + SEDENTARY_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"] + ACTIVE_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"] THRESHOLD_ACTIVE_BOUT: 10 # steps INCLUDE_ZERO_STEP_ROWS: False - SRC_FOLDER: "rapids" # inside src/features/fitbit_steps + SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_intraday SRC_LANGUAGE: "python" FITBIT_SLEEP: diff --git a/rules/features.smk b/rules/features.smk index 1e3e1709..8348a386 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -424,51 +424,57 @@ rule fitbit_heartrate_intraday_r_features: script: "../src/features/entry.R" -rule fitbit_steps_python_features: +rule fitbit_steps_summary_python_features: input: - sensor_data = expand("data/raw/{{pid}}/fitbit_steps_{fitbit_data_type}_parsed_with_datetime.csv", fitbit_data_type=["summary", "intraday"]), + sensor_data = "data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["FITBIT_STEPS"]["PROVIDERS"][wildcards.provider_key.upper()], + provider = lambda wildcards: config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "fitbit_steps" + sensor_key = "fitbit_steps_summary" output: - "data/interim/{pid}/fitbit_steps_features/fitbit_steps_python_{provider_key}.csv" + "data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_python_{provider_key}.csv" script: "../src/features/entry.py" -rule fitbit_steps_r_features: +rule fitbit_steps_summary_r_features: input: - sensor_data = expand("data/raw/{{pid}}/fitbit_steps_{fitbit_data_type}_parsed_with_datetime.csv", fitbit_data_type=["summary", "intraday"]), + sensor_data = "data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv", day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" params: - provider = lambda wildcards: config["FITBIT_STEPS"]["PROVIDERS"][wildcards.provider_key.upper()], + provider = lambda wildcards: config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()], provider_key = "{provider_key}", - sensor_key = "fitbit_steps" + sensor_key = "fitbit_steps_summary" output: - "data/interim/{pid}/fitbit_steps_features/fitbit_steps_r_{provider_key}.csv" + "data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_r_{provider_key}.csv" script: "../src/features/entry.R" -# rule fitbit_step_features: -# input: -# step_data = "data/raw/{pid}/fitbit_step_intraday_with_datetime.csv", -# sleep_data = optional_steps_sleep_input -# params: -# day_segment = "{day_segment}", -# features_all_steps = config["STEP"]["FEATURES"]["ALL_STEPS"], -# features_sedentary_bout = config["STEP"]["FEATURES"]["SEDENTARY_BOUT"], -# features_active_bout = config["STEP"]["FEATURES"]["ACTIVE_BOUT"], -# threshold_active_bout = config["STEP"]["THRESHOLD_ACTIVE_BOUT"], -# include_zero_step_rows = config["STEP"]["INCLUDE_ZERO_STEP_ROWS"], -# exclude_sleep = config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"], -# exclude_sleep_type = config["STEP"]["EXCLUDE_SLEEP"]["TYPE"], -# exclude_sleep_fixed_start = config["STEP"]["EXCLUDE_SLEEP"]["FIXED"]["START"], -# exclude_sleep_fixed_end = config["STEP"]["EXCLUDE_SLEEP"]["FIXED"]["END"], -# output: -# "data/processed/{pid}/fitbit_step_{day_segment}.csv" -# script: -# "../src/features/fitbit_step_features.py" +rule fitbit_steps_intraday_python_features: + input: + sensor_data = "data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv", + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "fitbit_steps_intraday" + output: + "data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule fitbit_steps_intraday_r_features: + input: + sensor_data = "data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv", + day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" + params: + provider = lambda wildcards: config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "fitbit_steps_intraday" + output: + "data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_r_{provider_key}.csv" + script: + "../src/features/entry.R" # rule fitbit_sleep_features: # input: diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index b2da8756..02d9be7e 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -195,14 +195,14 @@ rule fitbit_parse_heartrate: rule fitbit_parse_steps: input: - data = expand("data/raw/{{pid}}/fitbit_steps_{fitbit_data_type}_raw.csv", fitbit_data_type = (["json"] if config["FITBIT_STEPS"]["TABLE_FORMAT"] == "JSON" else ["summary", "intraday"])) + "data/raw/{pid}/fitbit_steps_{fitbit_data_type}_raw.csv" params: timezone = config["DEVICE_DATA"]["PHONE"]["TIMEZONE"]["VALUE"], - table = config["FITBIT_STEPS"]["TABLE"], - table_format = config["FITBIT_STEPS"]["TABLE_FORMAT"] + table = lambda wildcards: config["FITBIT_STEPS_"+str(wildcards.fitbit_data_type).upper()]["TABLE"], + column_format = config["DEVICE_DATA"]["FITBIT"]["SOURCE"]["COLUMN_FORMAT"], + fitbit_data_type = "{fitbit_data_type}" output: - summary_data = "data/raw/{pid}/fitbit_steps_summary_parsed.csv", - intraday_data = "data/raw/{pid}/fitbit_steps_intraday_parsed.csv" + "data/raw/{pid}/fitbit_steps_{fitbit_data_type}_parsed.csv" script: "../src/data/fitbit_parse_steps.py" diff --git a/src/data/fitbit_parse_steps.py b/src/data/fitbit_parse_steps.py index e7c94987..773a02bc 100644 --- a/src/data/fitbit_parse_steps.py +++ b/src/data/fitbit_parse_steps.py @@ -7,55 +7,65 @@ from math import trunc STEPS_COLUMNS = ("device_id", "steps", "local_date_time", "timestamp") -def parseStepsData(steps_data): +def parseStepsData(steps_data, fitbit_data_type): if steps_data.empty: return pd.DataFrame(), pd.DataFrame(columns=STEPS_INTRADAY_COLUMNS) device_id = steps_data["device_id"].iloc[0] records_summary, records_intraday = [], [] + # Parse JSON into individual records for record in steps_data.fitbit_data: record = json.loads(record) # Parse text into JSON - + curr_date = datetime.strptime(record["activities-steps"][0]["dateTime"], "%Y-%m-%d") + # Parse summary data - curr_date = datetime.strptime( - record["activities-steps"][0]["dateTime"], "%Y-%m-%d") - - row_summary = (device_id, - record["activities-steps"][0]["value"], - curr_date, - 0) - - records_summary.append(row_summary) + if fitbit_data_type == "summary": + + row_summary = (device_id, + record["activities-steps"][0]["value"], + curr_date, + 0) + + records_summary.append(row_summary) # Parse intraday data - dataset = record["activities-steps-intraday"]["dataset"] - for data in dataset: - d_time = datetime.strptime(data["time"], '%H:%M:%S').time() - d_datetime = datetime.combine(curr_date, d_time) + if fitbit_data_type == "intraday": + dataset = record["activities-steps-intraday"]["dataset"] + for data in dataset: + d_time = datetime.strptime(data["time"], '%H:%M:%S').time() + d_datetime = datetime.combine(curr_date, d_time) - row_intraday = (device_id, - data["value"], - d_datetime, - 0) + row_intraday = (device_id, + data["value"], + d_datetime, + 0) + + records_intraday.append(row_intraday) + + if fitbit_data_type == "summary": + parsed_data = pd.DataFrame(data=records_summary, columns=STEPS_COLUMNS) + elif fitbit_data_type == "intraday": + parsed_data = pd.DataFrame(data=records_intraday, columns=STEPS_COLUMNS) + else: + raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].") + + return parsed_data - records_intraday.append(row_intraday) - return pd.DataFrame(data=records_summary, columns=STEPS_COLUMNS), pd.DataFrame(data=records_intraday, columns=STEPS_COLUMNS) -table_format = snakemake.params["table_format"] timezone = snakemake.params["timezone"] +column_format = snakemake.params["column_format"] +fitbit_data_type = snakemake.params["fitbit_data_type"] -if table_format == "JSON": +if column_format == "JSON": json_raw = pd.read_csv(snakemake.input[0]) - summary, intraday = parseStepsData(json_raw) -elif table_format == "CSV": - summary = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) - intraday = pd.read_csv(snakemake.input[1], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) + parsed_data = parseStepsData(json_raw, fitbit_data_type) +elif column_format == "PLAIN_TEXT": + parsed_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) +else: + raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].") -if summary.shape[0] > 0: - summary["timestamp"] = summary["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 -if intraday.shape[0] > 0: - intraday["timestamp"] = intraday["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 +if parsed_data.shape[0] > 0: + parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 -summary.to_csv(snakemake.output["summary_data"], index=False) -intraday.to_csv(snakemake.output["intraday_data"], index=False) \ No newline at end of file +parsed_data.to_csv(snakemake.output[0], index=False) diff --git a/src/features/fitbit_steps/rapids/main.py b/src/features/fitbit_steps_intraday/rapids/main.py similarity index 75% rename from src/features/fitbit_steps/rapids/main.py rename to src/features/fitbit_steps_intraday/rapids/main.py index 66777724..8bb5013b 100644 --- a/src/features/fitbit_steps/rapids/main.py +++ b/src/features/fitbit_steps_intraday/rapids/main.py @@ -38,16 +38,6 @@ def getBouts(steps_data): return bouts -def extractStepsFeaturesFromSummaryData(steps_summary_data, summary_features_to_compute): - steps_summary_features = pd.DataFrame() - - # statistics features of daily steps count - steps_summary_features = statsFeatures(steps_summary_data, summary_features_to_compute, "sumsteps", steps_summary_features) - - steps_summary_features.reset_index(inplace=True) - - return steps_summary_features - def extractStepsFeaturesFromIntradayData(steps_intraday_data, threshold_active_bout, intraday_features_to_compute_steps, intraday_features_to_compute_sedentarybout, intraday_features_to_compute_activebout, steps_intraday_features): steps_intraday_features = pd.DataFrame() @@ -73,22 +63,20 @@ def extractStepsFeaturesFromIntradayData(steps_intraday_data, threshold_active_b return steps_intraday_features + def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): threshold_active_bout = provider["THRESHOLD_ACTIVE_BOUT"] include_zero_step_rows = provider["INCLUDE_ZERO_STEP_ROWS"] - steps_summary_data = pd.read_csv(sensor_data_files["sensor_data"][0]) - steps_intraday_data = pd.read_csv(sensor_data_files["sensor_data"][1]) + steps_intraday_data = pd.read_csv(sensor_data_files["sensor_data"]) - requested_summary_features = ["summary" + x for x in provider["FEATURES"]["SUMMARY"]] - requested_intraday_features = provider["FEATURES"]["INTRADAY"] + requested_intraday_features = provider["FEATURES"] requested_intraday_features_steps = ["intraday" + x + "steps" for x in requested_intraday_features["STEPS"]] requested_intraday_features_sedentarybout = ["intraday" + x + "sedentarybout" for x in requested_intraday_features["SEDENTARY_BOUT"]] requested_intraday_features_activebout = ["intraday" + x + "activebout" for x in requested_intraday_features["ACTIVE_BOUT"]] # name of the features this function can compute - base_summary_features = ["summarymaxsumsteps", "summaryminsumsteps", "summaryavgsumsteps", "summarymediansumsteps", "summarystdsumsteps"] base_intraday_features_steps = ["intradaysumsteps", "intradaymaxsteps", "intradayminsteps", "intradayavgsteps", "intradaystdsteps"] base_intraday_features_sedentarybout = ["intradaycountepisodesedentarybout", "intradaysumdurationsedentarybout", "intradaymaxdurationsedentarybout", "intradaymindurationsedentarybout", "intradayavgdurationsedentarybout", "intradaystddurationsedentarybout"] base_intraday_features_activebout = ["intradaycountepisodeactivebout", "intradaysumdurationactivebout", "intradaymaxdurationactivebout", "intradaymindurationactivebout", "intradayavgdurationactivebout", "intradaystddurationactivebout"] @@ -97,25 +85,8 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg intraday_features_to_compute_sedentarybout = list(set(requested_intraday_features_sedentarybout) & set(base_intraday_features_sedentarybout)) intraday_features_to_compute_activebout = list(set(requested_intraday_features_activebout) & set(base_intraday_features_activebout)) - summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features)) intraday_features_to_compute = intraday_features_to_compute_steps + intraday_features_to_compute_sedentarybout + intraday_features_to_compute_activebout - # extract features from summary data - steps_summary_features = pd.DataFrame(columns=["local_segment"] + ["steps_rapids_" + x for x in summary_features_to_compute]) - if not steps_summary_data.empty: - steps_summary_data = filter_data_by_segment(steps_summary_data, day_segment) - - if not steps_summary_data.empty: - # only keep the segments start at 00:00:00 and end at 23:59:59 - datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00" - datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59" - - segment_regex = "{}#{},{}".format(day_segment, datetime_start_regex, datetime_end_regex) - steps_summary_data = steps_summary_data[steps_summary_data["local_segment"].str.match(segment_regex)] - - if not steps_summary_data.empty: - steps_summary_features = extractStepsFeaturesFromSummaryData(steps_summary_data, summary_features_to_compute) - # extract features from intraday features steps_intraday_features = pd.DataFrame(columns=["local_segment"] + ["steps_rapids_" + x for x in intraday_features_to_compute]) if not steps_intraday_data.empty: @@ -124,18 +95,14 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg if not steps_intraday_data.empty: steps_intraday_features = extractStepsFeaturesFromIntradayData(steps_intraday_data, threshold_active_bout, intraday_features_to_compute_steps, intraday_features_to_compute_sedentarybout, intraday_features_to_compute_activebout, steps_intraday_features) - # merge summary features and intraday features - steps_features = steps_intraday_features.merge(steps_summary_features, on=["local_segment"], how="outer") - - # exclude rows when the total step count is ZERO during the whole day if not include_zero_step_rows: - steps_features.index = steps_features["local_segment"].apply(lambda segment: segment.split("#")[1][:10]) + steps_intraday_features.index = steps_intraday_features["local_segment"].apply(lambda segment: segment.split("#")[1][:10]) - steps_features["dailycountstep"] = steps_intraday_data.groupby(["local_date"])["steps"].sum() - steps_features = steps_features.query("dailycountstep != 0") + steps_intraday_features["dailycountstep"] = steps_intraday_data.groupby(["local_date"])["steps"].sum() + steps_intraday_features = steps_intraday_features.query("dailycountstep != 0") - del steps_features["dailycountstep"] - steps_features.reset_index(drop=True, inplace=True) + del steps_intraday_features["dailycountstep"] + steps_intraday_features.reset_index(drop=True, inplace=True) - return steps_features + return steps_intraday_features diff --git a/src/features/fitbit_steps_summary/rapids/main.py b/src/features/fitbit_steps_summary/rapids/main.py new file mode 100644 index 00000000..25953f34 --- /dev/null +++ b/src/features/fitbit_steps_summary/rapids/main.py @@ -0,0 +1,67 @@ +import pandas as pd +import numpy as np + +def statsFeatures(steps_data, features_to_compute, features_type, steps_features): + if features_type == "steps" or features_type == "sumsteps": + col_name = "steps" + elif features_type == "durationsedentarybout" or features_type == "durationactivebout": + col_name = "duration" + else: + raise ValueError("features_type can only be one of ['steps', 'sumsteps', 'durationsedentarybout', 'durationactivebout'].") + + if ("summarycount" if features_type == "sumsteps" else "intradaycount") + features_type.replace("duration", "episode") in features_to_compute: + steps_features["steps_rapids_" + ("summarycount" if features_type == "sumsteps" else "intradaycount") + features_type.replace("duration", "episode")] = steps_data.groupby(["local_segment"])[col_name].count() + if ("summarysum" if features_type == "sumsteps" else "intradaysum") + features_type in features_to_compute: + steps_features["steps_rapids_" + ("summarysum" if features_type == "sumsteps" else "intradaysum") + features_type] = steps_data.groupby(["local_segment"])[col_name].sum() + if ("summarymax" if features_type == "sumsteps" else "intradaymax") + features_type in features_to_compute: + steps_features["steps_rapids_" + ("summarymax" if features_type == "sumsteps" else "intradaymax") + features_type] = steps_data.groupby(["local_segment"])[col_name].max() + if ("summarymin" if features_type == "sumsteps" else "intradaymin") + features_type in features_to_compute: + steps_features["steps_rapids_" + ("summarymin" if features_type == "sumsteps" else "intradaymin") + features_type] = steps_data.groupby(["local_segment"])[col_name].min() + if ("summaryavg" if features_type == "sumsteps" else "intradayavg") + features_type in features_to_compute: + steps_features["steps_rapids_" + ("summaryavg" if features_type == "sumsteps" else "intradayavg") + features_type] = steps_data.groupby(["local_segment"])[col_name].mean() + if ("summarymedian" if features_type == "sumsteps" else "intradaymedian") + features_type in features_to_compute: + steps_features["steps_rapids_" + ("summarymedian" if features_type == "sumsteps" else "intradaymedian") + features_type] = steps_data.groupby(["local_segment"])[col_name].median() + if ("summarystd" if features_type == "sumsteps" else "intradaystd") + features_type in features_to_compute: + steps_features["steps_rapids_" + ("summarystd" if features_type == "sumsteps" else "intradaystd") + features_type] = steps_data.groupby(["local_segment"])[col_name].std() + + return steps_features + +def extractStepsFeaturesFromSummaryData(steps_summary_data, summary_features_to_compute): + steps_summary_features = pd.DataFrame() + + # statistics features of daily steps count + steps_summary_features = statsFeatures(steps_summary_data, summary_features_to_compute, "sumsteps", steps_summary_features) + + steps_summary_features.reset_index(inplace=True) + + return steps_summary_features + + + +def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): + + steps_summary_data = pd.read_csv(sensor_data_files["sensor_data"]) + requested_summary_features = ["summary" + x for x in provider["FEATURES"]] + + # name of the features this function can compute + base_summary_features = ["summarymaxsumsteps", "summaryminsumsteps", "summaryavgsumsteps", "summarymediansumsteps", "summarystdsumsteps"] + # the subset of requested features this function can compute + summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features)) + + # extract features from summary data + steps_summary_features = pd.DataFrame(columns=["local_segment"] + ["steps_rapids_" + x for x in summary_features_to_compute]) + if not steps_summary_data.empty: + steps_summary_data = filter_data_by_segment(steps_summary_data, day_segment) + + if not steps_summary_data.empty: + # only keep the segments start at 00:00:00 and end at 23:59:59 + datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00" + datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59" + + segment_regex = "{}#{},{}".format(day_segment, datetime_start_regex, datetime_end_regex) + steps_summary_data = steps_summary_data[steps_summary_data["local_segment"].str.match(segment_regex)] + + if not steps_summary_data.empty: + steps_summary_features = extractStepsFeaturesFromSummaryData(steps_summary_data, summary_features_to_compute) + + return steps_summary_features