From 0115fd14f6fdcdbed5935499dd9463ef40771537 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Fri, 5 Jun 2020 21:29:39 -0400 Subject: [PATCH] Refactor fitbit step features --- rules/features.snakefile | 2 +- src/features/fitbit_step/fitbit_step_base.py | 96 +++++++++++++++++ src/features/fitbit_step_features.py | 107 ++----------------- 3 files changed, 108 insertions(+), 97 deletions(-) create mode 100644 src/features/fitbit_step/fitbit_step_base.py diff --git a/rules/features.snakefile b/rules/features.snakefile index 0c73653f..3fc73dad 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -193,7 +193,7 @@ rule fitbit_heartrate_features: rule fitbit_step_features: input: - steps_data = "data/raw/{pid}/fitbit_steps_intraday_with_datetime.csv" + step_data = "data/raw/{pid}/fitbit_steps_intraday_with_datetime.csv" params: day_segment = "{day_segment}", features_all_steps = config["STEP"]["FEATURES"]["ALL_STEPS"], diff --git a/src/features/fitbit_step/fitbit_step_base.py b/src/features/fitbit_step/fitbit_step_base.py new file mode 100644 index 00000000..c64502d1 --- /dev/null +++ b/src/features/fitbit_step/fitbit_step_base.py @@ -0,0 +1,96 @@ +import pandas as pd +import numpy as np +import datetime as dt +from features_utils import splitOvernightEpisodes, splitMultiSegmentEpisodes + +def base_fitbit_step_features(step_data, day_segment, requested_features, threshold_active_bout, include_zero_step_rows): + requested_features_allsteps = requested_features["features_all_steps"] + requested_features_sedentarybout = requested_features["features_sedentary_bout"] + requested_features_activebout = requested_features["features_active_bout"] + + # name of the features this function can compute + base_features_allsteps = ["sumallsteps", "maxallsteps", "minallsteps", "avgallsteps", "stdallsteps"] + base_features_sedentarybout = ["countsedentarybout", "maxdurationsedentarybout", "mindurationsedentarybout", "avgdurationsedentarybout", "stddurationsedentarybout", "sumdurationsedentarybout"] + base_features_activebout = ["countactivebout", "maxdurationactivebout", "mindurationactivebout", "avgdurationactivebout", "stddurationactivebout"] + # the subset of requested features this function can compute + features_to_compute_allsteps = list(set(requested_features_allsteps) & set(base_features_allsteps)) + features_to_compute_sedentarybout = list(set(requested_features_sedentarybout) & set(base_features_sedentarybout)) + features_to_compute_activebout = list(set(requested_features_activebout) & set(base_features_activebout)) + + features_to_compute = features_to_compute_allsteps + features_to_compute_sedentarybout + features_to_compute_activebout + + step_features = pd.DataFrame(columns=["local_date"] + ["step_" + day_segment + "_" + x for x in features_to_compute]) + if not step_data.empty: + if day_segment != "daily": + step_data =step_data[step_data["local_day_segment"] == day_segment] + + if not step_data.empty: + step_features = pd.DataFrame() + + resampled_data = step_data.set_index(step_data.local_date_time) + resampled_data.index.names = ["datetime"] + + # Replace the first element of time_diff_minutes with its second element + resampled_data["time_diff_minutes"] = resampled_data["local_date_time"].diff().fillna(resampled_data["local_date_time"].diff()[1]).dt.total_seconds().div(60).astype(int) + + # Sedentary Bout when you have less than 10 steps in a minute + # Active Bout when you have greater or equal to 10 steps in a minute + resampled_data["active_sedentary"] = np.where(resampled_data["steps"] < int(threshold_active_bout) * resampled_data["time_diff_minutes"],"sedentary","active") + + # Time Calculations of sedentary/active bouts: + resampled_data["active_sedentary_groups"] = (resampled_data.active_sedentary != resampled_data.active_sedentary.shift()).cumsum().values + + # Get the total minutes for each episode + minutes_per_episode = resampled_data.groupby(["local_date","active_sedentary","active_sedentary_groups"])["time_diff_minutes"].sum() + + # Get Stats for all episodes in terms of minutes + stats_per_episode = minutes_per_episode.groupby(["local_date", "active_sedentary"]).agg([max, min, np.mean, np.std, np.sum]) + mux = pd.MultiIndex.from_product([stats_per_episode.index.levels[0], stats_per_episode.index.levels[1]], names=["local_date", "active_sedentary"]) + stats_per_episode = stats_per_episode.reindex(mux, fill_value=None).reset_index() + stats_per_episode.set_index("local_date", inplace = True) + + # Descriptive Statistics Features: + if "sumallsteps" in features_to_compute_allsteps: + step_features["step_" + str(day_segment) + "_sumallsteps"] = resampled_data["steps"].resample("D").sum() + if "maxallsteps" in features_to_compute_allsteps: + step_features["step_" + str(day_segment) + "_maxallsteps"] = resampled_data["steps"].resample("D").max() + if "minallsteps" in features_to_compute_allsteps: + step_features["step_" + str(day_segment) + "_minallsteps"] = resampled_data["steps"].resample("D").min() + if "avgallsteps" in features_to_compute_allsteps: + step_features["step_" + str(day_segment) + "_avgallsteps"] = resampled_data["steps"].resample("D").mean() + if "stdallsteps" in features_to_compute_allsteps: + step_features["step_" + str(day_segment) + "_stdallsteps"] = resampled_data["steps"].resample("D").std() + + if "countsedentarybout" in features_to_compute_sedentarybout: + step_features["step_" + str(day_segment) + "_countsedentarybout"] = resampled_data[resampled_data["active_sedentary"] == "sedentary"]["active_sedentary_groups"].resample("D").nunique() + if "countactivebout" in features_to_compute_activebout: + step_features["step_" + str(day_segment) + "_countactivebout"] = resampled_data[resampled_data["active_sedentary"] == "active"]["active_sedentary_groups"].resample("D").nunique() + if "maxdurationsedentarybout" in features_to_compute_sedentarybout: + step_features["step_" + str(day_segment) + "_maxdurationsedentarybout"] = stats_per_episode[stats_per_episode["active_sedentary"]=="sedentary"]["max"] + if "mindurationsedentarybout" in features_to_compute_sedentarybout: + step_features["step_" + str(day_segment) + "_mindurationsedentarybout"] = stats_per_episode[stats_per_episode["active_sedentary"]=="sedentary"]["min"] + if "avgdurationsedentarybout" in features_to_compute_sedentarybout: + step_features["step_" + str(day_segment) + "_avgdurationsedentarybout"] = stats_per_episode[stats_per_episode["active_sedentary"]=="sedentary"]["mean"] + if "stddurationsedentarybout" in features_to_compute_sedentarybout: + step_features["step_" + str(day_segment) + "_stddurationsedentarybout"] = stats_per_episode[stats_per_episode["active_sedentary"]=="sedentary"]["std"] + if "sumdurationsedentarybout" in features_to_compute_sedentarybout: + step_features["step_" + str(day_segment) + "_sumdurationsedentarybout"] = stats_per_episode[stats_per_episode["active_sedentary"]=="sedentary"]["sum"] + if "maxdurationactivebout" in features_to_compute_activebout: + step_features["step_" + str(day_segment) + "_maxdurationactivebout"] = stats_per_episode[stats_per_episode["active_sedentary"]== "active"]["max"] + if "mindurationactivebout" in features_to_compute_activebout: + step_features["step_" + str(day_segment) + "_mindurationactivebout"] = stats_per_episode[stats_per_episode["active_sedentary"]== "active"]["min"] + if "avgdurationactivebout" in features_to_compute_activebout: + step_features["step_" + str(day_segment) + "_avgdurationactivebout"] = stats_per_episode[stats_per_episode["active_sedentary"]== "active"]["mean"] + if "stddurationactivebout" in features_to_compute_activebout: + step_features["step_" + str(day_segment) + "_stddurationactivebout"] = stats_per_episode[stats_per_episode["active_sedentary"]== "active"]["std"] + + #Exclude data when the total step count is ZERO during the whole epoch + if not include_zero_step_rows: + step_features["sumallsteps_aux"] = resampled_data["steps"].resample("D").sum() + step_features = step_features.query("sumallsteps_aux != 0") + del step_features["sumallsteps_aux"] + + step_features.index.names = ["local_date"] + step_features = step_features.reset_index() + + return step_features diff --git a/src/features/fitbit_step_features.py b/src/features/fitbit_step_features.py index eec952ae..46677388 100644 --- a/src/features/fitbit_step_features.py +++ b/src/features/fitbit_step_features.py @@ -1,106 +1,21 @@ import pandas as pd import numpy as np -import datetime as dt -from features_utils import splitOvernightEpisodes, splitMultiSegmentEpisodes +from fitbit_step.fitbit_step_base import base_fitbit_step_features +step_data = pd.read_csv(snakemake.input["step_data"], parse_dates=["local_date_time"]) day_segment = snakemake.params["day_segment"] -all_steps = snakemake.params["features_all_steps"] -sedentary_bout = snakemake.params["features_sedentary_bout"] -active_bout = snakemake.params["features_active_bout"] -threshold_active_bout = snakemake.params['threshold_active_bout'] +threshold_active_bout = snakemake.params["threshold_active_bout"] include_zero_step_rows = snakemake.params["include_zero_step_rows"] +step_features = pd.DataFrame(columns=["local_date"]) -#Read csv into a pandas dataframe -data = pd.read_csv(snakemake.input['steps_data'],parse_dates=['local_date_time']) -columns = list("step_" + str(day_segment) + "_" + column for column in (all_steps + sedentary_bout + active_bout)) +requested_features = {} +requested_features["features_all_steps"] = snakemake.params["features_all_steps"] +requested_features["features_sedentary_bout"] = snakemake.params["features_sedentary_bout"] +requested_features["features_active_bout"] = snakemake.params["features_active_bout"] -if (day_segment != 'daily'): - data = data.loc[data['local_day_segment'] == str(day_segment)] - -if data.empty: - finalDataset = pd.DataFrame(columns = columns) -else: - finalDataset = pd.DataFrame() +step_features = step_features.merge(base_fitbit_step_features(step_data, day_segment, requested_features, threshold_active_bout, include_zero_step_rows), on="local_date", how="outer") - #Preprocessing: - data.local_date_time = pd.to_datetime(data.local_date_time) - resampledData = data.set_index(data.local_date_time) - resampledData.index.names = ['datetime'] - resampledData['time_diff_minutes'] = resampledData['local_date_time'].diff().fillna(pd.Timedelta(seconds=0)).dt.total_seconds().div(60).astype(int) +assert np.sum([len(x) for x in requested_features.values()]) + 1 == step_features.shape[1], "The number of features in the output dataframe (=" + str(step_features.shape[1]) + ") does not match the expected value (=" + str(np.sum([len(x) for x in requested_features.values()])) + " + 1). Verify your fitbit step feature extraction functions" - #Sedentary Bout when you have less than 10 steps in a minute - #Active Bout when you have greater or equal to 10 steps in a minute - resampledData['active_sedentary'] = np.where(resampledData['steps']