Fix step bouts features
parent
4af14eca4b
commit
29b04b0601
|
@ -154,10 +154,10 @@ STEP:
|
|||
END: "07:00"
|
||||
FEATURES:
|
||||
ALL_STEPS: ["sumallsteps", "maxallsteps", "minallsteps", "avgallsteps", "stdallsteps"]
|
||||
SEDENTARY_BOUT: ["countsedentarybout", "maxdurationsedentarybout", "mindurationsedentarybout", "avgdurationsedentarybout", "stddurationsedentarybout", "sumdurationsedentarybout"]
|
||||
ACTIVE_BOUT: ["countactivebout", "maxdurationactivebout", "mindurationactivebout", "avgdurationactivebout", "stddurationactivebout"]
|
||||
SEDENTARY_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"]
|
||||
ACTIVE_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"]
|
||||
THRESHOLD_ACTIVE_BOUT: 10 # steps
|
||||
INCLUDE_ZERO_STEP_ROWS: True
|
||||
INCLUDE_ZERO_STEP_ROWS: False
|
||||
|
||||
SLEEP:
|
||||
COMPUTE: False
|
||||
|
|
|
@ -1,6 +1,52 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
def getBouts(step_data, time_interval):
|
||||
# resample the data into time_interval minute bins, set "isactivebout" column to be NA if it is missing
|
||||
resampled_step_minute = pd.DataFrame(step_data.resample(str(time_interval) + "T", on="local_date_time")["isactivebout"].sum(min_count=1))
|
||||
|
||||
# group rows by consecutive values of "isactivebout" column
|
||||
group = pd.DataFrame(resampled_step_minute["isactivebout"] != resampled_step_minute["isactivebout"].shift()).cumsum().rename(columns={"isactivebout": "group_idx"})
|
||||
|
||||
# combine resampled_acc_minute and group column
|
||||
resampled_step_minute = pd.concat([resampled_step_minute, group], axis=1)
|
||||
|
||||
# drop rows where "isactivebout" column is missing and reset the index
|
||||
resampled_step_minute.dropna(subset=["isactivebout"], inplace=True)
|
||||
resampled_step_minute.reset_index(inplace=True)
|
||||
resampled_step_minute.loc[:, "local_date"] = resampled_step_minute["local_date_time"].dt.date
|
||||
|
||||
# duration column contains the number of minutes (rows) of active and sedentary bout
|
||||
bouts = resampled_step_minute.groupby(["isactivebout", "group_idx", "local_date"]).count().rename(columns={"local_date_time": "duration"}).reset_index()
|
||||
bouts["duration"] = bouts["duration"] * time_interval
|
||||
|
||||
return bouts
|
||||
|
||||
def statsFeatures(step_data, day_segment, features_to_compute, features_type, step_features):
|
||||
if features_type == "allsteps":
|
||||
col_name = "steps"
|
||||
elif features_type == "durationsedentarybout" or features_type == "durationactivebout":
|
||||
col_name = "duration"
|
||||
else:
|
||||
raise ValueError("features_type can only be one of ['allsteps', 'durationsedentarybout', 'durationactivebout'].")
|
||||
|
||||
if "count" + features_type.replace("duration", "episode") in features_to_compute:
|
||||
step_features["step_" + day_segment + "_count" + features_type.replace("duration", "episode")] = step_data.groupby(["local_date"])[col_name].count()
|
||||
if "sum" + features_type in features_to_compute:
|
||||
step_features["step_" + day_segment + "_sum" + features_type] = step_data.groupby(["local_date"])[col_name].sum()
|
||||
if "max" + features_type in features_to_compute:
|
||||
step_features["step_" + day_segment + "_max" + features_type] = step_data.groupby(["local_date"])[col_name].max()
|
||||
if "min" + features_type in features_to_compute:
|
||||
step_features["step_" + day_segment + "_min" + features_type] = step_data.groupby(["local_date"])[col_name].min()
|
||||
if "avg" + features_type in features_to_compute:
|
||||
step_features["step_" + day_segment + "_avg" + features_type] = step_data.groupby(["local_date"])[col_name].mean()
|
||||
if "median" + features_type in features_to_compute:
|
||||
step_features["step_" + day_segment + "_median" + features_type] = step_data.groupby(["local_date"])[col_name].median()
|
||||
if "std" + features_type in features_to_compute:
|
||||
step_features["step_" + day_segment + "_std" + features_type] = step_data.groupby(["local_date"])[col_name].std()
|
||||
|
||||
return step_features
|
||||
|
||||
def base_fitbit_step_features(step_data, day_segment, requested_features, threshold_active_bout, include_zero_step_rows):
|
||||
requested_features_allsteps = requested_features["features_all_steps"]
|
||||
requested_features_sedentarybout = requested_features["features_sedentary_bout"]
|
||||
|
@ -8,8 +54,8 @@ def base_fitbit_step_features(step_data, day_segment, requested_features, thresh
|
|||
|
||||
# name of the features this function can compute
|
||||
base_features_allsteps = ["sumallsteps", "maxallsteps", "minallsteps", "avgallsteps", "stdallsteps"]
|
||||
base_features_sedentarybout = ["countsedentarybout", "maxdurationsedentarybout", "mindurationsedentarybout", "avgdurationsedentarybout", "stddurationsedentarybout", "sumdurationsedentarybout"]
|
||||
base_features_activebout = ["countactivebout", "maxdurationactivebout", "mindurationactivebout", "avgdurationactivebout", "stddurationactivebout"]
|
||||
base_features_sedentarybout = ["countepisodesedentarybout", "sumdurationsedentarybout", "maxdurationsedentarybout", "mindurationsedentarybout", "avgdurationsedentarybout", "stddurationsedentarybout"]
|
||||
base_features_activebout = ["countepisodeactivebout", "sumdurationactivebout", "maxdurationactivebout", "mindurationactivebout", "avgdurationactivebout", "stddurationactivebout"]
|
||||
# the subset of requested features this function can compute
|
||||
features_to_compute_allsteps = list(set(requested_features_allsteps) & set(base_features_allsteps))
|
||||
features_to_compute_sedentarybout = list(set(requested_features_sedentarybout) & set(base_features_sedentarybout))
|
||||
|
@ -25,70 +71,33 @@ def base_fitbit_step_features(step_data, day_segment, requested_features, thresh
|
|||
if not step_data.empty:
|
||||
step_features = pd.DataFrame()
|
||||
|
||||
resampled_data = step_data.set_index(step_data.local_date_time)
|
||||
resampled_data.index.names = ["datetime"]
|
||||
# statistics features of step count
|
||||
step_features = statsFeatures(step_data, day_segment, features_to_compute_allsteps, "allsteps", step_features)
|
||||
|
||||
# Replace the first element of time_diff_minutes with its second element
|
||||
resampled_data["time_diff_minutes"] = resampled_data["local_date_time"].diff().fillna(resampled_data["local_date_time"].diff()[1]).dt.total_seconds().div(60).astype(int)
|
||||
# calculate time interval between two records in minutes
|
||||
time_interval = step_data["local_date_time"].diff().min().total_seconds() / 60
|
||||
|
||||
# Sedentary Bout when you have less than 10 steps in a minute
|
||||
# Active Bout when you have greater or equal to 10 steps in a minute
|
||||
resampled_data["active_sedentary"] = np.where(resampled_data["steps"] < int(threshold_active_bout) * resampled_data["time_diff_minutes"],"sedentary","active")
|
||||
# sedentary bout: less than THRESHOLD_ACTIVE_BOUT (default: 10) steps in a minute
|
||||
# active bout: greater or equal to THRESHOLD_ACTIVE_BOUT (default: 10) steps in a minute
|
||||
isactivebout = np.where(step_data["steps"] < int(threshold_active_bout) * time_interval, 0, 1)
|
||||
step_data = step_data.assign(isactivebout = isactivebout)
|
||||
|
||||
# Time Calculations of sedentary/active bouts:
|
||||
resampled_data["active_sedentary_groups"] = (resampled_data.active_sedentary != resampled_data.active_sedentary.shift()).cumsum().values
|
||||
bouts = getBouts(step_data, time_interval)
|
||||
|
||||
# Get the total minutes for each episode
|
||||
minutes_per_episode = resampled_data.groupby(["local_date","active_sedentary","active_sedentary_groups"])["time_diff_minutes"].sum()
|
||||
# statistics features of sedentary bout
|
||||
sedentary_bout = bouts[bouts["isactivebout"] == 0]
|
||||
step_features = statsFeatures(sedentary_bout, day_segment, features_to_compute_sedentarybout, "durationsedentarybout", step_features)
|
||||
|
||||
# Get Stats for all episodes in terms of minutes
|
||||
stats_per_episode = minutes_per_episode.groupby(["local_date", "active_sedentary"]).agg([max, min, np.mean, np.std, np.sum])
|
||||
mux = pd.MultiIndex.from_product([stats_per_episode.index.levels[0], stats_per_episode.index.levels[1]], names=["local_date", "active_sedentary"])
|
||||
stats_per_episode = stats_per_episode.reindex(mux, fill_value=None).reset_index()
|
||||
stats_per_episode.set_index("local_date", inplace = True)
|
||||
# statistics features of active bout
|
||||
active_bout = bouts[bouts["isactivebout"] == 1]
|
||||
step_features = statsFeatures(active_bout, day_segment, features_to_compute_activebout, "durationactivebout", step_features)
|
||||
|
||||
# Descriptive Statistics Features:
|
||||
if "sumallsteps" in features_to_compute_allsteps:
|
||||
step_features["step_" + str(day_segment) + "_sumallsteps"] = resampled_data["steps"].resample("D").sum()
|
||||
if "maxallsteps" in features_to_compute_allsteps:
|
||||
step_features["step_" + str(day_segment) + "_maxallsteps"] = resampled_data["steps"].resample("D").max()
|
||||
if "minallsteps" in features_to_compute_allsteps:
|
||||
step_features["step_" + str(day_segment) + "_minallsteps"] = resampled_data["steps"].resample("D").min()
|
||||
if "avgallsteps" in features_to_compute_allsteps:
|
||||
step_features["step_" + str(day_segment) + "_avgallsteps"] = resampled_data["steps"].resample("D").mean()
|
||||
if "stdallsteps" in features_to_compute_allsteps:
|
||||
step_features["step_" + str(day_segment) + "_stdallsteps"] = resampled_data["steps"].resample("D").std()
|
||||
|
||||
if "countsedentarybout" in features_to_compute_sedentarybout:
|
||||
step_features["step_" + str(day_segment) + "_countsedentarybout"] = resampled_data[resampled_data["active_sedentary"] == "sedentary"]["active_sedentary_groups"].resample("D").nunique()
|
||||
if "countactivebout" in features_to_compute_activebout:
|
||||
step_features["step_" + str(day_segment) + "_countactivebout"] = resampled_data[resampled_data["active_sedentary"] == "active"]["active_sedentary_groups"].resample("D").nunique()
|
||||
if "maxdurationsedentarybout" in features_to_compute_sedentarybout:
|
||||
step_features["step_" + str(day_segment) + "_maxdurationsedentarybout"] = stats_per_episode[stats_per_episode["active_sedentary"]=="sedentary"]["max"]
|
||||
if "mindurationsedentarybout" in features_to_compute_sedentarybout:
|
||||
step_features["step_" + str(day_segment) + "_mindurationsedentarybout"] = stats_per_episode[stats_per_episode["active_sedentary"]=="sedentary"]["min"]
|
||||
if "avgdurationsedentarybout" in features_to_compute_sedentarybout:
|
||||
step_features["step_" + str(day_segment) + "_avgdurationsedentarybout"] = stats_per_episode[stats_per_episode["active_sedentary"]=="sedentary"]["mean"]
|
||||
if "stddurationsedentarybout" in features_to_compute_sedentarybout:
|
||||
step_features["step_" + str(day_segment) + "_stddurationsedentarybout"] = stats_per_episode[stats_per_episode["active_sedentary"]=="sedentary"]["std"]
|
||||
if "sumdurationsedentarybout" in features_to_compute_sedentarybout:
|
||||
step_features["step_" + str(day_segment) + "_sumdurationsedentarybout"] = stats_per_episode[stats_per_episode["active_sedentary"]=="sedentary"]["sum"]
|
||||
if "maxdurationactivebout" in features_to_compute_activebout:
|
||||
step_features["step_" + str(day_segment) + "_maxdurationactivebout"] = stats_per_episode[stats_per_episode["active_sedentary"]== "active"]["max"]
|
||||
if "mindurationactivebout" in features_to_compute_activebout:
|
||||
step_features["step_" + str(day_segment) + "_mindurationactivebout"] = stats_per_episode[stats_per_episode["active_sedentary"]== "active"]["min"]
|
||||
if "avgdurationactivebout" in features_to_compute_activebout:
|
||||
step_features["step_" + str(day_segment) + "_avgdurationactivebout"] = stats_per_episode[stats_per_episode["active_sedentary"]== "active"]["mean"]
|
||||
if "stddurationactivebout" in features_to_compute_activebout:
|
||||
step_features["step_" + str(day_segment) + "_stddurationactivebout"] = stats_per_episode[stats_per_episode["active_sedentary"]== "active"]["std"]
|
||||
|
||||
#Exclude data when the total step count is ZERO during the whole epoch
|
||||
# exclude data when the total step count is ZERO during the whole epoch
|
||||
if not include_zero_step_rows:
|
||||
step_features["sumallsteps_aux"] = resampled_data["steps"].resample("D").sum()
|
||||
step_features["sumallsteps_aux"] = step_data.groupby(["local_date"])["steps"].sum()
|
||||
step_features = step_features.query("sumallsteps_aux != 0")
|
||||
del step_features["sumallsteps_aux"]
|
||||
|
||||
step_features.index.names = ["local_date"]
|
||||
step_features = step_features.reset_index()
|
||||
|
||||
return step_features
|
||||
|
|
|
@ -42,8 +42,8 @@ exclude_sleep_fixed_end = snakemake.params["exclude_sleep_fixed_end"]
|
|||
step_features = pd.DataFrame(columns=["local_date"])
|
||||
requested_features = {}
|
||||
requested_features["features_all_steps"] = snakemake.params["features_all_steps"]
|
||||
requested_features["features_sedentary_bout"] = snakemake.params["features_sedentary_bout"]
|
||||
requested_features["features_active_bout"] = snakemake.params["features_active_bout"]
|
||||
requested_features["features_sedentary_bout"] = [feature + "sedentarybout" for feature in snakemake.params["features_sedentary_bout"]]
|
||||
requested_features["features_active_bout"] = [feature + "activebout" for feature in snakemake.params["features_active_bout"]]
|
||||
|
||||
if exclude_sleep == True:
|
||||
if exclude_sleep_type == "FIXED":
|
||||
|
|
Loading…
Reference in New Issue