Split FITBIT_STEPS into FITBIT_STEPS_SUMMARY and FITBIT_STEPS_INTRADAY

pull/103/head
Meng Li 2020-11-11 21:16:48 -05:00
parent b7e22b7440
commit a71efd6b85
7 changed files with 189 additions and 138 deletions

View File

@ -144,9 +144,6 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/interim/{pid}/phone_locations_features/phone_locations_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_LOCATIONS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/phone_locations.csv", pid=config["PIDS"]))
if config["FITBIT_STEPS"]["TABLE_FORMAT"] not in ["JSON", "CSV"]:
raise ValueError("config['FITBIT_STEPS']['TABLE_FORMAT'] should be JSON or CSV but you typed" + config["FITBIT_STEPS"]["TABLE_FORMAT"])
if config["FITBIT_CALORIES"]["TABLE_FORMAT"] not in ["JSON", "CSV"]: if config["FITBIT_CALORIES"]["TABLE_FORMAT"] not in ["JSON", "CSV"]:
raise ValueError("config['FITBIT_CALORIES']['TABLE_FORMAT'] should be JSON or CSV but you typed" + config["FITBIT_CALORIES"]["TABLE_FORMAT"]) raise ValueError("config['FITBIT_CALORIES']['TABLE_FORMAT'] should be JSON or CSV but you typed" + config["FITBIT_CALORIES"]["TABLE_FORMAT"])
@ -170,13 +167,21 @@ for provider in config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_intraday_features/fitbit_heartrate_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/interim/{pid}/fitbit_heartrate_intraday_features/fitbit_heartrate_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_HEARTRATE_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_intraday.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_heartrate_intraday.csv", pid=config["PIDS"]))
for provider in config["FITBIT_STEPS"]["PROVIDERS"].keys(): for provider in config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"].keys():
if config["FITBIT_STEPS"]["PROVIDERS"][provider]["COMPUTE"]: if config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_{fitbit_data_type}_raw.csv", pid=config["PIDS"], fitbit_data_type=(["json"] if config["FITBIT_STEPS"]["TABLE_FORMAT"] == "JSON" else ["summary", "intraday"]))) files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_{fitbit_data_type}_parsed.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"])) files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_{fitbit_data_type}_parsed_with_datetime.csv", pid=config["PIDS"], fitbit_data_type=["summary", "intraday"])) files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_features/fitbit_steps_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps.csv", pid=config["PIDS"])) files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_summary.csv", pid=config["PIDS"]))
for provider in config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"].keys():
if config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["COMPUTE"]:
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_raw.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_parsed.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/fitbit_steps_intraday.csv", pid=config["PIDS"]))
for provider in config["FITBIT_CALORIES"]["PROVIDERS"].keys(): for provider in config["FITBIT_CALORIES"]["PROVIDERS"].keys():
if config["FITBIT_CALORIES"]["PROVIDERS"][provider]["COMPUTE"]: if config["FITBIT_CALORIES"]["PROVIDERS"][provider]["COMPUTE"]:

View File

@ -277,31 +277,27 @@ FITBIT_HEARTRATE_INTRADAY:
SRC_FOLDER: "rapids" # inside src/features/fitbit_heartrate_intraday SRC_FOLDER: "rapids" # inside src/features/fitbit_heartrate_intraday
SRC_LANGUAGE: "python" SRC_LANGUAGE: "python"
FITBIT_STEPS: FITBIT_STEPS_SUMMARY:
TABLE_FORMAT: JSON # JSON or CSV. If your JSON or CSV data are files change [DEVICE_DATA][FITBIT][SOURCE][TYPE] to FILES TABLE: steps_summary
TABLE: PROVIDERS:
JSON: fitbit_steps RAPIDS:
CSV: COMPUTE: False
SUMMARY: steps_summary FEATURES: ["maxsumsteps", "minsumsteps", "avgsumsteps", "mediansumsteps", "stdsumsteps"]
INTRADAY: steps_intraday SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_summary
EXCLUDE_SLEEP: # you can exclude sleep periods from the step features computation SRC_LANGUAGE: "python"
EXCLUDE: False
TYPE: FIXED # FIXED OR FITBIT_BASED (configure FITBIT_SLEEP section) FITBIT_STEPS_INTRADAY:
FIXED: TABLE: steps_intraday
START: "23:00"
END: "07:00"
PROVIDERS: PROVIDERS:
RAPIDS: RAPIDS:
COMPUTE: False COMPUTE: False
FEATURES: FEATURES:
SUMMARY: ["maxsumsteps", "minsumsteps", "avgsumsteps", "mediansumsteps", "stdsumsteps"] STEPS: ["sum", "max", "min", "avg", "std"]
INTRADAY: SEDENTARY_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"]
STEPS: ["sum", "max", "min", "avg", "std"] ACTIVE_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"]
SEDENTARY_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"]
ACTIVE_BOUT: ["countepisode", "sumduration", "maxduration", "minduration", "avgduration", "stdduration"]
THRESHOLD_ACTIVE_BOUT: 10 # steps THRESHOLD_ACTIVE_BOUT: 10 # steps
INCLUDE_ZERO_STEP_ROWS: False INCLUDE_ZERO_STEP_ROWS: False
SRC_FOLDER: "rapids" # inside src/features/fitbit_steps SRC_FOLDER: "rapids" # inside src/features/fitbit_steps_intraday
SRC_LANGUAGE: "python" SRC_LANGUAGE: "python"
FITBIT_SLEEP: FITBIT_SLEEP:

View File

@ -424,51 +424,57 @@ rule fitbit_heartrate_intraday_r_features:
script: script:
"../src/features/entry.R" "../src/features/entry.R"
rule fitbit_steps_python_features: rule fitbit_steps_summary_python_features:
input: input:
sensor_data = expand("data/raw/{{pid}}/fitbit_steps_{fitbit_data_type}_parsed_with_datetime.csv", fitbit_data_type=["summary", "intraday"]), sensor_data = "data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params: params:
provider = lambda wildcards: config["FITBIT_STEPS"]["PROVIDERS"][wildcards.provider_key.upper()], provider = lambda wildcards: config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}", provider_key = "{provider_key}",
sensor_key = "fitbit_steps" sensor_key = "fitbit_steps_summary"
output: output:
"data/interim/{pid}/fitbit_steps_features/fitbit_steps_python_{provider_key}.csv" "data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_python_{provider_key}.csv"
script: script:
"../src/features/entry.py" "../src/features/entry.py"
rule fitbit_steps_r_features: rule fitbit_steps_summary_r_features:
input: input:
sensor_data = expand("data/raw/{{pid}}/fitbit_steps_{fitbit_data_type}_parsed_with_datetime.csv", fitbit_data_type=["summary", "intraday"]), sensor_data = "data/raw/{pid}/fitbit_steps_summary_parsed_with_datetime.csv",
day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv" day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
params: params:
provider = lambda wildcards: config["FITBIT_STEPS"]["PROVIDERS"][wildcards.provider_key.upper()], provider = lambda wildcards: config["FITBIT_STEPS_SUMMARY"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}", provider_key = "{provider_key}",
sensor_key = "fitbit_steps" sensor_key = "fitbit_steps_summary"
output: output:
"data/interim/{pid}/fitbit_steps_features/fitbit_steps_r_{provider_key}.csv" "data/interim/{pid}/fitbit_steps_summary_features/fitbit_steps_summary_r_{provider_key}.csv"
script: script:
"../src/features/entry.R" "../src/features/entry.R"
# rule fitbit_step_features: rule fitbit_steps_intraday_python_features:
# input: input:
# step_data = "data/raw/{pid}/fitbit_step_intraday_with_datetime.csv", sensor_data = "data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv",
# sleep_data = optional_steps_sleep_input day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
# params: params:
# day_segment = "{day_segment}", provider = lambda wildcards: config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()],
# features_all_steps = config["STEP"]["FEATURES"]["ALL_STEPS"], provider_key = "{provider_key}",
# features_sedentary_bout = config["STEP"]["FEATURES"]["SEDENTARY_BOUT"], sensor_key = "fitbit_steps_intraday"
# features_active_bout = config["STEP"]["FEATURES"]["ACTIVE_BOUT"], output:
# threshold_active_bout = config["STEP"]["THRESHOLD_ACTIVE_BOUT"], "data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_python_{provider_key}.csv"
# include_zero_step_rows = config["STEP"]["INCLUDE_ZERO_STEP_ROWS"], script:
# exclude_sleep = config["STEP"]["EXCLUDE_SLEEP"]["EXCLUDE"], "../src/features/entry.py"
# exclude_sleep_type = config["STEP"]["EXCLUDE_SLEEP"]["TYPE"],
# exclude_sleep_fixed_start = config["STEP"]["EXCLUDE_SLEEP"]["FIXED"]["START"], rule fitbit_steps_intraday_r_features:
# exclude_sleep_fixed_end = config["STEP"]["EXCLUDE_SLEEP"]["FIXED"]["END"], input:
# output: sensor_data = "data/raw/{pid}/fitbit_steps_intraday_parsed_with_datetime.csv",
# "data/processed/{pid}/fitbit_step_{day_segment}.csv" day_segments_labels = "data/interim/day_segments/{pid}_day_segments_labels.csv"
# script: params:
# "../src/features/fitbit_step_features.py" provider = lambda wildcards: config["FITBIT_STEPS_INTRADAY"]["PROVIDERS"][wildcards.provider_key.upper()],
provider_key = "{provider_key}",
sensor_key = "fitbit_steps_intraday"
output:
"data/interim/{pid}/fitbit_steps_intraday_features/fitbit_steps_intraday_r_{provider_key}.csv"
script:
"../src/features/entry.R"
# rule fitbit_sleep_features: # rule fitbit_sleep_features:
# input: # input:

View File

@ -195,14 +195,14 @@ rule fitbit_parse_heartrate:
rule fitbit_parse_steps: rule fitbit_parse_steps:
input: input:
data = expand("data/raw/{{pid}}/fitbit_steps_{fitbit_data_type}_raw.csv", fitbit_data_type = (["json"] if config["FITBIT_STEPS"]["TABLE_FORMAT"] == "JSON" else ["summary", "intraday"])) "data/raw/{pid}/fitbit_steps_{fitbit_data_type}_raw.csv"
params: params:
timezone = config["DEVICE_DATA"]["PHONE"]["TIMEZONE"]["VALUE"], timezone = config["DEVICE_DATA"]["PHONE"]["TIMEZONE"]["VALUE"],
table = config["FITBIT_STEPS"]["TABLE"], table = lambda wildcards: config["FITBIT_STEPS_"+str(wildcards.fitbit_data_type).upper()]["TABLE"],
table_format = config["FITBIT_STEPS"]["TABLE_FORMAT"] column_format = config["DEVICE_DATA"]["FITBIT"]["SOURCE"]["COLUMN_FORMAT"],
fitbit_data_type = "{fitbit_data_type}"
output: output:
summary_data = "data/raw/{pid}/fitbit_steps_summary_parsed.csv", "data/raw/{pid}/fitbit_steps_{fitbit_data_type}_parsed.csv"
intraday_data = "data/raw/{pid}/fitbit_steps_intraday_parsed.csv"
script: script:
"../src/data/fitbit_parse_steps.py" "../src/data/fitbit_parse_steps.py"

View File

@ -7,55 +7,65 @@ from math import trunc
STEPS_COLUMNS = ("device_id", "steps", "local_date_time", "timestamp") STEPS_COLUMNS = ("device_id", "steps", "local_date_time", "timestamp")
def parseStepsData(steps_data): def parseStepsData(steps_data, fitbit_data_type):
if steps_data.empty: if steps_data.empty:
return pd.DataFrame(), pd.DataFrame(columns=STEPS_INTRADAY_COLUMNS) return pd.DataFrame(), pd.DataFrame(columns=STEPS_INTRADAY_COLUMNS)
device_id = steps_data["device_id"].iloc[0] device_id = steps_data["device_id"].iloc[0]
records_summary, records_intraday = [], [] records_summary, records_intraday = [], []
# Parse JSON into individual records # Parse JSON into individual records
for record in steps_data.fitbit_data: for record in steps_data.fitbit_data:
record = json.loads(record) # Parse text into JSON record = json.loads(record) # Parse text into JSON
curr_date = datetime.strptime(record["activities-steps"][0]["dateTime"], "%Y-%m-%d")
# Parse summary data # Parse summary data
curr_date = datetime.strptime( if fitbit_data_type == "summary":
record["activities-steps"][0]["dateTime"], "%Y-%m-%d")
row_summary = (device_id,
row_summary = (device_id, record["activities-steps"][0]["value"],
record["activities-steps"][0]["value"], curr_date,
curr_date, 0)
0)
records_summary.append(row_summary)
records_summary.append(row_summary)
# Parse intraday data # Parse intraday data
dataset = record["activities-steps-intraday"]["dataset"] if fitbit_data_type == "intraday":
for data in dataset: dataset = record["activities-steps-intraday"]["dataset"]
d_time = datetime.strptime(data["time"], '%H:%M:%S').time() for data in dataset:
d_datetime = datetime.combine(curr_date, d_time) d_time = datetime.strptime(data["time"], '%H:%M:%S').time()
d_datetime = datetime.combine(curr_date, d_time)
row_intraday = (device_id, row_intraday = (device_id,
data["value"], data["value"],
d_datetime, d_datetime,
0) 0)
records_intraday.append(row_intraday)
if fitbit_data_type == "summary":
parsed_data = pd.DataFrame(data=records_summary, columns=STEPS_COLUMNS)
elif fitbit_data_type == "intraday":
parsed_data = pd.DataFrame(data=records_intraday, columns=STEPS_COLUMNS)
else:
raise ValueError("fitbit_data_type can only be one of ['summary', 'intraday'].")
return parsed_data
records_intraday.append(row_intraday)
return pd.DataFrame(data=records_summary, columns=STEPS_COLUMNS), pd.DataFrame(data=records_intraday, columns=STEPS_COLUMNS)
table_format = snakemake.params["table_format"]
timezone = snakemake.params["timezone"] timezone = snakemake.params["timezone"]
column_format = snakemake.params["column_format"]
fitbit_data_type = snakemake.params["fitbit_data_type"]
if table_format == "JSON": if column_format == "JSON":
json_raw = pd.read_csv(snakemake.input[0]) json_raw = pd.read_csv(snakemake.input[0])
summary, intraday = parseStepsData(json_raw) parsed_data = parseStepsData(json_raw, fitbit_data_type)
elif table_format == "CSV": elif column_format == "PLAIN_TEXT":
summary = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) parsed_data = pd.read_csv(snakemake.input[0], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None))
intraday = pd.read_csv(snakemake.input[1], parse_dates=["local_date_time"], date_parser=lambda col: pd.to_datetime(col).tz_localize(None)) else:
raise ValueError("column_format can only be one of ['JSON', 'PLAIN_TEXT'].")
if summary.shape[0] > 0: if parsed_data.shape[0] > 0:
summary["timestamp"] = summary["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6 parsed_data["timestamp"] = parsed_data["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
if intraday.shape[0] > 0:
intraday["timestamp"] = intraday["local_date_time"].dt.tz_localize(timezone).astype(np.int64) // 10**6
summary.to_csv(snakemake.output["summary_data"], index=False) parsed_data.to_csv(snakemake.output[0], index=False)
intraday.to_csv(snakemake.output["intraday_data"], index=False)

View File

@ -38,16 +38,6 @@ def getBouts(steps_data):
return bouts return bouts
def extractStepsFeaturesFromSummaryData(steps_summary_data, summary_features_to_compute):
steps_summary_features = pd.DataFrame()
# statistics features of daily steps count
steps_summary_features = statsFeatures(steps_summary_data, summary_features_to_compute, "sumsteps", steps_summary_features)
steps_summary_features.reset_index(inplace=True)
return steps_summary_features
def extractStepsFeaturesFromIntradayData(steps_intraday_data, threshold_active_bout, intraday_features_to_compute_steps, intraday_features_to_compute_sedentarybout, intraday_features_to_compute_activebout, steps_intraday_features): def extractStepsFeaturesFromIntradayData(steps_intraday_data, threshold_active_bout, intraday_features_to_compute_steps, intraday_features_to_compute_sedentarybout, intraday_features_to_compute_activebout, steps_intraday_features):
steps_intraday_features = pd.DataFrame() steps_intraday_features = pd.DataFrame()
@ -73,22 +63,20 @@ def extractStepsFeaturesFromIntradayData(steps_intraday_data, threshold_active_b
return steps_intraday_features return steps_intraday_features
def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs):
threshold_active_bout = provider["THRESHOLD_ACTIVE_BOUT"] threshold_active_bout = provider["THRESHOLD_ACTIVE_BOUT"]
include_zero_step_rows = provider["INCLUDE_ZERO_STEP_ROWS"] include_zero_step_rows = provider["INCLUDE_ZERO_STEP_ROWS"]
steps_summary_data = pd.read_csv(sensor_data_files["sensor_data"][0]) steps_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
steps_intraday_data = pd.read_csv(sensor_data_files["sensor_data"][1])
requested_summary_features = ["summary" + x for x in provider["FEATURES"]["SUMMARY"]] requested_intraday_features = provider["FEATURES"]
requested_intraday_features = provider["FEATURES"]["INTRADAY"]
requested_intraday_features_steps = ["intraday" + x + "steps" for x in requested_intraday_features["STEPS"]] requested_intraday_features_steps = ["intraday" + x + "steps" for x in requested_intraday_features["STEPS"]]
requested_intraday_features_sedentarybout = ["intraday" + x + "sedentarybout" for x in requested_intraday_features["SEDENTARY_BOUT"]] requested_intraday_features_sedentarybout = ["intraday" + x + "sedentarybout" for x in requested_intraday_features["SEDENTARY_BOUT"]]
requested_intraday_features_activebout = ["intraday" + x + "activebout" for x in requested_intraday_features["ACTIVE_BOUT"]] requested_intraday_features_activebout = ["intraday" + x + "activebout" for x in requested_intraday_features["ACTIVE_BOUT"]]
# name of the features this function can compute # name of the features this function can compute
base_summary_features = ["summarymaxsumsteps", "summaryminsumsteps", "summaryavgsumsteps", "summarymediansumsteps", "summarystdsumsteps"]
base_intraday_features_steps = ["intradaysumsteps", "intradaymaxsteps", "intradayminsteps", "intradayavgsteps", "intradaystdsteps"] base_intraday_features_steps = ["intradaysumsteps", "intradaymaxsteps", "intradayminsteps", "intradayavgsteps", "intradaystdsteps"]
base_intraday_features_sedentarybout = ["intradaycountepisodesedentarybout", "intradaysumdurationsedentarybout", "intradaymaxdurationsedentarybout", "intradaymindurationsedentarybout", "intradayavgdurationsedentarybout", "intradaystddurationsedentarybout"] base_intraday_features_sedentarybout = ["intradaycountepisodesedentarybout", "intradaysumdurationsedentarybout", "intradaymaxdurationsedentarybout", "intradaymindurationsedentarybout", "intradayavgdurationsedentarybout", "intradaystddurationsedentarybout"]
base_intraday_features_activebout = ["intradaycountepisodeactivebout", "intradaysumdurationactivebout", "intradaymaxdurationactivebout", "intradaymindurationactivebout", "intradayavgdurationactivebout", "intradaystddurationactivebout"] base_intraday_features_activebout = ["intradaycountepisodeactivebout", "intradaysumdurationactivebout", "intradaymaxdurationactivebout", "intradaymindurationactivebout", "intradayavgdurationactivebout", "intradaystddurationactivebout"]
@ -97,25 +85,8 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
intraday_features_to_compute_sedentarybout = list(set(requested_intraday_features_sedentarybout) & set(base_intraday_features_sedentarybout)) intraday_features_to_compute_sedentarybout = list(set(requested_intraday_features_sedentarybout) & set(base_intraday_features_sedentarybout))
intraday_features_to_compute_activebout = list(set(requested_intraday_features_activebout) & set(base_intraday_features_activebout)) intraday_features_to_compute_activebout = list(set(requested_intraday_features_activebout) & set(base_intraday_features_activebout))
summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features))
intraday_features_to_compute = intraday_features_to_compute_steps + intraday_features_to_compute_sedentarybout + intraday_features_to_compute_activebout intraday_features_to_compute = intraday_features_to_compute_steps + intraday_features_to_compute_sedentarybout + intraday_features_to_compute_activebout
# extract features from summary data
steps_summary_features = pd.DataFrame(columns=["local_segment"] + ["steps_rapids_" + x for x in summary_features_to_compute])
if not steps_summary_data.empty:
steps_summary_data = filter_data_by_segment(steps_summary_data, day_segment)
if not steps_summary_data.empty:
# only keep the segments start at 00:00:00 and end at 23:59:59
datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00"
datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59"
segment_regex = "{}#{},{}".format(day_segment, datetime_start_regex, datetime_end_regex)
steps_summary_data = steps_summary_data[steps_summary_data["local_segment"].str.match(segment_regex)]
if not steps_summary_data.empty:
steps_summary_features = extractStepsFeaturesFromSummaryData(steps_summary_data, summary_features_to_compute)
# extract features from intraday features # extract features from intraday features
steps_intraday_features = pd.DataFrame(columns=["local_segment"] + ["steps_rapids_" + x for x in intraday_features_to_compute]) steps_intraday_features = pd.DataFrame(columns=["local_segment"] + ["steps_rapids_" + x for x in intraday_features_to_compute])
if not steps_intraday_data.empty: if not steps_intraday_data.empty:
@ -124,18 +95,14 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
if not steps_intraday_data.empty: if not steps_intraday_data.empty:
steps_intraday_features = extractStepsFeaturesFromIntradayData(steps_intraday_data, threshold_active_bout, intraday_features_to_compute_steps, intraday_features_to_compute_sedentarybout, intraday_features_to_compute_activebout, steps_intraday_features) steps_intraday_features = extractStepsFeaturesFromIntradayData(steps_intraday_data, threshold_active_bout, intraday_features_to_compute_steps, intraday_features_to_compute_sedentarybout, intraday_features_to_compute_activebout, steps_intraday_features)
# merge summary features and intraday features
steps_features = steps_intraday_features.merge(steps_summary_features, on=["local_segment"], how="outer")
# exclude rows when the total step count is ZERO during the whole day # exclude rows when the total step count is ZERO during the whole day
if not include_zero_step_rows: if not include_zero_step_rows:
steps_features.index = steps_features["local_segment"].apply(lambda segment: segment.split("#")[1][:10]) steps_intraday_features.index = steps_intraday_features["local_segment"].apply(lambda segment: segment.split("#")[1][:10])
steps_features["dailycountstep"] = steps_intraday_data.groupby(["local_date"])["steps"].sum() steps_intraday_features["dailycountstep"] = steps_intraday_data.groupby(["local_date"])["steps"].sum()
steps_features = steps_features.query("dailycountstep != 0") steps_intraday_features = steps_intraday_features.query("dailycountstep != 0")
del steps_features["dailycountstep"] del steps_intraday_features["dailycountstep"]
steps_features.reset_index(drop=True, inplace=True) steps_intraday_features.reset_index(drop=True, inplace=True)
return steps_features return steps_intraday_features

View File

@ -0,0 +1,67 @@
import pandas as pd
import numpy as np
def statsFeatures(steps_data, features_to_compute, features_type, steps_features):
if features_type == "steps" or features_type == "sumsteps":
col_name = "steps"
elif features_type == "durationsedentarybout" or features_type == "durationactivebout":
col_name = "duration"
else:
raise ValueError("features_type can only be one of ['steps', 'sumsteps', 'durationsedentarybout', 'durationactivebout'].")
if ("summarycount" if features_type == "sumsteps" else "intradaycount") + features_type.replace("duration", "episode") in features_to_compute:
steps_features["steps_rapids_" + ("summarycount" if features_type == "sumsteps" else "intradaycount") + features_type.replace("duration", "episode")] = steps_data.groupby(["local_segment"])[col_name].count()
if ("summarysum" if features_type == "sumsteps" else "intradaysum") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarysum" if features_type == "sumsteps" else "intradaysum") + features_type] = steps_data.groupby(["local_segment"])[col_name].sum()
if ("summarymax" if features_type == "sumsteps" else "intradaymax") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarymax" if features_type == "sumsteps" else "intradaymax") + features_type] = steps_data.groupby(["local_segment"])[col_name].max()
if ("summarymin" if features_type == "sumsteps" else "intradaymin") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarymin" if features_type == "sumsteps" else "intradaymin") + features_type] = steps_data.groupby(["local_segment"])[col_name].min()
if ("summaryavg" if features_type == "sumsteps" else "intradayavg") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summaryavg" if features_type == "sumsteps" else "intradayavg") + features_type] = steps_data.groupby(["local_segment"])[col_name].mean()
if ("summarymedian" if features_type == "sumsteps" else "intradaymedian") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarymedian" if features_type == "sumsteps" else "intradaymedian") + features_type] = steps_data.groupby(["local_segment"])[col_name].median()
if ("summarystd" if features_type == "sumsteps" else "intradaystd") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarystd" if features_type == "sumsteps" else "intradaystd") + features_type] = steps_data.groupby(["local_segment"])[col_name].std()
return steps_features
def extractStepsFeaturesFromSummaryData(steps_summary_data, summary_features_to_compute):
steps_summary_features = pd.DataFrame()
# statistics features of daily steps count
steps_summary_features = statsFeatures(steps_summary_data, summary_features_to_compute, "sumsteps", steps_summary_features)
steps_summary_features.reset_index(inplace=True)
return steps_summary_features
def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs):
steps_summary_data = pd.read_csv(sensor_data_files["sensor_data"])
requested_summary_features = ["summary" + x for x in provider["FEATURES"]]
# name of the features this function can compute
base_summary_features = ["summarymaxsumsteps", "summaryminsumsteps", "summaryavgsumsteps", "summarymediansumsteps", "summarystdsumsteps"]
# the subset of requested features this function can compute
summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features))
# extract features from summary data
steps_summary_features = pd.DataFrame(columns=["local_segment"] + ["steps_rapids_" + x for x in summary_features_to_compute])
if not steps_summary_data.empty:
steps_summary_data = filter_data_by_segment(steps_summary_data, day_segment)
if not steps_summary_data.empty:
# only keep the segments start at 00:00:00 and end at 23:59:59
datetime_start_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 00:00:00"
datetime_end_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2} 23:59:59"
segment_regex = "{}#{},{}".format(day_segment, datetime_start_regex, datetime_end_regex)
steps_summary_data = steps_summary_data[steps_summary_data["local_segment"].str.match(segment_regex)]
if not steps_summary_data.empty:
steps_summary_features = extractStepsFeaturesFromSummaryData(steps_summary_data, summary_features_to_compute)
return steps_summary_features