Update Python feature scripts to add sensor and provider names automatically

pull/103/head
Meng Li 2020-11-30 14:42:19 -05:00
parent 0202b9cee1
commit 016bdbfe8c
15 changed files with 229 additions and 302 deletions

View File

@ -18,31 +18,31 @@ def statsFeatures(heartrate_data, features, features_type, heartrate_features):
else:
raise ValueError("features_type can only be one of ['hr', 'restinghr', 'caloriesoutofrange', 'caloriesfatburn', 'caloriescardio', 'caloriespeak'].")
if "intradaysum" + features_type in features:
heartrate_features["heartrate_rapids_intradaysum" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].sum()
if "intradaymax" + features_type in features:
heartrate_features["heartrate_rapids_intradaymax" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max()
if "intradaymin" + features_type in features:
heartrate_features["heartrate_rapids_intradaymin" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min()
if "intradayavg" + features_type in features:
heartrate_features["heartrate_rapids_intradayavg" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].mean()
if "intradaymedian" + features_type in features:
heartrate_features["heartrate_rapids_intradaymedian" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].median()
if "intradaymode" + features_type in features:
heartrate_features["heartrate_rapids_intradaymode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: pd.Series.mode(x)[0])
if "intradaystd" + features_type in features:
heartrate_features["heartrate_rapids_intradaystd" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].std()
if "intradaydiffmaxmode" + features_type in features:
heartrate_features["heartrate_rapids_intradaydiffmaxmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max() - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: pd.Series.mode(x)[0])
if "intradaydiffminmode" + features_type in features:
heartrate_features["heartrate_rapids_intradaydiffminmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: pd.Series.mode(x)[0]) - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min()
if "intradayentropy" + features_type in features:
heartrate_features["heartrate_rapids_intradayentropy" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(entropy)
if "sum" + features_type in features:
heartrate_features["sum" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].sum()
if "max" + features_type in features:
heartrate_features["max" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max()
if "min" + features_type in features:
heartrate_features["min" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min()
if "avg" + features_type in features:
heartrate_features["avg" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].mean()
if "median" + features_type in features:
heartrate_features["median" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].median()
if "mode" + features_type in features:
heartrate_features["mode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: pd.Series.mode(x)[0])
if "std" + features_type in features:
heartrate_features["std" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].std()
if "diffmaxmode" + features_type in features:
heartrate_features["diffmaxmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max() - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: pd.Series.mode(x)[0])
if "diffminmode" + features_type in features:
heartrate_features["diffminmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: pd.Series.mode(x)[0]) - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min()
if "entropy" + features_type in features:
heartrate_features["entropy" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(entropy)
return heartrate_features
def extractHRFeaturesFromIntradayData(heartrate_intraday_data, features, day_segment, filter_data_by_segment):
heartrate_intraday_features = pd.DataFrame(columns=["local_segment"] + ["heartrate_rapids_" + x for x in features])
heartrate_intraday_features = pd.DataFrame(columns=["local_segment"] + features)
if not heartrate_intraday_data.empty:
num_rows_per_minute = heartrate_intraday_data.groupby(["local_date", "local_hour", "local_minute"]).count().mean()["device_id"]
heartrate_intraday_data = filter_data_by_segment(heartrate_intraday_data, day_segment)
@ -54,10 +54,10 @@ def extractHRFeaturesFromIntradayData(heartrate_intraday_data, features, day_seg
heartrate_intraday_features = statsFeatures(heartrate_intraday_data, features, "hr", heartrate_intraday_features)
# get number of minutes in each heart rate zone
for feature_name in list(set(["intradayminutesonoutofrangezone", "intradayminutesonfatburnzone", "intradayminutesoncardiozone", "intradayminutesonpeakzone"]) & set(features)):
for feature_name in list(set(["minutesonoutofrangezone", "minutesonfatburnzone", "minutesoncardiozone", "minutesonpeakzone"]) & set(features)):
heartrate_zone = heartrate_intraday_data[heartrate_intraday_data["heartrate_zone"] == feature_name[17:-4]]
heartrate_intraday_features["heartrate_rapids_" + feature_name] = heartrate_zone.groupby(["local_segment"])["device_id"].count() / num_rows_per_minute
heartrate_intraday_features.fillna(value={"heartrate_rapids_" + feature_name: 0}, inplace=True)
heartrate_intraday_features[feature_name] = heartrate_zone.groupby(["local_segment"])["device_id"].count() / num_rows_per_minute
heartrate_intraday_features.fillna(value={feature_name: 0}, inplace=True)
heartrate_intraday_features.reset_index(inplace=True)
return heartrate_intraday_features
@ -67,9 +67,9 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
heartrate_intraday_data = pd.read_csv(sensor_data_files["sensor_data"])
requested_intraday_features = ["intraday" + x for x in provider["FEATURES"]]
requested_intraday_features = provider["FEATURES"]
# name of the features this function can compute
base_intraday_features_names = ["intradaymaxhr", "intradayminhr", "intradayavghr", "intradaymedianhr", "intradaymodehr", "intradaystdhr", "intradaydiffmaxmodehr", "intradaydiffminmodehr", "intradayentropyhr", "intradayminutesonoutofrangezone", "intradayminutesonfatburnzone", "intradayminutesoncardiozone", "intradayminutesonpeakzone"]
base_intraday_features_names = ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "minutesonoutofrangezone", "minutesonfatburnzone", "minutesoncardiozone", "minutesonpeakzone"]
# the subset of requested features this function can compute
intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names))

View File

@ -18,26 +18,26 @@ def statsFeatures(heartrate_data, features, features_type, heartrate_features):
else:
raise ValueError("features_type can only be one of ['hr', 'restinghr', 'caloriesoutofrange', 'caloriesfatburn', 'caloriescardio', 'caloriespeak'].")
if "summarysum" + features_type in features:
heartrate_features["heartrate_rapids_summarysum" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].sum()
if "summarymax" + features_type in features:
heartrate_features["heartrate_rapids_summarymax" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max()
if "summarymin" + features_type in features:
heartrate_features["heartrate_rapids_summarymin" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min()
if "summaryavg" + features_type in features:
heartrate_features["heartrate_rapids_summaryavg" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].mean()
if "summarymedian" + features_type in features:
heartrate_features["heartrate_rapids_summarymedian" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].median()
if "summarymode" + features_type in features:
heartrate_features["heartrate_rapids_summarymode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: None if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0])
if "summarystd" + features_type in features:
heartrate_features["heartrate_rapids_summarystd" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].std()
if "summarydiffmaxmode" + features_type in features:
heartrate_features["heartrate_rapids_summarydiffmaxmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max() - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: None if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0])
if "summarydiffminmode" + features_type in features:
heartrate_features["heartrate_rapids_summarydiffminmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: None if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0]) - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min()
if "summaryentropy" + features_type in features:
heartrate_features["heartrate_rapids_summaryentropy" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(entropy)
if "sum" + features_type in features:
heartrate_features["sum" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].sum()
if "max" + features_type in features:
heartrate_features["max" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max()
if "min" + features_type in features:
heartrate_features["min" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min()
if "avg" + features_type in features:
heartrate_features["avg" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].mean()
if "median" + features_type in features:
heartrate_features["median" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].median()
if "mode" + features_type in features:
heartrate_features["mode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: None if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0])
if "std" + features_type in features:
heartrate_features["std" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].std()
if "diffmaxmode" + features_type in features:
heartrate_features["diffmaxmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max() - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: None if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0])
if "diffminmode" + features_type in features:
heartrate_features["diffminmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: None if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0]) - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min()
if "entropy" + features_type in features:
heartrate_features["entropy" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(entropy)
return heartrate_features
@ -63,14 +63,14 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
heartrate_summary_data = pd.read_csv(sensor_data_files["sensor_data"])
requested_summary_features = ["summary" + x for x in provider["FEATURES"]]
requested_summary_features = provider["FEATURES"]
# name of the features this function can compute
base_summary_features_names = ["summarymaxrestinghr", "summaryminrestinghr", "summaryavgrestinghr", "summarymedianrestinghr", "summarymoderestinghr", "summarystdrestinghr", "summarydiffmaxmoderestinghr", "summarydiffminmoderestinghr", "summaryentropyrestinghr", "summarysumcaloriesoutofrange", "summarymaxcaloriesoutofrange", "summarymincaloriesoutofrange", "summaryavgcaloriesoutofrange", "summarymediancaloriesoutofrange", "summarystdcaloriesoutofrange", "summaryentropycaloriesoutofrange", "summarysumcaloriesfatburn", "summarymaxcaloriesfatburn", "summarymincaloriesfatburn", "summaryavgcaloriesfatburn", "summarymediancaloriesfatburn", "summarystdcaloriesfatburn", "summaryentropycaloriesfatburn", "summarysumcaloriescardio", "summarymaxcaloriescardio", "summarymincaloriescardio", "summaryavgcaloriescardio", "summarymediancaloriescardio", "summarystdcaloriescardio", "summaryentropycaloriescardio", "summarysumcaloriespeak", "summarymaxcaloriespeak", "summarymincaloriespeak", "summaryavgcaloriespeak", "summarymediancaloriespeak", "summarystdcaloriespeak", "summaryentropycaloriespeak"]
base_summary_features_names = ["maxrestinghr", "minrestinghr", "avgrestinghr", "medianrestinghr", "moderestinghr", "stdrestinghr", "diffmaxmoderestinghr", "diffminmoderestinghr", "entropyrestinghr", "sumcaloriesoutofrange", "maxcaloriesoutofrange", "mincaloriesoutofrange", "avgcaloriesoutofrange", "mediancaloriesoutofrange", "stdcaloriesoutofrange", "entropycaloriesoutofrange", "sumcaloriesfatburn", "maxcaloriesfatburn", "mincaloriesfatburn", "avgcaloriesfatburn", "mediancaloriesfatburn", "stdcaloriesfatburn", "entropycaloriesfatburn", "sumcaloriescardio", "maxcaloriescardio", "mincaloriescardio", "avgcaloriescardio", "mediancaloriescardio", "stdcaloriescardio", "entropycaloriescardio", "sumcaloriespeak", "maxcaloriespeak", "mincaloriespeak", "avgcaloriespeak", "mediancaloriespeak", "stdcaloriespeak", "entropycaloriespeak"]
# the subset of requested features this function can compute
summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features_names))
# extract features from summary data
heartrate_summary_features = pd.DataFrame(columns=["local_segment"] + ["heartrate_rapids_" + x for x in summary_features_to_compute])
heartrate_summary_features = pd.DataFrame(columns=["local_segment"] + summary_features_to_compute)
if not heartrate_summary_data.empty:
heartrate_summary_data = filter_data_by_segment(heartrate_summary_data, day_segment)

View File

@ -13,36 +13,36 @@ def extractSleepFeaturesFromSummaryData(sleep_summary_data, summary_features, sl
features_sum = sleep_summary_data[["local_segment", "minutes_after_wakeup", "minutes_asleep", "minutes_awake", "minutes_to_fall_asleep", "minutes_in_bed"]].groupby(["local_segment"]).sum()
if "summarysumdurationafterwakeup" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_after_wakeup"]], how="outer").rename(columns={"minutes_after_wakeup": "sleep_rapids_summarysumdurationafterwakeup" + sleep_type})
if "summarysumdurationasleep" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_asleep"]], how="outer").rename(columns={"minutes_asleep": "sleep_rapids_summarysumdurationasleep" + sleep_type})
if "summarysumdurationawake" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_awake"]], how="outer").rename(columns={"minutes_awake": "sleep_rapids_summarysumdurationawake" + sleep_type})
if "summarysumdurationtofallasleep" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_to_fall_asleep"]], how="outer").rename(columns={"minutes_to_fall_asleep": "sleep_rapids_summarysumdurationtofallasleep" + sleep_type})
if "summarysumdurationinbed" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_in_bed"]], how="outer").rename(columns={"minutes_in_bed": "sleep_rapids_summarysumdurationinbed" + sleep_type})
if "sumdurationafterwakeup" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_after_wakeup"]], how="outer").rename(columns={"minutes_after_wakeup": "sumdurationafterwakeup" + sleep_type})
if "sumdurationasleep" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_asleep"]], how="outer").rename(columns={"minutes_asleep": "sumdurationasleep" + sleep_type})
if "sumdurationawake" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_awake"]], how="outer").rename(columns={"minutes_awake": "sumdurationawake" + sleep_type})
if "sumdurationtofallasleep" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_to_fall_asleep"]], how="outer").rename(columns={"minutes_to_fall_asleep": "sumdurationtofallasleep" + sleep_type})
if "sumdurationinbed" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_in_bed"]], how="outer").rename(columns={"minutes_in_bed": "sumdurationinbed" + sleep_type})
features_avg = sleep_summary_data[["local_segment", "efficiency", "minutes_after_wakeup", "minutes_asleep", "minutes_awake", "minutes_to_fall_asleep", "minutes_in_bed"]].groupby(["local_segment"]).mean()
if "summaryavgefficiency" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_avg[["efficiency"]], how="outer").rename(columns={"efficiency": "sleep_rapids_summaryavgefficiency" + sleep_type})
if "summaryavgdurationafterwakeup" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_after_wakeup"]], how="outer").rename(columns={"minutes_after_wakeup": "sleep_rapids_summaryavgdurationafterwakeup" + sleep_type})
if "summaryavgdurationasleep" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_asleep"]], how="outer").rename(columns={"minutes_asleep": "sleep_rapids_summaryavgdurationasleep" + sleep_type})
if "summaryavgdurationawake" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_awake"]], how="outer").rename(columns={"minutes_awake": "sleep_rapids_summaryavgdurationawake" + sleep_type})
if "summaryavgdurationtofallasleep" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_to_fall_asleep"]], how="outer").rename(columns={"minutes_to_fall_asleep": "sleep_rapids_summaryavgdurationtofallasleep" + sleep_type})
if "summaryavgdurationinbed" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_in_bed"]], how="outer").rename(columns={"minutes_in_bed": "sleep_rapids_summaryavgdurationinbed" + sleep_type})
if "avgefficiency" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_avg[["efficiency"]], how="outer").rename(columns={"efficiency": "avgefficiency" + sleep_type})
if "avgdurationafterwakeup" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_after_wakeup"]], how="outer").rename(columns={"minutes_after_wakeup": "avgdurationafterwakeup" + sleep_type})
if "avgdurationasleep" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_asleep"]], how="outer").rename(columns={"minutes_asleep": "avgdurationasleep" + sleep_type})
if "avgdurationawake" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_awake"]], how="outer").rename(columns={"minutes_awake": "avgdurationawake" + sleep_type})
if "avgdurationtofallasleep" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_to_fall_asleep"]], how="outer").rename(columns={"minutes_to_fall_asleep": "avgdurationtofallasleep" + sleep_type})
if "avgdurationinbed" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_in_bed"]], how="outer").rename(columns={"minutes_in_bed": "avgdurationinbed" + sleep_type})
features_count = sleep_summary_data[["local_segment", "timestamp"]].groupby(["local_segment"]).count()
if "summarycountepisode" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_count[["timestamp"]], how="outer").rename(columns={"timestamp": "sleep_rapids_summarycountepisode" + sleep_type})
if "countepisode" in summary_features:
sleep_summary_features = sleep_summary_features.join(features_count[["timestamp"]], how="outer").rename(columns={"timestamp": "countepisode" + sleep_type})
return sleep_summary_features
@ -51,11 +51,11 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
sleep_summary_data = pd.read_csv(sensor_data_files["sensor_data"])
requested_summary_features = ["summary" + x for x in provider["FEATURES"]]
requested_summary_features = provider["FEATURES"]
requested_sleep_types = provider["SLEEP_TYPES"]
# name of the features this function can compute
base_summary_features = ["summarycountepisode", "summaryavgefficiency", "summarysumdurationafterwakeup", "summarysumdurationasleep", "summarysumdurationawake", "summarysumdurationtofallasleep", "summarysumdurationinbed", "summaryavgdurationafterwakeup", "summaryavgdurationasleep", "summaryavgdurationawake", "summaryavgdurationtofallasleep", "summaryavgdurationinbed"]
base_summary_features = ["countepisode", "avgefficiency", "sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgdurationafterwakeup", "avgdurationasleep", "avgdurationawake", "avgdurationtofallasleep", "avgdurationinbed"]
base_sleep_types = ["main", "nap", "all"]
# the subset of requested features this function can compute
summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features))
@ -63,10 +63,10 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
# full names
features_fullnames_to_compute = ["".join(feature) for feature in itertools.product(summary_features_to_compute, sleep_types_to_compute)]
colnames_can_be_zero = ["sleep_rapids_" + x for x in [col for col in features_fullnames_to_compute if "summaryavgefficiency" not in col]]
colnames_can_be_zero = [col for col in features_fullnames_to_compute if "avgefficiency" not in col]
# extract features from summary data
sleep_summary_features = pd.DataFrame(columns=["local_segment"] + ["sleep_rapids_" + x for x in features_fullnames_to_compute])
sleep_summary_features = pd.DataFrame(columns=["local_segment"] + features_fullnames_to_compute)
if not sleep_summary_data.empty:
sleep_summary_data = filter_data_by_segment(sleep_summary_data, day_segment)

View File

@ -9,20 +9,20 @@ def statsFeatures(steps_data, features_to_compute, features_type, steps_features
else:
raise ValueError("features_type can only be one of ['steps', 'sumsteps', 'durationsedentarybout', 'durationactivebout'].")
if ("summarycount" if features_type == "sumsteps" else "intradaycount") + features_type.replace("duration", "episode") in features_to_compute:
steps_features["steps_rapids_" + ("summarycount" if features_type == "sumsteps" else "intradaycount") + features_type.replace("duration", "episode")] = steps_data.groupby(["local_segment"])[col_name].count()
if ("summarysum" if features_type == "sumsteps" else "intradaysum") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarysum" if features_type == "sumsteps" else "intradaysum") + features_type] = steps_data.groupby(["local_segment"])[col_name].sum()
if ("summarymax" if features_type == "sumsteps" else "intradaymax") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarymax" if features_type == "sumsteps" else "intradaymax") + features_type] = steps_data.groupby(["local_segment"])[col_name].max()
if ("summarymin" if features_type == "sumsteps" else "intradaymin") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarymin" if features_type == "sumsteps" else "intradaymin") + features_type] = steps_data.groupby(["local_segment"])[col_name].min()
if ("summaryavg" if features_type == "sumsteps" else "intradayavg") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summaryavg" if features_type == "sumsteps" else "intradayavg") + features_type] = steps_data.groupby(["local_segment"])[col_name].mean()
if ("summarymedian" if features_type == "sumsteps" else "intradaymedian") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarymedian" if features_type == "sumsteps" else "intradaymedian") + features_type] = steps_data.groupby(["local_segment"])[col_name].median()
if ("summarystd" if features_type == "sumsteps" else "intradaystd") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarystd" if features_type == "sumsteps" else "intradaystd") + features_type] = steps_data.groupby(["local_segment"])[col_name].std()
if "count" + features_type.replace("duration", "episode") in features_to_compute:
steps_features["count" + features_type.replace("duration", "episode")] = steps_data.groupby(["local_segment"])[col_name].count()
if "sum" + features_type in features_to_compute:
steps_features["sum" + features_type] = steps_data.groupby(["local_segment"])[col_name].sum()
if "max" + features_type in features_to_compute:
steps_features["max" + features_type] = steps_data.groupby(["local_segment"])[col_name].max()
if "min" + features_type in features_to_compute:
steps_features["min" + features_type] = steps_data.groupby(["local_segment"])[col_name].min()
if "avg" + features_type in features_to_compute:
steps_features["avg" + features_type] = steps_data.groupby(["local_segment"])[col_name].mean()
if "median" + features_type in features_to_compute:
steps_features["median" + features_type] = steps_data.groupby(["local_segment"])[col_name].median()
if "std" + features_type in features_to_compute:
steps_features["std" + features_type] = steps_data.groupby(["local_segment"])[col_name].std()
return steps_features
@ -73,13 +73,13 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
requested_intraday_features = provider["FEATURES"]
requested_intraday_features_steps = ["intraday" + x + "steps" for x in requested_intraday_features["STEPS"]]
requested_intraday_features_sedentarybout = ["intraday" + x + "sedentarybout" for x in requested_intraday_features["SEDENTARY_BOUT"]]
requested_intraday_features_activebout = ["intraday" + x + "activebout" for x in requested_intraday_features["ACTIVE_BOUT"]]
requested_intraday_features_steps = [x + "steps" for x in requested_intraday_features["STEPS"]]
requested_intraday_features_sedentarybout = [x + "sedentarybout" for x in requested_intraday_features["SEDENTARY_BOUT"]]
requested_intraday_features_activebout = [x + "activebout" for x in requested_intraday_features["ACTIVE_BOUT"]]
# name of the features this function can compute
base_intraday_features_steps = ["intradaysumsteps", "intradaymaxsteps", "intradayminsteps", "intradayavgsteps", "intradaystdsteps"]
base_intraday_features_sedentarybout = ["intradaycountepisodesedentarybout", "intradaysumdurationsedentarybout", "intradaymaxdurationsedentarybout", "intradaymindurationsedentarybout", "intradayavgdurationsedentarybout", "intradaystddurationsedentarybout"]
base_intraday_features_activebout = ["intradaycountepisodeactivebout", "intradaysumdurationactivebout", "intradaymaxdurationactivebout", "intradaymindurationactivebout", "intradayavgdurationactivebout", "intradaystddurationactivebout"]
base_intraday_features_steps = ["sumsteps", "maxsteps", "minsteps", "avgsteps", "stdsteps"]
base_intraday_features_sedentarybout = ["countepisodesedentarybout", "sumdurationsedentarybout", "maxdurationsedentarybout", "mindurationsedentarybout", "avgdurationsedentarybout", "stddurationsedentarybout"]
base_intraday_features_activebout = ["countepisodeactivebout", "sumdurationactivebout", "maxdurationactivebout", "mindurationactivebout", "avgdurationactivebout", "stddurationactivebout"]
# the subset of requested features this function can compute
intraday_features_to_compute_steps = list(set(requested_intraday_features_steps) & set(base_intraday_features_steps))
intraday_features_to_compute_sedentarybout = list(set(requested_intraday_features_sedentarybout) & set(base_intraday_features_sedentarybout))
@ -88,7 +88,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
intraday_features_to_compute = intraday_features_to_compute_steps + intraday_features_to_compute_sedentarybout + intraday_features_to_compute_activebout
# extract features from intraday features
steps_intraday_features = pd.DataFrame(columns=["local_segment"] + ["steps_rapids_" + x for x in intraday_features_to_compute])
steps_intraday_features = pd.DataFrame(columns=["local_segment"] + intraday_features_to_compute)
if not steps_intraday_data.empty:
steps_intraday_data = filter_data_by_segment(steps_intraday_data, day_segment)

View File

@ -9,20 +9,20 @@ def statsFeatures(steps_data, features_to_compute, features_type, steps_features
else:
raise ValueError("features_type can only be one of ['steps', 'sumsteps', 'durationsedentarybout', 'durationactivebout'].")
if ("summarycount" if features_type == "sumsteps" else "intradaycount") + features_type.replace("duration", "episode") in features_to_compute:
steps_features["steps_rapids_" + ("summarycount" if features_type == "sumsteps" else "intradaycount") + features_type.replace("duration", "episode")] = steps_data.groupby(["local_segment"])[col_name].count()
if ("summarysum" if features_type == "sumsteps" else "intradaysum") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarysum" if features_type == "sumsteps" else "intradaysum") + features_type] = steps_data.groupby(["local_segment"])[col_name].sum()
if ("summarymax" if features_type == "sumsteps" else "intradaymax") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarymax" if features_type == "sumsteps" else "intradaymax") + features_type] = steps_data.groupby(["local_segment"])[col_name].max()
if ("summarymin" if features_type == "sumsteps" else "intradaymin") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarymin" if features_type == "sumsteps" else "intradaymin") + features_type] = steps_data.groupby(["local_segment"])[col_name].min()
if ("summaryavg" if features_type == "sumsteps" else "intradayavg") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summaryavg" if features_type == "sumsteps" else "intradayavg") + features_type] = steps_data.groupby(["local_segment"])[col_name].mean()
if ("summarymedian" if features_type == "sumsteps" else "intradaymedian") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarymedian" if features_type == "sumsteps" else "intradaymedian") + features_type] = steps_data.groupby(["local_segment"])[col_name].median()
if ("summarystd" if features_type == "sumsteps" else "intradaystd") + features_type in features_to_compute:
steps_features["steps_rapids_" + ("summarystd" if features_type == "sumsteps" else "intradaystd") + features_type] = steps_data.groupby(["local_segment"])[col_name].std()
if "count" + features_type.replace("duration", "episode") in features_to_compute:
steps_features["count" + features_type.replace("duration", "episode")] = steps_data.groupby(["local_segment"])[col_name].count()
if "sum" + features_type in features_to_compute:
steps_features["sum" + features_type] = steps_data.groupby(["local_segment"])[col_name].sum()
if "max" + features_type in features_to_compute:
steps_features["max" + features_type] = steps_data.groupby(["local_segment"])[col_name].max()
if "min" + features_type in features_to_compute:
steps_features["min" + features_type] = steps_data.groupby(["local_segment"])[col_name].min()
if "avg" + features_type in features_to_compute:
steps_features["avg" + features_type] = steps_data.groupby(["local_segment"])[col_name].mean()
if "median" + features_type in features_to_compute:
steps_features["median" + features_type] = steps_data.groupby(["local_segment"])[col_name].median()
if "std" + features_type in features_to_compute:
steps_features["std" + features_type] = steps_data.groupby(["local_segment"])[col_name].std()
return steps_features
@ -41,15 +41,15 @@ def extractStepsFeaturesFromSummaryData(steps_summary_data, summary_features_to_
def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs):
steps_summary_data = pd.read_csv(sensor_data_files["sensor_data"])
requested_summary_features = ["summary" + x for x in provider["FEATURES"]]
requested_summary_features = provider["FEATURES"]
# name of the features this function can compute
base_summary_features = ["summarymaxsumsteps", "summaryminsumsteps", "summaryavgsumsteps", "summarymediansumsteps", "summarystdsumsteps"]
base_summary_features = ["maxsumsteps", "minsumsteps", "avgsumsteps", "mediansumsteps", "stdsumsteps"]
# the subset of requested features this function can compute
summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features))
# extract features from summary data
steps_summary_features = pd.DataFrame(columns=["local_segment"] + ["steps_rapids_" + x for x in summary_features_to_compute])
steps_summary_features = pd.DataFrame(columns=["local_segment"] + summary_features_to_compute)
if not steps_summary_data.empty:
steps_summary_data = filter_data_by_segment(steps_summary_data, day_segment)

View File

@ -26,17 +26,17 @@ def getActivityEpisodes(acc_minute):
def statsFeatures(acc_data, features_to_compute, features_type, acc_features):
if "sum" + features_type in features_to_compute:
acc_features["acc_panda_sum" + features_type] = acc_data.groupby(["local_segment"])["duration"].sum()
acc_features["sum" + features_type] = acc_data.groupby(["local_segment"])["duration"].sum()
if "max" + features_type in features_to_compute:
acc_features["acc_panda_max" + features_type] = acc_data.groupby(["local_segment"])["duration"].max()
acc_features["max" + features_type] = acc_data.groupby(["local_segment"])["duration"].max()
if "min" + features_type in features_to_compute:
acc_features["acc_panda_min" + features_type] = acc_data.groupby(["local_segment"])["duration"].min()
acc_features["min" + features_type] = acc_data.groupby(["local_segment"])["duration"].min()
if "avg" + features_type in features_to_compute:
acc_features["acc_panda_avg" + features_type] = acc_data.groupby(["local_segment"])["duration"].mean()
acc_features["avg" + features_type] = acc_data.groupby(["local_segment"])["duration"].mean()
if "median" + features_type in features_to_compute:
acc_features["acc_panda_median" + features_type] = acc_data.groupby(["local_segment"])["duration"].median()
acc_features["median" + features_type] = acc_data.groupby(["local_segment"])["duration"].median()
if "std" + features_type in features_to_compute:
acc_features["acc_panda_std" + features_type] = acc_data.groupby(["local_segment"])["duration"].std()
acc_features["std" + features_type] = acc_data.groupby(["local_segment"])["duration"].std()
return acc_features
@ -56,7 +56,7 @@ def panda_features(sensor_data_files, day_segment, provider, filter_data_by_segm
features_to_compute = features_to_compute_exertionalactivityepisode + features_to_compute_nonexertionalactivityepisode + (["validsensedminutes"] if valid_sensed_minutes else [])
acc_features = pd.DataFrame(columns=["local_segment"] + ["acc_panda_" + x for x in features_to_compute])
acc_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
if not acc_data.empty:
acc_data = filter_data_by_segment(acc_data, day_segment)
@ -72,7 +72,7 @@ def panda_features(sensor_data_files, day_segment, provider, filter_data_by_segm
acc_minute.reset_index(inplace=True)
if valid_sensed_minutes:
acc_features["acc_panda_validsensedminutes"] = acc_minute.groupby(["local_segment"])["isexertionalactivity"].count()
acc_features["validsensedminutes"] = acc_minute.groupby(["local_segment"])["isexertionalactivity"].count()
activity_episodes = getActivityEpisodes(acc_minute)
# compute exertional episodes features

View File

@ -10,7 +10,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
# the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names))
acc_features = pd.DataFrame(columns=["local_segment"] + ["acc_rapids_" + x for x in features_to_compute])
acc_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
if not acc_data.empty:
acc_data = filter_data_by_segment(acc_data, day_segment)
@ -21,15 +21,15 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
acc_data = acc_data.assign(magnitude = magnitude.values)
if "maxmagnitude" in features_to_compute:
acc_features["acc_rapids_maxmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].max()
acc_features["maxmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].max()
if "minmagnitude" in features_to_compute:
acc_features["acc_rapids_minmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].min()
acc_features["minmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].min()
if "avgmagnitude" in features_to_compute:
acc_features["acc_rapids_avgmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].mean()
acc_features["avgmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].mean()
if "medianmagnitude" in features_to_compute:
acc_features["acc_rapids_medianmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].median()
acc_features["medianmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].median()
if "stdmagnitude" in features_to_compute:
acc_features["acc_rapids_stdmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].std()
acc_features["stdmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].std()
acc_features = acc_features.reset_index()

View File

@ -12,7 +12,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
requested_features = provider["FEATURES"]
features_to_compute = list(set(requested_features) & set(base_features_names))
ar_features = pd.DataFrame(columns=["local_segment"] + ["ar_rapids_" + x for x in features_to_compute])
ar_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
if not ar_episodes.empty:
ar_episodes = filter_data_by_segment(ar_episodes, day_segment)
@ -20,98 +20,22 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
ar_features = pd.DataFrame()
if "count" in features_to_compute:
ar_features["ar_rapids_count"] = ar_episodes.groupby(["local_segment"]).count()["episode_id"]
ar_features["count"] = ar_episodes.groupby(["local_segment"]).count()["episode_id"]
if "mostcommonactivity" in features_to_compute:
ar_features["ar_rapids_mostcommonactivity"] = ar_episodes.groupby(["local_segment"])["activity_type"].agg(lambda x: pd.Series.mode(x)[0])
ar_features["mostcommonactivity"] = ar_episodes.groupby(["local_segment"])["activity_type"].agg(lambda x: pd.Series.mode(x)[0])
if "countuniqueactivities" in features_to_compute:
ar_features["ar_rapids_countuniqueactivities"] = ar_episodes.groupby(["local_segment"])["activity_type"].nunique()
ar_features["countuniqueactivities"] = ar_episodes.groupby(["local_segment"])["activity_type"].nunique()
# duration features
for column, activity_labels in activity_classes.items():
if "duration" + column.lower() in features_to_compute:
filtered_data = ar_episodes[ar_episodes["activity_name"].isin(pd.Series(activity_labels))]
if not filtered_data.empty:
ar_features["ar_rapids_duration" + column.lower()] = ar_episodes[ar_episodes["activity_name"].isin(pd.Series(activity_labels))].groupby(["local_segment"])["duration"].sum().fillna(0)
ar_features["duration" + column.lower()] = ar_episodes[ar_episodes["activity_name"].isin(pd.Series(activity_labels))].groupby(["local_segment"])["duration"].sum().fillna(0)
else:
ar_features["ar_rapids_duration" + column.lower()] = 0
ar_features["duration" + column.lower()] = 0
ar_features.index.names = ["local_segment"]
ar_features = ar_features.reset_index()
return ar_features
"""
if not ar_data.empty:
ar_data = filter_data_by_segment(ar_data, day_segment)
if not ar_data.empty:
# chunk_episodes
ar_data = chunk_episodes(ar_data)
if not ar_data.empty:
ar_data["episode_id"] = ((ar_data.ar_status != ar_data.ar_status.shift()) | (ar_data.start_timestamp - ar_data.end_timestamp.shift() > 1)).cumsum()
grouped = ar_data.groupby(by=["local_segment", "episode_id", "ar_status"])
ar_episodes= grouped[["duration"]].sum()
ar_episodes["ar_diff"] = grouped["ar_level"].first() - grouped["ar_level"].last()
ar_episodes["ar_consumption_rate"] = ar_episodes["ar_diff"] / ar_episodes["duration"]
ar_episodes.reset_index(inplace=True)
# for discharge episodes
ar_discharge_episodes = ar_episodes[(ar_episodes["ar_status"] == 3) | (ar_episodes["ar_status"] == 4)]
ar_discharge_features = pd.DataFrame()
if "countdischarge" in features_to_compute:
ar_discharge_features["ar_rapids_countdischarge"] = ar_discharge_episodes.groupby(["local_segment"])["episode_id"].count()
if "sumdurationdischarge" in features_to_compute:
ar_discharge_features["ar_rapids_sumdurationdischarge"] = ar_discharge_episodes.groupby(["local_segment"])["duration"].sum()
if "avgconsumptionrate" in features_to_compute:
ar_discharge_features["ar_rapids_avgconsumptionrate"] = ar_discharge_episodes.groupby(["local_segment"])["ar_consumption_rate"].mean()
if "maxconsumptionrate" in features_to_compute:
ar_discharge_features["ar_rapids_maxconsumptionrate"] = ar_discharge_episodes.groupby(["local_segment"])["ar_consumption_rate"].max()
# for charge episodes
ar_charge_episodes = ar_episodes[(ar_episodes["ar_status"] == 2) | (ar_episodes["ar_status"] == 5)]
ar_charge_features = pd.DataFrame()
if "countcharge" in features_to_compute:
ar_charge_features["ar_rapids_countcharge"] = ar_charge_episodes.groupby(["local_segment"])["episode_id"].count()
if "sumdurationcharge" in features_to_compute:
ar_charge_features["ar_rapids_sumdurationcharge"] = ar_charge_episodes.groupby(["local_segment"])["duration"].sum()
# combine discharge features and charge features; fill the missing values with ZERO
ar_features = pd.concat([ar_discharge_features, ar_charge_features], axis=1, sort=True).fillna(0)
ar_features.index.rename("local_segment", inplace=True)
ar_features = ar_features.reset_index()
return ar_features
"""

View File

@ -9,24 +9,24 @@ def compute_features(filtered_data, apps_type, requested_features, apps_features
if "timeoffirstuse" in requested_features:
time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
if time_first_event.empty:
apps_features["apps_rapids_timeoffirstuse" + apps_type] = np.nan
apps_features["timeoffirstuse" + apps_type] = np.nan
else:
apps_features["apps_rapids_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
apps_features["timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
if "timeoflastuse" in requested_features:
time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
if time_last_event.empty:
apps_features["apps_rapids_timeoflastuse" + apps_type] = np.nan
apps_features["timeoflastuse" + apps_type] = np.nan
else:
apps_features["apps_rapids_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
apps_features["timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
if "frequencyentropy" in requested_features:
apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index()
if (len(apps_with_count.index) < 2 ):
apps_features["apps_rapids_frequencyentropy" + apps_type] = np.nan
apps_features["frequencyentropy" + apps_type] = np.nan
else:
apps_features["apps_rapids_frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy)
apps_features["frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy)
if "count" in requested_features:
apps_features["apps_rapids_count" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"]
apps_features.fillna(value={"apps_rapids_count" + apps_type: 0}, inplace=True)
apps_features["count" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"]
apps_features.fillna(value={"count" + apps_type: 0}, inplace=True)
return apps_features
@ -53,7 +53,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
# exclude apps in the excluded_apps list
apps_data = apps_data[~apps_data["package_name"].isin(excluded_apps)]
apps_features = pd.DataFrame(columns=["local_segment"] + ["apps_rapids_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + single_apps)]])
apps_features = pd.DataFrame(columns=["local_segment"] + ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + single_apps)])
if not apps_data.empty:
# deep copy the apps_data for the top1global computation
apps_data_global = apps_data.copy()

View File

@ -11,7 +11,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
requested_features = provider["FEATURES"]
features_to_compute = list(set(requested_features) & set(base_features_names))
battery_features = pd.DataFrame(columns=["local_segment"] + ["battery_rapids_" + x for x in features_to_compute])
battery_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
if not battery_data.empty:
battery_data = filter_data_by_segment(battery_data, day_segment)
@ -28,21 +28,21 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
battery_discharge_episodes = battery_episodes[(battery_episodes["battery_status"] == 3) | (battery_episodes["battery_status"] == 4)]
battery_discharge_features = pd.DataFrame()
if "countdischarge" in features_to_compute:
battery_discharge_features["battery_rapids_countdischarge"] = battery_discharge_episodes.groupby(["local_segment"])["episode_id"].count()
battery_discharge_features["countdischarge"] = battery_discharge_episodes.groupby(["local_segment"])["episode_id"].count()
if "sumdurationdischarge" in features_to_compute:
battery_discharge_features["battery_rapids_sumdurationdischarge"] = battery_discharge_episodes.groupby(["local_segment"])["duration"].sum()
battery_discharge_features["sumdurationdischarge"] = battery_discharge_episodes.groupby(["local_segment"])["duration"].sum()
if "avgconsumptionrate" in features_to_compute:
battery_discharge_features["battery_rapids_avgconsumptionrate"] = battery_discharge_episodes.groupby(["local_segment"])["battery_consumption_rate"].mean()
battery_discharge_features["avgconsumptionrate"] = battery_discharge_episodes.groupby(["local_segment"])["battery_consumption_rate"].mean()
if "maxconsumptionrate" in features_to_compute:
battery_discharge_features["battery_rapids_maxconsumptionrate"] = battery_discharge_episodes.groupby(["local_segment"])["battery_consumption_rate"].max()
battery_discharge_features["maxconsumptionrate"] = battery_discharge_episodes.groupby(["local_segment"])["battery_consumption_rate"].max()
# for charge episodes
battery_charge_episodes = battery_episodes[(battery_episodes["battery_status"] == 2) | (battery_episodes["battery_status"] == 5)]
battery_charge_features = pd.DataFrame()
if "countcharge" in features_to_compute:
battery_charge_features["battery_rapids_countcharge"] = battery_charge_episodes.groupby(["local_segment"])["episode_id"].count()
battery_charge_features["countcharge"] = battery_charge_episodes.groupby(["local_segment"])["episode_id"].count()
if "sumdurationcharge" in features_to_compute:
battery_charge_features["battery_rapids_sumdurationcharge"] = battery_charge_episodes.groupby(["local_segment"])["duration"].sum()
battery_charge_features["sumdurationcharge"] = battery_charge_episodes.groupby(["local_segment"])["duration"].sum()
# combine discharge features and charge features; fill the missing values with ZERO
battery_features = pd.concat([battery_discharge_features, battery_charge_features], axis=1, sort=True).fillna(0)

View File

@ -21,7 +21,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
# the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names))
conversation_features = pd.DataFrame(columns=["local_segment"] + ["conversation_rapids_" + x for x in features_to_compute])
conversation_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
if not conversation_data.empty:
conversation_data = filter_data_by_segment(conversation_data, day_segment)
@ -31,19 +31,19 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
conversation_data = conversation_data.drop_duplicates(subset=["local_date", "local_time"], keep="first")
if "minutessilence" in features_to_compute:
conversation_features["conversation_rapids_minutessilence"] = conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60
conversation_features["minutessilence"] = conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60
if "minutesnoise" in features_to_compute:
conversation_features["conversation_rapids_minutesnoise"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60
conversation_features["minutesnoise"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60
if "minutesvoice" in features_to_compute:
conversation_features["conversation_rapids_minutesvoice"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60
conversation_features["minutesvoice"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60
if "minutesunknown" in features_to_compute:
conversation_features["conversation_rapids_minutesunknown"] = conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60
conversation_features["minutesunknown"] = conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60
if "countconversation" in features_to_compute:
conversation_features["conversation_rapids_countconversation"] = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['double_convo_start'].nunique()
conversation_features["countconversation"] = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['double_convo_start'].nunique()
conv_duration = (conversation_data['double_convo_end']/1000 - conversation_data['double_convo_start']/1000)/60
conversation_data = conversation_data.assign(conv_duration = conv_duration.values)
@ -51,43 +51,43 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
conv_totalDuration = conversation_data[(conversation_data['inference'] >= 0) & (conversation_data['inference'] < 4)].groupby(["local_segment"])['inference'].count()/60
if "silencesensedfraction" in features_to_compute:
conversation_features["conversation_rapids_silencesensedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration
conversation_features["silencesensedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration
if "noisesensedfraction" in features_to_compute:
conversation_features["conversation_rapids_noisesensedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration
conversation_features["noisesensedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration
if "voicesensedfraction" in features_to_compute:
conversation_features["conversation_rapids_voicesensedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration
conversation_features["voicesensedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration
if "unknownsensedfraction" in features_to_compute:
conversation_features["conversation_rapids_unknownsensedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration
conversation_features["unknownsensedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration
if "silenceexpectedfraction" in features_to_compute:
conversation_features["conversation_rapids_silenceexpectedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes
conversation_features["silenceexpectedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes
if "noiseexpectedfraction" in features_to_compute:
conversation_features["conversation_rapids_noiseexpectedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes
conversation_features["noiseexpectedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes
if "voiceexpectedfraction" in features_to_compute:
conversation_features["conversation_rapids_voiceexpectedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes
conversation_features["voiceexpectedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes
if "unknownexpectedfraction" in features_to_compute:
conversation_features["conversation_rapids_unknownexpectedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes
conversation_features["unknownexpectedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes
if "sumconversationduration" in features_to_compute:
conversation_features["conversation_rapids_sumconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].sum()
conversation_features["sumconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].sum()
if "avgconversationduration" in features_to_compute:
conversation_features["conversation_rapids_avgconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].mean()
conversation_features["avgconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].mean()
if "sdconversationduration" in features_to_compute:
conversation_features["conversation_rapids_sdconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].std()
conversation_features["sdconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].std()
if "minconversationduration" in features_to_compute:
conversation_features["conversation_rapids_minconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].min()
conversation_features["minconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].min()
if "maxconversationduration" in features_to_compute:
conversation_features["conversation_rapids_maxconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].max()
conversation_features["maxconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].max()
if "timefirstconversation" in features_to_compute:
timestampsLastConversation = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['timestamp'].min()
@ -95,9 +95,9 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
for date in list(timestampsLastConversation.index):
lastimestamp = timestampsLastConversation.loc[date]
lasttime = (conversation_data.query('timestamp == @lastimestamp', inplace = False))['local_time'].iat[0]
conversation_features.loc[date,"conversation_rapids_timefirstconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1])
conversation_features.loc[date,"timefirstconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1])
else:
conversation_features["conversation_rapids_timefirstconversation"] = np.nan
conversation_features["timefirstconversation"] = np.nan
if "timelastconversation" in features_to_compute:
timestampsLastConversation = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['timestamp'].max()
@ -105,39 +105,39 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
for date in list(timestampsLastConversation.index):
lastimestamp = timestampsLastConversation.loc[date]
lasttime = (conversation_data.query('timestamp == @lastimestamp', inplace = False))['local_time'].iat[0]
conversation_features.loc[date,"conversation_rapids_timelastconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1])
conversation_features.loc[date,"timelastconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1])
else:
conversation_features["conversation_rapids_timelastconversation"] = np.nan
conversation_features["timelastconversation"] = np.nan
if "noisesumenergy" in features_to_compute:
conversation_features["conversation_rapids_noisesumenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].sum()
conversation_features["noisesumenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].sum()
if "noiseavgenergy" in features_to_compute:
conversation_features["conversation_rapids_noiseavgenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].mean()
conversation_features["noiseavgenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].mean()
if "noisesdenergy" in features_to_compute:
conversation_features["conversation_rapids_noisesdenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].std()
conversation_features["noisesdenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].std()
if "noiseminenergy" in features_to_compute:
conversation_features["conversation_rapids_noiseminenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].min()
conversation_features["noiseminenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].min()
if "noisemaxenergy" in features_to_compute:
conversation_features["conversation_rapids_noisemaxenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].max()
conversation_features["noisemaxenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].max()
if "voicesumenergy" in features_to_compute:
conversation_features["conversation_rapids_voicesumenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].sum()
conversation_features["voicesumenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].sum()
if "voiceavgenergy" in features_to_compute:
conversation_features["conversation_rapids_voiceavgenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].mean()
conversation_features["voiceavgenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].mean()
if "voicesdenergy" in features_to_compute:
conversation_features["conversation_rapids_voicesdenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].std()
conversation_features["voicesdenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].std()
if "voiceminenergy" in features_to_compute:
conversation_features["conversation_rapids_voiceminenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].min()
conversation_features["voiceminenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].min()
if "voicemaxenergy" in features_to_compute:
conversation_features["conversation_rapids_voicemaxenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].max()
conversation_features["voicemaxenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].max()
conversation_features = conversation_features.reset_index()

View File

@ -10,26 +10,26 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
# the subset of requested features this function can compute
features_to_compute = list(set(requested_features) & set(base_features_names))
light_features = pd.DataFrame(columns=["local_segment"] + ["light_rapids_" + x for x in features_to_compute])
light_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
if not light_data.empty:
light_data = filter_data_by_segment(light_data, day_segment)
if not light_data.empty:
light_features = pd.DataFrame()
if "count" in features_to_compute:
light_features["light_rapids_count"] = light_data.groupby(["local_segment"]).count()["timestamp"]
light_features["count"] = light_data.groupby(["local_segment"]).count()["timestamp"]
# get light ambient luminance related features
if "maxlux" in features_to_compute:
light_features["light_rapids_maxlux"] = light_data.groupby(["local_segment"])["double_light_lux"].max()
light_features["maxlux"] = light_data.groupby(["local_segment"])["double_light_lux"].max()
if "minlux" in features_to_compute:
light_features["light_rapids_minlux"] = light_data.groupby(["local_segment"])["double_light_lux"].min()
light_features["minlux"] = light_data.groupby(["local_segment"])["double_light_lux"].min()
if "avglux" in features_to_compute:
light_features["light_rapids_avglux"] = light_data.groupby(["local_segment"])["double_light_lux"].mean()
light_features["avglux"] = light_data.groupby(["local_segment"])["double_light_lux"].mean()
if "medianlux" in features_to_compute:
light_features["light_rapids_medianlux"] = light_data.groupby(["local_segment"])["double_light_lux"].median()
light_features["medianlux"] = light_data.groupby(["local_segment"])["double_light_lux"].median()
if "stdlux" in features_to_compute:
light_features["light_rapids_stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std()
light_features["stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std()
light_features = light_features.reset_index()

View File

@ -26,12 +26,12 @@ def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_seg
if location_data.empty:
location_features = pd.DataFrame(columns=["local_segment"] + ["locations_doryab_" + x for x in features_to_compute])
location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
else:
location_data = filter_data_by_segment(location_data, day_segment)
if location_data.empty:
location_features = pd.DataFrame(columns=["local_segment"] + ["locations_doryab_" + x for x in features_to_compute])
location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
else:
location_features = pd.DataFrame()
@ -40,7 +40,7 @@ def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_seg
if "minutesdataused" in features_to_compute:
for localDate in location_data["local_segment"].unique():
location_features.loc[localDate,"locations_doryab_minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate])
location_features.loc[localDate,"minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate])
location_features.index.name = 'local_segment'
@ -52,10 +52,10 @@ def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_seg
return location_features
if "locationvariance" in features_to_compute:
location_features["locations_doryab_locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()
location_features["locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()
if "loglocationvariance" in features_to_compute:
location_features["locations_doryab_loglocationvariance"] = (location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()).apply(lambda x: np.log10(x) if x > 0 else None)
location_features["loglocationvariance"] = (location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()).apply(lambda x: np.log10(x) if x > 0 else None)
preComputedDistanceandSpeed = pd.DataFrame()
@ -67,85 +67,85 @@ def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_seg
if "totaldistance" in features_to_compute:
for localDate in location_data['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"]
location_features.loc[localDate,"totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"]
if "averagespeed" in features_to_compute:
for localDate in location_data['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"]
location_features.loc[localDate,"averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"]
if "varspeed" in features_to_compute:
for localDate in location_data['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"]
location_features.loc[localDate,"varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"]
if "circadianmovement" in features_to_compute:
for localDate in location_data['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_circadianmovement"] = circadian_movement(location_data[location_data['local_segment']==localDate])
location_features.loc[localDate,"circadianmovement"] = circadian_movement(location_data[location_data['local_segment']==localDate])
newLocationData = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples)
if "numberofsignificantplaces" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_segment']==localDate])
location_features.loc[localDate,"numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_segment']==localDate])
if "numberlocationtransitions" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_segment']==localDate])
location_features.loc[localDate,"numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_segment']==localDate])
if "radiusgyration" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
if "timeattop1location" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],1,sampling_frequency)
location_features.loc[localDate,"timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],1,sampling_frequency)
if "timeattop2location" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],2,sampling_frequency)
location_features.loc[localDate,"timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],2,sampling_frequency)
if "timeattop3location" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],3,sampling_frequency)
location_features.loc[localDate,"timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],3,sampling_frequency)
if "movingtostaticratio" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_movingtostaticratio"] = (newLocationData[newLocationData['local_segment']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_segment']==localDate].shape[0] * sampling_frequency)
location_features.loc[localDate,"movingtostaticratio"] = (newLocationData[newLocationData['local_segment']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_segment']==localDate].shape[0] * sampling_frequency)
if "outlierstimepercent" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
location_features.loc[localDate,"outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
preComputedmaxminCluster = pd.DataFrame()
for localDate in newLocationData['local_segment'].unique():
smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency)
preComputedmaxminCluster.loc[localDate,"locations_doryab_maxlengthstayatclusters"] = smax
preComputedmaxminCluster.loc[localDate,"locations_doryab_minlengthstayatclusters"] = smin
preComputedmaxminCluster.loc[localDate,"locations_doryab_stdlengthstayatclusters"] = sstd
preComputedmaxminCluster.loc[localDate,"locations_doryab_meanlengthstayatclusters"] = smean
preComputedmaxminCluster.loc[localDate,"maxlengthstayatclusters"] = smax
preComputedmaxminCluster.loc[localDate,"minlengthstayatclusters"] = smin
preComputedmaxminCluster.loc[localDate,"stdlengthstayatclusters"] = sstd
preComputedmaxminCluster.loc[localDate,"meanlengthstayatclusters"] = smean
if "maxlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_maxlengthstayatclusters"]
location_features.loc[localDate,"maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"maxlengthstayatclusters"]
if "minlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_minlengthstayatclusters"]
location_features.loc[localDate,"minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"minlengthstayatclusters"]
if "stdlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_stdlengthstayatclusters"]
location_features.loc[localDate,"stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"stdlengthstayatclusters"]
if "meanlengthstayatclusters" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_meanlengthstayatclusters"]
location_features.loc[localDate,"meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"meanlengthstayatclusters"]
if "locationentropy" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_locationentropy"] = location_entropy(newLocationData[newLocationData['local_segment']==localDate])
location_features.loc[localDate,"locationentropy"] = location_entropy(newLocationData[newLocationData['local_segment']==localDate])
if "normalizedlocationentropy" in features_to_compute:
for localDate in newLocationData['local_segment'].unique():
location_features.loc[localDate,"locations_doryab_normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_segment']==localDate])
location_features.loc[localDate,"normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_segment']==localDate])
location_features = location_features.reset_index()

View File

@ -5,23 +5,23 @@ def getEpisodeDurationFeatures(screen_data, day_segment, episode, features, refe
screen_data_episode = screen_data[screen_data["episode"] == episode]
duration_helper = pd.DataFrame()
if "countepisode" in features:
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].count().rename(columns = {"duration": "screen_rapids_countepisode" + episode})], axis = 1)
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].count().rename(columns = {"duration": "countepisode" + episode})], axis = 1)
if "sumduration" in features:
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].sum().rename(columns = {"duration": "screen_rapids_sumduration" + episode})], axis = 1)
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].sum().rename(columns = {"duration": "sumduration" + episode})], axis = 1)
if "maxduration" in features:
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].max().rename(columns = {"duration": "screen_rapids_maxduration" + episode})], axis = 1)
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].max().rename(columns = {"duration": "maxduration" + episode})], axis = 1)
if "minduration" in features:
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].min().rename(columns = {"duration": "screen_rapids_minduration" + episode})], axis = 1)
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].min().rename(columns = {"duration": "minduration" + episode})], axis = 1)
if "avgduration" in features:
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].mean().rename(columns = {"duration":"screen_rapids_avgduration" + episode})], axis = 1)
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].mean().rename(columns = {"duration":"avgduration" + episode})], axis = 1)
if "stdduration" in features:
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].std().rename(columns = {"duration":"screen_rapids_stdduration" + episode})], axis = 1)
duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].std().rename(columns = {"duration":"stdduration" + episode})], axis = 1)
if "firstuseafter" + "{0:0=2d}".format(reference_hour_first_use) in features:
screen_data_episode_after_hour = screen_data_episode.copy()
screen_data_episode_after_hour["hour"] = pd.to_datetime(screen_data_episode["local_start_date_time"]).dt.hour
screen_data_episode_after_hour = screen_data_episode_after_hour[screen_data_episode_after_hour["hour"] >= reference_hour_first_use]
duration_helper = pd.concat([duration_helper, pd.DataFrame(screen_data_episode_after_hour.groupby(["local_segment"])[["local_start_date_time"]].min().local_start_date_time.apply(lambda x: (x.to_pydatetime().hour - reference_hour_first_use) * 60 + x.to_pydatetime().minute + (x.to_pydatetime().second / 60))).rename(columns = {"local_start_date_time":"screen_rapids_firstuseafter" + "{0:0=2d}".format(reference_hour_first_use) + episode})], axis = 1)
duration_helper = pd.concat([duration_helper, pd.DataFrame(screen_data_episode_after_hour.groupby(["local_segment"])[["local_start_date_time"]].min().local_start_date_time.apply(lambda x: (x.to_pydatetime().hour - reference_hour_first_use) * 60 + x.to_pydatetime().minute + (x.to_pydatetime().second / 60))).rename(columns = {"local_start_date_time":"firstuseafter" + "{0:0=2d}".format(reference_hour_first_use) + episode})], axis = 1)
return duration_helper
@ -45,7 +45,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg
features_episodes_to_compute = ["firstuseafter" + "{0:0=2d}".format(reference_hour_first_use) if feature_name == "firstuseafter" else feature_name for feature_name in features_episodes_to_compute]
features_to_compute = ["".join(feature) for feature in itertools.product(features_episodes_to_compute, episode_type_to_compute)]
screen_features = pd.DataFrame(columns=["local_segment"]+["screen_rapids_" + x for x in features_to_compute])
screen_features = pd.DataFrame(columns=["local_segment"] + features_to_compute)
if not screen_data.empty:
screen_data = filter_data_by_segment(screen_data, day_segment)

View File

@ -87,6 +87,9 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file
for day_segment in day_segments_labels["label"]:
print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, day_segment))
features = feature_function(sensor_data_files, day_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes)
if not "local_segment" in features.columns:
raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + code_path + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different day segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)")
features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns]
sensor_features = sensor_features.merge(features, how="outer")
else:
for feature in provider["FEATURES"]: