From 016bdbfe8c6c09dc0e45d3f73c2a54c956b3f966 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Mon, 30 Nov 2020 14:42:19 -0500 Subject: [PATCH] Update Python feature scripts to add sensor and provider names automatically --- .../fitbit_heartrate_intraday/rapids/main.py | 52 +++++------ .../fitbit_heartrate_summary/rapids/main.py | 46 +++++----- .../fitbit_sleep_summary/rapids/main.py | 56 ++++++------ .../fitbit_steps_intraday/rapids/main.py | 42 ++++----- .../fitbit_steps_summary/rapids/main.py | 34 +++---- .../phone_accelerometer/panda/main.py | 16 ++-- .../phone_accelerometer/rapids/main.py | 12 +-- .../phone_activity_recognition/rapids/main.py | 88 ++----------------- .../rapids/main.py | 18 ++-- src/features/phone_battery/rapids/main.py | 14 +-- .../phone_conversation/rapids/main.py | 66 +++++++------- src/features/phone_light/rapids/main.py | 14 +-- src/features/phone_locations/doryab/main.py | 54 ++++++------ src/features/phone_screen/rapids/main.py | 16 ++-- src/features/utils/utils.py | 3 + 15 files changed, 229 insertions(+), 302 deletions(-) diff --git a/src/features/fitbit_heartrate_intraday/rapids/main.py b/src/features/fitbit_heartrate_intraday/rapids/main.py index f5100b26..5224e0df 100644 --- a/src/features/fitbit_heartrate_intraday/rapids/main.py +++ b/src/features/fitbit_heartrate_intraday/rapids/main.py @@ -18,31 +18,31 @@ def statsFeatures(heartrate_data, features, features_type, heartrate_features): else: raise ValueError("features_type can only be one of ['hr', 'restinghr', 'caloriesoutofrange', 'caloriesfatburn', 'caloriescardio', 'caloriespeak'].") - if "intradaysum" + features_type in features: - heartrate_features["heartrate_rapids_intradaysum" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].sum() - if "intradaymax" + features_type in features: - heartrate_features["heartrate_rapids_intradaymax" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max() - if "intradaymin" + features_type in features: - heartrate_features["heartrate_rapids_intradaymin" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min() - if "intradayavg" + features_type in features: - heartrate_features["heartrate_rapids_intradayavg" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].mean() - if "intradaymedian" + features_type in features: - heartrate_features["heartrate_rapids_intradaymedian" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].median() - if "intradaymode" + features_type in features: - heartrate_features["heartrate_rapids_intradaymode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: pd.Series.mode(x)[0]) - if "intradaystd" + features_type in features: - heartrate_features["heartrate_rapids_intradaystd" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].std() - if "intradaydiffmaxmode" + features_type in features: - heartrate_features["heartrate_rapids_intradaydiffmaxmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max() - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: pd.Series.mode(x)[0]) - if "intradaydiffminmode" + features_type in features: - heartrate_features["heartrate_rapids_intradaydiffminmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: pd.Series.mode(x)[0]) - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min() - if "intradayentropy" + features_type in features: - heartrate_features["heartrate_rapids_intradayentropy" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(entropy) + if "sum" + features_type in features: + heartrate_features["sum" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].sum() + if "max" + features_type in features: + heartrate_features["max" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max() + if "min" + features_type in features: + heartrate_features["min" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min() + if "avg" + features_type in features: + heartrate_features["avg" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].mean() + if "median" + features_type in features: + heartrate_features["median" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].median() + if "mode" + features_type in features: + heartrate_features["mode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: pd.Series.mode(x)[0]) + if "std" + features_type in features: + heartrate_features["std" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].std() + if "diffmaxmode" + features_type in features: + heartrate_features["diffmaxmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max() - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: pd.Series.mode(x)[0]) + if "diffminmode" + features_type in features: + heartrate_features["diffminmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: pd.Series.mode(x)[0]) - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min() + if "entropy" + features_type in features: + heartrate_features["entropy" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(entropy) return heartrate_features def extractHRFeaturesFromIntradayData(heartrate_intraday_data, features, day_segment, filter_data_by_segment): - heartrate_intraday_features = pd.DataFrame(columns=["local_segment"] + ["heartrate_rapids_" + x for x in features]) + heartrate_intraday_features = pd.DataFrame(columns=["local_segment"] + features) if not heartrate_intraday_data.empty: num_rows_per_minute = heartrate_intraday_data.groupby(["local_date", "local_hour", "local_minute"]).count().mean()["device_id"] heartrate_intraday_data = filter_data_by_segment(heartrate_intraday_data, day_segment) @@ -54,10 +54,10 @@ def extractHRFeaturesFromIntradayData(heartrate_intraday_data, features, day_seg heartrate_intraday_features = statsFeatures(heartrate_intraday_data, features, "hr", heartrate_intraday_features) # get number of minutes in each heart rate zone - for feature_name in list(set(["intradayminutesonoutofrangezone", "intradayminutesonfatburnzone", "intradayminutesoncardiozone", "intradayminutesonpeakzone"]) & set(features)): + for feature_name in list(set(["minutesonoutofrangezone", "minutesonfatburnzone", "minutesoncardiozone", "minutesonpeakzone"]) & set(features)): heartrate_zone = heartrate_intraday_data[heartrate_intraday_data["heartrate_zone"] == feature_name[17:-4]] - heartrate_intraday_features["heartrate_rapids_" + feature_name] = heartrate_zone.groupby(["local_segment"])["device_id"].count() / num_rows_per_minute - heartrate_intraday_features.fillna(value={"heartrate_rapids_" + feature_name: 0}, inplace=True) + heartrate_intraday_features[feature_name] = heartrate_zone.groupby(["local_segment"])["device_id"].count() / num_rows_per_minute + heartrate_intraday_features.fillna(value={feature_name: 0}, inplace=True) heartrate_intraday_features.reset_index(inplace=True) return heartrate_intraday_features @@ -67,9 +67,9 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg heartrate_intraday_data = pd.read_csv(sensor_data_files["sensor_data"]) - requested_intraday_features = ["intraday" + x for x in provider["FEATURES"]] + requested_intraday_features = provider["FEATURES"] # name of the features this function can compute - base_intraday_features_names = ["intradaymaxhr", "intradayminhr", "intradayavghr", "intradaymedianhr", "intradaymodehr", "intradaystdhr", "intradaydiffmaxmodehr", "intradaydiffminmodehr", "intradayentropyhr", "intradayminutesonoutofrangezone", "intradayminutesonfatburnzone", "intradayminutesoncardiozone", "intradayminutesonpeakzone"] + base_intraday_features_names = ["maxhr", "minhr", "avghr", "medianhr", "modehr", "stdhr", "diffmaxmodehr", "diffminmodehr", "entropyhr", "minutesonoutofrangezone", "minutesonfatburnzone", "minutesoncardiozone", "minutesonpeakzone"] # the subset of requested features this function can compute intraday_features_to_compute = list(set(requested_intraday_features) & set(base_intraday_features_names)) diff --git a/src/features/fitbit_heartrate_summary/rapids/main.py b/src/features/fitbit_heartrate_summary/rapids/main.py index 8114047c..b07f6ec3 100644 --- a/src/features/fitbit_heartrate_summary/rapids/main.py +++ b/src/features/fitbit_heartrate_summary/rapids/main.py @@ -18,26 +18,26 @@ def statsFeatures(heartrate_data, features, features_type, heartrate_features): else: raise ValueError("features_type can only be one of ['hr', 'restinghr', 'caloriesoutofrange', 'caloriesfatburn', 'caloriescardio', 'caloriespeak'].") - if "summarysum" + features_type in features: - heartrate_features["heartrate_rapids_summarysum" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].sum() - if "summarymax" + features_type in features: - heartrate_features["heartrate_rapids_summarymax" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max() - if "summarymin" + features_type in features: - heartrate_features["heartrate_rapids_summarymin" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min() - if "summaryavg" + features_type in features: - heartrate_features["heartrate_rapids_summaryavg" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].mean() - if "summarymedian" + features_type in features: - heartrate_features["heartrate_rapids_summarymedian" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].median() - if "summarymode" + features_type in features: - heartrate_features["heartrate_rapids_summarymode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: None if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0]) - if "summarystd" + features_type in features: - heartrate_features["heartrate_rapids_summarystd" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].std() - if "summarydiffmaxmode" + features_type in features: - heartrate_features["heartrate_rapids_summarydiffmaxmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max() - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: None if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0]) - if "summarydiffminmode" + features_type in features: - heartrate_features["heartrate_rapids_summarydiffminmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: None if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0]) - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min() - if "summaryentropy" + features_type in features: - heartrate_features["heartrate_rapids_summaryentropy" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(entropy) + if "sum" + features_type in features: + heartrate_features["sum" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].sum() + if "max" + features_type in features: + heartrate_features["max" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max() + if "min" + features_type in features: + heartrate_features["min" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min() + if "avg" + features_type in features: + heartrate_features["avg" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].mean() + if "median" + features_type in features: + heartrate_features["median" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].median() + if "mode" + features_type in features: + heartrate_features["mode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: None if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0]) + if "std" + features_type in features: + heartrate_features["std" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].std() + if "diffmaxmode" + features_type in features: + heartrate_features["diffmaxmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].max() - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: None if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0]) + if "diffminmode" + features_type in features: + heartrate_features["diffminmode" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(lambda x: None if len(pd.Series.mode(x)) == 0 else pd.Series.mode(x)[0]) - heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].min() + if "entropy" + features_type in features: + heartrate_features["entropy" + features_type] = heartrate_data[["local_segment", col_name]].groupby(["local_segment"])[col_name].agg(entropy) return heartrate_features @@ -63,14 +63,14 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg heartrate_summary_data = pd.read_csv(sensor_data_files["sensor_data"]) - requested_summary_features = ["summary" + x for x in provider["FEATURES"]] + requested_summary_features = provider["FEATURES"] # name of the features this function can compute - base_summary_features_names = ["summarymaxrestinghr", "summaryminrestinghr", "summaryavgrestinghr", "summarymedianrestinghr", "summarymoderestinghr", "summarystdrestinghr", "summarydiffmaxmoderestinghr", "summarydiffminmoderestinghr", "summaryentropyrestinghr", "summarysumcaloriesoutofrange", "summarymaxcaloriesoutofrange", "summarymincaloriesoutofrange", "summaryavgcaloriesoutofrange", "summarymediancaloriesoutofrange", "summarystdcaloriesoutofrange", "summaryentropycaloriesoutofrange", "summarysumcaloriesfatburn", "summarymaxcaloriesfatburn", "summarymincaloriesfatburn", "summaryavgcaloriesfatburn", "summarymediancaloriesfatburn", "summarystdcaloriesfatburn", "summaryentropycaloriesfatburn", "summarysumcaloriescardio", "summarymaxcaloriescardio", "summarymincaloriescardio", "summaryavgcaloriescardio", "summarymediancaloriescardio", "summarystdcaloriescardio", "summaryentropycaloriescardio", "summarysumcaloriespeak", "summarymaxcaloriespeak", "summarymincaloriespeak", "summaryavgcaloriespeak", "summarymediancaloriespeak", "summarystdcaloriespeak", "summaryentropycaloriespeak"] + base_summary_features_names = ["maxrestinghr", "minrestinghr", "avgrestinghr", "medianrestinghr", "moderestinghr", "stdrestinghr", "diffmaxmoderestinghr", "diffminmoderestinghr", "entropyrestinghr", "sumcaloriesoutofrange", "maxcaloriesoutofrange", "mincaloriesoutofrange", "avgcaloriesoutofrange", "mediancaloriesoutofrange", "stdcaloriesoutofrange", "entropycaloriesoutofrange", "sumcaloriesfatburn", "maxcaloriesfatburn", "mincaloriesfatburn", "avgcaloriesfatburn", "mediancaloriesfatburn", "stdcaloriesfatburn", "entropycaloriesfatburn", "sumcaloriescardio", "maxcaloriescardio", "mincaloriescardio", "avgcaloriescardio", "mediancaloriescardio", "stdcaloriescardio", "entropycaloriescardio", "sumcaloriespeak", "maxcaloriespeak", "mincaloriespeak", "avgcaloriespeak", "mediancaloriespeak", "stdcaloriespeak", "entropycaloriespeak"] # the subset of requested features this function can compute summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features_names)) # extract features from summary data - heartrate_summary_features = pd.DataFrame(columns=["local_segment"] + ["heartrate_rapids_" + x for x in summary_features_to_compute]) + heartrate_summary_features = pd.DataFrame(columns=["local_segment"] + summary_features_to_compute) if not heartrate_summary_data.empty: heartrate_summary_data = filter_data_by_segment(heartrate_summary_data, day_segment) diff --git a/src/features/fitbit_sleep_summary/rapids/main.py b/src/features/fitbit_sleep_summary/rapids/main.py index f42f750e..46ce2052 100644 --- a/src/features/fitbit_sleep_summary/rapids/main.py +++ b/src/features/fitbit_sleep_summary/rapids/main.py @@ -13,36 +13,36 @@ def extractSleepFeaturesFromSummaryData(sleep_summary_data, summary_features, sl features_sum = sleep_summary_data[["local_segment", "minutes_after_wakeup", "minutes_asleep", "minutes_awake", "minutes_to_fall_asleep", "minutes_in_bed"]].groupby(["local_segment"]).sum() - if "summarysumdurationafterwakeup" in summary_features: - sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_after_wakeup"]], how="outer").rename(columns={"minutes_after_wakeup": "sleep_rapids_summarysumdurationafterwakeup" + sleep_type}) - if "summarysumdurationasleep" in summary_features: - sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_asleep"]], how="outer").rename(columns={"minutes_asleep": "sleep_rapids_summarysumdurationasleep" + sleep_type}) - if "summarysumdurationawake" in summary_features: - sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_awake"]], how="outer").rename(columns={"minutes_awake": "sleep_rapids_summarysumdurationawake" + sleep_type}) - if "summarysumdurationtofallasleep" in summary_features: - sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_to_fall_asleep"]], how="outer").rename(columns={"minutes_to_fall_asleep": "sleep_rapids_summarysumdurationtofallasleep" + sleep_type}) - if "summarysumdurationinbed" in summary_features: - sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_in_bed"]], how="outer").rename(columns={"minutes_in_bed": "sleep_rapids_summarysumdurationinbed" + sleep_type}) + if "sumdurationafterwakeup" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_after_wakeup"]], how="outer").rename(columns={"minutes_after_wakeup": "sumdurationafterwakeup" + sleep_type}) + if "sumdurationasleep" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_asleep"]], how="outer").rename(columns={"minutes_asleep": "sumdurationasleep" + sleep_type}) + if "sumdurationawake" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_awake"]], how="outer").rename(columns={"minutes_awake": "sumdurationawake" + sleep_type}) + if "sumdurationtofallasleep" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_to_fall_asleep"]], how="outer").rename(columns={"minutes_to_fall_asleep": "sumdurationtofallasleep" + sleep_type}) + if "sumdurationinbed" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_sum[["minutes_in_bed"]], how="outer").rename(columns={"minutes_in_bed": "sumdurationinbed" + sleep_type}) features_avg = sleep_summary_data[["local_segment", "efficiency", "minutes_after_wakeup", "minutes_asleep", "minutes_awake", "minutes_to_fall_asleep", "minutes_in_bed"]].groupby(["local_segment"]).mean() - if "summaryavgefficiency" in summary_features: - sleep_summary_features = sleep_summary_features.join(features_avg[["efficiency"]], how="outer").rename(columns={"efficiency": "sleep_rapids_summaryavgefficiency" + sleep_type}) - if "summaryavgdurationafterwakeup" in summary_features: - sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_after_wakeup"]], how="outer").rename(columns={"minutes_after_wakeup": "sleep_rapids_summaryavgdurationafterwakeup" + sleep_type}) - if "summaryavgdurationasleep" in summary_features: - sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_asleep"]], how="outer").rename(columns={"minutes_asleep": "sleep_rapids_summaryavgdurationasleep" + sleep_type}) - if "summaryavgdurationawake" in summary_features: - sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_awake"]], how="outer").rename(columns={"minutes_awake": "sleep_rapids_summaryavgdurationawake" + sleep_type}) - if "summaryavgdurationtofallasleep" in summary_features: - sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_to_fall_asleep"]], how="outer").rename(columns={"minutes_to_fall_asleep": "sleep_rapids_summaryavgdurationtofallasleep" + sleep_type}) - if "summaryavgdurationinbed" in summary_features: - sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_in_bed"]], how="outer").rename(columns={"minutes_in_bed": "sleep_rapids_summaryavgdurationinbed" + sleep_type}) + if "avgefficiency" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_avg[["efficiency"]], how="outer").rename(columns={"efficiency": "avgefficiency" + sleep_type}) + if "avgdurationafterwakeup" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_after_wakeup"]], how="outer").rename(columns={"minutes_after_wakeup": "avgdurationafterwakeup" + sleep_type}) + if "avgdurationasleep" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_asleep"]], how="outer").rename(columns={"minutes_asleep": "avgdurationasleep" + sleep_type}) + if "avgdurationawake" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_awake"]], how="outer").rename(columns={"minutes_awake": "avgdurationawake" + sleep_type}) + if "avgdurationtofallasleep" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_to_fall_asleep"]], how="outer").rename(columns={"minutes_to_fall_asleep": "avgdurationtofallasleep" + sleep_type}) + if "avgdurationinbed" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_avg[["minutes_in_bed"]], how="outer").rename(columns={"minutes_in_bed": "avgdurationinbed" + sleep_type}) features_count = sleep_summary_data[["local_segment", "timestamp"]].groupby(["local_segment"]).count() - if "summarycountepisode" in summary_features: - sleep_summary_features = sleep_summary_features.join(features_count[["timestamp"]], how="outer").rename(columns={"timestamp": "sleep_rapids_summarycountepisode" + sleep_type}) + if "countepisode" in summary_features: + sleep_summary_features = sleep_summary_features.join(features_count[["timestamp"]], how="outer").rename(columns={"timestamp": "countepisode" + sleep_type}) return sleep_summary_features @@ -51,11 +51,11 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg sleep_summary_data = pd.read_csv(sensor_data_files["sensor_data"]) - requested_summary_features = ["summary" + x for x in provider["FEATURES"]] + requested_summary_features = provider["FEATURES"] requested_sleep_types = provider["SLEEP_TYPES"] # name of the features this function can compute - base_summary_features = ["summarycountepisode", "summaryavgefficiency", "summarysumdurationafterwakeup", "summarysumdurationasleep", "summarysumdurationawake", "summarysumdurationtofallasleep", "summarysumdurationinbed", "summaryavgdurationafterwakeup", "summaryavgdurationasleep", "summaryavgdurationawake", "summaryavgdurationtofallasleep", "summaryavgdurationinbed"] + base_summary_features = ["countepisode", "avgefficiency", "sumdurationafterwakeup", "sumdurationasleep", "sumdurationawake", "sumdurationtofallasleep", "sumdurationinbed", "avgdurationafterwakeup", "avgdurationasleep", "avgdurationawake", "avgdurationtofallasleep", "avgdurationinbed"] base_sleep_types = ["main", "nap", "all"] # the subset of requested features this function can compute summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features)) @@ -63,10 +63,10 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg # full names features_fullnames_to_compute = ["".join(feature) for feature in itertools.product(summary_features_to_compute, sleep_types_to_compute)] - colnames_can_be_zero = ["sleep_rapids_" + x for x in [col for col in features_fullnames_to_compute if "summaryavgefficiency" not in col]] + colnames_can_be_zero = [col for col in features_fullnames_to_compute if "avgefficiency" not in col] # extract features from summary data - sleep_summary_features = pd.DataFrame(columns=["local_segment"] + ["sleep_rapids_" + x for x in features_fullnames_to_compute]) + sleep_summary_features = pd.DataFrame(columns=["local_segment"] + features_fullnames_to_compute) if not sleep_summary_data.empty: sleep_summary_data = filter_data_by_segment(sleep_summary_data, day_segment) diff --git a/src/features/fitbit_steps_intraday/rapids/main.py b/src/features/fitbit_steps_intraday/rapids/main.py index 8bb5013b..8e508d4e 100644 --- a/src/features/fitbit_steps_intraday/rapids/main.py +++ b/src/features/fitbit_steps_intraday/rapids/main.py @@ -9,20 +9,20 @@ def statsFeatures(steps_data, features_to_compute, features_type, steps_features else: raise ValueError("features_type can only be one of ['steps', 'sumsteps', 'durationsedentarybout', 'durationactivebout'].") - if ("summarycount" if features_type == "sumsteps" else "intradaycount") + features_type.replace("duration", "episode") in features_to_compute: - steps_features["steps_rapids_" + ("summarycount" if features_type == "sumsteps" else "intradaycount") + features_type.replace("duration", "episode")] = steps_data.groupby(["local_segment"])[col_name].count() - if ("summarysum" if features_type == "sumsteps" else "intradaysum") + features_type in features_to_compute: - steps_features["steps_rapids_" + ("summarysum" if features_type == "sumsteps" else "intradaysum") + features_type] = steps_data.groupby(["local_segment"])[col_name].sum() - if ("summarymax" if features_type == "sumsteps" else "intradaymax") + features_type in features_to_compute: - steps_features["steps_rapids_" + ("summarymax" if features_type == "sumsteps" else "intradaymax") + features_type] = steps_data.groupby(["local_segment"])[col_name].max() - if ("summarymin" if features_type == "sumsteps" else "intradaymin") + features_type in features_to_compute: - steps_features["steps_rapids_" + ("summarymin" if features_type == "sumsteps" else "intradaymin") + features_type] = steps_data.groupby(["local_segment"])[col_name].min() - if ("summaryavg" if features_type == "sumsteps" else "intradayavg") + features_type in features_to_compute: - steps_features["steps_rapids_" + ("summaryavg" if features_type == "sumsteps" else "intradayavg") + features_type] = steps_data.groupby(["local_segment"])[col_name].mean() - if ("summarymedian" if features_type == "sumsteps" else "intradaymedian") + features_type in features_to_compute: - steps_features["steps_rapids_" + ("summarymedian" if features_type == "sumsteps" else "intradaymedian") + features_type] = steps_data.groupby(["local_segment"])[col_name].median() - if ("summarystd" if features_type == "sumsteps" else "intradaystd") + features_type in features_to_compute: - steps_features["steps_rapids_" + ("summarystd" if features_type == "sumsteps" else "intradaystd") + features_type] = steps_data.groupby(["local_segment"])[col_name].std() + if "count" + features_type.replace("duration", "episode") in features_to_compute: + steps_features["count" + features_type.replace("duration", "episode")] = steps_data.groupby(["local_segment"])[col_name].count() + if "sum" + features_type in features_to_compute: + steps_features["sum" + features_type] = steps_data.groupby(["local_segment"])[col_name].sum() + if "max" + features_type in features_to_compute: + steps_features["max" + features_type] = steps_data.groupby(["local_segment"])[col_name].max() + if "min" + features_type in features_to_compute: + steps_features["min" + features_type] = steps_data.groupby(["local_segment"])[col_name].min() + if "avg" + features_type in features_to_compute: + steps_features["avg" + features_type] = steps_data.groupby(["local_segment"])[col_name].mean() + if "median" + features_type in features_to_compute: + steps_features["median" + features_type] = steps_data.groupby(["local_segment"])[col_name].median() + if "std" + features_type in features_to_compute: + steps_features["std" + features_type] = steps_data.groupby(["local_segment"])[col_name].std() return steps_features @@ -73,13 +73,13 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg requested_intraday_features = provider["FEATURES"] - requested_intraday_features_steps = ["intraday" + x + "steps" for x in requested_intraday_features["STEPS"]] - requested_intraday_features_sedentarybout = ["intraday" + x + "sedentarybout" for x in requested_intraday_features["SEDENTARY_BOUT"]] - requested_intraday_features_activebout = ["intraday" + x + "activebout" for x in requested_intraday_features["ACTIVE_BOUT"]] + requested_intraday_features_steps = [x + "steps" for x in requested_intraday_features["STEPS"]] + requested_intraday_features_sedentarybout = [x + "sedentarybout" for x in requested_intraday_features["SEDENTARY_BOUT"]] + requested_intraday_features_activebout = [x + "activebout" for x in requested_intraday_features["ACTIVE_BOUT"]] # name of the features this function can compute - base_intraday_features_steps = ["intradaysumsteps", "intradaymaxsteps", "intradayminsteps", "intradayavgsteps", "intradaystdsteps"] - base_intraday_features_sedentarybout = ["intradaycountepisodesedentarybout", "intradaysumdurationsedentarybout", "intradaymaxdurationsedentarybout", "intradaymindurationsedentarybout", "intradayavgdurationsedentarybout", "intradaystddurationsedentarybout"] - base_intraday_features_activebout = ["intradaycountepisodeactivebout", "intradaysumdurationactivebout", "intradaymaxdurationactivebout", "intradaymindurationactivebout", "intradayavgdurationactivebout", "intradaystddurationactivebout"] + base_intraday_features_steps = ["sumsteps", "maxsteps", "minsteps", "avgsteps", "stdsteps"] + base_intraday_features_sedentarybout = ["countepisodesedentarybout", "sumdurationsedentarybout", "maxdurationsedentarybout", "mindurationsedentarybout", "avgdurationsedentarybout", "stddurationsedentarybout"] + base_intraday_features_activebout = ["countepisodeactivebout", "sumdurationactivebout", "maxdurationactivebout", "mindurationactivebout", "avgdurationactivebout", "stddurationactivebout"] # the subset of requested features this function can compute intraday_features_to_compute_steps = list(set(requested_intraday_features_steps) & set(base_intraday_features_steps)) intraday_features_to_compute_sedentarybout = list(set(requested_intraday_features_sedentarybout) & set(base_intraday_features_sedentarybout)) @@ -88,7 +88,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg intraday_features_to_compute = intraday_features_to_compute_steps + intraday_features_to_compute_sedentarybout + intraday_features_to_compute_activebout # extract features from intraday features - steps_intraday_features = pd.DataFrame(columns=["local_segment"] + ["steps_rapids_" + x for x in intraday_features_to_compute]) + steps_intraday_features = pd.DataFrame(columns=["local_segment"] + intraday_features_to_compute) if not steps_intraday_data.empty: steps_intraday_data = filter_data_by_segment(steps_intraday_data, day_segment) diff --git a/src/features/fitbit_steps_summary/rapids/main.py b/src/features/fitbit_steps_summary/rapids/main.py index 25953f34..e49a895f 100644 --- a/src/features/fitbit_steps_summary/rapids/main.py +++ b/src/features/fitbit_steps_summary/rapids/main.py @@ -9,20 +9,20 @@ def statsFeatures(steps_data, features_to_compute, features_type, steps_features else: raise ValueError("features_type can only be one of ['steps', 'sumsteps', 'durationsedentarybout', 'durationactivebout'].") - if ("summarycount" if features_type == "sumsteps" else "intradaycount") + features_type.replace("duration", "episode") in features_to_compute: - steps_features["steps_rapids_" + ("summarycount" if features_type == "sumsteps" else "intradaycount") + features_type.replace("duration", "episode")] = steps_data.groupby(["local_segment"])[col_name].count() - if ("summarysum" if features_type == "sumsteps" else "intradaysum") + features_type in features_to_compute: - steps_features["steps_rapids_" + ("summarysum" if features_type == "sumsteps" else "intradaysum") + features_type] = steps_data.groupby(["local_segment"])[col_name].sum() - if ("summarymax" if features_type == "sumsteps" else "intradaymax") + features_type in features_to_compute: - steps_features["steps_rapids_" + ("summarymax" if features_type == "sumsteps" else "intradaymax") + features_type] = steps_data.groupby(["local_segment"])[col_name].max() - if ("summarymin" if features_type == "sumsteps" else "intradaymin") + features_type in features_to_compute: - steps_features["steps_rapids_" + ("summarymin" if features_type == "sumsteps" else "intradaymin") + features_type] = steps_data.groupby(["local_segment"])[col_name].min() - if ("summaryavg" if features_type == "sumsteps" else "intradayavg") + features_type in features_to_compute: - steps_features["steps_rapids_" + ("summaryavg" if features_type == "sumsteps" else "intradayavg") + features_type] = steps_data.groupby(["local_segment"])[col_name].mean() - if ("summarymedian" if features_type == "sumsteps" else "intradaymedian") + features_type in features_to_compute: - steps_features["steps_rapids_" + ("summarymedian" if features_type == "sumsteps" else "intradaymedian") + features_type] = steps_data.groupby(["local_segment"])[col_name].median() - if ("summarystd" if features_type == "sumsteps" else "intradaystd") + features_type in features_to_compute: - steps_features["steps_rapids_" + ("summarystd" if features_type == "sumsteps" else "intradaystd") + features_type] = steps_data.groupby(["local_segment"])[col_name].std() + if "count" + features_type.replace("duration", "episode") in features_to_compute: + steps_features["count" + features_type.replace("duration", "episode")] = steps_data.groupby(["local_segment"])[col_name].count() + if "sum" + features_type in features_to_compute: + steps_features["sum" + features_type] = steps_data.groupby(["local_segment"])[col_name].sum() + if "max" + features_type in features_to_compute: + steps_features["max" + features_type] = steps_data.groupby(["local_segment"])[col_name].max() + if "min" + features_type in features_to_compute: + steps_features["min" + features_type] = steps_data.groupby(["local_segment"])[col_name].min() + if "avg" + features_type in features_to_compute: + steps_features["avg" + features_type] = steps_data.groupby(["local_segment"])[col_name].mean() + if "median" + features_type in features_to_compute: + steps_features["median" + features_type] = steps_data.groupby(["local_segment"])[col_name].median() + if "std" + features_type in features_to_compute: + steps_features["std" + features_type] = steps_data.groupby(["local_segment"])[col_name].std() return steps_features @@ -41,15 +41,15 @@ def extractStepsFeaturesFromSummaryData(steps_summary_data, summary_features_to_ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_segment, *args, **kwargs): steps_summary_data = pd.read_csv(sensor_data_files["sensor_data"]) - requested_summary_features = ["summary" + x for x in provider["FEATURES"]] + requested_summary_features = provider["FEATURES"] # name of the features this function can compute - base_summary_features = ["summarymaxsumsteps", "summaryminsumsteps", "summaryavgsumsteps", "summarymediansumsteps", "summarystdsumsteps"] + base_summary_features = ["maxsumsteps", "minsumsteps", "avgsumsteps", "mediansumsteps", "stdsumsteps"] # the subset of requested features this function can compute summary_features_to_compute = list(set(requested_summary_features) & set(base_summary_features)) # extract features from summary data - steps_summary_features = pd.DataFrame(columns=["local_segment"] + ["steps_rapids_" + x for x in summary_features_to_compute]) + steps_summary_features = pd.DataFrame(columns=["local_segment"] + summary_features_to_compute) if not steps_summary_data.empty: steps_summary_data = filter_data_by_segment(steps_summary_data, day_segment) diff --git a/src/features/phone_accelerometer/panda/main.py b/src/features/phone_accelerometer/panda/main.py index 0e6a50cd..1b1139df 100644 --- a/src/features/phone_accelerometer/panda/main.py +++ b/src/features/phone_accelerometer/panda/main.py @@ -26,17 +26,17 @@ def getActivityEpisodes(acc_minute): def statsFeatures(acc_data, features_to_compute, features_type, acc_features): if "sum" + features_type in features_to_compute: - acc_features["acc_panda_sum" + features_type] = acc_data.groupby(["local_segment"])["duration"].sum() + acc_features["sum" + features_type] = acc_data.groupby(["local_segment"])["duration"].sum() if "max" + features_type in features_to_compute: - acc_features["acc_panda_max" + features_type] = acc_data.groupby(["local_segment"])["duration"].max() + acc_features["max" + features_type] = acc_data.groupby(["local_segment"])["duration"].max() if "min" + features_type in features_to_compute: - acc_features["acc_panda_min" + features_type] = acc_data.groupby(["local_segment"])["duration"].min() + acc_features["min" + features_type] = acc_data.groupby(["local_segment"])["duration"].min() if "avg" + features_type in features_to_compute: - acc_features["acc_panda_avg" + features_type] = acc_data.groupby(["local_segment"])["duration"].mean() + acc_features["avg" + features_type] = acc_data.groupby(["local_segment"])["duration"].mean() if "median" + features_type in features_to_compute: - acc_features["acc_panda_median" + features_type] = acc_data.groupby(["local_segment"])["duration"].median() + acc_features["median" + features_type] = acc_data.groupby(["local_segment"])["duration"].median() if "std" + features_type in features_to_compute: - acc_features["acc_panda_std" + features_type] = acc_data.groupby(["local_segment"])["duration"].std() + acc_features["std" + features_type] = acc_data.groupby(["local_segment"])["duration"].std() return acc_features @@ -56,7 +56,7 @@ def panda_features(sensor_data_files, day_segment, provider, filter_data_by_segm features_to_compute = features_to_compute_exertionalactivityepisode + features_to_compute_nonexertionalactivityepisode + (["validsensedminutes"] if valid_sensed_minutes else []) - acc_features = pd.DataFrame(columns=["local_segment"] + ["acc_panda_" + x for x in features_to_compute]) + acc_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not acc_data.empty: acc_data = filter_data_by_segment(acc_data, day_segment) @@ -72,7 +72,7 @@ def panda_features(sensor_data_files, day_segment, provider, filter_data_by_segm acc_minute.reset_index(inplace=True) if valid_sensed_minutes: - acc_features["acc_panda_validsensedminutes"] = acc_minute.groupby(["local_segment"])["isexertionalactivity"].count() + acc_features["validsensedminutes"] = acc_minute.groupby(["local_segment"])["isexertionalactivity"].count() activity_episodes = getActivityEpisodes(acc_minute) # compute exertional episodes features diff --git a/src/features/phone_accelerometer/rapids/main.py b/src/features/phone_accelerometer/rapids/main.py index 09920343..dc197066 100644 --- a/src/features/phone_accelerometer/rapids/main.py +++ b/src/features/phone_accelerometer/rapids/main.py @@ -10,7 +10,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) - acc_features = pd.DataFrame(columns=["local_segment"] + ["acc_rapids_" + x for x in features_to_compute]) + acc_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not acc_data.empty: acc_data = filter_data_by_segment(acc_data, day_segment) @@ -21,15 +21,15 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg acc_data = acc_data.assign(magnitude = magnitude.values) if "maxmagnitude" in features_to_compute: - acc_features["acc_rapids_maxmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].max() + acc_features["maxmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].max() if "minmagnitude" in features_to_compute: - acc_features["acc_rapids_minmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].min() + acc_features["minmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].min() if "avgmagnitude" in features_to_compute: - acc_features["acc_rapids_avgmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].mean() + acc_features["avgmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].mean() if "medianmagnitude" in features_to_compute: - acc_features["acc_rapids_medianmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].median() + acc_features["medianmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].median() if "stdmagnitude" in features_to_compute: - acc_features["acc_rapids_stdmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].std() + acc_features["stdmagnitude"] = acc_data.groupby(["local_segment"])["magnitude"].std() acc_features = acc_features.reset_index() diff --git a/src/features/phone_activity_recognition/rapids/main.py b/src/features/phone_activity_recognition/rapids/main.py index 6d838325..099562fd 100644 --- a/src/features/phone_activity_recognition/rapids/main.py +++ b/src/features/phone_activity_recognition/rapids/main.py @@ -12,7 +12,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg requested_features = provider["FEATURES"] features_to_compute = list(set(requested_features) & set(base_features_names)) - ar_features = pd.DataFrame(columns=["local_segment"] + ["ar_rapids_" + x for x in features_to_compute]) + ar_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not ar_episodes.empty: ar_episodes = filter_data_by_segment(ar_episodes, day_segment) @@ -20,98 +20,22 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg ar_features = pd.DataFrame() if "count" in features_to_compute: - ar_features["ar_rapids_count"] = ar_episodes.groupby(["local_segment"]).count()["episode_id"] + ar_features["count"] = ar_episodes.groupby(["local_segment"]).count()["episode_id"] if "mostcommonactivity" in features_to_compute: - ar_features["ar_rapids_mostcommonactivity"] = ar_episodes.groupby(["local_segment"])["activity_type"].agg(lambda x: pd.Series.mode(x)[0]) + ar_features["mostcommonactivity"] = ar_episodes.groupby(["local_segment"])["activity_type"].agg(lambda x: pd.Series.mode(x)[0]) if "countuniqueactivities" in features_to_compute: - ar_features["ar_rapids_countuniqueactivities"] = ar_episodes.groupby(["local_segment"])["activity_type"].nunique() + ar_features["countuniqueactivities"] = ar_episodes.groupby(["local_segment"])["activity_type"].nunique() # duration features for column, activity_labels in activity_classes.items(): if "duration" + column.lower() in features_to_compute: filtered_data = ar_episodes[ar_episodes["activity_name"].isin(pd.Series(activity_labels))] if not filtered_data.empty: - ar_features["ar_rapids_duration" + column.lower()] = ar_episodes[ar_episodes["activity_name"].isin(pd.Series(activity_labels))].groupby(["local_segment"])["duration"].sum().fillna(0) + ar_features["duration" + column.lower()] = ar_episodes[ar_episodes["activity_name"].isin(pd.Series(activity_labels))].groupby(["local_segment"])["duration"].sum().fillna(0) else: - ar_features["ar_rapids_duration" + column.lower()] = 0 + ar_features["duration" + column.lower()] = 0 ar_features.index.names = ["local_segment"] ar_features = ar_features.reset_index() return ar_features - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - """ - - if not ar_data.empty: - ar_data = filter_data_by_segment(ar_data, day_segment) - - if not ar_data.empty: - # chunk_episodes - ar_data = chunk_episodes(ar_data) - - if not ar_data.empty: - - ar_data["episode_id"] = ((ar_data.ar_status != ar_data.ar_status.shift()) | (ar_data.start_timestamp - ar_data.end_timestamp.shift() > 1)).cumsum() - grouped = ar_data.groupby(by=["local_segment", "episode_id", "ar_status"]) - ar_episodes= grouped[["duration"]].sum() - ar_episodes["ar_diff"] = grouped["ar_level"].first() - grouped["ar_level"].last() - ar_episodes["ar_consumption_rate"] = ar_episodes["ar_diff"] / ar_episodes["duration"] - ar_episodes.reset_index(inplace=True) - - # for discharge episodes - ar_discharge_episodes = ar_episodes[(ar_episodes["ar_status"] == 3) | (ar_episodes["ar_status"] == 4)] - ar_discharge_features = pd.DataFrame() - if "countdischarge" in features_to_compute: - ar_discharge_features["ar_rapids_countdischarge"] = ar_discharge_episodes.groupby(["local_segment"])["episode_id"].count() - if "sumdurationdischarge" in features_to_compute: - ar_discharge_features["ar_rapids_sumdurationdischarge"] = ar_discharge_episodes.groupby(["local_segment"])["duration"].sum() - if "avgconsumptionrate" in features_to_compute: - ar_discharge_features["ar_rapids_avgconsumptionrate"] = ar_discharge_episodes.groupby(["local_segment"])["ar_consumption_rate"].mean() - if "maxconsumptionrate" in features_to_compute: - ar_discharge_features["ar_rapids_maxconsumptionrate"] = ar_discharge_episodes.groupby(["local_segment"])["ar_consumption_rate"].max() - - # for charge episodes - ar_charge_episodes = ar_episodes[(ar_episodes["ar_status"] == 2) | (ar_episodes["ar_status"] == 5)] - ar_charge_features = pd.DataFrame() - if "countcharge" in features_to_compute: - ar_charge_features["ar_rapids_countcharge"] = ar_charge_episodes.groupby(["local_segment"])["episode_id"].count() - if "sumdurationcharge" in features_to_compute: - ar_charge_features["ar_rapids_sumdurationcharge"] = ar_charge_episodes.groupby(["local_segment"])["duration"].sum() - - # combine discharge features and charge features; fill the missing values with ZERO - ar_features = pd.concat([ar_discharge_features, ar_charge_features], axis=1, sort=True).fillna(0) - - ar_features.index.rename("local_segment", inplace=True) - ar_features = ar_features.reset_index() - - return ar_features - """ \ No newline at end of file diff --git a/src/features/phone_applications_foreground/rapids/main.py b/src/features/phone_applications_foreground/rapids/main.py index 6ec2516f..305e07fc 100644 --- a/src/features/phone_applications_foreground/rapids/main.py +++ b/src/features/phone_applications_foreground/rapids/main.py @@ -9,24 +9,24 @@ def compute_features(filtered_data, apps_type, requested_features, apps_features if "timeoffirstuse" in requested_features: time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment") if time_first_event.empty: - apps_features["apps_rapids_timeoffirstuse" + apps_type] = np.nan + apps_features["timeoffirstuse" + apps_type] = np.nan else: - apps_features["apps_rapids_timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"] + apps_features["timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"] if "timeoflastuse" in requested_features: time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment") if time_last_event.empty: - apps_features["apps_rapids_timeoflastuse" + apps_type] = np.nan + apps_features["timeoflastuse" + apps_type] = np.nan else: - apps_features["apps_rapids_timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"] + apps_features["timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"] if "frequencyentropy" in requested_features: apps_with_count = filtered_data.groupby(["local_segment","application_name"]).count().sort_values(by="timestamp", ascending=False).reset_index() if (len(apps_with_count.index) < 2 ): - apps_features["apps_rapids_frequencyentropy" + apps_type] = np.nan + apps_features["frequencyentropy" + apps_type] = np.nan else: - apps_features["apps_rapids_frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy) + apps_features["frequencyentropy" + apps_type] = apps_with_count.groupby("local_segment")["timestamp"].agg(entropy) if "count" in requested_features: - apps_features["apps_rapids_count" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"] - apps_features.fillna(value={"apps_rapids_count" + apps_type: 0}, inplace=True) + apps_features["count" + apps_type] = filtered_data.groupby(["local_segment"]).count()["timestamp"] + apps_features.fillna(value={"count" + apps_type: 0}, inplace=True) return apps_features @@ -53,7 +53,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg # exclude apps in the excluded_apps list apps_data = apps_data[~apps_data["package_name"].isin(excluded_apps)] - apps_features = pd.DataFrame(columns=["local_segment"] + ["apps_rapids_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + single_apps)]]) + apps_features = pd.DataFrame(columns=["local_segment"] + ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + single_apps)]) if not apps_data.empty: # deep copy the apps_data for the top1global computation apps_data_global = apps_data.copy() diff --git a/src/features/phone_battery/rapids/main.py b/src/features/phone_battery/rapids/main.py index 67294b86..20702571 100644 --- a/src/features/phone_battery/rapids/main.py +++ b/src/features/phone_battery/rapids/main.py @@ -11,7 +11,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg requested_features = provider["FEATURES"] features_to_compute = list(set(requested_features) & set(base_features_names)) - battery_features = pd.DataFrame(columns=["local_segment"] + ["battery_rapids_" + x for x in features_to_compute]) + battery_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not battery_data.empty: battery_data = filter_data_by_segment(battery_data, day_segment) @@ -28,21 +28,21 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg battery_discharge_episodes = battery_episodes[(battery_episodes["battery_status"] == 3) | (battery_episodes["battery_status"] == 4)] battery_discharge_features = pd.DataFrame() if "countdischarge" in features_to_compute: - battery_discharge_features["battery_rapids_countdischarge"] = battery_discharge_episodes.groupby(["local_segment"])["episode_id"].count() + battery_discharge_features["countdischarge"] = battery_discharge_episodes.groupby(["local_segment"])["episode_id"].count() if "sumdurationdischarge" in features_to_compute: - battery_discharge_features["battery_rapids_sumdurationdischarge"] = battery_discharge_episodes.groupby(["local_segment"])["duration"].sum() + battery_discharge_features["sumdurationdischarge"] = battery_discharge_episodes.groupby(["local_segment"])["duration"].sum() if "avgconsumptionrate" in features_to_compute: - battery_discharge_features["battery_rapids_avgconsumptionrate"] = battery_discharge_episodes.groupby(["local_segment"])["battery_consumption_rate"].mean() + battery_discharge_features["avgconsumptionrate"] = battery_discharge_episodes.groupby(["local_segment"])["battery_consumption_rate"].mean() if "maxconsumptionrate" in features_to_compute: - battery_discharge_features["battery_rapids_maxconsumptionrate"] = battery_discharge_episodes.groupby(["local_segment"])["battery_consumption_rate"].max() + battery_discharge_features["maxconsumptionrate"] = battery_discharge_episodes.groupby(["local_segment"])["battery_consumption_rate"].max() # for charge episodes battery_charge_episodes = battery_episodes[(battery_episodes["battery_status"] == 2) | (battery_episodes["battery_status"] == 5)] battery_charge_features = pd.DataFrame() if "countcharge" in features_to_compute: - battery_charge_features["battery_rapids_countcharge"] = battery_charge_episodes.groupby(["local_segment"])["episode_id"].count() + battery_charge_features["countcharge"] = battery_charge_episodes.groupby(["local_segment"])["episode_id"].count() if "sumdurationcharge" in features_to_compute: - battery_charge_features["battery_rapids_sumdurationcharge"] = battery_charge_episodes.groupby(["local_segment"])["duration"].sum() + battery_charge_features["sumdurationcharge"] = battery_charge_episodes.groupby(["local_segment"])["duration"].sum() # combine discharge features and charge features; fill the missing values with ZERO battery_features = pd.concat([battery_discharge_features, battery_charge_features], axis=1, sort=True).fillna(0) diff --git a/src/features/phone_conversation/rapids/main.py b/src/features/phone_conversation/rapids/main.py index 52112bd3..f04e9c68 100644 --- a/src/features/phone_conversation/rapids/main.py +++ b/src/features/phone_conversation/rapids/main.py @@ -21,7 +21,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) - conversation_features = pd.DataFrame(columns=["local_segment"] + ["conversation_rapids_" + x for x in features_to_compute]) + conversation_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not conversation_data.empty: conversation_data = filter_data_by_segment(conversation_data, day_segment) @@ -31,19 +31,19 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg conversation_data = conversation_data.drop_duplicates(subset=["local_date", "local_time"], keep="first") if "minutessilence" in features_to_compute: - conversation_features["conversation_rapids_minutessilence"] = conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60 + conversation_features["minutessilence"] = conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60 if "minutesnoise" in features_to_compute: - conversation_features["conversation_rapids_minutesnoise"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60 + conversation_features["minutesnoise"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60 if "minutesvoice" in features_to_compute: - conversation_features["conversation_rapids_minutesvoice"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60 + conversation_features["minutesvoice"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60 if "minutesunknown" in features_to_compute: - conversation_features["conversation_rapids_minutesunknown"] = conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60 + conversation_features["minutesunknown"] = conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60 if "countconversation" in features_to_compute: - conversation_features["conversation_rapids_countconversation"] = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['double_convo_start'].nunique() + conversation_features["countconversation"] = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['double_convo_start'].nunique() conv_duration = (conversation_data['double_convo_end']/1000 - conversation_data['double_convo_start']/1000)/60 conversation_data = conversation_data.assign(conv_duration = conv_duration.values) @@ -51,43 +51,43 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg conv_totalDuration = conversation_data[(conversation_data['inference'] >= 0) & (conversation_data['inference'] < 4)].groupby(["local_segment"])['inference'].count()/60 if "silencesensedfraction" in features_to_compute: - conversation_features["conversation_rapids_silencesensedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration + conversation_features["silencesensedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration if "noisesensedfraction" in features_to_compute: - conversation_features["conversation_rapids_noisesensedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration + conversation_features["noisesensedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration if "voicesensedfraction" in features_to_compute: - conversation_features["conversation_rapids_voicesensedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration + conversation_features["voicesensedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration if "unknownsensedfraction" in features_to_compute: - conversation_features["conversation_rapids_unknownsensedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration + conversation_features["unknownsensedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ conv_totalDuration if "silenceexpectedfraction" in features_to_compute: - conversation_features["conversation_rapids_silenceexpectedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes + conversation_features["silenceexpectedfraction"] = (conversation_data[conversation_data['inference']==0].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes if "noiseexpectedfraction" in features_to_compute: - conversation_features["conversation_rapids_noiseexpectedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes + conversation_features["noiseexpectedfraction"] = (conversation_data[conversation_data['inference']==1].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes if "voiceexpectedfraction" in features_to_compute: - conversation_features["conversation_rapids_voiceexpectedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes + conversation_features["voiceexpectedfraction"] = (conversation_data[conversation_data['inference']==2].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes if "unknownexpectedfraction" in features_to_compute: - conversation_features["conversation_rapids_unknownexpectedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes + conversation_features["unknownexpectedfraction"] = (conversation_data[conversation_data['inference']==3].groupby(["local_segment"])['inference'].count()/60)/ expectedMinutes if "sumconversationduration" in features_to_compute: - conversation_features["conversation_rapids_sumconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].sum() + conversation_features["sumconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].sum() if "avgconversationduration" in features_to_compute: - conversation_features["conversation_rapids_avgconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].mean() + conversation_features["avgconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].mean() if "sdconversationduration" in features_to_compute: - conversation_features["conversation_rapids_sdconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].std() + conversation_features["sdconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].std() if "minconversationduration" in features_to_compute: - conversation_features["conversation_rapids_minconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].min() + conversation_features["minconversationduration"] = conversation_data[conversation_data["conv_duration"] > 0].groupby(["local_segment"])["conv_duration"].min() if "maxconversationduration" in features_to_compute: - conversation_features["conversation_rapids_maxconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].max() + conversation_features["maxconversationduration"] = conversation_data.groupby(["local_segment"])["conv_duration"].max() if "timefirstconversation" in features_to_compute: timestampsLastConversation = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['timestamp'].min() @@ -95,9 +95,9 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg for date in list(timestampsLastConversation.index): lastimestamp = timestampsLastConversation.loc[date] lasttime = (conversation_data.query('timestamp == @lastimestamp', inplace = False))['local_time'].iat[0] - conversation_features.loc[date,"conversation_rapids_timefirstconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1]) + conversation_features.loc[date,"timefirstconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1]) else: - conversation_features["conversation_rapids_timefirstconversation"] = np.nan + conversation_features["timefirstconversation"] = np.nan if "timelastconversation" in features_to_compute: timestampsLastConversation = conversation_data[conversation_data["double_convo_start"] > 0].groupby(["local_segment"])['timestamp'].max() @@ -105,39 +105,39 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg for date in list(timestampsLastConversation.index): lastimestamp = timestampsLastConversation.loc[date] lasttime = (conversation_data.query('timestamp == @lastimestamp', inplace = False))['local_time'].iat[0] - conversation_features.loc[date,"conversation_rapids_timelastconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1]) + conversation_features.loc[date,"timelastconversation"] = int(lasttime.split(':')[0])*60 + int(lasttime.split(':')[1]) else: - conversation_features["conversation_rapids_timelastconversation"] = np.nan + conversation_features["timelastconversation"] = np.nan if "noisesumenergy" in features_to_compute: - conversation_features["conversation_rapids_noisesumenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].sum() + conversation_features["noisesumenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].sum() if "noiseavgenergy" in features_to_compute: - conversation_features["conversation_rapids_noiseavgenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].mean() + conversation_features["noiseavgenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].mean() if "noisesdenergy" in features_to_compute: - conversation_features["conversation_rapids_noisesdenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].std() + conversation_features["noisesdenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].std() if "noiseminenergy" in features_to_compute: - conversation_features["conversation_rapids_noiseminenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].min() + conversation_features["noiseminenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].min() if "noisemaxenergy" in features_to_compute: - conversation_features["conversation_rapids_noisemaxenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].max() + conversation_features["noisemaxenergy"] = conversation_data[conversation_data['inference']==1].groupby(["local_segment"])["double_energy"].max() if "voicesumenergy" in features_to_compute: - conversation_features["conversation_rapids_voicesumenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].sum() + conversation_features["voicesumenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].sum() if "voiceavgenergy" in features_to_compute: - conversation_features["conversation_rapids_voiceavgenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].mean() + conversation_features["voiceavgenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].mean() if "voicesdenergy" in features_to_compute: - conversation_features["conversation_rapids_voicesdenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].std() + conversation_features["voicesdenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].std() if "voiceminenergy" in features_to_compute: - conversation_features["conversation_rapids_voiceminenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].min() + conversation_features["voiceminenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].min() if "voicemaxenergy" in features_to_compute: - conversation_features["conversation_rapids_voicemaxenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].max() + conversation_features["voicemaxenergy"] = conversation_data[conversation_data['inference']==2].groupby(["local_segment"])["double_energy"].max() conversation_features = conversation_features.reset_index() diff --git a/src/features/phone_light/rapids/main.py b/src/features/phone_light/rapids/main.py index f7d2bbdb..9231a2d9 100644 --- a/src/features/phone_light/rapids/main.py +++ b/src/features/phone_light/rapids/main.py @@ -10,26 +10,26 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) - light_features = pd.DataFrame(columns=["local_segment"] + ["light_rapids_" + x for x in features_to_compute]) + light_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not light_data.empty: light_data = filter_data_by_segment(light_data, day_segment) if not light_data.empty: light_features = pd.DataFrame() if "count" in features_to_compute: - light_features["light_rapids_count"] = light_data.groupby(["local_segment"]).count()["timestamp"] + light_features["count"] = light_data.groupby(["local_segment"]).count()["timestamp"] # get light ambient luminance related features if "maxlux" in features_to_compute: - light_features["light_rapids_maxlux"] = light_data.groupby(["local_segment"])["double_light_lux"].max() + light_features["maxlux"] = light_data.groupby(["local_segment"])["double_light_lux"].max() if "minlux" in features_to_compute: - light_features["light_rapids_minlux"] = light_data.groupby(["local_segment"])["double_light_lux"].min() + light_features["minlux"] = light_data.groupby(["local_segment"])["double_light_lux"].min() if "avglux" in features_to_compute: - light_features["light_rapids_avglux"] = light_data.groupby(["local_segment"])["double_light_lux"].mean() + light_features["avglux"] = light_data.groupby(["local_segment"])["double_light_lux"].mean() if "medianlux" in features_to_compute: - light_features["light_rapids_medianlux"] = light_data.groupby(["local_segment"])["double_light_lux"].median() + light_features["medianlux"] = light_data.groupby(["local_segment"])["double_light_lux"].median() if "stdlux" in features_to_compute: - light_features["light_rapids_stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std() + light_features["stdlux"] = light_data.groupby(["local_segment"])["double_light_lux"].std() light_features = light_features.reset_index() diff --git a/src/features/phone_locations/doryab/main.py b/src/features/phone_locations/doryab/main.py index c3657336..4164811c 100644 --- a/src/features/phone_locations/doryab/main.py +++ b/src/features/phone_locations/doryab/main.py @@ -26,12 +26,12 @@ def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_seg if location_data.empty: - location_features = pd.DataFrame(columns=["local_segment"] + ["locations_doryab_" + x for x in features_to_compute]) + location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) else: location_data = filter_data_by_segment(location_data, day_segment) if location_data.empty: - location_features = pd.DataFrame(columns=["local_segment"] + ["locations_doryab_" + x for x in features_to_compute]) + location_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) else: location_features = pd.DataFrame() @@ -40,7 +40,7 @@ def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_seg if "minutesdataused" in features_to_compute: for localDate in location_data["local_segment"].unique(): - location_features.loc[localDate,"locations_doryab_minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate]) + location_features.loc[localDate,"minutesdataused"] = getMinutesData(location_data[location_data["local_segment"]==localDate]) location_features.index.name = 'local_segment' @@ -52,10 +52,10 @@ def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_seg return location_features if "locationvariance" in features_to_compute: - location_features["locations_doryab_locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var() + location_features["locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var() if "loglocationvariance" in features_to_compute: - location_features["locations_doryab_loglocationvariance"] = (location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()).apply(lambda x: np.log10(x) if x > 0 else None) + location_features["loglocationvariance"] = (location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()).apply(lambda x: np.log10(x) if x > 0 else None) preComputedDistanceandSpeed = pd.DataFrame() @@ -67,85 +67,85 @@ def doryab_features(sensor_data_files, day_segment, provider, filter_data_by_seg if "totaldistance" in features_to_compute: for localDate in location_data['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"] + location_features.loc[localDate,"totaldistance"] = preComputedDistanceandSpeed.loc[localDate,"distance"] if "averagespeed" in features_to_compute: for localDate in location_data['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"] + location_features.loc[localDate,"averagespeed"] = preComputedDistanceandSpeed.loc[localDate,"avgspeed"] if "varspeed" in features_to_compute: for localDate in location_data['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"] + location_features.loc[localDate,"varspeed"] = preComputedDistanceandSpeed.loc[localDate,"varspeed"] if "circadianmovement" in features_to_compute: for localDate in location_data['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_circadianmovement"] = circadian_movement(location_data[location_data['local_segment']==localDate]) + location_features.loc[localDate,"circadianmovement"] = circadian_movement(location_data[location_data['local_segment']==localDate]) newLocationData = cluster_and_label(location_data, eps= distance_to_degrees(dbscan_eps), min_samples=dbscan_minsamples) if "numberofsignificantplaces" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_segment']==localDate]) + location_features.loc[localDate,"numberofsignificantplaces"] = number_of_significant_places(newLocationData[newLocationData['local_segment']==localDate]) if "numberlocationtransitions" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_segment']==localDate]) + location_features.loc[localDate,"numberlocationtransitions"] = number_location_transitions(newLocationData[newLocationData['local_segment']==localDate]) if "radiusgyration" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency) + location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency) if "timeattop1location" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],1,sampling_frequency) + location_features.loc[localDate,"timeattop1"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],1,sampling_frequency) if "timeattop2location" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],2,sampling_frequency) + location_features.loc[localDate,"timeattop2"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],2,sampling_frequency) if "timeattop3location" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],3,sampling_frequency) + location_features.loc[localDate,"timeattop3"] = time_at_topn_clusters_in_group(newLocationData[newLocationData['local_segment']==localDate],3,sampling_frequency) if "movingtostaticratio" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_movingtostaticratio"] = (newLocationData[newLocationData['local_segment']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_segment']==localDate].shape[0] * sampling_frequency) + location_features.loc[localDate,"movingtostaticratio"] = (newLocationData[newLocationData['local_segment']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_segment']==localDate].shape[0] * sampling_frequency) if "outlierstimepercent" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency) + location_features.loc[localDate,"outlierstimepercent"] = outliers_time_percent(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency) preComputedmaxminCluster = pd.DataFrame() for localDate in newLocationData['local_segment'].unique(): smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(newLocationData[newLocationData['local_segment']==localDate],sampling_frequency) - preComputedmaxminCluster.loc[localDate,"locations_doryab_maxlengthstayatclusters"] = smax - preComputedmaxminCluster.loc[localDate,"locations_doryab_minlengthstayatclusters"] = smin - preComputedmaxminCluster.loc[localDate,"locations_doryab_stdlengthstayatclusters"] = sstd - preComputedmaxminCluster.loc[localDate,"locations_doryab_meanlengthstayatclusters"] = smean + preComputedmaxminCluster.loc[localDate,"maxlengthstayatclusters"] = smax + preComputedmaxminCluster.loc[localDate,"minlengthstayatclusters"] = smin + preComputedmaxminCluster.loc[localDate,"stdlengthstayatclusters"] = sstd + preComputedmaxminCluster.loc[localDate,"meanlengthstayatclusters"] = smean if "maxlengthstayatclusters" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_maxlengthstayatclusters"] + location_features.loc[localDate,"maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"maxlengthstayatclusters"] if "minlengthstayatclusters" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_minlengthstayatclusters"] + location_features.loc[localDate,"minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"minlengthstayatclusters"] if "stdlengthstayatclusters" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_stdlengthstayatclusters"] + location_features.loc[localDate,"stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"stdlengthstayatclusters"] if "meanlengthstayatclusters" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"locations_doryab_meanlengthstayatclusters"] + location_features.loc[localDate,"meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"meanlengthstayatclusters"] if "locationentropy" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_locationentropy"] = location_entropy(newLocationData[newLocationData['local_segment']==localDate]) + location_features.loc[localDate,"locationentropy"] = location_entropy(newLocationData[newLocationData['local_segment']==localDate]) if "normalizedlocationentropy" in features_to_compute: for localDate in newLocationData['local_segment'].unique(): - location_features.loc[localDate,"locations_doryab_normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_segment']==localDate]) + location_features.loc[localDate,"normalizedlocationentropy"] = location_entropy_normalized(newLocationData[newLocationData['local_segment']==localDate]) location_features = location_features.reset_index() diff --git a/src/features/phone_screen/rapids/main.py b/src/features/phone_screen/rapids/main.py index f008c997..fe49a53c 100644 --- a/src/features/phone_screen/rapids/main.py +++ b/src/features/phone_screen/rapids/main.py @@ -5,23 +5,23 @@ def getEpisodeDurationFeatures(screen_data, day_segment, episode, features, refe screen_data_episode = screen_data[screen_data["episode"] == episode] duration_helper = pd.DataFrame() if "countepisode" in features: - duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].count().rename(columns = {"duration": "screen_rapids_countepisode" + episode})], axis = 1) + duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].count().rename(columns = {"duration": "countepisode" + episode})], axis = 1) if "sumduration" in features: - duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].sum().rename(columns = {"duration": "screen_rapids_sumduration" + episode})], axis = 1) + duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].sum().rename(columns = {"duration": "sumduration" + episode})], axis = 1) if "maxduration" in features: - duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].max().rename(columns = {"duration": "screen_rapids_maxduration" + episode})], axis = 1) + duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].max().rename(columns = {"duration": "maxduration" + episode})], axis = 1) if "minduration" in features: - duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].min().rename(columns = {"duration": "screen_rapids_minduration" + episode})], axis = 1) + duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].min().rename(columns = {"duration": "minduration" + episode})], axis = 1) if "avgduration" in features: - duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].mean().rename(columns = {"duration":"screen_rapids_avgduration" + episode})], axis = 1) + duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].mean().rename(columns = {"duration":"avgduration" + episode})], axis = 1) if "stdduration" in features: - duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].std().rename(columns = {"duration":"screen_rapids_stdduration" + episode})], axis = 1) + duration_helper = pd.concat([duration_helper, screen_data_episode.groupby(["local_segment"])[["duration"]].std().rename(columns = {"duration":"stdduration" + episode})], axis = 1) if "firstuseafter" + "{0:0=2d}".format(reference_hour_first_use) in features: screen_data_episode_after_hour = screen_data_episode.copy() screen_data_episode_after_hour["hour"] = pd.to_datetime(screen_data_episode["local_start_date_time"]).dt.hour screen_data_episode_after_hour = screen_data_episode_after_hour[screen_data_episode_after_hour["hour"] >= reference_hour_first_use] - duration_helper = pd.concat([duration_helper, pd.DataFrame(screen_data_episode_after_hour.groupby(["local_segment"])[["local_start_date_time"]].min().local_start_date_time.apply(lambda x: (x.to_pydatetime().hour - reference_hour_first_use) * 60 + x.to_pydatetime().minute + (x.to_pydatetime().second / 60))).rename(columns = {"local_start_date_time":"screen_rapids_firstuseafter" + "{0:0=2d}".format(reference_hour_first_use) + episode})], axis = 1) + duration_helper = pd.concat([duration_helper, pd.DataFrame(screen_data_episode_after_hour.groupby(["local_segment"])[["local_start_date_time"]].min().local_start_date_time.apply(lambda x: (x.to_pydatetime().hour - reference_hour_first_use) * 60 + x.to_pydatetime().minute + (x.to_pydatetime().second / 60))).rename(columns = {"local_start_date_time":"firstuseafter" + "{0:0=2d}".format(reference_hour_first_use) + episode})], axis = 1) return duration_helper @@ -45,7 +45,7 @@ def rapids_features(sensor_data_files, day_segment, provider, filter_data_by_seg features_episodes_to_compute = ["firstuseafter" + "{0:0=2d}".format(reference_hour_first_use) if feature_name == "firstuseafter" else feature_name for feature_name in features_episodes_to_compute] features_to_compute = ["".join(feature) for feature in itertools.product(features_episodes_to_compute, episode_type_to_compute)] - screen_features = pd.DataFrame(columns=["local_segment"]+["screen_rapids_" + x for x in features_to_compute]) + screen_features = pd.DataFrame(columns=["local_segment"] + features_to_compute) if not screen_data.empty: screen_data = filter_data_by_segment(screen_data, day_segment) diff --git a/src/features/utils/utils.py b/src/features/utils/utils.py index 7ce950b7..525a2089 100644 --- a/src/features/utils/utils.py +++ b/src/features/utils/utils.py @@ -87,6 +87,9 @@ def fetch_provider_features(provider, provider_key, sensor_key, sensor_data_file for day_segment in day_segments_labels["label"]: print("{} Processing {} {} {}".format(rapids_log_tag, sensor_key, provider_key, day_segment)) features = feature_function(sensor_data_files, day_segment, provider, filter_data_by_segment=filter_data_by_segment, chunk_episodes=chunk_episodes) + if not "local_segment" in features.columns: + raise ValueError("The dataframe returned by the " + sensor_key + " provider '" + provider_key + "' is missing the 'local_segment' column added by the 'filter_data_by_segment()' function. Check the provider script is using such function and is not removing 'local_segment' by accident (" + code_path + ")\n The 'local_segment' column is used to index a provider's features (each row corresponds to a different day segment instance (e.g. 2020-01-01, 2020-01-02, 2020-01-03, etc.)") + features.columns = ["{}{}".format("" if col.startswith("local_segment") else (sensor_key + "_"+ provider_key + "_"), col) for col in features.columns] sensor_features = sensor_features.merge(features, how="outer") else: for feature in provider["FEATURES"]: