From 17f41588d8ad0d12ec2ca461e6b02daac73adf13 Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Fri, 26 Jun 2020 11:25:25 -0400 Subject: [PATCH] Fix the bug of feature extraction code for some sensors: break when there is no data for an epoch --- .../accelerometer/accelerometer_base.py | 10 ++-- .../applications_foreground_base.py | 5 +- src/features/ar/ar_base.py | 26 +++++---- src/features/battery/battery_base.py | 54 +++++++++---------- .../conversation/conversation_base.py | 10 ++-- src/features/light/light_base.py | 5 +- 6 files changed, 50 insertions(+), 60 deletions(-) diff --git a/src/features/accelerometer/accelerometer_base.py b/src/features/accelerometer/accelerometer_base.py index fcd2691f..230584ce 100644 --- a/src/features/accelerometer/accelerometer_base.py +++ b/src/features/accelerometer/accelerometer_base.py @@ -70,14 +70,12 @@ def base_accelerometer_features(acc_data, day_segment, requested_features, valid features_to_compute = features_to_compute_magnitude + features_to_compute_exertionalactivityepisode + features_to_compute_nonexertionalactivityepisode + (["validsensedminutes"] if valid_sensed_minutes else []) - if acc_data.empty: - acc_features = pd.DataFrame(columns=["local_date"] + ["acc_" + day_segment + "_" + x for x in features_to_compute]) - else: + acc_features = pd.DataFrame(columns=["local_date"] + ["acc_" + day_segment + "_" + x for x in features_to_compute]) + if not acc_data.empty: if day_segment != "daily": acc_data = acc_data[acc_data["local_day_segment"] == day_segment] - if acc_data.empty: - acc_features = pd.DataFrame(columns=["local_date"] + ["acc_" + day_segment + "_" + x for x in features_to_compute]) - else: + + if not acc_data.empty: acc_features = pd.DataFrame() # get magnitude related features: magnitude = sqrt(x^2+y^2+z^2) magnitude = acc_data.apply(lambda row: np.sqrt(row["double_values_0"] ** 2 + row["double_values_1"] ** 2 + row["double_values_2"] ** 2), axis=1) diff --git a/src/features/applications_foreground/applications_foreground_base.py b/src/features/applications_foreground/applications_foreground_base.py index 976d9938..517ae424 100644 --- a/src/features/applications_foreground/applications_foreground_base.py +++ b/src/features/applications_foreground/applications_foreground_base.py @@ -28,9 +28,8 @@ def base_applications_foreground_features(apps_data, day_segment, requested_feat # deep copy the apps_data for the top1global computation apps_data_global = apps_data.copy() - if apps_data.empty: - apps_features = pd.DataFrame(columns=["local_date"] + ["apps_" + day_segment + "_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + apps)]]) - else: + apps_features = pd.DataFrame(columns=["local_date"] + ["apps_" + day_segment + "_" + x for x in ["".join(feature) for feature in itertools.product(requested_features, single_categories + multiple_categories + apps)]]) + if not apps_data.empty:: if day_segment != "daily": apps_data =apps_data[apps_data["local_day_segment"] == day_segment] diff --git a/src/features/ar/ar_base.py b/src/features/ar/ar_base.py index 0e3265bf..75e20ed6 100644 --- a/src/features/ar/ar_base.py +++ b/src/features/ar/ar_base.py @@ -9,10 +9,8 @@ def base_ar_features(ar_data, ar_deltas, day_segment, requested_features): # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) - if ar_data.empty: - ar_features = pd.DataFrame(columns = ["local_date"] + ["ar_" + day_segment + "_" + x for x in features_to_compute]) - else: - ar_features = pd.DataFrame() + ar_features = pd.DataFrame(columns = ["local_date"] + ["ar_" + day_segment + "_" + x for x in features_to_compute]) + if not ar_data.empty: ar_deltas = splitOvernightEpisodes(ar_deltas, [],["activity"]) if day_segment != "daily": @@ -22,25 +20,25 @@ def base_ar_features(ar_data, ar_deltas, day_segment, requested_features): resampledData = ar_data.set_index(ar_data.local_date_time) resampledData.drop(columns=["local_date_time"], inplace=True) - if(day_segment!="daily"): + if day_segment != "daily": resampledData = resampledData.loc[resampledData["local_day_segment"] == day_segment] - if resampledData.empty: - ar_features = pd.DataFrame(columns = ["ar_" + day_segment + "_" + x for x in features_to_compute]) - else: - #Finding the count of samples of the day + if not resampledData.empty: + ar_features = pd.DataFrame() + + # finding the count of samples of the day if "count" in features_to_compute: ar_features["ar_" + day_segment + "_count"] = resampledData["activity_type"].resample("D").count() - #Finding most common activity of the day + # finding most common activity of the day if "mostcommonactivity" in features_to_compute: ar_features["ar_" + day_segment + "_mostcommonactivity"] = resampledData["activity_type"].resample("D").apply(lambda x: stats.mode(x)[0] if len(stats.mode(x)[0]) != 0 else None) - #finding different number of activities during a day + # finding different number of activities during a day if "countuniqueactivities" in features_to_compute: ar_features["ar_" + day_segment + "_countuniqueactivities"] = resampledData["activity_type"].resample("D").nunique() - #finding Number of times activity changed + # finding Number of times activity changed if "activitychangecount" in features_to_compute: resampledData["activity_type_shift"] = resampledData["activity_type"].shift().fillna(resampledData["activity_type"].head(1)) resampledData["different_activity"] = np.where(resampledData["activity_type"]!=resampledData["activity_type_shift"],1,0) @@ -55,7 +53,7 @@ def base_ar_features(ar_data, ar_deltas, day_segment, requested_features): if column in features_to_compute: ar_features["ar_" + day_segment + "_" + column] = ar_deltas[ar_deltas["activity"].isin(pd.Series(activity_labels))].groupby(["local_start_date"])["time_diff"].sum() - ar_features.index.names = ["local_date"] - ar_features = ar_features.reset_index() + ar_features.index.names = ["local_date"] + ar_features = ar_features.reset_index() return ar_features diff --git a/src/features/battery/battery_base.py b/src/features/battery/battery_base.py index b4865304..c6cdedf6 100644 --- a/src/features/battery/battery_base.py +++ b/src/features/battery/battery_base.py @@ -9,40 +9,40 @@ def base_battery_features(battery_data, day_segment, requested_features): # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) - if battery_data.empty: - battery_features = pd.DataFrame(columns=["local_date"] + ["battery_" + day_segment + "_" + x for x in features_to_compute]) - else: + battery_features = pd.DataFrame(columns=["local_date"] + ["battery_" + day_segment + "_" + x for x in features_to_compute]) + if not battery_data.empty: battery_data = splitOvernightEpisodes(battery_data, ["battery_diff"], []) if day_segment != "daily": battery_data = splitMultiSegmentEpisodes(battery_data, day_segment, ["battery_diff"]) + + if not battery_data.empty: + battery_data["battery_consumption_rate"] = battery_data["battery_diff"] / battery_data["time_diff"] - battery_data["battery_consumption_rate"] = battery_data["battery_diff"] / battery_data["time_diff"] + # for battery_data_discharge: + battery_data_discharge = battery_data[battery_data["battery_diff"] > 0] + battery_discharge_features = pd.DataFrame() + if "countdischarge" in features_to_compute: + battery_discharge_features["battery_"+day_segment+"_countdischarge"] = battery_data_discharge.groupby(["local_start_date"])["local_start_date"].count() + if "sumdurationdischarge" in features_to_compute: + battery_discharge_features["battery_"+day_segment+"_sumdurationdischarge"] = battery_data_discharge.groupby(["local_start_date"])["time_diff"].sum() + if "avgconsumptionrate" in features_to_compute: + battery_discharge_features["battery_"+day_segment+"_avgconsumptionrate"] = battery_data_discharge.groupby(["local_start_date"])["battery_consumption_rate"].mean() + if "maxconsumptionrate" in features_to_compute: + battery_discharge_features["battery_"+day_segment+"_maxconsumptionrate"] = battery_data_discharge.groupby(["local_start_date"])["battery_consumption_rate"].max() - # for battery_data_discharge: - battery_data_discharge = battery_data[battery_data["battery_diff"] > 0] - battery_discharge_features = pd.DataFrame() - if "countdischarge" in features_to_compute: - battery_discharge_features["battery_"+day_segment+"_countdischarge"] = battery_data_discharge.groupby(["local_start_date"])["local_start_date"].count() - if "sumdurationdischarge" in features_to_compute: - battery_discharge_features["battery_"+day_segment+"_sumdurationdischarge"] = battery_data_discharge.groupby(["local_start_date"])["time_diff"].sum() - if "avgconsumptionrate" in features_to_compute: - battery_discharge_features["battery_"+day_segment+"_avgconsumptionrate"] = battery_data_discharge.groupby(["local_start_date"])["battery_consumption_rate"].mean() - if "maxconsumptionrate" in features_to_compute: - battery_discharge_features["battery_"+day_segment+"_maxconsumptionrate"] = battery_data_discharge.groupby(["local_start_date"])["battery_consumption_rate"].max() + # for battery_data_charge: + battery_data_charge = battery_data[battery_data["battery_diff"] <= 0] + battery_charge_features = pd.DataFrame() + if "countcharge" in features_to_compute: + battery_charge_features["battery_"+day_segment+"_countcharge"] = battery_data_charge.groupby(["local_start_date"])["local_start_date"].count() + if "sumdurationcharge" in features_to_compute: + battery_charge_features["battery_"+day_segment+"_sumdurationcharge"] = battery_data_charge.groupby(["local_start_date"])["time_diff"].sum() - # for battery_data_charge: - battery_data_charge = battery_data[battery_data["battery_diff"] <= 0] - battery_charge_features = pd.DataFrame() - if "countcharge" in features_to_compute: - battery_charge_features["battery_"+day_segment+"_countcharge"] = battery_data_charge.groupby(["local_start_date"])["local_start_date"].count() - if "sumdurationcharge" in features_to_compute: - battery_charge_features["battery_"+day_segment+"_sumdurationcharge"] = battery_data_charge.groupby(["local_start_date"])["time_diff"].sum() + # combine discharge features and charge features; fill the missing values with ZERO + battery_features = pd.concat([battery_discharge_features, battery_charge_features], axis=1, sort=True).fillna(0) - # combine discharge features and charge features; fill the missing values with ZERO - battery_features = pd.concat([battery_discharge_features, battery_charge_features], axis=1, sort=True).fillna(0) - - battery_features.index.rename("local_date", inplace=True) - battery_features = battery_features.reset_index() + battery_features.index.rename("local_date", inplace=True) + battery_features = battery_features.reset_index() return battery_features diff --git a/src/features/conversation/conversation_base.py b/src/features/conversation/conversation_base.py index 7d69e179..d9857d3f 100644 --- a/src/features/conversation/conversation_base.py +++ b/src/features/conversation/conversation_base.py @@ -11,16 +11,12 @@ def base_conversation_features(conversation_data, day_segment, requested_feature # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) - - if conversation_data.empty: - conversation_features = pd.DataFrame(columns=["local_date"] + ["conversation_" + day_segment + "_" + x for x in features_to_compute]) - else: + conversation_features = pd.DataFrame(columns=["local_date"] + ["conversation_" + day_segment + "_" + x for x in features_to_compute]) + if not conversation_data.empty: if day_segment != "daily": conversation_data = conversation_data[conversation_data["local_day_segment"] == day_segment] - if conversation_data.empty: - conversation_features = pd.DataFrame(columns=["local_date"] + ["conversation_" + day_segment + "_" + x for x in features_to_compute]) - else: + if not conversation_data.empty: conversation_features = pd.DataFrame() conversation_data = conversation_data.drop_duplicates(subset = 'local_time', keep= first) diff --git a/src/features/light/light_base.py b/src/features/light/light_base.py index e00843fb..73c816ab 100644 --- a/src/features/light/light_base.py +++ b/src/features/light/light_base.py @@ -6,9 +6,8 @@ def base_light_features(light_data, day_segment, requested_features): # the subset of requested features this function can compute features_to_compute = list(set(requested_features) & set(base_features_names)) - if light_data.empty: - light_features = pd.DataFrame(columns=["local_date"] + ["light_" + day_segment + "_" + x for x in features_to_compute]) - else: + light_features = pd.DataFrame(columns=["local_date"] + ["light_" + day_segment + "_" + x for x in features_to_compute]) + if not light_data.empty: if day_segment != "daily": light_data =light_data[light_data["local_day_segment"] == day_segment]