From aa1baaf948d54e16cd7296835826ced016711c8b Mon Sep 17 00:00:00 2001 From: Meng Li <34143965+Meng6@users.noreply.github.com> Date: Thu, 4 Jun 2020 19:32:28 -0400 Subject: [PATCH] Refactor activity recognition features --- config.yaml | 2 +- src/features/activity_recognition.py | 68 ++++------------------------ src/features/ar/ar_base.py | 61 +++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 59 deletions(-) create mode 100644 src/features/ar/ar_base.py diff --git a/config.yaml b/config.yaml index f71b6642..264c1fae 100644 --- a/config.yaml +++ b/config.yaml @@ -81,7 +81,7 @@ BLUETOOTH: ACTIVITY_RECOGNITION: DAY_SEGMENTS: *day_segments - FEATURES: ['count','mostcommonactivity','countuniqueactivities','activitychangecount','sumstationary','summobile','sumvehicle'] + FEATURES: ["count","mostcommonactivity","countuniqueactivities","activitychangecount","sumstationary","summobile","sumvehicle"] BATTERY: DAY_SEGMENTS: *day_segments diff --git a/src/features/activity_recognition.py b/src/features/activity_recognition.py index 19ccbf9e..5a3d7117 100644 --- a/src/features/activity_recognition.py +++ b/src/features/activity_recognition.py @@ -1,63 +1,15 @@ import pandas as pd -import numpy as np -import scipy.stats as stats -from features_utils import splitOvernightEpisodes, splitMultiSegmentEpisodes +from ar.ar_base import base_ar_features -day_segment = snakemake.params["segment"] -features = snakemake.params["features"] - -#Read csv into a pandas dataframe -data = pd.read_csv(snakemake.input[0],parse_dates=["local_date_time"]) +ar_data = pd.read_csv(snakemake.input[0],parse_dates=["local_date_time"]) ar_deltas = pd.read_csv(snakemake.input[1],parse_dates=["local_start_date_time", "local_end_date_time", "local_start_date", "local_end_date"]) -columns = list("ar_" + str(day_segment) + "_" + column for column in features) - -if data.empty: - finalDataset = pd.DataFrame(columns = columns) -else: - finalDataset = pd.DataFrame() - ar_deltas = splitOvernightEpisodes(ar_deltas, [],['activity']) - - if day_segment != "daily": - ar_deltas = splitMultiSegmentEpisodes(ar_deltas, day_segment, []) - - data.local_date_time = pd.to_datetime(data.local_date_time) - resampledData = data.set_index(data.local_date_time) - resampledData.drop(columns=['local_date_time'],inplace=True) - - if(day_segment!='daily'): - resampledData = resampledData.loc[resampledData['local_day_segment'] == str(day_segment)] - - if resampledData.empty: - finalDataset = pd.DataFrame(columns = columns) - else: - #Finding the count of samples of the day - if("count" in features): - finalDataset["ar_" + str(day_segment) + "_count"] = resampledData['activity_type'].resample('D').count() - - #Finding most common activity of the day - if("mostcommonactivity" in features): - finalDataset["ar_" + str(day_segment) + "_mostcommonactivity"] = resampledData['activity_type'].resample('D').apply(lambda x: stats.mode(x)[0] if len(stats.mode(x)[0]) != 0 else None) - - #finding different number of activities during a day - if("countuniqueactivities" in features): - finalDataset["ar_" + str(day_segment) + "_countuniqueactivities"] = resampledData['activity_type'].resample('D').nunique() - - #finding Number of times activity changed - if("activitychangecount" in features): - resampledData['activity_type_shift'] = resampledData['activity_type'].shift().fillna(resampledData['activity_type'].head(1)) - resampledData['different_activity'] = np.where(resampledData['activity_type']!=resampledData['activity_type_shift'],1,0) - finalDataset["ar_" + str(day_segment) + "_activitychangecount"] = resampledData['different_activity'].resample('D').sum() +day_segment = snakemake.params["segment"] +requested_features = snakemake.params["features"] +ar_features = pd.DataFrame(columns=["local_date"]) - deltas_features = {'sumstationary':['still','tilting'], - 'summobile':['on_foot','walking','running','on_bicycle'], - 'sumvehicle':['in_vehicle']} - - for column, activity_labels in deltas_features.items(): - if column in features: - finalDataset["ar_" + str(day_segment) + "_"+str(column)] = (ar_deltas[ar_deltas['activity'].isin(pd.Series(activity_labels))] - .groupby(['local_start_date'])['time_diff'] - .agg({"ar_" + str(day_segment) + "_" + str(column) :'sum'})) - -finalDataset.index.names = ['local_date'] -finalDataset.to_csv(snakemake.output[0]) +ar_features = ar_features.merge(base_ar_features(ar_data, ar_deltas, day_segment, requested_features), on="local_date", how="outer") + +assert len(requested_features) + 1 == ar_features.shape[1], "The number of features in the output dataframe (=" + str(ar_features.shape[1]) + ") does not match the expected value (=" + str(len(requested_features)) + " + 1). Verify your activity recognition feature extraction functions" + +ar_features.to_csv(snakemake.output[0], index=False) \ No newline at end of file diff --git a/src/features/ar/ar_base.py b/src/features/ar/ar_base.py new file mode 100644 index 00000000..0e3265bf --- /dev/null +++ b/src/features/ar/ar_base.py @@ -0,0 +1,61 @@ +import pandas as pd +import numpy as np +import scipy.stats as stats +from features_utils import splitOvernightEpisodes, splitMultiSegmentEpisodes + +def base_ar_features(ar_data, ar_deltas, day_segment, requested_features): + # name of the features this function can compute + base_features_names = ["count","mostcommonactivity","countuniqueactivities","activitychangecount","sumstationary","summobile","sumvehicle"] + # the subset of requested features this function can compute + features_to_compute = list(set(requested_features) & set(base_features_names)) + + if ar_data.empty: + ar_features = pd.DataFrame(columns = ["local_date"] + ["ar_" + day_segment + "_" + x for x in features_to_compute]) + else: + ar_features = pd.DataFrame() + ar_deltas = splitOvernightEpisodes(ar_deltas, [],["activity"]) + + if day_segment != "daily": + ar_deltas = splitMultiSegmentEpisodes(ar_deltas, day_segment, []) + + ar_data.local_date_time = pd.to_datetime(ar_data.local_date_time) + resampledData = ar_data.set_index(ar_data.local_date_time) + resampledData.drop(columns=["local_date_time"], inplace=True) + + if(day_segment!="daily"): + resampledData = resampledData.loc[resampledData["local_day_segment"] == day_segment] + + if resampledData.empty: + ar_features = pd.DataFrame(columns = ["ar_" + day_segment + "_" + x for x in features_to_compute]) + else: + #Finding the count of samples of the day + if "count" in features_to_compute: + ar_features["ar_" + day_segment + "_count"] = resampledData["activity_type"].resample("D").count() + + #Finding most common activity of the day + if "mostcommonactivity" in features_to_compute: + ar_features["ar_" + day_segment + "_mostcommonactivity"] = resampledData["activity_type"].resample("D").apply(lambda x: stats.mode(x)[0] if len(stats.mode(x)[0]) != 0 else None) + + #finding different number of activities during a day + if "countuniqueactivities" in features_to_compute: + ar_features["ar_" + day_segment + "_countuniqueactivities"] = resampledData["activity_type"].resample("D").nunique() + + #finding Number of times activity changed + if "activitychangecount" in features_to_compute: + resampledData["activity_type_shift"] = resampledData["activity_type"].shift().fillna(resampledData["activity_type"].head(1)) + resampledData["different_activity"] = np.where(resampledData["activity_type"]!=resampledData["activity_type_shift"],1,0) + ar_features["ar_" + day_segment + "_activitychangecount"] = resampledData["different_activity"].resample("D").sum() + + + deltas_features = {"sumstationary":["still","tilting"], + "summobile":["on_foot","walking","running","on_bicycle"], + "sumvehicle":["in_vehicle"]} + + for column, activity_labels in deltas_features.items(): + if column in features_to_compute: + ar_features["ar_" + day_segment + "_" + column] = ar_deltas[ar_deltas["activity"].isin(pd.Series(activity_labels))].groupby(["local_start_date"])["time_diff"].sum() + + ar_features.index.names = ["local_date"] + ar_features = ar_features.reset_index() + + return ar_features