From 4d52e2d980258070b79b923410bda9b110b52ece Mon Sep 17 00:00:00 2001 From: JulioV Date: Wed, 4 Dec 2019 12:04:20 -0500 Subject: [PATCH] Fix bugs with empty dataframes in screen and gar metrics --- Snakefile | 2 +- src/features/google_activity_recognition.py | 49 +++++++++++---------- src/features/screen_metrics.py | 12 +++-- 3 files changed, 36 insertions(+), 27 deletions(-) diff --git a/Snakefile b/Snakefile index 86c4d16e..1b60f3d9 100644 --- a/Snakefile +++ b/Snakefile @@ -9,7 +9,7 @@ rule all: expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]), expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]), expand("data/processed/{pid}/screen_deltas.csv", pid=config["PIDS"]), - expand("data/processed/{pid}/google_activity_recognition_deltas.csv", pid=config["PIDS"]), + expand("data/processed/{pid}/plugin_google_activity_recognition_deltas.csv", pid=config["PIDS"]), expand("data/interim/{pid}/phone_valid_sensed_days.csv", pid=config["PIDS"]), expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]), expand("data/processed/{pid}/sms_{sms_type}_{day_segment}.csv", diff --git a/src/features/google_activity_recognition.py b/src/features/google_activity_recognition.py index 9040b2f4..bdfac29e 100644 --- a/src/features/google_activity_recognition.py +++ b/src/features/google_activity_recognition.py @@ -27,30 +27,33 @@ else: if(day_segment!='daily'): resampledData = resampledData.loc[resampledData['local_day_segment'] == str(day_segment)] + + if resampledData.empty: + finalDataset = pd.DataFrame(columns = columns) + else: + count = resampledData['activity_type'].resample('D').count() + + #Finding most common activity of the day + mostCommonActivity = resampledData['activity_type'].resample('D').apply(lambda x:stats.mode(x)[0]) + + #finding different number of activities during a day + uniqueActivities = resampledData['activity_type'].resample('D').nunique() + + #finding Number of times activity changed + resampledData['activity_type_shift'] = resampledData['activity_type'].shift().fillna(resampledData['activity_type'].head(1),inplace=True) + resampledData['different_activity'] = np.where(resampledData['activity_type']!=resampledData['activity_type_shift'],1,0) + countChanges = resampledData['different_activity'].resample('D').sum() + finalDataset = pd.concat([count, mostCommonActivity, uniqueActivities, countChanges],axis=1) + + deltas_metrics = {'sumstationary':['still','tilting'], + 'summobile':['on_foot','running','on_bicycle'], + 'sumvehicle':['in_vehicle']} - count = resampledData['activity_type'].resample('D').count() - - #Finding most common activity of the day - mostCommonActivity = resampledData['activity_type'].resample('D').apply(lambda x:stats.mode(x)[0]) - - #finding different number of activities during a day - uniqueActivities = resampledData['activity_type'].resample('D').nunique() - - #finding Number of times activity changed - resampledData['activity_type_shift'] = resampledData['activity_type'].shift().fillna(resampledData['activity_type'].head(1),inplace=True) - resampledData['different_activity'] = np.where(resampledData['activity_type']!=resampledData['activity_type_shift'],1,0) - countChanges = resampledData['different_activity'].resample('D').sum() - finalDataset = pd.concat([count, mostCommonActivity, uniqueActivities, countChanges],axis=1) - - deltas_metrics = {'sumstationary':['still','tilting'], - 'summobile':['on_foot','running','on_bicycle'], - 'sumvehicle':['on_vehicle']} - - for column, activity_labels in deltas_metrics.items(): - metric = (ar_deltas[ar_deltas['activity'].isin(pd.Series(activity_labels))] - .groupby(['local_start_date'])['time_diff'] - .agg({"ar_" + str(day_segment) + "_" + str(column) :'sum'})) - finalDataset = finalDataset.merge(metric,how='outer',left_index=True,right_index=True) + for column, activity_labels in deltas_metrics.items(): + metric = (ar_deltas[ar_deltas['activity'].isin(pd.Series(activity_labels))] + .groupby(['local_start_date'])['time_diff'] + .agg({"ar_" + str(day_segment) + "_" + str(column) :'sum'})) + finalDataset = finalDataset.merge(metric,how='outer',left_index=True,right_index=True) finalDataset.fillna(0,inplace=True) finalDataset.index.names = ['local_date'] diff --git a/src/features/screen_metrics.py b/src/features/screen_metrics.py index 2e04cbc9..c01dc871 100644 --- a/src/features/screen_metrics.py +++ b/src/features/screen_metrics.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np import datetime +import itertools from datetime import datetime, timedelta, time from features_utils import splitOvernightEpisodes, splitMultiSegmentEpisodes @@ -73,9 +74,14 @@ else: # extract features for events and episodes event_features = getEventFeatures(screen_data, metrics_event) - duration_features = pd.DataFrame() - for episode in episodes: - duration_features = pd.concat([duration_features, getEpisodeDurationFeatures(screen_deltas, episode, metrics_deltas)], axis=1) + + if screen_deltas.empty: + metrics_deltas_name = ["".join(metric) for metric in itertools.product(metrics_deltas,episodes)] + duration_features = pd.DataFrame(columns=["screen_" + day_segment + "_" + x for x in metrics_deltas_name]) + else: + duration_features = pd.DataFrame() + for episode in episodes: + duration_features = pd.concat([duration_features, getEpisodeDurationFeatures(screen_deltas, episode, metrics_deltas)], axis=1) screen_features = pd.concat([event_features, duration_features], axis = 1).fillna(0) screen_features = screen_features.rename_axis("local_date").reset_index()