Fix bugs with empty dataframes in screen and gar metrics

replace/cb67d183ce858faab452a2747db7be3eda789434
JulioV 2019-12-04 12:04:20 -05:00
parent 0bd946b53c
commit 4d52e2d980
3 changed files with 36 additions and 27 deletions

View File

@ -9,7 +9,7 @@ rule all:
expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]), expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["SENSORS"]),
expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]), expand("data/processed/{pid}/battery_deltas.csv", pid=config["PIDS"]),
expand("data/processed/{pid}/screen_deltas.csv", pid=config["PIDS"]), expand("data/processed/{pid}/screen_deltas.csv", pid=config["PIDS"]),
expand("data/processed/{pid}/google_activity_recognition_deltas.csv", pid=config["PIDS"]), expand("data/processed/{pid}/plugin_google_activity_recognition_deltas.csv", pid=config["PIDS"]),
expand("data/interim/{pid}/phone_valid_sensed_days.csv", pid=config["PIDS"]), expand("data/interim/{pid}/phone_valid_sensed_days.csv", pid=config["PIDS"]),
expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]), expand("data/interim/{pid}/phone_sensed_bins.csv", pid=config["PIDS"]),
expand("data/processed/{pid}/sms_{sms_type}_{day_segment}.csv", expand("data/processed/{pid}/sms_{sms_type}_{day_segment}.csv",

View File

@ -27,30 +27,33 @@ else:
if(day_segment!='daily'): if(day_segment!='daily'):
resampledData = resampledData.loc[resampledData['local_day_segment'] == str(day_segment)] resampledData = resampledData.loc[resampledData['local_day_segment'] == str(day_segment)]
if resampledData.empty:
finalDataset = pd.DataFrame(columns = columns)
else:
count = resampledData['activity_type'].resample('D').count()
#Finding most common activity of the day
mostCommonActivity = resampledData['activity_type'].resample('D').apply(lambda x:stats.mode(x)[0])
#finding different number of activities during a day
uniqueActivities = resampledData['activity_type'].resample('D').nunique()
#finding Number of times activity changed
resampledData['activity_type_shift'] = resampledData['activity_type'].shift().fillna(resampledData['activity_type'].head(1),inplace=True)
resampledData['different_activity'] = np.where(resampledData['activity_type']!=resampledData['activity_type_shift'],1,0)
countChanges = resampledData['different_activity'].resample('D').sum()
finalDataset = pd.concat([count, mostCommonActivity, uniqueActivities, countChanges],axis=1)
deltas_metrics = {'sumstationary':['still','tilting'],
'summobile':['on_foot','running','on_bicycle'],
'sumvehicle':['in_vehicle']}
count = resampledData['activity_type'].resample('D').count() for column, activity_labels in deltas_metrics.items():
metric = (ar_deltas[ar_deltas['activity'].isin(pd.Series(activity_labels))]
#Finding most common activity of the day .groupby(['local_start_date'])['time_diff']
mostCommonActivity = resampledData['activity_type'].resample('D').apply(lambda x:stats.mode(x)[0]) .agg({"ar_" + str(day_segment) + "_" + str(column) :'sum'}))
finalDataset = finalDataset.merge(metric,how='outer',left_index=True,right_index=True)
#finding different number of activities during a day
uniqueActivities = resampledData['activity_type'].resample('D').nunique()
#finding Number of times activity changed
resampledData['activity_type_shift'] = resampledData['activity_type'].shift().fillna(resampledData['activity_type'].head(1),inplace=True)
resampledData['different_activity'] = np.where(resampledData['activity_type']!=resampledData['activity_type_shift'],1,0)
countChanges = resampledData['different_activity'].resample('D').sum()
finalDataset = pd.concat([count, mostCommonActivity, uniqueActivities, countChanges],axis=1)
deltas_metrics = {'sumstationary':['still','tilting'],
'summobile':['on_foot','running','on_bicycle'],
'sumvehicle':['on_vehicle']}
for column, activity_labels in deltas_metrics.items():
metric = (ar_deltas[ar_deltas['activity'].isin(pd.Series(activity_labels))]
.groupby(['local_start_date'])['time_diff']
.agg({"ar_" + str(day_segment) + "_" + str(column) :'sum'}))
finalDataset = finalDataset.merge(metric,how='outer',left_index=True,right_index=True)
finalDataset.fillna(0,inplace=True) finalDataset.fillna(0,inplace=True)
finalDataset.index.names = ['local_date'] finalDataset.index.names = ['local_date']

View File

@ -1,6 +1,7 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import datetime import datetime
import itertools
from datetime import datetime, timedelta, time from datetime import datetime, timedelta, time
from features_utils import splitOvernightEpisodes, splitMultiSegmentEpisodes from features_utils import splitOvernightEpisodes, splitMultiSegmentEpisodes
@ -73,9 +74,14 @@ else:
# extract features for events and episodes # extract features for events and episodes
event_features = getEventFeatures(screen_data, metrics_event) event_features = getEventFeatures(screen_data, metrics_event)
duration_features = pd.DataFrame()
for episode in episodes: if screen_deltas.empty:
duration_features = pd.concat([duration_features, getEpisodeDurationFeatures(screen_deltas, episode, metrics_deltas)], axis=1) metrics_deltas_name = ["".join(metric) for metric in itertools.product(metrics_deltas,episodes)]
duration_features = pd.DataFrame(columns=["screen_" + day_segment + "_" + x for x in metrics_deltas_name])
else:
duration_features = pd.DataFrame()
for episode in episodes:
duration_features = pd.concat([duration_features, getEpisodeDurationFeatures(screen_deltas, episode, metrics_deltas)], axis=1)
screen_features = pd.concat([event_features, duration_features], axis = 1).fillna(0) screen_features = pd.concat([event_features, duration_features], axis = 1).fillna(0)
screen_features = screen_features.rename_axis("local_date").reset_index() screen_features = screen_features.rename_axis("local_date").reset_index()