From c5a0c1e0d6e074f1e4dbf405023e62bc7c56def5 Mon Sep 17 00:00:00 2001 From: Echhit Joshi <32146180+EchhitJoshi@users.noreply.github.com> Date: Mon, 18 Nov 2019 14:22:08 -0500 Subject: [PATCH] Updated G_A_R features with epochs --- Snakefile | 3 +- config.yaml | 6 ++- rules/features.snakefile | 5 ++- src/features/google_activity_recognition.py | 42 +++++++-------------- 4 files changed, 25 insertions(+), 31 deletions(-) diff --git a/Snakefile b/Snakefile index 7cac0462..4552e465 100644 --- a/Snakefile +++ b/Snakefile @@ -21,7 +21,8 @@ rule all: expand("data/processed/{pid}/bluetooth_{segment}.csv", pid=config["PIDS"], segment = config["BLUETOOTH"]["DAY_SEGMENTS"]), - expand("data/processed/{pid}/google_activity_recognition.csv",pid=config["PIDS"]), + expand("data/processed/{pid}/google_activity_recognition_{segment}.csv",pid=config["PIDS"], + segment = config["GOOGLE_ACTIVITY_RECOGNITION"]["DAY_SEGMENTS"]), expand("data/processed/{pid}/battery_daily.csv", pid=config["PIDS"]), # Reports expand("reports/figures/{pid}/{sensor}_heatmap_rows.html", pid=config["PIDS"], sensor=config["SENSORS"]), diff --git a/config.yaml b/config.yaml index 91c19d45..67ec49e0 100644 --- a/config.yaml +++ b/config.yaml @@ -52,4 +52,8 @@ BARNETT_LOCATION: BLUETOOTH: DAY_SEGMENTS: *day_segments - METRICS: ["countscans", "uniquedevices", "countscansmostuniquedevice"] \ No newline at end of file + METRICS: ["countscans", "uniquedevices", "countscansmostuniquedevice"] + +GOOGLE_ACTIVITY_RECOGNITION: + DAY_SEGMENTS: *day_segments + METRICS: ['count','most_common_activity','number_unique_activities','activity_change_count'] diff --git a/rules/features.snakefile b/rules/features.snakefile index 43fab67c..572efe76 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -55,8 +55,11 @@ rule bluetooth_metrics: rule activity_metrics: input: "data/raw/{pid}/plugin_google_activity_recognition_with_datetime.csv" + params: + segment = "{day_segment}", + metrics = config["GOOGLE_ACTIVITY_RECOGNITION"]["METRICS"] output: - "data/processed/{pid}/google_activity_recognition.csv" + "data/processed/{pid}/google_activity_recognition_{day_segment}.csv" script: "../src/features/google_activity_recognition.py" diff --git a/src/features/google_activity_recognition.py b/src/features/google_activity_recognition.py index 95becab3..943fbb6a 100644 --- a/src/features/google_activity_recognition.py +++ b/src/features/google_activity_recognition.py @@ -2,51 +2,37 @@ import pandas as pd import numpy as np import scipy.stats as stats +day_segment = snakemake.params["segment"] + #Read csv into a pandas dataframe data = pd.read_csv(snakemake.input[0]) -column = ['local_date_time','count','most_common_activity','number_unique_activities','activity_change_count'] -finalDataset = pd.DataFrame(columns=column) -finalDataset.set_index('local_date_time',inplace=True) +columns = ['count','most_common_activity','count_unique_activities','activity_change_count'] +columns = list("ar_" + str(day_segment) + "_" + column for column in columns) if data.empty: - finalDataset.to_csv(snakemake.output[0]) - + finalDataset = pd.DataFrame(columns = columns) else: - #Resampling each of the required features as a pandas series data.local_date_time = pd.to_datetime(data.local_date_time) resampledData = data.set_index(data.local_date_time) - resampledData = resampledData[~resampledData.index.duplicated()] - resampledData.rename_axis('time',axis='columns',inplace=True) resampledData.drop(columns=['local_date_time'],inplace=True) - #Finding count grouped by day - count = pd.DataFrame() + if(day_segment!='daily'): + resampledData = resampledData.loc[resampledData['local_day_segment'] == str(day_segment)] + count = resampledData['activity_type'].resample('D').count() - count = count.rename(columns={"activity_type":"count"}) #Finding most common activity of the day - mostCommonActivity = pd.DataFrame() mostCommonActivity = resampledData['activity_type'].resample('D').apply(lambda x:stats.mode(x)[0]) - mostCommonActivity = mostCommonActivity.rename(columns={'activity_type':'most_common_activity'}) #finding different number of activities during a day - uniqueActivities = pd.DataFrame() - # countChanges = resampledData.to_period('D').groupby(resampledData.index)['activity_type'].value_counts() uniqueActivities = resampledData['activity_type'].resample('D').nunique() #finding Number of times activity changed - resampledData['activity_type_shift'] = resampledData['activity_type'].shift() - resampledData['activity_type_shift'].fillna(resampledData['activity_type'].head(1),inplace=True) - #resampledData['different_activity'] = resampledData['activity_type'].apply(lambda x: 0 if resampledData['activity_type'] == resampledData['activity_type_shift'] else 1, axis=1) - resampledData['different_activity']=np.where(resampledData['activity_type']!=resampledData['activity_type_shift'],1,0) - countChanges = pd.DataFrame() + resampledData['activity_type_shift'] = resampledData['activity_type'].shift().fillna(resampledData['activity_type'].head(1),inplace=True) + resampledData['different_activity'] = np.where(resampledData['activity_type']!=resampledData['activity_type_shift'],1,0) countChanges = resampledData['different_activity'].resample('D').sum() + finalDataset = pd.concat([count, mostCommonActivity, uniqueActivities, countChanges],axis=1) - #Concatenating all the processed data only, no other sensor data is added here for simplicity - finalDataset = pd.DataFrame() - finalDataset = pd.concat([count,mostCommonActivity,uniqueActivities,countChanges],axis=1) - finalDataset.rename(columns={0:"count",1:'most_common_activity','activity_type':'number_unique_activities','different_activity':'activity_change_count'},inplace = True) - - #Export final dataframe with extracted features to respective PID - finalDataset.to_csv(snakemake.output[0]) - +finalDataset.index.names = ['local_date'] +finalDataset.columns=columns +finalDataset.to_csv(snakemake.output[0]) \ No newline at end of file