diff --git a/Snakefile b/Snakefile index 26f4f1a2..f5829896 100644 --- a/Snakefile +++ b/Snakefile @@ -28,6 +28,7 @@ rule all: expand("data/processed/{pid}/bluetooth_{segment}.csv", pid=config["PIDS"], segment = config["BLUETOOTH"]["DAY_SEGMENTS"]), + expand("data/processed/{pid}/activity_extracted.csv",pid=config["PIDS"]), # Reports expand("reports/figures/{pid}/{sensor}_heatmap_rows.html", pid=config["PIDS"], sensor=config["SENSORS"]), expand("reports/figures/{pid}/compliance_heatmap.html", pid=config["PIDS"], sensor=config["SENSORS"]), diff --git a/rules/features.snakefile b/rules/features.snakefile index cfdb5097..e1e98065 100644 --- a/rules/features.snakefile +++ b/rules/features.snakefile @@ -50,4 +50,12 @@ rule bluetooth_metrics: output: "data/processed/{pid}/bluetooth_{day_segment}.csv" script: - "../src/features/bluetooth_metrics.R" \ No newline at end of file + "../src/features/bluetooth_metrics.R" + +rule activity_metrics: + input: + "data/raw/{pid}/plugin_google_activity_recognition_with_datetime.csv" + output: + "data/processed/{pid}/activity_extracted.csv" + script: + "../src/features/activity_recognition.py" diff --git a/src/features/activity_recognition.py b/src/features/activity_recognition.py new file mode 100644 index 00000000..0e814e26 --- /dev/null +++ b/src/features/activity_recognition.py @@ -0,0 +1,40 @@ +import pandas as pd +import numpy as np +import scipy.stats as stats + +#Read csv into a pandas dataframe +data = pd.read_csv(snakemake.input[0]) + + +#Resampling each of the required features as a pandas series +data.local_date_time = pd.to_datetime(data.local_date_time) +resampledData = data.set_index(data.local_date_time) + +resampledData = resampledData[~resampledData.index.duplicated()] +resampledData.rename_axis('time',axis='columns',inplace=True) + +resampledData.drop(columns=['local_date_time'],inplace=True) + +#Finding count grouped by day +count = pd.DataFrame() +count = resampledData['activity_type'].resample('D').count() +count = count.rename(columns={"activity_type":"count"}) + +#Finding most common activity of the day +mostCommonActivity = pd.DataFrame() +mostCommonActivity = resampledData['activity_type'].resample('D').apply(lambda x:stats.mode(x)[0]) +mostCommonActivity = mostCommonActivity.rename(columns={'activity_type':'most_common_activity'}) + +#finding different number of activities during a day +countChanges = pd.DataFrame() +# countChanges = resampledData.to_period('D').groupby(resampledData.index)['activity_type'].value_counts() +countChanges = resampledData['activity_type'].resample('D').nunique() + +#Concatenating all the processed data only, no other sensor data is added here for simplicity +finalDataset = pd.DataFrame() +finalDataset = pd.concat([count,mostCommonActivity,countChanges],axis=1) +finalDataset.rename(columns={0:"count",1:'most_common_activity','activity_type':'activity_changes_count'},inplace = True) + +#Export final dataframe with extracted features to respective PID +finalDataset.to_csv(snakemake.output[0]) +