Implemented by @adisb01 new plugin sentimental features

2021-03-05 11:04:11 -05:00 · 2021-03-05 11:04:11 -05:00 · 09ca9725c0
parent fc5b5eead8
commit 09ca9725c0
4 changed files with 158 additions and 0 deletions
--- a/9
+++ b/9
@ -218,6 +218,15 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
        files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
        files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")

+for provider in config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"].keys():
+    if config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][provider]["COMPUTE"]:
+        files_to_compute.extend(expand("data/raw/{pid}/phone_plugin_sentimental_raw.csv", pid=config["PIDS"]))
+        files_to_compute.extend(expand("data/raw/{pid}/phone_plugin_sentimental_with_datetime.csv", pid=config["PIDS"]))
+        files_to_compute.extend(expand("data/interim/{pid}/phone_plugin_sentimental_features/phone_plugin_sentimental_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
+        files_to_compute.extend(expand("data/processed/features/{pid}/phone_plugin_sentimental.csv", pid=config["PIDS"],))
+        files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
+        files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
+
 for provider in config["FITBIT_DATA_YIELD"]["PROVIDERS"].keys():
    if config["FITBIT_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]:
        files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"]))
--- a/config.yaml
+++ b/config.yaml
@ -282,6 +282,15 @@ PHONE_MESSAGES:
      SRC_LANGUAGE: "r"
      SRC_FOLDER: "rapids" # inside src/features/phone_messages

+PHONE_PLUGIN_SENTIMENTAL:
+  TABLE: plugin_sentimental_study_data
+  PROVIDERS:
+    WWBP:
+      COMPUTE: False
+      FEATURES: []
+      SRC_FOLDER: "wwbp"
+      SRC_LANGUAGE: "python"
+
 # See https://www.rapids.science/latest/features/phone-screen/
 PHONE_SCREEN:
  TABLE: screen
--- a/rules/features.smk
+++ b/rules/features.smk
@ -418,6 +418,32 @@ rule phone_messages_r_features:
    script:
        "../src/features/entry.R"

+rule phone_plugin_sentimental_python_features:
+    input:
+        sensor_data = "data/raw/{pid}/phone_plugin_sentimental_with_datetime.csv",
+        time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
+    params:
+        provider = lambda wildcards: config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][wildcards.provider_key.upper()],
+        provider_key = "{provider_key}",
+        sensor_key = "phone_plugin_sentimental"
+    output:
+        "data/interim/{pid}/phone_plugin_sentimental_features/phone_plugin_sentimental_python_{provider_key}.csv"
+    script:
+        "../src/features/entry.py"
+
+rule phone_plugin_sentimental_r_features:
+    input:
+        sensor_data = "data/raw/{pid}/plugin_sentimental_with_datetime.csv",
+        time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
+    params:
+        provider = lambda wildcards: config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][wildcards.provider_key.upper()],
+        provider_key = "{provider_key}",
+        sensor_key = "phone_plugin_sentimental"
+    output:
+        "data/interim/{pid}/phone_plugin_sentimental_features/phone_plugin_sentimental_r_{provider_key}.csv"
+    script:
+        "../src/features/entry.R"
+
 rule screen_episodes:
    input:
        screen = "data/raw/{pid}/phone_screen_with_datetime_unified.csv"
--- a/src/features/phone_plugin_sentimental/wwbp/main.py
+++ b/src/features/phone_plugin_sentimental/wwbp/main.py
@ -0,0 +1,114 @@
+import pandas as pd
+import numpy as np
+
+def wwbp_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): 
+
+    sentiment_data = pd.read_csv(sensor_data_files["sensor_data"])
+    sentiment_features = pd.DataFrame(columns=["local_segment"])
+
+    # Get relevant information about provided features 
+    app_included = "app" in provider["FEATURES"]
+
+    if not sentiment_data.empty:
+
+        sentiment_data = filter_data_by_segment(sentiment_data, time_segment)
+
+        if not sentiment_data.empty:
+
+            # Split the data into groups by time segment
+            segments = sentiment_data.groupby("local_segment")
+            time_segments = []
+
+            # Get all tuples of app_category features to calculate
+            if app_included:
+                small_df = sentiment_data.drop(sentiment_data.columns.difference(['app_name', 'word_category']), axis=1)
+                tuples = list(small_df.groupby(['app_name', 'word_category']).groups)
+                categories = list(filter(lambda x : x[1] != 'total_words' and x[1]==x[1], tuples))
+
+            # Get all word categories features to calculate 
+            else:
+                categories = sentiment_data.word_category.unique().tolist()
+                categories.remove('total_words')
+
+            # Aggregate each segment data into a single instance 
+            for _, segment_df in segments:
+                instance = process_local_segment(segment_df, categories, app_included)
+                time_segments.append(instance)
+
+            # Combine the data into a final dataframe
+            sentiment_features = pd.concat(time_segments)
+
+    return sentiment_features
+
+
+# Handles local segment logic and processing
+def process_local_segment(df, categories, app_included):
+
+    if not app_included:
+        # Simply add the relevant features
+        features_df = insert_features(df, categories, False)
+
+    else:
+        # Get the features for each app-category combination 
+        app_groups = df.groupby(['app_name'])
+        processed = []
+        for _, group in app_groups:
+            p = insert_features(group, categories, True)
+            processed.append(p)
+
+        # Combine the data into one instance 
+        features_df = pd.concat(processed).groupby(['local_segment'], as_index = False).sum()
+    
+    # Add the device_id column and return the data
+    features_df['device_id'] = df['device_id'].values[0]
+
+    return features_df
+
+
+# Calculates features in a particular local segment 
+def insert_features(df, categories, app_included=False):
+
+    app = df['app_name'].values[0]
+
+    # Map each word_category to its score 
+    category_to_score = {}
+    totals_df = pd.DataFrame()
+    totals_df['score'] = df.groupby(["word_category"])['double_sentiment_score'].sum()
+    for index, row in totals_df.iterrows():                
+        category_to_score[index] = row['score']
+
+    # Get the total number of words in the time segment
+    total_words = category_to_score['total_words']
+    category_to_score.pop('total_words')
+
+    # Populate data with the available scores otherwise fill in 0
+    data = {}
+    for c in categories:
+
+        # c is a tuple (app, word_category)
+        if app_included:
+            tuple_app = c[0]
+            tuple_cat = c[1]
+
+            # Calculate the normalized score if c present 
+            feature = tuple_cat + "_" + tuple_app
+            if tuple_app == app and tuple_cat in category_to_score:
+                data[feature] = category_to_score[tuple_cat] / total_words
+            else:
+                data[feature] = 0
+
+        # c is just a word_category
+        else: 
+            # Calculate the normalized score if c is present 
+            feature = c 
+            if c in category_to_score:
+                data[feature] = category_to_score[c] / total_words
+            else:
+                data[feature] = 0
+
+    # Create a dataframe from the data
+    data['local_segment'] = df['local_segment'].values[0]
+    processed_df = pd.DataFrame([data])
+
+    return processed_df
+