Implemented by @adisb01 new plugin sentimental features

2021-03-05 11:04:11 -05:00
4 changed files with 158 additions and 0 deletions
--- a/9
+++ b/9
@ -218,6 +218,15 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
        files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
        files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
 for provider in config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"].keys():
    if config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][provider]["COMPUTE"]:
        files_to_compute.extend(expand("data/raw/{pid}/phone_plugin_sentimental_raw.csv", pid=config["PIDS"]))
        files_to_compute.extend(expand("data/raw/{pid}/phone_plugin_sentimental_with_datetime.csv", pid=config["PIDS"]))
        files_to_compute.extend(expand("data/interim/{pid}/phone_plugin_sentimental_features/phone_plugin_sentimental_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
        files_to_compute.extend(expand("data/processed/features/{pid}/phone_plugin_sentimental.csv", pid=config["PIDS"],))
        files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
        files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
 for provider in config["FITBIT_DATA_YIELD"]["PROVIDERS"].keys():
    if config["FITBIT_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]:
        files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"]))
--- a/config.yaml
+++ b/config.yaml
@ -282,6 +282,15 @@ PHONE_MESSAGES:
      SRC_LANGUAGE: "r"
      SRC_FOLDER: "rapids" # inside src/features/phone_messages
 PHONE_PLUGIN_SENTIMENTAL:
  TABLE: plugin_sentimental_study_data
  PROVIDERS:
    WWBP:
      COMPUTE: False
      FEATURES: []
      SRC_FOLDER: "wwbp"
      SRC_LANGUAGE: "python"
 # See https://www.rapids.science/latest/features/phone-screen/
 PHONE_SCREEN:
  TABLE: screen
--- a/rules/features.smk
+++ b/rules/features.smk
@ -418,6 +418,32 @@ rule phone_messages_r_features:
    script:
        "../src/features/entry.R"
 rule phone_plugin_sentimental_python_features:
    input:
        sensor_data = "data/raw/{pid}/phone_plugin_sentimental_with_datetime.csv",
        time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
    params:
        provider = lambda wildcards: config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][wildcards.provider_key.upper()],
        provider_key = "{provider_key}",
        sensor_key = "phone_plugin_sentimental"
    output:
        "data/interim/{pid}/phone_plugin_sentimental_features/phone_plugin_sentimental_python_{provider_key}.csv"
    script:
        "../src/features/entry.py"
 rule phone_plugin_sentimental_r_features:
    input:
        sensor_data = "data/raw/{pid}/plugin_sentimental_with_datetime.csv",
        time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
    params:
        provider = lambda wildcards: config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][wildcards.provider_key.upper()],
        provider_key = "{provider_key}",
        sensor_key = "phone_plugin_sentimental"
    output:
        "data/interim/{pid}/phone_plugin_sentimental_features/phone_plugin_sentimental_r_{provider_key}.csv"
    script:
        "../src/features/entry.R"
 rule screen_episodes:
    input:
        screen = "data/raw/{pid}/phone_screen_with_datetime_unified.csv"
--- a/src/features/phone_plugin_sentimental/wwbp/main.py
+++ b/src/features/phone_plugin_sentimental/wwbp/main.py
@ -0,0 +1,114 @@
 import pandas as pd
 import numpy as np
 def wwbp_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): 
    sentiment_data = pd.read_csv(sensor_data_files["sensor_data"])
    sentiment_features = pd.DataFrame(columns=["local_segment"])
    # Get relevant information about provided features 
    app_included = "app" in provider["FEATURES"]
    if not sentiment_data.empty:
        sentiment_data = filter_data_by_segment(sentiment_data, time_segment)
        if not sentiment_data.empty:
            # Split the data into groups by time segment
            segments = sentiment_data.groupby("local_segment")
            time_segments = []
            # Get all tuples of app_category features to calculate
            if app_included:
                small_df = sentiment_data.drop(sentiment_data.columns.difference(['app_name', 'word_category']), axis=1)
                tuples = list(small_df.groupby(['app_name', 'word_category']).groups)
                categories = list(filter(lambda x : x[1] != 'total_words' and x[1]==x[1], tuples))
            # Get all word categories features to calculate 
            else:
                categories = sentiment_data.word_category.unique().tolist()
                categories.remove('total_words')
            # Aggregate each segment data into a single instance 
            for _, segment_df in segments:
                instance = process_local_segment(segment_df, categories, app_included)
                time_segments.append(instance)
            # Combine the data into a final dataframe
            sentiment_features = pd.concat(time_segments)
    return sentiment_features
 # Handles local segment logic and processing
 def process_local_segment(df, categories, app_included):
    if not app_included:
        # Simply add the relevant features
        features_df = insert_features(df, categories, False)
    else:
        # Get the features for each app-category combination 
        app_groups = df.groupby(['app_name'])
        processed = []
        for _, group in app_groups:
            p = insert_features(group, categories, True)
            processed.append(p)
        # Combine the data into one instance 
        features_df = pd.concat(processed).groupby(['local_segment'], as_index = False).sum()
    # Add the device_id column and return the data
    features_df['device_id'] = df['device_id'].values[0]
    return features_df
 # Calculates features in a particular local segment 
 def insert_features(df, categories, app_included=False):
    app = df['app_name'].values[0]
    # Map each word_category to its score 
    category_to_score = {}
    totals_df = pd.DataFrame()
    totals_df['score'] = df.groupby(["word_category"])['double_sentiment_score'].sum()
    for index, row in totals_df.iterrows():                
        category_to_score[index] = row['score']
    # Get the total number of words in the time segment
    total_words = category_to_score['total_words']
    category_to_score.pop('total_words')
    # Populate data with the available scores otherwise fill in 0
    data = {}
    for c in categories:
        # c is a tuple (app, word_category)
        if app_included:
            tuple_app = c[0]
            tuple_cat = c[1]
            # Calculate the normalized score if c present 
            feature = tuple_cat + "_" + tuple_app
            if tuple_app == app and tuple_cat in category_to_score:
                data[feature] = category_to_score[tuple_cat] / total_words
            else:
                data[feature] = 0
        # c is just a word_category
        else: 
            # Calculate the normalized score if c is present 
            feature = c 
            if c in category_to_score:
                data[feature] = category_to_score[c] / total_words
            else:
                data[feature] = 0
    # Create a dataframe from the data
    data['local_segment'] = df['local_segment'].values[0]
    processed_df = pd.DataFrame([data])
    return processed_df