diff --git a/Snakefile b/Snakefile index 136773c0..c728ac11 100644 --- a/Snakefile +++ b/Snakefile @@ -218,6 +218,15 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys(): files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") +for provider in config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"].keys(): + if config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][provider]["COMPUTE"]: + files_to_compute.extend(expand("data/raw/{pid}/phone_plugin_sentimental_raw.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/raw/{pid}/phone_plugin_sentimental_with_datetime.csv", pid=config["PIDS"])) + files_to_compute.extend(expand("data/interim/{pid}/phone_plugin_sentimental_features/phone_plugin_sentimental_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower())) + files_to_compute.extend(expand("data/processed/features/{pid}/phone_plugin_sentimental.csv", pid=config["PIDS"],)) + files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"])) + files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv") + for provider in config["FITBIT_DATA_YIELD"]["PROVIDERS"].keys(): if config["FITBIT_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]: files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"])) diff --git a/config.yaml b/config.yaml index c3e45b01..e7266e8e 100644 --- a/config.yaml +++ b/config.yaml @@ -282,6 +282,15 @@ PHONE_MESSAGES: SRC_LANGUAGE: "r" SRC_FOLDER: "rapids" # inside src/features/phone_messages +PHONE_PLUGIN_SENTIMENTAL: + TABLE: plugin_sentimental_study_data + PROVIDERS: + WWBP: + COMPUTE: False + FEATURES: [] + SRC_FOLDER: "wwbp" + SRC_LANGUAGE: "python" + # See https://www.rapids.science/latest/features/phone-screen/ PHONE_SCREEN: TABLE: screen diff --git a/rules/features.smk b/rules/features.smk index 77190b04..a92d02bc 100644 --- a/rules/features.smk +++ b/rules/features.smk @@ -418,6 +418,32 @@ rule phone_messages_r_features: script: "../src/features/entry.R" +rule phone_plugin_sentimental_python_features: + input: + sensor_data = "data/raw/{pid}/phone_plugin_sentimental_with_datetime.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + params: + provider = lambda wildcards: config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "phone_plugin_sentimental" + output: + "data/interim/{pid}/phone_plugin_sentimental_features/phone_plugin_sentimental_python_{provider_key}.csv" + script: + "../src/features/entry.py" + +rule phone_plugin_sentimental_r_features: + input: + sensor_data = "data/raw/{pid}/plugin_sentimental_with_datetime.csv", + time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv" + params: + provider = lambda wildcards: config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][wildcards.provider_key.upper()], + provider_key = "{provider_key}", + sensor_key = "phone_plugin_sentimental" + output: + "data/interim/{pid}/phone_plugin_sentimental_features/phone_plugin_sentimental_r_{provider_key}.csv" + script: + "../src/features/entry.R" + rule screen_episodes: input: screen = "data/raw/{pid}/phone_screen_with_datetime_unified.csv" diff --git a/src/features/phone_plugin_sentimental/wwbp/main.py b/src/features/phone_plugin_sentimental/wwbp/main.py new file mode 100644 index 00000000..847a5faf --- /dev/null +++ b/src/features/phone_plugin_sentimental/wwbp/main.py @@ -0,0 +1,114 @@ +import pandas as pd +import numpy as np + +def wwbp_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs): + + sentiment_data = pd.read_csv(sensor_data_files["sensor_data"]) + sentiment_features = pd.DataFrame(columns=["local_segment"]) + + # Get relevant information about provided features + app_included = "app" in provider["FEATURES"] + + if not sentiment_data.empty: + + sentiment_data = filter_data_by_segment(sentiment_data, time_segment) + + if not sentiment_data.empty: + + # Split the data into groups by time segment + segments = sentiment_data.groupby("local_segment") + time_segments = [] + + # Get all tuples of app_category features to calculate + if app_included: + small_df = sentiment_data.drop(sentiment_data.columns.difference(['app_name', 'word_category']), axis=1) + tuples = list(small_df.groupby(['app_name', 'word_category']).groups) + categories = list(filter(lambda x : x[1] != 'total_words' and x[1]==x[1], tuples)) + + # Get all word categories features to calculate + else: + categories = sentiment_data.word_category.unique().tolist() + categories.remove('total_words') + + # Aggregate each segment data into a single instance + for _, segment_df in segments: + instance = process_local_segment(segment_df, categories, app_included) + time_segments.append(instance) + + # Combine the data into a final dataframe + sentiment_features = pd.concat(time_segments) + + return sentiment_features + + +# Handles local segment logic and processing +def process_local_segment(df, categories, app_included): + + if not app_included: + # Simply add the relevant features + features_df = insert_features(df, categories, False) + + else: + # Get the features for each app-category combination + app_groups = df.groupby(['app_name']) + processed = [] + for _, group in app_groups: + p = insert_features(group, categories, True) + processed.append(p) + + # Combine the data into one instance + features_df = pd.concat(processed).groupby(['local_segment'], as_index = False).sum() + + # Add the device_id column and return the data + features_df['device_id'] = df['device_id'].values[0] + + return features_df + + +# Calculates features in a particular local segment +def insert_features(df, categories, app_included=False): + + app = df['app_name'].values[0] + + # Map each word_category to its score + category_to_score = {} + totals_df = pd.DataFrame() + totals_df['score'] = df.groupby(["word_category"])['double_sentiment_score'].sum() + for index, row in totals_df.iterrows(): + category_to_score[index] = row['score'] + + # Get the total number of words in the time segment + total_words = category_to_score['total_words'] + category_to_score.pop('total_words') + + # Populate data with the available scores otherwise fill in 0 + data = {} + for c in categories: + + # c is a tuple (app, word_category) + if app_included: + tuple_app = c[0] + tuple_cat = c[1] + + # Calculate the normalized score if c present + feature = tuple_cat + "_" + tuple_app + if tuple_app == app and tuple_cat in category_to_score: + data[feature] = category_to_score[tuple_cat] / total_words + else: + data[feature] = 0 + + # c is just a word_category + else: + # Calculate the normalized score if c is present + feature = c + if c in category_to_score: + data[feature] = category_to_score[c] / total_words + else: + data[feature] = 0 + + # Create a dataframe from the data + data['local_segment'] = df['local_segment'].values[0] + processed_df = pd.DataFrame([data]) + + return processed_df +