Implemented by @adisb01 new plugin sentimental features
parent
fc5b5eead8
commit
09ca9725c0
|
@ -218,6 +218,15 @@ for provider in config["PHONE_LOCATIONS"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
|
||||
for provider in config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"].keys():
|
||||
if config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/raw/{pid}/phone_plugin_sentimental_raw.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/phone_plugin_sentimental_with_datetime.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_plugin_sentimental_features/phone_plugin_sentimental_{language}_{provider_key}.csv", pid=config["PIDS"], language=config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][provider]["SRC_LANGUAGE"].lower(), provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_plugin_sentimental.csv", pid=config["PIDS"],))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv", pid=config["PIDS"]))
|
||||
files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
|
||||
for provider in config["FITBIT_DATA_YIELD"]["PROVIDERS"].keys():
|
||||
if config["FITBIT_DATA_YIELD"]["PROVIDERS"][provider]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/raw/{pid}/fitbit_heartrate_intraday_raw.csv", pid=config["PIDS"]))
|
||||
|
|
|
@ -282,6 +282,15 @@ PHONE_MESSAGES:
|
|||
SRC_LANGUAGE: "r"
|
||||
SRC_FOLDER: "rapids" # inside src/features/phone_messages
|
||||
|
||||
PHONE_PLUGIN_SENTIMENTAL:
|
||||
TABLE: plugin_sentimental_study_data
|
||||
PROVIDERS:
|
||||
WWBP:
|
||||
COMPUTE: False
|
||||
FEATURES: []
|
||||
SRC_FOLDER: "wwbp"
|
||||
SRC_LANGUAGE: "python"
|
||||
|
||||
# See https://www.rapids.science/latest/features/phone-screen/
|
||||
PHONE_SCREEN:
|
||||
TABLE: screen
|
||||
|
|
|
@ -418,6 +418,32 @@ rule phone_messages_r_features:
|
|||
script:
|
||||
"../src/features/entry.R"
|
||||
|
||||
rule phone_plugin_sentimental_python_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/phone_plugin_sentimental_with_datetime.csv",
|
||||
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
||||
params:
|
||||
provider = lambda wildcards: config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "phone_plugin_sentimental"
|
||||
output:
|
||||
"data/interim/{pid}/phone_plugin_sentimental_features/phone_plugin_sentimental_python_{provider_key}.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
rule phone_plugin_sentimental_r_features:
|
||||
input:
|
||||
sensor_data = "data/raw/{pid}/plugin_sentimental_with_datetime.csv",
|
||||
time_segments_labels = "data/interim/time_segments/{pid}_time_segments_labels.csv"
|
||||
params:
|
||||
provider = lambda wildcards: config["PHONE_PLUGIN_SENTIMENTAL"]["PROVIDERS"][wildcards.provider_key.upper()],
|
||||
provider_key = "{provider_key}",
|
||||
sensor_key = "phone_plugin_sentimental"
|
||||
output:
|
||||
"data/interim/{pid}/phone_plugin_sentimental_features/phone_plugin_sentimental_r_{provider_key}.csv"
|
||||
script:
|
||||
"../src/features/entry.R"
|
||||
|
||||
rule screen_episodes:
|
||||
input:
|
||||
screen = "data/raw/{pid}/phone_screen_with_datetime_unified.csv"
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
def wwbp_features(sensor_data_files, time_segment, provider, filter_data_by_segment, *args, **kwargs):
|
||||
|
||||
sentiment_data = pd.read_csv(sensor_data_files["sensor_data"])
|
||||
sentiment_features = pd.DataFrame(columns=["local_segment"])
|
||||
|
||||
# Get relevant information about provided features
|
||||
app_included = "app" in provider["FEATURES"]
|
||||
|
||||
if not sentiment_data.empty:
|
||||
|
||||
sentiment_data = filter_data_by_segment(sentiment_data, time_segment)
|
||||
|
||||
if not sentiment_data.empty:
|
||||
|
||||
# Split the data into groups by time segment
|
||||
segments = sentiment_data.groupby("local_segment")
|
||||
time_segments = []
|
||||
|
||||
# Get all tuples of app_category features to calculate
|
||||
if app_included:
|
||||
small_df = sentiment_data.drop(sentiment_data.columns.difference(['app_name', 'word_category']), axis=1)
|
||||
tuples = list(small_df.groupby(['app_name', 'word_category']).groups)
|
||||
categories = list(filter(lambda x : x[1] != 'total_words' and x[1]==x[1], tuples))
|
||||
|
||||
# Get all word categories features to calculate
|
||||
else:
|
||||
categories = sentiment_data.word_category.unique().tolist()
|
||||
categories.remove('total_words')
|
||||
|
||||
# Aggregate each segment data into a single instance
|
||||
for _, segment_df in segments:
|
||||
instance = process_local_segment(segment_df, categories, app_included)
|
||||
time_segments.append(instance)
|
||||
|
||||
# Combine the data into a final dataframe
|
||||
sentiment_features = pd.concat(time_segments)
|
||||
|
||||
return sentiment_features
|
||||
|
||||
|
||||
# Handles local segment logic and processing
|
||||
def process_local_segment(df, categories, app_included):
|
||||
|
||||
if not app_included:
|
||||
# Simply add the relevant features
|
||||
features_df = insert_features(df, categories, False)
|
||||
|
||||
else:
|
||||
# Get the features for each app-category combination
|
||||
app_groups = df.groupby(['app_name'])
|
||||
processed = []
|
||||
for _, group in app_groups:
|
||||
p = insert_features(group, categories, True)
|
||||
processed.append(p)
|
||||
|
||||
# Combine the data into one instance
|
||||
features_df = pd.concat(processed).groupby(['local_segment'], as_index = False).sum()
|
||||
|
||||
# Add the device_id column and return the data
|
||||
features_df['device_id'] = df['device_id'].values[0]
|
||||
|
||||
return features_df
|
||||
|
||||
|
||||
# Calculates features in a particular local segment
|
||||
def insert_features(df, categories, app_included=False):
|
||||
|
||||
app = df['app_name'].values[0]
|
||||
|
||||
# Map each word_category to its score
|
||||
category_to_score = {}
|
||||
totals_df = pd.DataFrame()
|
||||
totals_df['score'] = df.groupby(["word_category"])['double_sentiment_score'].sum()
|
||||
for index, row in totals_df.iterrows():
|
||||
category_to_score[index] = row['score']
|
||||
|
||||
# Get the total number of words in the time segment
|
||||
total_words = category_to_score['total_words']
|
||||
category_to_score.pop('total_words')
|
||||
|
||||
# Populate data with the available scores otherwise fill in 0
|
||||
data = {}
|
||||
for c in categories:
|
||||
|
||||
# c is a tuple (app, word_category)
|
||||
if app_included:
|
||||
tuple_app = c[0]
|
||||
tuple_cat = c[1]
|
||||
|
||||
# Calculate the normalized score if c present
|
||||
feature = tuple_cat + "_" + tuple_app
|
||||
if tuple_app == app and tuple_cat in category_to_score:
|
||||
data[feature] = category_to_score[tuple_cat] / total_words
|
||||
else:
|
||||
data[feature] = 0
|
||||
|
||||
# c is just a word_category
|
||||
else:
|
||||
# Calculate the normalized score if c is present
|
||||
feature = c
|
||||
if c in category_to_score:
|
||||
data[feature] = category_to_score[c] / total_words
|
||||
else:
|
||||
data[feature] = 0
|
||||
|
||||
# Create a dataframe from the data
|
||||
data['local_segment'] = df['local_segment'].values[0]
|
||||
processed_df = pd.DataFrame([data])
|
||||
|
||||
return processed_df
|
||||
|
Loading…
Reference in New Issue