Compare commits

...

12 Commits

Author SHA1 Message Date
junos 11c64cfc1a Include all participants again. 2022-04-12 17:20:19 +02:00
junos a6a37c7bd9 Drop NaN targets.
This mirrors INNER join in merge_features_and_targets_for_individual_model.py:

data = pd.concat([sensor_features, targets[["target"]]], axis=1, join="inner")
2022-04-12 17:01:49 +02:00
junos 9f5edf1c2b Revert "Add a rule for model baselines."
The example was for a classification rather than regression problem.

This reverts commit 9ab0c8f289.

# Conflicts:
#	rules/models.smk
2022-04-12 16:59:42 +02:00
junos 4ad261fae5 Rename baseline features AGAIN.
Correct other mistakes.
2022-04-12 16:55:01 +02:00
junos 9ab0c8f289 Add a rule for model baselines.
Add baselines and helper functions to main models dir.
2022-04-12 14:23:58 +02:00
junos 570d2eb656 Add the file for population model to Snakefile. 2022-04-12 14:11:40 +02:00
junos f5688f6154 Add a rule to merge sensor and baseline features.
And select target as before.
2022-04-08 15:42:04 +02:00
junos b1f356c3f7 Extract a function to be used elsewhere. 2022-04-08 15:36:32 +02:00
junos 7ff3dcf5fc Move and rename target variable. 2022-04-06 18:21:09 +02:00
junos 50c0defca7 Select target columns (no parsing necessary). 2022-04-06 18:16:49 +02:00
junos ac86221662 [WIP] Add a rule to parse targets.
Does nothing for now.
2022-04-06 17:47:03 +02:00
junos baa94c4c4e Correct additional error in feature file naming.
Add the final feature file to the list in Snakefile.
2022-04-06 17:29:17 +02:00
8 changed files with 99 additions and 20 deletions

View File

@ -169,7 +169,8 @@ for provider in config["PHONE_ESM"]["PROVIDERS"].keys():
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_raw.csv",pid=config["PIDS"]))
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_with_datetime.csv",pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_clean.csv",pid=config["PIDS"]))
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_features/phone_esm_clean_{provider_key}.csv",pid=config["PIDS"],provider_key=provider.lower()))
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_features/phone_esm_{language}_{provider_key}.csv",pid=config["PIDS"],language=get_script_language(config["PHONE_ESM"]["PROVIDERS"][provider]["SRC_SCRIPT"]),provider_key=provider.lower()))
files_to_compute.extend(expand("data/processed/features/{pid}/phone_esm.csv", pid=config["PIDS"]))
#files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"]))
#files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
@ -419,6 +420,13 @@ if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
files_to_compute.extend(expand("data/interim/{pid}/baseline_questionnaires.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]))
# Targets (labels)
if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]:
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
files_to_compute.extend(expand("data/processed/models/population_model/input.csv"))
#files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
rule all:
input:
files_to_compute

View File

@ -3,7 +3,7 @@
########################################################################################################################
# See https://www.rapids.science/latest/setup/configuration/#participant-files
PIDS: ['p031']
PIDS: ['p031', 'p032', 'p033', 'p034', 'p035', 'p036', 'p037', 'p038', 'p039', 'p040', 'p042', 'p043', 'p044', 'p045', 'p046', 'p049', 'p050', 'p052', 'p053', 'p054', 'p055', 'p057', 'p058', 'p059', 'p060', 'p061', 'p062', 'p064', 'p067', 'p068', 'p069', 'p070', 'p071', 'p072', 'p073', 'p074', 'p075', 'p076', 'p077', 'p078', 'p079', 'p080', 'p081', 'p082', 'p083', 'p084', 'p085', 'p086', 'p088', 'p089', 'p090', 'p091', 'p092', 'p093', 'p106', 'p107']
# See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files
CREATE_PARTICIPANT_FILES:
@ -638,16 +638,16 @@ ALL_CLEANING_OVERALL:
PARAMS_FOR_ANALYSIS:
BASELINE:
COMPUTE: False
COMPUTE: True
FOLDER: data/external/baseline
CONTAINER: [results-survey637813_final.csv, # Slovenia
results-survey358134_final.csv, # Belgium 1
results-survey413767_final.csv # Belgium 2
]
QUESTION_LIST: survey637813+question_text.csv
FEATURES: [age, gender, startlanguage, demand, control, demand_control_ratio]
FEATURES: [age, gender, startlanguage, limesurvey_demand, limesurvey_control, limesurvey_demand_control_ratio, limesurvey_demand_control_ratio_quartile]
CATEGORICAL_FEATURES: [gender]
TARGET:
SCALE: [positive_affect, negative_affect]
COMPUTE: True
LABEL: PANAS_negative_affect_mean

View File

@ -341,7 +341,7 @@ rule esm_features:
provider_key = "{provider_key}",
sensor_key = "phone_esm",
scales=lambda wildcards: config["PHONE_ESM"]["PROVIDERS"][wildcards.provider_key.upper()]["SCALES"]
output: "data/interim/{pid}/phone_esm_features/phone_esm_clean_{provider_key}.csv"
output: "data/interim/{pid}/phone_esm_features/phone_esm_python_{provider_key}.csv"
script:
"../src/features/entry.py"

View File

@ -27,3 +27,25 @@ rule baseline_features:
features="data/processed/features/{pid}/baseline_features.csv"
script:
"../src/data/baseline_features.py"
rule select_target:
input:
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned_rapids.csv"
params:
target_variable = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
output:
"data/processed/models/individual_model/{pid}/input.csv"
script:
"../src/models/select_targets.py"
rule merge_features_and_targets_for_population_model:
input:
cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned_rapids.csv",
demographic_features = expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]),
params:
target_variable=config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
output:
"data/processed/models/population_model/input.csv"
script:
"../src/models/merge_features_and_targets_for_population_model.py"

View File

@ -60,15 +60,15 @@ if not participant_info.empty:
0, "startlanguage"
]
if (
("demand" in requested_features)
or ("control" in requested_features)
or ("demand_control_ratio" in requested_features)
("limesurvey_demand" in requested_features)
or ("limesurvey_control" in requested_features)
or ("limesurvey_demand_control_ratio" in requested_features)
):
participant_info_t = participant_info.T
rows_baseline = participant_info_t.index
if ("demand" in requested_features) or (
"demand_control_ratio" in requested_features
if ("limesurvey_demand" in requested_features) or (
"limesurvey_demand_control_ratio" in requested_features
):
# Find questions about demand, but disregard time (duration of filling in questionnaire)
rows_demand = rows_baseline.str.startswith(
@ -96,12 +96,12 @@ if not participant_info.empty:
)
baseline_interim = pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True)
if "demand" in requested_features:
baseline_features.loc[0, "demand"] = limesurvey_demand[
baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[
"score"
].sum()
if ("control" in requested_features) or (
"demand_control_ratio" in requested_features
if ("limesurvey_control" in requested_features) or (
"limesurvey_demand_control_ratio" in requested_features
):
# Find questions about control, but disregard time (duration of filling in questionnaire)
rows_control = rows_baseline.str.startswith(
@ -130,12 +130,12 @@ if not participant_info.empty:
baseline_interim = pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True)
if "control" in requested_features:
baseline_features.loc[0, "control"] = limesurvey_control[
if "limesurvey_control" in requested_features:
baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[
"score"
].sum()
if "demand_control_ratio" in requested_features:
if "limesurvey_demand_control_ratio" in requested_features:
limesurvey_demand_control_ratio = (
limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()
)
@ -167,10 +167,10 @@ if not participant_info.empty:
limesurvey_quartile = np.nan
baseline_features.loc[
0, "demand_control_ratio"
0, "limesurvey_demand_control_ratio"
] = limesurvey_demand_control_ratio
baseline_features.loc[
0, "demand_control_ratio_quartile"
0, "limesurvey_demand_control_ratio_quartile"
] = limesurvey_quartile
if not baseline_interim.empty:

View File

@ -0,0 +1,18 @@
import pandas as pd
def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
column_names = df_input.columns
esm_names_index = column_names.str.startswith("phone_esm_straw")
# Find all columns coming from phone_esm, since these are not features for our purposes and we will drop them.
esm_names = column_names[esm_names_index]
target_variable_index = esm_names.str.contains(target_variable_name)
if all(~target_variable_index):
raise ValueError("The requested target (", target_variable_name,
")cannot be found in the dataset.",
"Please check the names of phone_esm_ columns in all_sensor_features_cleaned_rapids.csv")
sensor_features_plus_target = df_input.drop(esm_names, axis=1)
sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]
# We will only keep one column related to phone_esm and that will be our target variable.
# Add it back to the very and of the data frame and rename it to target.
return sensor_features_plus_target

View File

@ -0,0 +1,20 @@
import pandas as pd
from helper import retain_target_column
sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"])
all_baseline_features = pd.DataFrame()
for baseline_features_path in snakemake.input["demographic_features"]:
pid = baseline_features_path.split("/")[3]
baseline_features = pd.read_csv(baseline_features_path)
baseline_features = baseline_features.assign(pid=pid)
all_baseline_features = pd.concat([all_baseline_features, baseline_features], axis=0)
# merge sensor features and baseline features
features = sensor_features.merge(all_baseline_features, on="pid", how="left")
target_variable_name = snakemake.params["target_variable"]
model_input = retain_target_column(features, target_variable_name)
model_input.to_csv(snakemake.output[0], index=False)

View File

@ -0,0 +1,11 @@
import pandas as pd
from helper import retain_target_column
cleaned_sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"])
target_variable_name = snakemake.params["target_variable"]
model_input = retain_target_column(cleaned_sensor_features, target_variable_name)
model_input.dropna(axis ="index", how="any", subset=["target"], inplace=True)
model_input.to_csv(snakemake.output[0], index=False)