Compare commits
12 Commits
d2fbef5234
...
11c64cfc1a
Author | SHA1 | Date |
---|---|---|
junos | 11c64cfc1a | |
junos | a6a37c7bd9 | |
junos | 9f5edf1c2b | |
junos | 4ad261fae5 | |
junos | 9ab0c8f289 | |
junos | 570d2eb656 | |
junos | f5688f6154 | |
junos | b1f356c3f7 | |
junos | 7ff3dcf5fc | |
junos | 50c0defca7 | |
junos | ac86221662 | |
junos | baa94c4c4e |
10
Snakefile
10
Snakefile
|
@ -169,7 +169,8 @@ for provider in config["PHONE_ESM"]["PROVIDERS"].keys():
|
|||
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_raw.csv",pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/phone_esm_with_datetime.csv",pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_clean.csv",pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_features/phone_esm_clean_{provider_key}.csv",pid=config["PIDS"],provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/interim/{pid}/phone_esm_features/phone_esm_{language}_{provider_key}.csv",pid=config["PIDS"],language=get_script_language(config["PHONE_ESM"]["PROVIDERS"][provider]["SRC_SCRIPT"]),provider_key=provider.lower()))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/phone_esm.csv", pid=config["PIDS"]))
|
||||
#files_to_compute.extend(expand("data/processed/features/{pid}/all_sensor_features.csv",pid=config["PIDS"]))
|
||||
#files_to_compute.append("data/processed/features/all_participants/all_sensor_features.csv")
|
||||
|
||||
|
@ -419,6 +420,13 @@ if config["PARAMS_FOR_ANALYSIS"]["BASELINE"]["COMPUTE"]:
|
|||
files_to_compute.extend(expand("data/interim/{pid}/baseline_questionnaires.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]))
|
||||
|
||||
# Targets (labels)
|
||||
if config["PARAMS_FOR_ANALYSIS"]["TARGET"]["COMPUTE"]:
|
||||
files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/input.csv", pid=config["PIDS"]))
|
||||
files_to_compute.extend(expand("data/processed/models/population_model/input.csv"))
|
||||
|
||||
#files_to_compute.extend(expand("data/processed/models/individual_model/{pid}/output_{cv_method}/baselines.csv", pid=config["PIDS"], cv_method=config["PARAMS_FOR_ANALYSIS"]["CV_METHODS"]))
|
||||
|
||||
rule all:
|
||||
input:
|
||||
files_to_compute
|
||||
|
|
10
config.yaml
10
config.yaml
|
@ -3,7 +3,7 @@
|
|||
########################################################################################################################
|
||||
|
||||
# See https://www.rapids.science/latest/setup/configuration/#participant-files
|
||||
PIDS: ['p031']
|
||||
PIDS: ['p031', 'p032', 'p033', 'p034', 'p035', 'p036', 'p037', 'p038', 'p039', 'p040', 'p042', 'p043', 'p044', 'p045', 'p046', 'p049', 'p050', 'p052', 'p053', 'p054', 'p055', 'p057', 'p058', 'p059', 'p060', 'p061', 'p062', 'p064', 'p067', 'p068', 'p069', 'p070', 'p071', 'p072', 'p073', 'p074', 'p075', 'p076', 'p077', 'p078', 'p079', 'p080', 'p081', 'p082', 'p083', 'p084', 'p085', 'p086', 'p088', 'p089', 'p090', 'p091', 'p092', 'p093', 'p106', 'p107']
|
||||
|
||||
# See https://www.rapids.science/latest/setup/configuration/#automatic-creation-of-participant-files
|
||||
CREATE_PARTICIPANT_FILES:
|
||||
|
@ -638,16 +638,16 @@ ALL_CLEANING_OVERALL:
|
|||
|
||||
PARAMS_FOR_ANALYSIS:
|
||||
BASELINE:
|
||||
COMPUTE: False
|
||||
COMPUTE: True
|
||||
FOLDER: data/external/baseline
|
||||
CONTAINER: [results-survey637813_final.csv, # Slovenia
|
||||
results-survey358134_final.csv, # Belgium 1
|
||||
results-survey413767_final.csv # Belgium 2
|
||||
]
|
||||
QUESTION_LIST: survey637813+question_text.csv
|
||||
FEATURES: [age, gender, startlanguage, demand, control, demand_control_ratio]
|
||||
FEATURES: [age, gender, startlanguage, limesurvey_demand, limesurvey_control, limesurvey_demand_control_ratio, limesurvey_demand_control_ratio_quartile]
|
||||
CATEGORICAL_FEATURES: [gender]
|
||||
|
||||
TARGET:
|
||||
SCALE: [positive_affect, negative_affect]
|
||||
|
||||
COMPUTE: True
|
||||
LABEL: PANAS_negative_affect_mean
|
||||
|
|
|
@ -341,7 +341,7 @@ rule esm_features:
|
|||
provider_key = "{provider_key}",
|
||||
sensor_key = "phone_esm",
|
||||
scales=lambda wildcards: config["PHONE_ESM"]["PROVIDERS"][wildcards.provider_key.upper()]["SCALES"]
|
||||
output: "data/interim/{pid}/phone_esm_features/phone_esm_clean_{provider_key}.csv"
|
||||
output: "data/interim/{pid}/phone_esm_features/phone_esm_python_{provider_key}.csv"
|
||||
script:
|
||||
"../src/features/entry.py"
|
||||
|
||||
|
|
|
@ -27,3 +27,25 @@ rule baseline_features:
|
|||
features="data/processed/features/{pid}/baseline_features.csv"
|
||||
script:
|
||||
"../src/data/baseline_features.py"
|
||||
|
||||
rule select_target:
|
||||
input:
|
||||
cleaned_sensor_features = "data/processed/features/{pid}/all_sensor_features_cleaned_rapids.csv"
|
||||
params:
|
||||
target_variable = config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||
output:
|
||||
"data/processed/models/individual_model/{pid}/input.csv"
|
||||
script:
|
||||
"../src/models/select_targets.py"
|
||||
|
||||
rule merge_features_and_targets_for_population_model:
|
||||
input:
|
||||
cleaned_sensor_features = "data/processed/features/all_participants/all_sensor_features_cleaned_rapids.csv",
|
||||
demographic_features = expand("data/processed/features/{pid}/baseline_features.csv", pid=config["PIDS"]),
|
||||
params:
|
||||
target_variable=config["PARAMS_FOR_ANALYSIS"]["TARGET"]["LABEL"]
|
||||
output:
|
||||
"data/processed/models/population_model/input.csv"
|
||||
script:
|
||||
"../src/models/merge_features_and_targets_for_population_model.py"
|
||||
|
||||
|
|
|
@ -60,15 +60,15 @@ if not participant_info.empty:
|
|||
0, "startlanguage"
|
||||
]
|
||||
if (
|
||||
("demand" in requested_features)
|
||||
or ("control" in requested_features)
|
||||
or ("demand_control_ratio" in requested_features)
|
||||
("limesurvey_demand" in requested_features)
|
||||
or ("limesurvey_control" in requested_features)
|
||||
or ("limesurvey_demand_control_ratio" in requested_features)
|
||||
):
|
||||
participant_info_t = participant_info.T
|
||||
rows_baseline = participant_info_t.index
|
||||
|
||||
if ("demand" in requested_features) or (
|
||||
"demand_control_ratio" in requested_features
|
||||
if ("limesurvey_demand" in requested_features) or (
|
||||
"limesurvey_demand_control_ratio" in requested_features
|
||||
):
|
||||
# Find questions about demand, but disregard time (duration of filling in questionnaire)
|
||||
rows_demand = rows_baseline.str.startswith(
|
||||
|
@ -96,12 +96,12 @@ if not participant_info.empty:
|
|||
)
|
||||
baseline_interim = pd.concat([baseline_interim, limesurvey_demand], axis=0, ignore_index=True)
|
||||
if "demand" in requested_features:
|
||||
baseline_features.loc[0, "demand"] = limesurvey_demand[
|
||||
baseline_features.loc[0, "limesurvey_demand"] = limesurvey_demand[
|
||||
"score"
|
||||
].sum()
|
||||
|
||||
if ("control" in requested_features) or (
|
||||
"demand_control_ratio" in requested_features
|
||||
if ("limesurvey_control" in requested_features) or (
|
||||
"limesurvey_demand_control_ratio" in requested_features
|
||||
):
|
||||
# Find questions about control, but disregard time (duration of filling in questionnaire)
|
||||
rows_control = rows_baseline.str.startswith(
|
||||
|
@ -130,12 +130,12 @@ if not participant_info.empty:
|
|||
|
||||
baseline_interim = pd.concat([baseline_interim, limesurvey_control], axis=0, ignore_index=True)
|
||||
|
||||
if "control" in requested_features:
|
||||
baseline_features.loc[0, "control"] = limesurvey_control[
|
||||
if "limesurvey_control" in requested_features:
|
||||
baseline_features.loc[0, "limesurvey_control"] = limesurvey_control[
|
||||
"score"
|
||||
].sum()
|
||||
|
||||
if "demand_control_ratio" in requested_features:
|
||||
if "limesurvey_demand_control_ratio" in requested_features:
|
||||
limesurvey_demand_control_ratio = (
|
||||
limesurvey_demand["score"].sum() / limesurvey_control["score"].sum()
|
||||
)
|
||||
|
@ -167,10 +167,10 @@ if not participant_info.empty:
|
|||
limesurvey_quartile = np.nan
|
||||
|
||||
baseline_features.loc[
|
||||
0, "demand_control_ratio"
|
||||
0, "limesurvey_demand_control_ratio"
|
||||
] = limesurvey_demand_control_ratio
|
||||
baseline_features.loc[
|
||||
0, "demand_control_ratio_quartile"
|
||||
0, "limesurvey_demand_control_ratio_quartile"
|
||||
] = limesurvey_quartile
|
||||
|
||||
if not baseline_interim.empty:
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
import pandas as pd
|
||||
|
||||
|
||||
def retain_target_column(df_input: pd.DataFrame, target_variable_name: str):
|
||||
column_names = df_input.columns
|
||||
esm_names_index = column_names.str.startswith("phone_esm_straw")
|
||||
# Find all columns coming from phone_esm, since these are not features for our purposes and we will drop them.
|
||||
esm_names = column_names[esm_names_index]
|
||||
target_variable_index = esm_names.str.contains(target_variable_name)
|
||||
if all(~target_variable_index):
|
||||
raise ValueError("The requested target (", target_variable_name,
|
||||
")cannot be found in the dataset.",
|
||||
"Please check the names of phone_esm_ columns in all_sensor_features_cleaned_rapids.csv")
|
||||
sensor_features_plus_target = df_input.drop(esm_names, axis=1)
|
||||
sensor_features_plus_target["target"] = df_input[esm_names[target_variable_index]]
|
||||
# We will only keep one column related to phone_esm and that will be our target variable.
|
||||
# Add it back to the very and of the data frame and rename it to target.
|
||||
return sensor_features_plus_target
|
|
@ -0,0 +1,20 @@
|
|||
import pandas as pd
|
||||
|
||||
from helper import retain_target_column
|
||||
|
||||
sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"])
|
||||
|
||||
all_baseline_features = pd.DataFrame()
|
||||
for baseline_features_path in snakemake.input["demographic_features"]:
|
||||
pid = baseline_features_path.split("/")[3]
|
||||
baseline_features = pd.read_csv(baseline_features_path)
|
||||
baseline_features = baseline_features.assign(pid=pid)
|
||||
all_baseline_features = pd.concat([all_baseline_features, baseline_features], axis=0)
|
||||
|
||||
# merge sensor features and baseline features
|
||||
features = sensor_features.merge(all_baseline_features, on="pid", how="left")
|
||||
|
||||
target_variable_name = snakemake.params["target_variable"]
|
||||
model_input = retain_target_column(features, target_variable_name)
|
||||
|
||||
model_input.to_csv(snakemake.output[0], index=False)
|
|
@ -0,0 +1,11 @@
|
|||
import pandas as pd
|
||||
|
||||
from helper import retain_target_column
|
||||
|
||||
cleaned_sensor_features = pd.read_csv(snakemake.input["cleaned_sensor_features"])
|
||||
target_variable_name = snakemake.params["target_variable"]
|
||||
|
||||
model_input = retain_target_column(cleaned_sensor_features, target_variable_name)
|
||||
model_input.dropna(axis ="index", how="any", subset=["target"], inplace=True)
|
||||
|
||||
model_input.to_csv(snakemake.output[0], index=False)
|
Loading…
Reference in New Issue