diff --git a/exploration/ml_pipeline_classification.py b/exploration/ml_pipeline_classification.py index 233dffc..3acefcb 100644 --- a/exploration/ml_pipeline_classification.py +++ b/exploration/ml_pipeline_classification.py @@ -51,17 +51,19 @@ cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs # %% jupyter={"source_hidden": true} -model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv") +model_input = pd.read_csv("../data/stressfulness_event_nonstandardized/input_appraisal_stressfulness_event_mean.csv") # %% jupyter={"source_hidden": true} index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] model_input.set_index(index_columns, inplace=True) +model_input['target'].value_counts() # %% jupyter={"source_hidden": true} -bins = [-10, -1, 1, 10] # bins for z-scored targets -model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'medium', 'high'], retbins=True, right=True) #['low', 'medium', 'high'] +# bins = [-10, -1, 1, 10] # bins for z-scored targets +bins = [0, 1, 4] # bins for stressfulness (1-4) target +model_input['target'], edges = pd.cut(model_input.target, bins=bins, labels=['low', 'high'], retbins=True, right=True) #['low', 'medium', 'high'] model_input['target'].value_counts(), edges -model_input = model_input[model_input['target'] != "medium"] +# model_input = model_input[model_input['target'] != "medium"] model_input['target'] = model_input['target'].astype(str).apply(lambda x: 0 if x == "low" else 1) model_input['target'].value_counts()