diff --git a/.gitignore b/.gitignore index d5b862d..31e264b 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ __pycache__/ /data/*input*.csv /data/daily* /data/intradaily* +/data/30min* +/presentation/*scores.csv +/presentation/Results.ods diff --git a/config/environment.yml b/config/environment.yml index 62cb210..192bc90 100644 --- a/config/environment.yml +++ b/config/environment.yml @@ -9,6 +9,7 @@ dependencies: - flake8 - jupyterlab - jupytext + - lightgbm - mypy - nodejs - pandas diff --git a/exploration/ml_pipeline_classification_with_clustering.py b/exploration/ml_pipeline_classification_with_clustering.py index 0bf4417..0f730db 100644 --- a/exploration/ml_pipeline_classification_with_clustering.py +++ b/exploration/ml_pipeline_classification_with_clustering.py @@ -31,7 +31,6 @@ from sklearn.impute import SimpleImputer from sklearn.dummy import DummyClassifier from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble -from lightgbm import LGBMClassifier import xgboost as xg from sklearn.cluster import KMeans @@ -52,12 +51,12 @@ from machine_learning.classification_models import ClassificationModels # %% [markdown] # ## Set script's parameters -n_clusters = 5 # Number of clusters (could be regarded as a hyperparameter) +n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter) cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter) n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs # %% jupyter={"source_hidden": true} -model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv") +model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv") index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance @@ -66,7 +65,7 @@ model_input.columns[list(model_input.columns).index('age'):-1] lime_cols = [col for col in model_input if col.startswith('limesurvey')] lime_cols -lime_col = 'limesurvey_demand_control_ratio' +lime_col = 'limesurvey_demand_control_ratio_quartile' clust_col = lime_col model_input[clust_col].describe() @@ -75,9 +74,11 @@ model_input[clust_col].describe() # %% jupyter={"source_hidden": true} # Filter-out outlier rows by clust_col -model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)] +#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)] +#print(model_input) uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True) +uniq = uniq.dropna() plt.bar(uniq['pid'], uniq[clust_col]) # %% jupyter={"source_hidden": true} @@ -182,3 +183,14 @@ for k in range(n_clusters): # %% jupyter={"source_hidden": true} # Get overall results cm.get_total_models_scores(n_clusters=n_clusters) + +# %% +final_scores = pd.DataFrame() +for model in cmodels: + final_scores = pd.concat([final_scores,pd.DataFrame.from_dict(cmodels[model])]) + +# %% +final_scores + +# %% +