Cluster by demand_control_ratio_quartile.
parent
2bb95657d8
commit
1516d1c000
|
@ -12,3 +12,6 @@ __pycache__/
|
|||
/data/*input*.csv
|
||||
/data/daily*
|
||||
/data/intradaily*
|
||||
/data/30min*
|
||||
/presentation/*scores.csv
|
||||
/presentation/Results.ods
|
||||
|
|
|
@ -9,6 +9,7 @@ dependencies:
|
|||
- flake8
|
||||
- jupyterlab
|
||||
- jupytext
|
||||
- lightgbm
|
||||
- mypy
|
||||
- nodejs
|
||||
- pandas
|
||||
|
|
|
@ -31,7 +31,6 @@ from sklearn.impute import SimpleImputer
|
|||
|
||||
from sklearn.dummy import DummyClassifier
|
||||
from sklearn import linear_model, svm, naive_bayes, neighbors, tree, ensemble
|
||||
from lightgbm import LGBMClassifier
|
||||
import xgboost as xg
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
|
@ -52,12 +51,12 @@ from machine_learning.classification_models import ClassificationModels
|
|||
|
||||
# %% [markdown]
|
||||
# ## Set script's parameters
|
||||
n_clusters = 5 # Number of clusters (could be regarded as a hyperparameter)
|
||||
n_clusters = 4 # Number of clusters (could be regarded as a hyperparameter)
|
||||
cv_method_str = 'logo' # logo, halflogo, 5kfold # Cross-validation method (could be regarded as a hyperparameter)
|
||||
n_sl = 1 # Number of largest/smallest accuracies (of particular CV) outputs
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
model_input = pd.read_csv("../data/intradaily_30_min_all_targets/input_JCQ_job_demand_mean.csv")
|
||||
model_input = pd.read_csv("../data/30min_all_target_inputs/input_JCQ_job_demand_mean.csv")
|
||||
index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"]
|
||||
|
||||
clust_col = model_input.set_index(index_columns).var().idxmax() # age is a col with the highest variance
|
||||
|
@ -66,7 +65,7 @@ model_input.columns[list(model_input.columns).index('age'):-1]
|
|||
|
||||
lime_cols = [col for col in model_input if col.startswith('limesurvey')]
|
||||
lime_cols
|
||||
lime_col = 'limesurvey_demand_control_ratio'
|
||||
lime_col = 'limesurvey_demand_control_ratio_quartile'
|
||||
clust_col = lime_col
|
||||
|
||||
model_input[clust_col].describe()
|
||||
|
@ -75,9 +74,11 @@ model_input[clust_col].describe()
|
|||
# %% jupyter={"source_hidden": true}
|
||||
|
||||
# Filter-out outlier rows by clust_col
|
||||
model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||
#model_input = model_input[(np.abs(stats.zscore(model_input[clust_col])) < 3)]
|
||||
|
||||
#print(model_input)
|
||||
uniq = model_input[[clust_col, 'pid']].drop_duplicates().reset_index(drop=True)
|
||||
uniq = uniq.dropna()
|
||||
plt.bar(uniq['pid'], uniq[clust_col])
|
||||
|
||||
# %% jupyter={"source_hidden": true}
|
||||
|
@ -182,3 +183,14 @@ for k in range(n_clusters):
|
|||
# %% jupyter={"source_hidden": true}
|
||||
# Get overall results
|
||||
cm.get_total_models_scores(n_clusters=n_clusters)
|
||||
|
||||
# %%
|
||||
final_scores = pd.DataFrame()
|
||||
for model in cmodels:
|
||||
final_scores = pd.concat([final_scores,pd.DataFrame.from_dict(cmodels[model])])
|
||||
|
||||
# %%
|
||||
final_scores
|
||||
|
||||
# %%
|
||||
|
||||
|
|
Loading…
Reference in New Issue