diff --git a/automl_test.py b/automl_test.py new file mode 100644 index 00000000..405da670 --- /dev/null +++ b/automl_test.py @@ -0,0 +1,56 @@ +from pprint import pprint +import sklearn.metrics +import autosklearn.regression + +import datetime +import importlib +import os +import sys + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +import yaml + +from sklearn import linear_model, svm, kernel_ridge, gaussian_process +from sklearn.model_selection import LeaveOneGroupOut, cross_val_score, train_test_split +from sklearn.metrics import mean_squared_error, r2_score +from sklearn.impute import SimpleImputer + +model_input = pd.read_csv("data/processed/models/population_model/z_input.csv") # Standardizirani podatki + +model_input.dropna(axis=1, how="all", inplace=True) +model_input.dropna(axis=0, how="any", subset=["target"], inplace=True) + +categorical_feature_colnames = ["gender", "startlanguage"] +categorical_features = model_input[categorical_feature_colnames].copy() +mode_categorical_features = categorical_features.mode().iloc[0] +categorical_features = categorical_features.fillna(mode_categorical_features) +categorical_features = categorical_features.apply(lambda col: col.astype("category")) +if not categorical_features.empty: + categorical_features = pd.get_dummies(categorical_features) +numerical_features = model_input.drop(categorical_feature_colnames, axis=1) +model_in = pd.concat([numerical_features, categorical_features], axis=1) + +index_columns = ["local_segment", "local_segment_label", "local_segment_start_datetime", "local_segment_end_datetime"] +model_in.set_index(index_columns, inplace=True) + +X_train, X_test, y_train, y_test = train_test_split(model_in.drop(["target", "pid"], axis=1), model_in["target"], test_size=0.30) + +automl = autosklearn.regression.AutoSklearnRegressor( + time_left_for_this_task=14400, + per_run_time_limit=120 +) +automl.fit(X_train, y_train, dataset_name='straw') + +print(automl.leaderboard()) +pprint(automl.show_models(), indent=4) + +train_predictions = automl.predict(X_train) +print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions)) +test_predictions = automl.predict(X_test) +print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions)) + +import sys +sys.exit() diff --git a/environment.yml b/environment.yml index cba49edc..536149bc 100644 --- a/environment.yml +++ b/environment.yml @@ -86,8 +86,6 @@ dependencies: - readline=8.0 - requests=2.25.0 - retrying=1.3.3 - - scikit-learn=0.23.2 - - scipy=1.5.2 - setuptools=51.0.0 - six=1.15.0 - smmap=3.0.4 @@ -107,34 +105,61 @@ dependencies: - zlib=1.2.11 - pip: - amply==0.1.4 + - auto-sklearn==0.14.7 - bidict==0.22.0 - biosppy==0.8.0 + - build==0.8.0 - cached-property==1.5.2 + - cloudpickle==2.2.0 - configargparse==0.15.1 + - configspace==0.4.21 - cr-features==0.2.1 - cycler==0.11.0 + - cython==0.29.32 + - dask==2022.2.0 - decorator==4.4.2 + - distributed==2022.2.0 + - distro==1.7.0 + - emcee==3.1.2 - fonttools==4.33.2 + - fsspec==2022.8.2 - h5py==3.6.0 + - heapdict==1.0.1 - hmmlearn==0.2.7 - ipython-genutils==0.2.0 - jupyter-core==4.6.3 - kiwisolver==1.4.2 + - liac-arff==2.5.0 + - locket==1.0.0 - matplotlib==3.5.1 + - msgpack==1.0.4 - nbformat==5.0.7 - opencv-python==4.5.5.64 - packaging==21.3 + - partd==1.3.0 - peakutils==1.3.3 + - pep517==0.13.0 - pillow==9.1.0 - pulp==2.4 + - pynisher==0.6.4 - pyparsing==2.4.7 + - pyrfr==0.8.3 - pyrsistent==0.15.5 - pywavelets==1.3.0 - ratelimiter==1.2.0.post0 + - scikit-learn==0.24.2 + - scipy==1.7.3 - seaborn==0.11.2 - shortuuid==1.0.8 + - smac==1.2 - snakemake==5.30.2 + - sortedcontainers==2.4.0 + - tblib==1.7.0 + - tomli==2.0.1 + - toolz==0.12.0 - toposort==1.5 + - tornado==6.2 - traitlets==4.3.3 - typing-extensions==4.2.0 + - zict==2.2.0 prefix: /opt/conda/envs/rapids diff --git a/features_nans.png b/features_nans.png deleted file mode 100644 index eef88765..00000000 Binary files a/features_nans.png and /dev/null differ diff --git a/src/features/phone_bluetooth/doryab/main.py b/src/features/phone_bluetooth/doryab/main.py index 6efec19a..cb272ebc 100644 --- a/src/features/phone_bluetooth/doryab/main.py +++ b/src/features/phone_bluetooth/doryab/main.py @@ -15,6 +15,7 @@ def deviceFeatures(devices, ownership, common_devices, features_to_compute, feat if "meanscans" in features_to_compute: features = features.join(device_value_counts.groupby("local_segment")["scans"].mean().to_frame("meanscans" + ownership), how="outer") if "stdscans" in features_to_compute: + # TODO: std scans features = features.join(device_value_counts.groupby("local_segment")["scans"].std().to_frame("stdscans" + ownership), how="outer") # Most frequent device within segments, across segments, and across dataset if "countscansmostfrequentdevicewithinsegments" in features_to_compute: diff --git a/src/features/phone_locations/doryab/main.py b/src/features/phone_locations/doryab/main.py index 19a7b8d5..36969032 100644 --- a/src/features/phone_locations/doryab/main.py +++ b/src/features/phone_locations/doryab/main.py @@ -36,6 +36,7 @@ def variance_and_logvariance_features(location_data, location_features): location_data["latitude_for_wvar"] = (location_data["double_latitude"] - location_data["latitude_wavg"]) ** 2 * location_data["duration"] * 60 location_data["longitude_for_wvar"] = (location_data["double_longitude"] - location_data["longitude_wavg"]) ** 2 * location_data["duration"] * 60 + # TODO: location variance location_features["locationvariance"] = ((location_data_grouped["latitude_for_wvar"].sum() + location_data_grouped["longitude_for_wvar"].sum()) / (location_data_grouped["duration"].sum() * 60 - 1)).fillna(0) location_features["loglocationvariance"] = np.log10(location_features["locationvariance"]).replace(-np.inf, np.nan) @@ -112,6 +113,8 @@ def location_entropy(location_data): entropy = -1 * location_data.groupby(["local_segment"])[["plogp"]].sum().rename(columns={"plogp": "locationentropy"}) entropy["num_clusters"] = location_data.groupby(["local_segment"])["cluster_label"].nunique() + + # TODO: normalizedlocationentropy entropy["normalizedlocationentropy"] = entropy["locationentropy"] / entropy["num_clusters"] return entropy @@ -153,6 +156,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se # distance and speed features moving_data = location_data[location_data["is_stationary"] == 0].copy() location_features = location_features.merge(distance_and_speed_features(moving_data), how="outer", left_index=True, right_index=True) + # TODO: zakaj se ne zapolni varspeed z 0? location_features[["totaldistance", "avgspeed", "varspeed"]] = location_features[["totaldistance", "avgspeed", "varspeed"]].fillna(0) # stationary features diff --git a/tests/scripts/doryab_values.py b/tests/scripts/doryab_values.py new file mode 100644 index 00000000..2f171999 --- /dev/null +++ b/tests/scripts/doryab_values.py @@ -0,0 +1,28 @@ +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + + +path = "/rapids/data/processed/features/all_participants/all_sensor_features.csv" +df = pd.read_csv(path) + +# Bluetooth +doryab_cols_bt = [col for col in df.columns if "bluetooth_doryab" in col] +df_bt = df[doryab_cols_bt] + +print(len(doryab_cols_bt)) +print(df_bt) + +sns.heatmap(df_bt, xticklabels=1) +plt.savefig(f'bluetooth_doryab_values', bbox_inches='tight') + +# Location +doryab_cols_loc = [col for col in df.columns if "locations_doryab" in col] +df_loc = df[doryab_cols_loc] + +print(len(doryab_cols_loc)) +print(df_loc) + +sns.heatmap(df_loc, xticklabels=1) +plt.savefig(f'locations_doryab_values', bbox_inches='tight') + diff --git a/tests/scripts/standardization_methods_test.py b/tests/scripts/standardization_methods_test.py new file mode 100644 index 00000000..0747339d --- /dev/null +++ b/tests/scripts/standardization_methods_test.py @@ -0,0 +1,70 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.preprocessing import StandardScaler +import sys + +sys.path.append('/rapids/') +from src.features import cr_features_helper_methods as crhm + +pd.set_option("display.max_columns", None) +features_win = pd.read_csv("data/interim/p031/empatica_temperature_features/empatica_temperature_python_cr_windows.csv", usecols=[0, 1, 2, 3, 4, 5]) + +# First standardization method +excluded_columns = ['local_segment', 'local_segment_label', 'local_segment_start_datetime', 'local_segment_end_datetime', "empatica_temperature_cr_level_1"] +z1_windows = features_win.copy() +z1_windows.loc[:, ~z1_windows.columns.isin(excluded_columns)] = StandardScaler().fit_transform(z1_windows.loc[:, ~z1_windows.columns.isin(excluded_columns)]) +z1 = crhm.extract_second_order_features(z1_windows, ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'], prefix="empatica_temperature_cr_") +z1 = z1.iloc[:,4:] +# print(z1) + +# Second standardization method +so_features_reg = crhm.extract_second_order_features(features_win, ['mean', 'median', 'sd', 'nlargest', 'nsmallest', 'count_windows'], prefix="empatica_temperature_cr_") +so_features_reg = so_features_reg.iloc[:,4:] +z2 = pd.DataFrame(StandardScaler().fit_transform(so_features_reg), columns=so_features_reg.columns) +# print(z2) + +# Standardization of the first standardization method values +z1_z = pd.DataFrame(StandardScaler().fit_transform(z1), columns=z1.columns) +# print(z1_z) + +# For SD +fig, axs = plt.subplots(3, figsize=(8, 10)) +axs[0].plot(z1['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']) +axs[0].set_title("Z1 - standardizirana okna, nato ekstrahiranje značilk SO") + +axs[1].plot(z2['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']) +axs[1].set_title("Z2 - ekstrahirane značilke SO 'normalnih' vrednosti, nato standardizacija") + +axs[2].plot(z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_sd']) +axs[2].set_title("Standardiziran Z1") + +fig.suptitle('Z-Score methods for temperature_squareSumOfComponent_SO_sd') +plt.savefig('z_score_comparison_temperature_squareSumOfComponent_X_SO_sd', bbox_inches='tight') + +showcase = pd.DataFrame() +showcase['Z1__SD'] = z1['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'] +showcase['Z2__SD'] = z2['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'] +showcase['Z1__SD_STANDARDIZED'] = z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_sd'] +print(showcase) + +# For +fig, axs = plt.subplots(3, figsize=(8, 10)) +axs[0].plot(z1['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']) +axs[0].set_title("Z1 - standardizirana okna, nato ekstrahiranje značilk SO") + +axs[1].plot(z2['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']) +axs[1].set_title("Z2") + +axs[2].plot(z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest']) +axs[2].set_title("Standardized Z1") + +fig.suptitle('Z-Score methods for temperature_squareSumOfComponent_SO_nlargest') +plt.savefig('z_score_comparison_temperature_squareSumOfComponent_X_SO_nlargest', bbox_inches='tight') + +showcase2 = pd.DataFrame() +showcase2['Z1__nlargest'] = z1['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'] +showcase2['Z2__nlargest'] = z2['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'] +showcase2['Z1__nlargest_STANDARDIZED'] = z1_z['empatica_temperature_cr_squareSumOfComponent_X_SO_nlargest'] +print(showcase2) +