Compare commits

...

9 Commits

Author SHA1 Message Date
junos 6295cc8e91 Add baseline data capabilities. 2022-02-04 18:39:32 +01:00
junos 360ec7de4b Update RAPIDS. 2022-02-04 17:20:11 +01:00
junos e177b15058 Clean features across participants.
Explore the best linear regression feature.
2022-01-19 13:41:09 +01:00
junos 832eb6137e ML with RAPIDS and missing values. 2022-01-19 12:53:03 +01:00
junos 702b091d73 Read RAPIDS features and create columns. 2022-01-07 17:00:12 +01:00
junos 257a044227 Update RAPIDS. 2022-01-07 12:22:50 +01:00
junos ae358f1e24 Various improvements of RAPIDS. 2021-12-15 20:22:22 +01:00
junos 4dee4b6fc1 Add info about updating RAPIDS. 2021-12-15 20:21:59 +01:00
junos ed3483ace4 Update RAPIDS to v1.7.1. 2021-12-15 18:35:26 +01:00
4 changed files with 215 additions and 3 deletions

1
.gitignore vendored
View File

@ -6,3 +6,4 @@ __pycache__/
/config/*.ipynb /config/*.ipynb
/statistical_analysis/*.ipynb /statistical_analysis/*.ipynb
/machine_learning/intermediate_results/ /machine_learning/intermediate_results/
/data/features/

View File

@ -143,3 +143,19 @@ If this still fails, `dos2unix` can be used to change them.
### System has not been booted with systemd as init system (PID 1) ### System has not been booted with systemd as init system (PID 1)
See [the installation issue above](#Timezone-environment-variable-for-tidyverse-(relevant-for-WSL2)). See [the installation issue above](#Timezone-environment-variable-for-tidyverse-(relevant-for-WSL2)).
## Update RAPIDS
To update RAPIDS, first pull and merge [origin]( https://github.com/carissalow/rapids), such as with:
```commandline
git fetch --progress "origin" refs/heads/master
git merge --no-ff origin/master
```
Next, update the conda and R virtual environment.
```bash
R -e 'renv::restore(repos = c(CRAN = "https://packagemanager.rstudio.com/all/__linux__/focal/latest"))'
```

View File

@ -6,7 +6,7 @@
# extension: .py # extension: .py
# format_name: percent # format_name: percent
# format_version: '1.3' # format_version: '1.3'
# jupytext_version: 1.12.0 # jupytext_version: 1.13.0
# kernelspec: # kernelspec:
# display_name: straw2analysis # display_name: straw2analysis
# language: python # language: python
@ -21,11 +21,15 @@ import os
import sys import sys
import numpy as np import numpy as np
import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
import yaml import yaml
from pyprojroot import here
from sklearn import linear_model from sklearn import linear_model
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
nb_dir = os.path.split(os.getcwd())[0] nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path: if nb_dir not in sys.path:
@ -257,4 +261,195 @@ model_validation.cross_validate()
# %% # %%
model_validation.groups model_validation.groups
# %% [markdown]
# # Use RAPIDS
# %% # %%
with open(here("rapids/config.yaml"), "r") as file:
rapids_config = yaml.safe_load(file)
# %%
for key in rapids_config.keys():
if isinstance(rapids_config[key], dict): # Remove top-level configs
if ("PROVIDERS" in rapids_config[key]): # Retain features (that have providers)
if rapids_config[key]["PROVIDERS"]: # Remove non-implemented features
for provider in rapids_config[key]["PROVIDERS"]:
if rapids_config[key]["PROVIDERS"][provider]["COMPUTE"]: # Check that the features were actually calculated
if "FEATURES" in rapids_config[key]["PROVIDERS"][provider]:
print(key)
print(provider)
print(rapids_config[key]["PROVIDERS"][provider]["FEATURES"])
# %%
features_rapids = pd.read_csv(here("rapids/data/processed/features/all_participants/all_sensor_features.csv"), parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"])
# %%
features_rapids.columns
# %%
features_rapids = features_rapids.assign(date_lj=lambda x: x.local_segment_start_datetime.dt.date)
# %%
features_rapids["participant_id"] = features_rapids["pid"].str.extract("(\d+)")
features_rapids["participant_id"] = pd.to_numeric(features_rapids["participant_id"])
features_rapids.set_index(["participant_id", "date_lj"], inplace=True)
# %%
with open("../machine_learning/config/minimal_labels.yaml", "r") as file:
labels_params = yaml.safe_load(file)
# %%
labels = machine_learning.labels.Labels(**labels_params)
labels.set_participants_label("all")
# %%
labels.aggregate_labels(cached=True)
labels_read = labels.get_aggregated_labels()
labels_read = labels_read.reset_index()
labels_read["date_lj"] = labels_read["date_lj"].dt.date
labels_read.set_index(["participant_id", "date_lj"], inplace=True)
# date_lj column is parsed as a date and represented as Timestamp, when read from csv.
# When calculated, it is represented as date.
# %%
features_rapids.shape
# %%
labels_read.shape
# %%
features_labels = features_rapids.join(labels_read, how="inner").reset_index()
# %%
features_labels.shape
# %%
features_labels.columns
# %%
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# %%
feature_columns = features_labels.columns[6:-3]
label_column = "NA"
group_column = "pid"
# %%
lin_reg_rapids = linear_model.LinearRegression()
logo = LeaveOneGroupOut()
logo.get_n_splits(
features_labels[feature_columns],
features_labels[label_column],
groups=features_labels[group_column],
)
# %%
cross_val_score(
lin_reg_rapids,
X=imputer.fit_transform(features_labels[feature_columns]),
y=features_labels[label_column],
groups=features_labels[group_column],
cv=logo,
n_jobs=-1,
scoring="r2",
)
# %%
sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(features_labels[feature_columns].isna(), cbar=False)
# %% [markdown] tags=[]
# ```yaml
# ALL_CLEANING_INDIVIDUAL:
# PROVIDERS:
# RAPIDS:
# COMPUTE: True
# IMPUTE_SELECTED_EVENT_FEATURES: # Fill NAs with 0 only for event-based features, see table below
# COMPUTE: True
# MIN_DATA_YIELDED_MINUTES_TO_IMPUTE: 0.33 # Any feature value in a time segment instance with phone data yield > [MIN_DATA_YIELDED_MINUTES_TO_IMPUTE] will be replaced with a zero.
# COLS_NAN_THRESHOLD: 0.3 # Discard columns with missing value ratios higher than [COLS_NAN_THRESHOLD]. Set to 1 to disable
# COLS_VAR_THRESHOLD: True # Set to True to discard columns with zero variance
# ROWS_NAN_THRESHOLD: 1 # Discard rows with missing value ratios higher than [ROWS_NAN_THRESHOLD]. Set to 1 to disable
# DATA_YIELD_FEATURE: RATIO_VALID_YIELDED_HOURS # RATIO_VALID_YIELDED_HOURS or RATIO_VALID_YIELDED_MINUTES
# DATA_YIELD_RATIO_THRESHOLD: 0.3 # Discard rows with ratiovalidyieldedhours or ratiovalidyieldedminutes feature less than [DATA_YIELD_RATIO_THRESHOLD]. The feature name is determined by [DATA_YIELD_FEATURE] parameter. Set to 0 to disable
# DROP_HIGHLY_CORRELATED_FEATURES:
# COMPUTE: False
# MIN_OVERLAP_FOR_CORR_THRESHOLD: 0.5
# CORR_THRESHOLD: 0.95
# SRC_SCRIPT: src/features/all_cleaning_individual/rapids/main.R
# ```
# %%
features_rapids_cleaned = pd.read_csv(here("rapids/data/processed/features/all_participants/all_sensor_features_cleaned_rapids.csv"), parse_dates=["local_segment_start_datetime", "local_segment_end_datetime"])
features_rapids_cleaned = features_rapids_cleaned.assign(date_lj=lambda x: x.local_segment_start_datetime.dt.date)
features_rapids_cleaned["participant_id"] = features_rapids_cleaned["pid"].str.extract("(\d+)")
features_rapids_cleaned["participant_id"] = pd.to_numeric(features_rapids_cleaned["participant_id"])
features_rapids_cleaned.set_index(["participant_id", "date_lj"], inplace=True)
# %%
features_cleaned_labels = features_rapids_cleaned.join(labels_read, how="inner").reset_index()
feature_clean_columns = features_cleaned_labels.columns[6:-3]
# %%
print(feature_columns.shape)
print(feature_clean_columns.shape)
# %%
sns.set(rc={"figure.figsize":(16, 8)})
sns.heatmap(features_cleaned_labels[feature_clean_columns].isna(), cbar=False)
# %%
lin_reg_rapids_clean = linear_model.LinearRegression()
logo = LeaveOneGroupOut()
logo.get_n_splits(
features_cleaned_labels[feature_clean_columns],
features_cleaned_labels[label_column],
groups=features_cleaned_labels[group_column],
)
# %%
features_clean_imputed = imputer.fit_transform(features_cleaned_labels[feature_clean_columns])
# %%
cross_val_score(
lin_reg_rapids_clean,
X=features_clean_imputed,
y=features_cleaned_labels[label_column],
groups=features_cleaned_labels[group_column],
cv=logo,
n_jobs=-1,
scoring="r2",
)
# %%
lin_reg_full = linear_model.LinearRegression()
lin_reg_full.fit(features_clean_imputed,features_cleaned_labels[label_column])
# %%
NA_pred = lin_reg_full.predict(features_clean_imputed)
# %%
# The coefficients
print("Coefficients: \n", lin_reg_full.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(features_cleaned_labels[label_column], NA_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(features_cleaned_labels[label_column], NA_pred))
# %%
feature_clean_columns[np.argmax(lin_reg_full.coef_)]
# %% [markdown]
# Ratio between stationary time and total location sensed time. A lat/long coordinate pair is labeled as stationary if its speed (distance/time) to the next coordinate pair is less than 1km/hr. A higher value represents a more stationary routine.
# %%
plt.scatter(features_clean_imputed[:,np.argmax(lin_reg_full.coef_)], features_cleaned_labels[label_column], color="black")
plt.scatter(features_clean_imputed[:,np.argmax(lin_reg_full.coef_)], NA_pred, color="red", linewidth=3)
plt.xticks()
plt.yticks()
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()

2
rapids

@ -1 +1 @@
Subproject commit e5cc02501f629c96641dfd1bcd1f7fcfd0d55462 Subproject commit bf9c764c97f076f4af288f7afa1a32931996b2db