2021-11-19 16:34:36 +01:00
source ( " renv/activate.R" )
library ( tidyr )
library ( " dplyr" , warn.conflicts = F )
library ( tidyverse )
library ( caret )
library ( corrr )
rapids_cleaning <- function ( sensor_data_files , provider ) {
clean_features <- read.csv ( sensor_data_files [ [ " sensor_data" ] ] , stringsAsFactors = FALSE )
impute_selected_event_features <- provider [ [ " IMPUTE_SELECTED_EVENT_FEATURES" ] ]
cols_nan_threshold <- as.numeric ( provider [ [ " COLS_NAN_THRESHOLD" ] ] )
drop_zero_variance_columns <- as.logical ( provider [ [ " COLS_VAR_THRESHOLD" ] ] )
rows_nan_threshold <- as.numeric ( provider [ [ " ROWS_NAN_THRESHOLD" ] ] )
data_yield_unit <- tolower ( str_split_fixed ( provider [ [ " DATA_YIELD_FEATURE" ] ] , " _" , 4 ) [ [4 ] ] )
data_yield_column <- paste0 ( " phone_data_yield_rapids_ratiovalidyielded" , data_yield_unit )
data_yield_ratio_threshold <- as.numeric ( provider [ [ " DATA_YIELD_RATIO_THRESHOLD" ] ] )
drop_highly_correlated_features <- provider [ [ " DROP_HIGHLY_CORRELATED_FEATURES" ] ]
# Impute selected event features
if ( as.logical ( impute_selected_event_features $ COMPUTE ) ) {
if ( ! " phone_data_yield_rapids_ratiovalidyieldedminutes" %in% colnames ( clean_features ) ) {
stop ( " Error: RAPIDS provider needs to impute the selected event features based on phone_data_yield_rapids_ratiovalidyieldedminutes column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyieldedminutes' in [FEATURES]." )
}
column_names <- colnames ( clean_features )
selected_apps_features <- column_names [grepl ( " ^phone_applications_foreground_rapids_(countevent|countepisode|minduration|maxduration|meanduration|sumduration)" , column_names ) ]
selected_battery_features <- column_names [grepl ( " ^phone_battery_rapids_" , column_names ) ]
selected_calls_features <- column_names [grepl ( " ^phone_calls_rapids_.*_(count|distinctcontacts|sumduration|minduration|maxduration|meanduration|modeduration)" , column_names ) ]
selected_keyboard_features <- column_names [grepl ( " ^phone_keyboard_rapids_(sessioncount|averagesessionlength|changeintextlengthlessthanminusone|changeintextlengthequaltominusone|changeintextlengthequaltoone|changeintextlengthmorethanone|maxtextlength|totalkeyboardtouches)" , column_names ) ]
selected_messages_features <- column_names [grepl ( " ^phone_messages_rapids_.*_(count|distinctcontacts)" , column_names ) ]
selected_screen_features <- column_names [grepl ( " ^phone_screen_rapids_(sumduration|maxduration|minduration|avgduration|countepisode)" , column_names ) ]
selected_wifi_features <- column_names [grepl ( " ^phone_wifi_(connected|visible)_rapids_" , column_names ) ]
selected_columns <- c ( selected_apps_features , selected_battery_features , selected_calls_features , selected_keyboard_features , selected_messages_features , selected_screen_features , selected_wifi_features )
clean_features [selected_columns ] [is.na ( clean_features [selected_columns ] ) & ( clean_features $ phone_data_yield_rapids_ratiovalidyieldedminutes > impute_selected_event_features $ MIN_DATA_YIELDED_MINUTES_TO_IMPUTE ) ] <- 0
}
# Drop rows with the value of data_yield_column less than data_yield_ratio_threshold
if ( ! data_yield_column %in% colnames ( clean_features ) ) {
stop ( paste0 ( " Error: RAPIDS provider needs to clean data based on " , data_yield_column , " column, please set config[PHONE_DATA_YIELD][PROVIDERS][RAPIDS][COMPUTE] to True and include 'ratiovalidyielded" , data_yield_unit , " ' in [FEATURES]." ) )
}
clean_features <- clean_features %>%
filter ( .[ [data_yield_column ] ] >= data_yield_ratio_threshold )
# Drop columns with a percentage of NA values above cols_nan_threshold
if ( nrow ( clean_features ) )
2022-04-13 16:01:31 +02:00
clean_features <- clean_features %>% select ( where ( ~ sum ( is.na ( .) ) / length ( .) <= cols_nan_threshold ) , starts_with ( " phone_esm" ) )
2021-11-19 16:34:36 +01:00
# Drop columns with zero variance
if ( drop_zero_variance_columns )
2022-04-13 16:01:31 +02:00
clean_features <- clean_features %>% select_if ( grepl ( " pid|local_segment|local_segment_label|local_segment_start_datetime|local_segment_end_datetime|phone_esm" , names ( .) ) | sapply ( ., n_distinct , na.rm = T ) > 1 )
2021-11-19 16:34:36 +01:00
# Drop highly correlated features
if ( as.logical ( drop_highly_correlated_features $ COMPUTE ) ) {
min_overlap_for_corr_threshold <- as.numeric ( drop_highly_correlated_features $ MIN_OVERLAP_FOR_CORR_THRESHOLD )
corr_threshold <- as.numeric ( drop_highly_correlated_features $ CORR_THRESHOLD )
features_for_corr <- clean_features %>%
select_if ( is.numeric ) %>%
select_if ( sapply ( ., n_distinct , na.rm = T ) > 1 )
valid_pairs <- crossprod ( ! is.na ( features_for_corr ) ) >= min_overlap_for_corr_threshold * nrow ( features_for_corr )
if ( ( nrow ( features_for_corr ) != 0 ) & ( ncol ( features_for_corr ) != 0 ) ) {
highly_correlated_features <- features_for_corr %>%
correlate ( use = " pairwise.complete.obs" , method = " spearman" ) %>%
column_to_rownames ( ., var = " term" ) %>%
as.matrix ( ) %>%
replace ( ! valid_pairs | is.na ( .) , 0 ) %>%
findCorrelation ( ., cutoff = corr_threshold , verbose = F , names = T )
clean_features <- clean_features [ , ! names ( clean_features ) %in% highly_correlated_features ]
}
}
# Drop rows with a percentage of NA values above rows_nan_threshold
clean_features <- clean_features %>%
mutate ( percentage_na = rowSums ( is.na ( .) ) / ncol ( .) ) %>%
filter ( percentage_na <= rows_nan_threshold ) %>%
select ( - percentage_na )
return ( clean_features )
}