Few modifications of some imputation values in cleaning script and feature extraction.
parent
9884b383cf
commit
1ad25bb572
|
@ -75,7 +75,8 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
"firstuseafter" in col or
|
"firstuseafter" in col or
|
||||||
"timefirstmessages" in col or
|
"timefirstmessages" in col or
|
||||||
"timelastmessages" in col]
|
"timelastmessages" in col]
|
||||||
features[impute_w_hn] = impute(features[impute_w_hn], method="high_number")
|
features[impute_w_hn] = features[impute_w_hn].fillna(1500)
|
||||||
|
|
||||||
|
|
||||||
# Impute special case (mostcommonactivity) and (homelabel)
|
# Impute special case (mostcommonactivity) and (homelabel)
|
||||||
impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col]
|
impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col]
|
||||||
|
@ -84,6 +85,10 @@ def straw_cleaning(sensor_data_files, provider):
|
||||||
impute_w_sn2 = [col for col in features.columns if "homelabel" in col]
|
impute_w_sn2 = [col for col in features.columns if "homelabel" in col]
|
||||||
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
|
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
|
||||||
|
|
||||||
|
impute_w_sn3 = [col for col features.columns if "loglocationvariance" in col]
|
||||||
|
features[impute_w_sn2] = features[impute_w_sn2].fillna(-1000000) # Special case of imputation - nominal/ordinal value
|
||||||
|
|
||||||
|
|
||||||
# Impute selected phone features with 0
|
# Impute selected phone features with 0
|
||||||
impute_zero = [col for col in features if \
|
impute_zero = [col for col in features if \
|
||||||
col.startswith('phone_applications_foreground_rapids_') or
|
col.startswith('phone_applications_foreground_rapids_') or
|
||||||
|
@ -151,7 +156,7 @@ def impute(df, method='zero'):
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'zero': df.fillna(0),
|
'zero': df.fillna(0),
|
||||||
'high_number': df.fillna(1000000),
|
'high_number': df.fillna(1500),
|
||||||
'mean': df.fillna(df.mean()),
|
'mean': df.fillna(df.mean()),
|
||||||
'median': df.fillna(df.median()),
|
'median': df.fillna(df.median()),
|
||||||
'knn': k_nearest(df)
|
'knn': k_nearest(df)
|
||||||
|
|
|
@ -71,9 +71,7 @@ def straw_cleaning(sensor_data_files, provider, target):
|
||||||
"firstuseafter" in col or
|
"firstuseafter" in col or
|
||||||
"timefirstmessages" in col or
|
"timefirstmessages" in col or
|
||||||
"timelastmessages" in col]
|
"timelastmessages" in col]
|
||||||
features[impute_w_hn] = impute(features[impute_w_hn], method="high_number")
|
features[impute_w_hn] = features[impute_w_hn].fillna(1500)
|
||||||
|
|
||||||
graph_bf_af(features, "4high_number_imp")
|
|
||||||
|
|
||||||
# Impute special case (mostcommonactivity) and (homelabel)
|
# Impute special case (mostcommonactivity) and (homelabel)
|
||||||
impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col]
|
impute_w_sn = [col for col in features.columns if "mostcommonactivity" in col]
|
||||||
|
@ -82,6 +80,9 @@ def straw_cleaning(sensor_data_files, provider, target):
|
||||||
impute_w_sn2 = [col for col in features.columns if "homelabel" in col]
|
impute_w_sn2 = [col for col in features.columns if "homelabel" in col]
|
||||||
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
|
features[impute_w_sn2] = features[impute_w_sn2].fillna(1) # Special case of imputation - nominal/ordinal value
|
||||||
|
|
||||||
|
impute_w_sn3 = [col for col features.columns if "loglocationvariance" in col]
|
||||||
|
features[impute_w_sn2] = features[impute_w_sn2].fillna(-1000000) # Special case of imputation - loglocation
|
||||||
|
|
||||||
# Impute selected phone features with 0
|
# Impute selected phone features with 0
|
||||||
impute_zero = [col for col in features if \
|
impute_zero = [col for col in features if \
|
||||||
col.startswith('phone_applications_foreground_rapids_') or
|
col.startswith('phone_applications_foreground_rapids_') or
|
||||||
|
@ -189,7 +190,7 @@ def impute(df, method='zero'):
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'zero': df.fillna(0),
|
'zero': df.fillna(0),
|
||||||
'high_number': df.fillna(1000000),
|
'high_number': df.fillna(1500),
|
||||||
'mean': df.fillna(df.mean()),
|
'mean': df.fillna(df.mean()),
|
||||||
'median': df.fillna(df.median()),
|
'median': df.fillna(df.median()),
|
||||||
'knn': k_nearest(df)
|
'knn': k_nearest(df)
|
||||||
|
|
|
@ -9,13 +9,13 @@ def compute_features(filtered_data, apps_type, requested_features, apps_features
|
||||||
if "timeoffirstuse" in requested_features:
|
if "timeoffirstuse" in requested_features:
|
||||||
time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
time_first_event = filtered_data.sort_values(by="timestamp", ascending=True).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
||||||
if time_first_event.empty:
|
if time_first_event.empty:
|
||||||
apps_features["timeoffirstuse" + apps_type] = 1000000 # np.nan
|
apps_features["timeoffirstuse" + apps_type] = 1500 # np.nan
|
||||||
else:
|
else:
|
||||||
apps_features["timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
|
apps_features["timeoffirstuse" + apps_type] = time_first_event["local_hour"] * 60 + time_first_event["local_minute"]
|
||||||
if "timeoflastuse" in requested_features:
|
if "timeoflastuse" in requested_features:
|
||||||
time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
time_last_event = filtered_data.sort_values(by="timestamp", ascending=False).drop_duplicates(subset="local_segment", keep="first").set_index("local_segment")
|
||||||
if time_last_event.empty:
|
if time_last_event.empty:
|
||||||
apps_features["timeoflastuse" + apps_type] = 1000000 # np.nan
|
apps_features["timeoflastuse" + apps_type] = 1500 # np.nan
|
||||||
else:
|
else:
|
||||||
apps_features["timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
|
apps_features["timeoflastuse" + apps_type] = time_last_event["local_hour"] * 60 + time_last_event["local_minute"]
|
||||||
if "frequencyentropy" in requested_features:
|
if "frequencyentropy" in requested_features:
|
||||||
|
|
|
@ -94,7 +94,7 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
||||||
colnames(.)
|
colnames(.)
|
||||||
|
|
||||||
call_features <- call_features %>%
|
call_features <- call_features %>%
|
||||||
mutate_at(., time_cols, ~replace(., is.na(.), 1000000))
|
mutate_at(., time_cols, ~replace(., is.na(.), 1500))
|
||||||
|
|
||||||
# Fill NA values with 0
|
# Fill NA values with 0
|
||||||
call_features <- call_features %>% mutate_all(~replace(., is.na(.), 0))
|
call_features <- call_features %>% mutate_all(~replace(., is.na(.), 0))
|
||||||
|
|
|
@ -70,7 +70,7 @@ rapids_features <- function(sensor_data_files, time_segment, provider){
|
||||||
colnames(.)
|
colnames(.)
|
||||||
|
|
||||||
messages_features <- messages_features %>%
|
messages_features <- messages_features %>%
|
||||||
mutate_at(., time_cols, ~replace(., is.na(.), 1000000))
|
mutate_at(., time_cols, ~replace(., is.na(.), 1500))
|
||||||
|
|
||||||
# Fill NA values with 0
|
# Fill NA values with 0
|
||||||
messages_features <- messages_features %>% mutate_all(~replace(., is.na(.), 0))
|
messages_features <- messages_features %>% mutate_all(~replace(., is.na(.), 0))
|
||||||
|
|
Loading…
Reference in New Issue