Finalise new day segment structure with calls as example
parent
4b6ba12e62
commit
31ec5b0da4
|
@ -41,7 +41,7 @@ if config["CALLS"]["COMPUTE"]:
|
|||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_raw.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
|
||||
files_to_compute.extend(expand("data/raw/{pid}/{sensor}_with_datetime_unified.csv", pid=config["PIDS"], sensor=config["CALLS"]["DB_TABLE"]))
|
||||
files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}_{day_segment}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"], day_segment = config["CALLS"]["DAY_SEGMENTS"]))
|
||||
files_to_compute.extend(expand("data/processed/{pid}/calls_{call_type}.csv", pid=config["PIDS"], call_type=config["CALLS"]["TYPES"], day_segment = config["CALLS"]["DAY_SEGMENTS"]))
|
||||
|
||||
if config["BARNETT_LOCATION"]["COMPUTE"]:
|
||||
if config["BARNETT_LOCATION"]["LOCATIONS_TO_USE"] == "RESAMPLE_FUSED":
|
||||
|
|
|
@ -4,7 +4,8 @@ PIDS: [t01]
|
|||
|
||||
# Global var with common day segments
|
||||
DAY_SEGMENTS: &day_segments
|
||||
"data/external/daysegments_default.csv"
|
||||
TYPE: INTERVAL_EVERY_DAY # FREQUENCY_EVERY_DAY, INTERVAL_EVERY_DAY, INTERVAL_FLEXIBLE_DAY
|
||||
FILE: "data/external/daysegments_interval.csv"
|
||||
|
||||
# Global timezone
|
||||
# Use codes from https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
label,start,end
|
||||
daily,00:00, 23:59
|
||||
morning,06:00, 11:59
|
||||
eveningblue,18:00, 21:59
|
|
|
@ -1,4 +1,2 @@
|
|||
label,start,end
|
||||
daily,00:00, 23:59
|
||||
morning,06:00, 11:59
|
||||
evening,18:00, 23:59
|
||||
label,start_time,length
|
||||
daily,00:00:00,"23H 59M 59S"
|
||||
|
|
|
|
@ -0,0 +1,8 @@
|
|||
label,start_date_time,length,shift,shift_direction
|
||||
stress,2020-05-04 11:30:00,1hours,30minutes,-1
|
||||
stress,2020-05-04 13:30:00,1hours,30minutes,-1
|
||||
stress1,2020-05-04 11:30:00,1hours,30minutes,-1
|
||||
stress2,2020-05-04 13:30:00,1hours,30minutes,-1
|
||||
weekly,2020-04-21 00:00:00,7days,0,0
|
||||
weekly,2020-04-28 00:00:00,7days,0,0
|
||||
weekly,2020-05-05 00:00:00,7days,0,0
|
|
|
@ -0,0 +1,2 @@
|
|||
label,length
|
||||
tenminutes,10
|
|
|
@ -0,0 +1,6 @@
|
|||
label,start_time,length
|
||||
daily,00:00:00,23H 59M 59S
|
||||
morning,06:00:00,5H 59M 59S
|
||||
afternoon,12:00:00,5H 59M 59S
|
||||
evening,18:00:00,5H 59M 59S
|
||||
night,00:00:00,5H 59M 59S
|
|
|
@ -26,6 +26,22 @@ def optional_phone_sensed_bins_input(wildcards):
|
|||
|
||||
return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform)
|
||||
|
||||
def find_day_segments_input_file(wildcards):
|
||||
for key, values in config.items():
|
||||
if "DB_TABLE" in config[key] and config[key]["DB_TABLE"] == wildcards.sensor:
|
||||
if "DAY_SEGMENTS" in config[key]:
|
||||
return config[key]["DAY_SEGMENTS"]["FILE"]
|
||||
else:
|
||||
raise ValueError("{} should have a [DAY_SEGMENTS][FILE] parameter containing the path to its day segments file".format(wildcards.sensor))
|
||||
|
||||
def find_day_segments_input_type(wildcards):
|
||||
for key, values in config.items():
|
||||
if "DB_TABLE" in config[key] and config[key]["DB_TABLE"] == wildcards.sensor:
|
||||
if "DAY_SEGMENTS" in config[key]:
|
||||
return config[key]["DAY_SEGMENTS"]["TYPE"]
|
||||
else:
|
||||
raise ValueError("{} should have a [DAY_SEGMENTS][TYPE] parameter containing INTERVAL, FREQUENCY, or EVENT".format(wildcards.sensor))
|
||||
|
||||
# Features.smk #########################################################################################################
|
||||
|
||||
def optional_ar_input(wildcards):
|
||||
|
@ -111,11 +127,3 @@ def optional_heatmap_days_by_sensors_input(wildcards):
|
|||
tables_platform = [table for table in config["HEATMAP_DAYS_BY_SENSORS"]["DB_TABLES"] if table not in [config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"]]] # for ios, discard any android tables that may exist
|
||||
|
||||
return expand("data/raw/{{pid}}/{table}_with_datetime.csv", table = tables_platform)
|
||||
|
||||
def find_day_segments_input_file(wildcards):
|
||||
for key, values in config.items():
|
||||
if "DB_TABLE" in config[key] and config[key]["DB_TABLE"] == wildcards.sensor:
|
||||
if "DAY_SEGMENTS" in config[key]:
|
||||
return config[key]["DAY_SEGMENTS"]
|
||||
else:
|
||||
raise ValueError("{} should have a DAY_SEGMENTS parameter containing the path to its day segments file".format(wildcards.sensor))
|
||||
|
|
|
@ -12,13 +12,13 @@ rule messages_features:
|
|||
|
||||
rule call_features:
|
||||
input:
|
||||
expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"])
|
||||
expand("data/raw/{{pid}}/{sensor}_with_datetime_unified.csv", sensor=config["CALLS"]["DB_TABLE"]),
|
||||
day_segments_labels = expand("data/interim/{sensor}_day_segments_labels.csv", sensor=config["CALLS"]["DB_TABLE"])
|
||||
params:
|
||||
call_type = "{call_type}",
|
||||
day_segment = "{day_segment}",
|
||||
features = lambda wildcards: config["CALLS"]["FEATURES"][wildcards.call_type]
|
||||
output:
|
||||
"data/processed/{pid}/calls_{call_type}_{day_segment}.csv"
|
||||
"data/processed/{pid}/calls_{call_type}.csv"
|
||||
script:
|
||||
"../src/features/call_features.R"
|
||||
|
||||
|
|
|
@ -41,8 +41,11 @@ rule download_dataset:
|
|||
rule compute_day_segments:
|
||||
input:
|
||||
find_day_segments_input_file
|
||||
params:
|
||||
day_segments_type = find_day_segments_input_type
|
||||
output:
|
||||
segments_file = "data/interim/{sensor}_day_segments.csv",
|
||||
segments_labels_file = "data/interim/{sensor}_day_segments_labels.csv",
|
||||
script:
|
||||
"../src/data/compute_day_segments.py"
|
||||
|
||||
|
@ -62,7 +65,8 @@ rule readable_datetime:
|
|||
day_segments = "data/interim/{sensor}_day_segments.csv"
|
||||
params:
|
||||
timezones = None,
|
||||
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"]
|
||||
fixed_timezone = config["READABLE_DATETIME"]["FIXED_TIMEZONE"],
|
||||
day_segments_type = find_day_segments_input_type
|
||||
wildcard_constraints:
|
||||
sensor = '.*(' + '|'.join([re.escape(x) for x in PHONE_SENSORS]) + ').*' # only process smartphone sensors, not fitbit
|
||||
output:
|
||||
|
|
|
@ -1,54 +1,89 @@
|
|||
import pandas as pd
|
||||
|
||||
def is_valid_frequency_segments(day_segments):
|
||||
def is_valid_frequency_segments(day_segments, day_segments_file):
|
||||
"""
|
||||
returns true if day_segment has the expected structure for generating frequency segments;
|
||||
raises ValueError exception otherwise.
|
||||
"""
|
||||
if day_segments is None:
|
||||
message = 'Table of frequency segmentation info is None. ' \
|
||||
'Check the file under DAY_SEGMENTS in config.yaml'
|
||||
raise ValueError(message)
|
||||
|
||||
valid_columns = ["label", "length"]
|
||||
if len(list(set(day_segments.columns) - set(valid_columns))) > 0:
|
||||
error_message = 'The FREQUENCY_EVERY_DAY day segments file in [DAY_SEGMENTS][FILE] must have two columns: label, and length ' \
|
||||
'but instead we found {}. Modify {}'.format(list(day_segments.columns), day_segments_file)
|
||||
raise ValueError(error_message)
|
||||
|
||||
if day_segments.shape[0] == 0:
|
||||
message = 'Table of frequency segmentation info is empty. ' \
|
||||
'Check the file under DAY_SEGMENTS in config.yaml'
|
||||
raise ValueError(message)
|
||||
if day_segments.shape[0] > 1:
|
||||
message = 'Table of frequency segmentation info provides multiple specification but only one is allowed. ' \
|
||||
'Check the file under DAY_SEGMENTS in config.yaml'
|
||||
raise ValueError(message)
|
||||
|
||||
if 'length' not in day_segments.columns:
|
||||
message = 'Table of frequency segmentation info must provide segment length. ' \
|
||||
'Check the file under DAY_SEGMENTS in config.yaml'
|
||||
raise ValueError(message)
|
||||
if 'label' not in day_segments.columns:
|
||||
message = 'Table of frequency segmentation info must provide segment label. ' \
|
||||
'Check the file under DAY_SEGMENTS in config.yaml'
|
||||
message = 'The FREQUENCY_EVERY_DAY day segments file in [DAY_SEGMENTS][FILE] can only have 1 row.' \
|
||||
'Modify {}'.format(day_segments_file)
|
||||
raise ValueError(message)
|
||||
|
||||
if not pd.api.types.is_integer_dtype(day_segments.dtypes['length']):
|
||||
message = 'Only integer segment length is allowed in the table of frequency segmentation; ' \
|
||||
'found {}. Check the file under DAY_SEGMENTS in config.yaml'.format(day_segments.dtypes['length'])
|
||||
message = 'The column length in the FREQUENCY_EVERY_DAY day segments file in [DAY_SEGMENTS][FILE] must be integer but instead is ' \
|
||||
'{}. Modify {}'.format(day_segments.dtypes['length'], day_segments_file)
|
||||
raise ValueError(message)
|
||||
|
||||
if day_segments.iloc[0].loc['length'] < 0:
|
||||
message = 'Only positive integer segment length is allowed in the table of frequency segmentation; ' \
|
||||
'found {}. Check the file under DAY_SEGMENTS in config.yaml'.format(day_segments.iloc[0].loc['length'])
|
||||
message = 'The value in column length in the FREQUENCY_EVERY_DAY day segments file in [DAY_SEGMENTS][FILE] must be positive but instead is ' \
|
||||
'{}. Modify {}'.format(day_segments.iloc[0].loc['length'], day_segments_file)
|
||||
raise ValueError(message)
|
||||
if day_segments.iloc[0].loc['length'] >= 1440:
|
||||
message = 'Segment length in the table of frequency segmentation should be shorter than a day (in minutes); ' \
|
||||
'found {}. Check the file under DAY_SEGMENTS in config.yaml'.format(day_segments.iloc[0].loc['length'])
|
||||
message = 'The column length in the FREQUENCY_EVERY_DAY day segments file in [DAY_SEGMENTS][FILE] must be shorter than a day in minutes (1440) but instead is ' \
|
||||
'{}. Modify {}'.format(day_segments.iloc[0].loc['length'], day_segments_file)
|
||||
raise ValueError(message)
|
||||
|
||||
return True
|
||||
|
||||
def is_valid_interval_segments(day_segments):
|
||||
def is_valid_interval_segments(day_segments, day_segments_file):
|
||||
day_segments = day_segments.copy(deep=True)
|
||||
|
||||
valid_columns = ["label", "start_time", "length"]
|
||||
if len(list(set(day_segments.columns) - set(valid_columns))) > 0:
|
||||
error_message = 'The INTERVAL_EVERY_DAY day segments file in [DAY_SEGMENTS][FILE] must have three columns: label, start_time and length ' \
|
||||
'but instead we found {}. Modify {}'.format(list(day_segments.columns), day_segments_file)
|
||||
raise ValueError(error_message)
|
||||
|
||||
try:
|
||||
day_segments["start_time"] = pd.to_datetime(day_segments["start_time"])
|
||||
except ValueError as err:
|
||||
raise ValueError("At least one start_time in the INTERVAL_EVERY_DAY day segments file in [DAY_SEGMENTS][FILE] has an invalid format, it should be HH:MM in 24hr clock({}). Modify {}".format(err, day_segments_file))
|
||||
|
||||
if(day_segments.shape[0] != day_segments.drop_duplicates().shape[0]):
|
||||
error_message = 'The INTERVAL_EVERY_DAY day segments file in [DAY_SEGMENTS][FILE] has two or more rows that are identical. ' \
|
||||
'Modify {}'.format(day_segments_file)
|
||||
raise ValueError(error_message)
|
||||
|
||||
# TODO Validate string format for lubridate
|
||||
|
||||
return True
|
||||
|
||||
def is_valid_event_segments(day_segments):
|
||||
return False
|
||||
def is_valid_event_segments(day_segments, day_segments_file):
|
||||
day_segments = day_segments.copy(deep=True)
|
||||
|
||||
valid_columns = ["label", "start_date_time", "length", "shift", "shift_direction"]
|
||||
if len(list(set(day_segments.columns) - set(valid_columns))) > 0:
|
||||
error_message = 'The INTERVAL_FLEXIBLE_DAY day segments file in [DAY_SEGMENTS][FILE] must have five columns: label, start_date_time, length, shift and shift_direction ' \
|
||||
'but instead we found {}. Modify {}'.format(list(day_segments.columns), day_segments_file)
|
||||
raise ValueError(error_message)
|
||||
|
||||
try:
|
||||
day_segments["start_date_time"] = pd.to_datetime(day_segments["start_date_time"], format='%Y-%m-%d %H:%M:%S', errors='raise')
|
||||
except ValueError as err:
|
||||
raise ValueError("At least one start_date_time has an invalid format, it should be YYYY-MM-DD HH:MM:SS in 24hr clock({}). Modify {}".format(err, day_segments_file))
|
||||
|
||||
valid_shift_direction_values = [1, -1, 0]
|
||||
provided_values = day_segments["shift_direction"].unique()
|
||||
if len(list(set(provided_values) - set(valid_shift_direction_values))) > 0:
|
||||
error_message = 'The values of shift_direction column in the INTERVAL_FLEXIBLE_DAY day segments file in [DAY_SEGMENTS][FILE] can only be 1, -1 or 0 ' \
|
||||
'but instead we found {}. Modify {}'.format(provided_values, day_segments_file)
|
||||
raise ValueError(error_message)
|
||||
|
||||
if(day_segments.shape[0] != day_segments.drop_duplicates().shape[0]):
|
||||
error_message = 'The INTERVAL_FLEXIBLE_DAY day segments file in [DAY_SEGMENTS][FILE] has two or more rows that are identical. ' \
|
||||
'Modify {}'.format(day_segments_file)
|
||||
raise ValueError(error_message)
|
||||
|
||||
# TODO Validate string format for lubridate of length and shift
|
||||
return True
|
||||
|
||||
|
||||
def parse_frequency_segments(day_segments: pd.DataFrame) -> pd.DataFrame:
|
||||
|
@ -70,26 +105,22 @@ def parse_frequency_segments(day_segments: pd.DataFrame) -> pd.DataFrame:
|
|||
slots = ['{:02d}:{:02d}'.format(x.hour, x.minute) for x in slots]
|
||||
|
||||
table = pd.DataFrame(slots, columns=['start_time'])
|
||||
table['end_time'] = table['start_time'].shift(-1)
|
||||
table['length'] = day_segments.iloc[0].loc['length']
|
||||
table = table.iloc[:-1, :]
|
||||
|
||||
label = day_segments.loc[0, 'label']
|
||||
table['label'] = range(0, table.shape[0])
|
||||
table['label'] = table['label'].apply(lambda x: '{}_{:04}'.format(label, x))
|
||||
table['label'] = table['label'].apply(lambda x: '{}{:04}'.format(label, x))
|
||||
|
||||
table['local_date'] = None
|
||||
|
||||
return table[['local_date', 'start_time', 'end_time', 'label']]
|
||||
return table[['start_time', 'length', 'label']]
|
||||
|
||||
def parse_interval_segments(day_segments):
|
||||
day_segments["local_date"] = 1
|
||||
day_segments = day_segments.rename(columns={"start": "start_time", "end":"end_time"})
|
||||
return day_segments
|
||||
|
||||
def parse_event_segments(day_segments):
|
||||
return day_segments
|
||||
|
||||
def parse_day_segments(day_segments_file):
|
||||
def parse_day_segments(day_segments_file, segments_type):
|
||||
# Add code to validate and parse frequencies, intervals, and events
|
||||
# Expected formats:
|
||||
# Frequency: label, length columns (e.g. my_prefix, 5) length has to be in minutes (int)
|
||||
|
@ -98,15 +129,27 @@ def parse_day_segments(day_segments_file):
|
|||
# Our output should have local_date, start_time, end_time, label. In the readable_datetime script, If local_date has the same value for all rows, every segment will be applied for all days, otherwise each segment will be applied only to its local_date
|
||||
day_segments = pd.read_csv(day_segments_file)
|
||||
|
||||
if(is_valid_frequency_segments(day_segments)):
|
||||
if day_segments is None:
|
||||
message = 'The day segments file in [DAY_SEGMENTS][FILE] is None. Modify {}'.format(local_date)
|
||||
raise ValueError(message)
|
||||
|
||||
if day_segments.shape[0] == 0:
|
||||
message = 'The day segments file in [DAY_SEGMENTS][FILE] is empty. Modify {}'.format(local_date)
|
||||
raise ValueError(message)
|
||||
|
||||
if(segments_type not in ["FREQUENCY_EVERY_DAY", "INTERVAL_EVERY_DAY", "INTERVAL_FLEXIBLE_DAY"]):
|
||||
raise ValueError("[DAY_SEGMENTS][TYPE] can only be FREQUENCY_EVERY_DAY, INTERVAL_EVERY_DAY, or INTERVAL_FLEXIBLE_DAY")
|
||||
|
||||
if(segments_type == "FREQUENCY_EVERY_DAY" and is_valid_frequency_segments(day_segments, day_segments_file)):
|
||||
day_segments = parse_frequency_segments(day_segments)
|
||||
elif(is_valid_interval_segments(day_segments)):
|
||||
elif(segments_type == "INTERVAL_EVERY_DAY" and is_valid_interval_segments(day_segments, day_segments_file)):
|
||||
day_segments = parse_interval_segments(day_segments)
|
||||
elif(is_valid_event_segments(day_segments)):
|
||||
elif(segments_type == "INTERVAL_FLEXIBLE_DAY" and is_valid_event_segments(day_segments, day_segments_file)):
|
||||
day_segments = parse_event_segments(day_segments)
|
||||
else:
|
||||
raise ValueError("{} does not have a format compatible with frequency, interval or event day segments. Please refer to [LINK]".format(day_segments_file))
|
||||
return day_segments
|
||||
|
||||
day_segments = parse_day_segments(snakemake.input[0])
|
||||
day_segments.to_csv(snakemake.output["segments_file"], index=False)
|
||||
final_day_segments = parse_day_segments(snakemake.input[0], snakemake.params["day_segments_type"])
|
||||
final_day_segments.to_csv(snakemake.output["segments_file"], index=False)
|
||||
pd.DataFrame({"label" : final_day_segments["label"].unique()}).to_csv(snakemake.output["segments_labels_file"], index=False)
|
|
@ -5,37 +5,92 @@ library("readr")
|
|||
library("lubridate")
|
||||
|
||||
input <- read.csv(snakemake@input[["sensor_input"]]) %>% arrange(timestamp)
|
||||
day_segments <- read.csv(snakemake@input[["day_segments"]]) %>% filter(label != "daily") #daily is done by default by all scripts
|
||||
day_segments <- read.csv(snakemake@input[["day_segments"]])
|
||||
day_segments_type <- snakemake@params[["day_segments_type"]]
|
||||
sensor_output <- snakemake@output[[1]]
|
||||
timezone_periods <- snakemake@params[["timezone_periods"]]
|
||||
fixed_timezone <- snakemake@params[["fixed_timezone"]]
|
||||
|
||||
assign_to_day_segment <- function(data, day_segments){
|
||||
data <- data %>% mutate(local_day_segment = NA)
|
||||
assign_to_day_segment <- function(data, day_segments, day_segments_type, fixed_timezone){
|
||||
|
||||
# All segments belong to the same date, so we assume all days have the same segments
|
||||
if(length(unique(day_segments$local_date)) == 1){
|
||||
data <- data %>% mutate(local_time_obj = lubridate::hms(local_time))
|
||||
day_segments <- day_segments %>% mutate(start_time = lubridate::hm(start_time),
|
||||
end_time = lubridate::hm(end_time))
|
||||
if(day_segments_type == "FREQUENCY_EVERY_DAY"){
|
||||
data <- data %>% mutate(local_date_time_obj = lubridate::parse_date_time(local_time, orders = c("HMS", "HM")))
|
||||
day_segments <- day_segments %>% mutate(start_time = lubridate::parse_date_time(start_time, orders = c("HMS", "HM")),
|
||||
end_time = start_time + minutes(length))
|
||||
|
||||
# Create a new column for each day_segment
|
||||
for(row_id in 1:nrow(day_segments)){
|
||||
row = day_segments[row_id,]
|
||||
data <- data %>% mutate(local_day_segment = ifelse(local_time_obj >= row$start_time & local_time_obj <= row$end_time, row$label, local_day_segment))
|
||||
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj >= row$start_time & local_date_time_obj < row$end_time,
|
||||
paste0("[",
|
||||
row$label, "_",
|
||||
local_date, "_",
|
||||
paste(str_pad(hour(row$start_time),2, pad="0"), str_pad(minute(row$start_time),2, pad="0"), str_pad(second(row$start_time),2, pad="0"),sep =":"),
|
||||
"]"), NA))
|
||||
}
|
||||
data <- data %>% select(-local_time_obj)
|
||||
# Segments belong to different dates, so each day can have different segments
|
||||
}else{
|
||||
data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time))
|
||||
day_segments <- day_segments %>% mutate(start_local_date_time_obj = lubridate::ymd_hm(paste(local_date, start_time)),
|
||||
end_local_date_time_obj = lubridate::ymd_hm(paste(local_date, end_time)),
|
||||
|
||||
} else if (day_segments_type == "INTERVAL_EVERY_DAY"){
|
||||
|
||||
data_dates <- data %>% select(local_date) %>% distinct(local_date)
|
||||
inferred_day_segments <- crossing(day_segments, data_dates) %>%
|
||||
mutate(start_local_date_time_obj = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone),
|
||||
end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length),
|
||||
date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj)) %>%
|
||||
group_by(label, local_date) %>%
|
||||
mutate(group_start_datetime = lubridate::parse_date_time(paste(local_date, start_time), orders = c("Ymd HMS", "Ymd HM"), tz = fixed_timezone),
|
||||
group_end_datetime = group_start_datetime + lubridate::period(length),
|
||||
group_start_datetime = min(group_start_datetime),
|
||||
group_end_datetime = max(group_end_datetime)) %>%
|
||||
ungroup()
|
||||
|
||||
|
||||
data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone))
|
||||
|
||||
# Create a new column for each day_segment
|
||||
for(row_id in 1:nrow(inferred_day_segments)){
|
||||
row = inferred_day_segments[row_id,]
|
||||
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval,
|
||||
paste0("[",
|
||||
paste(sep= "#",
|
||||
row$label,
|
||||
lubridate::date(row$group_start_datetime),
|
||||
paste(str_pad(hour(row$group_start_datetime),2, pad="0"), str_pad(minute(row$group_start_datetime),2, pad="0"), str_pad(second(row$group_start_datetime),2, pad="0"),sep =":"),
|
||||
lubridate::date(row$group_end_datetime),
|
||||
paste(str_pad(hour(row$group_end_datetime),2, pad="0"), str_pad(minute(row$group_end_datetime),2, pad="0"), str_pad(second(row$group_end_datetime),2, pad="0"),sep =":")
|
||||
),
|
||||
"]"), NA))
|
||||
}
|
||||
|
||||
|
||||
} else if ( day_segments_type == "INTERVAL_FLEXIBLE_DAY"){
|
||||
data <- data %>% mutate(local_date_time_obj = lubridate::ymd_hms(local_date_time, tz = fixed_timezone))
|
||||
day_segments <- day_segments %>% mutate(shift = ifelse(shift == "0", "0seconds", shift),
|
||||
start_local_date_time_obj = lubridate::ymd_hms(start_date_time, tz = fixed_timezone) + (lubridate::period(shift) * ifelse(shift_direction >= 0, 1, -1)),
|
||||
end_local_date_time_obj = start_local_date_time_obj + lubridate::period(length),
|
||||
date_time_interval = lubridate::interval(start_local_date_time_obj, end_local_date_time_obj))
|
||||
|
||||
# Create a new column for each day_segment
|
||||
for(row_id in 1:nrow(day_segments)){
|
||||
row = day_segments[row_id,]
|
||||
data <- data %>% mutate(local_day_segment = ifelse(local_date_time_obj %within% row$date_time_interval, row$label, local_day_segment))
|
||||
print(row$length)
|
||||
data <- data %>% mutate(!!paste("local_day_segment", row_id, sep = "_") := ifelse(local_date_time_obj %within% row$date_time_interval,
|
||||
paste0("[",
|
||||
paste(sep= "#",
|
||||
row$label,
|
||||
lubridate::date(row$start_local_date_time_obj),
|
||||
paste(str_pad(hour(row$start_local_date_time_obj),2, pad="0"), str_pad(minute(row$start_local_date_time_obj),2, pad="0"), str_pad(second(row$start_local_date_time_obj),2, pad="0"),sep =":"),
|
||||
lubridate::date(row$end_local_date_time_obj),
|
||||
paste(str_pad(hour(row$end_local_date_time_obj),2, pad="0"), str_pad(minute(row$end_local_date_time_obj),2, pad="0"), str_pad(second(row$end_local_date_time_obj),2, pad="0"),sep =":")
|
||||
),
|
||||
"]"), NA))
|
||||
}
|
||||
data <- data %>% select(-local_date_time_obj)
|
||||
}
|
||||
|
||||
# Join all day_segments in a single column
|
||||
data <- data %>%
|
||||
unite("assigned_segments", starts_with("local_day_segment"), sep = "|", na.rm = TRUE) %>%
|
||||
select(-local_date_time_obj)
|
||||
|
||||
return(data)
|
||||
}
|
||||
|
||||
|
@ -45,28 +100,31 @@ split_local_date_time <- function(data, day_segments){
|
|||
separate(local_time, c("local_hour", "local_minute"), ":", remove = FALSE, extra = "drop") %>%
|
||||
mutate(local_hour = as.numeric(local_hour),
|
||||
local_minute = as.numeric(local_minute))
|
||||
|
||||
split_data <- assign_to_day_segment(split_data, day_segments)
|
||||
|
||||
return(split_data)
|
||||
}
|
||||
|
||||
if(!is.null(timezone_periods)){
|
||||
timezones <- read_csv(timezone_periods)
|
||||
tz_starts <- timezones$start
|
||||
output <- input %>%
|
||||
mutate(timezone = findInterval(timestamp / 1000, tz_starts), # Set an interval ID based on timezones' start column
|
||||
timezone = ifelse(timezone == 0, 1, timezone), # Correct the first timezone ID
|
||||
timezone = recode(timezone, !!! timezones$timezone), # Swap IDs for text labels
|
||||
timezone = as.character(timezone)) %>%
|
||||
rowwise() %>%
|
||||
mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"),
|
||||
local_date_time = format(utc_date_time, tz = timezone, usetz = T))
|
||||
output <- split_local_date_time(output, day_segments)
|
||||
write.csv(output, sensor_output)
|
||||
# TODO: Not active yet
|
||||
# timezones <- read_csv(timezone_periods)
|
||||
# tz_starts <- timezones$start
|
||||
# output <- input %>%
|
||||
# mutate(timezone = findInterval(timestamp / 1000, tz_starts), # Set an interval ID based on timezones' start column
|
||||
# timezone = ifelse(timezone == 0, 1, timezone), # Correct the first timezone ID
|
||||
# timezone = recode(timezone, !!! timezones$timezone), # Swap IDs for text labels
|
||||
# timezone = as.character(timezone)) %>%
|
||||
# rowwise() %>%
|
||||
# mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"),
|
||||
# local_date_time = format(utc_date_time, tz = timezone, usetz = T, "%Y-%m-%d %H:%M:%S"))
|
||||
# output <- split_local_date_time(output, day_segments)
|
||||
# TODO: Implement day segment assigment with support for multiple timezones
|
||||
# output <- assign_to_day_segment(output, day_segments, day_segments_type, fixed_timezone)
|
||||
# write.csv(output, sensor_output)
|
||||
} else if(!is.null(fixed_timezone)){
|
||||
output <- input %>%
|
||||
mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"),
|
||||
local_date_time = format(utc_date_time, tz = fixed_timezone, usetz = F))
|
||||
output <- split_local_date_time(output, day_segments)
|
||||
write_csv(output, sensor_output)
|
||||
output <- input %>%
|
||||
mutate(utc_date_time = as.POSIXct(timestamp/1000, origin="1970-01-01", tz="UTC"),
|
||||
local_date_time = format(utc_date_time, tz = fixed_timezone, usetz = F, "%Y-%m-%d %H:%M:%S"))
|
||||
output <- split_local_date_time(output, day_segments)
|
||||
output <- assign_to_day_segment(output, day_segments, day_segments_type, fixed_timezone)
|
||||
write_csv(output, sensor_output)
|
||||
}
|
||||
|
|
|
@ -54,7 +54,7 @@ unify_ios_calls <- function(ios_calls){
|
|||
local_time = first(local_time),
|
||||
local_hour = first(local_hour),
|
||||
local_minute = first(local_minute),
|
||||
local_day_segment = first(local_day_segment))
|
||||
assigned_segments = first(assigned_segments))
|
||||
}
|
||||
else {
|
||||
ios_calls <- ios_calls %>% summarise(call_type_sequence = paste(call_type, collapse = ","), call_duration = sum(call_duration), timestamp = first(timestamp))
|
||||
|
|
|
@ -1,13 +1,5 @@
|
|||
library('tidyr')
|
||||
|
||||
filter_by_day_segment <- function(data, day_segment) {
|
||||
if(day_segment %in% c("morning", "afternoon", "evening", "night"))
|
||||
data <- data %>% filter(local_day_segment == day_segment)
|
||||
else if(day_segment == "daily")
|
||||
return(data)
|
||||
else
|
||||
return(data %>% head(0))
|
||||
}
|
||||
library('stringr')
|
||||
|
||||
Mode <- function(v) {
|
||||
uniqv <- unique(v)
|
||||
|
@ -16,7 +8,7 @@ Mode <- function(v) {
|
|||
|
||||
base_call_features <- function(calls, call_type, day_segment, requested_features){
|
||||
# Output dataframe
|
||||
features = data.frame(local_date = character(), stringsAsFactors = FALSE)
|
||||
features = data.frame(local_segment = character(), stringsAsFactors = FALSE)
|
||||
|
||||
# The name of the features this function can compute
|
||||
base_features_names <- c("count", "distinctcontacts", "meanduration", "sumduration", "minduration", "maxduration", "stdduration", "modeduration", "entropyduration", "timefirstcall", "timelastcall", "countmostfrequentcontact")
|
||||
|
@ -28,13 +20,21 @@ base_call_features <- function(calls, call_type, day_segment, requested_features
|
|||
call_type_label = ifelse(call_type == "incoming", "1", ifelse(call_type == "outgoing", "2", ifelse(call_type == "missed", "3", NA)))
|
||||
if(is.na(call_type_label))
|
||||
stop(paste("Call type can online be incoming, outgoing or missed but instead you typed: ", call_type))
|
||||
calls <- calls %>% filter(call_type == call_type_label) %>% filter_by_day_segment(day_segment)
|
||||
|
||||
# Filter the rows that belong to day_segment, and put the segment full name in a new column for grouping
|
||||
date_regex = "[0-9]{4}[\\-|\\/][0-9]{2}[\\-|\\/][0-9]{2}"
|
||||
hour_regex = "[0-9]{2}:[0-9]{2}:[0-9]{2}"
|
||||
calls <- calls %>%
|
||||
filter(call_type == call_type_label) %>%
|
||||
filter(grepl(paste0("\\[", day_segment, "#"),assigned_segments)) %>%
|
||||
mutate(local_segment = str_extract(assigned_segments, paste0("\\[", day_segment, "#", date_regex, "#", hour_regex, "#", date_regex, "#", hour_regex, "\\]")),
|
||||
local_segment = str_sub(local_segment, 2, -2)) # get rid of first and last character([])
|
||||
|
||||
# If there are not features or data to work with, return an empty df with appropiate columns names
|
||||
if(length(features_to_compute) == 0)
|
||||
return(features)
|
||||
if(nrow(calls) < 1)
|
||||
return(cbind(features, read.csv(text = paste(paste("call", call_type, day_segment, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
|
||||
return(cbind(features, read.csv(text = paste(paste("call", call_type, features_to_compute, sep = "_"), collapse = ","), stringsAsFactors = FALSE)))
|
||||
|
||||
for(feature_name in features_to_compute){
|
||||
if(feature_name == "countmostfrequentcontact"){
|
||||
|
@ -48,28 +48,28 @@ base_call_features <- function(calls, call_type, day_segment, requested_features
|
|||
pull(trace)
|
||||
feature <- calls %>%
|
||||
filter(trace == mostfrequentcontact) %>%
|
||||
group_by(local_date) %>%
|
||||
summarise(!!paste("call", call_type, day_segment, feature_name, sep = "_") := n()) %>%
|
||||
group_by(local_segment) %>%
|
||||
summarise(!!paste("call", call_type, feature_name, sep = "_") := n()) %>%
|
||||
replace(is.na(.), 0)
|
||||
features <- merge(features, feature, by="local_date", all = TRUE)
|
||||
features <- merge(features, feature, by="local_segment", all = TRUE)
|
||||
} else {
|
||||
feature <- calls %>%
|
||||
group_by(local_date)
|
||||
group_by(local_segment)
|
||||
|
||||
feature <- switch(feature_name,
|
||||
"count" = feature %>% summarise(!!paste("call", call_type, day_segment, feature_name, sep = "_") := n()),
|
||||
"distinctcontacts" = feature %>% summarise(!!paste("call", call_type, day_segment, feature_name, sep = "_") := n_distinct(trace)),
|
||||
"meanduration" = feature %>% summarise(!!paste("call", call_type, day_segment, feature_name, sep = "_") := mean(call_duration)),
|
||||
"sumduration" = feature %>% summarise(!!paste("call", call_type, day_segment, feature_name, sep = "_") := sum(call_duration)),
|
||||
"minduration" = feature %>% summarise(!!paste("call", call_type, day_segment, feature_name, sep = "_") := min(call_duration)),
|
||||
"maxduration" = feature %>% summarise(!!paste("call", call_type, day_segment, feature_name, sep = "_") := max(call_duration)),
|
||||
"stdduration" = feature %>% summarise(!!paste("call", call_type, day_segment, feature_name, sep = "_") := sd(call_duration)),
|
||||
"modeduration" = feature %>% summarise(!!paste("call", call_type, day_segment, feature_name, sep = "_") := Mode(call_duration)),
|
||||
"entropyduration" = feature %>% summarise(!!paste("call", call_type, day_segment, feature_name, sep = "_") := entropy.MillerMadow(call_duration)),
|
||||
"timefirstcall" = feature %>% summarise(!!paste("call", call_type, day_segment, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)),
|
||||
"timelastcall" = feature %>% summarise(!!paste("call", call_type, day_segment, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute)))
|
||||
"count" = feature %>% summarise(!!paste("call", call_type, feature_name, sep = "_") := n()),
|
||||
"distinctcontacts" = feature %>% summarise(!!paste("call", call_type, feature_name, sep = "_") := n_distinct(trace)),
|
||||
"meanduration" = feature %>% summarise(!!paste("call", call_type, feature_name, sep = "_") := mean(call_duration)),
|
||||
"sumduration" = feature %>% summarise(!!paste("call", call_type, feature_name, sep = "_") := sum(call_duration)),
|
||||
"minduration" = feature %>% summarise(!!paste("call", call_type, feature_name, sep = "_") := min(call_duration)),
|
||||
"maxduration" = feature %>% summarise(!!paste("call", call_type, feature_name, sep = "_") := max(call_duration)),
|
||||
"stdduration" = feature %>% summarise(!!paste("call", call_type, feature_name, sep = "_") := sd(call_duration)),
|
||||
"modeduration" = feature %>% summarise(!!paste("call", call_type, feature_name, sep = "_") := Mode(call_duration)),
|
||||
"entropyduration" = feature %>% summarise(!!paste("call", call_type, feature_name, sep = "_") := entropy.MillerMadow(call_duration)),
|
||||
"timefirstcall" = feature %>% summarise(!!paste("call", call_type, feature_name, sep = "_") := first(local_hour) * 60 + first(local_minute)),
|
||||
"timelastcall" = feature %>% summarise(!!paste("call", call_type, feature_name, sep = "_") := last(local_hour) * 60 + last(local_minute)))
|
||||
|
||||
features <- merge(features, feature, by="local_date", all = TRUE)
|
||||
features <- merge(features, feature, by="local_segment", all = TRUE)
|
||||
}
|
||||
}
|
||||
features <- features %>% mutate_at(vars(contains("countmostfrequentcontact")), list( ~ replace_na(., 0)))
|
||||
|
|
|
@ -1,18 +1,23 @@
|
|||
source("renv/activate.R")
|
||||
source("src/features/call/call_base.R")
|
||||
library(dplyr)
|
||||
library(entropy)
|
||||
|
||||
calls <- read.csv(snakemake@input[[1]], stringsAsFactors = FALSE)
|
||||
day_segment <- snakemake@params[["day_segment"]]
|
||||
day_segments_labels <- read.csv(snakemake@input[["day_segments_labels"]])
|
||||
requested_features <- snakemake@params[["features"]]
|
||||
call_type <- snakemake@params[["call_type"]]
|
||||
features = data.frame(local_date = character(), stringsAsFactors = FALSE)
|
||||
features = data.frame(local_segment = character(), stringsAsFactors = FALSE)
|
||||
|
||||
# Compute base Call features
|
||||
features <- merge(features, base_call_features(calls, call_type, day_segment, requested_features), by="local_date", all = TRUE)
|
||||
day_segments <- day_segments_labels %>% pull(label)
|
||||
for (day_segment in day_segments)
|
||||
features <- merge(features, base_call_features(calls, call_type, day_segment, requested_features), all = TRUE)
|
||||
|
||||
if(ncol(features) != length(requested_features) + 1)
|
||||
stop(paste0("The number of features in the output dataframe (=", ncol(features),") does not match the expected value (=", length(requested_features)," + 1). Verify your Call feature extraction functions"))
|
||||
|
||||
features <- features %>% separate(col = local_segment,
|
||||
into = c("segment", "local_start_date", "local_start_time", "local_end_date", "local_end_time"),
|
||||
sep = "#",
|
||||
remove = FALSE)
|
||||
|
||||
write.csv(features, snakemake@output[[1]], row.names = FALSE)
|
||||
|
|
Loading…
Reference in New Issue