Fixed the features dependent on time duration.
parent
4746d7ab6c
commit
0bbf15f52e
|
@ -45,7 +45,6 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
|||
if cluster_on == "PARTICIPANT_DATASET":
|
||||
location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters)
|
||||
location_data = filter_data_by_segment(location_data, time_segment)
|
||||
|
||||
elif cluster_on == "TIME_SEGMENT":
|
||||
location_data = filter_data_by_segment(location_data, time_segment)
|
||||
location_data = cluster_and_label(location_data,clustering_algorithm,threshold_static,**hyperparameters)
|
||||
|
@ -73,6 +72,7 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
|||
location_features = location_features.reset_index(drop=True)
|
||||
return location_features
|
||||
|
||||
location_data['timeInSeconds'] = (location_data.timestamp.diff(-1)* -1)/1000
|
||||
if "locationvariance" in features_to_compute:
|
||||
location_features["locationvariance"] = location_data.groupby(['local_segment'])['double_latitude'].var() + location_data.groupby(['local_segment'])['double_longitude'].var()
|
||||
|
||||
|
@ -118,49 +118,52 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
|||
for localDate in stationaryLocations['local_segment'].unique():
|
||||
location_features.loc[localDate,"radiusgyration"] = radius_of_gyration(stationaryLocations[stationaryLocations['local_segment']==localDate],sampling_frequency)
|
||||
|
||||
preComputedTimeArray = pd.DataFrame()
|
||||
for localDate in stationaryLocations["local_segment"].unique():
|
||||
top1,top2,top3,smax,smin,sstd,smean = len_stay_timeattopn(stationaryLocations[stationaryLocations["local_segment"]==localDate])
|
||||
preComputedTimeArray.loc[localDate,"timeattop1"] = top1
|
||||
preComputedTimeArray.loc[localDate,"timeattop2"] = top2
|
||||
preComputedTimeArray.loc[localDate,"timeattop3"] = top3
|
||||
preComputedTimeArray.loc[localDate,"maxlengthstayatclusters"] = smax
|
||||
preComputedTimeArray.loc[localDate,"minlengthstayatclusters"] = smin
|
||||
preComputedTimeArray.loc[localDate,"stdlengthstayatclusters"] = sstd
|
||||
preComputedTimeArray.loc[localDate,"meanlengthstayatclusters"] = smean
|
||||
|
||||
if "timeattop1location" in features_to_compute:
|
||||
for localDate in stationaryLocations['local_segment'].unique():
|
||||
location_features.loc[localDate,"timeattop1"] = time_at_topn_clusters_in_group(stationaryLocations[stationaryLocations['local_segment']==localDate],1,sampling_frequency)
|
||||
location_features.loc[localDate,"timeattop1"] = preComputedTimeArray.loc[localDate,"timeattop1"]
|
||||
|
||||
if "timeattop2location" in features_to_compute:
|
||||
for localDate in stationaryLocations['local_segment'].unique():
|
||||
location_features.loc[localDate,"timeattop2"] = time_at_topn_clusters_in_group(stationaryLocations[stationaryLocations['local_segment']==localDate],2,sampling_frequency)
|
||||
location_features.loc[localDate,"timeattop2"] = preComputedTimeArray.loc[localDate,"timeattop2"]
|
||||
|
||||
if "timeattop3location" in features_to_compute:
|
||||
for localDate in stationaryLocations['local_segment'].unique():
|
||||
location_features.loc[localDate,"timeattop3"] = time_at_topn_clusters_in_group(stationaryLocations[stationaryLocations['local_segment']==localDate],3,sampling_frequency)
|
||||
location_features.loc[localDate,"timeattop3"] = preComputedTimeArray.loc[localDate,"timeattop3"]
|
||||
|
||||
if "movingtostaticratio" in features_to_compute:
|
||||
for localDate in stationaryLocations['local_segment'].unique():
|
||||
location_features.loc[localDate,"movingtostaticratio"] = (stationaryLocations[stationaryLocations['local_segment']==localDate].shape[0]*sampling_frequency) / (location_data[location_data['local_segment']==localDate].shape[0] * sampling_frequency)
|
||||
location_features.loc[localDate,"movingtostaticratio"] = (stationaryLocations[stationaryLocations['local_segment']==localDate]['timeInSeconds'].sum()) / (location_data[location_data['local_segment']==localDate]['timeInSeconds'].sum())
|
||||
|
||||
if "outlierstimepercent" in features_to_compute:
|
||||
for localDate in stationaryLocations['local_segment'].unique():
|
||||
location_features.loc[localDate,"outlierstimepercent"] = outliers_time_percent(stationaryLocations[stationaryLocations['local_segment']==localDate],sampling_frequency)
|
||||
|
||||
preComputedmaxminCluster = pd.DataFrame()
|
||||
for localDate in stationaryLocations['local_segment'].unique():
|
||||
smax, smin, sstd,smean = len_stay_at_clusters_in_minutes(stationaryLocations[stationaryLocations['local_segment']==localDate],sampling_frequency)
|
||||
preComputedmaxminCluster.loc[localDate,"maxlengthstayatclusters"] = smax
|
||||
preComputedmaxminCluster.loc[localDate,"minlengthstayatclusters"] = smin
|
||||
preComputedmaxminCluster.loc[localDate,"stdlengthstayatclusters"] = sstd
|
||||
preComputedmaxminCluster.loc[localDate,"meanlengthstayatclusters"] = smean
|
||||
location_features.loc[localDate,"outlierstimepercent"] = outlier_time_percent_new(stationaryLocations[stationaryLocations['local_segment']==localDate])
|
||||
|
||||
if "maxlengthstayatclusters" in features_to_compute:
|
||||
for localDate in stationaryLocations['local_segment'].unique():
|
||||
location_features.loc[localDate,"maxlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"maxlengthstayatclusters"]
|
||||
location_features.loc[localDate,"maxlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"maxlengthstayatclusters"]
|
||||
|
||||
if "minlengthstayatclusters" in features_to_compute:
|
||||
for localDate in stationaryLocations['local_segment'].unique():
|
||||
location_features.loc[localDate,"minlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"minlengthstayatclusters"]
|
||||
location_features.loc[localDate,"minlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"minlengthstayatclusters"]
|
||||
|
||||
if "stdlengthstayatclusters" in features_to_compute:
|
||||
for localDate in stationaryLocations['local_segment'].unique():
|
||||
location_features.loc[localDate,"stdlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"stdlengthstayatclusters"]
|
||||
location_features.loc[localDate,"stdlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"stdlengthstayatclusters"]
|
||||
|
||||
if "meanlengthstayatclusters" in features_to_compute:
|
||||
for localDate in stationaryLocations['local_segment'].unique():
|
||||
location_features.loc[localDate,"meanlengthstayatclusters"] = preComputedmaxminCluster.loc[localDate,"meanlengthstayatclusters"]
|
||||
location_features.loc[localDate,"meanlengthstayatclusters"] = preComputedTimeArray.loc[localDate,"meanlengthstayatclusters"]
|
||||
|
||||
if "locationentropy" in features_to_compute:
|
||||
for localDate in stationaryLocations['local_segment'].unique():
|
||||
|
@ -174,6 +177,21 @@ def doryab_features(sensor_data_files, time_segment, provider, filter_data_by_se
|
|||
|
||||
return location_features
|
||||
|
||||
def len_stay_timeattopn(locationData):
|
||||
if locationData is None or len(locationData) == 0:
|
||||
return (None, None, None,None, None, None, None)
|
||||
|
||||
calculationDf = locationData[locationData["location_label"] >= 1][['location_label','timeInSeconds']]
|
||||
calculationDf[calculationDf['timeInSeconds'] >= 300]['timeInSeconds'] = 60
|
||||
timeArray = calculationDf.groupby('location_label')['timeInSeconds'].sum().reset_index()['timeInSeconds'].sort_values(ascending=False)/60
|
||||
|
||||
if len(timeArray) > 2:
|
||||
return (timeArray[0],timeArray[1],timeArray[2],timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
|
||||
elif len(timeArray)==2:
|
||||
return (timeArray[0],timeArray[1],None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
|
||||
else:
|
||||
return (timeArray[0],None,None,timeArray.max(),timeArray.min(),timeArray.std(),timeArray.mean())
|
||||
|
||||
|
||||
def getMinutesData(locationData):
|
||||
|
||||
|
@ -188,7 +206,6 @@ def distance_to_degrees(d):
|
|||
|
||||
def get_all_travel_distances_meters_speed(locationData,threshold,maximum_gap_allowed):
|
||||
|
||||
locationData['timeInSeconds'] = (locationData.timestamp.diff(-1)* -1)/1000
|
||||
lat_lon_temp = locationData[locationData['timeInSeconds'] <= maximum_gap_allowed][['double_latitude','double_longitude','timeInSeconds']]
|
||||
|
||||
if lat_lon_temp.empty:
|
||||
|
@ -314,7 +331,6 @@ def rank_count_map(clusters):
|
|||
labels, counts = tuple(np.unique(clusters, return_counts = True))
|
||||
sorted_by_count = [x for (y,x) in sorted(zip(counts, labels), reverse = True)]
|
||||
label_to_rank = {label : rank + 1 for (label, rank) in [(sorted_by_count[i],i) for i in range(len(sorted_by_count))]}
|
||||
|
||||
return lambda x: label_to_rank.get(x, -1)
|
||||
|
||||
|
||||
|
@ -367,78 +383,15 @@ def radius_of_gyration(locationData,sampling_frequency):
|
|||
|
||||
return np.sqrt(final_rog)
|
||||
|
||||
def time_at_topn_clusters_in_group(locationData,n,sampling_frequency): # relevant only for global location features since, top3_clusters = top3_clusters_in_group for local
|
||||
|
||||
if locationData is None or len(locationData) == 0:
|
||||
def outlier_time_percent_new(locationData):
|
||||
if locationData is None or len(locationData)==0:
|
||||
return None
|
||||
|
||||
locationData = locationData[locationData["location_label"] >= 1] # remove outliers/ cluster noise
|
||||
valcounts = locationData["location_label"].value_counts().to_dict()
|
||||
sorted_valcounts = sorted(valcounts.items(), key=lambda kv: (-kv[1], kv[0]))
|
||||
|
||||
if len(sorted_valcounts) >= n:
|
||||
topn = sorted_valcounts[n-1]
|
||||
topn_time = topn[1] * sampling_frequency
|
||||
else:
|
||||
topn_time = None
|
||||
|
||||
return topn_time
|
||||
|
||||
def outliers_time_percent(locationData,sampling_frequency):
|
||||
if locationData is None or len(locationData) == 0:
|
||||
return None
|
||||
clusters = locationData["location_label"]
|
||||
numoutliers = clusters[(clusters == -1)].sum() * sampling_frequency
|
||||
numtotal = len(clusters) * sampling_frequency
|
||||
return (float(-1*numoutliers) / numtotal)
|
||||
|
||||
|
||||
def moving_time_percent(locationData):
|
||||
if locationData is None or len(locationData) == 0:
|
||||
return None
|
||||
lbls = locationData["location_label"]
|
||||
nummoving = lbls.isnull().sum()
|
||||
numtotal = len(lbls)
|
||||
|
||||
return (float(nummoving) / numtotal)
|
||||
|
||||
def len_stay_at_clusters_in_minutes(locationData,sampling_frequency):
|
||||
if locationData is None or len(locationData) == 0:
|
||||
return (None, None, None,None)
|
||||
|
||||
lenstays = []
|
||||
count = 0
|
||||
prev_loc_label = None
|
||||
for row in locationData.iterrows():
|
||||
cur_loc_label = row[1]["location_label"]
|
||||
if np.isnan(cur_loc_label):
|
||||
continue
|
||||
elif prev_loc_label == None:
|
||||
prev_loc_label = int(cur_loc_label)
|
||||
count += 1
|
||||
else:
|
||||
if prev_loc_label == int(cur_loc_label):
|
||||
count += 1
|
||||
else:
|
||||
lenstays.append(count)
|
||||
prev_loc_label = int(cur_loc_label)
|
||||
count = 0 + 1
|
||||
if count > 0: # in case of no transition
|
||||
lenstays.append(count)
|
||||
lenstays = np.array(lenstays) * sampling_frequency
|
||||
|
||||
if len(lenstays) > 0:
|
||||
smax = np.max(lenstays)
|
||||
smin = np.min(lenstays)
|
||||
sstd = np.std(lenstays)
|
||||
smean = np.mean(lenstays)
|
||||
else:
|
||||
smax = None
|
||||
smin = None
|
||||
sstd = None
|
||||
smean = None
|
||||
return (smax, smin, sstd, smean)
|
||||
clustersDf = locationData[["location_label","timeInSeconds"]]
|
||||
numoutliers = clustersDf[clustersDf["location_label"]== -1]["timeInSeconds"].sum()
|
||||
numtotal = clustersDf.timeInSeconds.sum()
|
||||
|
||||
return numoutliers/numtotal
|
||||
|
||||
def location_entropy(locationData):
|
||||
if locationData is None or len(locationData) == 0:
|
||||
|
|
Loading…
Reference in New Issue