rapids/calculatingfeatures/eda_explorer/load_files.py

264 lines
9.5 KiB
Python

import pandas as pd
import scipy.signal as scisig
import os
import numpy as np
def get_user_input(prompt):
try:
return raw_input(prompt)
except NameError:
return input(prompt)
def getInputLoadFile():
'''Asks user for type of file and file path. Loads corresponding data.
OUTPUT:
data: DataFrame, index is a list of timestamps at 8Hz, columns include
AccelZ, AccelY, AccelX, Temp, EDA, filtered_eda
'''
print("Please enter information about your EDA file... ")
dataType = get_user_input("\tData Type (e4, q, shimmer, or misc): ")
if dataType=='q':
filepath = get_user_input("\tFile path: ")
filepath_confirm = filepath
data = loadData_Qsensor(filepath)
elif dataType=='e4':
filepath = get_user_input("\tPath to E4 directory: ")
filepath_confirm = os.path.join(filepath,"EDA.csv")
data = loadData_E4(filepath)
elif dataType=='shimmer':
filepath = get_user_input("\tFile path: ")
filepath_confirm = filepath
data = loadData_shimmer(filepath)
elif dataType=="misc":
filepath = get_user_input("\tFile path: ")
filepath_confirm = filepath
data = loadData_misc(filepath)
else:
print("Error: not a valid file choice")
return data, filepath_confirm
def getOutputPath():
print("")
print("Where would you like to save the computed output file?")
outfile = get_user_input('\tFile name: ')
outputPath = get_user_input('\tFile directory (./ for this directory): ')
fullOutputPath = os.path.join(outputPath,outfile)
if fullOutputPath[-4:] != '.csv':
fullOutputPath = fullOutputPath+'.csv'
return fullOutputPath
def loadData_Qsensor(filepath):
'''
This function loads the Q sensor data, uses a lowpass butterworth filter on the EDA signal
Note: currently assumes sampling rate of 8hz, 16hz, 32hz; if sampling rate is 16hz or 32hz the signal is downsampled
INPUT:
filepath: string, path to input file
OUTPUT:
data: DataFrame, index is a list of timestamps at 8Hz, columns include AccelZ, AccelY, AccelX, Temp, EDA, filtered_eda
'''
# Get header info
try:
header_info = pd.io.parsers.read_csv(filepath, nrows=5)
except IOError:
print("Error!! Couldn't load file, make sure the filepath is correct and you are using a csv from the q sensor software\n\n")
return
# Get sample rate
sampleRate = int((header_info.iloc[3,0]).split(":")[1].strip())
# Get the raw data
data = pd.io.parsers.read_csv(filepath, skiprows=7)
data = data.reset_index()
# Reset the index to be a time and reset the column headers
data.columns = ['AccelZ','AccelY','AccelX','Battery','Temp','EDA']
# Get Start Time
startTime = pd.to_datetime(header_info.iloc[4,0][12:-10])
# Make sure data has a sample rate of 8Hz
data = interpolateDataTo8Hz(data,sampleRate,startTime)
# Remove Battery Column
data = data[['AccelZ','AccelY','AccelX','Temp','EDA']]
# Get the filtered data using a low-pass butterworth filter (cutoff:1hz, fs:8hz, order:6)
data['filtered_eda'] = butter_lowpass_filter(data['EDA'], 1.0, 8, 6)
return data
def _loadSingleFile_E4(filepath,list_of_columns, expected_sample_rate,freq):
# Load data
data = pd.read_csv(filepath)
# Get the startTime and sample rate
startTime = pd.to_datetime(float(data.columns.values[0]),unit="s")
sampleRate = float(data.iloc[0][0])
data = data[data.index!=0]
data.index = data.index-1
# Reset the data frame assuming expected_sample_rate
data.columns = list_of_columns
if sampleRate != expected_sample_rate:
print('ERROR, NOT SAMPLED AT {0}HZ. PROBLEMS WILL OCCUR\n'.format(expected_sample_rate))
# Make sure data has a sample rate of 8Hz
data = interpolateDataTo8Hz(data,sampleRate,startTime)
return data
def loadData_E4(filepath):
# Load EDA data
eda_data = _loadSingleFile_E4(os.path.join(filepath,'EDA.csv'),["EDA"],4,"250L")
# Get the filtered data using a low-pass butterworth filter (cutoff:1hz, fs:8hz, order:6)
eda_data['filtered_eda'] = butter_lowpass_filter(eda_data['EDA'], 1.0, 8, 6)
# Load ACC data
acc_data = _loadSingleFile_E4(os.path.join(filepath,'ACC.csv'),["AccelX","AccelY","AccelZ"],32,"31250U")
# Scale the accelometer to +-2g
acc_data[["AccelX","AccelY","AccelZ"]] = acc_data[["AccelX","AccelY","AccelZ"]]/64.0
# Load Temperature data
temperature_data = _loadSingleFile_E4(os.path.join(filepath,'TEMP.csv'),["Temp"],4,"250L")
data = eda_data.join(acc_data, how='outer')
data = data.join(temperature_data, how='outer')
# E4 sometimes records different length files - adjust as necessary
min_length = min(len(acc_data), len(eda_data), len(temperature_data))
return data[:min_length]
def loadData_shimmer(filepath):
data = pd.read_csv(filepath, sep='\t', skiprows=(0,1))
orig_cols = data.columns
rename_cols = {}
for search, new_col in [['Timestamp','Timestamp'],
['Accel_LN_X', 'AccelX'], ['Accel_LN_Y', 'AccelY'], ['Accel_LN_Z', 'AccelZ'],
['Skin_Conductance', 'EDA']]:
orig = [c for c in orig_cols if search in c]
if len(orig) == 0:
continue
rename_cols[orig[0]] = new_col
data.rename(columns=rename_cols, inplace=True)
# TODO: Assuming no temperature is recorded
data['Temp'] = 0
# Drop the units row and unnecessary columns
data = data[data['Timestamp'] != 'ms']
data.index = pd.to_datetime(data['Timestamp'], unit='ms')
data = data[['AccelZ', 'AccelY', 'AccelX', 'Temp', 'EDA']]
for c in ['AccelZ', 'AccelY', 'AccelX', 'Temp', 'EDA']:
data[c] = pd.to_numeric(data[c])
# Convert to 8Hz
data = data.resample("125L").mean()
data.interpolate(inplace=True)
# Get the filtered data using a low-pass butterworth filter (cutoff:1hz, fs:8hz, order:6)
data['filtered_eda'] = butter_lowpass_filter(data['EDA'], 1.0, 8, 6)
return data
def loadData_getColNames(data_columns):
print("Here are the data columns of your file: ")
print(data_columns)
# Find the column names for each of the 5 data streams
colnames = ['EDA data','Temperature data','Acceleration X','Acceleration Y','Acceleration Z']
new_colnames = ['','','','','']
for i in range(len(new_colnames)):
new_colnames[i] = get_user_input("Column name that contains "+colnames[i]+": ")
while (new_colnames[i] not in data_columns):
print("Column not found. Please try again")
print("Here are the data columns of your file: ")
print(data_columns)
new_colnames[i] = get_user_input("Column name that contains "+colnames[i]+": ")
# Get user input on sample rate
sampleRate = get_user_input("Enter sample rate (must be an integer power of 2): ")
while (sampleRate.isdigit()==False) or (np.log(int(sampleRate))/np.log(2) != np.floor(np.log(int(sampleRate))/np.log(2))):
print("Not an integer power of two")
sampleRate = get_user_input("Enter sample rate (must be a integer power of 2): ")
sampleRate = int(sampleRate)
# Get user input on start time
startTime = pd.to_datetime(get_user_input("Enter a start time (format: YYYY-MM-DD HH:MM:SS): "))
while type(startTime)==str:
print("Not a valid date/time")
startTime = pd.to_datetime(get_user_input("Enter a start time (format: YYYY-MM-DD HH:MM:SS): "))
return sampleRate, startTime, new_colnames
def loadData_misc(filepath):
# Load data
data = pd.read_csv(filepath)
# Get the correct colnames
sampleRate, startTime, new_colnames = loadData_getColNames(data.columns.values)
data.rename(columns=dict(zip(new_colnames,['EDA','Temp','AccelX','AccelY','AccelZ'])), inplace=True)
data = data[['AccelZ','AccelY','AccelX','Temp','EDA']]
# Make sure data has a sample rate of 8Hz
data = interpolateDataTo8Hz(data,sampleRate,startTime)
# Get the filtered data using a low-pass butterworth filter (cutoff:1hz, fs:8hz, order:6)
data['filtered_eda'] = butter_lowpass_filter(data['EDA'], 1.0, 8, 6)
return data
def interpolateDataTo8Hz(data,sample_rate,startTime):
if sample_rate<8:
# Upsample by linear interpolation
if sample_rate==2:
data.index = pd.date_range(start=startTime, periods=len(data), freq='500L')
elif sample_rate==4:
data.index = pd.date_range(start=startTime, periods=len(data), freq='250L')
data = data.resample("125L").mean()
else:
if sample_rate>8:
# Downsample
idx_range = list(range(0,len(data))) # TODO: double check this one
data = data.iloc[idx_range[0::int(int(sample_rate)/8)]]
# Set the index to be 8Hz
data.index = pd.date_range(start=startTime, periods=len(data), freq='125L')
# Interpolate all empty values
data = interpolateEmptyValues(data)
return data
def interpolateEmptyValues(data):
cols = data.columns.values
for c in cols:
data.loc[:, c] = data[c].interpolate()
return data
def butter_lowpass(cutoff, fs, order=5):
# Filtering Helper functions
sos = scisig.butter(order, cutoff, btype='low', analog=False, output='sos', fs=fs)
return sos
def butter_lowpass_filter(data, cutoff, fs, order=5):
# Filtering Helper functions
sos = butter_lowpass(cutoff, fs, order=order)
y = scisig.sosfilt(sos, data)
return y