#Utility functions for working with 3D gaze data. #Authors: Brendan David-John, Candace Peacock import itertools import numpy as np import pandas as pd import os import sys import math import pandas as pd import numpy as np import json import glob # This was missing from sklearn.preprocessing import scale from sklearn.utils import resample subjs_skip = [] discrete_feats = ['sac_det','fix_det'] #https://pubmed.ncbi.nlm.nih.gov/25713524/ & https://www.sciencedirect.com/science/article/abs/pii/0025556475900759 max_sacc_speed = 800 #https://www.geeksforgeeks.org/python-make-a-list-of-intervals-with-sequential-numbers/ def intervals_extract(iterable): iterable = sorted(set(iterable)) for key, group in itertools.groupby(enumerate(iterable), lambda t: t[1] - t[0]): group = list(group) yield [group[0][1], group[-1][1]] #find consecutive chunks of true values in bools, and return start and end indicies #input: 1d np array #output: 2d np array with indices def intervals_from_bool(bools): #get python list of indicies bool_idx = np.where(bools == True)[0].tolist() #pull out the intervals intervals = intervals_extract(bool_idx) #convert to flat np array intervals = [item for sublist in intervals for item in sublist] #unroll it so start and end points fall into 1st/2nd column intervals = np.reshape(intervals, (-1, 2)) return intervals ''' Function to compute angular gaze velocity using the a numerically stable atan2 method Inputs: gaze_vectors: Nx3 array with N 3D gaze vectors (x,y,z), dt: 1D array with N elements containing timestep for each gaze sample Output: 1D array with N elements containing angular gaze velocity in degrees ''' def compute_gaze_velocity(gaze_vectors,dt): #get array of i+1 vectors gaze_vectors_1 = gaze_vectors[0:-1,:] gaze_vectors_2 = gaze_vectors[1:,:] #numerically stable method: atan2(norm(u-v),norm(u+v)) assuming u and v are already normalized gaze_dist = 2*np.arctan2(np.sqrt(np.square(gaze_vectors_1-gaze_vectors_2).sum(axis=1)),np.sqrt(np.square(gaze_vectors_1+gaze_vectors_2).sum(axis=1))) #2*np.arctan2(mag(u-v),mag(u+v)) gaze_velocity = np.rad2deg(gaze_dist) / dt #make nans and infs zero np.nan_to_num(gaze_velocity,copy=False,posinf=0.0,neginf=0.0) #correct outliers over 800 deg/s by interpolating across good samples gaze_velocity_good_idx = (gaze_velocity <= max_sacc_speed).nonzero()[0] gaze_velocity = np.interp(range(gaze_velocity.shape[0]),gaze_velocity_good_idx,gaze_velocity[gaze_velocity_good_idx]) #append zero for first element, so size matches input return np.insert(gaze_velocity,0,0.0) ''' Function to compute angular gaze velocity using the arccos dot method Inputs: gaze_vectors: Nx3 array with N 3D gaze vectors (x,y,z), dt :1D array with N elements containing timestep for each gaze sample Output: 1D array with N elements containing angular gaze velocity in degrees ''' def compute_gaze_velocity_naive(gaze_vectors,dt): #get array of i+1 vectors gaze_vectors_1 = gaze_vectors[0:-1,:] gaze_vectors_2 = gaze_vectors[1:,:] #arc cos of dot product method, assuming vectors are normalized gaze_dist_naive = np.arccos(np.sum(gaze_vectors_1*gaze_vectors_2, axis=1)) gaze_velocity_naive = np.rad2deg(gaze_dist_naive) / dt #make nans and infs zero np.nan_to_num(gaze_velocity_naive,copy=False,posinf=0.0,neginf=0.0) #correct outliers over 800 deg/s by interpolating across good samples gaze_velocity_naive_good_idx = (gaze_velocity_naive <= max_sacc_speed).nonzero()[0] gaze_velocity_naive = np.interp(range(gaze_velocity_naive.shape[0]),gaze_velocity_naive_good_idx,gaze_velocity_naive[gaze_velocity_naive_good_idx]) #append zero for first element, so size matches input return np.insert(gaze_velocity_naive,0,0.0) ''' Function to classify gaze samples as saccade events using velocity thresholding. Inputs: gaze_velocity: 1D array with N elements representing gaze velocity in degrees, time_stamps: 1D array with N elements representing timestamps in seconds, ivt_threshold: scalar in degrees, saccade samples are classified above this value min_dur: scalar in seconds, saccades shorter than this are discarded Output: 2 element tuple. First element: 1D boolean array with N elements where True indiciates a sample is part of saccade, and False if not Second element: 2D array that is Mx2, where M is the number of saccade events detected and the first column is start index, second end index ''' def saccade_classification_ivt(gaze_velocity,time_stamps,ivt_threshold=100.0,min_dur=.012,max_dur=2.0): #find samples over threshold sac_bool = gaze_velocity > ivt_threshold sac_intervals = intervals_from_bool(sac_bool) #correct saccades shorter than minimum duration sacc_durs = time_stamps[sac_intervals[:,1]] - time_stamps[sac_intervals[:,0]] short_sacc_idx = np.where((sacc_dursmax_dur))[0] #use short_sacc_idx to remove the bad intervals, and set those bad intervals to False in sac_bool bad_sac_intervals = sac_intervals[short_sacc_idx,:] sac_intervals_clean = np.delete(sac_intervals,short_sacc_idx,0) num_bad_intervals = len(bad_sac_intervals) for i in range(num_bad_intervals): curr_interval = bad_sac_intervals[i,:] sac_bool[curr_interval[0]:curr_interval[1]+1] = False return (sac_bool,sac_intervals_clean) ''' Function to classify gaze samples as fixation events using velocity thresholding. Inputs: gaze_velocity: 1D array with N elements representing gaze velocity in degrees, time_stamps: 1D array with N elements representing timestamps in seconds, ivt_threshold: scalar in degrees, fixation samples are classified below this value min_dur: scalar in seconds, fixations shorter than this are discarded max_dur: scalar in secodns, fixations longer than this are discarded Output: 2 element tuple. First element: 1D boolean array with N elements where True indiciates a sample is part of saccade, and False if not Second element: 2D array that is Mx2, where M is the number of saccade events detected and the first column is start index, second end index ''' def fixation_classification_ivt(gaze_velocity,time_stamps,ivt_threshold=20.0,min_dur=0.100,max_dur=2.0): #find samples over threshold fix_bool = gaze_velocity < ivt_threshold fix_intervals = intervals_from_bool(fix_bool) #correct fixation shorter than minimum duration fix_durs = time_stamps[fix_intervals[:,1]] - time_stamps[fix_intervals[:,0]] short_fix_idx = np.where((fix_dursmax_dur))[0] #short_fix_idx = np.where(fix_durs= t0 and time_stamps[r]<= t1): last = r elif time_stamps[r] > t1: break window_range = [current,last] #now check the dispersion in this window dispersion = compute_dispersion(gaze_vectors[current:last+1,:]) if (dispersion <= max_disp): #add new points while(dispersion <= max_disp and last + 1 < len(gaze_vectors)): last += 1 window_range = [current,last] #print current, last, "*" #print "*" dispersion = compute_dispersion(gaze_vectors[current:last+1,:]) #dispersion threshold is exceeded #fixation at the centroid [current,last] fix_bool[current:last+1] = True current = last + 1 #this will move the pointer to a novel window else: current += 1 last = current #correct with saccade bool, find points marked as both, change to False in fix_bool conflict_idx = np.where((fix_bool == True) & (sac_bool == True))[0] fix_bool[conflict_idx] = False fix_intervals = intervals_from_bool(fix_bool) #correct fixation shorter than minimum duration or longer than max fix_durs = time_stamps[fix_intervals[:,1]] - time_stamps[fix_intervals[:,0]] short_fix_idx = np.where((fix_dursmax_dur))[0] #use short_fix to remove the bad intervals, and set those bad intervals to False in fix_bool bad_fix_intervals = fix_intervals[short_fix_idx,:] fix_intervals_clean = np.delete(fix_intervals,short_fix_idx,0) num_bad_intervals = len(bad_fix_intervals) for i in range(num_bad_intervals): curr_interval = bad_fix_intervals[i,:] fix_bool[curr_interval[0]:curr_interval[1]+1] = False return (fix_bool,fix_intervals_clean) ''' Summary: Compute a continous time series for feature values by averaging over events in the last N seconds Inputs: features: 2D array with rows corresponding to events, and columns corresponding to feature values fixation_times: 1D array containing the timestamps corresponding to each row of fixation_times time_stamps: 1D array representing every sample in the time series, containing times in seconds N: Scalar indicating the number of seconds to consider past events within for computing average feature values Output: 2D array, with rows corresponding to each sample in time_stamps and a column for each feature signal. ''' def compute_feature_signal_time_avg(features,fixation_times,time_stamps,N): num_features = features.shape[1] num_rows = len(time_stamps) feature_signal = np.zeros((num_rows,num_features),dtype=float) #loop over each time stamp, grab which elements have time greater than current time -2 but less than current time, average, add to row for i in range(num_rows): curr_time = time_stamps[i] #get events N seconds prior to curr_time curr_events = features[(fixation_times>=curr_time-N) & (fixation_times<=curr_time),:] curr_row = np.zeros((1,num_features),dtype=float) if len(curr_events) >0: curr_row = np.mean(curr_events,0) feature_signal[i,:] = curr_row return feature_signal ''' Summary: Compute a continous time series for feature values by averaging over the past N events Inputs: features: 2D array with rows corresponding to events, and columns corresponding to feature values fixation_times: 1D array containing the timestamps corresponding to each row of fixation_times time_stamps: 1D array representing every sample in the time series, containing times in seconds N: Scalar indicating the number of past events to consider when computing average feature values Output: 2D array, with rows corresponding to each sample in time_stamps and a column for each feature signal. ''' def compute_feature_signal_event_avg(features,fixation_times,time_stamps,N): num_features = features.shape[1] num_rows = len(time_stamps) feature_signal = np.zeros((num_rows,num_features),dtype=float) #loop over each time stamp, grab past N elements, average, add to row for i in range(num_rows): curr_time = time_stamps[i] #get events prior to curr_time curr_events = features[(fixation_times<=curr_time),:] #grab last N of them if possible prior_events = curr_events[-N:,:] curr_row = np.zeros((1,num_features),dtype=float) if len(prior_events) >0: curr_row = np.mean(prior_events,0) feature_signal[i,:] = curr_row return feature_signal #wrapper function to call feature signal methods def compute_feature_signal(method,features,event_times,time_stamps,N): if method == 'time_avg': return compute_feature_signal_time_avg(features,event_times,time_stamps,N) elif method == 'event_avg': return compute_feature_signal_event_avg(features,event_times,time_stamps,N) elif method == 'event_discrete': return compute_feature_signal_event_discrete(features,event_times,time_stamps,N) else: return None #returns three column np array (x,y,z) containing head corrected gaze vectors def head_correction(df,param): gaze_vectors = df[['combined.gazeDirection.x','combined.gazeDirection.y','combined.gazeDirection.z']].values num_rows = np.size(gaze_vectors,0) transformed_gaze_vectors = np.zeros(gaze_vectors.shape) if param.transform_type == 'coordinate_frame': #transform vectors head_pos = df[['head.pos.x','head.pos.y','head.pos.z']].values head_up = df[['head.right.x','head.right.y','head.right.z']].values head_right = df[['head.up.x','head.up.y','head.up.z']].values head_dir = df[['head.dir.x','head.dir.y','head.dir.z']].values for i in range(num_rows): #get current gaze vector and transforms gaze_vector = gaze_vectors[i,:] curr_head_up = head_up[i,:] curr_head_right = head_right[i,:] curr_head_dir = head_dir[i,:] '''Transformation matrix of the form: [ R.x, R.y, R.z, 0, U.x, U.y, U.z, 0, D.x, D.y, D.z, 0, 0, 0, 0, 1 ] ''' #make transformation matrix transform_mat = np.ma.row_stack((np.append(curr_head_right ,0),\ np.append(curr_head_up ,0),\ np.append(curr_head_dir,0),\ np.array([0.0, 0.0, 0.0, 1.0]))) #add homogenous coordinate to vector gaze_vector = np.append(gaze_vector,1.0) #right-multiply row vector with transpose of matrix as defined above transformed_gaze_vector = np.matmul(gaze_vector,transform_mat.T) #place transformed gaze_vector back into gaze_vectors w/o homogenous coordinate transformed_gaze_vectors[i,:] = transformed_gaze_vector[0:3] #normalize all rows again to account for any rounding errors row_norm_vals = np.sqrt(np.square(transformed_gaze_vectors).sum(axis=1)) transformed_gaze_vectors = transformed_gaze_vectors/row_norm_vals[:,None] elif param.transform_type == 'quaternions': q_x = df['RotationX'].values q_y = df['RotationY'].values q_z = df['RotationZ'].values q_w = df['RotationW'].values for i in range(num_rows): #get current gaze vector and transforms gaze_vector = gaze_vectors[i,:] curr_q_x = q_x[i] curr_q_y = q_y[i] curr_q_z = q_z[i] curr_q_w = q_w[i] # first column one_one = np.square(curr_q_w) + np.square(curr_q_x) - np.square(curr_q_y) - np.square(curr_q_z) two_one = 2 * (curr_q_w*curr_q_z + curr_q_x*curr_q_y) three_one = 2 * (curr_q_x*curr_q_z - curr_q_w*curr_q_y) # second column one_two = 2* (curr_q_x*curr_q_y - curr_q_w*curr_q_z) two_two = np.square(curr_q_w) + np.square(curr_q_x) - np.square(curr_q_y) - np.square(curr_q_z) three_two = 2* (curr_q_w*curr_q_x - curr_q_y*curr_q_z) # third column one_three = 2* (curr_q_w*curr_q_y - curr_q_x*curr_q_z) two_three = 2* (curr_q_y*curr_q_z - curr_q_w*curr_q_x) three_three = np.square(curr_q_w) + np.square(curr_q_x) - np.square(curr_q_y) - np.square(curr_q_z) # compute rotation matrix as per Diaz et al. rot_mat = np.array([[one_one, two_one, three_one],\ [one_two, two_two, three_two],\ [one_three, two_three, three_three]]) transformed_gaze_vectors[i,:] = np.matmul(rot_mat,gaze_vector) #normalize all rows again to account for any rounding errors row_norm_vals = np.sqrt(np.square(transformed_gaze_vectors).sum(axis=1)) transformed_gaze_vectors = transformed_gaze_vectors/row_norm_vals[:,None] return transformed_gaze_vectors def window_signal_overlap(X, y, window_size=10): M, N = X.shape windows = np.array([]) lbls = [] FP_indxs = np.where(y==1)[0] TP_indxs = np.where(y==-1)[0] combined = np.hstack((FP_indxs, TP_indxs)) for i in combined: window = np.array([]) starts = range(max(0, i-window_size+1), i+1) ends = range(i+1, min(M, i+window_size)) for (s, e) in zip(starts, ends): window = X[s:e] curr_size = window.shape[0] window = window.reshape(-1, curr_size*N) # window is clipped from front or back, add zero padding if curr_size < window_size: padding_size = window_size*N - curr_size*N padding = np.zeros(shape=(1,padding_size)) if i-window_size < 0: window = np.hstack((padding,window)) else: window = np.hstack((window, padding)) if windows.shape[0] == 0: windows = window else: windows = np.vstack((windows, window)) lbl = y[i] lbls.append(lbl) lbls = np.array(lbls, dtype=int).reshape(-1,1) return (windows, lbls) def window_signal_center(X, y, window_size=10): M, N = X.shape windows = np.array([]) lbls = [] FP_indxs = np.where(y==1)[0] TP_indxs = np.where(y==-1)[0] combined = np.hstack((FP_indxs, TP_indxs)) for i in combined: start = max(0, i-int(window_size/2)) end = min(M, i+int(window_size/2)) window = X[start:end, :] curr_size = window.shape[0] window = window.reshape(-1, curr_size*N) # window is clipped from front or back, add zero padding if curr_size < window_size: padding_size = window_size*N - curr_size*N padding = np.zeros(shape=(1,padding_size)) if i-int(window_size/2) < 0: window = np.hstack((padding,window)) else: window = np.hstack((window, padding)) if windows.shape[0] == 0: windows = window else: windows = np.vstack((windows, window)) lbl = y[i] lbls.append(lbl) lbls = np.array(lbls, dtype=int).reshape(-1,1) return (windows, lbls) def window_signal_start(X, y, window_size=10): M, N = X.shape windows = np.array([]) lbls = [] FP_indxs = np.where(y==1)[0] TP_indxs = np.where(y==-1)[0] combined = np.hstack((FP_indxs, TP_indxs)) for i in combined: start = i end = min(M, i+window_size) window = X[start:end, :] curr_size = window.shape[0] window = window.reshape(-1, curr_size*N) # window is clipped from front or back, add zero padding if curr_size < window_size: padding_size = window_size*N - curr_size*N padding = np.zeros(shape=(1,padding_size)) window = np.hstack((window, padding)) if windows.shape[0] == 0: windows = window else: windows = np.vstack((windows, window)) lbl = y[i] lbls.append(lbl) lbls = np.array(lbls, dtype=int).reshape(-1,1) return (windows, lbls) def window_signal(X, y, mode='center', window_size=10): if mode == 'center': return window_signal_center(X, y, window_size=window_size) elif mode == 'start': return window_signal_start(X, y, window_size=window_size) elif mode == 'overlap': return window_signal_overlap(X, y, window_size=window_size) def get_header(features, window_size): header = "" for i in range(window_size): h = ['%s_%d'%(f, i) for f in features] h = ",".join(h) if not len(header): header = h continue header = '%s,%s'%(header, h) header = '%s,%s'%(header, 'label') return header def is_discrete_feat(feat): isdisc = False for f in discrete_feats: if f in feat: isdisc = True break return isdisc def read_data(path): files = glob.glob(os.path.join(path, '*')) y = [] X = [] subjs = [] discrete_cols = [] cols = [] for file in files: subj = file.split(os.sep)[-1] subj = subj.split('.')[0] subj = subj.split('_')[0] if subj in subjs_skip: continue data = pd.read_csv(file) if not len(discrete_cols): cols = data.columns discrete_cols = [] for i in range(len(cols)): if is_discrete_feat(cols[i]): discrete_cols.append(cols[i]) if not len(y): y = data['label'].to_numpy().reshape(-1, 1) else: y = np.vstack((y, data['label'].to_numpy().reshape(-1,1))) curr_X = data[data.columns[data.columns != 'label']].to_numpy() if not len(X): X = curr_X else: X = np.vstack((X, curr_X)) M = curr_X.shape[0] curr_subjs = np.repeat(subj, M).reshape(-1, 1) if not len(subjs): subjs = curr_subjs else: subjs = np.vstack((subjs, curr_subjs)) return (X, y, subjs, discrete_cols, cols) def read_feat_data(path): ''' Reads feature data in director 'path' and stores them in PD data frame''' files = glob.glob(os.path.join(path, '*')) combined = pd.DataFrame() for file in files: data = pd.read_csv(file, dtype={'selection_type': 'str'}, index_col=0) filename = file.split(os.sep)[-1] filename = filename.split('.')[0] subj = filename.split('_')[0] data['sbj'] = subj block = filename.split('_')[1] data['block'] = block combined = combined.append(data) return combined def read_window_data(main_dir, MODE, WINDOW_SIZE, data_type='train'): data_dir = os.path.join(main_dir, 'w_%s_%d'%(MODE, WINDOW_SIZE)) data_dir = os.path.join(data_dir, data_type) X, y, subjs, discrete_cols, cols = read_data(main_dir) return X, y, subjs, discrete_cols, cols def normalize_subj_data(X, y, subjs, exclude_idxs): X_out, y_out = [], [] sbjs = np.unique(subjs) for sbj in sbjs: subj_idxs = np.where(subjs==sbj)[0] X_subj, y_subj = X[subj_idxs], y[subj_idxs] if X_subj.shape[0] < 1: continue if len(exclude_idxs): mask = ~np.isin(np.arange(X.shape[1]), exclude_idxs) X_subj[:,mask] = scale(X_subj[:,mask]) else: X_Subj = scale(X_subj) if not len(X_out): X_out = X_subj y_out = y_subj else: X_out = np.vstack((X_out, X_subj)) y_out = np.vstack((y_out, y_subj)) return X_out, y_out def get_feat_indxs(columns, query_cols): indxs = [] for i in range(len(columns)): c = columns[i] c = c.split('_') c.pop() c = '_'.join(c) if c in query_cols: indxs.append(i) return indxs def filter_features(featfilepath, X, headers, count_thresh=1): feat_counter = pd.read_csv(featfilepath,squeeze=True) imp_indxs = np.where(feat_counter['num']>count_thresh)[0] important_feats = feat_counter['feat'].values[imp_indxs] indxs = get_feat_indxs(headers, important_feats) X = X[:,indxs] return X def compute_feature_signal_time_avg(features,fixation_times,time_stamps,N): num_features = features.shape[1] num_rows = len(time_stamps) feature_signal = np.zeros((num_rows,num_features),dtype=float) #loop over each time stamp, grab which elements have time greater than current time -2 but less than current time, average, add to row for i in range(num_rows): curr_time = time_stamps[i] #get events N seconds prior to curr_time curr_events = features[(fixation_times>=curr_time-N) & (fixation_times<=curr_time),:] curr_row = np.zeros((1,num_features),dtype=float) if len(curr_events) >0: curr_row = np.mean(curr_events,0) feature_signal[i,:] = curr_row return feature_signal ''' Summary: Compute a continous time series for feature values by averaging over the past N events Inputs: features: 2D array with rows corresponding to events, and columns corresponding to feature values fixation_times: 1D array containing the timestamps corresponding to each row of fixation_times time_stamps: 1D array representing every sample in the time series, containing times in seconds N: Scalar indicating the number of past events to consider when computing average feature values Output: 2D array, with rows corresponding to each sample in time_stamps and a column for each feature signal. ''' def compute_feature_signal_event_avg(features,fixation_times,time_stamps,N): num_features = features.shape[1] num_rows = len(time_stamps) feature_signal = np.zeros((num_rows,num_features),dtype=float) #loop over each time stamp, grab past N elements, average, add to row for i in range(num_rows): curr_time = time_stamps[i] #get events prior to curr_time curr_events = features[(fixation_times<=curr_time),:] #grab last N of them if possible prior_events = curr_events[-N:,:] curr_row = np.zeros((1,num_features),dtype=float) if len(prior_events) >0: curr_row = np.mean(prior_events,0) feature_signal[i,:] = curr_row return feature_signal #wrapper function to call feature signal methods def compute_feature_signal(method,features,event_times,time_stamps,N): if method == 'time_avg': return compute_feature_signal_time_avg(features,event_times,time_stamps,N) elif method == 'event_avg': return compute_feature_signal_event_avg(features,event_times,time_stamps,N) elif method == 'event_discrete': return compute_feature_signal_event_discrete(features,event_times,time_stamps,N) else: return None def get_header(features, window_size): header = "" for i in range(window_size): h = ['%s_%d'%(f, i) for f in features] h = ",".join(h) if not len(header): header = h continue header = '%s,%s'%(header, h) header = '%s,%s'%(header, 'label') return header