old musi stuff

2025-10-29 21:48:54 -05:00
parent c3b0526f59
commit 6b150b7bae
11 changed files with 1319 additions and 7 deletions
--- a/utils.py
+++ b/utils.py
@@ -0,0 +1,745 @@
+#Utility functions for working with 3D gaze data.
+#Authors: Brendan David-John, Candace Peacock
+import itertools
+import numpy as np 
+import pandas as pd
+import os
+import sys
+import math
+import pandas as pd
+import numpy as np
+import json
+import glob  # This was missing
+from sklearn.preprocessing import scale
+from sklearn.utils import resample
+
+
+subjs_skip = []
+discrete_feats = ['sac_det','fix_det']
+
+#https://pubmed.ncbi.nlm.nih.gov/25713524/ & https://www.sciencedirect.com/science/article/abs/pii/0025556475900759
+max_sacc_speed = 800
+
+#https://www.geeksforgeeks.org/python-make-a-list-of-intervals-with-sequential-numbers/
+def intervals_extract(iterable): 
+    iterable = sorted(set(iterable)) 
+    for key, group in itertools.groupby(enumerate(iterable), 
+    lambda t: t[1] - t[0]): 
+        group = list(group) 
+        yield [group[0][1], group[-1][1]] 
+
+#find consecutive chunks of true values in bools, and return start and end indicies
+#input:  1d np array
+#output: 2d np array with indices
+def intervals_from_bool(bools):
+    #get python list of indicies
+    bool_idx = np.where(bools == True)[0].tolist()
+
+    #pull out the intervals
+    intervals = intervals_extract(bool_idx)
+
+    #convert to flat np array
+    intervals = [item for sublist in intervals for item in sublist]
+
+    #unroll it so start and end points fall into 1st/2nd column
+    intervals = np.reshape(intervals, (-1, 2))
+
+    return intervals
+
+'''
+Function to compute angular gaze velocity using the a numerically stable atan2 method
+Inputs: gaze_vectors: Nx3 array with N 3D gaze vectors (x,y,z),
+        dt: 1D array with N elements containing timestep for each gaze sample
+Output: 1D array with N elements containing angular gaze velocity in degrees
+'''
+def compute_gaze_velocity(gaze_vectors,dt):
+    #get array of i+1 vectors 
+    gaze_vectors_1 = gaze_vectors[0:-1,:]
+    gaze_vectors_2 = gaze_vectors[1:,:]
+
+    #numerically stable method: atan2(norm(u-v),norm(u+v)) assuming u and v are already normalized
+    gaze_dist = 2*np.arctan2(np.sqrt(np.square(gaze_vectors_1-gaze_vectors_2).sum(axis=1)),np.sqrt(np.square(gaze_vectors_1+gaze_vectors_2).sum(axis=1))) #2*np.arctan2(mag(u-v),mag(u+v))
+    gaze_velocity = np.rad2deg(gaze_dist) / dt
+
+    #make nans and infs zero
+    np.nan_to_num(gaze_velocity,copy=False,posinf=0.0,neginf=0.0)
+
+    #correct outliers over 800 deg/s by interpolating across good samples
+    gaze_velocity_good_idx = (gaze_velocity <= max_sacc_speed).nonzero()[0]
+    gaze_velocity = np.interp(range(gaze_velocity.shape[0]),gaze_velocity_good_idx,gaze_velocity[gaze_velocity_good_idx])
+
+    #append zero for first element, so size matches input
+    return np.insert(gaze_velocity,0,0.0)
+
+'''
+Function to compute angular gaze velocity using the arccos dot method
+Inputs: gaze_vectors: Nx3 array with N 3D gaze vectors (x,y,z), 
+        dt :1D array with N elements containing timestep for each gaze sample
+Output: 1D array with N elements containing angular gaze velocity in degrees
+'''
+def compute_gaze_velocity_naive(gaze_vectors,dt):
+    #get array of i+1 vectors
+    gaze_vectors_1 = gaze_vectors[0:-1,:]
+    gaze_vectors_2 = gaze_vectors[1:,:]
+
+    #arc cos of dot product method, assuming vectors are normalized
+    gaze_dist_naive = np.arccos(np.sum(gaze_vectors_1*gaze_vectors_2, axis=1))
+    gaze_velocity_naive = np.rad2deg(gaze_dist_naive) / dt
+
+    #make nans and infs zero
+    np.nan_to_num(gaze_velocity_naive,copy=False,posinf=0.0,neginf=0.0)
+
+    #correct outliers over 800 deg/s by interpolating across good samples
+    gaze_velocity_naive_good_idx = (gaze_velocity_naive <= max_sacc_speed).nonzero()[0]
+    gaze_velocity_naive = np.interp(range(gaze_velocity_naive.shape[0]),gaze_velocity_naive_good_idx,gaze_velocity_naive[gaze_velocity_naive_good_idx])
+
+    #append zero for first element, so size matches input
+    return np.insert(gaze_velocity_naive,0,0.0)
+
+'''
+Function to classify gaze samples as saccade events using velocity thresholding.
+Inputs: gaze_velocity: 1D array with N elements representing gaze velocity in degrees,
+        time_stamps: 1D array with N elements representing timestamps in seconds,
+        ivt_threshold: scalar in degrees, saccade samples are classified above this value
+        min_dur: scalar in seconds, saccades shorter than this are discarded
+Output: 2 element tuple. 
+        First element: 1D boolean array with N elements where True indiciates a sample is part of saccade, and False if not
+        Second element: 2D array that is Mx2, where M is the number of saccade events detected and the first column is start index, second end index
+'''
+def saccade_classification_ivt(gaze_velocity,time_stamps,ivt_threshold=100.0,min_dur=.012,max_dur=2.0):
+    #find samples over threshold
+    sac_bool = gaze_velocity > ivt_threshold
+    sac_intervals = intervals_from_bool(sac_bool)
+
+    #correct saccades shorter than minimum duration
+    sacc_durs = time_stamps[sac_intervals[:,1]] - time_stamps[sac_intervals[:,0]]
+    short_sacc_idx = np.where((sacc_durs<min_dur)|(sacc_durs>max_dur))[0]
+    
+    #use short_sacc_idx to remove the bad intervals, and set those bad intervals to False in sac_bool
+    bad_sac_intervals = sac_intervals[short_sacc_idx,:] 
+
+    sac_intervals_clean = np.delete(sac_intervals,short_sacc_idx,0)
+
+    num_bad_intervals = len(bad_sac_intervals)
+    for i in range(num_bad_intervals):
+        curr_interval = bad_sac_intervals[i,:]
+        sac_bool[curr_interval[0]:curr_interval[1]+1] = False
+
+    return (sac_bool,sac_intervals_clean)
+
+'''
+Function to classify gaze samples as fixation events using velocity thresholding.
+Inputs: gaze_velocity: 1D array with N elements representing gaze velocity in degrees,
+        time_stamps: 1D array with N elements representing timestamps in seconds,
+        ivt_threshold: scalar in degrees, fixation samples are classified below this value
+        min_dur: scalar in seconds, fixations shorter than this are discarded
+        max_dur: scalar in secodns, fixations longer than this are discarded
+Output: 2 element tuple. 
+        First element: 1D boolean array with N elements where True indiciates a sample is part of saccade, and False if not
+        Second element: 2D array that is Mx2, where M is the number of saccade events detected and the first column is start index, second end index
+'''
+def fixation_classification_ivt(gaze_velocity,time_stamps,ivt_threshold=20.0,min_dur=0.100,max_dur=2.0):
+    #find samples over threshold
+    fix_bool = gaze_velocity < ivt_threshold
+
+    fix_intervals = intervals_from_bool(fix_bool)
+
+    #correct fixation shorter than minimum duration
+    fix_durs = time_stamps[fix_intervals[:,1]] - time_stamps[fix_intervals[:,0]]
+    short_fix_idx = np.where((fix_durs<min_dur) | (fix_durs>max_dur))[0]
+    #short_fix_idx = np.where(fix_durs<min_dur)[0]
+
+    #use short_fix to remove the bad intervals, and set those bad intervals to False in fix_bool
+    bad_fix_intervals = fix_intervals[short_fix_idx,:]
+    fix_intervals_clean = np.delete(fix_intervals,short_fix_idx,0)
+
+    num_bad_intervals = len(bad_fix_intervals)
+    for i in range(num_bad_intervals):
+        curr_interval = bad_fix_intervals[i,:]
+        fix_bool[curr_interval[0]:curr_interval[1]+1] = False
+
+    return (fix_bool,fix_intervals_clean)
+'''
+Compute angular dispersion over input set of 3D gaze vectors, computed based on max angular displacement
+    of a sample in window from the centroid.
+Inputs:  gaze_vectors: Nx3 array with N 3D gaze vectors (x,y,z)
+Outputs: Maximum dispersion in degrees
+'''
+def compute_dispersion(gaze_vectors):
+    #get gaze_distances in angle from centroid of gaze positions
+    centroid = np.mean(gaze_vectors,0)
+
+    #numerically stable method: atan2(norm(u-v),norm(u+v)) assuming u and v are already normalized
+    gaze_dist = np.rad2deg(2*np.arctan2(np.sqrt(np.square(gaze_vectors-centroid)).sum(axis=1),np.sqrt(np.square(gaze_vectors+centroid)).sum(axis=1))) #2*np.arctan2(mag(u-v),mag(u+v))
+
+    # gaze_dist = 2*np.arctan2(np.sqrt(np.square(gaze_vectors-gaze_vectors).sum(axis=1)),np.sqrt(np.square(gaze_vectors+gaze_vectors).sum(axis=1))) #2*np.arctan2(mag(u-v),mag(u+v))
+    #make nans and infs zero
+    # print(gaze_dist)
+    np.nan_to_num(gaze_dist,copy=False,posinf=0.0,neginf=0.0) 
+
+
+    #return maximum
+    return np.max(gaze_dist)
+
+'''
+Function to classify gaze samples as fixation events using I-DT by enforcing a maximum angular dispersion within a time window of min duration
+Implementation based on:  https://github.com/ecekt/eyegaze/blob/master/gaze.py and https://github.com/ASAPLableni/VR-centred_I-DT_algorithm
+Inputs: gaze_vectors: Nx3 array with N 3D gaze vectors (x,y,z)
+        time_stamps: 1D array with N elements representing timestamps in seconds,
+        sac_bool: 1D boolean array indicating which samples were marked as saccades from I-VT, used to ensure samples do not have more than one label
+        min_dur: scalar in seconds, fixations shorter than this are discarded
+        max_dur: scalar in secodns, fixations longer than this are discarded
+        max_disp: scalar in degrees, maximum spatial dispersion allowed within a time window for a fixation
+        window_size: how big of window to bin samples within when classifying dispersion 
+Output: 2 element tuple. 
+        First element: 1D boolean array with N elements where True indiciates a sample is part of saccade, and False if not
+        Second element: 2D array that is Mx2, where M is the number of saccade events detected and the first column is start index, second end index
+'''
+def fixation_classification_idt(gaze_vectors,time_stamps,sac_bool,min_dur=0.100,max_dur=2.0,max_disp=1.0):
+    fix_bool = np.zeros(sac_bool.shape, dtype=bool) 
+    window_range = [0,0]
+
+    current = 0 #pointer to represent the current beginning point of the window
+    last = 0
+    fix_idx = []
+
+    while (current < len(gaze_vectors)):
+        t0 = time_stamps[current] #beginning time
+        t1 = t0 + min_dur     #time after a min. fix. threshold has been observed
+
+        for r in range(current, len(gaze_vectors)): 
+            if(time_stamps[r]>= t0 and time_stamps[r]<= t1):
+                last = r 
+            elif time_stamps[r] > t1:
+                break
+
+        window_range = [current,last]
+
+        #now check the dispersion in this window
+        dispersion = compute_dispersion(gaze_vectors[current:last+1,:])
+
+        if (dispersion <= max_disp):
+            #add new points
+            while(dispersion <= max_disp and last + 1 < len(gaze_vectors)):
+                last += 1
+                window_range = [current,last]
+                #print current, last, "*"
+                #print "*"
+                dispersion = compute_dispersion(gaze_vectors[current:last+1,:])
+       
+            #dispersion threshold is exceeded
+            #fixation at the centroid [current,last]
+            fix_bool[current:last+1] = True
+            
+            current = last + 1 #this will move the pointer to a novel window
+
+        else:
+            current += 1 
+            last = current 
+
+    #correct with saccade bool, find points marked as both, change to False in fix_bool
+    conflict_idx = np.where((fix_bool == True) & (sac_bool == True))[0]
+    fix_bool[conflict_idx] = False
+
+    fix_intervals = intervals_from_bool(fix_bool)
+
+    #correct fixation shorter than minimum duration or longer than max
+    fix_durs = time_stamps[fix_intervals[:,1]] - time_stamps[fix_intervals[:,0]]
+    short_fix_idx = np.where((fix_durs<min_dur) | (fix_durs>max_dur))[0]
+
+    #use short_fix to remove the bad intervals, and set those bad intervals to False in fix_bool
+    bad_fix_intervals = fix_intervals[short_fix_idx,:]
+    fix_intervals_clean = np.delete(fix_intervals,short_fix_idx,0)
+
+    num_bad_intervals = len(bad_fix_intervals)
+    for i in range(num_bad_intervals):
+        curr_interval = bad_fix_intervals[i,:]
+        fix_bool[curr_interval[0]:curr_interval[1]+1] = False
+
+    return (fix_bool,fix_intervals_clean)
+
+'''
+Summary: Compute a continous time series for feature values by averaging over events in the last N seconds
+Inputs: features: 2D array with rows corresponding to events, and columns corresponding to feature values
+        fixation_times: 1D array containing the timestamps corresponding to each row of fixation_times
+        time_stamps: 1D array representing every sample in the time series, containing times in seconds
+        N: Scalar indicating the number of seconds to consider past events within for computing average feature values
+Output: 2D array, with rows corresponding to each sample in time_stamps and a column for each feature signal. 
+'''
+def compute_feature_signal_time_avg(features,fixation_times,time_stamps,N):
+    num_features   = features.shape[1]
+    num_rows       = len(time_stamps)    
+    feature_signal = np.zeros((num_rows,num_features),dtype=float)
+
+    #loop over each time stamp, grab which elements have time greater than current time -2 but less than current time, average, add to row
+    for i in range(num_rows):
+        curr_time = time_stamps[i]
+
+        #get events N seconds prior to curr_time
+        curr_events = features[(fixation_times>=curr_time-N) & (fixation_times<=curr_time),:]
+        
+        curr_row = np.zeros((1,num_features),dtype=float)
+        if len(curr_events) >0:
+            curr_row = np.mean(curr_events,0)
+        feature_signal[i,:] = curr_row
+
+    return feature_signal
+
+'''
+Summary: Compute a continous time series for feature values by averaging over the past N events
+Inputs: features: 2D array with rows corresponding to events, and columns corresponding to feature values
+        fixation_times: 1D array containing the timestamps corresponding to each row of fixation_times
+        time_stamps: 1D array representing every sample in the time series, containing times in seconds
+        N: Scalar indicating the number of past events to consider when computing average feature values
+Output: 2D array, with rows corresponding to each sample in time_stamps and a column for each feature signal. 
+'''
+def compute_feature_signal_event_avg(features,fixation_times,time_stamps,N):
+    num_features   = features.shape[1]
+    num_rows       = len(time_stamps)    
+    feature_signal = np.zeros((num_rows,num_features),dtype=float)
+
+    #loop over each time stamp, grab past N elements, average, add to row
+    for i in range(num_rows):
+        curr_time = time_stamps[i]
+
+        #get events prior to curr_time
+        curr_events = features[(fixation_times<=curr_time),:]
+
+        #grab last N of them if possible
+        prior_events = curr_events[-N:,:]
+        
+        curr_row = np.zeros((1,num_features),dtype=float)
+        if len(prior_events) >0:
+            curr_row = np.mean(prior_events,0)
+        feature_signal[i,:] = curr_row
+
+    return feature_signal
+
+#wrapper function to call feature signal methods
+def compute_feature_signal(method,features,event_times,time_stamps,N):
+    if method == 'time_avg':
+        return compute_feature_signal_time_avg(features,event_times,time_stamps,N)
+    elif method == 'event_avg':
+        return compute_feature_signal_event_avg(features,event_times,time_stamps,N)
+    elif method == 'event_discrete':
+        return compute_feature_signal_event_discrete(features,event_times,time_stamps,N)
+    else:
+        return None 
+
+#returns three column np array (x,y,z) containing head corrected gaze vectors
+def head_correction(df,param):
+    gaze_vectors = df[['combined.gazeDirection.x','combined.gazeDirection.y','combined.gazeDirection.z']].values
+    
+    num_rows                 = np.size(gaze_vectors,0)
+    transformed_gaze_vectors = np.zeros(gaze_vectors.shape)
+    
+    if param.transform_type == 'coordinate_frame':
+        #transform vectors 
+        head_pos   = df[['head.pos.x','head.pos.y','head.pos.z']].values
+        head_up    = df[['head.right.x','head.right.y','head.right.z']].values
+        head_right = df[['head.up.x','head.up.y','head.up.z']].values
+        head_dir   = df[['head.dir.x','head.dir.y','head.dir.z']].values
+        
+        for i in range(num_rows):
+            #get current gaze vector and transforms 
+            gaze_vector     = gaze_vectors[i,:]
+            curr_head_up    = head_up[i,:]
+            curr_head_right = head_right[i,:]
+            curr_head_dir   = head_dir[i,:]
+
+            '''Transformation matrix of the form:
+            [ R.x, R.y, R.z, 0,
+            U.x, U.y, U.z, 0,
+            D.x, D.y, D.z, 0,
+            0, 0, 0, 1 ]
+            '''
+            #make transformation matrix
+            transform_mat = np.ma.row_stack((np.append(curr_head_right ,0),\
+                                            np.append(curr_head_up ,0),\
+                                            np.append(curr_head_dir,0),\
+                                            np.array([0.0, 0.0, 0.0, 1.0])))
+
+
+            #add homogenous coordinate to vector 
+            gaze_vector = np.append(gaze_vector,1.0)
+
+            #right-multiply row vector with transpose of matrix as defined above
+            transformed_gaze_vector = np.matmul(gaze_vector,transform_mat.T) 
+
+            #place transformed gaze_vector back into gaze_vectors w/o homogenous coordinate 
+            transformed_gaze_vectors[i,:] = transformed_gaze_vector[0:3] 
+
+        #normalize all rows again to account for any rounding errors
+        row_norm_vals = np.sqrt(np.square(transformed_gaze_vectors).sum(axis=1))
+        transformed_gaze_vectors = transformed_gaze_vectors/row_norm_vals[:,None]
+        
+    elif param.transform_type == 'quaternions':
+        q_x = df['RotationX'].values
+        q_y = df['RotationY'].values
+        q_z = df['RotationZ'].values
+        q_w = df['RotationW'].values
+        
+        for i in range(num_rows):
+            #get current gaze vector and transforms 
+            gaze_vector = gaze_vectors[i,:]
+         
+            curr_q_x    = q_x[i]
+            curr_q_y    = q_y[i]
+            curr_q_z    = q_z[i]
+            curr_q_w    = q_w[i]
+
+            # first column
+            one_one = np.square(curr_q_w) + np.square(curr_q_x) - np.square(curr_q_y) - np.square(curr_q_z)
+            two_one = 2 * (curr_q_w*curr_q_z + curr_q_x*curr_q_y)
+            three_one = 2 * (curr_q_x*curr_q_z - curr_q_w*curr_q_y)
+
+            # second column
+            one_two = 2* (curr_q_x*curr_q_y - curr_q_w*curr_q_z)
+            two_two = np.square(curr_q_w) + np.square(curr_q_x) - np.square(curr_q_y) - np.square(curr_q_z)
+            three_two = 2* (curr_q_w*curr_q_x - curr_q_y*curr_q_z)
+
+            # third column
+            one_three = 2* (curr_q_w*curr_q_y - curr_q_x*curr_q_z)
+            two_three = 2* (curr_q_y*curr_q_z - curr_q_w*curr_q_x)
+            three_three = np.square(curr_q_w) + np.square(curr_q_x) - np.square(curr_q_y) - np.square(curr_q_z)
+
+            # compute rotation matrix as per Diaz et al.
+            rot_mat = np.array([[one_one, two_one, three_one],\
+                                [one_two, two_two, three_two],\
+                                [one_three, two_three, three_three]])
+
+            transformed_gaze_vectors[i,:] = np.matmul(rot_mat,gaze_vector)
+        
+        #normalize all rows again to account for any rounding errors
+        row_norm_vals = np.sqrt(np.square(transformed_gaze_vectors).sum(axis=1))
+        transformed_gaze_vectors = transformed_gaze_vectors/row_norm_vals[:,None]
+    
+    return transformed_gaze_vectors
+
+def window_signal_overlap(X, y, window_size=10):
+	M, N = X.shape
+	windows = np.array([])
+	lbls = []
+
+	FP_indxs = np.where(y==1)[0]
+	TP_indxs = np.where(y==-1)[0]
+	combined = np.hstack((FP_indxs, TP_indxs))
+
+	for i in combined:
+		window = np.array([])
+		starts = range(max(0, i-window_size+1), i+1)
+		ends = range(i+1, min(M, i+window_size))
+
+		for (s, e) in zip(starts, ends):
+			window = X[s:e]
+			curr_size = window.shape[0]
+			window = window.reshape(-1, curr_size*N)
+
+			# window is clipped from front or back, add zero padding
+			if curr_size < window_size: 
+				padding_size = window_size*N - curr_size*N
+				padding = np.zeros(shape=(1,padding_size))
+				if i-window_size < 0:
+					window = np.hstack((padding,window))
+				else: 
+					window = np.hstack((window, padding))
+					
+			if windows.shape[0] == 0:
+				windows = window
+			else:
+				windows = np.vstack((windows, window))
+			lbl = y[i]
+			lbls.append(lbl)
+
+
+	lbls = np.array(lbls, dtype=int).reshape(-1,1)
+	return (windows, lbls)
+
+
+def window_signal_center(X, y, window_size=10):
+	M, N = X.shape
+	windows = np.array([])
+	lbls = []
+
+	FP_indxs = np.where(y==1)[0]
+	TP_indxs = np.where(y==-1)[0]
+	combined = np.hstack((FP_indxs, TP_indxs))
+
+	for i in combined:
+		start = max(0, i-int(window_size/2))
+		end = min(M, i+int(window_size/2))
+
+		window = X[start:end, :]
+		curr_size = window.shape[0]
+		window = window.reshape(-1, curr_size*N)
+
+		# window is clipped from front or back, add zero padding
+		if curr_size < window_size: 
+			padding_size = window_size*N - curr_size*N
+			padding = np.zeros(shape=(1,padding_size))
+			if i-int(window_size/2) < 0:
+				window = np.hstack((padding,window))
+			else: 
+				window = np.hstack((window, padding))
+
+		if windows.shape[0] == 0:
+			windows = window
+		else:
+			windows = np.vstack((windows, window))
+		lbl = y[i]
+		lbls.append(lbl)
+
+	lbls = np.array(lbls, dtype=int).reshape(-1,1)
+	return (windows, lbls)
+
+def window_signal_start(X, y, window_size=10):
+	M, N = X.shape
+	windows = np.array([])
+	lbls = []
+
+	FP_indxs = np.where(y==1)[0]
+	TP_indxs = np.where(y==-1)[0]
+	combined = np.hstack((FP_indxs, TP_indxs))
+
+	for i in combined:
+		start = i
+		end = min(M, i+window_size)
+
+		window = X[start:end, :]
+		curr_size = window.shape[0]
+		window = window.reshape(-1, curr_size*N)
+
+		# window is clipped from front or back, add zero padding
+		if curr_size < window_size: 
+			padding_size = window_size*N - curr_size*N
+			padding = np.zeros(shape=(1,padding_size))
+			window = np.hstack((window, padding))
+
+		if windows.shape[0] == 0:
+			windows = window
+		else:
+			windows = np.vstack((windows, window))
+		lbl = y[i]
+		lbls.append(lbl)
+
+	lbls = np.array(lbls, dtype=int).reshape(-1,1)
+	return (windows, lbls)	
+
+def window_signal(X, y, mode='center', window_size=10):
+	if mode == 'center':
+		return window_signal_center(X, y, window_size=window_size)
+	elif mode == 'start':
+		return window_signal_start(X, y, window_size=window_size)
+	elif mode == 'overlap':
+		return window_signal_overlap(X, y, window_size=window_size)
+
+def get_header(features, window_size):
+	header = ""
+	for i in range(window_size):
+		h = ['%s_%d'%(f, i) for f in features]
+		h = ",".join(h)
+		if not len(header):
+			header = h
+			continue
+		header = '%s,%s'%(header, h)
+	header = '%s,%s'%(header, 'label')
+	return header
+
+def is_discrete_feat(feat):
+	isdisc = False
+	for f in discrete_feats:
+		if f in feat:
+			isdisc = True
+			break
+	return isdisc
+
+def read_data(path):
+	files = glob.glob(os.path.join(path, '*'))
+	y = []
+	X = []
+	subjs = []
+	discrete_cols = []
+	cols = []
+
+	for file in files:
+		subj = file.split(os.sep)[-1]
+		subj = subj.split('.')[0] 
+		subj = subj.split('_')[0] 
+
+		if subj in subjs_skip: 
+			continue
+			
+		data = pd.read_csv(file)
+
+		if not len(discrete_cols):
+			cols = data.columns
+			discrete_cols = []
+			for i in range(len(cols)):
+				if is_discrete_feat(cols[i]):
+					discrete_cols.append(cols[i])
+
+
+		if not len(y):
+			y = data['label'].to_numpy().reshape(-1, 1)
+		else:
+			y = np.vstack((y, data['label'].to_numpy().reshape(-1,1)))
+
+		curr_X = data[data.columns[data.columns != 'label']].to_numpy()
+
+		if not len(X):
+			X = curr_X
+		else: 
+			X = np.vstack((X, curr_X))
+
+		M = curr_X.shape[0]
+		curr_subjs = np.repeat(subj, M).reshape(-1, 1)
+		if not len(subjs):
+			subjs = curr_subjs
+		else:
+			subjs = np.vstack((subjs, curr_subjs))
+
+	return (X, y, subjs, discrete_cols, cols)
+
+
+def read_feat_data(path):
+	''' Reads feature data in director 'path' and stores them in PD data frame'''
+	files = glob.glob(os.path.join(path, '*'))
+
+	combined = pd.DataFrame() 
+	for file in files:
+		data = pd.read_csv(file, dtype={'selection_type': 'str'}, index_col=0)
+
+		filename = file.split(os.sep)[-1]
+		filename = filename.split('.')[0]
+		subj = filename.split('_')[0]
+		data['sbj'] = subj
+		block = filename.split('_')[1] 
+		data['block'] = block
+
+		combined = combined.append(data)
+
+	return combined
+
+def read_window_data(main_dir, MODE, WINDOW_SIZE, data_type='train'):
+	data_dir = os.path.join(main_dir, 'w_%s_%d'%(MODE, WINDOW_SIZE))
+	data_dir = os.path.join(data_dir, data_type)
+	X, y, subjs, discrete_cols, cols = read_data(main_dir)
+	return X, y, subjs, discrete_cols, cols
+
+
+def normalize_subj_data(X, y, subjs, exclude_idxs):
+	X_out, y_out = [], []
+	sbjs = np.unique(subjs)
+	for sbj in sbjs: 
+		subj_idxs = np.where(subjs==sbj)[0]
+		X_subj, y_subj = X[subj_idxs], y[subj_idxs]
+		
+		if X_subj.shape[0] < 1:
+			continue
+
+		if len(exclude_idxs):
+			mask = ~np.isin(np.arange(X.shape[1]), exclude_idxs)
+			X_subj[:,mask] = scale(X_subj[:,mask])
+		else:
+			X_Subj = scale(X_subj)
+		
+		if not len(X_out):
+			X_out = X_subj
+			y_out = y_subj
+		else:
+			X_out = np.vstack((X_out, X_subj))
+			y_out = np.vstack((y_out, y_subj))
+	return X_out, y_out
+
+def get_feat_indxs(columns, query_cols):
+    indxs = []
+    for i in range(len(columns)):
+        c = columns[i]
+        c = c.split('_')
+        c.pop()
+        c = '_'.join(c)
+        if c in query_cols:
+            indxs.append(i)
+    return indxs
+
+def filter_features(featfilepath, X, headers, count_thresh=1):
+    feat_counter = pd.read_csv(featfilepath,squeeze=True)
+
+    imp_indxs = np.where(feat_counter['num']>count_thresh)[0]
+    important_feats = feat_counter['feat'].values[imp_indxs]
+    indxs = get_feat_indxs(headers, important_feats)
+    X = X[:,indxs]
+    return X
+    
+
+def compute_feature_signal_time_avg(features,fixation_times,time_stamps,N):
+    num_features   = features.shape[1]
+    num_rows       = len(time_stamps)    
+    feature_signal = np.zeros((num_rows,num_features),dtype=float)
+
+    #loop over each time stamp, grab which elements have time greater than current time -2 but less than current time, average, add to row
+    for i in range(num_rows):
+        curr_time = time_stamps[i]
+
+        #get events N seconds prior to curr_time
+        curr_events = features[(fixation_times>=curr_time-N) & (fixation_times<=curr_time),:]
+        
+        curr_row = np.zeros((1,num_features),dtype=float)
+        if len(curr_events) >0:
+            curr_row = np.mean(curr_events,0)
+        feature_signal[i,:] = curr_row
+
+    return feature_signal
+
+'''
+Summary: Compute a continous time series for feature values by averaging over the past N events
+Inputs: features: 2D array with rows corresponding to events, and columns corresponding to feature values
+        fixation_times: 1D array containing the timestamps corresponding to each row of fixation_times
+        time_stamps: 1D array representing every sample in the time series, containing times in seconds
+        N: Scalar indicating the number of past events to consider when computing average feature values
+Output: 2D array, with rows corresponding to each sample in time_stamps and a column for each feature signal. 
+'''
+def compute_feature_signal_event_avg(features,fixation_times,time_stamps,N):
+    num_features   = features.shape[1]
+    num_rows       = len(time_stamps)    
+    feature_signal = np.zeros((num_rows,num_features),dtype=float)
+
+    #loop over each time stamp, grab past N elements, average, add to row
+    for i in range(num_rows):
+        curr_time = time_stamps[i]
+
+        #get events prior to curr_time
+        curr_events = features[(fixation_times<=curr_time),:]
+
+        #grab last N of them if possible
+        prior_events = curr_events[-N:,:]
+        
+        curr_row = np.zeros((1,num_features),dtype=float)
+        if len(prior_events) >0:
+            curr_row = np.mean(prior_events,0)
+        feature_signal[i,:] = curr_row
+
+    return feature_signal
+
+#wrapper function to call feature signal methods
+def compute_feature_signal(method,features,event_times,time_stamps,N):
+    if method == 'time_avg':
+        return compute_feature_signal_time_avg(features,event_times,time_stamps,N)
+    elif method == 'event_avg':
+        return compute_feature_signal_event_avg(features,event_times,time_stamps,N)
+    elif method == 'event_discrete':
+        return compute_feature_signal_event_discrete(features,event_times,time_stamps,N)
+    else:
+        return None 
+
+def get_header(features, window_size):
+	header = ""
+	for i in range(window_size):
+		h = ['%s_%d'%(f, i) for f in features]
+		h = ",".join(h)
+		if not len(header):
+			header = h
+			continue
+		header = '%s,%s'%(header, h)
+	header = '%s,%s'%(header, 'label')
+	return header