Spaces:

sofieff
/

NeuroMusicLab

Sleeping

App Files Files Community

NeuroMusicLab / data_processor.py

sofieff

ready for deply

66947ed 3 months ago

raw

history blame contribute delete

9.13 kB

	"""
	EEG Data Processing Module
	-------------------------
	Handles EEG data loading, preprocessing, and epoching for real-time classification.
	Adapted from the original eeg_motor_imagery.py script.
	"""

	import scipy.io
	import numpy as np
	import mne
	import pandas as pd
	from typing import List, Tuple

	class EEGDataProcessor:
	"""
	Processes EEG data from .mat files for motor imagery classification.
	"""

	def __init__(self):
	self.fs = None
	self.ch_names = None
	self.event_id = {
	"left_hand": 1,
	"right_hand": 2,
	"neutral": 3,
	"left_leg": 4,
	"tongue": 5,
	"right_leg": 6,
	}

	def load_mat_file(self, file_path: str) -> Tuple[np.ndarray, np.ndarray, List[str], int]:
	"""Load and parse a single .mat EEG file."""
	mat = scipy.io.loadmat(file_path)
	content = mat['o'][0, 0]

	labels = content[4].flatten()
	signals = content[5]
	chan_names_raw = content[6]
	channels = [ch[0][0] for ch in chan_names_raw]
	fs = int(content[2][0, 0])

	return signals, labels, channels, fs

	def create_raw_object(self, signals: np.ndarray, channels: List[str], fs: int,
	drop_ground_electrodes: bool = True) -> mne.io.RawArray:
	"""Create MNE Raw object from signal data."""
	df = pd.DataFrame(signals, columns=channels)

	if drop_ground_electrodes:
	# Drop auxiliary channels that should be excluded
	aux_exclude = ('X3', 'X5')
	columns_to_drop = [ch for ch in channels if ch in aux_exclude]

	df = df.drop(columns=columns_to_drop, errors="ignore")
	print(f"Dropped auxiliary channels {columns_to_drop}. Remaining channels: {len(df.columns)}")

	eeg = df.values.T
	ch_names = df.columns.tolist()

	self.ch_names = ch_names
	self.fs = fs

	info = mne.create_info(ch_names=ch_names, sfreq=fs, ch_types="eeg")
	raw = mne.io.RawArray(eeg, info)

	return raw

	def extract_events(self, labels: np.ndarray) -> np.ndarray:
	"""Extract events from label array."""
	onsets = np.where((labels[1:] != 0) & (labels[:-1] == 0))[0] + 1
	event_codes = labels[onsets].astype(int)
	events = np.c_[onsets, np.zeros_like(onsets), event_codes]

	# Keep only relevant events
	mask = np.isin(events[:, 2], np.arange(1, 7))
	events = events[mask]

	return events

	def create_epochs(self, raw: mne.io.RawArray, events: np.ndarray,
	tmin: float = 0, tmax: float = 1.5, event_id=None) -> mne.Epochs:
	"""Create epochs from raw data and events."""
	if event_id is None:
	event_id = self.event_id
	epochs = mne.Epochs(
	raw,
	events=events,
	event_id=event_id,
	tmin=tmin,
	tmax=tmax,
	baseline=None,
	preload=True,
	)
	return epochs

	def process_files(self, file_paths: List[str]) -> Tuple[np.ndarray, np.ndarray, List[str]]:
	"""Process multiple EEG files and return combined data."""
	all_epochs = []
	allowed_labels = {1, 2, 4, 6}
	allowed_event_id = {k: v for k, v in self.event_id.items() if v in allowed_labels}

	for file_path in file_paths:
	signals, labels, channels, fs = self.load_mat_file(file_path)
	raw = self.create_raw_object(signals, channels, fs, drop_ground_electrodes=True)
	events = self.extract_events(labels)
	# only keep allowed labels
	events = events[np.isin(events[:, -1], list(allowed_labels))]
	# create epochs only for allowed labels
	epochs = self.create_epochs(raw, events, event_id=allowed_event_id)
	all_epochs.append((epochs, channels))

	if len(all_epochs) > 1:
	epochs_combined = mne.concatenate_epochs([ep for ep, _ in all_epochs])
	ch_names = all_epochs[0][1] # Assume same channel order for all files
	else:
	epochs_combined = all_epochs[0][0]
	ch_names = all_epochs[0][1]
	# Convert to arrays for model input
	X = epochs_combined.get_data().astype("float32")
	y = (epochs_combined.events[:, -1] - 1).astype("int64") # classes 0..5
	return X, y, ch_names

	def load_continuous_data(self, file_paths: List[str]) -> Tuple[np.ndarray, int]:
	"""
	Load continuous raw EEG data without epoching.

	Args:
	file_paths: List of .mat file paths

	Returns:
	raw_data: Continuous EEG data [n_channels, n_timepoints]
	fs: Sampling frequency
	"""
	all_raw_data = []

	for file_path in file_paths:
	signals, labels, channels, fs = self.load_mat_file(file_path)
	raw = self.create_raw_object(signals, channels, fs, drop_ground_electrodes=True)

	# Extract continuous data (no epoching)
	continuous_data = raw.get_data() # [n_channels, n_timepoints]
	all_raw_data.append(continuous_data)

	# Concatenate all continuous data along time axis
	if len(all_raw_data) > 1:
	combined_raw = np.concatenate(all_raw_data, axis=1)
	else:
	combined_raw = all_raw_data[0]

	return combined_raw, fs

	def prepare_loso_split(self, file_paths: List[str], test_session_idx: int = 0) -> Tuple:
	"""
	Prepare Leave-One-Session-Out (LOSO) split for EEG data.

	Args:
	file_paths: List of .mat file paths (one per subject)
	test_subject_idx: Index of subject to use for testing

	Returns:
	X_train, y_train, X_test, y_test, subject_info
	"""
	all_sessions_data = []
	session_info = []

	# Load each subject separately
	for i, file_path in enumerate(file_paths):
	signals, labels, channels, fs = self.load_mat_file(file_path)
	raw = self.create_raw_object(signals, channels, fs, drop_ground_electrodes=True)
	events = self.extract_events(labels)
	epochs = self.create_epochs(raw, events)

	# Convert to arrays
	X_subject = epochs.get_data().astype("float32")
	y_subject = (epochs.events[:, -1] - 1).astype("int64")
	all_sessions_data.append((X_subject, y_subject))
	session_info.append({
	'file_path': file_path,
	'subject_id': f"Subject_{i+1}",
	'n_epochs': len(X_subject),
	'channels': channels,
	'fs': fs
	})

	# LOSO split: one session for test, others for train
	test_sessions = all_sessions_data[test_session_idx]
	train_sessions = [all_sessions_data[i] for i in range(len(all_sessions_data)) if i != test_session_idx]

	# Combine training sessions
	if len(train_sessions) > 1:
	X_train = np.concatenate([sess[0] for sess in train_sessions], axis=0)
	y_train = np.concatenate([sess[1] for sess in train_sessions], axis=0)
	else:
	X_train, y_train = train_sessions[0]

	X_test, y_test = test_sessions

	print("LOSO Split:")
	print(f" Test Subject: {session_info[test_session_idx]['subject_id']} ({len(X_test)} epochs)")
	print(f" Train Subjects: {len(train_sessions)} subjects ({len(X_train)} epochs)")

	return X_train, y_train, X_test, y_test, session_info

	def simulate_real_time_data(self, X: np.ndarray, y: np.ndarray, mode: str = "random") -> Tuple[np.ndarray, int]:
	"""
	Simulate real-time EEG data for demo purposes.

	Args:
	X: EEG data array (currently epoched data)
	y: Labels array
	mode: "random", "sequential", or "class_balanced"

	Returns:
	Single epoch and its true label
	"""
	if mode == "random":
	idx = np.random.randint(0, len(X))
	elif mode == "sequential":
	# Use a counter for sequential sampling (would need to store state)
	idx = np.random.randint(0, len(X)) # Simplified for now
	elif mode == "class_balanced":
	# Sample ensuring we get different classes
	available_classes = np.unique(y)
	target_class = np.random.choice(available_classes)
	class_indices = np.where(y == target_class)[0]
	idx = np.random.choice(class_indices)
	else:
	idx = np.random.randint(0, len(X))

	return X[idx], y[idx]