Upload run_incrementalpca.py
Browse files
AlternativeModels/IncrementalPCA/run_incrementalpca.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# https://stackoverflow.com/questions/31428581/incremental-pca-on-big-data
|
| 2 |
+
import h5py
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.decomposition import IncrementalPCA
|
| 5 |
+
from scipy.io import loadmat
|
| 6 |
+
import pickle
|
| 7 |
+
|
| 8 |
+
inverse_transformation_path = "./mask/"
|
| 9 |
+
filestr = "washu120_subsample1_randperm0_timeseries"
|
| 10 |
+
n_samples = 27255 # the number of time points to get. Leave it empty to use all data
|
| 11 |
+
# Construct the path to the left inverse transformation matrix file and load it
|
| 12 |
+
left_file_path = inverse_transformation_path + "Left_fMRI2Grid_192_by_192_NN.mat"
|
| 13 |
+
left_data = loadmat(left_file_path)
|
| 14 |
+
Left_inverse_transformation = left_data["inverse_transformation"]
|
| 15 |
+
|
| 16 |
+
# Construct the path to the right inverse transformation matrix file and load it
|
| 17 |
+
right_file_path = inverse_transformation_path + "Right_fMRI2Grid_192_by_192_NN.mat"
|
| 18 |
+
right_data = loadmat(right_file_path)
|
| 19 |
+
Right_inverse_transformation = right_data["inverse_transformation"]
|
| 20 |
+
|
| 21 |
+
# h5filename = '/mnt/leuthardte/Active/Cindy/Data/BCP_Jan2023_10moTo20mogroupavg.h5'
|
| 22 |
+
# h5filename = '/mnt/leuthardte/Active/Cindy/Data/120_allsubs_corr.dconn.h5'
|
| 23 |
+
h5filename = (
|
| 24 |
+
"/mnt/leuthardte/Active/Cindy/Data/" + filestr +".h5"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# some parameters
|
| 28 |
+
batch_size = 5451 # Adjust based on your system's memory capacity
|
| 29 |
+
|
| 30 |
+
n_components = 100
|
| 31 |
+
ipca = IncrementalPCA(n_components=n_components)
|
| 32 |
+
|
| 33 |
+
if not n_samples:
|
| 34 |
+
# Get sample size
|
| 35 |
+
with h5py.File(h5filename, "r") as file:
|
| 36 |
+
n_samples = file["/LeftData"].shape[0]
|
| 37 |
+
print(n_samples)
|
| 38 |
+
|
| 39 |
+
# abort if sample size is not divisible by batch size
|
| 40 |
+
print(n_samples)
|
| 41 |
+
assert n_samples % batch_size == 0, "batch size not a factor of sample size"
|
| 42 |
+
|
| 43 |
+
for start_idx in range(0, n_samples, batch_size):
|
| 44 |
+
end_idx = start_idx + batch_size
|
| 45 |
+
|
| 46 |
+
print(start_idx)
|
| 47 |
+
|
| 48 |
+
# Load data from an HDF5 file
|
| 49 |
+
with h5py.File(h5filename, "r") as file:
|
| 50 |
+
reconL = file["/LeftData"][start_idx:end_idx, :, :, :]
|
| 51 |
+
reconR = file["/RightData"][start_idx:end_idx, :, :, :]
|
| 52 |
+
|
| 53 |
+
# Perform matrix and tensor manipulations
|
| 54 |
+
# Transpose and reshape 'reconL' and 'reconR' similar to MATLAB permute and reshape
|
| 55 |
+
corticalrecon_L = (
|
| 56 |
+
Left_inverse_transformation
|
| 57 |
+
@ reconL.transpose(0, 1, 2, 3).reshape(batch_size, -1).T
|
| 58 |
+
)
|
| 59 |
+
corticalrecon_R = (
|
| 60 |
+
Right_inverse_transformation
|
| 61 |
+
@ reconR.transpose(0, 1, 2, 3).reshape(batch_size, -1).T
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# Concatenate the left and right reconstructions
|
| 65 |
+
recon_dtseries = np.vstack((corticalrecon_L, corticalrecon_R))
|
| 66 |
+
# print(recon_dtseries.shape)
|
| 67 |
+
|
| 68 |
+
# recon_dtseries[recon_dtseries == 0] = 1 2024/11/18 commented this out for the timeseries data
|
| 69 |
+
# 2024/06/08 make the diagonals equal to 1 instead of the 0
|
| 70 |
+
|
| 71 |
+
ipca.partial_fit(recon_dtseries.T)
|
| 72 |
+
|
| 73 |
+
principal_components = ipca.components_
|
| 74 |
+
|
| 75 |
+
# Save the trained model
|
| 76 |
+
with open(
|
| 77 |
+
"./IncrementalPCA/pca_model_"
|
| 78 |
+
+ filestr
|
| 79 |
+
+ "_zdim"
|
| 80 |
+
+ str(n_components)
|
| 81 |
+
+ ".pkl",
|
| 82 |
+
"wb",
|
| 83 |
+
) as f:
|
| 84 |
+
pickle.dump(ipca, f)
|
| 85 |
+
|
| 86 |
+
print(ipca.mean_)
|
| 87 |
+
|
| 88 |
+
# np.savetxt(
|
| 89 |
+
# "./IncrementalPCA/principal_components_washu120_subsample10_randperm1_train100_zdim"
|
| 90 |
+
# + str(n_components)
|
| 91 |
+
# + ".txt",
|
| 92 |
+
# principal_components,
|
| 93 |
+
# fmt="%f",
|
| 94 |
+
# )
|
| 95 |
+
|
| 96 |
+
# obtain the latent representations with the loaded components
|
| 97 |
+
# z=principal_components@recon_dtseries
|