File size: 16,895 Bytes
1e32f6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd5d5af
 
1e32f6a
 
 
 
fd5d5af
 
 
1e32f6a
 
 
 
 
 
 
 
 
 
 
fd5d5af
1e32f6a
 
 
 
 
fd5d5af
 
 
1e32f6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd5d5af
1e32f6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea0ddf6
 
 
 
 
 
 
 
1e32f6a
ea0ddf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e32f6a
 
 
fd5d5af
1e32f6a
 
 
 
 
 
 
fd5d5af
1e32f6a
fd5d5af
1e32f6a
 
fd5d5af
1e32f6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd5d5af
 
1e32f6a
 
 
fd5d5af
1e32f6a
 
 
 
 
 
 
d635592
 
 
 
 
 
 
 
1e32f6a
d635592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e32f6a
d635592
 
 
1e32f6a
d635592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e32f6a
 
d635592
 
 
 
 
 
 
 
1e32f6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f982c86
 
1e32f6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420

// 1. Import transformers.js
import { pipeline, env } from './transformers.min.js';

// 2. Constants
env.allowLocalModels = true;
const MODEL_NAME = 'nomic-ai/nomic-embed-text-v1.5';
const EMBEDDING_DIM = 256;
const INDEX_PATH = 'data/quotes_index.bin';
const APPROX_MODEL_SIZE_MB = 180; // Actual observed size of nomic-embed-text-v1.5 model

const DB_NAME = 'QuoteSearchDB';
const DB_VERSION = 1;
const STORE_NAME = 'quoteIndex';

let model;
let indexData;
let isReady = false;

// IndexedDB Helper Functions
function openDB() {
    return new Promise((resolve, reject) => {
        const request = indexedDB.open(DB_NAME, DB_VERSION);

        request.onupgradeneeded = (event) => {
            const db = event.target.result;
            db.createObjectStore(STORE_NAME, { keyPath: 'id' });
        };

        request.onsuccess = (event) => {
            resolve(event.target.result);
        };

        request.onerror = (event) => {
            reject('IndexedDB error: ' + event.target.errorCode);
        };
    });
}

async function getFromDB(key) {
    const db = await openDB();
    return new Promise((resolve, reject) => {
        const transaction = db.transaction([STORE_NAME], 'readonly');
        const store = transaction.objectStore(STORE_NAME);
        const request = store.get(key);

        request.onsuccess = () => {
            resolve(request.result ? request.result.value : null);
        };

        request.onerror = () => {
            reject('Error getting data from DB');
        };
    });
}

async function putInDB(key, value) {
    const db = await openDB();
    return new Promise((resolve, reject) => {
        const transaction = db.transaction([STORE_NAME], 'readwrite');
        const store = transaction.objectStore(STORE_NAME);
        const request = store.put({ id: key, value: value });

        request.onsuccess = () => {
            resolve();
        };

        request.onerror = () => {
            reject('Error putting data in DB');
        };
    });
}

async function deleteFromDB(key) {
    const db = await openDB();
    return new Promise((resolve, reject) => {
        const transaction = db.transaction([STORE_NAME], 'readwrite');
        const store = transaction.objectStore(STORE_NAME);
        const request = store.delete(key);

        request.onsuccess = () => {
            resolve();
        };

        request.onerror = () => {
            reject('Error deleting data from DB');
        };
    });
}

// 3. Load Model and Index
async function loadModel() {
    try {
        // Inform the UI that the model will be downloaded/loaded
        self.postMessage({ type: 'loading', payload: 'Downloading model (this may take a while)...' });

        // Load the model with a progress callback
        model = await pipeline('feature-extraction', MODEL_NAME, {
            progress_callback: (progress) => {
                // Make it explicit that this progress refers to model download or model file operations
                const detailMessage = `Downloading model ${progress.status}: ${progress.file || ''} ${Math.floor(progress.progress || 0)}%`;
                    self.postMessage({ type: 'progress', payload: { ...progress, detail: detailMessage } });
            }
        });
    } catch (error) {
        console.error('Error loading model:', error);
        self.postMessage({ type: 'error', payload: error.message });
        throw error; // Re-throw to prevent further execution if model fails to load
    }
}

async function loadIndex() {
    try {
    self.postMessage({ type: 'loading', payload: 'Checking for cached index file...' });
        const cachedIndex = await getFromDB('quoteIndexData');

        if (cachedIndex) {
            indexData = cachedIndex;
            self.postMessage({ type: 'loading', payload: 'Index loaded from cache.' });
            } else {
            // Inform UI that the index file will be downloaded
            self.postMessage({ type: 'loading', payload: 'Downloading index file (this may take a while)...' });

            // Fetch and parse the index file with progress reporting
            const response = await fetch(INDEX_PATH);
            const contentLength = response.headers.get('Content-Length');
            const total = parseInt(contentLength, 10);
            let loaded = 0;

            const reader = response.body.getReader();
            const chunks = [];

            while (true) {
                const { done, value } = await reader.read();
                if (done) {
                    break;
                }
                chunks.push(value);
                loaded += value.length;

                const progress = {
                    status: 'Downloading Index',
                    progress: (loaded / total) * 100,
                    file: INDEX_PATH,
                    detail: `Downloading index file: ${Math.floor((loaded / total) * 100)}% (${(loaded / (1024 * 1024)).toFixed(2)}MB / ${(total / (1024 * 1024)).toFixed(2)}MB)`
                };
                self.postMessage({ type: 'progress', payload: progress });
            }

            const buffer = await new Response(new Blob(chunks)).arrayBuffer();

            // Parse the binary file
            let offset = 0;
            const numQuotes = new Uint32Array(buffer.slice(offset, offset + 4))[0];
            offset += 4;
            const embeddingDim = new Uint16Array(buffer.slice(offset, offset + 2))[0];
            offset += 2;
            const scale = new Float32Array(buffer.slice(offset, offset + 4))[0];
            offset += 4;
            const metadataSize = new Uint32Array(buffer.slice(offset, offset + 4))[0];
            offset += 4;

            // Read metadata format flag (1 byte) first, then metadata bytes
            let metadata_format = 0; // 0 = uncompressed JSON (legacy), 1 = gzip-compressed JSON
            if (offset + 1 <= buffer.byteLength) {
                metadata_format = new Uint8Array(buffer.slice(offset, offset + 1))[0];
                offset += 1;
            }

            let metadataBytes = buffer.slice(offset, offset + metadataSize);
            offset += metadataSize;

            async function decodeMetadata(bytes, format) {
                const decoder = new TextDecoder('utf-8');
                if (format === 0) {
                    return JSON.parse(decoder.decode(bytes));
                } else if (format === 1) {
                    // Try using native DecompressionStream if available
                    if (typeof DecompressionStream !== 'undefined') {
                        const ds = new DecompressionStream('gzip');
                        const decompressed = await new Response(new Blob([bytes]).stream().pipeThrough(ds)).arrayBuffer();
                        return JSON.parse(decoder.decode(decompressed));
                    } else {
                        // Fallback: DecompressionStream is not available. To support gzip in
                        // older browsers, add a browser-ready pako build to the repo and
                        // implement the decompression here. For now, fail with a clear error.
                        throw new Error('Gzip decompression not available: add DecompressionStream support or include pako in the worker.');
                    }
                } else {
                    throw new Error('Unknown metadata format: ' + format);
                }
            }

            const metadata = await decodeMetadata(metadataBytes, metadata_format);

            const quantizedEmbeddings = new Int8Array(buffer.slice(offset));

            // De-quantize embeddings (processing step) with progress reporting
            const embeddings = new Float32Array(quantizedEmbeddings.length);
            const totalEmbeddings = quantizedEmbeddings.length;
            const updateInterval = Math.floor(totalEmbeddings / 100); // Update every 1%

            for (let i = 0; i < totalEmbeddings; i++) {
                embeddings[i] = quantizedEmbeddings[i] / scale;
                if (updateInterval > 0 && i % updateInterval === 0) {
                    // This is processing (de-quantization), not download
                    const progress = { 
                        status: 'Processing index (de-quantizing)',
                        progress: (i / totalEmbeddings) * 100,
                        file: INDEX_PATH,
                        detail: `Processing index: ${Math.floor((i / totalEmbeddings) * 100)}%`
                    };
                    self.postMessage({ type: 'progress', payload: progress });
                }
            }
            indexData = {
                metadata,
                embeddings: reshape(embeddings, [numQuotes, embeddingDim]),
                embeddingsByteLength: quantizedEmbeddings.byteLength // Store byteLength
            };
            await putInDB('quoteIndexData', indexData); // Store in IndexedDB
        }
    } catch (error) {
        console.error('Error loading index:', error);
        self.postMessage({ type: 'error', payload: error.message });
        throw error;
    }
}

async function load() {
    try {
        await loadIndex();
        isReady = true;
        self.postMessage({ type: 'ready' });
    } catch (error) {
        console.error('Error in worker load function:', error);
        self.postMessage({ type: 'error', payload: error.message });
    }
}

// 4. Listen for Messages
self.onmessage = async (event) => {
    const { type, payload } = event.data;

    if (type === 'search') {
        if (!indexData) {
            // Be explicit: index may be downloaded before search
            self.postMessage({ type: 'loading', payload: 'Downloading index before running your search...' });
            await loadIndex();
        }
        if (!model) {
            self.postMessage({ type: 'loading', payload: 'Downloading model before running your search...' });
            await loadModel();
        }
        self.postMessage({ type: 'loading', payload: 'Searching...' });
        const results = await search(payload);
        self.postMessage({ type: 'results', payload: results });
    } else if (type === 'deleteData') {
        self.postMessage({ type: 'loading', payload: 'Deleting cached data...' });
        // Run cleanup with a timeout so UI doesn't get stuck if some API call blocks or takes too long
        const cleanup = async () => {
            // 1) Delete the specific object from the DB if present
            try {
                await deleteFromDB('quoteIndexData');
            } catch (e) {
                // ignore individual delete error and proceed to full DB deletion
            }

            // 2) Delete the entire IndexedDB database to ensure all stored artifacts are removed
            await new Promise((resolve, reject) => {
                const deleteRequest = indexedDB.deleteDatabase(DB_NAME);
                deleteRequest.onsuccess = () => resolve();
                deleteRequest.onerror = () => reject(deleteRequest.error || new Error('Failed to delete IndexedDB'));
                deleteRequest.onblocked = () => {
                    // If deletion is blocked, still try to continue with cache cleanup
                    resolve();
                };
            });

            // 3) Clear Cache Storage entries that may contain model or transformers resources
            try {
                if (typeof caches !== 'undefined' && caches.keys) {
                    const cacheNames = await caches.keys();
                    for (const cacheName of cacheNames) {
                        // Delete transformers related caches and any caches that contain the model name
                        if (cacheName.startsWith('transformers-cache') || cacheName.includes(MODEL_NAME.replace(/\//g, '-')) || cacheName.includes('nomic')) {
                            await caches.delete(cacheName);
                        }
                    }
                }
            } catch (e) {
                // Non-fatal
                console.warn('Cache cleanup error', e);
            }

            // 4) Clear localStorage keys commonly used by transformers.js or model caches (best-effort)
            try {
                if (typeof localStorage !== 'undefined') {
                    const keysToClear = [];
                    for (let i = 0; i < localStorage.length; i++) {
                        const key = localStorage.key(i);
                        if (!key) continue;
                        if (key.startsWith('transformers') || key.includes('transformers') || key.includes('nomic') || key.includes('hf_')) {
                            keysToClear.push(key);
                        }
                    }
                    for (const k of keysToClear) localStorage.removeItem(k);
                }
            } catch (e) {
                // localStorage may not be available in worker-like contexts; ignore
            }

            // 5) Clear in-memory references
            indexData = null;
            model = null;
            isReady = false;
        };

        // Timeout in ms
        const TIMEOUT_MS = 8000;

        try {
            await Promise.race([
                cleanup(),
                new Promise((_, reject) => setTimeout(() => reject(new Error('cleanup-timeout')), TIMEOUT_MS))
            ]);
            // If cleanup completed within timeout
            self.postMessage({ type: 'dataDeleted', payload: 'Cached data deleted successfully.' });
        } catch (error) {
            if (error && error.message === 'cleanup-timeout') {
                console.warn('deleteData: cleanup timed out');
                // Post a success-like message so UI doesn't stay stuck; note that some cleanup may still be pending
                self.postMessage({ type: 'dataDeleted', payload: 'Cached data deletion attempted (timed out). Some cleanup may remain.' });
            } else {
                console.error('deleteData error:', error);
                self.postMessage({ type: 'error', payload: 'Failed to delete cached data: ' + (error && error.message ? error.message : String(error)) });
            }
        }
    } else if (type === 'getIndexSize') {
        try {
            let totalSize = 0;
            let indexCached = false;
            let modelCached = false;

            // Check if index is cached
            const cachedIndex = await getFromDB('quoteIndexData');
            if (cachedIndex) {
                totalSize += JSON.stringify(cachedIndex.metadata).length + cachedIndex.embeddingsByteLength;
                indexCached = true;
            } else {
                const response = await fetch(INDEX_PATH, { method: 'HEAD' });
                const contentLength = response.headers.get('Content-Length');
                totalSize += parseInt(contentLength, 10);
            }

            // Add approximate model size
            totalSize += APPROX_MODEL_SIZE_MB * 1024 * 1024; // Convert MB to bytes
            // Heuristic: assume model is cached if `model` object is initialized
            if (model) {
                modelCached = true;
            }

            self.postMessage({ type: 'indexSize', payload: { size: totalSize, indexCached: indexCached, modelCached: modelCached } });
        } catch (error) {
            self.postMessage({ type: 'error', payload: 'Failed to get index size: ' + error.message });
        }
    }
};

// 5. Search Function
async function search(query) {
    if (!model || !indexData) {
        return [];
    }

    // Generate query embedding
    const queryEmbedding = await model("search_query: " + query, { pooling: 'mean', normalize: true });
    const truncatedQueryEmbedding = queryEmbedding.data.slice(0, EMBEDDING_DIM);

    // Calculate cosine similarities
    const similarities = [];
    for (let i = 0; i < indexData.embeddings.length; i++) {
        const similarity = cosineSimilarity(truncatedQueryEmbedding, indexData.embeddings[i]);
        similarities.push({ index: i, similarity });
    }

    // Sort by similarity
    similarities.sort((a, b) => b.similarity - a.similarity);

    // Get top 30 results
    const topResults = similarities.slice(0, 30).map(item => {
        return indexData.metadata[item.index];
    });

    return topResults;
}

// 6. Helper Functions
function cosineSimilarity(vecA, vecB) {
    let dotProduct = 0;
    let normA = 0;
    let normB = 0;
    for (let i = 0; i < vecA.length; i++) {
        dotProduct += vecA[i] * vecB[i];
        normA += vecA[i] * vecA[i];
        normB += vecB[i] * vecB[i];
    }
    return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}

function reshape(array, shape) {
    const reshaped = [];
    let offset = 0;
    for (let i = 0; i < shape[0]; i++) {
        reshaped.push(array.slice(offset, offset + shape[1]));
        offset += shape[1];
    }
    return reshaped;
}