// 1. Import transformers.js import { pipeline, env } from './transformers.min.js'; // 2. Constants env.allowLocalModels = true; const MODEL_NAME = 'nomic-ai/nomic-embed-text-v1.5'; const EMBEDDING_DIM = 256; const INDEX_PATH = 'data/quotes_index.bin'; const APPROX_MODEL_SIZE_MB = 180; // Actual observed size of nomic-embed-text-v1.5 model const DB_NAME = 'QuoteSearchDB'; const DB_VERSION = 1; const STORE_NAME = 'quoteIndex'; let model; let indexData; let isReady = false; // IndexedDB Helper Functions function openDB() { return new Promise((resolve, reject) => { const request = indexedDB.open(DB_NAME, DB_VERSION); request.onupgradeneeded = (event) => { const db = event.target.result; db.createObjectStore(STORE_NAME, { keyPath: 'id' }); }; request.onsuccess = (event) => { resolve(event.target.result); }; request.onerror = (event) => { reject('IndexedDB error: ' + event.target.errorCode); }; }); } async function getFromDB(key) { const db = await openDB(); return new Promise((resolve, reject) => { const transaction = db.transaction([STORE_NAME], 'readonly'); const store = transaction.objectStore(STORE_NAME); const request = store.get(key); request.onsuccess = () => { resolve(request.result ? request.result.value : null); }; request.onerror = () => { reject('Error getting data from DB'); }; }); } async function putInDB(key, value) { const db = await openDB(); return new Promise((resolve, reject) => { const transaction = db.transaction([STORE_NAME], 'readwrite'); const store = transaction.objectStore(STORE_NAME); const request = store.put({ id: key, value: value }); request.onsuccess = () => { resolve(); }; request.onerror = () => { reject('Error putting data in DB'); }; }); } async function deleteFromDB(key) { const db = await openDB(); return new Promise((resolve, reject) => { const transaction = db.transaction([STORE_NAME], 'readwrite'); const store = transaction.objectStore(STORE_NAME); const request = store.delete(key); request.onsuccess = () => { resolve(); }; request.onerror = () => { reject('Error deleting data from DB'); }; }); } // 3. Load Model and Index async function loadModel() { try { // Inform the UI that the model will be downloaded/loaded self.postMessage({ type: 'loading', payload: 'Downloading model (this may take a while)...' }); // Load the model with a progress callback model = await pipeline('feature-extraction', MODEL_NAME, { progress_callback: (progress) => { // Make it explicit that this progress refers to model download or model file operations const detailMessage = `Downloading model ${progress.status}: ${progress.file || ''} ${Math.floor(progress.progress || 0)}%`; self.postMessage({ type: 'progress', payload: { ...progress, detail: detailMessage } }); } }); } catch (error) { console.error('Error loading model:', error); self.postMessage({ type: 'error', payload: error.message }); throw error; // Re-throw to prevent further execution if model fails to load } } async function loadIndex() { try { self.postMessage({ type: 'loading', payload: 'Checking for cached index file...' }); const cachedIndex = await getFromDB('quoteIndexData'); if (cachedIndex) { indexData = cachedIndex; self.postMessage({ type: 'loading', payload: 'Index loaded from cache.' }); } else { // Inform UI that the index file will be downloaded self.postMessage({ type: 'loading', payload: 'Downloading index file (this may take a while)...' }); // Fetch and parse the index file with progress reporting const response = await fetch(INDEX_PATH); const contentLength = response.headers.get('Content-Length'); const total = parseInt(contentLength, 10); let loaded = 0; const reader = response.body.getReader(); const chunks = []; while (true) { const { done, value } = await reader.read(); if (done) { break; } chunks.push(value); loaded += value.length; const progress = { status: 'Downloading Index', progress: (loaded / total) * 100, file: INDEX_PATH, detail: `Downloading index file: ${Math.floor((loaded / total) * 100)}% (${(loaded / (1024 * 1024)).toFixed(2)}MB / ${(total / (1024 * 1024)).toFixed(2)}MB)` }; self.postMessage({ type: 'progress', payload: progress }); } const buffer = await new Response(new Blob(chunks)).arrayBuffer(); // Parse the binary file let offset = 0; const numQuotes = new Uint32Array(buffer.slice(offset, offset + 4))[0]; offset += 4; const embeddingDim = new Uint16Array(buffer.slice(offset, offset + 2))[0]; offset += 2; const scale = new Float32Array(buffer.slice(offset, offset + 4))[0]; offset += 4; const metadataSize = new Uint32Array(buffer.slice(offset, offset + 4))[0]; offset += 4; // Read metadata format flag (1 byte) first, then metadata bytes let metadata_format = 0; // 0 = uncompressed JSON (legacy), 1 = gzip-compressed JSON if (offset + 1 <= buffer.byteLength) { metadata_format = new Uint8Array(buffer.slice(offset, offset + 1))[0]; offset += 1; } let metadataBytes = buffer.slice(offset, offset + metadataSize); offset += metadataSize; async function decodeMetadata(bytes, format) { const decoder = new TextDecoder('utf-8'); if (format === 0) { return JSON.parse(decoder.decode(bytes)); } else if (format === 1) { // Try using native DecompressionStream if available if (typeof DecompressionStream !== 'undefined') { const ds = new DecompressionStream('gzip'); const decompressed = await new Response(new Blob([bytes]).stream().pipeThrough(ds)).arrayBuffer(); return JSON.parse(decoder.decode(decompressed)); } else { // Fallback: DecompressionStream is not available. To support gzip in // older browsers, add a browser-ready pako build to the repo and // implement the decompression here. For now, fail with a clear error. throw new Error('Gzip decompression not available: add DecompressionStream support or include pako in the worker.'); } } else { throw new Error('Unknown metadata format: ' + format); } } const metadata = await decodeMetadata(metadataBytes, metadata_format); const quantizedEmbeddings = new Int8Array(buffer.slice(offset)); // De-quantize embeddings (processing step) with progress reporting const embeddings = new Float32Array(quantizedEmbeddings.length); const totalEmbeddings = quantizedEmbeddings.length; const updateInterval = Math.floor(totalEmbeddings / 100); // Update every 1% for (let i = 0; i < totalEmbeddings; i++) { embeddings[i] = quantizedEmbeddings[i] / scale; if (updateInterval > 0 && i % updateInterval === 0) { // This is processing (de-quantization), not download const progress = { status: 'Processing index (de-quantizing)', progress: (i / totalEmbeddings) * 100, file: INDEX_PATH, detail: `Processing index: ${Math.floor((i / totalEmbeddings) * 100)}%` }; self.postMessage({ type: 'progress', payload: progress }); } } indexData = { metadata, embeddings: reshape(embeddings, [numQuotes, embeddingDim]), embeddingsByteLength: quantizedEmbeddings.byteLength // Store byteLength }; await putInDB('quoteIndexData', indexData); // Store in IndexedDB } } catch (error) { console.error('Error loading index:', error); self.postMessage({ type: 'error', payload: error.message }); throw error; } } async function load() { try { await loadIndex(); isReady = true; self.postMessage({ type: 'ready' }); } catch (error) { console.error('Error in worker load function:', error); self.postMessage({ type: 'error', payload: error.message }); } } // 4. Listen for Messages self.onmessage = async (event) => { const { type, payload } = event.data; if (type === 'search') { if (!indexData) { // Be explicit: index may be downloaded before search self.postMessage({ type: 'loading', payload: 'Downloading index before running your search...' }); await loadIndex(); } if (!model) { self.postMessage({ type: 'loading', payload: 'Downloading model before running your search...' }); await loadModel(); } self.postMessage({ type: 'loading', payload: 'Searching...' }); const results = await search(payload); self.postMessage({ type: 'results', payload: results }); } else if (type === 'deleteData') { self.postMessage({ type: 'loading', payload: 'Deleting cached data...' }); // Run cleanup with a timeout so UI doesn't get stuck if some API call blocks or takes too long const cleanup = async () => { // 1) Delete the specific object from the DB if present try { await deleteFromDB('quoteIndexData'); } catch (e) { // ignore individual delete error and proceed to full DB deletion } // 2) Delete the entire IndexedDB database to ensure all stored artifacts are removed await new Promise((resolve, reject) => { const deleteRequest = indexedDB.deleteDatabase(DB_NAME); deleteRequest.onsuccess = () => resolve(); deleteRequest.onerror = () => reject(deleteRequest.error || new Error('Failed to delete IndexedDB')); deleteRequest.onblocked = () => { // If deletion is blocked, still try to continue with cache cleanup resolve(); }; }); // 3) Clear Cache Storage entries that may contain model or transformers resources try { if (typeof caches !== 'undefined' && caches.keys) { const cacheNames = await caches.keys(); for (const cacheName of cacheNames) { // Delete transformers related caches and any caches that contain the model name if (cacheName.startsWith('transformers-cache') || cacheName.includes(MODEL_NAME.replace(/\//g, '-')) || cacheName.includes('nomic')) { await caches.delete(cacheName); } } } } catch (e) { // Non-fatal console.warn('Cache cleanup error', e); } // 4) Clear localStorage keys commonly used by transformers.js or model caches (best-effort) try { if (typeof localStorage !== 'undefined') { const keysToClear = []; for (let i = 0; i < localStorage.length; i++) { const key = localStorage.key(i); if (!key) continue; if (key.startsWith('transformers') || key.includes('transformers') || key.includes('nomic') || key.includes('hf_')) { keysToClear.push(key); } } for (const k of keysToClear) localStorage.removeItem(k); } } catch (e) { // localStorage may not be available in worker-like contexts; ignore } // 5) Clear in-memory references indexData = null; model = null; isReady = false; }; // Timeout in ms const TIMEOUT_MS = 8000; try { await Promise.race([ cleanup(), new Promise((_, reject) => setTimeout(() => reject(new Error('cleanup-timeout')), TIMEOUT_MS)) ]); // If cleanup completed within timeout self.postMessage({ type: 'dataDeleted', payload: 'Cached data deleted successfully.' }); } catch (error) { if (error && error.message === 'cleanup-timeout') { console.warn('deleteData: cleanup timed out'); // Post a success-like message so UI doesn't stay stuck; note that some cleanup may still be pending self.postMessage({ type: 'dataDeleted', payload: 'Cached data deletion attempted (timed out). Some cleanup may remain.' }); } else { console.error('deleteData error:', error); self.postMessage({ type: 'error', payload: 'Failed to delete cached data: ' + (error && error.message ? error.message : String(error)) }); } } } else if (type === 'getIndexSize') { try { let totalSize = 0; let indexCached = false; let modelCached = false; // Check if index is cached const cachedIndex = await getFromDB('quoteIndexData'); if (cachedIndex) { totalSize += JSON.stringify(cachedIndex.metadata).length + cachedIndex.embeddingsByteLength; indexCached = true; } else { const response = await fetch(INDEX_PATH, { method: 'HEAD' }); const contentLength = response.headers.get('Content-Length'); totalSize += parseInt(contentLength, 10); } // Add approximate model size totalSize += APPROX_MODEL_SIZE_MB * 1024 * 1024; // Convert MB to bytes // Heuristic: assume model is cached if `model` object is initialized if (model) { modelCached = true; } self.postMessage({ type: 'indexSize', payload: { size: totalSize, indexCached: indexCached, modelCached: modelCached } }); } catch (error) { self.postMessage({ type: 'error', payload: 'Failed to get index size: ' + error.message }); } } }; // 5. Search Function async function search(query) { if (!model || !indexData) { return []; } // Generate query embedding const queryEmbedding = await model("search_query: " + query, { pooling: 'mean', normalize: true }); const truncatedQueryEmbedding = queryEmbedding.data.slice(0, EMBEDDING_DIM); // Calculate cosine similarities const similarities = []; for (let i = 0; i < indexData.embeddings.length; i++) { const similarity = cosineSimilarity(truncatedQueryEmbedding, indexData.embeddings[i]); similarities.push({ index: i, similarity }); } // Sort by similarity similarities.sort((a, b) => b.similarity - a.similarity); // Get top 30 results const topResults = similarities.slice(0, 30).map(item => { return indexData.metadata[item.index]; }); return topResults; } // 6. Helper Functions function cosineSimilarity(vecA, vecB) { let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < vecA.length; i++) { dotProduct += vecA[i] * vecB[i]; normA += vecA[i] * vecA[i]; normB += vecB[i] * vecB[i]; } return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); } function reshape(array, shape) { const reshaped = []; let offset = 0; for (let i = 0; i < shape[0]; i++) { reshaped.push(array.slice(offset, offset + shape[1])); offset += shape[1]; } return reshaped; }