Spaces:
Running
Running
| // 1. Import transformers.js | |
| import { pipeline, env } from './transformers.min.js'; | |
| // 2. Constants | |
| env.allowLocalModels = true; | |
| const MODEL_NAME = 'nomic-ai/nomic-embed-text-v1.5'; | |
| const EMBEDDING_DIM = 256; | |
| const INDEX_PATH = 'data/quotes_index.bin'; | |
| const APPROX_MODEL_SIZE_MB = 180; // Actual observed size of nomic-embed-text-v1.5 model | |
| const DB_NAME = 'QuoteSearchDB'; | |
| const DB_VERSION = 1; | |
| const STORE_NAME = 'quoteIndex'; | |
| let model; | |
| let indexData; | |
| let isReady = false; | |
| // IndexedDB Helper Functions | |
| function openDB() { | |
| return new Promise((resolve, reject) => { | |
| const request = indexedDB.open(DB_NAME, DB_VERSION); | |
| request.onupgradeneeded = (event) => { | |
| const db = event.target.result; | |
| db.createObjectStore(STORE_NAME, { keyPath: 'id' }); | |
| }; | |
| request.onsuccess = (event) => { | |
| resolve(event.target.result); | |
| }; | |
| request.onerror = (event) => { | |
| reject('IndexedDB error: ' + event.target.errorCode); | |
| }; | |
| }); | |
| } | |
| async function getFromDB(key) { | |
| const db = await openDB(); | |
| return new Promise((resolve, reject) => { | |
| const transaction = db.transaction([STORE_NAME], 'readonly'); | |
| const store = transaction.objectStore(STORE_NAME); | |
| const request = store.get(key); | |
| request.onsuccess = () => { | |
| resolve(request.result ? request.result.value : null); | |
| }; | |
| request.onerror = () => { | |
| reject('Error getting data from DB'); | |
| }; | |
| }); | |
| } | |
| async function putInDB(key, value) { | |
| const db = await openDB(); | |
| return new Promise((resolve, reject) => { | |
| const transaction = db.transaction([STORE_NAME], 'readwrite'); | |
| const store = transaction.objectStore(STORE_NAME); | |
| const request = store.put({ id: key, value: value }); | |
| request.onsuccess = () => { | |
| resolve(); | |
| }; | |
| request.onerror = () => { | |
| reject('Error putting data in DB'); | |
| }; | |
| }); | |
| } | |
| async function deleteFromDB(key) { | |
| const db = await openDB(); | |
| return new Promise((resolve, reject) => { | |
| const transaction = db.transaction([STORE_NAME], 'readwrite'); | |
| const store = transaction.objectStore(STORE_NAME); | |
| const request = store.delete(key); | |
| request.onsuccess = () => { | |
| resolve(); | |
| }; | |
| request.onerror = () => { | |
| reject('Error deleting data from DB'); | |
| }; | |
| }); | |
| } | |
| // 3. Load Model and Index | |
| async function loadModel() { | |
| try { | |
| // Inform the UI that the model will be downloaded/loaded | |
| self.postMessage({ type: 'loading', payload: 'Downloading model (this may take a while)...' }); | |
| // Load the model with a progress callback | |
| model = await pipeline('feature-extraction', MODEL_NAME, { | |
| progress_callback: (progress) => { | |
| // Make it explicit that this progress refers to model download or model file operations | |
| const detailMessage = `Downloading model ${progress.status}: ${progress.file || ''} ${Math.floor(progress.progress || 0)}%`; | |
| self.postMessage({ type: 'progress', payload: { ...progress, detail: detailMessage } }); | |
| } | |
| }); | |
| } catch (error) { | |
| console.error('Error loading model:', error); | |
| self.postMessage({ type: 'error', payload: error.message }); | |
| throw error; // Re-throw to prevent further execution if model fails to load | |
| } | |
| } | |
| async function loadIndex() { | |
| try { | |
| self.postMessage({ type: 'loading', payload: 'Checking for cached index file...' }); | |
| const cachedIndex = await getFromDB('quoteIndexData'); | |
| if (cachedIndex) { | |
| indexData = cachedIndex; | |
| self.postMessage({ type: 'loading', payload: 'Index loaded from cache.' }); | |
| } else { | |
| // Inform UI that the index file will be downloaded | |
| self.postMessage({ type: 'loading', payload: 'Downloading index file (this may take a while)...' }); | |
| // Fetch and parse the index file with progress reporting | |
| const response = await fetch(INDEX_PATH); | |
| const contentLength = response.headers.get('Content-Length'); | |
| const total = parseInt(contentLength, 10); | |
| let loaded = 0; | |
| const reader = response.body.getReader(); | |
| const chunks = []; | |
| while (true) { | |
| const { done, value } = await reader.read(); | |
| if (done) { | |
| break; | |
| } | |
| chunks.push(value); | |
| loaded += value.length; | |
| const progress = { | |
| status: 'Downloading Index', | |
| progress: (loaded / total) * 100, | |
| file: INDEX_PATH, | |
| detail: `Downloading index file: ${Math.floor((loaded / total) * 100)}% (${(loaded / (1024 * 1024)).toFixed(2)}MB / ${(total / (1024 * 1024)).toFixed(2)}MB)` | |
| }; | |
| self.postMessage({ type: 'progress', payload: progress }); | |
| } | |
| const buffer = await new Response(new Blob(chunks)).arrayBuffer(); | |
| // Parse the binary file | |
| let offset = 0; | |
| const numQuotes = new Uint32Array(buffer.slice(offset, offset + 4))[0]; | |
| offset += 4; | |
| const embeddingDim = new Uint16Array(buffer.slice(offset, offset + 2))[0]; | |
| offset += 2; | |
| const scale = new Float32Array(buffer.slice(offset, offset + 4))[0]; | |
| offset += 4; | |
| const metadataSize = new Uint32Array(buffer.slice(offset, offset + 4))[0]; | |
| offset += 4; | |
| // Read metadata format flag (1 byte) first, then metadata bytes | |
| let metadata_format = 0; // 0 = uncompressed JSON (legacy), 1 = gzip-compressed JSON | |
| if (offset + 1 <= buffer.byteLength) { | |
| metadata_format = new Uint8Array(buffer.slice(offset, offset + 1))[0]; | |
| offset += 1; | |
| } | |
| let metadataBytes = buffer.slice(offset, offset + metadataSize); | |
| offset += metadataSize; | |
| async function decodeMetadata(bytes, format) { | |
| const decoder = new TextDecoder('utf-8'); | |
| if (format === 0) { | |
| return JSON.parse(decoder.decode(bytes)); | |
| } else if (format === 1) { | |
| // Try using native DecompressionStream if available | |
| if (typeof DecompressionStream !== 'undefined') { | |
| const ds = new DecompressionStream('gzip'); | |
| const decompressed = await new Response(new Blob([bytes]).stream().pipeThrough(ds)).arrayBuffer(); | |
| return JSON.parse(decoder.decode(decompressed)); | |
| } else { | |
| // Fallback: DecompressionStream is not available. To support gzip in | |
| // older browsers, add a browser-ready pako build to the repo and | |
| // implement the decompression here. For now, fail with a clear error. | |
| throw new Error('Gzip decompression not available: add DecompressionStream support or include pako in the worker.'); | |
| } | |
| } else { | |
| throw new Error('Unknown metadata format: ' + format); | |
| } | |
| } | |
| const metadata = await decodeMetadata(metadataBytes, metadata_format); | |
| const quantizedEmbeddings = new Int8Array(buffer.slice(offset)); | |
| // De-quantize embeddings (processing step) with progress reporting | |
| const embeddings = new Float32Array(quantizedEmbeddings.length); | |
| const totalEmbeddings = quantizedEmbeddings.length; | |
| const updateInterval = Math.floor(totalEmbeddings / 100); // Update every 1% | |
| for (let i = 0; i < totalEmbeddings; i++) { | |
| embeddings[i] = quantizedEmbeddings[i] / scale; | |
| if (updateInterval > 0 && i % updateInterval === 0) { | |
| // This is processing (de-quantization), not download | |
| const progress = { | |
| status: 'Processing index (de-quantizing)', | |
| progress: (i / totalEmbeddings) * 100, | |
| file: INDEX_PATH, | |
| detail: `Processing index: ${Math.floor((i / totalEmbeddings) * 100)}%` | |
| }; | |
| self.postMessage({ type: 'progress', payload: progress }); | |
| } | |
| } | |
| indexData = { | |
| metadata, | |
| embeddings: reshape(embeddings, [numQuotes, embeddingDim]), | |
| embeddingsByteLength: quantizedEmbeddings.byteLength // Store byteLength | |
| }; | |
| await putInDB('quoteIndexData', indexData); // Store in IndexedDB | |
| } | |
| } catch (error) { | |
| console.error('Error loading index:', error); | |
| self.postMessage({ type: 'error', payload: error.message }); | |
| throw error; | |
| } | |
| } | |
| async function load() { | |
| try { | |
| await loadIndex(); | |
| isReady = true; | |
| self.postMessage({ type: 'ready' }); | |
| } catch (error) { | |
| console.error('Error in worker load function:', error); | |
| self.postMessage({ type: 'error', payload: error.message }); | |
| } | |
| } | |
| // 4. Listen for Messages | |
| self.onmessage = async (event) => { | |
| const { type, payload } = event.data; | |
| if (type === 'search') { | |
| if (!indexData) { | |
| // Be explicit: index may be downloaded before search | |
| self.postMessage({ type: 'loading', payload: 'Downloading index before running your search...' }); | |
| await loadIndex(); | |
| } | |
| if (!model) { | |
| self.postMessage({ type: 'loading', payload: 'Downloading model before running your search...' }); | |
| await loadModel(); | |
| } | |
| self.postMessage({ type: 'loading', payload: 'Searching...' }); | |
| const results = await search(payload); | |
| self.postMessage({ type: 'results', payload: results }); | |
| } else if (type === 'deleteData') { | |
| self.postMessage({ type: 'loading', payload: 'Deleting cached data...' }); | |
| // Run cleanup with a timeout so UI doesn't get stuck if some API call blocks or takes too long | |
| const cleanup = async () => { | |
| // 1) Delete the specific object from the DB if present | |
| try { | |
| await deleteFromDB('quoteIndexData'); | |
| } catch (e) { | |
| // ignore individual delete error and proceed to full DB deletion | |
| } | |
| // 2) Delete the entire IndexedDB database to ensure all stored artifacts are removed | |
| await new Promise((resolve, reject) => { | |
| const deleteRequest = indexedDB.deleteDatabase(DB_NAME); | |
| deleteRequest.onsuccess = () => resolve(); | |
| deleteRequest.onerror = () => reject(deleteRequest.error || new Error('Failed to delete IndexedDB')); | |
| deleteRequest.onblocked = () => { | |
| // If deletion is blocked, still try to continue with cache cleanup | |
| resolve(); | |
| }; | |
| }); | |
| // 3) Clear Cache Storage entries that may contain model or transformers resources | |
| try { | |
| if (typeof caches !== 'undefined' && caches.keys) { | |
| const cacheNames = await caches.keys(); | |
| for (const cacheName of cacheNames) { | |
| // Delete transformers related caches and any caches that contain the model name | |
| if (cacheName.startsWith('transformers-cache') || cacheName.includes(MODEL_NAME.replace(/\//g, '-')) || cacheName.includes('nomic')) { | |
| await caches.delete(cacheName); | |
| } | |
| } | |
| } | |
| } catch (e) { | |
| // Non-fatal | |
| console.warn('Cache cleanup error', e); | |
| } | |
| // 4) Clear localStorage keys commonly used by transformers.js or model caches (best-effort) | |
| try { | |
| if (typeof localStorage !== 'undefined') { | |
| const keysToClear = []; | |
| for (let i = 0; i < localStorage.length; i++) { | |
| const key = localStorage.key(i); | |
| if (!key) continue; | |
| if (key.startsWith('transformers') || key.includes('transformers') || key.includes('nomic') || key.includes('hf_')) { | |
| keysToClear.push(key); | |
| } | |
| } | |
| for (const k of keysToClear) localStorage.removeItem(k); | |
| } | |
| } catch (e) { | |
| // localStorage may not be available in worker-like contexts; ignore | |
| } | |
| // 5) Clear in-memory references | |
| indexData = null; | |
| model = null; | |
| isReady = false; | |
| }; | |
| // Timeout in ms | |
| const TIMEOUT_MS = 8000; | |
| try { | |
| await Promise.race([ | |
| cleanup(), | |
| new Promise((_, reject) => setTimeout(() => reject(new Error('cleanup-timeout')), TIMEOUT_MS)) | |
| ]); | |
| // If cleanup completed within timeout | |
| self.postMessage({ type: 'dataDeleted', payload: 'Cached data deleted successfully.' }); | |
| } catch (error) { | |
| if (error && error.message === 'cleanup-timeout') { | |
| console.warn('deleteData: cleanup timed out'); | |
| // Post a success-like message so UI doesn't stay stuck; note that some cleanup may still be pending | |
| self.postMessage({ type: 'dataDeleted', payload: 'Cached data deletion attempted (timed out). Some cleanup may remain.' }); | |
| } else { | |
| console.error('deleteData error:', error); | |
| self.postMessage({ type: 'error', payload: 'Failed to delete cached data: ' + (error && error.message ? error.message : String(error)) }); | |
| } | |
| } | |
| } else if (type === 'getIndexSize') { | |
| try { | |
| let totalSize = 0; | |
| let indexCached = false; | |
| let modelCached = false; | |
| // Check if index is cached | |
| const cachedIndex = await getFromDB('quoteIndexData'); | |
| if (cachedIndex) { | |
| totalSize += JSON.stringify(cachedIndex.metadata).length + cachedIndex.embeddingsByteLength; | |
| indexCached = true; | |
| } else { | |
| const response = await fetch(INDEX_PATH, { method: 'HEAD' }); | |
| const contentLength = response.headers.get('Content-Length'); | |
| totalSize += parseInt(contentLength, 10); | |
| } | |
| // Add approximate model size | |
| totalSize += APPROX_MODEL_SIZE_MB * 1024 * 1024; // Convert MB to bytes | |
| // Heuristic: assume model is cached if `model` object is initialized | |
| if (model) { | |
| modelCached = true; | |
| } | |
| self.postMessage({ type: 'indexSize', payload: { size: totalSize, indexCached: indexCached, modelCached: modelCached } }); | |
| } catch (error) { | |
| self.postMessage({ type: 'error', payload: 'Failed to get index size: ' + error.message }); | |
| } | |
| } | |
| }; | |
| // 5. Search Function | |
| async function search(query) { | |
| if (!model || !indexData) { | |
| return []; | |
| } | |
| // Generate query embedding | |
| const queryEmbedding = await model("search_query: " + query, { pooling: 'mean', normalize: true }); | |
| const truncatedQueryEmbedding = queryEmbedding.data.slice(0, EMBEDDING_DIM); | |
| // Calculate cosine similarities | |
| const similarities = []; | |
| for (let i = 0; i < indexData.embeddings.length; i++) { | |
| const similarity = cosineSimilarity(truncatedQueryEmbedding, indexData.embeddings[i]); | |
| similarities.push({ index: i, similarity }); | |
| } | |
| // Sort by similarity | |
| similarities.sort((a, b) => b.similarity - a.similarity); | |
| // Get top 30 results | |
| const topResults = similarities.slice(0, 30).map(item => { | |
| return indexData.metadata[item.index]; | |
| }); | |
| return topResults; | |
| } | |
| // 6. Helper Functions | |
| function cosineSimilarity(vecA, vecB) { | |
| let dotProduct = 0; | |
| let normA = 0; | |
| let normB = 0; | |
| for (let i = 0; i < vecA.length; i++) { | |
| dotProduct += vecA[i] * vecB[i]; | |
| normA += vecA[i] * vecA[i]; | |
| normB += vecB[i] * vecB[i]; | |
| } | |
| return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); | |
| } | |
| function reshape(array, shape) { | |
| const reshaped = []; | |
| let offset = 0; | |
| for (let i = 0; i < shape[0]; i++) { | |
| reshaped.push(array.slice(offset, offset + shape[1])); | |
| offset += shape[1]; | |
| } | |
| return reshaped; | |
| } | |