QuoteSearch / worker.js
ruidiao's picture
Upload worker.js
f982c86
// 1. Import transformers.js
import { pipeline, env } from './transformers.min.js';
// 2. Constants
env.allowLocalModels = true;
const MODEL_NAME = 'nomic-ai/nomic-embed-text-v1.5';
const EMBEDDING_DIM = 256;
const INDEX_PATH = 'data/quotes_index.bin';
const APPROX_MODEL_SIZE_MB = 180; // Actual observed size of nomic-embed-text-v1.5 model
const DB_NAME = 'QuoteSearchDB';
const DB_VERSION = 1;
const STORE_NAME = 'quoteIndex';
let model;
let indexData;
let isReady = false;
// IndexedDB Helper Functions
function openDB() {
return new Promise((resolve, reject) => {
const request = indexedDB.open(DB_NAME, DB_VERSION);
request.onupgradeneeded = (event) => {
const db = event.target.result;
db.createObjectStore(STORE_NAME, { keyPath: 'id' });
};
request.onsuccess = (event) => {
resolve(event.target.result);
};
request.onerror = (event) => {
reject('IndexedDB error: ' + event.target.errorCode);
};
});
}
async function getFromDB(key) {
const db = await openDB();
return new Promise((resolve, reject) => {
const transaction = db.transaction([STORE_NAME], 'readonly');
const store = transaction.objectStore(STORE_NAME);
const request = store.get(key);
request.onsuccess = () => {
resolve(request.result ? request.result.value : null);
};
request.onerror = () => {
reject('Error getting data from DB');
};
});
}
async function putInDB(key, value) {
const db = await openDB();
return new Promise((resolve, reject) => {
const transaction = db.transaction([STORE_NAME], 'readwrite');
const store = transaction.objectStore(STORE_NAME);
const request = store.put({ id: key, value: value });
request.onsuccess = () => {
resolve();
};
request.onerror = () => {
reject('Error putting data in DB');
};
});
}
async function deleteFromDB(key) {
const db = await openDB();
return new Promise((resolve, reject) => {
const transaction = db.transaction([STORE_NAME], 'readwrite');
const store = transaction.objectStore(STORE_NAME);
const request = store.delete(key);
request.onsuccess = () => {
resolve();
};
request.onerror = () => {
reject('Error deleting data from DB');
};
});
}
// 3. Load Model and Index
async function loadModel() {
try {
// Inform the UI that the model will be downloaded/loaded
self.postMessage({ type: 'loading', payload: 'Downloading model (this may take a while)...' });
// Load the model with a progress callback
model = await pipeline('feature-extraction', MODEL_NAME, {
progress_callback: (progress) => {
// Make it explicit that this progress refers to model download or model file operations
const detailMessage = `Downloading model ${progress.status}: ${progress.file || ''} ${Math.floor(progress.progress || 0)}%`;
self.postMessage({ type: 'progress', payload: { ...progress, detail: detailMessage } });
}
});
} catch (error) {
console.error('Error loading model:', error);
self.postMessage({ type: 'error', payload: error.message });
throw error; // Re-throw to prevent further execution if model fails to load
}
}
async function loadIndex() {
try {
self.postMessage({ type: 'loading', payload: 'Checking for cached index file...' });
const cachedIndex = await getFromDB('quoteIndexData');
if (cachedIndex) {
indexData = cachedIndex;
self.postMessage({ type: 'loading', payload: 'Index loaded from cache.' });
} else {
// Inform UI that the index file will be downloaded
self.postMessage({ type: 'loading', payload: 'Downloading index file (this may take a while)...' });
// Fetch and parse the index file with progress reporting
const response = await fetch(INDEX_PATH);
const contentLength = response.headers.get('Content-Length');
const total = parseInt(contentLength, 10);
let loaded = 0;
const reader = response.body.getReader();
const chunks = [];
while (true) {
const { done, value } = await reader.read();
if (done) {
break;
}
chunks.push(value);
loaded += value.length;
const progress = {
status: 'Downloading Index',
progress: (loaded / total) * 100,
file: INDEX_PATH,
detail: `Downloading index file: ${Math.floor((loaded / total) * 100)}% (${(loaded / (1024 * 1024)).toFixed(2)}MB / ${(total / (1024 * 1024)).toFixed(2)}MB)`
};
self.postMessage({ type: 'progress', payload: progress });
}
const buffer = await new Response(new Blob(chunks)).arrayBuffer();
// Parse the binary file
let offset = 0;
const numQuotes = new Uint32Array(buffer.slice(offset, offset + 4))[0];
offset += 4;
const embeddingDim = new Uint16Array(buffer.slice(offset, offset + 2))[0];
offset += 2;
const scale = new Float32Array(buffer.slice(offset, offset + 4))[0];
offset += 4;
const metadataSize = new Uint32Array(buffer.slice(offset, offset + 4))[0];
offset += 4;
// Read metadata format flag (1 byte) first, then metadata bytes
let metadata_format = 0; // 0 = uncompressed JSON (legacy), 1 = gzip-compressed JSON
if (offset + 1 <= buffer.byteLength) {
metadata_format = new Uint8Array(buffer.slice(offset, offset + 1))[0];
offset += 1;
}
let metadataBytes = buffer.slice(offset, offset + metadataSize);
offset += metadataSize;
async function decodeMetadata(bytes, format) {
const decoder = new TextDecoder('utf-8');
if (format === 0) {
return JSON.parse(decoder.decode(bytes));
} else if (format === 1) {
// Try using native DecompressionStream if available
if (typeof DecompressionStream !== 'undefined') {
const ds = new DecompressionStream('gzip');
const decompressed = await new Response(new Blob([bytes]).stream().pipeThrough(ds)).arrayBuffer();
return JSON.parse(decoder.decode(decompressed));
} else {
// Fallback: DecompressionStream is not available. To support gzip in
// older browsers, add a browser-ready pako build to the repo and
// implement the decompression here. For now, fail with a clear error.
throw new Error('Gzip decompression not available: add DecompressionStream support or include pako in the worker.');
}
} else {
throw new Error('Unknown metadata format: ' + format);
}
}
const metadata = await decodeMetadata(metadataBytes, metadata_format);
const quantizedEmbeddings = new Int8Array(buffer.slice(offset));
// De-quantize embeddings (processing step) with progress reporting
const embeddings = new Float32Array(quantizedEmbeddings.length);
const totalEmbeddings = quantizedEmbeddings.length;
const updateInterval = Math.floor(totalEmbeddings / 100); // Update every 1%
for (let i = 0; i < totalEmbeddings; i++) {
embeddings[i] = quantizedEmbeddings[i] / scale;
if (updateInterval > 0 && i % updateInterval === 0) {
// This is processing (de-quantization), not download
const progress = {
status: 'Processing index (de-quantizing)',
progress: (i / totalEmbeddings) * 100,
file: INDEX_PATH,
detail: `Processing index: ${Math.floor((i / totalEmbeddings) * 100)}%`
};
self.postMessage({ type: 'progress', payload: progress });
}
}
indexData = {
metadata,
embeddings: reshape(embeddings, [numQuotes, embeddingDim]),
embeddingsByteLength: quantizedEmbeddings.byteLength // Store byteLength
};
await putInDB('quoteIndexData', indexData); // Store in IndexedDB
}
} catch (error) {
console.error('Error loading index:', error);
self.postMessage({ type: 'error', payload: error.message });
throw error;
}
}
async function load() {
try {
await loadIndex();
isReady = true;
self.postMessage({ type: 'ready' });
} catch (error) {
console.error('Error in worker load function:', error);
self.postMessage({ type: 'error', payload: error.message });
}
}
// 4. Listen for Messages
self.onmessage = async (event) => {
const { type, payload } = event.data;
if (type === 'search') {
if (!indexData) {
// Be explicit: index may be downloaded before search
self.postMessage({ type: 'loading', payload: 'Downloading index before running your search...' });
await loadIndex();
}
if (!model) {
self.postMessage({ type: 'loading', payload: 'Downloading model before running your search...' });
await loadModel();
}
self.postMessage({ type: 'loading', payload: 'Searching...' });
const results = await search(payload);
self.postMessage({ type: 'results', payload: results });
} else if (type === 'deleteData') {
self.postMessage({ type: 'loading', payload: 'Deleting cached data...' });
// Run cleanup with a timeout so UI doesn't get stuck if some API call blocks or takes too long
const cleanup = async () => {
// 1) Delete the specific object from the DB if present
try {
await deleteFromDB('quoteIndexData');
} catch (e) {
// ignore individual delete error and proceed to full DB deletion
}
// 2) Delete the entire IndexedDB database to ensure all stored artifacts are removed
await new Promise((resolve, reject) => {
const deleteRequest = indexedDB.deleteDatabase(DB_NAME);
deleteRequest.onsuccess = () => resolve();
deleteRequest.onerror = () => reject(deleteRequest.error || new Error('Failed to delete IndexedDB'));
deleteRequest.onblocked = () => {
// If deletion is blocked, still try to continue with cache cleanup
resolve();
};
});
// 3) Clear Cache Storage entries that may contain model or transformers resources
try {
if (typeof caches !== 'undefined' && caches.keys) {
const cacheNames = await caches.keys();
for (const cacheName of cacheNames) {
// Delete transformers related caches and any caches that contain the model name
if (cacheName.startsWith('transformers-cache') || cacheName.includes(MODEL_NAME.replace(/\//g, '-')) || cacheName.includes('nomic')) {
await caches.delete(cacheName);
}
}
}
} catch (e) {
// Non-fatal
console.warn('Cache cleanup error', e);
}
// 4) Clear localStorage keys commonly used by transformers.js or model caches (best-effort)
try {
if (typeof localStorage !== 'undefined') {
const keysToClear = [];
for (let i = 0; i < localStorage.length; i++) {
const key = localStorage.key(i);
if (!key) continue;
if (key.startsWith('transformers') || key.includes('transformers') || key.includes('nomic') || key.includes('hf_')) {
keysToClear.push(key);
}
}
for (const k of keysToClear) localStorage.removeItem(k);
}
} catch (e) {
// localStorage may not be available in worker-like contexts; ignore
}
// 5) Clear in-memory references
indexData = null;
model = null;
isReady = false;
};
// Timeout in ms
const TIMEOUT_MS = 8000;
try {
await Promise.race([
cleanup(),
new Promise((_, reject) => setTimeout(() => reject(new Error('cleanup-timeout')), TIMEOUT_MS))
]);
// If cleanup completed within timeout
self.postMessage({ type: 'dataDeleted', payload: 'Cached data deleted successfully.' });
} catch (error) {
if (error && error.message === 'cleanup-timeout') {
console.warn('deleteData: cleanup timed out');
// Post a success-like message so UI doesn't stay stuck; note that some cleanup may still be pending
self.postMessage({ type: 'dataDeleted', payload: 'Cached data deletion attempted (timed out). Some cleanup may remain.' });
} else {
console.error('deleteData error:', error);
self.postMessage({ type: 'error', payload: 'Failed to delete cached data: ' + (error && error.message ? error.message : String(error)) });
}
}
} else if (type === 'getIndexSize') {
try {
let totalSize = 0;
let indexCached = false;
let modelCached = false;
// Check if index is cached
const cachedIndex = await getFromDB('quoteIndexData');
if (cachedIndex) {
totalSize += JSON.stringify(cachedIndex.metadata).length + cachedIndex.embeddingsByteLength;
indexCached = true;
} else {
const response = await fetch(INDEX_PATH, { method: 'HEAD' });
const contentLength = response.headers.get('Content-Length');
totalSize += parseInt(contentLength, 10);
}
// Add approximate model size
totalSize += APPROX_MODEL_SIZE_MB * 1024 * 1024; // Convert MB to bytes
// Heuristic: assume model is cached if `model` object is initialized
if (model) {
modelCached = true;
}
self.postMessage({ type: 'indexSize', payload: { size: totalSize, indexCached: indexCached, modelCached: modelCached } });
} catch (error) {
self.postMessage({ type: 'error', payload: 'Failed to get index size: ' + error.message });
}
}
};
// 5. Search Function
async function search(query) {
if (!model || !indexData) {
return [];
}
// Generate query embedding
const queryEmbedding = await model("search_query: " + query, { pooling: 'mean', normalize: true });
const truncatedQueryEmbedding = queryEmbedding.data.slice(0, EMBEDDING_DIM);
// Calculate cosine similarities
const similarities = [];
for (let i = 0; i < indexData.embeddings.length; i++) {
const similarity = cosineSimilarity(truncatedQueryEmbedding, indexData.embeddings[i]);
similarities.push({ index: i, similarity });
}
// Sort by similarity
similarities.sort((a, b) => b.similarity - a.similarity);
// Get top 30 results
const topResults = similarities.slice(0, 30).map(item => {
return indexData.metadata[item.index];
});
return topResults;
}
// 6. Helper Functions
function cosineSimilarity(vecA, vecB) {
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < vecA.length; i++) {
dotProduct += vecA[i] * vecB[i];
normA += vecA[i] * vecA[i];
normB += vecB[i] * vecB[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
function reshape(array, shape) {
const reshaped = [];
let offset = 0;
for (let i = 0; i < shape[0]; i++) {
reshaped.push(array.slice(offset, offset + shape[1]));
offset += shape[1];
}
return reshaped;
}