Spaces:
Running
Running
File size: 5,887 Bytes
1e32f6a ea0ddf6 1e32f6a fd5d5af 1e32f6a fd5d5af ea0ddf6 fd5d5af ea0ddf6 fd5d5af ea0ddf6 fd5d5af ea0ddf6 fd5d5af ea0ddf6 fd5d5af ea0ddf6 fd5d5af 1e32f6a fd5d5af 1e32f6a ea0ddf6 1e32f6a ea0ddf6 1e32f6a ea0ddf6 1e32f6a ea0ddf6 1e32f6a ea0ddf6 1e32f6a ea0ddf6 1e32f6a ea0ddf6 1e32f6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import json
import gzip
import struct
import re
# 1. Constants
MODEL_NAME = 'nomic-ai/nomic-embed-text-v1.5'
EMBEDDING_DIM = 256
INPUT_CSV_PATH = 'data/quotes.csv'
OUTPUT_BINARY_PATH = 'data/quotes_index.bin'
BATCH_SIZE = 64 # Optimized batch size for GPU processing
# 2. Load Data
def load_quotes(file_path):
"""Loads quotes from a CSV file using pandas, skipping malformed lines."""
# Define column names explicitly
column_names = ['quote', 'author', 'category']
# Read CSV with explicit separator, quote character, and skip bad lines
# header=None because we are providing names explicitly
df = pd.read_csv(
file_path,
sep=',',
quotechar='"',
header=None,
names=column_names,
on_bad_lines='skip' # Skip lines that pandas cannot parse into 3 columns
)
# Filter out rows where the category contains uppercase letters
initial_rows = len(df)
df = df[df['category'].apply(lambda x: isinstance(x, str) and not any(c.isupper() for c in x))]
filtered_rows = len(df)
if initial_rows - filtered_rows > 0:
print(f"Ignored {initial_rows - filtered_rows} rows due to uppercase letters in category.")
# Ensure author is a string for grouping (empty string for missing authors)
df['author'] = df['author'].fillna('').astype(str)
# Group by quote and author to deduplicate entries.
# We intentionally ignore categories to reduce metadata size in the output index.
grouped = {}
for _, row in df.iterrows():
quote = row['quote']
author = row['author']
# Build a case-insensitive key for deduplication
quote_key = quote.lower().strip() if isinstance(quote, str) else ''
author_key = author.lower().strip() if isinstance(author, str) else ''
key = (quote_key, author_key)
if key not in grouped:
grouped[key] = { 'quote': quote, 'author': author }
# Build records from grouped data; do NOT include categories
records = []
for key, data in grouped.items():
orig_author = data['author'] if data['author'] != '' else None
records.append({
'quote': data['quote'],
'author': orig_author
})
# Prepend the required prefix for retrieval tasks
for r in records:
r['quote_for_embedding'] = "search_query: " + r['quote']
return records
# 3. Generate Embeddings
def generate_embeddings(quotes, model_name, embedding_dim):
"""Generates and truncates embeddings for a list of quotes."""
model = SentenceTransformer(model_name, trust_remote_code=True)
# The model automatically uses the GPU if available
embeddings = model.encode(
[q['quote_for_embedding'] for q in quotes],
convert_to_tensor=True,
batch_size=BATCH_SIZE,
show_progress_bar=True # Display progress bar for embedding generation
)
# Truncate embeddings to the desired dimension
truncated_embeddings = embeddings[:, :embedding_dim]
return truncated_embeddings.cpu().numpy()
# 4. Quantize Embeddings
def quantize_embeddings(embeddings):
"""Quantizes float32 embeddings to int8."""
# Calculate the scale factor
abs_max = np.abs(embeddings).max()
scale = 127.0 / abs_max if abs_max != 0 else 0
# Quantize and clip
quantized_embeddings = np.clip(embeddings * scale, -127, 127).astype(np.int8)
return quantized_embeddings, scale
def main():
"""Main function to run the offline processing pipeline."""
print("Starting offline processing...")
# Load quotes
quotes = load_quotes(INPUT_CSV_PATH)
print(f"Loaded {len(quotes)} quotes.")
# Generate embeddings
print("Generating embeddings...")
float_embeddings = generate_embeddings(quotes, MODEL_NAME, EMBEDDING_DIM)
print(f"Generated float embeddings with shape: {float_embeddings.shape}")
# Quantize embeddings
print("Quantizing embeddings...")
quantized_embeddings, scale = quantize_embeddings(float_embeddings)
print(f"Quantized embeddings with shape: {quantized_embeddings.shape}")
print(f"Quantization scale factor: {scale}")
# Prepare metadata without categories to reduce index size
metadata = [
{"quote": q["quote"], "author": q["author"]}
for q in quotes
]
# Replace NaN values with None for JSON compatibility
for item in metadata:
for key, value in list(item.items()):
if isinstance(value, float) and np.isnan(value):
item[key] = None
# After cleaning metadata, serialize and compress once
metadata_json = json.dumps(metadata, separators=(",", ":"))
metadata_bytes_uncompressed = metadata_json.encode('utf-8')
# Compress metadata with gzip to reduce index size on disk
metadata_bytes = gzip.compress(metadata_bytes_uncompressed)
# metadata format: 0 = uncompressed JSON (legacy), 1 = gzip-compressed JSON
metadata_format = 1
# Pack data into a binary file
print("Packaging data into binary file...")
with open(OUTPUT_BINARY_PATH, 'wb') as f:
# Header
f.write(struct.pack('<I', len(quotes))) # 4 bytes: number of quotes
f.write(struct.pack('<H', EMBEDDING_DIM)) # 2 bytes: embedding dimension
f.write(struct.pack('<f', scale)) # 4 bytes: quantization scale factor
f.write(struct.pack('<I', len(metadata_bytes))) # 4 bytes: metadata size
f.write(struct.pack('<B', metadata_format)) # 1 byte: metadata format flag
# Metadata (possibly compressed)
f.write(metadata_bytes)
# Embeddings
f.write(quantized_embeddings.tobytes())
print(f"Offline processing complete. Index file saved to {OUTPUT_BINARY_PATH}")
if __name__ == "__main__":
main()
|