File size: 5,887 Bytes
1e32f6a
 
 
 
 
ea0ddf6
1e32f6a
fd5d5af
1e32f6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd5d5af
 
 
ea0ddf6
 
fd5d5af
 
 
 
ea0ddf6
 
 
 
 
 
fd5d5af
ea0ddf6
fd5d5af
ea0ddf6
fd5d5af
ea0ddf6
 
fd5d5af
ea0ddf6
 
fd5d5af
 
1e32f6a
fd5d5af
 
 
 
1e32f6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea0ddf6
1e32f6a
ea0ddf6
1e32f6a
 
 
 
 
ea0ddf6
1e32f6a
 
 
ea0ddf6
1e32f6a
ea0ddf6
 
 
 
 
 
1e32f6a
 
 
 
 
 
 
 
 
ea0ddf6
1e32f6a
ea0ddf6
1e32f6a
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import json
import gzip
import struct
import re

# 1. Constants
MODEL_NAME = 'nomic-ai/nomic-embed-text-v1.5'
EMBEDDING_DIM = 256
INPUT_CSV_PATH = 'data/quotes.csv'
OUTPUT_BINARY_PATH = 'data/quotes_index.bin'
BATCH_SIZE = 64 # Optimized batch size for GPU processing

# 2. Load Data
def load_quotes(file_path):
    """Loads quotes from a CSV file using pandas, skipping malformed lines."""
    # Define column names explicitly
    column_names = ['quote', 'author', 'category']
    
    # Read CSV with explicit separator, quote character, and skip bad lines
    # header=None because we are providing names explicitly
    df = pd.read_csv(
        file_path,
        sep=',',
        quotechar='"',
        header=None,
        names=column_names,
        on_bad_lines='skip' # Skip lines that pandas cannot parse into 3 columns
    )

    # Filter out rows where the category contains uppercase letters
    initial_rows = len(df)
    df = df[df['category'].apply(lambda x: isinstance(x, str) and not any(c.isupper() for c in x))]
    filtered_rows = len(df)
    if initial_rows - filtered_rows > 0:
        print(f"Ignored {initial_rows - filtered_rows} rows due to uppercase letters in category.")

    # Ensure author is a string for grouping (empty string for missing authors)
    df['author'] = df['author'].fillna('').astype(str)

    # Group by quote and author to deduplicate entries.
    # We intentionally ignore categories to reduce metadata size in the output index.
    grouped = {}
    for _, row in df.iterrows():
        quote = row['quote']
        author = row['author']

        # Build a case-insensitive key for deduplication
        quote_key = quote.lower().strip() if isinstance(quote, str) else ''
        author_key = author.lower().strip() if isinstance(author, str) else ''
        key = (quote_key, author_key)

        if key not in grouped:
            grouped[key] = { 'quote': quote, 'author': author }

    # Build records from grouped data; do NOT include categories
    records = []
    for key, data in grouped.items():
        orig_author = data['author'] if data['author'] != '' else None
        records.append({
            'quote': data['quote'],
            'author': orig_author
        })

    # Prepend the required prefix for retrieval tasks
    for r in records:
        r['quote_for_embedding'] = "search_query: " + r['quote']

    return records

# 3. Generate Embeddings
def generate_embeddings(quotes, model_name, embedding_dim):
    """Generates and truncates embeddings for a list of quotes."""
    model = SentenceTransformer(model_name, trust_remote_code=True)
    # The model automatically uses the GPU if available
    embeddings = model.encode(
        [q['quote_for_embedding'] for q in quotes],
        convert_to_tensor=True,
        batch_size=BATCH_SIZE,
        show_progress_bar=True # Display progress bar for embedding generation
    )
    # Truncate embeddings to the desired dimension
    truncated_embeddings = embeddings[:, :embedding_dim]
    return truncated_embeddings.cpu().numpy()

# 4. Quantize Embeddings
def quantize_embeddings(embeddings):
    """Quantizes float32 embeddings to int8."""
    # Calculate the scale factor
    abs_max = np.abs(embeddings).max()
    scale = 127.0 / abs_max if abs_max != 0 else 0
    
    # Quantize and clip
    quantized_embeddings = np.clip(embeddings * scale, -127, 127).astype(np.int8)
    
    return quantized_embeddings, scale

def main():
    """Main function to run the offline processing pipeline."""
    print("Starting offline processing...")

    # Load quotes
    quotes = load_quotes(INPUT_CSV_PATH)
    print(f"Loaded {len(quotes)} quotes.")

    # Generate embeddings
    print("Generating embeddings...")
    float_embeddings = generate_embeddings(quotes, MODEL_NAME, EMBEDDING_DIM)
    print(f"Generated float embeddings with shape: {float_embeddings.shape}")

    # Quantize embeddings
    print("Quantizing embeddings...")
    quantized_embeddings, scale = quantize_embeddings(float_embeddings)
    print(f"Quantized embeddings with shape: {quantized_embeddings.shape}")
    print(f"Quantization scale factor: {scale}")

    # Prepare metadata without categories to reduce index size
    metadata = [
        {"quote": q["quote"], "author": q["author"]}
        for q in quotes
    ]

    # Replace NaN values with None for JSON compatibility
    for item in metadata:
        for key, value in list(item.items()):
            if isinstance(value, float) and np.isnan(value):
                item[key] = None

    # After cleaning metadata, serialize and compress once
    metadata_json = json.dumps(metadata, separators=(",", ":"))
    metadata_bytes_uncompressed = metadata_json.encode('utf-8')

    # Compress metadata with gzip to reduce index size on disk
    metadata_bytes = gzip.compress(metadata_bytes_uncompressed)
    # metadata format: 0 = uncompressed JSON (legacy), 1 = gzip-compressed JSON
    metadata_format = 1

    # Pack data into a binary file
    print("Packaging data into binary file...")
    with open(OUTPUT_BINARY_PATH, 'wb') as f:
        # Header
        f.write(struct.pack('<I', len(quotes)))  # 4 bytes: number of quotes
        f.write(struct.pack('<H', EMBEDDING_DIM)) # 2 bytes: embedding dimension
        f.write(struct.pack('<f', scale)) # 4 bytes: quantization scale factor
        f.write(struct.pack('<I', len(metadata_bytes))) # 4 bytes: metadata size
        f.write(struct.pack('<B', metadata_format)) # 1 byte: metadata format flag

        # Metadata (possibly compressed)
        f.write(metadata_bytes)

        # Embeddings
        f.write(quantized_embeddings.tobytes())

    print(f"Offline processing complete. Index file saved to {OUTPUT_BINARY_PATH}")

if __name__ == "__main__":
    main()