import json import time import sys import os # Placeholder for FAISS/SentenceTransformer (to be installed) # This script prepares the data for vectorization def load_memories(memory_path): data = [] # Handle JSONL format with open(memory_path, 'r') as f: for line in f: if line.strip(): try: data.append(json.loads(line)) except json.JSONDecodeError: continue print(f"Loaded {len(data)} memories.") return data def prepare_corpus(memories): corpus = [] ids = [] for m in memories: # Combine relevant fields for embedding text = f"{m.get('content', '')} {m.get('context', '')} {' '.join(m.get('tags', []))}" corpus.append(text) ids.append(m.get('id')) return ids, corpus if __name__ == "__main__": memory_file = "haim/data/memory.jsonl" if not os.path.exists(memory_file): print(f"Error: {memory_file} not found.") sys.exit(1) ids, corpus = prepare_corpus(load_memories(memory_file)) print(f"Prepared {len(corpus)} text chunks for embedding.") # Save prepared corpus for the actual vectorization step with open("haim/vector_core/corpus_ready.json", "w") as f: json.dump({"ids": ids, "corpus": corpus}, f) print("Corpus saved to haim/vector_core/corpus_ready.json")