import os import pickle from tqdm import tqdm from config.rag_config import RAGConfig from src.pdf_loader import load_pdfs, chunk_text from src.embedder import Embedder def prepare_embeddings(): config = RAGConfig() embedder = Embedder(config) print(f"📂 加载 PDF 文件...") pdf_texts = load_pdfs(config.pdf_dir) print("✂️ 切分文本...") all_chunks = [] for text in pdf_texts: all_chunks.extend(chunk_text(text, config.chunk_size, config.chunk_overlap)) print("🧠 生成 embeddings...") embeddings = embedder.embed_texts(all_chunks) os.makedirs(os.path.dirname(config.vector_db_path), exist_ok=True) with open(config.vector_db_path, "wb") as f: pickle.dump({"texts": all_chunks, "embeddings": embeddings}, f) print(f"✅ 向量库已保存到 {config.vector_db_path}, 共 {len(all_chunks)} 段文本") if __name__ == "__main__": prepare_embeddings()