Spaces:
Sleeping
Sleeping
| import os | |
| import pickle | |
| from tqdm import tqdm | |
| from config.rag_config import RAGConfig | |
| from src.pdf_loader import load_pdfs, chunk_text | |
| from src.embedder import Embedder | |
| def prepare_embeddings(): | |
| config = RAGConfig() | |
| embedder = Embedder(config) | |
| print(f"📂 加载 PDF 文件...") | |
| pdf_texts = load_pdfs(config.pdf_dir) | |
| print("✂️ 切分文本...") | |
| all_chunks = [] | |
| for text in pdf_texts: | |
| all_chunks.extend(chunk_text(text, config.chunk_size, config.chunk_overlap)) | |
| print("🧠 生成 embeddings...") | |
| embeddings = embedder.embed_texts(all_chunks) | |
| os.makedirs(os.path.dirname(config.vector_db_path), exist_ok=True) | |
| with open(config.vector_db_path, "wb") as f: | |
| pickle.dump({"texts": all_chunks, "embeddings": embeddings}, f) | |
| print(f"✅ 向量库已保存到 {config.vector_db_path}, 共 {len(all_chunks)} 段文本") | |
| if __name__ == "__main__": | |
| prepare_embeddings() | |