hf-rag-multi / prepare_embeddings.py
siyu618's picture
Upload 18 files
94f5c4b verified
raw
history blame
939 Bytes
import os
import pickle
from tqdm import tqdm
from config.rag_config import RAGConfig
from src.pdf_loader import load_pdfs, chunk_text
from src.embedder import Embedder
def prepare_embeddings():
config = RAGConfig()
embedder = Embedder(config)
print(f"📂 加载 PDF 文件...")
pdf_texts = load_pdfs(config.pdf_dir)
print("✂️ 切分文本...")
all_chunks = []
for text in pdf_texts:
all_chunks.extend(chunk_text(text, config.chunk_size, config.chunk_overlap))
print("🧠 生成 embeddings...")
embeddings = embedder.embed_texts(all_chunks)
os.makedirs(os.path.dirname(config.vector_db_path), exist_ok=True)
with open(config.vector_db_path, "wb") as f:
pickle.dump({"texts": all_chunks, "embeddings": embeddings}, f)
print(f"✅ 向量库已保存到 {config.vector_db_path}, 共 {len(all_chunks)} 段文本")
if __name__ == "__main__":
prepare_embeddings()