File size: 939 Bytes
94f5c4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import os
import pickle
from tqdm import tqdm
from config.rag_config import RAGConfig
from src.pdf_loader import load_pdfs, chunk_text
from src.embedder import Embedder

def prepare_embeddings():
    config = RAGConfig()
    embedder = Embedder(config)

    print(f"📂 加载 PDF 文件...")
    pdf_texts = load_pdfs(config.pdf_dir)

    print("✂️ 切分文本...")
    all_chunks = []
    for text in pdf_texts:
        all_chunks.extend(chunk_text(text, config.chunk_size, config.chunk_overlap))

    print("🧠 生成 embeddings...")
    embeddings = embedder.embed_texts(all_chunks)

    os.makedirs(os.path.dirname(config.vector_db_path), exist_ok=True)
    with open(config.vector_db_path, "wb") as f:
        pickle.dump({"texts": all_chunks, "embeddings": embeddings}, f)

    print(f"✅ 向量库已保存到 {config.vector_db_path}, 共 {len(all_chunks)} 段文本")

if __name__ == "__main__":
    prepare_embeddings()