File size: 830 Bytes
94f5c4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from dataclasses import dataclass

@dataclass
class RAGConfig:
    # Embedding 模型
    embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
    normalize_embeddings: bool = True

    # 检索参数
    top_k: int = 5
    similarity_threshold: float = 0.4

    # LLM 模型
    llm_model_name: str = "facebook/rag-token-base"
    llm_max_length: int = 512
    generation_kwargs: dict = None

    # PDF 路径
    pdf_dir: str = "data/pdfs"
    vector_db_path: str = "data/embeddings/vector_store.pkl"

    # Chunk 配置
    chunk_size: int = 500
    chunk_overlap: int = 100

    def __post_init__(self):
        if self.generation_kwargs is None:
            self.generation_kwargs = {
                "max_new_tokens": 200,
                "temperature": 0.7,
                "do_sample": True,
            }