Spaces:
Sleeping
Sleeping
| import os | |
| from pathlib import Path | |
| from langchain.document_loaders import TextLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import FAISS | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.chains import RetrievalQA | |
| from langchain.llms import HuggingFaceHub | |
| class KnowledgeManager: | |
| def __init__(self, knowledge_dir="."): # root dir by default | |
| self.knowledge_dir = Path(knowledge_dir) | |
| self.documents = [] | |
| self.embeddings = None | |
| self.vectorstore = None | |
| self.retriever = None | |
| self.llm = None | |
| self.qa_chain = None | |
| self._load_documents() | |
| if self.documents: | |
| self._initialize_embeddings() | |
| self._initialize_vectorstore() | |
| self._initialize_llm() | |
| self._initialize_qa_chain() | |
| def _load_documents(self): | |
| if not self.knowledge_dir.exists(): | |
| raise FileNotFoundError(f"Directory {self.knowledge_dir} does not exist.") | |
| files = list(self.knowledge_dir.glob("*.txt")) | |
| if not files: | |
| raise FileNotFoundError(f"No .txt files found in {self.knowledge_dir}. Please upload your knowledge base files in root.") | |
| for file in files: | |
| loader = TextLoader(str(file)) | |
| self.documents.extend(loader.load()) | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
| self.documents = splitter.split_documents(self.documents) | |
| def _initialize_embeddings(self): | |
| self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| def _initialize_vectorstore(self): | |
| self.vectorstore = FAISS.from_documents(self.documents, self.embeddings) | |
| self.retriever = self.vectorstore.as_retriever() | |
| def _initialize_llm(self): | |
| self.llm = HuggingFaceHub(repo_id="google/flan-t5-small", model_kwargs={"temperature":0, "max_length":256}) | |
| def _initialize_qa_chain(self): | |
| self.qa_chain = RetrievalQA.from_chain_type(llm=self.llm, chain_type="stuff", retriever=self.retriever) | |
| def ask(self, query): | |
| if not self.qa_chain: | |
| return "Knowledge base not initialized properly." | |
| return self.qa_chain.run(query) | |
| def get_knowledge_summary(self): | |
| return f"Loaded {len(self.documents)} document chunks from {self.knowledge_dir}" | |