Spaces:

hugging2021
/

open-webui-rag-system

Build error

App Files Files Community

hugging2021 commited on Jul 20

Commit

e3fafe9

verified ·

1 Parent(s): cec55e8

Update rag_system.py

Browse files

Files changed (1) hide show

rag_system.py +134 -37

rag_system.py CHANGED Viewed

@@ -13,10 +13,10 @@ from langchain_community.vectorstores import FAISS
 try:
     import fitz  # PyMuPDF
     PYMUPDF_AVAILABLE = True
-    print("✅ PyMuPDF library available")
 except ImportError:
     PYMUPDF_AVAILABLE = False
-    print("⚠️ PyMuPDF library is not installed. Install with: pip install PyMuPDF")
 # PDF processing utilities
 import pytesseract
@@ -396,45 +396,142 @@ def split_documents(documents, chunk_size=800, chunk_overlap=100):
 # Main Execution
 # --------------------------------
-if __name__ == "__main__":
-    folder = "dataset_test"
-    log("PyMuPDF-based document processing started")
-    docs = load_documents(folder)
-    log("Document loading complete")
-    # Page information check
-    log("Page information summary:")
-    page_info = {}
-    for doc in docs:
         source = doc.metadata.get('source', 'unknown')
         page = doc.metadata.get('page', 'unknown')
         doc_type = doc.metadata.get('type', 'unknown')
-        if source not in page_info:
-            page_info[source] = {'pages': set(), 'types': set()}
-        page_info[source]['pages'].add(page)
-        page_info[source]['types'].add(doc_type)
-    for source, info in page_info.items():
-        max_page = max(info['pages']) if info['pages'] and isinstance(max(info['pages']), int) else 'unknown'
-        log(f"  {os.path.basename(source)}: {max_page} pages, type: {info['types']}")
-    chunks = split_documents(docs)
-    log("E5-Large-Instruct embedding preparation")
-    embedding_model = HuggingFaceEmbeddings(
-        model_name="intfloat/e5-large-v2",
-        model_kwargs={"device": "cuda"}
-    )
-    vectorstore = FAISS.from_documents(chunks, embedding_model)
-    vectorstore.save_local("vector_db")
-    log(f"Total number of documents: {len(docs)}")
-    log(f"Total number of chunks: {len(chunks)}")
-    log("FAISS save complete: vector_db")
-    # Sample output with page information
-    log("\nSample including actual page information:")
-    for i, chunk in enumerate(chunks[:5]):
-        meta = chunk.metadata
-        log(f"  Chunk {i+1}: {meta.get('type')} | Page {meta.get('page')} | {os.path.basename(meta.get('source', 'unknown'))}")

 try:
     import fitz  # PyMuPDF
     PYMUPDF_AVAILABLE = True
+    print("PyMuPDF library available")
 except ImportError:
     PYMUPDF_AVAILABLE = False
+    print("PyMuPDF library is not installed. Install with: pip install PyMuPDF")
 # PDF processing utilities
 import pytesseract
 # Main Execution
 # --------------------------------
+def build_rag_chain(llm, vectorstore, language="en", k=7):
+    """Build RAG Chain"""
+    question_prompt, refine_prompt = create_refine_prompts_with_pages(language)
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="refine",
+        retriever=vectorstore.as_retriever(search_kwargs={"k": k}),
+        chain_type_kwargs={
+            "question_prompt": question_prompt,
+            "refine_prompt": refine_prompt
+        },
+        return_source_documents=True
+    )
+    return qa_chain
+def ask_question_with_pages(qa_chain, question):
+    """Process questions"""
+    result = qa_chain({"query": question})
+    # Extract only the text after A: from the result
+    answer = result['result']
+    final_answer = answer.split("A:")[-1].strip() if "A:" in answer else answer.strip()
+    print(f"\nQuestion: {question}")
+    print(f"\nFinal Answer: {final_answer}")
+    # Metadata debugging info (disabled)
+    # debug_metadata_info(result["source_documents"])
+    # Organize reference documents by page
+    print("\nReference Document Summary:")
+    source_info = {}
+    for doc in result["source_documents"]:
         source = doc.metadata.get('source', 'unknown')
         page = doc.metadata.get('page', 'unknown')
         doc_type = doc.metadata.get('type', 'unknown')
+        section = doc.metadata.get('section', None)
+        total_pages = doc.metadata.get('total_pages', None)
+        filename = doc.metadata.get('filename', 'unknown')
+        if filename == 'unknown':
+            filename = os.path.basename(source) if source != 'unknown' else 'unknown'
+        if filename not in source_info:
+            source_info[filename] = {
+                'pages': set(),
+                'sections': set(),
+                'types': set(),
+                'total_pages': total_pages
+            }
+        if page != 'unknown':
+            if isinstance(page, str) and page.startswith('section'):
+                source_info[filename]['sections'].add(page)
+            else:
+                source_info[filename]['pages'].add(page)
+        if section is not None:
+            source_info[filename]['sections'].add(f"section {section}")
+        source_info[filename]['types'].add(doc_type)
+    # Result output
+    total_chunks = len(result["source_documents"])
+    print(f"Total chunks used: {total_chunks}")
+    for filename, info in source_info.items():
+        print(f"\n- {filename}")
+        # Total page count information
+        if info['total_pages']:
+            print(f"  Total page count: {info['total_pages']}")
+        # Page information output
+        if info['pages']:
+            pages_list = list(info['pages'])
+            print(f"  Pages: {', '.join(map(str, pages_list))}")
+        # Section information output
+        if info['sections']:
+            sections_list = sorted(list(info['sections']))
+            print(f"  Sections: {', '.join(sections_list)}")
+        # If no pages or sections are present
+        if not info['pages'] and not info['sections']:
+            print(f"  Pages: No information")
+        # Output document type
+        types_str = ', '.join(sorted(info['types']))
+        print(f"  Type: {types_str}")
+    return result
+# Existing ask_question function is replaced with ask_question_with_pages
+def ask_question(qa_chain, question):
+    """Wrapper function for compatibility"""
+    return ask_question_with_pages(qa_chain, question)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="RAG refine system (supports page numbers)")
+    parser.add_argument("--vector_store", type=str, default="vector_db", help="Vector store path")
+    parser.add_argument("--model", type=str, default="LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct", help="LLM model ID")
+    parser.add_argument("--device", type=str, default="cuda", choices=["cuda", "cpu"], help="Device to use")
+    parser.add_argument("--k", type=int, default=7, help="Number of documents to retrieve")
+    parser.add_argument("--language", type=str, default="en", choices=["ko", "en"], help="Language to use")
+    parser.add_argument("--query", type=str, help="Question (runs interactive mode if not provided)")
+    args = parser.parse_args()
+    embeddings = get_embeddings(device=args.device)
+    vectorstore = load_vector_store(embeddings, load_path=args.vector_store)
+    llm = load_llama_model()
+    from rag_system import build_rag_chain, ask_question_with_pages #Hinzugefügt, um den neuen ask_question_with_pages code in der Konsole nutzbar zu machen.
+    qa_chain = build_rag_chain(llm, vectorstore, language=args.language, k=args.k)
+    print("RAG system with page number support ready!")
+    if args.query:
+        ask_question_with_pages(qa_chain, args.query)
+    else:
+        print("Starting interactive mode (enter 'exit', 'quit' to finish)")
+        while True:
+            try:
+                query = input("Question: ").strip()
+                if query.lower() in ["exit", "quit"]:
+                    break
+                if query:  # Prevent empty input
+                    ask_question_with_pages(qa_chain, query)
+            except KeyboardInterrupt:
+                print("\n\nExiting program.")
+                break
+            except Exception as e:
+                print(f"Error occurred: {e}\nPlease try again.")