Spaces:

garage-lab
/

MCP_HTML2JSON

Paused

App Files Files Community

OmarKouta21 commited on Jun 30

Commit

2ae29dd

1 Parent(s): 1f9040d

Reranker for nividia and hf

Browse files

Files changed (9) hide show

.txt +0 -0
requirements.txt +2 -0
test.ipynb +0 -0
web2json/__pycache__/ai_extractor.cpython-311.pyc +0 -0
web2json/__pycache__/pipeline.cpython-311.pyc +0 -0
web2json/__pycache__/postprocessor.cpython-311.pyc +0 -0
web2json/__pycache__/preprocessor.cpython-311.pyc +0 -0
web2json/ai_extractor.py +120 -60
web2json/pipeline.py +4 -4

.txt ADDED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -13,4 +13,6 @@ langchain-text-splitters
 sentence-transformers
 openai
 html_chunking
 lxml

 sentence-transformers
 openai
 html_chunking
+langchain_nvidia_ai_endpoints
+langchain_core
 lxml

test.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

web2json/__pycache__/ai_extractor.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ

web2json/__pycache__/pipeline.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/pipeline.cpython-311.pyc and b/web2json/__pycache__/pipeline.cpython-311.pyc differ

web2json/__pycache__/postprocessor.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/postprocessor.cpython-311.pyc and b/web2json/__pycache__/postprocessor.cpython-311.pyc differ

web2json/__pycache__/preprocessor.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ

web2json/ai_extractor.py CHANGED Viewed

@@ -11,12 +11,16 @@ from google.genai import types
 from pydantic import BaseModel
 from concurrent.futures import ThreadPoolExecutor
 from html_chunking import get_html_chunks
 from abc import ABC, abstractmethod
 from typing import List, Any, Dict, Tuple, Optional
 import re
 import json
 from langchain_text_splitters import HTMLHeaderTextSplitter
 from sentence_transformers import SentenceTransformer
 class LLMClient(ABC):
     """
     Abstract base class for calling LLM APIs.
@@ -227,6 +231,93 @@ class NvidiaLLMClient(LLMClient):
                     # You could set results[idx] = None or a default string
                     results[idx] = f"<failed after retries>"
         return results
 class AIExtractor:
@@ -264,7 +355,7 @@ class LLMClassifierExtractor(AIExtractor):
     Extractor that uses an LLM to classify and extract structured information from text content.
     This class is designed to handle classification tasks where the LLM generates structured output based on a provided schema.
     """
-    def __init__(self, llm_client: LLMClient, prompt_template: str, classifier_prompt: str, ):
         """
         Initializes the LLMClassifierExtractor with an LLM client and a prompt template.
@@ -273,6 +364,7 @@ class LLMClassifierExtractor(AIExtractor):
             prompt_template (str): The template to use for generating prompts for the LLM.
         """
         super().__init__(llm_client, prompt_template)
         self.classifier_prompt = classifier_prompt
     def chunk_content(self, content: str , max_tokens: int = 500, is_clean: bool = True) -> List[str]:
@@ -288,79 +380,47 @@ class LLMClassifierExtractor(AIExtractor):
         # Use the get_html_chunks function to split the content into chunks
         return get_html_chunks(html=content, max_tokens=max_tokens, is_clean_html=is_clean, attr_cutoff_len=5)
-    def classify_chunks(self, chunks: List[str], schema: BaseModel) -> List[Dict[str, Any]]:
-        """
-        Classifies each chunk using the LLM based on the provided schema.
-        Args:
-            chunks (List[str]): A list of text chunks to classify.
-            schema (BaseModel): A Pydantic model defining the structure of the expected output.
-        Returns:
-            List[Dict[str, Any]]: A list of dictionaries containing classified information.
-        """
-        prompts = [self.classifier_prompt.format(content=chunk, schema=schema.model_json_schema()) for chunk in chunks]
-        classified_chunks = []
-        responses = self.llm_client.call_batch(prompts)
-        for response in responses:
-            # extract the json from the response
-            json_data = extract_markdown_json(response)
-            if json_data:
-                classified_chunks.append(json_data)
-            else:
-                classified_chunks.append({
-                    "error": "Failed to extract JSON from response",
-                    "relevant": 1,
-                })
-        return classified_chunks
-    def extract(self, content: str, schema: BaseModel) -> str:
         """
         Extracts structured information from the given content based on the provided schema.
         Args:
             content (str): The raw content to extract information from.
             schema (BaseModel): A Pydantic model defining the structure of the expected output.
-        Returns:
-            str: The structured JSON object as a string.
         """
-        # Chunk the HTML
-        chunks = self.chunk_content(content,max_tokens=1500)
-        print(f"Content successfully chunked into {len(chunks)} pieces.")
-        # Classify each chunk using the LLM
-        classified_chunks = self.classify_chunks(chunks, schema)
-        # Concatenate the positive classified chunks into a single string
-        print(f"Classified {classified_chunks} chunks.")
-        positive_chunks = []
-        for i, chunk in enumerate(classified_chunks):
-            if chunk.get("relevant", 0) > 0:
-                positive_chunks.append(chunks[i])
-        if len(positive_chunks) == 0:
-            positive_chunks = chunks
-        filtered_content = "\n\n".join(positive_chunks)
-        print(f"Filtered content for extraction: {filtered_content}")  # Log the first 500 characters of filtered content
         if not filtered_content:
             print("Warning: No relevant chunks found. Returning empty response.")
             return "{}"
-        # Generate the final prompt for extraction
         prompt = self.prompt_template.format(content=filtered_content, schema=schema.model_json_schema())
-        print(f"Generated prompt for extraction: {prompt[:500]}...")
-        # Call the LLM to extract structured information
         llm_response = self.llm_client.call_api(prompt)
-        print(f"LLM response: {llm_response[:500]}...")
-        # Return the structured response
-        if not llm_response:
-            print("Warning: LLM response is empty. Returning empty response.")
-            return "{}"
-        # json_response = extract_markdown_json(llm_response)
-        # if json_response is None:
-        #     print("Warning: Failed to extract JSON from LLM response. Returning empty response.")
-        #     return "{}"
-        return llm_response
 # TODO: RAGExtractor class
 class RAGExtractor(AIExtractor):
@@ -486,7 +546,7 @@ class RAGExtractor(AIExtractor):
         if not query:
             query = f"Extract information based on the following JSON schema: {schema.model_json_schema()}"
-            print(f"No explicit query provided for retrieval. Using default: '{query[:100]}...'")
         chunks = self._langchain_HHTS(content)
         print(f"Content successfully chunked into {len(chunks)} pieces.")

 from pydantic import BaseModel
 from concurrent.futures import ThreadPoolExecutor
 from html_chunking import get_html_chunks
+from langchain_nvidia_ai_endpoints import NVIDIARerank
+from langchain_core.documents import Document
 from abc import ABC, abstractmethod
 from typing import List, Any, Dict, Tuple, Optional
 import re
 import json
 from langchain_text_splitters import HTMLHeaderTextSplitter
 from sentence_transformers import SentenceTransformer
+import requests
 class LLMClient(ABC):
     """
     Abstract base class for calling LLM APIs.
                     # You could set results[idx] = None or a default string
                     results[idx] = f"<failed after retries>"
         return results
+class NvidiaRerankerClient(LLMClient):
+    """
+    Concrete implementation of LLMClient for the NVIDIA API (non-streaming).
+    """
+    def __init__(self, config: dict):
+        self.model_name = config.get("model_name", "nvidia/llama-3.2-nv-rerankqa-1b-v2")
+        self.client = NVIDIARerank(
+            model=self.model_name,
+            api_key=os.getenv("NVIDIA_API_KEY"),
+        )
+    def set_model(self, model_name: str):
+        """
+        Set the model name for the NVIDIA API client.
+        Args:
+            model_name (str): The name of the model to use.
+        """
+        self.model_name = model_name
+    @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
+    def call_api(self, prompt: str) -> str:
+        pass
+    def call_batch(self, prompts, max_workers=8):
+        pass
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from typing import List, Dict
+class HFRerankerClient(LLMClient):
+    """
+    Hugging Face Reranker client using Qwen/Qwen1.5-MoE-A14B-Chat reranking style (0.6B variant).
+    """
+    def __init__(self, model_name: str = "Qwen/Qwen3-Reranker-0.6B", device: str = None):
+        """
+        Initialize the Hugging Face reranker.
+        """
+        self.model_name = model_name
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name).to(self.device)
+        self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
+        self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
+    def rerank(self, query: str, passages: List[str], top_k: int = 3) -> List[str]:
+        """
+        Rerank passages based on relevance to query.
+        Args:
+            query (str): Query string.
+            passages (List[str]): List of passages.
+            top_k (int): Number of top passages to return.
+        Returns:
+            List[str]: Top-k most relevant passages.
+        """
+        inputs = [self.tokenizer(f"{query} [SEP] {p}", return_tensors="pt", truncation=True, padding=True).to(self.device) for p in passages]
+        scores = []
+        with torch.no_grad():
+            for inp in inputs:
+                logits = self.model(**inp).logits
+                score = torch.softmax(logits, dim=1)[0, 1].item()  # probability of relevance
+                scores.append(score)
+        print(f"Scores for passages: {scores}")
+        top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
+        print(f"top indices: {top_indices}")
+        return [passages[i] for i in top_indices]
+    @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
+    def call_api(self, prompt: str) -> str:
+        pass
+    def call_batch(self, prompts, max_workers=8):
+        pass
 class AIExtractor:
     Extractor that uses an LLM to classify and extract structured information from text content.
     This class is designed to handle classification tasks where the LLM generates structured output based on a provided schema.
     """
+    def __init__(self, reranker: LLMClient, llm_client: LLMClient, prompt_template: str, classifier_prompt: str, ):
         """
         Initializes the LLMClassifierExtractor with an LLM client and a prompt template.
             prompt_template (str): The template to use for generating prompts for the LLM.
         """
         super().__init__(llm_client, prompt_template)
+        self.reranker = reranker
         self.classifier_prompt = classifier_prompt
     def chunk_content(self, content: str , max_tokens: int = 500, is_clean: bool = True) -> List[str]:
         # Use the get_html_chunks function to split the content into chunks
         return get_html_chunks(html=content, max_tokens=max_tokens, is_clean_html=is_clean, attr_cutoff_len=5)
+    def classify_chunks(self, passages, top_k=3, hf: bool = False):  # reranker
+        query = self.classifier_prompt
+        if hf:
+            print("Using Hugging Face reranker for classification.")
+            return self.reranker.rerank(query, passages, top_k=top_k)
+        # NVIDIA reranker path
+        responses = self.reranker.client.compress_documents(
+            query=query,
+            documents=[Document(page_content=passage) for passage in passages]
+        )
+        return [response.page_content for response in responses[:top_k]]
+    def extract(self, content, schema, hf: bool = False):
         """
         Extracts structured information from the given content based on the provided schema.
         Args:
             content (str): The raw content to extract information from.
             schema (BaseModel): A Pydantic model defining the structure of the expected output.
+            hf (bool): Whether to use the Hugging Face reranker or NVIDIA (default).
         """
+        chunks = self.chunk_content(content, max_tokens=1500)
+        print(f"Content successfully chunked into {len(chunks)}.")
+        print(f"Content successfully chunked: {chunks}")
+        classified_chunks = self.classify_chunks(chunks, hf=hf)  # conditional reranker
+        filtered_content = "\n\n".join(classified_chunks)
         if not filtered_content:
             print("Warning: No relevant chunks found. Returning empty response.")
             return "{}"
         prompt = self.prompt_template.format(content=filtered_content, schema=schema.model_json_schema())
+        # print(f"Generated prompt for extraction: {prompt[:500]}...")
         llm_response = self.llm_client.call_api(prompt)
+        # print(f"LLM response: {llm_response[:500]}...")
+        return llm_response or "{}"
 # TODO: RAGExtractor class
 class RAGExtractor(AIExtractor):
         if not query:
             query = f"Extract information based on the following JSON schema: {schema.model_json_schema()}"
+            # print(f"No explicit query provided for retrieval. Using default: '{query[:100]}...'")
         chunks = self._langchain_HHTS(content)
         print(f"Content successfully chunked into {len(chunks)} pieces.")

web2json/pipeline.py CHANGED Viewed

@@ -13,7 +13,7 @@ class Pipeline:
         self.ai_extractor = ai_extractor
         self.postprocessor = postprocessor
-    def run(self, content: str, is_url: bool, schema:BaseModel) -> dict:
         """
         Run the entire pipeline: preprocess, extract, and postprocess.
@@ -27,11 +27,11 @@ class Pipeline:
         """
         # Step 1: Preprocess the content
         preprocessed_content = self.preprocessor.preprocess(content, is_url)
-        print(f"Preprocessed content: {preprocessed_content}...")
         print('+'*80)
         # Step 2: Extract structured information using AI
-        extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
-        print(f"Extracted data: {extracted_data[:100]}...")
         print('+'*80)
         # Step 3: Post-process the extracted data
         final_output = self.postprocessor.process(extracted_data)

         self.ai_extractor = ai_extractor
         self.postprocessor = postprocessor
+    def run(self, content: str, is_url: bool, schema:BaseModel, hf=False) -> dict:
         """
         Run the entire pipeline: preprocess, extract, and postprocess.
         """
         # Step 1: Preprocess the content
         preprocessed_content = self.preprocessor.preprocess(content, is_url)
+        # print(f"Preprocessed content: {preprocessed_content}...")
         print('+'*80)
         # Step 2: Extract structured information using AI
+        extracted_data = self.ai_extractor.extract(preprocessed_content, schema, hf=hf)
+        # print(f"Extracted data: {extracted_data[:100]}...")
         print('+'*80)
         # Step 3: Post-process the extracted data
         final_output = self.postprocessor.process(extracted_data)