Spaces:

garage-lab
/

MCP_HTML2JSON

Paused

App Files Files Community

abdo-Mansour commited on Jun 27

Commit

c67f04e

1 Parent(s): 4ed1b4f

done with version 2

Browse files

Files changed (8) hide show

app.py +54 -4
requirements.txt +3 -1
web2json/__pycache__/ai_extractor.cpython-311.pyc +0 -0
web2json/__pycache__/pipeline.cpython-311.pyc +0 -0
web2json/__pycache__/preprocessor.cpython-311.pyc +0 -0
web2json/ai_extractor.py +240 -6
web2json/pipeline.py +1 -1
web2json/preprocessor.py +78 -3

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import pandas as pd
 import gradio as gr
 from typing import Dict, Any, Type
 from web2json.preprocessor import BasicPreprocessor
-from web2json.ai_extractor import AIExtractor, RAGExtractor, GeminiLLMClient
 from web2json.postprocessor import PostProcessor
 from web2json.pipeline import Pipeline
 from pydantic import BaseModel, Field, create_model
@@ -170,16 +170,66 @@ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str,
     - Preserve the original formatting and context where relevant
     - Return the extracted data in the format specified by the schema"""
     # Initialize pipeline components
     # TODO: improve the RAG system and optimize (don't instantiate every time)
-    preprocessor = BasicPreprocessor(config={'keep_tags': False})
     try:
-        llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
     except Exception as e:
         return {"error": f"Failed to initialize LLM client: {str(e)}"}
     # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
-    ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
     postprocessor = PostProcessor()
     pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)

 import gradio as gr
 from typing import Dict, Any, Type
 from web2json.preprocessor import BasicPreprocessor
+from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient
 from web2json.postprocessor import PostProcessor
 from web2json.pipeline import Pipeline
 from pydantic import BaseModel, Field, create_model
     - Preserve the original formatting and context where relevant
     - Return the extracted data in the format specified by the schema"""
+    classification_prompt_template = """
+    # HTML Chunk Relevance Classification Prompt
+    You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant.
+    ## Instructions:
+    1. Carefully examine the provided HTML chunk
+    2. Compare it against the given schema/criteria
+    3. Determine if the HTML chunk contains content that matches or is relevant to the schema
+    4. Respond with ONLY a JSON object containing a single field "relevant" with value 1 (relevant) or 0 (not relevant)
+    ## Input Format:
+    **Schema/Criteria:**
+    {schema}
+    **HTML Chunk:**
+    ```html
+    {content}
+    ```
+    ## Output Format:
+    Your response must be ONLY a valid JSON object with no additional text:
+    ```json
+    {{
+    "relevant": 1
+    }}
+    ```
+    OR
+    ```json
+    {{
+    "relevant": 0
+    }}
+    ```
+    ## Classification Rules:
+    - Output 1 if the HTML chunk contains content that matches the schema criteria
+    - Output 0 if the HTML chunk does not contain relevant content
+    - Consider semantic meaning, not just exact keyword matches
+    - Look at text content, attributes, structure, and context
+    - Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content
+    - Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema
+    - Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0)
+    - The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema
+    CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object.
+    """
     # Initialize pipeline components
     # TODO: improve the RAG system and optimize (don't instantiate every time)
+    preprocessor = BasicPreprocessor(config={'keep_tags': True})
     try:
+        # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
+        llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
     except Exception as e:
         return {"error": f"Failed to initialize LLM client: {str(e)}"}
     # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
+    ai_extractor = LLMClassifierExtractor(llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
     postprocessor = PostProcessor()
     pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)

requirements.txt CHANGED Viewed

@@ -10,4 +10,6 @@ json_repair
 numpy
 langchain
 langchain-text-splitters
-sentence-transformers

 numpy
 langchain
 langchain-text-splitters
+sentence-transformers
+openai
+html_chunking

web2json/__pycache__/ai_extractor.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ

web2json/__pycache__/pipeline.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/pipeline.cpython-311.pyc and b/web2json/__pycache__/pipeline.cpython-311.pyc differ

web2json/__pycache__/preprocessor.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ

web2json/ai_extractor.py CHANGED Viewed

@@ -1,14 +1,22 @@
 import os
-from abc import ABC, abstractmethod
 from google import genai
 from google.genai import types
 from pydantic import BaseModel
-import numpy as np
-from typing import List, Any, Dict, Tuple
-import time
 from langchain_text_splitters import HTMLHeaderTextSplitter
 from sentence_transformers import SentenceTransformer
 class LLMClient(ABC):
     """
     Abstract base class for calling LLM APIs.
@@ -96,7 +104,130 @@ class GeminiLLMClient(LLMClient):
         # Combine all output parts into a single string
         return response.text
 class AIExtractor:
     def __init__(self, llm_client: LLMClient, prompt_template: str):
@@ -127,6 +258,109 @@ class AIExtractor:
         # print(f"Generated prompt: {prompt}")
         response = self.llm_client.call_api(prompt)
         return response
 # TODO: RAGExtractor class
 class RAGExtractor(AIExtractor):

 import os
+import time
+import numpy as np
 from google import genai
+from openai import OpenAI
+import time
+import random
+from openai import RateLimitError
+from functools import wraps
 from google.genai import types
 from pydantic import BaseModel
+from concurrent.futures import ThreadPoolExecutor
+from html_chunking import get_html_chunks
+from abc import ABC, abstractmethod
+from typing import List, Any, Dict, Tuple, Optional
+import re
+import json
 from langchain_text_splitters import HTMLHeaderTextSplitter
 from sentence_transformers import SentenceTransformer
 class LLMClient(ABC):
     """
     Abstract base class for calling LLM APIs.
         # Combine all output parts into a single string
         return response.text
+def extract_markdown_json(text: str) -> Optional[Dict[str, Any]]:
+        """
+        Find the first Markdown ```json ...``` block in `text`,
+        parse it as JSON, and return the resulting dict.
+        Returns None if no valid JSON block is found.
+        """
+        # 1) Look specifically for a ```json code fence
+        fence_match = re.search(
+            r"```json\s*(\{.*?\})\s*```",
+            text,
+            re.DOTALL | re.IGNORECASE
+        )
+        if not fence_match:
+            return None
+        json_str = fence_match.group(1)
+        try:
+            return json.loads(json_str)
+        except json.JSONDecodeError:
+            return None
+def retry_on_ratelimit(max_retries=5, base_delay=1.0, max_delay=10.0):
+    def deco(fn):
+        @wraps(fn)
+        def wrapped(*args, **kwargs):
+            delay = base_delay
+            for attempt in range(max_retries):
+                try:
+                    return fn(*args, **kwargs)
+                except RateLimitError:
+                    if attempt == max_retries - 1:
+                        # give up
+                        raise
+                    # back off + jitter
+                    sleep = min(max_delay, delay) + random.uniform(0, delay)
+                    time.sleep(sleep)
+                    delay *= 2
+            # unreachable
+        return wrapped
+    return deco
+class NvidiaLLMClient(LLMClient):
+    """
+    Concrete implementation of LLMClient for the NVIDIA API (non-streaming).
+    """
+    def __init__(self, config: dict):
+        """
+        Initializes the NvidiaLLMClient with an API key, model name, and optional generation settings.
+        Args:
+            config (dict): Configuration containing:
+                - 'api_key': (optional) API key for NVIDIA (falls back to NVIDIA_API_KEY env var)
+                - 'model_name': (optional) the model to use (default 'google/gemma-3-1b-it')
+                - 'generation_config': (optional) dict of generation parameters like temperature, top_p, etc.
+        """
+        api_key = config.get("api_key") or os.environ.get("NVIDIA_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "API key for NVIDIA must be provided in config['api_key'] or NVIDIA_API_KEY env var."
+            )
+        self.client = OpenAI(
+            base_url="https://integrate.api.nvidia.com/v1",
+            api_key=api_key
+        )
+        self.model_name = config.get("model_name", "google/gemma-3-1b-it")
+        # Store generation settings with sensible defaults
+        gen_conf = config.get("generation_config", {})
+        self.temperature = gen_conf.get("temperature", 0.1)
+        self.top_p = gen_conf.get("top_p", 0.7)
+        self.max_tokens = gen_conf.get("max_tokens", 512)
+    def set_model(self, model_name: str):
+        """
+        Set the model name for the NVIDIA API client.
+        Args:
+            model_name (str): The name of the model to use.
+        """
+        self.model_name = model_name
+    @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
+    def call_api(self, prompt: str) -> str:
+        """
+        Call the NVIDIA API with the given prompt (non-streaming).
+        Args:
+            prompt (str): The input text for the API.
+        Returns:
+            str: The generated text from the NVIDIA API.
+        """
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=self.temperature,
+            top_p=self.top_p,
+            max_tokens=self.max_tokens
+            # stream is omitted (defaults to False)
+        )
+        # print("DONE")
+        # For the standard (non-streaming) response:
+        # choices[0].message.content holds the generated text
+        return response.choices[0].message.content
+    def call_batch(self, prompts, max_workers=8):
+        """
+        Parallel batch with isolated errors: each prompt that still
+        fails after retries will raise, but others succeed.
+        """
+        from concurrent.futures import ThreadPoolExecutor, as_completed
+        results = [None] * len(prompts)
+        with ThreadPoolExecutor(max_workers=max_workers) as ex:
+            futures = {ex.submit(self.call_api, p): i for i, p in enumerate(prompts)}
+            for fut in as_completed(futures):
+                idx = futures[fut]
+                try:
+                    results[idx] = fut.result()
+                except RateLimitError:
+                    # You could set results[idx] = None or a default string
+                    results[idx] = f"<failed after retries>"
+        return results
 class AIExtractor:
     def __init__(self, llm_client: LLMClient, prompt_template: str):
         # print(f"Generated prompt: {prompt}")
         response = self.llm_client.call_api(prompt)
         return response
+class LLMClassifierExtractor(AIExtractor):
+    """
+    Extractor that uses an LLM to classify and extract structured information from text content.
+    This class is designed to handle classification tasks where the LLM generates structured output based on a provided schema.
+    """
+    def __init__(self, llm_client: LLMClient, prompt_template: str, classifier_prompt: str, ):
+        """
+        Initializes the LLMClassifierExtractor with an LLM client and a prompt template.
+        Args:
+            llm_client (LLMClient): An instance of a class that implements the LLMClient interface.
+            prompt_template (str): The template to use for generating prompts for the LLM.
+        """
+        super().__init__(llm_client, prompt_template)
+        self.classifier_prompt = classifier_prompt
+    def chunk_content(self, content: str , max_tokens: int = 500, is_clean: bool = True) -> List[str]:
+        """
+        Splits the content into manageable chunks for processing.
+        Args:
+            content (str): The raw content to be chunked.
+        Returns:
+            List[str]: A list of text chunks.
+        """
+        # Use the get_html_chunks function to split the content into chunks
+        return get_html_chunks(html=content, max_tokens=max_tokens, is_clean_html=is_clean, attr_cutoff_len=5)
+    def classify_chunks(self, chunks: List[str], schema: BaseModel) -> List[Dict[str, Any]]:
+        """
+        Classifies each chunk using the LLM based on the provided schema.
+        Args:
+            chunks (List[str]): A list of text chunks to classify.
+            schema (BaseModel): A Pydantic model defining the structure of the expected output.
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries containing classified information.
+        """
+        prompts = [self.classifier_prompt.format(content=chunk, schema=schema.model_json_schema()) for chunk in chunks]
+        classified_chunks = []
+        responses = self.llm_client.call_batch(prompts)
+        for response in responses:
+            # extract the json from the response
+            json_data = extract_markdown_json(response)
+            if json_data:
+                classified_chunks.append(json_data)
+            else:
+                classified_chunks.append({
+                    "error": "Failed to extract JSON from response",
+                    "relevant": 1,
+                })
+        return classified_chunks
+    def extract(self, content: str, schema: BaseModel) -> str:
+        """
+        Extracts structured information from the given content based on the provided schema.
+        Args:
+            content (str): The raw content to extract information from.
+            schema (BaseModel): A Pydantic model defining the structure of the expected output.
+        Returns:
+            str: The structured JSON object as a string.
+        """
+        # Chunk the HTML
+        chunks = self.chunk_content(content,max_tokens=1500)
+        print(f"Content successfully chunked into {len(chunks)} pieces.")
+        # Classify each chunk using the LLM
+        classified_chunks = self.classify_chunks(chunks, schema)
+        # Concatenate the positive classified chunks into a single string
+        print(f"Classified {classified_chunks} chunks.")
+        positive_chunks = []
+        for i, chunk in enumerate(classified_chunks):
+            if chunk.get("relevant", 0) > 0:
+                positive_chunks.append(chunks[i])
+        if len(positive_chunks) == 0:
+            positive_chunks = chunks
+        filtered_content = "\n\n".join(positive_chunks)
+        print(f"Filtered content for extraction: {filtered_content}")  # Log the first 500 characters of filtered content
+        if not filtered_content:
+            print("Warning: No relevant chunks found. Returning empty response.")
+            return "{}"
+        # Generate the final prompt for extraction
+        prompt = self.prompt_template.format(content=filtered_content, schema=schema.model_json_schema())
+        print(f"Generated prompt for extraction: {prompt[:500]}...")
+        # Call the LLM to extract structured information
+        llm_response = self.llm_client.call_api(prompt)
+        print(f"LLM response: {llm_response[:500]}...")
+        # Return the structured response
+        if not llm_response:
+            print("Warning: LLM response is empty. Returning empty response.")
+            return "{}"
+        # json_response = extract_markdown_json(llm_response)
+        # if json_response is None:
+        #     print("Warning: Failed to extract JSON from LLM response. Returning empty response.")
+        #     return "{}"
+        return llm_response
 # TODO: RAGExtractor class
 class RAGExtractor(AIExtractor):

web2json/pipeline.py CHANGED Viewed

@@ -27,7 +27,7 @@ class Pipeline:
         """
         # Step 1: Preprocess the content
         preprocessed_content = self.preprocessor.preprocess(content, is_url)
-        print(f"Preprocessed content: {preprocessed_content[:100]}...")
         print('+'*80)
         # Step 2: Extract structured information using AI
         extracted_data = self.ai_extractor.extract(preprocessed_content, schema)

         """
         # Step 1: Preprocess the content
         preprocessed_content = self.preprocessor.preprocess(content, is_url)
+        print(f"Preprocessed content: {preprocessed_content}...")
         print('+'*80)
         # Step 2: Extract structured information using AI
         extracted_data = self.ai_extractor.extract(preprocessed_content, schema)

web2json/preprocessor.py CHANGED Viewed

@@ -4,6 +4,74 @@ from bs4 import BeautifulSoup , Comment
 from abc import ABC, abstractmethod
 from typing import Any, Dict, Optional
 class Preprocessor(ABC):
     """
@@ -136,9 +204,16 @@ class BasicPreprocessor(Preprocessor):
         # Clean the HTML content
-        cleaned_content = self._clean_html(html_content)
-        return cleaned_content.strip()  # Return the cleaned text content, stripped of leading/trailing whitespace

 from abc import ABC, abstractmethod
 from typing import Any, Dict, Optional
+class HTMLCleaner:
+    DEFAULT_REMOVE_TAGS = [
+        "script", "style"
+    ]
+    def __init__(self, config: dict = None):
+        self.config = config or {}
+        # allow custom tags to remove
+        self.remove_tags = set(self.DEFAULT_REMOVE_TAGS) | set(self.config.get("extra_remove_tags", []))
+    def _clean_html(self, html_content: str) -> str:
+        """
+        Cleans up the given HTML content by:
+        - Removing specified tags and their content.
+        - Stripping HTML comments.
+        - Optionally stripping out all attributes.
+        - Optionally flattening hyperlinks.
+        - Removing empty tags.
+        - Extracting and returning cleaned HTML or visible text.
+        Args:
+            html_content (str): The HTML content to clean.
+        Returns:
+            str: The cleaned HTML (if keep_tags=True) or normalized text.
+        """
+        soup = BeautifulSoup(html_content, "html.parser")
+        # Remove unwanted tags entirely
+        for tag_name in self.remove_tags:
+            for tag in soup.find_all(tag_name):
+                tag.decompose()
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        # Strip attributes if requested
+        if self.config.get("strip_attrs", False):
+            for tag in soup.find_all(True):
+                tag.attrs = {}
+        # Flatten hyperlinks if requested
+        if self.config.get("strip_links", False):
+            for a in soup.find_all('a'):
+                a.replace_with(a.get_text())
+        # Remove empty tags (no text and no non-empty children)
+        for tag in soup.find_all(True):
+            if not tag.get_text(strip=True):
+                tag.decompose()
+        # Convert soup to HTML string if preserving tags
+        if self.config.get('keep_tags', False):
+            html_str = str(soup)
+            # Remove any empty lines
+            html_str = re.sub(r'(?m)^[ \t]*\n', '', html_str)
+            return html_str.strip()
+        # Extract visible text
+        text = soup.get_text(separator="\n", strip=True)
+        # Remove empty lines
+        lines = [line for line in text.splitlines() if line.strip()]
+        clean_text = "\n".join(lines)
+        # Normalize whitespace within lines
+        clean_text = re.sub(r'\s+', ' ', clean_text)
+        return clean_text.strip()
 class Preprocessor(ABC):
     """
         # Clean the HTML content
+        # cleaned_content = self._clean_html(html_content)
+        cleaner = HTMLCleaner({
+            'keep_tags': True if self.config.get('keep_tags', False) else False,
+            'strip_attrs': True,
+            'strip_links': True,
+            'extra_remove_tags': ['header', 'footer']
+        })
+        clean = cleaner._clean_html(html_content=html_content)
+        return clean.strip()  # Return the cleaned text content, stripped of leading/trailing whitespace