Spaces:
Paused
Paused
Commit
·
c67f04e
1
Parent(s):
4ed1b4f
done with version 2
Browse files- app.py +54 -4
- requirements.txt +3 -1
- web2json/__pycache__/ai_extractor.cpython-311.pyc +0 -0
- web2json/__pycache__/pipeline.cpython-311.pyc +0 -0
- web2json/__pycache__/preprocessor.cpython-311.pyc +0 -0
- web2json/ai_extractor.py +240 -6
- web2json/pipeline.py +1 -1
- web2json/preprocessor.py +78 -3
app.py
CHANGED
|
@@ -3,7 +3,7 @@ import pandas as pd
|
|
| 3 |
import gradio as gr
|
| 4 |
from typing import Dict, Any, Type
|
| 5 |
from web2json.preprocessor import BasicPreprocessor
|
| 6 |
-
from web2json.ai_extractor import AIExtractor,
|
| 7 |
from web2json.postprocessor import PostProcessor
|
| 8 |
from web2json.pipeline import Pipeline
|
| 9 |
from pydantic import BaseModel, Field, create_model
|
|
@@ -170,16 +170,66 @@ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str,
|
|
| 170 |
- Preserve the original formatting and context where relevant
|
| 171 |
- Return the extracted data in the format specified by the schema"""
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
# Initialize pipeline components
|
| 174 |
# TODO: improve the RAG system and optimize (don't instantiate every time)
|
| 175 |
-
preprocessor = BasicPreprocessor(config={'keep_tags':
|
| 176 |
try:
|
| 177 |
-
llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
|
|
|
|
| 178 |
except Exception as e:
|
| 179 |
return {"error": f"Failed to initialize LLM client: {str(e)}"}
|
| 180 |
|
| 181 |
# ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
|
| 182 |
-
ai_extractor =
|
| 183 |
postprocessor = PostProcessor()
|
| 184 |
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
|
| 185 |
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
from typing import Dict, Any, Type
|
| 5 |
from web2json.preprocessor import BasicPreprocessor
|
| 6 |
+
from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient
|
| 7 |
from web2json.postprocessor import PostProcessor
|
| 8 |
from web2json.pipeline import Pipeline
|
| 9 |
from pydantic import BaseModel, Field, create_model
|
|
|
|
| 170 |
- Preserve the original formatting and context where relevant
|
| 171 |
- Return the extracted data in the format specified by the schema"""
|
| 172 |
|
| 173 |
+
classification_prompt_template = """
|
| 174 |
+
# HTML Chunk Relevance Classification Prompt
|
| 175 |
+
|
| 176 |
+
You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant.
|
| 177 |
+
|
| 178 |
+
## Instructions:
|
| 179 |
+
1. Carefully examine the provided HTML chunk
|
| 180 |
+
2. Compare it against the given schema/criteria
|
| 181 |
+
3. Determine if the HTML chunk contains content that matches or is relevant to the schema
|
| 182 |
+
4. Respond with ONLY a JSON object containing a single field "relevant" with value 1 (relevant) or 0 (not relevant)
|
| 183 |
+
|
| 184 |
+
## Input Format:
|
| 185 |
+
**Schema/Criteria:**
|
| 186 |
+
{schema}
|
| 187 |
+
|
| 188 |
+
**HTML Chunk:**
|
| 189 |
+
```html
|
| 190 |
+
{content}
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
## Output Format:
|
| 194 |
+
Your response must be ONLY a valid JSON object with no additional text:
|
| 195 |
+
|
| 196 |
+
```json
|
| 197 |
+
{{
|
| 198 |
+
"relevant": 1
|
| 199 |
+
}}
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
OR
|
| 203 |
+
|
| 204 |
+
```json
|
| 205 |
+
{{
|
| 206 |
+
"relevant": 0
|
| 207 |
+
}}
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
## Classification Rules:
|
| 211 |
+
- Output 1 if the HTML chunk contains content that matches the schema criteria
|
| 212 |
+
- Output 0 if the HTML chunk does not contain relevant content
|
| 213 |
+
- Consider semantic meaning, not just exact keyword matches
|
| 214 |
+
- Look at text content, attributes, structure, and context
|
| 215 |
+
- Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content
|
| 216 |
+
- Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema
|
| 217 |
+
- Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0)
|
| 218 |
+
- The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema
|
| 219 |
+
|
| 220 |
+
CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object.
|
| 221 |
+
"""
|
| 222 |
# Initialize pipeline components
|
| 223 |
# TODO: improve the RAG system and optimize (don't instantiate every time)
|
| 224 |
+
preprocessor = BasicPreprocessor(config={'keep_tags': True})
|
| 225 |
try:
|
| 226 |
+
# llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
|
| 227 |
+
llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
|
| 228 |
except Exception as e:
|
| 229 |
return {"error": f"Failed to initialize LLM client: {str(e)}"}
|
| 230 |
|
| 231 |
# ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
|
| 232 |
+
ai_extractor = LLMClassifierExtractor(llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
|
| 233 |
postprocessor = PostProcessor()
|
| 234 |
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
|
| 235 |
|
requirements.txt
CHANGED
|
@@ -10,4 +10,6 @@ json_repair
|
|
| 10 |
numpy
|
| 11 |
langchain
|
| 12 |
langchain-text-splitters
|
| 13 |
-
sentence-transformers
|
|
|
|
|
|
|
|
|
| 10 |
numpy
|
| 11 |
langchain
|
| 12 |
langchain-text-splitters
|
| 13 |
+
sentence-transformers
|
| 14 |
+
openai
|
| 15 |
+
html_chunking
|
web2json/__pycache__/ai_extractor.cpython-311.pyc
CHANGED
|
Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ
|
|
|
web2json/__pycache__/pipeline.cpython-311.pyc
CHANGED
|
Binary files a/web2json/__pycache__/pipeline.cpython-311.pyc and b/web2json/__pycache__/pipeline.cpython-311.pyc differ
|
|
|
web2json/__pycache__/preprocessor.cpython-311.pyc
CHANGED
|
Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ
|
|
|
web2json/ai_extractor.py
CHANGED
|
@@ -1,14 +1,22 @@
|
|
| 1 |
import os
|
| 2 |
-
|
|
|
|
| 3 |
from google import genai
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from google.genai import types
|
| 5 |
from pydantic import BaseModel
|
| 6 |
-
|
| 7 |
-
from
|
| 8 |
-
import
|
|
|
|
|
|
|
|
|
|
| 9 |
from langchain_text_splitters import HTMLHeaderTextSplitter
|
| 10 |
from sentence_transformers import SentenceTransformer
|
| 11 |
-
|
| 12 |
class LLMClient(ABC):
|
| 13 |
"""
|
| 14 |
Abstract base class for calling LLM APIs.
|
|
@@ -96,7 +104,130 @@ class GeminiLLMClient(LLMClient):
|
|
| 96 |
# Combine all output parts into a single string
|
| 97 |
return response.text
|
| 98 |
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
class AIExtractor:
|
| 102 |
def __init__(self, llm_client: LLMClient, prompt_template: str):
|
|
@@ -127,6 +258,109 @@ class AIExtractor:
|
|
| 127 |
# print(f"Generated prompt: {prompt}")
|
| 128 |
response = self.llm_client.call_api(prompt)
|
| 129 |
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
# TODO: RAGExtractor class
|
| 132 |
class RAGExtractor(AIExtractor):
|
|
|
|
| 1 |
import os
|
| 2 |
+
import time
|
| 3 |
+
import numpy as np
|
| 4 |
from google import genai
|
| 5 |
+
from openai import OpenAI
|
| 6 |
+
import time
|
| 7 |
+
import random
|
| 8 |
+
from openai import RateLimitError
|
| 9 |
+
from functools import wraps
|
| 10 |
from google.genai import types
|
| 11 |
from pydantic import BaseModel
|
| 12 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 13 |
+
from html_chunking import get_html_chunks
|
| 14 |
+
from abc import ABC, abstractmethod
|
| 15 |
+
from typing import List, Any, Dict, Tuple, Optional
|
| 16 |
+
import re
|
| 17 |
+
import json
|
| 18 |
from langchain_text_splitters import HTMLHeaderTextSplitter
|
| 19 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 20 |
class LLMClient(ABC):
|
| 21 |
"""
|
| 22 |
Abstract base class for calling LLM APIs.
|
|
|
|
| 104 |
# Combine all output parts into a single string
|
| 105 |
return response.text
|
| 106 |
|
| 107 |
+
def extract_markdown_json(text: str) -> Optional[Dict[str, Any]]:
|
| 108 |
+
"""
|
| 109 |
+
Find the first Markdown ```json ...``` block in `text`,
|
| 110 |
+
parse it as JSON, and return the resulting dict.
|
| 111 |
+
Returns None if no valid JSON block is found.
|
| 112 |
+
"""
|
| 113 |
+
# 1) Look specifically for a ```json code fence
|
| 114 |
+
fence_match = re.search(
|
| 115 |
+
r"```json\s*(\{.*?\})\s*```",
|
| 116 |
+
text,
|
| 117 |
+
re.DOTALL | re.IGNORECASE
|
| 118 |
+
)
|
| 119 |
+
if not fence_match:
|
| 120 |
+
return None
|
| 121 |
+
|
| 122 |
+
json_str = fence_match.group(1)
|
| 123 |
+
try:
|
| 124 |
+
return json.loads(json_str)
|
| 125 |
+
except json.JSONDecodeError:
|
| 126 |
+
return None
|
| 127 |
+
|
| 128 |
+
def retry_on_ratelimit(max_retries=5, base_delay=1.0, max_delay=10.0):
|
| 129 |
+
def deco(fn):
|
| 130 |
+
@wraps(fn)
|
| 131 |
+
def wrapped(*args, **kwargs):
|
| 132 |
+
delay = base_delay
|
| 133 |
+
for attempt in range(max_retries):
|
| 134 |
+
try:
|
| 135 |
+
return fn(*args, **kwargs)
|
| 136 |
+
except RateLimitError:
|
| 137 |
+
if attempt == max_retries - 1:
|
| 138 |
+
# give up
|
| 139 |
+
raise
|
| 140 |
+
# back off + jitter
|
| 141 |
+
sleep = min(max_delay, delay) + random.uniform(0, delay)
|
| 142 |
+
time.sleep(sleep)
|
| 143 |
+
delay *= 2
|
| 144 |
+
# unreachable
|
| 145 |
+
return wrapped
|
| 146 |
+
return deco
|
| 147 |
+
class NvidiaLLMClient(LLMClient):
|
| 148 |
+
"""
|
| 149 |
+
Concrete implementation of LLMClient for the NVIDIA API (non-streaming).
|
| 150 |
+
"""
|
| 151 |
+
|
| 152 |
+
def __init__(self, config: dict):
|
| 153 |
+
"""
|
| 154 |
+
Initializes the NvidiaLLMClient with an API key, model name, and optional generation settings.
|
| 155 |
+
|
| 156 |
+
Args:
|
| 157 |
+
config (dict): Configuration containing:
|
| 158 |
+
- 'api_key': (optional) API key for NVIDIA (falls back to NVIDIA_API_KEY env var)
|
| 159 |
+
- 'model_name': (optional) the model to use (default 'google/gemma-3-1b-it')
|
| 160 |
+
- 'generation_config': (optional) dict of generation parameters like temperature, top_p, etc.
|
| 161 |
+
"""
|
| 162 |
+
api_key = config.get("api_key") or os.environ.get("NVIDIA_API_KEY")
|
| 163 |
+
if not api_key:
|
| 164 |
+
raise ValueError(
|
| 165 |
+
"API key for NVIDIA must be provided in config['api_key'] or NVIDIA_API_KEY env var."
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
self.client = OpenAI(
|
| 169 |
+
base_url="https://integrate.api.nvidia.com/v1",
|
| 170 |
+
api_key=api_key
|
| 171 |
+
)
|
| 172 |
+
self.model_name = config.get("model_name", "google/gemma-3-1b-it")
|
| 173 |
+
|
| 174 |
+
# Store generation settings with sensible defaults
|
| 175 |
+
gen_conf = config.get("generation_config", {})
|
| 176 |
+
self.temperature = gen_conf.get("temperature", 0.1)
|
| 177 |
+
self.top_p = gen_conf.get("top_p", 0.7)
|
| 178 |
+
self.max_tokens = gen_conf.get("max_tokens", 512)
|
| 179 |
+
|
| 180 |
+
def set_model(self, model_name: str):
|
| 181 |
+
"""
|
| 182 |
+
Set the model name for the NVIDIA API client.
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
model_name (str): The name of the model to use.
|
| 186 |
+
"""
|
| 187 |
+
self.model_name = model_name
|
| 188 |
+
|
| 189 |
+
@retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
|
| 190 |
+
def call_api(self, prompt: str) -> str:
|
| 191 |
+
"""
|
| 192 |
+
Call the NVIDIA API with the given prompt (non-streaming).
|
| 193 |
+
|
| 194 |
+
Args:
|
| 195 |
+
prompt (str): The input text for the API.
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
str: The generated text from the NVIDIA API.
|
| 199 |
+
"""
|
| 200 |
+
response = self.client.chat.completions.create(
|
| 201 |
+
model=self.model_name,
|
| 202 |
+
messages=[{"role": "user", "content": prompt}],
|
| 203 |
+
temperature=self.temperature,
|
| 204 |
+
top_p=self.top_p,
|
| 205 |
+
max_tokens=self.max_tokens
|
| 206 |
+
# stream is omitted (defaults to False)
|
| 207 |
+
)
|
| 208 |
+
# print("DONE")
|
| 209 |
+
# For the standard (non-streaming) response:
|
| 210 |
+
# choices[0].message.content holds the generated text
|
| 211 |
+
return response.choices[0].message.content
|
| 212 |
+
|
| 213 |
+
def call_batch(self, prompts, max_workers=8):
|
| 214 |
+
"""
|
| 215 |
+
Parallel batch with isolated errors: each prompt that still
|
| 216 |
+
fails after retries will raise, but others succeed.
|
| 217 |
+
"""
|
| 218 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 219 |
+
results = [None] * len(prompts)
|
| 220 |
+
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
| 221 |
+
futures = {ex.submit(self.call_api, p): i for i, p in enumerate(prompts)}
|
| 222 |
+
for fut in as_completed(futures):
|
| 223 |
+
idx = futures[fut]
|
| 224 |
+
try:
|
| 225 |
+
results[idx] = fut.result()
|
| 226 |
+
except RateLimitError:
|
| 227 |
+
# You could set results[idx] = None or a default string
|
| 228 |
+
results[idx] = f"<failed after retries>"
|
| 229 |
+
return results
|
| 230 |
+
|
| 231 |
|
| 232 |
class AIExtractor:
|
| 233 |
def __init__(self, llm_client: LLMClient, prompt_template: str):
|
|
|
|
| 258 |
# print(f"Generated prompt: {prompt}")
|
| 259 |
response = self.llm_client.call_api(prompt)
|
| 260 |
return response
|
| 261 |
+
|
| 262 |
+
class LLMClassifierExtractor(AIExtractor):
|
| 263 |
+
"""
|
| 264 |
+
Extractor that uses an LLM to classify and extract structured information from text content.
|
| 265 |
+
This class is designed to handle classification tasks where the LLM generates structured output based on a provided schema.
|
| 266 |
+
"""
|
| 267 |
+
def __init__(self, llm_client: LLMClient, prompt_template: str, classifier_prompt: str, ):
|
| 268 |
+
"""
|
| 269 |
+
Initializes the LLMClassifierExtractor with an LLM client and a prompt template.
|
| 270 |
+
|
| 271 |
+
Args:
|
| 272 |
+
llm_client (LLMClient): An instance of a class that implements the LLMClient interface.
|
| 273 |
+
prompt_template (str): The template to use for generating prompts for the LLM.
|
| 274 |
+
"""
|
| 275 |
+
super().__init__(llm_client, prompt_template)
|
| 276 |
+
self.classifier_prompt = classifier_prompt
|
| 277 |
+
|
| 278 |
+
def chunk_content(self, content: str , max_tokens: int = 500, is_clean: bool = True) -> List[str]:
|
| 279 |
+
"""
|
| 280 |
+
Splits the content into manageable chunks for processing.
|
| 281 |
+
|
| 282 |
+
Args:
|
| 283 |
+
content (str): The raw content to be chunked.
|
| 284 |
+
|
| 285 |
+
Returns:
|
| 286 |
+
List[str]: A list of text chunks.
|
| 287 |
+
"""
|
| 288 |
+
# Use the get_html_chunks function to split the content into chunks
|
| 289 |
+
return get_html_chunks(html=content, max_tokens=max_tokens, is_clean_html=is_clean, attr_cutoff_len=5)
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
def classify_chunks(self, chunks: List[str], schema: BaseModel) -> List[Dict[str, Any]]:
|
| 293 |
+
"""
|
| 294 |
+
Classifies each chunk using the LLM based on the provided schema.
|
| 295 |
+
|
| 296 |
+
Args:
|
| 297 |
+
chunks (List[str]): A list of text chunks to classify.
|
| 298 |
+
schema (BaseModel): A Pydantic model defining the structure of the expected output.
|
| 299 |
+
|
| 300 |
+
Returns:
|
| 301 |
+
List[Dict[str, Any]]: A list of dictionaries containing classified information.
|
| 302 |
+
"""
|
| 303 |
+
prompts = [self.classifier_prompt.format(content=chunk, schema=schema.model_json_schema()) for chunk in chunks]
|
| 304 |
+
classified_chunks = []
|
| 305 |
+
responses = self.llm_client.call_batch(prompts)
|
| 306 |
+
for response in responses:
|
| 307 |
+
# extract the json from the response
|
| 308 |
+
json_data = extract_markdown_json(response)
|
| 309 |
+
if json_data:
|
| 310 |
+
classified_chunks.append(json_data)
|
| 311 |
+
else:
|
| 312 |
+
classified_chunks.append({
|
| 313 |
+
"error": "Failed to extract JSON from response",
|
| 314 |
+
"relevant": 1,
|
| 315 |
+
})
|
| 316 |
+
return classified_chunks
|
| 317 |
+
|
| 318 |
+
def extract(self, content: str, schema: BaseModel) -> str:
|
| 319 |
+
"""
|
| 320 |
+
Extracts structured information from the given content based on the provided schema.
|
| 321 |
+
|
| 322 |
+
Args:
|
| 323 |
+
content (str): The raw content to extract information from.
|
| 324 |
+
schema (BaseModel): A Pydantic model defining the structure of the expected output.
|
| 325 |
+
|
| 326 |
+
Returns:
|
| 327 |
+
str: The structured JSON object as a string.
|
| 328 |
+
"""
|
| 329 |
+
# Chunk the HTML
|
| 330 |
+
chunks = self.chunk_content(content,max_tokens=1500)
|
| 331 |
+
print(f"Content successfully chunked into {len(chunks)} pieces.")
|
| 332 |
+
# Classify each chunk using the LLM
|
| 333 |
+
classified_chunks = self.classify_chunks(chunks, schema)
|
| 334 |
+
# Concatenate the positive classified chunks into a single string
|
| 335 |
+
print(f"Classified {classified_chunks} chunks.")
|
| 336 |
+
positive_chunks = []
|
| 337 |
+
for i, chunk in enumerate(classified_chunks):
|
| 338 |
+
if chunk.get("relevant", 0) > 0:
|
| 339 |
+
positive_chunks.append(chunks[i])
|
| 340 |
+
if len(positive_chunks) == 0:
|
| 341 |
+
positive_chunks = chunks
|
| 342 |
+
filtered_content = "\n\n".join(positive_chunks)
|
| 343 |
+
print(f"Filtered content for extraction: {filtered_content}") # Log the first 500 characters of filtered content
|
| 344 |
+
if not filtered_content:
|
| 345 |
+
print("Warning: No relevant chunks found. Returning empty response.")
|
| 346 |
+
return "{}"
|
| 347 |
+
# Generate the final prompt for extraction
|
| 348 |
+
prompt = self.prompt_template.format(content=filtered_content, schema=schema.model_json_schema())
|
| 349 |
+
print(f"Generated prompt for extraction: {prompt[:500]}...")
|
| 350 |
+
# Call the LLM to extract structured information
|
| 351 |
+
llm_response = self.llm_client.call_api(prompt)
|
| 352 |
+
print(f"LLM response: {llm_response[:500]}...")
|
| 353 |
+
# Return the structured response
|
| 354 |
+
if not llm_response:
|
| 355 |
+
print("Warning: LLM response is empty. Returning empty response.")
|
| 356 |
+
return "{}"
|
| 357 |
+
|
| 358 |
+
# json_response = extract_markdown_json(llm_response)
|
| 359 |
+
# if json_response is None:
|
| 360 |
+
# print("Warning: Failed to extract JSON from LLM response. Returning empty response.")
|
| 361 |
+
# return "{}"
|
| 362 |
+
|
| 363 |
+
return llm_response
|
| 364 |
|
| 365 |
# TODO: RAGExtractor class
|
| 366 |
class RAGExtractor(AIExtractor):
|
web2json/pipeline.py
CHANGED
|
@@ -27,7 +27,7 @@ class Pipeline:
|
|
| 27 |
"""
|
| 28 |
# Step 1: Preprocess the content
|
| 29 |
preprocessed_content = self.preprocessor.preprocess(content, is_url)
|
| 30 |
-
print(f"Preprocessed content: {preprocessed_content
|
| 31 |
print('+'*80)
|
| 32 |
# Step 2: Extract structured information using AI
|
| 33 |
extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
|
|
|
|
| 27 |
"""
|
| 28 |
# Step 1: Preprocess the content
|
| 29 |
preprocessed_content = self.preprocessor.preprocess(content, is_url)
|
| 30 |
+
print(f"Preprocessed content: {preprocessed_content}...")
|
| 31 |
print('+'*80)
|
| 32 |
# Step 2: Extract structured information using AI
|
| 33 |
extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
|
web2json/preprocessor.py
CHANGED
|
@@ -4,6 +4,74 @@ from bs4 import BeautifulSoup , Comment
|
|
| 4 |
from abc import ABC, abstractmethod
|
| 5 |
from typing import Any, Dict, Optional
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
class Preprocessor(ABC):
|
| 9 |
"""
|
|
@@ -136,9 +204,16 @@ class BasicPreprocessor(Preprocessor):
|
|
| 136 |
|
| 137 |
|
| 138 |
# Clean the HTML content
|
| 139 |
-
cleaned_content = self._clean_html(html_content)
|
| 140 |
-
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
|
| 144 |
|
|
|
|
| 4 |
from abc import ABC, abstractmethod
|
| 5 |
from typing import Any, Dict, Optional
|
| 6 |
|
| 7 |
+
class HTMLCleaner:
|
| 8 |
+
DEFAULT_REMOVE_TAGS = [
|
| 9 |
+
"script", "style"
|
| 10 |
+
]
|
| 11 |
+
|
| 12 |
+
def __init__(self, config: dict = None):
|
| 13 |
+
self.config = config or {}
|
| 14 |
+
# allow custom tags to remove
|
| 15 |
+
self.remove_tags = set(self.DEFAULT_REMOVE_TAGS) | set(self.config.get("extra_remove_tags", []))
|
| 16 |
+
|
| 17 |
+
def _clean_html(self, html_content: str) -> str:
|
| 18 |
+
"""
|
| 19 |
+
Cleans up the given HTML content by:
|
| 20 |
+
- Removing specified tags and their content.
|
| 21 |
+
- Stripping HTML comments.
|
| 22 |
+
- Optionally stripping out all attributes.
|
| 23 |
+
- Optionally flattening hyperlinks.
|
| 24 |
+
- Removing empty tags.
|
| 25 |
+
- Extracting and returning cleaned HTML or visible text.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
html_content (str): The HTML content to clean.
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
str: The cleaned HTML (if keep_tags=True) or normalized text.
|
| 32 |
+
"""
|
| 33 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
| 34 |
+
|
| 35 |
+
# Remove unwanted tags entirely
|
| 36 |
+
for tag_name in self.remove_tags:
|
| 37 |
+
for tag in soup.find_all(tag_name):
|
| 38 |
+
tag.decompose()
|
| 39 |
+
|
| 40 |
+
# Remove HTML comments
|
| 41 |
+
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
| 42 |
+
comment.extract()
|
| 43 |
+
|
| 44 |
+
# Strip attributes if requested
|
| 45 |
+
if self.config.get("strip_attrs", False):
|
| 46 |
+
for tag in soup.find_all(True):
|
| 47 |
+
tag.attrs = {}
|
| 48 |
+
|
| 49 |
+
# Flatten hyperlinks if requested
|
| 50 |
+
if self.config.get("strip_links", False):
|
| 51 |
+
for a in soup.find_all('a'):
|
| 52 |
+
a.replace_with(a.get_text())
|
| 53 |
+
|
| 54 |
+
# Remove empty tags (no text and no non-empty children)
|
| 55 |
+
for tag in soup.find_all(True):
|
| 56 |
+
if not tag.get_text(strip=True):
|
| 57 |
+
tag.decompose()
|
| 58 |
+
|
| 59 |
+
# Convert soup to HTML string if preserving tags
|
| 60 |
+
if self.config.get('keep_tags', False):
|
| 61 |
+
html_str = str(soup)
|
| 62 |
+
# Remove any empty lines
|
| 63 |
+
html_str = re.sub(r'(?m)^[ \t]*\n', '', html_str)
|
| 64 |
+
return html_str.strip()
|
| 65 |
+
|
| 66 |
+
# Extract visible text
|
| 67 |
+
text = soup.get_text(separator="\n", strip=True)
|
| 68 |
+
# Remove empty lines
|
| 69 |
+
lines = [line for line in text.splitlines() if line.strip()]
|
| 70 |
+
clean_text = "\n".join(lines)
|
| 71 |
+
# Normalize whitespace within lines
|
| 72 |
+
clean_text = re.sub(r'\s+', ' ', clean_text)
|
| 73 |
+
|
| 74 |
+
return clean_text.strip()
|
| 75 |
|
| 76 |
class Preprocessor(ABC):
|
| 77 |
"""
|
|
|
|
| 204 |
|
| 205 |
|
| 206 |
# Clean the HTML content
|
| 207 |
+
# cleaned_content = self._clean_html(html_content)
|
| 208 |
+
cleaner = HTMLCleaner({
|
| 209 |
+
'keep_tags': True if self.config.get('keep_tags', False) else False,
|
| 210 |
+
'strip_attrs': True,
|
| 211 |
+
'strip_links': True,
|
| 212 |
+
'extra_remove_tags': ['header', 'footer']
|
| 213 |
+
})
|
| 214 |
+
clean = cleaner._clean_html(html_content=html_content)
|
| 215 |
+
|
| 216 |
+
return clean.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
|
| 217 |
|
| 218 |
|
| 219 |
|