Spaces:

only1ahmed
/

personal

Runtime error

App Files Files Community

only1ahmed commited on Jul 4

Commit

a8e3cab

verified ·

1 Parent(s): 3228c1d

Upload 6 files

Browse files

Files changed (6) hide show

.gitattributes +35 -35
.gitignore +7 -0
.txt +0 -0
README.md +15 -13
app.py +308 -0
requirements.txt +27 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+.env
+*.ipynb
+venv
+*.csv
+*.json
+*.jsonl
+vllm*

.txt ADDED Viewed

File without changes

README.md CHANGED Viewed

@@ -1,13 +1,15 @@
----
-title: Personal
-emoji: 🏆
-colorFrom: green
-colorTo: yellow
-sdk: gradio
-sdk_version: 5.35.0
-app_file: app.py
-pinned: false
-short_description: personal projects
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: MCP Server Web2JSON
+emoji: 🖇️
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 5.33.0
+app_file: app.py
+pinned: True
+tags: [mcp-server-track]
+---
+[Video overview of the agent demo](https://youtu.be/wd0kjOVoGn8)
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,308 @@

+import json
+import pandas as pd
+import gradio as gr
+from typing import Dict, Any, Type
+from web2json.preprocessor import BasicPreprocessor
+from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient , ModalRerankerClient
+from web2json.postprocessor import PostProcessor
+from web2json.pipeline import Pipeline
+from pydantic import BaseModel, Field, create_model
+import os
+import dotenv
+import random
+import numpy as np
+import torch
+dotenv.load_dotenv()
+def seed_everything(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)  # if using multi-GPU
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+seed_everything(22)
+def parse_schema_input(schema_input: str) -> Type[BaseModel]:
+    """
+    Convert user schema input to a Pydantic BaseModel.
+    Supports multiple input formats:
+    1. JSON schema format
+    2. Python class definition
+    3. Simple field definitions
+    """
+    schema_input = schema_input.strip()
+    if not schema_input:
+        # Default schema if none provided
+        return create_model('DefaultSchema',
+                          title=(str, Field(description="Title of the content")),
+                          content=(str, Field(description="Main content")))
+    try:
+        # Try parsing as JSON schema
+        if schema_input.startswith('{'):
+            schema_dict = json.loads(schema_input)
+            return json_schema_to_basemodel(schema_dict)
+        # Try parsing as Python class definition
+        elif 'class ' in schema_input and 'BaseModel' in schema_input:
+            return python_class_to_basemodel(schema_input)
+        # Try parsing as simple field definitions
+        else:
+            return simple_fields_to_basemodel(schema_input)
+    except Exception as e:
+        raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.")
+def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]:
+    """Convert JSON schema to BaseModel"""
+    fields = {}
+    properties = schema_dict.get('properties', {})
+    required = schema_dict.get('required', [])
+    for field_name, field_info in properties.items():
+        field_type = get_python_type(field_info.get('type', 'string'))
+        field_description = field_info.get('description', '')
+        if field_name in required:
+            fields[field_name] = (field_type, Field(description=field_description))
+        else:
+            fields[field_name] = (field_type, Field(default=None, description=field_description))
+    return create_model('DynamicSchema', **fields)
+def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]:
+    """Convert Python class definition to BaseModel"""
+    try:
+        # Execute the class definition in a safe namespace
+        namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int,
+                    'float': float, 'bool': bool, 'list': list, 'dict': dict}
+        exec(class_definition, namespace)
+        # Find the class that inherits from BaseModel
+        for name, obj in namespace.items():
+            if (isinstance(obj, type) and
+                issubclass(obj, BaseModel) and
+                obj != BaseModel):
+                return obj
+        raise ValueError("No BaseModel class found in definition")
+    except Exception as e:
+        raise ValueError(f"Invalid Python class definition: {str(e)}")
+def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]:
+    """Convert simple field definitions to BaseModel"""
+    fields = {}
+    for line in fields_text.strip().split('\n'):
+        line = line.strip()
+        if not line or line.startswith('#'):
+            continue
+        # Parse field definition (e.g., "name: str = description")
+        if ':' in line:
+            parts = line.split(':', 1)
+            field_name = parts[0].strip()
+            type_and_desc = parts[1].strip()
+            if '=' in type_and_desc:
+                type_part, desc_part = type_and_desc.split('=', 1)
+                field_type = get_python_type(type_part.strip())
+                description = desc_part.strip().strip('"\'')
+            else:
+                field_type = get_python_type(type_and_desc.strip())
+                description = ""
+            fields[field_name] = (field_type, Field(description=description))
+        else:
+            # Simple field name only
+            field_name = line.strip()
+            fields[field_name] = (str, Field(description=""))
+    if not fields:
+        raise ValueError("No valid fields found in schema definition")
+    return create_model('DynamicSchema', **fields)
+def get_python_type(type_str: str):
+    """Convert type string to Python type"""
+    type_str = type_str.lower().strip()
+    type_mapping = {
+        'string': str, 'str': str,
+        'integer': int, 'int': int,
+        'number': float, 'float': float,
+        'boolean': bool, 'bool': bool,
+        'array': list, 'list': list,
+        'object': dict, 'dict': dict
+    }
+    return type_mapping.get(type_str, str)
+def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]:
+    """Wrapper function that converts schema input to BaseModel"""
+    try:
+        # Parse the schema input into a BaseModel
+        schema_model = parse_schema_input(schema_input)
+        # Call the original function
+        return webpage_to_json(content, is_url, schema_model)
+    except Exception as e:
+        return {"error": f"Schema parsing error: {str(e)}"}
+def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]:
+    """
+    Extracts structured JSON information from a given content based on a specified schema.
+    This function sets up a processing pipeline that includes:
+    - Preprocessing the input content.
+    - Utilizing an AI language model to extract information according to the provided schema.
+    - Postprocessing the extracted output to match the exact schema requirements.
+    Parameters:
+        content (str): The input content to be analyzed. This can be direct text or a URL content.
+        is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False).
+        schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output.
+    Returns:
+        Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization
+                        or processing, the dictionary will include an "error" key with a descriptive message.
+    """
+    prompt_template = """Extract the following information from the provided content according to the specified schema.
+    Content to analyze:
+    {content}
+    Schema requirements:
+    {schema}
+    Instructions:
+    - Extract only information that is explicitly present in the content
+    - Follow the exact structure and data types specified in the schema
+    - If a required field cannot be found, indicate this clearly
+    - Preserve the original formatting and context where relevant
+    - Return the extracted data in the format specified by the schema
+    - STICK TO THE SCHEMA DON'T EVEN THINK OF DOING SOMETHING ELSE
+    - IF THE SCHEMA ASKS FOR AN ARRAY THEN YOU MAY TRY TO EXTRACT ONE IF THERE IS
+    - OR I WILL KILL AND KIDNAP YOUR FAMILY AND TORTURE THEM """
+    classification_prompt_template = schema.model_json_schema()
+    # Initialize pipeline components
+    # TODO: improve the RAG system and optimize (don't instantiate every time)
+    preprocessor = BasicPreprocessor(config={'keep_tags': True})
+    try:
+        # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
+        llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'google/gemma-3n-e2b-it'})
+        # reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})\
+        reranker = ModalRerankerClient("https://abdulrahmanmfam2003--qwen3-reranker-rerank.modal.run")
+    except Exception as e:
+        return {"error": f"Failed to initialize LLM client: {str(e)}"}
+    # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
+    ai_extractor = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
+    postprocessor = PostProcessor()
+    pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
+    try:
+        result = pipeline.run(content, is_url, schema)
+        print("-"*80)
+        print(f"Processed result: {result}")
+        return result
+    except Exception as e:
+        return {"error": f"Processing error: {str(e)}"}
+# Example schemas for the user
+example_schemas = """
+**Example Schema Formats:**
+1. **Simple field definitions:**
+```
+title: str = Page title
+price: float = Product price
+description: str = Product description
+available: bool = Is available
+```
+2. **JSON Schema:**
+```json
+{
+  "properties": {
+    "title": {"type": "string", "description": "Page title"},
+    "price": {"type": "number", "description": "Product price"},
+    "description": {"type": "string", "description": "Product description"}
+  },
+  "required": ["title"]
+}
+```
+3. **Python Class Definition:**
+```python
+class ProductSchema(BaseModel):
+    title: str = Field(description="Product title")
+    price: float = Field(description="Product price")
+    description: str = Field(description="Product description")
+    available: bool = Field(default=False, description="Availability status")
+```
+"""
+# Build Gradio Interface
+demo = gr.Interface(
+    fn=webpage_to_json_wrapper,
+    inputs=[
+        gr.Textbox(
+            label="Content (URL or Raw Text)",
+            lines=10,
+            placeholder="Enter URL or paste raw HTML/text here."
+        ),
+        gr.Checkbox(label="Content is URL?", value=False),
+        gr.Textbox(
+            label="Schema Definition",
+            lines=15,
+            placeholder="Define your extraction schema (see examples below)",
+            info=example_schemas
+        )
+    ],
+    outputs=gr.JSON(label="Output JSON"),
+    title="Webpage to JSON Converter",
+    description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.",
+    examples=[
+        [
+            "https://example.com",
+            True,
+            "title: str = Page title\nprice: float = Product price\ndescription: str = Description"
+        ],
+        [
+            "<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>",
+            False,
+            '''{
+            "type": "object",
+            "properties": {
+                "title": {
+                "type": "string",
+                "description": "Name of the product"
+                },
+                "price": {
+                "type": "number",
+                "description": "Price of the product"
+                },
+                "description": {
+                "type": "string",
+                "description": "Detailed description of the product"
+                },
+                "availability": {
+                "type": "boolean",
+                "description": "Whether the product is in stock (true) or not (false)"
+                }
+            },
+            "required": ["title", "price"]
+            }'''
+        ]
+    ]
+)
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+pandas
+gradio
+gradio[mcp]
+pydantic
+python-dotenv
+beautifulsoup4
+requests
+google-genai
+json_repair
+numpy
+langchain
+langchain-text-splitters
+sentence-transformers
+openai
+html_chunking
+langchain_nvidia_ai_endpoints
+langchain_core
+lxml
+pdfkit
+html2text
+inscriptis
+trafilatura
+markdownify
+beautifulsoup4
+readabilipy
+docling
+htmlrag