only1ahmed commited on
Commit
a8e3cab
·
verified ·
1 Parent(s): 3228c1d

Upload 6 files

Browse files
Files changed (6) hide show
  1. .gitattributes +35 -35
  2. .gitignore +7 -0
  3. .txt +0 -0
  4. README.md +15 -13
  5. app.py +308 -0
  6. requirements.txt +27 -0
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .env
2
+ *.ipynb
3
+ venv
4
+ *.csv
5
+ *.json
6
+ *.jsonl
7
+ vllm*
.txt ADDED
File without changes
README.md CHANGED
@@ -1,13 +1,15 @@
1
- ---
2
- title: Personal
3
- emoji: 🏆
4
- colorFrom: green
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.35.0
8
- app_file: app.py
9
- pinned: false
10
- short_description: personal projects
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
+ ---
2
+ title: MCP Server Web2JSON
3
+ emoji: 🖇️
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 5.33.0
8
+ app_file: app.py
9
+ pinned: True
10
+ tags: [mcp-server-track]
11
+ ---
12
+
13
+ [Video overview of the agent demo](https://youtu.be/wd0kjOVoGn8)
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ import gradio as gr
4
+ from typing import Dict, Any, Type
5
+ from web2json.preprocessor import BasicPreprocessor
6
+ from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient , ModalRerankerClient
7
+ from web2json.postprocessor import PostProcessor
8
+ from web2json.pipeline import Pipeline
9
+ from pydantic import BaseModel, Field, create_model
10
+ import os
11
+ import dotenv
12
+ import random
13
+ import numpy as np
14
+ import torch
15
+
16
+ dotenv.load_dotenv()
17
+
18
+ def seed_everything(seed=42):
19
+ random.seed(seed)
20
+ np.random.seed(seed)
21
+ torch.manual_seed(seed)
22
+
23
+ if torch.cuda.is_available():
24
+ torch.cuda.manual_seed(seed)
25
+ torch.cuda.manual_seed_all(seed) # if using multi-GPU
26
+
27
+ torch.backends.cudnn.deterministic = True
28
+ torch.backends.cudnn.benchmark = False
29
+
30
+ seed_everything(22)
31
+
32
+ def parse_schema_input(schema_input: str) -> Type[BaseModel]:
33
+ """
34
+ Convert user schema input to a Pydantic BaseModel.
35
+ Supports multiple input formats:
36
+ 1. JSON schema format
37
+ 2. Python class definition
38
+ 3. Simple field definitions
39
+ """
40
+ schema_input = schema_input.strip()
41
+
42
+ if not schema_input:
43
+ # Default schema if none provided
44
+ return create_model('DefaultSchema',
45
+ title=(str, Field(description="Title of the content")),
46
+ content=(str, Field(description="Main content")))
47
+
48
+ try:
49
+ # Try parsing as JSON schema
50
+ if schema_input.startswith('{'):
51
+ schema_dict = json.loads(schema_input)
52
+ return json_schema_to_basemodel(schema_dict)
53
+
54
+ # Try parsing as Python class definition
55
+ elif 'class ' in schema_input and 'BaseModel' in schema_input:
56
+ return python_class_to_basemodel(schema_input)
57
+
58
+ # Try parsing as simple field definitions
59
+ else:
60
+ return simple_fields_to_basemodel(schema_input)
61
+
62
+ except Exception as e:
63
+ raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.")
64
+
65
+ def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]:
66
+ """Convert JSON schema to BaseModel"""
67
+ fields = {}
68
+ properties = schema_dict.get('properties', {})
69
+ required = schema_dict.get('required', [])
70
+
71
+ for field_name, field_info in properties.items():
72
+ field_type = get_python_type(field_info.get('type', 'string'))
73
+ field_description = field_info.get('description', '')
74
+
75
+ if field_name in required:
76
+ fields[field_name] = (field_type, Field(description=field_description))
77
+ else:
78
+ fields[field_name] = (field_type, Field(default=None, description=field_description))
79
+
80
+ return create_model('DynamicSchema', **fields)
81
+
82
+ def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]:
83
+ """Convert Python class definition to BaseModel"""
84
+ try:
85
+ # Execute the class definition in a safe namespace
86
+ namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int,
87
+ 'float': float, 'bool': bool, 'list': list, 'dict': dict}
88
+ exec(class_definition, namespace)
89
+
90
+ # Find the class that inherits from BaseModel
91
+ for name, obj in namespace.items():
92
+ if (isinstance(obj, type) and
93
+ issubclass(obj, BaseModel) and
94
+ obj != BaseModel):
95
+ return obj
96
+
97
+ raise ValueError("No BaseModel class found in definition")
98
+ except Exception as e:
99
+ raise ValueError(f"Invalid Python class definition: {str(e)}")
100
+
101
+ def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]:
102
+ """Convert simple field definitions to BaseModel"""
103
+ fields = {}
104
+
105
+ for line in fields_text.strip().split('\n'):
106
+ line = line.strip()
107
+ if not line or line.startswith('#'):
108
+ continue
109
+
110
+ # Parse field definition (e.g., "name: str = description")
111
+ if ':' in line:
112
+ parts = line.split(':', 1)
113
+ field_name = parts[0].strip()
114
+
115
+ type_and_desc = parts[1].strip()
116
+ if '=' in type_and_desc:
117
+ type_part, desc_part = type_and_desc.split('=', 1)
118
+ field_type = get_python_type(type_part.strip())
119
+ description = desc_part.strip().strip('"\'')
120
+ else:
121
+ field_type = get_python_type(type_and_desc.strip())
122
+ description = ""
123
+
124
+ fields[field_name] = (field_type, Field(description=description))
125
+ else:
126
+ # Simple field name only
127
+ field_name = line.strip()
128
+ fields[field_name] = (str, Field(description=""))
129
+
130
+ if not fields:
131
+ raise ValueError("No valid fields found in schema definition")
132
+
133
+ return create_model('DynamicSchema', **fields)
134
+
135
+ def get_python_type(type_str: str):
136
+ """Convert type string to Python type"""
137
+ type_str = type_str.lower().strip()
138
+ type_mapping = {
139
+ 'string': str, 'str': str,
140
+ 'integer': int, 'int': int,
141
+ 'number': float, 'float': float,
142
+ 'boolean': bool, 'bool': bool,
143
+ 'array': list, 'list': list,
144
+ 'object': dict, 'dict': dict
145
+ }
146
+ return type_mapping.get(type_str, str)
147
+
148
+ def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]:
149
+ """Wrapper function that converts schema input to BaseModel"""
150
+ try:
151
+ # Parse the schema input into a BaseModel
152
+ schema_model = parse_schema_input(schema_input)
153
+
154
+ # Call the original function
155
+ return webpage_to_json(content, is_url, schema_model)
156
+
157
+ except Exception as e:
158
+ return {"error": f"Schema parsing error: {str(e)}"}
159
+
160
+ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]:
161
+ """
162
+ Extracts structured JSON information from a given content based on a specified schema.
163
+ This function sets up a processing pipeline that includes:
164
+ - Preprocessing the input content.
165
+ - Utilizing an AI language model to extract information according to the provided schema.
166
+ - Postprocessing the extracted output to match the exact schema requirements.
167
+ Parameters:
168
+ content (str): The input content to be analyzed. This can be direct text or a URL content.
169
+ is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False).
170
+ schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output.
171
+ Returns:
172
+ Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization
173
+ or processing, the dictionary will include an "error" key with a descriptive message.
174
+ """
175
+ prompt_template = """Extract the following information from the provided content according to the specified schema.
176
+
177
+ Content to analyze:
178
+ {content}
179
+
180
+ Schema requirements:
181
+ {schema}
182
+
183
+ Instructions:
184
+ - Extract only information that is explicitly present in the content
185
+ - Follow the exact structure and data types specified in the schema
186
+ - If a required field cannot be found, indicate this clearly
187
+ - Preserve the original formatting and context where relevant
188
+ - Return the extracted data in the format specified by the schema
189
+ - STICK TO THE SCHEMA DON'T EVEN THINK OF DOING SOMETHING ELSE
190
+ - IF THE SCHEMA ASKS FOR AN ARRAY THEN YOU MAY TRY TO EXTRACT ONE IF THERE IS
191
+ - OR I WILL KILL AND KIDNAP YOUR FAMILY AND TORTURE THEM """
192
+
193
+ classification_prompt_template = schema.model_json_schema()
194
+ # Initialize pipeline components
195
+ # TODO: improve the RAG system and optimize (don't instantiate every time)
196
+ preprocessor = BasicPreprocessor(config={'keep_tags': True})
197
+ try:
198
+ # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
199
+ llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'google/gemma-3n-e2b-it'})
200
+ # reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})\
201
+ reranker = ModalRerankerClient("https://abdulrahmanmfam2003--qwen3-reranker-rerank.modal.run")
202
+ except Exception as e:
203
+ return {"error": f"Failed to initialize LLM client: {str(e)}"}
204
+
205
+ # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
206
+ ai_extractor = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
207
+ postprocessor = PostProcessor()
208
+ pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
209
+
210
+ try:
211
+ result = pipeline.run(content, is_url, schema)
212
+ print("-"*80)
213
+ print(f"Processed result: {result}")
214
+ return result
215
+ except Exception as e:
216
+ return {"error": f"Processing error: {str(e)}"}
217
+
218
+ # Example schemas for the user
219
+ example_schemas = """
220
+ **Example Schema Formats:**
221
+
222
+ 1. **Simple field definitions:**
223
+ ```
224
+ title: str = Page title
225
+ price: float = Product price
226
+ description: str = Product description
227
+ available: bool = Is available
228
+ ```
229
+
230
+ 2. **JSON Schema:**
231
+ ```json
232
+ {
233
+ "properties": {
234
+ "title": {"type": "string", "description": "Page title"},
235
+ "price": {"type": "number", "description": "Product price"},
236
+ "description": {"type": "string", "description": "Product description"}
237
+ },
238
+ "required": ["title"]
239
+ }
240
+ ```
241
+
242
+ 3. **Python Class Definition:**
243
+ ```python
244
+ class ProductSchema(BaseModel):
245
+ title: str = Field(description="Product title")
246
+ price: float = Field(description="Product price")
247
+ description: str = Field(description="Product description")
248
+ available: bool = Field(default=False, description="Availability status")
249
+ ```
250
+ """
251
+
252
+ # Build Gradio Interface
253
+ demo = gr.Interface(
254
+ fn=webpage_to_json_wrapper,
255
+ inputs=[
256
+ gr.Textbox(
257
+ label="Content (URL or Raw Text)",
258
+ lines=10,
259
+ placeholder="Enter URL or paste raw HTML/text here."
260
+ ),
261
+ gr.Checkbox(label="Content is URL?", value=False),
262
+ gr.Textbox(
263
+ label="Schema Definition",
264
+ lines=15,
265
+ placeholder="Define your extraction schema (see examples below)",
266
+ info=example_schemas
267
+ )
268
+ ],
269
+ outputs=gr.JSON(label="Output JSON"),
270
+ title="Webpage to JSON Converter",
271
+ description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.",
272
+ examples=[
273
+ [
274
+ "https://example.com",
275
+ True,
276
+ "title: str = Page title\nprice: float = Product price\ndescription: str = Description"
277
+ ],
278
+ [
279
+ "<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>",
280
+ False,
281
+ '''{
282
+ "type": "object",
283
+ "properties": {
284
+ "title": {
285
+ "type": "string",
286
+ "description": "Name of the product"
287
+ },
288
+ "price": {
289
+ "type": "number",
290
+ "description": "Price of the product"
291
+ },
292
+ "description": {
293
+ "type": "string",
294
+ "description": "Detailed description of the product"
295
+ },
296
+ "availability": {
297
+ "type": "boolean",
298
+ "description": "Whether the product is in stock (true) or not (false)"
299
+ }
300
+ },
301
+ "required": ["title", "price"]
302
+ }'''
303
+ ]
304
+ ]
305
+ )
306
+
307
+ if __name__ == "__main__":
308
+ demo.launch(mcp_server=True)
requirements.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ gradio
3
+ gradio[mcp]
4
+ pydantic
5
+ python-dotenv
6
+ beautifulsoup4
7
+ requests
8
+ google-genai
9
+ json_repair
10
+ numpy
11
+ langchain
12
+ langchain-text-splitters
13
+ sentence-transformers
14
+ openai
15
+ html_chunking
16
+ langchain_nvidia_ai_endpoints
17
+ langchain_core
18
+ lxml
19
+ pdfkit
20
+ html2text
21
+ inscriptis
22
+ trafilatura
23
+ markdownify
24
+ beautifulsoup4
25
+ readabilipy
26
+ docling
27
+ htmlrag