OneServerToRuleThemAll / scraper /data_extractor.py
etukurudinesh's picture
add files
feea636
from bs4 import BeautifulSoup, Comment
from typing import Dict, List, Optional
import re
from urllib.parse import urljoin, urlparse
class DataExtractor:
def __init__(self):
self.ignore_selectors = [
'.advertisement',
'.ad',
'.banner',
'.popup',
'#footer',
'.footer',
'.sidebar',
'nav',
'.navbar',
'.menu',
'header',
'#header',
'script',
'style',
'noscript',
'iframe',
'meta',
'link',
'[class*="ad-"]',
'[id*="ad-"]',
'.cookie-notice',
'.modal',
'form',
'input',
'button',
'.social-media',
'.comments-section',
'.widget'
]
self.content_selectors = [
'.main-content',
'article',
'p',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'div.content',
'.post',
'.article-body',
'.content-body',
'section',
'main',
'ul',
'ol',
'li',
'table',
'td',
'th',
'blockquote',
'pre',
'.text',
'[class*="content"]',
'[class*="post"]',
'[class*="article"]',
'div:not([class*="ad"]):not([class*="banner"]):not([class*="sidebar"])'
]
self.min_text_length = 200
def extract_structured_data(self, html: str, url: str) -> Dict:
"""Extract structured data from HTML for LLM consumption"""
soup = BeautifulSoup(html, 'lxml')
# Remove unwanted elements
self._clean_html(soup)
return {
"content": self._extract_content(soup),
"metadata": self._extract_metadata(soup, url),
"structure": self._extract_structure(soup),
"links": self._extract_links(soup, url),
"images": self._extract_images(soup, url),
"text_summary": self._extract_text_summary(soup)
}
def _clean_html(self, soup: BeautifulSoup):
"""Remove unwanted elements for cleaner extraction"""
for selector in self.ignore_selectors:
for element in soup.select(selector):
element.decompose()
# Remove comments and scripts
for element in soup(text=lambda text: isinstance(text, Comment)):
element.extract()
def _extract_content(self, soup: BeautifulSoup) -> List[Dict]:
"""Extract main content blocks"""
content_blocks = []
for selector in self.content_selectors:
elements = soup.select(selector)
for elem in elements:
text = elem.get_text(strip=True)
if len(text) >= self.min_text_length:
content_blocks.append({
"tag": elem.name,
"text": text,
"html": str(elem),
"attributes": dict(elem.attrs) if elem.attrs else {}
})
return content_blocks
def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
"""Extract page metadata"""
title = soup.find('title')
meta_desc = soup.find('meta', attrs={'name': 'description'})
return {
"title": title.get_text().strip() if title else "",
"description": meta_desc.get('content', '') if meta_desc else "",
"url": url,
"domain": urlparse(url).netloc,
"headings": self._extract_headings(soup)
}
def _extract_headings(self, soup: BeautifulSoup) -> List[Dict]:
"""Extract heading hierarchy for structure"""
headings = []
for i in range(1, 7):
for heading in soup.find_all(f'h{i}'):
headings.append({
"level": i,
"text": heading.get_text().strip(),
"id": heading.get('id', '')
})
return headings
def _extract_structure(self, soup: BeautifulSoup) -> Dict:
"""Extract DOM structure for relationships"""
return {
"sections": len(soup.find_all(['section', 'article', 'div'])),
"paragraphs": len(soup.find_all('p')),
"lists": len(soup.find_all(['ul', 'ol'])),
"tables": len(soup.find_all('table')),
"forms": len(soup.find_all('form'))
}
def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
"""Extract all links for relationship mapping"""
links = []
for link in soup.find_all('a', href=True):
href = urljoin(base_url, link['href'])
links.append({
"url": href,
"text": link.get_text().strip(),
"internal": urlparse(href).netloc == urlparse(base_url).netloc
})
return links[:50] # Limit for performance
def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
"""Extract images with context"""
images = []
for img in soup.find_all('img', src=True):
images.append({
"src": urljoin(base_url, img['src']),
"alt": img.get('alt', ''),
"caption": img.get('title', '')
})
return images[:20] # Limit for performance
def _extract_text_summary(self, soup: BeautifulSoup) -> str:
"""Extract clean text for LLM processing"""
text = soup.get_text()
# Clean whitespace and normalize
text = re.sub(r'\s+', ' ', text).strip()
return text[:5000] # Limit for token efficiency