meirk-brd
commited on
Commit
·
d21fa10
1
Parent(s):
9220e18
Extract canonical URL from HTML when smolagents fetches page content
Browse files
tool.py
CHANGED
|
@@ -293,7 +293,19 @@ class BrightDataDatasetTool(Tool):
|
|
| 293 |
if not text:
|
| 294 |
return None
|
| 295 |
|
| 296 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
match = re.search(r"(https?://[^\s\"'<>]+)", text)
|
| 298 |
if match:
|
| 299 |
return match.group(1)
|
|
|
|
| 293 |
if not text:
|
| 294 |
return None
|
| 295 |
|
| 296 |
+
# If text looks like HTML, try to extract canonical URL first
|
| 297 |
+
if text.strip().startswith(("<!doctype", "<!DOCTYPE", "<html", "<HTML")):
|
| 298 |
+
# Look for canonical URL in HTML
|
| 299 |
+
canonical_match = re.search(r'<link\s+rel=["\']canonical["\']\s+href=["\'](https?://[^"\']+)["\']', text, re.IGNORECASE)
|
| 300 |
+
if canonical_match:
|
| 301 |
+
return canonical_match.group(1)
|
| 302 |
+
|
| 303 |
+
# Look for og:url meta tag
|
| 304 |
+
og_url_match = re.search(r'<meta\s+property=["\']og:url["\']\s+content=["\'](https?://[^"\']+)["\']', text, re.IGNORECASE)
|
| 305 |
+
if og_url_match:
|
| 306 |
+
return og_url_match.group(1)
|
| 307 |
+
|
| 308 |
+
# direct http/https - find first URL
|
| 309 |
match = re.search(r"(https?://[^\s\"'<>]+)", text)
|
| 310 |
if match:
|
| 311 |
return match.group(1)
|