Spaces:

BrightData
/

brightdata-dataset-tool

Running

meirk-brd commited on 6 days ago

Commit

d21fa10

1 Parent(s): 9220e18

Extract canonical URL from HTML when smolagents fetches page content

Files changed (1) hide show

tool.py CHANGED Viewed

@@ -293,7 +293,19 @@ class BrightDataDatasetTool(Tool):
         if not text:
             return None
-        # direct http/https
         match = re.search(r"(https?://[^\s\"'<>]+)", text)
         if match:
             return match.group(1)

         if not text:
             return None
+        # If text looks like HTML, try to extract canonical URL first
+        if text.strip().startswith(("<!doctype", "<!DOCTYPE", "<html", "<HTML")):
+            # Look for canonical URL in HTML
+            canonical_match = re.search(r'<link\s+rel=["\']canonical["\']\s+href=["\'](https?://[^"\']+)["\']', text, re.IGNORECASE)
+            if canonical_match:
+                return canonical_match.group(1)
+            # Look for og:url meta tag
+            og_url_match = re.search(r'<meta\s+property=["\']og:url["\']\s+content=["\'](https?://[^"\']+)["\']', text, re.IGNORECASE)
+            if og_url_match:
+                return og_url_match.group(1)
+        # direct http/https - find first URL
         match = re.search(r"(https?://[^\s\"'<>]+)", text)
         if match:
             return match.group(1)