meirk-brd commited on
Commit
d21fa10
·
1 Parent(s): 9220e18

Extract canonical URL from HTML when smolagents fetches page content

Browse files
Files changed (1) hide show
  1. tool.py +13 -1
tool.py CHANGED
@@ -293,7 +293,19 @@ class BrightDataDatasetTool(Tool):
293
  if not text:
294
  return None
295
 
296
- # direct http/https
 
 
 
 
 
 
 
 
 
 
 
 
297
  match = re.search(r"(https?://[^\s\"'<>]+)", text)
298
  if match:
299
  return match.group(1)
 
293
  if not text:
294
  return None
295
 
296
+ # If text looks like HTML, try to extract canonical URL first
297
+ if text.strip().startswith(("<!doctype", "<!DOCTYPE", "<html", "<HTML")):
298
+ # Look for canonical URL in HTML
299
+ canonical_match = re.search(r'<link\s+rel=["\']canonical["\']\s+href=["\'](https?://[^"\']+)["\']', text, re.IGNORECASE)
300
+ if canonical_match:
301
+ return canonical_match.group(1)
302
+
303
+ # Look for og:url meta tag
304
+ og_url_match = re.search(r'<meta\s+property=["\']og:url["\']\s+content=["\'](https?://[^"\']+)["\']', text, re.IGNORECASE)
305
+ if og_url_match:
306
+ return og_url_match.group(1)
307
+
308
+ # direct http/https - find first URL
309
  match = re.search(r"(https?://[^\s\"'<>]+)", text)
310
  if match:
311
  return match.group(1)