Spaces:

BrightData
/

brightdata-dataset-tool

Running

App Files Files Community

meirk-brd commited on 16 days ago

Commit

dc8b2a7

1 Parent(s): 0cc79aa

extract url

Browse files

Files changed (1) hide show

tool.py +34 -15

tool.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 import ast
 import json
 import os
 import time
 from typing import Any, Dict, List, Optional
@@ -211,23 +212,22 @@ class BrightDataDatasetTool(Tool):
             return None
         if isinstance(raw, str):
-            if raw.lstrip().startswith("<"):
-                return None
             if raw.strip().startswith("{") and "orig_name" in raw:
                 parsed = self._parse_file_dict_string(raw)
                 if parsed:
                     raw = parsed
                 else:
-                    return self._ensure_scheme(raw)
             else:
-                return self._ensure_scheme(raw)
         if isinstance(raw, dict):
             file_path = raw.get("path")
             if isinstance(file_path, str) and os.path.isfile(file_path):
-                text = self._read_text_file(file_path)
-                if text:
-                    return self._ensure_scheme(text)
             orig_name = raw.get("orig_name")
             if (
@@ -235,7 +235,9 @@ class BrightDataDatasetTool(Tool):
                 and orig_name
                 and (orig_name.startswith(("http://", "https://")) or "." in orig_name)
             ):
-                return self._ensure_scheme(orig_name)
             url_value = raw.get("url")
             if isinstance(url_value, str):
@@ -243,9 +245,14 @@ class BrightDataDatasetTool(Tool):
                     return url_value
                 if url_value.startswith("/gradio_api/file="):
                     file_path = url_value.split("=", 1)[-1]
-                    text = self._read_text_file(file_path)
-                    if text:
-                        return self._ensure_scheme(text)
                 return None
         return None
@@ -267,12 +274,24 @@ class BrightDataDatasetTool(Tool):
             return None
         try:
             with open(path, "r", encoding="utf-8", errors="ignore") as fh:
-                for line in fh:
-                    text = line.strip()
-                    if text and not text.startswith("<"):
-                        return text
         except OSError:
             return None
         return None
     def _get_gradio_app_code(self, tool_module_name: str = "tool") -> str:

 import ast
 import json
 import os
+import re
 import time
 from typing import Any, Dict, List, Optional
             return None
         if isinstance(raw, str):
             if raw.strip().startswith("{") and "orig_name" in raw:
                 parsed = self._parse_file_dict_string(raw)
                 if parsed:
                     raw = parsed
                 else:
+                    return self._extract_url_from_text(raw)
             else:
+                return self._extract_url_from_text(raw)
         if isinstance(raw, dict):
             file_path = raw.get("path")
             if isinstance(file_path, str) and os.path.isfile(file_path):
+                content = self._read_text_file(file_path)
+                url_from_file = self._extract_url_from_text(content or "")
+                if url_from_file:
+                    return url_from_file
             orig_name = raw.get("orig_name")
             if (
                 and orig_name
                 and (orig_name.startswith(("http://", "https://")) or "." in orig_name)
             ):
+                extracted = self._extract_url_from_text(orig_name)
+                if extracted:
+                    return extracted
             url_value = raw.get("url")
             if isinstance(url_value, str):
                     return url_value
                 if url_value.startswith("/gradio_api/file="):
                     file_path = url_value.split("=", 1)[-1]
+                    content = self._read_text_file(file_path)
+                    url_from_file = self._extract_url_from_text(content or "")
+                    if url_from_file:
+                        return url_from_file
+                else:
+                    extracted = self._extract_url_from_text(url_value)
+                    if extracted:
+                        return extracted
                 return None
         return None
             return None
         try:
             with open(path, "r", encoding="utf-8", errors="ignore") as fh:
+                return fh.read()
         except OSError:
             return None
+    def _extract_url_from_text(self, text: str) -> Optional[str]:
+        if not text:
+            return None
+        # direct http/https
+        match = re.search(r"(https?://[^\s\"'<>]+)", text)
+        if match:
+            return match.group(1)
+        # domain/path without scheme
+        match_domain = re.search(r"\b([A-Za-z0-9.-]+\.[A-Za-z]{2,}(?:/[^\s\"'<>]*)?)", text)
+        if match_domain:
+            return self._ensure_scheme(match_domain.group(1))
         return None
     def _get_gradio_app_code(self, tool_module_name: str = "tool") -> str: