meirk-brd
commited on
Commit
·
dc8b2a7
1
Parent(s):
0cc79aa
extract url
Browse files
tool.py
CHANGED
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
| 3 |
import ast
|
| 4 |
import json
|
| 5 |
import os
|
|
|
|
| 6 |
import time
|
| 7 |
from typing import Any, Dict, List, Optional
|
| 8 |
|
|
@@ -211,23 +212,22 @@ class BrightDataDatasetTool(Tool):
|
|
| 211 |
return None
|
| 212 |
|
| 213 |
if isinstance(raw, str):
|
| 214 |
-
if raw.lstrip().startswith("<"):
|
| 215 |
-
return None
|
| 216 |
if raw.strip().startswith("{") and "orig_name" in raw:
|
| 217 |
parsed = self._parse_file_dict_string(raw)
|
| 218 |
if parsed:
|
| 219 |
raw = parsed
|
| 220 |
else:
|
| 221 |
-
return self.
|
| 222 |
else:
|
| 223 |
-
return self.
|
| 224 |
|
| 225 |
if isinstance(raw, dict):
|
| 226 |
file_path = raw.get("path")
|
| 227 |
if isinstance(file_path, str) and os.path.isfile(file_path):
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
|
|
|
| 231 |
|
| 232 |
orig_name = raw.get("orig_name")
|
| 233 |
if (
|
|
@@ -235,7 +235,9 @@ class BrightDataDatasetTool(Tool):
|
|
| 235 |
and orig_name
|
| 236 |
and (orig_name.startswith(("http://", "https://")) or "." in orig_name)
|
| 237 |
):
|
| 238 |
-
|
|
|
|
|
|
|
| 239 |
|
| 240 |
url_value = raw.get("url")
|
| 241 |
if isinstance(url_value, str):
|
|
@@ -243,9 +245,14 @@ class BrightDataDatasetTool(Tool):
|
|
| 243 |
return url_value
|
| 244 |
if url_value.startswith("/gradio_api/file="):
|
| 245 |
file_path = url_value.split("=", 1)[-1]
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
return None
|
| 250 |
|
| 251 |
return None
|
|
@@ -267,12 +274,24 @@ class BrightDataDatasetTool(Tool):
|
|
| 267 |
return None
|
| 268 |
try:
|
| 269 |
with open(path, "r", encoding="utf-8", errors="ignore") as fh:
|
| 270 |
-
|
| 271 |
-
text = line.strip()
|
| 272 |
-
if text and not text.startswith("<"):
|
| 273 |
-
return text
|
| 274 |
except OSError:
|
| 275 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
return None
|
| 277 |
|
| 278 |
def _get_gradio_app_code(self, tool_module_name: str = "tool") -> str:
|
|
|
|
| 3 |
import ast
|
| 4 |
import json
|
| 5 |
import os
|
| 6 |
+
import re
|
| 7 |
import time
|
| 8 |
from typing import Any, Dict, List, Optional
|
| 9 |
|
|
|
|
| 212 |
return None
|
| 213 |
|
| 214 |
if isinstance(raw, str):
|
|
|
|
|
|
|
| 215 |
if raw.strip().startswith("{") and "orig_name" in raw:
|
| 216 |
parsed = self._parse_file_dict_string(raw)
|
| 217 |
if parsed:
|
| 218 |
raw = parsed
|
| 219 |
else:
|
| 220 |
+
return self._extract_url_from_text(raw)
|
| 221 |
else:
|
| 222 |
+
return self._extract_url_from_text(raw)
|
| 223 |
|
| 224 |
if isinstance(raw, dict):
|
| 225 |
file_path = raw.get("path")
|
| 226 |
if isinstance(file_path, str) and os.path.isfile(file_path):
|
| 227 |
+
content = self._read_text_file(file_path)
|
| 228 |
+
url_from_file = self._extract_url_from_text(content or "")
|
| 229 |
+
if url_from_file:
|
| 230 |
+
return url_from_file
|
| 231 |
|
| 232 |
orig_name = raw.get("orig_name")
|
| 233 |
if (
|
|
|
|
| 235 |
and orig_name
|
| 236 |
and (orig_name.startswith(("http://", "https://")) or "." in orig_name)
|
| 237 |
):
|
| 238 |
+
extracted = self._extract_url_from_text(orig_name)
|
| 239 |
+
if extracted:
|
| 240 |
+
return extracted
|
| 241 |
|
| 242 |
url_value = raw.get("url")
|
| 243 |
if isinstance(url_value, str):
|
|
|
|
| 245 |
return url_value
|
| 246 |
if url_value.startswith("/gradio_api/file="):
|
| 247 |
file_path = url_value.split("=", 1)[-1]
|
| 248 |
+
content = self._read_text_file(file_path)
|
| 249 |
+
url_from_file = self._extract_url_from_text(content or "")
|
| 250 |
+
if url_from_file:
|
| 251 |
+
return url_from_file
|
| 252 |
+
else:
|
| 253 |
+
extracted = self._extract_url_from_text(url_value)
|
| 254 |
+
if extracted:
|
| 255 |
+
return extracted
|
| 256 |
return None
|
| 257 |
|
| 258 |
return None
|
|
|
|
| 274 |
return None
|
| 275 |
try:
|
| 276 |
with open(path, "r", encoding="utf-8", errors="ignore") as fh:
|
| 277 |
+
return fh.read()
|
|
|
|
|
|
|
|
|
|
| 278 |
except OSError:
|
| 279 |
return None
|
| 280 |
+
|
| 281 |
+
def _extract_url_from_text(self, text: str) -> Optional[str]:
|
| 282 |
+
if not text:
|
| 283 |
+
return None
|
| 284 |
+
|
| 285 |
+
# direct http/https
|
| 286 |
+
match = re.search(r"(https?://[^\s\"'<>]+)", text)
|
| 287 |
+
if match:
|
| 288 |
+
return match.group(1)
|
| 289 |
+
|
| 290 |
+
# domain/path without scheme
|
| 291 |
+
match_domain = re.search(r"\b([A-Za-z0-9.-]+\.[A-Za-z]{2,}(?:/[^\s\"'<>]*)?)", text)
|
| 292 |
+
if match_domain:
|
| 293 |
+
return self._ensure_scheme(match_domain.group(1))
|
| 294 |
+
|
| 295 |
return None
|
| 296 |
|
| 297 |
def _get_gradio_app_code(self, tool_module_name: str = "tool") -> str:
|