meirk-brd commited on
Commit
dc8b2a7
·
1 Parent(s): 0cc79aa

extract url

Browse files
Files changed (1) hide show
  1. tool.py +34 -15
tool.py CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
3
  import ast
4
  import json
5
  import os
 
6
  import time
7
  from typing import Any, Dict, List, Optional
8
 
@@ -211,23 +212,22 @@ class BrightDataDatasetTool(Tool):
211
  return None
212
 
213
  if isinstance(raw, str):
214
- if raw.lstrip().startswith("<"):
215
- return None
216
  if raw.strip().startswith("{") and "orig_name" in raw:
217
  parsed = self._parse_file_dict_string(raw)
218
  if parsed:
219
  raw = parsed
220
  else:
221
- return self._ensure_scheme(raw)
222
  else:
223
- return self._ensure_scheme(raw)
224
 
225
  if isinstance(raw, dict):
226
  file_path = raw.get("path")
227
  if isinstance(file_path, str) and os.path.isfile(file_path):
228
- text = self._read_text_file(file_path)
229
- if text:
230
- return self._ensure_scheme(text)
 
231
 
232
  orig_name = raw.get("orig_name")
233
  if (
@@ -235,7 +235,9 @@ class BrightDataDatasetTool(Tool):
235
  and orig_name
236
  and (orig_name.startswith(("http://", "https://")) or "." in orig_name)
237
  ):
238
- return self._ensure_scheme(orig_name)
 
 
239
 
240
  url_value = raw.get("url")
241
  if isinstance(url_value, str):
@@ -243,9 +245,14 @@ class BrightDataDatasetTool(Tool):
243
  return url_value
244
  if url_value.startswith("/gradio_api/file="):
245
  file_path = url_value.split("=", 1)[-1]
246
- text = self._read_text_file(file_path)
247
- if text:
248
- return self._ensure_scheme(text)
 
 
 
 
 
249
  return None
250
 
251
  return None
@@ -267,12 +274,24 @@ class BrightDataDatasetTool(Tool):
267
  return None
268
  try:
269
  with open(path, "r", encoding="utf-8", errors="ignore") as fh:
270
- for line in fh:
271
- text = line.strip()
272
- if text and not text.startswith("<"):
273
- return text
274
  except OSError:
275
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  return None
277
 
278
  def _get_gradio_app_code(self, tool_module_name: str = "tool") -> str:
 
3
  import ast
4
  import json
5
  import os
6
+ import re
7
  import time
8
  from typing import Any, Dict, List, Optional
9
 
 
212
  return None
213
 
214
  if isinstance(raw, str):
 
 
215
  if raw.strip().startswith("{") and "orig_name" in raw:
216
  parsed = self._parse_file_dict_string(raw)
217
  if parsed:
218
  raw = parsed
219
  else:
220
+ return self._extract_url_from_text(raw)
221
  else:
222
+ return self._extract_url_from_text(raw)
223
 
224
  if isinstance(raw, dict):
225
  file_path = raw.get("path")
226
  if isinstance(file_path, str) and os.path.isfile(file_path):
227
+ content = self._read_text_file(file_path)
228
+ url_from_file = self._extract_url_from_text(content or "")
229
+ if url_from_file:
230
+ return url_from_file
231
 
232
  orig_name = raw.get("orig_name")
233
  if (
 
235
  and orig_name
236
  and (orig_name.startswith(("http://", "https://")) or "." in orig_name)
237
  ):
238
+ extracted = self._extract_url_from_text(orig_name)
239
+ if extracted:
240
+ return extracted
241
 
242
  url_value = raw.get("url")
243
  if isinstance(url_value, str):
 
245
  return url_value
246
  if url_value.startswith("/gradio_api/file="):
247
  file_path = url_value.split("=", 1)[-1]
248
+ content = self._read_text_file(file_path)
249
+ url_from_file = self._extract_url_from_text(content or "")
250
+ if url_from_file:
251
+ return url_from_file
252
+ else:
253
+ extracted = self._extract_url_from_text(url_value)
254
+ if extracted:
255
+ return extracted
256
  return None
257
 
258
  return None
 
274
  return None
275
  try:
276
  with open(path, "r", encoding="utf-8", errors="ignore") as fh:
277
+ return fh.read()
 
 
 
278
  except OSError:
279
  return None
280
+
281
+ def _extract_url_from_text(self, text: str) -> Optional[str]:
282
+ if not text:
283
+ return None
284
+
285
+ # direct http/https
286
+ match = re.search(r"(https?://[^\s\"'<>]+)", text)
287
+ if match:
288
+ return match.group(1)
289
+
290
+ # domain/path without scheme
291
+ match_domain = re.search(r"\b([A-Za-z0-9.-]+\.[A-Za-z]{2,}(?:/[^\s\"'<>]*)?)", text)
292
+ if match_domain:
293
+ return self._ensure_scheme(match_domain.group(1))
294
+
295
  return None
296
 
297
  def _get_gradio_app_code(self, tool_module_name: str = "tool") -> str: