Spaces:

rkihacker
/

Scrap

Paused

rkihacker commited on Sep 18

Commit

9942a54

verified ·

1 Parent(s): 25c899c

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -47,6 +47,8 @@ USER_AGENT_ROTATION = True
 # Context management
 CONTEXT_WINDOW_SIZE = 10_000_000
 MAX_CONTEXT_SIZE = 2_000_000
 # Initialize fake user agent generator
 try:
@@ -133,6 +135,9 @@ def clean_url(url: str) -> str:
 async def check_robots_txt(url: str) -> bool:
     """Check if scraping is allowed by robots.txt."""
     try:
         domain_match = re.search(r'https?://([^/]+)', url)
         if not domain_match:
@@ -154,7 +159,8 @@ async def check_robots_txt(url: str) -> bool:
         return True
     except Exception as e:
         logging.warning(f"Could not check robots.txt for {url}: {e}")
-        return False
 async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
     """Perform a real search using DuckDuckGo (Lite/HTML) with multi-endpoint fallback to reduce 202 issues."""

 # Context management
 CONTEXT_WINDOW_SIZE = 10_000_000
 MAX_CONTEXT_SIZE = 2_000_000
+## Robots.txt behavior (user requested scraping even if disallowed)
+RESPECT_ROBOTS_TXT = False
 # Initialize fake user agent generator
 try:
 async def check_robots_txt(url: str) -> bool:
     """Check if scraping is allowed by robots.txt."""
+    # If configured to ignore robots.txt, always allow
+    if not RESPECT_ROBOTS_TXT:
+        return True
     try:
         domain_match = re.search(r'https?://([^/]+)', url)
         if not domain_match:
         return True
     except Exception as e:
         logging.warning(f"Could not check robots.txt for {url}: {e}")
+        # Default to allow on failure to check
+        return True
 async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
     """Perform a real search using DuckDuckGo (Lite/HTML) with multi-endpoint fallback to reduce 202 issues."""