Update main.py
Browse files
main.py
CHANGED
|
@@ -47,6 +47,8 @@ USER_AGENT_ROTATION = True
|
|
| 47 |
# Context management
|
| 48 |
CONTEXT_WINDOW_SIZE = 10_000_000
|
| 49 |
MAX_CONTEXT_SIZE = 2_000_000
|
|
|
|
|
|
|
| 50 |
|
| 51 |
# Initialize fake user agent generator
|
| 52 |
try:
|
|
@@ -133,6 +135,9 @@ def clean_url(url: str) -> str:
|
|
| 133 |
|
| 134 |
async def check_robots_txt(url: str) -> bool:
|
| 135 |
"""Check if scraping is allowed by robots.txt."""
|
|
|
|
|
|
|
|
|
|
| 136 |
try:
|
| 137 |
domain_match = re.search(r'https?://([^/]+)', url)
|
| 138 |
if not domain_match:
|
|
@@ -154,7 +159,8 @@ async def check_robots_txt(url: str) -> bool:
|
|
| 154 |
return True
|
| 155 |
except Exception as e:
|
| 156 |
logging.warning(f"Could not check robots.txt for {url}: {e}")
|
| 157 |
-
|
|
|
|
| 158 |
|
| 159 |
async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
|
| 160 |
"""Perform a real search using DuckDuckGo (Lite/HTML) with multi-endpoint fallback to reduce 202 issues."""
|
|
|
|
| 47 |
# Context management
|
| 48 |
CONTEXT_WINDOW_SIZE = 10_000_000
|
| 49 |
MAX_CONTEXT_SIZE = 2_000_000
|
| 50 |
+
## Robots.txt behavior (user requested scraping even if disallowed)
|
| 51 |
+
RESPECT_ROBOTS_TXT = False
|
| 52 |
|
| 53 |
# Initialize fake user agent generator
|
| 54 |
try:
|
|
|
|
| 135 |
|
| 136 |
async def check_robots_txt(url: str) -> bool:
|
| 137 |
"""Check if scraping is allowed by robots.txt."""
|
| 138 |
+
# If configured to ignore robots.txt, always allow
|
| 139 |
+
if not RESPECT_ROBOTS_TXT:
|
| 140 |
+
return True
|
| 141 |
try:
|
| 142 |
domain_match = re.search(r'https?://([^/]+)', url)
|
| 143 |
if not domain_match:
|
|
|
|
| 159 |
return True
|
| 160 |
except Exception as e:
|
| 161 |
logging.warning(f"Could not check robots.txt for {url}: {e}")
|
| 162 |
+
# Default to allow on failure to check
|
| 163 |
+
return True
|
| 164 |
|
| 165 |
async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
|
| 166 |
"""Perform a real search using DuckDuckGo (Lite/HTML) with multi-endpoint fallback to reduce 202 issues."""
|