rkihacker commited on
Commit
9942a54
·
verified ·
1 Parent(s): 25c899c

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +7 -1
main.py CHANGED
@@ -47,6 +47,8 @@ USER_AGENT_ROTATION = True
47
  # Context management
48
  CONTEXT_WINDOW_SIZE = 10_000_000
49
  MAX_CONTEXT_SIZE = 2_000_000
 
 
50
 
51
  # Initialize fake user agent generator
52
  try:
@@ -133,6 +135,9 @@ def clean_url(url: str) -> str:
133
 
134
  async def check_robots_txt(url: str) -> bool:
135
  """Check if scraping is allowed by robots.txt."""
 
 
 
136
  try:
137
  domain_match = re.search(r'https?://([^/]+)', url)
138
  if not domain_match:
@@ -154,7 +159,8 @@ async def check_robots_txt(url: str) -> bool:
154
  return True
155
  except Exception as e:
156
  logging.warning(f"Could not check robots.txt for {url}: {e}")
157
- return False
 
158
 
159
  async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
160
  """Perform a real search using DuckDuckGo (Lite/HTML) with multi-endpoint fallback to reduce 202 issues."""
 
47
  # Context management
48
  CONTEXT_WINDOW_SIZE = 10_000_000
49
  MAX_CONTEXT_SIZE = 2_000_000
50
+ ## Robots.txt behavior (user requested scraping even if disallowed)
51
+ RESPECT_ROBOTS_TXT = False
52
 
53
  # Initialize fake user agent generator
54
  try:
 
135
 
136
  async def check_robots_txt(url: str) -> bool:
137
  """Check if scraping is allowed by robots.txt."""
138
+ # If configured to ignore robots.txt, always allow
139
+ if not RESPECT_ROBOTS_TXT:
140
+ return True
141
  try:
142
  domain_match = re.search(r'https?://([^/]+)', url)
143
  if not domain_match:
 
159
  return True
160
  except Exception as e:
161
  logging.warning(f"Could not check robots.txt for {url}: {e}")
162
+ # Default to allow on failure to check
163
+ return True
164
 
165
  async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
166
  """Perform a real search using DuckDuckGo (Lite/HTML) with multi-endpoint fallback to reduce 202 issues."""