rkihacker commited on
Commit
25c899c
·
verified ·
1 Parent(s): c830710

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +129 -55
main.py CHANGED
@@ -108,8 +108,12 @@ def clean_url(url: str) -> str:
108
  if not url:
109
  return ""
110
 
111
- if url.startswith('//duckduckgo.com/l/'):
112
- url = f"https:{url}"
 
 
 
 
113
  try:
114
  parsed = urlparse(url)
115
  query_params = parsed.query
@@ -117,7 +121,7 @@ def clean_url(url: str) -> str:
117
  match = re.search(r'uddg=([^&]+)', query_params)
118
  if match:
119
  return unquote(match.group(1))
120
- except:
121
  pass
122
 
123
  if url.startswith('//'):
@@ -153,68 +157,135 @@ async def check_robots_txt(url: str) -> bool:
153
  return False
154
 
155
  async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
156
- """Perform a real search using DuckDuckGo's HTML interface with robust retry logic."""
157
- headers = {
158
- "User-Agent": await get_real_user_agent(),
159
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
160
- "Accept-Language": "en-US,en;q=0.5",
 
 
 
161
  "Referer": "https://duckduckgo.com/",
162
- "DNT": "1"
163
  }
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  for attempt in range(RETRY_ATTEMPTS):
166
  try:
167
- search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
168
  async with aiohttp.ClientSession() as session:
169
- async with session.get(search_url, headers=headers, timeout=10) as response:
170
- if response.status != 200:
171
- if response.status == 202:
172
- logging.warning(f"Search attempt {attempt + 1} failed with status 202 for query '{query}'")
173
- if attempt < RETRY_ATTEMPTS - 1:
174
- await asyncio.sleep(RETRY_DELAY)
 
 
 
 
 
 
 
 
 
 
 
 
175
  continue
176
- logging.warning(f"Search failed with status {response.status} for query '{query}'")
177
- return []
178
-
179
- html = await response.text()
180
- soup = BeautifulSoup(html, 'html.parser')
181
-
182
- results = []
183
- for selector in ['.result__body', '.result__a', '.result']:
184
- if len(results) >= max_results:
185
- break
186
-
187
- for result in soup.select(selector)[:max_results]:
188
- try:
189
- title_elem = result.select_one('.result__title .result__a') or result.select_one('.result__a')
190
- if not title_elem:
191
- continue
192
-
193
- link = title_elem['href']
194
- snippet_elem = result.select_one('.result__snippet')
195
-
196
- clean_link = clean_url(link)
197
- if not clean_link or clean_link.startswith('javascript:'):
198
- continue
199
-
200
- snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
201
- results.append({
202
- 'title': title_elem.get_text(strip=True),
203
- 'link': clean_link,
204
- 'snippet': snippet
205
- })
206
- except Exception as e:
207
- logging.warning(f"Error parsing search result: {e}")
208
  continue
 
 
 
 
 
 
209
 
210
- logging.info(f"Found {len(results)} real search results for '{query}'")
211
- return results[:max_results]
212
  except Exception as e:
213
  logging.error(f"Search attempt {attempt + 1} failed for '{query}': {e}")
214
- if attempt < RETRY_ATTEMPTS - 1:
215
- await asyncio.sleep(RETRY_DELAY)
216
- continue
217
- logging.error(f"All {RETRY_ATTEMPTS} search attempts failed for '{query}'")
 
 
218
  return []
219
 
220
  async def process_web_source(session: aiohttp.ClientSession, source: dict, timeout: int = 15) -> Tuple[str, dict]:
@@ -780,7 +851,10 @@ Evidence and notes from crawled sources (trimmed):
780
  f"[{s['id']}] {s['title']} — {s['url']}" for s in sources_catalog
781
  ]
782
  refs_md = "\n".join(refs_md_lines)
783
- yield format_sse({"event": "chunk", "data": {"text": refs_md, "section": "references"}})
 
 
 
784
 
785
  duration = time.time() - start_time
786
  stats = {
 
108
  if not url:
109
  return ""
110
 
111
+ # Handle DuckDuckGo redirect links like //duckduckgo.com/l/?uddg=... or /l/?uddg=...
112
+ if url.startswith('//duckduckgo.com/l/') or url.startswith('/l/?'):
113
+ if url.startswith('//'):
114
+ url = f"https:{url}"
115
+ elif url.startswith('/'):
116
+ url = f"https://duckduckgo.com{url}"
117
  try:
118
  parsed = urlparse(url)
119
  query_params = parsed.query
 
121
  match = re.search(r'uddg=([^&]+)', query_params)
122
  if match:
123
  return unquote(match.group(1))
124
+ except Exception:
125
  pass
126
 
127
  if url.startswith('//'):
 
157
  return False
158
 
159
  async def fetch_search_results(query: str, max_results: int = 5) -> List[dict]:
160
+ """Perform a real search using DuckDuckGo (Lite/HTML) with multi-endpoint fallback to reduce 202 issues."""
161
+ ua_hdr = await get_real_user_agent()
162
+ common_headers = {
163
+ "User-Agent": ua_hdr,
164
+ "Accept-Language": "en-US,en;q=0.9",
165
+ "DNT": "1",
166
+ "Cache-Control": "no-cache",
167
+ "Pragma": "no-cache",
168
  "Referer": "https://duckduckgo.com/",
 
169
  }
170
 
171
+ # Try Lite first (very lightweight HTML), then HTML mirrors
172
+ endpoints = [
173
+ {"name": "lite-get", "method": "GET", "url": lambda q: f"https://lite.duckduckgo.com/lite/?q={quote_plus(q)}&kl=us-en&bing_market=us-en"},
174
+ # Per provided openapi.json: POST /lite/ with query params
175
+ {"name": "lite-post", "method": "POST", "url": lambda q: f"https://lite.duckduckgo.com/lite/?q={quote_plus(q)}&kl=us-en&bing_market=us-en"},
176
+ {"name": "html-mirror", "method": "GET", "url": lambda q: f"https://html.duckduckgo.com/html/?q={quote_plus(q)}"},
177
+ {"name": "html", "method": "GET", "url": lambda q: f"https://duckduckgo.com/html/?q={quote_plus(q)}"},
178
+ ]
179
+
180
+ def parse_results_from_html(html: str) -> List[dict]:
181
+ soup = BeautifulSoup(html, 'html.parser')
182
+ results: List[dict] = []
183
+
184
+ # Primary selectors (full HTML interface)
185
+ candidates = soup.select('.result__body')
186
+ if not candidates:
187
+ candidates = soup.select('.result')
188
+
189
+ for result in candidates:
190
+ try:
191
+ title_elem = result.select_one('.result__title .result__a') or result.select_one('.result__a')
192
+ if not title_elem:
193
+ # Lite fallback: find first anchor in this block
194
+ title_elem = result.find('a')
195
+ if not title_elem:
196
+ continue
197
+ link = title_elem.get('href')
198
+ if not link:
199
+ continue
200
+ snippet_elem = result.select_one('.result__snippet') or result.find('p')
201
+ clean_link = clean_url(link)
202
+ if not clean_link or clean_link.startswith('javascript:'):
203
+ continue
204
+ snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
205
+ title_text = title_elem.get_text(strip=True)
206
+ results.append({'title': title_text, 'link': clean_link, 'snippet': snippet})
207
+ except Exception as e:
208
+ logging.warning(f"Error parsing search result: {e}")
209
+ continue
210
+
211
+ # DuckDuckGo Lite often uses simple anchors; target likely link patterns first
212
+ if not results:
213
+ lite_links = soup.select('a[href*="/l/?uddg="]')
214
+ for a in lite_links:
215
+ try:
216
+ href = a.get('href')
217
+ title_text = a.get_text(strip=True)
218
+ if not href or not title_text:
219
+ continue
220
+ clean_link = clean_url(href)
221
+ if not clean_link or clean_link.startswith('javascript:'):
222
+ continue
223
+ results.append({'title': title_text, 'link': clean_link, 'snippet': ''})
224
+ if len(results) >= max_results:
225
+ break
226
+ except Exception:
227
+ continue
228
+
229
+ # If still empty, do a very generic anchor scrape (fallback)
230
+ if not results:
231
+ anchors = soup.find_all('a', href=True)
232
+ for a in anchors:
233
+ text = a.get_text(strip=True)
234
+ href = a['href']
235
+ if not text or not href:
236
+ continue
237
+ if '/l/?' in href or href.startswith('http') or href.startswith('//'):
238
+ clean_link = clean_url(href)
239
+ if clean_link and not clean_link.startswith('javascript:'):
240
+ results.append({'title': text, 'link': clean_link, 'snippet': ''})
241
+ if len(results) >= max_results * 2:
242
+ break
243
+
244
+ return results[:max_results]
245
+
246
  for attempt in range(RETRY_ATTEMPTS):
247
  try:
 
248
  async with aiohttp.ClientSession() as session:
249
+ for ep in endpoints:
250
+ url = ep['url'](query)
251
+ headers = {**common_headers, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"}
252
+ try:
253
+ if ep['method'] == 'GET':
254
+ resp = await session.get(url, headers=headers, timeout=12)
255
+ else:
256
+ # POST with querystring parameters as specified; no body required
257
+ resp = await session.post(url, headers=headers, timeout=12)
258
+ async with resp as response:
259
+ if response.status == 200:
260
+ html = await response.text()
261
+ results = parse_results_from_html(html)
262
+ if results:
263
+ logging.info(f"Found {len(results)} real search results for '{query}' via {ep['name']}")
264
+ return results
265
+ # If empty, try next endpoint
266
+ logging.warning(f"No results parsed from {ep['name']} for '{query}', trying next endpoint...")
267
  continue
268
+ elif response.status == 202:
269
+ logging.warning(f"Search attempt {attempt + 1} got 202 at {ep['name']} for '{query}', trying next endpoint")
270
+ continue
271
+ else:
272
+ logging.warning(f"Search failed with status {response.status} at {ep['name']} for '{query}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  continue
274
+ except asyncio.TimeoutError:
275
+ logging.warning(f"Timeout contacting {ep['name']} for '{query}'")
276
+ continue
277
+ except Exception as e:
278
+ logging.warning(f"Error contacting {ep['name']} for '{query}': {e}")
279
+ continue
280
 
 
 
281
  except Exception as e:
282
  logging.error(f"Search attempt {attempt + 1} failed for '{query}': {e}")
283
+
284
+ # Backoff before next multi-endpoint attempt
285
+ if attempt < RETRY_ATTEMPTS - 1:
286
+ await asyncio.sleep(RETRY_DELAY)
287
+
288
+ logging.error(f"All {RETRY_ATTEMPTS} search attempts failed across endpoints for '{query}'")
289
  return []
290
 
291
  async def process_web_source(session: aiohttp.ClientSession, source: dict, timeout: int = 15) -> Tuple[str, dict]:
 
851
  f"[{s['id']}] {s['title']} — {s['url']}" for s in sources_catalog
852
  ]
853
  refs_md = "\n".join(refs_md_lines)
854
+ # Back-compat: plain chunk
855
+ yield format_sse({"event": "chunk", "data": refs_md})
856
+ # New: section-tagged chunk
857
+ yield format_sse({"event": "section_chunk", "data": {"text": refs_md, "section": "references"}})
858
 
859
  duration = time.time() - start_time
860
  stats = {