Spaces:

akra35567
/

akira

Running

App Files Files Community

akira / modules /web_search.py

akra35567

Update modules/web_search.py

b70cf63 about 1 month ago

raw

history blame

8.76 kB

	"""
	WebSearch — Módulo para busca de notícias (WebScraping) e pesquisa geral (API Placeholder).

	- Angola News: Fontes fixas (Angop, Novo Jornal, Jornal de Angola, etc.)
	- Busca Geral: Placeholder para integração de API externa (ex: Google Search API, Serper API)
	- Cache: 15 minutos (900 segundos)
	"""

	import time
	import re
	import requests
	from typing import List, Dict, Any
	from loguru import logger
	from bs4 import BeautifulSoup
	import os

	# Importa o config para possível uso futuro de chaves de API
	try:
	# Assumindo que o config está em modules/config.py
	import modules.config as config
	except ImportError:
	# Fallback se config.py não estiver disponível
	class ConfigMock:
	pass
	config = ConfigMock()

	# Configuração do logger para este módulo
	logger.add("web_search.log", rotation="10 MB", level="INFO")


	class SimpleCache:
	"""Cache simples em memória com Time-To-Live (TTL)."""
	def __init__(self, ttl: int = 900): # 15 min
	self.ttl = ttl
	self._data: Dict[str, Any] = {}

	def get(self, key: str):
	if key in self._data:
	value, timestamp = self._data[key]
	if time.time() - timestamp < self.ttl:
	return value
	del self._data[key]
	return None

	def set(self, key: str, value: Any):
	self._data[key] = (value, time.time())


	class WebSearch:
	"""Gerenciador de buscas para notícias de Angola e pesquisa geral."""

	def __init__(self):
	self.cache = SimpleCache(ttl=900)
	self.session = requests.Session()
	# Header para simular um navegador real e evitar bloqueios de scraping
	self.session.headers.update({
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
	"Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7"
	})
	# Fontes de notícias de Angola (Web Scraping)
	self.fontes_angola = [
	"https://www.angop.ao/ultimas",
	"https://www.novojornal.co.ao/",
	"https://www.jornaldeangola.ao/",
	"https://www.verangola.net/va/noticias"
	]

	def _limpar_texto(self, texto: str) -> str:
	"""Limpa e formata o texto para o LLM."""
	if not texto: return ""
	# Remove espaços múltiplos, quebras de linha e caracteres de formatação
	texto = re.sub(r'[\s\n\t]+', ' ', texto)
	# Limita o tamanho para o contexto do LLM
	return texto.strip()[:200]

	# --- FUNÇÃO PRINCIPAL DE BUSCA GERAL (PLACEHOLDER) ---
	def buscar_geral(self, query: str) -> str:
	"""
	Retorna resultados de pesquisa na web para cultura geral.

	ATENÇÃO: Esta função é um PLACEHOLDER. Para funcionar, você DEVE
	integrar uma API de busca externa paga (ex: Serper, Google Search API,
	ou outra) para substituir o bloco de fallback.
	"""
	cache_key = f"busca_geral_{query.lower()}"
	cached = self.cache.get(cache_key)
	if cached:
	return cached

	logger.warning(f"PLACEHOLDER: Executando busca geral para '{query}'. É necessária integração de API externa.")

	# O BLOCO ABAIXO DEVE SER SUBSTITUÍDO PELA CHAMADA REAL DA API DE BUSCA

	# --- COMEÇO DO PLACEHOLDER ---
	fallback_response = "Sem informações de cultura geral disponíveis. Para ativar a pesquisa em tempo real, configure e integre uma API de busca (como Serper ou Google Search API) na função 'buscar_geral' do web_search.py."
	# --- FIM DO PLACEHOLDER ---

	self.cache.set(cache_key, fallback_response)
	return fallback_response

	# --- IMPLEMENTAÇÃO DE BUSCA DE NOTÍCIAS DE ANGOLA (WEB SCRAPING) ---

	def _buscar_angop(self) -> List[Dict]:
	"""Extrai notícias da Angop."""
	try:
	r = self.session.get(self.fontes_angola[0], timeout=8)
	if r.status_code != 200: return []
	soup = BeautifulSoup(r.text, 'html.parser')
	itens = soup.select('.ultimas-noticias .item')[:3]
	noticias = []
	for item in itens:
	titulo = item.select_one('h3 a')
	link = item.select_one('a')
	if titulo and link:
	noticias.append({
	"titulo": self._limpar_texto(titulo.get_text()),
	"link": "https://www.angop.ao" + link.get('href', '') if link.get('href', '').startswith('/') else link.get('href', '')
	})
	return noticias
	except Exception as e:
	logger.warning(f"Angop falhou: {e}")
	return []

	def _buscar_novojornal(self) -> List[Dict]:
	"""Extrai notícias do Novo Jornal."""
	try:
	r = self.session.get(self.fontes_angola[1], timeout=8)
	if r.status_code != 200: return []
	soup = BeautifulSoup(r.text, 'html.parser')
	itens = soup.select('.noticia-lista .titulo')[:3]
	noticias = []
	for item in itens:
	a = item.find('a')
	if a:
	noticias.append({
	"titulo": self._limpar_texto(a.get_text()),
	"link": a.get('href', '')
	})
	return noticias
	except Exception as e:
	logger.warning(f"Novo Jornal falhou: {e}")
	return []

	def _buscar_jornaldeangola(self) -> List[Dict]:
	"""Extrai notícias do Jornal de Angola."""
	try:
	r = self.session.get(self.fontes_angola[2], timeout=8)
	if r.status_code != 200: return []
	soup = BeautifulSoup(r.text, 'html.parser')
	itens = soup.select('.ultimas .titulo a')[:3]
	noticias = []
	for a in itens:
	noticias.append({
	"titulo": self._limpar_texto(a.get_text()),
	"link": a.get('href', '')
	})
	return noticias
	except Exception as e:
	logger.warning(f"Jornal de Angola falhou: {e}")
	return []

	def _buscar_verangola(self) -> List[Dict]:
	"""Extrai notícias do VerAngola."""
	try:
	r = self.session.get(self.fontes_angola[3], timeout=8)
	if r.status_code != 200: return []
	soup = BeautifulSoup(r.text, 'html.parser')
	# Seletores podem mudar, mas .noticia-item geralmente é um bom ponto de partida
	itens = soup.select('.noticia-item')[:3]
	noticias = []
	for item in itens:
	titulo = item.select_one('h3 a')
	if titulo:
	link = titulo.get('href', '')
	noticias.append({
	"titulo": self._limpar_texto(titulo.get_text()),
	"link": link if link.startswith('http') else "https://www.verangola.net" + link
	})
	return noticias
	except Exception as e:
	logger.warning(f"VerAngola falhou: {e}")
	return []

	def pesquisar_noticias_angola(self) -> str:
	"""
	Retorna as notícias mais recentes de Angola através de Web Scraping.
	Esta é a função usada no api.py quando detecta intenção de notícias.
	"""
	cache_key = "noticias_angola"
	cached = self.cache.get(cache_key)
	if cached:
	return cached

	todas = []
	try:
	todas.extend(self._buscar_angop())
	todas.extend(self._buscar_novojornal())
	todas.extend(self._buscar_jornaldeangola())
	todas.extend(self._buscar_verangola())
	except Exception as e:
	logger.error(f"Erro no pipeline de scraping: {e}")

	# Filtra e remove duplicatas
	vistos = set()
	unicas = []
	for n in todas:
	t = n["titulo"].lower()
	if t not in vistos and len(t) > 20:
	vistos.add(t)
	unicas.append(n)
	if len(unicas) >= 5:
	break

	if not unicas:
	fallback = "Sem notícias recentes de Angola disponíveis no momento."
	self.cache.set(cache_key, fallback)
	return fallback

	# Formata a resposta para injeção no prompt do LLM
	texto = "NOTÍCIAS RECENTES DE ANGOLA (CONTEXTO):\n"
	for i, n in enumerate(unicas, 1):
	# Apenas o título é relevante para o contexto do LLM
	texto += f"[{i}] {n['titulo']}\n"

	self.cache.set(cache_key, texto.strip())
	return texto.strip()