"""
WebSearch — Módulo para busca de notícias (WebScraping) e pesquisa geral (API Placeholder).

- Angola News: Fontes fixas (Angop, Novo Jornal, Jornal de Angola, etc.)
- Busca Geral: Placeholder para integração de API externa (ex: Google Search API, Serper API)
- Cache: 15 minutos (900 segundos)
"""

import time
import re
import requests
from typing import List, Dict, Any
from loguru import logger
from bs4 import BeautifulSoup
import os

# Importa o config para possível uso futuro de chaves de API
try:
    # Assumindo que o config está em modules/config.py
    import modules.config as config
except ImportError:
    # Fallback se config.py não estiver disponível
    class ConfigMock:
        pass
    config = ConfigMock()

# Configuração do logger para este módulo
logger.add("web_search.log", rotation="10 MB", level="INFO")


class SimpleCache:
    """Cache simples em memória com Time-To-Live (TTL)."""
    def __init__(self, ttl: int = 900):  # 15 min
        self.ttl = ttl
        self._data: Dict[str, Any] = {}

    def get(self, key: str):
        if key in self._data:
            value, timestamp = self._data[key]
            if time.time() - timestamp < self.ttl:
                return value
            del self._data[key]
        return None

    def set(self, key: str, value: Any):
        self._data[key] = (value, time.time())


class WebSearch:
    """Gerenciador de buscas para notícias de Angola e pesquisa geral."""
    
    def __init__(self):
        self.cache = SimpleCache(ttl=900)
        self.session = requests.Session()
        # Header para simular um navegador real e evitar bloqueios de scraping
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
            "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7"
        })
        # Fontes de notícias de Angola (Web Scraping)
        self.fontes_angola = [
            "https://www.angop.ao/ultimas",
            "https://www.novojornal.co.ao/",
            "https://www.jornaldeangola.ao/",
            "https://www.verangola.net/va/noticias"
        ]

    def _limpar_texto(self, texto: str) -> str:
        """Limpa e formata o texto para o LLM."""
        if not texto: return ""
        # Remove espaços múltiplos, quebras de linha e caracteres de formatação
        texto = re.sub(r'[\s\n\t]+', ' ', texto)
        # Limita o tamanho para o contexto do LLM
        return texto.strip()[:200]

    # --- FUNÇÃO PRINCIPAL DE BUSCA GERAL (PLACEHOLDER) ---
    def buscar_geral(self, query: str) -> str:
        """
        Retorna resultados de pesquisa na web para cultura geral.
        
        ATENÇÃO: Esta função é um PLACEHOLDER. Para funcionar, você DEVE
        integrar uma API de busca externa paga (ex: Serper, Google Search API,
        ou outra) para substituir o bloco de fallback.
        """
        cache_key = f"busca_geral_{query.lower()}"
        cached = self.cache.get(cache_key)
        if cached:
            return cached
        
        logger.warning(f"PLACEHOLDER: Executando busca geral para '{query}'. É necessária integração de API externa.")
        
        # O BLOCO ABAIXO DEVE SER SUBSTITUÍDO PELA CHAMADA REAL DA API DE BUSCA
        
        # --- COMEÇO DO PLACEHOLDER ---
        fallback_response = "Sem informações de cultura geral disponíveis. Para ativar a pesquisa em tempo real, configure e integre uma API de busca (como Serper ou Google Search API) na função 'buscar_geral' do web_search.py."
        # --- FIM DO PLACEHOLDER ---
        
        self.cache.set(cache_key, fallback_response)
        return fallback_response

    # --- IMPLEMENTAÇÃO DE BUSCA DE NOTÍCIAS DE ANGOLA (WEB SCRAPING) ---

    def _buscar_angop(self) -> List[Dict]:
        """Extrai notícias da Angop."""
        try:
            r = self.session.get(self.fontes_angola[0], timeout=8)
            if r.status_code != 200: return []
            soup = BeautifulSoup(r.text, 'html.parser')
            itens = soup.select('.ultimas-noticias .item')[:3]
            noticias = []
            for item in itens:
                titulo = item.select_one('h3 a')
                link = item.select_one('a')
                if titulo and link:
                    noticias.append({
                        "titulo": self._limpar_texto(titulo.get_text()),
                        "link": "https://www.angop.ao" + link.get('href', '') if link.get('href', '').startswith('/') else link.get('href', '')
                    })
            return noticias
        except Exception as e:
            logger.warning(f"Angop falhou: {e}")
            return []

    def _buscar_novojornal(self) -> List[Dict]:
        """Extrai notícias do Novo Jornal."""
        try:
            r = self.session.get(self.fontes_angola[1], timeout=8)
            if r.status_code != 200: return []
            soup = BeautifulSoup(r.text, 'html.parser')
            itens = soup.select('.noticia-lista .titulo')[:3]
            noticias = []
            for item in itens:
                a = item.find('a')
                if a:
                    noticias.append({
                        "titulo": self._limpar_texto(a.get_text()),
                        "link": a.get('href', '')
                    })
            return noticias
        except Exception as e:
            logger.warning(f"Novo Jornal falhou: {e}")
            return []

    def _buscar_jornaldeangola(self) -> List[Dict]:
        """Extrai notícias do Jornal de Angola."""
        try:
            r = self.session.get(self.fontes_angola[2], timeout=8)
            if r.status_code != 200: return []
            soup = BeautifulSoup(r.text, 'html.parser')
            itens = soup.select('.ultimas .titulo a')[:3]
            noticias = []
            for a in itens:
                noticias.append({
                    "titulo": self._limpar_texto(a.get_text()),
                    "link": a.get('href', '')
                })
            return noticias
        except Exception as e:
            logger.warning(f"Jornal de Angola falhou: {e}")
            return []

    def _buscar_verangola(self) -> List[Dict]:
        """Extrai notícias do VerAngola."""
        try:
            r = self.session.get(self.fontes_angola[3], timeout=8)
            if r.status_code != 200: return []
            soup = BeautifulSoup(r.text, 'html.parser')
            # Seletores podem mudar, mas .noticia-item geralmente é um bom ponto de partida
            itens = soup.select('.noticia-item')[:3] 
            noticias = []
            for item in itens:
                titulo = item.select_one('h3 a')
                if titulo:
                    link = titulo.get('href', '')
                    noticias.append({
                        "titulo": self._limpar_texto(titulo.get_text()),
                        "link": link if link.startswith('http') else "https://www.verangola.net" + link
                    })
            return noticias
        except Exception as e:
            logger.warning(f"VerAngola falhou: {e}")
            return []

    def pesquisar_noticias_angola(self) -> str:
        """
        Retorna as notícias mais recentes de Angola através de Web Scraping.
        Esta é a função usada no api.py quando detecta intenção de notícias.
        """
        cache_key = "noticias_angola"
        cached = self.cache.get(cache_key)
        if cached:
            return cached

        todas = []
        try:
            todas.extend(self._buscar_angop())
            todas.extend(self._buscar_novojornal())
            todas.extend(self._buscar_jornaldeangola())
            todas.extend(self._buscar_verangola())
        except Exception as e:
            logger.error(f"Erro no pipeline de scraping: {e}")

        # Filtra e remove duplicatas
        vistos = set()
        unicas = []
        for n in todas:
            t = n["titulo"].lower()
            if t not in vistos and len(t) > 20:
                vistos.add(t)
                unicas.append(n)
                if len(unicas) >= 5:
                    break

        if not unicas:
            fallback = "Sem notícias recentes de Angola disponíveis no momento."
            self.cache.set(cache_key, fallback)
            return fallback

        # Formata a resposta para injeção no prompt do LLM
        texto = "NOTÍCIAS RECENTES DE ANGOLA (CONTEXTO):\n"
        for i, n in enumerate(unicas, 1):
            # Apenas o título é relevante para o contexto do LLM
            texto += f"[{i}] {n['titulo']}\n" 
            
        self.cache.set(cache_key, texto.strip())
        return texto.strip()