Spaces:

akra35567
/

akira

Running

App Files Files Community

akra35567 commited on Nov 6

Commit

6ca7a85

1 Parent(s): f3f8380

Update modules/web_search.py

Browse files

Files changed (1) hide show

modules/web_search.py +158 -223

modules/web_search.py CHANGED Viewed

@@ -1,236 +1,171 @@
-# modules/treinamento.py
 """
-Sistema de treinamento avançado para Akira IA.
-- Implementação de um sistema de treinamento periódico e aprendizado em tempo real.
-- Usa um modelo NLP pesado para gerar embeddings.
 """
-import threading
 import time
-import logging
-import sqlite3
 import re
-import json
-import collections
-from typing import Optional, Any, List, Dict, Tuple
-from dataclasses import dataclass
-# Importa o módulo de configuração (necessário para constantes como DB_PATH)
-import modules.config as config
-from .database import Database # Importação adicionada para type hints (necessário para o método __init__)
-logger = logging.getLogger(__name__)
-# MODELO PESADO (Solicitado): paraphrase-multilingual-mpnet-base-v2
-try:
-    from sentence_transformers import SentenceTransformer
-    MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
-    logger.info(f"Modelo NLP carregado: {MODEL_NAME}")
-except Exception as e:
-    logger.warning(f"sentence_transformers não disponível: {e}")
-    SentenceTransformer = None
-    MODEL_NAME = None
-# Listas angolanas (Conforme fornecido pelo usuário)
-PALAVRAS_POSITIVAS = ['bom', 'ótimo', 'incrível', 'feliz', 'alegre', 'fixe', 'bué', 'top', 'show', 'adoro', 'rsrs', 'kkk']
-PALAVRAS_NEGATIVAS = ['ruim', 'péssimo', 'triste', 'ódio', 'puto', 'merda', 'caralho', 'chateado']
-GIRIAS_ANGOLANAS = ['mano', 'puto', 'cota', 'mwangolé', 'kota', 'oroh', 'bué', 'fixe', 'baza', 'kuduro']
-PALAVRAS_RUDES = ['caralho', 'puto', 'merda', 'fdp', 'vsf', 'burro', 'idiota', 'parvo']
-@dataclass
-class Interacao:
-    usuario: str
-    mensagem: str
-    resposta: str
-    numero: str
-    is_reply: bool = False
-    mensagem_original: str = ""
-class Treinamento:
-    """
-    Treinamento contínuo da Akira:
-    - Registra interações
-    - Analisa tom, emoção, gírias
-    - Heurístico para LLMs (sem fine-tuning pesado)
-    """
-    def __init__(self, db: Database, contexto: Optional[Any] = None, interval_hours: int = 1):
-        self.db = db
-        # O contexto é opcional, mas se existir, deve ser uma instância de Contexto
-        self.contexto = contexto
-        self.interval_hours = interval_hours
-        self._thread = None
-        self._running = False
-        self._model = None
-        # Usuários privilegiados
-        self.privileged_users = ['244937035662', 'isaac', 'isaac quarenta']
-    def _ensure_nlp_model(self):
-        """Carrega o modelo NLP pesado se ainda não estiver carregado."""
-        if self._model is not None:
-            return
-        if SentenceTransformer is None:
-            return
         try:
-            logger.info(f"Carregando modelo NLP pesado: {MODEL_NAME}...")
-            # Usa o MODEL_NAME definido globalmente
-            self._model = SentenceTransformer(MODEL_NAME)
-            logger.info(f"Modelo NLP carregado com sucesso: {MODEL_NAME}")
         except Exception as e:
-            logger.error(f"Falha ao carregar modelo NLP ({MODEL_NAME}): {e}")
-            logger.warning("Verifique se seu ambiente tem o HF_TOKEN correto e acesso à internet.")
-            self._model = None
-    def registrar_interacao(self, usuario: str, mensagem: str, resposta: str, numero: str = '', is_reply: bool = False, mensagem_original: str = ''):
-        """Salva + aprende na hora"""
         try:
-            # Assumindo que db.salvar_mensagem agora aceita todos os 6 parâmetros
-            self.db.salvar_mensagem(usuario, mensagem, resposta, numero, is_reply, mensagem_original)
-            self._aprender_em_tempo_real(numero, mensagem, resposta)
-            logger.info(f"Interação registrada e aprendida em tempo real: {numero}")
         except Exception as e:
-            logger.warning(f'Erro ao registrar interação: {e}')
-    def _aprender_em_tempo_real(self, numero: str, msg: str, resp: str):
-        """Executa análise heurística e NLP em tempo real."""
-        if not numero or numero == 'unknown':
-            return
-        texto = f"{msg} {resp}".lower()
-        # === ANÁLISE NLP (se disponível) ===
-        self._ensure_nlp_model()
-        if self._model:
-            try:
-                # O numpy.tobytes() garante que o embedding é salvo como BLOB (bytes)
-                emb = self._model.encode(texto).tobytes()
-                # Uso do novo método db.salvar_embedding (assumindo que existe na Database)
-                # O embedding é salvo com o texto (key) para referência futura
-                self.db.salvar_embedding(texto, emb)
-            except Exception as e:
-                logger.warning(f"Erro ao gerar/salvar embedding: {e}")
-                pass
-        # === ANÁLISE HEURÍSTICA ===
-        rude = any(p in texto for p in PALAVRAS_RUDES)
-        tom = 'rude' if rude else 'casual'
-        # Filtra palavras comuns e conta a frequência para top_girias
-        palavras = [p for p in re.findall(r'\b\w{4,}\b', texto)
-                      if p not in {'não', 'que', 'com', 'pra', 'uma', 'ele', 'ela', 'por', 'pra', 'tudo', 'bem', 'como'}]
-        contador = collections.Counter(palavras)
-        # Seleciona palavras com frequência > 1, excluindo as gírias conhecidas
-        top_girias = [w for w, c in contador.most_common(5) if c > 1 and w not in GIRIAS_ANGOLANAS]
-        # Salvar tom
-        intensidade = 0.9 if rude else 0.6
-        # Uso do novo método db.registrar_tom_usuario (assumindo que existe)
-        self.db.registrar_tom_usuario(numero, tom, intensidade, texto[:100])
-        # Salvar gírias
-        for giria in top_girias:
-            significado = "gíria rude" if rude else "gíria local"
-            # Uso do novo método db.salvar_giria_aprendida (assumindo que existe)
-            self.db.salvar_giria_aprendida(numero, giria, significado, texto[:100])
-        # Emoção: Puxa do contexto
-        if self.contexto and hasattr(self.contexto, 'analisar_emocoes_mensagem'):
-            # O Contexto.analisar_emocoes_mensagem retorna str ('feliz', 'triste', etc.)
-            emocao_str = self.contexto.analisar_emocoes_mensagem(msg)
-            # Adapta para o formato JSON esperado por salvar_aprendizado_detalhado
-            analise = {emocao_str: 1.0, "texto_original": msg}
-            self.db.salvar_aprendizado_detalhado(numero, "emocao_recente", json.dumps(analise))
-        else:
-            logger.debug(f"Contexto não disponível para análise de emoção em tempo real para {numero}.")
-    # ================================================================
-    # HEURÍSTICO PARA MISTRAL (TREINAMENTO PERIÓDICO)
-    # ================================================================
-    def _prepare_prompt_for_mistral(self, interacoes: List[Interacao]) -> str:
-        """Prepara prompt para LLM baseado em interações (exemplo de adaptação)."""
-        examples = []
-        for i in interacoes:
-            # Adaptando para um formato mais 'chat'
-            prompt = f"U: {i.mensagem}\nA: {i.resposta}"
-            examples.append(prompt)
-        return "\n".join(examples)
-    def train_once(self):
-        """Treinamento heurístico periódico: analisa usuários e salva preferências."""
-        logger.info("Treinamento heurístico iniciado...")
-        self._analisar_usuarios()
-        self._salvar_ultimo_treino()
-        logger.info("Treinamento concluído.")
-    def _analisar_usuarios(self):
-        """Analisa o histórico dos usuários para determinar tom e gírias."""
-        usuarios = set()
-        # Recupera números distintos de usuários que interagiram
-        rows = self.db._execute_with_retry("SELECT DISTINCT numero FROM mensagens WHERE numero IS NOT NULL AND numero != ''")
-        if rows:
-            for r in rows:
-                usuarios.add(r[0])
-        for num in usuarios:
-            # Recupera as 20 últimas mensagens (entrada e resposta)
-            msgs = self.db.recuperar_mensagens(num, limite=20)
-            if len(msgs) < 3: continue
-            tom = self._detectar_tom(msgs, num)
-            # Uso do novo método db.salvar_preferencia_tom (assumindo que existe)
-            self.db.salvar_preferencia_tom(num, tom)
-    def _detectar_tom(self, mensagens: List[Tuple], numero: str) -> str:
-        """Detecta o tom predominante do usuário."""
-        if numero in self.privileged_users:
-            return 'formal'
-        counter = collections.Counter()
-        for msg, _ in mensagens: # (mensagem_usuario, resposta_ia)
-            msg_l = (msg or '').lower()
-            if any(p in msg_l for p in PALAVRAS_RUDES):
-                counter['rude'] += 1
-            elif any(p in msg_l for p in ['por favor', 'obrigado', 'excelência']):
-                counter['formal'] += 1
-            elif any(p in msg_l for p in GIRIAS_ANGOLANAS):
-                counter['casual'] += 1
-            else:
-                counter['neutro'] += 1
-        return counter.most_common(1)[0][0] if counter else 'neutro'
-    def _salvar_ultimo_treino(self):
-        """Salva o timestamp do último treinamento."""
         try:
-            self.db.salvar_info_geral('ultimo_treino', str(time.time()))
-        except Exception:
             pass
-    # ================================================================
-    # LOOP DE TREINAMENTO (Mantido)
-    # ================================================================
-    def _run_loop(self):
-        """Loop principal do treinamento periódico."""
-        interval = max(1, self.interval_hours) * 3600
-        logger.info(f"Treinamento heurístico a cada {self.interval_hours}h")
-        while self._running:
-            try:
-                self.train_once()
-            except Exception as e:
-                logger.exception(f"Erro no treinamento: {e}")
-            for _ in range(int(interval)):
-                if not self._running:
                     break
-                time.sleep(1)
-        logger.info("Treinamento parado.")
-    def start_periodic_training(self):
-        """Inicia o treinamento periódico em uma thread separada."""
-        if self._running: return
-        self._running = True
-        self._thread = threading.Thread(target=self._run_loop, daemon=True)
-        self._thread.start()
-    def stop(self):
-        """Para o treinamento periódico."""
-        self._running = False
-        if self._thread:
-            # Tenta juntar a thread por 5 segundos
-            self._thread.join(timeout=5)

 """
+WebSearch — Busca notícias recentes de Angola
+- Fontes: Angop, Novo Jornal, Jornal de Angola, etc.
+- Cache: 15 minutos
+- Fallback: "Sem notícias recentes."
+- 100% seguro (requests + try/except)
 """
 import time
 import re
+import requests
+from typing import List, Dict
+from loguru import logger
+from bs4 import BeautifulSoup
+import modules.config as config
+class SimpleCache:
+    def __init__(self, ttl: int = 900):  # 15 min
+        self.ttl = ttl
+        self._data = {}
+    def get(self, key: str):
+        if key in self._data:
+            value, timestamp = self._data[key]
+            if time.time() - timestamp < self.ttl:
+                return value
+            del self._data[key]
+        return None
+    def set(self, key: str, value):
+        self._data[key] = (value, time.time())
+class WebSearch:
+    def __init__(self):
+        self.cache = SimpleCache(ttl=900)
+        self.session = requests.Session()
+        self.session.headers.update({
+            "User-Agent": "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36"
+        })
+        self.fontes = [
+            "https://www.angop.ao/ultimas",
+            "https://www.novojornal.co.ao/",
+            "https://www.jornaldeangola.ao/",
+            "https://www.verangola.net/va/noticias"
+        ]
+    def _limpar_texto(self, texto: str) -> str:
+        if not texto: return ""
+        texto = re.sub(r'\s+', ' ', texto)
+        texto = re.sub(r'[^a-zA-ZÀ-ÿ0-9\s\.,!?]', '', texto)
+        return texto.strip()[:200]
+    def _buscar_angop(self) -> List[Dict]:
         try:
+            r = self.session.get(self.fontes[0], timeout=8)
+            if r.status_code != 200: return []
+            soup = BeautifulSoup(r.text, 'html.parser')
+            itens = soup.select('.ultimas-noticias .item')[:3]
+            noticias = []
+            for item in itens:
+                titulo = item.select_one('h3 a')
+                link = item.select_one('a')
+                if titulo and link:
+                    noticias.append({
+                        "titulo": self._limpar_texto(titulo.get_text()),
+                        "link": "https://www.angop.ao" + link.get('href', '') if link.get('href', '').startswith('/') else link.get('href', '')
+                    })
+            return noticias
         except Exception as e:
+            logger.warning(f"Angop falhou: {e}")
+            return []
+    def _buscar_novojornal(self) -> List[Dict]:
         try:
+            r = self.session.get(self.fontes[1], timeout=8)
+            if r.status_code != 200: return []
+            soup = BeautifulSoup(r.text, 'html.parser')
+            itens = soup.select('.noticia-lista .titulo')[:3]
+            noticias = []
+            for item in itens:
+                a = item.find('a')
+                if a:
+                    noticias.append({
+                        "titulo": self._limpar_texto(a.get_text()),
+                        "link": a.get('href', '')
+                    })
+            return noticias
         except Exception as e:
+            logger.warning(f"Novo Jornal falhou: {e}")
+            return []
+    def _buscar_jornaldeangola(self) -> List[Dict]:
+        try:
+            r = self.session.get(self.fontes[2], timeout=8)
+            if r.status_code != 200: return []
+            soup = BeautifulSoup(r.text, 'html.parser')
+            itens = soup.select('.ultimas .titulo a')[:3]
+            noticias = []
+            for a in itens:
+                noticias.append({
+                    "titulo": self._limpar_texto(a.get_text()),
+                    "link": a.get('href', '')
+                })
+            return noticias
+        except Exception as e:
+            logger.warning(f"Jornal de Angola falhou: {e}")
+            return []
+    def _buscar_verangola(self) -> List[Dict]:
         try:
+            r = self.session.get(self.fontes[3], timeout=8)
+            if r.status_code != 200: return []
+            soup = BeautifulSoup(r.text, 'html.parser')
+            itens = soup.select('.noticia-item')[:3]
+            noticias = []
+            for item in itens:
+                titulo = item.select_one('h3 a')
+                if titulo:
+                    link = titulo.get('href', '')
+                    noticias.append({
+                        "titulo": self._limpar_texto(titulo.get_text()),
+                        "link": link if link.startswith('http') else "https://www.verangola.net" + link
+                    })
+            return noticias
+        except Exception as e:
+            logger.warning(f"VerAngola falhou: {e}")
+            return []
+    def pesquisar_noticias_angola(self) -> str:
+        cache_key = "noticias_angola"
+        cached = self.cache.get(cache_key)
+        if cached:
+            return cached
+        todas = []
+        try:
+            todas.extend(self._buscar_angop())
+            todas.extend(self._buscar_novojornal())
+            todas.extend(self._buscar_jornaldeangola())
+            todas.extend(self._buscar_verangola())
+        except:
             pass
+        if not todas:
+            fallback = "Sem notícias recentes."
+            self.cache.set(cache_key, fallback)
+            return fallback
+        # Remove duplicatas por título
+        vistos = set()
+        unicas = []
+        for n in todas:
+            t = n["titulo"].lower()
+            if t not in vistos and len(t) > 20:
+                vistos.add(t)
+                unicas.append(n)
+                if len(unicas) >= 3:
                     break
+        if not unicas:
+            fallback = "Sem notícias relevantes."
+            self.cache.set(cache_key, fallback)
+            return fallback
+        texto = "NOTÍCIAS RECENTES:\n"
+        for i, n in enumerate(unicas, 1):
+            texto += f"{i}. {n['titulo']}\n"
+        self.cache.set(cache_key, texto.strip())
+        return texto.strip()