import os import requests from huggingface_hub import hf_hub_download from langchain.llms.base import LLM from langchain.chains import RetrievalQA from langchain_core.prompts import PromptTemplate from typing import Any, List, Optional, Mapping # --- Custom LangChain LLM Wrapper for Hybrid Approach --- class HybridLLM(LLM): api_url: str = "" local_llm: Any = None @property def _llm_type(self) -> str: return "hybrid_llm" def _call(self, prompt: str, stop: Optional[List[str]] = None, **kwargs: Any) -> str: # 1. Try Colab API first if self.api_url: try: print(f"🌐 Calling Colab API: {self.api_url}") response = requests.post( f"{self.api_url}/generate", json={"prompt": prompt, "max_tokens": 512}, timeout=30 # 30s timeout ) if response.status_code == 200: return response.json()["response"] else: print(f"⚠️ API Error {response.status_code}: {response.text}") except Exception as e: print(f"⚠️ API Connection Failed: {e}") # 2. Fallback to Local GGUF if self.local_llm: print("💻 Using Local GGUF Fallback...") # Llama-cpp-python expects prompt in specific format or raw # We'll pass the prompt directly output = self.local_llm( prompt, max_tokens=512, stop=["<|im_end|>", "User:", "Input:"], echo=False ) return output['choices'][0]['text'] return "❌ Error: No working LLM available (API failed and no local model)." @property def _identifying_params(self) -> Mapping[str, Any]: return {"api_url": self.api_url} class LLMClient: def __init__(self, vector_store=None): """ Initialize Hybrid LLM Client """ self.vector_store = vector_store self.api_url = os.environ.get("COLAB_API_URL", "") # Get from Env Var self.local_llm = None # Initialize Local GGUF (always load as backup or if API missing) # We load it lazily or eagerly depending on memory. # Since user has 16GB RAM, we can load a 2B model easily. try: print("📂 Loading Local Qwen3-VL-2B-Thinking (GGUF)...") from llama_cpp import Llama model_name = "Qwen/Qwen2.5-VL-3B-Thinking-GGUF" # Fallback to a known working GGUF if Qwen3 not found, but user asked for Qwen3 # NOTE: As of now, Qwen3-VL GGUF might be under a specific repo. # Let's use a generic search or specific path if known. # User specified: Qwen/Qwen3-VL-2B-Thinking-GGUF # We will try to download it. repo_id = "Qwen/Qwen3-VL-2B-Thinking-GGUF" model_repo = "Qwen/Qwen3-VL-2B-Thinking-GGUF" filename = "Qwen3VL-2B-Thinking-Q4_K_M.gguf" model_path = hf_hub_download( repo_id=model_repo, filename=filename ) self.local_llm = Llama( model_path=model_path, n_ctx=2048, n_threads=2, # Use 2 vCPUs verbose=False ) print("✅ Local GGUF Model Ready!") except Exception as e: print(f"⚠️ Could not load local GGUF: {e}") # Create Hybrid LangChain Wrapper self.llm = HybridLLM(api_url=self.api_url, local_llm=self.local_llm) def analyze(self, text, context_chunks=None): """ Analyze text using LangChain RetrievalQA """ if not self.vector_store: return "❌ Vector Store not initialized." # Custom Prompt Template template = """<|im_start|>system You are a cybersecurity expert. Task: Determine whether the input is 'PHISHING' or 'BENIGN' (Safe). Respond in the following format: LABEL: [PHISHING or BENIGN] EXPLANATION: [A brief Vietnamese explanation] Context: {context} <|im_end|> <|im_start|>user Input: {question} Short Analysis: <|im_end|> <|im_start|>assistant """ PROMPT = PromptTemplate( template=template, input_variables=["context", "question"] ) # Create QA Chain qa_chain = RetrievalQA.from_chain_type( llm=self.llm, chain_type="stuff", retriever=self.vector_store.as_retriever(search_kwargs={"k": 3}), chain_type_kwargs={"prompt": PROMPT} ) try: print("🤖 Generating response...") response = qa_chain.invoke(text) return response['result'] except Exception as e: return f"❌ Error: {str(e)}"