Spaces:
Paused
Paused
| """ | |
| LLM implementation using Hugging Face Inference Endpoint with OpenAI compatibility. | |
| """ | |
| import requests | |
| import os | |
| import json | |
| import logging | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| # Endpoint configuration | |
| HF_API_KEY = os.environ.get("HF_API_KEY", "") | |
| ENDPOINT_URL = os.environ.get("ENDPOINT_URL", "https://cg01ow7izccjx1b2.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions") | |
| # Verify configuration | |
| if not HF_API_KEY: | |
| logger.warning("HF_API_KEY environment variable not set") | |
| if not ENDPOINT_URL: | |
| logger.warning("ENDPOINT_URL environment variable not set") | |
| # Memory store for conversation history | |
| conversation_memory = {} | |
| def run_llm(input_text, max_tokens=512, temperature=0.7): | |
| """ | |
| Process input text through HF Inference Endpoint. | |
| Args: | |
| input_text: User input to process | |
| max_tokens: Maximum tokens to generate | |
| temperature: Temperature for sampling (higher = more random) | |
| Returns: | |
| Generated response text | |
| """ | |
| headers = { | |
| "Authorization": f"Bearer {HF_API_KEY}", | |
| "Content-Type": "application/json" | |
| } | |
| # Format messages in OpenAI format | |
| messages = [ | |
| {"role": "system", "content": "You are a helpful AI assistant for a telecom service. Answer questions clearly and concisely."}, | |
| {"role": "user", "content": input_text} | |
| ] | |
| payload = { | |
| "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | |
| "messages": messages, | |
| "max_tokens": max_tokens, | |
| "temperature": temperature | |
| } | |
| logger.info(f"Sending request to endpoint: {ENDPOINT_URL[:30]}...") | |
| try: | |
| response = requests.post(ENDPOINT_URL, headers=headers, json=payload) | |
| response.raise_for_status() | |
| result = response.json() | |
| response_text = result["choices"][0]["message"]["content"] | |
| return response_text | |
| except requests.exceptions.RequestException as e: | |
| error_msg = f"Error calling endpoint: {str(e)}" | |
| if hasattr(e, 'response') and e.response is not None: | |
| error_msg += f" - Status code: {e.response.status_code}, Response: {e.response.text}" | |
| logger.error(error_msg) | |
| return f"Error generating response: {str(e)}" | |
| def run_llm_with_memory(input_text, session_id="default", max_tokens=512, temperature=0.7): | |
| """ | |
| Process input with conversation memory. | |
| Args: | |
| input_text: User input to process | |
| session_id: Unique identifier for conversation | |
| max_tokens: Maximum tokens to generate | |
| temperature: Temperature for sampling | |
| Returns: | |
| Generated response text | |
| """ | |
| # Initialize memory if needed | |
| if session_id not in conversation_memory: | |
| conversation_memory[session_id] = [ | |
| {"role": "system", "content": "You are a helpful AI assistant for a telecom service. Answer questions clearly and concisely."} | |
| ] | |
| # Add current input to memory | |
| conversation_memory[session_id].append({"role": "user", "content": input_text}) | |
| # Prepare the full conversation history | |
| messages = conversation_memory[session_id].copy() | |
| # Keep only the last 10 messages to avoid context length issues | |
| if len(messages) > 10: | |
| # Always keep the system message | |
| messages = [messages[0]] + messages[-9:] | |
| headers = { | |
| "Authorization": f"Bearer {HF_API_KEY}", | |
| "Content-Type": "application/json" | |
| } | |
| payload = { | |
| "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | |
| "messages": messages, | |
| "max_tokens": max_tokens, | |
| "temperature": temperature | |
| } | |
| logger.info(f"Sending memory-based request for session {session_id}") | |
| try: | |
| response = requests.post(ENDPOINT_URL, headers=headers, json=payload) | |
| response.raise_for_status() | |
| result = response.json() | |
| response_text = result["choices"][0]["message"]["content"] | |
| # Save response to memory | |
| conversation_memory[session_id].append({"role": "assistant", "content": response_text}) | |
| return response_text | |
| except requests.exceptions.RequestException as e: | |
| error_msg = f"Error calling endpoint: {str(e)}" | |
| if hasattr(e, 'response') and e.response is not None: | |
| error_msg += f" - Status code: {e.response.status_code}, Response: {e.response.text}" | |
| logger.error(error_msg) | |
| return f"Error generating response: {str(e)}" | |
| def clear_memory(session_id="default"): | |
| """ | |
| Clear conversation memory for a specific session. | |
| Args: | |
| session_id: Unique identifier for conversation | |
| """ | |
| if session_id in conversation_memory: | |
| conversation_memory[session_id] = [ | |
| {"role": "system", "content": "You are a helpful AI assistant for a telecom service. Answer questions clearly and concisely."} | |
| ] | |
| return True | |
| return False | |
| def get_memory_sessions(): | |
| """ | |
| Get list of active memory sessions. | |
| Returns: | |
| List of session IDs | |
| """ | |
| return list(conversation_memory.keys()) | |
| def get_model_info(): | |
| """ | |
| Get information about the connected model endpoint. | |
| Returns: | |
| Dictionary with endpoint information | |
| """ | |
| return { | |
| "endpoint_url": ENDPOINT_URL, | |
| "memory_sessions": len(conversation_memory), | |
| "model_type": "Meta-Llama-3.1-8B-Instruct (Inference Endpoint)" | |
| } | |
| def test_endpoint(): | |
| """ | |
| Test the endpoint connection. | |
| Returns: | |
| Status information | |
| """ | |
| try: | |
| response = run_llm("Hello, this is a test message. Please respond with a short greeting.") | |
| return { | |
| "status": "connected", | |
| "message": "Successfully connected to endpoint", | |
| "sample_response": response[:50] + "..." if len(response) > 50 else response | |
| } | |
| except Exception as e: | |
| return { | |
| "status": "error", | |
| "message": f"Failed to connect to endpoint: {str(e)}" | |
| } |