import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from typing import Optional, Dict, Any

class CognitiveLLM:
    def __init__(self, model_name: str = "Qwen/Qwen3-7B-Instruct", device: str = None):
        """
        Initialize the Cognitive LLM with the specified model.
        
        Args:
            model_name: Name of the model to use (default: Qwen/Qwen3-7B-Instruct)
            device: Device to run the model on ('cuda', 'mps', or 'cpu'). Auto-detects if None.
        """
        self.model_name = model_name
        self.device = device if device else 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
        
        print(f"Loading {model_name} on {self.device}...")
        
        # Load tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True
        )
        
        # Load model with 4-bit quantization for efficiency
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2" if self.device.startswith('cuda') else None,
            load_in_4bit=True
        )
        
        # Create text generation pipeline
        self.pipe = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device_map="auto"
        )
        
        print(f"Model {model_name} loaded successfully on {self.device}")
    
    def generate(
        self,
        prompt: str,
        max_new_tokens: int = 512,
        temperature: float = 0.7,
        top_p: float = 0.9,
        **generation_kwargs
    ) -> str:
        """
        Generate text from a prompt using the loaded model.
        
        Args:
            prompt: Input text prompt
            max_new_tokens: Maximum number of tokens to generate
            temperature: Sampling temperature (lower = more focused, higher = more creative)
            top_p: Nucleus sampling parameter
            **generation_kwargs: Additional generation parameters
            
        Returns:
            Generated text
        """
        # Format the prompt for Qwen3 chat
        messages = [
            {"role": "user", "content": prompt}
        ]
        
        # Generate response
        response = self.pipe(
            messages,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            **generation_kwargs
        )
        
        # Extract and return the generated text
        return response[0]["generated_text"][-1]["content"]


def main():
    # Initialize the cognitive LLM
    llm = CognitiveLLM()
    
    print("\nCognitive LLM initialized. Type 'quit' to exit.")
    print("Enter your prompt:")
    
    # Interactive loop
    while True:
        try:
            user_input = input(">> ")
            if user_input.lower() in ['quit', 'exit', 'q']:
                break
                
            if user_input.strip() == '':
                continue
                
            # Generate response
            response = llm.generate(user_input)
            print("\nResponse:")
            print(response)
            print("\n---\nEnter another prompt or 'quit' to exit:")
            
        except KeyboardInterrupt:
            print("\nExiting...")
            break
        except Exception as e:
            print(f"\nError: {str(e)}")
            continue


if __name__ == "__main__":
    main()