Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| from typing import Optional, Dict, Any | |
| class CognitiveLLM: | |
| def __init__(self, model_name: str = "Qwen/Qwen3-7B-Instruct", device: str = None): | |
| """ | |
| Initialize the Cognitive LLM with the specified model. | |
| Args: | |
| model_name: Name of the model to use (default: Qwen/Qwen3-7B-Instruct) | |
| device: Device to run the model on ('cuda', 'mps', or 'cpu'). Auto-detects if None. | |
| """ | |
| self.model_name = model_name | |
| self.device = device if device else 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu' | |
| print(f"Loading {model_name} on {self.device}...") | |
| # Load tokenizer and model | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, | |
| trust_remote_code=True | |
| ) | |
| # Load model with 4-bit quantization for efficiency | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| torch_dtype=torch.bfloat16, | |
| attn_implementation="flash_attention_2" if self.device.startswith('cuda') else None, | |
| load_in_4bit=True | |
| ) | |
| # Create text generation pipeline | |
| self.pipe = pipeline( | |
| "text-generation", | |
| model=self.model, | |
| tokenizer=self.tokenizer, | |
| device_map="auto" | |
| ) | |
| print(f"Model {model_name} loaded successfully on {self.device}") | |
| def generate( | |
| self, | |
| prompt: str, | |
| max_new_tokens: int = 512, | |
| temperature: float = 0.7, | |
| top_p: float = 0.9, | |
| **generation_kwargs | |
| ) -> str: | |
| """ | |
| Generate text from a prompt using the loaded model. | |
| Args: | |
| prompt: Input text prompt | |
| max_new_tokens: Maximum number of tokens to generate | |
| temperature: Sampling temperature (lower = more focused, higher = more creative) | |
| top_p: Nucleus sampling parameter | |
| **generation_kwargs: Additional generation parameters | |
| Returns: | |
| Generated text | |
| """ | |
| # Format the prompt for Qwen3 chat | |
| messages = [ | |
| {"role": "user", "content": prompt} | |
| ] | |
| # Generate response | |
| response = self.pipe( | |
| messages, | |
| max_new_tokens=max_new_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| do_sample=True, | |
| **generation_kwargs | |
| ) | |
| # Extract and return the generated text | |
| return response[0]["generated_text"][-1]["content"] | |
| def main(): | |
| # Initialize the cognitive LLM | |
| llm = CognitiveLLM() | |
| print("\nCognitive LLM initialized. Type 'quit' to exit.") | |
| print("Enter your prompt:") | |
| # Interactive loop | |
| while True: | |
| try: | |
| user_input = input(">> ") | |
| if user_input.lower() in ['quit', 'exit', 'q']: | |
| break | |
| if user_input.strip() == '': | |
| continue | |
| # Generate response | |
| response = llm.generate(user_input) | |
| print("\nResponse:") | |
| print(response) | |
| print("\n---\nEnter another prompt or 'quit' to exit:") | |
| except KeyboardInterrupt: | |
| print("\nExiting...") | |
| break | |
| except Exception as e: | |
| print(f"\nError: {str(e)}") | |
| continue | |
| if __name__ == "__main__": | |
| main() | |