Spaces:
Sleeping
Sleeping
| # app.py | |
| from awq import AutoAWQForCausalLM | |
| from transformers import AutoTokenizer | |
| import torch | |
| import gradio as gr | |
| # Model name from Hugging Face | |
| MODEL_NAME = "TheBloke/Mistral-7B-v0.1-AWQ" | |
| # Load the model | |
| print("π Loading Mistral 7B v0.1 AWQ model...") | |
| model = AutoAWQForCausalLM.from_quantized( | |
| MODEL_NAME, | |
| fuse_layers=True, | |
| trust_remote_code=False, | |
| safetensors=True | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=False) | |
| print("β Model loaded successfully!") | |
| # Text generation function | |
| def generate_text(prompt, temperature, max_tokens): | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| inputs.input_ids, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=0.9, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Clean the output (remove the original prompt from response) | |
| if prompt in response: | |
| response = response[len(prompt):].strip() | |
| return response | |
| # Gradio Interface | |
| interface = gr.Interface( | |
| fn=generate_text, | |
| inputs=[ | |
| gr.Textbox(lines=3, placeholder="Ask Mistral something...", label="Prompt"), | |
| gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Temperature"), | |
| gr.Slider(50, 1024, value=512, step=10, label="Max Tokens") | |
| ], | |
| outputs=gr.Textbox(lines=10, label="Response"), | |
| title="π§ Mistral 7B v0.1 AWQ", | |
| description="Run the quantized Mistral 7B v0.1 model locally or on Google Colab using Gradio.", | |
| theme="default" | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch(share=True) | |