| # import torch | |
| # print(torch.cuda.is_available()) # Should return True | |
| # print(torch.cuda.get_device_name(0)) # Should return 'Tesla T4' | |
| # print(torch.cuda.get_device_capability(0)) | |
| import llama_cpp | |
| from llama_cpp import Llama | |
| # import llama_cpp.llama_tokenizer | |
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| model_name = "large-traversaal/Alif-1.0-8B-Instruct" | |
| model_file = "model-Q8_0.gguf" | |
| model_path_file = hf_hub_download(model_name, | |
| filename=model_file,) | |
| # llama = llama_cpp.Llama.from_pretrained( | |
| # repo_id="large-traversaal/Alif-1.0-8B-Instruct", | |
| # filename="*model-Q6_K.gguf", | |
| # tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained( | |
| # "large-traversaal/Alif-1.0-8B-Instruct" | |
| # ), | |
| # verbose=False, | |
| # ) | |
| # llama = Llama(model_path="./model-Q8_0.gguf", verbose=False) | |
| llama = Llama( | |
| model_path=model_path_file, | |
| n_gpu_layers=40, # Adjust based on VRAM | |
| n_threads=8, # Match CPU cores | |
| n_batch=512, # Optimize for better VRAM usage | |
| n_ctx=4096, # Context window size | |
| verbose=True # Enable debug logging | |
| ) | |
| chat_prompt = """You are Urdu Chatbot. Write approriate response for given instruction:{inp} Response:""" | |
| # prompt = "قابل تجدید توانائی کیا ہے؟" | |
| prompt = "شہر کراچی کے بارے میں بتاؤ" | |
| # prompt = chat_prompt.format(inp=prompt) | |
| # response = llama(prompt, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True) # Enable streaming | |
| # # prompt = "قابل تجدید توانائی کیا ہے؟" | |
| # stop_tokens = ["\n\n", "<|end_of_text|>"] # Stops after natural pauses or end-of-text token | |
| # Function to generate text with streaming output | |
| def chat_with_ai(prompt): | |
| query = chat_prompt.format(inp=prompt) | |
| #response = llama(prompt, max_tokens=1024, stop=stop_tokens, echo=False, stream=True) # Enable streaming | |
| response = llama(query, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True) # Enable streaming | |
| # response = llama.create_chat_completion( | |
| # messages = [ | |
| # {"role": "system", "content": "You are a Urdu Chatbot."}, | |
| # { | |
| # "role": "user", | |
| # "content": prompt | |
| # } | |
| # ], | |
| # stream=True | |
| # ) | |
| text = "" | |
| for chunk in response: | |
| content = chunk["choices"][0]["text"] | |
| if content: | |
| text += content | |
| yield text | |
| # Gradio UI setup | |
| demo = gr.Interface( | |
| fn=chat_with_ai, # Streaming function | |
| inputs="text", # User input | |
| outputs="text", # Model response | |
| title="💬 Streaming AI Chatbot", | |
| description="Enter a prompt and get a streamed response from Llama.cpp (GGUF)." | |
| ) | |
| # Launch the Gradio app | |
| demo.launch(share=True) |