Spaces:

ZennyKenny
/

GRPO_Qwen_3B_ZK_FineTune_LoRA_Demo

Build error

App Files Files Community

GRPO_Qwen_3B_ZK_FineTune_LoRA_Demo / app.py

ZennyKenny

Update app.py

3f3d24b verified 9 months ago

raw

history blame contribute delete

2.21 kB

	import spaces
	import gradio as gr
	from peft import PeftModel
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	import threading
	import torch

	# Load the base model without quantization to avoid bitsandbytes issues
	base_model = AutoModelForCausalLM.from_pretrained(
	"unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
	device_map="cpu", # Ensure it runs on CPU to avoid bitsandbytes issues
	torch_dtype=torch.float32 # Explicitly set dtype
	)

	# Load the LoRA adapter
	model = PeftModel.from_pretrained(
	base_model,
	"ZennyKenny/GPRO_LoRA_Qwen_3B"
	)

	# Move model to CPU explicitly (since peft sometimes does not move it automatically)
	model.to("cpu")
	model.eval() # Ensure the model is in inference mode

	# Load the tokenizer
	tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")

	@spaces.GPU
	def generate_response(prompt):
	reasoning_prompt = (
	"Answer the following question and explain your reasoning step by step.\n"
	f"Question: {prompt}\nReasoning:"
	)

	# Tokenize and move to correct device
	inputs = tokenizer(reasoning_prompt, return_tensors="pt")
	input_ids = inputs["input_ids"].to("cpu") # Ensure tensor is on the correct device

	# Using TextIteratorStreamer for streaming responses
	streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)

	# Adjust generation parameters
	generation_kwargs = dict(
	input_ids=input_ids,
	max_new_tokens=300,
	do_sample=True,
	temperature=0.8,
	top_p=0.95,
	streamer=streamer
	)

	# Ensure streaming happens in a separate thread
	thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	for new_text in streamer:
	yield new_text

	# Define Gradio UI
	demo = gr.Interface(
	fn=generate_response,
	inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
	outputs=gr.Textbox(label="Response"),
	title="LoRA Model Reasoning Inference",
	description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.",
	allow_flagging="never"
	)

	# Launch the Gradio app
	demo.launch(share=True)