import gradio as gr from transformers import AutoProcessor, AutoModelForVision2Seq import torch from PIL import Image # Load model and processor directly # Using device_map="auto" to handle GPU/CPU automatically print("Loading Fara-7B model...") processor = AutoProcessor.from_pretrained("microsoft/Fara-7B", trust_remote_code=True) model = AutoModelForVision2Seq.from_pretrained( "microsoft/Fara-7B", trust_remote_code=True, device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 ) print("Model loaded successfully!") def chat(message, history, image): """ Chat function using the local Fara-7B model """ if not message and not image: return "Please provide text or an image." # Prepare content list for the model content = [] # Add image if provided if image: content.append({"type": "image", "image": image}) # Add text if message: content.append({"type": "text", "text": message}) elif image: # If only image is provided, ask for description content.append({"type": "text", "text": "Describe this image and what actions I can take."}) # Construct messages messages = [ { "role": "user", "content": content } ] try: # Process inputs # The processor handles the image and text formatting inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) # Generate response outputs = model.generate(**inputs, max_new_tokens=500) # Decode response generated_text = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True) return generated_text except Exception as e: return f"Error generating response: {str(e)}" # Create a simple Gradio interface with gr.Blocks(title="Fara-7B Simple Chat") as demo: gr.Markdown("# 🤖 Fara-7B Simple Chat") gr.Markdown("Running microsoft/Fara-7B directly using transformers.") with gr.Row(): with gr.Column(scale=1): image_input = gr.Image(type="pil", label="Upload Screenshot (Optional)") with gr.Column(scale=2): chatbot = gr.ChatInterface( fn=chat, additional_inputs=[image_input], type="messages" ) if __name__ == "__main__": demo.launch()