Spaces:
Sleeping
Sleeping
File size: 5,128 Bytes
51d9af5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import gradio as gr
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import upload_file
import os
import uuid
import logging
# Model configuration
MID = "apple/FastVLM-0.5B"
IMAGE_TOKEN_INDEX = -200
# Your HF model repo where you want to upload results
HF_MODEL = "rahul7star/VideoExplain" # change if needed
# Load model and tokenizer (lazy load)
tok = None
model = None
def load_model():
global tok, model
if tok is None or model is None:
print("Loading model...")
tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
if torch.cuda.is_available():
device = "cuda"
dtype = torch.float16
else:
device = "cpu"
dtype = torch.float32
model = AutoModelForCausalLM.from_pretrained(
MID,
torch_dtype=dtype,
device_map=device,
trust_remote_code=True,
)
print(f"Model loaded on {device.upper()} successfully!")
return tok, model
def upload_to_hf(image_path, summary_text):
"""Upload image + summary text to Hugging Face model repo"""
unique_folder = f"image_{uuid.uuid4().hex[:8]}"
logging.info(f"Creating new HF folder: {unique_folder} in repo {HF_MODEL}")
# Upload image
img_filename = os.path.basename(image_path)
img_hf_path = f"{unique_folder}/{img_filename}"
upload_file(
path_or_fileobj=image_path,
path_in_repo=img_hf_path,
repo_id=HF_MODEL,
repo_type="model",
token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
)
logging.info(f"✅ Uploaded image to HF: {img_hf_path}")
# Upload summary text
summary_file = "/tmp/summary.txt"
with open(summary_file, "w", encoding="utf-8") as f:
f.write(summary_text)
summary_hf_path = f"{unique_folder}/summary.txt"
upload_file(
path_or_fileobj=summary_file,
path_in_repo=summary_hf_path,
repo_id=HF_MODEL,
repo_type="model",
token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
)
logging.info(f"✅ Uploaded summary to HF: {summary_hf_path}")
return f"Uploaded to Hugging Face under {unique_folder}"
def caption_image(image, custom_prompt=None):
"""Generate caption + upload image+caption to HF"""
if image is None:
return "Please upload an image first."
try:
# Save uploaded image locally (needed for upload)
temp_img = "/tmp/uploaded_image.png"
image.save(temp_img)
# Load model
tok, model = load_model()
if image.mode != "RGB":
image = image.convert("RGB")
prompt = custom_prompt if custom_prompt else "Describe this image in detail."
messages = [{"role": "user", "content": f"<image>\n{prompt}"}]
rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
pre, post = rendered.split("<image>", 1)
pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
attention_mask = torch.ones_like(input_ids, device=model.device)
px = model.get_vision_tower().image_processor(images=image, return_tensors="pt")["pixel_values"]
px = px.to(model.device, dtype=model.dtype)
with torch.no_grad():
out = model.generate(
inputs=input_ids,
attention_mask=attention_mask,
images=px,
max_new_tokens=128,
do_sample=False,
)
generated_text = tok.decode(out[0], skip_special_tokens=True)
response = generated_text.split("assistant")[-1].strip() if "assistant" in generated_text else generated_text
# Upload image + caption to HF repo
upload_status = upload_to_hf(temp_img, response)
return f"{response}\n\n---\n{upload_status}"
except Exception as e:
return f"Error generating caption: {str(e)}"
# Gradio UI
with gr.Blocks(title="FastVLM Image Captioning") as demo:
gr.Markdown("# 🖼️ FastVLM Image Captioning")
with gr.Row():
with gr.Column():
image_input = gr.Image(type="pil", label="Upload Image")
custom_prompt = gr.Textbox(
label="Custom Prompt (Optional)",
placeholder="Leave empty for default prompt",
lines=2
)
generate_btn = gr.Button("Generate + Upload", variant="primary")
clear_btn = gr.ClearButton([image_input, custom_prompt])
with gr.Column():
output = gr.Textbox(label="Generated Caption + Upload Status", lines=8, show_copy_button=True)
generate_btn.click(caption_image, [image_input, custom_prompt], output)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)
|