Spaces:
Running
Running
File size: 5,671 Bytes
51d9af5 740d79f fded6f9 51d9af5 eb2059f c03ac17 eb2059f 2a46970 eb2059f 56e9293 eb2059f ca0ea04 eb2059f 51d9af5 eb2059f 51d9af5 eb2059f 51d9af5 ca0ea04 51d9af5 ca0ea04 51d9af5 eb2059f 51d9af5 c8faaff 51d9af5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
import gradio as gr
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import upload_file
import os
import uuid
import logging
# Model configuration
MID = "apple/FastVLM-0.5B"
IMAGE_TOKEN_INDEX = -200
HF_MODEL = "rahul7star/ImageExplain"
# Load model and tokenizer (lazy load)
tok = None
model = None
def load_model():
global tok, model
if tok is None or model is None:
print("Loading model...")
tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
if torch.cuda.is_available():
device = "cuda"
dtype = torch.float16
else:
device = "cpu"
dtype = torch.float32
model = AutoModelForCausalLM.from_pretrained(
MID,
torch_dtype=dtype,
device_map=device,
trust_remote_code=True,
)
print(f"Model loaded on {device.upper()} successfully!")
return tok, model
import os
import uuid
import logging
from datetime import datetime
import tempfile
from huggingface_hub import HfApi, upload_file
def upload_to_hf(video_path, summary_text):
api = HfApi()
today_str = datetime.now().strftime("%Y-%m-%d")
date_folder = f"{today_str}-APPLE-IMAGE_FOLDER"
# Unique subfolder for this upload
unique_subfolder = f"upload_{uuid.uuid4().hex[:8]}"
hf_folder = f"{date_folder}/{unique_subfolder}"
logging.info(f"Uploading files to HF folder: {hf_folder} in repo {HF_MODEL}")
# --- Upload video ---
video_filename = os.path.basename(video_path)
video_hf_path = f"{hf_folder}/{video_filename}"
upload_file(
path_or_fileobj=video_path,
path_in_repo=video_hf_path,
repo_id=HF_MODEL,
repo_type="model",
token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
)
logging.info(f"โ
Uploaded video to HF: {video_hf_path}")
# --- Upload summary.txt ---
summary_filename = f"summary_{os.path.splitext(video_filename)[0]}.txt"
summary_file = os.path.join(tempfile.gettempdir(), summary_filename)
with open(summary_file, "w", encoding="utf-8") as f:
f.write(summary_text)
summary_hf_path = f"{hf_folder}/{summary_filename}"
upload_file(
path_or_fileobj=summary_file,
path_in_repo=summary_hf_path,
repo_id=HF_MODEL,
repo_type="model",
token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
)
logging.info(f"โ
Uploaded summary to HF: {summary_hf_path}")
return hf_folder
def caption_image(image, custom_prompt=None):
"""Generate caption + upload image+caption to HF"""
if image is None:
return "Please upload an image first."
try:
# Save uploaded image locally (needed for upload)
temp_img = "/tmp/uploaded_image.png"
image.save(temp_img)
# Load model
tok, model = load_model()
if image.mode != "RGB":
image = image.convert("RGB")
prompt = custom_prompt if custom_prompt else "Describe this image in detail."
messages = [{"role": "user", "content": f"<image>\n{prompt}"}]
rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
pre, post = rendered.split("<image>", 1)
pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
attention_mask = torch.ones_like(input_ids, device=model.device)
px = model.get_vision_tower().image_processor(images=image, return_tensors="pt")["pixel_values"]
px = px.to(model.device, dtype=model.dtype)
with torch.no_grad():
out = model.generate(
inputs=input_ids,
attention_mask=attention_mask,
images=px,
max_new_tokens=128,
do_sample=False,
)
generated_text = tok.decode(out[0], skip_special_tokens=True)
response = generated_text.split("assistant")[-1].strip() if "assistant" in generated_text else generated_text
# Upload image + caption to HF repo
upload_status = upload_to_hf(temp_img, response)
return f"{response}\n\n---\n{upload_status}"
except Exception as e:
return f"Error generating caption: {str(e)}"
# Gradio UI
with gr.Blocks(title="FastVLM Image Captioning") as demo:
gr.Markdown("# ๐ผ๏ธ FastVLM Image Captioning")
# ๐ Add hyperlink here
gr.Markdown(
"### ๐ For **Video Analysis**, click here: "
"[Video-Analysis-AppleFastVLM-7B](https://huggingface.co/spaces/rahul7star/Video-Analysis-AppleFastVLM-7B)"
)
with gr.Row():
with gr.Column():
image_input = gr.Image(type="pil", label="Upload Image")
custom_prompt = gr.Textbox(
label="Custom Prompt (Optional)",
placeholder="Leave empty for default prompt",
lines=2
)
generate_btn = gr.Button("Generate + Upload", variant="primary")
clear_btn = gr.ClearButton([image_input, custom_prompt])
with gr.Column():
output = gr.Textbox(label="Generated Caption + Upload Status", lines=8, show_copy_button=True)
generate_btn.click(caption_image, [image_input, custom_prompt], output)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)
|