File size: 5,128 Bytes
51d9af5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import gradio as gr
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import upload_file
import os
import uuid
import logging

# Model configuration
MID = "apple/FastVLM-0.5B"
IMAGE_TOKEN_INDEX = -200

# Your HF model repo where you want to upload results
HF_MODEL = "rahul7star/VideoExplain"  # change if needed

# Load model and tokenizer (lazy load)
tok = None
model = None

def load_model():
    global tok, model
    if tok is None or model is None:
        print("Loading model...")
        tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
        if torch.cuda.is_available():
            device = "cuda"
            dtype = torch.float16
        else:
            device = "cpu"
            dtype = torch.float32
        model = AutoModelForCausalLM.from_pretrained(
            MID,
            torch_dtype=dtype,
            device_map=device,
            trust_remote_code=True,
        )
        print(f"Model loaded on {device.upper()} successfully!")
    return tok, model


def upload_to_hf(image_path, summary_text):
    """Upload image + summary text to Hugging Face model repo"""
    unique_folder = f"image_{uuid.uuid4().hex[:8]}"
    logging.info(f"Creating new HF folder: {unique_folder} in repo {HF_MODEL}")

    # Upload image
    img_filename = os.path.basename(image_path)
    img_hf_path = f"{unique_folder}/{img_filename}"
    upload_file(
        path_or_fileobj=image_path,
        path_in_repo=img_hf_path,
        repo_id=HF_MODEL,
        repo_type="model",
        token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
    )
    logging.info(f"✅ Uploaded image to HF: {img_hf_path}")

    # Upload summary text
    summary_file = "/tmp/summary.txt"
    with open(summary_file, "w", encoding="utf-8") as f:
        f.write(summary_text)

    summary_hf_path = f"{unique_folder}/summary.txt"
    upload_file(
        path_or_fileobj=summary_file,
        path_in_repo=summary_hf_path,
        repo_id=HF_MODEL,
        repo_type="model",
        token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
    )
    logging.info(f"✅ Uploaded summary to HF: {summary_hf_path}")

    return f"Uploaded to Hugging Face under {unique_folder}"


def caption_image(image, custom_prompt=None):
    """Generate caption + upload image+caption to HF"""
    if image is None:
        return "Please upload an image first."
    try:
        # Save uploaded image locally (needed for upload)
        temp_img = "/tmp/uploaded_image.png"
        image.save(temp_img)

        # Load model
        tok, model = load_model()
        if image.mode != "RGB":
            image = image.convert("RGB")

        prompt = custom_prompt if custom_prompt else "Describe this image in detail."
        messages = [{"role": "user", "content": f"<image>\n{prompt}"}]

        rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        pre, post = rendered.split("<image>", 1)
        pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
        post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
        img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
        input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
        attention_mask = torch.ones_like(input_ids, device=model.device)

        px = model.get_vision_tower().image_processor(images=image, return_tensors="pt")["pixel_values"]
        px = px.to(model.device, dtype=model.dtype)

        with torch.no_grad():
            out = model.generate(
                inputs=input_ids,
                attention_mask=attention_mask,
                images=px,
                max_new_tokens=128,
                do_sample=False,
            )
        generated_text = tok.decode(out[0], skip_special_tokens=True)
        response = generated_text.split("assistant")[-1].strip() if "assistant" in generated_text else generated_text

        # Upload image + caption to HF repo
        upload_status = upload_to_hf(temp_img, response)

        return f"{response}\n\n---\n{upload_status}"

    except Exception as e:
        return f"Error generating caption: {str(e)}"


# Gradio UI
with gr.Blocks(title="FastVLM Image Captioning") as demo:
    gr.Markdown("# 🖼️ FastVLM Image Captioning")

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Image")
            custom_prompt = gr.Textbox(
                label="Custom Prompt (Optional)",
                placeholder="Leave empty for default prompt",
                lines=2
            )
            generate_btn = gr.Button("Generate + Upload", variant="primary")
            clear_btn = gr.ClearButton([image_input, custom_prompt])
        with gr.Column():
            output = gr.Textbox(label="Generated Caption + Upload Status", lines=8, show_copy_button=True)

    generate_btn.click(caption_image, [image_input, custom_prompt], output)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)