File size: 5,671 Bytes
51d9af5
 
 
 
 
 
 
 
 
 
 
 
 
740d79f
fded6f9
51d9af5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb2059f
 
 
 
c03ac17
eb2059f
 
 
 
 
2a46970
eb2059f
56e9293
 
 
 
 
eb2059f
 
ca0ea04
eb2059f
 
51d9af5
eb2059f
 
51d9af5
 
 
 
eb2059f
51d9af5
ca0ea04
 
 
51d9af5
 
 
ca0ea04
51d9af5
 
 
 
 
 
 
 
 
eb2059f
51d9af5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8faaff
 
 
 
 
 
51d9af5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import gradio as gr
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import upload_file
import os
import uuid
import logging

# Model configuration
MID = "apple/FastVLM-0.5B"
IMAGE_TOKEN_INDEX = -200


HF_MODEL = "rahul7star/ImageExplain" 

# Load model and tokenizer (lazy load)
tok = None
model = None

def load_model():
    global tok, model
    if tok is None or model is None:
        print("Loading model...")
        tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
        if torch.cuda.is_available():
            device = "cuda"
            dtype = torch.float16
        else:
            device = "cpu"
            dtype = torch.float32
        model = AutoModelForCausalLM.from_pretrained(
            MID,
            torch_dtype=dtype,
            device_map=device,
            trust_remote_code=True,
        )
        print(f"Model loaded on {device.upper()} successfully!")
    return tok, model


import os
import uuid
import logging
from datetime import datetime
import tempfile
from huggingface_hub import HfApi, upload_file

def upload_to_hf(video_path, summary_text):
    api = HfApi()
    
   
    today_str = datetime.now().strftime("%Y-%m-%d")
    date_folder = f"{today_str}-APPLE-IMAGE_FOLDER"

    # Unique subfolder for this upload
    unique_subfolder = f"upload_{uuid.uuid4().hex[:8]}"
    hf_folder = f"{date_folder}/{unique_subfolder}"
    logging.info(f"Uploading files to HF folder: {hf_folder} in repo {HF_MODEL}")

    # --- Upload video ---
    video_filename = os.path.basename(video_path)
    video_hf_path = f"{hf_folder}/{video_filename}"
    upload_file(
        path_or_fileobj=video_path,
        path_in_repo=video_hf_path,
        repo_id=HF_MODEL,
        repo_type="model",
        token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
    )
    logging.info(f"โœ… Uploaded video to HF: {video_hf_path}")

    # --- Upload summary.txt ---
    summary_filename = f"summary_{os.path.splitext(video_filename)[0]}.txt"
    summary_file = os.path.join(tempfile.gettempdir(), summary_filename)
    with open(summary_file, "w", encoding="utf-8") as f:
        f.write(summary_text)

    summary_hf_path = f"{hf_folder}/{summary_filename}"
    upload_file(
        path_or_fileobj=summary_file,
        path_in_repo=summary_hf_path,
        repo_id=HF_MODEL,
        repo_type="model",
        token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
    )
    logging.info(f"โœ… Uploaded summary to HF: {summary_hf_path}")

    return hf_folder

def caption_image(image, custom_prompt=None):
    """Generate caption + upload image+caption to HF"""
    if image is None:
        return "Please upload an image first."
    try:
        # Save uploaded image locally (needed for upload)
        temp_img = "/tmp/uploaded_image.png"
        image.save(temp_img)

        # Load model
        tok, model = load_model()
        if image.mode != "RGB":
            image = image.convert("RGB")

        prompt = custom_prompt if custom_prompt else "Describe this image in detail."
        messages = [{"role": "user", "content": f"<image>\n{prompt}"}]

        rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        pre, post = rendered.split("<image>", 1)
        pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
        post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
        img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
        input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
        attention_mask = torch.ones_like(input_ids, device=model.device)

        px = model.get_vision_tower().image_processor(images=image, return_tensors="pt")["pixel_values"]
        px = px.to(model.device, dtype=model.dtype)

        with torch.no_grad():
            out = model.generate(
                inputs=input_ids,
                attention_mask=attention_mask,
                images=px,
                max_new_tokens=128,
                do_sample=False,
            )
        generated_text = tok.decode(out[0], skip_special_tokens=True)
        response = generated_text.split("assistant")[-1].strip() if "assistant" in generated_text else generated_text

        # Upload image + caption to HF repo
        upload_status = upload_to_hf(temp_img, response)

        return f"{response}\n\n---\n{upload_status}"

    except Exception as e:
        return f"Error generating caption: {str(e)}"


# Gradio UI
with gr.Blocks(title="FastVLM Image Captioning") as demo:
    gr.Markdown("# ๐Ÿ–ผ๏ธ FastVLM Image Captioning")
    # ๐Ÿ”— Add hyperlink here
    gr.Markdown(
        "### ๐Ÿ”— For **Video Analysis**, click here: "
        "[Video-Analysis-AppleFastVLM-7B](https://huggingface.co/spaces/rahul7star/Video-Analysis-AppleFastVLM-7B)"
    )


    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Image")
            custom_prompt = gr.Textbox(
                label="Custom Prompt (Optional)",
                placeholder="Leave empty for default prompt",
                lines=2
            )
            generate_btn = gr.Button("Generate + Upload", variant="primary")
            clear_btn = gr.ClearButton([image_input, custom_prompt])
        with gr.Column():
            output = gr.Textbox(label="Generated Caption + Upload Status", lines=8, show_copy_button=True)

    generate_btn.click(caption_image, [image_input, custom_prompt], output)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)