rahul7star commited on
Commit
51d9af5
·
verified ·
1 Parent(s): aab02c4

Create app1.py

Browse files
Files changed (1) hide show
  1. app1.py +145 -0
app1.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from PIL import Image
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ from huggingface_hub import upload_file
6
+ import os
7
+ import uuid
8
+ import logging
9
+
10
+ # Model configuration
11
+ MID = "apple/FastVLM-0.5B"
12
+ IMAGE_TOKEN_INDEX = -200
13
+
14
+ # Your HF model repo where you want to upload results
15
+ HF_MODEL = "rahul7star/VideoExplain" # change if needed
16
+
17
+ # Load model and tokenizer (lazy load)
18
+ tok = None
19
+ model = None
20
+
21
+ def load_model():
22
+ global tok, model
23
+ if tok is None or model is None:
24
+ print("Loading model...")
25
+ tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
26
+ if torch.cuda.is_available():
27
+ device = "cuda"
28
+ dtype = torch.float16
29
+ else:
30
+ device = "cpu"
31
+ dtype = torch.float32
32
+ model = AutoModelForCausalLM.from_pretrained(
33
+ MID,
34
+ torch_dtype=dtype,
35
+ device_map=device,
36
+ trust_remote_code=True,
37
+ )
38
+ print(f"Model loaded on {device.upper()} successfully!")
39
+ return tok, model
40
+
41
+
42
+ def upload_to_hf(image_path, summary_text):
43
+ """Upload image + summary text to Hugging Face model repo"""
44
+ unique_folder = f"image_{uuid.uuid4().hex[:8]}"
45
+ logging.info(f"Creating new HF folder: {unique_folder} in repo {HF_MODEL}")
46
+
47
+ # Upload image
48
+ img_filename = os.path.basename(image_path)
49
+ img_hf_path = f"{unique_folder}/{img_filename}"
50
+ upload_file(
51
+ path_or_fileobj=image_path,
52
+ path_in_repo=img_hf_path,
53
+ repo_id=HF_MODEL,
54
+ repo_type="model",
55
+ token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
56
+ )
57
+ logging.info(f"✅ Uploaded image to HF: {img_hf_path}")
58
+
59
+ # Upload summary text
60
+ summary_file = "/tmp/summary.txt"
61
+ with open(summary_file, "w", encoding="utf-8") as f:
62
+ f.write(summary_text)
63
+
64
+ summary_hf_path = f"{unique_folder}/summary.txt"
65
+ upload_file(
66
+ path_or_fileobj=summary_file,
67
+ path_in_repo=summary_hf_path,
68
+ repo_id=HF_MODEL,
69
+ repo_type="model",
70
+ token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
71
+ )
72
+ logging.info(f"✅ Uploaded summary to HF: {summary_hf_path}")
73
+
74
+ return f"Uploaded to Hugging Face under {unique_folder}"
75
+
76
+
77
+ def caption_image(image, custom_prompt=None):
78
+ """Generate caption + upload image+caption to HF"""
79
+ if image is None:
80
+ return "Please upload an image first."
81
+ try:
82
+ # Save uploaded image locally (needed for upload)
83
+ temp_img = "/tmp/uploaded_image.png"
84
+ image.save(temp_img)
85
+
86
+ # Load model
87
+ tok, model = load_model()
88
+ if image.mode != "RGB":
89
+ image = image.convert("RGB")
90
+
91
+ prompt = custom_prompt if custom_prompt else "Describe this image in detail."
92
+ messages = [{"role": "user", "content": f"<image>\n{prompt}"}]
93
+
94
+ rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
95
+ pre, post = rendered.split("<image>", 1)
96
+ pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
97
+ post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
98
+ img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
99
+ input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
100
+ attention_mask = torch.ones_like(input_ids, device=model.device)
101
+
102
+ px = model.get_vision_tower().image_processor(images=image, return_tensors="pt")["pixel_values"]
103
+ px = px.to(model.device, dtype=model.dtype)
104
+
105
+ with torch.no_grad():
106
+ out = model.generate(
107
+ inputs=input_ids,
108
+ attention_mask=attention_mask,
109
+ images=px,
110
+ max_new_tokens=128,
111
+ do_sample=False,
112
+ )
113
+ generated_text = tok.decode(out[0], skip_special_tokens=True)
114
+ response = generated_text.split("assistant")[-1].strip() if "assistant" in generated_text else generated_text
115
+
116
+ # Upload image + caption to HF repo
117
+ upload_status = upload_to_hf(temp_img, response)
118
+
119
+ return f"{response}\n\n---\n{upload_status}"
120
+
121
+ except Exception as e:
122
+ return f"Error generating caption: {str(e)}"
123
+
124
+
125
+ # Gradio UI
126
+ with gr.Blocks(title="FastVLM Image Captioning") as demo:
127
+ gr.Markdown("# 🖼️ FastVLM Image Captioning")
128
+
129
+ with gr.Row():
130
+ with gr.Column():
131
+ image_input = gr.Image(type="pil", label="Upload Image")
132
+ custom_prompt = gr.Textbox(
133
+ label="Custom Prompt (Optional)",
134
+ placeholder="Leave empty for default prompt",
135
+ lines=2
136
+ )
137
+ generate_btn = gr.Button("Generate + Upload", variant="primary")
138
+ clear_btn = gr.ClearButton([image_input, custom_prompt])
139
+ with gr.Column():
140
+ output = gr.Textbox(label="Generated Caption + Upload Status", lines=8, show_copy_button=True)
141
+
142
+ generate_btn.click(caption_image, [image_input, custom_prompt], output)
143
+
144
+ if __name__ == "__main__":
145
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)