paligemma-hf

Runtime error

App Files Files Community

hermanhelf commited on Jun 29, 2024

Commit

9002c41

1 Parent(s): d914d44

simple edit to make simple

Browse files

Files changed (1) hide show

app.py +58 -55

app.py CHANGED Viewed

@@ -14,7 +14,8 @@ import numpy as np
 import spaces
-model_id = "google/paligemma-3b-mix-448"
 COLORS = ['#4285f4', '#db4437', '#f4b400', '#0f9d58', '#e48ef1']
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(device)
@@ -64,23 +65,24 @@ def parse_segmentation(input_image, input_text):
 ######## Demo
-INTRO_TEXT = """## PaliGemma demo\n\n
-| [Github](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)
-| [Blogpost](https://huggingface.co/blog/paligemma)
-|\n\n
-PaliGemma is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
-built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
-vision model and the [Gemma](https://arxiv.org/abs/2403.08295) language model. PaliGemma is designed as a versatile
-model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
-answering, text reading, object detection and object segmentation.
-\n\n
-This space includes models fine-tuned on a mix of downstream tasks, **inferred via 🤗 transformers**.
-See the [Blogpost](https://huggingface.co/blog/paligemma) and
-[README](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)
-for detailed information how to use and fine-tune PaliGemma models.
-\n\n
-**This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
-"""
 with gr.Blocks(css="style.css") as demo:
@@ -92,14 +94,15 @@ with gr.Blocks(css="style.css") as demo:
         text_output = gr.Text(label="Text Output")
         chat_btn = gr.Button()
-        tokens = gr.Slider(
-            label="Max New Tokens",
-            info="Set to larger for longer generation.",
-            minimum=10,
-            maximum=100,
-            value=20,
-            step=10,
-        )
     chat_inputs = [
         image,
@@ -127,34 +130,34 @@ with gr.Blocks(css="style.css") as demo:
         examples=examples,
         inputs=chat_inputs,
     )
-  with gr.Tab("Segment/Detect"):
-    image = gr.Image(type="pil")
-    seg_input = gr.Text(label="Entities to Segment/Detect")
-    seg_btn = gr.Button("Submit")
-    annotated_image = gr.AnnotatedImage(label="Output")
-    examples = [["./cats.png", "segment cats"],
-                ["./bee.jpg", "detect bee"],
-               ["./examples/barsik.jpg", "segment cat"],
-               ["./bird.jpg", "segment bird ; bird ; plant"]]
-    gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).")
-    gr.Examples(
-        examples=examples,
-        inputs=[image, seg_input],
-    )
-    seg_inputs = [
-        image,
-        seg_input
-        ]
-    seg_outputs = [
-        annotated_image
-    ]
-    seg_btn.click(
-        fn=parse_segmentation,
-        inputs=seg_inputs,
-        outputs=seg_outputs,
-    )
@@ -323,4 +326,4 @@ def extract_objs(text, width, height, unique_labels=False):
 #########
 if __name__ == "__main__":
-    demo.queue(max_size=10).launch(debug=True)

 import spaces
+# model_id = "google/paligemma-3b-mix-448"
+model_id = "hermanhelf/paligemma"
 COLORS = ['#4285f4', '#db4437', '#f4b400', '#0f9d58', '#e48ef1']
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(device)
 ######## Demo
+INTRO_TEXT = # """## PaliGemma demo\n\n
+# | [Github](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)
+# | [Blogpost](https://huggingface.co/blog/paligemma)
+# |\n\n
+# PaliGemma is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
+# built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
+# vision model and the [Gemma](https://arxiv.org/abs/2403.08295) language model. PaliGemma is designed as a versatile
+# model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
+# answering, text reading, object detection and object segmentation.
+# \n\n
+# This space includes models fine-tuned on a mix of downstream tasks, **inferred via 🤗 transformers**.
+# See the [Blogpost](https://huggingface.co/blog/paligemma) and
+# [README](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)
+# for detailed information how to use and fine-tune PaliGemma models.
+# \n\n
+# **This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
+# """
+INTRO_TEXT = "## Demo\n\n"
 with gr.Blocks(css="style.css") as demo:
         text_output = gr.Text(label="Text Output")
         chat_btn = gr.Button()
+        # tokens = gr.Slider(
+        #     label="Max New Tokens",
+        #     info="Set to larger for longer generation.",
+        #     minimum=10,
+        #     maximum=100,
+        #     value=20,
+        #     step=10,
+        # )
+        tokens = 20
     chat_inputs = [
         image,
         examples=examples,
         inputs=chat_inputs,
     )
+  # with gr.Tab("Segment/Detect"):
+  #   image = gr.Image(type="pil")
+  #   seg_input = gr.Text(label="Entities to Segment/Detect")
+  #   seg_btn = gr.Button("Submit")
+  #   annotated_image = gr.AnnotatedImage(label="Output")
+  #   examples = [["./cats.png", "segment cats"],
+  #               ["./bee.jpg", "detect bee"],
+  #              ["./examples/barsik.jpg", "segment cat"],
+  #              ["./bird.jpg", "segment bird ; bird ; plant"]]
+  #   gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).")
+  #   gr.Examples(
+  #       examples=examples,
+  #       inputs=[image, seg_input],
+  #   )
+  #   seg_inputs = [
+  #       image,
+  #       seg_input
+  #       ]
+  #   seg_outputs = [
+  #       annotated_image
+  #   ]
+  #   seg_btn.click(
+  #       fn=parse_segmentation,
+  #       inputs=seg_inputs,
+  #       outputs=seg_outputs,
+  #   )
 #########
 if __name__ == "__main__":
+    demo.queue(max_size=10).launch(debug=True)