Spaces:

aletrn
/

lisa-on-cuda

Paused

App Files Files Community

alessandro trinca tornidor commited on Mar 3, 2024

Commit

719ecfd

1 Parent(s): 995f2bf

[refactor] fix missing parse_args() functions, fix inference()

Browse files

Files changed (2) hide show

app.py +54 -36
utils/constants.py +47 -0

app.py CHANGED Viewed

@@ -1,32 +1,20 @@
 import argparse
-import cv2
-import gradio as gr
 import json
 import logging
-import nh3
-import numpy as np
 import os
-import re
 import sys
-import torch
-import torch.nn.functional as F
-from fastapi import FastAPI, File, UploadFile, Request
-from fastapi.responses import HTMLResponse, RedirectResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.templating import Jinja2Templates
-from transformers import AutoTokenizer, BitsAndBytesConfig, CLIPImageProcessor
-from typing import Callable
-from model.LISA import LISAForCausalLM
-from model.llava import conversation as conversation_lib
-from model.llava.mm_utils import tokenizer_image_token
-from model.segment_anything.utils.transforms import ResizeLongestSide
-from utils import session_logger
-from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
-                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
-session_logger.change_logging(logging.DEBUG)
 CUSTOM_GRADIO_PATH = "/"
 app = FastAPI(title="lisa_app", version="1.0")
@@ -48,6 +36,37 @@ def health() -> str:
         return json.dumps({"msg": "request failed"})
 @session_logger.set_uuid_logging
 def get_cleaned_input(input_str):
     logging.info(f"start cleaning of input_str: {input_str}.")
@@ -85,12 +104,11 @@ def get_inference_model_by_args(args_to_parse):
     @session_logger.set_uuid_logging
     def inference(input_str, input_image):
-        ## filter out special chars
-        input_str = get_cleaned_input(input_str)
-        logging.info(f"input_str type: {type(input_str)}, input_image type: {type(input_image)}.")
-        logging.info(f"input_str: {input_str}.")
         return output_image, output_str
     return inference
@@ -100,20 +118,20 @@ def get_inference_model_by_args(args_to_parse):
 def get_gradio_interface(fn_inference: Callable):
     return gr.Interface(
         fn_inference,
-    inputs=[
             gr.Textbox(lines=1, placeholder=None, label="Text Instruction"),
             gr.Image(type="filepath", label="Input Image")
-    ],
-    outputs=[
             gr.Image(type="pil", label="Segmentation Output"),
-        gr.Textbox(lines=1, placeholder=None, label="Text Output"),
-    ],
-        title=title,
-        description=description,
-        article=article,
-        examples=examples,
-        allow_flagging="auto",
-)
 args = parse_args(sys.argv[1:])

 import argparse
 import json
 import logging
 import os
 import sys
+from typing import Callable
+import gradio as gr
+import nh3
+from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
 from fastapi.templating import Jinja2Templates
+from utils import constants, session_logger
+session_logger.change_logging(logging.DEBUG)
 CUSTOM_GRADIO_PATH = "/"
 app = FastAPI(title="lisa_app", version="1.0")
         return json.dumps({"msg": "request failed"})
+@session_logger.set_uuid_logging
+def parse_args(args_to_parse):
+    parser = argparse.ArgumentParser(description="LISA chat")
+    parser.add_argument("--version", default="xinlai/LISA-13B-llama2-v1")
+    parser.add_argument("--vis_save_path", default="./vis_output", type=str)
+    parser.add_argument(
+        "--precision",
+        default="fp16",
+        type=str,
+        choices=["fp32", "bf16", "fp16"],
+        help="precision for inference",
+    )
+    parser.add_argument("--image_size", default=1024, type=int, help="image size")
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument(
+        "--vision-tower", default="openai/clip-vit-large-patch14", type=str
+    )
+    parser.add_argument("--local-rank", default=0, type=int, help="node rank")
+    parser.add_argument("--load_in_8bit", action="store_true", default=False)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type",
+        default="llava_v1",
+        type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    return parser.parse_args(args_to_parse)
 @session_logger.set_uuid_logging
 def get_cleaned_input(input_str):
     logging.info(f"start cleaning of input_str: {input_str}.")
     @session_logger.set_uuid_logging
     def inference(input_str, input_image):
+        logging.info(f"start cleaning input_str: {input_str}, type {type(input_str)}.")
+        output_str = get_cleaned_input(input_str)
+        logging.info(f"cleaned output_str: {output_str}, type {type(output_str)}.")
+        output_image = input_image
+        logging.info(f"output_image type: {type(output_image)}.")
         return output_image, output_str
     return inference
 def get_gradio_interface(fn_inference: Callable):
     return gr.Interface(
         fn_inference,
+        inputs=[
             gr.Textbox(lines=1, placeholder=None, label="Text Instruction"),
             gr.Image(type="filepath", label="Input Image")
+        ],
+        outputs=[
             gr.Image(type="pil", label="Segmentation Output"),
+            gr.Textbox(lines=1, placeholder=None, label="Text Output")
+        ],
+        title=constants.title,
+        description=constants.description,
+        article=constants.article,
+        examples=constants.examples,
+        allow_flagging="auto"
+    )
 args = parse_args(sys.argv[1:])

utils/constants.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Gradio
+examples = [
+    [
+        "Where can the driver see the car speed in this image? Please output segmentation mask.",
+        "./resources/imgs/example1.jpg",
+    ],
+    [
+        "Can you segment the food that tastes spicy and hot?",
+        "./resources/imgs/example2.jpg",
+    ],
+    [
+        "Assuming you are an autonomous driving robot, what part of the diagram would you manipulate to control the direction of travel? Please output segmentation mask and explain why.",
+        "./resources/imgs/example1.jpg",
+    ],
+    [
+        "What can make the woman stand higher? Please output segmentation mask and explain why.",
+        "./resources/imgs/example3.jpg",
+    ],
+]
+output_labels = ["Segmentation Output"]
+title = "LISA: Reasoning Segmentation via Large Language Model"
+description = """
+<font size=4>
+This is the online demo of LISA. \n
+If multiple users are using it at the same time, they will enter a queue, which may delay some time. \n
+**Note**: **Different prompts can lead to significantly varied results**. \n
+**Note**: Please try to **standardize** your input text prompts to **avoid ambiguity**, and also pay attention to whether the **punctuations** of the input are correct. \n
+**Note**: Current model is **LISA-13B-llama2-v0-explanatory**, and 4-bit quantization may impair text-generation quality. \n
+**Usage**: <br>
+&ensp;(1) To let LISA **segment something**, input prompt like: "Can you segment xxx in this image?", "What is xxx in this image? Please output segmentation mask."; <br>
+&ensp;(2) To let LISA **output an explanation**, input prompt like: "What is xxx in this image? Please output segmentation mask and explain why."; <br>
+&ensp;(3) To obtain **solely language output**, you can input like what you should do in current multi-modal LLM (e.g., LLaVA). <br>
+Hope you can enjoy our work!
+</font>
+"""
+article = """
+<p style='text-align: center'>
+<a href='https://arxiv.org/abs/2308.00692' target='_blank'>
+Preprint Paper
+</a>
+\n
+<p style='text-align: center'>
+<a href='https://github.com/dvlab-research/LISA' target='_blank'>   Github Repo </a></p>
+"""