Update app.py
Browse files
app.py
CHANGED
|
@@ -4,7 +4,7 @@ import os
|
|
| 4 |
import numpy as np
|
| 5 |
from groq import Groq
|
| 6 |
import spaces
|
| 7 |
-
from transformers import AutoModel, AutoTokenizer
|
| 8 |
from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
|
| 9 |
from parler_tts import ParlerTTSForConditionalGeneration
|
| 10 |
import soundfile as sf
|
|
@@ -20,8 +20,6 @@ import requests
|
|
| 20 |
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
| 21 |
MODEL = 'llama3-groq-70b-8192-tool-use-preview'
|
| 22 |
|
| 23 |
-
############### MINICPM MEIN ERROR HAI, USKO REPLACE KARNA HOGA ###############
|
| 24 |
-
|
| 25 |
# Load MiniCPM-V-2_6 with 4-bit quantization
|
| 26 |
text_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True,
|
| 27 |
device_map="auto", torch_dtype=torch.bfloat16)
|
|
@@ -30,9 +28,9 @@ tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_co
|
|
| 30 |
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1")
|
| 31 |
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
|
| 32 |
|
| 33 |
-
image_model = UNet2DConditionModel.
|
| 34 |
-
image_pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", unet=image_model, torch_dtype=torch.float16
|
| 35 |
-
image_pipe.scheduler = EulerDiscreteScheduler.
|
| 36 |
|
| 37 |
# Initialize voice-only mode
|
| 38 |
def play_voice_output(response):
|
|
@@ -174,13 +172,13 @@ def initialize_tools():
|
|
| 174 |
}
|
| 175 |
]
|
| 176 |
return tools
|
|
|
|
| 177 |
@spaces.GPU()
|
| 178 |
-
# Gradio Interface
|
| 179 |
def main_interface(user_prompt, image=None, video=None, audio=None, doc=None, voice_only=False):
|
| 180 |
-
text_model
|
| 181 |
tts_model.to("cuda")
|
| 182 |
image_model.to("cuda", torch.float16)
|
| 183 |
-
|
| 184 |
response = handle_input(user_prompt, image=image, video=video, audio=audio, doc=doc)
|
| 185 |
if voice_only:
|
| 186 |
audio_file = play_voice_output(response)
|
|
@@ -205,4 +203,4 @@ with gr.Blocks() as demo:
|
|
| 205 |
outputs=output
|
| 206 |
)
|
| 207 |
|
| 208 |
-
demo.launch(inline=False)
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
from groq import Groq
|
| 6 |
import spaces
|
| 7 |
+
from transformers import AutoModel, AutoTokenizer
|
| 8 |
from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
|
| 9 |
from parler_tts import ParlerTTSForConditionalGeneration
|
| 10 |
import soundfile as sf
|
|
|
|
| 20 |
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
| 21 |
MODEL = 'llama3-groq-70b-8192-tool-use-preview'
|
| 22 |
|
|
|
|
|
|
|
| 23 |
# Load MiniCPM-V-2_6 with 4-bit quantization
|
| 24 |
text_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2', trust_remote_code=True,
|
| 25 |
device_map="auto", torch_dtype=torch.bfloat16)
|
|
|
|
| 28 |
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1")
|
| 29 |
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
|
| 30 |
|
| 31 |
+
image_model = UNet2DConditionModel.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet")
|
| 32 |
+
image_pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", unet=image_model, torch_dtype=torch.float16)
|
| 33 |
+
image_pipe.scheduler = EulerDiscreteScheduler.from_pretrained(image_pipe.scheduler.config, timestep_spacing="trailing")
|
| 34 |
|
| 35 |
# Initialize voice-only mode
|
| 36 |
def play_voice_output(response):
|
|
|
|
| 172 |
}
|
| 173 |
]
|
| 174 |
return tools
|
| 175 |
+
|
| 176 |
@spaces.GPU()
|
|
|
|
| 177 |
def main_interface(user_prompt, image=None, video=None, audio=None, doc=None, voice_only=False):
|
| 178 |
+
text_model.to(device='cuda', dtype=torch.bfloat16)
|
| 179 |
tts_model.to("cuda")
|
| 180 |
image_model.to("cuda", torch.float16)
|
| 181 |
+
image_pipe.to("cuda")
|
| 182 |
response = handle_input(user_prompt, image=image, video=video, audio=audio, doc=doc)
|
| 183 |
if voice_only:
|
| 184 |
audio_file = play_voice_output(response)
|
|
|
|
| 203 |
outputs=output
|
| 204 |
)
|
| 205 |
|
| 206 |
+
demo.launch(inline=False)
|