manueljohnson063 commited on
Commit
50d7c9d
Β·
verified Β·
1 Parent(s): 87552f0

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +233 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from nemo.collections.speechlm2.models import SALM
4
+ import torch
5
+ import tempfile
6
+ import os
7
+
8
+ # Load model using official NVIDIA NeMo approach
9
+ model_id = "nvidia/canary-qwen-2.5b"
10
+ print("Loading NVIDIA Canary-Qwen-2.5B model using NeMo...")
11
+ model = SALM.from_pretrained(model_id)
12
+
13
+ def generate_text(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
14
+ """Generate text using the NVIDIA NeMo model (LLM mode)"""
15
+
16
+ try:
17
+ # Use LLM mode (text-only) as per official documentation
18
+ with model.llm.disable_adapter():
19
+ answer_ids = model.generate(
20
+ prompts=[[{"role": "user", "content": prompt}]],
21
+ max_new_tokens=max_tokens,
22
+ temperature=temperature,
23
+ top_p=top_p,
24
+ do_sample=True
25
+ )
26
+
27
+ # Convert IDs to text using model's tokenizer
28
+ response = model.tokenizer.ids_to_text(answer_ids[0].cpu())
29
+ return response
30
+
31
+ except Exception as e:
32
+ return f"Error generating text: {str(e)}"
33
+
34
+ def transcribe_audio(audio_file, user_prompt="Transcribe the following:"):
35
+ """Transcribe audio using ASR mode"""
36
+
37
+ try:
38
+ if audio_file is None:
39
+ return "No audio file provided"
40
+
41
+ # Use ASR mode (speech-to-text) as per official documentation
42
+ answer_ids = model.generate(
43
+ prompts=[
44
+ [{"role": "user", "content": f"{user_prompt} {model.audio_locator_tag}", "audio": [audio_file]}]
45
+ ],
46
+ max_new_tokens=128,
47
+ )
48
+
49
+ # Convert IDs to text
50
+ transcript = model.tokenizer.ids_to_text(answer_ids[0].cpu())
51
+ return transcript
52
+
53
+ except Exception as e:
54
+ return f"Error transcribing audio: {str(e)}"
55
+
56
+ def chat_interface(message, history, max_tokens, temperature, top_p):
57
+ """Chat interface for Gradio"""
58
+
59
+ # Build conversation context
60
+ conversation = ""
61
+ for user_msg, bot_msg in history:
62
+ conversation += f"User: {user_msg}\nAssistant: {bot_msg}\n"
63
+
64
+ conversation += f"User: {message}\nAssistant: "
65
+
66
+ # Generate response
67
+ response = generate_text(conversation, max_tokens, temperature, top_p)
68
+
69
+ # Update history
70
+ history.append((message, response))
71
+
72
+ return "", history
73
+
74
+ # Create Gradio interface
75
+ with gr.Blocks(title="NVIDIA Canary-Qwen-2.5B Chat") as demo:
76
+
77
+ gr.HTML("""
78
+ <div style="text-align: center;">
79
+ <h1>πŸ€– NVIDIA Canary-Qwen-2.5B</h1>
80
+ <p>Official NeMo implementation - Speech-to-Text & Text Generation</p>
81
+ <p><strong>Capabilities:</strong> Audio Transcription + Text Chat</p>
82
+ </div>
83
+ """)
84
+
85
+ with gr.Tab("🎀 Audio Transcription (ASR)"):
86
+ with gr.Row():
87
+ with gr.Column():
88
+ audio_input = gr.Audio(
89
+ label="Upload Audio File (.wav or .flac)",
90
+ type="filepath",
91
+ format="wav"
92
+ )
93
+
94
+ asr_prompt = gr.Textbox(
95
+ label="Custom Prompt (optional)",
96
+ value="Transcribe the following:",
97
+ placeholder="Enter custom transcription prompt..."
98
+ )
99
+
100
+ transcribe_btn = gr.Button("🎀 Transcribe Audio", variant="primary")
101
+
102
+ transcript_output = gr.Textbox(
103
+ label="Transcription Result",
104
+ lines=8,
105
+ max_lines=15
106
+ )
107
+
108
+ gr.Examples(
109
+ examples=[
110
+ ["Transcribe the following:"],
111
+ ["Please transcribe this audio in detail:"],
112
+ ["Convert this speech to text:"]
113
+ ],
114
+ inputs=[asr_prompt]
115
+ )
116
+
117
+ with gr.Tab("πŸ’¬ Text Chat (LLM)"):
118
+ with gr.Row():
119
+ with gr.Column(scale=3):
120
+ chatbot = gr.Chatbot(height=400)
121
+ msg = gr.Textbox(label="Your message", placeholder="Type here...")
122
+
123
+ with gr.Row():
124
+ submit_btn = gr.Button("Send", variant="primary")
125
+ clear_btn = gr.Button("Clear Chat")
126
+
127
+ with gr.Column(scale=1):
128
+ gr.Markdown("### βš™οΈ Settings")
129
+
130
+ max_tokens = gr.Slider(
131
+ minimum=10, maximum=500, value=200, step=10,
132
+ label="Max Tokens"
133
+ )
134
+
135
+ temperature = gr.Slider(
136
+ minimum=0.1, maximum=2.0, value=0.7, step=0.1,
137
+ label="Temperature"
138
+ )
139
+
140
+ top_p = gr.Slider(
141
+ minimum=0.1, maximum=1.0, value=0.9, step=0.05,
142
+ label="Top-p"
143
+ )
144
+
145
+ with gr.Tab("πŸ“ Single Generation"):
146
+ with gr.Column():
147
+ prompt_input = gr.Textbox(
148
+ label="Prompt",
149
+ placeholder="Enter your prompt...",
150
+ lines=5
151
+ )
152
+
153
+ generate_btn = gr.Button("Generate", variant="primary")
154
+
155
+ output_text = gr.Textbox(
156
+ label="Generated Text",
157
+ lines=10,
158
+ max_lines=20
159
+ )
160
+
161
+ with gr.Row():
162
+ single_max_tokens = gr.Slider(10, 500, 200, label="Max Tokens")
163
+ single_temperature = gr.Slider(0.1, 2.0, 0.7, label="Temperature")
164
+ single_top_p = gr.Slider(0.1, 1.0, 0.9, label="Top-p")
165
+
166
+ with gr.Tab("ℹ️ Model Info"):
167
+ gr.Markdown("""
168
+ ## NVIDIA Canary-Qwen-2.5B Model Information
169
+
170
+ ### Capabilities:
171
+ - 🎀 **Audio Transcription (ASR)**: Convert speech to text
172
+ - πŸ’¬ **Text Generation (LLM)**: Chat and text completion
173
+ - 🎯 **Multimodal**: Combines audio and text processing
174
+
175
+ ### Model Details:
176
+ - **Size**: 2.5 billion parameters
177
+ - **Framework**: NVIDIA NeMo
178
+ - **Audio Input**: 16kHz mono-channel .wav or .flac files
179
+ - **Languages**: Multiple languages supported
180
+
181
+ ### Usage Tips:
182
+ 1. **For Audio**: Upload .wav or .flac files (16kHz recommended)
183
+ 2. **For Text**: Use natural language prompts
184
+ 3. **Custom Prompts**: You can modify transcription prompts
185
+ 4. **Parameters**: Adjust temperature and tokens for different outputs
186
+
187
+ ### Official Documentation:
188
+ - [Model Card](https://huggingface.co/nvidia/canary-qwen-2.5b)
189
+ - [NVIDIA NeMo](https://github.com/NVIDIA/NeMo)
190
+ """)
191
+
192
+ # Event handlers
193
+ transcribe_btn.click(
194
+ transcribe_audio,
195
+ inputs=[audio_input, asr_prompt],
196
+ outputs=[transcript_output]
197
+ )
198
+
199
+ # Event handlers
200
+ submit_btn.click(
201
+ chat_interface,
202
+ inputs=[msg, chatbot, max_tokens, temperature, top_p],
203
+ outputs=[msg, chatbot]
204
+ )
205
+
206
+ msg.submit(
207
+ chat_interface,
208
+ inputs=[msg, chatbot, max_tokens, temperature, top_p],
209
+ outputs=[msg, chatbot]
210
+ )
211
+
212
+ clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
213
+
214
+ generate_btn.click(
215
+ generate_text,
216
+ inputs=[prompt_input, single_max_tokens, single_temperature, single_top_p],
217
+ outputs=[output_text]
218
+ )
219
+
220
+ # Example prompts
221
+ gr.Examples(
222
+ examples=[
223
+ ["Explain quantum computing in simple terms"],
224
+ ["Write a short story about AI"],
225
+ ["What are the benefits of renewable energy?"],
226
+ ["How do neural networks work?"],
227
+ ["Summarize the key points about machine learning"]
228
+ ],
229
+ inputs=[prompt_input]
230
+ )
231
+
232
+ if __name__ == "__main__":
233
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch>=2.6.0
2
+ gradio>=4.0.0
3
+ nemo_toolkit[asr,tts] @ git+https://github.com/NVIDIA/NeMo.git
4
+ accelerate>=0.20.0