multimodalart HF Staff Claude Opus 4.6 commited on
Commit
e36805d
·
1 Parent(s): d3bb466

Fix NeMo CUDA error 35 on ZeroGPU by disabling CUDA graphs in decoding config

Browse files

The previous fix disabled CUDA graphs on the decoding_computer object, but
transcribe(timestamps=True) calls change_decoding_strategy() which rebuilds
the decoder from cfg, re-enabling CUDA graphs. Setting
cfg.decoding.greedy.use_cuda_graph_decoder=False persists across rebuilds.

Also loads the English ASR model eagerly at startup so ZeroGPU can properly
hijack CUDA calls at global scope, instead of lazy-loading inside a
@spaces.GPU-decorated function.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

preprocess/tools/lyric_transcription.py CHANGED
@@ -150,12 +150,15 @@ class _ASREnModel:
150
  map_location=device,
151
  )
152
  self.model.eval()
153
- # Disable CUDA Graphs to avoid "CUDA failure! 35" on CUDA 12.8 + PyTorch 2.9
154
- # See: https://github.com/NVIDIA-NeMo/NeMo/issues/15145
155
- if hasattr(self.model, 'decoding') and hasattr(self.model.decoding, 'decoding'):
156
- comp = getattr(self.model.decoding.decoding, 'decoding_computer', None)
157
- if comp is not None and hasattr(comp, 'disable_cuda_graphs'):
158
- comp.disable_cuda_graphs()
 
 
 
159
 
160
  @staticmethod
161
  def _clean_word(word: str) -> str:
@@ -219,9 +222,6 @@ class LyricTranscriber:
219
  verbose (bool): Whether to print verbose logs.
220
  """
221
  self.verbose = verbose
222
- self.device = device
223
- self.zh_model_path = zh_model_path
224
- self.en_model_path = en_model_path
225
 
226
  if self.verbose:
227
  print(
@@ -233,8 +233,9 @@ class LyricTranscriber:
233
  # Always initialize Chinese ASR.
234
  self.zh_model = _ASRZhModel(device=device, model_path=zh_model_path)
235
 
236
- # English ASR will be lazily initialized on first English request to avoid long waiting cost when importing NeMo
237
- self.en_model = None
 
238
 
239
  if self.verbose:
240
  print("[lyric transcription] init: success")
@@ -256,11 +257,6 @@ class LyricTranscriber:
256
 
257
  lang = (language or "auto").lower()
258
  if lang in {"english"}:
259
- if self.en_model is None:
260
- # Lazy-load NeMo model only when English is actually used.
261
- if v:
262
- print("[lyric transcription] init English ASR, please make sure NeMo is installed")
263
- self.en_model = _ASREnModel(model_path=self.en_model_path, device=self.device)
264
  out = self.en_model.process(wav_fn)
265
  else:
266
  out = self.zh_model.process(wav_fn)
 
150
  map_location=device,
151
  )
152
  self.model.eval()
153
+ # Disable CUDA Graphs via the decoding config to avoid
154
+ # "CUDA failure! 35" (cudaErrorInsufficientDriver) on
155
+ # CUDA 12.8 + ZeroGPU where the driver is too old for graph capture.
156
+ # This must be set in the config (not on the decoding_computer) because
157
+ # transcribe(timestamps=True) calls change_decoding_strategy() which
158
+ # rebuilds the decoder from cfg.
159
+ from omegaconf import open_dict
160
+ with open_dict(self.model.cfg.decoding):
161
+ self.model.cfg.decoding.greedy.use_cuda_graph_decoder = False
162
 
163
  @staticmethod
164
  def _clean_word(word: str) -> str:
 
222
  verbose (bool): Whether to print verbose logs.
223
  """
224
  self.verbose = verbose
 
 
 
225
 
226
  if self.verbose:
227
  print(
 
233
  # Always initialize Chinese ASR.
234
  self.zh_model = _ASRZhModel(device=device, model_path=zh_model_path)
235
 
236
+ # Initialize English ASR eagerly so the model is loaded at global
237
+ # scope where ZeroGPU can hijack CUDA calls properly.
238
+ self.en_model = _ASREnModel(model_path=en_model_path, device=device)
239
 
240
  if self.verbose:
241
  print("[lyric transcription] init: success")
 
257
 
258
  lang = (language or "auto").lower()
259
  if lang in {"english"}:
 
 
 
 
 
260
  out = self.en_model.process(wav_fn)
261
  else:
262
  out = self.zh_model.process(wav_fn)