SoulX-Singer

Running on Zero

multimodalart HF Staff Claude Opus 4.6 commited on 12 days ago

Commit

e36805d

1 Parent(s): d3bb466

Fix NeMo CUDA error 35 on ZeroGPU by disabling CUDA graphs in decoding config

The previous fix disabled CUDA graphs on the decoding_computer object, but
transcribe(timestamps=True) calls change_decoding_strategy() which rebuilds
the decoder from cfg, re-enabling CUDA graphs. Setting
cfg.decoding.greedy.use_cuda_graph_decoder=False persists across rebuilds.

Also loads the English ASR model eagerly at startup so ZeroGPU can properly
hijack CUDA calls at global scope, instead of lazy-loading inside a
@spaces.GPU-decorated function.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

preprocess/tools/lyric_transcription.py +12 -16

preprocess/tools/lyric_transcription.py CHANGED Viewed

@@ -150,12 +150,15 @@ class _ASREnModel:
             map_location=device,
         )
         self.model.eval()
-        # Disable CUDA Graphs to avoid "CUDA failure! 35" on CUDA 12.8 + PyTorch 2.9
-        # See: https://github.com/NVIDIA-NeMo/NeMo/issues/15145
-        if hasattr(self.model, 'decoding') and hasattr(self.model.decoding, 'decoding'):
-            comp = getattr(self.model.decoding.decoding, 'decoding_computer', None)
-            if comp is not None and hasattr(comp, 'disable_cuda_graphs'):
-                comp.disable_cuda_graphs()
     @staticmethod
     def _clean_word(word: str) -> str:
@@ -219,9 +222,6 @@ class LyricTranscriber:
             verbose (bool): Whether to print verbose logs.
         """
         self.verbose = verbose
-        self.device = device
-        self.zh_model_path = zh_model_path
-        self.en_model_path = en_model_path
         if self.verbose:
             print(
@@ -233,8 +233,9 @@ class LyricTranscriber:
         # Always initialize Chinese ASR.
         self.zh_model = _ASRZhModel(device=device, model_path=zh_model_path)
-        # English ASR will be lazily initialized on first English request to avoid long waiting cost when importing NeMo
-        self.en_model = None
         if self.verbose:
             print("[lyric transcription] init: success")
@@ -256,11 +257,6 @@ class LyricTranscriber:
         lang = (language or "auto").lower()
         if lang in {"english"}:
-            if self.en_model is None:
-                # Lazy-load NeMo model only when English is actually used.
-                if v:
-                    print("[lyric transcription] init English ASR, please make sure NeMo is installed")
-                self.en_model = _ASREnModel(model_path=self.en_model_path, device=self.device)
             out = self.en_model.process(wav_fn)
         else:
             out = self.zh_model.process(wav_fn)

             map_location=device,
         )
         self.model.eval()
+        # Disable CUDA Graphs via the decoding config to avoid
+        # "CUDA failure! 35" (cudaErrorInsufficientDriver) on
+        # CUDA 12.8 + ZeroGPU where the driver is too old for graph capture.
+        # This must be set in the config (not on the decoding_computer) because
+        # transcribe(timestamps=True) calls change_decoding_strategy() which
+        # rebuilds the decoder from cfg.
+        from omegaconf import open_dict
+        with open_dict(self.model.cfg.decoding):
+            self.model.cfg.decoding.greedy.use_cuda_graph_decoder = False
     @staticmethod
     def _clean_word(word: str) -> str:
             verbose (bool): Whether to print verbose logs.
         """
         self.verbose = verbose
         if self.verbose:
             print(
         # Always initialize Chinese ASR.
         self.zh_model = _ASRZhModel(device=device, model_path=zh_model_path)
+        # Initialize English ASR eagerly so the model is loaded at global
+        # scope where ZeroGPU can hijack CUDA calls properly.
+        self.en_model = _ASREnModel(model_path=en_model_path, device=device)
         if self.verbose:
             print("[lyric transcription] init: success")
         lang = (language or "auto").lower()
         if lang in {"english"}:
             out = self.en_model.process(wav_fn)
         else:
             out = self.zh_model.process(wav_fn)