Spaces:

RobotsMali
/

RobotsMali_ASR_DEMO

Sleeping

App Files Files Community

binaryMao commited on Oct 21

Commit

3b37f5b

verified ·

1 Parent(s): fd40e91

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -19

app.py CHANGED Viewed

@@ -10,9 +10,19 @@ import soundfile as sf
 import numpy as np
 # --- IMPORTS NEMO ---
-import nemo.collections.asr as nemo_asr
-import nemo.collections.nlp as nemo_nlp
-# --------------------
 # ----------------------------------------------------------------------
 # CONSTANTES DE CONFIGURATION
@@ -79,6 +89,7 @@ def load_punct_model():
             print("-> Modèle de ponctuation chargé avec succès.")
         except Exception as e:
             print(f"!!! AVERTISSEMENT: Échec du chargement du modèle de ponctuation {PUNCT_MODEL_NAME}. La sortie restera brute. Détail: {e}")
     return punct_pipeline
 # ----------------------------------------------------------------------
@@ -106,6 +117,7 @@ def transcribe_audio(model_name: str, audio_path: str):
         # ----------------------------------------------------------------
         yield f"**[1/4] CHARGEMENT AUDIO...** Préparation du fichier original (Mono @ 16kHz). ⚙️"
         full_audio_data, sr = librosa.load(audio_path, sr=SR_TARGET, mono=True)
         total_duration = len(full_audio_data) / SR_TARGET
@@ -128,7 +140,6 @@ def transcribe_audio(model_name: str, audio_path: str):
         # --- BARRE DE PROGRESSION SIMULÉE ---
         for progress_percent in range(0, 91, 10):
             time.sleep(0.3)
-            # Utilisation de la syntaxe correcte : progress(valeur_flottante, description)
             progress(progress_percent / 100, desc=f"Progression ASR ({progress_percent}%)")
         yield f"**[3/4] FINALISATION...** Inférence en cours sur le GPU. 🚀"
@@ -138,10 +149,11 @@ def transcribe_audio(model_name: str, audio_path: str):
         transcriptions = asr_model.transcribe([temp_full_path], batch_size=1)
         # --- GESTION DE L'OBJET HYPOTHESIS ---
-        transcription_text_final = ""
         if transcriptions and transcriptions[0]:
             hyp_object = transcriptions[0]
             if hasattr(hyp_object, 'text'):
                 transcription_text_final = hyp_object.text.strip()
             elif isinstance(hyp_object, str):
@@ -162,6 +174,7 @@ def transcribe_audio(model_name: str, audio_path: str):
         # --- POST-TRAITEMENT (PONCTUATION & CASSE) ---
         punct_model = load_punct_model()
         if punct_model and transcription_text_final != "[Transcription vide ou échec ASR]":
             yield f"**[4/4] POST-TRAITEMENT...** Correction de la ponctuation et de la casse pour la lisibilité. ✨"
             # Termine la barre de progression
@@ -173,21 +186,33 @@ def transcribe_audio(model_name: str, audio_path: str):
                     processed_text = corrected_list[0].strip()
             except Exception as pc_error:
                 print(f"!!! Échec du post-traitement de ponctuation : {pc_error}")
                 yield "⚠️ Échec de la correction de ponctuation. Affichage du texte brut."
-        # 1. EN-TÊTE D'INFORMATION
         output = f"**Modèle Utilisé :** `{model_short_name}` (NeMo)\n"
         output += f"**Durée de l'Audio :** {total_duration:.1f} secondes\n"
         output += f"**Temps de Traitement Total :** {duration:.2f} secondes\n"
         output += f"***\n"
-        # 2. PRÉSENTATION LYRICS PROPRE
         output += "**RÉSULTAT DE LA TRANSCRIPTION (Lyrics) :**\n"
-        formatted_lyrics = processed_text.replace('\n', ' ').strip().replace('. ', '.\n\n>>> ').replace('? ', '?\n\n>>> ')
-        if not formatted_lyrics.startswith('>>> '):
-            formatted_lyrics = '>>> ' + formatted_lyrics
-        output += formatted_lyrics
         # 3. NOTE FINALE
         output += "\n\n*Traitement complet de l'audio sans découpage (chunking).* "
@@ -195,7 +220,7 @@ def transcribe_audio(model_name: str, audio_path: str):
         yield output
     except RuntimeError as e:
-        yield f"❌ Erreur critique lors du chargement : {str(e)}"
     except Exception as e:
         yield f"❌ Erreur générale lors de la transcription complète : {e}"
@@ -210,26 +235,38 @@ def transcribe_audio(model_name: str, audio_path: str):
 # 3. PRÉ-CHARGEMENT ET INTERFACE GRADIO
 # ----------------------------------------------------------------------
-INITIAL_DESCRIPTION = "Sélectionnez un modèle ASR de RobotsMali, puis enregistrez ou téléchargez un fichier audio pour obtenir la transcription."
 if ROBOTSMALI_MODELS:
     default_model = ROBOTSMALI_MODELS[0]
     try:
         # Tente de charger le modèle par défaut au démarrage
         load_pipeline(default_model)
-        default_model_short_name = default_model.split('/')[-1]
         INITIAL_DESCRIPTION = (
-            f"✅ Le modèle par défaut `{default_model_short_name}` (NeMo) a été **préchargé et réchauffé** avec succès. "
-            f"**Attention :** Le traitement se fait sur l'audio complet. Les longs fichiers peuvent planter la RAM. "
-            f"Téléchargez ou enregistrez votre audio pour transcrire."
         )
     except RuntimeError as e:
-        default_model_short_name = default_model.split('/')[-1]
         INITIAL_DESCRIPTION = (
             f"❌ ERREUR CRITIQUE AU DÉMARRAGE : Impossible de charger le modèle `{default_model_short_name}`. "
             f"**Veuillez sélectionner un autre modèle dans la liste**. "
             f"Détails de l'erreur : {str(e)}"
         )
 model_dropdown = gr.Dropdown(
     label="1. Sélectionner un Modèle RobotsMali",
@@ -242,7 +279,7 @@ audio_input = gr.Audio(
     label="2. Télécharger ou Enregistrer l'Audio",
     type="filepath",
     sources=["microphone", "upload"],
-    format="mp3")
 text_output = gr.Markdown(
     label="3. Résultat de la Transcription ASR")

 import numpy as np
 # --- IMPORTS NEMO ---
+# Nécessite : pip install nemo_toolkit['asr'] nemo_toolkit['nlp']
+try:
+    import nemo.collections.asr as nemo_asr
+    import nemo.collections.nlp as nemo_nlp
+except ImportError:
+    print("!!! AVERTISSEMENT : NeMo ASR ou NLP n'est pas installé. Les modèles ne fonctionneront pas.")
+    # On définit des substituts pour permettre au script de s'exécuter jusqu'à l'interface
+    class DummyASRModel:
+        def from_pretrained(self, model_name): raise RuntimeError("NeMo ASR not installed.")
+    class DummyNLPModel:
+        def from_pretrained(self, model_name): raise RuntimeError("NeMo NLP not installed.")
+    nemo_asr = type('nemo_asr', (object,), {'models': type('models', (object,), {'ASRModel': DummyASRModel})})
+    nemo_nlp = type('nemo_nlp', (object,), {'models': type('models', (object,), {'PunctuationCapitalizationModel': DummyNLPModel})})
 # ----------------------------------------------------------------------
 # CONSTANTES DE CONFIGURATION
             print("-> Modèle de ponctuation chargé avec succès.")
         except Exception as e:
             print(f"!!! AVERTISSEMENT: Échec du chargement du modèle de ponctuation {PUNCT_MODEL_NAME}. La sortie restera brute. Détail: {e}")
+            punct_pipeline = False # Marquer comme tentative échouée
     return punct_pipeline
 # ----------------------------------------------------------------------
         # ----------------------------------------------------------------
         yield f"**[1/4] CHARGEMENT AUDIO...** Préparation du fichier original (Mono @ 16kHz). ⚙️"
+        # NOTE : Utilisation de librosa pour garantir le 16kHz et mono
         full_audio_data, sr = librosa.load(audio_path, sr=SR_TARGET, mono=True)
         total_duration = len(full_audio_data) / SR_TARGET
         # --- BARRE DE PROGRESSION SIMULÉE ---
         for progress_percent in range(0, 91, 10):
             time.sleep(0.3)
             progress(progress_percent / 100, desc=f"Progression ASR ({progress_percent}%)")
         yield f"**[3/4] FINALISATION...** Inférence en cours sur le GPU. 🚀"
         transcriptions = asr_model.transcribe([temp_full_path], batch_size=1)
         # --- GESTION DE L'OBJET HYPOTHESIS ---
+        transcription_text_final = "[Transcription vide ou échec ASR]"
         if transcriptions and transcriptions[0]:
             hyp_object = transcriptions[0]
+            # Tente d'extraire le texte de l'objet de retour de NeMo
             if hasattr(hyp_object, 'text'):
                 transcription_text_final = hyp_object.text.strip()
             elif isinstance(hyp_object, str):
         # --- POST-TRAITEMENT (PONCTUATION & CASSE) ---
         punct_model = load_punct_model()
         if punct_model and transcription_text_final != "[Transcription vide ou échec ASR]":
             yield f"**[4/4] POST-TRAITEMENT...** Correction de la ponctuation et de la casse pour la lisibilité. ✨"
             # Termine la barre de progression
                     processed_text = corrected_list[0].strip()
             except Exception as pc_error:
                 print(f"!!! Échec du post-traitement de ponctuation : {pc_error}")
+                # Le texte brut (transcription_text_final) reste dans processed_text
                 yield "⚠️ Échec de la correction de ponctuation. Affichage du texte brut."
+        # 1. EN-TÊTE D'INFORMATION - LE NOM DU MODÈLE EST MAINTENANT DYNAMIQUE ICI
         output = f"**Modèle Utilisé :** `{model_short_name}` (NeMo)\n"
         output += f"**Durée de l'Audio :** {total_duration:.1f} secondes\n"
         output += f"**Temps de Traitement Total :** {duration:.2f} secondes\n"
         output += f"***\n"
+        # 2. PRÉSENTATION LYRICS PROPRE (AMÉLIORÉE)
         output += "**RÉSULTAT DE LA TRANSCRIPTION (Lyrics) :**\n"
+        output += "---\n" # Séparateur visuel pour la section lyrics
+        # Nettoyage et normalisation de base
+        clean_text = processed_text.replace('\n', ' ').strip()
+        # Remplacer les séparateurs de phrases par un double saut de ligne pour simuler des paragraphes/strophes
+        formatted_lyrics = clean_text.replace('. ', '.\n\n').replace('? ', '?\n\n').replace('! ', '!\n\n')
+        # Ajouter le bloc de citation (>) au début de chaque ligne pour un rendu plus clair en Markdown
+        final_lines = []
+        for line in formatted_lyrics.split('\n'):
+            if line.strip():
+                final_lines.append('> ' + line.strip())
+        output += '\n'.join(final_lines)
+        output += "\n---\n"
         # 3. NOTE FINALE
         output += "\n\n*Traitement complet de l'audio sans découpage (chunking).* "
         yield output
     except RuntimeError as e:
+        yield f"❌ Erreur critique lors du chargement ou de l'inférence : {str(e)}"
     except Exception as e:
         yield f"❌ Erreur générale lors de la transcription complète : {e}"
 # 3. PRÉ-CHARGEMENT ET INTERFACE GRADIO
 # ----------------------------------------------------------------------
+# --- MODIFICATIONS APPLIQUÉES ICI ---
+# 1. On donne une description initiale GÉNÉRALE
+INITIAL_DESCRIPTION_BASE = (
+    "Sélectionnez un modèle ASR de RobotsMali, puis enregistrez ou téléchargez un fichier audio pour obtenir la transcription. "
+    "Attention : Le traitement se fait sur l'audio complet. Les longs fichiers peuvent planter la RAM."
+)
+INITIAL_DESCRIPTION = INITIAL_DESCRIPTION_BASE
 if ROBOTSMALI_MODELS:
     default_model = ROBOTSMALI_MODELS[0]
+    default_model_short_name = default_model.split('/')[-1]
     try:
         # Tente de charger le modèle par défaut au démarrage
         load_pipeline(default_model)
+        # 2. On ajoute seulement le statut de préchargement au message d'information initial
         INITIAL_DESCRIPTION = (
+            f"✅ Modèle par défaut `{default_model_short_name}` **préchargé et réchauffé** avec succès. "
+            f"{INITIAL_DESCRIPTION_BASE}"
         )
     except RuntimeError as e:
         INITIAL_DESCRIPTION = (
             f"❌ ERREUR CRITIQUE AU DÉMARRAGE : Impossible de charger le modèle `{default_model_short_name}`. "
             f"**Veuillez sélectionner un autre modèle dans la liste**. "
             f"Détails de l'erreur : {str(e)}"
         )
+    except Exception as e:
+        INITIAL_DESCRIPTION = (
+            f"❌ ERREUR CRITIQUE : Problème de configuration (peut-être NeMo/CUDA). Détails : {str(e)}"
+        )
+# -------------------------------------
 model_dropdown = gr.Dropdown(
     label="1. Sélectionner un Modèle RobotsMali",
     label="2. Télécharger ou Enregistrer l'Audio",
     type="filepath",
     sources=["microphone", "upload"],
+    format="mp3")
 text_output = gr.Markdown(
     label="3. Résultat de la Transcription ASR")