Spaces:

Yoni232
/

count-the-notes

Running on Zero

App Files Files Community

Yoni232 commited on 4 days ago

Commit

a86b9b2

1 Parent(s): 1de6d5a

changed output file name to input file name and fixed stereo to mono bug

Browse files

Files changed (1) hide show

app.py +25 -12

app.py CHANGED Viewed

@@ -12,10 +12,16 @@ from pathlib import Path
 import numpy as np
 import soundfile as sf
 import librosa
 from onsets_and_frames.hf_model import CountEMModel
 from onsets_and_frames.constants import SAMPLE_RATE
 # Cache for loaded models to avoid reloading
 model_cache = {}
@@ -23,9 +29,9 @@ model_cache = {}
 def load_model(model_name: str) -> CountEMModel:
     """Load model from cache or download from Hugging Face Hub."""
     if model_name not in model_cache:
-        print(f"Loading model: {model_name}")
         model_cache[model_name] = CountEMModel.from_pretrained(model_name)
-        print(f"Model loaded successfully")
     return model_cache[model_name]
@@ -61,6 +67,7 @@ def transcribe_audio(
         # Extract audio data
         # Gradio Audio component returns (sample_rate, audio_array) or audio file path
         if isinstance(audio_input, tuple):
             sr, audio = audio_input
             # Convert to float32 if needed
@@ -70,7 +77,9 @@ def transcribe_audio(
                 audio = audio.astype(np.float32) / 2147483648.0
         elif isinstance(audio_input, str):
             # Audio file path provided
-            audio, sr = librosa.load(audio_input, sr=None, mono=False)
         else:
             return None, f"Error: Unexpected audio input type: {type(audio_input)}"
@@ -80,7 +89,7 @@ def transcribe_audio(
         # Resample to 16kHz if needed
         if sr != SAMPLE_RATE:
-            print(f"Resampling from {sr}Hz to {SAMPLE_RATE}Hz")
             audio = librosa.resample(audio, orig_sr=sr, target_sr=SAMPLE_RATE)
             sr = SAMPLE_RATE
@@ -96,16 +105,20 @@ def transcribe_audio(
         # Load model
         status = f"Loading {model_choice} model..."
-        print(status)
         model = load_model(model_name)
         # Transcribe
         status = f"Transcribing {duration:.1f} seconds of audio..."
-        print(status)
-        # Create temporary MIDI file
-        with tempfile.NamedTemporaryFile(suffix=".mid", delete=False) as tmp:
-            output_path = tmp.name
         model.transcribe_to_midi(
             audio,
@@ -130,7 +143,7 @@ Download your MIDI file using the button below.
     except Exception as e:
         error_msg = f"Error during transcription: {str(e)}"
-        print(error_msg)
         return None, error_msg
@@ -238,9 +251,9 @@ with gr.Blocks(title="CountEM - Music Transcription") as demo:
 if __name__ == "__main__":
     # Pre-load the default model to speed up first transcription
-    print("Pre-loading default model...")
     load_model("Yoni232/countem-musicnet")
-    print("Model pre-loaded. Starting Gradio interface...")
     # Launch the demo
     demo.launch(

 import numpy as np
 import soundfile as sf
 import librosa
+import logging
 from onsets_and_frames.hf_model import CountEMModel
 from onsets_and_frames.constants import SAMPLE_RATE
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Cache for loaded models to avoid reloading
 model_cache = {}
 def load_model(model_name: str) -> CountEMModel:
     """Load model from cache or download from Hugging Face Hub."""
     if model_name not in model_cache:
+        logger.info(f"Loading model: {model_name}")
         model_cache[model_name] = CountEMModel.from_pretrained(model_name)
+        logger.info(f"Model loaded successfully")
     return model_cache[model_name]
         # Extract audio data
         # Gradio Audio component returns (sample_rate, audio_array) or audio file path
+        input_filename = None
         if isinstance(audio_input, tuple):
             sr, audio = audio_input
             # Convert to float32 if needed
                 audio = audio.astype(np.float32) / 2147483648.0
         elif isinstance(audio_input, str):
             # Audio file path provided
+            audio, sr = librosa.load(audio_input, sr=None, mono=True)
+            # Extract filename for output naming
+            input_filename = Path(audio_input).stem
         else:
             return None, f"Error: Unexpected audio input type: {type(audio_input)}"
         # Resample to 16kHz if needed
         if sr != SAMPLE_RATE:
+            logger.info(f"Resampling from {sr}Hz to {SAMPLE_RATE}Hz")
             audio = librosa.resample(audio, orig_sr=sr, target_sr=SAMPLE_RATE)
             sr = SAMPLE_RATE
         # Load model
         status = f"Loading {model_choice} model..."
+        logger.info(status)
         model = load_model(model_name)
         # Transcribe
         status = f"Transcribing {duration:.1f} seconds of audio..."
+        logger.info(status)
+        # Create temporary MIDI file with original filename if available
+        if input_filename:
+            temp_dir = tempfile.gettempdir()
+            output_path = os.path.join(temp_dir, f"{input_filename}.mid")
+        else:
+            with tempfile.NamedTemporaryFile(suffix=".mid", delete=False) as tmp:
+                output_path = tmp.name
         model.transcribe_to_midi(
             audio,
     except Exception as e:
         error_msg = f"Error during transcription: {str(e)}"
+        logger.error(error_msg)
         return None, error_msg
 if __name__ == "__main__":
     # Pre-load the default model to speed up first transcription
+    logger.info("Pre-loading default model...")
     load_model("Yoni232/countem-musicnet")
+    logger.info("Model pre-loaded. Starting Gradio interface...")
     # Launch the demo
     demo.launch(