Yoni232 commited on
Commit
2dbc854
·
1 Parent(s): 56be069

added app.py and updated README

Browse files
Files changed (2) hide show
  1. README.md +41 -1
  2. app.py +250 -0
README.md CHANGED
@@ -10,4 +10,44 @@ pinned: false
10
  license: cc-by-4.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  license: cc-by-4.0
11
  ---
12
 
13
+ # CountEM - Automatic Music Transcription
14
+
15
+ Upload a piano/music recording and transcribe it to MIDI using the CountEM framework.
16
+
17
+ ## About
18
+
19
+ This is a Gradio demo for **CountEM**, a histogram-based supervision approach for Automatic Music Transcription (AMT) presented at ISMIR 2025.
20
+
21
+ **Paper:** [Count the Notes: Histogram-Based Supervision for Automatic Music Transcription](https://arxiv.org/abs/2511.14250)
22
+
23
+ ## Models
24
+
25
+ - [countem-musicnet](https://huggingface.co/Yoni-Yaffe/countem-musicnet) - Trained on MusicNet dataset (recommended)
26
+ - [countem-synth](https://huggingface.co/Yoni-Yaffe/countem-synth) - Trained on synthetic data
27
+
28
+ ## Links
29
+
30
+ - [GitHub Repository](https://github.com/Yoni-Yaffe/count-the-notes)
31
+ - [Project Page](https://yoni-yaffe.github.io/count-the-notes/)
32
+ - [ArXiv Paper](https://arxiv.org/abs/2511.14250)
33
+
34
+ ## Citation
35
+
36
+ If you use this work, please cite:
37
+
38
+ ```bibtex
39
+ @misc{yaffe2025countnoteshistogrambasedsupervision,
40
+ title={Count The Notes: Histogram-Based Supervision for Automatic Music Transcription},
41
+ author={Jonathan Yaffe and Ben Maman and Meinard Müller and Amit H. Bermano},
42
+ year={2025},
43
+ eprint={2511.14250},
44
+ archivePrefix={arXiv},
45
+ primaryClass={cs.SD},
46
+ url={https://arxiv.org/abs/2511.14250},
47
+ }
48
+ ```
49
+
50
+
51
+ ## License
52
+
53
+ CC-BY-4.0
app.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio demo for CountEM Automatic Music Transcription.
3
+
4
+ This demo allows users to upload audio files and transcribe them to MIDI
5
+ using pre-trained models from Hugging Face Hub.
6
+ """
7
+
8
+ import gradio as gr
9
+ import tempfile
10
+ import os
11
+ from pathlib import Path
12
+ import numpy as np
13
+ import soundfile as sf
14
+ import librosa
15
+ from onsets_and_frames.hf_model import CountEMModel
16
+ from onsets_and_frames.constants import SAMPLE_RATE
17
+
18
+
19
+ # Cache for loaded models to avoid reloading
20
+ model_cache = {}
21
+
22
+
23
+ def load_model(model_name: str) -> CountEMModel:
24
+ """Load model from cache or download from Hugging Face Hub."""
25
+ if model_name not in model_cache:
26
+ print(f"Loading model: {model_name}")
27
+ model_cache[model_name] = CountEMModel.from_pretrained(model_name)
28
+ print(f"Model loaded successfully")
29
+ return model_cache[model_name]
30
+
31
+
32
+ def transcribe_audio(
33
+ audio_input,
34
+ model_choice: str,
35
+ onset_threshold: float,
36
+ frame_threshold: float,
37
+ ) -> tuple:
38
+ """
39
+ Transcribe audio to MIDI.
40
+
41
+ Args:
42
+ audio_input: Tuple of (sample_rate, audio_data) from Gradio Audio component
43
+ model_choice: Model to use ("MusicNet" or "Synth")
44
+ onset_threshold: Threshold for onset detection
45
+ frame_threshold: Threshold for frame detection
46
+
47
+ Returns:
48
+ Tuple of (output_midi_path, status_message)
49
+ """
50
+ try:
51
+ # Handle empty input
52
+ if audio_input is None:
53
+ return None, "Error: Please upload an audio file"
54
+
55
+ # Map model choice to HuggingFace repo ID
56
+ model_map = {
57
+ "MusicNet (Recommended)": "Yoni-Yaffe/countem-musicnet",
58
+ "Synth": "Yoni-Yaffe/countem-synth",
59
+ }
60
+ model_name = model_map[model_choice]
61
+
62
+ # Extract audio data
63
+ # Gradio Audio component returns (sample_rate, audio_array) or audio file path
64
+ if isinstance(audio_input, tuple):
65
+ sr, audio = audio_input
66
+ # Convert to float32 if needed
67
+ if audio.dtype == np.int16:
68
+ audio = audio.astype(np.float32) / 32768.0
69
+ elif audio.dtype == np.int32:
70
+ audio = audio.astype(np.float32) / 2147483648.0
71
+ elif isinstance(audio_input, str):
72
+ # Audio file path provided
73
+ audio, sr = sf.read(audio_input, dtype="float32")
74
+ else:
75
+ return None, f"Error: Unexpected audio input type: {type(audio_input)}"
76
+
77
+ # Convert stereo to mono if needed
78
+ if len(audio.shape) > 1:
79
+ audio = audio.mean(axis=1)
80
+
81
+ # Resample to 16kHz if needed
82
+ if sr != SAMPLE_RATE:
83
+ print(f"Resampling from {sr}Hz to {SAMPLE_RATE}Hz")
84
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=SAMPLE_RATE)
85
+ sr = SAMPLE_RATE
86
+
87
+ # Check audio length
88
+ duration = len(audio) / sr
89
+ if duration < 0.5:
90
+ return None, "Error: Audio is too short (minimum 0.5 seconds)"
91
+ if duration > 600: # 10 minutes
92
+ return (
93
+ None,
94
+ f"Error: Audio is too long ({duration:.1f}s). Maximum is 10 minutes (600s).",
95
+ )
96
+
97
+ # Load model
98
+ status = f"Loading {model_choice} model..."
99
+ print(status)
100
+ model = load_model(model_name)
101
+
102
+ # Transcribe
103
+ status = f"Transcribing {duration:.1f} seconds of audio..."
104
+ print(status)
105
+
106
+ # Create temporary MIDI file
107
+ with tempfile.NamedTemporaryFile(suffix=".mid", delete=False) as tmp:
108
+ output_path = tmp.name
109
+
110
+ model.transcribe_to_midi(
111
+ audio,
112
+ output_path,
113
+ onset_threshold=onset_threshold,
114
+ frame_threshold=frame_threshold,
115
+ )
116
+
117
+ # Success message
118
+ success_msg = f"""
119
+ ✓ Transcription complete!
120
+ - Model: {model_choice}
121
+ - Duration: {duration:.2f} seconds
122
+ - Sample rate: {sr} Hz
123
+ - Onset threshold: {onset_threshold}
124
+ - Frame threshold: {frame_threshold}
125
+
126
+ Download your MIDI file using the button below.
127
+ """
128
+
129
+ return output_path, success_msg.strip()
130
+
131
+ except Exception as e:
132
+ error_msg = f"Error during transcription: {str(e)}"
133
+ print(error_msg)
134
+ return None, error_msg
135
+
136
+
137
+ # Build Gradio interface
138
+ with gr.Blocks(title="CountEM - Music Transcription") as demo:
139
+ gr.Markdown(
140
+ """
141
+ # CountEM - Automatic Music Transcription
142
+
143
+ Upload a piano/music recording and transcribe it to MIDI using the CountEM framework.
144
+
145
+ **Paper:** [Count the Notes: Histogram-Based Supervision for Automatic Music Transcription](https://arxiv.org/abs/2511.14250) (ISMIR 2025)
146
+
147
+ **Models on Hugging Face:**
148
+ - [countem-musicnet](https://huggingface.co/Yoni-Yaffe/countem-musicnet) - Trained on MusicNet dataset
149
+ - [countem-synth](https://huggingface.co/Yoni-Yaffe/countem-synth) - Trained on synthetic data
150
+ """
151
+ )
152
+
153
+ with gr.Row():
154
+ with gr.Column():
155
+ # Input section
156
+ audio_input = gr.Audio(
157
+ label="Upload Audio File",
158
+ type="filepath",
159
+ sources=["upload"],
160
+ )
161
+
162
+ model_choice = gr.Radio(
163
+ choices=["MusicNet (Recommended)", "Synth"],
164
+ value="MusicNet (Recommended)",
165
+ label="Model Selection",
166
+ info="MusicNet model is trained on real piano recordings, Synth on synthetic data",
167
+ )
168
+
169
+ with gr.Row():
170
+ onset_threshold = gr.Slider(
171
+ minimum=0.1,
172
+ maximum=0.9,
173
+ value=0.5,
174
+ step=0.05,
175
+ label="Onset Threshold",
176
+ info="Higher = fewer notes detected",
177
+ )
178
+ frame_threshold = gr.Slider(
179
+ minimum=0.1,
180
+ maximum=0.9,
181
+ value=0.5,
182
+ step=0.05,
183
+ label="Frame Threshold",
184
+ info="Higher = shorter note durations",
185
+ )
186
+
187
+ transcribe_btn = gr.Button("Transcribe to MIDI", variant="primary")
188
+
189
+ with gr.Column():
190
+ # Output section
191
+ output_midi = gr.File(label="Download MIDI", interactive=False)
192
+ status_output = gr.Textbox(
193
+ label="Status",
194
+ lines=10,
195
+ interactive=False,
196
+ placeholder="Upload audio and click 'Transcribe to MIDI' to start...",
197
+ )
198
+
199
+ # Example files
200
+ gr.Markdown(
201
+ """
202
+ ### Notes:
203
+ - Audio will be automatically resampled to 16kHz if needed
204
+ - Supports common formats: WAV, FLAC, MP3
205
+ - Maximum duration: 10 minutes
206
+ - Best results with piano recordings at 16kHz
207
+ - Processing time depends on audio length (typically a few seconds per minute of audio)
208
+ """
209
+ )
210
+
211
+ # Connect button to function
212
+ transcribe_btn.click(
213
+ fn=transcribe_audio,
214
+ inputs=[audio_input, model_choice, onset_threshold, frame_threshold],
215
+ outputs=[output_midi, status_output],
216
+ )
217
+
218
+ gr.Markdown(
219
+ """
220
+ ---
221
+ **Project Links:**
222
+ - [GitHub Repository](https://github.com/Yoni-Yaffe/count-the-notes)
223
+ - [Project Page](https://yoni-yaffe.github.io/count-the-notes/)
224
+ - [ArXiv Paper](https://arxiv.org/abs/2511.14250)
225
+
226
+ If you use this work, please cite:
227
+ ```
228
+ @inproceedings{yaffe2025countem,
229
+ title={Count the Notes: Histogram-Based Supervision for Automatic Music Transcription},
230
+ author={Jonathan Yaffe and Ben Maman and Meinard Müller and Amit Bermano},
231
+ booktitle={Proc. ISMIR},
232
+ year={2025}
233
+ }
234
+ ```
235
+ """
236
+ )
237
+
238
+
239
+ if __name__ == "__main__":
240
+ # Pre-load the default model to speed up first transcription
241
+ print("Pre-loading default model...")
242
+ load_model("Yoni-Yaffe/countem-musicnet")
243
+ print("Model pre-loaded. Starting Gradio interface...")
244
+
245
+ # Launch the demo
246
+ demo.launch(
247
+ share=False, # Set to True to create a public link
248
+ server_name="0.0.0.0", # Allow access from network
249
+ server_port=7860,
250
+ )