Spaces:

Yoni232
/

count-the-notes

Running on Zero

App Files Files Community

Yoni232 commited on 4 days ago

Commit

05d6e12

1 Parent(s): 80e5ec8

added source code of model and transcription scripts

Browse files

Files changed (10) hide show

onsets_and_frames/__init__.py +5 -0
onsets_and_frames/constants.py +26 -0
onsets_and_frames/dataset.py +719 -0
onsets_and_frames/decoding.py +102 -0
onsets_and_frames/hf_model.py +364 -0
onsets_and_frames/lstm.py +96 -0
onsets_and_frames/mel.py +136 -0
onsets_and_frames/midi_utils.py +655 -0
onsets_and_frames/transcriber.py +276 -0
onsets_and_frames/utils.py +245 -0

onsets_and_frames/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .constants import *
+from .dataset import EMDATASET
+from .mel import melspectrogram
+from .transcriber import OnsetsAndFrames, OnsetsNoFrames
+from .utils import *

onsets_and_frames/constants.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+SAMPLE_RATE = 16000
+HOP_LENGTH = 512
+ONSET_LENGTH = HOP_LENGTH
+OFFSET_LENGTH = HOP_LENGTH
+HOPS_IN_ONSET = ONSET_LENGTH // HOP_LENGTH
+HOPS_IN_OFFSET = OFFSET_LENGTH // HOP_LENGTH
+MIN_MIDI = 21
+MAX_MIDI = 108
+N_KEYS = MAX_MIDI - MIN_MIDI + 1
+DTW_FACTOR = 3
+N_MELS = 229
+MEL_FMIN = 30
+MEL_FMAX = SAMPLE_RATE // 2
+WINDOW_LENGTH = 2048
+SEQ_LEN = 327680  # 20 seconds
+DRUM_CHANNEL = 9
+DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

onsets_and_frames/dataset.py ADDED Viewed

	@@ -0,0 +1,719 @@

+import os
+import random
+import sys
+import time
+import librosa
+import numpy as np
+import soundfile
+import torch
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from onsets_and_frames import constants
+from onsets_and_frames.constants import DEFAULT_DEVICE, N_KEYS, SAMPLE_RATE
+from onsets_and_frames.mel import melspectrogram
+from onsets_and_frames.midi_utils import (
+    midi_to_frames,
+    save_midi_alignments_and_predictions,
+)
+from onsets_and_frames.utils import (
+    get_diff,
+    get_logger,
+    get_peaks,
+    shift_label,
+    smooth_labels,
+)
+class EMDATASET(Dataset):
+    def __init__(
+        self,
+        audio_path="NoteEM_audio",
+        tsv_path="NoteEM_tsv",
+        labels_path="NoteEm_labels",
+        groups=None,
+        sequence_length=None,
+        seed=42,
+        device=DEFAULT_DEVICE,
+        instrument_map=None,
+        update_instruments=False,
+        transcriber=None,
+        conversion_map=None,
+        pitch_shift=True,
+        pitch_shift_limit=5,
+        keep_eval_files=False,
+        n_eval=1,
+        evaluation_list=None,
+        only_eval=False,
+        save_to_memory=False,
+        smooth_labels=False,
+        use_onset_mask=False,
+    ):
+        # Get the dataset logger (logging system should already be initialized by train.py)
+        self.logger = get_logger("dataset")
+        self.audio_path = audio_path
+        self.tsv_path = tsv_path
+        self.labels_path = labels_path
+        self.sequence_length = sequence_length
+        self.device = device
+        self.random = np.random.RandomState(seed)
+        self.groups = groups
+        self.conversion_map = conversion_map
+        self.eval_file_list = []
+        self.file_list = self.files(
+            self.groups,
+            pitch_shift=pitch_shift,
+            keep_eval_files=keep_eval_files,
+            n_eval=n_eval,
+            evaluation_list=evaluation_list,
+            pitch_shift_limit=pitch_shift_limit,
+        )
+        self.save_to_memory = save_to_memory
+        self.smooth_labels = smooth_labels
+        self.use_onset_mask = use_onset_mask
+        self.pitch_shift_limit = pitch_shift_limit
+        self.logger.debug("Save to memory is %s", self.save_to_memory)
+        self.logger.info("len file list %d", len(self.file_list))
+        self.logger.info("\n\n")
+        if instrument_map is None:
+            self.get_instruments(conversion_map=conversion_map)
+        else:
+            self.instruments = instrument_map
+            if update_instruments:
+                self.add_instruments()
+        self.transcriber = transcriber
+        if only_eval:
+            return
+        self.load_pts(self.file_list)
+        self.data = []
+        self.logger.info("Reading files...")
+        for input_files in tqdm(self.file_list, desc="creating data list"):
+            flac, _ = input_files
+            audio_len = librosa.get_duration(path=flac)
+            minutes = int(np.ceil(audio_len / 60))
+            copies = minutes
+            for _ in range(copies):
+                self.data.append(input_files)
+        random.shuffle(self.data)
+    def flac_to_pt_path(self, flac):
+        pt_fname = os.path.basename(flac).replace(".flac", ".pt")
+        pt_path = os.path.join(self.labels_path, pt_fname)
+        return pt_path
+    def __len__(self):
+        return len(self.data)
+    def files(
+        self,
+        groups,
+        pitch_shift=True,
+        keep_eval_files=False,
+        n_eval=1,
+        evaluation_list=None,
+        pitch_shift_limit=5,
+    ):
+        self.path = self.audio_path
+        tsvs_path = self.tsv_path
+        self.logger.info("tsv path: %s", tsvs_path)
+        self.logger.info("Evaluation list: %s", evaluation_list)
+        res = []
+        self.logger.info("keep eval files: %s", keep_eval_files)
+        self.logger.info("n eval: %d", n_eval)
+        for group in groups:
+            tsvs = os.listdir(tsvs_path + os.sep + group)
+            tsvs = sorted(tsvs)
+            if keep_eval_files and evaluation_list is None:
+                eval_tsvs = tsvs[:n_eval]
+                tsvs = tsvs[n_eval:]
+            elif keep_eval_files and evaluation_list is not None:
+                eval_tsvs_names = [
+                    i.split("#")[0].split(".flac")[0].split(".tsv")[0]
+                    for i in evaluation_list
+                ]
+                eval_tsvs = [
+                    i
+                    for i in tsvs
+                    if i.split("#")[0].split(".tsv")[0] in eval_tsvs_names
+                ]
+                tsvs = [i for i in tsvs if i not in eval_tsvs]
+            else:
+                eval_tsvs = []
+            self.logger.info("len tsvs: %d", len(tsvs))
+            tsvs_names = [t.split(".tsv")[0].split("#")[0] for t in tsvs]
+            eval_tsvs_names = [t.split(".tsv")[0].split("#")[0] for t in eval_tsvs]
+            for shft in range(-5, 6):
+                if shft != 0 and not pitch_shift or abs(shft) > pitch_shift_limit:
+                    continue
+                curr_fls_pth = self.path + os.sep + group + "#{}".format(shft)
+                fls = os.listdir(curr_fls_pth)
+                orig_files = fls
+                # print(f"files names before\n {fls}")
+                fls = [
+                    i for i in fls if i.split("#")[0] in tsvs_names
+                ]  # in case we dont have the corresponding midi
+                missing_fls = [i for i in orig_files if i not in fls]
+                if len(missing_fls) > 0:
+                    self.logger.warning("missing files: %s", missing_fls)
+                fls_names = [i.split("#")[0].split(".flac")[0] for i in fls]
+                tsvs = [
+                    i for i in tsvs if i.split(".tsv")[0].split("#")[0] in fls_names
+                ]
+                assert len(tsvs) == len(fls)
+                # print(f"files names after\n {fls}")
+                fls = sorted(fls)
+                if shft == 0:
+                    eval_fls = os.listdir(curr_fls_pth)
+                    # print(f"files names\n {eval_fls}")
+                    eval_fls = [
+                        i for i in eval_fls if i.split("#")[0] in eval_tsvs_names
+                    ]  # in case we dont have the corresponding midi
+                    eval_fls_names = [i.split("#")[0] for i in eval_fls]
+                    eval_tsvs = [
+                        i
+                        for i in eval_tsvs
+                        if i.split(".tsv")[0].split("#")[0] in eval_fls_names
+                    ]
+                    assert len(eval_fls_names) == len(eval_tsvs_names)
+                    # print(f"files names\n {eval_fls}")
+                    eval_fls = sorted(eval_fls)
+                    for f, t in zip(eval_fls, eval_tsvs):
+                        self.eval_file_list.append(
+                            (
+                                curr_fls_pth + os.sep + f,
+                                tsvs_path + os.sep + group + os.sep + t,
+                            )
+                        )
+                for f, t in zip(fls, tsvs):
+                    res.append(
+                        (
+                            curr_fls_pth + os.sep + f,
+                            tsvs_path + os.sep + group + os.sep + t,
+                        )
+                    )
+        for flac, tsv in res:
+            if (
+                os.path.basename(flac).split("#")[0].split(".flac")[0]
+                != os.path.basename(tsv).split("#")[0].split(".tsv")[0]
+            ):
+                self.logger.warning("found mismatch in the files: ")
+                self.logger.warning("flac: %s", os.path.basename(flac).split("#")[0])
+                self.logger.warning("tsv: %s", os.path.basename(tsv).split("#")[0])
+                self.logger.warning("please check the input files")
+                exit(1)
+        return res
+    def get_instruments(self, conversion_map=None):
+        instruments = set()
+        for _, f in self.file_list:
+            events = np.loadtxt(f, delimiter="\t", skiprows=1)
+            curr_instruments = set(events[:, -1])
+            if conversion_map is not None:
+                curr_instruments = {
+                    conversion_map[c] if c in conversion_map else c
+                    for c in curr_instruments
+                }
+            instruments = instruments.union(curr_instruments)
+        instruments = [int(elem) for elem in instruments if elem < 115]
+        if conversion_map is not None:
+            instruments = [i for i in instruments if i in conversion_map]
+        instruments = list(set(instruments))
+        if 0 in instruments:
+            piano_ind = instruments.index(0)
+            instruments.pop(piano_ind)
+            instruments.insert(0, 0)
+        self.instruments = instruments
+        self.instruments = list(
+            set(self.instruments) - set(range(88, 104)) - set(range(112, 150))
+        )
+        self.logger.info("Dataset instruments: %s", self.instruments)
+        self.logger.info("Total: %d instruments", len(self.instruments))
+    def add_instruments(self):
+        for _, f in self.file_list:
+            events = np.loadtxt(f, delimiter="\t", skiprows=1)
+            curr_instruments = set(events[:, -1])
+            new_instruments = curr_instruments - set(self.instruments)
+            self.instruments += list(new_instruments)
+        instruments = [int(elem) for elem in self.instruments if (elem < 115)]
+        self.instruments = instruments
+    def __getitem__(self, index):
+        data = self.load(*self.data[index])
+        # result = dict(path=data['path'])
+        midi_length = len(data["label"])
+        n_steps = self.sequence_length // constants.HOP_LENGTH
+        if midi_length < n_steps:
+            step_begin = 0
+            step_end = midi_length
+        else:
+            step_begin = self.random.randint(max(midi_length - n_steps, 1))
+            step_end = step_begin + n_steps
+        begin = step_begin * constants.HOP_LENGTH
+        end = begin + self.sequence_length
+        audio = (
+            data["audio"][begin:end].float().div_(32768.0)
+        )  # torch.ShortTensor → float
+        label = data["label"][step_begin:step_end].clone()  # torch.Tensor
+        if audio.shape[0] < self.sequence_length:
+            pad_amt = self.sequence_length - audio.shape[0]
+            audio = torch.cat([audio, torch.zeros(pad_amt, dtype=audio.dtype)], dim=0)
+        if label.shape[0] < n_steps:
+            pad_amt = n_steps - label.shape[0]
+            label = torch.cat(
+                [label, torch.zeros((pad_amt, *label.shape[1:]), dtype=label.dtype)],
+                dim=0,
+            )
+        audio = torch.clamp(audio, -1.0, 1.0)
+        result = {"path": data["path"], "audio": audio, "label": label}
+        if "velocity" in data:
+            result["velocity"] = data["velocity"][step_begin:step_end, ...]
+            result["velocity"] = result["velocity"].float() / 128.0
+        if result["label"].max() < 3:
+            result["onset"] = result["label"].float()
+        else:
+            result["onset"] = (result["label"] == 3).float()
+        result["offset"] = (result["label"] == 1).float()
+        result["frame"] = (result["label"] > 1).float()
+        if self.smooth_labels:
+            result["onset"] = smooth_labels(result["onset"])
+        if self.use_onset_mask:
+            if "onset_mask" in data:
+                result["onset_mask"] = data["onset_mask"][
+                    step_begin:step_end, ...
+                ].float()
+            else:
+                result["onset_mask"] = torch.ones_like(result["onset"]).float()
+            if "frame_mask" in data:
+                result["frame_mask"] = data["frame_mask"][
+                    step_begin:step_end, ...
+                ].float()
+            else:
+                result["frame_mask"] = torch.ones_like(result["frame"]).float()
+        shape = result["frame"].shape
+        keys = N_KEYS
+        new_shape = shape[:-1] + (shape[-1] // keys, keys)
+        result["big_frame"] = result["frame"]
+        result["frame"], _ = result["frame"].reshape(new_shape).max(axis=-2)
+        # if 'frame_mask' not in data:
+        #     result['frame_mask'] = torch.ones_like(result['frame']).to(self.device).float()
+        result["big_offset"] = result["offset"]
+        result["offset"], _ = result["offset"].reshape(new_shape).max(axis=-2)
+        result["group"] = self.data[index][0].split(os.sep)[-2].split("#")[0]
+        return result
+    def load(self, audio_path, tsv_path):
+        if self.save_to_memory:
+            data = self.pts[audio_path]
+        else:
+            data = torch.load(self.flac_to_pt_path(audio_path))
+        if len(data["audio"].shape) > 1:
+            data["audio"] = (data["audio"].float().mean(dim=-1)).short()
+        if "label" in data:
+            return data
+        else:
+            piece, part = audio_path.split(os.sep)[-2:]
+            piece_split = piece.split("#")
+            if len(piece_split) == 2:
+                piece, shift1 = piece_split
+            else:
+                piece, shift1 = "#".join(piece_split[:2]), piece_split[-1]
+            part_split = part.split("#")
+            if len(part_split) == 2:
+                part, shift2 = part_split
+            else:
+                part, shift2 = "#".join(part_split[:2]), part_split[-1]
+            shift2, _ = shift2.split(".")
+            assert shift1 == shift2
+            shift = shift1
+            assert shift != 0
+            orig = audio_path.replace("#{}".format(shift), "#0")
+            if self.save_to_memory:
+                orig_data = self.pts[orig]
+            else:
+                orig_data = torch.load(self.flac_to_pt_path(orig))
+            res = {}
+            res["label"] = shift_label(orig_data["label"], int(shift))
+            res["path"] = audio_path
+            res["audio"] = data["audio"]
+            if "velocity" in orig_data:
+                res["velocity"] = shift_label(orig_data["velocity"], int(shift))
+            if "onset_mask" in orig_data:
+                res["onset_mask"] = shift_label(orig_data["onset_mask"], int(shift))
+            if "frame_mask" in orig_data:
+                res["frame_mask"] = shift_label(orig_data["frame_mask"], int(shift))
+            return res
+    def load_pts(self, files):
+        self.pts = {}
+        self.logger.info("loading pts...")
+        for flac, tsv in tqdm(files, desc="loading pts"):
+            # print('flac, tsv', flac, tsv)
+            if os.path.isfile(
+                self.labels_path
+                + os.sep
+                + flac.split(os.sep)[-1].replace(".flac", ".pt")
+            ):
+                if self.save_to_memory:
+                    self.pts[flac] = torch.load(
+                        self.labels_path
+                        + os.sep
+                        + flac.split(os.sep)[-1].replace(".flac", ".pt")
+                    )
+            else:
+                if flac.count("#") != 2:
+                    self.logger.debug("two # in filename: %s", flac)
+                audio, sr = soundfile.read(flac, dtype="int16")
+                if len(audio.shape) == 2:
+                    audio = audio.astype(float).mean(axis=1)
+                else:
+                    audio = audio.astype(float)
+                audio = audio.astype(np.int16)
+                self.logger.debug("audio len: %d", len(audio))
+                assert sr == SAMPLE_RATE
+                audio = torch.ShortTensor(audio)
+                if "#0" not in flac:
+                    assert "#" in flac
+                    data = {"audio": audio}
+                    if self.save_to_memory:
+                        self.pts[flac] = data
+                    torch.save(data, self.flac_to_pt_path(flac))
+                    continue
+                midi = np.loadtxt(tsv, delimiter="\t", skiprows=1)
+                unaligned_label = midi_to_frames(
+                    midi, self.instruments, conversion_map=self.conversion_map
+                )
+                if len(self.instruments) == 1:
+                    unaligned_label = unaligned_label[:, -N_KEYS:]
+                if len(unaligned_label) < self.sequence_length // constants.HOP_LENGTH:
+                    diff = self.sequence_length // constants.HOP_LENGTH - len(
+                        unaligned_label
+                    )
+                    pad = torch.zeros(
+                        (diff, unaligned_label.shape[1]), dtype=unaligned_label.dtype
+                    )
+                    unaligned_label = torch.cat((unaligned_label, pad), dim=0)
+                group = flac.split(os.sep)[-2].split("#")[0]
+                data = dict(
+                    path=self.labels_path + os.sep + flac.split(os.sep)[-1],
+                    audio=audio,
+                    unaligned_label=unaligned_label,
+                    group=group,
+                    BON=float("inf"),
+                    BON_VEC=np.full(unaligned_label.shape[1], float("inf")),
+                )
+                torch.save(data, self.flac_to_pt_path(flac))
+                if self.save_to_memory:
+                    self.pts[flac] = data
+    def update_pts_counting(
+        self,
+        transcriber,
+        counting_window_length,
+        POS=1.1,
+        NEG=-0.001,
+        FRAME_POS=0.5,
+        to_save=None,
+        first=False,
+        update=True,
+        BEST_DIST=False,
+        peak_size=3,
+        BEST_DIST_VEC=False,
+        counting_window_hop=0,
+    ):
+        self.logger.info("Updating pts...")
+        self.logger.info("First %s", first)
+        total_counting_time = 0.0  # Initialize total time for counting-based alignment
+        self.logger.info("POS, NEG: %s, %s", POS, NEG)
+        if to_save is not None:
+            os.makedirs(to_save, exist_ok=True)
+        self.logger.info("There are %d pts", len(self.pts))
+        update_count = 0
+        sys.stdout.flush()
+        onlt_pitch_0_files = [f for f in self.file_list if "#0" in f[0]]
+        for input_files in tqdm(onlt_pitch_0_files, desc="updating pts"):
+            flac, tsv = input_files
+            data = torch.load(self.flac_to_pt_path(flac))
+            if "unaligned_label" not in data:
+                self.logger.warning("No unaligned labels for %s", flac)
+                continue
+            audio_inp = data["audio"].float() / 32768.0
+            MAX_TIME = 5 * 60 * SAMPLE_RATE
+            audio_inp_len = len(audio_inp)
+            if audio_inp_len > MAX_TIME:
+                n_segments = int(np.ceil(audio_inp_len / MAX_TIME))
+                self.logger.debug("Long audio, splitting to %d segments", n_segments)
+                seg_len = MAX_TIME
+                onsets_preds = []
+                offset_preds = []
+                frame_preds = []
+                for i_s in range(n_segments):
+                    curr = (
+                        audio_inp[i_s * seg_len : (i_s + 1) * seg_len]
+                        .unsqueeze(0)
+                        .cuda()
+                    )
+                    curr_mel = melspectrogram(
+                        curr.reshape(-1, curr.shape[-1])[:, :-1]
+                    ).transpose(-1, -2)
+                    (
+                        curr_onset_pred,
+                        curr_offset_pred,
+                        _,
+                        curr_frame_pred,
+                        curr_velocity_pred,
+                    ) = transcriber(curr_mel)
+                    onsets_preds.append(curr_onset_pred)
+                    offset_preds.append(curr_offset_pred)
+                    frame_preds.append(curr_frame_pred)
+                onset_pred = torch.cat(onsets_preds, dim=1)
+                offset_pred = torch.cat(offset_preds, dim=1)
+                frame_pred = torch.cat(frame_preds, dim=1)
+            else:
+                audio_inp = audio_inp.unsqueeze(0).cuda()
+                mel = melspectrogram(
+                    audio_inp.reshape(-1, audio_inp.shape[-1])[:, :-1]
+                ).transpose(-1, -2)
+                onset_pred, offset_pred, _, frame_pred, _ = transcriber(mel)
+            self.logger.debug("Done predicting.")
+            # We assume onset predictions are of length N_KEYS * (len(instruments) + 1),
+            # first N_KEYS classes are the first instrument, next N_KEYS classes are the next instrument, etc.,
+            # and last N_KEYS classes are for pitch regardless of instrument
+            # Currently, frame and offset predictions are only N_KEYS classes.
+            onset_pred = onset_pred.detach().squeeze().cpu()
+            frame_pred = frame_pred.detach().squeeze().cpu()
+            PEAK_SIZE = peak_size
+            self.logger.debug("PEAK_SIZE: %d", PEAK_SIZE)
+            # we peak peak the onset prediction to only keep local maximum onsets
+            if peak_size > 0:
+                peaks = get_peaks(
+                    onset_pred, PEAK_SIZE
+                )  # we only want local peaks, in a 7-frame neighborhood, 3 to each side.
+                onset_pred[~peaks] = 0
+            unaligned_onsets = (data["unaligned_label"] == 3).float().numpy()
+            onset_pred_np = onset_pred.numpy()
+            frame_pred_np = frame_pred.numpy()
+            pred_bag_of_notes = (onset_pred_np[:, -N_KEYS:] >= 0.5).sum(axis=0)
+            gt_bag_of_notes = unaligned_onsets[:, -N_KEYS:].astype(bool).sum(axis=0)
+            bon_dist = (((pred_bag_of_notes - gt_bag_of_notes) ** 2).sum()) ** 0.5
+            pred_bag_of_notes_with_inst = (onset_pred_np >= 0.5).sum(axis=0)
+            gt_bag_of_notes_with_inst = unaligned_onsets.astype(bool).sum(axis=0)
+            bon_dist_vec = np.abs(
+                pred_bag_of_notes_with_inst - gt_bag_of_notes_with_inst
+            )
+            bon_dist /= gt_bag_of_notes.sum()
+            self.logger.debug("bag of notes dist: %f", bon_dist)
+            ####
+            aligned_onsets = np.zeros(onset_pred_np.shape, dtype=bool)
+            aligned_frames = np.zeros(onset_pred_np.shape, dtype=bool)
+            # This block is the main difference between the counting approach and the DTW approach.
+            # In the counting approach we label the audio by counting note onsets: For each onset pitch class,
+            # denote by K the number of times it occurs in the unaligned label. We simply take the K highest local
+            # peaks predicted by the current model.
+            # Split unaligned onsets into chunks of size counting_window_length
+            self.logger.debug(
+                "unaligned onsets shape: %s, counting window length: %d, counting window hop: %d",
+                unaligned_onsets.shape,
+                counting_window_length,
+                counting_window_hop,
+            )
+            assert counting_window_hop <= counting_window_length
+            if counting_window_hop == 0:
+                counting_window_hop = counting_window_length
+            num_chunks = (
+                1
+                if counting_window_length == 0
+                else int(np.ceil(len(unaligned_onsets) / counting_window_hop))
+            )
+            self.logger.debug("number of chunks: %d", num_chunks)
+            start_time = time.time()
+            for chunk_idx in range(num_chunks):
+                start_idx = chunk_idx * counting_window_hop
+                if counting_window_length == 0:
+                    end_idx = max(len(unaligned_onsets), len(onset_pred_np))
+                else:
+                    end_idx = min(
+                        start_idx + counting_window_length, len(unaligned_onsets)
+                    )
+                chunk_onsets = unaligned_onsets[start_idx:end_idx]
+                chunk_onsets_count = (
+                    (data["unaligned_label"][start_idx:end_idx, :] == 3)
+                    .sum(dim=0)
+                    .numpy()
+                )
+                for f, f_count in enumerate(chunk_onsets_count):
+                    if f_count == 0:
+                        continue
+                    f_most_likely = np.sort(
+                        onset_pred_np[start_idx:end_idx, f].argsort()[::-1][:f_count]
+                    )
+                    f_most_likely += start_idx  # Adjust indices to the original size
+                    aligned_onsets[f_most_likely, f] = 1
+                    f_unaligned = chunk_onsets[:, f].nonzero()
+                    assert len(f_unaligned) == 1
+                    f_unaligned = f_unaligned[0]
+            counting_duration = time.time() - start_time
+            total_counting_time += counting_duration
+            self.logger.debug(
+                "Counting alignment for file '%s' took %.2f seconds.",
+                flac,
+                counting_duration,
+            )
+            # Pseudo labels, Pos bigger than 1 is equivalent to not using pseudo labels
+            pseudo_onsets = (onset_pred_np >= POS) & (~aligned_onsets)
+            onset_label = np.maximum(pseudo_onsets, aligned_onsets)
+            # in this project we do not train frame stack but we calculate the labeels anyways
+            pseudo_frames = np.zeros(pseudo_onsets.shape, dtype=pseudo_onsets.dtype)
+            pseudo_offsets = np.zeros(pseudo_onsets.shape, dtype=pseudo_onsets.dtype)
+            for t, f in zip(*onset_label.nonzero()):
+                t_off = t
+                while (
+                    t_off < len(pseudo_frames)
+                    and frame_pred[t_off, f % N_KEYS] >= FRAME_POS
+                ):
+                    t_off += 1
+                pseudo_frames[t:t_off, f] = 1
+                if t_off < len(pseudo_offsets):
+                    pseudo_offsets[t_off, f] = 1
+            frame_label = np.maximum(pseudo_frames, aligned_frames)
+            offset_label = get_diff(frame_label, offset=True)
+            label = np.maximum(2 * frame_label, offset_label)
+            label = np.maximum(3 * onset_label, label).astype(np.uint8)
+            if to_save is not None:
+                save_midi_alignments_and_predictions(
+                    to_save,
+                    data["path"],
+                    self.instruments,
+                    aligned_onsets,
+                    aligned_frames,
+                    onset_pred_np,
+                    frame_pred_np,
+                    prefix="",
+                    group=data["group"],
+                )
+            prev_bon_dist = data.get("BON", float("inf"))
+            prev_bon_dist_vec = data.get("BON_VEC", None)
+            if update:
+                if BEST_DIST_VEC:
+                    self.logger.debug("Updated Labels")
+                    if prev_bon_dist_vec is None:
+                        raise ValueError(
+                            "BEST_DIST_VEC is True but no previous BON_VEC found"
+                        )
+                    prev_label = data["label"]
+                    new_label = torch.from_numpy(label).byte()
+                    if first:
+                        prev_label = new_label
+                        update_count += 1
+                    else:
+                        updated_flag = False
+                        num_pitches_updated = 0
+                        for k in range(prev_label.shape[1]):
+                            if prev_bon_dist_vec[k] > bon_dist_vec[k]:
+                                prev_label[:, k] = new_label[:, k]
+                                prev_bon_dist_vec[k] = bon_dist_vec[k]
+                                num_pitches_updated += 1
+                                updated_flag = True
+                        if updated_flag:
+                            update_count += 1
+                        self.logger.debug("Updated %d pitches", num_pitches_updated)
+                    data["label"] = prev_label
+                    data["BON_VEC"] = prev_bon_dist_vec
+                    self.logger.debug("saved updated pt")
+                    torch.save(
+                        data,
+                        self.labels_path
+                        + os.sep
+                        + flac.split(os.sep)[-1]
+                        .replace(".flac", ".pt")
+                        .replace(".mp3", ".pt"),
+                    )
+                elif not BEST_DIST or bon_dist < prev_bon_dist:
+                    update_count += 1
+                    self.logger.debug("Updated Labels")
+                    data["label"] = torch.from_numpy(label).byte()
+                    data["BON"] = bon_dist
+                    self.logger.debug("saved updated pt")
+                    torch.save(
+                        data,
+                        self.labels_path
+                        + os.sep
+                        + flac.split(os.sep)[-1]
+                        .replace(".flac", ".pt")
+                        .replace(".mp3", ".pt"),
+                    )
+            if bon_dist < prev_bon_dist:
+                self.logger.debug(
+                    "Bag of notes distance improved from %f to %f",
+                    prev_bon_dist,
+                    bon_dist,
+                )
+                data["BON"] = bon_dist
+                if to_save is not None and BEST_DIST:
+                    os.makedirs(to_save + "/BEST_BON", exist_ok=True)
+                    save_midi_alignments_and_predictions(
+                        to_save + "/BEST_BON",
+                        data["path"],
+                        self.instruments,
+                        aligned_onsets,
+                        aligned_frames,
+                        onset_pred_np,
+                        frame_pred_np,
+                        prefix="BEST_BON",
+                        group=data["group"],
+                        use_time=False,
+                    )
+        self.logger.info(
+            "Updated %d pts out of %d", update_count, len(onlt_pitch_0_files)
+        )
+        self.logger.info(
+            "Total counting alignment time for all files: %.2f seconds.", total_counting_time
+        )

onsets_and_frames/decoding.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import numpy as np
+import torch
+def extract_notes(onsets, frames, velocity, onset_threshold=0.5, frame_threshold=0.5):
+    """
+    Finds the note timings based on the onsets and frames information
+    Parameters
+    ----------
+    onsets: torch.FloatTensor, shape = [frames, bins]
+    frames: torch.FloatTensor, shape = [frames, bins]
+    velocity: torch.FloatTensor, shape = [frames, bins]
+    onset_threshold: float
+    frame_threshold: float
+    Returns
+    -------
+    pitches: np.ndarray of bin_indices
+    intervals: np.ndarray of rows containing (onset_index, offset_index)
+    velocities: np.ndarray of velocity values
+    """
+    # onsets_forward = torch.roll(onsets, shifts=(1, 0), dims=(0, 1))
+    # onsets_forward[0, :] = 0
+    # onsets_backward = torch.roll(onsets, shifts=(-1, 0), dims=(0, 1))
+    # onsets_backward[-1, :] = 0
+    # onsets_peak = torch.logical_and(onsets >= onsets_forward, onsets >= onsets_backward)
+    # onsets_peak = torch.logical_and(onsets >= 0.25, onsets_peak)
+    onsets = (onsets > onset_threshold).cpu().to(torch.uint8)
+    frames = (frames > frame_threshold).cpu().to(torch.uint8)
+    onset_diff = torch.cat([onsets[:1, :], onsets[1:, :] - onsets[:-1, :]], dim=0) == 1
+    # onset_diff = torch.cat([frames[:1, :], frames[1:, :] - frames[:-1, :]], dim=0) == 1
+    pitches = []
+    intervals = []
+    velocities = []
+    # for nonzero in onsets_peak.nonzero(as_tuple=False):
+    for nonzero in onset_diff.nonzero(as_tuple=False):
+        frame = nonzero[0].item()
+        pitch = nonzero[1].item()
+        onset = frame
+        offset = frame
+        velocity_samples = []
+        while onsets[offset, pitch].item() or frames[offset, pitch].item():
+            if onsets[offset, pitch].item():
+                # if frames[offset, pitch].item():
+                velocity_samples.append(velocity[offset, pitch].item())
+            offset += 1
+            if offset == onsets.shape[0]:
+                break
+        if offset > onset:
+            pitches.append(pitch)
+            intervals.append([onset, offset])
+            velocities.append(
+                np.mean(velocity_samples) if len(velocity_samples) > 0 else 0
+            )
+    return np.array(pitches), np.array(intervals), np.array(velocities)
+def notes_to_frames(pitches, intervals, shape, mask=None):
+    """
+    Takes lists specifying notes sequences and return
+    Parameters
+    ----------
+    pitches: list of pitch bin indices
+    intervals: list of [onset, offset] ranges of bin indices
+    shape: the shape of the original piano roll, [n_frames, n_bins]
+    Returns
+    -------
+    time: np.ndarray containing the frame indices
+    freqs: list of np.ndarray, each containing the frequency bin indices
+    """
+    roll = np.zeros(tuple(shape))
+    for pitch, (onset, offset) in zip(pitches, intervals):
+        # print('pitch', pitch, onset, offset)
+        # print('onset offset', onset, offset, pitch)
+        roll[onset:offset, pitch] = 1
+    if mask is not None:
+        roll *= mask
+    time = np.arange(roll.shape[0])
+    freqs = [roll[t, :].nonzero()[0] for t in time]
+    # if mask_size is not None:
+    #     mask = np.zeros(tuple(shape))
+    #     notes = roll.shape[1]
+    #     for n in range(notes):
+    #         onset_d = roll[1:, n] - roll[: -1, n]
+    #         print('unique', np.unique(onset_d))
+    #         onset_d[onset_d < 0] = 0
+    #         print('n', n, onset_d.sum())
+    #         onset_d = np.concatenate((np.zeros((1, 1)), roll[1:, n] - roll[: -1, n]))
+    #         onset_d[onset_d < 0] = 0
+    #         for r in range(mask_size):
+    #             mask[:, n] += np.roll(onset_d, r)
+    return time, freqs

onsets_and_frames/hf_model.py ADDED Viewed

	@@ -0,0 +1,364 @@

+"""
+Hugging Face Hub-compatible wrapper for CountEM music transcription models.
+"""
+from pathlib import Path
+from typing import Union, Tuple
+import numpy as np
+import torch
+import soundfile as sf
+from huggingface_hub import PyTorchModelHubMixin
+from onsets_and_frames.transcriber import OnsetsAndFrames
+from onsets_and_frames.mel import MelSpectrogram
+from onsets_and_frames.midi_utils import frames2midi
+from onsets_and_frames.constants import (
+    N_MELS,
+    MIN_MIDI,
+    MAX_MIDI,
+    HOP_LENGTH,
+    SAMPLE_RATE,
+    WINDOW_LENGTH,
+    MEL_FMIN,
+    MEL_FMAX,
+)
+class CountEMModel(
+    OnsetsAndFrames,
+    PyTorchModelHubMixin,
+    # Optional metadata that gets pushed to model card
+    library_name="countem",
+    tags=["audio", "music-transcription", "automatic-music-transcription", "midi"],
+    license="cc-by-4.0",
+    repo_url="https://github.com/Yoni-Yaffe/count-the-notes",
+    paper_url="https://arxiv.org/abs/2511.14250",
+):
+    """
+    Hugging Face Hub-compatible wrapper for CountEM automatic music transcription models.
+    This model performs automatic music transcription (AMT) from audio to MIDI.
+    It uses the Onsets & Frames architecture trained with the CountEM framework,
+    which enables training with weak, unordered note count histograms.
+    Example usage:
+        ```python
+        from onsets_and_frames.hf_model import CountEMModel
+        import soundfile as sf
+        # Load model from Hub
+        model = CountEMModel.from_pretrained("Yoni-Yaffe/countem-musicnet")
+        # Load audio (must be 16kHz)
+        audio, sr = sf.read("audio.flac")
+        assert sr == 16000, "Audio must be 16kHz"
+        # Transcribe to MIDI
+        model.transcribe_to_midi(audio, "output.mid")
+        ```
+    Args:
+        model_complexity: Complexity multiplier for the model (default: 64)
+        onset_complexity: Complexity multiplier for onset stack (default: 1.5)
+        n_instruments: Number of instruments to transcribe (default: 1)
+    """
+    def __init__(
+        self,
+        model_complexity: int = 64,
+        onset_complexity: float = 1.5,
+        n_instruments: int = 1,
+        **kwargs
+    ):
+        # Initialize the base OnsetsAndFrames model
+        n_keys = MAX_MIDI - MIN_MIDI + 1
+        OnsetsAndFrames.__init__(
+            self,
+            input_features=N_MELS,
+            output_features=n_keys,
+            model_complexity=model_complexity,
+            onset_complexity=onset_complexity,
+            n_instruments=n_instruments,
+        )
+        # Store config for HF Hub
+        self.config = {
+            "model_complexity": model_complexity,
+            "onset_complexity": onset_complexity,
+            "n_instruments": n_instruments,
+            "n_mels": N_MELS,
+            "n_keys": n_keys,
+            "sample_rate": SAMPLE_RATE,
+            "hop_length": HOP_LENGTH,
+        }
+        # Add mel spectrogram as a submodule for proper device management
+        # This ensures the mel transform moves with the model when calling .to(device)
+        self.melspectrogram = MelSpectrogram(
+            n_mels=N_MELS,
+            sample_rate=SAMPLE_RATE,
+            filter_length=WINDOW_LENGTH,
+            hop_length=HOP_LENGTH,
+            mel_fmin=MEL_FMIN,
+            mel_fmax=MEL_FMAX,
+        )
+    def forward(self, audio: Union[np.ndarray, torch.Tensor]):
+        """
+        Forward pass that accepts raw audio waveforms.
+        Unlike the parent OnsetsAndFrames which expects mel spectrograms,
+        this forward method accepts raw audio and converts it internally.
+        Args:
+            audio: Raw audio waveform, shape (batch, n_samples) or (n_samples,)
+                   Should be normalized to [-1, 1] or will be normalized automatically
+        Returns:
+            Tuple of (onset_pred, offset_pred, activation_pred, frame_pred, velocity_pred)
+        """
+        # Convert to torch tensor if needed
+        if isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio).float()
+        # Ensure audio is in range [-1, 1]
+        if audio.dtype == torch.int16:
+            audio = audio.float() / 32768.0
+        elif audio.max() > 1.0 or audio.min() < -1.0:
+            audio = audio / max(abs(audio.max()), abs(audio.min()))
+        # Add batch dimension if needed
+        if audio.dim() == 1:
+            audio = audio.unsqueeze(0)
+        device = next(self.parameters()).device
+        audio = audio.to(device)
+        # Remove last sample to fix frame count mismatch
+        audio = audio[:, :-1]
+        mel = self.melspectrogram(audio)
+        # Transpose to (batch, time, features) format expected by parent model
+        mel = mel.transpose(-1, -2)
+        return super().forward(mel)
+    @torch.no_grad()
+    def transcribe(
+        self,
+        audio: Union[np.ndarray, torch.Tensor],
+        onset_threshold: float = 0.5,
+        frame_threshold: float = 0.5,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Transcribe audio to note predictions.
+        Automatically handles long audio by splitting into segments (max 5 minutes each)
+        to avoid memory issues.
+        Args:
+            audio: Audio waveform, shape (n_samples,), normalized to [-1, 1]
+            onset_threshold: Threshold for onset detection (default: 0.5)
+            frame_threshold: Threshold for frame detection (default: 0.5)
+        Returns:
+            Tuple of (onset_pred, offset_pred, activation_pred, frame_pred, velocity_pred)
+            All are numpy arrays of shape (n_frames, 88) except velocity which may vary
+        """
+        self.eval()
+        # Convert to torch tensor if needed
+        if isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio).float()
+        # Ensure audio is 1D (convert stereo to mono if needed)
+        if audio.dim() > 1:
+            # If stereo or multi-channel, take mean across channels
+            audio = audio.mean(dim=-1 if audio.shape[-1] <=2 else 0)
+        # Normalize audio
+        if audio.dtype == torch.int16:
+            audio = audio.float() / 32768.0
+        elif audio.max() > 1.0 or audio.min() < -1.0:
+            audio = audio / max(abs(audio.max()), abs(audio.min()))
+        device = next(self.parameters()).device
+        audio = audio.to(device)
+        # Handle long audio by segmenting
+        MAX_TIME = 5 * 60 * SAMPLE_RATE  # 5 minutes
+        audio_len = len(audio)
+        if audio_len > MAX_TIME:
+            # Split into segments
+            n_segments = int(np.ceil(audio_len / MAX_TIME))
+            seg_len = MAX_TIME
+            onset_preds = []
+            offset_preds = []
+            activation_preds = []
+            frame_preds = []
+            velocity_preds = []
+            for i_s in range(n_segments):
+                start = i_s * seg_len
+                end = min((i_s + 1) * seg_len, audio_len)
+                segment = audio[start:end]
+                # Forward pass on segment
+                onset_seg, offset_seg, activation_seg, frame_seg, velocity_seg = self(segment)
+                onset_preds.append(onset_seg)
+                offset_preds.append(offset_seg)
+                activation_preds.append(activation_seg)
+                frame_preds.append(frame_seg)
+                velocity_preds.append(velocity_seg)
+            # Concatenate along time dimension (dim=1)
+            onset_pred = torch.cat(onset_preds, dim=1)
+            offset_pred = torch.cat(offset_preds, dim=1)
+            activation_pred = torch.cat(activation_preds, dim=1)
+            frame_pred = torch.cat(frame_preds, dim=1)
+            velocity_pred = torch.cat(velocity_preds, dim=1)
+        else:
+            # Short audio, process directly
+            onset_pred, offset_pred, activation_pred, frame_pred, velocity_pred = self(audio)
+        # Convert to numpy and remove batch dimension
+        onset_pred = onset_pred.squeeze(0).cpu().numpy()
+        offset_pred = offset_pred.squeeze(0).cpu().numpy()
+        activation_pred = activation_pred.squeeze(0).cpu().numpy()
+        frame_pred = frame_pred.squeeze(0).cpu().numpy()
+        velocity_pred = velocity_pred.squeeze(0).cpu().numpy()
+        return onset_pred, offset_pred, activation_pred, frame_pred, velocity_pred
+    def transcribe_to_midi(
+        self,
+        audio: Union[np.ndarray, torch.Tensor, str, Path],
+        output_path: Union[str, Path],
+        onset_threshold: float = 0.5,
+        frame_threshold: float = 0.5,
+    ) -> None:
+        """
+        Transcribe audio to MIDI file.
+        Args:
+            audio: Audio waveform, numpy array, torch tensor, or path to audio file
+            output_path: Path to save MIDI file
+            onset_threshold: Threshold for onset detection (default: 0.5)
+            frame_threshold: Threshold for frame detection (default: 0.5)
+        """
+        # Load audio from file if path is provided
+        if isinstance(audio, (str, Path)):
+            audio, sr = sf.read(audio, dtype="float32")
+            if sr != SAMPLE_RATE:
+                raise ValueError(
+                    f"Audio must be {SAMPLE_RATE}Hz, got {sr}Hz. "
+                    f"Please resample to {SAMPLE_RATE}Hz first."
+                )
+        # Get predictions
+        onset_pred, offset_pred, _, frame_pred, velocity_pred = self.transcribe(
+            audio, onset_threshold, frame_threshold
+        )
+        # Default instrument mapping (piano)
+        inst_mapping = {0: 0}  # instrument 0 -> MIDI program 0 (Acoustic Grand Piano)
+        # Convert predictions to MIDI
+        frames2midi(
+            str(output_path),
+            onset_pred,
+            frame_pred,
+            velocity_pred,
+            onset_threshold=onset_threshold,
+            frame_threshold=frame_threshold,
+            scaling=HOP_LENGTH / SAMPLE_RATE,
+            inst_mapping=inst_mapping,
+        )
+    def to_legacy(self) -> OnsetsAndFrames:
+        """
+        Convert this HuggingFace-compatible model to a legacy OnsetsAndFrames instance.
+        This is useful for:
+        - Fine-tuning models downloaded from HuggingFace Hub using existing training code
+        - Using HF models with existing inference scripts that expect OnsetsAndFrames
+        The legacy model will use the global melspectrogram from mel.py instead of
+        the instance-specific one in this model.
+        Returns:
+            OnsetsAndFrames instance with copied weights
+        """
+        # Create legacy model with same architecture
+        legacy_model = OnsetsAndFrames(
+            input_features=self.config['n_mels'],
+            output_features=self.config['n_keys'],
+            model_complexity=self.config['model_complexity'],
+            onset_complexity=self.config['onset_complexity'],
+            n_instruments=self.config['n_instruments']
+        )
+        # Get the state dict and filter out melspectrogram keys
+        state_dict = self.state_dict()
+        legacy_state_dict = {k: v for k, v in state_dict.items() if not k.startswith('melspectrogram.')}
+        # Copy state dict (only model weights, not mel spectrogram)
+        # The legacy model will use the global melspectrogram
+        legacy_model.load_state_dict(legacy_state_dict)
+        return legacy_model
+    @classmethod
+    def from_legacy_checkpoint(
+        cls,
+        checkpoint_path: Union[str, Path],
+        **kwargs
+    ) -> "CountEMModel":
+        """
+        Load a model from a legacy checkpoint (saved with torch.save(model)).
+        This is useful for converting old checkpoints to the new HF-compatible format.
+        Args:
+            checkpoint_path: Path to the legacy .pt checkpoint file
+            **kwargs: Additional arguments for model initialization
+        Returns:
+            CountEMModel instance with loaded weights
+        """
+        # Load the legacy checkpoint
+        legacy_model = torch.load(checkpoint_path, map_location="cpu")
+        # Extract configuration from the loaded model
+        # Infer model_complexity from the model structure
+        # ConvStack.cnn[0] is the first Conv2d layer with out_channels = model_size // 16
+        first_conv_channels = legacy_model.offset_stack[0].cnn[0].out_channels
+        model_size = first_conv_channels * 16
+        model_complexity = model_size // 16
+        # Infer onset_complexity
+        onset_first_conv_channels = legacy_model.onset_stack[0].cnn[0].out_channels
+        onset_model_size = onset_first_conv_channels * 16
+        onset_complexity = onset_model_size / model_size
+        # Infer n_instruments from output layer
+        # onset_stack[2] is the Linear layer
+        onset_out_features = legacy_model.onset_stack[2].out_features
+        n_keys = MAX_MIDI - MIN_MIDI + 1
+        n_instruments = onset_out_features // n_keys
+        # Create new model with the same configuration
+        model = cls(
+            model_complexity=model_complexity,
+            onset_complexity=onset_complexity,
+            n_instruments=n_instruments,
+            **kwargs
+        )
+        # Copy the state dict (strict=False because new model has melspectrogram submodule)
+        model.load_state_dict(legacy_model.state_dict(), strict=False)
+        return model

onsets_and_frames/lstm.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import torch
+from torch import nn
+class BiLSTM(nn.Module):
+    inference_chunk_length = 512
+    def __init__(self, input_features, recurrent_features, use_gru=False, dropout=0.0):
+        super().__init__()
+        self.rnn = (nn.LSTM if not use_gru else nn.GRU)(
+            input_features,
+            recurrent_features,
+            batch_first=True,
+            bidirectional=True,
+            dropout=dropout,
+        )
+    def forward(self, x):
+        if self.training:
+            return self.rnn(x)[0]
+        else:
+            # evaluation mode: support for longer sequences that do not fit in memory
+            batch_size, sequence_length, input_features = x.shape
+            hidden_size = self.rnn.hidden_size
+            num_directions = 2 if self.rnn.bidirectional else 1
+            h = torch.zeros(num_directions, batch_size, hidden_size, device=x.device)
+            c = torch.zeros(num_directions, batch_size, hidden_size, device=x.device)
+            output = torch.zeros(
+                batch_size,
+                sequence_length,
+                num_directions * hidden_size,
+                device=x.device,
+            )
+            # forward direction
+            slices = range(0, sequence_length, self.inference_chunk_length)
+            for start in slices:
+                end = start + self.inference_chunk_length
+                output[:, start:end, :], (h, c) = self.rnn(x[:, start:end, :], (h, c))
+            # reverse direction
+            if self.rnn.bidirectional:
+                h.zero_()
+                c.zero_()
+                for start in reversed(slices):
+                    end = start + self.inference_chunk_length
+                    result, (h, c) = self.rnn(x[:, start:end, :], (h, c))
+                    output[:, start:end, hidden_size:] = result[:, :, hidden_size:]
+            return output
+class UniLSTM(nn.Module):
+    inference_chunk_length = 512
+    def __init__(self, input_features, recurrent_features):
+        super().__init__()
+        self.rnn = nn.LSTM(input_features, recurrent_features, batch_first=True)
+    def forward(self, x):
+        if self.training:
+            return self.rnn(x)[0]
+        else:
+            # evaluation mode: support for longer sequences that do not fit in memory
+            batch_size, sequence_length, input_features = x.shape
+            hidden_size = self.rnn.hidden_size
+            num_directions = 2 if self.rnn.bidirectional else 1
+            h = torch.zeros(num_directions, batch_size, hidden_size, device=x.device)
+            c = torch.zeros(num_directions, batch_size, hidden_size, device=x.device)
+            output = torch.zeros(
+                batch_size,
+                sequence_length,
+                num_directions * hidden_size,
+                device=x.device,
+            )
+            # forward direction
+            slices = range(0, sequence_length, self.inference_chunk_length)
+            for start in slices:
+                end = start + self.inference_chunk_length
+                output[:, start:end, :], (h, c) = self.rnn(x[:, start:end, :], (h, c))
+            # reverse direction
+            if self.rnn.bidirectional:
+                h.zero_()
+                c.zero_()
+                for start in reversed(slices):
+                    end = start + self.inference_chunk_length
+                    result, (h, c) = self.rnn(x[:, start:end, :], (h, c))
+                    output[:, start:end, hidden_size:] = result[:, :, hidden_size:]
+            return output

onsets_and_frames/mel.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from librosa.filters import mel
+from librosa.util import pad_center
+from scipy.signal import get_window
+from torch.autograd import Variable
+from onsets_and_frames.constants import (
+    DEFAULT_DEVICE,
+    HOP_LENGTH,
+    MEL_FMAX,
+    MEL_FMIN,
+    N_MELS,
+    SAMPLE_RATE,
+    WINDOW_LENGTH,
+)
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+    def __init__(self, filter_length, hop_length, win_length=None, window="hann"):
+        super(STFT, self).__init__()
+        if win_length is None:
+            win_length = filter_length
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack(
+            [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
+        )
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        if window is not None:
+            assert filter_length >= win_length
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, size=filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+            # window the bases
+            forward_basis *= fft_window
+        self.register_buffer("forward_basis", forward_basis.float())
+    def forward(self, input_data):
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        # print('inp before', input_data.shape)
+        input_data = F.pad(
+            input_data.unsqueeze(1),
+            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+            mode="reflect",
+        )
+        input_data = input_data.squeeze(1)
+        # print('inp after', input_data.shape)
+        forward_transform = F.conv1d(
+            input_data,
+            Variable(self.forward_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )
+        # print('fwd', forward_transform.shape)
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
+        return magnitude, phase
+class MelSpectrogram(torch.nn.Module):
+    def __init__(
+        self,
+        n_mels,
+        sample_rate,
+        filter_length,
+        hop_length,
+        win_length=None,
+        mel_fmin=0.0,
+        mel_fmax=None,
+    ):
+        super(MelSpectrogram, self).__init__()
+        self.stft = STFT(filter_length, hop_length, win_length)
+        mel_basis = mel(
+            sr=sample_rate,
+            n_fft=filter_length,
+            n_mels=n_mels,
+            fmin=mel_fmin,
+            fmax=mel_fmax,
+            htk=True,
+        )
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+    def forward(self, y):
+        """Computes mel-spectrograms from a batch of waves
+        PARAMS
+        ------
+        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+        RETURNS
+        -------
+        mel_output: torch.FloatTensor of shape (B, T, n_mels)
+        """
+        assert torch.min(y.data) >= -1
+        assert torch.max(y.data) <= 1
+        magnitudes, phases = self.stft(y)
+        magnitudes = magnitudes.data
+        mel_output = torch.matmul(self.mel_basis, magnitudes)
+        mel_output = torch.log(torch.clamp(mel_output, min=1e-5))
+        return mel_output
+# the default melspectrogram converter across the project
+melspectrogram = MelSpectrogram(
+    N_MELS, SAMPLE_RATE, WINDOW_LENGTH, HOP_LENGTH, mel_fmin=MEL_FMIN, mel_fmax=MEL_FMAX
+)
+melspectrogram.to(DEFAULT_DEVICE)

onsets_and_frames/midi_utils.py ADDED Viewed

	@@ -0,0 +1,655 @@

+import os
+from datetime import datetime
+import mido
+import numpy as np
+import torch
+from mido import Message, MidiFile, MidiTrack
+from onsets_and_frames.constants import (
+    DRUM_CHANNEL,
+    HOP_LENGTH,
+    HOPS_IN_OFFSET,
+    HOPS_IN_ONSET,
+    MAX_MIDI,
+    MIN_MIDI,
+    N_KEYS,
+    SAMPLE_RATE,
+)
+from .utils import max_inst
+def midi_to_hz(m):
+    return 440.0 * (2.0 ** ((m - 69.0) / 12.0))
+def hz_to_midi(h):
+    return 12.0 * np.log2(h / (440.0)) + 69.0
+def midi_to_frames(midi, instruments, conversion_map=None):
+    n_keys = MAX_MIDI - MIN_MIDI + 1
+    midi_length = int((max(midi[:, 1]) + 1) * SAMPLE_RATE)
+    n_steps = (midi_length - 1) // HOP_LENGTH + 1
+    n_channels = len(instruments) + 1
+    label = torch.zeros(n_steps, n_keys * n_channels, dtype=torch.uint8)
+    for onset, offset, note, vel, instrument in midi:
+        f = int(note) - MIN_MIDI
+        if 104 > instrument > 87 or instrument > 111:
+            continue
+        if f >= n_keys or f < 0:
+            continue
+        assert 0 < vel < 128
+        instrument = int(instrument)
+        if conversion_map is not None:
+            if instrument not in conversion_map:
+                continue
+            instrument = conversion_map[instrument]
+        left = int(round(onset * SAMPLE_RATE / HOP_LENGTH))
+        onset_right = min(n_steps, left + HOPS_IN_ONSET)
+        frame_right = int(round(offset * SAMPLE_RATE / HOP_LENGTH))
+        frame_right = min(n_steps, frame_right)
+        offset_right = min(n_steps, frame_right + HOPS_IN_OFFSET)
+        if int(instrument) not in instruments:
+            continue
+        chan = instruments.index(int(instrument))
+        label[left:onset_right, n_keys * chan + f] = 3
+        label[onset_right:frame_right, n_keys * chan + f] = 2
+        label[frame_right:offset_right, n_keys * chan + f] = 1
+        inv_chan = len(instruments)
+        label[left:onset_right, n_keys * inv_chan + f] = 3
+        label[onset_right:frame_right, n_keys * inv_chan + f] = 2
+        label[frame_right:offset_right, n_keys * inv_chan + f] = 1
+    return label
+"""
+Convert piano roll to list of notes, pitch only.
+"""
+def extract_notes_np_pitch(
+    onsets, frames, velocity, onset_threshold=0.5, frame_threshold=0.5
+):
+    onsets = (onsets > onset_threshold).astype(np.uint8)
+    frames = (frames > frame_threshold).astype(np.uint8)
+    onset_diff = (
+        np.concatenate([onsets[:1, :], onsets[1:, :] - onsets[:-1, :]], axis=0) == 1
+    )
+    pitches = []
+    intervals = []
+    velocities = []
+    for nonzero in np.transpose(np.nonzero(onset_diff)):
+        frame = nonzero[0].item()
+        pitch = nonzero[1].item()
+        onset = frame
+        offset = frame
+        velocity_samples = []
+        while onsets[offset, pitch] or frames[offset, pitch]:
+            if onsets[offset, pitch]:
+                velocity_samples.append(velocity[offset, pitch])
+            offset += 1
+            if offset == onsets.shape[0]:
+                break
+        if offset > onset:
+            pitches.append(pitch)
+            intervals.append([onset, offset])
+            velocities.append(
+                np.mean(velocity_samples) if len(velocity_samples) > 0 else 0
+            )
+    return np.array(pitches), np.array(intervals), np.array(velocities)
+def extract_notes_np_rescaled(
+    onsets, frames, velocity, onset_threshold=0.5, frame_threshold=0.5
+):
+    pitches, intervals, velocities, instruments = extract_notes_np(
+        onsets, frames, velocity, onset_threshold, frame_threshold
+    )
+    pitches += MIN_MIDI
+    scaling = HOP_LENGTH / SAMPLE_RATE
+    intervals = (intervals * scaling).reshape(-1, 2)
+    return pitches, intervals, velocities, instruments
+"""
+Convert piano roll to list of notes, pitch and instrument.
+"""
+def extract_notes_np(
+    onsets,
+    frames,
+    velocity,
+    onset_threshold=0.5,
+    frame_threshold=0.5,
+    onset_threshold_vec=None,
+):
+    if onset_threshold_vec is not None:
+        onsets = (onsets > np.array(onset_threshold_vec)).astype(np.uint8)
+    else:
+        onsets = (onsets > onset_threshold).astype(np.uint8)
+    frames = (frames > frame_threshold).astype(np.uint8)
+    onset_diff = (
+        np.concatenate([onsets[:1, :], onsets[1:, :] - onsets[:-1, :]], axis=0) == 1
+    )
+    if onsets.shape[-1] != frames.shape[-1]:
+        num_instruments = onsets.shape[1] / frames.shape[1]
+        assert num_instruments.is_integer()
+        num_instruments = int(num_instruments)
+        frames = np.tile(frames, (1, num_instruments))
+    pitches = []
+    intervals = []
+    velocities = []
+    instruments = []
+    for nonzero in np.transpose(np.nonzero(onset_diff)):
+        frame = nonzero[0].item()
+        pitch = nonzero[1].item()
+        onset = frame
+        offset = frame
+        velocity_samples = []
+        while onsets[offset, pitch] or frames[offset, pitch]:
+            if onsets[offset, pitch]:
+                velocity_samples.append(velocity[offset, pitch])
+            offset += 1
+            if offset == onsets.shape[0]:
+                break
+        if offset > onset:
+            pitch, instrument = pitch % N_KEYS, pitch // N_KEYS
+            pitches.append(pitch)
+            intervals.append([onset, offset])
+            velocities.append(
+                np.mean(velocity_samples) if len(velocity_samples) > 0 else 0
+            )
+            instruments.append(instrument)
+    return (
+        np.array(pitches),
+        np.array(intervals),
+        np.array(velocities),
+        np.array(instruments),
+    )
+def append_track_multi(file, pitches, intervals, velocities, ins, single_ins=False):
+    track = MidiTrack()
+    file.tracks.append(track)
+    chan = len(file.tracks) - 1
+    if chan >= DRUM_CHANNEL:
+        chan += 1
+    if chan > 15:
+        print(f"invalid chan {chan}")
+        chan = 15
+    track.append(
+        Message(
+            "program_change", channel=chan, program=ins if not single_ins else 0, time=0
+        )
+    )
+    ticks_per_second = file.ticks_per_beat * 2.0
+    events = []
+    for i in range(len(pitches)):
+        events.append(
+            dict(
+                type="on",
+                pitch=pitches[i],
+                time=intervals[i][0],
+                velocity=velocities[i],
+            )
+        )
+        events.append(
+            dict(
+                type="off",
+                pitch=pitches[i],
+                time=intervals[i][1],
+                velocity=velocities[i],
+            )
+        )
+    events.sort(key=lambda row: row["time"])
+    last_tick = 0
+    for event in events:
+        current_tick = int(event["time"] * ticks_per_second)
+        velocity = int(event["velocity"] * 127)
+        if velocity > 127:
+            velocity = 127
+        pitch = int(round(hz_to_midi(event["pitch"])))
+        track.append(
+            Message(
+                "note_" + event["type"],
+                channel=chan,
+                note=pitch,
+                velocity=velocity,
+                time=current_tick - last_tick,
+            )
+        )
+        # try:
+        #     track.append(Message('note_' + event['type'], channel=chan, note=pitch, velocity=velocity, time=current_tick - last_tick))
+        # except Exception as e:
+        #     print('Err Message', 'note_' + event['type'], pitch, velocity, current_tick - last_tick)
+        #     track.append(Message('note_' + event['type'], channel=chan, note=pitch, velocity=max(0, velocity), time=current_tick - last_tick))
+        #     if velocity >= 0:
+        #         raise e
+        last_tick = current_tick
+def append_track(file, pitches, intervals, velocities):
+    track = MidiTrack()
+    file.tracks.append(track)
+    ticks_per_second = file.ticks_per_beat * 2.0
+    events = []
+    for i in range(len(pitches)):
+        events.append(
+            dict(
+                type="on",
+                pitch=pitches[i],
+                time=intervals[i][0],
+                velocity=velocities[i],
+            )
+        )
+        events.append(
+            dict(
+                type="off",
+                pitch=pitches[i],
+                time=intervals[i][1],
+                velocity=velocities[i],
+            )
+        )
+    events.sort(key=lambda row: row["time"])
+    last_tick = 0
+    for event in events:
+        current_tick = int(event["time"] * ticks_per_second)
+        velocity = int(event["velocity"] * 127)
+        if velocity > 127:
+            velocity = 127
+        pitch = int(round(hz_to_midi(event["pitch"])))
+        try:
+            track.append(
+                Message(
+                    "note_" + event["type"],
+                    note=pitch,
+                    velocity=velocity,
+                    time=current_tick - last_tick,
+                )
+            )
+        except Exception as e:
+            print(
+                "Err Message",
+                "note_" + event["type"],
+                pitch,
+                velocity,
+                current_tick - last_tick,
+            )
+            track.append(
+                Message(
+                    "note_" + event["type"],
+                    note=pitch,
+                    velocity=max(0, velocity),
+                    time=current_tick - last_tick,
+                )
+            )
+            if velocity >= 0:
+                raise e
+        last_tick = current_tick
+def save_midi(path, pitches, intervals, velocities, insts=None):
+    """
+    Save extracted notes as a MIDI file
+    Parameters
+    ----------
+    path: the path to save the MIDI file
+    pitches: np.ndarray of bin_indices
+    intervals: list of (onset_index, offset_index)
+    velocities: list of velocity values
+    """
+    file = MidiFile()
+    if isinstance(pitches, list):
+        for p, i, v, ins in zip(pitches, intervals, velocities, insts):
+            append_track_multi(file, p, i, v, ins)
+    else:
+        append_track(file, pitches, intervals, velocities)
+    file.save(path)
+def frames2midi(
+    save_path,
+    onsets,
+    frames,
+    vels,
+    onset_threshold=0.5,
+    frame_threshold=0.5,
+    scaling=HOP_LENGTH / SAMPLE_RATE,
+    inst_mapping=None,
+    onset_threshold_vec=None,
+):
+    p_est, i_est, v_est, inst_est = extract_notes_np(
+        onsets,
+        frames,
+        vels,
+        onset_threshold,
+        frame_threshold,
+        onset_threshold_vec=onset_threshold_vec,
+    )
+    i_est = (i_est * scaling).reshape(-1, 2)
+    p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])
+    inst_set = set(inst_est)
+    inst_set = sorted(list(inst_set))
+    p_est_lst = {}
+    i_est_lst = {}
+    v_est_lst = {}
+    assert len(p_est) == len(i_est) == len(v_est) == len(inst_est)
+    for p, i, v, ins in zip(p_est, i_est, v_est, inst_est):
+        if ins in p_est_lst:
+            p_est_lst[ins].append(p)
+        else:
+            p_est_lst[ins] = [p]
+        if ins in i_est_lst:
+            i_est_lst[ins].append(i)
+        else:
+            i_est_lst[ins] = [i]
+        if ins in v_est_lst:
+            v_est_lst[ins].append(v)
+        else:
+            v_est_lst[ins] = [v]
+    for elem in [p_est_lst, i_est_lst, v_est_lst]:
+        for k, v in elem.items():
+            elem[k] = np.array(v)
+    inst_set = [e for e in inst_set if e in p_est_lst]
+    # inst_set = [INSTRUMENT_MAPPING[e] for e in inst_set if e in p_est_lst]
+    p_est_lst = [p_est_lst[ins] for ins in inst_set if ins in p_est_lst]
+    i_est_lst = [i_est_lst[ins] for ins in inst_set if ins in i_est_lst]
+    v_est_lst = [v_est_lst[ins] for ins in inst_set if ins in v_est_lst]
+    assert len(p_est_lst) == len(i_est_lst) == len(v_est_lst) == len(inst_set)
+    inst_set = [inst_mapping[e] for e in inst_set]
+    save_midi(save_path, p_est_lst, i_est_lst, v_est_lst, inst_set)
+def frames2midi_pitch(
+    save_path,
+    onsets,
+    frames,
+    vels,
+    onset_threshold=0.5,
+    frame_threshold=0.5,
+    scaling=HOP_LENGTH / SAMPLE_RATE,
+):
+    p_est, i_est, v_est = extract_notes_np_pitch(
+        onsets, frames, vels, onset_threshold, frame_threshold
+    )
+    i_est = (i_est * scaling).reshape(-1, 2)
+    p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])
+    print("Saving midi in", save_path)
+    save_midi(save_path, p_est, i_est, v_est)
+def parse_midi_multi(path, force_instrument=None):
+    """open midi file and return np.array of (onset, offset, note, velocity, instrument) rows"""
+    try:
+        midi = mido.MidiFile(path)
+    except:
+        print("could not open midi", path)
+        return
+    time = 0
+    events = []
+    control_changes = []
+    program_changes = []
+    sustain = {}
+    all_channels = set()
+    instruments = {}  # mapping of channel: instrument
+    for message in midi:
+        time += message.time
+        if hasattr(message, "channel"):
+            if message.channel == DRUM_CHANNEL:
+                continue
+        if (
+            message.type == "control_change"
+            and message.control == 64
+            and (message.value >= 64) != sustain.get(message.channel, False)
+        ):
+            sustain[message.channel] = message.value >= 64
+            event_type = "sustain_on" if sustain[message.channel] else "sustain_off"
+            event = dict(
+                index=len(events), time=time, type=event_type, note=None, velocity=0
+            )
+            event["channel"] = message.channel
+            event["sustain"] = sustain[message.channel]
+            events.append(event)
+        if message.type == "control_change" and message.control != 64:
+            control_changes.append(
+                (time, message.control, message.value, message.channel)
+            )
+        if message.type == "program_change":
+            program_changes.append((time, message.program, message.channel))
+            instruments[message.channel] = instruments.get(message.channel, []) + [
+                (message.program, time)
+            ]
+        if "note" in message.type:
+            # MIDI offsets can be either 'note_off' events or 'note_on' with zero velocity
+            velocity = message.velocity if message.type == "note_on" else 0
+            event = dict(
+                index=len(events),
+                time=time,
+                type="note",
+                note=message.note,
+                velocity=velocity,
+                sustain=sustain.get(message.channel, False),
+            )
+            event["channel"] = message.channel
+            events.append(event)
+        if hasattr(message, "channel"):
+            all_channels.add(message.channel)
+    if len(instruments) == 0:
+        instruments = {c: [(0, 0)] for c in all_channels}
+    if len(all_channels) > len(instruments):
+        for e in all_channels - set(instruments.keys()):
+            instruments[e] = [(0, 0)]
+    if force_instrument is not None:
+        instruments = {c: [(force_instrument, 0)] for c in all_channels}
+    this_instruments = set()
+    for v in instruments.values():
+        this_instruments = this_instruments.union(set(x[0] for x in v))
+    notes = []
+    for i, onset in enumerate(events):
+        if onset["velocity"] == 0:
+            continue
+        offset = next(
+            n
+            for n in events[i + 1 :]
+            if (n["note"] == onset["note"] and n["channel"] == onset["channel"])
+            or n is events[-1]
+        )
+        if "sustain" not in offset:
+            print("offset without sustain", offset)
+        if offset["sustain"] and offset is not events[-1]:
+            # if the sustain pedal is active at offset, find when the sustain ends
+            offset = next(
+                n
+                for n in events[offset["index"] + 1 :]
+                if (n["type"] == "sustain_off" and n["channel"] == onset["channel"])
+                or n is events[-1]
+            )
+        for k, v in instruments.items():
+            if len(set(v)) == 1 and len(v) > 1:
+                instruments[k] = list(set(v))
+        for k, v in instruments.items():
+            instruments[k] = sorted(v, key=lambda x: x[1])
+        if len(instruments[onset["channel"]]) == 1:
+            instrument = instruments[onset["channel"]][0][0]
+        else:
+            ind = 0
+            while (
+                ind < len(instruments[onset["channel"]])
+                and onset["time"] >= instruments[onset["channel"]][ind][1]
+            ):
+                ind += 1
+            if ind > 0:
+                ind -= 1
+            instrument = instruments[onset["channel"]][ind][0]
+        if onset["channel"] == DRUM_CHANNEL:
+            print("skipping drum note")
+            continue
+        note = (
+            onset["time"],
+            offset["time"],
+            onset["note"],
+            onset["velocity"],
+            instrument,
+        )
+        notes.append(note)
+    res = np.array(notes)
+    return res
+def save_midi_alignments_and_predictions(
+    save_path,
+    data_path,
+    inst_mapping,
+    aligned_onsets,
+    aligned_frames,
+    onset_pred_np,
+    frame_pred_np,
+    prefix="",
+    use_time=True,
+    group=None,
+):
+    inst_only = len(inst_mapping) * N_KEYS
+    time_now = datetime.now().strftime("%y%m%d-%H%M%S") if use_time else ""
+    if len(prefix) > 0:
+        prefix = "_{}".format(prefix)
+    # Save the aligned label. If training on a small dataset or a single performance in order to label it for later adding it
+    # to a large dataset, it is recommended to use this MIDI as a label.
+    frames2midi(
+        save_path
+        + os.sep
+        + data_path.replace(".flac", "").split(os.sep)[-1]
+        + prefix
+        + "_alignment_"
+        + time_now
+        + ".mid",
+        aligned_onsets[:, :inst_only],
+        aligned_frames[:, :inst_only],
+        64.0 * aligned_onsets[:, :inst_only],
+        inst_mapping=inst_mapping,
+    )
+    return
+    # # Aligned label, pitch-only, on the piano.
+    # frames2midi_pitch(save_path + os.sep + data_path.replace('.flac', '').split(os.sep)[-1] + prefix + '_alignment_pitch_' + time_now + '.mid',
+    #                   aligned_onsets[:, -N_KEYS:], aligned_frames[:, -N_KEYS:],
+    #                   64. * aligned_onsets[:, -N_KEYS:])
+    predicted_onsets = onset_pred_np >= 0.5
+    predicted_frames = frame_pred_np >= 0.5
+    # # Raw pitch with instrument prediction - will probably have lower recall, depending on the model's strength.
+    # frames2midi(save_path + os.sep + data_path.replace('.flac', '').split(os.sep)[-1] + prefix + '_pred_' + time_now + '.mid',
+    #             predicted_onsets[:, : inst_only], predicted_frames[:, : inst_only],
+    #             64. * predicted_onsets[:, : inst_only],
+    #             inst_mapping=inst_mapping)
+    # Pitch prediction played on the piano - will have high recall, since it does not differentiate between instruments.
+    frames2midi_pitch(
+        save_path
+        + os.sep
+        + data_path.replace(".flac", "").split(os.sep)[-1]
+        + prefix
+        + "_pred_pitch_"
+        + time_now
+        + ".mid",
+        predicted_onsets[:, -N_KEYS:],
+        predicted_frames[:, -N_KEYS:],
+        64.0 * predicted_onsets[:, -N_KEYS:],
+    )
+    # Pitch prediction, with choice of most likely instrument for each detected note.
+    if len(inst_mapping) > 1:
+        max_pred_onsets = max_inst(onset_pred_np)
+        frames2midi(
+            save_path
+            + os.sep
+            + data_path.replace(".flac", "").split(os.sep)[-1]
+            + prefix
+            + "_pred_inst_"
+            + time_now
+            + ".mid",
+            max_pred_onsets[:, :inst_only],
+            predicted_frames[:, :inst_only],
+            64.0 * max_pred_onsets[:, :inst_only],
+            inst_mapping=inst_mapping,
+        )
+    pseudo_onsets = (onset_pred_np >= 0.5) & (~aligned_onsets)
+    onset_label = np.maximum(pseudo_onsets, aligned_onsets)
+    pseudo_frames = np.zeros(pseudo_onsets.shape, dtype=pseudo_onsets.dtype)
+    for t, f in zip(*onset_label.nonzero()):
+        t_off = t
+        while t_off < len(pseudo_frames) and frame_pred_np[t_off, f % N_KEYS] >= 0.5:
+            t_off += 1
+        pseudo_frames[t:t_off, f] = 1
+    frame_label = np.maximum(pseudo_frames, aligned_frames)
+    # pseudo_frames = (frame_pred_np >= 0.5) & (~aligned_frames)
+    # frame_label = np.maximum(pseudo_frames, aligned_frames)
+    frames2midi(
+        save_path
+        + os.sep
+        + data_path.replace(".flac", "").split(os.sep)[-1]
+        + prefix
+        + "_pred_align_max_"
+        + time_now
+        + ".mid",
+        onset_label[:, :inst_only],
+        frame_label[:, :inst_only],
+        64.0 * onset_label[:, :inst_only],
+        inst_mapping=inst_mapping,
+    )
+    # if group is not None:
+    #     gorup_path = os.path.join(save_path, 'pred_alignment_max', group)
+    #     file_name = os.path.basename(data_path).replace('.flac', '_pred_align_max.mid')
+    #     os.makedirs(gorup_path, exist_ok=True)
+    #     frames2midi(os.path.join(gorup_path, file_name),
+    #                 onset_label[:, : inst_only], frame_label[:, : inst_only],
+    #                 64. * onset_label[:, : inst_only],
+    #                 inst_mapping=inst_mapping)

onsets_and_frames/transcriber.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+from onsets_and_frames.constants import MAX_MIDI, MIN_MIDI, N_KEYS
+from .lstm import BiLSTM
+from .mel import melspectrogram
+class ConvStack(nn.Module):
+    def __init__(self, input_features, output_features):
+        super().__init__()
+        # input is batch_size * 1 channel * frames * input_features
+        self.cnn = nn.Sequential(
+            # layer 0
+            nn.Conv2d(1, output_features // 16, (3, 3), padding=1),
+            nn.BatchNorm2d(output_features // 16),
+            nn.ReLU(),
+            # layer 1
+            nn.Conv2d(output_features // 16, output_features // 16, (3, 3), padding=1),
+            nn.BatchNorm2d(output_features // 16),
+            nn.ReLU(),
+            # layer 2
+            nn.MaxPool2d((1, 2)),
+            nn.Dropout(0.25),
+            nn.Conv2d(output_features // 16, output_features // 8, (3, 3), padding=1),
+            nn.BatchNorm2d(output_features // 8),
+            nn.ReLU(),
+            # layer 3
+            nn.MaxPool2d((1, 2)),
+            nn.Dropout(0.25),
+        )
+        self.fc = nn.Sequential(
+            nn.Linear((output_features // 8) * (input_features // 4), output_features),
+            nn.Dropout(0.5),
+        )
+    def forward(self, mel):
+        x = mel.view(mel.size(0), 1, mel.size(1), mel.size(2))
+        x = self.cnn(x)
+        x = x.transpose(1, 2).flatten(-2)
+        x = self.fc(x)
+        return x
+class OnsetsAndFrames(nn.Module):
+    def __init__(
+        self,
+        input_features,
+        output_features,
+        model_complexity=48,
+        onset_complexity=1,
+        n_instruments=13,
+    ):
+        nn.Module.__init__(self)
+        model_size = model_complexity * 16
+        sequence_model = lambda input_size, output_size: BiLSTM(
+            input_size, output_size // 2
+        )
+        onset_model_size = int(onset_complexity * model_size)
+        self.onset_stack = nn.Sequential(
+            ConvStack(input_features, onset_model_size),
+            sequence_model(onset_model_size, onset_model_size),
+            nn.Linear(onset_model_size, output_features * n_instruments),
+            nn.Sigmoid(),
+        )
+        self.offset_stack = nn.Sequential(
+            ConvStack(input_features, model_size),
+            sequence_model(model_size, model_size),
+            nn.Linear(model_size, output_features),
+            nn.Sigmoid(),
+        )
+        self.frame_stack = nn.Sequential(
+            ConvStack(input_features, model_size),
+            nn.Linear(model_size, output_features),
+            nn.Sigmoid(),
+        )
+        self.combined_stack = nn.Sequential(
+            sequence_model(output_features * 3, model_size),
+            nn.Linear(model_size, output_features),
+            nn.Sigmoid(),
+        )
+        self.velocity_stack = nn.Sequential(
+            ConvStack(input_features, model_size),
+            nn.Linear(model_size, output_features * n_instruments),
+        )
+    def forward(self, mel):
+        onset_pred = self.onset_stack(mel)
+        offset_pred = self.offset_stack(mel)
+        activation_pred = self.frame_stack(mel)
+        onset_detached = onset_pred.detach()
+        shape = onset_detached.shape
+        keys = MAX_MIDI - MIN_MIDI + 1
+        new_shape = shape[:-1] + (shape[-1] // keys, keys)
+        onset_detached = onset_detached.reshape(new_shape)
+        onset_detached, _ = onset_detached.max(axis=-2)
+        offset_detached = offset_pred.detach()
+        combined_pred = torch.cat(
+            [onset_detached, offset_detached, activation_pred], dim=-1
+        )
+        frame_pred = self.combined_stack(combined_pred)
+        velocity_pred = self.velocity_stack(mel)
+        return onset_pred, offset_pred, activation_pred, frame_pred, velocity_pred
+    def run_on_batch(
+        self,
+        batch,
+        parallel_model=None,
+        positive_weight=2.0,
+        inv_positive_weight=2.0,
+        with_onset_mask=False,
+    ):
+        audio_label = batch["audio"]
+        onset_label = batch["onset"]
+        offset_label = batch["offset"]
+        frame_label = batch["frame"]
+        if "velocity" in batch:
+            velocity_label = batch["velocity"]
+        mel = melspectrogram(
+            audio_label.reshape(-1, audio_label.shape[-1])[:, :-1]
+        ).transpose(-1, -2)
+        if not parallel_model:
+            onset_pred, offset_pred, _, frame_pred, velocity_pred = self(mel)
+        else:
+            onset_pred, offset_pred, _, frame_pred, velocity_pred = parallel_model(mel)
+        predictions = {
+            "onset": onset_pred.reshape(*onset_label.shape),
+            "offset": offset_pred.reshape(*offset_label.shape),
+            "frame": frame_pred.reshape(*frame_label.shape),
+            # 'velocity': velocity_pred.reshape(*velocity_label.shape)
+        }
+        if "velocity" in batch:
+            predictions["velocity"] = velocity_pred.reshape(*velocity_label.shape)
+        losses = {
+            "loss/onset": F.binary_cross_entropy(
+                predictions["onset"], onset_label, reduction="none"
+            ),
+            "loss/offset": F.binary_cross_entropy(
+                predictions["offset"], offset_label, reduction="none"
+            ),
+            "loss/frame": F.binary_cross_entropy(
+                predictions["frame"], frame_label, reduction="none"
+            ),
+            # 'loss/velocity': self.velocity_loss(predictions['velocity'], velocity_label, onset_label)
+        }
+        if "velocity" in batch:
+            losses["loss/velocity"] = self.velocity_loss(
+                predictions["velocity"], velocity_label, onset_label
+            )
+        onset_mask = 1.0 * onset_label
+        onset_mask[..., :-N_KEYS] *= positive_weight - 1
+        onset_mask[..., -N_KEYS:] *= inv_positive_weight - 1
+        onset_mask += 1
+        if with_onset_mask:
+            if "onset_mask" in batch:
+                onset_mask = onset_mask * batch["onset_mask"]
+        # if 'onset_mask' in batch:
+        #     onset_mask += batch['onset_mask']
+        offset_mask = 1.0 * offset_label
+        offset_positive_weight = 2.0
+        offset_mask *= offset_positive_weight - 1
+        offset_mask += 1.0
+        frame_mask = 1.0 * frame_label
+        frame_positive_weight = 2.0
+        frame_mask *= frame_positive_weight - 1
+        frame_mask += 1.0
+        for loss_key, mask in zip(
+            ["onset", "offset", "frame"], [onset_mask, offset_mask, frame_mask]
+        ):
+            losses["loss/" + loss_key] = (mask * losses["loss/" + loss_key]).mean()
+        return predictions, losses
+    def velocity_loss(self, velocity_pred, velocity_label, onset_label):
+        denominator = onset_label.sum()
+        if denominator.item() == 0:
+            return denominator
+        else:
+            return (
+                onset_label * (velocity_label - velocity_pred) ** 2
+            ).sum() / denominator
+#   same implementation as OnsetsAndFrames, but with only onset stack
+class OnsetsNoFrames(nn.Module):
+    def __init__(
+        self,
+        input_features,
+        output_features,
+        model_complexity=48,
+        onset_complexity=1,
+        n_instruments=13,
+    ):
+        nn.Module.__init__(self)
+        model_size = model_complexity * 16
+        sequence_model = lambda input_size, output_size: BiLSTM(
+            input_size, output_size // 2
+        )
+        onset_model_size = int(onset_complexity * model_size)
+        self.onset_stack = nn.Sequential(
+            ConvStack(input_features, onset_model_size),
+            sequence_model(onset_model_size, onset_model_size),
+            nn.Linear(onset_model_size, output_features * n_instruments),
+            nn.Sigmoid(),
+        )
+    def forward(self, mel):
+        onset_pred = self.onset_stack(mel)
+        onset_detached = onset_pred.detach()
+        shape = onset_detached.shape
+        keys = MAX_MIDI - MIN_MIDI + 1
+        new_shape = shape[:-1] + (shape[-1] // keys, keys)
+        onset_detached = onset_detached.reshape(new_shape)
+        onset_detached, _ = onset_detached.max(axis=-2)
+        return onset_pred
+    def run_on_batch(
+        self,
+        batch,
+        parallel_model=None,
+        positive_weight=2.0,
+        inv_positive_weight=2.0,
+        with_onset_mask=False,
+    ):
+        audio_label = batch["audio"]
+        onset_label = batch["onset"]
+        mel = melspectrogram(
+            audio_label.reshape(-1, audio_label.shape[-1])[:, :-1]
+        ).transpose(-1, -2)
+        if not parallel_model:
+            onset_pred = self(mel)
+        else:
+            onset_pred = parallel_model(mel)
+        predictions = {
+            "onset": onset_pred,
+        }
+        losses = {
+            "loss/onset": F.binary_cross_entropy(
+                predictions["onset"], onset_label, reduction="none"
+            ),
+        }
+        onset_mask = 1.0 * onset_label
+        onset_mask[..., :-N_KEYS] *= positive_weight - 1
+        onset_mask[..., -N_KEYS:] *= inv_positive_weight - 1
+        onset_mask += 1
+        if with_onset_mask:
+            if "onset_mask" in batch:
+                onset_mask = onset_mask * batch["onset_mask"]
+        losses["loss/onset"] = (onset_mask * losses["loss/onset"]).mean()
+        return predictions, losses

onsets_and_frames/utils.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import logging
+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+from onsets_and_frames.constants import (
+    DTW_FACTOR,
+    HOP_LENGTH,
+    MAX_MIDI,
+    MIN_MIDI,
+    N_KEYS,
+)
+def cycle(iterable):
+    while True:
+        for item in iterable:
+            yield item
+def shift_label(label, shift):
+    if shift == 0:
+        return label
+    assert len(label.shape) == 2
+    t, p = label.shape
+    keys, instruments = N_KEYS, p // N_KEYS
+    label_zero_pad = torch.zeros(t, instruments, abs(shift), dtype=label.dtype)
+    label = label.reshape(t, instruments, keys)
+    to_cat = (
+        (label_zero_pad, label[:, :, :-shift])
+        if shift > 0
+        else (label[:, :, -shift:], label_zero_pad)
+    )
+    label = torch.cat(to_cat, dim=-1)
+    return label.reshape(t, p)
+def get_peaks(notes, win_size, gpu=False):
+    constraints = []
+    notes = notes.cpu()
+    for i in range(1, win_size + 1):
+        forward = torch.roll(notes, i, 0)
+        forward[:i, ...] = 0  # assume time axis is 0
+        backward = torch.roll(notes, -i, 0)
+        backward[-i:, ...] = 0
+        constraints.extend([forward, backward])
+    res = torch.ones(notes.shape, dtype=bool)
+    for elem in constraints:
+        res = res & (notes >= elem)
+    return res if not gpu else res.cuda()
+def get_peaks_numpy(notes, win_size):
+    """
+    Detect peaks in a NumPy array based on a window size.
+    Args:
+        notes (np.ndarray): Input array, shape (frames, ...).
+        win_size (int): Window size for detecting peaks.
+    Returns:
+        np.ndarray: Boolean array indicating peaks, same shape as `notes`.
+    """
+    # Initialize constraints
+    constraints = []
+    notes = np.array(notes)  # Ensure input is a NumPy array
+    for i in range(1, win_size + 1):
+        # Roll array forward and backward
+        forward = np.roll(notes, i, axis=0)
+        backward = np.roll(notes, -i, axis=0)
+        # Zero out invalid regions
+        forward[:i, ...] = 0
+        backward[-i:, ...] = 0
+        constraints.extend([forward, backward])
+    # Initialize result with all True
+    res = np.ones_like(notes, dtype=bool)
+    # Apply constraints
+    for elem in constraints:
+        res &= notes >= elem
+    return res
+def get_diff(notes, offset=True):
+    rolled = np.roll(notes, 1, axis=0)
+    rolled[0, ...] = 0
+    return (rolled & (~notes)) if offset else (notes & (~rolled))
+def compress_across_octave(notes):
+    keys = MAX_MIDI - MIN_MIDI + 1
+    time, instruments = notes.shape[0], notes.shape[1] // keys
+    notes_reshaped = notes.reshape((time, instruments, keys))
+    notes_reshaped = notes_reshaped.max(axis=1)
+    octaves = keys // 12
+    res = np.zeros((time, 12), dtype=np.uint8)
+    for i in range(octaves):
+        curr_octave = notes_reshaped[:, i * 12 : (i + 1) * 12]
+        res = np.maximum(res, curr_octave)
+    return res
+def compress_time(notes, factor):
+    t, p = notes.shape
+    res = np.zeros((t // factor, p), dtype=notes.dtype)
+    for i in range(t // factor):
+        res[i, :] = notes[i * factor : (i + 1) * factor, :].max(axis=0)
+    return res
+def get_matches(index1, index2):
+    matches = {}
+    for i1, i2 in zip(index1, index2):
+        # matches[i1] = matches.get(i1, []) + [i2]
+        if i1 not in matches:
+            matches[i1] = []
+        matches[i1].append(i2)
+    return matches
+"""
+Extend a temporal range to WINDOW_SIZE_SRC if it is shorter than that.
+WINDOW_SIZE_SRC defaults to 28 frames for 256 hop length (assuming DTW_FACTOR=3), which is ~0.5 second.
+"""
+def get_margin(
+    t_sources, max_len, WINDOW_SIZE_SRC=11 * (512 // HOP_LENGTH) + 2 * DTW_FACTOR
+):
+    margin = max(0, (WINDOW_SIZE_SRC - len(t_sources)) // 2)
+    t_sources_left = list(range(max(t_sources[0] - margin, 0), t_sources[0]))
+    t_sources_right = list(
+        range(t_sources[-1], min(t_sources[-1] + margin, max_len - 1))
+    )
+    t_sources_extended = t_sources_left + t_sources + t_sources_right
+    return t_sources_extended
+def get_inactive_instruments(target_onsets, T):
+    keys = MAX_MIDI - MIN_MIDI + 1
+    time, instruments = target_onsets.shape[0], target_onsets.shape[1] // keys
+    notes_reshaped = target_onsets.reshape((time, instruments, keys))
+    active_instruments = notes_reshaped.max(axis=(0, 2))
+    res = np.zeros((T, instruments, keys), dtype=bool)
+    for ins in range(instruments):
+        if active_instruments[ins] == 0:
+            res[:, ins, :] = 1
+    return res.reshape((T, instruments * keys)), active_instruments
+def max_inst(probs, threshold_vec=None):
+    if threshold_vec is None:
+        threshold_vec = 0.5
+    if probs.shape[-1] == N_KEYS or probs.shape[-1] == N_KEYS * 2:
+        # there is only pitch
+        return probs
+    keys = MAX_MIDI - MIN_MIDI + 1
+    instruments = probs.shape[1] // keys
+    time = len(probs)
+    probs = probs.reshape((time, instruments, keys))
+    notes = probs.max(axis=1) >= threshold_vec
+    max_instruments = np.argmax(probs[:, :-1, :], axis=1)
+    res = np.zeros(probs.shape, dtype=np.uint8)
+    for t, p in zip(*(notes.nonzero())):
+        res[t, max_instruments[t, p], p] = 1
+        res[t, -1, p] = 1
+    return res.reshape((time, instruments * keys))
+# Define the smoothing function (operates on CPU)
+def smooth_labels(onset_tensor):
+    """
+    Smooths onset labels using a triangular kernel with 1D convolution along the time axis.
+    Args:
+        onset_tensor (torch.Tensor): A (T, F) tensor where T = time steps and F = pitches.
+    Returns:
+        torch.Tensor: Smoothed onset tensor with the same shape (T, F).
+    """
+    # Define the triangular smoothing kernel
+    # kernel = torch.tensor([0.2, 0.4, 0.6, 0.8, 1, 0.8, 0.6, 0.4, 0.2],
+    #                       dtype=onset_tensor.dtype).view(1, 1, -1)
+    # kernel = torch.tensor([0.1, 0.2, 0.3, 0.4, 0.5,  0.6, 0.7,  0.8, 0.9,  1, 0.9,  0.8, 0.7,  0.6, 0.5,  0.4, 0.3,  0.2, 0.1],
+    #                       dtype=onset_tensor.dtype).view(1, 1, -1)
+    kernel = torch.tensor([0.33, 0.67, 1, 0.67, 0.33], dtype=onset_tensor.dtype).view(
+        1, 1, -1
+    )
+    onset_tensor = onset_tensor.T.unsqueeze(1)  # Now shape is (F, 1, T)
+    # Use 'same' padding so that the output has the same time dimension as the input.
+    padding = kernel.shape[-1] // 2
+    smoothed = F.conv1d(onset_tensor, kernel, padding=padding)
+    # Reshape back to original shape (T, F)
+    return smoothed.squeeze(1).T
+def initialize_logging_system(logdir):
+    """Initialize the logging system once with named loggers for train and dataset."""
+    log_file = os.path.join(logdir, "training.log")
+    # Create formatter
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    # File handler (shared by all loggers)
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(formatter)
+    # Console handler (shared by all loggers)
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    console_handler.setFormatter(formatter)
+    # Create train logger
+    train_logger = logging.getLogger("train")
+    train_logger.setLevel(logging.INFO)
+    train_logger.handlers.clear()
+    train_logger.addHandler(file_handler)
+    train_logger.addHandler(console_handler)
+    # Create dataset logger
+    dataset_logger = logging.getLogger("dataset")
+    dataset_logger.setLevel(logging.INFO)
+    dataset_logger.handlers.clear()
+    dataset_logger.addHandler(file_handler)
+    dataset_logger.addHandler(console_handler)
+    return train_logger, dataset_logger
+def get_logger(name):
+    """Get a named logger. Call initialize_logging_system first."""
+    return logging.getLogger(name)