Spaces:
Running
on
Zero
Running
on
Zero
| import torch | |
| import random | |
| def clip_by_length(x, length, factor): | |
| if len(x) <= length: | |
| return x | |
| start = random.randint(0, len(x) - length - 1) | |
| start = start // factor * factor | |
| x = x[start: start + length] | |
| return x | |
| def speech_edit_find_time_stamp(x, token_list): | |
| assert isinstance(x, torch.Tensor) | |
| x, counts = torch.unique_consecutive(x, return_counts=True) | |
| x = [token_list[i.item()] for i in x] | |
| counts = torch.cumsum(counts, dim=0) | |
| counts = counts.cpu().tolist() | |
| # Possible Phones obtained from kaldi: | |
| # (B)egin, (E)nd, (I)nternal and (S)ingleton | |
| # & SIL & SPN_S | |
| # The phone_table doesn't contain SPN_S so it is replaced by <UNK> | |
| ans, buf = [], [] | |
| for phone, count in zip(x, counts): | |
| if phone.endswith('_B') or phone.endswith('_I') or phone.endswith("_E"): | |
| buf.append((phone, count)) | |
| if phone.endswith("_E"): | |
| phone_seq = tuple([x[0] for x in buf]) | |
| count = buf[-1][1] | |
| ans.append((phone_seq, count)) | |
| buf = [] | |
| elif phone == "SIL" or phone.endswith('_S'): | |
| ans.append((phone, count)) | |
| else: | |
| ans.append((phone, count)) # usually SPN_S | |
| # If too short, mask it all. | |
| if len(ans) <= 2: | |
| return (0, ans[-1][1]) | |
| num = random.randint(1, 2) # mask 1-2 words | |
| word_start = random.randint(0, len(ans) - num) | |
| if word_start == 0: | |
| start = 0 | |
| else: | |
| start = ans[word_start - 1][1] | |
| end = ans[word_start + num - 1][1] | |
| return (start, end) | |
| def codec_specaug(codec, mask_id): | |
| """ | |
| Simply specaug on codec audio input. | |
| Apply time mask with max-width 5% of the total length; 10 masks | |
| Apply codec (frequency) mask with only 0 / 1 bin. 1 mask. | |
| """ | |
| T, D = codec.size() | |
| max_len = int(T * 0.05) | |
| for i in range(5): | |
| start = random.randint(0, T - max_len - 1) | |
| length = random.randint(0, max_len) | |
| codec[start: start + length] = mask_id | |
| if random.random() > 1.0: | |
| dim = random.randint(0, D - 1) | |
| codec[:, dim] = mask_id | |
| return codec.view(-1).contiguous() | |