Spaces:

mrfakename
/

DMOSpeech2

Running on Zero

App Files Files Community

DMOSpeech2 / funasr_detach /frontends /windowing.py

mrfakename

Super-squash branch 'main' using huggingface_hub

0102e16 verified 10 months ago

raw

history blame

2.69 kB

	#!/usr/bin/env python3
	# 2020, Technische Universität München; Ludwig Kürzinger
	# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)

	"""Sliding Window for raw audio input data."""

	import torch
	import torch.nn as nn
	from typing import Tuple


	class SlidingWindow(nn.Module):
	"""Sliding Window.
	Provides a sliding window over a batched continuous raw audio tensor.
	Optionally, provides padding (Currently not implemented).
	Combine this module with a pre-encoder compatible with raw audio data,
	for example Sinc convolutions.
	Known issues:
	Output length is calculated incorrectly if audio shorter than win_length.
	WARNING: trailing values are discarded - padding not implemented yet.
	There is currently no additional window function applied to input values.
	"""

	def __init__(
	self,
	win_length: int = 400,
	hop_length: int = 160,
	channels: int = 1,
	padding: int = None,
	fs=None,
	):
	"""Initialize.
	Args:
	win_length: Length of frame.
	hop_length: Relative starting point of next frame.
	channels: Number of input channels.
	padding: Padding (placeholder, currently not implemented).
	fs: Sampling rate (placeholder for compatibility, not used).
	"""
	super().__init__()
	self.fs = fs
	self.win_length = win_length
	self.hop_length = hop_length
	self.channels = channels
	self.padding = padding

	def forward(
	self, input: torch.Tensor, input_lengths: torch.Tensor
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""Apply a sliding window on the input.
	Args:
	input: Input (B, T, CD) or (B, TC*D), with D=C=1.
	input_lengths: Input lengths within batch.
	Returns:
	Tensor: Output with dimensions (B, T, C, D), with D=win_length.
	Tensor: Output lengths within batch.
	"""
	input_size = input.size()
	B = input_size[0]
	T = input_size[1]
	C = self.channels
	D = self.win_length
	# (B, T, C) --> (T, B, C)
	continuous = input.view(B, T, C).permute(1, 0, 2)
	windowed = continuous.unfold(0, D, self.hop_length)
	# (T, B, C, D) --> (B, T, C, D)
	output = windowed.permute(1, 0, 2, 3).contiguous()
	# After unfold(), windowed lengths change:
	output_lengths = (input_lengths - self.win_length) // self.hop_length + 1
	return output, output_lengths

	def output_size(self) -> int:
	"""Return output length of feature dimension D, i.e. the window length."""
	return self.win_length