Spaces:
Runtime error
Runtime error
| import torch | |
| from torch import nn | |
| from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSeq2SeqLM | |
| class CombinedModel(nn.Module): | |
| def __init__(self, stt_model_name, nmt_model_name,device = "cuda"): | |
| super(CombinedModel, self).__init__() | |
| self.stt_processor = Wav2Vec2Processor.from_pretrained(stt_model_name) | |
| self.stt_model = Wav2Vec2ForCTC.from_pretrained(stt_model_name) | |
| self.nmt_tokenizer = AutoTokenizer.from_pretrained(nmt_model_name) | |
| self.nmt_model = AutoModelForSeq2SeqLM.from_pretrained(nmt_model_name) | |
| self.device = device | |
| def forward(self, batch, *args, **kwargs): | |
| # Use stt_model to transcribe the audio to text | |
| device = self.device | |
| audio = torch.tensor(batch["audio"][0]).to(self.device) | |
| input_features = self.stt_processor(audio,sampling_rate=16000, return_tensors="pt",max_length=110000, padding=True, truncation=True) | |
| stt_output = self.stt_model(input_features.input_values.to(device), attention_mask= input_features.attention_mask.to(device) ) | |
| transcription = self.stt_processor.decode(torch.squeeze(stt_output.logits.argmax(axis=-1)).to(device)) | |
| input_nmt_tokens = self.nmt_tokenizer(transcription, return_tensors="pt", padding=True, truncation=True) | |
| output_nmt_output = self.nmt_model.generate(input_ids = input_nmt_tokens.input_ids.to(device), attention_mask= input_nmt_tokens.attention_mask.to(device)) | |
| decoded_nmt_output = self.nmt_tokenizer.batch_decode(output_nmt_output, skip_special_tokens=True) | |
| return transcription, decoded_nmt_output |