farzadab commited on
Commit
c84f28d
·
verified ·
1 Parent(s): 0cde4b8

Update ultravox_processing.py

Browse files
Files changed (1) hide show
  1. ultravox_processing.py +5 -2
ultravox_processing.py CHANGED
@@ -112,7 +112,8 @@ class UltravoxProcessor(transformers.ProcessorMixin):
112
  assert (
113
  tokenizer.eos_token is not None
114
  ), "The tokenizer has no EOS token. Cannot recover."
115
- self.audio_replacement_token_id = tokenizer.get_vocab()[tokenizer.eos_token]
 
116
  if tokenizer.pad_token_id is None:
117
  tokenizer.pad_token_id = tokenizer.eos_token_id
118
 
@@ -326,6 +327,8 @@ class UltravoxProcessor(transformers.ProcessorMixin):
326
  split_input_ids = tokenized_parts["input_ids"]
327
  input_ids: List[int] = []
328
 
 
 
329
  for i, token_len in enumerate(data.get("audio_token_len", [])):
330
  if not audio_is_continuation[i]:
331
  placeholder_index += 1
@@ -338,7 +341,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
338
 
339
  audio_token_start_idx.append(len(input_ids))
340
 
341
- input_ids.extend([self.audio_replacement_token_id] * token_len)
342
 
343
  # Include any tokens after the last audio.
344
  placeholder_index += 1
 
112
  assert (
113
  tokenizer.eos_token is not None
114
  ), "The tokenizer has no EOS token. Cannot recover."
115
+ self.vocab = tokenizer.get_vocab()
116
+ self.audio_replacement = tokenizer.eos_token
117
  if tokenizer.pad_token_id is None:
118
  tokenizer.pad_token_id = tokenizer.eos_token_id
119
 
 
327
  split_input_ids = tokenized_parts["input_ids"]
328
  input_ids: List[int] = []
329
 
330
+ audio_replacement_token_id = self.vocab[self.audio_replacement]
331
+
332
  for i, token_len in enumerate(data.get("audio_token_len", [])):
333
  if not audio_is_continuation[i]:
334
  placeholder_index += 1
 
341
 
342
  audio_token_start_idx.append(len(input_ids))
343
 
344
+ input_ids.extend([audio_replacement_token_id] * token_len)
345
 
346
  # Include any tokens after the last audio.
347
  placeholder_index += 1