Update ultravox_processing.py
Browse files- ultravox_processing.py +5 -2
ultravox_processing.py
CHANGED
|
@@ -112,7 +112,8 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
| 112 |
assert (
|
| 113 |
tokenizer.eos_token is not None
|
| 114 |
), "The tokenizer has no EOS token. Cannot recover."
|
| 115 |
-
self.
|
|
|
|
| 116 |
if tokenizer.pad_token_id is None:
|
| 117 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 118 |
|
|
@@ -326,6 +327,8 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
| 326 |
split_input_ids = tokenized_parts["input_ids"]
|
| 327 |
input_ids: List[int] = []
|
| 328 |
|
|
|
|
|
|
|
| 329 |
for i, token_len in enumerate(data.get("audio_token_len", [])):
|
| 330 |
if not audio_is_continuation[i]:
|
| 331 |
placeholder_index += 1
|
|
@@ -338,7 +341,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
|
|
| 338 |
|
| 339 |
audio_token_start_idx.append(len(input_ids))
|
| 340 |
|
| 341 |
-
input_ids.extend([
|
| 342 |
|
| 343 |
# Include any tokens after the last audio.
|
| 344 |
placeholder_index += 1
|
|
|
|
| 112 |
assert (
|
| 113 |
tokenizer.eos_token is not None
|
| 114 |
), "The tokenizer has no EOS token. Cannot recover."
|
| 115 |
+
self.vocab = tokenizer.get_vocab()
|
| 116 |
+
self.audio_replacement = tokenizer.eos_token
|
| 117 |
if tokenizer.pad_token_id is None:
|
| 118 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 119 |
|
|
|
|
| 327 |
split_input_ids = tokenized_parts["input_ids"]
|
| 328 |
input_ids: List[int] = []
|
| 329 |
|
| 330 |
+
audio_replacement_token_id = self.vocab[self.audio_replacement]
|
| 331 |
+
|
| 332 |
for i, token_len in enumerate(data.get("audio_token_len", [])):
|
| 333 |
if not audio_is_continuation[i]:
|
| 334 |
placeholder_index += 1
|
|
|
|
| 341 |
|
| 342 |
audio_token_start_idx.append(len(input_ids))
|
| 343 |
|
| 344 |
+
input_ids.extend([audio_replacement_token_id] * token_len)
|
| 345 |
|
| 346 |
# Include any tokens after the last audio.
|
| 347 |
placeholder_index += 1
|