Update app.py
Browse files
app.py
CHANGED
|
@@ -23,78 +23,27 @@ import tempfile
|
|
| 23 |
|
| 24 |
# return sanskrit_text, audio_path
|
| 25 |
# Load model and tokenizer
|
| 26 |
-
import os
|
| 27 |
-
import sys
|
| 28 |
-
import transformers
|
| 29 |
-
import tensorflow as tf
|
| 30 |
-
from datasets import load_dataset
|
| 31 |
-
from transformers import AutoTokenizer
|
| 32 |
-
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
|
| 33 |
-
from transformers import AdamWeightDecay
|
| 34 |
-
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
|
| 35 |
|
| 36 |
-
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
|
| 37 |
|
| 38 |
-
from datasets import load_dataset
|
| 39 |
|
| 40 |
-
raw_datasets = load_dataset("rahular/itihasa", download_mode="force_redownload")
|
| 41 |
|
| 42 |
-
import torch
|
| 43 |
-
from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
|
| 44 |
-
from datasets import load_dataset
|
| 45 |
|
| 46 |
# Load the pre-trained English to Hindi model
|
| 47 |
-
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
|
| 48 |
-
model = MarianMTModel.from_pretrained(model_checkpoint)
|
| 49 |
-
tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)
|
| 50 |
-
|
| 51 |
-
# Inspect the raw_datasets structure
|
| 52 |
-
print(raw_datasets)
|
| 53 |
-
print(raw_datasets['train'][0]) # Print the first example from the training set
|
| 54 |
-
|
| 55 |
-
# Tokenization function
|
| 56 |
-
def tokenize_function(examples):
|
| 57 |
-
# Extract English and Sanskrit translations
|
| 58 |
-
english_sentences = [item['en'] for item in examples['translation']]
|
| 59 |
-
sanskrit_sentences = [item['sn'] for item in examples['translation']]
|
| 60 |
-
|
| 61 |
-
# Tokenize the English inputs
|
| 62 |
-
model_inputs = tokenizer(
|
| 63 |
-
english_sentences,
|
| 64 |
-
padding="max_length",
|
| 65 |
-
truncation=True,
|
| 66 |
-
max_length=128
|
| 67 |
-
)
|
| 68 |
-
|
| 69 |
-
# Tokenize the Sanskrit labels
|
| 70 |
-
with tokenizer.as_target_tokenizer():
|
| 71 |
-
labels = tokenizer(
|
| 72 |
-
sanskrit_sentences,
|
| 73 |
-
padding="max_length",
|
| 74 |
-
truncation=True,
|
| 75 |
-
max_length=128
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
# Add labels to the model inputs
|
| 79 |
-
model_inputs["labels"] = labels["input_ids"]
|
| 80 |
-
return model_inputs
|
| 81 |
-
|
| 82 |
-
tokenizer = AutoTokenizer.from_pretrained(get_model_name())
|
| 83 |
-
|
| 84 |
-
model = M2M100ForConditionalGeneration.from_pretrained(get_model_name())
|
| 85 |
-
# I dont know wheter this will be of use or not
|
| 86 |
-
|
| 87 |
-
tokenized_train = raw_datasets['train'].map(tokenize_function, batched=True)
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
tokenized_validation = raw_datasets['validation'].map(tokenize_function, batched=True)
|
| 92 |
-
|
| 93 |
-
from transformers import AutoModelForSeq2SeqLM # Instead of TFAutoModel...
|
| 94 |
-
|
| 95 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
|
| 96 |
-
|
| 97 |
-
|
| 98 |
# from transformers import M2M100ForConditionalGeneration, AutoModelForCausalLM
|
| 99 |
|
| 100 |
# # Load appropriate model based on phase
|
|
@@ -259,28 +208,6 @@ model___name = "SweUmaVarsh/m2m100-en-sa-translation"
|
|
| 259 |
# shuffle=False,
|
| 260 |
# batch_size=8,
|
| 261 |
# collate_fn=data_collator,
|
| 262 |
-
# )
|
| 263 |
-
|
| 264 |
-
# from transformers import create_optimizer
|
| 265 |
-
|
| 266 |
-
# steps_per_epoch = len(train_dataset)
|
| 267 |
-
# num_train_steps = steps_per_epoch * 1 # 1 epoch in your case
|
| 268 |
-
# num_warmup_steps = int(0.1 * num_train_steps) # 10% warmup
|
| 269 |
-
|
| 270 |
-
# optimizer, _ = create_optimizer(
|
| 271 |
-
# init_lr=2e-5,
|
| 272 |
-
# num_train_steps=num_train_steps,
|
| 273 |
-
# num_warmup_steps=num_warmup_steps,
|
| 274 |
-
# weight_decay_rate=0.01
|
| 275 |
-
# )
|
| 276 |
-
|
| 277 |
-
# model.compile(optimizer=optimizer)
|
| 278 |
-
# model.fit(train_dataset, validation_data=val_dataset, epochs=1)
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
|
| 285 |
model____name="Rask6723/IT_GR7_En-Sn"
|
| 286 |
tokenizer = M2M100Tokenizer.from_pretrained(model___name)
|
|
|
|
| 23 |
|
| 24 |
# return sanskrit_text, audio_path
|
| 25 |
# Load model and tokenizer
|
| 26 |
+
# import os
|
| 27 |
+
# import sys
|
| 28 |
+
# import transformers
|
| 29 |
+
# import tensorflow as tf
|
| 30 |
+
# from datasets import load_dataset
|
| 31 |
+
# from transformers import AutoTokenizer
|
| 32 |
+
# from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
|
| 33 |
+
# from transformers import AdamWeightDecay
|
| 34 |
+
# from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
|
| 35 |
|
| 36 |
+
# model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
|
| 37 |
|
| 38 |
+
# from datasets import load_dataset
|
| 39 |
|
| 40 |
+
# raw_datasets = load_dataset("rahular/itihasa", download_mode="force_redownload")
|
| 41 |
|
| 42 |
+
# import torch
|
| 43 |
+
# from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
|
| 44 |
+
# from datasets import load_dataset
|
| 45 |
|
| 46 |
# Load the pre-trained English to Hindi model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
# from transformers import M2M100ForConditionalGeneration, AutoModelForCausalLM
|
| 48 |
|
| 49 |
# # Load appropriate model based on phase
|
|
|
|
| 208 |
# shuffle=False,
|
| 209 |
# batch_size=8,
|
| 210 |
# collate_fn=data_collator,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
model____name="Rask6723/IT_GR7_En-Sn"
|
| 213 |
tokenizer = M2M100Tokenizer.from_pretrained(model___name)
|