Spaces:
Runtime error
Runtime error
| # Copy of final | |
| # ================================================================ | |
| # = STEP 1: SETUP AND DOWNLOAD (YOUR PROVEN METHOD) = | |
| # ================================================================ | |
| import os | |
| print("--- 1. Installing All Libraries ---") | |
| print("✅ Libraries installed.") | |
| print("\n--- 2. Cloning IndicLID Repository ---") | |
| # Using your proven method of changing directories | |
| print("✅ Repository cloned.") | |
| # Navigate into the correct directory structure | |
| print("\n--- 3. Downloading and Unzipping IndicLID Models ---") | |
| print("✅ Download commands executed. Unzipping now...") | |
| print("✅ Unzip commands executed.") | |
| print("\n🎉🎉🎉 SETUP COMPLETE. You can now proceed to Step 2. 🎉🎉🎉") | |
| # ========================= | |
| # = STEP 2: INITIALIZE MODELS (EXACTLY AS YOUR OLD CODE) = | |
| # ========================= | |
| import os | |
| import sys | |
| import torch | |
| print("--- Applying your original add_safe_globals fix... ---") | |
| if "/content/IndicLID/Inference" not in sys.path: | |
| sys.path.append("/content/IndicLID/Inference") | |
| from transformers.models.bert.modeling_bert import ( | |
| BertModel, BertPreTrainedModel, BertForSequenceClassification, | |
| BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention, | |
| BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput | |
| ) | |
| from transformers.models.bert.configuration_bert import BertConfig | |
| import torch.nn as nn | |
| from torch.nn.modules.sparse import Embedding | |
| from torch.nn.modules.container import ModuleList | |
| from torch.nn.modules.linear import Linear | |
| from torch.nn.modules.normalization import LayerNorm | |
| from torch.nn.modules.dropout import Dropout | |
| torch.serialization.add_safe_globals([ | |
| BertModel, BertPreTrainedModel, BertForSequenceClassification, | |
| BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention, | |
| BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput, BertConfig, | |
| Embedding, ModuleList, Linear, LayerNorm, Dropout, | |
| ]) | |
| print("✅ Comprehensive safe globals added successfully.") | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| from IndicTransToolkit.processor import IndicProcessor | |
| from ai4bharat.IndicLID import IndicLID | |
| print("--- Loading all models into memory... ---") | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"Using device: {device}") | |
| lid = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6) | |
| print("✅ IndicLID model loaded successfully.") | |
| MODEL_ID = "ai4bharat/indictrans2-indic-en-1B" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, trust_remote_code=True).to(device) | |
| ip = IndicProcessor(inference=True) | |
| print("✅ IndicTrans2 1B model loaded.") | |
| print("🎉 ALL MODELS ARE LOADED. Proceed to direct batch prediction tests.") | |
| import sys | |
| print(sys.path) | |
| pip show transformers | |
| # ================================================================ | |
| # = STEP 2.5: LOAD ROMANSETU (COMPATIBLE WITH 4.40.2) = | |
| # ================================================================ | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import torch | |
| print("--- Loading RomanSetu model compatible with transformers 4.40.2... ---") | |
| # Try smaller, more compatible models first | |
| model_options = [ | |
| "ai4bharat/romansetu-cpt-roman-100m", | |
| "ai4bharat/romansetu-cpt-roman-200m" | |
| ] | |
| rs_model = None | |
| rs_tokenizer = None | |
| for model_id in model_options: | |
| try: | |
| print(f"Trying model: {model_id}") | |
| rs_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
| rs_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(device) | |
| print(f"✅ {model_id} loaded successfully.") | |
| break | |
| except Exception as e: | |
| print(f"❌ {model_id} failed: {e}") | |
| continue | |
| if rs_model is None: | |
| print("❌ All RomanSetu models failed. Continuing with transliteration-based approach.") | |
| def translate_with_romansetu(text, max_new_tokens=50): | |
| if rs_model is None: | |
| # Fallback: use enhanced transliteration + IndicTrans2 | |
| from indic_transliteration import sanscript | |
| from indic_transliteration.sanscript import transliterate | |
| try: | |
| # Try to transliterate and then translate with IndicTrans2 | |
| native_text = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI) | |
| pre = ip.preprocess_batch([native_text], src_lang="hin_Deva", tgt_lang="eng_Latn") | |
| inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device) | |
| with torch.no_grad(): | |
| out = model.generate(**inputs, num_beams=3, max_length=100) | |
| dec = tokenizer.batch_decode(out, skip_special_tokens=True) | |
| post = ip.postprocess_batch(dec, lang="hin_Deva") | |
| return post[0] | |
| except: | |
| return text | |
| try: | |
| prompt = f"Translate this romanized Indian text to English: {text}" | |
| inputs = rs_tokenizer(prompt, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| outputs = rs_model.generate( | |
| inputs.input_ids, | |
| max_new_tokens=max_new_tokens, | |
| num_beams=2, | |
| temperature=0.7, | |
| do_sample=True, | |
| pad_token_id=rs_tokenizer.eos_token_id | |
| ) | |
| full_response = rs_tokenizer.decode(outputs, skip_special_tokens=True) | |
| translation = full_response.replace(prompt, "").strip() | |
| return translation if translation and len(translation) > 2 else text | |
| except Exception as e: | |
| return text | |
| print("✅ RomanSetu/fallback translation function defined.") | |
| print("🎉 SETUP COMPLETE with fallback mechanism.") | |
| # ================================================================ | |
| # = STEP 2.6: LOAD INDICXLIT FOR BETTER TRANSLITERATION (CORRECTED) = | |
| # ================================================================ | |
| print("--- Installing and loading IndicXlit for better romanized text handling ---") | |
| # Install IndicXlit (compatible with your transformers==4.40.2) | |
| from ai4bharat.transliteration import XlitEngine | |
| import torch | |
| try: | |
| # Load IndicXlit engines for different languages (based on official docs) | |
| xlit_engines = { | |
| "hindi": XlitEngine("hi", beam_width=4, rescore=True), | |
| "bengali": XlitEngine("bn", beam_width=4, rescore=True), | |
| "tamil": XlitEngine("ta", beam_width=4, rescore=True), | |
| "telugu": XlitEngine("te", beam_width=4, rescore=True), | |
| "gujarati": XlitEngine("gu", beam_width=4, rescore=True), | |
| "kannada": XlitEngine("kn", beam_width=4, rescore=True), | |
| "malayalam": XlitEngine("ml", beam_width=4, rescore=True), | |
| "punjabi": XlitEngine("pa", beam_width=4, rescore=True), | |
| "marathi": XlitEngine("mr", beam_width=4, rescore=True), | |
| "urdu": XlitEngine("ur", beam_width=4, rescore=True), | |
| } | |
| print("✅ Multiple IndicXlit engines loaded successfully.") | |
| except Exception as e: | |
| print(f"❌ Error loading IndicXlit: {e}") | |
| print("💡 Falling back to basic transliteration.") | |
| xlit_engines = {} | |
| def enhanced_transliterate_with_xlit(text, target_lang): | |
| """ | |
| Enhanced transliteration using IndicXlit (based on official API) | |
| """ | |
| lang_key = target_lang.lower() | |
| if not xlit_engines or lang_key not in xlit_engines: | |
| # Fallback to your existing transliteration | |
| from indic_transliteration import sanscript | |
| from indic_transliteration.sanscript import transliterate | |
| script_map = { | |
| "hindi": sanscript.DEVANAGARI, "bengali": sanscript.BENGALI, | |
| "tamil": sanscript.TAMIL, "telugu": sanscript.TELUGU, | |
| "kannada": sanscript.KANNADA, "malayalam": sanscript.MALAYALAM, | |
| "gujarati": sanscript.GUJARATI, "punjabi": sanscript.GURMUKHI, | |
| "marathi": sanscript.DEVANAGARI, "urdu": 'urdu' | |
| } | |
| return transliterate(text, sanscript.ITRANS, script_map.get(lang_key, sanscript.DEVANAGARI)) | |
| try: | |
| # Use IndicXlit for better transliteration (official API) | |
| engine = xlit_engines[lang_key] | |
| # For sentences, use translit_sentence (returns dict with lang code as key) | |
| if ' ' in text: | |
| result = engine.translit_sentence(text) | |
| # Get the language code for this engine | |
| lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te", | |
| "gujarati": "gu", "kannada": "kn", "malayalam": "ml", | |
| "punjabi": "pa", "marathi": "mr", "urdu": "ur"} | |
| lang_code = lang_codes.get(lang_key, "hi") | |
| return result.get(lang_code, text) | |
| else: | |
| # For single words, use translit_word (returns dict with topk results) | |
| result = engine.translit_word(text, topk=1) | |
| lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te", | |
| "gujarati": "gu", "kannada": "kn", "malayalam": "ml", | |
| "punjabi": "pa", "marathi": "mr", "urdu": "ur"} | |
| lang_code = lang_codes.get(lang_key, "hi") | |
| return result.get(lang_code, [text])[0] | |
| except Exception as e: | |
| print(f"IndicXlit error for '{text}': {e}") | |
| # Fallback if IndicXlit fails | |
| return text | |
| print("✅ Enhanced transliteration function defined.") | |
| print("🎉 INDICXLIT SETUP COMPLETE.") | |
| import pandas as pd | |
| from indic_transliteration import sanscript | |
| from indic_transliteration.sanscript import transliterate | |
| # EXPANDED language mapping to handle misdetections | |
| LID_TO_TRANSLATE = { | |
| # Hindi variants | |
| "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, | |
| "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, | |
| # Maithili (often confused with Hindi) - map to Hindi | |
| "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, | |
| "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, | |
| # Bengali variants | |
| "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, | |
| "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, | |
| # Assamese (often confused with Bengali) - map to Bengali | |
| "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, | |
| "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, | |
| # Tamil variants | |
| "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, | |
| "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, | |
| "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, | |
| # Telugu variants | |
| "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, | |
| "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, | |
| # Kannada variants | |
| "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, | |
| "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, | |
| # Malayalam variants | |
| "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, | |
| "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, | |
| # Gujarati variants | |
| "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, | |
| "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, | |
| # Punjabi variants | |
| "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, | |
| "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, | |
| # Marathi variants | |
| "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, | |
| "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, | |
| # Urdu variants | |
| "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, | |
| "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, | |
| # Additional commonly misdetected languages | |
| "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi → Hindi | |
| "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali → Hindi | |
| "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani → Hindi | |
| "gom_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Goan Konkani → Hindi | |
| "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo → Hindi | |
| } | |
| def enhanced_transliterate_robust(text, target_script): | |
| """ | |
| Enhanced transliteration with better romanization handling | |
| """ | |
| try: | |
| # Preprocess text for better transliteration | |
| cleaned_text = text.lower().strip() | |
| # Handle common romanization patterns | |
| replacements = { | |
| 'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph', | |
| 'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh', | |
| 'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au' | |
| } | |
| for old, new in replacements.items(): | |
| cleaned_text = cleaned_text.replace(old, new) | |
| # Transliterate using your existing library | |
| result = transliterate(cleaned_text, sanscript.ITRANS, target_script) | |
| return result if result else text | |
| except Exception as e: | |
| print(f"Transliteration error: {e}") | |
| return text | |
| def detect_and_translate_robust(texts, batch_size=64): | |
| """ | |
| Robust detection and translation with expanded language mapping | |
| """ | |
| results = [] | |
| preds = lid.batch_predict(texts, batch_size) | |
| for item in preds: | |
| if isinstance(item, dict): | |
| text = item.get("text", "") | |
| lang_code = item.get("lang", item.get("pred_lang", "")) | |
| score = float(item.get("score", 0.0)) | |
| model_name = item.get("model", "") | |
| else: | |
| text, lang_code, score, model_name = item | |
| is_romanized = lang_code.endswith("_Latn") | |
| if lang_code not in LID_TO_TRANSLATE: | |
| translation = f"Language '{lang_code}' not supported for translation" | |
| method = "Unsupported" | |
| else: | |
| try: | |
| lang_info = LID_TO_TRANSLATE[lang_code] | |
| src_code = lang_info["it_code"] | |
| if is_romanized: | |
| # Use enhanced transliteration | |
| native_text = enhanced_transliterate_robust(text, lang_info["script"]) | |
| method = f"Enhanced Transliteration + IndicTrans2 (detected as {lang_code})" | |
| print(f"Enhanced: '{text}' → '{native_text}' (detected: {lang_code})") | |
| else: | |
| native_text = text | |
| method = f"IndicTrans2 (detected as {lang_code})" | |
| # Translate with IndicTrans2 | |
| pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn") | |
| inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device) | |
| with torch.no_grad(): | |
| out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True) | |
| dec = tokenizer.batch_decode(out, skip_special_tokens=True) | |
| post = ip.postprocess_batch(dec, lang=src_code) | |
| translation = post[0] | |
| except Exception as e: | |
| translation = f"Translation error: {str(e)}" | |
| method = "Error" | |
| results.append({ | |
| "original_text": text, | |
| "detected_lang": lang_code, | |
| "script_type": "Romanized" if is_romanized else "Native", | |
| "confidence": f"{score:.3f}", | |
| "translation_method": method, | |
| "english_translation": translation | |
| }) | |
| return pd.DataFrame(results) | |
| print("✅ Robust translation function with expanded language mapping defined") | |
| # Test with the same samples | |
| sample_texts = [ | |
| "यहाँ कितने लोग हैं?", | |
| "tum kaha ho", | |
| "aaj mausam suhana hai", | |
| "aap kaise hain", | |
| "আমি ভালো আছি।", | |
| "ami bhalo achi", | |
| "mera naam rahul hai", | |
| "main office jaa raha hun" | |
| ] | |
| print(f"🔍 Testing robust approach with expanded language mapping...") | |
| df_results = detect_and_translate_robust(sample_texts, batch_size=16) | |
| display(df_results) | |
| # ================================================================ | |
| # = COMPLETE TEST CODE FOR ALL 22 INDIAN LANGUAGES = | |
| # ================================================================ | |
| import pandas as pd | |
| from indic_transliteration import sanscript | |
| from indic_transliteration.sanscript import transliterate | |
| # Official 22 Indian languages sample sentences (native + romanized) | |
| sample_sentences = { | |
| "Assamese": ("আপুনি কেনেকৈ আছেন?", "apuni kenekoi asen?"), | |
| "Bengali": ("তুমি কেমন আছো?", "tumi kemon acho?"), | |
| "Bodo": ("नांगनि फाथै खौ?", "nangni phathai kho?"), | |
| "Dogri": ("तुसीं केहे हो?", "tusi kehe ho?"), | |
| "Gujarati": ("તમે કેમ છો?", "tame kem cho?"), | |
| "Hindi": ("तुम कैसे हो?", "tum kaise ho?"), | |
| "Kannada": ("ನೀವು ಹೇಗಿದ್ದೀರಾ?", "neevu hegiddira?"), | |
| "Kashmiri": ("तुस की छै?", "tus ki chhai?"), | |
| "Konkani": ("तुम कशें आसा?", "tum kashen asa?"), | |
| "Maithili": ("अहाँ कथी छी?", "ahaan kathi chhi?"), | |
| "Malayalam": ("സുഖമായിരോ?", "sukhamaayiro?"), | |
| "Manipuri": ("नमस्कार, नखोंगबा तौ?", "namaskaar, nakhongba tau?"), | |
| "Marathi": ("तू कसा आहेस?", "tu kasa ahes?"), | |
| "Nepali": ("तिमी कस्तो छौ?", "timi kasto chau?"), | |
| "Odia": ("ତୁମେ କେମିତି ଅଛ?", "tume kemiti achha?"), | |
| "Punjabi": ("ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?", "tusi kiven ho?"), | |
| "Sanskrit": ("भवतः कथम् अस्ति?", "bhavatah katham asti?"), | |
| "Santali": ("ᱥᱟᱱᱛᱟᱲᱤ ᱠᱚᱱᱛᱮᱞᱤ ᱟᱹᱲᱤ?", "santalii konteli adii?"), | |
| "Sindhi": ("توهان ڪيئن آهيو؟", "tohan kayn aahiyo?"), | |
| "Tamil": ("நீங்கள் எப்படி இருக்கிறீர்கள்?", "neenga epdi irukeenga?"), | |
| "Telugu": ("మీరు ఎలా ఉన్నారు?", "meeru ela unnaru?"), | |
| "Urdu": ("آپ کیسے ہیں؟", "aap kaise hain?") | |
| } | |
| # Expanded language mapping (covers common misdetections) | |
| LID_TO_TRANSLATE = { | |
| # Hindi variants | |
| "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, | |
| "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, | |
| "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Maithili→Hindi | |
| "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, | |
| "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali→Hindi | |
| "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi→Hindi | |
| "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani→Hindi | |
| "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo→Hindi | |
| # Bengali variants | |
| "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, | |
| "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, | |
| "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, # Assamese→Bengali | |
| "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, | |
| # Tamil variants | |
| "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, | |
| "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, | |
| "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"}, | |
| # Telugu variants | |
| "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, | |
| "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"}, | |
| # Kannada variants | |
| "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, | |
| "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"}, | |
| # Malayalam variants | |
| "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, | |
| "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"}, | |
| # Gujarati variants | |
| "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, | |
| "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"}, | |
| # Punjabi variants | |
| "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, | |
| "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"}, | |
| # Marathi variants | |
| "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, | |
| "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"}, | |
| # Urdu variants | |
| "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, | |
| "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"}, | |
| } | |
| def enhanced_transliterate_robust(text, target_script): | |
| """Enhanced transliteration with better romanization handling""" | |
| try: | |
| cleaned_text = text.lower().strip() | |
| replacements = { | |
| 'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph', | |
| 'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh', | |
| 'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au' | |
| } | |
| for old, new in replacements.items(): | |
| cleaned_text = cleaned_text.replace(old, new) | |
| result = transliterate(cleaned_text, sanscript.ITRANS, target_script) | |
| return result if result else text | |
| except Exception as e: | |
| print(f"Transliteration error: {e}") | |
| return text | |
| def test_all_22_languages(texts, batch_size=32): | |
| """Complete testing function for all 22 languages""" | |
| results = [] | |
| preds = lid.batch_predict(texts, batch_size) | |
| for item in preds: | |
| if isinstance(item, dict): | |
| text = item.get("text", "") | |
| lang_code = item.get("lang", item.get("pred_lang", "")) | |
| score = float(item.get("score", 0.0)) | |
| model_name = item.get("model", "") | |
| else: | |
| text, lang_code, score, model_name = item | |
| is_romanized = lang_code.endswith("_Latn") | |
| if lang_code not in LID_TO_TRANSLATE: | |
| translation = f"Language '{lang_code}' not supported" | |
| method = "Unsupported" | |
| else: | |
| try: | |
| lang_info = LID_TO_TRANSLATE[lang_code] | |
| src_code = lang_info["it_code"] | |
| if is_romanized: | |
| native_text = enhanced_transliterate_robust(text, lang_info["script"]) | |
| method = f"Transliteration+IndicTrans2 (detected: {lang_code})" | |
| print(f"Romanized: '{text}' → '{native_text}'") | |
| else: | |
| native_text = text | |
| method = f"IndicTrans2 (detected: {lang_code})" | |
| # Translate with IndicTrans2 | |
| pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn") | |
| inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device) | |
| with torch.no_grad(): | |
| out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True) | |
| dec = tokenizer.batch_decode(out, skip_special_tokens=True) | |
| post = ip.postprocess_batch(dec, lang=src_code) | |
| translation = post[0] | |
| except Exception as e: | |
| translation = f"Translation error: {str(e)}" | |
| method = "Error" | |
| results.append({ | |
| "language": text[:20] + "..." if len(text) > 20 else text, | |
| "original_text": text, | |
| "detected_lang": lang_code, | |
| "script_type": "Romanized" if is_romanized else "Native", | |
| "confidence": f"{score:.3f}", | |
| "method": method, | |
| "english_translation": translation | |
| }) | |
| return pd.DataFrame(results) | |
| # Create test dataset with all 44 samples (22 native + 22 romanized) | |
| print("🔍 Creating test dataset for all 22 official Indian languages...") | |
| all_test_texts = [] | |
| for lang, (native, roman) in sample_sentences.items(): | |
| all_test_texts.append(native) | |
| all_test_texts.append(roman) | |
| print(f"📊 Testing {len(all_test_texts)} samples ({len(sample_sentences)} languages × 2 scripts)...") | |
| # Run the complete test | |
| df_results = test_all_22_languages(all_test_texts, batch_size=32) | |
| # Display results | |
| print("\n🎯 COMPLETE TEST RESULTS:") | |
| display(df_results) | |
| # Summary statistics | |
| print(f"\n📈 SUMMARY STATISTICS:") | |
| print(f"Total samples tested: {len(df_results)}") | |
| print(f"Languages detected: {df_results['detected_lang'].nunique()}") | |
| print(f"Native script samples: {len(df_results[df_results['script_type'] == 'Native'])}") | |
| print(f"Romanized samples: {len(df_results[df_results['script_type'] == 'Romanized'])}") | |
| print(f"Successfully translated: {len(df_results[~df_results['english_translation'].str.contains('error|not supported', case=False)])}") | |
| import pandas as pd | |
| def detailed_translation_summary(df_results): | |
| """ | |
| Generate comprehensive detailed summary of translation results | |
| """ | |
| # Flag successful translations | |
| df_results['successful_translation'] = ~df_results['english_translation'].str.contains('error|not supported', case=False, na=False) | |
| print("\n=========== OVERALL SUMMARY ===========") | |
| print(f"Total samples tested: {len(df_results)}") | |
| print(f"Languages detected: {df_results['detected_lang'].nunique()}") | |
| print(f"Native script samples: {df_results[df_results['script_type'] == 'Native'].shape[0]}") | |
| print(f"Romanized samples: {df_results[df_results['script_type'] == 'Romanized'].shape}") | |
| print(f"Successfully translated: {df_results['successful_translation'].sum()}") | |
| overall_success_rate = (df_results['successful_translation'].sum() / len(df_results) * 100) | |
| print(f"Overall success rate: {overall_success_rate:.1f}%") | |
| print("\n=========== DETAILED LANGUAGE BREAKDOWN ===========") | |
| # Per-language analysis | |
| lang_summary = df_results.groupby('detected_lang').agg( | |
| total_samples=('original_text', 'count'), | |
| native_count=('script_type', lambda x: (x == 'Native').sum()), | |
| romanized_count=('script_type', lambda x: (x == 'Romanized').sum()), | |
| mean_confidence=('confidence', lambda x: pd.to_numeric(x, errors='coerce').mean()), | |
| success=('successful_translation', 'sum'), | |
| error_count=('successful_translation', lambda x: (~x).sum()) | |
| ).reset_index().sort_values('total_samples', ascending=False) | |
| lang_summary['success_rate'] = (lang_summary['success'] / lang_summary['total_samples'] * 100).round(1) | |
| print(lang_summary) | |
| print("\n=========== TOP PERFORMING LANGUAGES ===========") | |
| top_performers = lang_summary[lang_summary['success_rate'] >= 90].sort_values('success_rate', ascending=False) | |
| if len(top_performers) > 0: | |
| print(top_performers[['detected_lang', 'total_samples', 'success_rate']]) | |
| else: | |
| print("No languages with 90%+ success rate") | |
| print("\n=========== CHALLENGING LANGUAGES ===========") | |
| challenging = lang_summary[lang_summary['success_rate'] < 50].sort_values('success_rate') | |
| if len(challenging) > 0: | |
| print(challenging[['detected_lang', 'total_samples', 'success_rate']]) | |
| else: | |
| print("No languages with <50% success rate") | |
| print("\n=========== ERROR ANALYSIS ===========") | |
| error_df = df_results[~df_results['successful_translation']] | |
| print(f"Total errors: {len(error_df)}") | |
| if len(error_df) > 0: | |
| print("\nError samples:") | |
| print(error_df[['original_text', 'detected_lang', 'script_type', 'confidence', 'english_translation']]) | |
| else: | |
| print("No errors found!") | |
| print("\n=========== SUCCESS BREAKDOWN BY SCRIPT ===========") | |
| script_summary = df_results.groupby('script_type').agg( | |
| total_samples=('original_text', 'count'), | |
| successful=('successful_translation', 'sum'), | |
| success_rate=('successful_translation', lambda x: x.mean() * 100) | |
| ).round(1) | |
| print(script_summary) | |
| print("\n=========== DETECTION CONFIDENCE ANALYSIS ===========") | |
| confidence_summary = lang_summary[['detected_lang', 'mean_confidence']].sort_values('mean_confidence', ascending=False) | |
| print("Top 10 most confident detections:") | |
| print(confidence_summary.head(10)) | |
| return lang_summary, script_summary, error_df | |
| # ===== HOW TO USE ===== | |
| print("✅ Detailed summary function defined") | |
| print("\n📋 To run on your test results:") | |
| print(" lang_summary, script_summary, error_df = detailed_translation_summary(df_results)") | |
| print(" display(lang_summary)") | |
| print(" display(error_df)") | |
| lang_summary, script_summary, error_df = detailed_translation_summary(df_results) | |
| display(lang_summary) | |
| display(error_df) | |