copy-of-final / app.py
kasimali's picture
Upload folder using huggingface_hub
b3a699a verified
# Copy of final
# ================================================================
# = STEP 1: SETUP AND DOWNLOAD (YOUR PROVEN METHOD) =
# ================================================================
import os
print("--- 1. Installing All Libraries ---")
print("✅ Libraries installed.")
print("\n--- 2. Cloning IndicLID Repository ---")
# Using your proven method of changing directories
print("✅ Repository cloned.")
# Navigate into the correct directory structure
print("\n--- 3. Downloading and Unzipping IndicLID Models ---")
print("✅ Download commands executed. Unzipping now...")
print("✅ Unzip commands executed.")
print("\n🎉🎉🎉 SETUP COMPLETE. You can now proceed to Step 2. 🎉🎉🎉")
# =========================
# = STEP 2: INITIALIZE MODELS (EXACTLY AS YOUR OLD CODE) =
# =========================
import os
import sys
import torch
print("--- Applying your original add_safe_globals fix... ---")
if "/content/IndicLID/Inference" not in sys.path:
sys.path.append("/content/IndicLID/Inference")
from transformers.models.bert.modeling_bert import (
BertModel, BertPreTrainedModel, BertForSequenceClassification,
BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput
)
from transformers.models.bert.configuration_bert import BertConfig
import torch.nn as nn
from torch.nn.modules.sparse import Embedding
from torch.nn.modules.container import ModuleList
from torch.nn.modules.linear import Linear
from torch.nn.modules.normalization import LayerNorm
from torch.nn.modules.dropout import Dropout
torch.serialization.add_safe_globals([
BertModel, BertPreTrainedModel, BertForSequenceClassification,
BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput, BertConfig,
Embedding, ModuleList, Linear, LayerNorm, Dropout,
])
print("✅ Comprehensive safe globals added successfully.")
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IndicTransToolkit.processor import IndicProcessor
from ai4bharat.IndicLID import IndicLID
print("--- Loading all models into memory... ---")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
lid = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
print("✅ IndicLID model loaded successfully.")
MODEL_ID = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, trust_remote_code=True).to(device)
ip = IndicProcessor(inference=True)
print("✅ IndicTrans2 1B model loaded.")
print("🎉 ALL MODELS ARE LOADED. Proceed to direct batch prediction tests.")
import sys
print(sys.path)
pip show transformers
# ================================================================
# = STEP 2.5: LOAD ROMANSETU (COMPATIBLE WITH 4.40.2) =
# ================================================================
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
print("--- Loading RomanSetu model compatible with transformers 4.40.2... ---")
# Try smaller, more compatible models first
model_options = [
"ai4bharat/romansetu-cpt-roman-100m",
"ai4bharat/romansetu-cpt-roman-200m"
]
rs_model = None
rs_tokenizer = None
for model_id in model_options:
try:
print(f"Trying model: {model_id}")
rs_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
rs_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
print(f"✅ {model_id} loaded successfully.")
break
except Exception as e:
print(f"❌ {model_id} failed: {e}")
continue
if rs_model is None:
print("❌ All RomanSetu models failed. Continuing with transliteration-based approach.")
def translate_with_romansetu(text, max_new_tokens=50):
if rs_model is None:
# Fallback: use enhanced transliteration + IndicTrans2
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
try:
# Try to transliterate and then translate with IndicTrans2
native_text = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
pre = ip.preprocess_batch([native_text], src_lang="hin_Deva", tgt_lang="eng_Latn")
inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
with torch.no_grad():
out = model.generate(**inputs, num_beams=3, max_length=100)
dec = tokenizer.batch_decode(out, skip_special_tokens=True)
post = ip.postprocess_batch(dec, lang="hin_Deva")
return post[0]
except:
return text
try:
prompt = f"Translate this romanized Indian text to English: {text}"
inputs = rs_tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
outputs = rs_model.generate(
inputs.input_ids,
max_new_tokens=max_new_tokens,
num_beams=2,
temperature=0.7,
do_sample=True,
pad_token_id=rs_tokenizer.eos_token_id
)
full_response = rs_tokenizer.decode(outputs, skip_special_tokens=True)
translation = full_response.replace(prompt, "").strip()
return translation if translation and len(translation) > 2 else text
except Exception as e:
return text
print("✅ RomanSetu/fallback translation function defined.")
print("🎉 SETUP COMPLETE with fallback mechanism.")
# ================================================================
# = STEP 2.6: LOAD INDICXLIT FOR BETTER TRANSLITERATION (CORRECTED) =
# ================================================================
print("--- Installing and loading IndicXlit for better romanized text handling ---")
# Install IndicXlit (compatible with your transformers==4.40.2)
from ai4bharat.transliteration import XlitEngine
import torch
try:
# Load IndicXlit engines for different languages (based on official docs)
xlit_engines = {
"hindi": XlitEngine("hi", beam_width=4, rescore=True),
"bengali": XlitEngine("bn", beam_width=4, rescore=True),
"tamil": XlitEngine("ta", beam_width=4, rescore=True),
"telugu": XlitEngine("te", beam_width=4, rescore=True),
"gujarati": XlitEngine("gu", beam_width=4, rescore=True),
"kannada": XlitEngine("kn", beam_width=4, rescore=True),
"malayalam": XlitEngine("ml", beam_width=4, rescore=True),
"punjabi": XlitEngine("pa", beam_width=4, rescore=True),
"marathi": XlitEngine("mr", beam_width=4, rescore=True),
"urdu": XlitEngine("ur", beam_width=4, rescore=True),
}
print("✅ Multiple IndicXlit engines loaded successfully.")
except Exception as e:
print(f"❌ Error loading IndicXlit: {e}")
print("💡 Falling back to basic transliteration.")
xlit_engines = {}
def enhanced_transliterate_with_xlit(text, target_lang):
"""
Enhanced transliteration using IndicXlit (based on official API)
"""
lang_key = target_lang.lower()
if not xlit_engines or lang_key not in xlit_engines:
# Fallback to your existing transliteration
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
script_map = {
"hindi": sanscript.DEVANAGARI, "bengali": sanscript.BENGALI,
"tamil": sanscript.TAMIL, "telugu": sanscript.TELUGU,
"kannada": sanscript.KANNADA, "malayalam": sanscript.MALAYALAM,
"gujarati": sanscript.GUJARATI, "punjabi": sanscript.GURMUKHI,
"marathi": sanscript.DEVANAGARI, "urdu": 'urdu'
}
return transliterate(text, sanscript.ITRANS, script_map.get(lang_key, sanscript.DEVANAGARI))
try:
# Use IndicXlit for better transliteration (official API)
engine = xlit_engines[lang_key]
# For sentences, use translit_sentence (returns dict with lang code as key)
if ' ' in text:
result = engine.translit_sentence(text)
# Get the language code for this engine
lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te",
"gujarati": "gu", "kannada": "kn", "malayalam": "ml",
"punjabi": "pa", "marathi": "mr", "urdu": "ur"}
lang_code = lang_codes.get(lang_key, "hi")
return result.get(lang_code, text)
else:
# For single words, use translit_word (returns dict with topk results)
result = engine.translit_word(text, topk=1)
lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te",
"gujarati": "gu", "kannada": "kn", "malayalam": "ml",
"punjabi": "pa", "marathi": "mr", "urdu": "ur"}
lang_code = lang_codes.get(lang_key, "hi")
return result.get(lang_code, [text])[0]
except Exception as e:
print(f"IndicXlit error for '{text}': {e}")
# Fallback if IndicXlit fails
return text
print("✅ Enhanced transliteration function defined.")
print("🎉 INDICXLIT SETUP COMPLETE.")
import pandas as pd
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
# EXPANDED language mapping to handle misdetections
LID_TO_TRANSLATE = {
# Hindi variants
"hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
"hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
# Maithili (often confused with Hindi) - map to Hindi
"mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
"mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
# Bengali variants
"ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
"ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
# Assamese (often confused with Bengali) - map to Bengali
"asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
"asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
# Tamil variants
"tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
"tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
"tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
# Telugu variants
"tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
"tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
# Kannada variants
"kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
"kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
# Malayalam variants
"mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
"mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
# Gujarati variants
"guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
"guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
# Punjabi variants
"pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
"pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
# Marathi variants
"mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
"mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
# Urdu variants
"urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
"urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
# Additional commonly misdetected languages
"snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi → Hindi
"nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali → Hindi
"kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani → Hindi
"gom_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Goan Konkani → Hindi
"brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo → Hindi
}
def enhanced_transliterate_robust(text, target_script):
"""
Enhanced transliteration with better romanization handling
"""
try:
# Preprocess text for better transliteration
cleaned_text = text.lower().strip()
# Handle common romanization patterns
replacements = {
'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
}
for old, new in replacements.items():
cleaned_text = cleaned_text.replace(old, new)
# Transliterate using your existing library
result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
return result if result else text
except Exception as e:
print(f"Transliteration error: {e}")
return text
def detect_and_translate_robust(texts, batch_size=64):
"""
Robust detection and translation with expanded language mapping
"""
results = []
preds = lid.batch_predict(texts, batch_size)
for item in preds:
if isinstance(item, dict):
text = item.get("text", "")
lang_code = item.get("lang", item.get("pred_lang", ""))
score = float(item.get("score", 0.0))
model_name = item.get("model", "")
else:
text, lang_code, score, model_name = item
is_romanized = lang_code.endswith("_Latn")
if lang_code not in LID_TO_TRANSLATE:
translation = f"Language '{lang_code}' not supported for translation"
method = "Unsupported"
else:
try:
lang_info = LID_TO_TRANSLATE[lang_code]
src_code = lang_info["it_code"]
if is_romanized:
# Use enhanced transliteration
native_text = enhanced_transliterate_robust(text, lang_info["script"])
method = f"Enhanced Transliteration + IndicTrans2 (detected as {lang_code})"
print(f"Enhanced: '{text}' → '{native_text}' (detected: {lang_code})")
else:
native_text = text
method = f"IndicTrans2 (detected as {lang_code})"
# Translate with IndicTrans2
pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
with torch.no_grad():
out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
dec = tokenizer.batch_decode(out, skip_special_tokens=True)
post = ip.postprocess_batch(dec, lang=src_code)
translation = post[0]
except Exception as e:
translation = f"Translation error: {str(e)}"
method = "Error"
results.append({
"original_text": text,
"detected_lang": lang_code,
"script_type": "Romanized" if is_romanized else "Native",
"confidence": f"{score:.3f}",
"translation_method": method,
"english_translation": translation
})
return pd.DataFrame(results)
print("✅ Robust translation function with expanded language mapping defined")
# Test with the same samples
sample_texts = [
"यहाँ कितने लोग हैं?",
"tum kaha ho",
"aaj mausam suhana hai",
"aap kaise hain",
"আমি ভালো আছি।",
"ami bhalo achi",
"mera naam rahul hai",
"main office jaa raha hun"
]
print(f"🔍 Testing robust approach with expanded language mapping...")
df_results = detect_and_translate_robust(sample_texts, batch_size=16)
display(df_results)
# ================================================================
# = COMPLETE TEST CODE FOR ALL 22 INDIAN LANGUAGES =
# ================================================================
import pandas as pd
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
# Official 22 Indian languages sample sentences (native + romanized)
sample_sentences = {
"Assamese": ("আপুনি কেনেকৈ আছেন?", "apuni kenekoi asen?"),
"Bengali": ("তুমি কেমন আছো?", "tumi kemon acho?"),
"Bodo": ("नांगनि फाथै खौ?", "nangni phathai kho?"),
"Dogri": ("तुसीं केहे हो?", "tusi kehe ho?"),
"Gujarati": ("તમે કેમ છો?", "tame kem cho?"),
"Hindi": ("तुम कैसे हो?", "tum kaise ho?"),
"Kannada": ("ನೀವು ಹೇಗಿದ್ದೀರಾ?", "neevu hegiddira?"),
"Kashmiri": ("तुस की छै?", "tus ki chhai?"),
"Konkani": ("तुम कशें आसा?", "tum kashen asa?"),
"Maithili": ("अहाँ कथी छी?", "ahaan kathi chhi?"),
"Malayalam": ("സുഖമായിരോ?", "sukhamaayiro?"),
"Manipuri": ("नमस्कार, नखोंगबा तौ?", "namaskaar, nakhongba tau?"),
"Marathi": ("तू कसा आहेस?", "tu kasa ahes?"),
"Nepali": ("तिमी कस्तो छौ?", "timi kasto chau?"),
"Odia": ("ତୁମେ କେମିତି ଅଛ?", "tume kemiti achha?"),
"Punjabi": ("ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?", "tusi kiven ho?"),
"Sanskrit": ("भवतः कथम् अस्ति?", "bhavatah katham asti?"),
"Santali": ("ᱥᱟᱱᱛᱟᱲᱤ ᱠᱚᱱᱛᱮᱞᱤ ᱟᱹᱲᱤ?", "santalii konteli adii?"),
"Sindhi": ("توهان ڪيئن آهيو؟", "tohan kayn aahiyo?"),
"Tamil": ("நீங்கள் எப்படி இருக்கிறீர்கள்?", "neenga epdi irukeenga?"),
"Telugu": ("మీరు ఎలా ఉన్నారు?", "meeru ela unnaru?"),
"Urdu": ("آپ کیسے ہیں؟", "aap kaise hain?")
}
# Expanded language mapping (covers common misdetections)
LID_TO_TRANSLATE = {
# Hindi variants
"hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
"hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
"mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Maithili→Hindi
"mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
"nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali→Hindi
"snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi→Hindi
"kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani→Hindi
"brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo→Hindi
# Bengali variants
"ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
"ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
"asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, # Assamese→Bengali
"asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
# Tamil variants
"tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
"tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
"tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
# Telugu variants
"tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
"tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
# Kannada variants
"kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
"kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
# Malayalam variants
"mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
"mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
# Gujarati variants
"guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
"guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
# Punjabi variants
"pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
"pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
# Marathi variants
"mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
"mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
# Urdu variants
"urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
"urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
}
def enhanced_transliterate_robust(text, target_script):
"""Enhanced transliteration with better romanization handling"""
try:
cleaned_text = text.lower().strip()
replacements = {
'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
}
for old, new in replacements.items():
cleaned_text = cleaned_text.replace(old, new)
result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
return result if result else text
except Exception as e:
print(f"Transliteration error: {e}")
return text
def test_all_22_languages(texts, batch_size=32):
"""Complete testing function for all 22 languages"""
results = []
preds = lid.batch_predict(texts, batch_size)
for item in preds:
if isinstance(item, dict):
text = item.get("text", "")
lang_code = item.get("lang", item.get("pred_lang", ""))
score = float(item.get("score", 0.0))
model_name = item.get("model", "")
else:
text, lang_code, score, model_name = item
is_romanized = lang_code.endswith("_Latn")
if lang_code not in LID_TO_TRANSLATE:
translation = f"Language '{lang_code}' not supported"
method = "Unsupported"
else:
try:
lang_info = LID_TO_TRANSLATE[lang_code]
src_code = lang_info["it_code"]
if is_romanized:
native_text = enhanced_transliterate_robust(text, lang_info["script"])
method = f"Transliteration+IndicTrans2 (detected: {lang_code})"
print(f"Romanized: '{text}' → '{native_text}'")
else:
native_text = text
method = f"IndicTrans2 (detected: {lang_code})"
# Translate with IndicTrans2
pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
with torch.no_grad():
out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
dec = tokenizer.batch_decode(out, skip_special_tokens=True)
post = ip.postprocess_batch(dec, lang=src_code)
translation = post[0]
except Exception as e:
translation = f"Translation error: {str(e)}"
method = "Error"
results.append({
"language": text[:20] + "..." if len(text) > 20 else text,
"original_text": text,
"detected_lang": lang_code,
"script_type": "Romanized" if is_romanized else "Native",
"confidence": f"{score:.3f}",
"method": method,
"english_translation": translation
})
return pd.DataFrame(results)
# Create test dataset with all 44 samples (22 native + 22 romanized)
print("🔍 Creating test dataset for all 22 official Indian languages...")
all_test_texts = []
for lang, (native, roman) in sample_sentences.items():
all_test_texts.append(native)
all_test_texts.append(roman)
print(f"📊 Testing {len(all_test_texts)} samples ({len(sample_sentences)} languages × 2 scripts)...")
# Run the complete test
df_results = test_all_22_languages(all_test_texts, batch_size=32)
# Display results
print("\n🎯 COMPLETE TEST RESULTS:")
display(df_results)
# Summary statistics
print(f"\n📈 SUMMARY STATISTICS:")
print(f"Total samples tested: {len(df_results)}")
print(f"Languages detected: {df_results['detected_lang'].nunique()}")
print(f"Native script samples: {len(df_results[df_results['script_type'] == 'Native'])}")
print(f"Romanized samples: {len(df_results[df_results['script_type'] == 'Romanized'])}")
print(f"Successfully translated: {len(df_results[~df_results['english_translation'].str.contains('error|not supported', case=False)])}")
import pandas as pd
def detailed_translation_summary(df_results):
"""
Generate comprehensive detailed summary of translation results
"""
# Flag successful translations
df_results['successful_translation'] = ~df_results['english_translation'].str.contains('error|not supported', case=False, na=False)
print("\n=========== OVERALL SUMMARY ===========")
print(f"Total samples tested: {len(df_results)}")
print(f"Languages detected: {df_results['detected_lang'].nunique()}")
print(f"Native script samples: {df_results[df_results['script_type'] == 'Native'].shape[0]}")
print(f"Romanized samples: {df_results[df_results['script_type'] == 'Romanized'].shape}")
print(f"Successfully translated: {df_results['successful_translation'].sum()}")
overall_success_rate = (df_results['successful_translation'].sum() / len(df_results) * 100)
print(f"Overall success rate: {overall_success_rate:.1f}%")
print("\n=========== DETAILED LANGUAGE BREAKDOWN ===========")
# Per-language analysis
lang_summary = df_results.groupby('detected_lang').agg(
total_samples=('original_text', 'count'),
native_count=('script_type', lambda x: (x == 'Native').sum()),
romanized_count=('script_type', lambda x: (x == 'Romanized').sum()),
mean_confidence=('confidence', lambda x: pd.to_numeric(x, errors='coerce').mean()),
success=('successful_translation', 'sum'),
error_count=('successful_translation', lambda x: (~x).sum())
).reset_index().sort_values('total_samples', ascending=False)
lang_summary['success_rate'] = (lang_summary['success'] / lang_summary['total_samples'] * 100).round(1)
print(lang_summary)
print("\n=========== TOP PERFORMING LANGUAGES ===========")
top_performers = lang_summary[lang_summary['success_rate'] >= 90].sort_values('success_rate', ascending=False)
if len(top_performers) > 0:
print(top_performers[['detected_lang', 'total_samples', 'success_rate']])
else:
print("No languages with 90%+ success rate")
print("\n=========== CHALLENGING LANGUAGES ===========")
challenging = lang_summary[lang_summary['success_rate'] < 50].sort_values('success_rate')
if len(challenging) > 0:
print(challenging[['detected_lang', 'total_samples', 'success_rate']])
else:
print("No languages with <50% success rate")
print("\n=========== ERROR ANALYSIS ===========")
error_df = df_results[~df_results['successful_translation']]
print(f"Total errors: {len(error_df)}")
if len(error_df) > 0:
print("\nError samples:")
print(error_df[['original_text', 'detected_lang', 'script_type', 'confidence', 'english_translation']])
else:
print("No errors found!")
print("\n=========== SUCCESS BREAKDOWN BY SCRIPT ===========")
script_summary = df_results.groupby('script_type').agg(
total_samples=('original_text', 'count'),
successful=('successful_translation', 'sum'),
success_rate=('successful_translation', lambda x: x.mean() * 100)
).round(1)
print(script_summary)
print("\n=========== DETECTION CONFIDENCE ANALYSIS ===========")
confidence_summary = lang_summary[['detected_lang', 'mean_confidence']].sort_values('mean_confidence', ascending=False)
print("Top 10 most confident detections:")
print(confidence_summary.head(10))
return lang_summary, script_summary, error_df
# ===== HOW TO USE =====
print("✅ Detailed summary function defined")
print("\n📋 To run on your test results:")
print(" lang_summary, script_summary, error_df = detailed_translation_summary(df_results)")
print(" display(lang_summary)")
print(" display(error_df)")
lang_summary, script_summary, error_df = detailed_translation_summary(df_results)
display(lang_summary)
display(error_df)