Spaces:

kasimali
/

copy-of-final

Runtime error

File size: 29,729 Bytes

b3a699a

# Copy of final

# ================================================================
# = STEP 1: SETUP AND DOWNLOAD (YOUR PROVEN METHOD)              =
# ================================================================
import os

print("--- 1. Installing All Libraries ---")
print("✅ Libraries installed.")

print("\n--- 2. Cloning IndicLID Repository ---")
# Using your proven method of changing directories
print("✅ Repository cloned.")

# Navigate into the correct directory structure

print("\n--- 3. Downloading and Unzipping IndicLID Models ---")
print("✅ Download commands executed. Unzipping now...")
print("✅ Unzip commands executed.")

print("\n🎉🎉🎉 SETUP COMPLETE. You can now proceed to Step 2. 🎉🎉🎉")


# =========================
# = STEP 2: INITIALIZE MODELS (EXACTLY AS YOUR OLD CODE) =
# =========================
import os
import sys
import torch
print("--- Applying your original add_safe_globals fix... ---")

if "/content/IndicLID/Inference" not in sys.path:
    sys.path.append("/content/IndicLID/Inference")

from transformers.models.bert.modeling_bert import (
    BertModel, BertPreTrainedModel, BertForSequenceClassification,
    BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
    BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput
)
from transformers.models.bert.configuration_bert import BertConfig
import torch.nn as nn
from torch.nn.modules.sparse import Embedding
from torch.nn.modules.container import ModuleList
from torch.nn.modules.linear import Linear
from torch.nn.modules.normalization import LayerNorm
from torch.nn.modules.dropout import Dropout

torch.serialization.add_safe_globals([
    BertModel, BertPreTrainedModel, BertForSequenceClassification,
    BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
    BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput, BertConfig,
    Embedding, ModuleList, Linear, LayerNorm, Dropout,
])
print("✅ Comprehensive safe globals added successfully.")

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from IndicTransToolkit.processor import IndicProcessor
from ai4bharat.IndicLID import IndicLID

print("--- Loading all models into memory... ---")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

lid = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
print("✅ IndicLID model loaded successfully.")

MODEL_ID = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, trust_remote_code=True).to(device)
ip = IndicProcessor(inference=True)
print("✅ IndicTrans2 1B model loaded.")

print("🎉 ALL MODELS ARE LOADED. Proceed to direct batch prediction tests.")


import sys
print(sys.path)

pip show transformers



# ================================================================
# = STEP 2.5: LOAD ROMANSETU (COMPATIBLE WITH 4.40.2)           =
# ================================================================

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("--- Loading RomanSetu model compatible with transformers 4.40.2... ---")

# Try smaller, more compatible models first
model_options = [
    "ai4bharat/romansetu-cpt-roman-100m",
    "ai4bharat/romansetu-cpt-roman-200m"
]

rs_model = None
rs_tokenizer = None

for model_id in model_options:
    try:
        print(f"Trying model: {model_id}")
        rs_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        rs_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
        print(f"✅ {model_id} loaded successfully.")
        break
    except Exception as e:
        print(f"❌ {model_id} failed: {e}")
        continue

if rs_model is None:
    print("❌ All RomanSetu models failed. Continuing with transliteration-based approach.")

def translate_with_romansetu(text, max_new_tokens=50):
    if rs_model is None:
        # Fallback: use enhanced transliteration + IndicTrans2
        from indic_transliteration import sanscript
        from indic_transliteration.sanscript import transliterate
        try:
            # Try to transliterate and then translate with IndicTrans2
            native_text = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
            pre = ip.preprocess_batch([native_text], src_lang="hin_Deva", tgt_lang="eng_Latn")
            inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
            with torch.no_grad():
                out = model.generate(**inputs, num_beams=3, max_length=100)
            dec = tokenizer.batch_decode(out, skip_special_tokens=True)
            post = ip.postprocess_batch(dec, lang="hin_Deva")
            return post[0]
        except:
            return text

    try:
        prompt = f"Translate this romanized Indian text to English: {text}"
        inputs = rs_tokenizer(prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = rs_model.generate(
                inputs.input_ids,
                max_new_tokens=max_new_tokens,
                num_beams=2,
                temperature=0.7,
                do_sample=True,
                pad_token_id=rs_tokenizer.eos_token_id
            )

        full_response = rs_tokenizer.decode(outputs, skip_special_tokens=True)
        translation = full_response.replace(prompt, "").strip()
        return translation if translation and len(translation) > 2 else text

    except Exception as e:
        return text

print("✅ RomanSetu/fallback translation function defined.")
print("🎉 SETUP COMPLETE with fallback mechanism.")


# ================================================================
# = STEP 2.6: LOAD INDICXLIT FOR BETTER TRANSLITERATION (CORRECTED) =
# ================================================================

print("--- Installing and loading IndicXlit for better romanized text handling ---")

# Install IndicXlit (compatible with your transformers==4.40.2)

from ai4bharat.transliteration import XlitEngine
import torch

try:
    # Load IndicXlit engines for different languages (based on official docs)
    xlit_engines = {
        "hindi": XlitEngine("hi", beam_width=4, rescore=True),
        "bengali": XlitEngine("bn", beam_width=4, rescore=True),
        "tamil": XlitEngine("ta", beam_width=4, rescore=True),
        "telugu": XlitEngine("te", beam_width=4, rescore=True),
        "gujarati": XlitEngine("gu", beam_width=4, rescore=True),
        "kannada": XlitEngine("kn", beam_width=4, rescore=True),
        "malayalam": XlitEngine("ml", beam_width=4, rescore=True),
        "punjabi": XlitEngine("pa", beam_width=4, rescore=True),
        "marathi": XlitEngine("mr", beam_width=4, rescore=True),
        "urdu": XlitEngine("ur", beam_width=4, rescore=True),
    }
    print("✅ Multiple IndicXlit engines loaded successfully.")

except Exception as e:
    print(f"❌ Error loading IndicXlit: {e}")
    print("💡 Falling back to basic transliteration.")
    xlit_engines = {}

def enhanced_transliterate_with_xlit(text, target_lang):
    """
    Enhanced transliteration using IndicXlit (based on official API)
    """
    lang_key = target_lang.lower()

    if not xlit_engines or lang_key not in xlit_engines:
        # Fallback to your existing transliteration
        from indic_transliteration import sanscript
        from indic_transliteration.sanscript import transliterate
        script_map = {
            "hindi": sanscript.DEVANAGARI, "bengali": sanscript.BENGALI,
            "tamil": sanscript.TAMIL, "telugu": sanscript.TELUGU,
            "kannada": sanscript.KANNADA, "malayalam": sanscript.MALAYALAM,
            "gujarati": sanscript.GUJARATI, "punjabi": sanscript.GURMUKHI,
            "marathi": sanscript.DEVANAGARI, "urdu": 'urdu'
        }
        return transliterate(text, sanscript.ITRANS, script_map.get(lang_key, sanscript.DEVANAGARI))

    try:
        # Use IndicXlit for better transliteration (official API)
        engine = xlit_engines[lang_key]

        # For sentences, use translit_sentence (returns dict with lang code as key)
        if ' ' in text:
            result = engine.translit_sentence(text)
            # Get the language code for this engine
            lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te",
                         "gujarati": "gu", "kannada": "kn", "malayalam": "ml",
                         "punjabi": "pa", "marathi": "mr", "urdu": "ur"}
            lang_code = lang_codes.get(lang_key, "hi")
            return result.get(lang_code, text)
        else:
            # For single words, use translit_word (returns dict with topk results)
            result = engine.translit_word(text, topk=1)
            lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te",
                         "gujarati": "gu", "kannada": "kn", "malayalam": "ml",
                         "punjabi": "pa", "marathi": "mr", "urdu": "ur"}
            lang_code = lang_codes.get(lang_key, "hi")
            return result.get(lang_code, [text])[0]

    except Exception as e:
        print(f"IndicXlit error for '{text}': {e}")
        # Fallback if IndicXlit fails
        return text

print("✅ Enhanced transliteration function defined.")
print("🎉 INDICXLIT SETUP COMPLETE.")


import pandas as pd
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# EXPANDED language mapping to handle misdetections
LID_TO_TRANSLATE = {
    # Hindi variants
    "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
    "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},

    # Maithili (often confused with Hindi) - map to Hindi
    "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
    "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},

    # Bengali variants
    "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
    "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},

    # Assamese (often confused with Bengali) - map to Bengali
    "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
    "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},

    # Tamil variants
    "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
    "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
    "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},

    # Telugu variants
    "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
    "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},

    # Kannada variants
    "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
    "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},

    # Malayalam variants
    "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
    "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},

    # Gujarati variants
    "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
    "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},

    # Punjabi variants
    "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
    "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},

    # Marathi variants
    "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
    "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},

    # Urdu variants
    "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
    "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},

    # Additional commonly misdetected languages
    "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},  # Sindhi → Hindi
    "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},  # Nepali → Hindi
    "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},  # Konkani → Hindi
    "gom_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},  # Goan Konkani → Hindi
    "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},  # Bodo → Hindi
}

def enhanced_transliterate_robust(text, target_script):
    """
    Enhanced transliteration with better romanization handling
    """
    try:
        # Preprocess text for better transliteration
        cleaned_text = text.lower().strip()

        # Handle common romanization patterns
        replacements = {
            'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
            'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
            'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
        }

        for old, new in replacements.items():
            cleaned_text = cleaned_text.replace(old, new)

        # Transliterate using your existing library
        result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
        return result if result else text

    except Exception as e:
        print(f"Transliteration error: {e}")
        return text

def detect_and_translate_robust(texts, batch_size=64):
    """
    Robust detection and translation with expanded language mapping
    """
    results = []
    preds = lid.batch_predict(texts, batch_size)

    for item in preds:
        if isinstance(item, dict):
            text = item.get("text", "")
            lang_code = item.get("lang", item.get("pred_lang", ""))
            score = float(item.get("score", 0.0))
            model_name = item.get("model", "")
        else:
            text, lang_code, score, model_name = item

        is_romanized = lang_code.endswith("_Latn")

        if lang_code not in LID_TO_TRANSLATE:
            translation = f"Language '{lang_code}' not supported for translation"
            method = "Unsupported"
        else:
            try:
                lang_info = LID_TO_TRANSLATE[lang_code]
                src_code = lang_info["it_code"]

                if is_romanized:
                    # Use enhanced transliteration
                    native_text = enhanced_transliterate_robust(text, lang_info["script"])
                    method = f"Enhanced Transliteration + IndicTrans2 (detected as {lang_code})"
                    print(f"Enhanced: '{text}' → '{native_text}' (detected: {lang_code})")
                else:
                    native_text = text
                    method = f"IndicTrans2 (detected as {lang_code})"

                # Translate with IndicTrans2
                pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
                inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
                with torch.no_grad():
                    out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
                dec = tokenizer.batch_decode(out, skip_special_tokens=True)
                post = ip.postprocess_batch(dec, lang=src_code)
                translation = post[0]

            except Exception as e:
                translation = f"Translation error: {str(e)}"
                method = "Error"

        results.append({
            "original_text": text,
            "detected_lang": lang_code,
            "script_type": "Romanized" if is_romanized else "Native",
            "confidence": f"{score:.3f}",
            "translation_method": method,
            "english_translation": translation
        })

    return pd.DataFrame(results)

print("✅ Robust translation function with expanded language mapping defined")

# Test with the same samples
sample_texts = [
    "यहाँ कितने लोग हैं?",
    "tum kaha ho",
    "aaj mausam suhana hai",
    "aap kaise hain",
    "আমি ভালো আছি।",
    "ami bhalo achi",
    "mera naam rahul hai",
    "main office jaa raha hun"
]

print(f"🔍 Testing robust approach with expanded language mapping...")
df_results = detect_and_translate_robust(sample_texts, batch_size=16)
display(df_results)


# ================================================================
# = COMPLETE TEST CODE FOR ALL 22 INDIAN LANGUAGES              =
# ================================================================

import pandas as pd
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# Official 22 Indian languages sample sentences (native + romanized)
sample_sentences = {
    "Assamese": ("আপুনি কেনেকৈ আছেন?", "apuni kenekoi asen?"),
    "Bengali": ("তুমি কেমন আছো?", "tumi kemon acho?"),
    "Bodo": ("नांगनि फाथै खौ?", "nangni phathai kho?"),
    "Dogri": ("तुसीं केहे हो?", "tusi kehe ho?"),
    "Gujarati": ("તમે કેમ છો?", "tame kem cho?"),
    "Hindi": ("तुम कैसे हो?", "tum kaise ho?"),
    "Kannada": ("ನೀವು ಹೇಗಿದ್ದೀರಾ?", "neevu hegiddira?"),
    "Kashmiri": ("तुस की छै?", "tus ki chhai?"),
    "Konkani": ("तुम कशें आसा?", "tum kashen asa?"),
    "Maithili": ("अहाँ कथी छी?", "ahaan kathi chhi?"),
    "Malayalam": ("സുഖമായിരോ?", "sukhamaayiro?"),
    "Manipuri": ("नमस्कार, नखोंगबा तौ?", "namaskaar, nakhongba tau?"),
    "Marathi": ("तू कसा आहेस?", "tu kasa ahes?"),
    "Nepali": ("तिमी कस्तो छौ?", "timi kasto chau?"),
    "Odia": ("ତୁମେ କେମିତି ଅଛ?", "tume kemiti achha?"),
    "Punjabi": ("ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?", "tusi kiven ho?"),
    "Sanskrit": ("भवतः कथम् अस्ति?", "bhavatah katham asti?"),
    "Santali": ("ᱥᱟᱱᱛᱟᱲᱤ ᱠᱚᱱᱛᱮᱞᱤ ᱟᱹᱲᱤ?", "santalii konteli adii?"),
    "Sindhi": ("توهان ڪيئن آهيو؟", "tohan kayn aahiyo?"),
    "Tamil": ("நீங்கள் எப்படி இருக்கிறீர்கள்?", "neenga epdi irukeenga?"),
    "Telugu": ("మీరు ఎలా ఉన్నారు?", "meeru ela unnaru?"),
    "Urdu": ("آپ کیسے ہیں؟", "aap kaise hain?")
}

# Expanded language mapping (covers common misdetections)
LID_TO_TRANSLATE = {
    # Hindi variants
    "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
    "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
    "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Maithili→Hindi
    "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
    "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali→Hindi
    "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi→Hindi
    "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani→Hindi
    "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo→Hindi

    # Bengali variants
    "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
    "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
    "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, # Assamese→Bengali
    "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},

    # Tamil variants
    "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
    "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
    "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},

    # Telugu variants
    "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
    "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},

    # Kannada variants
    "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
    "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},

    # Malayalam variants
    "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
    "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},

    # Gujarati variants
    "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
    "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},

    # Punjabi variants
    "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
    "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},

    # Marathi variants
    "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
    "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},

    # Urdu variants
    "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
    "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
}

def enhanced_transliterate_robust(text, target_script):
    """Enhanced transliteration with better romanization handling"""
    try:
        cleaned_text = text.lower().strip()
        replacements = {
            'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
            'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
            'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
        }
        for old, new in replacements.items():
            cleaned_text = cleaned_text.replace(old, new)
        result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
        return result if result else text
    except Exception as e:
        print(f"Transliteration error: {e}")
        return text

def test_all_22_languages(texts, batch_size=32):
    """Complete testing function for all 22 languages"""
    results = []
    preds = lid.batch_predict(texts, batch_size)

    for item in preds:
        if isinstance(item, dict):
            text = item.get("text", "")
            lang_code = item.get("lang", item.get("pred_lang", ""))
            score = float(item.get("score", 0.0))
            model_name = item.get("model", "")
        else:
            text, lang_code, score, model_name = item

        is_romanized = lang_code.endswith("_Latn")

        if lang_code not in LID_TO_TRANSLATE:
            translation = f"Language '{lang_code}' not supported"
            method = "Unsupported"
        else:
            try:
                lang_info = LID_TO_TRANSLATE[lang_code]
                src_code = lang_info["it_code"]

                if is_romanized:
                    native_text = enhanced_transliterate_robust(text, lang_info["script"])
                    method = f"Transliteration+IndicTrans2 (detected: {lang_code})"
                    print(f"Romanized: '{text}' → '{native_text}'")
                else:
                    native_text = text
                    method = f"IndicTrans2 (detected: {lang_code})"

                # Translate with IndicTrans2
                pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
                inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
                with torch.no_grad():
                    out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
                dec = tokenizer.batch_decode(out, skip_special_tokens=True)
                post = ip.postprocess_batch(dec, lang=src_code)
                translation = post[0]

            except Exception as e:
                translation = f"Translation error: {str(e)}"
                method = "Error"

        results.append({
            "language": text[:20] + "..." if len(text) > 20 else text,
            "original_text": text,
            "detected_lang": lang_code,
            "script_type": "Romanized" if is_romanized else "Native",
            "confidence": f"{score:.3f}",
            "method": method,
            "english_translation": translation
        })

    return pd.DataFrame(results)

# Create test dataset with all 44 samples (22 native + 22 romanized)
print("🔍 Creating test dataset for all 22 official Indian languages...")
all_test_texts = []
for lang, (native, roman) in sample_sentences.items():
    all_test_texts.append(native)
    all_test_texts.append(roman)

print(f"📊 Testing {len(all_test_texts)} samples ({len(sample_sentences)} languages × 2 scripts)...")

# Run the complete test
df_results = test_all_22_languages(all_test_texts, batch_size=32)

# Display results
print("\n🎯 COMPLETE TEST RESULTS:")
display(df_results)

# Summary statistics
print(f"\n📈 SUMMARY STATISTICS:")
print(f"Total samples tested: {len(df_results)}")
print(f"Languages detected: {df_results['detected_lang'].nunique()}")
print(f"Native script samples: {len(df_results[df_results['script_type'] == 'Native'])}")
print(f"Romanized samples: {len(df_results[df_results['script_type'] == 'Romanized'])}")
print(f"Successfully translated: {len(df_results[~df_results['english_translation'].str.contains('error|not supported', case=False)])}")


import pandas as pd

def detailed_translation_summary(df_results):
    """
    Generate comprehensive detailed summary of translation results
    """
    # Flag successful translations
    df_results['successful_translation'] = ~df_results['english_translation'].str.contains('error|not supported', case=False, na=False)

    print("\n=========== OVERALL SUMMARY ===========")
    print(f"Total samples tested: {len(df_results)}")
    print(f"Languages detected: {df_results['detected_lang'].nunique()}")
    print(f"Native script samples: {df_results[df_results['script_type'] == 'Native'].shape[0]}")
    print(f"Romanized samples: {df_results[df_results['script_type'] == 'Romanized'].shape}")
    print(f"Successfully translated: {df_results['successful_translation'].sum()}")

    overall_success_rate = (df_results['successful_translation'].sum() / len(df_results) * 100)
    print(f"Overall success rate: {overall_success_rate:.1f}%")

    print("\n=========== DETAILED LANGUAGE BREAKDOWN ===========")
    # Per-language analysis
    lang_summary = df_results.groupby('detected_lang').agg(
        total_samples=('original_text', 'count'),
        native_count=('script_type', lambda x: (x == 'Native').sum()),
        romanized_count=('script_type', lambda x: (x == 'Romanized').sum()),
        mean_confidence=('confidence', lambda x: pd.to_numeric(x, errors='coerce').mean()),
        success=('successful_translation', 'sum'),
        error_count=('successful_translation', lambda x: (~x).sum())
    ).reset_index().sort_values('total_samples', ascending=False)

    lang_summary['success_rate'] = (lang_summary['success'] / lang_summary['total_samples'] * 100).round(1)
    print(lang_summary)

    print("\n=========== TOP PERFORMING LANGUAGES ===========")
    top_performers = lang_summary[lang_summary['success_rate'] >= 90].sort_values('success_rate', ascending=False)
    if len(top_performers) > 0:
        print(top_performers[['detected_lang', 'total_samples', 'success_rate']])
    else:
        print("No languages with 90%+ success rate")

    print("\n=========== CHALLENGING LANGUAGES ===========")
    challenging = lang_summary[lang_summary['success_rate'] < 50].sort_values('success_rate')
    if len(challenging) > 0:
        print(challenging[['detected_lang', 'total_samples', 'success_rate']])
    else:
        print("No languages with <50% success rate")

    print("\n=========== ERROR ANALYSIS ===========")
    error_df = df_results[~df_results['successful_translation']]
    print(f"Total errors: {len(error_df)}")
    if len(error_df) > 0:
        print("\nError samples:")
        print(error_df[['original_text', 'detected_lang', 'script_type', 'confidence', 'english_translation']])
    else:
        print("No errors found!")

    print("\n=========== SUCCESS BREAKDOWN BY SCRIPT ===========")
    script_summary = df_results.groupby('script_type').agg(
        total_samples=('original_text', 'count'),
        successful=('successful_translation', 'sum'),
        success_rate=('successful_translation', lambda x: x.mean() * 100)
    ).round(1)
    print(script_summary)

    print("\n=========== DETECTION CONFIDENCE ANALYSIS ===========")
    confidence_summary = lang_summary[['detected_lang', 'mean_confidence']].sort_values('mean_confidence', ascending=False)
    print("Top 10 most confident detections:")
    print(confidence_summary.head(10))

    return lang_summary, script_summary, error_df

# ===== HOW TO USE =====
print("✅ Detailed summary function defined")
print("\n📋 To run on your test results:")
print("   lang_summary, script_summary, error_df = detailed_translation_summary(df_results)")
print("   display(lang_summary)")
print("   display(error_df)")


lang_summary, script_summary, error_df = detailed_translation_summary(df_results)


display(lang_summary)
display(error_df)