Arabic Dialacts
Collection
6 items
β’
Updated
MasriSwitch-Gemma3n-Transcriber is an automatic speech transcription model specialized for Egyptian Arabic with strong English code-switching capabilities.
This model is one of the very few publicly available systems explicitly optimized for:
The model is trained using:
MasriSwitch-Gemma3n-Transcriber is built on the Gemma3n conditional generation architecture and fine-tuned to understand natural Egyptian speech patterns, including mixed Arabic/English utterances commonly used in daily life, workplaces, and online content.
It is suitable for:
Use this model for:
import torch
from transformers import AutoProcessor, Gemma3nForConditionalGeneration
MODEL_ID = "oddadmix/egyptian-code-switching-b4-g2-merged"
def load_model_and_processor(model_id=MODEL_ID, device=None):
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading model {model_id} to device {device}...")
model = Gemma3nForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16 if device == "cuda" else None,
device_map="auto" if device == "cuda" else None,
).eval()
if not any(p.device.type == "cuda" for p in model.parameters()) and device == "cuda":
model.to("cuda")
processor = AutoProcessor.from_pretrained(model_id)
return model, processor, device
def transcribe_file(model, processor, audio_path, max_new_tokens=128):
if not audio_path:
raise ValueError("audio_path must point to an audio file")
messages = [
{
"role": "system",
"content": [
{"type": "text", "text": "You are an assistant that transcribes speech accurately."}
],
},
{
"role": "user",
"content": [
{"type": "audio", "url": audio_path},
{"type": "text", "text": "Please transcribe this audio."}
],
},
]
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
device = next(model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items()}
input_len = inputs["input_ids"].shape[-1]
with torch.inference_mode():
generated = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
)
gen_tokens = generated[0][input_len:]
text = processor.decode(gen_tokens, skip_special_tokens=True)
return text
if __name__ == "__main__":
audio_path = "path/to/audio.wav"
model, processor, device = load_model_and_processor()
transcription = transcribe_file(model, processor, audio_path, max_new_tokens=256)
print("Transcription:", transcription)