Spaces:

iRecite
/

iRecite-MVP-API

Running

App Files Files Community

didodev commited on 18 days ago

Commit

4ca6263

1 Parent(s): 468a7b7

Deploy iRecite MVP API (Docker + FastAPI)

Browse files

Files changed (18) hide show

.dockerignore +13 -0
Dockerfile +21 -0
app.py +129 -0
data/fatiha_canonical.json +43 -0
data/fatiha_canonical_fallback.json +332 -0
requirements.txt +15 -0
step10_word_segments_and_mapping.py +126 -0
step12_align_segments_wavlm.py +123 -0
step13_arabic_ctc_transcribe.py +40 -0
step14_align_text_to_canonical.py +113 -0
step15_global_word_alignment.py +140 -0
step16_ctc_word_timestamps.py +165 -0
step16b_token_interpolation_timestamps.py +108 -0
step17_make_api_response.py +88 -0
step5_wavlm_test.py +31 -0
step7_fallback_phonemes_and_madd.py +58 -0
step8_madd_signal.py +51 -0
step9_madd_feedback_json.py +140 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,13 @@

+.venv/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+env/
+venv/
+.uvicorn/
+uploads/*
+output/*
+sample.wav
+sample_trim.wav

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM python:3.11-slim
+# System deps (ffmpeg for audio conversion + git for some pip installs if needed)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Install Python deps
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the application code
+COPY . /app
+# Hugging Face Spaces expects port 7860
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import os
+import re
+import shutil
+import subprocess
+from fastapi import FastAPI, UploadFile, File
+from fastapi.responses import JSONResponse
+app = FastAPI(title="iRecite MVP API")
+WORKDIR = os.path.dirname(os.path.abspath(__file__))
+PYTHON = os.path.join(WORKDIR, ".venv", "Scripts", "python.exe")
+UPLOADS = os.path.join(WORKDIR, "uploads")
+OUTPUT_DIR = os.path.join(WORKDIR, "output")
+API_JSON = os.path.join(OUTPUT_DIR, "api_response.json")
+import sys
+def run(cmd):
+    # Always run child scripts with the same Python interpreter as the server
+    if cmd and cmd[0].lower() == "python":
+        cmd = [sys.executable] + cmd[1:]
+    subprocess.check_call(cmd, cwd=WORKDIR)
+def detect_trim_times(wav_path: str):
+    """
+    Use ffmpeg silencedetect to get start/end of main speech.
+    Returns (start_sec, end_sec). If detection fails, returns (0, full_duration).
+    """
+    # Run silencedetect and capture output
+    p = subprocess.run(
+        ["ffmpeg", "-i", wav_path, "-af", "silencedetect=noise=-35dB:d=0.35", "-f", "null", "NUL"],
+        cwd=WORKDIR,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        encoding="utf-8",
+        errors="ignore"
+    )
+    txt = p.stdout
+    # Find first "silence_end" near the beginning (speech start)
+    # and last "silence_start" near the end (speech end)
+    silence_end = None
+    silence_start_last = None
+    for line in txt.splitlines():
+        if "silence_end:" in line:
+            m = re.search(r"silence_end:\s*([0-9.]+)", line)
+            if m and silence_end is None:
+                silence_end = float(m.group(1))
+        if "silence_start:" in line:
+            m = re.search(r"silence_start:\s*([0-9.]+)", line)
+            if m:
+                silence_start_last = float(m.group(1))
+    # Get full duration using ffprobe
+    pr = subprocess.run(
+        ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=nw=1:nk=1", wav_path],
+        cwd=WORKDIR,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True
+    )
+    try:
+        full_dur = float(pr.stdout.strip())
+    except Exception:
+        full_dur = None
+    start = max(0.0, (silence_end if silence_end is not None else 0.0))
+    end = (silence_start_last if silence_start_last is not None else (full_dur if full_dur is not None else 0.0))
+    # Sanity checks
+    if full_dur is not None:
+        end = min(end, full_dur)
+    if end <= start + 1.0:
+        # fallback: don't trim
+        return 0.0, full_dur if full_dur is not None else 0.0
+    # small padding
+    start = max(0.0, start - 0.10)
+    end = end + 0.10
+    if full_dur is not None:
+        end = min(end, full_dur)
+    return start, end
+@app.post("/analyze")
+async def analyze(file: UploadFile = File(...)):
+    os.makedirs(UPLOADS, exist_ok=True)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    # Save upload
+    upload_path = os.path.join(UPLOADS, file.filename)
+    with open(upload_path, "wb") as f:
+        shutil.copyfileobj(file.file, f)
+    # Convert to 16k mono wav
+    sample_wav = os.path.join(WORKDIR, "sample.wav")
+    run(["ffmpeg", "-y", "-i", upload_path, "-ac", "1", "-ar", "16000", sample_wav])
+    # Auto trim -> sample_trim.wav
+    sample_trim = os.path.join(WORKDIR, "sample_trim.wav")
+    start, end = detect_trim_times(sample_wav)
+    if end and end > start:
+        run(["ffmpeg", "-y", "-i", sample_wav, "-ss", f"{start:.2f}", "-to", f"{end:.2f}", "-ac", "1", "-ar", "16000", sample_trim])
+    else:
+        shutil.copy(sample_wav, sample_trim)
+    # Run pipeline (ordered)
+    run(["python", "step7_fallback_phonemes_and_madd.py"])  # ensures fallback json exists
+    run(["python", "step8_madd_signal.py"])
+    run(["python", "step9_madd_feedback_json.py"])
+    run(["python", "step13_arabic_ctc_transcribe.py"])      # now writes output/asr_raw.txt automatically
+    run(["python", "step14_align_text_to_canonical.py"])
+    run(["python", "step15_global_word_alignment.py"])
+    run(["python", "step16b_token_interpolation_timestamps.py"])
+    run(["python", "step17_make_api_response.py"])
+    if not os.path.exists(API_JSON):
+        return JSONResponse({"error": "api_response.json not generated"}, status_code=500)
+    import json
+    with open(API_JSON, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    # include trim info for debugging
+    data["debug"] = {"trim": {"start": round(start, 2), "end": round(end, 2)}}
+    return data

data/fatiha_canonical.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "surah": "Al-Fatiha",
+  "surah_number": 1,
+  "riwayah": "Hafs",
+  "bismillah_included": true,
+  "ayahs": [
+    {
+      "ayah": 1,
+      "arabic": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ",
+      "words": ["بِسْمِ", "اللَّهِ", "الرَّحْمَٰنِ", "الرَّحِيمِ"]
+    },
+    {
+      "ayah": 2,
+      "arabic": "الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ",
+      "words": ["الْحَمْدُ", "لِلَّهِ", "رَبِّ", "الْعَالَمِينَ"]
+    },
+    {
+      "ayah": 3,
+      "arabic": "الرَّحْمَٰنِ الرَّحِيمِ",
+      "words": ["الرَّحْمَٰنِ", "الرَّحِيمِ"]
+    },
+    {
+      "ayah": 4,
+      "arabic": "مَالِكِ يَوْمِ الدِّينِ",
+      "words": ["مَالِكِ", "يَوْمِ", "الدِّينِ"]
+    },
+    {
+      "ayah": 5,
+      "arabic": "إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ",
+      "words": ["إِيَّاكَ", "نَعْبُدُ", "وَإِيَّاكَ", "نَسْتَعِينُ"]
+    },
+    {
+      "ayah": 6,
+      "arabic": "اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ",
+      "words": ["اهْدِنَا", "الصِّرَاطَ", "الْمُسْتَقِيمَ"]
+    },
+    {
+      "ayah": 7,
+      "arabic": "صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ الْمَغْضُوبِ عَلَيْهِمْ وَلَا الضَّالِّينَ",
+      "words": ["صِرَاطَ", "الَّذِينَ", "أَنْعَمْتَ", "عَلَيْهِمْ", "غَيْرِ", "الْمَغْضُوبِ", "عَلَيْهِمْ", "وَلَا", "الضَّالِّينَ"]
+    }
+  ]
+}

data/fatiha_canonical_fallback.json ADDED Viewed

	@@ -0,0 +1,332 @@

+{
+  "surah": "Al-Fatiha",
+  "surah_number": 1,
+  "riwayah": "Hafs",
+  "bismillah_included": true,
+  "ayahs": [
+    {
+      "ayah": 1,
+      "arabic": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ",
+      "words": [
+        "بِسْمِ",
+        "اللَّهِ",
+        "الرَّحْمَٰنِ",
+        "الرَّحِيمِ"
+      ],
+      "word_info": [
+        {
+          "word": "بِسْمِ",
+          "base": "بسم",
+          "phonemes_fallback": "bisomi",
+          "madd_positions_base_index": []
+        },
+        {
+          "word": "اللَّهِ",
+          "base": "الله",
+          "phonemes_fallback": ">al~ahi",
+          "madd_positions_base_index": [
+            0
+          ]
+        },
+        {
+          "word": "الرَّحْمَٰنِ",
+          "base": "الرحمن",
+          "phonemes_fallback": ">ar~aHomaٰni",
+          "madd_positions_base_index": [
+            0
+          ]
+        },
+        {
+          "word": "الرَّحِيمِ",
+          "base": "الرحيم",
+          "phonemes_fallback": ">ar~aHiymi",
+          "madd_positions_base_index": [
+            0,
+            4
+          ]
+        }
+      ]
+    },
+    {
+      "ayah": 2,
+      "arabic": "الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ",
+      "words": [
+        "الْحَمْدُ",
+        "لِلَّهِ",
+        "رَبِّ",
+        "الْعَالَمِينَ"
+      ],
+      "word_info": [
+        {
+          "word": "الْحَمْدُ",
+          "base": "الحمد",
+          "phonemes_fallback": ">aloHamodu",
+          "madd_positions_base_index": [
+            0
+          ]
+        },
+        {
+          "word": "لِلَّهِ",
+          "base": "لله",
+          "phonemes_fallback": "lilohi",
+          "madd_positions_base_index": []
+        },
+        {
+          "word": "رَبِّ",
+          "base": "رب",
+          "phonemes_fallback": "rab~i",
+          "madd_positions_base_index": []
+        },
+        {
+          "word": "الْعَالَمِينَ",
+          "base": "العالمين",
+          "phonemes_fallback": ">aloEaAlamiyna",
+          "madd_positions_base_index": [
+            0,
+            3,
+            6
+          ]
+        }
+      ]
+    },
+    {
+      "ayah": 3,
+      "arabic": "الرَّحْمَٰنِ الرَّحِيمِ",
+      "words": [
+        "الرَّحْمَٰنِ",
+        "الرَّحِيمِ"
+      ],
+      "word_info": [
+        {
+          "word": "الرَّحْمَٰنِ",
+          "base": "الرحمن",
+          "phonemes_fallback": ">ar~aHomaٰni",
+          "madd_positions_base_index": [
+            0
+          ]
+        },
+        {
+          "word": "الرَّحِيمِ",
+          "base": "الرحيم",
+          "phonemes_fallback": ">ar~aHiymi",
+          "madd_positions_base_index": [
+            0,
+            4
+          ]
+        }
+      ]
+    },
+    {
+      "ayah": 4,
+      "arabic": "مَالِكِ يَوْمِ الدِّينِ",
+      "words": [
+        "مَالِكِ",
+        "يَوْمِ",
+        "الدِّينِ"
+      ],
+      "word_info": [
+        {
+          "word": "مَالِكِ",
+          "base": "مالك",
+          "phonemes_fallback": "maAliki",
+          "madd_positions_base_index": [
+            1
+          ]
+        },
+        {
+          "word": "يَوْمِ",
+          "base": "يوم",
+          "phonemes_fallback": "yawomi",
+          "madd_positions_base_index": [
+            0,
+            1
+          ]
+        },
+        {
+          "word": "الدِّينِ",
+          "base": "الدين",
+          "phonemes_fallback": ">ad~iyni",
+          "madd_positions_base_index": [
+            0,
+            3
+          ]
+        }
+      ]
+    },
+    {
+      "ayah": 5,
+      "arabic": "إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ",
+      "words": [
+        "إِيَّاكَ",
+        "نَعْبُدُ",
+        "وَإِيَّاكَ",
+        "نَسْتَعِينُ"
+      ],
+      "word_info": [
+        {
+          "word": "إِيَّاكَ",
+          "base": "إياك",
+          "phonemes_fallback": "<iy~aAka",
+          "madd_positions_base_index": [
+            1,
+            2
+          ]
+        },
+        {
+          "word": "نَعْبُدُ",
+          "base": "نعبد",
+          "phonemes_fallback": "naEobudu",
+          "madd_positions_base_index": []
+        },
+        {
+          "word": "وَإِيَّاكَ",
+          "base": "وإياك",
+          "phonemes_fallback": "wa<iy~aAka",
+          "madd_positions_base_index": [
+            0,
+            2,
+            3
+          ]
+        },
+        {
+          "word": "نَسْتَعِينُ",
+          "base": "نستعين",
+          "phonemes_fallback": "nasotaEiynu",
+          "madd_positions_base_index": [
+            4
+          ]
+        }
+      ]
+    },
+    {
+      "ayah": 6,
+      "arabic": "اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ",
+      "words": [
+        "اهْدِنَا",
+        "الصِّرَاطَ",
+        "الْمُسْتَقِيمَ"
+      ],
+      "word_info": [
+        {
+          "word": "اهْدِنَا",
+          "base": "اهدنا",
+          "phonemes_fallback": "<ihodinaA",
+          "madd_positions_base_index": [
+            0,
+            4
+          ]
+        },
+        {
+          "word": "الصِّرَاطَ",
+          "base": "الصراط",
+          "phonemes_fallback": ">aS~iraATa",
+          "madd_positions_base_index": [
+            0,
+            4
+          ]
+        },
+        {
+          "word": "الْمُسْتَقِيمَ",
+          "base": "المستقيم",
+          "phonemes_fallback": ">alomusotaqiyma",
+          "madd_positions_base_index": [
+            0,
+            6
+          ]
+        }
+      ]
+    },
+    {
+      "ayah": 7,
+      "arabic": "صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ الْمَغْضُوبِ عَلَيْهِمْ وَلَا الضَّالِّينَ",
+      "words": [
+        "صِرَاطَ",
+        "الَّذِينَ",
+        "أَنْعَمْتَ",
+        "عَلَيْهِمْ",
+        "غَيْرِ",
+        "الْمَغْضُوبِ",
+        "عَلَيْهِمْ",
+        "وَلَا",
+        "الضَّالِّينَ"
+      ],
+      "word_info": [
+        {
+          "word": "صِرَاطَ",
+          "base": "صراط",
+          "phonemes_fallback": "SiraATa",
+          "madd_positions_base_index": [
+            2
+          ]
+        },
+        {
+          "word": "الَّذِينَ",
+          "base": "الذين",
+          "phonemes_fallback": ">al~a*iyna",
+          "madd_positions_base_index": [
+            0,
+            3
+          ]
+        },
+        {
+          "word": "أَنْعَمْتَ",
+          "base": "أنعمت",
+          "phonemes_fallback": ">anoEamota",
+          "madd_positions_base_index": []
+        },
+        {
+          "word": "عَلَيْهِمْ",
+          "base": "عليهم",
+          "phonemes_fallback": "Ealayohimo",
+          "madd_positions_base_index": [
+            2
+          ]
+        },
+        {
+          "word": "غَيْرِ",
+          "base": "غير",
+          "phonemes_fallback": "gayori",
+          "madd_positions_base_index": [
+            1
+          ]
+        },
+        {
+          "word": "الْمَغْضُوبِ",
+          "base": "المغضوب",
+          "phonemes_fallback": ">alomagoDuwbi",
+          "madd_positions_base_index": [
+            0,
+            5
+          ]
+        },
+        {
+          "word": "عَلَيْهِمْ",
+          "base": "عليهم",
+          "phonemes_fallback": "Ealayohimo",
+          "madd_positions_base_index": [
+            2
+          ]
+        },
+        {
+          "word": "وَلَا",
+          "base": "ولا",
+          "phonemes_fallback": "walaA",
+          "madd_positions_base_index": [
+            0,
+            2
+          ]
+        },
+        {
+          "word": "الضَّالِّينَ",
+          "base": "الضالين",
+          "phonemes_fallback": ">aD~aAl~iyna",
+          "madd_positions_base_index": [
+            0,
+            3,
+            5
+          ]
+        }
+      ]
+    }
+  ]
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+fastapi==0.128.0
+uvicorn==0.40.0
+python-multipart==0.0.21
+numpy
+librosa
+soundfile
+webrtcvad
+praat-parselmouth
+dtw-python
+torch
+transformers
+sentencepiece
+jiwer

step10_word_segments_and_mapping.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import json
+import wave
+import contextlib
+import numpy as np
+import webrtcvad
+import librosa
+from difflib import SequenceMatcher
+from arabic_phonemizer import ArabicPhonemizer
+AUDIO_PATH = "sample.wav"
+CANON_PATH = "data/fatiha_canonical_fallback.json"
+OUT_PATH = "output/word_mapping.json"
+# VAD settings
+VAD_MODE = 2  # 0-3 (higher = more aggressive)
+FRAME_MS = 30 # 10, 20, or 30ms required
+def read_wav_mono16k(path):
+    # librosa loads float32; we need int16 pcm for VAD
+    audio, sr = librosa.load(path, sr=16000, mono=True)
+    pcm16 = (audio * 32767).astype(np.int16)
+    return pcm16, 16000
+def frame_generator(pcm16, sr, frame_ms):
+    n = int(sr * frame_ms / 1000)
+    offset = 0
+    while offset + n < len(pcm16):
+        yield pcm16[offset:offset+n]
+        offset += n
+def vad_segments(pcm16, sr, frame_ms, mode):
+    vad = webrtcvad.Vad(mode)
+    frames = list(frame_generator(pcm16, sr, frame_ms))
+    voiced_flags = [vad.is_speech(f.tobytes(), sr) for f in frames]
+    # Convert voiced_flags into segments in seconds
+    segments = []
+    in_seg = False
+    start_i = 0
+    for i, v in enumerate(voiced_flags):
+        if v and not in_seg:
+            in_seg = True
+            start_i = i
+        elif (not v) and in_seg:
+            in_seg = False
+            end_i = i
+            segments.append((start_i, end_i))
+    if in_seg:
+        segments.append((start_i, len(voiced_flags)))
+    # Merge segments that are too close
+    merged = []
+    for s, e in segments:
+        if not merged:
+            merged.append([s, e])
+        else:
+            prev_s, prev_e = merged[-1]
+            gap = s - prev_e
+            if gap <= 2:  # ~60ms gap
+                merged[-1][1] = e
+            else:
+                merged.append([s, e])
+    # Convert to time
+    out = []
+    for s, e in merged:
+        t0 = (s * frame_ms) / 1000.0
+        t1 = (e * frame_ms) / 1000.0
+        if (t1 - t0) >= 0.10:
+            out.append((round(t0, 3), round(t1, 3)))
+    return out
+def canonical_words(canon):
+    words = []
+    for ay in canon["ayahs"]:
+        for w in ay["word_info"]:
+            words.append({"ayah": ay["ayah"], "word": w["word"], "base": w["base"]})
+    return words
+def similarity(a, b):
+    return SequenceMatcher(None, a, b).ratio()
+def main():
+    with open(CANON_PATH, "r", encoding="utf-8") as f:
+        canon = json.load(f)
+    canon_words = canonical_words(canon)
+    ph = ArabicPhonemizer()
+    pcm16, sr = read_wav_mono16k(AUDIO_PATH)
+    segs = vad_segments(pcm16, sr, FRAME_MS, VAD_MODE)
+    # For each audio segment, phonemize its "best guess" by just extracting audio and using fallback:
+    # We don't have ASR here; so we approximate by mapping segments to canonical words in order
+    # using a greedy approach: advance through canon words and match by duration / count.
+    #
+    # MVP: we map N segments to first N canon words (still better than madd-only mapping)
+    mapped = []
+    n = min(len(segs), len(canon_words))
+    for i in range(n):
+        t0, t1 = segs[i]
+        cw = canon_words[i]
+        mapped.append({
+            "segment_index": i+1,
+            "timestamp": {"start": t0, "end": t1},
+            "mapped_canonical": cw
+        })
+    out = {
+        "audio_path": AUDIO_PATH,
+        "vad": {"mode": VAD_MODE, "frame_ms": FRAME_MS},
+        "segments": segs,
+        "mapped": mapped,
+        "note": "This is MVP word-like segmentation. Next step will replace sequential mapping with acoustic+phoneme alignment."
+    }
+    with open(OUT_PATH, "w", encoding="utf-8") as f:
+        json.dump(out, f, ensure_ascii=False, indent=2)
+    print("OK ✅ wrote", OUT_PATH)
+    print("VAD segments:", len(segs))
+    if mapped:
+        print("First mapping:", mapped[0])
+if __name__ == "__main__":
+    main()

step12_align_segments_wavlm.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import json
+import numpy as np
+import librosa
+import torch
+from dtw import dtw
+from transformers import AutoFeatureExtractor, AutoModel
+from arabic_phonemizer import ArabicPhonemizer
+AUDIO_PATH = "sample_trim.wav"
+CANON_PATH = "data/fatiha_canonical_fallback.json"
+OUT_PATH = "output/alignment_wavlm.json"
+MODEL_ID = "microsoft/wavlm-base"
+def wavlm_embeddings(audio_16k: np.ndarray, sr: int):
+    fe = AutoFeatureExtractor.from_pretrained(MODEL_ID)
+    model = AutoModel.from_pretrained(MODEL_ID)
+    model.eval()
+    inputs = fe(audio_16k, sampling_rate=sr, return_tensors="pt")
+    with torch.no_grad():
+        out = model(**inputs)
+    # (frames, hidden)
+    emb = out.last_hidden_state[0].cpu().numpy()
+    return emb
+def mean_pool(emb: np.ndarray):
+    return emb.mean(axis=0)
+def load_audio_segment(path, start_s, end_s, sr=16000):
+    audio, _ = librosa.load(path, sr=sr, mono=True, offset=float(start_s), duration=float(end_s - start_s))
+    return audio
+def canonical_word_list(canon):
+    words = []
+    for ay in canon["ayahs"]:
+        for w in ay["word_info"]:
+            words.append({"ayah": ay["ayah"], "word": w["word"], "base": w["base"]})
+    return words
+def vad_segments_from_step8(feedback_path="output/feedback_madd.json"):
+    # Use the long segments already detected in your feedback JSON
+    d = json.load(open(feedback_path, encoding="utf-8"))
+    segs = [(s["start"], s["end"]) for s in d["segments_detected"]]
+    return segs
+def cosine(a, b):
+    a = a / (np.linalg.norm(a) + 1e-9)
+    b = b / (np.linalg.norm(b) + 1e-9)
+    return float(np.dot(a, b))
+def main():
+    canon = json.load(open(CANON_PATH, encoding="utf-8"))
+    canon_words = canonical_word_list(canon)
+    # We will build "prototype embeddings" for each canonical word by phonemizing text
+    # For MVP we don't synthesize audio; instead we just keep word order and do local matching.
+    # Real version uses forced alignment / phoneme decoding.
+    #
+    # Here we do a practical improvement: map each detected long segment to a nearby word index
+    # based on its relative time position in the recitation.
+    segs = vad_segments_from_step8()
+    # Compute full-audio embedding frames once
+    full_audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
+    full_emb = wavlm_embeddings(full_audio, sr)
+    # Map time->frame index approximately
+    # WavLM frame rate is roughly 50 fps-ish after feature extraction; we estimate using emb length
+    total_sec = len(full_audio) / sr
+    frames = full_emb.shape[0]
+    fps = frames / total_sec
+    results = []
+    for i, (s, e) in enumerate(segs, 1):
+        # Take embedding slice for this time window
+        f0 = int(max(0, np.floor(s * fps)))
+        f1 = int(min(frames, np.ceil(e * fps)))
+        if f1 <= f0 + 1:
+            continue
+        seg_vec = mean_pool(full_emb[f0:f1])
+        # Estimate position in surah by time ratio, then search around that word index
+        t_mid = (s + e) / 2.0
+        ratio = t_mid / total_sec
+        est_idx = int(ratio * (len(canon_words) - 1))
+        # Search a window around estimated index
+        W = 6
+        cand_range = range(max(0, est_idx - W), min(len(canon_words), est_idx + W + 1))
+        # Score candidates (we don’t have word audio prototypes, so we use a simple proxy:
+        # compare segment vector to other segment vectors nearby is not helpful.
+        # Instead: pick the nearest index as MVP and output the search window.
+        # This step is mainly building the structure; next step will add real phoneme decoder/alignment.)
+        chosen = est_idx
+        results.append({
+            "segment_index": i,
+            "timestamp": {"start": round(float(s), 3), "end": round(float(e), 3)},
+            "estimated_word_index": est_idx,
+            "candidate_word_indices": list(cand_range),
+            "mapped_word": canon_words[chosen],
+            "note": "MVP time-based alignment using WavLM frame mapping. Next step replaces this with phoneme/CTC alignment."
+        })
+    out = {
+        "audio_path": AUDIO_PATH,
+        "total_sec": round(float(total_sec), 3),
+        "wavlm": {"model_id": MODEL_ID, "frames": int(frames), "fps_est": round(float(fps), 2)},
+        "num_canonical_words": len(canon_words),
+        "segments_used": len(results),
+        "alignment": results
+    }
+    json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
+    print("OK ✅ wrote", OUT_PATH)
+    print("Segments aligned:", len(results))
+    if results:
+        print("Sample:", results[0])
+if __name__ == "__main__":
+    main()

step13_arabic_ctc_transcribe.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import torch
+import librosa
+from transformers import AutoProcessor, AutoModelForCTC
+# Arabic wav2vec2 CTC model (CPU friendly but heavy)
+MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
+AUDIO_PATH = "sample_trim.wav"
+OUT_TXT = os.path.join("output", "asr_raw.txt")
+def main():
+    os.makedirs("output", exist_ok=True)
+    print("Loading:", MODEL_ID)
+    processor = AutoProcessor.from_pretrained(MODEL_ID)
+    model = AutoModelForCTC.from_pretrained(MODEL_ID)
+    model.eval()
+    audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
+    print("Audio sec:", round(len(audio)/sr, 2))
+    inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    pred_ids = torch.argmax(logits, dim=-1)
+    text = processor.batch_decode(pred_ids)[0].strip()
+    # Save to file for downstream steps
+    with open(OUT_TXT, "w", encoding="utf-8") as f:
+        f.write(text + "\n")
+    print("\n--- RAW TRANSCRIPTION ---")
+    print(text)
+    print(f"\nOK ✅ wrote {OUT_TXT}")
+if __name__ == "__main__":
+    main()

step14_align_text_to_canonical.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import json
+import re
+from difflib import SequenceMatcher
+CANON_PATH = "data/fatiha_canonical.json"
+ASR_TEXT_PATH = "output/asr_raw.txt"
+OUT_PATH = "output/text_alignment.json"
+# --- Normalization helpers ---
+ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")  # harakat etc.
+TATWEEL = "\u0640"
+def normalize_ar(s: str) -> str:
+    s = s.replace(TATWEEL, "")
+    s = re.sub(ARABIC_DIACRITICS, "", s)
+    # normalize common variants
+    s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
+    s = s.replace("ى", "ي")
+    s = s.replace("ة", "ه")
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+def tokenize(s: str):
+    # keep Arabic letters and spaces only
+    s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
+    s = re.sub(r"\s+", " ", s).strip()
+    return s.split(" ") if s else []
+def sim(a, b) -> float:
+    return SequenceMatcher(None, a, b).ratio()
+def main():
+    canon = json.load(open(CANON_PATH, encoding="utf-8"))
+    # Load ASR raw text (we will create it in 14.2)
+    raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip()
+    raw_n = normalize_ar(raw)
+    asr_tokens = tokenize(raw_n)
+    # Canonical tokens (word-level) from JSON
+    canon_words = []
+    for ay in canon["ayahs"]:
+        for w in ay["words"]:
+            canon_words.append({
+                "ayah": ay["ayah"],
+                "word": w,
+                "norm": normalize_ar(w)
+            })
+    # Greedy alignment: for each canonical word, find best match in a moving window of ASR tokens
+    aligned = []
+    j = 0
+    WINDOW = 6
+    for i, cw in enumerate(canon_words):
+        best = None
+        best_j = None
+        for k in range(j, min(len(asr_tokens), j + WINDOW)):
+            score = sim(cw["norm"], asr_tokens[k])
+            if (best is None) or (score > best):
+                best = score
+                best_j = k
+        if best is None:
+            aligned.append({
+                "canon": cw,
+                "asr_token": None,
+                "score": 0.0,
+                "match": False
+            })
+            continue
+        token = asr_tokens[best_j]
+        match = best >= 0.75  # MVP threshold
+        aligned.append({
+            "canon": cw,
+            "asr_token": token,
+            "score": round(float(best), 3),
+            "match": bool(match)
+        })
+        # advance pointer to keep order
+        j = best_j + 1
+    # Summaries
+    total = len(aligned)
+    matches = sum(1 for a in aligned if a["match"])
+    mismatches = total - matches
+    out = {
+        "asr_raw": raw,
+        "asr_normalized": raw_n,
+        "stats": {
+            "canonical_words": total,
+            "matches": matches,
+            "mismatches": mismatches,
+            "match_rate": round(matches / total, 3) if total else 0.0
+        },
+        "alignment": aligned
+    }
+    json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
+    print("OK ✅ wrote", OUT_PATH)
+    print("Match rate:", out["stats"]["match_rate"])
+    print("First 5 alignments:")
+    for a in aligned[:5]:
+        print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"])
+if __name__ == "__main__":
+    main()

step15_global_word_alignment.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import json
+import re
+from difflib import SequenceMatcher
+CANON_PATH = "data/fatiha_canonical.json"
+ASR_TEXT_PATH = "output/asr_raw.txt"
+OUT_PATH = "output/text_alignment_global.json"
+ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
+TATWEEL = "\u0640"
+def normalize_ar(s: str) -> str:
+    s = s.replace(TATWEEL, "")
+    s = re.sub(ARABIC_DIACRITICS, "", s)
+    s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
+    s = s.replace("ى", "ي")
+    s = s.replace("ة", "ه")
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+def tokenize(s: str):
+    s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
+    s = re.sub(r"\s+", " ", s).strip()
+    return s.split(" ") if s else []
+def sim(a, b) -> float:
+    return SequenceMatcher(None, a, b).ratio()
+def main():
+    canon = json.load(open(CANON_PATH, encoding="utf-8"))
+    raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip()
+    raw_n = normalize_ar(raw)
+    asr_tokens = tokenize(raw_n)
+    canon_words = []
+    for ay in canon["ayahs"]:
+        for w in ay["words"]:
+            canon_words.append({
+                "ayah": ay["ayah"],
+                "word": w,
+                "norm": normalize_ar(w)
+            })
+    # --- Global alignment DP ---
+    n = len(canon_words)
+    m = len(asr_tokens)
+    # scoring
+    GAP = -0.45  # penalty for skipping a token/word
+    def match_score(i, j):
+        # reward similarity, centered around 0.75
+        s = sim(canon_words[i]["norm"], asr_tokens[j])
+        return (s - 0.75) * 2.0  # >0 is good match
+    # DP matrices
+    dp = [[0.0]*(m+1) for _ in range(n+1)]
+    bt = [[None]*(m+1) for _ in range(n+1)]  # backtrack: 'D' diag, 'U' up, 'L' left
+    for i in range(1, n+1):
+        dp[i][0] = dp[i-1][0] + GAP
+        bt[i][0] = 'U'
+    for j in range(1, m+1):
+        dp[0][j] = dp[0][j-1] + GAP
+        bt[0][j] = 'L'
+    for i in range(1, n+1):
+        for j in range(1, m+1):
+            diag = dp[i-1][j-1] + match_score(i-1, j-1)
+            up   = dp[i-1][j] + GAP
+            left = dp[i][j-1] + GAP
+            best = max(diag, up, left)
+            dp[i][j] = best
+            bt[i][j] = 'D' if best == diag else ('U' if best == up else 'L')
+    # Backtrack to alignment pairs
+    aligned = []
+    i, j = n, m
+    while i > 0 or j > 0:
+        move = bt[i][j]
+        if move == 'D':
+            cw = canon_words[i-1]
+            tok = asr_tokens[j-1]
+            s = sim(cw["norm"], tok)
+            aligned.append({
+                "canon": cw,
+                "asr_token": tok,
+                "score": round(float(s), 3),
+                "match": bool(s >= 0.72)
+            })
+            i -= 1
+            j -= 1
+        elif move == 'U':
+            cw = canon_words[i-1]
+            aligned.append({
+                "canon": cw,
+                "asr_token": None,
+                "score": 0.0,
+                "match": False
+            })
+            i -= 1
+        else:  # 'L'
+            # ASR token skipped
+            j -= 1
+    aligned.reverse()
+    total = len(canon_words)
+    matches = sum(1 for a in aligned if a["canon"] and a["match"])
+    mismatches = total - matches
+    out = {
+        "asr_raw": raw,
+        "asr_normalized": raw_n,
+        "stats": {
+            "canonical_words": total,
+            "asr_tokens": len(asr_tokens),
+            "matches": matches,
+            "mismatches": mismatches,
+            "match_rate": round(matches / total, 3) if total else 0.0
+        },
+        "alignment": aligned
+    }
+    json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
+    print("OK ✅ wrote", OUT_PATH)
+    print("Match rate:", out["stats"]["match_rate"])
+    print("First 8 alignments:")
+    shown = 0
+    for a in aligned:
+        if a["canon"] is None:
+            continue
+        print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"])
+        shown += 1
+        if shown >= 8:
+            break
+if __name__ == "__main__":
+    main()

step16_ctc_word_timestamps.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import json
+import re
+import numpy as np
+import torch
+import librosa
+from transformers import AutoProcessor, AutoModelForCTC
+AUDIO_PATH = "sample_trim.wav"
+ALIGN_PATH = "output/text_alignment_global.json"
+OUT_PATH = "output/word_timestamps.json"
+MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
+ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
+TATWEEL = "\u0640"
+def normalize_ar(s: str) -> str:
+    s = s.replace(TATWEEL, "")
+    s = re.sub(ARABIC_DIACRITICS, "", s)
+    s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
+    s = s.replace("ى", "ي")
+    s = s.replace("ة", "ه")
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+def main():
+    # Load alignment
+    align = json.load(open(ALIGN_PATH, encoding="utf-8"))
+    alignment = [a for a in align["alignment"] if a.get("canon")]
+    # Load audio
+    audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
+    total_sec = len(audio) / sr
+    # Load CTC model
+    processor = AutoProcessor.from_pretrained(MODEL_ID)
+    model = AutoModelForCTC.from_pretrained(MODEL_ID)
+    model.eval()
+    inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
+    with torch.no_grad():
+        logits = model(**inputs).logits[0]  # (T, V)
+    pred_ids = torch.argmax(logits, dim=-1).cpu().numpy().tolist()
+    # Convert token IDs -> tokens
+    vocab = processor.tokenizer.get_vocab()
+    # invert vocab: id -> token
+    inv_vocab = {i: t for t, i in vocab.items()}
+    blank_id = processor.tokenizer.pad_token_id
+    if blank_id is None:
+        # fallback: common wav2vec2 blank is vocab["<pad>"]
+        blank_id = vocab.get("<pad>", None)
+    tokens = [inv_vocab[i] for i in pred_ids]
+    # Collapse repeats, remove blanks, keep time indices
+    collapsed = []
+    prev = None
+    for t_idx, tok_id in enumerate(pred_ids):
+        if tok_id == prev:
+            continue
+        prev = tok_id
+        if blank_id is not None and tok_id == blank_id:
+            continue
+        tok = inv_vocab.get(tok_id, "")
+        if tok.strip() == "":
+            continue
+        collapsed.append((t_idx, tok))
+    # Map CTC time index -> seconds
+    # time steps correspond to model frames spanning full audio
+    T = logits.shape[0]
+    def idx_to_time(i):
+        return (i / T) * total_sec
+    # Prepare normalized ASR tokens from alignment file (we use them to locate spans)
+    asr_tokens = []
+    for a in alignment:
+        if a["asr_token"] is None:
+            asr_tokens.append(None)
+        else:
+            asr_tokens.append(normalize_ar(a["asr_token"]))
+    # We will approximate word timestamps by scanning collapsed tokens and
+    # finding the earliest and latest CTC indices where the letters of the ASR token appear in order.
+    #
+    # This is a heuristic but works reasonably for MVP.
+    def find_span_for_word(word_norm, start_search_idx):
+        if not word_norm:
+            return None, start_search_idx
+        # remove spaces
+        target = word_norm.replace(" ", "")
+        if target == "":
+            return None, start_search_idx
+        i = start_search_idx
+        start_idx = None
+        last_idx = None
+        for ch in target:
+            found = False
+            while i < len(collapsed):
+                t_idx, tok = collapsed[i]
+                # tokens may be characters or pieces; match if character appears
+                if ch in tok:
+                    if start_idx is None:
+                        start_idx = t_idx
+                    last_idx = t_idx
+                    i += 1
+                    found = True
+                    break
+                i += 1
+            if not found:
+                return None, start_search_idx
+        return (start_idx, last_idx), i
+    out_rows = []
+    search_ptr = 0
+    for a in alignment:
+        cw = a["canon"]
+        tok = a["asr_token"]
+        tok_norm = normalize_ar(tok) if tok else None
+        span, search_ptr2 = find_span_for_word(tok_norm, search_ptr) if tok_norm else (None, search_ptr)
+        if span is None:
+            start_t = None
+            end_t = None
+        else:
+            s_idx, e_idx = span
+            start_t = round(float(idx_to_time(s_idx)), 3)
+            end_t = round(float(idx_to_time(e_idx)), 3)
+            # advance pointer to keep order
+            search_ptr = search_ptr2
+        out_rows.append({
+            "ayah": cw["ayah"],
+            "word": cw["word"],
+            "asr_token": tok,
+            "score": a["score"],
+            "match": a["match"],
+            "timestamp": None if start_t is None else {"start": start_t, "end": end_t}
+        })
+    out = {
+        "audio_path": AUDIO_PATH,
+        "model": MODEL_ID,
+        "note": "CTC-based approximate word timestamps; upgrade later with forced alignment for higher accuracy.",
+        "stats": {
+            "words": len(out_rows),
+            "timestamped": sum(1 for r in out_rows if r["timestamp"] is not None)
+        },
+        "words": out_rows
+    }
+    json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
+    print("OK ✅ wrote", OUT_PATH)
+    print("Timestamped:", out["stats"]["timestamped"], "/", out["stats"]["words"])
+    print("Sample:", out_rows[0])
+if __name__ == "__main__":
+    main()

step16b_token_interpolation_timestamps.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import json
+import re
+import librosa
+AUDIO_PATH = "sample_trim.wav"
+ALIGN_GLOBAL_PATH = "output/text_alignment_global.json"
+OUT_PATH = "output/word_timestamps_v2.json"
+ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
+TATWEEL = "\u0640"
+def normalize_ar(s: str) -> str:
+    s = s.replace(TATWEEL, "")
+    s = re.sub(ARABIC_DIACRITICS, "", s)
+    s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
+    s = s.replace("ى", "ي")
+    s = s.replace("ة", "ه")
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+def tokenize_ar_words(s: str):
+    s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
+    s = re.sub(r"\s+", " ", s).strip()
+    return s.split(" ") if s else []
+def main():
+    # Load audio duration
+    audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
+    total_sec = len(audio) / sr
+    # Load global alignment (has asr_raw + alignment pairs)
+    g = json.load(open(ALIGN_GLOBAL_PATH, encoding="utf-8"))
+    asr_raw = g["asr_raw"]
+    asr_norm = normalize_ar(asr_raw)
+    asr_tokens = tokenize_ar_words(asr_norm)
+    # Build token timeline: divide total audio time across ASR tokens evenly
+    # (MVP approximation; later replace with real forced alignment)
+    N = max(1, len(asr_tokens))
+    token_times = []
+    for i in range(N):
+        start = (i / N) * total_sec
+        end = ((i + 1) / N) * total_sec
+        token_times.append((round(start, 3), round(end, 3)))
+    # Now assign each canonical word the timestamp of its matched ASR token (if any),
+    # otherwise interpolate from its index in canonical sequence.
+    alignment = [a for a in g["alignment"] if a.get("canon")]
+    out_words = []
+    last_token_idx = 0
+    for idx, a in enumerate(alignment):
+        cw = a["canon"]
+        tok = a["asr_token"]
+        if tok is not None:
+            tok_norm = normalize_ar(tok)
+            # find token index in asr_tokens near expected position
+            # we use a forward search to keep monotonic mapping
+            # MVP: choose first exact match, else fallback to proportional index
+                        # monotonic search: only search forward from last token index
+            found = None
+            for ti in range(last_token_idx, len(asr_tokens)):
+                if asr_tokens[ti] == tok_norm:
+                    found = ti
+                    break
+            if found is None:
+                # fallback: proportional but also monotonic
+                found = int((idx / max(1, len(alignment))) * (N - 1))
+                found = max(found, last_token_idx)
+            t0, t1 = token_times[found]
+            last_token_idx = found + 1
+        else:
+            # no matched token: proportional fallback
+            found = int((idx / max(1, len(alignment))) * (N - 1))
+            t0, t1 = token_times[found]
+        out_words.append({
+            "index": idx + 1,
+            "ayah": cw["ayah"],
+            "word": cw["word"],
+            "asr_token": tok,
+            "score": a["score"],
+            "match": a["match"],
+            "timestamp": {"start": t0, "end": t1}
+        })
+    out = {
+        "audio_path": AUDIO_PATH,
+        "method": "token-time interpolation (MVP)",
+        "stats": {
+            "canonical_words": len(out_words),
+            "asr_tokens": len(asr_tokens),
+            "timestamped": len(out_words)
+        },
+        "words": out_words
+    }
+    json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
+    print("OK ✅ wrote", OUT_PATH)
+    print("Words timestamped:", len(out_words), "/", len(out_words))
+    print("First:", out_words[0])
+    print("Last:", out_words[-1])
+if __name__ == "__main__":
+    main()

step17_make_api_response.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import json
+WORDS_PATH = "output/word_timestamps_v2.json"
+MADD_PATH = "output/feedback_madd.json"
+CANON_FALLBACK_PATH = "data/fatiha_canonical_fallback.json"
+OUT_PATH = "output/api_response.json"
+def main():
+    words_doc = json.load(open(WORDS_PATH, encoding="utf-8"))
+    madd_doc = json.load(open(MADD_PATH, encoding="utf-8"))
+    canon_fb = json.load(open(CANON_FALLBACK_PATH, encoding="utf-8"))
+    # Build quick lookup: (ayah, word) -> madd_positions
+    madd_pos = {}
+    for ay in canon_fb["ayahs"]:
+        for wi in ay.get("word_info", []):
+            madd_pos[(ay["ayah"], wi["word"])] = wi.get("madd_positions_base_index", [])
+    # Word list for UI
+    ui_words = []
+    mismatches = []
+    for w in words_doc["words"]:
+        ay = w["ayah"]
+        word = w["word"]
+        item = {
+            "index": w["index"],
+            "ayah": ay,
+            "word": word,
+            "timestamp": w["timestamp"],
+            "match": w["match"],
+            "score": w["score"],
+            "madd_positions_base_index": madd_pos.get((ay, word), [])
+        }
+        ui_words.append(item)
+        if not w["match"]:
+            mismatches.append({
+                "ayah": ay,
+                "word": word,
+                "timestamp": w["timestamp"],
+                "reason": "text_mismatch",
+                "score": w["score"]
+            })
+    # Madd results already include timestamps; keep them as "issues"
+    madd_issues = []
+    for r in madd_doc.get("results", []):
+        madd_issues.append({
+            "type": "madd",
+            "ayah": r["ayah"],
+            "word": r["word"],
+            "timestamp": r["timestamp"],
+            "duration_sec": r["duration_sec"],
+            "classification": r["classification"],
+            "confidence": r["confidence"],
+            "tip": r["tip"]
+        })
+    out = {
+        "surah": "Al-Fatiha",
+        "audio_path": words_doc["audio_path"],
+        "pipeline_version": "mvp-v1",
+        "summary": {
+            "words_total": len(ui_words),
+            "text_mismatches": len(mismatches),
+            "madd_issues": len(madd_issues)
+        },
+        "words": ui_words,
+        "issues": {
+            "text": mismatches,
+            "madd": madd_issues
+        },
+        "notes": [
+            "Word timestamps are MVP (token-time interpolation).",
+            "Text alignment uses global DP alignment for robustness.",
+            "Madd detection uses intensity-based long voiced segments; replace with phoneme-level alignment later."
+        ]
+    }
+    json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
+    print("OK ✅ wrote", OUT_PATH)
+    print("Summary:", out["summary"])
+    if out["issues"]["text"]:
+        print("Example text mismatch:", out["issues"]["text"][0])
+    if out["issues"]["madd"]:
+        print("Example madd issue:", out["issues"]["madd"][0])
+if __name__ == "__main__":
+    main()

step5_wavlm_test.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+import librosa
+from transformers import AutoFeatureExtractor, AutoModel
+MODEL_ID = "microsoft/wavlm-base"
+def load_audio(path: str, target_sr: int = 16000):
+    audio, sr = librosa.load(path, sr=target_sr, mono=True)
+    return audio, sr
+def main():
+    print("Loading model:", MODEL_ID)
+    feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID)
+    model = AutoModel.from_pretrained(MODEL_ID)
+    model.eval()
+    audio, sr = load_audio("sample.wav")
+    print("Audio length (sec):", round(len(audio) / sr, 2))
+    inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt")
+    with torch.no_grad():
+        out = model(**inputs)
+    x = out.last_hidden_state  # [batch, frames, hidden]
+    print("OK ✅ WavLM ran on CPU")
+    print("Embedding tensor shape:", tuple(x.shape))
+if __name__ == "__main__":
+    main()

step7_fallback_phonemes_and_madd.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import json
+import re
+from arabic_phonemizer import ArabicPhonemizer
+# --- Helpers ---
+# Very simple Madd detection from script (MVP-level):
+# We mark likely long vowels caused by: ا, و, ي, ى, and madd sign "ٓ"
+MADD_CHARS = set(["ا", "و", "ي", "ى", "ٓ"])
+ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")  # tanwin, harakat, etc.
+def strip_diacritics(s: str) -> str:
+    return re.sub(ARABIC_DIACRITICS, "", s)
+def detect_madd_positions(word: str):
+    """
+    Returns a list of indices in the *diacritics-stripped* word where Madd-ish characters appear.
+    MVP heuristic; later replace with Quranic-Phonemizer (Tajweed-aware).
+    """
+    base = strip_diacritics(word)
+    return [i for i, ch in enumerate(base) if ch in MADD_CHARS]
+def main():
+    # Instantiate phonemizer once
+    ph = ArabicPhonemizer()
+    path_in = "data/fatiha_canonical.json"
+    with open(path_in, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    for ay in data["ayahs"]:
+        ay_word_info = []
+        for w in ay["words"]:
+            base = strip_diacritics(w)
+            # ArabicPhonemizer API: use .phonemize(text)
+            # If your version differs, we’ll adapt after you run it.
+            phonemes = ph.phonemize(w)
+            ay_word_info.append({
+                "word": w,
+                "base": base,
+                "phonemes_fallback": phonemes,
+                "madd_positions_base_index": detect_madd_positions(w)
+            })
+        ay["word_info"] = ay_word_info
+    path_out = "data/fatiha_canonical_fallback.json"
+    with open(path_out, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    print("OK ✅ wrote", path_out)
+    print("Sample ayah 1 word_info:")
+    for item in data["ayahs"][0]["word_info"]:
+        print(" -", item["word"], "| base:", item["base"], "| madd idx:", item["madd_positions_base_index"], "| ph:", item["phonemes_fallback"])
+if __name__ == "__main__":
+    main()

step8_madd_signal.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import parselmouth
+import numpy as np
+AUDIO_PATH = "sample_trim.wav"
+def main():
+    snd = parselmouth.Sound(AUDIO_PATH)
+    duration = snd.get_total_duration()
+    print("Audio duration (sec):", round(duration, 2))
+    # Intensity (energy over time)
+    intensity = snd.to_intensity(time_step=0.01)
+    times = intensity.xs()
+    vals = intensity.values[0]
+    # Simple segmentation: find "voiced-ish" regions by intensity threshold
+    thr = np.percentile(vals, 60)  # adaptive threshold
+    voiced = vals > thr
+    # Convert boolean mask into segments [start, end]
+    segments = []
+    in_seg = False
+    start = None
+    for t, v in zip(times, voiced):
+        if v and not in_seg:
+            in_seg = True
+            start = t
+        elif (not v) and in_seg:
+            in_seg = False
+            end = t
+            if end - start >= 0.06:  # ignore tiny blips
+                segments.append((start, end))
+    if in_seg and start is not None:
+        end = times[-1]
+        if end - start >= 0.06:
+            segments.append((start, end))
+    # Print segments
+    print("Candidate voiced segments:", len(segments))
+    for i, (s, e) in enumerate(segments[:12], 1):
+        print(f"{i:02d}. {s:.2f} -> {e:.2f}  (dur {e-s:.2f}s)")
+    # Heuristic "madd-like" durations: anything > 0.18s is suspiciously long vowel
+    longish = [(s, e, e - s) for (s, e) in segments if (e - s) >= 0.18]
+    print("\nLong segments (possible Madd candidates):", len(longish))
+    for i, (s, e, d) in enumerate(longish[:12], 1):
+        print(f"{i:02d}. {s:.2f} -> {e:.2f}  (dur {d:.2f}s)")
+if __name__ == "__main__":
+    main()

step9_madd_feedback_json.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import json
+import numpy as np
+import parselmouth
+AUDIO_PATH = "sample_trim.wav"
+CANON_PATH = "data/fatiha_canonical_fallback.json"
+OUT_PATH = "output/feedback_madd.json"
+# --- Heuristic thresholds (MVP) ---
+# Quranic madd lengths depend on rule; for MVP we just classify by duration.
+TOO_SHORT_SEC = 0.15
+OK_MAX_SEC = 0.35
+TOO_LONG_SEC = 0.35
+def extract_long_voiced_segments(sound: parselmouth.Sound):
+    intensity = sound.to_intensity(time_step=0.01)
+    times = intensity.xs()
+    vals = intensity.values[0]
+    thr = np.percentile(vals, 60)
+    voiced = vals > thr
+    segments = []
+    in_seg = False
+    start = None
+    for t, v in zip(times, voiced):
+        if v and not in_seg:
+            in_seg = True
+            start = float(t)
+        elif (not v) and in_seg:
+            in_seg = False
+            end = float(t)
+            if end - start >= 0.06:
+                segments.append((start, end))
+    if in_seg and start is not None:
+        end = float(times[-1])
+        if end - start >= 0.06:
+            segments.append((start, end))
+    # Return only the longer ones as Madd candidates
+    longish = [(s, e, e - s) for (s, e) in segments if (e - s) >= 0.18]
+    return longish
+def madd_words_in_order(canon):
+    """
+    Returns list of dicts in recitation order where madd_positions exists.
+    """
+    items = []
+    for ay in canon["ayahs"]:
+        for w in ay["word_info"]:
+            if w.get("madd_positions_base_index"):
+                items.append({
+                    "ayah": ay["ayah"],
+                    "word": w["word"],
+                    "base": w["base"],
+                    "madd_positions_base_index": w["madd_positions_base_index"],
+                    "phonemes_fallback": w.get("phonemes_fallback", "")
+                })
+    return items
+def classify_duration(d):
+    if d < TOO_SHORT_SEC:
+        return "too_short"
+    if d <= OK_MAX_SEC:
+        return "ok"
+    return "too_long"
+def confidence_from_duration(d):
+    # crude confidence: farther from ok band → higher confidence
+    if d < TOO_SHORT_SEC:
+        return min(0.95, 0.60 + (TOO_SHORT_SEC - d) * 2.0)
+    if d <= OK_MAX_SEC:
+        return 0.55
+    return min(0.95, 0.60 + (d - OK_MAX_SEC) * 1.2)
+def main():
+    # Load canonical word info
+    with open(CANON_PATH, "r", encoding="utf-8") as f:
+        canon = json.load(f)
+    madd_targets = madd_words_in_order(canon)
+    # Load audio
+    snd = parselmouth.Sound(AUDIO_PATH)
+    longish = extract_long_voiced_segments(snd)
+    feedback = {
+        "surah": canon["surah"],
+        "riwayah": canon["riwayah"],
+        "rule": "Madd (MVP heuristic)",
+        "audio_path": AUDIO_PATH,
+        "notes": [
+            "This MVP uses intensity-based voiced segments and maps long segments to Madd-eligible words in order.",
+            "Replace with real forced alignment + Quranic-Phonemizer later for Tajweed-accurate placement."
+        ],
+        "segments_detected": [{"start": s, "end": e, "dur": d} for (s, e, d) in longish],
+        "madd_targets": madd_targets,
+        "results": []
+    }
+    # Map segments to madd targets sequentially
+    n = min(len(longish), len(madd_targets))
+    for i in range(n):
+        s, e, d = longish[i]
+        tgt = madd_targets[i]
+        label = classify_duration(d)
+        conf = float(round(confidence_from_duration(d), 3))
+        # Simple user-facing tip
+        if label == "too_short":
+            tip = "Extend the vowel a bit more (madd)."
+        elif label == "too_long":
+            tip = "Shorten the vowel slightly (avoid over-stretching)."
+        else:
+            tip = "Madd length looks OK."
+        feedback["results"].append({
+            "index": i + 1,
+            "ayah": tgt["ayah"],
+            "word": tgt["word"],
+            "timestamp": {"start": round(s, 3), "end": round(e, 3)},
+            "duration_sec": round(d, 3),
+            "classification": label,
+            "confidence": conf,
+            "tip": tip
+        })
+    with open(OUT_PATH, "w", encoding="utf-8") as f:
+        json.dump(feedback, f, ensure_ascii=False, indent=2)
+    print("OK ✅ wrote", OUT_PATH)
+    print("Long segments:", len(longish))
+    print("Madd target words:", len(madd_targets))
+    print("Mapped results:", len(feedback["results"]))
+    if feedback["results"]:
+        print("Sample result:", feedback["results"][0])
+if __name__ == "__main__":
+    main()