File size: 2,927 Bytes
4ca6263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import json

WORDS_PATH = "output/word_timestamps_v2.json"
MADD_PATH = "output/feedback_madd.json"
CANON_FALLBACK_PATH = "data/fatiha_canonical_fallback.json"
OUT_PATH = "output/api_response.json"

def main():
    words_doc = json.load(open(WORDS_PATH, encoding="utf-8"))
    madd_doc = json.load(open(MADD_PATH, encoding="utf-8"))
    canon_fb = json.load(open(CANON_FALLBACK_PATH, encoding="utf-8"))

    # Build quick lookup: (ayah, word) -> madd_positions
    madd_pos = {}
    for ay in canon_fb["ayahs"]:
        for wi in ay.get("word_info", []):
            madd_pos[(ay["ayah"], wi["word"])] = wi.get("madd_positions_base_index", [])

    # Word list for UI
    ui_words = []
    mismatches = []
    for w in words_doc["words"]:
        ay = w["ayah"]
        word = w["word"]
        item = {
            "index": w["index"],
            "ayah": ay,
            "word": word,
            "timestamp": w["timestamp"],
            "match": w["match"],
            "score": w["score"],
            "madd_positions_base_index": madd_pos.get((ay, word), [])
        }
        ui_words.append(item)
        if not w["match"]:
            mismatches.append({
                "ayah": ay,
                "word": word,
                "timestamp": w["timestamp"],
                "reason": "text_mismatch",
                "score": w["score"]
            })

    # Madd results already include timestamps; keep them as "issues"
    madd_issues = []
    for r in madd_doc.get("results", []):
        madd_issues.append({
            "type": "madd",
            "ayah": r["ayah"],
            "word": r["word"],
            "timestamp": r["timestamp"],
            "duration_sec": r["duration_sec"],
            "classification": r["classification"],
            "confidence": r["confidence"],
            "tip": r["tip"]
        })

    out = {
        "surah": "Al-Fatiha",
        "audio_path": words_doc["audio_path"],
        "pipeline_version": "mvp-v1",
        "summary": {
            "words_total": len(ui_words),
            "text_mismatches": len(mismatches),
            "madd_issues": len(madd_issues)
        },
        "words": ui_words,
        "issues": {
            "text": mismatches,
            "madd": madd_issues
        },
        "notes": [
            "Word timestamps are MVP (token-time interpolation).",
            "Text alignment uses global DP alignment for robustness.",
            "Madd detection uses intensity-based long voiced segments; replace with phoneme-level alignment later."
        ]
    }

    json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
    print("OK ✅ wrote", OUT_PATH)
    print("Summary:", out["summary"])
    if out["issues"]["text"]:
        print("Example text mismatch:", out["issues"]["text"][0])
    if out["issues"]["madd"]:
        print("Example madd issue:", out["issues"]["madd"][0])

if __name__ == "__main__":
    main()