didodev commited on
Commit
4ca6263
·
1 Parent(s): 468a7b7

Deploy iRecite MVP API (Docker + FastAPI)

Browse files
.dockerignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ .Python
7
+ env/
8
+ venv/
9
+ .uvicorn/
10
+ uploads/*
11
+ output/*
12
+ sample.wav
13
+ sample_trim.wav
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # System deps (ffmpeg for audio conversion + git for some pip installs if needed)
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ ffmpeg \
6
+ git \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ WORKDIR /app
10
+
11
+ # Install Python deps
12
+ COPY requirements.txt /app/requirements.txt
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Copy the application code
16
+ COPY . /app
17
+
18
+ # Hugging Face Spaces expects port 7860
19
+ EXPOSE 7860
20
+
21
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import shutil
4
+ import subprocess
5
+ from fastapi import FastAPI, UploadFile, File
6
+ from fastapi.responses import JSONResponse
7
+
8
+ app = FastAPI(title="iRecite MVP API")
9
+
10
+ WORKDIR = os.path.dirname(os.path.abspath(__file__))
11
+ PYTHON = os.path.join(WORKDIR, ".venv", "Scripts", "python.exe")
12
+ UPLOADS = os.path.join(WORKDIR, "uploads")
13
+ OUTPUT_DIR = os.path.join(WORKDIR, "output")
14
+ API_JSON = os.path.join(OUTPUT_DIR, "api_response.json")
15
+
16
+ import sys
17
+
18
+ def run(cmd):
19
+ # Always run child scripts with the same Python interpreter as the server
20
+ if cmd and cmd[0].lower() == "python":
21
+ cmd = [sys.executable] + cmd[1:]
22
+ subprocess.check_call(cmd, cwd=WORKDIR)
23
+
24
+ def detect_trim_times(wav_path: str):
25
+ """
26
+ Use ffmpeg silencedetect to get start/end of main speech.
27
+ Returns (start_sec, end_sec). If detection fails, returns (0, full_duration).
28
+ """
29
+ # Run silencedetect and capture output
30
+ p = subprocess.run(
31
+ ["ffmpeg", "-i", wav_path, "-af", "silencedetect=noise=-35dB:d=0.35", "-f", "null", "NUL"],
32
+ cwd=WORKDIR,
33
+ stdout=subprocess.PIPE,
34
+ stderr=subprocess.STDOUT,
35
+ text=True,
36
+ encoding="utf-8",
37
+ errors="ignore"
38
+ )
39
+ txt = p.stdout
40
+
41
+ # Find first "silence_end" near the beginning (speech start)
42
+ # and last "silence_start" near the end (speech end)
43
+ silence_end = None
44
+ silence_start_last = None
45
+
46
+ for line in txt.splitlines():
47
+ if "silence_end:" in line:
48
+ m = re.search(r"silence_end:\s*([0-9.]+)", line)
49
+ if m and silence_end is None:
50
+ silence_end = float(m.group(1))
51
+ if "silence_start:" in line:
52
+ m = re.search(r"silence_start:\s*([0-9.]+)", line)
53
+ if m:
54
+ silence_start_last = float(m.group(1))
55
+
56
+ # Get full duration using ffprobe
57
+ pr = subprocess.run(
58
+ ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=nw=1:nk=1", wav_path],
59
+ cwd=WORKDIR,
60
+ stdout=subprocess.PIPE,
61
+ stderr=subprocess.PIPE,
62
+ text=True
63
+ )
64
+ try:
65
+ full_dur = float(pr.stdout.strip())
66
+ except Exception:
67
+ full_dur = None
68
+
69
+ start = max(0.0, (silence_end if silence_end is not None else 0.0))
70
+ end = (silence_start_last if silence_start_last is not None else (full_dur if full_dur is not None else 0.0))
71
+
72
+ # Sanity checks
73
+ if full_dur is not None:
74
+ end = min(end, full_dur)
75
+ if end <= start + 1.0:
76
+ # fallback: don't trim
77
+ return 0.0, full_dur if full_dur is not None else 0.0
78
+
79
+ # small padding
80
+ start = max(0.0, start - 0.10)
81
+ end = end + 0.10
82
+ if full_dur is not None:
83
+ end = min(end, full_dur)
84
+
85
+ return start, end
86
+
87
+ @app.post("/analyze")
88
+ async def analyze(file: UploadFile = File(...)):
89
+ os.makedirs(UPLOADS, exist_ok=True)
90
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
91
+
92
+ # Save upload
93
+ upload_path = os.path.join(UPLOADS, file.filename)
94
+ with open(upload_path, "wb") as f:
95
+ shutil.copyfileobj(file.file, f)
96
+
97
+ # Convert to 16k mono wav
98
+ sample_wav = os.path.join(WORKDIR, "sample.wav")
99
+ run(["ffmpeg", "-y", "-i", upload_path, "-ac", "1", "-ar", "16000", sample_wav])
100
+
101
+ # Auto trim -> sample_trim.wav
102
+ sample_trim = os.path.join(WORKDIR, "sample_trim.wav")
103
+ start, end = detect_trim_times(sample_wav)
104
+ if end and end > start:
105
+ run(["ffmpeg", "-y", "-i", sample_wav, "-ss", f"{start:.2f}", "-to", f"{end:.2f}", "-ac", "1", "-ar", "16000", sample_trim])
106
+ else:
107
+ shutil.copy(sample_wav, sample_trim)
108
+
109
+ # Run pipeline (ordered)
110
+ run(["python", "step7_fallback_phonemes_and_madd.py"]) # ensures fallback json exists
111
+ run(["python", "step8_madd_signal.py"])
112
+ run(["python", "step9_madd_feedback_json.py"])
113
+ run(["python", "step13_arabic_ctc_transcribe.py"]) # now writes output/asr_raw.txt automatically
114
+ run(["python", "step14_align_text_to_canonical.py"])
115
+ run(["python", "step15_global_word_alignment.py"])
116
+ run(["python", "step16b_token_interpolation_timestamps.py"])
117
+ run(["python", "step17_make_api_response.py"])
118
+
119
+ if not os.path.exists(API_JSON):
120
+ return JSONResponse({"error": "api_response.json not generated"}, status_code=500)
121
+
122
+ import json
123
+ with open(API_JSON, "r", encoding="utf-8") as f:
124
+ data = json.load(f)
125
+
126
+ # include trim info for debugging
127
+ data["debug"] = {"trim": {"start": round(start, 2), "end": round(end, 2)}}
128
+
129
+ return data
data/fatiha_canonical.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "surah": "Al-Fatiha",
3
+ "surah_number": 1,
4
+ "riwayah": "Hafs",
5
+ "bismillah_included": true,
6
+ "ayahs": [
7
+ {
8
+ "ayah": 1,
9
+ "arabic": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ",
10
+ "words": ["بِسْمِ", "اللَّهِ", "الرَّحْمَٰنِ", "الرَّحِيمِ"]
11
+ },
12
+ {
13
+ "ayah": 2,
14
+ "arabic": "الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ",
15
+ "words": ["الْحَمْدُ", "لِلَّهِ", "رَبِّ", "الْعَالَمِينَ"]
16
+ },
17
+ {
18
+ "ayah": 3,
19
+ "arabic": "الرَّحْمَٰنِ الرَّحِيمِ",
20
+ "words": ["الرَّحْمَٰنِ", "الرَّحِيمِ"]
21
+ },
22
+ {
23
+ "ayah": 4,
24
+ "arabic": "مَالِكِ يَوْمِ الدِّينِ",
25
+ "words": ["مَالِكِ", "يَوْمِ", "الدِّينِ"]
26
+ },
27
+ {
28
+ "ayah": 5,
29
+ "arabic": "إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ",
30
+ "words": ["إِيَّاكَ", "نَعْبُدُ", "وَإِيَّاكَ", "نَسْتَعِينُ"]
31
+ },
32
+ {
33
+ "ayah": 6,
34
+ "arabic": "اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ",
35
+ "words": ["اهْدِنَا", "الصِّرَاطَ", "الْمُسْتَقِيمَ"]
36
+ },
37
+ {
38
+ "ayah": 7,
39
+ "arabic": "صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ الْمَغْضُوبِ عَلَيْهِمْ وَلَا الضَّالِّينَ",
40
+ "words": ["صِرَاطَ", "الَّذِينَ", "أَنْعَمْتَ", "عَلَيْهِمْ", "غَيْرِ", "الْمَغْضُوبِ", "عَلَيْهِمْ", "وَلَا", "الضَّالِّينَ"]
41
+ }
42
+ ]
43
+ }
data/fatiha_canonical_fallback.json ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "surah": "Al-Fatiha",
3
+ "surah_number": 1,
4
+ "riwayah": "Hafs",
5
+ "bismillah_included": true,
6
+ "ayahs": [
7
+ {
8
+ "ayah": 1,
9
+ "arabic": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ",
10
+ "words": [
11
+ "بِسْمِ",
12
+ "اللَّهِ",
13
+ "الرَّحْمَٰنِ",
14
+ "الرَّحِيمِ"
15
+ ],
16
+ "word_info": [
17
+ {
18
+ "word": "بِسْمِ",
19
+ "base": "بسم",
20
+ "phonemes_fallback": "bisomi",
21
+ "madd_positions_base_index": []
22
+ },
23
+ {
24
+ "word": "اللَّهِ",
25
+ "base": "الله",
26
+ "phonemes_fallback": ">al~ahi",
27
+ "madd_positions_base_index": [
28
+ 0
29
+ ]
30
+ },
31
+ {
32
+ "word": "الرَّحْمَٰنِ",
33
+ "base": "الرحمن",
34
+ "phonemes_fallback": ">ar~aHomaٰni",
35
+ "madd_positions_base_index": [
36
+ 0
37
+ ]
38
+ },
39
+ {
40
+ "word": "الرَّحِيمِ",
41
+ "base": "الرحيم",
42
+ "phonemes_fallback": ">ar~aHiymi",
43
+ "madd_positions_base_index": [
44
+ 0,
45
+ 4
46
+ ]
47
+ }
48
+ ]
49
+ },
50
+ {
51
+ "ayah": 2,
52
+ "arabic": "الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ",
53
+ "words": [
54
+ "الْحَمْدُ",
55
+ "لِلَّهِ",
56
+ "رَبِّ",
57
+ "الْعَالَمِينَ"
58
+ ],
59
+ "word_info": [
60
+ {
61
+ "word": "الْحَمْدُ",
62
+ "base": "الحمد",
63
+ "phonemes_fallback": ">aloHamodu",
64
+ "madd_positions_base_index": [
65
+ 0
66
+ ]
67
+ },
68
+ {
69
+ "word": "لِلَّهِ",
70
+ "base": "لله",
71
+ "phonemes_fallback": "lilohi",
72
+ "madd_positions_base_index": []
73
+ },
74
+ {
75
+ "word": "رَبِّ",
76
+ "base": "رب",
77
+ "phonemes_fallback": "rab~i",
78
+ "madd_positions_base_index": []
79
+ },
80
+ {
81
+ "word": "الْعَالَمِينَ",
82
+ "base": "العالمين",
83
+ "phonemes_fallback": ">aloEaAlamiyna",
84
+ "madd_positions_base_index": [
85
+ 0,
86
+ 3,
87
+ 6
88
+ ]
89
+ }
90
+ ]
91
+ },
92
+ {
93
+ "ayah": 3,
94
+ "arabic": "الرَّحْمَٰنِ الرَّحِيمِ",
95
+ "words": [
96
+ "الرَّحْمَٰنِ",
97
+ "الرَّحِيمِ"
98
+ ],
99
+ "word_info": [
100
+ {
101
+ "word": "الرَّحْمَٰنِ",
102
+ "base": "الرحمن",
103
+ "phonemes_fallback": ">ar~aHomaٰni",
104
+ "madd_positions_base_index": [
105
+ 0
106
+ ]
107
+ },
108
+ {
109
+ "word": "الرَّحِيمِ",
110
+ "base": "الرحيم",
111
+ "phonemes_fallback": ">ar~aHiymi",
112
+ "madd_positions_base_index": [
113
+ 0,
114
+ 4
115
+ ]
116
+ }
117
+ ]
118
+ },
119
+ {
120
+ "ayah": 4,
121
+ "arabic": "مَالِكِ يَوْمِ الدِّينِ",
122
+ "words": [
123
+ "مَالِكِ",
124
+ "يَوْمِ",
125
+ "الدِّينِ"
126
+ ],
127
+ "word_info": [
128
+ {
129
+ "word": "مَالِكِ",
130
+ "base": "مالك",
131
+ "phonemes_fallback": "maAliki",
132
+ "madd_positions_base_index": [
133
+ 1
134
+ ]
135
+ },
136
+ {
137
+ "word": "يَوْمِ",
138
+ "base": "يوم",
139
+ "phonemes_fallback": "yawomi",
140
+ "madd_positions_base_index": [
141
+ 0,
142
+ 1
143
+ ]
144
+ },
145
+ {
146
+ "word": "الدِّينِ",
147
+ "base": "الدين",
148
+ "phonemes_fallback": ">ad~iyni",
149
+ "madd_positions_base_index": [
150
+ 0,
151
+ 3
152
+ ]
153
+ }
154
+ ]
155
+ },
156
+ {
157
+ "ayah": 5,
158
+ "arabic": "إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ",
159
+ "words": [
160
+ "إِيَّاكَ",
161
+ "نَعْبُدُ",
162
+ "وَإِيَّاكَ",
163
+ "نَسْتَعِينُ"
164
+ ],
165
+ "word_info": [
166
+ {
167
+ "word": "إِيَّاكَ",
168
+ "base": "إياك",
169
+ "phonemes_fallback": "<iy~aAka",
170
+ "madd_positions_base_index": [
171
+ 1,
172
+ 2
173
+ ]
174
+ },
175
+ {
176
+ "word": "نَعْبُدُ",
177
+ "base": "نعبد",
178
+ "phonemes_fallback": "naEobudu",
179
+ "madd_positions_base_index": []
180
+ },
181
+ {
182
+ "word": "وَإِيَّاكَ",
183
+ "base": "وإياك",
184
+ "phonemes_fallback": "wa<iy~aAka",
185
+ "madd_positions_base_index": [
186
+ 0,
187
+ 2,
188
+ 3
189
+ ]
190
+ },
191
+ {
192
+ "word": "نَسْتَعِينُ",
193
+ "base": "نستعين",
194
+ "phonemes_fallback": "nasotaEiynu",
195
+ "madd_positions_base_index": [
196
+ 4
197
+ ]
198
+ }
199
+ ]
200
+ },
201
+ {
202
+ "ayah": 6,
203
+ "arabic": "اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ",
204
+ "words": [
205
+ "اهْدِنَا",
206
+ "الصِّرَاطَ",
207
+ "الْمُسْتَقِيمَ"
208
+ ],
209
+ "word_info": [
210
+ {
211
+ "word": "اهْدِنَا",
212
+ "base": "اهدنا",
213
+ "phonemes_fallback": "<ihodinaA",
214
+ "madd_positions_base_index": [
215
+ 0,
216
+ 4
217
+ ]
218
+ },
219
+ {
220
+ "word": "الصِّرَاطَ",
221
+ "base": "الصراط",
222
+ "phonemes_fallback": ">aS~iraATa",
223
+ "madd_positions_base_index": [
224
+ 0,
225
+ 4
226
+ ]
227
+ },
228
+ {
229
+ "word": "الْمُسْتَقِيمَ",
230
+ "base": "المستقيم",
231
+ "phonemes_fallback": ">alomusotaqiyma",
232
+ "madd_positions_base_index": [
233
+ 0,
234
+ 6
235
+ ]
236
+ }
237
+ ]
238
+ },
239
+ {
240
+ "ayah": 7,
241
+ "arabic": "صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ الْمَغْضُوبِ عَلَيْهِمْ وَلَا الضَّالِّينَ",
242
+ "words": [
243
+ "صِرَاطَ",
244
+ "الَّذِينَ",
245
+ "أَنْعَمْتَ",
246
+ "عَلَيْهِمْ",
247
+ "غَيْرِ",
248
+ "الْمَغْضُوبِ",
249
+ "عَلَيْهِمْ",
250
+ "وَلَا",
251
+ "الضَّالِّينَ"
252
+ ],
253
+ "word_info": [
254
+ {
255
+ "word": "صِرَاطَ",
256
+ "base": "صراط",
257
+ "phonemes_fallback": "SiraATa",
258
+ "madd_positions_base_index": [
259
+ 2
260
+ ]
261
+ },
262
+ {
263
+ "word": "الَّذِينَ",
264
+ "base": "الذين",
265
+ "phonemes_fallback": ">al~a*iyna",
266
+ "madd_positions_base_index": [
267
+ 0,
268
+ 3
269
+ ]
270
+ },
271
+ {
272
+ "word": "أَنْعَمْتَ",
273
+ "base": "أنعمت",
274
+ "phonemes_fallback": ">anoEamota",
275
+ "madd_positions_base_index": []
276
+ },
277
+ {
278
+ "word": "عَلَيْهِمْ",
279
+ "base": "عليهم",
280
+ "phonemes_fallback": "Ealayohimo",
281
+ "madd_positions_base_index": [
282
+ 2
283
+ ]
284
+ },
285
+ {
286
+ "word": "غَيْرِ",
287
+ "base": "غير",
288
+ "phonemes_fallback": "gayori",
289
+ "madd_positions_base_index": [
290
+ 1
291
+ ]
292
+ },
293
+ {
294
+ "word": "الْمَغْضُوبِ",
295
+ "base": "المغضوب",
296
+ "phonemes_fallback": ">alomagoDuwbi",
297
+ "madd_positions_base_index": [
298
+ 0,
299
+ 5
300
+ ]
301
+ },
302
+ {
303
+ "word": "عَلَيْهِمْ",
304
+ "base": "عليهم",
305
+ "phonemes_fallback": "Ealayohimo",
306
+ "madd_positions_base_index": [
307
+ 2
308
+ ]
309
+ },
310
+ {
311
+ "word": "وَلَا",
312
+ "base": "ولا",
313
+ "phonemes_fallback": "walaA",
314
+ "madd_positions_base_index": [
315
+ 0,
316
+ 2
317
+ ]
318
+ },
319
+ {
320
+ "word": "الضَّالِّينَ",
321
+ "base": "الضالين",
322
+ "phonemes_fallback": ">aD~aAl~iyna",
323
+ "madd_positions_base_index": [
324
+ 0,
325
+ 3,
326
+ 5
327
+ ]
328
+ }
329
+ ]
330
+ }
331
+ ]
332
+ }
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.128.0
2
+ uvicorn==0.40.0
3
+ python-multipart==0.0.21
4
+
5
+ numpy
6
+ librosa
7
+ soundfile
8
+ webrtcvad
9
+ praat-parselmouth
10
+ dtw-python
11
+
12
+ torch
13
+ transformers
14
+ sentencepiece
15
+ jiwer
step10_word_segments_and_mapping.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import wave
3
+ import contextlib
4
+ import numpy as np
5
+ import webrtcvad
6
+ import librosa
7
+ from difflib import SequenceMatcher
8
+ from arabic_phonemizer import ArabicPhonemizer
9
+
10
+ AUDIO_PATH = "sample.wav"
11
+ CANON_PATH = "data/fatiha_canonical_fallback.json"
12
+ OUT_PATH = "output/word_mapping.json"
13
+
14
+ # VAD settings
15
+ VAD_MODE = 2 # 0-3 (higher = more aggressive)
16
+ FRAME_MS = 30 # 10, 20, or 30ms required
17
+
18
+ def read_wav_mono16k(path):
19
+ # librosa loads float32; we need int16 pcm for VAD
20
+ audio, sr = librosa.load(path, sr=16000, mono=True)
21
+ pcm16 = (audio * 32767).astype(np.int16)
22
+ return pcm16, 16000
23
+
24
+ def frame_generator(pcm16, sr, frame_ms):
25
+ n = int(sr * frame_ms / 1000)
26
+ offset = 0
27
+ while offset + n < len(pcm16):
28
+ yield pcm16[offset:offset+n]
29
+ offset += n
30
+
31
+ def vad_segments(pcm16, sr, frame_ms, mode):
32
+ vad = webrtcvad.Vad(mode)
33
+ frames = list(frame_generator(pcm16, sr, frame_ms))
34
+ voiced_flags = [vad.is_speech(f.tobytes(), sr) for f in frames]
35
+
36
+ # Convert voiced_flags into segments in seconds
37
+ segments = []
38
+ in_seg = False
39
+ start_i = 0
40
+ for i, v in enumerate(voiced_flags):
41
+ if v and not in_seg:
42
+ in_seg = True
43
+ start_i = i
44
+ elif (not v) and in_seg:
45
+ in_seg = False
46
+ end_i = i
47
+ segments.append((start_i, end_i))
48
+ if in_seg:
49
+ segments.append((start_i, len(voiced_flags)))
50
+
51
+ # Merge segments that are too close
52
+ merged = []
53
+ for s, e in segments:
54
+ if not merged:
55
+ merged.append([s, e])
56
+ else:
57
+ prev_s, prev_e = merged[-1]
58
+ gap = s - prev_e
59
+ if gap <= 2: # ~60ms gap
60
+ merged[-1][1] = e
61
+ else:
62
+ merged.append([s, e])
63
+
64
+ # Convert to time
65
+ out = []
66
+ for s, e in merged:
67
+ t0 = (s * frame_ms) / 1000.0
68
+ t1 = (e * frame_ms) / 1000.0
69
+ if (t1 - t0) >= 0.10:
70
+ out.append((round(t0, 3), round(t1, 3)))
71
+ return out
72
+
73
+ def canonical_words(canon):
74
+ words = []
75
+ for ay in canon["ayahs"]:
76
+ for w in ay["word_info"]:
77
+ words.append({"ayah": ay["ayah"], "word": w["word"], "base": w["base"]})
78
+ return words
79
+
80
+ def similarity(a, b):
81
+ return SequenceMatcher(None, a, b).ratio()
82
+
83
+ def main():
84
+ with open(CANON_PATH, "r", encoding="utf-8") as f:
85
+ canon = json.load(f)
86
+
87
+ canon_words = canonical_words(canon)
88
+ ph = ArabicPhonemizer()
89
+
90
+ pcm16, sr = read_wav_mono16k(AUDIO_PATH)
91
+ segs = vad_segments(pcm16, sr, FRAME_MS, VAD_MODE)
92
+
93
+ # For each audio segment, phonemize its "best guess" by just extracting audio and using fallback:
94
+ # We don't have ASR here; so we approximate by mapping segments to canonical words in order
95
+ # using a greedy approach: advance through canon words and match by duration / count.
96
+ #
97
+ # MVP: we map N segments to first N canon words (still better than madd-only mapping)
98
+ mapped = []
99
+ n = min(len(segs), len(canon_words))
100
+ for i in range(n):
101
+ t0, t1 = segs[i]
102
+ cw = canon_words[i]
103
+ mapped.append({
104
+ "segment_index": i+1,
105
+ "timestamp": {"start": t0, "end": t1},
106
+ "mapped_canonical": cw
107
+ })
108
+
109
+ out = {
110
+ "audio_path": AUDIO_PATH,
111
+ "vad": {"mode": VAD_MODE, "frame_ms": FRAME_MS},
112
+ "segments": segs,
113
+ "mapped": mapped,
114
+ "note": "This is MVP word-like segmentation. Next step will replace sequential mapping with acoustic+phoneme alignment."
115
+ }
116
+
117
+ with open(OUT_PATH, "w", encoding="utf-8") as f:
118
+ json.dump(out, f, ensure_ascii=False, indent=2)
119
+
120
+ print("OK ✅ wrote", OUT_PATH)
121
+ print("VAD segments:", len(segs))
122
+ if mapped:
123
+ print("First mapping:", mapped[0])
124
+
125
+ if __name__ == "__main__":
126
+ main()
step12_align_segments_wavlm.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ import librosa
4
+ import torch
5
+ from dtw import dtw
6
+ from transformers import AutoFeatureExtractor, AutoModel
7
+ from arabic_phonemizer import ArabicPhonemizer
8
+
9
+ AUDIO_PATH = "sample_trim.wav"
10
+ CANON_PATH = "data/fatiha_canonical_fallback.json"
11
+ OUT_PATH = "output/alignment_wavlm.json"
12
+
13
+ MODEL_ID = "microsoft/wavlm-base"
14
+
15
+ def wavlm_embeddings(audio_16k: np.ndarray, sr: int):
16
+ fe = AutoFeatureExtractor.from_pretrained(MODEL_ID)
17
+ model = AutoModel.from_pretrained(MODEL_ID)
18
+ model.eval()
19
+
20
+ inputs = fe(audio_16k, sampling_rate=sr, return_tensors="pt")
21
+ with torch.no_grad():
22
+ out = model(**inputs)
23
+ # (frames, hidden)
24
+ emb = out.last_hidden_state[0].cpu().numpy()
25
+ return emb
26
+
27
+ def mean_pool(emb: np.ndarray):
28
+ return emb.mean(axis=0)
29
+
30
+ def load_audio_segment(path, start_s, end_s, sr=16000):
31
+ audio, _ = librosa.load(path, sr=sr, mono=True, offset=float(start_s), duration=float(end_s - start_s))
32
+ return audio
33
+
34
+ def canonical_word_list(canon):
35
+ words = []
36
+ for ay in canon["ayahs"]:
37
+ for w in ay["word_info"]:
38
+ words.append({"ayah": ay["ayah"], "word": w["word"], "base": w["base"]})
39
+ return words
40
+
41
+ def vad_segments_from_step8(feedback_path="output/feedback_madd.json"):
42
+ # Use the long segments already detected in your feedback JSON
43
+ d = json.load(open(feedback_path, encoding="utf-8"))
44
+ segs = [(s["start"], s["end"]) for s in d["segments_detected"]]
45
+ return segs
46
+
47
+ def cosine(a, b):
48
+ a = a / (np.linalg.norm(a) + 1e-9)
49
+ b = b / (np.linalg.norm(b) + 1e-9)
50
+ return float(np.dot(a, b))
51
+
52
+ def main():
53
+ canon = json.load(open(CANON_PATH, encoding="utf-8"))
54
+ canon_words = canonical_word_list(canon)
55
+
56
+ # We will build "prototype embeddings" for each canonical word by phonemizing text
57
+ # For MVP we don't synthesize audio; instead we just keep word order and do local matching.
58
+ # Real version uses forced alignment / phoneme decoding.
59
+ #
60
+ # Here we do a practical improvement: map each detected long segment to a nearby word index
61
+ # based on its relative time position in the recitation.
62
+ segs = vad_segments_from_step8()
63
+
64
+ # Compute full-audio embedding frames once
65
+ full_audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
66
+ full_emb = wavlm_embeddings(full_audio, sr)
67
+
68
+ # Map time->frame index approximately
69
+ # WavLM frame rate is roughly 50 fps-ish after feature extraction; we estimate using emb length
70
+ total_sec = len(full_audio) / sr
71
+ frames = full_emb.shape[0]
72
+ fps = frames / total_sec
73
+
74
+ results = []
75
+ for i, (s, e) in enumerate(segs, 1):
76
+ # Take embedding slice for this time window
77
+ f0 = int(max(0, np.floor(s * fps)))
78
+ f1 = int(min(frames, np.ceil(e * fps)))
79
+ if f1 <= f0 + 1:
80
+ continue
81
+ seg_vec = mean_pool(full_emb[f0:f1])
82
+
83
+ # Estimate position in surah by time ratio, then search around that word index
84
+ t_mid = (s + e) / 2.0
85
+ ratio = t_mid / total_sec
86
+ est_idx = int(ratio * (len(canon_words) - 1))
87
+
88
+ # Search a window around estimated index
89
+ W = 6
90
+ cand_range = range(max(0, est_idx - W), min(len(canon_words), est_idx + W + 1))
91
+
92
+ # Score candidates (we don’t have word audio prototypes, so we use a simple proxy:
93
+ # compare segment vector to other segment vectors nearby is not helpful.
94
+ # Instead: pick the nearest index as MVP and output the search window.
95
+ # This step is mainly building the structure; next step will add real phoneme decoder/alignment.)
96
+ chosen = est_idx
97
+
98
+ results.append({
99
+ "segment_index": i,
100
+ "timestamp": {"start": round(float(s), 3), "end": round(float(e), 3)},
101
+ "estimated_word_index": est_idx,
102
+ "candidate_word_indices": list(cand_range),
103
+ "mapped_word": canon_words[chosen],
104
+ "note": "MVP time-based alignment using WavLM frame mapping. Next step replaces this with phoneme/CTC alignment."
105
+ })
106
+
107
+ out = {
108
+ "audio_path": AUDIO_PATH,
109
+ "total_sec": round(float(total_sec), 3),
110
+ "wavlm": {"model_id": MODEL_ID, "frames": int(frames), "fps_est": round(float(fps), 2)},
111
+ "num_canonical_words": len(canon_words),
112
+ "segments_used": len(results),
113
+ "alignment": results
114
+ }
115
+
116
+ json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
117
+ print("OK ✅ wrote", OUT_PATH)
118
+ print("Segments aligned:", len(results))
119
+ if results:
120
+ print("Sample:", results[0])
121
+
122
+ if __name__ == "__main__":
123
+ main()
step13_arabic_ctc_transcribe.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import librosa
4
+ from transformers import AutoProcessor, AutoModelForCTC
5
+
6
+ # Arabic wav2vec2 CTC model (CPU friendly but heavy)
7
+ MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
8
+
9
+ AUDIO_PATH = "sample_trim.wav"
10
+ OUT_TXT = os.path.join("output", "asr_raw.txt")
11
+
12
+ def main():
13
+ os.makedirs("output", exist_ok=True)
14
+
15
+ print("Loading:", MODEL_ID)
16
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
17
+ model = AutoModelForCTC.from_pretrained(MODEL_ID)
18
+ model.eval()
19
+
20
+ audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
21
+ print("Audio sec:", round(len(audio)/sr, 2))
22
+
23
+ inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
24
+
25
+ with torch.no_grad():
26
+ logits = model(**inputs).logits
27
+
28
+ pred_ids = torch.argmax(logits, dim=-1)
29
+ text = processor.batch_decode(pred_ids)[0].strip()
30
+
31
+ # Save to file for downstream steps
32
+ with open(OUT_TXT, "w", encoding="utf-8") as f:
33
+ f.write(text + "\n")
34
+
35
+ print("\n--- RAW TRANSCRIPTION ---")
36
+ print(text)
37
+ print(f"\nOK ✅ wrote {OUT_TXT}")
38
+
39
+ if __name__ == "__main__":
40
+ main()
step14_align_text_to_canonical.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from difflib import SequenceMatcher
4
+
5
+ CANON_PATH = "data/fatiha_canonical.json"
6
+ ASR_TEXT_PATH = "output/asr_raw.txt"
7
+ OUT_PATH = "output/text_alignment.json"
8
+
9
+ # --- Normalization helpers ---
10
+ ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") # harakat etc.
11
+ TATWEEL = "\u0640"
12
+
13
+ def normalize_ar(s: str) -> str:
14
+ s = s.replace(TATWEEL, "")
15
+ s = re.sub(ARABIC_DIACRITICS, "", s)
16
+ # normalize common variants
17
+ s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
18
+ s = s.replace("ى", "ي")
19
+ s = s.replace("ة", "ه")
20
+ s = re.sub(r"\s+", " ", s).strip()
21
+ return s
22
+
23
+ def tokenize(s: str):
24
+ # keep Arabic letters and spaces only
25
+ s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
26
+ s = re.sub(r"\s+", " ", s).strip()
27
+ return s.split(" ") if s else []
28
+
29
+ def sim(a, b) -> float:
30
+ return SequenceMatcher(None, a, b).ratio()
31
+
32
+ def main():
33
+ canon = json.load(open(CANON_PATH, encoding="utf-8"))
34
+
35
+ # Load ASR raw text (we will create it in 14.2)
36
+ raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip()
37
+ raw_n = normalize_ar(raw)
38
+
39
+ asr_tokens = tokenize(raw_n)
40
+
41
+ # Canonical tokens (word-level) from JSON
42
+ canon_words = []
43
+ for ay in canon["ayahs"]:
44
+ for w in ay["words"]:
45
+ canon_words.append({
46
+ "ayah": ay["ayah"],
47
+ "word": w,
48
+ "norm": normalize_ar(w)
49
+ })
50
+
51
+ # Greedy alignment: for each canonical word, find best match in a moving window of ASR tokens
52
+ aligned = []
53
+ j = 0
54
+ WINDOW = 6
55
+
56
+ for i, cw in enumerate(canon_words):
57
+ best = None
58
+ best_j = None
59
+ for k in range(j, min(len(asr_tokens), j + WINDOW)):
60
+ score = sim(cw["norm"], asr_tokens[k])
61
+ if (best is None) or (score > best):
62
+ best = score
63
+ best_j = k
64
+
65
+ if best is None:
66
+ aligned.append({
67
+ "canon": cw,
68
+ "asr_token": None,
69
+ "score": 0.0,
70
+ "match": False
71
+ })
72
+ continue
73
+
74
+ token = asr_tokens[best_j]
75
+ match = best >= 0.75 # MVP threshold
76
+
77
+ aligned.append({
78
+ "canon": cw,
79
+ "asr_token": token,
80
+ "score": round(float(best), 3),
81
+ "match": bool(match)
82
+ })
83
+
84
+ # advance pointer to keep order
85
+ j = best_j + 1
86
+
87
+ # Summaries
88
+ total = len(aligned)
89
+ matches = sum(1 for a in aligned if a["match"])
90
+ mismatches = total - matches
91
+
92
+ out = {
93
+ "asr_raw": raw,
94
+ "asr_normalized": raw_n,
95
+ "stats": {
96
+ "canonical_words": total,
97
+ "matches": matches,
98
+ "mismatches": mismatches,
99
+ "match_rate": round(matches / total, 3) if total else 0.0
100
+ },
101
+ "alignment": aligned
102
+ }
103
+
104
+ json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
105
+
106
+ print("OK ✅ wrote", OUT_PATH)
107
+ print("Match rate:", out["stats"]["match_rate"])
108
+ print("First 5 alignments:")
109
+ for a in aligned[:5]:
110
+ print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"])
111
+
112
+ if __name__ == "__main__":
113
+ main()
step15_global_word_alignment.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from difflib import SequenceMatcher
4
+
5
+ CANON_PATH = "data/fatiha_canonical.json"
6
+ ASR_TEXT_PATH = "output/asr_raw.txt"
7
+ OUT_PATH = "output/text_alignment_global.json"
8
+
9
+ ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
10
+ TATWEEL = "\u0640"
11
+
12
+ def normalize_ar(s: str) -> str:
13
+ s = s.replace(TATWEEL, "")
14
+ s = re.sub(ARABIC_DIACRITICS, "", s)
15
+ s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
16
+ s = s.replace("ى", "ي")
17
+ s = s.replace("ة", "ه")
18
+ s = re.sub(r"\s+", " ", s).strip()
19
+ return s
20
+
21
+ def tokenize(s: str):
22
+ s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
23
+ s = re.sub(r"\s+", " ", s).strip()
24
+ return s.split(" ") if s else []
25
+
26
+ def sim(a, b) -> float:
27
+ return SequenceMatcher(None, a, b).ratio()
28
+
29
+ def main():
30
+ canon = json.load(open(CANON_PATH, encoding="utf-8"))
31
+ raw = open(ASR_TEXT_PATH, encoding="utf-8").read().strip()
32
+ raw_n = normalize_ar(raw)
33
+
34
+ asr_tokens = tokenize(raw_n)
35
+
36
+ canon_words = []
37
+ for ay in canon["ayahs"]:
38
+ for w in ay["words"]:
39
+ canon_words.append({
40
+ "ayah": ay["ayah"],
41
+ "word": w,
42
+ "norm": normalize_ar(w)
43
+ })
44
+
45
+ # --- Global alignment DP ---
46
+ n = len(canon_words)
47
+ m = len(asr_tokens)
48
+
49
+ # scoring
50
+ GAP = -0.45 # penalty for skipping a token/word
51
+ def match_score(i, j):
52
+ # reward similarity, centered around 0.75
53
+ s = sim(canon_words[i]["norm"], asr_tokens[j])
54
+ return (s - 0.75) * 2.0 # >0 is good match
55
+
56
+ # DP matrices
57
+ dp = [[0.0]*(m+1) for _ in range(n+1)]
58
+ bt = [[None]*(m+1) for _ in range(n+1)] # backtrack: 'D' diag, 'U' up, 'L' left
59
+
60
+ for i in range(1, n+1):
61
+ dp[i][0] = dp[i-1][0] + GAP
62
+ bt[i][0] = 'U'
63
+ for j in range(1, m+1):
64
+ dp[0][j] = dp[0][j-1] + GAP
65
+ bt[0][j] = 'L'
66
+
67
+ for i in range(1, n+1):
68
+ for j in range(1, m+1):
69
+ diag = dp[i-1][j-1] + match_score(i-1, j-1)
70
+ up = dp[i-1][j] + GAP
71
+ left = dp[i][j-1] + GAP
72
+ best = max(diag, up, left)
73
+ dp[i][j] = best
74
+ bt[i][j] = 'D' if best == diag else ('U' if best == up else 'L')
75
+
76
+ # Backtrack to alignment pairs
77
+ aligned = []
78
+ i, j = n, m
79
+ while i > 0 or j > 0:
80
+ move = bt[i][j]
81
+ if move == 'D':
82
+ cw = canon_words[i-1]
83
+ tok = asr_tokens[j-1]
84
+ s = sim(cw["norm"], tok)
85
+ aligned.append({
86
+ "canon": cw,
87
+ "asr_token": tok,
88
+ "score": round(float(s), 3),
89
+ "match": bool(s >= 0.72)
90
+ })
91
+ i -= 1
92
+ j -= 1
93
+ elif move == 'U':
94
+ cw = canon_words[i-1]
95
+ aligned.append({
96
+ "canon": cw,
97
+ "asr_token": None,
98
+ "score": 0.0,
99
+ "match": False
100
+ })
101
+ i -= 1
102
+ else: # 'L'
103
+ # ASR token skipped
104
+ j -= 1
105
+
106
+ aligned.reverse()
107
+
108
+ total = len(canon_words)
109
+ matches = sum(1 for a in aligned if a["canon"] and a["match"])
110
+ mismatches = total - matches
111
+
112
+ out = {
113
+ "asr_raw": raw,
114
+ "asr_normalized": raw_n,
115
+ "stats": {
116
+ "canonical_words": total,
117
+ "asr_tokens": len(asr_tokens),
118
+ "matches": matches,
119
+ "mismatches": mismatches,
120
+ "match_rate": round(matches / total, 3) if total else 0.0
121
+ },
122
+ "alignment": aligned
123
+ }
124
+
125
+ json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
126
+
127
+ print("OK ✅ wrote", OUT_PATH)
128
+ print("Match rate:", out["stats"]["match_rate"])
129
+ print("First 8 alignments:")
130
+ shown = 0
131
+ for a in aligned:
132
+ if a["canon"] is None:
133
+ continue
134
+ print("-", a["canon"]["word"], "=>", a["asr_token"], "score", a["score"], "match", a["match"])
135
+ shown += 1
136
+ if shown >= 8:
137
+ break
138
+
139
+ if __name__ == "__main__":
140
+ main()
step16_ctc_word_timestamps.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import numpy as np
4
+ import torch
5
+ import librosa
6
+ from transformers import AutoProcessor, AutoModelForCTC
7
+
8
+ AUDIO_PATH = "sample_trim.wav"
9
+ ALIGN_PATH = "output/text_alignment_global.json"
10
+ OUT_PATH = "output/word_timestamps.json"
11
+
12
+ MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
13
+
14
+ ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
15
+ TATWEEL = "\u0640"
16
+
17
+ def normalize_ar(s: str) -> str:
18
+ s = s.replace(TATWEEL, "")
19
+ s = re.sub(ARABIC_DIACRITICS, "", s)
20
+ s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
21
+ s = s.replace("ى", "ي")
22
+ s = s.replace("ة", "ه")
23
+ s = re.sub(r"\s+", " ", s).strip()
24
+ return s
25
+
26
+ def main():
27
+ # Load alignment
28
+ align = json.load(open(ALIGN_PATH, encoding="utf-8"))
29
+ alignment = [a for a in align["alignment"] if a.get("canon")]
30
+
31
+ # Load audio
32
+ audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
33
+ total_sec = len(audio) / sr
34
+
35
+ # Load CTC model
36
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
37
+ model = AutoModelForCTC.from_pretrained(MODEL_ID)
38
+ model.eval()
39
+
40
+ inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
41
+
42
+ with torch.no_grad():
43
+ logits = model(**inputs).logits[0] # (T, V)
44
+
45
+ pred_ids = torch.argmax(logits, dim=-1).cpu().numpy().tolist()
46
+
47
+ # Convert token IDs -> tokens
48
+ vocab = processor.tokenizer.get_vocab()
49
+ # invert vocab: id -> token
50
+ inv_vocab = {i: t for t, i in vocab.items()}
51
+
52
+ blank_id = processor.tokenizer.pad_token_id
53
+ if blank_id is None:
54
+ # fallback: common wav2vec2 blank is vocab["<pad>"]
55
+ blank_id = vocab.get("<pad>", None)
56
+
57
+ tokens = [inv_vocab[i] for i in pred_ids]
58
+
59
+ # Collapse repeats, remove blanks, keep time indices
60
+ collapsed = []
61
+ prev = None
62
+ for t_idx, tok_id in enumerate(pred_ids):
63
+ if tok_id == prev:
64
+ continue
65
+ prev = tok_id
66
+ if blank_id is not None and tok_id == blank_id:
67
+ continue
68
+ tok = inv_vocab.get(tok_id, "")
69
+ if tok.strip() == "":
70
+ continue
71
+ collapsed.append((t_idx, tok))
72
+
73
+ # Map CTC time index -> seconds
74
+ # time steps correspond to model frames spanning full audio
75
+ T = logits.shape[0]
76
+ def idx_to_time(i):
77
+ return (i / T) * total_sec
78
+
79
+ # Prepare normalized ASR tokens from alignment file (we use them to locate spans)
80
+ asr_tokens = []
81
+ for a in alignment:
82
+ if a["asr_token"] is None:
83
+ asr_tokens.append(None)
84
+ else:
85
+ asr_tokens.append(normalize_ar(a["asr_token"]))
86
+
87
+ # We will approximate word timestamps by scanning collapsed tokens and
88
+ # finding the earliest and latest CTC indices where the letters of the ASR token appear in order.
89
+ #
90
+ # This is a heuristic but works reasonably for MVP.
91
+ def find_span_for_word(word_norm, start_search_idx):
92
+ if not word_norm:
93
+ return None, start_search_idx
94
+ # remove spaces
95
+ target = word_norm.replace(" ", "")
96
+ if target == "":
97
+ return None, start_search_idx
98
+
99
+ i = start_search_idx
100
+ start_idx = None
101
+ last_idx = None
102
+
103
+ for ch in target:
104
+ found = False
105
+ while i < len(collapsed):
106
+ t_idx, tok = collapsed[i]
107
+ # tokens may be characters or pieces; match if character appears
108
+ if ch in tok:
109
+ if start_idx is None:
110
+ start_idx = t_idx
111
+ last_idx = t_idx
112
+ i += 1
113
+ found = True
114
+ break
115
+ i += 1
116
+ if not found:
117
+ return None, start_search_idx
118
+
119
+ return (start_idx, last_idx), i
120
+
121
+ out_rows = []
122
+ search_ptr = 0
123
+ for a in alignment:
124
+ cw = a["canon"]
125
+ tok = a["asr_token"]
126
+ tok_norm = normalize_ar(tok) if tok else None
127
+
128
+ span, search_ptr2 = find_span_for_word(tok_norm, search_ptr) if tok_norm else (None, search_ptr)
129
+ if span is None:
130
+ start_t = None
131
+ end_t = None
132
+ else:
133
+ s_idx, e_idx = span
134
+ start_t = round(float(idx_to_time(s_idx)), 3)
135
+ end_t = round(float(idx_to_time(e_idx)), 3)
136
+ # advance pointer to keep order
137
+ search_ptr = search_ptr2
138
+
139
+ out_rows.append({
140
+ "ayah": cw["ayah"],
141
+ "word": cw["word"],
142
+ "asr_token": tok,
143
+ "score": a["score"],
144
+ "match": a["match"],
145
+ "timestamp": None if start_t is None else {"start": start_t, "end": end_t}
146
+ })
147
+
148
+ out = {
149
+ "audio_path": AUDIO_PATH,
150
+ "model": MODEL_ID,
151
+ "note": "CTC-based approximate word timestamps; upgrade later with forced alignment for higher accuracy.",
152
+ "stats": {
153
+ "words": len(out_rows),
154
+ "timestamped": sum(1 for r in out_rows if r["timestamp"] is not None)
155
+ },
156
+ "words": out_rows
157
+ }
158
+
159
+ json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
160
+ print("OK ✅ wrote", OUT_PATH)
161
+ print("Timestamped:", out["stats"]["timestamped"], "/", out["stats"]["words"])
162
+ print("Sample:", out_rows[0])
163
+
164
+ if __name__ == "__main__":
165
+ main()
step16b_token_interpolation_timestamps.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import librosa
4
+
5
+ AUDIO_PATH = "sample_trim.wav"
6
+ ALIGN_GLOBAL_PATH = "output/text_alignment_global.json"
7
+ OUT_PATH = "output/word_timestamps_v2.json"
8
+
9
+ ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]")
10
+ TATWEEL = "\u0640"
11
+
12
+ def normalize_ar(s: str) -> str:
13
+ s = s.replace(TATWEEL, "")
14
+ s = re.sub(ARABIC_DIACRITICS, "", s)
15
+ s = s.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
16
+ s = s.replace("ى", "ي")
17
+ s = s.replace("ة", "ه")
18
+ s = re.sub(r"\s+", " ", s).strip()
19
+ return s
20
+
21
+ def tokenize_ar_words(s: str):
22
+ s = re.sub(r"[^\u0600-\u06FF\s]", " ", s)
23
+ s = re.sub(r"\s+", " ", s).strip()
24
+ return s.split(" ") if s else []
25
+
26
+ def main():
27
+ # Load audio duration
28
+ audio, sr = librosa.load(AUDIO_PATH, sr=16000, mono=True)
29
+ total_sec = len(audio) / sr
30
+
31
+ # Load global alignment (has asr_raw + alignment pairs)
32
+ g = json.load(open(ALIGN_GLOBAL_PATH, encoding="utf-8"))
33
+ asr_raw = g["asr_raw"]
34
+ asr_norm = normalize_ar(asr_raw)
35
+ asr_tokens = tokenize_ar_words(asr_norm)
36
+
37
+ # Build token timeline: divide total audio time across ASR tokens evenly
38
+ # (MVP approximation; later replace with real forced alignment)
39
+ N = max(1, len(asr_tokens))
40
+ token_times = []
41
+ for i in range(N):
42
+ start = (i / N) * total_sec
43
+ end = ((i + 1) / N) * total_sec
44
+ token_times.append((round(start, 3), round(end, 3)))
45
+
46
+ # Now assign each canonical word the timestamp of its matched ASR token (if any),
47
+ # otherwise interpolate from its index in canonical sequence.
48
+ alignment = [a for a in g["alignment"] if a.get("canon")]
49
+
50
+ out_words = []
51
+ last_token_idx = 0
52
+ for idx, a in enumerate(alignment):
53
+ cw = a["canon"]
54
+ tok = a["asr_token"]
55
+
56
+ if tok is not None:
57
+ tok_norm = normalize_ar(tok)
58
+ # find token index in asr_tokens near expected position
59
+ # we use a forward search to keep monotonic mapping
60
+ # MVP: choose first exact match, else fallback to proportional index
61
+ # monotonic search: only search forward from last token index
62
+ found = None
63
+ for ti in range(last_token_idx, len(asr_tokens)):
64
+ if asr_tokens[ti] == tok_norm:
65
+ found = ti
66
+ break
67
+
68
+ if found is None:
69
+ # fallback: proportional but also monotonic
70
+ found = int((idx / max(1, len(alignment))) * (N - 1))
71
+ found = max(found, last_token_idx)
72
+
73
+ t0, t1 = token_times[found]
74
+ last_token_idx = found + 1
75
+ else:
76
+ # no matched token: proportional fallback
77
+ found = int((idx / max(1, len(alignment))) * (N - 1))
78
+ t0, t1 = token_times[found]
79
+
80
+ out_words.append({
81
+ "index": idx + 1,
82
+ "ayah": cw["ayah"],
83
+ "word": cw["word"],
84
+ "asr_token": tok,
85
+ "score": a["score"],
86
+ "match": a["match"],
87
+ "timestamp": {"start": t0, "end": t1}
88
+ })
89
+
90
+ out = {
91
+ "audio_path": AUDIO_PATH,
92
+ "method": "token-time interpolation (MVP)",
93
+ "stats": {
94
+ "canonical_words": len(out_words),
95
+ "asr_tokens": len(asr_tokens),
96
+ "timestamped": len(out_words)
97
+ },
98
+ "words": out_words
99
+ }
100
+
101
+ json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
102
+ print("OK ✅ wrote", OUT_PATH)
103
+ print("Words timestamped:", len(out_words), "/", len(out_words))
104
+ print("First:", out_words[0])
105
+ print("Last:", out_words[-1])
106
+
107
+ if __name__ == "__main__":
108
+ main()
step17_make_api_response.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ WORDS_PATH = "output/word_timestamps_v2.json"
4
+ MADD_PATH = "output/feedback_madd.json"
5
+ CANON_FALLBACK_PATH = "data/fatiha_canonical_fallback.json"
6
+ OUT_PATH = "output/api_response.json"
7
+
8
+ def main():
9
+ words_doc = json.load(open(WORDS_PATH, encoding="utf-8"))
10
+ madd_doc = json.load(open(MADD_PATH, encoding="utf-8"))
11
+ canon_fb = json.load(open(CANON_FALLBACK_PATH, encoding="utf-8"))
12
+
13
+ # Build quick lookup: (ayah, word) -> madd_positions
14
+ madd_pos = {}
15
+ for ay in canon_fb["ayahs"]:
16
+ for wi in ay.get("word_info", []):
17
+ madd_pos[(ay["ayah"], wi["word"])] = wi.get("madd_positions_base_index", [])
18
+
19
+ # Word list for UI
20
+ ui_words = []
21
+ mismatches = []
22
+ for w in words_doc["words"]:
23
+ ay = w["ayah"]
24
+ word = w["word"]
25
+ item = {
26
+ "index": w["index"],
27
+ "ayah": ay,
28
+ "word": word,
29
+ "timestamp": w["timestamp"],
30
+ "match": w["match"],
31
+ "score": w["score"],
32
+ "madd_positions_base_index": madd_pos.get((ay, word), [])
33
+ }
34
+ ui_words.append(item)
35
+ if not w["match"]:
36
+ mismatches.append({
37
+ "ayah": ay,
38
+ "word": word,
39
+ "timestamp": w["timestamp"],
40
+ "reason": "text_mismatch",
41
+ "score": w["score"]
42
+ })
43
+
44
+ # Madd results already include timestamps; keep them as "issues"
45
+ madd_issues = []
46
+ for r in madd_doc.get("results", []):
47
+ madd_issues.append({
48
+ "type": "madd",
49
+ "ayah": r["ayah"],
50
+ "word": r["word"],
51
+ "timestamp": r["timestamp"],
52
+ "duration_sec": r["duration_sec"],
53
+ "classification": r["classification"],
54
+ "confidence": r["confidence"],
55
+ "tip": r["tip"]
56
+ })
57
+
58
+ out = {
59
+ "surah": "Al-Fatiha",
60
+ "audio_path": words_doc["audio_path"],
61
+ "pipeline_version": "mvp-v1",
62
+ "summary": {
63
+ "words_total": len(ui_words),
64
+ "text_mismatches": len(mismatches),
65
+ "madd_issues": len(madd_issues)
66
+ },
67
+ "words": ui_words,
68
+ "issues": {
69
+ "text": mismatches,
70
+ "madd": madd_issues
71
+ },
72
+ "notes": [
73
+ "Word timestamps are MVP (token-time interpolation).",
74
+ "Text alignment uses global DP alignment for robustness.",
75
+ "Madd detection uses intensity-based long voiced segments; replace with phoneme-level alignment later."
76
+ ]
77
+ }
78
+
79
+ json.dump(out, open(OUT_PATH, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
80
+ print("OK ✅ wrote", OUT_PATH)
81
+ print("Summary:", out["summary"])
82
+ if out["issues"]["text"]:
83
+ print("Example text mismatch:", out["issues"]["text"][0])
84
+ if out["issues"]["madd"]:
85
+ print("Example madd issue:", out["issues"]["madd"][0])
86
+
87
+ if __name__ == "__main__":
88
+ main()
step5_wavlm_test.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ from transformers import AutoFeatureExtractor, AutoModel
4
+
5
+ MODEL_ID = "microsoft/wavlm-base"
6
+
7
+ def load_audio(path: str, target_sr: int = 16000):
8
+ audio, sr = librosa.load(path, sr=target_sr, mono=True)
9
+ return audio, sr
10
+
11
+ def main():
12
+ print("Loading model:", MODEL_ID)
13
+
14
+ feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID)
15
+ model = AutoModel.from_pretrained(MODEL_ID)
16
+ model.eval()
17
+
18
+ audio, sr = load_audio("sample.wav")
19
+ print("Audio length (sec):", round(len(audio) / sr, 2))
20
+
21
+ inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt")
22
+
23
+ with torch.no_grad():
24
+ out = model(**inputs)
25
+
26
+ x = out.last_hidden_state # [batch, frames, hidden]
27
+ print("OK ✅ WavLM ran on CPU")
28
+ print("Embedding tensor shape:", tuple(x.shape))
29
+
30
+ if __name__ == "__main__":
31
+ main()
step7_fallback_phonemes_and_madd.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from arabic_phonemizer import ArabicPhonemizer
4
+
5
+ # --- Helpers ---
6
+ # Very simple Madd detection from script (MVP-level):
7
+ # We mark likely long vowels caused by: ا, و, ي, ى, and madd sign "ٓ"
8
+ MADD_CHARS = set(["ا", "و", "ي", "ى", "ٓ"])
9
+
10
+ ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0653\u0654\u0655]") # tanwin, harakat, etc.
11
+
12
+ def strip_diacritics(s: str) -> str:
13
+ return re.sub(ARABIC_DIACRITICS, "", s)
14
+
15
+ def detect_madd_positions(word: str):
16
+ """
17
+ Returns a list of indices in the *diacritics-stripped* word where Madd-ish characters appear.
18
+ MVP heuristic; later replace with Quranic-Phonemizer (Tajweed-aware).
19
+ """
20
+ base = strip_diacritics(word)
21
+ return [i for i, ch in enumerate(base) if ch in MADD_CHARS]
22
+
23
+ def main():
24
+ # Instantiate phonemizer once
25
+ ph = ArabicPhonemizer()
26
+
27
+ path_in = "data/fatiha_canonical.json"
28
+ with open(path_in, "r", encoding="utf-8") as f:
29
+ data = json.load(f)
30
+
31
+ for ay in data["ayahs"]:
32
+ ay_word_info = []
33
+ for w in ay["words"]:
34
+ base = strip_diacritics(w)
35
+
36
+ # ArabicPhonemizer API: use .phonemize(text)
37
+ # If your version differs, we’ll adapt after you run it.
38
+ phonemes = ph.phonemize(w)
39
+
40
+ ay_word_info.append({
41
+ "word": w,
42
+ "base": base,
43
+ "phonemes_fallback": phonemes,
44
+ "madd_positions_base_index": detect_madd_positions(w)
45
+ })
46
+ ay["word_info"] = ay_word_info
47
+
48
+ path_out = "data/fatiha_canonical_fallback.json"
49
+ with open(path_out, "w", encoding="utf-8") as f:
50
+ json.dump(data, f, ensure_ascii=False, indent=2)
51
+
52
+ print("OK ✅ wrote", path_out)
53
+ print("Sample ayah 1 word_info:")
54
+ for item in data["ayahs"][0]["word_info"]:
55
+ print(" -", item["word"], "| base:", item["base"], "| madd idx:", item["madd_positions_base_index"], "| ph:", item["phonemes_fallback"])
56
+
57
+ if __name__ == "__main__":
58
+ main()
step8_madd_signal.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import parselmouth
2
+ import numpy as np
3
+
4
+ AUDIO_PATH = "sample_trim.wav"
5
+
6
+ def main():
7
+ snd = parselmouth.Sound(AUDIO_PATH)
8
+
9
+ duration = snd.get_total_duration()
10
+ print("Audio duration (sec):", round(duration, 2))
11
+
12
+ # Intensity (energy over time)
13
+ intensity = snd.to_intensity(time_step=0.01)
14
+ times = intensity.xs()
15
+ vals = intensity.values[0]
16
+
17
+ # Simple segmentation: find "voiced-ish" regions by intensity threshold
18
+ thr = np.percentile(vals, 60) # adaptive threshold
19
+ voiced = vals > thr
20
+
21
+ # Convert boolean mask into segments [start, end]
22
+ segments = []
23
+ in_seg = False
24
+ start = None
25
+ for t, v in zip(times, voiced):
26
+ if v and not in_seg:
27
+ in_seg = True
28
+ start = t
29
+ elif (not v) and in_seg:
30
+ in_seg = False
31
+ end = t
32
+ if end - start >= 0.06: # ignore tiny blips
33
+ segments.append((start, end))
34
+ if in_seg and start is not None:
35
+ end = times[-1]
36
+ if end - start >= 0.06:
37
+ segments.append((start, end))
38
+
39
+ # Print segments
40
+ print("Candidate voiced segments:", len(segments))
41
+ for i, (s, e) in enumerate(segments[:12], 1):
42
+ print(f"{i:02d}. {s:.2f} -> {e:.2f} (dur {e-s:.2f}s)")
43
+
44
+ # Heuristic "madd-like" durations: anything > 0.18s is suspiciously long vowel
45
+ longish = [(s, e, e - s) for (s, e) in segments if (e - s) >= 0.18]
46
+ print("\nLong segments (possible Madd candidates):", len(longish))
47
+ for i, (s, e, d) in enumerate(longish[:12], 1):
48
+ print(f"{i:02d}. {s:.2f} -> {e:.2f} (dur {d:.2f}s)")
49
+
50
+ if __name__ == "__main__":
51
+ main()
step9_madd_feedback_json.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ import parselmouth
4
+
5
+ AUDIO_PATH = "sample_trim.wav"
6
+ CANON_PATH = "data/fatiha_canonical_fallback.json"
7
+ OUT_PATH = "output/feedback_madd.json"
8
+
9
+ # --- Heuristic thresholds (MVP) ---
10
+ # Quranic madd lengths depend on rule; for MVP we just classify by duration.
11
+ TOO_SHORT_SEC = 0.15
12
+ OK_MAX_SEC = 0.35
13
+ TOO_LONG_SEC = 0.35
14
+
15
+ def extract_long_voiced_segments(sound: parselmouth.Sound):
16
+ intensity = sound.to_intensity(time_step=0.01)
17
+ times = intensity.xs()
18
+ vals = intensity.values[0]
19
+
20
+ thr = np.percentile(vals, 60)
21
+ voiced = vals > thr
22
+
23
+ segments = []
24
+ in_seg = False
25
+ start = None
26
+
27
+ for t, v in zip(times, voiced):
28
+ if v and not in_seg:
29
+ in_seg = True
30
+ start = float(t)
31
+ elif (not v) and in_seg:
32
+ in_seg = False
33
+ end = float(t)
34
+ if end - start >= 0.06:
35
+ segments.append((start, end))
36
+ if in_seg and start is not None:
37
+ end = float(times[-1])
38
+ if end - start >= 0.06:
39
+ segments.append((start, end))
40
+
41
+ # Return only the longer ones as Madd candidates
42
+ longish = [(s, e, e - s) for (s, e) in segments if (e - s) >= 0.18]
43
+ return longish
44
+
45
+ def madd_words_in_order(canon):
46
+ """
47
+ Returns list of dicts in recitation order where madd_positions exists.
48
+ """
49
+ items = []
50
+ for ay in canon["ayahs"]:
51
+ for w in ay["word_info"]:
52
+ if w.get("madd_positions_base_index"):
53
+ items.append({
54
+ "ayah": ay["ayah"],
55
+ "word": w["word"],
56
+ "base": w["base"],
57
+ "madd_positions_base_index": w["madd_positions_base_index"],
58
+ "phonemes_fallback": w.get("phonemes_fallback", "")
59
+ })
60
+ return items
61
+
62
+ def classify_duration(d):
63
+ if d < TOO_SHORT_SEC:
64
+ return "too_short"
65
+ if d <= OK_MAX_SEC:
66
+ return "ok"
67
+ return "too_long"
68
+
69
+ def confidence_from_duration(d):
70
+ # crude confidence: farther from ok band → higher confidence
71
+ if d < TOO_SHORT_SEC:
72
+ return min(0.95, 0.60 + (TOO_SHORT_SEC - d) * 2.0)
73
+ if d <= OK_MAX_SEC:
74
+ return 0.55
75
+ return min(0.95, 0.60 + (d - OK_MAX_SEC) * 1.2)
76
+
77
+ def main():
78
+ # Load canonical word info
79
+ with open(CANON_PATH, "r", encoding="utf-8") as f:
80
+ canon = json.load(f)
81
+
82
+ madd_targets = madd_words_in_order(canon)
83
+
84
+ # Load audio
85
+ snd = parselmouth.Sound(AUDIO_PATH)
86
+ longish = extract_long_voiced_segments(snd)
87
+
88
+ feedback = {
89
+ "surah": canon["surah"],
90
+ "riwayah": canon["riwayah"],
91
+ "rule": "Madd (MVP heuristic)",
92
+ "audio_path": AUDIO_PATH,
93
+ "notes": [
94
+ "This MVP uses intensity-based voiced segments and maps long segments to Madd-eligible words in order.",
95
+ "Replace with real forced alignment + Quranic-Phonemizer later for Tajweed-accurate placement."
96
+ ],
97
+ "segments_detected": [{"start": s, "end": e, "dur": d} for (s, e, d) in longish],
98
+ "madd_targets": madd_targets,
99
+ "results": []
100
+ }
101
+
102
+ # Map segments to madd targets sequentially
103
+ n = min(len(longish), len(madd_targets))
104
+ for i in range(n):
105
+ s, e, d = longish[i]
106
+ tgt = madd_targets[i]
107
+ label = classify_duration(d)
108
+ conf = float(round(confidence_from_duration(d), 3))
109
+
110
+ # Simple user-facing tip
111
+ if label == "too_short":
112
+ tip = "Extend the vowel a bit more (madd)."
113
+ elif label == "too_long":
114
+ tip = "Shorten the vowel slightly (avoid over-stretching)."
115
+ else:
116
+ tip = "Madd length looks OK."
117
+
118
+ feedback["results"].append({
119
+ "index": i + 1,
120
+ "ayah": tgt["ayah"],
121
+ "word": tgt["word"],
122
+ "timestamp": {"start": round(s, 3), "end": round(e, 3)},
123
+ "duration_sec": round(d, 3),
124
+ "classification": label,
125
+ "confidence": conf,
126
+ "tip": tip
127
+ })
128
+
129
+ with open(OUT_PATH, "w", encoding="utf-8") as f:
130
+ json.dump(feedback, f, ensure_ascii=False, indent=2)
131
+
132
+ print("OK ✅ wrote", OUT_PATH)
133
+ print("Long segments:", len(longish))
134
+ print("Madd target words:", len(madd_targets))
135
+ print("Mapped results:", len(feedback["results"]))
136
+ if feedback["results"]:
137
+ print("Sample result:", feedback["results"][0])
138
+
139
+ if __name__ == "__main__":
140
+ main()