Hothaifa commited on
Commit
1bcf6e9
·
verified ·
1 Parent(s): 816b46a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +236 -94
app.py CHANGED
@@ -99,8 +99,8 @@ def clean_visible_text(text: str) -> str:
99
  text = re.sub(r"https?://\S+|www\.\S+|%[0-9A-Fa-f]{2}", " ", text)
100
  return text.strip()
101
 
102
- START_CUES = ("الحمد لله", "أما بعد", "فالجواب", "الجواب", "الإجابة")
103
- END_CUES = ("والله أعلم", "والله تعالى أعلم")
104
 
105
  def slice_to_answer_core(text: str) -> str:
106
  start_idx = min([text.find(p) for p in START_CUES if p in text] or [0])
@@ -120,7 +120,199 @@ def dynamic_snippet(text: str) -> str:
120
 
121
  def looks_religious_answer(text: str) -> bool:
122
  return any(k in text for k in START_CUES + END_CUES)
123
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  # ===================== 5) Load assets & FAISS (فتاوى) =====================
125
  print("[SERVER-INFO] بدء تحميل الأصول...")
126
  DATA_FILE_ID = "1GMG6fVxhUuBEAHP91c8RAUdUJh5TxY5O"
@@ -175,92 +367,41 @@ def save_feedback(question: str, answer: str, useful: str, comment: str = ""):
175
  df_learned.to_csv(learned_data_path, index=False)
176
  return row
177
 
178
- # ===================== 8) Google Search (النسخة النهائية القوية) =====================
179
-
180
- # --- أدوات مساعدة خاصة بالبحث الجديد ---
181
- def choose_extractor(url):
182
- host = urlparse(url).netloc.lower()
183
- DOMAIN_EXTRACTORS = {
184
- "islamweb.net": lambda text: (re.search(r"السؤال\s*[::]?\s*(.+?)\s*(?:الإجاب+ة|الإجابة|الجواب)", text, re.DOTALL), text),
185
- "islamqa.info": lambda text: (re.search(r"السؤال\s*[::]?\s*(.+?)\s*(?:الجواب|الإجابة)", text, re.DOTALL), text),
186
- "binbaz.org.sa": lambda text: (re.search(r"(?:س|السؤال)\s*[::]?\s*(.+?)\s*(?:ج|الجواب)", text, re.DOTALL), text),
187
- "alifta.gov.sa": lambda text: (re.search(r"السؤال\s*[::]?\s*(.+?)\s*(?:الجواب|الإجابة)", text, re.DOTALL), text),
188
- }
189
- for dom, func in DOMAIN_EXTRACTORS.items():
190
- if dom in host:
191
- return func
192
- # الدالة الافتراضية
193
- return lambda text: (re.search(r"(?:السؤال|س)\s*[::]?\s*(.+?)\s*(?:الجواب|الإجابة|ج)", text, re.DOTALL), text)
194
-
195
- def get_page_text(url):
196
- try:
197
- resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=15)
198
- resp.raise_for_status()
199
- resp.encoding = resp.apparent_encoding or "utf-8"
200
- soup = BeautifulSoup(resp.text, "html.parser")
201
- for tag in soup(["script","style","noscript","header","footer","nav","form","aside"]):
202
- tag.extract()
203
- return clean_visible_text(soup.get_text(" ", strip=True))
204
- except Exception:
205
- return ""
206
-
207
- # --- الدالة الرئيسية للبحث ---
208
  def google_search_fatwa(query: str):
209
- if GOOGLE_API_KEY == "YOUR_API_KEY" or CUSTOM_SEARCH_ENGINE_ID == "YOUR_CSE_ID":
210
- print("[GOOGLE-SEARCH] مفاتيح جوجل غير مضبوطة.")
211
- return None
 
212
  try:
213
- service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)
214
- site_filter = " OR ".join(f"site:{d}" for d in ("islamweb.net","islamqa.info","binbaz.org.sa","alifta.gov.sa"))
215
- full_query = f'{query} ({site_filter})'
216
- res = service.cse().list(q=full_query, cx=CUSTOM_SEARCH_ENGINE_ID, num=8, lr="lang_ar", safe="off").execute()
217
- items = res.get("items", [])
218
- if not items: return None
219
-
220
- candidates = []
221
- for item in items:
222
- url = item.get("link", "")
223
- title = item.get("title", "")
224
-
225
- page_content = get_page_text(url)
226
- if not page_content: continue
227
-
228
- extractor = choose_extractor(url)
229
- match, text_region = extractor(page_content)
230
-
231
- page_question = match.group(1).strip() if match else ""
232
- answer_region = text_region[match.end():].strip() if match else text_region
233
- final_answer = slice_to_answer_core(answer_region) # نستخدم دالة القص هنا فقط على منطقة الجواب
234
-
235
- if len(final_answer.split()) < 40: continue # تجاهل الإجابات القصيرة جدًا
236
-
237
- # حساب درجة التشابه
238
- score = fuzz.token_set_ratio(normalize_text(query), normalize_text(page_question)) if page_question else 0
239
-
240
- candidates.append({
241
- "title": title,
242
- "url": url,
243
- "answer": final_answer,
244
- "score": score
245
- })
246
-
247
- if not candidates: return None
248
-
249
- best = max(candidates, key=lambda c: c["score"])
250
-
251
- if best['score'] < 50: # نرفع الحد الأدنى للثقة
252
- print(f"[GOOGLE-SEARCH] أفضل نتيجة درجتها ضعيفة ({best['score']}). يتم تجاهلها.")
253
  return None
254
 
255
- # لا داعي لحفظ النتيجة في التعلم الذاتي هنا، لنركز على الدقة أولاً
256
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  return {
258
  "question": query,
259
- "answer": best["answer"], # الجواب المقصوص والنظيف
260
  "source": "بحث جوجل (فتوى موثقة)",
261
- "source_url": best["url"],
262
- "title": best["title"],
263
- "score": best['score']
264
  }
265
  except Exception as e:
266
  print(f"[GOOGLE-SEARCH-ERROR] {e}")
@@ -290,6 +431,18 @@ def safe_download(file_id, output_path):
290
  print(f"[SAFE-DOWNLOAD-INFO] الملف {output_path} موجود بالفعل، سيتم استخدامه.")
291
  else:
292
  raise e
 
 
 
 
 
 
 
 
 
 
 
 
293
  @app.on_event("startup")
294
  async def startup_event():
295
  global df_main, df_learned, question_embeddings, index, tokenizer, model
@@ -400,18 +553,7 @@ def feedback(req: FeedbackRequest):
400
 
401
  # ===================== 11) ----- قسم ا��أحاديث (مدمج) ----- =====================
402
 
403
- # --- Google Drive IDs & local paths (أحاديث) ---
404
- # --- Google Drive IDs & local paths (أحاديث) ---
405
- ID_BUKHARI = os.environ.get("ID_BUKHARI")
406
- ID_MUSLIM = os.environ.get("ID_MUSLIM")
407
- ID_MUSNAD = os.environ.get("ID_MUSNAD")
408
-
409
- PATHS = {
410
- "bukhari": os.path.join(DATA_DIR, "sahih_bukhari_clean.csv"),
411
- "muslim": os.path.join(DATA_DIR, "sahih_muslim_clean.csv"),
412
- "musnad": os.path.join(DATA_DIR, "musnad_ahmed_clean.csv"),
413
- }
414
-
415
  # --- تطبيع عربي (أحاديث) ---
416
  def normalize_ar(s: str) -> str:
417
  if not isinstance(s, str):
 
99
  text = re.sub(r"https?://\S+|www\.\S+|%[0-9A-Fa-f]{2}", " ", text)
100
  return text.strip()
101
 
102
+ START_CUES = ("الحمد لله","أما بعد","فالجواب","الجواب","الإجابة","الإجابــة")
103
+ END_CUES = ("والله أعلم","والله تعالى أعلم","وبالله التوفيق")
104
 
105
  def slice_to_answer_core(text: str) -> str:
106
  start_idx = min([text.find(p) for p in START_CUES if p in text] or [0])
 
120
 
121
  def looks_religious_answer(text: str) -> bool:
122
  return any(k in text for k in START_CUES + END_CUES)
123
+ # ===================== Google CSE (نسخة Colab المحسّنة) =====================
124
+ HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; HajeenBot/1.0)"}
125
+
126
+ def normalize_for_score(s: str) -> str:
127
+ # نعيد استخدام نفس منطق التطبيع تبعك
128
+ return normalize_text(s or "")
129
+
130
+ def call_google_cse(query, cx=CUSTOM_SEARCH_ENGINE_ID, key=GOOGLE_API_KEY, num=10, lr="lang_ar"):
131
+ if key in (None, "", "YOUR_API_KEY") or cx in (None, "", "YOUR_CSE_ID"):
132
+ raise RuntimeError("Google CSE keys are not configured")
133
+ url = "https://www.googleapis.com/customsearch/v1"
134
+ params = {"q": query, "cx": cx, "key": key, "num": num, "lr": lr, "safe": "off"}
135
+ resp = requests.get(url, params=params, timeout=25, headers=HEADERS)
136
+ resp.raise_for_status()
137
+ return resp.json().get("items", []) or []
138
+
139
+ def get_soup(url, timeout=20):
140
+ try:
141
+ resp = requests.get(url, headers=HEADERS, timeout=timeout)
142
+ resp.encoding = resp.apparent_encoding or "utf-8"
143
+ resp.raise_for_status()
144
+ soup = BeautifulSoup(resp.text, "html.parser")
145
+ for tag in soup(["script","style","noscript","header","footer","nav","form","aside"]):
146
+ tag.extract()
147
+ return soup
148
+ except Exception:
149
+ return None
150
+
151
+ def page_text(soup: BeautifulSoup) -> str:
152
+ text = soup.get_text(" ", strip=True)
153
+ text = re.sub(r"\u0640", "", text) # إزالة التطويل
154
+ text = re.sub(r"\s+", " ", text)
155
+ text = re.sub(r"https?://\S+|www\.\S+|%[0-9A-Fa-f]{2}", " ", text)
156
+ return text.strip()
157
+
158
+ # --- قواطع سؤال/جواب حسب الدومين ---
159
+ def slice_by_cues(text: str) -> str:
160
+ low = text
161
+ start = 0
162
+ for cue in START_CUES:
163
+ i = low.find(cue)
164
+ if i != -1:
165
+ start = i
166
+ break
167
+ end = len(text)
168
+ for cue in END_CUES:
169
+ j = low.find(cue, start)
170
+ if j != -1:
171
+ end = j
172
+ break
173
+ core = text[start:end].strip()
174
+ return core if len(core.split()) > 15 else text.strip()
175
+
176
+ def extract_islamweb_qa(text: str):
177
+ m = re.search(r"السؤال\s*[::]?\s*(.+?)\s*(?:الإجاب+ة|الإجابة|الجواب)\s*[::]?", text, flags=re.DOTALL)
178
+ q = (m.group(1).strip() if m else "")
179
+ ans_region = text[m.end():].strip() if m else ""
180
+ answer = slice_by_cues(ans_region or text)
181
+ return q, answer
182
+
183
+ def extract_islamqa_qa(text: str):
184
+ m = re.search(r"السؤال\s*[::]?\s*(.+?)\s*(?:الجواب|الإجابة)\s*[::]?", text, flags=re.DOTALL)
185
+ q = (m.group(1).strip() if m else "")
186
+ ans_region = text[m.end():].strip() if m else ""
187
+ answer = slice_by_cues(ans_region or text)
188
+ return q, answer
189
+
190
+ def extract_binbaz_qa(text: str):
191
+ m = re.search(r"(?:س|السؤال)\s*[::]?\s*(.+?)\s*(?:ج|الجواب)\s*[::]?", text, flags=re.DOTALL)
192
+ q = (m.group(1).strip() if m else "")
193
+ ans_region = text[m.end():].strip() if m else ""
194
+ answer = slice_by_cues(ans_region or text)
195
+ return q, answer
196
+
197
+ def extract_alifta_qa(text: str):
198
+ m = re.search(r"السؤال\s*[::]?\s*(.+?)\s*(?:الجواب|الإجابة)\s*[::]?", text, flags=re.DOTALL)
199
+ q = (m.group(1).strip() if m else "")
200
+ ans_region = text[m.end():].strip() if m else ""
201
+ answer = slice_by_cues(ans_region or text)
202
+ return q, answer
203
+
204
+ def extract_generic_qa(text: str):
205
+ m = re.search(r"(?:السؤال|س)\s*[::]?\s*(.+?)\s*(?:الجواب|الإجابة|ج)\s*[::]?", text, flags=re.DOTALL)
206
+ q = (m.group(1).strip() if m else "")
207
+ ans_region = text[m.end():].strip() if m else ""
208
+ answer = slice_by_cues(ans_region or text)
209
+ return q, answer
210
+
211
+ DOMAIN_EXTRACTORS = {
212
+ "islamweb.net": extract_islamweb_qa,
213
+ "islamqa.info": extract_islamqa_qa,
214
+ "binbaz.org.sa": extract_binbaz_qa,
215
+ "alifta.gov.sa": extract_alifta_qa,
216
+ }
217
+
218
+ def choose_extractor(url):
219
+ host = urlparse(url).netloc.lower()
220
+ for dom, fn in DOMAIN_EXTRACTORS.items():
221
+ if dom in host:
222
+ return fn
223
+ return extract_generic_qa
224
+
225
+ def structural_score(url: str, page_text_sample: str) -> int:
226
+ u = url.lower()
227
+ score = 0
228
+ if re.search(r"/fatwa/\d+", u) or "/answers/" in u or "/fatwas/" in u:
229
+ score += 30
230
+ if "/fatawa/" in u or "العرض الموضوعي" in page_text_sample or "بحث عن فتوى" in page_text_sample:
231
+ score -= 35
232
+ if page_text_sample.count(".. المزيد") > 1 or re.search(r"\b\d+\s+\d+\s+\d+\b", page_text_sample):
233
+ score -= 15
234
+ return score
235
+
236
+ def score_candidate(user_query, cand_title, cand_url, page_question, page_answer, snippet):
237
+ nq_user = normalize_for_score(user_query)
238
+ nq_page_q = normalize_for_score(page_question)
239
+ nq_title = normalize_for_score(cand_title or "")
240
+ nq_snip = normalize_for_score(snippet or "")
241
+ wc = len((page_answer or "").split())
242
+
243
+ s_q = fuzz.token_set_ratio(nq_user, nq_page_q) if nq_page_q else 0
244
+ s_t = fuzz.partial_ratio(nq_user, nq_title) if nq_title else 0
245
+ s_s = fuzz.partial_ratio(nq_user, nq_snip) if nq_snip else 0
246
+
247
+ len_adj = 0
248
+ if wc < 40:
249
+ len_adj -= 15
250
+ elif wc > 5000:
251
+ len_adj -= 10
252
+
253
+ host = urlparse(cand_url).netloc.lower()
254
+ domain_bonus = 8 if any(d in host for d in ("islamweb.net","islamqa.info","binbaz.org.sa","alifta.gov.sa")) else 0
255
+
256
+ sample = (page_question + " " + page_answer)[:1200]
257
+ struct = structural_score(cand_url, sample)
258
+
259
+ score = int(0.80*s_q + 0.15*s_t + 0.05*s_s + len_adj + domain_bonus + struct)
260
+ return score, {"s_q": s_q, "s_title": s_t, "s_snip": s_s, "wc": wc, "struct": struct}
261
+
262
+ def google_search_match_and_extract_full(
263
+ user_query: str,
264
+ domains=("islamweb.net","islamqa.info","binbaz.org.sa","alifta.gov.sa"),
265
+ num_results=10,
266
+ max_links=8,
267
+ sleep_between=0.8
268
+ ):
269
+ site_filter = " OR ".join(f"site:{d}" for d in domains)
270
+ query = f"{user_query} ({site_filter})"
271
+
272
+ items = call_google_cse(query, num=num_results)
273
+ if not items:
274
+ return {"query": user_query, "best": None, "candidates": []}
275
+
276
+ candidates = []
277
+ for item in items[:max_links]:
278
+ title = item.get("title","")
279
+ link = item.get("link","")
280
+ snippet = item.get("snippet","")
281
+
282
+ soup = get_soup(link)
283
+ if not soup:
284
+ page_q, page_a = "", snippet
285
+ else:
286
+ txt = page_text(soup)
287
+ extractor = choose_extractor(link)
288
+ try:
289
+ page_q, page_a = extractor(txt)
290
+ except Exception:
291
+ page_q, page_a = extract_generic_qa(txt)
292
+
293
+ score, meta = score_candidate(user_query, title, link, page_q, page_a, snippet)
294
+ entry = {
295
+ "title": title,
296
+ "url": link,
297
+ "snippet": snippet,
298
+ "page_question": page_q,
299
+ "answer": page_a,
300
+ "score": score,
301
+ **meta
302
+ }
303
+
304
+ # قصّ خاص لابن باز: أول 250 كلمة فقط
305
+ if "binbaz.org.sa" in urlparse(link).netloc.lower():
306
+ words = (entry["answer"] or "").split()
307
+ if len(words) > 250:
308
+ entry["answer"] = " ".join(words[:250]) + " …"
309
+
310
+ candidates.append(entry)
311
+ time.sleep(sleep_between)
312
+
313
+ candidates_sorted = sorted(candidates, key=lambda x: x["score"], reverse=True)
314
+ best = candidates_sorted[0] if candidates_sorted else None
315
+ return {"query": user_query, "best": best, "candidates": candidates_sorted}
316
  # ===================== 5) Load assets & FAISS (فتاوى) =====================
317
  print("[SERVER-INFO] بدء تحميل الأصول...")
318
  DATA_FILE_ID = "1GMG6fVxhUuBEAHP91c8RAUdUJh5TxY5O"
 
367
  df_learned.to_csv(learned_data_path, index=False)
368
  return row
369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  def google_search_fatwa(query: str):
371
+ """
372
+ يغلّف النسخة المحسّنة: يبحث بالمواقع الموثوقة، يستخرج سؤال/جواب ويعيد أفضل نتيجة.
373
+ يحفظ التعلم الذاتي في learned_fatwas.csv مثل قبل.
374
+ """
375
  try:
376
+ res = google_search_match_and_extract_full(query)
377
+ best = res.get("best")
378
+ if not best:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  return None
380
 
381
+ answer_text = best["answer"] or best["snippet"] or ""
382
+ title = best.get("title","")
383
+ url = best.get("url","")
384
+ wc = len(answer_text.split())
385
+ score = int(best.get("score", 70))
386
+
387
+ # خزّن تعلم ذاتي
388
+ new_row = pd.DataFrame([{
389
+ "question": query,
390
+ "answer": answer_text,
391
+ "source_url": url,
392
+ "title": title,
393
+ "word_count": wc,
394
+ "score": score
395
+ }])
396
+ new_row.to_csv(learned_data_path, mode='a', header=not os.path.exists(learned_data_path), index=False)
397
+
398
  return {
399
  "question": query,
400
+ "answer": answer_text,
401
  "source": "بحث جوجل (فتوى موثقة)",
402
+ "source_url": url,
403
+ "title": title,
404
+ "score": score
405
  }
406
  except Exception as e:
407
  print(f"[GOOGLE-SEARCH-ERROR] {e}")
 
431
  print(f"[SAFE-DOWNLOAD-INFO] الملف {output_path} موجود بالفعل، سيتم استخدامه.")
432
  else:
433
  raise e
434
+ # --- Google Drive IDs & local paths (أحاديث) ---
435
+ # --- Google Drive IDs & local paths (أحاديث) ---
436
+ ID_BUKHARI = os.environ.get("ID_BUKHARI")
437
+ ID_MUSLIM = os.environ.get("ID_MUSLIM")
438
+ ID_MUSNAD = os.environ.get("ID_MUSNAD")
439
+
440
+ PATHS = {
441
+ "bukhari": os.path.join(DATA_DIR, "sahih_bukhari_clean.csv"),
442
+ "muslim": os.path.join(DATA_DIR, "sahih_muslim_clean.csv"),
443
+ "musnad": os.path.join(DATA_DIR, "musnad_ahmed_clean.csv"),
444
+ }
445
+
446
  @app.on_event("startup")
447
  async def startup_event():
448
  global df_main, df_learned, question_embeddings, index, tokenizer, model
 
553
 
554
  # ===================== 11) ----- قسم ا��أحاديث (مدمج) ----- =====================
555
 
556
+
 
 
 
 
 
 
 
 
 
 
 
557
  # --- تطبيع عربي (أحاديث) ---
558
  def normalize_ar(s: str) -> str:
559
  if not isinstance(s, str):