# ================== imports ================== import gradio as gr import pandas as pd import numpy as np import os, tempfile, re from sentence_transformers import SentenceTransformer, util from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import gdown # ================== إعدادات ================== BOOKS_FILE = "book.xlsx" THESES_FILE = "theses.xlsx" DRIVE_BOOKS_ID = "1IVu5M_zHWg3wo-cbQm3FWhmDIafUSCVP" DRIVE_THESES_ID = "1hvZIGFG6h0kQY32bIuEWGRdXTZp84vhk" TOP_K = 20 TFIDF_TOP_N = 100 TFIDF_FALLBACK_MIN = 30 # ================== نموذج Semantic ================== model = SentenceTransformer("all-MiniLM-L6-v2") # ================== تحميل الملفات ================== def download_from_drive(file_id, output): if not os.path.exists(output): url = f"https://drive.google.com/uc?id={file_id}" gdown.download(url, output, quiet=True) download_from_drive(DRIVE_BOOKS_ID, BOOKS_FILE) download_from_drive(DRIVE_THESES_ID, THESES_FILE) # ================== تحميل البيانات ================== books_df = pd.read_excel(BOOKS_FILE).fillna("") theses_df = pd.read_excel(THESES_FILE).fillna("") books_df["المصدر"] = "كتاب" theses_df["المصدر"] = "رسالة" for df in [books_df, theses_df]: if "الموقع على الرف" in df.columns: df["الموقع على الرف"] = ( df["الموقع على الرف"] .astype(str) .str.strip() .replace(["nan", "NaN", "None", ""], "-") ) library_df = pd.concat([books_df, theses_df], ignore_index=True) # ================== الأعمدة الدلالية ================== def detect_semantic_columns(df): exclude = ["م", "سنة", "تاريخ", "رقم", "كود", "الموقع", "الرف"] include = ["عنوان", "العنوان", "مؤلف", "المؤلف", "موضوع", "ملخص", "وصف", "كلمات"] cols = [] for c in df.columns: s = str(c) if any(x in s for x in exclude): continue if any(x in s for x in include): cols.append(c) if not cols: cols = df.select_dtypes(include="object").columns.tolist() return cols semantic_cols = detect_semantic_columns(library_df) # ================== Normalize ================== def normalize_text(text): text = str(text).lower() text = re.sub(r"[^\w\s]", " ", text) text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا") text = text.replace("ى", "ي").replace("ة", "ه") return re.sub(r"\s+", " ", text).strip() def row_to_text(row): return normalize_text( " ".join(str(row[c]) for c in semantic_cols if row[c]) ) # ================== Embeddings ================== embeddings = model.encode( library_df.apply(row_to_text, axis=1).tolist(), convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True ) library_df["embedding"] = list(embeddings) # ================== TF-IDF ================== def tfidf_prefilter(df, query): texts = df.apply(row_to_text, axis=1).tolist() vec = TfidfVectorizer(ngram_range=(1, 2), max_features=20000) mat = vec.fit_transform(texts) q_vec = vec.transform([normalize_text(query)]) scores = cosine_similarity(q_vec, mat)[0] df = df.copy() df["tfidf_score"] = scores return df.sort_values("tfidf_score", ascending=False).head(TFIDF_TOP_N) # ================== CSS ================== CUSTOM_CSS = """ """ def safe(v): return "-" if pd.isna(v) or str(v).strip() == "" else str(v) def results_to_html(df): if df.empty: return "
❌ لا توجد نتائج
" html = "" for _, r in df.iterrows(): html += f"""| العنوان | {safe(r.get("العنوان"))} |
| المؤلف | {safe(r.get("المؤلف"))} |
| سنة النشر | {safe(r.get("سنة النشر"))} |
| الموقع على الرف | {safe(r.get("الموقع على الرف"))} |
⚠️ اكتب كلمة للبحث
", pd.DataFrame() df = library_df.copy() if source_filter != "الكل": df = df[df["المصدر"] == source_filter] if df.empty: return "❌ لا توجد بيانات
", pd.DataFrame() if mode == "نصي": mask = ( df["العنوان"].str.contains(query, case=False, na=False) | df["المؤلف"].str.contains(query, case=False, na=False) ) df = df[mask].head(TOP_K) else: df_pref = tfidf_prefilter(df, query) if len(df_pref) < TFIDF_FALLBACK_MIN: df_pref = df q_emb = model.encode([normalize_text(query)], normalize_embeddings=True) mat = np.vstack(df_pref["embedding"].values) scores = util.cos_sim(q_emb, mat)[0].cpu().numpy() df_pref["score"] = scores df = df_pref.sort_values("score", ascending=False).head(TOP_K) return results_to_html(df), df # ================== حفظ النتائج ================== def save_to_excel(df): tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") df.drop(columns=["embedding", "score", "tfidf_score"], errors="ignore") \ .to_excel(tmp.name, index=False) return tmp.name # ================== واجهة Gradio ================== with gr.Blocks(title="نظام البحث الذكي بالمكتبة") as app: gr.Markdown("## 🔍 البحث الذكي في مقتنيات المكتبة") query = gr.Textbox(label="كلمة أو موضوع البحث") mode = gr.Radio(["نصي", "دلالي (Semantic)"], value="نصي", label="نوع البحث") source_filter = gr.Radio(["الكل", "كتاب", "رسالة"], value="الكل", label="المصدر") btn = gr.Button("🔎 بحث") out_html = gr.HTML() df_state = gr.State() file_out = gr.File(label="⬇️ تحميل النتائج") btn.click(local_search_df, [query, mode, source_filter], [out_html, df_state]) gr.Button("📥 حفظ النتائج").click(save_to_excel, df_state, file_out) app.launch()