Spaces:
Running
Running
| # ================== imports ================== | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import os, tempfile, re | |
| from sentence_transformers import SentenceTransformer, util | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import gdown | |
| # ================== إعدادات ================== | |
| BOOKS_FILE = "book.xlsx" | |
| THESES_FILE = "theses.xlsx" | |
| DRIVE_BOOKS_ID = "1IVu5M_zHWg3wo-cbQm3FWhmDIafUSCVP" | |
| DRIVE_THESES_ID = "1hvZIGFG6h0kQY32bIuEWGRdXTZp84vhk" | |
| TOP_K = 20 | |
| TFIDF_TOP_N = 100 | |
| TFIDF_FALLBACK_MIN = 30 | |
| # ================== نموذج Semantic ================== | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| # ================== تحميل الملفات ================== | |
| def download_from_drive(file_id, output): | |
| if not os.path.exists(output): | |
| url = f"https://drive.google.com/uc?id={file_id}" | |
| gdown.download(url, output, quiet=True) | |
| download_from_drive(DRIVE_BOOKS_ID, BOOKS_FILE) | |
| download_from_drive(DRIVE_THESES_ID, THESES_FILE) | |
| # ================== تحميل البيانات ================== | |
| books_df = pd.read_excel(BOOKS_FILE).fillna("") | |
| theses_df = pd.read_excel(THESES_FILE).fillna("") | |
| books_df["المصدر"] = "كتاب" | |
| theses_df["المصدر"] = "رسالة" | |
| for df in [books_df, theses_df]: | |
| if "الموقع على الرف" in df.columns: | |
| df["الموقع على الرف"] = ( | |
| df["الموقع على الرف"] | |
| .astype(str) | |
| .str.strip() | |
| .replace(["nan", "NaN", "None", ""], "-") | |
| ) | |
| library_df = pd.concat([books_df, theses_df], ignore_index=True) | |
| # ================== الأعمدة الدلالية ================== | |
| def detect_semantic_columns(df): | |
| exclude = ["م", "سنة", "تاريخ", "رقم", "كود", "الموقع", "الرف"] | |
| include = ["عنوان", "العنوان", "مؤلف", "المؤلف", "موضوع", "ملخص", "وصف", "كلمات"] | |
| cols = [] | |
| for c in df.columns: | |
| s = str(c) | |
| if any(x in s for x in exclude): | |
| continue | |
| if any(x in s for x in include): | |
| cols.append(c) | |
| if not cols: | |
| cols = df.select_dtypes(include="object").columns.tolist() | |
| return cols | |
| semantic_cols = detect_semantic_columns(library_df) | |
| # ================== Normalize ================== | |
| def normalize_text(text): | |
| text = str(text).lower() | |
| text = re.sub(r"[^\w\s]", " ", text) | |
| text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا") | |
| text = text.replace("ى", "ي").replace("ة", "ه") | |
| return re.sub(r"\s+", " ", text).strip() | |
| def row_to_text(row): | |
| return normalize_text( | |
| " ".join(str(row[c]) for c in semantic_cols if row[c]) | |
| ) | |
| # ================== Embeddings ================== | |
| embeddings = model.encode( | |
| library_df.apply(row_to_text, axis=1).tolist(), | |
| convert_to_numpy=True, | |
| normalize_embeddings=True, | |
| show_progress_bar=True | |
| ) | |
| library_df["embedding"] = list(embeddings) | |
| # ================== TF-IDF ================== | |
| def tfidf_prefilter(df, query): | |
| texts = df.apply(row_to_text, axis=1).tolist() | |
| vec = TfidfVectorizer(ngram_range=(1, 2), max_features=20000) | |
| mat = vec.fit_transform(texts) | |
| q_vec = vec.transform([normalize_text(query)]) | |
| scores = cosine_similarity(q_vec, mat)[0] | |
| df = df.copy() | |
| df["tfidf_score"] = scores | |
| return df.sort_values("tfidf_score", ascending=False).head(TFIDF_TOP_N) | |
| # ================== CSS ================== | |
| CUSTOM_CSS = """ | |
| <style> | |
| .card{border:1px solid #ddd;border-radius:8px;padding:10px;margin:12px 0;background:#fafafa;} | |
| .card table{width:100%;border-collapse:collapse;direction:rtl;} | |
| .card td{border:1px solid #ccc;padding:8px;text-align:center;vertical-align:middle;font-size:15px;} | |
| .card td.label{width:30%;font-weight:bold;} | |
| .row-title td{background:#e3f2fd;color:#0d47a1;font-weight:bold;} | |
| .row-author td{background:#fce4ec;color:#880e4f;} | |
| .row-year td{background:#e8f5e9;color:#1b5e20;} | |
| .row-shelf td{background:#fff8e1;color:#e65100;font-weight:bold;font-size:16px;} | |
| </style> | |
| """ | |
| def safe(v): | |
| return "-" if pd.isna(v) or str(v).strip() == "" else str(v) | |
| def results_to_html(df): | |
| if df.empty: | |
| return "<p>❌ لا توجد نتائج</p>" | |
| html = "" | |
| for _, r in df.iterrows(): | |
| html += f""" | |
| <div class="card"> | |
| <table> | |
| <tr class="row-title"> | |
| <td class="label">العنوان</td> | |
| <td>{safe(r.get("العنوان"))}</td> | |
| </tr> | |
| <tr class="row-author"> | |
| <td class="label">المؤلف</td> | |
| <td>{safe(r.get("المؤلف"))}</td> | |
| </tr> | |
| <tr class="row-year"> | |
| <td class="label">سنة النشر</td> | |
| <td>{safe(r.get("سنة النشر"))}</td> | |
| </tr> | |
| <tr class="row-shelf"> | |
| <td class="label">الموقع على الرف</td> | |
| <td>{safe(r.get("الموقع على الرف"))}</td> | |
| </tr> | |
| </table> | |
| </div> | |
| """ | |
| return CUSTOM_CSS + html | |
| # ================== البحث ================== | |
| def local_search_df(query, mode, source_filter): | |
| if not query.strip(): | |
| return "<p>⚠️ اكتب كلمة للبحث</p>", pd.DataFrame() | |
| df = library_df.copy() | |
| if source_filter != "الكل": | |
| df = df[df["المصدر"] == source_filter] | |
| if df.empty: | |
| return "<p>❌ لا توجد بيانات</p>", pd.DataFrame() | |
| if mode == "نصي": | |
| mask = ( | |
| df["العنوان"].str.contains(query, case=False, na=False) | | |
| df["المؤلف"].str.contains(query, case=False, na=False) | |
| ) | |
| df = df[mask].head(TOP_K) | |
| else: | |
| df_pref = tfidf_prefilter(df, query) | |
| if len(df_pref) < TFIDF_FALLBACK_MIN: | |
| df_pref = df | |
| q_emb = model.encode([normalize_text(query)], normalize_embeddings=True) | |
| mat = np.vstack(df_pref["embedding"].values) | |
| scores = util.cos_sim(q_emb, mat)[0].cpu().numpy() | |
| df_pref["score"] = scores | |
| df = df_pref.sort_values("score", ascending=False).head(TOP_K) | |
| return results_to_html(df), df | |
| # ================== حفظ النتائج ================== | |
| def save_to_excel(df): | |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") | |
| df.drop(columns=["embedding", "score", "tfidf_score"], errors="ignore") \ | |
| .to_excel(tmp.name, index=False) | |
| return tmp.name | |
| # ================== واجهة Gradio ================== | |
| with gr.Blocks(title="نظام البحث الذكي بالمكتبة") as app: | |
| gr.Markdown("## 🔍 البحث الذكي في مقتنيات المكتبة") | |
| query = gr.Textbox(label="كلمة أو موضوع البحث") | |
| mode = gr.Radio(["نصي", "دلالي (Semantic)"], value="نصي", label="نوع البحث") | |
| source_filter = gr.Radio(["الكل", "كتاب", "رسالة"], value="الكل", label="المصدر") | |
| btn = gr.Button("🔎 بحث") | |
| out_html = gr.HTML() | |
| df_state = gr.State() | |
| file_out = gr.File(label="⬇️ تحميل النتائج") | |
| btn.click(local_search_df, [query, mode, source_filter], [out_html, df_state]) | |
| gr.Button("📥 حفظ النتائج").click(save_to_excel, df_state, file_out) | |
| app.launch() | |