aelsaeed's picture
Update app.py
1a4565a verified
# ================== imports ==================
import gradio as gr
import pandas as pd
import numpy as np
import os, tempfile, re
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gdown
# ================== إعدادات ==================
BOOKS_FILE = "book.xlsx"
THESES_FILE = "theses.xlsx"
DRIVE_BOOKS_ID = "1IVu5M_zHWg3wo-cbQm3FWhmDIafUSCVP"
DRIVE_THESES_ID = "1hvZIGFG6h0kQY32bIuEWGRdXTZp84vhk"
TOP_K = 20
TFIDF_TOP_N = 100
TFIDF_FALLBACK_MIN = 30
# ================== نموذج Semantic ==================
model = SentenceTransformer("all-MiniLM-L6-v2")
# ================== تحميل الملفات ==================
def download_from_drive(file_id, output):
if not os.path.exists(output):
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, output, quiet=True)
download_from_drive(DRIVE_BOOKS_ID, BOOKS_FILE)
download_from_drive(DRIVE_THESES_ID, THESES_FILE)
# ================== تحميل البيانات ==================
books_df = pd.read_excel(BOOKS_FILE).fillna("")
theses_df = pd.read_excel(THESES_FILE).fillna("")
books_df["المصدر"] = "كتاب"
theses_df["المصدر"] = "رسالة"
for df in [books_df, theses_df]:
if "الموقع على الرف" in df.columns:
df["الموقع على الرف"] = (
df["الموقع على الرف"]
.astype(str)
.str.strip()
.replace(["nan", "NaN", "None", ""], "-")
)
library_df = pd.concat([books_df, theses_df], ignore_index=True)
# ================== الأعمدة الدلالية ==================
def detect_semantic_columns(df):
exclude = ["م", "سنة", "تاريخ", "رقم", "كود", "الموقع", "الرف"]
include = ["عنوان", "العنوان", "مؤلف", "المؤلف", "موضوع", "ملخص", "وصف", "كلمات"]
cols = []
for c in df.columns:
s = str(c)
if any(x in s for x in exclude):
continue
if any(x in s for x in include):
cols.append(c)
if not cols:
cols = df.select_dtypes(include="object").columns.tolist()
return cols
semantic_cols = detect_semantic_columns(library_df)
# ================== Normalize ==================
def normalize_text(text):
text = str(text).lower()
text = re.sub(r"[^\w\s]", " ", text)
text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
text = text.replace("ى", "ي").replace("ة", "ه")
return re.sub(r"\s+", " ", text).strip()
def row_to_text(row):
return normalize_text(
" ".join(str(row[c]) for c in semantic_cols if row[c])
)
# ================== Embeddings ==================
embeddings = model.encode(
library_df.apply(row_to_text, axis=1).tolist(),
convert_to_numpy=True,
normalize_embeddings=True,
show_progress_bar=True
)
library_df["embedding"] = list(embeddings)
# ================== TF-IDF ==================
def tfidf_prefilter(df, query):
texts = df.apply(row_to_text, axis=1).tolist()
vec = TfidfVectorizer(ngram_range=(1, 2), max_features=20000)
mat = vec.fit_transform(texts)
q_vec = vec.transform([normalize_text(query)])
scores = cosine_similarity(q_vec, mat)[0]
df = df.copy()
df["tfidf_score"] = scores
return df.sort_values("tfidf_score", ascending=False).head(TFIDF_TOP_N)
# ================== CSS ==================
CUSTOM_CSS = """
<style>
.card{border:1px solid #ddd;border-radius:8px;padding:10px;margin:12px 0;background:#fafafa;}
.card table{width:100%;border-collapse:collapse;direction:rtl;}
.card td{border:1px solid #ccc;padding:8px;text-align:center;vertical-align:middle;font-size:15px;}
.card td.label{width:30%;font-weight:bold;}
.row-title td{background:#e3f2fd;color:#0d47a1;font-weight:bold;}
.row-author td{background:#fce4ec;color:#880e4f;}
.row-year td{background:#e8f5e9;color:#1b5e20;}
.row-shelf td{background:#fff8e1;color:#e65100;font-weight:bold;font-size:16px;}
</style>
"""
def safe(v):
return "-" if pd.isna(v) or str(v).strip() == "" else str(v)
def results_to_html(df):
if df.empty:
return "<p>❌ لا توجد نتائج</p>"
html = ""
for _, r in df.iterrows():
html += f"""
<div class="card">
<table>
<tr class="row-title">
<td class="label">العنوان</td>
<td>{safe(r.get("العنوان"))}</td>
</tr>
<tr class="row-author">
<td class="label">المؤلف</td>
<td>{safe(r.get("المؤلف"))}</td>
</tr>
<tr class="row-year">
<td class="label">سنة النشر</td>
<td>{safe(r.get("سنة النشر"))}</td>
</tr>
<tr class="row-shelf">
<td class="label">الموقع على الرف</td>
<td>{safe(r.get("الموقع على الرف"))}</td>
</tr>
</table>
</div>
"""
return CUSTOM_CSS + html
# ================== البحث ==================
def local_search_df(query, mode, source_filter):
if not query.strip():
return "<p>⚠️ اكتب كلمة للبحث</p>", pd.DataFrame()
df = library_df.copy()
if source_filter != "الكل":
df = df[df["المصدر"] == source_filter]
if df.empty:
return "<p>❌ لا توجد بيانات</p>", pd.DataFrame()
if mode == "نصي":
mask = (
df["العنوان"].str.contains(query, case=False, na=False) |
df["المؤلف"].str.contains(query, case=False, na=False)
)
df = df[mask].head(TOP_K)
else:
df_pref = tfidf_prefilter(df, query)
if len(df_pref) < TFIDF_FALLBACK_MIN:
df_pref = df
q_emb = model.encode([normalize_text(query)], normalize_embeddings=True)
mat = np.vstack(df_pref["embedding"].values)
scores = util.cos_sim(q_emb, mat)[0].cpu().numpy()
df_pref["score"] = scores
df = df_pref.sort_values("score", ascending=False).head(TOP_K)
return results_to_html(df), df
# ================== حفظ النتائج ==================
def save_to_excel(df):
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
df.drop(columns=["embedding", "score", "tfidf_score"], errors="ignore") \
.to_excel(tmp.name, index=False)
return tmp.name
# ================== واجهة Gradio ==================
with gr.Blocks(title="نظام البحث الذكي بالمكتبة") as app:
gr.Markdown("## 🔍 البحث الذكي في مقتنيات المكتبة")
query = gr.Textbox(label="كلمة أو موضوع البحث")
mode = gr.Radio(["نصي", "دلالي (Semantic)"], value="نصي", label="نوع البحث")
source_filter = gr.Radio(["الكل", "كتاب", "رسالة"], value="الكل", label="المصدر")
btn = gr.Button("🔎 بحث")
out_html = gr.HTML()
df_state = gr.State()
file_out = gr.File(label="⬇️ تحميل النتائج")
btn.click(local_search_df, [query, mode, source_filter], [out_html, df_state])
gr.Button("📥 حفظ النتائج").click(save_to_excel, df_state, file_out)
app.launch()