Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -23,10 +23,7 @@ model = SentenceTransformer(MODEL_NAME)
|
|
| 23 |
def download_from_drive(file_id, output):
|
| 24 |
if not os.path.exists(output):
|
| 25 |
url = f"https://drive.google.com/uc?id={file_id}"
|
| 26 |
-
|
| 27 |
-
gdown.download(url, output, quiet=True)
|
| 28 |
-
except Exception:
|
| 29 |
-
pass
|
| 30 |
|
| 31 |
download_from_drive(DRIVE_BOOKS_ID, BOOKS_FILE)
|
| 32 |
download_from_drive(DRIVE_THESES_ID, THESES_FILE)
|
|
@@ -39,18 +36,6 @@ def load_and_merge():
|
|
| 39 |
books = pd.read_excel(BOOKS_FILE).fillna("")
|
| 40 |
theses = pd.read_excel(THESES_FILE).fillna("")
|
| 41 |
|
| 42 |
-
# توحيد عمود العنوان
|
| 43 |
-
def normalize_title(df):
|
| 44 |
-
if "Title" not in df.columns:
|
| 45 |
-
if "العنوان" in df.columns:
|
| 46 |
-
df["Title"] = df["العنوان"].astype(str)
|
| 47 |
-
else:
|
| 48 |
-
df["Title"] = df.iloc[:, 0].astype(str)
|
| 49 |
-
return df
|
| 50 |
-
|
| 51 |
-
books = normalize_title(books)
|
| 52 |
-
theses = normalize_title(theses)
|
| 53 |
-
|
| 54 |
# إضافة نوع المصدر
|
| 55 |
books["المصدر"] = "كتاب"
|
| 56 |
theses["المصدر"] = "رسالة"
|
|
@@ -73,7 +58,7 @@ def build_or_load_embeddings(df, name):
|
|
| 73 |
if len(emb) == len(df):
|
| 74 |
return emb
|
| 75 |
|
| 76 |
-
texts = df["
|
| 77 |
emb = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
|
| 78 |
with open(path, "wb") as f:
|
| 79 |
pickle.dump(emb, f)
|
|
@@ -86,50 +71,25 @@ def results_to_html(df):
|
|
| 86 |
if df.empty:
|
| 87 |
return "<p>❌ لم يتم العثور على نتائج</p>"
|
| 88 |
|
| 89 |
-
#
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
elif "Author" in df.columns:
|
| 94 |
-
df["المؤلف"] = df["Author"]
|
| 95 |
-
cols.append("المؤلف")
|
| 96 |
-
else:
|
| 97 |
-
df["المؤلف"] = "-"
|
| 98 |
-
cols.append("المؤلف")
|
| 99 |
-
|
| 100 |
-
if "العنوان" in df.columns:
|
| 101 |
-
cols.append("العنوان")
|
| 102 |
-
else:
|
| 103 |
-
df["العنوان"] = df.get("Title", "-")
|
| 104 |
-
cols.append("العنوان")
|
| 105 |
-
|
| 106 |
-
if "سنة النشر" in df.columns:
|
| 107 |
-
cols.append("سنة النشر")
|
| 108 |
-
else:
|
| 109 |
-
df["سنة النشر"] = "-"
|
| 110 |
-
cols.append("سنة النشر")
|
| 111 |
-
|
| 112 |
-
if "الموقع على الرف" in df.columns:
|
| 113 |
-
cols.append("الموقع على الرف")
|
| 114 |
-
else:
|
| 115 |
-
df["الموقع على الرف"] = "-"
|
| 116 |
-
cols.append("الموقع على الرف")
|
| 117 |
-
|
| 118 |
-
if "المصدر" in df.columns:
|
| 119 |
-
cols.append("المصدر")
|
| 120 |
-
else:
|
| 121 |
-
df["المصدر"] = "-"
|
| 122 |
-
cols.append("المصدر")
|
| 123 |
-
|
| 124 |
-
df_display = df[cols]
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
# ================== البحث ==================
|
| 131 |
def local_search_df(query, mode, source_filter):
|
| 132 |
-
if not query
|
| 133 |
return "<p>⚠️ اكتب كلمة أو جملة للبحث</p>", pd.DataFrame()
|
| 134 |
|
| 135 |
df_search = library_df.copy()
|
|
@@ -138,25 +98,16 @@ def local_search_df(query, mode, source_filter):
|
|
| 138 |
if source_filter != "الكل":
|
| 139 |
df_search = df_search[df_search["المصدر"] == source_filter]
|
| 140 |
|
| 141 |
-
#
|
| 142 |
if mode == "نصي":
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
# ---- بحث دلالي ----
|
| 147 |
else:
|
| 148 |
q_emb = model.encode([query], convert_to_numpy=True)
|
| 149 |
scores = util.cos_sim(q_emb, library_embeddings)[0].cpu().numpy()
|
| 150 |
-
df_search = df_search.copy()
|
| 151 |
df_search["score"] = scores
|
| 152 |
df = df_search.sort_values("score", ascending=False)
|
| 153 |
|
| 154 |
-
if df.empty:
|
| 155 |
-
df = pd.DataFrame([{"نتيجة": "❌ لم يتم العثور على نتائج"}])
|
| 156 |
-
|
| 157 |
-
if "Title" in df.columns:
|
| 158 |
-
df = df.drop(columns=["Title"])
|
| 159 |
-
|
| 160 |
return results_to_html(df), df
|
| 161 |
|
| 162 |
# ================== حفظ النتائج Excel ==================
|
|
@@ -169,12 +120,11 @@ def save_to_excel(df):
|
|
| 169 |
return tmp.name
|
| 170 |
|
| 171 |
# ================== الواجهة ==================
|
| 172 |
-
|
| 173 |
-
gr.Markdown("## 🔍 البحث بالمكتبة (ملف موحد)")
|
| 174 |
|
| 175 |
-
|
| 176 |
-
gr.
|
| 177 |
-
|
| 178 |
|
| 179 |
query = gr.Textbox(label="اكتب كلمة أو موضوع البحث")
|
| 180 |
|
|
@@ -208,5 +158,4 @@ with gr.Blocks(title="البحث الدلالي بالمكتبة") as app:
|
|
| 208 |
outputs=file_out
|
| 209 |
)
|
| 210 |
|
| 211 |
-
|
| 212 |
app.launch()
|
|
|
|
| 23 |
def download_from_drive(file_id, output):
|
| 24 |
if not os.path.exists(output):
|
| 25 |
url = f"https://drive.google.com/uc?id={file_id}"
|
| 26 |
+
gdown.download(url, output, quiet=True)
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
download_from_drive(DRIVE_BOOKS_ID, BOOKS_FILE)
|
| 29 |
download_from_drive(DRIVE_THESES_ID, THESES_FILE)
|
|
|
|
| 36 |
books = pd.read_excel(BOOKS_FILE).fillna("")
|
| 37 |
theses = pd.read_excel(THESES_FILE).fillna("")
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
# إضافة نوع المصدر
|
| 40 |
books["المصدر"] = "كتاب"
|
| 41 |
theses["المصدر"] = "رسالة"
|
|
|
|
| 58 |
if len(emb) == len(df):
|
| 59 |
return emb
|
| 60 |
|
| 61 |
+
texts = df["العنوان"].astype(str).tolist()
|
| 62 |
emb = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
|
| 63 |
with open(path, "wb") as f:
|
| 64 |
pickle.dump(emb, f)
|
|
|
|
| 71 |
if df.empty:
|
| 72 |
return "<p>❌ لم يتم العثور على نتائج</p>"
|
| 73 |
|
| 74 |
+
# التأكد من وجود الأعمدة المطلوبة
|
| 75 |
+
for col in ["المؤلف", "العنوان", "سنة النشر", "الموقع على الرف", "المصدر"]:
|
| 76 |
+
if col not in df.columns:
|
| 77 |
+
df[col] = "-"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
display_cols = ["المؤلف", "العنوان", "سنة النشر", "الموقع على الرف", "المصدر"]
|
| 80 |
+
df_display = df[display_cols]
|
| 81 |
|
| 82 |
+
html_table = df_display.to_html(
|
| 83 |
+
index=False,
|
| 84 |
+
escape=False,
|
| 85 |
+
classes="table table-striped",
|
| 86 |
+
border=0
|
| 87 |
+
)
|
| 88 |
+
return html_table
|
| 89 |
|
| 90 |
# ================== البحث ==================
|
| 91 |
def local_search_df(query, mode, source_filter):
|
| 92 |
+
if not query.strip():
|
| 93 |
return "<p>⚠️ اكتب كلمة أو جملة للبحث</p>", pd.DataFrame()
|
| 94 |
|
| 95 |
df_search = library_df.copy()
|
|
|
|
| 98 |
if source_filter != "الكل":
|
| 99 |
df_search = df_search[df_search["المصدر"] == source_filter]
|
| 100 |
|
| 101 |
+
# بحث نصي
|
| 102 |
if mode == "نصي":
|
| 103 |
+
df = df_search[df_search["العنوان"].str.contains(query, case=False, na=False)]
|
| 104 |
+
# بحث دلالي
|
|
|
|
|
|
|
| 105 |
else:
|
| 106 |
q_emb = model.encode([query], convert_to_numpy=True)
|
| 107 |
scores = util.cos_sim(q_emb, library_embeddings)[0].cpu().numpy()
|
|
|
|
| 108 |
df_search["score"] = scores
|
| 109 |
df = df_search.sort_values("score", ascending=False)
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
return results_to_html(df), df
|
| 112 |
|
| 113 |
# ================== حفظ النتائج Excel ==================
|
|
|
|
| 120 |
return tmp.name
|
| 121 |
|
| 122 |
# ================== الواجهة ==================
|
| 123 |
+
IMAGE_URL = "https://drive.google.com/uc?id=1y1cbJbdXSrhkEM7bMDrAUKr0dTiHPe-y"
|
|
|
|
| 124 |
|
| 125 |
+
with gr.Blocks(title="البحث الدلالي بالمكتبة") as app:
|
| 126 |
+
gr.Markdown("## 🔍 البحث في مقتنيات المكتبة")
|
| 127 |
+
gr.Image(IMAGE_URL, elem_id="header-image")
|
| 128 |
|
| 129 |
query = gr.Textbox(label="اكتب كلمة أو موضوع البحث")
|
| 130 |
|
|
|
|
| 158 |
outputs=file_out
|
| 159 |
)
|
| 160 |
|
|
|
|
| 161 |
app.launch()
|