aelsaeed commited on
Commit
be62a9f
·
verified ·
1 Parent(s): 7c9bc21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -27
app.py CHANGED
@@ -16,7 +16,8 @@ DRIVE_THESES_ID = "1K2Mtze6ZdvfKUsFMCOWlRBjDq-ZnJNrv"
16
  EMB_DIR = "embeddings"
17
  os.makedirs(EMB_DIR, exist_ok=True)
18
 
19
- MODEL_NAME = "all-MiniLM-L6-v2" # نموذج أخف وأسرع
 
20
  model = SentenceTransformer(MODEL_NAME)
21
 
22
  # ================== تحميل من Drive ==================
@@ -28,26 +29,19 @@ def download_from_drive(file_id, output):
28
  download_from_drive(DRIVE_BOOKS_ID, BOOKS_FILE)
29
  download_from_drive(DRIVE_THESES_ID, THESES_FILE)
30
 
31
- # ================== تحميل ودمج الملفات ==================
32
- def load_and_merge():
33
- if not os.path.exists(BOOKS_FILE) or not os.path.exists(THESES_FILE):
34
- raise FileNotFoundError("❌ تأكدي من وجود book.xlsx و theses.xlsx")
35
 
36
- books = pd.read_excel(BOOKS_FILE).fillna("")
37
- theses = pd.read_excel(THESES_FILE).fillna("")
 
38
 
39
- books["المصدر"] = "كتاب"
40
- theses["المصدر"] = "رسالة"
41
-
42
- merged = pd.concat([books, theses], ignore_index=True)
43
- return merged
44
-
45
- library_df = load_and_merge()
46
-
47
- # ================== Embeddings ==================
48
  def emb_path(name):
49
  return os.path.join(EMB_DIR, f"{name}.pkl")
50
 
 
51
  def build_or_load_embeddings(df, name):
52
  path = emb_path(name)
53
  if os.path.exists(path):
@@ -58,13 +52,21 @@ def build_or_load_embeddings(df, name):
58
  return emb
59
  except Exception:
60
  pass
61
- texts = df["العنوان"].astype(str).tolist()
 
 
 
62
  emb = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
63
  with open(path,"wb") as f:
64
  pickle.dump(emb,f)
65
  return emb
66
 
67
- library_embeddings = build_or_load_embeddings(library_df, "library")
 
 
 
 
 
68
 
69
  # ================== CSS ==================
70
  CUSTOM_CSS = """
@@ -74,25 +76,23 @@ CUSTOM_CSS = """
74
  .styled-table tr:nth-child(even){background-color:#f9f9f9;}
75
  .styled-table tr:nth-child(odd){background-color:#fff;}
76
  .styled-table th{background-color:#4da6ff;color:white;}
77
- a{color:#0066cc;text-decoration:none;}
78
- a:hover{text-decoration:underline;}
79
  </style>
80
  """
81
 
82
- # ================== عرض النتائج HTML ==================
83
  def results_to_html(df):
84
  if df.empty:
85
  return "<p>❌ لم يتم العثور على نتائج</p>"
86
 
87
- for col in ["المؤلف","العنوان","سنة النشر","الموقع على الرف","المصدر","score"]:
 
88
  if col not in df.columns:
89
  df[col] = "-"
90
 
91
  html_results = ""
92
  for _, row in df.iterrows():
93
- single_df = pd.DataFrame([row[["المؤلف","العنوان","سنة النشر","الموقع على الرف","المصدر","score"]]])
94
- html_results += single_df.to_html(index=False, escape=False, classes="styled-table", border=0)
95
-
96
  return CUSTOM_CSS + html_results
97
 
98
  # ================== البحث ==================
@@ -102,16 +102,19 @@ def local_search_df(query, mode, source_filter):
102
 
103
  df_search = library_df.copy()
104
 
 
105
  if source_filter != "الكل":
106
  df_search = df_search[df_search["المصدر"] == source_filter]
107
 
 
108
  if mode == "نصي":
109
  df = df_search[df_search["العنوان"].str.contains(query, case=False, na=False)]
 
110
  else:
111
  q_emb = model.encode([query], convert_to_numpy=True)
112
  scores = util.cos_sim(q_emb, library_embeddings)[0].cpu().numpy()
113
  df_search["score"] = scores
114
- df = df_search.sort_values("score", ascending=False).head(20) # أعلى 20 نتيجة
115
 
116
  return results_to_html(df), df
117
 
@@ -124,7 +127,7 @@ def save_to_excel(df):
124
  df.to_excel(tmp.name, index=False)
125
  return tmp.name
126
 
127
- # ================== الواجهة ==================
128
  IMAGE_URL = "https://drive.google.com/uc?id=1y1cbJbdXSrhkEM7bMDrAUKr0dTiHPe-y"
129
 
130
  with gr.Blocks(title="البحث الدلالي بالمكتبة") as app:
@@ -146,6 +149,7 @@ with gr.Blocks(title="البحث الدلالي بالمكتبة") as app:
146
  )
147
 
148
  btn_search = gr.Button("🔎 بحث")
 
149
  df_state = gr.State()
150
  output_html = gr.HTML()
151
  file_out = gr.File(label="⬇️ تحميل النتائج")
 
16
  EMB_DIR = "embeddings"
17
  os.makedirs(EMB_DIR, exist_ok=True)
18
 
19
+ # ================== تحميل نموذج Semantic ==================
20
+ MODEL_NAME = "all-MiniLM-L6-v2"
21
  model = SentenceTransformer(MODEL_NAME)
22
 
23
  # ================== تحميل من Drive ==================
 
29
  download_from_drive(DRIVE_BOOKS_ID, BOOKS_FILE)
30
  download_from_drive(DRIVE_THESES_ID, THESES_FILE)
31
 
32
+ # ================== تحميل الملفات ==================
33
+ books_df = pd.read_excel(BOOKS_FILE).fillna("")
34
+ theses_df = pd.read_excel(THESES_FILE).fillna("")
 
35
 
36
+ # إضافة نوع المصدر
37
+ books_df["المصدر"] = "كتاب"
38
+ theses_df["المصدر"] = "رسالة"
39
 
40
+ # ================== مسار الـ embeddings ==================
 
 
 
 
 
 
 
 
41
  def emb_path(name):
42
  return os.path.join(EMB_DIR, f"{name}.pkl")
43
 
44
+ # ================== بناء أو تحميل الـ embeddings ==================
45
  def build_or_load_embeddings(df, name):
46
  path = emb_path(name)
47
  if os.path.exists(path):
 
52
  return emb
53
  except Exception:
54
  pass
55
+ # تأكد من وجود عمود Title
56
+ if "Title" not in df.columns:
57
+ df["Title"] = df["العنوان"]
58
+ texts = df["Title"].astype(str).tolist()
59
  emb = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
60
  with open(path,"wb") as f:
61
  pickle.dump(emb,f)
62
  return emb
63
 
64
+ books_embeddings = build_or_load_embeddings(books_df,"books")
65
+ theses_embeddings = build_or_load_embeddings(theses_df,"theses")
66
+
67
+ # ================== دمج المكتبة ==================
68
+ library_df = pd.concat([books_df, theses_df], ignore_index=True)
69
+ library_embeddings = np.concatenate([books_embeddings, theses_embeddings], axis=0)
70
 
71
  # ================== CSS ==================
72
  CUSTOM_CSS = """
 
76
  .styled-table tr:nth-child(even){background-color:#f9f9f9;}
77
  .styled-table tr:nth-child(odd){background-color:#fff;}
78
  .styled-table th{background-color:#4da6ff;color:white;}
 
 
79
  </style>
80
  """
81
 
82
+ # ================== عرض النتائج HTML لكل نتيجة ==================
83
  def results_to_html(df):
84
  if df.empty:
85
  return "<p>❌ لم يتم العثور على نتائج</p>"
86
 
87
+ # التأكد من الأعمدة المطلوبة
88
+ for col in ["المؤلف","العنوان","سنة النشر","الموقع على الرف","المصدر"]:
89
  if col not in df.columns:
90
  df[col] = "-"
91
 
92
  html_results = ""
93
  for _, row in df.iterrows():
94
+ row_df = pd.DataFrame([row[["المؤلف","العنوان","سنة النشر","الموقع على الرف","المصدر"]]])
95
+ html_results += row_df.to_html(index=False, escape=False, classes="styled-table", border=0)
 
96
  return CUSTOM_CSS + html_results
97
 
98
  # ================== البحث ==================
 
102
 
103
  df_search = library_df.copy()
104
 
105
+ # فلترة حسب المصدر
106
  if source_filter != "الكل":
107
  df_search = df_search[df_search["المصدر"] == source_filter]
108
 
109
+ # بحث نصي
110
  if mode == "نصي":
111
  df = df_search[df_search["العنوان"].str.contains(query, case=False, na=False)]
112
+ # بحث دلالي
113
  else:
114
  q_emb = model.encode([query], convert_to_numpy=True)
115
  scores = util.cos_sim(q_emb, library_embeddings)[0].cpu().numpy()
116
  df_search["score"] = scores
117
+ df = df_search.sort_values("score", ascending=False)
118
 
119
  return results_to_html(df), df
120
 
 
127
  df.to_excel(tmp.name, index=False)
128
  return tmp.name
129
 
130
+ # ================== واجهة Gradio ==================
131
  IMAGE_URL = "https://drive.google.com/uc?id=1y1cbJbdXSrhkEM7bMDrAUKr0dTiHPe-y"
132
 
133
  with gr.Blocks(title="البحث الدلالي بالمكتبة") as app:
 
149
  )
150
 
151
  btn_search = gr.Button("🔎 بحث")
152
+
153
  df_state = gr.State()
154
  output_html = gr.HTML()
155
  file_out = gr.File(label="⬇️ تحميل النتائج")