Spaces:

Abdelrahman-a99
/

nu-policy-rag

Running

App Files Files Community

abdelrahman-a99 commited on 10 days ago

Commit

efa9374

1 Parent(s): c1b0ea3

Separate rag_core.py (the core logic) and app.py(only Gradio UI + API only) and import the answer_query function from rag_core and call it in on_ask function

Browse files

Files changed (3) hide show

app.py +14 -289
rag_core.py +307 -0
requirements.txt +0 -3

app.py CHANGED Viewed

@@ -1,301 +1,25 @@
-import os, json, glob, pickle, re
-from typing import List, Dict
-import faiss
-import torch
 import traceback
 import gradio as gr
-from langdetect import detect
-from unidecode import unidecode
-from sentence_transformers import SentenceTransformer
-from huggingface_hub import hf_hub_download
-from llama_cpp import Llama
-# ===============================
-# CONFIG
-# ===============================
-# Project paths (can be overridden with Space “Variables” if you like)
-DATA_DIR = os.getenv("DATA_DIR", "./data/pages")
-INDEX_PATH = os.getenv("INDEX_PATH", "./artifacts/policy.index")
-DOC_STORE_PATH = os.getenv("DOC_STORE_PATH", "./artifacts/policy_docs.pkl")
-ARTIFACT_DIR = os.path.dirname(INDEX_PATH) or "."
-os.makedirs(ARTIFACT_DIR, exist_ok=True)
-# Embeddings (multilingual e5; remember to prefix "query:" and "passage:")
-EMBED_MODEL = os.getenv("EMBED_MODEL", "intfloat/multilingual-e5-base")
-# LLM served on CPU via llama.cpp using a quantized GGUF of Qwen 3B Instruct
-GGUF_REPO_ID = os.getenv("GGUF_REPO_ID", "Qwen/Qwen2.5-3B-Instruct-GGUF")
-GGUF_FILENAME = os.getenv("GGUF_FILENAME", "qwen2.5-3b-instruct-q4_k_m.gguf")  # adjust if RAM is tight
-TOP_K = int(os.getenv("TOP_K", "5"))
-MAX_CTX_CHARS = int(os.getenv("MAX_CTX_CHARS", "5000"))
-N_CTX = int(os.getenv("N_CTX", "4096"))
-MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "140"))
-INSUFFICIENT_EN = "Insufficient context"
-INSUFFICIENT_AR = "لا تتوفر معلومة كافية"
-# ===============================
-# HELPERS
-# ===============================
-AR_REGEX = re.compile(r'[\u0600-\u06FF]')
-def detect_lang(text: str) -> str:
-    if AR_REGEX.search(text or ""):
-        return "ar"
-    try:
-        return "ar" if detect(text or "") == "ar" else "en"
-    except:
-        return "en"
-def normalize_q(text: str) -> str:
-    return re.sub(r'\s+', ' ', (text or "")).strip()
-def make_citation(d):
-    pg = d.get("page_number", "?")
-    sec = d.get("section", d.get("tag", "")) or ""
-    return f"p.{pg}" + (f" — {sec}" if sec else "")
-def truncate_ctx(s: str, limit: int = MAX_CTX_CHARS) -> str:
-    return s if len(s) <= limit else s[:limit] + "\n[...]"
-# ===============================
-# DATA LOADING & INDEXING
-# ===============================
-def load_policy_jsons(folder: str):
-    docs = []
-    files = sorted(glob.glob(os.path.join(folder, "*.json")))
-    for fp in files:
-        try:
-            with open(fp, "r", encoding="utf-8") as f:
-                data = json.load(f)
-            page_num = data.get("page_number")
-            section  = data.get("doc_title", {}).get("en", "") or data.get("doc_title", {}).get("ar", "")
-            qas = data.get("qas", [])
-            for qa in qas:
-                cid = qa.get("canonical_id") or qa.get("id") or os.path.basename(fp)
-                q_ar = normalize_q( qa.get("question", {}).get("ar", "") )
-                q_en = normalize_q( qa.get("question", {}).get("en", "") )
-                a_ar = normalize_q( qa.get("answer",  {}).get("ar", "") )
-                a_en = normalize_q( qa.get("answer",  {}).get("en", "") )
-                if q_ar or a_ar:
-                    docs.append({
-                        "id": cid + "::ar",
-                        "lang": "ar",
-                        "question": q_ar,
-                        "answer": a_ar,
-                        "page_number": page_num,
-                        "section": section,
-                        "source_file": fp
-                    })
-                if q_en or a_en:
-                    docs.append({
-                        "id": cid + "::en",
-                        "lang": "en",
-                        "question": q_en,
-                        "answer": a_en,
-                        "page_number": page_num,
-                        "section": section,
-                        "source_file": fp
-                    })
-        except Exception as e:
-            print(f"Error reading {fp}: {e}")
-    print(f"Loaded {len(docs)} QA passages from {len(files)} files.")
-    return docs
-def passages_text(d):
-    q = d.get("question") or ""
-    a = d.get("answer") or ""
-    base = f"Q: {q}\nA: {a}\nSource: page {d.get('page_number','?')}"
-    return "passage: " + base
-def build_index(docs, embedder, index_path, doc_store_path):
-    if not docs:
-        raise ValueError("No documents found to index.")
-    texts = [passages_text(d) for d in docs]
-    emb = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True, batch_size=64)
-    faiss.normalize_L2(emb)
-    index = faiss.IndexFlatIP(embedder.get_sentence_embedding_dimension())
-    index.add(emb)
-    faiss.write_index(index, index_path)
-    with open(doc_store_path, "wb") as f:
-        pickle.dump(docs, f)
-    print(f"Index built: {len(docs)} items.")
-def load_index():
-    if not (os.path.exists(INDEX_PATH) and os.path.exists(DOC_STORE_PATH)):
-        if not os.path.isdir(DATA_DIR):
-            raise FileNotFoundError(f"DATA_DIR not found: {DATA_DIR}")
-        docs = load_policy_jsons(DATA_DIR)
-        if not docs:
-            raise FileNotFoundError(f"No JSON files found in {DATA_DIR}. Please add your page JSON files.")
-        print("Building index...")
-        embedder = SentenceTransformer(EMBED_MODEL, device="cpu")
-        build_index(docs, embedder, INDEX_PATH, DOC_STORE_PATH)
-    index = faiss.read_index(INDEX_PATH)
-    with open(DOC_STORE_PATH, "rb") as f:
-        docs = pickle.load(f)
-    return index, docs
-try:
-    INDEX, DOCS = load_index()
-except Exception as e:
-    print("Failed to load/build FAISS index:", e)
-    INDEX, DOCS = None, []
-try:
-    EMBEDDER = SentenceTransformer(EMBED_MODEL, device="cpu")
-except Exception as e:
-    print("Failed to load embedder:", e)
-    EMBEDDER = None
-# ===============================
-# LLM (llama.cpp CPU) setup
-# ===============================
-def get_llm() -> Llama:
-    # Download the GGUF quantized model locally into ./models
-    local_path = hf_hub_download(
-        repo_id=GGUF_REPO_ID,
-        filename=GGUF_FILENAME,
-        local_dir="./models",
-        local_dir_use_symlinks=False
-    )
-    # Keep context moderate for free-CPU memory
-    return Llama(
-        model_path=local_path,
-        n_threads=max(2, os.cpu_count() or 2),
-        n_ctx=N_CTX,
-        chat_format="qwen",   # llama.cpp supports qwen2/qwen2.5 chat template
-        verbose=False
-    )
-try:
-    LLM = get_llm()
-except Exception as e:
-    print("Failed to init LLM:", e)
-    LLM = None
-# ===============================
-# RETRIEVAL + GENERATION
-# ===============================
-def retrieve(query_text: str, top_k: int = TOP_K, lang_hint: str = None):
-    q_emb = EMBEDDER.encode(["query: " + (query_text or "")], convert_to_numpy=True)
-    faiss.normalize_L2(q_emb)
-    D, I = INDEX.search(q_emb, top_k * 2)  # pull more, filter by language
-    lang = lang_hint or detect_lang(query_text or "")
-    same_lang, others = [], []
-    for i in I[0]:
-        if i < 0 or i >= len(DOCS):
-            continue
-        d = DOCS[i]
-        (same_lang if d.get("lang") == lang else others).append(d)
-    out = same_lang[:top_k]
-    if not out:
-        return out
-    if len(out) < top_k:
-        out.extend(others[:top_k - len(out)])
-    return out[:top_k]
-def build_messages(user_q: str, passages: List[Dict]):
-    lang = detect_lang(user_q or "")
-    sys_en = (
-        "You are NU-CS Policy Assistant. Answer ONLY using the provided context. "
-        "If the requested person/course/section is NOT present verbatim in the context, "
-        f"reply EXACTLY: \"{INSUFFICIENT_EN}\". "
-        "Include short page citations like (p.12). Answer in the user's language."
-    )
-    sys_ar = (
-        "أنت مساعد سياسات برنامج علوم الحاسب بجامعة النيل. أجب فقط من السياق المقدم. "
-        f"إذا لم يظهر الاسم/المقرر المطلوب نصًا داخل السياق فأجِب نصًا: \"{INSUFFICIENT_AR}\". "
-        "ضمّن إشارة صفحة موجزة مثل (ص.12). أجب بلغة المستخدم."
-    )
-    sys = sys_ar if lang == "ar" else sys_en
-    seen = set()
-    blocks = []
-    for d in passages:
-        key = (d.get("lang"), d.get("question"), d.get("answer"), d.get("page_number"))
-        if key in seen:
-            continue
-        seen.add(key)
-        cite = make_citation(d)
-        q = d.get("question") or ""
-        a = d.get("answer") or ""
-        if d.get("lang") == "ar":
-            blocks.append(f"س: {q}\nج: {a}\nالمصدر: {cite}")
-        else:
-            blocks.append(f"Q: {q}\nA: {a}\nSource: {cite}")
-    ctx = truncate_ctx("\n\n---\n\n".join(blocks))
-    if lang == "ar":
-        user = (
-            "أجب في جملة أو جملتين فقط بالاعتماد على السياق التالي. "
-            "إن لم يكن الجواب موجودًا في السياق فأجِب نصًا: \"لا تتوفر معلومة كافية\".\n\n"
-            f"السؤال: {user_q}\n\nالسياق:\n{ctx}"
-        )
-    else:
-        user = (
-            "Answer in 1–2 sentences using ONLY the context below. "
-            "If the answer isn’t in the context, reply EXACTLY: \"Insufficient context\".\n\n"
-            f"Question: {user_q}\n\nContext:\n{ctx}"
-        )
-    return [{"role": "system", "content": sys}, {"role": "user", "content": user}]
-def llm_generate(messages, max_new_tokens=MAX_NEW_TOKENS) -> str:
-    out = LLM.create_chat_completion(
-        messages=messages,
-        temperature=0.0,
-        max_tokens=max_new_tokens,
-        repeat_penalty=1.15,
-        stop=None,
-    )
-    try:
-        return out["choices"][0]["message"]["content"].strip()
-    except Exception:
-        return INSUFFICIENT_EN
-def answer_query(user_q: str, top_k: int = TOP_K):
-    if INDEX is None or EMBEDDER is None or LLM is None:
-        return INSUFFICIENT_EN, []
-    passages = retrieve(user_q, top_k=top_k, lang_hint=detect_lang(user_q or ""))
-    msgs = build_messages(user_q, passages)
-    resp = llm_generate(msgs, max_new_tokens=140)
-    return resp, passages
 # ===============================
 # GRADIO UI
 # ===============================
-def format_passages(passages: List[Dict]) -> str:
-    lines = []
-    for i, d in enumerate(passages, 1):
-        cite = make_citation(d)
-        q = d.get("question") or ""
-        a = d.get("answer") or ""
-        lang = d.get("lang", "")
-        lines.append(f"{i}. [{lang}] {cite}\nQ: {q}\nA: {a}\n")
-    return "\n\n".join(lines) if lines else "(no passages)"
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# NU-CS Policy RAG — Qwen 3B (CPU, GGUF via llama.cpp)")
-    gr.Markdown("Put your 51 JSON files in **./data/pages/** and (re)start the Space. It will build the FAISS index automatically.")
     with gr.Row():
-        inp = gr.Textbox(label="Your question (AR/EN)", placeholder="مثال: ما هي سياسة الحضور؟  |  Example: What is the attendance policy?")
-    with gr.Row():
-        topk = gr.Slider(1, 10, value=5, step=1, label="Top-K passages")
     with gr.Row():
-        btn = gr.Button("Ask")
     with gr.Row():
         out = gr.Textbox(label="Answer")
     with gr.Row():
@@ -303,18 +27,19 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     def on_ask(q, k):
         try:
-            k = int(float(k)) if k is not None else 5
             if not q or not q.strip():
                 return "Please enter a question.", []
             ans, passages = answer_query(q.strip(), k)
             return ans, passages
         except Exception as e:
-                return f"ERROR: {e}", {"error": str(e)}
     btn.click(on_ask, inputs=[inp, topk], outputs=[out, ctx], api_name="answer")
     inp.submit(on_ask, inputs=[inp, topk], outputs=[out, ctx])

 import traceback
 import gradio as gr
+from rag_core import answer_query, TOP_K, INSUFFICIENT_EN
 # ===============================
 # GRADIO UI
 # ===============================
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# NU-CS Policy RAG — Qwen 3B (CPU, GGUF via llama.cpp)")
+    gr.Markdown(
+        "Put your page JSON files in **./data/pages/** and (re)start the Space. "
+        "It will build the FAISS index automatically."
+    )
     with gr.Row():
+        inp = gr.Textbox(
+            label="Your question (AR/EN)",
+            placeholder="مثال: ما هي سياسة الحضور؟  |  Example: What is the attendance policy?",
+        )
     with gr.Row():
+        topk = gr.Slider(1, 10, value=TOP_K, step=1, label="Top-K passages")
     with gr.Row():
         out = gr.Textbox(label="Answer")
     with gr.Row():
     def on_ask(q, k):
         try:
+            k = int(float(k)) if k is not None else TOP_K
             if not q or not q.strip():
                 return "Please enter a question.", []
             ans, passages = answer_query(q.strip(), k)
             return ans, passages
         except Exception as e:
+            # error in debug JSON; keep answer user-friendly
+            return f"ERROR: {e}", {"error": traceback.format_exc()}
+    btn = gr.Button("Ask")
     btn.click(on_ask, inputs=[inp, topk], outputs=[out, ctx], api_name="answer")
     inp.submit(on_ask, inputs=[inp, topk], outputs=[out, ctx])

rag_core.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import os, json, glob, pickle, re
+from typing import List, Dict
+import faiss
+from langdetect import detect
+from sentence_transformers import SentenceTransformer
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+# ===============================
+# CONFIG
+# ===============================
+DATA_DIR = os.getenv("DATA_DIR", "./data/pages")
+INDEX_PATH = os.getenv("INDEX_PATH", "./artifacts/policy.index")
+DOC_STORE_PATH = os.getenv("DOC_STORE_PATH", "./artifacts/policy_docs.pkl")
+ARTIFACT_DIR = os.path.dirname(INDEX_PATH) or "."
+os.makedirs(ARTIFACT_DIR, exist_ok=True)
+EMBED_MODEL = os.getenv("EMBED_MODEL", "intfloat/multilingual-e5-base")
+GGUF_REPO_ID = os.getenv("GGUF_REPO_ID", "Qwen/Qwen2.5-3B-Instruct-GGUF")
+GGUF_FILENAME = os.getenv("GGUF_FILENAME", "qwen2.5-3b-instruct-q4_k_m.gguf")
+TOP_K = int(os.getenv("TOP_K", "5"))
+MAX_CTX_CHARS = int(os.getenv("MAX_CTX_CHARS", "5000"))
+N_CTX = int(os.getenv("N_CTX", "4096"))
+MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "140"))
+INSUFFICIENT_EN = "Insufficient context"
+INSUFFICIENT_AR = "لا تتوفر معلومة كافية"
+# ===============================
+# HELPERS
+# ===============================
+AR_REGEX = re.compile(r'[\u0600-\u06FF]')
+def detect_lang(text: str) -> str:
+    if AR_REGEX.search(text or ""):
+        return "ar"
+    try:
+        return "ar" if detect(text or "") == "ar" else "en"
+    except Exception:
+        return "en"
+def normalize_q(text: str) -> str:
+    return re.sub(r"\s+", " ", (text or "")).strip()
+def make_citation(d: Dict) -> str:
+    pg = d.get("page_number", "?")
+    sec = d.get("section", d.get("tag", "")) or ""
+    return f"p.{pg}" + (f" — {sec}" if sec else "")
+def truncate_ctx(s: str, limit: int = MAX_CTX_CHARS) -> str:
+    return s if len(s) <= limit else s[:limit] + "\n[...]"
+# ===============================
+# DATA LOADING & INDEXING
+# ===============================
+def load_policy_jsons(folder: str):
+    docs = []
+    files = sorted(glob.glob(os.path.join(folder, "*.json")))
+    for fp in files:
+        try:
+            with open(fp, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            page_num = data.get("page_number")
+            section = (
+                data.get("doc_title", {}).get("en", "")
+                or data.get("doc_title", {}).get("ar", "")
+            )
+            qas = data.get("qas", [])
+            for qa in qas:
+                cid = qa.get("canonical_id") or qa.get("id") or os.path.basename(fp)
+                q_ar = normalize_q(qa.get("question", {}).get("ar", ""))
+                q_en = normalize_q(qa.get("question", {}).get("en", ""))
+                a_ar = normalize_q(qa.get("answer", {}).get("ar", ""))
+                a_en = normalize_q(qa.get("answer", {}).get("en", ""))
+                if q_ar or a_ar:
+                    docs.append(
+                        {
+                            "id": cid + "::ar",
+                            "lang": "ar",
+                            "question": q_ar,
+                            "answer": a_ar,
+                            "page_number": page_num,
+                            "section": section,
+                            "source_file": fp,
+                        }
+                    )
+                if q_en or a_en:
+                    docs.append(
+                        {
+                            "id": cid + "::en",
+                            "lang": "en",
+                            "question": q_en,
+                            "answer": a_en,
+                            "page_number": page_num,
+                            "section": section,
+                            "source_file": fp,
+                        }
+                    )
+        except Exception as e:
+            print(f"Error reading {fp}: {e}")
+    print(f"Loaded {len(docs)} QA passages from {len(files)} files.")
+    return docs
+def passages_text(d: Dict) -> str:
+    q = d.get("question") or ""
+    a = d.get("answer") or ""
+    base = f"Q: {q}\nA: {a}\nSource: page {d.get('page_number', '?')}"
+    return "passage: " + base
+def build_index(docs, embedder, index_path, doc_store_path):
+    if not docs:
+        raise ValueError("No documents found to index.")
+    texts = [passages_text(d) for d in docs]
+    emb = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True, batch_size=64)
+    faiss.normalize_L2(emb)
+    index = faiss.IndexFlatIP(embedder.get_sentence_embedding_dimension())
+    index.add(emb)
+    faiss.write_index(index, index_path)
+    with open(doc_store_path, "wb") as f:
+        pickle.dump(docs, f)
+    print(f"Index built: {len(docs)} items.")
+def load_index():
+    if not (os.path.exists(INDEX_PATH) and os.path.exists(DOC_STORE_PATH)):
+        if not os.path.isdir(DATA_DIR):
+            raise FileNotFoundError(f"DATA_DIR not found: {DATA_DIR}")
+        docs = load_policy_jsons(DATA_DIR)
+        if not docs:
+            raise FileNotFoundError(
+                f"No JSON files found in {DATA_DIR}. Please add your page JSON files."
+            )
+        print("Building index...")
+        embedder = SentenceTransformer(EMBED_MODEL, device="cpu")
+        build_index(docs, embedder, INDEX_PATH, DOC_STORE_PATH)
+    index = faiss.read_index(INDEX_PATH)
+    with open(DOC_STORE_PATH, "rb") as f:
+        docs = pickle.load(f)
+    return index, docs
+# Global initialization (for Spaces)
+try:
+    INDEX, DOCS = load_index()
+except Exception as e:
+    print("Failed to load/build FAISS index:", e)
+    INDEX, DOCS = None, []
+try:
+    EMBEDDER = SentenceTransformer(EMBED_MODEL, device="cpu")
+except Exception as e:
+    print("Failed to load embedder:", e)
+    EMBEDDER = None
+# ===============================
+# LLM (llama.cpp CPU) setup
+# ===============================
+def get_llm() -> Llama:
+    local_path = hf_hub_download(
+        repo_id=GGUF_REPO_ID,
+        filename=GGUF_FILENAME,
+        local_dir="./models",
+        local_dir_use_symlinks=False,
+    )
+    return Llama(
+        model_path=local_path,
+        n_threads=max(2, os.cpu_count() or 2),
+        n_ctx=N_CTX,
+        chat_format="qwen",
+        verbose=False,
+    )
+try:
+    LLM = get_llm()
+except Exception as e:
+    print("Failed to init LLM:", e)
+    LLM = None
+# ===============================
+# RETRIEVAL + GENERATION
+# ===============================
+def retrieve(query_text: str, top_k: int = TOP_K, lang_hint: str = None):
+    if EMBEDDER is None or INDEX is None:
+        return []
+    q_emb = EMBEDDER.encode(
+        ["query: " + (query_text or "")],
+        convert_to_numpy=True,
+    )
+    faiss.normalize_L2(q_emb)
+    D, I = INDEX.search(q_emb, top_k * 2)  # pull more, filter by language
+    lang = lang_hint or detect_lang(query_text or "")
+    same_lang, others = [], []
+    for i in I[0]:
+        if i < 0 or i >= len(DOCS):
+            continue
+        d = DOCS[i]
+        (same_lang if d.get("lang") == lang else others).append(d)
+    out = same_lang[:top_k]
+    if len(out) < top_k:
+        out.extend(others[: top_k - len(out)])
+    return out[:top_k]
+def build_messages(user_q: str, passages: List[Dict]):
+    lang = detect_lang(user_q or "")
+    sys_en = (
+        "You are NU-CS Policy Assistant. Answer ONLY using the provided context. "
+        "If the requested person/course/section is NOT present verbatim in the context, "
+        f"reply EXACTLY: \"{INSUFFICIENT_EN}\". "
+        "Include short page citations like (p.12). Answer in the user's language."
+    )
+    sys_ar = (
+        "أنت مساعد سياسات برنامج علوم الحاسب بجامعة النيل. أجب فقط من السياق المقدم. "
+        f"إذا لم يظهر الاسم/المقرر المطلوب نصًا داخل السياق فأجِب نصًا: \"{INSUFFICIENT_AR}\". "
+        "ضمّن إشارة صفحة موجزة مثل (ص.12). أجب بلغة المستخدم."
+    )
+    sys = sys_ar if lang == "ar" else sys_en
+    seen = set()
+    blocks = []
+    for d in passages:
+        key = (d.get("lang"), d.get("question"), d.get("answer"), d.get("page_number"))
+        if key in seen:
+            continue
+        seen.add(key)
+        cite = make_citation(d)
+        q = d.get("question") or ""
+        a = d.get("answer") or ""
+        if d.get("lang") == "ar":
+            blocks.append(f"س: {q}\nج: {a}\nالمصدر: {cite}")
+        else:
+            blocks.append(f"Q: {q}\nA: {a}\nSource: {cite}")
+    ctx = truncate_ctx("\n\n---\n\n".join(blocks))
+    if lang == "ar":
+        user = (
+            f"أجب في جملة أو جملتين فقط بالاعتماد على السياق التالي. "
+            f"إن لم يكن الجواب موجودًا في السياق فأجِب نصًا: \"{INSUFFICIENT_AR}\".\n\n"
+            f"السؤال: {user_q}\n\nالسياق:\n{ctx}"
+        )
+    else:
+        user = (
+            f"Answer in 1–2 sentences using ONLY the context below. "
+            f"If the answer isn’t in the context, reply EXACTLY: \"{INSUFFICIENT_EN}\".\n\n"
+            f"Question: {user_q}\n\nContext:\n{ctx}"
+        )
+    return [{"role": "system", "content": sys}, {"role": "user", "content": user}]
+def llm_generate(messages, max_new_tokens: int = MAX_NEW_TOKENS) -> str:
+    if LLM is None:
+        return INSUFFICIENT_EN
+    out = LLM.create_chat_completion(
+        messages=messages,
+        temperature=0.0,
+        max_tokens=max_new_tokens,
+        repeat_penalty=1.15,
+        stop=None,
+    )
+    try:
+        return out["choices"][0]["message"]["content"].strip()
+    except Exception:
+        return INSUFFICIENT_EN
+def answer_query(user_q: str, top_k: int = TOP_K):
+    if INDEX is None or EMBEDDER is None or LLM is None:
+        lang = detect_lang(user_q or "")
+        msg = INSUFFICIENT_AR if lang == "ar" else INSUFFICIENT_EN
+        return msg, []
+    lang = detect_lang(user_q or "")
+    passages = retrieve(user_q, top_k=top_k, lang_hint=lang)
+    # If retrieval found nothing, don't waste tokens on the LLM
+    if not passages:
+        msg = INSUFFICIENT_AR if lang == "ar" else INSUFFICIENT_EN
+        return msg, []
+    msgs = build_messages(user_q, passages)
+    resp = llm_generate(msgs)
+    return resp, passages

requirements.txt CHANGED Viewed

@@ -2,10 +2,7 @@ faiss-cpu==1.8.0.post1
 sentence-transformers==3.0.1
 torch==2.3.1
 huggingface_hub==0.24.5
-# llama.cpp CPU bindings
 llama-cpp-python==0.2.90
 tqdm==4.66.4
 langdetect==1.0.9
 unidecode==1.3.8

 sentence-transformers==3.0.1
 torch==2.3.1
 huggingface_hub==0.24.5
 llama-cpp-python==0.2.90
 tqdm==4.66.4
 langdetect==1.0.9
 unidecode==1.3.8