import io
import numpy as np
import streamlit as st
from pypdf import PdfReader

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


# -------------------- Config -------------------- #

EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL_NAME = "google/gemma-2b-it"  # you can change this later


# -------------------- Model loaders (cached) -------------------- #

@st.cache_resource(show_spinner=True)
def load_embedder():
    return SentenceTransformer(EMBEDDING_MODEL_NAME)


@st.cache_resource(show_spinner=True)
def load_llm_pipeline():
    """
    Load a text-generation pipeline for the LLM.
    Using device_map="auto" will use GPU if available.
    """
    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(
        LLM_MODEL_NAME,
        device_map="auto",
    )
    gen_pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        do_sample=False,
        temperature=0.1,
        top_p=0.9,
    )
    return gen_pipe


# -------------------- Helpers -------------------- #

def extract_text_from_pdf(file) -> str:
    """Extract all text from an uploaded PDF file."""
    pdf_reader = PdfReader(file)
    all_text = []
    for page in pdf_reader.pages:
        text = page.extract_text()
        if text:
            all_text.append(text)
    return "\n".join(all_text)


def chunk_text(text, chunk_size=800, overlap=200):
    """Split long text into overlapping chunks (by words)."""
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks


def embed_texts(texts, embedder: SentenceTransformer):
    """Get embeddings for a list of texts."""
    if not texts:
        return np.array([])
    embeddings = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=False)
    return embeddings.astype("float32")


def cosine_sim_matrix(matrix, vector):
    """Cosine similarity between each row in matrix and a single vector."""
    if matrix.size == 0:
        return np.array([])
    matrix_norm = matrix / (np.linalg.norm(matrix, axis=1, keepdims=True) + 1e-10)
    vector_norm = vector / (np.linalg.norm(vector) + 1e-10)
    return np.dot(matrix_norm, vector_norm)


def retrieve_relevant_chunks(question, chunks, chunk_embeddings, embedder, top_k=4):
    """Find top_k most relevant chunks for the question."""
    if len(chunks) == 0:
        return []

    q_emb = embed_texts([question], embedder)[0]
    sims = cosine_sim_matrix(chunk_embeddings, q_emb)
    top_idx = np.argsort(sims)[::-1][:top_k]
    return [chunks[i] for i in top_idx]


def build_prompt(question, context_chunks):
    context = "\n\n---\n\n".join(context_chunks)
    system_instruction = (
        "You are a helpful assistant that answers questions "
        "using ONLY the information provided in the document context.\n"
        "If the answer is not in the context, say that you cannot find it in the document."
    )

    prompt = (
        f"{system_instruction}\n\n"
        f"Document context:\n{context}\n\n"
        f"Question: {question}\n\n"
        f"Answer:"
    )
    return prompt


def answer_question(question, chunks, llm_pipe):
    """Call the LLM with the question + retrieved context."""
    prompt = build_prompt(question, chunks)

    # For most HF instruction models, plain prompt works ok.
    outputs = llm_pipe(
        prompt,
        num_return_sequences=1,
        truncation=True,
    )
    text = outputs[0]["generated_text"]

    # Try to remove the prompt part if the model echoes it
    if prompt in text:
        text = text.split(prompt, 1)[-1].strip()

    return text.strip()


# -------------------- Streamlit UI -------------------- #

st.set_page_config(page_title="Chat with your PDF (HuggingFace)", layout="wide")

st.title("📄 Chat with your PDF (HuggingFace RAG)")

st.markdown(
    """
Upload a PDF, let the app index it, and then ask questions.
The model will answer based only on the document content (RAG).
"""
)

with st.sidebar:
    st.header("1. Upload and process PDF")
    uploaded_pdf = st.file_uploader("Choose a PDF file", type=["pdf"])
    process_button = st.button("Process Document")

# Session state to keep doc data
if "chunks" not in st.session_state:
    st.session_state.chunks = []
    st.session_state.embeddings = None

# Load models (lazy)
with st.spinner("Loading models (first time only)..."):
    embedder = load_embedder()
    llm_pipe = load_llm_pipeline()

# Step 1: Process PDF
if process_button:
    if uploaded_pdf is None:
        st.sidebar.error("Please upload a PDF first.")
    else:
        with st.spinner("Reading and indexing your PDF..."):
            pdf_bytes = io.BytesIO(uploaded_pdf.read())
            text = extract_text_from_pdf(pdf_bytes)

            if not text.strip():
                st.error("Could not extract any text from this PDF.")
            else:
                chunks = chunk_text(text)
                embeddings = embed_texts(chunks, embedder)

                st.session_state.chunks = chunks
                st.session_state.embeddings = embeddings

                st.success(f"Done! Indexed {len(chunks)} chunks from the PDF.")

# Step 2: Ask questions
st.header("2. Ask questions about your document")

question = st.text_input("Type your question here")

if st.button("Get answer"):
    if not st.session_state.chunks:
        st.error("Please upload and process a PDF first.")
    elif not question.strip():
        st.error("Please type a question.")
    else:
        with st.spinner("Thinking with your document..."):
            relevant_chunks = retrieve_relevant_chunks(
                question,
                st.session_state.chunks,
                st.session_state.embeddings,
                embedder,
                top_k=4,
            )
            answer = answer_question(question, relevant_chunks, llm_pipe)

        st.subheader("Answer")
        st.write(answer)

        with st.expander("Show relevant excerpts from the PDF"):
            for i, ch in enumerate(relevant_chunks, start=1):
                st.markdown(f"**Chunk {i}:**")
                st.write(ch)
                st.markdown("---")