Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import io | |
| import PyPDF2 | |
| from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor, pipeline | |
| from gtts import gTTS | |
| from PIL import Image | |
| import fitz # PyMuPDF | |
| # Function to extract text from a PDF | |
| def extract_text_from_pdf(pdf_file): | |
| pdf_stream = io.BytesIO(pdf_file.read()) | |
| pdf_reader = PyPDF2.PdfReader(pdf_stream) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() or "" # Handle None for non-text pages | |
| return text | |
| # Function to generate discussion points (summarization) | |
| def generate_discussion_points(text): | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| summary = summarizer(text, max_length=600, min_length=300, do_sample=False) | |
| return summary[0]["summary_text"] | |
| # Function to convert text to speech | |
| def text_to_speech(text): | |
| tts = gTTS(text=text, lang="en") | |
| tts.save("discussion_points.mp3") | |
| # Function for document question answering | |
| def answer_questions(pdf_file, question): | |
| # Open PDF using PyMuPDF | |
| doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
| processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large") | |
| model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large") | |
| answers = [] | |
| for page in doc: | |
| # Convert page to an image | |
| pix = page.get_pixmap() | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| # Process the image for Q&A | |
| inputs = processor(images=img, text=question, return_tensors="pt") | |
| outputs = model.generate(**inputs) | |
| answer = processor.decode(outputs[0], skip_special_tokens=True) | |
| answers.append(answer) | |
| return answers | |
| # Streamlit app | |
| st.title("PDF Analysis Tool: Text, Summarization, and Q&A") | |
| uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
| if uploaded_file is not None: | |
| # Extract text from the uploaded PDF | |
| text = extract_text_from_pdf(uploaded_file) | |
| st.subheader("Extracted Text") | |
| st.write(text) | |
| # Generate and display discussion points | |
| st.subheader("Generated Discussion Points") | |
| if st.button("Generate Discussion Points"): | |
| discussion_points = generate_discussion_points(text) | |
| st.write(discussion_points) | |
| text_to_speech(discussion_points) | |
| # Play the audio | |
| audio_file = open("discussion_points.mp3", "rb") | |
| audio_bytes = audio_file.read() | |
| st.audio(audio_bytes, format="audio/mp3") | |
| # Q&A Section | |
| st.subheader("Document Question Answering") | |
| question = st.text_input("Ask a question about the document:") | |
| if question: | |
| answers = answer_questions(uploaded_file, question) | |
| st.write("Answers:") | |
| for page_num, answer in enumerate(answers, 1): | |
| st.write(f"Page {page_num}: {answer}") |