File size: 1,307 Bytes
119e740
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import gradio as gr
from PyPDF2 import PdfReader
from concurrent.futures import ThreadPoolExecutor


def convert_pdf_to_text(pdf_file):
    print(pdf_file.name)
    if not pdf_file.name.endswith(".pdf"):
        raise ValueError("Invalid file format. Please upload PDF files only.")

    text = "\n---\n"
    text += f"file name: {pdf_file.name}\n content: \n"
    with open(pdf_file.name, "rb") as file:
        pdf_reader = PdfReader(file)
        # Extract all text at once
        text += "".join([page.extract_text() for page in pdf_reader.pages])
    text += "\n---\n"
    return text


def pdf_to_text(pdf_files):

    # Create a ThreadPoolExecutor to run the conversion in parallel
    with ThreadPoolExecutor() as executor:
        # Use the executor to map the convert_pdf_to_text function over all the pdf_files
        results = executor.map(convert_pdf_to_text, pdf_files)
        # Concatenate the text from all the PDFs
        text = "\n".join(results)

    return text


iface = gr.Interface(
    fn=pdf_to_text,
    inputs=gr.inputs.File(
        type="file", label="Upload a PDF file", file_count="multiple"),
    outputs="text",
    title="PDF to Text Converter",
    description="Upload PDF files and get their content in text format.",
)

if __name__ == "__main__":
    iface.launch()