File size: 6,030 Bytes
0f922c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import streamlit as st
from PIL import Image
import fitz  # PyMuPDF
import numpy as np
import tempfile
import os
import time
import io
import json
import torch
import cv2

# Import OCR engines
import ocr_engines

# Try importing LLM processor if LLM features are to be used
llm_available = False
try:
    import llm_processor

    llm_available = True
except ImportError:
    pass  # LLM features will be disabled

# Create results folder if it doesn't exist
if not os.path.exists("results"):
    os.makedirs("results")

# Streamlit application
st.title("OCRInsight")

# Sidebar
st.sidebar.header("Settings")


# Function to save text to file
def save_text_to_file(attributes_of_output, all_ocr_text, filename):
    with open(filename, "a", encoding="utf-8") as f:
        f.write("\n" + "-" * 75 + "\n")
        f.write("Attributes of Output:\n")
        f.write(attributes_of_output)
        f.write("\nOCR Result:\n")
        f.write(all_ocr_text)
        f.write("\n" + "-" * 75 + "\n")
    st.success(f"{filename} saved successfully!")


# Device selection
device = st.sidebar.radio("Select Device", ["CPU", "GPU (CUDA)"])
save_output = st.sidebar.checkbox("Save Outputs")

# Language selection
language = st.sidebar.selectbox(
    "Select Language", ["Türkçe", "English", "Français", "Deutsch", "Español"]
)

# Map selected language to language codes
language_codes = {
    "Türkçe": "tr",
    "English": "en",
    "Français": "fr",
    "Deutsch": "de",
    "Español": "es",
}

# OCR model selection
ocr_models = st.sidebar.multiselect(
    "Select OCR Models",
    ["EasyOCR", "DocTR", "Tesseract", "PaddleOCR"],
    ["EasyOCR"],  # default selection
)

# LLM model selection
llm_model = st.sidebar.selectbox(
    "Select LLM Model", ["Only OCR Mode", "llama3.1", "llama3", "gemma2"]
)

# Conditional UI elements based on LLM model selection
if llm_model != "Only OCR Mode" and llm_available:
    user_command = st.sidebar.text_input("Enter command:", "")

    task_type = st.sidebar.radio("Select task type:", ["Summarize", "Generate"])
elif llm_model != "Only OCR Mode" and not llm_available:
    st.sidebar.warning(
        "LLM features are not available. Please install 'ollama' to enable LLM processing."
    )
    llm_model = "Only OCR Mode"

# Check GPU availability
if device == "GPU (CUDA)" and not torch.cuda.is_available():
    st.sidebar.warning("GPU (CUDA) not available. Switching to CPU.")
    device = "CPU"

# Initialize OCR models
ocr_readers = ocr_engines.initialize_ocr_models(
    ocr_models, language_codes[language], device
)

# File upload
uploaded_file = st.file_uploader(
    "Upload File (PDF, Image)", type=["pdf", "png", "jpg", "jpeg"]
)

# Create results folder if it doesn't exist
if not os.path.exists("results"):
    os.makedirs("results")

if uploaded_file is not None:
    start_time = time.time()

    if uploaded_file.type == "application/pdf":
        pdf_document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
        images = []
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            pix = page.get_pixmap()
            img_data = pix.tobytes("png")
            img = Image.open(io.BytesIO(img_data))
            images.append(img)
        total_pages = len(pdf_document)
        pdf_document.close()
    else:
        images = [Image.open(uploaded_file)]
        total_pages = 1

    all_ocr_texts = {
        model_name: "" for model_name in ocr_models
    }  # To store OCR text for each model

    for page_num, image in enumerate(images, start=1):
        st.image(image, caption=f"Page {page_num}/{total_pages}", use_column_width=True)

        # Perform OCR with each selected model
        for model_name in ocr_models:
            text = ocr_engines.perform_ocr(
                model_name, ocr_readers, image, language_codes[language]
            )
            all_ocr_texts[
                model_name
            ] += f"--- Page {page_num} ({model_name}) ---\n{text}\n\n"

            st.subheader(f"OCR Result ({model_name}) - Page {page_num}/{total_pages}:")
            st.text(text)

    end_time = time.time()
    process_time = end_time - start_time

    st.info(f"Processing time: {process_time:.2f} seconds")

    # Save OCR outputs if selected
    if save_output:
        attributes_of_output = {
            "Model Names": ocr_models,
            "Language": language,
            "Device": device,
            "Process Time": process_time,
        }
        for model_name, ocr_text in all_ocr_texts.items():
            filename = f"results//ocr_output_{model_name}.txt"
            save_text_to_file(
                json.dumps(attributes_of_output, ensure_ascii=False), ocr_text, filename
            )

    # LLM processing
    if (
        llm_model != "Only OCR Mode"
        and llm_available
        and st.sidebar.button("Start LLM Processing")
    ):
        st.subheader("LLM Processing Result:")

        # Combine all OCR texts
        combined_ocr_text = "\n".join(all_ocr_texts.values())

        # Prepare the prompt based on the task type
        if task_type == "Summarize":
            prompt = f"Please summarize the following text. Command: {user_command}\n\nText: {combined_ocr_text}"
        else:  # "Generate"
            prompt = f"Please generate new text based on the following text. Command: {user_command}\n\nText: {combined_ocr_text}"

        llm_output = llm_processor.process_with_llm(llm_model, prompt)

        # Display the result
        st.write(f"Processing completed using '{llm_model}' model.")
        st.text_area("LLM Output:", value=llm_output, height=300)

        # Save LLM output if selected
        if save_output:
            filename = "llm_output.txt"
            save_text_to_file(llm_output, "", filename)

elif llm_model != "Only OCR Mode" and not llm_available:
    st.warning(
        "LLM features are not available. Please install 'ollama' to enable LLM processing."
    )

st.sidebar.info(f"Selected device: {device}")