Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import fitz # PyMuPDF for reading PDFs
|
| 3 |
import numpy as np
|
| 4 |
-
from bokeh.plotting import figure, output_file, save
|
| 5 |
-
from bokeh.models import HoverTool, ColumnDataSource
|
| 6 |
-
import umap
|
| 7 |
import pandas as pd
|
| 8 |
-
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
|
| 9 |
-
from sentence_transformers import SentenceTransformer
|
| 10 |
-
import tempfile
|
| 11 |
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# Set up logging
|
| 14 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
@@ -19,74 +18,76 @@ logging.info("Model loaded successfully.")
|
|
| 19 |
|
| 20 |
def process_pdf(pdf_path):
|
| 21 |
logging.info(f"Processing PDF: {pdf_path}")
|
| 22 |
-
# Open the PDF
|
| 23 |
doc = fitz.open(pdf_path)
|
| 24 |
texts = [page.get_text() for page in doc]
|
| 25 |
-
|
| 26 |
return " ".join(texts)
|
| 27 |
|
| 28 |
def create_embeddings(text):
|
| 29 |
-
|
| 30 |
sentences = text.split(". ") # A simple split; consider a more robust sentence splitter
|
| 31 |
embeddings = model.encode(sentences)
|
| 32 |
-
|
| 33 |
return embeddings, sentences
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
def generate_plotly_figure(query, pdf_file):
|
| 39 |
logging.info("Generating plot with Plotly.")
|
| 40 |
-
# Generate embeddings for the query
|
| 41 |
query_embedding = model.encode([query])[0]
|
| 42 |
-
|
| 43 |
-
# Process the PDF and create embeddings
|
| 44 |
text = process_pdf(pdf_file.name)
|
| 45 |
embeddings, sentences = create_embeddings(text)
|
| 46 |
-
|
| 47 |
-
logging.info("Data prepared for UMAP.")
|
| 48 |
-
# Prepare the data for UMAP and visualization
|
| 49 |
all_embeddings = np.vstack([embeddings, query_embedding])
|
| 50 |
all_sentences = sentences + [query]
|
| 51 |
-
|
| 52 |
-
# UMAP transformation
|
| 53 |
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
|
| 54 |
umap_embeddings = umap_transform.fit_transform(all_embeddings)
|
| 55 |
|
| 56 |
-
|
| 57 |
-
#
|
| 58 |
-
distances = cosine_similarity([query_embedding], embeddings)[0]
|
| 59 |
-
closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed
|
| 60 |
|
| 61 |
-
|
| 62 |
-
colors
|
| 63 |
-
colors.append('red') # Query point in red
|
| 64 |
|
| 65 |
-
# Add the scatter plot for sentences and query
|
| 66 |
fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
|
| 67 |
marker=dict(color=colors[:-1]), text=all_sentences[:-1],
|
| 68 |
name='Sentences'))
|
| 69 |
-
|
| 70 |
-
# Add the scatter plot for the query point
|
| 71 |
fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
|
| 72 |
marker=dict(color='red'), text=[query], name='Query'))
|
| 73 |
-
|
| 74 |
-
fig.update_layout(title="UMAP Projection of Sentences", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
|
| 75 |
|
| 76 |
logging.info("Plotly figure created successfully.")
|
| 77 |
return fig
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
| 81 |
logging.info("Returning Plotly figure.")
|
| 82 |
return fig
|
|
|
|
| 83 |
iface = gr.Interface(
|
| 84 |
fn=gradio_interface,
|
| 85 |
-
inputs=[
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
title="PDF Content Visualizer",
|
| 88 |
-
description="
|
|
|
|
|
|
|
|
|
|
| 89 |
)
|
| 90 |
|
| 91 |
if __name__ == "__main__":
|
| 92 |
-
iface.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import fitz # PyMuPDF for reading PDFs
|
| 3 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
| 4 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
| 5 |
import logging
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
|
| 8 |
+
from sklearn.metrics.pairwise import linear_kernel as dot_similarity # For dot product
|
| 9 |
+
import umap
|
| 10 |
+
import plotly.graph_objects as go
|
| 11 |
|
| 12 |
# Set up logging
|
| 13 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
| 18 |
|
| 19 |
def process_pdf(pdf_path):
|
| 20 |
logging.info(f"Processing PDF: {pdf_path}")
|
|
|
|
| 21 |
doc = fitz.open(pdf_path)
|
| 22 |
texts = [page.get_text() for page in doc]
|
| 23 |
+
logging.info("PDF processed successfully.")
|
| 24 |
return " ".join(texts)
|
| 25 |
|
| 26 |
def create_embeddings(text):
|
| 27 |
+
logging.info("Creating embeddings.")
|
| 28 |
sentences = text.split(". ") # A simple split; consider a more robust sentence splitter
|
| 29 |
embeddings = model.encode(sentences)
|
| 30 |
+
logging.info("Embeddings created successfully.")
|
| 31 |
return embeddings, sentences
|
| 32 |
|
| 33 |
+
def calculate_distances(embeddings, query_embedding, metric):
|
| 34 |
+
if metric == "cosine":
|
| 35 |
+
distances = 1 - cosine_similarity(embeddings, [query_embedding])
|
| 36 |
+
elif metric == "euclidean":
|
| 37 |
+
distances = euclidean_distances(embeddings, [query_embedding])
|
| 38 |
+
elif metric == "manhattan":
|
| 39 |
+
distances = manhattan_distances(embeddings, [query_embedding])
|
| 40 |
+
elif metric == "dot":
|
| 41 |
+
distances = -dot_similarity(embeddings, [query_embedding]) # Negated for consistency with other metrics
|
| 42 |
+
return distances.flatten()
|
| 43 |
|
| 44 |
+
def generate_plotly_figure(query, pdf_file, metric):
|
| 45 |
logging.info("Generating plot with Plotly.")
|
|
|
|
| 46 |
query_embedding = model.encode([query])[0]
|
|
|
|
|
|
|
| 47 |
text = process_pdf(pdf_file.name)
|
| 48 |
embeddings, sentences = create_embeddings(text)
|
|
|
|
|
|
|
|
|
|
| 49 |
all_embeddings = np.vstack([embeddings, query_embedding])
|
| 50 |
all_sentences = sentences + [query]
|
| 51 |
+
|
|
|
|
| 52 |
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
|
| 53 |
umap_embeddings = umap_transform.fit_transform(all_embeddings)
|
| 54 |
|
| 55 |
+
distances = calculate_distances(embeddings, query_embedding, metric)
|
| 56 |
+
closest_indices = np.argsort(distances)[:5] # Get indices of 5 closest sentences
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
colors = ['green' if i in closest_indices else 'blue' for i in range(len(sentences))]
|
| 59 |
+
colors.append('red') # For the query
|
|
|
|
| 60 |
|
|
|
|
| 61 |
fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
|
| 62 |
marker=dict(color=colors[:-1]), text=all_sentences[:-1],
|
| 63 |
name='Sentences'))
|
|
|
|
|
|
|
| 64 |
fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
|
| 65 |
marker=dict(color='red'), text=[query], name='Query'))
|
| 66 |
+
fig.update_layout(title="UMAP Projection of Sentences with Query Highlight", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
|
|
|
|
| 67 |
|
| 68 |
logging.info("Plotly figure created successfully.")
|
| 69 |
return fig
|
| 70 |
+
|
| 71 |
+
def gradio_interface(pdf_file, query, metric):
|
| 72 |
+
logging.info("Gradio interface called with metric: " + metric)
|
| 73 |
+
fig = generate_plotly_figure(query, pdf_file, metric)
|
| 74 |
logging.info("Returning Plotly figure.")
|
| 75 |
return fig
|
| 76 |
+
|
| 77 |
iface = gr.Interface(
|
| 78 |
fn=gradio_interface,
|
| 79 |
+
inputs=[
|
| 80 |
+
gr.File(label="Upload PDF"),
|
| 81 |
+
gr.Textbox(label="Query"),
|
| 82 |
+
gr.Radio(choices=["cosine", "euclidean", "manhattan", "dot"], label="Distance Metric")
|
| 83 |
+
],
|
| 84 |
+
outputs=gr.Plot(),
|
| 85 |
title="PDF Content Visualizer",
|
| 86 |
+
description="""This tool allows you to upload a PDF document, input a query, and visualize the context of the document
|
| 87 |
+
as it relates to your query. It uses UMAP for dimensionality reduction and highlights the query and its closest contexts
|
| 88 |
+
within the document based on the selected distance metric. Choose from cosine, Euclidean, Manhattan, or dot product metrics
|
| 89 |
+
to explore different aspects of textual similarity."""
|
| 90 |
)
|
| 91 |
|
| 92 |
if __name__ == "__main__":
|
| 93 |
+
iface.launch()
|