Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -32,53 +32,6 @@ def create_embeddings(text):
|
|
| 32 |
print("Embeddings created successfully.")
|
| 33 |
return embeddings, sentences
|
| 34 |
|
| 35 |
-
def generate_plot_bokeh(query, pdf_file):
|
| 36 |
-
logging.info("Generating plot.")
|
| 37 |
-
# Generate embeddings for the query
|
| 38 |
-
query_embedding = model.encode([query])[0]
|
| 39 |
-
|
| 40 |
-
# Process the PDF and create embeddings
|
| 41 |
-
text = process_pdf(pdf_file.name)
|
| 42 |
-
embeddings, sentences = create_embeddings(text)
|
| 43 |
-
|
| 44 |
-
logging.info("Data prepared for UMAP.")
|
| 45 |
-
# Prepare the data for UMAP and visualization
|
| 46 |
-
all_embeddings = np.vstack([embeddings, query_embedding])
|
| 47 |
-
all_sentences = sentences + [query]
|
| 48 |
-
|
| 49 |
-
# UMAP transformation
|
| 50 |
-
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
|
| 51 |
-
umap_embeddings = umap_transform.fit_transform(all_embeddings)
|
| 52 |
-
|
| 53 |
-
logging.info("UMAP transformation completed.")
|
| 54 |
-
# Find the closest sentences to the query
|
| 55 |
-
distances = cosine_similarity([query_embedding], embeddings)[0]
|
| 56 |
-
closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed
|
| 57 |
-
|
| 58 |
-
# Prepare data for plotting
|
| 59 |
-
data = {
|
| 60 |
-
'x': umap_embeddings[:-1, 0], # Exclude the query point itself
|
| 61 |
-
'y': umap_embeddings[:-1, 1], # Exclude the query point itself
|
| 62 |
-
'content': all_sentences[:-1], # Exclude the query sentence itself
|
| 63 |
-
'color': ['red' if i in closest_indices else 'blue' for i in range(len(sentences))],
|
| 64 |
-
}
|
| 65 |
-
source = ColumnDataSource(data)
|
| 66 |
-
|
| 67 |
-
# Create the Bokeh plot
|
| 68 |
-
p = figure(title="UMAP Projection of Sentences", width=700, height=700)
|
| 69 |
-
p.scatter('x', 'y', color='color', source=source)
|
| 70 |
-
|
| 71 |
-
hover = HoverTool(tooltips=[("Content", "@content")])
|
| 72 |
-
p.add_tools(hover)
|
| 73 |
-
|
| 74 |
-
logging.info("Plot created successfully.")
|
| 75 |
-
# Save the plot to an HTML file
|
| 76 |
-
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".html")
|
| 77 |
-
logging.info(f"temp file is {temp_file.name}")
|
| 78 |
-
output_file(temp_file.name)
|
| 79 |
-
save(p)
|
| 80 |
-
logging.info("Plot saved to file.")
|
| 81 |
-
return temp_file.name
|
| 82 |
import plotly.express as px
|
| 83 |
import plotly.graph_objects as go
|
| 84 |
|
|
@@ -106,16 +59,22 @@ def generate_plotly_figure(query, pdf_file):
|
|
| 106 |
closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed
|
| 107 |
|
| 108 |
# Prepare data for plotting
|
| 109 |
-
colors = ['
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
fig.update_layout(title="UMAP Projection of Sentences", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
|
| 115 |
|
| 116 |
logging.info("Plotly figure created successfully.")
|
| 117 |
return fig
|
| 118 |
-
|
| 119 |
def gradio_interface(pdf_file, query):
|
| 120 |
logging.info("Gradio interface called.")
|
| 121 |
fig = generate_plotly_figure(query, pdf_file)
|
|
|
|
| 32 |
print("Embeddings created successfully.")
|
| 33 |
return embeddings, sentences
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
import plotly.express as px
|
| 36 |
import plotly.graph_objects as go
|
| 37 |
|
|
|
|
| 59 |
closest_indices = distances.argsort()[-5:][::-1] # Adjust the number as needed
|
| 60 |
|
| 61 |
# Prepare data for plotting
|
| 62 |
+
colors = ['green' if i in closest_indices else 'blue' for i in range(len(sentences))] # Target points in green
|
| 63 |
+
colors.append('red') # Query point in red
|
| 64 |
+
|
| 65 |
+
# Add the scatter plot for sentences and query
|
| 66 |
+
fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
|
| 67 |
+
marker=dict(color=colors[:-1]), text=all_sentences[:-1],
|
| 68 |
+
name='Sentences'))
|
| 69 |
+
|
| 70 |
+
# Add the scatter plot for the query point
|
| 71 |
+
fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
|
| 72 |
+
marker=dict(color='red'), text=[query], name='Query'))
|
| 73 |
|
| 74 |
fig.update_layout(title="UMAP Projection of Sentences", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
|
| 75 |
|
| 76 |
logging.info("Plotly figure created successfully.")
|
| 77 |
return fig
|
|
|
|
| 78 |
def gradio_interface(pdf_file, query):
|
| 79 |
logging.info("Gradio interface called.")
|
| 80 |
fig = generate_plotly_figure(query, pdf_file)
|