Spaces:
Build error
Build error
| import os | |
| import pandas as pd | |
| import numpy as np | |
| import random | |
| import networkx as nx | |
| import seaborn as sns | |
| from pathlib import Path | |
| from langchain.document_loaders import DirectoryLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from pyvis.network import Network | |
| from helpers.df_helpers import documents2Dataframe, df2Graph, graph2Df | |
| import gradio as gr | |
| import logging | |
| # Constants | |
| CHUNK_SIZE = 1500 | |
| CHUNK_OVERLAP = 150 | |
| WEIGHT_MULTIPLIER = 4 | |
| COLOR_PALETTE = "hls" | |
| GRAPH_OUTPUT_DIRECTORY = "./docs/index.html" | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def colors2Community(communities) -> pd.DataFrame: | |
| palette = sns.color_palette(COLOR_PALETTE, len(communities)).as_hex() | |
| random.shuffle(palette) | |
| rows = [{"node": node, "color": color, "group": group + 1} | |
| for group, community in enumerate(communities) | |
| for node, color in zip(community, palette)] | |
| return pd.DataFrame(rows) | |
| def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame: | |
| dfg_long = pd.melt(df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node").drop(columns=["variable"]) | |
| dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2")) | |
| dfg_wide = dfg_wide[dfg_wide["node_1"] != dfg_wide["node_2"]].reset_index(drop=True) | |
| dfg2 = dfg_wide.groupby(["node_1", "node_2"]).agg({"chunk_id": [",".join, "count"]}).reset_index() | |
| dfg2.columns = ["node_1", "node_2", "chunk_id", "count"] | |
| dfg2.dropna(subset=["node_1", "node_2"], inplace=True) | |
| dfg2 = dfg2[dfg2["count"] != 1] | |
| dfg2["edge"] = "contextual proximity" | |
| return dfg2 | |
| def load_documents(input_dir): | |
| loader = DirectoryLoader(input_dir, show_progress=True) | |
| return loader.load() | |
| def split_documents(documents): | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len, is_separator_regex=False) | |
| return splitter.split_documents(documents) | |
| def save_dataframes(df, dfg1, output_dir): | |
| os.makedirs(output_dir, exist_ok=True) | |
| dfg1.to_csv(output_dir / "graph.csv", sep="|", index=False) | |
| df.to_csv(output_dir / "chunks.csv", sep="|", index=False) | |
| def load_dataframes(output_dir): | |
| df = pd.read_csv(output_dir / "chunks.csv", sep="|") | |
| dfg1 = pd.read_csv(output_dir / "graph.csv", sep="|") | |
| return df, dfg1 | |
| def build_graph(dfg): | |
| nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique() | |
| G = nx.Graph() | |
| G.add_nodes_from(nodes) | |
| for _, row in dfg.iterrows(): | |
| G.add_edge(row["node_1"], row["node_2"], title=row["edge"], weight=row['count'] / WEIGHT_MULTIPLIER) | |
| return G | |
| def visualize_graph(G, communities): | |
| colors = colors2Community(communities) | |
| for _, row in colors.iterrows(): | |
| G.nodes[row['node']].update(group=row['group'], color=row['color'], size=G.degree[row['node']]) | |
| nt = Network(notebook=False, cdn_resources="remote", height="900px", width="100%", select_menu=True) | |
| nt.from_nx(G) | |
| nt.force_atlas_2based(central_gravity=0.015, gravity=-31) | |
| nt.show_buttons(filter_=["physics"]) | |
| html = nt.generate_html().replace("'", "\"") | |
| return f"""<iframe style="width: 100%; height: 600px; margin:0 auto" | |
| name="result" allow="midi; geolocation; microphone; camera; | |
| display-capture; encrypted-media;" sandbox="allow-modals allow-forms | |
| allow-scripts allow-same-origin allow-popups | |
| allow-top-navigation-by-user-activation allow-downloads" allowfullscreen | |
| allowpaymentrequest frameborder="0" srcdoc='{html}'></iframe>""" | |
| def process_pdfs(input_dir, output_dir, regenerate=False): | |
| if regenerate: | |
| documents = load_documents(input_dir) | |
| pages = split_documents(documents) | |
| df = documents2Dataframe(pages) | |
| concepts_list = df2Graph(df, model='zephyr:latest') | |
| dfg1 = graph2Df(concepts_list) | |
| save_dataframes(df, dfg1, output_dir) | |
| else: | |
| df, dfg1 = load_dataframes(output_dir) | |
| dfg1.replace("", np.nan, inplace=True) | |
| dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True) | |
| dfg1['count'] = WEIGHT_MULTIPLIER | |
| dfg2 = contextual_proximity(dfg1) | |
| dfg = pd.concat([dfg1, dfg2], axis=0).groupby(["node_1", "node_2"]).agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'}).reset_index() | |
| G = build_graph(dfg) | |
| communities_generator = nx.community.girvan_newman(G) | |
| next_level_communities = next(communities_generator) | |
| next_level_communities = next(communities_generator) # Two levels of communities | |
| communities = sorted(map(sorted, next_level_communities)) | |
| logger.info(f"Number of Communities = {len(communities)}") | |
| logger.info(communities) | |
| html = visualize_graph(G, communities) | |
| return html | |
| def main(): | |
| data_dir = "cureus" | |
| input_dir = Path(f"./data_input/{data_dir}") | |
| output_dir = Path(f"./data_output/{data_dir}") | |
| html = process_pdfs(input_dir, output_dir, regenerate=False) | |
| demo = gr.Interface(fn=lambda: html, inputs=None, outputs=gr.HTML(), title="Text to knowledge graph", allow_flagging='never') | |
| demo.launch() | |
| if __name__ == "__main__": | |
| main() | |