File size: 10,949 Bytes
acd7cf4
 
f1eedd1
acd7cf4
 
 
d02622b
43d27f2
f1eedd1
d2a63cc
acd7cf4
fb9c306
f1eedd1
acd7cf4
3a3b216
acd7cf4
 
3a3b216
bccd595
3a3b216
283e483
799ac7c
d02622b
acd7cf4
799ac7c
acd7cf4
3a3b216
fb9c306
 
0b9d8c7
acd7cf4
 
 
fb9c306
acd7cf4
d02622b
 
 
 
 
 
 
 
 
 
 
 
 
 
817f16e
fb9c306
817f16e
d02622b
 
fb9c306
d02622b
fb9c306
f1eedd1
 
 
acd7cf4
 
 
0b9d8c7
 
acd7cf4
 
 
 
 
 
 
f1eedd1
 
 
f29e862
 
 
 
fb9c306
817f16e
bda6eda
acd7cf4
283e483
 
 
 
acd7cf4
d02622b
 
 
f1eedd1
3a3b216
f1eedd1
3a3b216
f1eedd1
3a3b216
f29e862
acd7cf4
f29e862
 
 
0b9d8c7
f29e862
 
 
 
acd7cf4
f29e862
bccd595
f29e862
 
283e483
 
 
9e67c3b
 
283e483
 
 
 
 
 
 
 
9e67c3b
bccd595
 
 
 
 
 
 
 
283e483
bccd595
 
9e67c3b
bccd595
 
 
 
 
 
 
 
9e67c3b
 
 
 
f1eedd1
283e483
f1eedd1
 
 
 
 
 
9e67c3b
f1eedd1
 
 
bccd595
f1eedd1
 
bccd595
 
 
 
 
 
 
 
 
 
f1eedd1
9e67c3b
 
 
acd7cf4
f1eedd1
acd7cf4
e25b548
3a3b216
817f16e
e25b548
 
9e67c3b
e25b548
 
 
 
 
5f219fc
e25b548
 
9e67c3b
e25b548
 
 
 
 
 
9e67c3b
 
 
 
fb9c306
f1eedd1
3a3b216
817f16e
f1eedd1
 
 
 
817f16e
fb9c306
 
 
 
 
8e67692
fb9c306
acd7cf4
817f16e
d02622b
 
 
 
 
 
817f16e
fb9c306
 
 
 
 
1189434
fb9c306
d02622b
9e67c3b
 
acd7cf4
d02622b
 
 
 
 
 
f1eedd1
3a3b216
f1eedd1
799ac7c
0b9d8c7
 
 
 
799ac7c
9e67c3b
f1eedd1
 
283e483
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e67c3b
 
 
 
283e483
f1eedd1
 
 
 
 
 
 
 
fb9c306
799ac7c
 
2a0edfe
 
 
 
fb9c306
 
799ac7c
 
 
 
 
9e67c3b
 
acd7cf4
3a3b216
 
9e67c3b
 
 
 
 
 
acd7cf4
 
f1eedd1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
import os
import time
from typing import Dict

import gradio as gr

from graphgen.bases import BaseLLMWrapper
from graphgen.bases.datatypes import Chunk
from graphgen.engine import op
from graphgen.models import (
    JsonKVStorage,
    JsonListStorage,
    MetaJsonKVStorage,
    NetworkXStorage,
    OpenAIClient,
    Tokenizer,
)
from graphgen.operators import (
    build_kg,
    chunk_documents,
    extract_info,
    generate_qas,
    init_llm,
    judge_statement,
    partition_kg,
    quiz,
    read_files,
    search_all,
)
from graphgen.utils import async_to_sync_method, compute_mm_hash, logger

sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))


class GraphGen:
    def __init__(
        self,
        unique_id: int = int(time.time()),
        working_dir: str = os.path.join(sys_path, "cache"),
        tokenizer_instance: Tokenizer = None,
        synthesizer_llm_client: OpenAIClient = None,
        trainee_llm_client: OpenAIClient = None,
        progress_bar: gr.Progress = None,
    ):
        self.unique_id: int = unique_id
        self.working_dir: str = working_dir

        # llm
        self.tokenizer_instance: Tokenizer = tokenizer_instance or Tokenizer(
            model_name=os.getenv("TOKENIZER_MODEL")
        )

        self.synthesizer_llm_client: BaseLLMWrapper = (
            synthesizer_llm_client or init_llm("synthesizer")
        )
        self.trainee_llm_client: BaseLLMWrapper = trainee_llm_client

        self.meta_storage: MetaJsonKVStorage = MetaJsonKVStorage(
            self.working_dir, namespace="_meta"
        )
        self.full_docs_storage: JsonKVStorage = JsonKVStorage(
            self.working_dir, namespace="full_docs"
        )
        self.chunks_storage: JsonKVStorage = JsonKVStorage(
            self.working_dir, namespace="chunks"
        )
        self.graph_storage: NetworkXStorage = NetworkXStorage(
            self.working_dir, namespace="graph"
        )
        self.rephrase_storage: JsonKVStorage = JsonKVStorage(
            self.working_dir, namespace="rephrase"
        )
        self.partition_storage: JsonListStorage = JsonListStorage(
            self.working_dir, namespace="partition"
        )
        self.search_storage: JsonKVStorage = JsonKVStorage(
            os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
            namespace="search",
        )
        self.qa_storage: JsonListStorage = JsonListStorage(
            os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
            namespace="qa",
        )
        self.extract_storage: JsonKVStorage = JsonKVStorage(
            os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
            namespace="extraction",
        )

        # webui
        self.progress_bar: gr.Progress = progress_bar

    @op("read", deps=[])
    @async_to_sync_method
    async def read(self, read_config: Dict):
        """
        read files from input sources
        """
        doc_stream = read_files(**read_config, cache_dir=self.working_dir)

        batch = {}
        for doc in doc_stream:
            doc_id = compute_mm_hash(doc, prefix="doc-")

            batch[doc_id] = doc
        if batch:
            self.full_docs_storage.upsert(batch)
            self.full_docs_storage.index_done_callback()

        # TODO: configurable whether to use coreference resolution

        _add_doc_keys = self.full_docs_storage.filter_keys(list(batch.keys()))
        new_docs = {k: v for k, v in batch.items() if k in _add_doc_keys}
        if len(new_docs) == 0:
            logger.warning("All documents are already in the storage")
            return
        self.full_docs_storage.upsert(new_docs)
        self.full_docs_storage.index_done_callback()

    @op("chunk", deps=["read"])
    @async_to_sync_method
    async def chunk(self, chunk_config: Dict):
        """
        chunk documents into smaller pieces from full_docs_storage if not already present
        """

        new_docs = self.meta_storage.get_new_data(self.full_docs_storage)
        if len(new_docs) == 0:
            logger.warning("All documents are already in the storage")
            return

        inserting_chunks = await chunk_documents(
            new_docs,
            self.tokenizer_instance,
            self.progress_bar,
            **chunk_config,
        )

        _add_chunk_keys = self.chunks_storage.filter_keys(list(inserting_chunks.keys()))
        inserting_chunks = {
            k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
        }

        if len(inserting_chunks) == 0:
            logger.warning("All chunks are already in the storage")
            return

        self.chunks_storage.upsert(inserting_chunks)
        self.chunks_storage.index_done_callback()
        self.meta_storage.mark_done(self.full_docs_storage)
        self.meta_storage.index_done_callback()

    @op("build_kg", deps=["chunk"])
    @async_to_sync_method
    async def build_kg(self):
        """
        build knowledge graph from text chunks
        """
        # Step 1: get new chunks according to meta and chunks storage
        inserting_chunks = self.meta_storage.get_new_data(self.chunks_storage)
        if len(inserting_chunks) == 0:
            logger.warning("All chunks are already in the storage")
            return

        logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks))
        # Step 2: build knowledge graph from new chunks
        _add_entities_and_relations = await build_kg(
            llm_client=self.synthesizer_llm_client,
            kg_instance=self.graph_storage,
            chunks=[Chunk.from_dict(k, v) for k, v in inserting_chunks.items()],
            progress_bar=self.progress_bar,
        )
        if not _add_entities_and_relations:
            logger.warning("No entities or relations extracted from text chunks")
            return

        # Step 3: mark meta
        self.graph_storage.index_done_callback()
        self.meta_storage.mark_done(self.chunks_storage)
        self.meta_storage.index_done_callback()

        return _add_entities_and_relations

    @op("search", deps=["read"])
    @async_to_sync_method
    async def search(self, search_config: Dict):
        logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))

        seeds = self.meta_storage.get_new_data(self.full_docs_storage)
        if len(seeds) == 0:
            logger.warning("All documents are already been searched")
            return
        search_results = await search_all(
            seed_data=seeds,
            search_config=search_config,
        )

        _add_search_keys = self.search_storage.filter_keys(list(search_results.keys()))
        search_results = {
            k: v for k, v in search_results.items() if k in _add_search_keys
        }
        if len(search_results) == 0:
            logger.warning("All search results are already in the storage")
            return
        self.search_storage.upsert(search_results)
        self.search_storage.index_done_callback()
        self.meta_storage.mark_done(self.full_docs_storage)
        self.meta_storage.index_done_callback()

    @op("quiz_and_judge", deps=["build_kg"])
    @async_to_sync_method
    async def quiz_and_judge(self, quiz_and_judge_config: Dict):
        logger.warning(
            "Quiz and Judge operation needs trainee LLM client."
            " Make sure to provide one."
        )
        max_samples = quiz_and_judge_config["quiz_samples"]
        await quiz(
            self.synthesizer_llm_client,
            self.graph_storage,
            self.rephrase_storage,
            max_samples,
            progress_bar=self.progress_bar,
        )

        # TODO: assert trainee_llm_client is valid before judge
        if not self.trainee_llm_client:
            # TODO: shutdown existing synthesizer_llm_client properly
            logger.info("No trainee LLM client provided, initializing a new one.")
            self.synthesizer_llm_client.shutdown()
            self.trainee_llm_client = init_llm("trainee")

        re_judge = quiz_and_judge_config["re_judge"]
        _update_relations = await judge_statement(
            self.trainee_llm_client,
            self.graph_storage,
            self.rephrase_storage,
            re_judge,
            progress_bar=self.progress_bar,
        )

        self.rephrase_storage.index_done_callback()
        _update_relations.index_done_callback()

        logger.info("Shutting down trainee LLM client.")
        self.trainee_llm_client.shutdown()
        self.trainee_llm_client = None
        logger.info("Restarting synthesizer LLM client.")
        self.synthesizer_llm_client.restart()

    @op("partition", deps=["build_kg"])
    @async_to_sync_method
    async def partition(self, partition_config: Dict):
        batches = await partition_kg(
            self.graph_storage,
            self.chunks_storage,
            self.tokenizer_instance,
            partition_config,
        )
        self.partition_storage.upsert(batches)
        return batches

    @op("extract", deps=["chunk"])
    @async_to_sync_method
    async def extract(self, extract_config: Dict):
        logger.info("Extracting information from given chunks...")

        results = await extract_info(
            self.synthesizer_llm_client,
            self.chunks_storage,
            extract_config,
            progress_bar=self.progress_bar,
        )
        if not results:
            logger.warning("No information extracted")
            return

        self.extract_storage.upsert(results)
        self.extract_storage.index_done_callback()
        self.meta_storage.mark_done(self.chunks_storage)
        self.meta_storage.index_done_callback()

    @op("generate", deps=["partition"])
    @async_to_sync_method
    async def generate(self, generate_config: Dict):

        batches = self.partition_storage.data
        if not batches:
            logger.warning("No partitions found for QA generation")
            return

        # Step 2: generate QA pairs
        results = await generate_qas(
            self.synthesizer_llm_client,
            batches,
            generate_config,
            progress_bar=self.progress_bar,
        )

        if not results:
            logger.warning("No QA pairs generated")
            return

        # Step 3: store the generated QA pairs
        self.qa_storage.upsert(results)
        self.qa_storage.index_done_callback()

    @async_to_sync_method
    async def clear(self):
        self.full_docs_storage.drop()
        self.chunks_storage.drop()
        self.search_storage.drop()
        self.graph_storage.clear()
        self.rephrase_storage.drop()
        self.qa_storage.drop()

        logger.info("All caches are cleared")

    # TODO: add data filtering step here in the future
    # graph_gen.filter(filter_config=config["filter"])