github-actions[bot] commited on
Commit
dee1edd
·
1 Parent(s): b7519b4

Auto-sync from demo at Wed Nov 26 09:31:54 UTC 2025

Browse files
app.py CHANGED
@@ -101,12 +101,15 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
101
  pipeline = [
102
  {
103
  "name": "read",
 
104
  "params": {
105
  "input_file": params.upload_file,
106
  },
107
  },
108
  {
109
  "name": "chunk",
 
 
110
  "params": {
111
  "chunk_size": params.chunk_size,
112
  "chunk_overlap": params.chunk_overlap,
@@ -114,6 +117,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
114
  },
115
  {
116
  "name": "build_kg",
 
 
117
  },
118
  ]
119
 
@@ -121,6 +126,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
121
  pipeline.append(
122
  {
123
  "name": "quiz_and_judge",
 
 
124
  "params": {"quiz_samples": params.quiz_samples, "re_judge": True},
125
  }
126
  )
@@ -128,6 +135,7 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
128
  {
129
  "name": "partition",
130
  "deps": ["quiz_and_judge"],
 
131
  "params": {
132
  "method": params.partition_method,
133
  "method_params": partition_params,
@@ -138,6 +146,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
138
  pipeline.append(
139
  {
140
  "name": "partition",
 
 
141
  "params": {
142
  "method": params.partition_method,
143
  "method_params": partition_params,
@@ -147,6 +157,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
147
  pipeline.append(
148
  {
149
  "name": "generate",
 
 
150
  "params": {
151
  "method": params.mode,
152
  "data_format": params.data_format,
 
101
  pipeline = [
102
  {
103
  "name": "read",
104
+ "op_key": "read",
105
  "params": {
106
  "input_file": params.upload_file,
107
  },
108
  },
109
  {
110
  "name": "chunk",
111
+ "deps": ["read"],
112
+ "op_key": "chunk",
113
  "params": {
114
  "chunk_size": params.chunk_size,
115
  "chunk_overlap": params.chunk_overlap,
 
117
  },
118
  {
119
  "name": "build_kg",
120
+ "deps": ["chunk"],
121
+ "op_key": "build_kg",
122
  },
123
  ]
124
 
 
126
  pipeline.append(
127
  {
128
  "name": "quiz_and_judge",
129
+ "deps": ["build_kg"],
130
+ "op_key": "quiz_and_judge",
131
  "params": {"quiz_samples": params.quiz_samples, "re_judge": True},
132
  }
133
  )
 
135
  {
136
  "name": "partition",
137
  "deps": ["quiz_and_judge"],
138
+ "op_key": "partition",
139
  "params": {
140
  "method": params.partition_method,
141
  "method_params": partition_params,
 
146
  pipeline.append(
147
  {
148
  "name": "partition",
149
+ "deps": ["build_kg"],
150
+ "op_key": "partition",
151
  "params": {
152
  "method": params.partition_method,
153
  "method_params": partition_params,
 
157
  pipeline.append(
158
  {
159
  "name": "generate",
160
+ "deps": ["partition"],
161
+ "op_key": "generate",
162
  "params": {
163
  "method": params.mode,
164
  "data_format": params.data_format,
graphgen/configs/aggregated_config.yaml CHANGED
@@ -1,22 +1,30 @@
1
  pipeline:
2
- - name: read
 
3
  params:
4
  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
 
6
- - name: chunk
 
 
7
  params:
8
  chunk_size: 1024 # chunk size for text splitting
9
  chunk_overlap: 100 # chunk overlap for text splitting
10
 
11
- - name: build_kg
 
 
12
 
13
- - name: quiz_and_judge
 
 
14
  params:
15
  quiz_samples: 2 # number of quiz samples to generate
16
  re_judge: false # whether to re-judge the existing quiz samples
17
 
18
- - name: partition
19
- deps: [quiz_and_judge] # ece depends on quiz_and_judge steps
 
20
  params:
21
  method: ece # ece is a custom partition method based on comprehension loss
22
  method_params:
@@ -25,7 +33,9 @@ pipeline:
25
  max_tokens_per_community: 10240 # max tokens per community
26
  unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
27
 
28
- - name: generate
 
 
29
  params:
30
  method: aggregated # atomic, aggregated, multi_hop, cot, vqa
31
  data_format: ChatML # Alpaca, Sharegpt, ChatML
 
1
  pipeline:
2
+ - name: read_step # step name is unique in the pipeline, and can be referenced by other steps
3
+ op_key: read
4
  params:
5
  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
6
 
7
+ - name: chunk_step
8
+ op_key: chunk
9
+ deps: [read_step] # chunk_step depends on read_step
10
  params:
11
  chunk_size: 1024 # chunk size for text splitting
12
  chunk_overlap: 100 # chunk overlap for text splitting
13
 
14
+ - name: build_kg_step
15
+ op_key: build_kg
16
+ deps: [chunk_step] # build_kg_step depends on chunk_step
17
 
18
+ - name: quiz_and_judge_step
19
+ op_key: quiz_and_judge
20
+ deps: [build_kg_step] # quiz_and_judge depends on build_kg_step
21
  params:
22
  quiz_samples: 2 # number of quiz samples to generate
23
  re_judge: false # whether to re-judge the existing quiz samples
24
 
25
+ - name: partition_step
26
+ op_key: partition
27
+ deps: [quiz_and_judge_step] # partition_step depends on quiz_and_judge_step
28
  params:
29
  method: ece # ece is a custom partition method based on comprehension loss
30
  method_params:
 
33
  max_tokens_per_community: 10240 # max tokens per community
34
  unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
35
 
36
+ - name: generate_step
37
+ op_key: generate
38
+ deps: [partition_step] # generate_step depends on partition_step
39
  params:
40
  method: aggregated # atomic, aggregated, multi_hop, cot, vqa
41
  data_format: ChatML # Alpaca, Sharegpt, ChatML
graphgen/configs/atomic_config.yaml CHANGED
@@ -1,21 +1,31 @@
1
  pipeline:
2
- - name: read
 
3
  params:
4
  input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
5
 
6
- - name: chunk
 
 
7
  params:
8
  chunk_size: 1024 # chunk size for text splitting
9
  chunk_overlap: 100 # chunk overlap for text splitting
10
 
11
- - name: build_kg
 
 
12
 
13
- - name: partition
 
 
14
  params:
15
  method: dfs # partition method, support: dfs, bfs, ece, leiden
16
  method_params:
17
  max_units_per_community: 1 # atomic partition, one node or edge per community
18
- - name: generate
 
 
 
19
  params:
20
  method: atomic # atomic, aggregated, multi_hop, cot, vqa
21
  data_format: Alpaca # Alpaca, Sharegpt, ChatML
 
1
  pipeline:
2
+ - name: read_step
3
+ op_key: read
4
  params:
5
  input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
6
 
7
+ - name: chunk_step
8
+ op_key: chunk
9
+ deps: [read_step] # chunk_step depends on read_step
10
  params:
11
  chunk_size: 1024 # chunk size for text splitting
12
  chunk_overlap: 100 # chunk overlap for text splitting
13
 
14
+ - name: build_kg_step
15
+ op_key: build_kg
16
+ deps: [chunk_step] # build_kg depends on chunk_step
17
 
18
+ - name: partition_step
19
+ op_key: partition
20
+ deps: [build_kg] # partition_step depends on build_kg
21
  params:
22
  method: dfs # partition method, support: dfs, bfs, ece, leiden
23
  method_params:
24
  max_units_per_community: 1 # atomic partition, one node or edge per community
25
+
26
+ - name: generate_step
27
+ op_key: generate
28
+ deps: [partition_step] # generate_step depends on partition_step
29
  params:
30
  method: atomic # atomic, aggregated, multi_hop, cot, vqa
31
  data_format: Alpaca # Alpaca, Sharegpt, ChatML
graphgen/configs/cot_config.yaml CHANGED
@@ -1,16 +1,23 @@
1
  pipeline:
2
- - name: read
 
3
  params:
4
  input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
 
6
- - name: chunk
 
 
7
  params:
8
  chunk_size: 1024 # chunk size for text splitting
9
  chunk_overlap: 100 # chunk overlap for text splitting
10
 
11
- - name: build_kg
 
 
12
 
13
- - name: partition
 
 
14
  params:
15
  method: leiden # leiden is a partitioner detection algorithm
16
  method_params:
@@ -18,7 +25,9 @@ pipeline:
18
  use_lcc: false # whether to use the largest connected component
19
  random_seed: 42 # random seed for partitioning
20
 
21
- - name: generate
 
 
22
  params:
23
  method: cot # atomic, aggregated, multi_hop, cot, vqa
24
  data_format: Sharegpt # Alpaca, Sharegpt, ChatML
 
1
  pipeline:
2
+ - name: read_step
3
+ op_key: read
4
  params:
5
  input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
6
 
7
+ - name: chunk_step
8
+ op_key: chunk
9
+ deps: [read_step] # chunk_step depends on read_step
10
  params:
11
  chunk_size: 1024 # chunk size for text splitting
12
  chunk_overlap: 100 # chunk overlap for text splitting
13
 
14
+ - name: build_kg_step
15
+ op_key: build_kg
16
+ deps: [chunk_step] # build_kg depends on chunk_step
17
 
18
+ - name: partition_step
19
+ op_key: partition
20
+ deps: [build_kg_step] # partition_step depends on build_kg
21
  params:
22
  method: leiden # leiden is a partitioner detection algorithm
23
  method_params:
 
25
  use_lcc: false # whether to use the largest connected component
26
  random_seed: 42 # random seed for partitioning
27
 
28
+ - name: generate_step
29
+ op_key: generate
30
+ deps: [partition_step] # generate_step depends on partition_step
31
  params:
32
  method: cot # atomic, aggregated, multi_hop, cot, vqa
33
  data_format: Sharegpt # Alpaca, Sharegpt, ChatML
graphgen/configs/multi_hop_config.yaml CHANGED
@@ -1,16 +1,23 @@
1
  pipeline:
2
- - name: read
 
3
  params:
4
  input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
 
6
- - name: chunk
 
 
7
  params:
8
  chunk_size: 1024 # chunk size for text splitting
9
  chunk_overlap: 100 # chunk overlap for text splitting
10
 
11
- - name: build_kg
 
 
12
 
13
- - name: partition
 
 
14
  params:
15
  method: ece # ece is a custom partition method based on comprehension loss
16
  method_params:
@@ -19,7 +26,9 @@ pipeline:
19
  max_tokens_per_community: 10240 # max tokens per community
20
  unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
21
 
22
- - name: generate
 
 
23
  params:
24
  method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
25
  data_format: ChatML # Alpaca, Sharegpt, ChatML
 
1
  pipeline:
2
+ - name: read_step
3
+ op_key: read
4
  params:
5
  input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
6
 
7
+ - name: chunk_step
8
+ op_key: chunk
9
+ deps: [read_step] # chunk_step depends on read_step
10
  params:
11
  chunk_size: 1024 # chunk size for text splitting
12
  chunk_overlap: 100 # chunk overlap for text splitting
13
 
14
+ - name: build_kg_step
15
+ op_key: build_kg
16
+ deps: [chunk_step] # build_kg_step depends on chunk_step
17
 
18
+ - name: partition_step
19
+ op_key: partition
20
+ deps: [build_kg_step] # partition_step depends on build_kg_step
21
  params:
22
  method: ece # ece is a custom partition method based on comprehension loss
23
  method_params:
 
26
  max_tokens_per_community: 10240 # max tokens per community
27
  unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
28
 
29
+ - name: generate_step
30
+ op_key: generate
31
+ deps: [partition_step] # generate_step depends on partition_step
32
  params:
33
  method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
34
  data_format: ChatML # Alpaca, Sharegpt, ChatML
graphgen/configs/schema_guided_extraction_config.yaml CHANGED
@@ -1,15 +1,20 @@
1
  pipeline:
2
- - name: read
 
3
  params:
4
  input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
 
6
- - name: chunk
 
 
7
  params:
8
  chunk_size: 20480
9
  chunk_overlap: 2000
10
  separators: []
11
 
12
- - name: extract
 
 
13
  params:
14
  method: schema_guided # extraction method, support: schema_guided
15
  schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method
 
1
  pipeline:
2
+ - name: read_step
3
+ op_key: read
4
  params:
5
  input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
6
 
7
+ - name: chunk_step
8
+ op_key: chunk
9
+ deps: [read_step] # chunk_step depends on read_step
10
  params:
11
  chunk_size: 20480
12
  chunk_overlap: 2000
13
  separators: []
14
 
15
+ - name: extract_step
16
+ op_key: extract
17
+ deps: [chunk_step] # extract_step depends on chunk_step
18
  params:
19
  method: schema_guided # extraction method, support: schema_guided
20
  schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method
graphgen/configs/search_config.yaml CHANGED
@@ -1,9 +1,12 @@
1
  pipeline:
2
- - name: read
 
3
  params:
4
  input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
 
6
- - name: search
 
 
7
  params:
8
  data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot
9
  uniprot_params:
 
1
  pipeline:
2
+ - name: read_step
3
+ op_key: read
4
  params:
5
  input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
6
 
7
+ - name: search_step
8
+ op_key: search
9
+ deps: [read_step] # search_step depends on read_step
10
  params:
11
  data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot
12
  uniprot_params:
graphgen/configs/vqa_config.yaml CHANGED
@@ -1,23 +1,32 @@
1
  pipeline:
2
- - name: read
 
3
  params:
4
  input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
 
6
- - name: chunk
 
 
7
  params:
8
  chunk_size: 1024 # chunk size for text splitting
9
  chunk_overlap: 100 # chunk overlap for text splitting
10
 
11
- - name: build_kg
 
 
12
 
13
- - name: partition
 
 
14
  params:
15
  method: anchor_bfs # partition method
16
  method_params:
17
  anchor_type: image # node type to select anchor nodes
18
  max_units_per_community: 10 # atomic partition, one node or edge per community
19
 
20
- - name: generate
 
 
21
  params:
22
  method: vqa # atomic, aggregated, multi_hop, cot, vqa
23
  data_format: ChatML # Alpaca, Sharegpt, ChatML
 
1
  pipeline:
2
+ - name: read_step
3
+ op_key: read
4
  params:
5
  input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
6
 
7
+ - name: chunk_step
8
+ op_key: chunk
9
+ deps: [read_step] # chunk_step depends on read_step
10
  params:
11
  chunk_size: 1024 # chunk size for text splitting
12
  chunk_overlap: 100 # chunk overlap for text splitting
13
 
14
+ - name: build_kg_step
15
+ op_key: build_kg
16
+ deps: [chunk_step] # build_kg depends on chunk_step
17
 
18
+ - name: partition_step
19
+ op_key: partition
20
+ deps: [build_kg_step] # partition_step depends on build_kg_step
21
  params:
22
  method: anchor_bfs # partition method
23
  method_params:
24
  anchor_type: image # node type to select anchor nodes
25
  max_units_per_community: 10 # atomic partition, one node or edge per community
26
 
27
+ - name: generate_step
28
+ op_key: generate
29
+ deps: [partition_step] # generate_step depends on partition_step
30
  params:
31
  method: vqa # atomic, aggregated, multi_hop, cot, vqa
32
  data_format: ChatML # Alpaca, Sharegpt, ChatML
graphgen/engine.py CHANGED
@@ -4,7 +4,6 @@ orchestration engine for GraphGen
4
 
5
  import threading
6
  import traceback
7
- from functools import wraps
8
  from typing import Any, Callable, List
9
 
10
 
@@ -27,25 +26,12 @@ class OpNode:
27
  self.name, self.deps, self.func = name, deps, func
28
 
29
 
30
- def op(name: str, deps=None):
31
- deps = deps or []
32
-
33
- def decorator(func):
34
- @wraps(func)
35
- def _wrapper(*args, **kwargs):
36
- return func(*args, **kwargs)
37
-
38
- _wrapper.op_node = OpNode(name, deps, lambda self, ctx: func(self, **ctx))
39
- return _wrapper
40
-
41
- return decorator
42
-
43
-
44
  class Engine:
45
  def __init__(self, max_workers: int = 4):
46
  self.max_workers = max_workers
47
 
48
  def run(self, ops: List[OpNode], ctx: Context):
 
49
  name2op = {operation.name: operation for operation in ops}
50
 
51
  # topological sort
@@ -81,7 +67,7 @@ class Engine:
81
  return
82
  try:
83
  name2op[n].func(name2op[n], ctx)
84
- except Exception: # pylint: disable=broad-except
85
  exc[n] = traceback.format_exc()
86
  done[n].set()
87
 
@@ -96,6 +82,20 @@ class Engine:
96
  + "\n".join(f"---- {op} ----\n{tb}" for op, tb in exc.items())
97
  )
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  def collect_ops(config: dict, graph_gen) -> List[OpNode]:
101
  """
@@ -106,16 +106,20 @@ def collect_ops(config: dict, graph_gen) -> List[OpNode]:
106
  ops: List[OpNode] = []
107
  for stage in config["pipeline"]:
108
  name = stage["name"]
109
- method = getattr(graph_gen, name)
110
- op_node = method.op_node
111
-
112
- # if there are runtime dependencies, override them
113
- runtime_deps = stage.get("deps", op_node.deps)
114
- op_node.deps = runtime_deps
115
 
116
  if "params" in stage:
117
- op_node.func = lambda self, ctx, m=method, sc=stage: m(sc.get("params", {}))
 
 
 
118
  else:
119
- op_node.func = lambda self, ctx, m=method: m()
 
 
 
 
120
  ops.append(op_node)
121
  return ops
 
4
 
5
  import threading
6
  import traceback
 
7
  from typing import Any, Callable, List
8
 
9
 
 
26
  self.name, self.deps, self.func = name, deps, func
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  class Engine:
30
  def __init__(self, max_workers: int = 4):
31
  self.max_workers = max_workers
32
 
33
  def run(self, ops: List[OpNode], ctx: Context):
34
+ self._validate(ops)
35
  name2op = {operation.name: operation for operation in ops}
36
 
37
  # topological sort
 
67
  return
68
  try:
69
  name2op[n].func(name2op[n], ctx)
70
+ except Exception:
71
  exc[n] = traceback.format_exc()
72
  done[n].set()
73
 
 
82
  + "\n".join(f"---- {op} ----\n{tb}" for op, tb in exc.items())
83
  )
84
 
85
+ @staticmethod
86
+ def _validate(ops: List[OpNode]):
87
+ name_set = set()
88
+ for op in ops:
89
+ if op.name in name_set:
90
+ raise ValueError(f"Duplicate operation name: {op.name}")
91
+ name_set.add(op.name)
92
+ for op in ops:
93
+ for dep in op.deps:
94
+ if dep not in name_set:
95
+ raise ValueError(
96
+ f"Operation {op.name} has unknown dependency: {dep}"
97
+ )
98
+
99
 
100
  def collect_ops(config: dict, graph_gen) -> List[OpNode]:
101
  """
 
106
  ops: List[OpNode] = []
107
  for stage in config["pipeline"]:
108
  name = stage["name"]
109
+ method_name = stage.get("op_key")
110
+ method = getattr(graph_gen, method_name)
111
+ deps = stage.get("deps", [])
 
 
 
112
 
113
  if "params" in stage:
114
+
115
+ def func(self, ctx, _method=method, _params=stage.get("params", {})):
116
+ return _method(_params)
117
+
118
  else:
119
+
120
+ def func(self, ctx, _method=method):
121
+ return _method()
122
+
123
+ op_node = OpNode(name=name, deps=deps, func=func)
124
  ops.append(op_node)
125
  return ops
graphgen/graphgen.py CHANGED
@@ -6,7 +6,6 @@ import gradio as gr
6
 
7
  from graphgen.bases import BaseLLMWrapper
8
  from graphgen.bases.datatypes import Chunk
9
- from graphgen.engine import op
10
  from graphgen.models import (
11
  JsonKVStorage,
12
  JsonListStorage,
@@ -89,7 +88,6 @@ class GraphGen:
89
  # webui
90
  self.progress_bar: gr.Progress = progress_bar
91
 
92
- @op("read", deps=[])
93
  @async_to_sync_method
94
  async def read(self, read_config: Dict):
95
  """
@@ -116,7 +114,6 @@ class GraphGen:
116
  self.full_docs_storage.upsert(new_docs)
117
  self.full_docs_storage.index_done_callback()
118
 
119
- @op("chunk", deps=["read"])
120
  @async_to_sync_method
121
  async def chunk(self, chunk_config: Dict):
122
  """
@@ -149,7 +146,6 @@ class GraphGen:
149
  self.meta_storage.mark_done(self.full_docs_storage)
150
  self.meta_storage.index_done_callback()
151
 
152
- @op("build_kg", deps=["chunk"])
153
  @async_to_sync_method
154
  async def build_kg(self):
155
  """
@@ -180,7 +176,6 @@ class GraphGen:
180
 
181
  return _add_entities_and_relations
182
 
183
- @op("search", deps=["read"])
184
  @async_to_sync_method
185
  async def search(self, search_config: Dict):
186
  logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))
@@ -206,7 +201,6 @@ class GraphGen:
206
  self.meta_storage.mark_done(self.full_docs_storage)
207
  self.meta_storage.index_done_callback()
208
 
209
- @op("quiz_and_judge", deps=["build_kg"])
210
  @async_to_sync_method
211
  async def quiz_and_judge(self, quiz_and_judge_config: Dict):
212
  logger.warning(
@@ -247,7 +241,6 @@ class GraphGen:
247
  logger.info("Restarting synthesizer LLM client.")
248
  self.synthesizer_llm_client.restart()
249
 
250
- @op("partition", deps=["build_kg"])
251
  @async_to_sync_method
252
  async def partition(self, partition_config: Dict):
253
  batches = await partition_kg(
@@ -259,7 +252,6 @@ class GraphGen:
259
  self.partition_storage.upsert(batches)
260
  return batches
261
 
262
- @op("extract", deps=["chunk"])
263
  @async_to_sync_method
264
  async def extract(self, extract_config: Dict):
265
  logger.info("Extracting information from given chunks...")
@@ -279,7 +271,6 @@ class GraphGen:
279
  self.meta_storage.mark_done(self.chunks_storage)
280
  self.meta_storage.index_done_callback()
281
 
282
- @op("generate", deps=["partition"])
283
  @async_to_sync_method
284
  async def generate(self, generate_config: Dict):
285
 
 
6
 
7
  from graphgen.bases import BaseLLMWrapper
8
  from graphgen.bases.datatypes import Chunk
 
9
  from graphgen.models import (
10
  JsonKVStorage,
11
  JsonListStorage,
 
88
  # webui
89
  self.progress_bar: gr.Progress = progress_bar
90
 
 
91
  @async_to_sync_method
92
  async def read(self, read_config: Dict):
93
  """
 
114
  self.full_docs_storage.upsert(new_docs)
115
  self.full_docs_storage.index_done_callback()
116
 
 
117
  @async_to_sync_method
118
  async def chunk(self, chunk_config: Dict):
119
  """
 
146
  self.meta_storage.mark_done(self.full_docs_storage)
147
  self.meta_storage.index_done_callback()
148
 
 
149
  @async_to_sync_method
150
  async def build_kg(self):
151
  """
 
176
 
177
  return _add_entities_and_relations
178
 
 
179
  @async_to_sync_method
180
  async def search(self, search_config: Dict):
181
  logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))
 
201
  self.meta_storage.mark_done(self.full_docs_storage)
202
  self.meta_storage.index_done_callback()
203
 
 
204
  @async_to_sync_method
205
  async def quiz_and_judge(self, quiz_and_judge_config: Dict):
206
  logger.warning(
 
241
  logger.info("Restarting synthesizer LLM client.")
242
  self.synthesizer_llm_client.restart()
243
 
 
244
  @async_to_sync_method
245
  async def partition(self, partition_config: Dict):
246
  batches = await partition_kg(
 
252
  self.partition_storage.upsert(batches)
253
  return batches
254
 
 
255
  @async_to_sync_method
256
  async def extract(self, extract_config: Dict):
257
  logger.info("Extracting information from given chunks...")
 
271
  self.meta_storage.mark_done(self.chunks_storage)
272
  self.meta_storage.index_done_callback()
273
 
 
274
  @async_to_sync_method
275
  async def generate(self, generate_config: Dict):
276
 
graphgen/operators/extract/extract_info.py CHANGED
@@ -31,7 +31,7 @@ async def extract_info(
31
  else:
32
  raise ValueError(f"Unsupported extraction method: {method}")
33
 
34
- chunks = await chunk_storage.get_all()
35
  chunks = [{k: v} for k, v in chunks.items()]
36
  logger.info("Start extracting information from %d chunks", len(chunks))
37
 
 
31
  else:
32
  raise ValueError(f"Unsupported extraction method: {method}")
33
 
34
+ chunks = chunk_storage.get_all()
35
  chunks = [{k: v} for k, v in chunks.items()]
36
  logger.info("Start extracting information from %d chunks", len(chunks))
37
 
webui/app.py CHANGED
@@ -101,12 +101,15 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
101
  pipeline = [
102
  {
103
  "name": "read",
 
104
  "params": {
105
  "input_file": params.upload_file,
106
  },
107
  },
108
  {
109
  "name": "chunk",
 
 
110
  "params": {
111
  "chunk_size": params.chunk_size,
112
  "chunk_overlap": params.chunk_overlap,
@@ -114,6 +117,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
114
  },
115
  {
116
  "name": "build_kg",
 
 
117
  },
118
  ]
119
 
@@ -121,6 +126,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
121
  pipeline.append(
122
  {
123
  "name": "quiz_and_judge",
 
 
124
  "params": {"quiz_samples": params.quiz_samples, "re_judge": True},
125
  }
126
  )
@@ -128,6 +135,7 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
128
  {
129
  "name": "partition",
130
  "deps": ["quiz_and_judge"],
 
131
  "params": {
132
  "method": params.partition_method,
133
  "method_params": partition_params,
@@ -138,6 +146,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
138
  pipeline.append(
139
  {
140
  "name": "partition",
 
 
141
  "params": {
142
  "method": params.partition_method,
143
  "method_params": partition_params,
@@ -147,6 +157,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
147
  pipeline.append(
148
  {
149
  "name": "generate",
 
 
150
  "params": {
151
  "method": params.mode,
152
  "data_format": params.data_format,
 
101
  pipeline = [
102
  {
103
  "name": "read",
104
+ "op_key": "read",
105
  "params": {
106
  "input_file": params.upload_file,
107
  },
108
  },
109
  {
110
  "name": "chunk",
111
+ "deps": ["read"],
112
+ "op_key": "chunk",
113
  "params": {
114
  "chunk_size": params.chunk_size,
115
  "chunk_overlap": params.chunk_overlap,
 
117
  },
118
  {
119
  "name": "build_kg",
120
+ "deps": ["chunk"],
121
+ "op_key": "build_kg",
122
  },
123
  ]
124
 
 
126
  pipeline.append(
127
  {
128
  "name": "quiz_and_judge",
129
+ "deps": ["build_kg"],
130
+ "op_key": "quiz_and_judge",
131
  "params": {"quiz_samples": params.quiz_samples, "re_judge": True},
132
  }
133
  )
 
135
  {
136
  "name": "partition",
137
  "deps": ["quiz_and_judge"],
138
+ "op_key": "partition",
139
  "params": {
140
  "method": params.partition_method,
141
  "method_params": partition_params,
 
146
  pipeline.append(
147
  {
148
  "name": "partition",
149
+ "deps": ["build_kg"],
150
+ "op_key": "partition",
151
  "params": {
152
  "method": params.partition_method,
153
  "method_params": partition_params,
 
157
  pipeline.append(
158
  {
159
  "name": "generate",
160
+ "deps": ["partition"],
161
+ "op_key": "generate",
162
  "params": {
163
  "method": params.mode,
164
  "data_format": params.data_format,