Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
dee1edd
1
Parent(s):
b7519b4
Auto-sync from demo at Wed Nov 26 09:31:54 UTC 2025
Browse files- app.py +12 -0
- graphgen/configs/aggregated_config.yaml +17 -7
- graphgen/configs/atomic_config.yaml +15 -5
- graphgen/configs/cot_config.yaml +14 -5
- graphgen/configs/multi_hop_config.yaml +14 -5
- graphgen/configs/schema_guided_extraction_config.yaml +8 -3
- graphgen/configs/search_config.yaml +5 -2
- graphgen/configs/vqa_config.yaml +14 -5
- graphgen/engine.py +28 -24
- graphgen/graphgen.py +0 -9
- graphgen/operators/extract/extract_info.py +1 -1
- webui/app.py +12 -0
app.py
CHANGED
|
@@ -101,12 +101,15 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 101 |
pipeline = [
|
| 102 |
{
|
| 103 |
"name": "read",
|
|
|
|
| 104 |
"params": {
|
| 105 |
"input_file": params.upload_file,
|
| 106 |
},
|
| 107 |
},
|
| 108 |
{
|
| 109 |
"name": "chunk",
|
|
|
|
|
|
|
| 110 |
"params": {
|
| 111 |
"chunk_size": params.chunk_size,
|
| 112 |
"chunk_overlap": params.chunk_overlap,
|
|
@@ -114,6 +117,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 114 |
},
|
| 115 |
{
|
| 116 |
"name": "build_kg",
|
|
|
|
|
|
|
| 117 |
},
|
| 118 |
]
|
| 119 |
|
|
@@ -121,6 +126,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 121 |
pipeline.append(
|
| 122 |
{
|
| 123 |
"name": "quiz_and_judge",
|
|
|
|
|
|
|
| 124 |
"params": {"quiz_samples": params.quiz_samples, "re_judge": True},
|
| 125 |
}
|
| 126 |
)
|
|
@@ -128,6 +135,7 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 128 |
{
|
| 129 |
"name": "partition",
|
| 130 |
"deps": ["quiz_and_judge"],
|
|
|
|
| 131 |
"params": {
|
| 132 |
"method": params.partition_method,
|
| 133 |
"method_params": partition_params,
|
|
@@ -138,6 +146,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 138 |
pipeline.append(
|
| 139 |
{
|
| 140 |
"name": "partition",
|
|
|
|
|
|
|
| 141 |
"params": {
|
| 142 |
"method": params.partition_method,
|
| 143 |
"method_params": partition_params,
|
|
@@ -147,6 +157,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 147 |
pipeline.append(
|
| 148 |
{
|
| 149 |
"name": "generate",
|
|
|
|
|
|
|
| 150 |
"params": {
|
| 151 |
"method": params.mode,
|
| 152 |
"data_format": params.data_format,
|
|
|
|
| 101 |
pipeline = [
|
| 102 |
{
|
| 103 |
"name": "read",
|
| 104 |
+
"op_key": "read",
|
| 105 |
"params": {
|
| 106 |
"input_file": params.upload_file,
|
| 107 |
},
|
| 108 |
},
|
| 109 |
{
|
| 110 |
"name": "chunk",
|
| 111 |
+
"deps": ["read"],
|
| 112 |
+
"op_key": "chunk",
|
| 113 |
"params": {
|
| 114 |
"chunk_size": params.chunk_size,
|
| 115 |
"chunk_overlap": params.chunk_overlap,
|
|
|
|
| 117 |
},
|
| 118 |
{
|
| 119 |
"name": "build_kg",
|
| 120 |
+
"deps": ["chunk"],
|
| 121 |
+
"op_key": "build_kg",
|
| 122 |
},
|
| 123 |
]
|
| 124 |
|
|
|
|
| 126 |
pipeline.append(
|
| 127 |
{
|
| 128 |
"name": "quiz_and_judge",
|
| 129 |
+
"deps": ["build_kg"],
|
| 130 |
+
"op_key": "quiz_and_judge",
|
| 131 |
"params": {"quiz_samples": params.quiz_samples, "re_judge": True},
|
| 132 |
}
|
| 133 |
)
|
|
|
|
| 135 |
{
|
| 136 |
"name": "partition",
|
| 137 |
"deps": ["quiz_and_judge"],
|
| 138 |
+
"op_key": "partition",
|
| 139 |
"params": {
|
| 140 |
"method": params.partition_method,
|
| 141 |
"method_params": partition_params,
|
|
|
|
| 146 |
pipeline.append(
|
| 147 |
{
|
| 148 |
"name": "partition",
|
| 149 |
+
"deps": ["build_kg"],
|
| 150 |
+
"op_key": "partition",
|
| 151 |
"params": {
|
| 152 |
"method": params.partition_method,
|
| 153 |
"method_params": partition_params,
|
|
|
|
| 157 |
pipeline.append(
|
| 158 |
{
|
| 159 |
"name": "generate",
|
| 160 |
+
"deps": ["partition"],
|
| 161 |
+
"op_key": "generate",
|
| 162 |
"params": {
|
| 163 |
"method": params.mode,
|
| 164 |
"data_format": params.data_format,
|
graphgen/configs/aggregated_config.yaml
CHANGED
|
@@ -1,22 +1,30 @@
|
|
| 1 |
pipeline:
|
| 2 |
-
- name:
|
|
|
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
|
| 6 |
-
- name:
|
|
|
|
|
|
|
| 7 |
params:
|
| 8 |
chunk_size: 1024 # chunk size for text splitting
|
| 9 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 10 |
|
| 11 |
-
- name:
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
- name:
|
|
|
|
|
|
|
| 14 |
params:
|
| 15 |
quiz_samples: 2 # number of quiz samples to generate
|
| 16 |
re_judge: false # whether to re-judge the existing quiz samples
|
| 17 |
|
| 18 |
-
- name:
|
| 19 |
-
|
|
|
|
| 20 |
params:
|
| 21 |
method: ece # ece is a custom partition method based on comprehension loss
|
| 22 |
method_params:
|
|
@@ -25,7 +33,9 @@ pipeline:
|
|
| 25 |
max_tokens_per_community: 10240 # max tokens per community
|
| 26 |
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
|
| 27 |
|
| 28 |
-
- name:
|
|
|
|
|
|
|
| 29 |
params:
|
| 30 |
method: aggregated # atomic, aggregated, multi_hop, cot, vqa
|
| 31 |
data_format: ChatML # Alpaca, Sharegpt, ChatML
|
|
|
|
| 1 |
pipeline:
|
| 2 |
+
- name: read_step # step name is unique in the pipeline, and can be referenced by other steps
|
| 3 |
+
op_key: read
|
| 4 |
params:
|
| 5 |
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 6 |
|
| 7 |
+
- name: chunk_step
|
| 8 |
+
op_key: chunk
|
| 9 |
+
deps: [read_step] # chunk_step depends on read_step
|
| 10 |
params:
|
| 11 |
chunk_size: 1024 # chunk size for text splitting
|
| 12 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 13 |
|
| 14 |
+
- name: build_kg_step
|
| 15 |
+
op_key: build_kg
|
| 16 |
+
deps: [chunk_step] # build_kg_step depends on chunk_step
|
| 17 |
|
| 18 |
+
- name: quiz_and_judge_step
|
| 19 |
+
op_key: quiz_and_judge
|
| 20 |
+
deps: [build_kg_step] # quiz_and_judge depends on build_kg_step
|
| 21 |
params:
|
| 22 |
quiz_samples: 2 # number of quiz samples to generate
|
| 23 |
re_judge: false # whether to re-judge the existing quiz samples
|
| 24 |
|
| 25 |
+
- name: partition_step
|
| 26 |
+
op_key: partition
|
| 27 |
+
deps: [quiz_and_judge_step] # partition_step depends on quiz_and_judge_step
|
| 28 |
params:
|
| 29 |
method: ece # ece is a custom partition method based on comprehension loss
|
| 30 |
method_params:
|
|
|
|
| 33 |
max_tokens_per_community: 10240 # max tokens per community
|
| 34 |
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
|
| 35 |
|
| 36 |
+
- name: generate_step
|
| 37 |
+
op_key: generate
|
| 38 |
+
deps: [partition_step] # generate_step depends on partition_step
|
| 39 |
params:
|
| 40 |
method: aggregated # atomic, aggregated, multi_hop, cot, vqa
|
| 41 |
data_format: ChatML # Alpaca, Sharegpt, ChatML
|
graphgen/configs/atomic_config.yaml
CHANGED
|
@@ -1,21 +1,31 @@
|
|
| 1 |
pipeline:
|
| 2 |
-
- name:
|
|
|
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
|
| 5 |
|
| 6 |
-
- name:
|
|
|
|
|
|
|
| 7 |
params:
|
| 8 |
chunk_size: 1024 # chunk size for text splitting
|
| 9 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 10 |
|
| 11 |
-
- name:
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
- name:
|
|
|
|
|
|
|
| 14 |
params:
|
| 15 |
method: dfs # partition method, support: dfs, bfs, ece, leiden
|
| 16 |
method_params:
|
| 17 |
max_units_per_community: 1 # atomic partition, one node or edge per community
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
| 19 |
params:
|
| 20 |
method: atomic # atomic, aggregated, multi_hop, cot, vqa
|
| 21 |
data_format: Alpaca # Alpaca, Sharegpt, ChatML
|
|
|
|
| 1 |
pipeline:
|
| 2 |
+
- name: read_step
|
| 3 |
+
op_key: read
|
| 4 |
params:
|
| 5 |
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
|
| 6 |
|
| 7 |
+
- name: chunk_step
|
| 8 |
+
op_key: chunk
|
| 9 |
+
deps: [read_step] # chunk_step depends on read_step
|
| 10 |
params:
|
| 11 |
chunk_size: 1024 # chunk size for text splitting
|
| 12 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 13 |
|
| 14 |
+
- name: build_kg_step
|
| 15 |
+
op_key: build_kg
|
| 16 |
+
deps: [chunk_step] # build_kg depends on chunk_step
|
| 17 |
|
| 18 |
+
- name: partition_step
|
| 19 |
+
op_key: partition
|
| 20 |
+
deps: [build_kg] # partition_step depends on build_kg
|
| 21 |
params:
|
| 22 |
method: dfs # partition method, support: dfs, bfs, ece, leiden
|
| 23 |
method_params:
|
| 24 |
max_units_per_community: 1 # atomic partition, one node or edge per community
|
| 25 |
+
|
| 26 |
+
- name: generate_step
|
| 27 |
+
op_key: generate
|
| 28 |
+
deps: [partition_step] # generate_step depends on partition_step
|
| 29 |
params:
|
| 30 |
method: atomic # atomic, aggregated, multi_hop, cot, vqa
|
| 31 |
data_format: Alpaca # Alpaca, Sharegpt, ChatML
|
graphgen/configs/cot_config.yaml
CHANGED
|
@@ -1,16 +1,23 @@
|
|
| 1 |
pipeline:
|
| 2 |
-
- name:
|
|
|
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
|
| 6 |
-
- name:
|
|
|
|
|
|
|
| 7 |
params:
|
| 8 |
chunk_size: 1024 # chunk size for text splitting
|
| 9 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 10 |
|
| 11 |
-
- name:
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
- name:
|
|
|
|
|
|
|
| 14 |
params:
|
| 15 |
method: leiden # leiden is a partitioner detection algorithm
|
| 16 |
method_params:
|
|
@@ -18,7 +25,9 @@ pipeline:
|
|
| 18 |
use_lcc: false # whether to use the largest connected component
|
| 19 |
random_seed: 42 # random seed for partitioning
|
| 20 |
|
| 21 |
-
- name:
|
|
|
|
|
|
|
| 22 |
params:
|
| 23 |
method: cot # atomic, aggregated, multi_hop, cot, vqa
|
| 24 |
data_format: Sharegpt # Alpaca, Sharegpt, ChatML
|
|
|
|
| 1 |
pipeline:
|
| 2 |
+
- name: read_step
|
| 3 |
+
op_key: read
|
| 4 |
params:
|
| 5 |
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 6 |
|
| 7 |
+
- name: chunk_step
|
| 8 |
+
op_key: chunk
|
| 9 |
+
deps: [read_step] # chunk_step depends on read_step
|
| 10 |
params:
|
| 11 |
chunk_size: 1024 # chunk size for text splitting
|
| 12 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 13 |
|
| 14 |
+
- name: build_kg_step
|
| 15 |
+
op_key: build_kg
|
| 16 |
+
deps: [chunk_step] # build_kg depends on chunk_step
|
| 17 |
|
| 18 |
+
- name: partition_step
|
| 19 |
+
op_key: partition
|
| 20 |
+
deps: [build_kg_step] # partition_step depends on build_kg
|
| 21 |
params:
|
| 22 |
method: leiden # leiden is a partitioner detection algorithm
|
| 23 |
method_params:
|
|
|
|
| 25 |
use_lcc: false # whether to use the largest connected component
|
| 26 |
random_seed: 42 # random seed for partitioning
|
| 27 |
|
| 28 |
+
- name: generate_step
|
| 29 |
+
op_key: generate
|
| 30 |
+
deps: [partition_step] # generate_step depends on partition_step
|
| 31 |
params:
|
| 32 |
method: cot # atomic, aggregated, multi_hop, cot, vqa
|
| 33 |
data_format: Sharegpt # Alpaca, Sharegpt, ChatML
|
graphgen/configs/multi_hop_config.yaml
CHANGED
|
@@ -1,16 +1,23 @@
|
|
| 1 |
pipeline:
|
| 2 |
-
- name:
|
|
|
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
|
| 6 |
-
- name:
|
|
|
|
|
|
|
| 7 |
params:
|
| 8 |
chunk_size: 1024 # chunk size for text splitting
|
| 9 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 10 |
|
| 11 |
-
- name:
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
- name:
|
|
|
|
|
|
|
| 14 |
params:
|
| 15 |
method: ece # ece is a custom partition method based on comprehension loss
|
| 16 |
method_params:
|
|
@@ -19,7 +26,9 @@ pipeline:
|
|
| 19 |
max_tokens_per_community: 10240 # max tokens per community
|
| 20 |
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
|
| 21 |
|
| 22 |
-
- name:
|
|
|
|
|
|
|
| 23 |
params:
|
| 24 |
method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
|
| 25 |
data_format: ChatML # Alpaca, Sharegpt, ChatML
|
|
|
|
| 1 |
pipeline:
|
| 2 |
+
- name: read_step
|
| 3 |
+
op_key: read
|
| 4 |
params:
|
| 5 |
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 6 |
|
| 7 |
+
- name: chunk_step
|
| 8 |
+
op_key: chunk
|
| 9 |
+
deps: [read_step] # chunk_step depends on read_step
|
| 10 |
params:
|
| 11 |
chunk_size: 1024 # chunk size for text splitting
|
| 12 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 13 |
|
| 14 |
+
- name: build_kg_step
|
| 15 |
+
op_key: build_kg
|
| 16 |
+
deps: [chunk_step] # build_kg_step depends on chunk_step
|
| 17 |
|
| 18 |
+
- name: partition_step
|
| 19 |
+
op_key: partition
|
| 20 |
+
deps: [build_kg_step] # partition_step depends on build_kg_step
|
| 21 |
params:
|
| 22 |
method: ece # ece is a custom partition method based on comprehension loss
|
| 23 |
method_params:
|
|
|
|
| 26 |
max_tokens_per_community: 10240 # max tokens per community
|
| 27 |
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
|
| 28 |
|
| 29 |
+
- name: generate_step
|
| 30 |
+
op_key: generate
|
| 31 |
+
deps: [partition_step] # generate_step depends on partition_step
|
| 32 |
params:
|
| 33 |
method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
|
| 34 |
data_format: ChatML # Alpaca, Sharegpt, ChatML
|
graphgen/configs/schema_guided_extraction_config.yaml
CHANGED
|
@@ -1,15 +1,20 @@
|
|
| 1 |
pipeline:
|
| 2 |
-
- name:
|
|
|
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
|
| 6 |
-
- name:
|
|
|
|
|
|
|
| 7 |
params:
|
| 8 |
chunk_size: 20480
|
| 9 |
chunk_overlap: 2000
|
| 10 |
separators: []
|
| 11 |
|
| 12 |
-
- name:
|
|
|
|
|
|
|
| 13 |
params:
|
| 14 |
method: schema_guided # extraction method, support: schema_guided
|
| 15 |
schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method
|
|
|
|
| 1 |
pipeline:
|
| 2 |
+
- name: read_step
|
| 3 |
+
op_key: read
|
| 4 |
params:
|
| 5 |
input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 6 |
|
| 7 |
+
- name: chunk_step
|
| 8 |
+
op_key: chunk
|
| 9 |
+
deps: [read_step] # chunk_step depends on read_step
|
| 10 |
params:
|
| 11 |
chunk_size: 20480
|
| 12 |
chunk_overlap: 2000
|
| 13 |
separators: []
|
| 14 |
|
| 15 |
+
- name: extract_step
|
| 16 |
+
op_key: extract
|
| 17 |
+
deps: [chunk_step] # extract_step depends on chunk_step
|
| 18 |
params:
|
| 19 |
method: schema_guided # extraction method, support: schema_guided
|
| 20 |
schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method
|
graphgen/configs/search_config.yaml
CHANGED
|
@@ -1,9 +1,12 @@
|
|
| 1 |
pipeline:
|
| 2 |
-
- name:
|
|
|
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
|
| 6 |
-
- name:
|
|
|
|
|
|
|
| 7 |
params:
|
| 8 |
data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot
|
| 9 |
uniprot_params:
|
|
|
|
| 1 |
pipeline:
|
| 2 |
+
- name: read_step
|
| 3 |
+
op_key: read
|
| 4 |
params:
|
| 5 |
input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 6 |
|
| 7 |
+
- name: search_step
|
| 8 |
+
op_key: search
|
| 9 |
+
deps: [read_step] # search_step depends on read_step
|
| 10 |
params:
|
| 11 |
data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot
|
| 12 |
uniprot_params:
|
graphgen/configs/vqa_config.yaml
CHANGED
|
@@ -1,23 +1,32 @@
|
|
| 1 |
pipeline:
|
| 2 |
-
- name:
|
|
|
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
|
| 6 |
-
- name:
|
|
|
|
|
|
|
| 7 |
params:
|
| 8 |
chunk_size: 1024 # chunk size for text splitting
|
| 9 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 10 |
|
| 11 |
-
- name:
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
- name:
|
|
|
|
|
|
|
| 14 |
params:
|
| 15 |
method: anchor_bfs # partition method
|
| 16 |
method_params:
|
| 17 |
anchor_type: image # node type to select anchor nodes
|
| 18 |
max_units_per_community: 10 # atomic partition, one node or edge per community
|
| 19 |
|
| 20 |
-
- name:
|
|
|
|
|
|
|
| 21 |
params:
|
| 22 |
method: vqa # atomic, aggregated, multi_hop, cot, vqa
|
| 23 |
data_format: ChatML # Alpaca, Sharegpt, ChatML
|
|
|
|
| 1 |
pipeline:
|
| 2 |
+
- name: read_step
|
| 3 |
+
op_key: read
|
| 4 |
params:
|
| 5 |
input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 6 |
|
| 7 |
+
- name: chunk_step
|
| 8 |
+
op_key: chunk
|
| 9 |
+
deps: [read_step] # chunk_step depends on read_step
|
| 10 |
params:
|
| 11 |
chunk_size: 1024 # chunk size for text splitting
|
| 12 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 13 |
|
| 14 |
+
- name: build_kg_step
|
| 15 |
+
op_key: build_kg
|
| 16 |
+
deps: [chunk_step] # build_kg depends on chunk_step
|
| 17 |
|
| 18 |
+
- name: partition_step
|
| 19 |
+
op_key: partition
|
| 20 |
+
deps: [build_kg_step] # partition_step depends on build_kg_step
|
| 21 |
params:
|
| 22 |
method: anchor_bfs # partition method
|
| 23 |
method_params:
|
| 24 |
anchor_type: image # node type to select anchor nodes
|
| 25 |
max_units_per_community: 10 # atomic partition, one node or edge per community
|
| 26 |
|
| 27 |
+
- name: generate_step
|
| 28 |
+
op_key: generate
|
| 29 |
+
deps: [partition_step] # generate_step depends on partition_step
|
| 30 |
params:
|
| 31 |
method: vqa # atomic, aggregated, multi_hop, cot, vqa
|
| 32 |
data_format: ChatML # Alpaca, Sharegpt, ChatML
|
graphgen/engine.py
CHANGED
|
@@ -4,7 +4,6 @@ orchestration engine for GraphGen
|
|
| 4 |
|
| 5 |
import threading
|
| 6 |
import traceback
|
| 7 |
-
from functools import wraps
|
| 8 |
from typing import Any, Callable, List
|
| 9 |
|
| 10 |
|
|
@@ -27,25 +26,12 @@ class OpNode:
|
|
| 27 |
self.name, self.deps, self.func = name, deps, func
|
| 28 |
|
| 29 |
|
| 30 |
-
def op(name: str, deps=None):
|
| 31 |
-
deps = deps or []
|
| 32 |
-
|
| 33 |
-
def decorator(func):
|
| 34 |
-
@wraps(func)
|
| 35 |
-
def _wrapper(*args, **kwargs):
|
| 36 |
-
return func(*args, **kwargs)
|
| 37 |
-
|
| 38 |
-
_wrapper.op_node = OpNode(name, deps, lambda self, ctx: func(self, **ctx))
|
| 39 |
-
return _wrapper
|
| 40 |
-
|
| 41 |
-
return decorator
|
| 42 |
-
|
| 43 |
-
|
| 44 |
class Engine:
|
| 45 |
def __init__(self, max_workers: int = 4):
|
| 46 |
self.max_workers = max_workers
|
| 47 |
|
| 48 |
def run(self, ops: List[OpNode], ctx: Context):
|
|
|
|
| 49 |
name2op = {operation.name: operation for operation in ops}
|
| 50 |
|
| 51 |
# topological sort
|
|
@@ -81,7 +67,7 @@ class Engine:
|
|
| 81 |
return
|
| 82 |
try:
|
| 83 |
name2op[n].func(name2op[n], ctx)
|
| 84 |
-
except Exception:
|
| 85 |
exc[n] = traceback.format_exc()
|
| 86 |
done[n].set()
|
| 87 |
|
|
@@ -96,6 +82,20 @@ class Engine:
|
|
| 96 |
+ "\n".join(f"---- {op} ----\n{tb}" for op, tb in exc.items())
|
| 97 |
)
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
def collect_ops(config: dict, graph_gen) -> List[OpNode]:
|
| 101 |
"""
|
|
@@ -106,16 +106,20 @@ def collect_ops(config: dict, graph_gen) -> List[OpNode]:
|
|
| 106 |
ops: List[OpNode] = []
|
| 107 |
for stage in config["pipeline"]:
|
| 108 |
name = stage["name"]
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
# if there are runtime dependencies, override them
|
| 113 |
-
runtime_deps = stage.get("deps", op_node.deps)
|
| 114 |
-
op_node.deps = runtime_deps
|
| 115 |
|
| 116 |
if "params" in stage:
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
| 118 |
else:
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
ops.append(op_node)
|
| 121 |
return ops
|
|
|
|
| 4 |
|
| 5 |
import threading
|
| 6 |
import traceback
|
|
|
|
| 7 |
from typing import Any, Callable, List
|
| 8 |
|
| 9 |
|
|
|
|
| 26 |
self.name, self.deps, self.func = name, deps, func
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
class Engine:
|
| 30 |
def __init__(self, max_workers: int = 4):
|
| 31 |
self.max_workers = max_workers
|
| 32 |
|
| 33 |
def run(self, ops: List[OpNode], ctx: Context):
|
| 34 |
+
self._validate(ops)
|
| 35 |
name2op = {operation.name: operation for operation in ops}
|
| 36 |
|
| 37 |
# topological sort
|
|
|
|
| 67 |
return
|
| 68 |
try:
|
| 69 |
name2op[n].func(name2op[n], ctx)
|
| 70 |
+
except Exception:
|
| 71 |
exc[n] = traceback.format_exc()
|
| 72 |
done[n].set()
|
| 73 |
|
|
|
|
| 82 |
+ "\n".join(f"---- {op} ----\n{tb}" for op, tb in exc.items())
|
| 83 |
)
|
| 84 |
|
| 85 |
+
@staticmethod
|
| 86 |
+
def _validate(ops: List[OpNode]):
|
| 87 |
+
name_set = set()
|
| 88 |
+
for op in ops:
|
| 89 |
+
if op.name in name_set:
|
| 90 |
+
raise ValueError(f"Duplicate operation name: {op.name}")
|
| 91 |
+
name_set.add(op.name)
|
| 92 |
+
for op in ops:
|
| 93 |
+
for dep in op.deps:
|
| 94 |
+
if dep not in name_set:
|
| 95 |
+
raise ValueError(
|
| 96 |
+
f"Operation {op.name} has unknown dependency: {dep}"
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
|
| 100 |
def collect_ops(config: dict, graph_gen) -> List[OpNode]:
|
| 101 |
"""
|
|
|
|
| 106 |
ops: List[OpNode] = []
|
| 107 |
for stage in config["pipeline"]:
|
| 108 |
name = stage["name"]
|
| 109 |
+
method_name = stage.get("op_key")
|
| 110 |
+
method = getattr(graph_gen, method_name)
|
| 111 |
+
deps = stage.get("deps", [])
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
if "params" in stage:
|
| 114 |
+
|
| 115 |
+
def func(self, ctx, _method=method, _params=stage.get("params", {})):
|
| 116 |
+
return _method(_params)
|
| 117 |
+
|
| 118 |
else:
|
| 119 |
+
|
| 120 |
+
def func(self, ctx, _method=method):
|
| 121 |
+
return _method()
|
| 122 |
+
|
| 123 |
+
op_node = OpNode(name=name, deps=deps, func=func)
|
| 124 |
ops.append(op_node)
|
| 125 |
return ops
|
graphgen/graphgen.py
CHANGED
|
@@ -6,7 +6,6 @@ import gradio as gr
|
|
| 6 |
|
| 7 |
from graphgen.bases import BaseLLMWrapper
|
| 8 |
from graphgen.bases.datatypes import Chunk
|
| 9 |
-
from graphgen.engine import op
|
| 10 |
from graphgen.models import (
|
| 11 |
JsonKVStorage,
|
| 12 |
JsonListStorage,
|
|
@@ -89,7 +88,6 @@ class GraphGen:
|
|
| 89 |
# webui
|
| 90 |
self.progress_bar: gr.Progress = progress_bar
|
| 91 |
|
| 92 |
-
@op("read", deps=[])
|
| 93 |
@async_to_sync_method
|
| 94 |
async def read(self, read_config: Dict):
|
| 95 |
"""
|
|
@@ -116,7 +114,6 @@ class GraphGen:
|
|
| 116 |
self.full_docs_storage.upsert(new_docs)
|
| 117 |
self.full_docs_storage.index_done_callback()
|
| 118 |
|
| 119 |
-
@op("chunk", deps=["read"])
|
| 120 |
@async_to_sync_method
|
| 121 |
async def chunk(self, chunk_config: Dict):
|
| 122 |
"""
|
|
@@ -149,7 +146,6 @@ class GraphGen:
|
|
| 149 |
self.meta_storage.mark_done(self.full_docs_storage)
|
| 150 |
self.meta_storage.index_done_callback()
|
| 151 |
|
| 152 |
-
@op("build_kg", deps=["chunk"])
|
| 153 |
@async_to_sync_method
|
| 154 |
async def build_kg(self):
|
| 155 |
"""
|
|
@@ -180,7 +176,6 @@ class GraphGen:
|
|
| 180 |
|
| 181 |
return _add_entities_and_relations
|
| 182 |
|
| 183 |
-
@op("search", deps=["read"])
|
| 184 |
@async_to_sync_method
|
| 185 |
async def search(self, search_config: Dict):
|
| 186 |
logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))
|
|
@@ -206,7 +201,6 @@ class GraphGen:
|
|
| 206 |
self.meta_storage.mark_done(self.full_docs_storage)
|
| 207 |
self.meta_storage.index_done_callback()
|
| 208 |
|
| 209 |
-
@op("quiz_and_judge", deps=["build_kg"])
|
| 210 |
@async_to_sync_method
|
| 211 |
async def quiz_and_judge(self, quiz_and_judge_config: Dict):
|
| 212 |
logger.warning(
|
|
@@ -247,7 +241,6 @@ class GraphGen:
|
|
| 247 |
logger.info("Restarting synthesizer LLM client.")
|
| 248 |
self.synthesizer_llm_client.restart()
|
| 249 |
|
| 250 |
-
@op("partition", deps=["build_kg"])
|
| 251 |
@async_to_sync_method
|
| 252 |
async def partition(self, partition_config: Dict):
|
| 253 |
batches = await partition_kg(
|
|
@@ -259,7 +252,6 @@ class GraphGen:
|
|
| 259 |
self.partition_storage.upsert(batches)
|
| 260 |
return batches
|
| 261 |
|
| 262 |
-
@op("extract", deps=["chunk"])
|
| 263 |
@async_to_sync_method
|
| 264 |
async def extract(self, extract_config: Dict):
|
| 265 |
logger.info("Extracting information from given chunks...")
|
|
@@ -279,7 +271,6 @@ class GraphGen:
|
|
| 279 |
self.meta_storage.mark_done(self.chunks_storage)
|
| 280 |
self.meta_storage.index_done_callback()
|
| 281 |
|
| 282 |
-
@op("generate", deps=["partition"])
|
| 283 |
@async_to_sync_method
|
| 284 |
async def generate(self, generate_config: Dict):
|
| 285 |
|
|
|
|
| 6 |
|
| 7 |
from graphgen.bases import BaseLLMWrapper
|
| 8 |
from graphgen.bases.datatypes import Chunk
|
|
|
|
| 9 |
from graphgen.models import (
|
| 10 |
JsonKVStorage,
|
| 11 |
JsonListStorage,
|
|
|
|
| 88 |
# webui
|
| 89 |
self.progress_bar: gr.Progress = progress_bar
|
| 90 |
|
|
|
|
| 91 |
@async_to_sync_method
|
| 92 |
async def read(self, read_config: Dict):
|
| 93 |
"""
|
|
|
|
| 114 |
self.full_docs_storage.upsert(new_docs)
|
| 115 |
self.full_docs_storage.index_done_callback()
|
| 116 |
|
|
|
|
| 117 |
@async_to_sync_method
|
| 118 |
async def chunk(self, chunk_config: Dict):
|
| 119 |
"""
|
|
|
|
| 146 |
self.meta_storage.mark_done(self.full_docs_storage)
|
| 147 |
self.meta_storage.index_done_callback()
|
| 148 |
|
|
|
|
| 149 |
@async_to_sync_method
|
| 150 |
async def build_kg(self):
|
| 151 |
"""
|
|
|
|
| 176 |
|
| 177 |
return _add_entities_and_relations
|
| 178 |
|
|
|
|
| 179 |
@async_to_sync_method
|
| 180 |
async def search(self, search_config: Dict):
|
| 181 |
logger.info("[Search] %s ...", ", ".join(search_config["data_sources"]))
|
|
|
|
| 201 |
self.meta_storage.mark_done(self.full_docs_storage)
|
| 202 |
self.meta_storage.index_done_callback()
|
| 203 |
|
|
|
|
| 204 |
@async_to_sync_method
|
| 205 |
async def quiz_and_judge(self, quiz_and_judge_config: Dict):
|
| 206 |
logger.warning(
|
|
|
|
| 241 |
logger.info("Restarting synthesizer LLM client.")
|
| 242 |
self.synthesizer_llm_client.restart()
|
| 243 |
|
|
|
|
| 244 |
@async_to_sync_method
|
| 245 |
async def partition(self, partition_config: Dict):
|
| 246 |
batches = await partition_kg(
|
|
|
|
| 252 |
self.partition_storage.upsert(batches)
|
| 253 |
return batches
|
| 254 |
|
|
|
|
| 255 |
@async_to_sync_method
|
| 256 |
async def extract(self, extract_config: Dict):
|
| 257 |
logger.info("Extracting information from given chunks...")
|
|
|
|
| 271 |
self.meta_storage.mark_done(self.chunks_storage)
|
| 272 |
self.meta_storage.index_done_callback()
|
| 273 |
|
|
|
|
| 274 |
@async_to_sync_method
|
| 275 |
async def generate(self, generate_config: Dict):
|
| 276 |
|
graphgen/operators/extract/extract_info.py
CHANGED
|
@@ -31,7 +31,7 @@ async def extract_info(
|
|
| 31 |
else:
|
| 32 |
raise ValueError(f"Unsupported extraction method: {method}")
|
| 33 |
|
| 34 |
-
chunks =
|
| 35 |
chunks = [{k: v} for k, v in chunks.items()]
|
| 36 |
logger.info("Start extracting information from %d chunks", len(chunks))
|
| 37 |
|
|
|
|
| 31 |
else:
|
| 32 |
raise ValueError(f"Unsupported extraction method: {method}")
|
| 33 |
|
| 34 |
+
chunks = chunk_storage.get_all()
|
| 35 |
chunks = [{k: v} for k, v in chunks.items()]
|
| 36 |
logger.info("Start extracting information from %d chunks", len(chunks))
|
| 37 |
|
webui/app.py
CHANGED
|
@@ -101,12 +101,15 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 101 |
pipeline = [
|
| 102 |
{
|
| 103 |
"name": "read",
|
|
|
|
| 104 |
"params": {
|
| 105 |
"input_file": params.upload_file,
|
| 106 |
},
|
| 107 |
},
|
| 108 |
{
|
| 109 |
"name": "chunk",
|
|
|
|
|
|
|
| 110 |
"params": {
|
| 111 |
"chunk_size": params.chunk_size,
|
| 112 |
"chunk_overlap": params.chunk_overlap,
|
|
@@ -114,6 +117,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 114 |
},
|
| 115 |
{
|
| 116 |
"name": "build_kg",
|
|
|
|
|
|
|
| 117 |
},
|
| 118 |
]
|
| 119 |
|
|
@@ -121,6 +126,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 121 |
pipeline.append(
|
| 122 |
{
|
| 123 |
"name": "quiz_and_judge",
|
|
|
|
|
|
|
| 124 |
"params": {"quiz_samples": params.quiz_samples, "re_judge": True},
|
| 125 |
}
|
| 126 |
)
|
|
@@ -128,6 +135,7 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 128 |
{
|
| 129 |
"name": "partition",
|
| 130 |
"deps": ["quiz_and_judge"],
|
|
|
|
| 131 |
"params": {
|
| 132 |
"method": params.partition_method,
|
| 133 |
"method_params": partition_params,
|
|
@@ -138,6 +146,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 138 |
pipeline.append(
|
| 139 |
{
|
| 140 |
"name": "partition",
|
|
|
|
|
|
|
| 141 |
"params": {
|
| 142 |
"method": params.partition_method,
|
| 143 |
"method_params": partition_params,
|
|
@@ -147,6 +157,8 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 147 |
pipeline.append(
|
| 148 |
{
|
| 149 |
"name": "generate",
|
|
|
|
|
|
|
| 150 |
"params": {
|
| 151 |
"method": params.mode,
|
| 152 |
"data_format": params.data_format,
|
|
|
|
| 101 |
pipeline = [
|
| 102 |
{
|
| 103 |
"name": "read",
|
| 104 |
+
"op_key": "read",
|
| 105 |
"params": {
|
| 106 |
"input_file": params.upload_file,
|
| 107 |
},
|
| 108 |
},
|
| 109 |
{
|
| 110 |
"name": "chunk",
|
| 111 |
+
"deps": ["read"],
|
| 112 |
+
"op_key": "chunk",
|
| 113 |
"params": {
|
| 114 |
"chunk_size": params.chunk_size,
|
| 115 |
"chunk_overlap": params.chunk_overlap,
|
|
|
|
| 117 |
},
|
| 118 |
{
|
| 119 |
"name": "build_kg",
|
| 120 |
+
"deps": ["chunk"],
|
| 121 |
+
"op_key": "build_kg",
|
| 122 |
},
|
| 123 |
]
|
| 124 |
|
|
|
|
| 126 |
pipeline.append(
|
| 127 |
{
|
| 128 |
"name": "quiz_and_judge",
|
| 129 |
+
"deps": ["build_kg"],
|
| 130 |
+
"op_key": "quiz_and_judge",
|
| 131 |
"params": {"quiz_samples": params.quiz_samples, "re_judge": True},
|
| 132 |
}
|
| 133 |
)
|
|
|
|
| 135 |
{
|
| 136 |
"name": "partition",
|
| 137 |
"deps": ["quiz_and_judge"],
|
| 138 |
+
"op_key": "partition",
|
| 139 |
"params": {
|
| 140 |
"method": params.partition_method,
|
| 141 |
"method_params": partition_params,
|
|
|
|
| 146 |
pipeline.append(
|
| 147 |
{
|
| 148 |
"name": "partition",
|
| 149 |
+
"deps": ["build_kg"],
|
| 150 |
+
"op_key": "partition",
|
| 151 |
"params": {
|
| 152 |
"method": params.partition_method,
|
| 153 |
"method_params": partition_params,
|
|
|
|
| 157 |
pipeline.append(
|
| 158 |
{
|
| 159 |
"name": "generate",
|
| 160 |
+
"deps": ["partition"],
|
| 161 |
+
"op_key": "generate",
|
| 162 |
"params": {
|
| 163 |
"method": params.mode,
|
| 164 |
"data_format": params.data_format,
|