github-actions[bot] commited on
Commit
2a0edfe
·
1 Parent(s): 799ac7c

Auto-sync from demo at Thu Oct 16 11:36:22 UTC 2025

Browse files
app.py CHANGED
@@ -74,11 +74,32 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
74
  def sum_tokens(client):
75
  return sum(u["total_tokens"] for u in client.token_usage)
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  config = {
78
  "if_trainee_model": params.if_trainee_model,
79
- "read": {
80
- "input_file": params.input_file,
81
- },
82
  "split": {
83
  "chunk_size": params.chunk_size,
84
  "chunk_overlap": params.chunk_overlap,
@@ -89,21 +110,12 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
89
  "quiz_samples": params.quiz_samples,
90
  },
91
  "partition": {
92
- "method": "ece",
93
- "method_params": {
94
- "bidirectional": params.bidirectional,
95
- "expand_method": params.expand_method,
96
- "max_extra_edges": params.max_extra_edges,
97
- "max_tokens": params.max_tokens,
98
- "max_depth": params.max_depth,
99
- "edge_sampling": params.edge_sampling,
100
- "isolated_node_strategy": params.isolated_node_strategy,
101
- "loss_strategy": params.loss_strategy,
102
- },
103
  },
104
  "generate": {
105
- "mode": params.output_data_type,
106
- "data_format": params.output_data_format,
107
  },
108
  }
109
 
@@ -141,10 +153,7 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
141
  graph_gen.insert(read_config=config["read"], split_config=config["split"])
142
 
143
  if config["if_trainee_model"]:
144
- # Quiz and Judge
145
  graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
146
- else:
147
- config["partition"]["method_params"]["edge_sampling"] = "random"
148
 
149
  graph_gen.generate(
150
  partition_config=config["partition"],
@@ -245,13 +254,7 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
245
  ):
246
  lang_btn.render()
247
 
248
- gr.Markdown(
249
- value="# "
250
- + _("Title")
251
- + "\n\n"
252
- + "### [GraphGen](https://github.com/open-sciencelab/GraphGen) "
253
- + _("Intro")
254
- )
255
 
256
  if_trainee_model = gr.Checkbox(
257
  label=_("Use Trainee Model"), value=False, interactive=True
@@ -295,106 +298,13 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
295
  visible=if_trainee_model.value is True,
296
  )
297
 
298
- with gr.Accordion(label=_("Generation Config"), open=False):
299
- chunk_size = gr.Slider(
300
- label="Chunk Size",
301
- minimum=256,
302
- maximum=4096,
303
- value=1024,
304
- step=256,
305
- interactive=True,
306
- )
307
- chunk_overlap = gr.Slider(
308
- label="Chunk Overlap",
309
- minimum=0,
310
- maximum=500,
311
- value=100,
312
- step=100,
313
- interactive=True,
314
- )
315
- output_data_type = gr.Radio(
316
- choices=["atomic", "multi_hop", "aggregated"],
317
- label="Output Data Type",
318
- value="aggregated",
319
- interactive=True,
320
- )
321
- output_data_format = gr.Radio(
322
- choices=["Alpaca", "Sharegpt", "ChatML"],
323
- label="Output Data Format",
324
- value="Alpaca",
325
- interactive=True,
326
- )
327
- quiz_samples = gr.Number(
328
- label="Quiz Samples",
329
- value=2,
330
- minimum=1,
331
- interactive=True,
332
- visible=if_trainee_model.value is True,
333
- )
334
- bidirectional = gr.Checkbox(
335
- label="Bidirectional", value=True, interactive=True
336
- )
337
-
338
- expand_method = gr.Radio(
339
- choices=["max_width", "max_tokens"],
340
- label="Expand Method",
341
- value="max_tokens",
342
- interactive=True,
343
- )
344
- max_extra_edges = gr.Slider(
345
- minimum=1,
346
- maximum=10,
347
- value=5,
348
- label="Max Extra Edges",
349
- step=1,
350
- interactive=True,
351
- visible=expand_method.value == "max_width",
352
- )
353
- max_tokens = gr.Slider(
354
- minimum=64,
355
- maximum=1024,
356
- value=256,
357
- label="Max Tokens",
358
- step=64,
359
- interactive=True,
360
- visible=(expand_method.value != "max_width"),
361
- )
362
-
363
- max_depth = gr.Slider(
364
- minimum=1,
365
- maximum=5,
366
- value=2,
367
- label="Max Depth",
368
- step=1,
369
- interactive=True,
370
- )
371
- edge_sampling = gr.Radio(
372
- choices=["max_loss", "min_loss", "random"],
373
- label="Edge Sampling",
374
- value="max_loss",
375
- interactive=True,
376
- visible=if_trainee_model.value is True,
377
- )
378
- isolated_node_strategy = gr.Radio(
379
- choices=["add", "ignore"],
380
- label="Isolated Node Strategy",
381
- value="ignore",
382
- interactive=True,
383
- )
384
- loss_strategy = gr.Radio(
385
- choices=["only_edge", "both"],
386
- label="Loss Strategy",
387
- value="only_edge",
388
- interactive=True,
389
- )
390
-
391
  with gr.Row(equal_height=True):
392
  with gr.Column(scale=3):
393
  api_key = gr.Textbox(
394
  label=_("SiliconFlow Token"),
395
  type="password",
396
  value="",
397
- info="https://cloud.siliconflow.cn/account/ak",
398
  )
399
  with gr.Column(scale=1):
400
  test_connection_btn = gr.Button(_("Test Connection"))
@@ -437,6 +347,177 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
437
  elem_id="preview_df",
438
  )
439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  with gr.Blocks():
441
  token_counter = gr.DataFrame(
442
  label="Token Stats",
@@ -468,7 +549,7 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
468
  label="TPM",
469
  minimum=5000,
470
  maximum=5000000,
471
- value=100000,
472
  step=1000,
473
  interactive=True,
474
  visible=True,
@@ -498,24 +579,14 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
498
  outputs=[],
499
  )
500
 
501
- expand_method.change(
502
- lambda method: (
503
- gr.update(visible=method == "max_width"),
504
- gr.update(visible=method != "max_width"),
505
- ),
506
- inputs=expand_method,
507
- outputs=[max_extra_edges, max_tokens],
508
- )
509
-
510
  if_trainee_model.change(
511
- lambda use_trainee: [gr.update(visible=use_trainee)] * 5,
512
  inputs=if_trainee_model,
513
  outputs=[
514
  trainee_url,
515
  trainee_model,
516
- quiz_samples,
517
- edge_sampling,
518
  trainee_api_key,
 
519
  ],
520
  )
521
 
@@ -538,59 +609,35 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
538
 
539
  submit_btn.click(
540
  lambda *args: run_graphgen(
541
- WebuiParams(
542
- if_trainee_model=args[0],
543
- input_file=args[1],
544
- tokenizer=args[2],
545
- output_data_type=args[3],
546
- output_data_format=args[4],
547
- bidirectional=args[5],
548
- expand_method=args[6],
549
- max_extra_edges=args[7],
550
- max_tokens=args[8],
551
- max_depth=args[9],
552
- edge_sampling=args[10],
553
- isolated_node_strategy=args[11],
554
- loss_strategy=args[12],
555
- synthesizer_url=args[13],
556
- synthesizer_model=args[14],
557
- trainee_model=args[15],
558
- api_key=args[16],
559
- chunk_size=args[17],
560
- chunk_overlap=args[18],
561
- rpm=args[19],
562
- tpm=args[20],
563
- quiz_samples=args[21],
564
- trainee_url=args[22],
565
- trainee_api_key=args[23],
566
- token_counter=args[24],
567
- )
568
  ),
569
  inputs=[
570
  if_trainee_model,
571
  upload_file,
572
  tokenizer,
573
- output_data_type,
574
- output_data_format,
575
- bidirectional,
576
- expand_method,
577
- max_extra_edges,
578
- max_tokens,
579
- max_depth,
580
- edge_sampling,
581
- isolated_node_strategy,
582
- loss_strategy,
583
- synthesizer_url,
584
  synthesizer_model,
 
585
  trainee_model,
 
586
  api_key,
 
587
  chunk_size,
588
  chunk_overlap,
 
 
 
 
 
 
 
 
 
 
 
 
 
589
  rpm,
590
  tpm,
591
- quiz_samples,
592
- trainee_url,
593
- trainee_api_key,
594
  token_counter,
595
  ],
596
  outputs=[output, token_counter],
@@ -599,4 +646,4 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
599
 
600
  if __name__ == "__main__":
601
  demo.queue(api_open=False, default_concurrency_limit=2)
602
- demo.launch(server_name="0.0.0.0")
 
74
  def sum_tokens(client):
75
  return sum(u["total_tokens"] for u in client.token_usage)
76
 
77
+ method = params.partition_method
78
+ if method == "dfs":
79
+ partition_params = {
80
+ "max_units_per_community": params.dfs_max_units,
81
+ }
82
+ elif method == "bfs":
83
+ partition_params = {
84
+ "max_units_per_community": params.bfs_max_units,
85
+ }
86
+ elif method == "leiden":
87
+ partition_params = {
88
+ "max_size": params.leiden_max_size,
89
+ "use_lcc": params.leiden_use_lcc,
90
+ "random_seed": params.leiden_random_seed,
91
+ }
92
+ else: # ece
93
+ partition_params = {
94
+ "max_units_per_community": params.ece_max_units,
95
+ "min_units_per_community": params.ece_min_units,
96
+ "max_tokens_per_community": params.ece_max_tokens,
97
+ "unit_sampling": params.ece_unit_sampling,
98
+ }
99
+
100
  config = {
101
  "if_trainee_model": params.if_trainee_model,
102
+ "read": {"input_file": params.upload_file},
 
 
103
  "split": {
104
  "chunk_size": params.chunk_size,
105
  "chunk_overlap": params.chunk_overlap,
 
110
  "quiz_samples": params.quiz_samples,
111
  },
112
  "partition": {
113
+ "method": params.partition_method,
114
+ "method_params": partition_params,
 
 
 
 
 
 
 
 
 
115
  },
116
  "generate": {
117
+ "mode": params.mode,
118
+ "data_format": params.data_format,
119
  },
120
  }
121
 
 
153
  graph_gen.insert(read_config=config["read"], split_config=config["split"])
154
 
155
  if config["if_trainee_model"]:
 
156
  graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
 
 
157
 
158
  graph_gen.generate(
159
  partition_config=config["partition"],
 
254
  ):
255
  lang_btn.render()
256
 
257
+ gr.Markdown(value=_("Title") + _("Intro"))
 
 
 
 
 
 
258
 
259
  if_trainee_model = gr.Checkbox(
260
  label=_("Use Trainee Model"), value=False, interactive=True
 
298
  visible=if_trainee_model.value is True,
299
  )
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  with gr.Row(equal_height=True):
302
  with gr.Column(scale=3):
303
  api_key = gr.Textbox(
304
  label=_("SiliconFlow Token"),
305
  type="password",
306
  value="",
307
+ info=_("SiliconFlow Token Info"),
308
  )
309
  with gr.Column(scale=1):
310
  test_connection_btn = gr.Button(_("Test Connection"))
 
347
  elem_id="preview_df",
348
  )
349
 
350
+ with gr.Accordion(label=_("Split Config"), open=False):
351
+ gr.Markdown(value=_("Split Config Info"))
352
+ with gr.Row(equal_height=True):
353
+ with gr.Column(scale=1):
354
+ chunk_size = gr.Slider(
355
+ label=_("Chunk Size"),
356
+ minimum=256,
357
+ maximum=4096,
358
+ value=1024,
359
+ step=256,
360
+ interactive=True,
361
+ info=_("Chunk Size Info"),
362
+ )
363
+ with gr.Column(scale=1):
364
+ chunk_overlap = gr.Slider(
365
+ label=_("Chunk Overlap"),
366
+ minimum=0,
367
+ maximum=500,
368
+ value=100,
369
+ step=100,
370
+ interactive=True,
371
+ info=_("Chunk Overlap Info"),
372
+ )
373
+
374
+ with gr.Accordion(
375
+ label=_("Quiz & Judge Config"), open=False, visible=False
376
+ ) as quiz_accordion:
377
+ gr.Markdown(value=_("Quiz & Judge Config Info"))
378
+ quiz_samples = gr.Number(
379
+ label=_("Quiz Samples"),
380
+ value=2,
381
+ minimum=1,
382
+ interactive=True,
383
+ info=_("Quiz Samples Info"),
384
+ )
385
+
386
+ with gr.Accordion(label=_("Partition Config"), open=False):
387
+ gr.Markdown(value=_("Partition Config Info"))
388
+
389
+ partition_method = gr.Dropdown(
390
+ label=_("Partition Method"),
391
+ choices=["dfs", "bfs", "ece", "leiden"],
392
+ value="ece",
393
+ interactive=True,
394
+ info=_("Which algorithm to use for graph partitioning."),
395
+ )
396
+
397
+ # DFS method parameters
398
+ with gr.Group(visible=False) as dfs_group:
399
+ gr.Markdown(_("DFS intro"))
400
+ dfs_max_units = gr.Slider(
401
+ label=_("Max Units Per Community"),
402
+ minimum=1,
403
+ maximum=100,
404
+ value=5,
405
+ step=1,
406
+ interactive=True,
407
+ info=_("Max Units Per Community Info"),
408
+ )
409
+ # BFS method parameters
410
+ with gr.Group(visible=False) as bfs_group:
411
+ gr.Markdown(_("BFS intro"))
412
+ bfs_max_units = gr.Slider(
413
+ label=_("Max Units Per Community"),
414
+ minimum=1,
415
+ maximum=100,
416
+ value=5,
417
+ step=1,
418
+ interactive=True,
419
+ info=_("Max Units Per Community Info"),
420
+ )
421
+
422
+ # Leiden method parameters
423
+ with gr.Group(visible=False) as leiden_group:
424
+ gr.Markdown(_("Leiden intro"))
425
+ leiden_max_size = gr.Slider(
426
+ label=_("Maximum Size of Communities"),
427
+ minimum=1,
428
+ maximum=100,
429
+ value=20,
430
+ step=1,
431
+ interactive=True,
432
+ info=_("Maximum Size of Communities Info"),
433
+ )
434
+ leiden_use_lcc = gr.Checkbox(
435
+ label=_("Use Largest Connected Component"),
436
+ value=False,
437
+ interactive=True,
438
+ info=_("Use Largest Connected Component Info"),
439
+ )
440
+ leiden_random_seed = gr.Number(
441
+ label=_("Random Seed"),
442
+ value=42,
443
+ precision=0,
444
+ interactive=True,
445
+ info=_("Random Seed Info"),
446
+ )
447
+
448
+ # ECE method parameters
449
+ with gr.Group(visible=True) as ece_group:
450
+ gr.Markdown(_("ECE intro"))
451
+ ece_max_units = gr.Slider(
452
+ label=_("Max Units Per Community"),
453
+ minimum=1,
454
+ maximum=100,
455
+ value=20,
456
+ step=1,
457
+ interactive=True,
458
+ info=_("Max Units Per Community Info"),
459
+ )
460
+ ece_min_units = gr.Slider(
461
+ label=_("Min Units Per Community"),
462
+ minimum=1,
463
+ maximum=100,
464
+ value=3,
465
+ step=1,
466
+ interactive=True,
467
+ info=_("Min Units Per Community Info"),
468
+ )
469
+ ece_max_tokens = gr.Slider(
470
+ label=_("Max Tokens Per Community"),
471
+ minimum=512,
472
+ maximum=20_480,
473
+ value=10_240,
474
+ step=512,
475
+ interactive=True,
476
+ info=_("Max Tokens Per Community Info"),
477
+ )
478
+ ece_unit_sampling = gr.Radio(
479
+ label=_("Unit Sampling Strategy"),
480
+ choices=["random", "max_loss", "min_loss"],
481
+ value="random",
482
+ interactive=True,
483
+ info=_("Unit Sampling Strategy Info"),
484
+ )
485
+
486
+ def toggle_partition_params(method):
487
+ dfs = method == "dfs"
488
+ bfs = method == "bfs"
489
+ leiden = method == "leiden"
490
+ ece = method == "ece"
491
+ return (
492
+ gr.update(visible=dfs), # dfs_group
493
+ gr.update(visible=bfs), # bfs_group
494
+ gr.update(visible=leiden), # leiden_group
495
+ gr.update(visible=ece), # ece_group
496
+ )
497
+
498
+ partition_method.change(
499
+ fn=toggle_partition_params,
500
+ inputs=partition_method,
501
+ outputs=[dfs_group, bfs_group, leiden_group, ece_group],
502
+ )
503
+
504
+ with gr.Accordion(label=_("Generation Config"), open=False):
505
+ gr.Markdown(value=_("Generation Config Info"))
506
+ mode = gr.Radio(
507
+ choices=["atomic", "multi_hop", "aggregated", "CoT"],
508
+ label=_("Mode"),
509
+ value="aggregated",
510
+ interactive=True,
511
+ info=_("Mode Info"),
512
+ )
513
+ data_format = gr.Radio(
514
+ choices=["Alpaca", "Sharegpt", "ChatML"],
515
+ label=_("Output Data Format"),
516
+ value="Alpaca",
517
+ interactive=True,
518
+ info=_("Output Data Format Info"),
519
+ )
520
+
521
  with gr.Blocks():
522
  token_counter = gr.DataFrame(
523
  label="Token Stats",
 
549
  label="TPM",
550
  minimum=5000,
551
  maximum=5000000,
552
+ value=50000,
553
  step=1000,
554
  interactive=True,
555
  visible=True,
 
579
  outputs=[],
580
  )
581
 
 
 
 
 
 
 
 
 
 
582
  if_trainee_model.change(
583
+ lambda use_trainee: [gr.update(visible=use_trainee)] * 4,
584
  inputs=if_trainee_model,
585
  outputs=[
586
  trainee_url,
587
  trainee_model,
 
 
588
  trainee_api_key,
589
+ quiz_accordion,
590
  ],
591
  )
592
 
 
609
 
610
  submit_btn.click(
611
  lambda *args: run_graphgen(
612
+ WebuiParams(**dict(zip(WebuiParams.__annotations__, args)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
  ),
614
  inputs=[
615
  if_trainee_model,
616
  upload_file,
617
  tokenizer,
 
 
 
 
 
 
 
 
 
 
 
618
  synthesizer_model,
619
+ synthesizer_url,
620
  trainee_model,
621
+ trainee_url,
622
  api_key,
623
+ trainee_api_key,
624
  chunk_size,
625
  chunk_overlap,
626
+ quiz_samples,
627
+ partition_method,
628
+ dfs_max_units,
629
+ bfs_max_units,
630
+ leiden_max_size,
631
+ leiden_use_lcc,
632
+ leiden_random_seed,
633
+ ece_max_units,
634
+ ece_min_units,
635
+ ece_max_tokens,
636
+ ece_unit_sampling,
637
+ mode,
638
+ data_format,
639
  rpm,
640
  tpm,
 
 
 
641
  token_counter,
642
  ],
643
  outputs=[output, token_counter],
 
646
 
647
  if __name__ == "__main__":
648
  demo.queue(api_open=False, default_concurrency_limit=2)
649
+ demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
graphgen/configs/aggregated_config.yaml CHANGED
@@ -16,7 +16,7 @@ partition: # graph partition configuration
16
  max_units_per_community: 20 # max nodes and edges per community
17
  min_units_per_community: 5 # min nodes and edges per community
18
  max_tokens_per_community: 10240 # max tokens per community
19
- unit_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
20
  generate:
21
  mode: aggregated # atomic, aggregated, multi_hop, cot
22
  data_format: ChatML # Alpaca, Sharegpt, ChatML
 
16
  max_units_per_community: 20 # max nodes and edges per community
17
  min_units_per_community: 5 # min nodes and edges per community
18
  max_tokens_per_community: 10240 # max tokens per community
19
+ unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
20
  generate:
21
  mode: aggregated # atomic, aggregated, multi_hop, cot
22
  data_format: ChatML # Alpaca, Sharegpt, ChatML
graphgen/configs/multi_hop_config.yaml CHANGED
@@ -16,7 +16,7 @@ partition: # graph partition configuration
16
  max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
17
  min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
18
  max_tokens_per_community: 10240 # max tokens per community
19
- unit_sampling: random # edge sampling strategy, support: random, max_loss, min_loss
20
  generate:
21
  mode: multi_hop # strategy for generating multi-hop QA pairs
22
  data_format: ChatML # Alpaca, Sharegpt, ChatML
 
16
  max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
17
  min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
18
  max_tokens_per_community: 10240 # max tokens per community
19
+ unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
20
  generate:
21
  mode: multi_hop # strategy for generating multi-hop QA pairs
22
  data_format: ChatML # Alpaca, Sharegpt, ChatML
graphgen/graphgen.py CHANGED
@@ -237,7 +237,10 @@ class GraphGen:
237
 
238
  # Step 2: generate QA pairs
239
  results = await generate_qas(
240
- self.synthesizer_llm_client, batches, generate_config
 
 
 
241
  )
242
 
243
  if not results:
 
237
 
238
  # Step 2: generate QA pairs
239
  results = await generate_qas(
240
+ self.synthesizer_llm_client,
241
+ batches,
242
+ generate_config,
243
+ progress_bar=self.progress_bar,
244
  )
245
 
246
  if not results:
graphgen/models/kg_builder/light_rag_kg_builder.py CHANGED
@@ -42,7 +42,7 @@ class LightRAGKGBuilder(BaseKGBuilder):
42
 
43
  # step 2: initial glean
44
  final_result = await self.llm_client.generate_answer(hint_prompt)
45
- logger.debug("First extraction result: %s", final_result)
46
 
47
  # step3: iterative refinement
48
  history = pack_history_conversations(hint_prompt, final_result)
@@ -57,7 +57,7 @@ class LightRAGKGBuilder(BaseKGBuilder):
57
  glean_result = await self.llm_client.generate_answer(
58
  text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history
59
  )
60
- logger.debug("Loop %s glean: %s", loop_idx + 1, glean_result)
61
 
62
  history += pack_history_conversations(
63
  KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result
 
42
 
43
  # step 2: initial glean
44
  final_result = await self.llm_client.generate_answer(hint_prompt)
45
+ logger.info("First extraction result: %s", final_result)
46
 
47
  # step3: iterative refinement
48
  history = pack_history_conversations(hint_prompt, final_result)
 
57
  glean_result = await self.llm_client.generate_answer(
58
  text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history
59
  )
60
+ logger.info("Loop %s glean: %s", loop_idx + 1, glean_result)
61
 
62
  history += pack_history_conversations(
63
  KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result
graphgen/models/partitioner/ece_partitioner.py CHANGED
@@ -17,8 +17,8 @@ EDGE_UNIT: str = "e"
17
  class ECEPartitioner(BFSPartitioner):
18
  """
19
  ECE partitioner that partitions the graph into communities based on Expected Calibration Error (ECE).
20
- We calculate ECE for edges in KG (represented as 'comprehension loss')
21
- and group edges with similar ECE values into the same community.
22
  1. Select a sampling strategy.
23
  2. Choose a unit based on the sampling strategy.
24
  2. Expand the community using BFS.
 
17
  class ECEPartitioner(BFSPartitioner):
18
  """
19
  ECE partitioner that partitions the graph into communities based on Expected Calibration Error (ECE).
20
+ We calculate ECE for units in KG (represented as 'comprehension loss')
21
+ and group units with similar ECE values into the same community.
22
  1. Select a sampling strategy.
23
  2. Choose a unit based on the sampling strategy.
24
  2. Expand the community using BFS.
graphgen/operators/generate/generate_qas.py CHANGED
@@ -18,12 +18,14 @@ async def generate_qas(
18
  ]
19
  ],
20
  generation_config: dict,
 
21
  ) -> list[dict[str, Any]]:
22
  """
23
  Generate question-answer pairs based on nodes and edges.
24
  :param llm_client: LLM client
25
  :param batches
26
  :param generation_config
 
27
  :return: QA pairs
28
  """
29
  mode = generation_config["mode"]
@@ -45,6 +47,7 @@ async def generate_qas(
45
  batches,
46
  desc="[4/4]Generating QAs",
47
  unit="batch",
 
48
  )
49
 
50
  # format
 
18
  ]
19
  ],
20
  generation_config: dict,
21
+ progress_bar=None,
22
  ) -> list[dict[str, Any]]:
23
  """
24
  Generate question-answer pairs based on nodes and edges.
25
  :param llm_client: LLM client
26
  :param batches
27
  :param generation_config
28
+ :param progress_bar
29
  :return: QA pairs
30
  """
31
  mode = generation_config["mode"]
 
47
  batches,
48
  desc="[4/4]Generating QAs",
49
  unit="batch",
50
+ progress_bar=progress_bar,
51
  )
52
 
53
  # format
graphgen/utils/run_concurrent.py CHANGED
@@ -10,6 +10,77 @@ T = TypeVar("T")
10
  R = TypeVar("R")
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  async def run_concurrent(
14
  coro_fn: Callable[[T], Awaitable[R]],
15
  items: List[T],
@@ -20,19 +91,36 @@ async def run_concurrent(
20
  ) -> List[R]:
21
  tasks = [asyncio.create_task(coro_fn(it)) for it in items]
22
 
23
- results = await tqdm_async.gather(*tasks, desc=desc, unit=unit)
24
-
25
- ok_results = []
26
- for idx, res in enumerate(results):
27
- if isinstance(res, Exception):
28
- logger.exception("Task failed: %s", res)
29
- if progress_bar:
30
- progress_bar((idx + 1) / len(items), desc=desc)
31
- continue
32
- ok_results.append(res)
33
- if progress_bar:
34
- progress_bar((idx + 1) / len(items), desc=desc)
35
-
36
- if progress_bar:
37
- progress_bar(1.0, desc=desc)
38
- return ok_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  R = TypeVar("R")
11
 
12
 
13
+ # async def run_concurrent(
14
+ # coro_fn: Callable[[T], Awaitable[R]],
15
+ # items: List[T],
16
+ # *,
17
+ # desc: str = "processing",
18
+ # unit: str = "item",
19
+ # progress_bar: Optional[gr.Progress] = None,
20
+ # ) -> List[R]:
21
+ # tasks = [asyncio.create_task(coro_fn(it)) for it in items]
22
+ #
23
+ # results = []
24
+ # async for future in tqdm_async(
25
+ # tasks, desc=desc, unit=unit
26
+ # ):
27
+ # try:
28
+ # result = await future
29
+ # results.append(result)
30
+ # except Exception as e: # pylint: disable=broad-except
31
+ # logger.exception("Task failed: %s", e)
32
+ #
33
+ # if progress_bar is not None:
34
+ # progress_bar((len(results)) / len(items), desc=desc)
35
+ #
36
+ # if progress_bar is not None:
37
+ # progress_bar(1.0, desc=desc)
38
+ # return results
39
+
40
+ # results = await tqdm_async.gather(*tasks, desc=desc, unit=unit)
41
+ #
42
+ # ok_results = []
43
+ # for idx, res in enumerate(results):
44
+ # if isinstance(res, Exception):
45
+ # logger.exception("Task failed: %s", res)
46
+ # if progress_bar:
47
+ # progress_bar((idx + 1) / len(items), desc=desc)
48
+ # continue
49
+ # ok_results.append(res)
50
+ # if progress_bar:
51
+ # progress_bar((idx + 1) / len(items), desc=desc)
52
+ #
53
+ # if progress_bar:
54
+ # progress_bar(1.0, desc=desc)
55
+ # return ok_results
56
+
57
+ # async def run_concurrent(
58
+ # coro_fn: Callable[[T], Awaitable[R]],
59
+ # items: List[T],
60
+ # *,
61
+ # desc: str = "processing",
62
+ # unit: str = "item",
63
+ # progress_bar: Optional[gr.Progress] = None,
64
+ # ) -> List[R]:
65
+ # tasks = [asyncio.create_task(coro_fn(it)) for it in items]
66
+ #
67
+ # results = []
68
+ # # 使用同步方式更新进度条,避免异步冲突
69
+ # for i, task in enumerate(asyncio.as_completed(tasks)):
70
+ # try:
71
+ # result = await task
72
+ # results.append(result)
73
+ # # 同步更新进度条
74
+ # if progress_bar is not None:
75
+ # # 在同步上下文中更新进度
76
+ # progress_bar((i + 1) / len(items), desc=desc)
77
+ # except Exception as e:
78
+ # logger.exception("Task failed: %s", e)
79
+ # results.append(e)
80
+ #
81
+ # return results
82
+
83
+
84
  async def run_concurrent(
85
  coro_fn: Callable[[T], Awaitable[R]],
86
  items: List[T],
 
91
  ) -> List[R]:
92
  tasks = [asyncio.create_task(coro_fn(it)) for it in items]
93
 
94
+ completed_count = 0
95
+ results = []
96
+
97
+ pbar = tqdm_async(total=len(items), desc=desc, unit=unit)
98
+
99
+ if progress_bar is not None:
100
+ progress_bar(0.0, desc=f"{desc} (0/{len(items)})")
101
+
102
+ for future in asyncio.as_completed(tasks):
103
+ try:
104
+ result = await future
105
+ results.append(result)
106
+ except Exception as e: # pylint: disable=broad-except
107
+ logger.exception("Task failed: %s", e)
108
+ # even if failed, record it to keep results consistent with tasks
109
+ results.append(e)
110
+
111
+ completed_count += 1
112
+ pbar.update(1)
113
+
114
+ if progress_bar is not None:
115
+ progress = completed_count / len(items)
116
+ progress_bar(progress, desc=f"{desc} ({completed_count}/{len(items)})")
117
+
118
+ pbar.close()
119
+
120
+ if progress_bar is not None:
121
+ progress_bar(1.0, desc=f"{desc} (completed)")
122
+
123
+ # filter out exceptions
124
+ results = [res for res in results if not isinstance(res, Exception)]
125
+
126
+ return results
webui/app.py CHANGED
@@ -74,11 +74,32 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
74
  def sum_tokens(client):
75
  return sum(u["total_tokens"] for u in client.token_usage)
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  config = {
78
  "if_trainee_model": params.if_trainee_model,
79
- "read": {
80
- "input_file": params.input_file,
81
- },
82
  "split": {
83
  "chunk_size": params.chunk_size,
84
  "chunk_overlap": params.chunk_overlap,
@@ -89,21 +110,12 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
89
  "quiz_samples": params.quiz_samples,
90
  },
91
  "partition": {
92
- "method": "ece",
93
- "method_params": {
94
- "bidirectional": params.bidirectional,
95
- "expand_method": params.expand_method,
96
- "max_extra_edges": params.max_extra_edges,
97
- "max_tokens": params.max_tokens,
98
- "max_depth": params.max_depth,
99
- "edge_sampling": params.edge_sampling,
100
- "isolated_node_strategy": params.isolated_node_strategy,
101
- "loss_strategy": params.loss_strategy,
102
- },
103
  },
104
  "generate": {
105
- "mode": params.output_data_type,
106
- "data_format": params.output_data_format,
107
  },
108
  }
109
 
@@ -141,10 +153,7 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
141
  graph_gen.insert(read_config=config["read"], split_config=config["split"])
142
 
143
  if config["if_trainee_model"]:
144
- # Quiz and Judge
145
  graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
146
- else:
147
- config["partition"]["method_params"]["edge_sampling"] = "random"
148
 
149
  graph_gen.generate(
150
  partition_config=config["partition"],
@@ -245,13 +254,7 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
245
  ):
246
  lang_btn.render()
247
 
248
- gr.Markdown(
249
- value="# "
250
- + _("Title")
251
- + "\n\n"
252
- + "### [GraphGen](https://github.com/open-sciencelab/GraphGen) "
253
- + _("Intro")
254
- )
255
 
256
  if_trainee_model = gr.Checkbox(
257
  label=_("Use Trainee Model"), value=False, interactive=True
@@ -295,106 +298,13 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
295
  visible=if_trainee_model.value is True,
296
  )
297
 
298
- with gr.Accordion(label=_("Generation Config"), open=False):
299
- chunk_size = gr.Slider(
300
- label="Chunk Size",
301
- minimum=256,
302
- maximum=4096,
303
- value=1024,
304
- step=256,
305
- interactive=True,
306
- )
307
- chunk_overlap = gr.Slider(
308
- label="Chunk Overlap",
309
- minimum=0,
310
- maximum=500,
311
- value=100,
312
- step=100,
313
- interactive=True,
314
- )
315
- output_data_type = gr.Radio(
316
- choices=["atomic", "multi_hop", "aggregated"],
317
- label="Output Data Type",
318
- value="aggregated",
319
- interactive=True,
320
- )
321
- output_data_format = gr.Radio(
322
- choices=["Alpaca", "Sharegpt", "ChatML"],
323
- label="Output Data Format",
324
- value="Alpaca",
325
- interactive=True,
326
- )
327
- quiz_samples = gr.Number(
328
- label="Quiz Samples",
329
- value=2,
330
- minimum=1,
331
- interactive=True,
332
- visible=if_trainee_model.value is True,
333
- )
334
- bidirectional = gr.Checkbox(
335
- label="Bidirectional", value=True, interactive=True
336
- )
337
-
338
- expand_method = gr.Radio(
339
- choices=["max_width", "max_tokens"],
340
- label="Expand Method",
341
- value="max_tokens",
342
- interactive=True,
343
- )
344
- max_extra_edges = gr.Slider(
345
- minimum=1,
346
- maximum=10,
347
- value=5,
348
- label="Max Extra Edges",
349
- step=1,
350
- interactive=True,
351
- visible=expand_method.value == "max_width",
352
- )
353
- max_tokens = gr.Slider(
354
- minimum=64,
355
- maximum=1024,
356
- value=256,
357
- label="Max Tokens",
358
- step=64,
359
- interactive=True,
360
- visible=(expand_method.value != "max_width"),
361
- )
362
-
363
- max_depth = gr.Slider(
364
- minimum=1,
365
- maximum=5,
366
- value=2,
367
- label="Max Depth",
368
- step=1,
369
- interactive=True,
370
- )
371
- edge_sampling = gr.Radio(
372
- choices=["max_loss", "min_loss", "random"],
373
- label="Edge Sampling",
374
- value="max_loss",
375
- interactive=True,
376
- visible=if_trainee_model.value is True,
377
- )
378
- isolated_node_strategy = gr.Radio(
379
- choices=["add", "ignore"],
380
- label="Isolated Node Strategy",
381
- value="ignore",
382
- interactive=True,
383
- )
384
- loss_strategy = gr.Radio(
385
- choices=["only_edge", "both"],
386
- label="Loss Strategy",
387
- value="only_edge",
388
- interactive=True,
389
- )
390
-
391
  with gr.Row(equal_height=True):
392
  with gr.Column(scale=3):
393
  api_key = gr.Textbox(
394
  label=_("SiliconFlow Token"),
395
  type="password",
396
  value="",
397
- info="https://cloud.siliconflow.cn/account/ak",
398
  )
399
  with gr.Column(scale=1):
400
  test_connection_btn = gr.Button(_("Test Connection"))
@@ -437,6 +347,177 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
437
  elem_id="preview_df",
438
  )
439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  with gr.Blocks():
441
  token_counter = gr.DataFrame(
442
  label="Token Stats",
@@ -468,7 +549,7 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
468
  label="TPM",
469
  minimum=5000,
470
  maximum=5000000,
471
- value=100000,
472
  step=1000,
473
  interactive=True,
474
  visible=True,
@@ -498,24 +579,14 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
498
  outputs=[],
499
  )
500
 
501
- expand_method.change(
502
- lambda method: (
503
- gr.update(visible=method == "max_width"),
504
- gr.update(visible=method != "max_width"),
505
- ),
506
- inputs=expand_method,
507
- outputs=[max_extra_edges, max_tokens],
508
- )
509
-
510
  if_trainee_model.change(
511
- lambda use_trainee: [gr.update(visible=use_trainee)] * 5,
512
  inputs=if_trainee_model,
513
  outputs=[
514
  trainee_url,
515
  trainee_model,
516
- quiz_samples,
517
- edge_sampling,
518
  trainee_api_key,
 
519
  ],
520
  )
521
 
@@ -538,59 +609,35 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
538
 
539
  submit_btn.click(
540
  lambda *args: run_graphgen(
541
- WebuiParams(
542
- if_trainee_model=args[0],
543
- input_file=args[1],
544
- tokenizer=args[2],
545
- output_data_type=args[3],
546
- output_data_format=args[4],
547
- bidirectional=args[5],
548
- expand_method=args[6],
549
- max_extra_edges=args[7],
550
- max_tokens=args[8],
551
- max_depth=args[9],
552
- edge_sampling=args[10],
553
- isolated_node_strategy=args[11],
554
- loss_strategy=args[12],
555
- synthesizer_url=args[13],
556
- synthesizer_model=args[14],
557
- trainee_model=args[15],
558
- api_key=args[16],
559
- chunk_size=args[17],
560
- chunk_overlap=args[18],
561
- rpm=args[19],
562
- tpm=args[20],
563
- quiz_samples=args[21],
564
- trainee_url=args[22],
565
- trainee_api_key=args[23],
566
- token_counter=args[24],
567
- )
568
  ),
569
  inputs=[
570
  if_trainee_model,
571
  upload_file,
572
  tokenizer,
573
- output_data_type,
574
- output_data_format,
575
- bidirectional,
576
- expand_method,
577
- max_extra_edges,
578
- max_tokens,
579
- max_depth,
580
- edge_sampling,
581
- isolated_node_strategy,
582
- loss_strategy,
583
- synthesizer_url,
584
  synthesizer_model,
 
585
  trainee_model,
 
586
  api_key,
 
587
  chunk_size,
588
  chunk_overlap,
 
 
 
 
 
 
 
 
 
 
 
 
 
589
  rpm,
590
  tpm,
591
- quiz_samples,
592
- trainee_url,
593
- trainee_api_key,
594
  token_counter,
595
  ],
596
  outputs=[output, token_counter],
@@ -599,4 +646,4 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
599
 
600
  if __name__ == "__main__":
601
  demo.queue(api_open=False, default_concurrency_limit=2)
602
- demo.launch(server_name="0.0.0.0")
 
74
  def sum_tokens(client):
75
  return sum(u["total_tokens"] for u in client.token_usage)
76
 
77
+ method = params.partition_method
78
+ if method == "dfs":
79
+ partition_params = {
80
+ "max_units_per_community": params.dfs_max_units,
81
+ }
82
+ elif method == "bfs":
83
+ partition_params = {
84
+ "max_units_per_community": params.bfs_max_units,
85
+ }
86
+ elif method == "leiden":
87
+ partition_params = {
88
+ "max_size": params.leiden_max_size,
89
+ "use_lcc": params.leiden_use_lcc,
90
+ "random_seed": params.leiden_random_seed,
91
+ }
92
+ else: # ece
93
+ partition_params = {
94
+ "max_units_per_community": params.ece_max_units,
95
+ "min_units_per_community": params.ece_min_units,
96
+ "max_tokens_per_community": params.ece_max_tokens,
97
+ "unit_sampling": params.ece_unit_sampling,
98
+ }
99
+
100
  config = {
101
  "if_trainee_model": params.if_trainee_model,
102
+ "read": {"input_file": params.upload_file},
 
 
103
  "split": {
104
  "chunk_size": params.chunk_size,
105
  "chunk_overlap": params.chunk_overlap,
 
110
  "quiz_samples": params.quiz_samples,
111
  },
112
  "partition": {
113
+ "method": params.partition_method,
114
+ "method_params": partition_params,
 
 
 
 
 
 
 
 
 
115
  },
116
  "generate": {
117
+ "mode": params.mode,
118
+ "data_format": params.data_format,
119
  },
120
  }
121
 
 
153
  graph_gen.insert(read_config=config["read"], split_config=config["split"])
154
 
155
  if config["if_trainee_model"]:
 
156
  graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
 
 
157
 
158
  graph_gen.generate(
159
  partition_config=config["partition"],
 
254
  ):
255
  lang_btn.render()
256
 
257
+ gr.Markdown(value=_("Title") + _("Intro"))
 
 
 
 
 
 
258
 
259
  if_trainee_model = gr.Checkbox(
260
  label=_("Use Trainee Model"), value=False, interactive=True
 
298
  visible=if_trainee_model.value is True,
299
  )
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  with gr.Row(equal_height=True):
302
  with gr.Column(scale=3):
303
  api_key = gr.Textbox(
304
  label=_("SiliconFlow Token"),
305
  type="password",
306
  value="",
307
+ info=_("SiliconFlow Token Info"),
308
  )
309
  with gr.Column(scale=1):
310
  test_connection_btn = gr.Button(_("Test Connection"))
 
347
  elem_id="preview_df",
348
  )
349
 
350
+ with gr.Accordion(label=_("Split Config"), open=False):
351
+ gr.Markdown(value=_("Split Config Info"))
352
+ with gr.Row(equal_height=True):
353
+ with gr.Column(scale=1):
354
+ chunk_size = gr.Slider(
355
+ label=_("Chunk Size"),
356
+ minimum=256,
357
+ maximum=4096,
358
+ value=1024,
359
+ step=256,
360
+ interactive=True,
361
+ info=_("Chunk Size Info"),
362
+ )
363
+ with gr.Column(scale=1):
364
+ chunk_overlap = gr.Slider(
365
+ label=_("Chunk Overlap"),
366
+ minimum=0,
367
+ maximum=500,
368
+ value=100,
369
+ step=100,
370
+ interactive=True,
371
+ info=_("Chunk Overlap Info"),
372
+ )
373
+
374
+ with gr.Accordion(
375
+ label=_("Quiz & Judge Config"), open=False, visible=False
376
+ ) as quiz_accordion:
377
+ gr.Markdown(value=_("Quiz & Judge Config Info"))
378
+ quiz_samples = gr.Number(
379
+ label=_("Quiz Samples"),
380
+ value=2,
381
+ minimum=1,
382
+ interactive=True,
383
+ info=_("Quiz Samples Info"),
384
+ )
385
+
386
+ with gr.Accordion(label=_("Partition Config"), open=False):
387
+ gr.Markdown(value=_("Partition Config Info"))
388
+
389
+ partition_method = gr.Dropdown(
390
+ label=_("Partition Method"),
391
+ choices=["dfs", "bfs", "ece", "leiden"],
392
+ value="ece",
393
+ interactive=True,
394
+ info=_("Which algorithm to use for graph partitioning."),
395
+ )
396
+
397
+ # DFS method parameters
398
+ with gr.Group(visible=False) as dfs_group:
399
+ gr.Markdown(_("DFS intro"))
400
+ dfs_max_units = gr.Slider(
401
+ label=_("Max Units Per Community"),
402
+ minimum=1,
403
+ maximum=100,
404
+ value=5,
405
+ step=1,
406
+ interactive=True,
407
+ info=_("Max Units Per Community Info"),
408
+ )
409
+ # BFS method parameters
410
+ with gr.Group(visible=False) as bfs_group:
411
+ gr.Markdown(_("BFS intro"))
412
+ bfs_max_units = gr.Slider(
413
+ label=_("Max Units Per Community"),
414
+ minimum=1,
415
+ maximum=100,
416
+ value=5,
417
+ step=1,
418
+ interactive=True,
419
+ info=_("Max Units Per Community Info"),
420
+ )
421
+
422
+ # Leiden method parameters
423
+ with gr.Group(visible=False) as leiden_group:
424
+ gr.Markdown(_("Leiden intro"))
425
+ leiden_max_size = gr.Slider(
426
+ label=_("Maximum Size of Communities"),
427
+ minimum=1,
428
+ maximum=100,
429
+ value=20,
430
+ step=1,
431
+ interactive=True,
432
+ info=_("Maximum Size of Communities Info"),
433
+ )
434
+ leiden_use_lcc = gr.Checkbox(
435
+ label=_("Use Largest Connected Component"),
436
+ value=False,
437
+ interactive=True,
438
+ info=_("Use Largest Connected Component Info"),
439
+ )
440
+ leiden_random_seed = gr.Number(
441
+ label=_("Random Seed"),
442
+ value=42,
443
+ precision=0,
444
+ interactive=True,
445
+ info=_("Random Seed Info"),
446
+ )
447
+
448
+ # ECE method parameters
449
+ with gr.Group(visible=True) as ece_group:
450
+ gr.Markdown(_("ECE intro"))
451
+ ece_max_units = gr.Slider(
452
+ label=_("Max Units Per Community"),
453
+ minimum=1,
454
+ maximum=100,
455
+ value=20,
456
+ step=1,
457
+ interactive=True,
458
+ info=_("Max Units Per Community Info"),
459
+ )
460
+ ece_min_units = gr.Slider(
461
+ label=_("Min Units Per Community"),
462
+ minimum=1,
463
+ maximum=100,
464
+ value=3,
465
+ step=1,
466
+ interactive=True,
467
+ info=_("Min Units Per Community Info"),
468
+ )
469
+ ece_max_tokens = gr.Slider(
470
+ label=_("Max Tokens Per Community"),
471
+ minimum=512,
472
+ maximum=20_480,
473
+ value=10_240,
474
+ step=512,
475
+ interactive=True,
476
+ info=_("Max Tokens Per Community Info"),
477
+ )
478
+ ece_unit_sampling = gr.Radio(
479
+ label=_("Unit Sampling Strategy"),
480
+ choices=["random", "max_loss", "min_loss"],
481
+ value="random",
482
+ interactive=True,
483
+ info=_("Unit Sampling Strategy Info"),
484
+ )
485
+
486
+ def toggle_partition_params(method):
487
+ dfs = method == "dfs"
488
+ bfs = method == "bfs"
489
+ leiden = method == "leiden"
490
+ ece = method == "ece"
491
+ return (
492
+ gr.update(visible=dfs), # dfs_group
493
+ gr.update(visible=bfs), # bfs_group
494
+ gr.update(visible=leiden), # leiden_group
495
+ gr.update(visible=ece), # ece_group
496
+ )
497
+
498
+ partition_method.change(
499
+ fn=toggle_partition_params,
500
+ inputs=partition_method,
501
+ outputs=[dfs_group, bfs_group, leiden_group, ece_group],
502
+ )
503
+
504
+ with gr.Accordion(label=_("Generation Config"), open=False):
505
+ gr.Markdown(value=_("Generation Config Info"))
506
+ mode = gr.Radio(
507
+ choices=["atomic", "multi_hop", "aggregated", "CoT"],
508
+ label=_("Mode"),
509
+ value="aggregated",
510
+ interactive=True,
511
+ info=_("Mode Info"),
512
+ )
513
+ data_format = gr.Radio(
514
+ choices=["Alpaca", "Sharegpt", "ChatML"],
515
+ label=_("Output Data Format"),
516
+ value="Alpaca",
517
+ interactive=True,
518
+ info=_("Output Data Format Info"),
519
+ )
520
+
521
  with gr.Blocks():
522
  token_counter = gr.DataFrame(
523
  label="Token Stats",
 
549
  label="TPM",
550
  minimum=5000,
551
  maximum=5000000,
552
+ value=50000,
553
  step=1000,
554
  interactive=True,
555
  visible=True,
 
579
  outputs=[],
580
  )
581
 
 
 
 
 
 
 
 
 
 
582
  if_trainee_model.change(
583
+ lambda use_trainee: [gr.update(visible=use_trainee)] * 4,
584
  inputs=if_trainee_model,
585
  outputs=[
586
  trainee_url,
587
  trainee_model,
 
 
588
  trainee_api_key,
589
+ quiz_accordion,
590
  ],
591
  )
592
 
 
609
 
610
  submit_btn.click(
611
  lambda *args: run_graphgen(
612
+ WebuiParams(**dict(zip(WebuiParams.__annotations__, args)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
  ),
614
  inputs=[
615
  if_trainee_model,
616
  upload_file,
617
  tokenizer,
 
 
 
 
 
 
 
 
 
 
 
618
  synthesizer_model,
619
+ synthesizer_url,
620
  trainee_model,
621
+ trainee_url,
622
  api_key,
623
+ trainee_api_key,
624
  chunk_size,
625
  chunk_overlap,
626
+ quiz_samples,
627
+ partition_method,
628
+ dfs_max_units,
629
+ bfs_max_units,
630
+ leiden_max_size,
631
+ leiden_use_lcc,
632
+ leiden_random_seed,
633
+ ece_max_units,
634
+ ece_min_units,
635
+ ece_max_tokens,
636
+ ece_unit_sampling,
637
+ mode,
638
+ data_format,
639
  rpm,
640
  tpm,
 
 
 
641
  token_counter,
642
  ],
643
  outputs=[output, token_counter],
 
646
 
647
  if __name__ == "__main__":
648
  demo.queue(api_open=False, default_concurrency_limit=2)
649
+ demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
webui/base.py CHANGED
@@ -9,27 +9,29 @@ class WebuiParams:
9
  """
10
 
11
  if_trainee_model: bool
12
- input_file: str
13
  tokenizer: str
14
- output_data_type: str
15
- output_data_format: str
16
- bidirectional: bool
17
- expand_method: str
18
- max_extra_edges: int
19
- max_tokens: int
20
- max_depth: int
21
- edge_sampling: str
22
- isolated_node_strategy: str
23
- loss_strategy: str
24
- synthesizer_url: str
25
  synthesizer_model: str
 
26
  trainee_model: str
 
27
  api_key: str
 
28
  chunk_size: int
29
  chunk_overlap: int
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  rpm: int
31
  tpm: int
32
- quiz_samples: int
33
- trainee_url: str
34
- trainee_api_key: str
35
  token_counter: Any
 
9
  """
10
 
11
  if_trainee_model: bool
12
+ upload_file: Any # gr.File
13
  tokenizer: str
 
 
 
 
 
 
 
 
 
 
 
14
  synthesizer_model: str
15
+ synthesizer_url: str
16
  trainee_model: str
17
+ trainee_url: str
18
  api_key: str
19
+ trainee_api_key: str
20
  chunk_size: int
21
  chunk_overlap: int
22
+ quiz_samples: int
23
+ partition_method: str
24
+ dfs_max_units: int
25
+ bfs_max_units: int
26
+ leiden_max_size: int
27
+ leiden_use_lcc: bool
28
+ leiden_random_seed: int
29
+ ece_max_units: int
30
+ ece_min_units: int
31
+ ece_max_tokens: int
32
+ ece_unit_sampling: str
33
+ mode: str
34
+ data_format: str
35
  rpm: int
36
  tpm: int
 
 
 
37
  token_counter: Any
webui/translation.json CHANGED
@@ -1,32 +1,62 @@
1
  {
2
  "en": {
3
- "Title": "✨Easy-to-use LLM Training Data Generation Framework",
4
- "\n\n": "\n\n",
5
- "### [GraphGen](https://github.com/open-sciencelab/GraphGen) ": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) ",
6
- "Intro": "is a framework for synthetic data generation guided by knowledge graphs, designed to tackle challenges for knowledge-intensive QA generation. \n\nBy uploading your text chunks (such as knowledge in agriculture, healthcare, or marine science) and filling in the LLM API key, you can generate the training data required by **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)** and **[xtuner](https://github.com/InternLM/xtuner)** online. We will automatically delete user information after completion.",
7
- "# ": "# ",
8
- "Use Trainee Model": "Use Trainee Model to identify knowledge blind spots, please keep disable for SiliconCloud",
9
  "Synthesizer URL Info": "Base URL for the Synthesizer Model API, use SiliconFlow as default",
10
  "Synthesizer Model Info": "Model for constructing KGs and generating QAs",
11
  "Trainee URL Info": "Base URL for the Trainee Model API, use SiliconFlow as default",
12
  "Trainee Model Info": "Model for training",
13
  "SiliconFlow Token for Trainee Model": "SiliconFlow API Key for Trainee Model",
14
  "Model Config": "Model Configuration",
15
- "Generation Config": "Generation Config",
16
- "API Config": "API Config",
17
- "### ": "### ",
18
  "SiliconFlow Token": "SiliconFlow API Key",
19
  "Upload File": "Upload File",
20
  "Example Files": "Example Files",
21
- "Output File": "Output File",
22
- "File Preview": "File Preview"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  },
24
  "zh": {
25
- "Title": "✨开箱即用的LLM训练数据生成框架✨",
26
- "\n\n": "\n\n",
27
- "### [GraphGen](https://github.com/open-sciencelab/GraphGen) ": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) ",
28
- "Intro": "是一个基于知识图谱的数据合成框架,旨在知识密集型任务中生成问答。\n\n 上传你的文本块(如农业、医疗、海洋知识),填写 LLM api key,即可在线生成 **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)**、**[xtuner](https://github.com/InternLM/xtuner)** 所需训练数据。结束后我们将自动删除用户信息。",
29
- "# ": "# ",
30
  "Use Trainee Model": "使用Trainee Model来识别知识盲区,使用硅基流动时请保持禁用",
31
  "Synthesizer URL Info": "调用合成模型API的URL,默认使用硅基流动",
32
  "Synthesizer Model Info": "用于构建知识图谱和生成问答的模型",
@@ -34,13 +64,49 @@
34
  "Trainee Model Info": "用于训练的模型",
35
  "SiliconFlow Token for Trainee Model": "SiliconFlow Token for Trainee Model",
36
  "Model Config": "模型配置",
37
- "Generation Config": "生成配置",
38
- "API Config": "API Config",
39
- "### ": "### ",
40
  "SiliconFlow Token": "硅基流动 API 秘钥",
41
  "Upload File": "上传文件",
42
  "Example Files": "示例文件",
43
- "Output File": "输出文件",
44
- "File Preview": "文件预览"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
  }
 
1
  {
2
  "en": {
3
+ "Title": "# ✨Easy-to-use LLM Training Data Generation Framework✨\n\n",
4
+ "Intro": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) is a framework for synthetic data generation guided by knowledge graphs, designed to tackle challenges for knowledge-intensive QA generation. \n\nBy uploading your text chunks (such as knowledge in agriculture, healthcare, or marine science) and filling in the LLM API key, you can generate the training data required by **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)** and **[xtuner](https://github.com/InternLM/xtuner)** online. We will automatically delete user information after completion.",
5
+ "Use Trainee Model": "Use Trainee Model to identify knowledge blind spots, please keep disabled for SiliconCloud",
 
 
 
6
  "Synthesizer URL Info": "Base URL for the Synthesizer Model API, use SiliconFlow as default",
7
  "Synthesizer Model Info": "Model for constructing KGs and generating QAs",
8
  "Trainee URL Info": "Base URL for the Trainee Model API, use SiliconFlow as default",
9
  "Trainee Model Info": "Model for training",
10
  "SiliconFlow Token for Trainee Model": "SiliconFlow API Key for Trainee Model",
11
  "Model Config": "Model Configuration",
12
+ "SiliconFlow Token Info": "Get SiliconFlow API Key at \"https://cloud.siliconflow.cn/account/ak\", efficiently and stably use LLM interfaces",
 
 
13
  "SiliconFlow Token": "SiliconFlow API Key",
14
  "Upload File": "Upload File",
15
  "Example Files": "Example Files",
16
+ "File Preview": "File Preview",
17
+ "Split Config Info": "If the input text is a long text without chunks, the system will split the text into appropriate paragraphs based on the following parameters.",
18
+ "Chunk Size Info": "Split the long text according to this value. Too short will lead to incomplete knowledge, and too long will lead to LLM input being too long",
19
+ "Chunk Size": "chunk_size(Chunk Size)",
20
+ "Chunk Overlap Info": "The overlapping part between two adjacent chunks, which can help maintain context continuity",
21
+ "Chunk Overlap": "chunk_overlap(Chunk Overlap)",
22
+ "Split Config": "Split Config",
23
+ "Quiz & Judge Config Info": "Synthesizer Model generates quiz questions based on each knowledge unit in the knowledge graph to assess the Trainee Model's understanding of the knowledge and obtain comprehension loss.",
24
+ "Quiz Samples Info": "Configure how many quiz questions to generate for each knowledge unit",
25
+ "Quiz Samples": "quiz_samples(Quiz Samples)",
26
+ "Quiz & Judge Config": "Quiz & Judge Config",
27
+ "Partition Config Info": "Partition the knowledge graph into multiple communities (subgraphs), each community is the smallest unit for generating QAs. Appropriate partitioning methods can improve relevance and diversity.",
28
+ "Which algorithm to use for graph partitioning.": "Which algorithm to use for graph partitioning.",
29
+ "Partition Method": "method(Partition Method)",
30
+ "DFS intro": "The DFS partitioning method uses a depth-first search algorithm to traverse the knowledge graph, starting from one unit and exploring as deeply as possible along connected units until a preset community size is reached or there are no more unvisited units. It then starts a new community from another unvisited unit, repeating this process until all units are assigned to communities.",
31
+ "Max Units Per Community Info": "The maximum number of knowledge units (nodes) allowed in each community. If a community exceeds this limit, it will be further partitioned. A unit refers to a node in the knowledge graph, which can be an entity or a relation.",
32
+ "Max Units Per Community": "max_units_per_community(Max Units Per Community)",
33
+ "BFS intro": "The BFS partitioning method uses a breadth-first search algorithm to traverse the knowledge graph, starting from one unit and exploring all its neighboring units before moving on to the neighbors' neighbors. This process continues until a preset community size is reached or there are no more unvisited units. It then starts a new community from another unvisited unit, repeating this process until all units are assigned to communities.",
34
+ "Leiden intro": "The Leiden partitioning method is a community detection algorithm based on modularity optimization, designed to identify tightly connected subgraphs within a graph. The algorithm iteratively optimizes the assignment of nodes to communities, maximizing the density of connections within communities while minimizing connections between communities. The Leiden algorithm can effectively handle large-scale graph data and typically produces higher-quality community partitions compared to other community detection algorithms, such as the Louvain algorithm.",
35
+ "Maximum Size of Communities Info": "The maximum number of nodes allowed in a community. If a community exceeds this limit, it will be further partitioned.",
36
+ "Maximum Size of Communities": "max_size(Maximum Size of Communities)",
37
+ "Use Largest Connected Component Info": "The largest connected component refers to the largest subset of nodes in a graph where there is a path connecting any two nodes. When this option is enabled, the partitioning algorithm will only consider the largest connected component of the knowledge graph for community partitioning, ignoring other smaller connected components. This helps ensure that the generated communities have higher connectivity and relevance.",
38
+ "Use Largest Connected Component": "use_lcc(Use Largest Connected Component)",
39
+ "Random Seed Info": "The random seed changes the initial state of the graph partitioning, thereby affecting the partitioning results. By setting different random seeds, different community partitioning schemes can be generated, which helps improve the diversity of generated QAs.",
40
+ "Random Seed": "random_seed(Random Seed)",
41
+ "ECE intro": "ECE is an original graph partitioning method based on the principle of model calibration. It evaluates the performance of each unit under the current model by computing its calibration error (referred to as the comprehension loss) and partitions the graph according to this comprehension error.",
42
+ "Min Units Per Community Info": "Limit the minimum number of nodes allowed in each community. If a community has fewer nodes than this limit, it will be discarded.",
43
+ "Min Units Per Community": "min_units_per_community(Min Units Per Community)",
44
+ "Max Tokens Per Community Info": "The maximum number of tokens allowed in each community. If a community exceeds this limit, it will be further partitioned.",
45
+ "Max Tokens Per Community": "max_tokens_per_community(Max Tokens Per Community)",
46
+ "Unit Sampling Strategy Info": "Unit sampling strategy determines how to select units from candidate units when constructing communities. Unit sampling strategies include random, max_loss, and min_loss. random means selecting units randomly, max_loss means prioritizing units with higher comprehension loss, and min_loss means prioritizing units with lower comprehension loss.\n\n(Note: Only when the Trainee Model is activated and evaluated will there be comprehension loss, allowing the use of max_loss and min_loss strategies; otherwise, only the random strategy can be used.)",
47
+ "Unit Sampling Strategy": "unit_sampling(Unit Sampling Strategy)",
48
+ "Partition Config": "Knowledge Graph Partition Config",
49
+ "Generation Config Info": "Generation configuration includes generation mode and output data format.",
50
+ "Mode Info": "Includes various generation modes such as atomic, aggregated, multi-hop, and chain-of-thought, suitable for tasks of different complexity.",
51
+ "Mode": "mode(Mode)",
52
+ "Output Data Format Info": "Includes various output formats such as Alpaca, Sharegpt, and ChatML.",
53
+ "Output Data Format": "data_format(Output Data Format)",
54
+ "Generation Config": "Generation Config",
55
+ "Output File": "Output File"
56
  },
57
  "zh": {
58
+ "Title": "# ✨开箱即用的LLM训练数据生成框架✨\n\n",
59
+ "Intro": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) 是一个基于知识图谱的数据合成框架,旨在知识密集型任务中生成问答。\n\n 上传你的文本块(如农业、医疗、海洋知识),填写 LLM api key,即可在线生成 **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)**、**[xtuner](https://github.com/InternLM/xtuner)** 所需训练数据。结束后我们将自动删除用户信息。",
 
 
 
60
  "Use Trainee Model": "使用Trainee Model来识别知识盲区,使用硅基流动时请保持禁用",
61
  "Synthesizer URL Info": "调用合成模型API的URL,默认使用硅基流动",
62
  "Synthesizer Model Info": "用于构建知识图谱和生成问答的模型",
 
64
  "Trainee Model Info": "用于训练的模型",
65
  "SiliconFlow Token for Trainee Model": "SiliconFlow Token for Trainee Model",
66
  "Model Config": "模型配置",
67
+ "SiliconFlow Token Info": "在 \"https://cloud.siliconflow.cn/account/ak\" 获取硅基流动 API 秘钥, 使用高效稳定的 LLM 接口",
 
 
68
  "SiliconFlow Token": "硅基流动 API 秘钥",
69
  "Upload File": "上传文件",
70
  "Example Files": "示例文件",
71
+ "File Preview": "文件预览",
72
+ "Split Config Info": "如果输入文本是未分块的长文本,系统会根据以下参数将文本分成合适的段落。",
73
+ "Chunk Size Info": "按照该值将分割长文本,太短会导致知识不完整,太长会导致 LLM 输入过长",
74
+ "Chunk Size": "chunk_size(分割大小)",
75
+ "Chunk Overlap Info": "两个相邻块之间的重叠部分,有助于保持上下文的连续性",
76
+ "Chunk Overlap": "chunk_overlap(分割重叠大小)",
77
+ "Split Config": "文本分割配置",
78
+ "Quiz & Judge Config Info": "合成模型根据知识图谱中的每个知识单元,生成判断题,用于评估学生模型对知识的理解程度,得到理解误差。",
79
+ "Quiz Samples Info": "配置每个知识单元生成多少判断题",
80
+ "Quiz Samples": "quiz_samples(Quiz Samples)",
81
+ "Quiz & Judge Config": "测试与评判配置",
82
+ "Partition Config Info": "将知识图谱划分为多个社区(子图),每个社区是生成问答的最小单位。合适的分区方法可以提高关联性和多样性。",
83
+ "Which algorithm to use for graph partitioning.": "选择用于图划分的算法。",
84
+ "Partition Method": "method(划分方法)",
85
+ "DFS intro": "DFS划分方法使用深度优先搜索算法遍历知识图谱,从一个单元开始,沿着与之连接的单元深入探索,直到达到预设的社区大小或没有更多未访问的单元为止。然后,它会从另一个未访问的单元开始新的社区,重复这一过程,直到所有单元都被分配到社区中。",
86
+ "Max Units Per Community Info": "每个社区允许的知识单元(节点)的最大数量。如果一个社区超过这个限制,它将被进一步划分。一个单元指的是知识图谱中的一个节点,可以是实体或关系。",
87
+ "Max Units Per Community": "max_units_per_community(每个社区的最大单元数)",
88
+ "BFS intro": "BFS划分方法使用广度优先搜索算法遍历知识图谱,从一个单元开始,探索所有与之直接连接的单元,然后再从这些单元出发,继续探索它们的直接连接单元。这个过程会持续直到达到预设的社区大小或没有更多未访问的单元为止。然后,它会从另一个未访问的单元开始新的社区,重复这一过程,直到所有单元都被分配到社区中。",
89
+ "Leiden intro": "Leiden划分方法是一种基于模块度优化的社区检测算法,旨在识别图中的紧密连接子图。该算法通过迭代地优化节点的社区分配,最大化社区内的连接密度,同时最小化社区间的连接。Leiden算法能够有效处理大规模图数据,并且通常比其他社区检测算法(如Louvain算法)产生更高质量的社区划分结果。",
90
+ "Maximum Size of Communities Info": "一个社区中允许的最大节点数量。如果一个社区的节点数超过这个限制,它将被进一步划分。",
91
+ "Maximum Size of Communities": "max_size(社区的最大尺寸)",
92
+ "Use Largest Connected Component Info": "最大连通分量是指在图中节点之间存在路径连接的最大子集。启用此选项后,划分算法将仅考虑知识图谱中的最大连通分量进行社区划分,忽略其他较小的连通分量。这有助于确保生成的社区具有更高的连通性和相关性。",
93
+ "Use Largest Connected Component": "use_lcc(使用最大连通分量)",
94
+ "Random Seed Info": "随机种子改变图划分的初始状态,从而影响划分结果。通过设置不同的随机种子,可以生成不同的社区划分方案,有助于提高生成问答的多样性。",
95
+ "Random Seed": "random_seed(随机种子)",
96
+ "ECE intro": "ECE是一种基于模型校准原理的原创图划分方法。ECE通过计算单元的校准误差来评估其在当前模型下的表现(记为理解误差),并根据理解误差对图进行划分。",
97
+ "Min Units Per Community Info": "限制每个社区中允许的最小节点数量。如果一个社区的节点数少于这个限制,它将被舍弃。",
98
+ "Min Units Per Community": "min_units_per_community(每个社区的最小单元数)",
99
+ "Max Tokens Per Community Info": "每个社区允许的最大Token数量。如果一个社区的Token数超过这个限制,它将被进一步划分。",
100
+ "Max Tokens Per Community": "max_tokens_per_community(每个社区的最大Token数)",
101
+ "Unit Sampling Strategy Info": "单元采样策略决定在构建社区的时候如何从候选单元中选择单元。单元采样策略包括 random, max_loss, min_loss。 random表示随机选择单元,max_loss表示优先选择理解误差较大的单元,min_loss表示优先选择理解误差较小的单元。\n\n(注意:只有当学生模型启动时,经过评测后,才会有理解误差,才能使用 max_loss 和 min_loss 策略,否则只能使用 random 策略)",
102
+ "Unit Sampling Strategy": "unit_sampling(单元采样策略)",
103
+ "Partition Config": "知识图谱分区配置",
104
+ "Generation Config Info": "生成配置包括生成模式和输出数据格式。",
105
+ "Mode Info": "包括原子、聚合、多跳、思维链等多种生成模式,适用于不同复杂度的任务。",
106
+ "Mode": "mode(生成模式)",
107
+ "Output Data Format Info": "包括 Alpaca, Sharegpt, ChatML等多种输出格式。",
108
+ "Output Data Format": "data_format(输出数据格式)",
109
+ "Generation Config": "生成配置",
110
+ "Output File": "输出文件"
111
  }
112
  }