Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
28d211c
1
Parent(s):
43d27f2
Auto-sync from demo at Fri Sep 26 08:35:55 UTC 2025
Browse files- app.py +63 -46
- graphgen/configs/aggregated_config.yaml +3 -3
- graphgen/configs/atomic_config.yaml +3 -3
- graphgen/configs/cot_config.yaml +3 -3
- graphgen/configs/multi_hop_config.yaml +3 -3
- webui/app.py +63 -46
- webui/translation.json +12 -4
- webui/utils/__init__.py +3 -0
- webui/{cache_utils.py → utils/cache.py} +2 -1
- webui/{count_tokens.py → utils/count_tokens.py} +0 -0
- webui/utils/preview_file.py +29 -0
app.py
CHANGED
|
@@ -13,11 +13,10 @@ from graphgen.models import OpenAIModel, Tokenizer
|
|
| 13 |
from graphgen.models.llm.limitter import RPM, TPM
|
| 14 |
from graphgen.utils import set_logger
|
| 15 |
from webui.base import WebuiParams
|
| 16 |
-
from webui.cache_utils import cleanup_workspace, setup_workspace
|
| 17 |
-
from webui.count_tokens import count_tokens
|
| 18 |
from webui.i18n import Translate
|
| 19 |
from webui.i18n import gettext as _
|
| 20 |
from webui.test_api import test_api_connection
|
|
|
|
| 21 |
|
| 22 |
root_dir = files("webui").parent
|
| 23 |
sys.path.append(root_dir)
|
|
@@ -391,6 +390,58 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
|
|
| 391 |
with gr.Column(scale=1):
|
| 392 |
test_connection_btn = gr.Button(_("Test Connection"))
|
| 393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
with gr.Blocks():
|
| 395 |
with gr.Row(equal_height=True):
|
| 396 |
with gr.Column():
|
|
@@ -415,46 +466,12 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
|
|
| 415 |
)
|
| 416 |
|
| 417 |
with gr.Blocks():
|
| 418 |
-
with gr.
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
interactive=True,
|
| 425 |
-
)
|
| 426 |
-
examples_dir = os.path.join(root_dir, "webui", "examples")
|
| 427 |
-
gr.Examples(
|
| 428 |
-
examples=[
|
| 429 |
-
[os.path.join(examples_dir, "txt_demo.txt")],
|
| 430 |
-
[os.path.join(examples_dir, "jsonl_demo.jsonl")],
|
| 431 |
-
[os.path.join(examples_dir, "json_demo.json")],
|
| 432 |
-
[os.path.join(examples_dir, "csv_demo.csv")],
|
| 433 |
-
],
|
| 434 |
-
inputs=upload_file,
|
| 435 |
-
label=_("Example Files"),
|
| 436 |
-
examples_per_page=4,
|
| 437 |
-
)
|
| 438 |
-
with gr.Column(scale=1):
|
| 439 |
-
output = gr.File(
|
| 440 |
-
label="Output(See Github FAQ)",
|
| 441 |
-
file_count="single",
|
| 442 |
-
interactive=False,
|
| 443 |
-
)
|
| 444 |
-
|
| 445 |
-
with gr.Blocks():
|
| 446 |
-
token_counter = gr.DataFrame(
|
| 447 |
-
label="Token Stats",
|
| 448 |
-
headers=[
|
| 449 |
-
"Source Text Token Count",
|
| 450 |
-
"Estimated Token Usage",
|
| 451 |
-
"Token Used",
|
| 452 |
-
],
|
| 453 |
-
datatype="str",
|
| 454 |
-
interactive=False,
|
| 455 |
-
visible=False,
|
| 456 |
-
wrap=True,
|
| 457 |
-
)
|
| 458 |
|
| 459 |
submit_btn = gr.Button(_("Run GraphGen"))
|
| 460 |
|
|
@@ -494,13 +511,13 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
|
|
| 494 |
)
|
| 495 |
|
| 496 |
upload_file.change(
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
outputs=
|
| 500 |
).then(
|
| 501 |
count_tokens,
|
| 502 |
inputs=[upload_file, tokenizer, token_counter],
|
| 503 |
-
outputs=
|
| 504 |
)
|
| 505 |
|
| 506 |
# run GraphGen
|
|
|
|
| 13 |
from graphgen.models.llm.limitter import RPM, TPM
|
| 14 |
from graphgen.utils import set_logger
|
| 15 |
from webui.base import WebuiParams
|
|
|
|
|
|
|
| 16 |
from webui.i18n import Translate
|
| 17 |
from webui.i18n import gettext as _
|
| 18 |
from webui.test_api import test_api_connection
|
| 19 |
+
from webui.utils import cleanup_workspace, count_tokens, preview_file, setup_workspace
|
| 20 |
|
| 21 |
root_dir = files("webui").parent
|
| 22 |
sys.path.append(root_dir)
|
|
|
|
| 390 |
with gr.Column(scale=1):
|
| 391 |
test_connection_btn = gr.Button(_("Test Connection"))
|
| 392 |
|
| 393 |
+
with gr.Row(equal_height=True):
|
| 394 |
+
with gr.Column(scale=1):
|
| 395 |
+
with gr.Blocks():
|
| 396 |
+
with gr.Row(equal_height=True):
|
| 397 |
+
with gr.Column(scale=1):
|
| 398 |
+
upload_file = gr.File(
|
| 399 |
+
label=_("Upload File"),
|
| 400 |
+
file_count="single",
|
| 401 |
+
file_types=[".txt", ".json", ".jsonl", ".csv"],
|
| 402 |
+
interactive=True,
|
| 403 |
+
)
|
| 404 |
+
examples_dir = os.path.join(root_dir, "webui", "examples")
|
| 405 |
+
gr.Examples(
|
| 406 |
+
examples=[
|
| 407 |
+
[os.path.join(examples_dir, "txt_demo.txt")],
|
| 408 |
+
[os.path.join(examples_dir, "jsonl_demo.jsonl")],
|
| 409 |
+
[os.path.join(examples_dir, "json_demo.json")],
|
| 410 |
+
[os.path.join(examples_dir, "csv_demo.csv")],
|
| 411 |
+
],
|
| 412 |
+
inputs=upload_file,
|
| 413 |
+
label=_("Example Files"),
|
| 414 |
+
examples_per_page=4,
|
| 415 |
+
)
|
| 416 |
+
with gr.Column(scale=1):
|
| 417 |
+
with gr.Blocks():
|
| 418 |
+
preview_code = gr.Code(
|
| 419 |
+
label=_("File Preview"),
|
| 420 |
+
interactive=False,
|
| 421 |
+
visible=True,
|
| 422 |
+
elem_id="preview_code",
|
| 423 |
+
)
|
| 424 |
+
preview_df = gr.DataFrame(
|
| 425 |
+
label=_("File Preview"),
|
| 426 |
+
interactive=False,
|
| 427 |
+
visible=False,
|
| 428 |
+
elem_id="preview_df",
|
| 429 |
+
)
|
| 430 |
+
|
| 431 |
+
with gr.Blocks():
|
| 432 |
+
token_counter = gr.DataFrame(
|
| 433 |
+
label="Token Stats",
|
| 434 |
+
headers=[
|
| 435 |
+
"Source Text Token Count",
|
| 436 |
+
"Estimated Token Usage",
|
| 437 |
+
"Token Used",
|
| 438 |
+
],
|
| 439 |
+
datatype="str",
|
| 440 |
+
interactive=False,
|
| 441 |
+
visible=False,
|
| 442 |
+
wrap=True,
|
| 443 |
+
)
|
| 444 |
+
|
| 445 |
with gr.Blocks():
|
| 446 |
with gr.Row(equal_height=True):
|
| 447 |
with gr.Column():
|
|
|
|
| 466 |
)
|
| 467 |
|
| 468 |
with gr.Blocks():
|
| 469 |
+
with gr.Column(scale=1):
|
| 470 |
+
output = gr.File(
|
| 471 |
+
label=_("Output File"),
|
| 472 |
+
file_count="single",
|
| 473 |
+
interactive=False,
|
| 474 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
|
| 476 |
submit_btn = gr.Button(_("Run GraphGen"))
|
| 477 |
|
|
|
|
| 511 |
)
|
| 512 |
|
| 513 |
upload_file.change(
|
| 514 |
+
preview_file, inputs=upload_file, outputs=[preview_code, preview_df]
|
| 515 |
+
).then(
|
| 516 |
+
lambda x: gr.update(visible=True), inputs=upload_file, outputs=token_counter
|
| 517 |
).then(
|
| 518 |
count_tokens,
|
| 519 |
inputs=[upload_file, tokenizer, token_counter],
|
| 520 |
+
outputs=token_counter,
|
| 521 |
)
|
| 522 |
|
| 523 |
# run GraphGen
|
graphgen/configs/aggregated_config.yaml
CHANGED
|
@@ -3,12 +3,12 @@ read:
|
|
| 3 |
split:
|
| 4 |
chunk_size: 1024 # chunk size for text splitting
|
| 5 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 6 |
-
output_data_type: aggregated # atomic, aggregated, multi_hop, cot
|
| 7 |
-
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
|
| 8 |
-
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
| 9 |
search: # web search configuration
|
| 10 |
enabled: false # whether to enable web search
|
| 11 |
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
|
|
|
|
|
|
|
|
|
|
| 12 |
quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
|
| 13 |
enabled: true
|
| 14 |
quiz_samples: 2 # number of quiz samples to generate
|
|
|
|
| 3 |
split:
|
| 4 |
chunk_size: 1024 # chunk size for text splitting
|
| 5 |
chunk_overlap: 100 # chunk overlap for text splitting
|
|
|
|
|
|
|
|
|
|
| 6 |
search: # web search configuration
|
| 7 |
enabled: false # whether to enable web search
|
| 8 |
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
|
| 9 |
+
output_data_type: aggregated # atomic, aggregated, multi_hop, cot
|
| 10 |
+
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
|
| 11 |
+
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
| 12 |
quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
|
| 13 |
enabled: true
|
| 14 |
quiz_samples: 2 # number of quiz samples to generate
|
graphgen/configs/atomic_config.yaml
CHANGED
|
@@ -3,12 +3,12 @@ read:
|
|
| 3 |
split:
|
| 4 |
chunk_size: 1024 # chunk size for text splitting
|
| 5 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 6 |
-
output_data_type: atomic # atomic, aggregated, multi_hop, cot
|
| 7 |
-
output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
|
| 8 |
-
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
| 9 |
search: # web search configuration
|
| 10 |
enabled: false # whether to enable web search
|
| 11 |
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
|
|
|
|
|
|
|
|
|
|
| 12 |
quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
|
| 13 |
enabled: true
|
| 14 |
quiz_samples: 2 # number of quiz samples to generate
|
|
|
|
| 3 |
split:
|
| 4 |
chunk_size: 1024 # chunk size for text splitting
|
| 5 |
chunk_overlap: 100 # chunk overlap for text splitting
|
|
|
|
|
|
|
|
|
|
| 6 |
search: # web search configuration
|
| 7 |
enabled: false # whether to enable web search
|
| 8 |
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
|
| 9 |
+
output_data_type: atomic # atomic, aggregated, multi_hop, cot
|
| 10 |
+
output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
|
| 11 |
+
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
| 12 |
quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
|
| 13 |
enabled: true
|
| 14 |
quiz_samples: 2 # number of quiz samples to generate
|
graphgen/configs/cot_config.yaml
CHANGED
|
@@ -3,12 +3,12 @@ read:
|
|
| 3 |
split:
|
| 4 |
chunk_size: 1024 # chunk size for text splitting
|
| 5 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 6 |
-
output_data_type: cot # atomic, aggregated, multi_hop, cot
|
| 7 |
-
output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
|
| 8 |
-
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
| 9 |
search: # web search configuration
|
| 10 |
enabled: false # whether to enable web search
|
| 11 |
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
|
|
|
|
|
|
|
|
|
|
| 12 |
method_params:
|
| 13 |
method: leiden
|
| 14 |
max_size: 20 # Maximum size of communities
|
|
|
|
| 3 |
split:
|
| 4 |
chunk_size: 1024 # chunk size for text splitting
|
| 5 |
chunk_overlap: 100 # chunk overlap for text splitting
|
|
|
|
|
|
|
|
|
|
| 6 |
search: # web search configuration
|
| 7 |
enabled: false # whether to enable web search
|
| 8 |
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
|
| 9 |
+
output_data_type: cot # atomic, aggregated, multi_hop, cot
|
| 10 |
+
output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
|
| 11 |
+
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
| 12 |
method_params:
|
| 13 |
method: leiden
|
| 14 |
max_size: 20 # Maximum size of communities
|
graphgen/configs/multi_hop_config.yaml
CHANGED
|
@@ -3,12 +3,12 @@ read:
|
|
| 3 |
split:
|
| 4 |
chunk_size: 1024 # chunk size for text splitting
|
| 5 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 6 |
-
output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
|
| 7 |
-
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
|
| 8 |
-
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
| 9 |
search: # web search configuration
|
| 10 |
enabled: false # whether to enable web search
|
| 11 |
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
|
|
|
|
|
|
|
|
|
|
| 12 |
quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
|
| 13 |
enabled: false
|
| 14 |
quiz_samples: 2 # number of quiz samples to generate
|
|
|
|
| 3 |
split:
|
| 4 |
chunk_size: 1024 # chunk size for text splitting
|
| 5 |
chunk_overlap: 100 # chunk overlap for text splitting
|
|
|
|
|
|
|
|
|
|
| 6 |
search: # web search configuration
|
| 7 |
enabled: false # whether to enable web search
|
| 8 |
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
|
| 9 |
+
output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
|
| 10 |
+
output_data_format: ChatML # Alpaca, Sharegpt, ChatML
|
| 11 |
+
tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
|
| 12 |
quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
|
| 13 |
enabled: false
|
| 14 |
quiz_samples: 2 # number of quiz samples to generate
|
webui/app.py
CHANGED
|
@@ -13,11 +13,10 @@ from graphgen.models import OpenAIModel, Tokenizer
|
|
| 13 |
from graphgen.models.llm.limitter import RPM, TPM
|
| 14 |
from graphgen.utils import set_logger
|
| 15 |
from webui.base import WebuiParams
|
| 16 |
-
from webui.cache_utils import cleanup_workspace, setup_workspace
|
| 17 |
-
from webui.count_tokens import count_tokens
|
| 18 |
from webui.i18n import Translate
|
| 19 |
from webui.i18n import gettext as _
|
| 20 |
from webui.test_api import test_api_connection
|
|
|
|
| 21 |
|
| 22 |
root_dir = files("webui").parent
|
| 23 |
sys.path.append(root_dir)
|
|
@@ -391,6 +390,58 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
|
|
| 391 |
with gr.Column(scale=1):
|
| 392 |
test_connection_btn = gr.Button(_("Test Connection"))
|
| 393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
with gr.Blocks():
|
| 395 |
with gr.Row(equal_height=True):
|
| 396 |
with gr.Column():
|
|
@@ -415,46 +466,12 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
|
|
| 415 |
)
|
| 416 |
|
| 417 |
with gr.Blocks():
|
| 418 |
-
with gr.
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
interactive=True,
|
| 425 |
-
)
|
| 426 |
-
examples_dir = os.path.join(root_dir, "webui", "examples")
|
| 427 |
-
gr.Examples(
|
| 428 |
-
examples=[
|
| 429 |
-
[os.path.join(examples_dir, "txt_demo.txt")],
|
| 430 |
-
[os.path.join(examples_dir, "jsonl_demo.jsonl")],
|
| 431 |
-
[os.path.join(examples_dir, "json_demo.json")],
|
| 432 |
-
[os.path.join(examples_dir, "csv_demo.csv")],
|
| 433 |
-
],
|
| 434 |
-
inputs=upload_file,
|
| 435 |
-
label=_("Example Files"),
|
| 436 |
-
examples_per_page=4,
|
| 437 |
-
)
|
| 438 |
-
with gr.Column(scale=1):
|
| 439 |
-
output = gr.File(
|
| 440 |
-
label="Output(See Github FAQ)",
|
| 441 |
-
file_count="single",
|
| 442 |
-
interactive=False,
|
| 443 |
-
)
|
| 444 |
-
|
| 445 |
-
with gr.Blocks():
|
| 446 |
-
token_counter = gr.DataFrame(
|
| 447 |
-
label="Token Stats",
|
| 448 |
-
headers=[
|
| 449 |
-
"Source Text Token Count",
|
| 450 |
-
"Estimated Token Usage",
|
| 451 |
-
"Token Used",
|
| 452 |
-
],
|
| 453 |
-
datatype="str",
|
| 454 |
-
interactive=False,
|
| 455 |
-
visible=False,
|
| 456 |
-
wrap=True,
|
| 457 |
-
)
|
| 458 |
|
| 459 |
submit_btn = gr.Button(_("Run GraphGen"))
|
| 460 |
|
|
@@ -494,13 +511,13 @@ with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
|
|
| 494 |
)
|
| 495 |
|
| 496 |
upload_file.change(
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
outputs=
|
| 500 |
).then(
|
| 501 |
count_tokens,
|
| 502 |
inputs=[upload_file, tokenizer, token_counter],
|
| 503 |
-
outputs=
|
| 504 |
)
|
| 505 |
|
| 506 |
# run GraphGen
|
|
|
|
| 13 |
from graphgen.models.llm.limitter import RPM, TPM
|
| 14 |
from graphgen.utils import set_logger
|
| 15 |
from webui.base import WebuiParams
|
|
|
|
|
|
|
| 16 |
from webui.i18n import Translate
|
| 17 |
from webui.i18n import gettext as _
|
| 18 |
from webui.test_api import test_api_connection
|
| 19 |
+
from webui.utils import cleanup_workspace, count_tokens, preview_file, setup_workspace
|
| 20 |
|
| 21 |
root_dir = files("webui").parent
|
| 22 |
sys.path.append(root_dir)
|
|
|
|
| 390 |
with gr.Column(scale=1):
|
| 391 |
test_connection_btn = gr.Button(_("Test Connection"))
|
| 392 |
|
| 393 |
+
with gr.Row(equal_height=True):
|
| 394 |
+
with gr.Column(scale=1):
|
| 395 |
+
with gr.Blocks():
|
| 396 |
+
with gr.Row(equal_height=True):
|
| 397 |
+
with gr.Column(scale=1):
|
| 398 |
+
upload_file = gr.File(
|
| 399 |
+
label=_("Upload File"),
|
| 400 |
+
file_count="single",
|
| 401 |
+
file_types=[".txt", ".json", ".jsonl", ".csv"],
|
| 402 |
+
interactive=True,
|
| 403 |
+
)
|
| 404 |
+
examples_dir = os.path.join(root_dir, "webui", "examples")
|
| 405 |
+
gr.Examples(
|
| 406 |
+
examples=[
|
| 407 |
+
[os.path.join(examples_dir, "txt_demo.txt")],
|
| 408 |
+
[os.path.join(examples_dir, "jsonl_demo.jsonl")],
|
| 409 |
+
[os.path.join(examples_dir, "json_demo.json")],
|
| 410 |
+
[os.path.join(examples_dir, "csv_demo.csv")],
|
| 411 |
+
],
|
| 412 |
+
inputs=upload_file,
|
| 413 |
+
label=_("Example Files"),
|
| 414 |
+
examples_per_page=4,
|
| 415 |
+
)
|
| 416 |
+
with gr.Column(scale=1):
|
| 417 |
+
with gr.Blocks():
|
| 418 |
+
preview_code = gr.Code(
|
| 419 |
+
label=_("File Preview"),
|
| 420 |
+
interactive=False,
|
| 421 |
+
visible=True,
|
| 422 |
+
elem_id="preview_code",
|
| 423 |
+
)
|
| 424 |
+
preview_df = gr.DataFrame(
|
| 425 |
+
label=_("File Preview"),
|
| 426 |
+
interactive=False,
|
| 427 |
+
visible=False,
|
| 428 |
+
elem_id="preview_df",
|
| 429 |
+
)
|
| 430 |
+
|
| 431 |
+
with gr.Blocks():
|
| 432 |
+
token_counter = gr.DataFrame(
|
| 433 |
+
label="Token Stats",
|
| 434 |
+
headers=[
|
| 435 |
+
"Source Text Token Count",
|
| 436 |
+
"Estimated Token Usage",
|
| 437 |
+
"Token Used",
|
| 438 |
+
],
|
| 439 |
+
datatype="str",
|
| 440 |
+
interactive=False,
|
| 441 |
+
visible=False,
|
| 442 |
+
wrap=True,
|
| 443 |
+
)
|
| 444 |
+
|
| 445 |
with gr.Blocks():
|
| 446 |
with gr.Row(equal_height=True):
|
| 447 |
with gr.Column():
|
|
|
|
| 466 |
)
|
| 467 |
|
| 468 |
with gr.Blocks():
|
| 469 |
+
with gr.Column(scale=1):
|
| 470 |
+
output = gr.File(
|
| 471 |
+
label=_("Output File"),
|
| 472 |
+
file_count="single",
|
| 473 |
+
interactive=False,
|
| 474 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
|
| 476 |
submit_btn = gr.Button(_("Run GraphGen"))
|
| 477 |
|
|
|
|
| 511 |
)
|
| 512 |
|
| 513 |
upload_file.change(
|
| 514 |
+
preview_file, inputs=upload_file, outputs=[preview_code, preview_df]
|
| 515 |
+
).then(
|
| 516 |
+
lambda x: gr.update(visible=True), inputs=upload_file, outputs=token_counter
|
| 517 |
).then(
|
| 518 |
count_tokens,
|
| 519 |
inputs=[upload_file, tokenizer, token_counter],
|
| 520 |
+
outputs=token_counter,
|
| 521 |
)
|
| 522 |
|
| 523 |
# run GraphGen
|
webui/translation.json
CHANGED
|
@@ -1,8 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"en": {
|
| 3 |
"Title": "✨Easy-to-use LLM Training Data Generation Framework✨",
|
|
|
|
| 4 |
"### [GraphGen](https://github.com/open-sciencelab/GraphGen) ": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) ",
|
| 5 |
"Intro": "is a framework for synthetic data generation guided by knowledge graphs, designed to tackle challenges for knowledge-intensive QA generation. \n\nBy uploading your text chunks (such as knowledge in agriculture, healthcare, or marine science) and filling in the LLM API key, you can generate the training data required by **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)** and **[xtuner](https://github.com/InternLM/xtuner)** online. We will automatically delete user information after completion.",
|
|
|
|
| 6 |
"Use Trainee Model": "Use Trainee Model to identify knowledge blind spots, please keep disable for SiliconCloud",
|
| 7 |
"Synthesizer URL Info": "Base URL for the Synthesizer Model API, use SiliconFlow as default",
|
| 8 |
"Synthesizer Model Info": "Model for constructing KGs and generating QAs",
|
|
@@ -11,16 +13,20 @@
|
|
| 11 |
"SiliconFlow Token for Trainee Model": "SiliconFlow API Key for Trainee Model",
|
| 12 |
"Model Config": "Model Configuration",
|
| 13 |
"Generation Config": "Generation Config",
|
|
|
|
|
|
|
| 14 |
"SiliconFlow Token": "SiliconFlow API Key",
|
| 15 |
-
"Test Connection": "Test Connection",
|
| 16 |
"Upload File": "Upload File",
|
| 17 |
"Example Files": "Example Files",
|
| 18 |
-
"
|
|
|
|
| 19 |
},
|
| 20 |
"zh": {
|
| 21 |
"Title": "✨开箱即用的LLM训练数据生成框架✨",
|
|
|
|
| 22 |
"### [GraphGen](https://github.com/open-sciencelab/GraphGen) ": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) ",
|
| 23 |
"Intro": "是一个基于知识图谱的数据合成框架,旨在知识密集型任务中生成问答。\n\n 上传你的文本块(如农业、医疗、海洋知识),填写 LLM api key,即可在线生成 **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)**、**[xtuner](https://github.com/InternLM/xtuner)** 所需训练数据。结束后我们将自动删除用户信息。",
|
|
|
|
| 24 |
"Use Trainee Model": "使用Trainee Model来识别知识盲区,使用硅基流动时请保持禁用",
|
| 25 |
"Synthesizer URL Info": "调用合成模型API的URL,默认使用硅基流动",
|
| 26 |
"Synthesizer Model Info": "用于构建知识图谱和生成问答的模型",
|
|
@@ -29,10 +35,12 @@
|
|
| 29 |
"SiliconFlow Token for Trainee Model": "SiliconFlow Token for Trainee Model",
|
| 30 |
"Model Config": "模型配置",
|
| 31 |
"Generation Config": "生成配置",
|
|
|
|
|
|
|
| 32 |
"SiliconFlow Token": "SiliconFlow Token",
|
| 33 |
-
"Test Connection": "测试接口",
|
| 34 |
"Upload File": "上传文件",
|
| 35 |
"Example Files": "示例文件",
|
| 36 |
-
"
|
|
|
|
| 37 |
}
|
| 38 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"en": {
|
| 3 |
"Title": "✨Easy-to-use LLM Training Data Generation Framework✨",
|
| 4 |
+
"\n\n": "\n\n",
|
| 5 |
"### [GraphGen](https://github.com/open-sciencelab/GraphGen) ": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) ",
|
| 6 |
"Intro": "is a framework for synthetic data generation guided by knowledge graphs, designed to tackle challenges for knowledge-intensive QA generation. \n\nBy uploading your text chunks (such as knowledge in agriculture, healthcare, or marine science) and filling in the LLM API key, you can generate the training data required by **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)** and **[xtuner](https://github.com/InternLM/xtuner)** online. We will automatically delete user information after completion.",
|
| 7 |
+
"# ": "# ",
|
| 8 |
"Use Trainee Model": "Use Trainee Model to identify knowledge blind spots, please keep disable for SiliconCloud",
|
| 9 |
"Synthesizer URL Info": "Base URL for the Synthesizer Model API, use SiliconFlow as default",
|
| 10 |
"Synthesizer Model Info": "Model for constructing KGs and generating QAs",
|
|
|
|
| 13 |
"SiliconFlow Token for Trainee Model": "SiliconFlow API Key for Trainee Model",
|
| 14 |
"Model Config": "Model Configuration",
|
| 15 |
"Generation Config": "Generation Config",
|
| 16 |
+
"API Config": "API Config",
|
| 17 |
+
"### ": "### ",
|
| 18 |
"SiliconFlow Token": "SiliconFlow API Key",
|
|
|
|
| 19 |
"Upload File": "Upload File",
|
| 20 |
"Example Files": "Example Files",
|
| 21 |
+
"Output File": "Output File",
|
| 22 |
+
"File Preview": "File Preview"
|
| 23 |
},
|
| 24 |
"zh": {
|
| 25 |
"Title": "✨开箱即用的LLM训练数据生成框架✨",
|
| 26 |
+
"\n\n": "\n\n",
|
| 27 |
"### [GraphGen](https://github.com/open-sciencelab/GraphGen) ": "### [GraphGen](https://github.com/open-sciencelab/GraphGen) ",
|
| 28 |
"Intro": "是一个基于知识图谱的数据合成框架,旨在知识密集型任务中生成问答。\n\n 上传你的文本块(如农业、医疗、海洋知识),填写 LLM api key,即可在线生成 **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)**、**[xtuner](https://github.com/InternLM/xtuner)** 所需训练数据。结束后我们将自动删除用户信息。",
|
| 29 |
+
"# ": "# ",
|
| 30 |
"Use Trainee Model": "使用Trainee Model来识别知识盲区,使用硅基流动时请保持禁用",
|
| 31 |
"Synthesizer URL Info": "调用合成模型API的URL,默认使用硅基流动",
|
| 32 |
"Synthesizer Model Info": "用于构建知识图谱和生成问答的模型",
|
|
|
|
| 35 |
"SiliconFlow Token for Trainee Model": "SiliconFlow Token for Trainee Model",
|
| 36 |
"Model Config": "模型配置",
|
| 37 |
"Generation Config": "生成配置",
|
| 38 |
+
"API Config": "API Config",
|
| 39 |
+
"### ": "### ",
|
| 40 |
"SiliconFlow Token": "SiliconFlow Token",
|
|
|
|
| 41 |
"Upload File": "上传文件",
|
| 42 |
"Example Files": "示例文件",
|
| 43 |
+
"Output File": "输出文件",
|
| 44 |
+
"File Preview": "文件预览"
|
| 45 |
}
|
| 46 |
}
|
webui/utils/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .cache import cleanup_workspace, setup_workspace
|
| 2 |
+
from .count_tokens import count_tokens
|
| 3 |
+
from .preview_file import preview_file
|
webui/{cache_utils.py → utils/cache.py}
RENAMED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
-
import uuid
|
| 3 |
import shutil
|
|
|
|
|
|
|
| 4 |
|
| 5 |
def setup_workspace(folder):
|
| 6 |
request_id = str(uuid.uuid4())
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import shutil
|
| 3 |
+
import uuid
|
| 4 |
+
|
| 5 |
|
| 6 |
def setup_workspace(folder):
|
| 7 |
request_id = str(uuid.uuid4())
|
webui/{count_tokens.py → utils/count_tokens.py}
RENAMED
|
File without changes
|
webui/utils/preview_file.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import codecs
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
import gradio as gr
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def preview_file(file):
|
| 9 |
+
if file is None:
|
| 10 |
+
return gr.update(visible=False), gr.update(visible=False)
|
| 11 |
+
|
| 12 |
+
path = file.name
|
| 13 |
+
ext = os.path.splitext(path)[1].lower()
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
if ext == ".csv":
|
| 17 |
+
df = pd.read_csv(path, nrows=10)
|
| 18 |
+
return gr.update(visible=False), gr.update(value=df, visible=True)
|
| 19 |
+
with codecs.open(path, "r", encoding="utf-8") as f:
|
| 20 |
+
text = f.read(5000)
|
| 21 |
+
if len(text) == 5000:
|
| 22 |
+
text += "\n\n... (truncated at 5000 chars)"
|
| 23 |
+
return gr.update(
|
| 24 |
+
value=text, visible=True, language="json" if ext != ".txt" else None
|
| 25 |
+
), gr.update(visible=False)
|
| 26 |
+
except Exception as e: # pylint: disable=broad-except
|
| 27 |
+
return gr.update(
|
| 28 |
+
value=f"Preview failed: {e}", visible=True, language=None
|
| 29 |
+
), gr.update(visible=False)
|