Spaces:
Runtime error
Runtime error
add save dataset
Browse files
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import io
|
|
|
|
| 2 |
import re
|
| 3 |
import time
|
| 4 |
from itertools import islice
|
|
@@ -10,16 +11,18 @@ from typing import Callable, Iterable, Iterator, Optional, TypeVar
|
|
| 10 |
import gradio as gr
|
| 11 |
import pandas as pd
|
| 12 |
import requests.exceptions
|
| 13 |
-
from huggingface_hub import InferenceClient
|
| 14 |
|
| 15 |
|
| 16 |
model_id = "microsoft/Phi-3-mini-4k-instruct"
|
| 17 |
client = InferenceClient(model_id)
|
|
|
|
| 18 |
|
| 19 |
MAX_TOTAL_NB_ITEMS = 100 # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
|
| 20 |
MAX_NB_ITEMS_PER_GENERATION_CALL = 10
|
| 21 |
NUM_ROWS = 100
|
| 22 |
NUM_VARIANTS = 10
|
|
|
|
| 23 |
URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
|
| 24 |
|
| 25 |
GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
|
|
@@ -54,8 +57,6 @@ LONG_RARITIES = [
|
|
| 54 |
"very nice but still plausible",
|
| 55 |
]
|
| 56 |
|
| 57 |
-
landing_page_query = "various datasets on many different subjects and topics, from classification to language modeling, from science to sport to finance to news"
|
| 58 |
-
|
| 59 |
landing_page_datasets_generated_text = """
|
| 60 |
1. NewsEventsPredict (classification, media, trend)
|
| 61 |
2. FinancialForecast (economy, stocks, regression)
|
|
@@ -71,6 +72,29 @@ landing_page_datasets_generated_text = """
|
|
| 71 |
default_output = landing_page_datasets_generated_text.strip().split("\n")
|
| 72 |
assert len(default_output) == MAX_NB_ITEMS_PER_GENERATION_CALL
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
css = """
|
| 75 |
a {
|
| 76 |
color: var(--body-text-color);
|
|
@@ -145,35 +169,27 @@ a {
|
|
| 145 |
color: transparent;
|
| 146 |
background-clip: text;
|
| 147 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
"""
|
| 149 |
|
| 150 |
|
| 151 |
with gr.Blocks(css=css) as demo:
|
| 152 |
generated_texts_state = gr.State((landing_page_datasets_generated_text,))
|
| 153 |
-
with gr.Row():
|
| 154 |
-
with gr.Column(scale=4, min_width=0):
|
| 155 |
-
pass
|
| 156 |
-
with gr.Column(scale=10):
|
| 157 |
-
gr.Markdown(
|
| 158 |
-
"# 🤗 Infinite Dataset Hub ♾️\n\n"
|
| 159 |
-
"An endless catalog of datasets, created just for you.\n\n"
|
| 160 |
-
)
|
| 161 |
-
with gr.Column(scale=4, min_width=0):
|
| 162 |
-
pass
|
| 163 |
with gr.Column() as search_page:
|
| 164 |
with gr.Row():
|
| 165 |
-
with gr.Column(scale=4, min_width=0):
|
| 166 |
-
pass
|
| 167 |
with gr.Column(scale=10):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
with gr.Row():
|
| 169 |
search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets, get infinite results", show_label=False, container=False, scale=9)
|
| 170 |
search_button = gr.Button("🔍", variant="primary", scale=1)
|
| 171 |
-
with gr.Column(scale=4, min_width=0):
|
| 172 |
-
pass
|
| 173 |
-
with gr.Row():
|
| 174 |
-
with gr.Column(scale=4, min_width=0):
|
| 175 |
-
pass
|
| 176 |
-
with gr.Column(scale=10):
|
| 177 |
button_groups: list[gr.Group] = []
|
| 178 |
buttons: list[gr.Button] = []
|
| 179 |
for i in range(MAX_TOTAL_NB_ITEMS):
|
|
@@ -195,20 +211,28 @@ with gr.Blocks(css=css) as demo:
|
|
| 195 |
|
| 196 |
load_more_datasets = gr.Button("Load more datasets") # TODO: dosable when reaching end of page
|
| 197 |
gr.Markdown(f"_powered by [{model_id}](https://huggingface.co/{model_id})_")
|
| 198 |
-
with gr.Column(scale=4, min_width=
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
with gr.Column(visible=False) as dataset_page:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
dataset_title = gr.Markdown()
|
| 202 |
gr.Markdown("_Note: This is an AI-generated dataset so its content may be inaccurate or false_")
|
| 203 |
dataset_content = gr.Markdown()
|
| 204 |
generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary")
|
| 205 |
dataset_dataframe = gr.DataFrame(visible=False, interactive=False, wrap=True)
|
| 206 |
save_dataset_button = gr.Button("💾 Save Dataset", variant="primary", visible=False)
|
|
|
|
| 207 |
dataset_share_button = gr.Button("Share Dataset URL")
|
| 208 |
dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
|
| 209 |
back_button = gr.Button("< Back", size="sm")
|
| 210 |
-
|
| 211 |
-
app_state = gr.State({})
|
| 212 |
|
| 213 |
###################################
|
| 214 |
#
|
|
@@ -254,7 +278,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 254 |
|
| 255 |
def gen_datasets_line_by_line(search_query: str, generated_texts: tuple[str] = ()) -> Iterator[str]:
|
| 256 |
search_query = search_query or ""
|
| 257 |
-
search_query = search_query[:1000] if search_query.strip() else
|
| 258 |
generated_text = ""
|
| 259 |
current_line = ""
|
| 260 |
for token in stream_reponse(
|
|
@@ -273,7 +297,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 273 |
|
| 274 |
def gen_dataset_content(search_query: str, dataset_name: str, tags: str) -> Iterator[str]:
|
| 275 |
search_query = search_query or ""
|
| 276 |
-
search_query = search_query[:1000] if search_query.strip() else
|
| 277 |
generated_text = ""
|
| 278 |
for token in stream_reponse(GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
|
| 279 |
search_query=search_query,
|
|
@@ -418,7 +442,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 418 |
|
| 419 |
|
| 420 |
def _search_datasets(search_query):
|
| 421 |
-
yield {generated_texts_state: []
|
| 422 |
yield {
|
| 423 |
button_group: gr.Group(elem_classes="buttonsGroup insivibleButtonGroup")
|
| 424 |
for button_group in button_groups[MAX_NB_ITEMS_PER_GENERATION_CALL:]
|
|
@@ -453,12 +477,12 @@ with gr.Blocks(css=css) as demo:
|
|
| 453 |
current_item_idx += 1
|
| 454 |
|
| 455 |
|
| 456 |
-
@search_button.click(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state
|
| 457 |
def search_dataset_from_search_button(search_query):
|
| 458 |
yield from _search_datasets(search_query)
|
| 459 |
|
| 460 |
|
| 461 |
-
@search_bar.submit(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state
|
| 462 |
def search_dataset_from_search_bar(search_query):
|
| 463 |
yield from _search_datasets(search_query)
|
| 464 |
|
|
@@ -497,20 +521,16 @@ with gr.Blocks(css=css) as demo:
|
|
| 497 |
dataset_title: f"# {dataset_name}\n\n tags: {tags}",
|
| 498 |
dataset_share_textbox: gr.Textbox(visible=False),
|
| 499 |
dataset_dataframe: gr.DataFrame(visible=False),
|
| 500 |
-
generate_full_dataset_button: gr.Button(
|
| 501 |
save_dataset_button: gr.Button(visible=False),
|
| 502 |
-
|
| 503 |
-
"search_query": search_query,
|
| 504 |
-
"dataset_name": dataset_name,
|
| 505 |
-
"tags": tags
|
| 506 |
-
}
|
| 507 |
}
|
| 508 |
for generated_text in gen_dataset_content(search_query=search_query, dataset_name=dataset_name, tags=tags):
|
| 509 |
yield {dataset_content: generated_text}
|
| 510 |
|
| 511 |
|
| 512 |
show_dataset_inputs = [search_bar, *buttons]
|
| 513 |
-
show_dataset_outputs = [
|
| 514 |
scroll_to_top_js = """
|
| 515 |
function (...args) {
|
| 516 |
console.log(args);
|
|
@@ -537,8 +557,8 @@ with gr.Blocks(css=css) as demo:
|
|
| 537 |
return gr.Column(visible=True), gr.Column(visible=False)
|
| 538 |
|
| 539 |
|
| 540 |
-
@generate_full_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar], outputs=[dataset_dataframe, generate_full_dataset_button, save_dataset_button])
|
| 541 |
-
def generate_full_dataset(title, content, search_query):
|
| 542 |
dataset_name, tags = title.strip("# ").split("\ntags:", 1)
|
| 543 |
dataset_name, tags = dataset_name.strip(), tags.strip()
|
| 544 |
csv_header, preview_df = parse_preview_df(content)
|
|
@@ -556,8 +576,8 @@ with gr.Blocks(css=css) as demo:
|
|
| 556 |
output[:len(preview_df)] = [{"idx": i, **x} for i, x in enumerate(preview_df.to_dict(orient="records"))]
|
| 557 |
yield {
|
| 558 |
dataset_dataframe: gr.DataFrame(pd.DataFrame([{"idx": i, **x} for i, x in enumerate(output) if x]), visible=True),
|
| 559 |
-
generate_full_dataset_button: gr.Button(
|
| 560 |
-
save_dataset_button: gr.Button(visible=True, interactive=False)
|
| 561 |
}
|
| 562 |
kwargs_iterable = [
|
| 563 |
{
|
|
@@ -573,24 +593,48 @@ with gr.Blocks(css=css) as demo:
|
|
| 573 |
]
|
| 574 |
for _ in iflatmap_unordered(generate_partial_dataset, kwargs_iterable=kwargs_iterable):
|
| 575 |
yield {dataset_dataframe: pd.DataFrame([{"idx": i, **{column_name: x.get(column_name) for column_name in columns}} for i, x in enumerate(output) if x])}
|
| 576 |
-
yield {save_dataset_button: gr.Button(
|
| 577 |
print(f"Generated {dataset_name}!")
|
| 578 |
|
| 579 |
|
| 580 |
-
@save_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, dataset_dataframe])
|
| 581 |
-
def save_dataset(title, content, search_query, df):
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
return gr.Textbox(
|
| 588 |
-
f"{URL}?q={
|
| 589 |
visible=True,
|
| 590 |
)
|
| 591 |
|
| 592 |
-
@demo.load(outputs=
|
| 593 |
-
def load_app(request: gr.Request):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
query_params = dict(request.query_params)
|
| 595 |
if "dataset" in query_params:
|
| 596 |
yield from _show_dataset(
|
|
|
|
| 1 |
import io
|
| 2 |
+
import os
|
| 3 |
import re
|
| 4 |
import time
|
| 5 |
from itertools import islice
|
|
|
|
| 11 |
import gradio as gr
|
| 12 |
import pandas as pd
|
| 13 |
import requests.exceptions
|
| 14 |
+
from huggingface_hub import InferenceClient, create_repo, whoami, DatasetCard
|
| 15 |
|
| 16 |
|
| 17 |
model_id = "microsoft/Phi-3-mini-4k-instruct"
|
| 18 |
client = InferenceClient(model_id)
|
| 19 |
+
save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
|
| 20 |
|
| 21 |
MAX_TOTAL_NB_ITEMS = 100 # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
|
| 22 |
MAX_NB_ITEMS_PER_GENERATION_CALL = 10
|
| 23 |
NUM_ROWS = 100
|
| 24 |
NUM_VARIANTS = 10
|
| 25 |
+
NAMESPACE = "infinite-dataset-hub"
|
| 26 |
URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
|
| 27 |
|
| 28 |
GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
|
|
|
|
| 57 |
"very nice but still plausible",
|
| 58 |
]
|
| 59 |
|
|
|
|
|
|
|
| 60 |
landing_page_datasets_generated_text = """
|
| 61 |
1. NewsEventsPredict (classification, media, trend)
|
| 62 |
2. FinancialForecast (economy, stocks, regression)
|
|
|
|
| 72 |
default_output = landing_page_datasets_generated_text.strip().split("\n")
|
| 73 |
assert len(default_output) == MAX_NB_ITEMS_PER_GENERATION_CALL
|
| 74 |
|
| 75 |
+
DATASET_CARD_CONTENT = """
|
| 76 |
+
---
|
| 77 |
+
license: mit
|
| 78 |
+
tags:
|
| 79 |
+
- infinite-dataset-hub
|
| 80 |
+
- synthetic
|
| 81 |
+
---
|
| 82 |
+
|
| 83 |
+
{title}
|
| 84 |
+
|
| 85 |
+
_Note: This is an AI-generated dataset so its content may be inaccurate or false_
|
| 86 |
+
|
| 87 |
+
{content}
|
| 88 |
+
|
| 89 |
+
**Source of the data:**
|
| 90 |
+
|
| 91 |
+
The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id} using the query '{search_query}':
|
| 92 |
+
|
| 93 |
+
- **Dataset Generation Page**: {dataset_url}
|
| 94 |
+
- **Model**: https://huggingface.co/{model_id}
|
| 95 |
+
- **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
|
| 96 |
+
"""
|
| 97 |
+
|
| 98 |
css = """
|
| 99 |
a {
|
| 100 |
color: var(--body-text-color);
|
|
|
|
| 169 |
color: transparent;
|
| 170 |
background-clip: text;
|
| 171 |
}
|
| 172 |
+
.settings {
|
| 173 |
+
background: transparent;
|
| 174 |
+
}
|
| 175 |
+
.settings button span {
|
| 176 |
+
color: var(--body-text-color-subdued);
|
| 177 |
+
}
|
| 178 |
"""
|
| 179 |
|
| 180 |
|
| 181 |
with gr.Blocks(css=css) as demo:
|
| 182 |
generated_texts_state = gr.State((landing_page_datasets_generated_text,))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
with gr.Column() as search_page:
|
| 184 |
with gr.Row():
|
|
|
|
|
|
|
| 185 |
with gr.Column(scale=10):
|
| 186 |
+
gr.Markdown(
|
| 187 |
+
"# 🤗 Infinite Dataset Hub ♾️\n\n"
|
| 188 |
+
"An endless catalog of datasets, created just for you.\n\n"
|
| 189 |
+
)
|
| 190 |
with gr.Row():
|
| 191 |
search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets, get infinite results", show_label=False, container=False, scale=9)
|
| 192 |
search_button = gr.Button("🔍", variant="primary", scale=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
button_groups: list[gr.Group] = []
|
| 194 |
buttons: list[gr.Button] = []
|
| 195 |
for i in range(MAX_TOTAL_NB_ITEMS):
|
|
|
|
| 211 |
|
| 212 |
load_more_datasets = gr.Button("Load more datasets") # TODO: dosable when reaching end of page
|
| 213 |
gr.Markdown(f"_powered by [{model_id}](https://huggingface.co/{model_id})_")
|
| 214 |
+
with gr.Column(scale=4, min_width="200px"):
|
| 215 |
+
with gr.Accordion("Settings", open=False, elem_classes="settings"):
|
| 216 |
+
gr.Markdown("Save datasets to your account")
|
| 217 |
+
gr.LoginButton()
|
| 218 |
+
select_namespace_dropdown = gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, label="Select user or organization", visible=False)
|
| 219 |
+
gr.Markdown("Save datasets as public or private datasets")
|
| 220 |
+
visibility_radio = gr.Radio(["public", "private"], value="public", container=False, interactive=False)
|
| 221 |
with gr.Column(visible=False) as dataset_page:
|
| 222 |
+
gr.Markdown(
|
| 223 |
+
"# 🤗 Infinite Dataset Hub ♾️\n\n"
|
| 224 |
+
"An endless catalog of datasets, created just for you.\n\n"
|
| 225 |
+
)
|
| 226 |
dataset_title = gr.Markdown()
|
| 227 |
gr.Markdown("_Note: This is an AI-generated dataset so its content may be inaccurate or false_")
|
| 228 |
dataset_content = gr.Markdown()
|
| 229 |
generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary")
|
| 230 |
dataset_dataframe = gr.DataFrame(visible=False, interactive=False, wrap=True)
|
| 231 |
save_dataset_button = gr.Button("💾 Save Dataset", variant="primary", visible=False)
|
| 232 |
+
open_dataset_message = gr.Markdown("", visible=False)
|
| 233 |
dataset_share_button = gr.Button("Share Dataset URL")
|
| 234 |
dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
|
| 235 |
back_button = gr.Button("< Back", size="sm")
|
|
|
|
|
|
|
| 236 |
|
| 237 |
###################################
|
| 238 |
#
|
|
|
|
| 278 |
|
| 279 |
def gen_datasets_line_by_line(search_query: str, generated_texts: tuple[str] = ()) -> Iterator[str]:
|
| 280 |
search_query = search_query or ""
|
| 281 |
+
search_query = search_query[:1000] if search_query.strip() else ""
|
| 282 |
generated_text = ""
|
| 283 |
current_line = ""
|
| 284 |
for token in stream_reponse(
|
|
|
|
| 297 |
|
| 298 |
def gen_dataset_content(search_query: str, dataset_name: str, tags: str) -> Iterator[str]:
|
| 299 |
search_query = search_query or ""
|
| 300 |
+
search_query = search_query[:1000] if search_query.strip() else ""
|
| 301 |
generated_text = ""
|
| 302 |
for token in stream_reponse(GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
|
| 303 |
search_query=search_query,
|
|
|
|
| 442 |
|
| 443 |
|
| 444 |
def _search_datasets(search_query):
|
| 445 |
+
yield {generated_texts_state: []}
|
| 446 |
yield {
|
| 447 |
button_group: gr.Group(elem_classes="buttonsGroup insivibleButtonGroup")
|
| 448 |
for button_group in button_groups[MAX_NB_ITEMS_PER_GENERATION_CALL:]
|
|
|
|
| 477 |
current_item_idx += 1
|
| 478 |
|
| 479 |
|
| 480 |
+
@search_button.click(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state])
|
| 481 |
def search_dataset_from_search_button(search_query):
|
| 482 |
yield from _search_datasets(search_query)
|
| 483 |
|
| 484 |
|
| 485 |
+
@search_bar.submit(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state])
|
| 486 |
def search_dataset_from_search_bar(search_query):
|
| 487 |
yield from _search_datasets(search_query)
|
| 488 |
|
|
|
|
| 521 |
dataset_title: f"# {dataset_name}\n\n tags: {tags}",
|
| 522 |
dataset_share_textbox: gr.Textbox(visible=False),
|
| 523 |
dataset_dataframe: gr.DataFrame(visible=False),
|
| 524 |
+
generate_full_dataset_button: gr.Button(interactive=True),
|
| 525 |
save_dataset_button: gr.Button(visible=False),
|
| 526 |
+
open_dataset_message: gr.Markdown(visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
}
|
| 528 |
for generated_text in gen_dataset_content(search_query=search_query, dataset_name=dataset_name, tags=tags):
|
| 529 |
yield {dataset_content: generated_text}
|
| 530 |
|
| 531 |
|
| 532 |
show_dataset_inputs = [search_bar, *buttons]
|
| 533 |
+
show_dataset_outputs = [search_page, dataset_page, dataset_title, dataset_content, generate_full_dataset_button, dataset_dataframe, save_dataset_button, open_dataset_message, dataset_share_textbox]
|
| 534 |
scroll_to_top_js = """
|
| 535 |
function (...args) {
|
| 536 |
console.log(args);
|
|
|
|
| 557 |
return gr.Column(visible=True), gr.Column(visible=False)
|
| 558 |
|
| 559 |
|
| 560 |
+
@generate_full_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, select_namespace_dropdown, visibility_radio], outputs=[dataset_dataframe, generate_full_dataset_button, save_dataset_button])
|
| 561 |
+
def generate_full_dataset(title, content, search_query, namespace, visability):
|
| 562 |
dataset_name, tags = title.strip("# ").split("\ntags:", 1)
|
| 563 |
dataset_name, tags = dataset_name.strip(), tags.strip()
|
| 564 |
csv_header, preview_df = parse_preview_df(content)
|
|
|
|
| 576 |
output[:len(preview_df)] = [{"idx": i, **x} for i, x in enumerate(preview_df.to_dict(orient="records"))]
|
| 577 |
yield {
|
| 578 |
dataset_dataframe: gr.DataFrame(pd.DataFrame([{"idx": i, **x} for i, x in enumerate(output) if x]), visible=True),
|
| 579 |
+
generate_full_dataset_button: gr.Button(interactive=False),
|
| 580 |
+
save_dataset_button: gr.Button(f"💾 Save Dataset {namespace}/{dataset_name}" + (" (private)" if visability != "public" else ""), visible=True, interactive=False)
|
| 581 |
}
|
| 582 |
kwargs_iterable = [
|
| 583 |
{
|
|
|
|
| 593 |
]
|
| 594 |
for _ in iflatmap_unordered(generate_partial_dataset, kwargs_iterable=kwargs_iterable):
|
| 595 |
yield {dataset_dataframe: pd.DataFrame([{"idx": i, **{column_name: x.get(column_name) for column_name in columns}} for i, x in enumerate(output) if x])}
|
| 596 |
+
yield {save_dataset_button: gr.Button(interactive=True)}
|
| 597 |
print(f"Generated {dataset_name}!")
|
| 598 |
|
| 599 |
|
| 600 |
+
@save_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, dataset_dataframe, select_namespace_dropdown, visibility_radio], outputs=[save_dataset_button, open_dataset_message])
|
| 601 |
+
def save_dataset(title: str, content: str, search_query: str, df: pd.DataFrame, namespace: str, visability: str, oauth_token: Optional[gr.OAuthToken]):
|
| 602 |
+
dataset_name, tags = title.strip("# ").split("\ntags:", 1)
|
| 603 |
+
dataset_name, tags = dataset_name.strip(), tags.strip()
|
| 604 |
+
token = oauth_token.token if oauth_token else save_dataset_hf_token
|
| 605 |
+
repo_id = f"{namespace}/{dataset_name}"
|
| 606 |
+
dataset_url = f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
|
| 607 |
+
gr.Info("Saving dataset...")
|
| 608 |
+
yield {save_dataset_button: gr.Button(interactive=False)}
|
| 609 |
+
create_repo(repo_id=repo_id, repo_type="dataset", private=visability!="public", exist_ok=True, token=token)
|
| 610 |
+
df.to_csv(f"hf://datasets/{repo_id}/data.csv", storage_options={"token": token}, index=False)
|
| 611 |
+
DatasetCard(DATASET_CARD_CONTENT.format(title=title, content=content, url=URL, dataset_url=dataset_url, model_id=model_id, search_query=search_query)).push_to_hub(repo_id=repo_id, repo_type="dataset", token=token)
|
| 612 |
+
gr.Info(f"✅ Dataset saved at {repo_id}")
|
| 613 |
+
additional_message = "PS: You can also save datasets under your account in the Settings ;)"
|
| 614 |
+
yield {open_dataset_message: gr.Markdown(f"# 🎉 Yay ! Your dataset has been saved to [{repo_id}](https://huggingface.co/datasets/{repo_id}) !\n\nDataset link: [https://huggingface.co/datasets/{repo_id}](https://huggingface.co/datasets/{repo_id})\n\n{additional_message}", visible=True)}
|
| 615 |
+
|
| 616 |
+
|
| 617 |
+
@dataset_share_button.click(inputs=[dataset_title, search_bar], outputs=[dataset_share_textbox])
|
| 618 |
+
def show_dataset_url(title, search_query):
|
| 619 |
+
dataset_name, tags = title.strip("# ").split("\ntags:", 1)
|
| 620 |
+
dataset_name, tags = dataset_name.strip(), tags.strip()
|
| 621 |
return gr.Textbox(
|
| 622 |
+
f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}",
|
| 623 |
visible=True,
|
| 624 |
)
|
| 625 |
|
| 626 |
+
@demo.load(outputs=show_dataset_outputs + button_groups + buttons + [generated_texts_state] + [select_namespace_dropdown, visibility_radio])
|
| 627 |
+
def load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]):
|
| 628 |
+
if oauth_token:
|
| 629 |
+
user_info = whoami(oauth_token.token)
|
| 630 |
+
yield {
|
| 631 |
+
select_namespace_dropdown: gr.Dropdown(
|
| 632 |
+
choices=[user_info["name"]] + [org_info["name"] for org_info in user_info["orgs"]],
|
| 633 |
+
value=user_info["name"],
|
| 634 |
+
visible=True,
|
| 635 |
+
),
|
| 636 |
+
visibility_radio: gr.Radio(interactive=True),
|
| 637 |
+
}
|
| 638 |
query_params = dict(request.query_params)
|
| 639 |
if "dataset" in query_params:
|
| 640 |
yield from _show_dataset(
|