Spaces:
Runtime error
Runtime error
add share link + use locally
Browse files
app.py
CHANGED
|
@@ -1,20 +1,25 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
|
| 3 |
import duckdb
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
| 6 |
import requests
|
| 7 |
-
from duckdb import DuckDBPyRelation
|
| 8 |
from duckdb.typing import DuckDBPyType
|
| 9 |
from huggingface_hub import HfApi
|
| 10 |
|
|
|
|
| 11 |
Table = DuckDBPyRelation
|
| 12 |
Dtype = DuckDBPyType
|
| 13 |
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
|
| 14 |
-
|
| 15 |
-
|
|
|
|
| 16 |
NUM_TRENDING_DATASETS = 10
|
| 17 |
NUM_USER_DATASETS = 10
|
|
|
|
|
|
|
| 18 |
css = """
|
| 19 |
.transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion {
|
| 20 |
background: var(--body-background-fill);
|
|
@@ -23,32 +28,53 @@ css = """
|
|
| 23 |
padding: var(--size-4) 0 !important;
|
| 24 |
max-width: 98% !important;
|
| 25 |
}
|
|
|
|
|
|
|
|
|
|
| 26 |
"""
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
| 31 |
|
| 32 |
-
def
|
| 33 |
-
query = ", ".join(
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
def
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
with gr.Blocks(css=css) as demo:
|
| 41 |
-
|
|
|
|
| 42 |
with gr.Row():
|
| 43 |
with gr.Column():
|
| 44 |
gr.Markdown("# <p style='text-align:center;'>π€ (WIP) Hugging Face Dataset Spreadsheets π</p>\n\n<p style='text-align:center;'>Edit any dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>)")
|
| 45 |
with gr.Group():
|
| 46 |
-
with gr.
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
def show_subset_dropdown(dataset: str):
|
| 54 |
if dataset and "/" not in dataset.strip().strip("/"):
|
|
@@ -64,60 +90,103 @@ with gr.Blocks(css=css) as demo:
|
|
| 64 |
split = (splits or [""])[0]
|
| 65 |
return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
|
| 66 |
|
| 67 |
-
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]):
|
| 68 |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
| 69 |
-
if dataset and subset and split and pattern:
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
else:
|
| 72 |
-
|
| 73 |
-
|
| 74 |
|
| 75 |
-
@demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, dataframe])
|
| 76 |
-
def _fetch_datasets(
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
if
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
subsets, loading_codes = show_subset_dropdown(dataset)
|
|
|
|
| 83 |
splits = show_split_dropdown(subsets["value"], loading_codes)
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
return {
|
| 86 |
dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
|
| 87 |
loading_codes_json: loading_codes,
|
| 88 |
subset_dropdown: gr.Dropdown(**subsets),
|
| 89 |
split_dropdown: gr.Dropdown(**splits),
|
|
|
|
| 90 |
dataframe: gr.DataFrame(**input_dataframe),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
}
|
| 92 |
|
| 93 |
-
@dataset_dropdown.select(inputs=dataset_dropdown, outputs=[loading_codes_json, subset_dropdown, split_dropdown, dataframe])
|
| 94 |
-
def _show_subset_dropdown(dataset: str):
|
| 95 |
subsets, loading_codes = show_subset_dropdown(dataset)
|
| 96 |
splits = show_split_dropdown(subsets["value"], loading_codes)
|
| 97 |
-
|
|
|
|
| 98 |
return {
|
| 99 |
loading_codes_json: loading_codes,
|
| 100 |
subset_dropdown: gr.Dropdown(**subsets),
|
| 101 |
split_dropdown: gr.Dropdown(**splits),
|
|
|
|
| 102 |
dataframe: gr.DataFrame(**input_dataframe),
|
| 103 |
}
|
| 104 |
|
| 105 |
-
@subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown, dataframe])
|
| 106 |
def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
|
| 107 |
splits = show_split_dropdown(subset, loading_codes)
|
| 108 |
-
|
|
|
|
| 109 |
return {
|
| 110 |
split_dropdown: gr.Dropdown(**splits),
|
|
|
|
| 111 |
dataframe: gr.DataFrame(**input_dataframe),
|
| 112 |
}
|
| 113 |
|
| 114 |
-
@split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[dataframe])
|
| 115 |
def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
|
| 116 |
-
|
|
|
|
| 117 |
return {
|
|
|
|
| 118 |
dataframe: gr.DataFrame(**input_dataframe),
|
| 119 |
}
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
if __name__ == "__main__":
|
| 123 |
demo.launch()
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from uuid import uuid4
|
| 3 |
|
| 4 |
import duckdb
|
| 5 |
import gradio as gr
|
| 6 |
import pandas as pd
|
| 7 |
import requests
|
| 8 |
+
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
| 9 |
from duckdb.typing import DuckDBPyType
|
| 10 |
from huggingface_hub import HfApi
|
| 11 |
|
| 12 |
+
Connection = DuckDBPyConnection
|
| 13 |
Table = DuckDBPyRelation
|
| 14 |
Dtype = DuckDBPyType
|
| 15 |
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
|
| 16 |
+
memory_con = duckdb.connect(":memory:")
|
| 17 |
+
empty_tbl = memory_con.sql("SELECT null as col_1, null as col_2, null as col_3, null as col_4 FROM range(10)")
|
| 18 |
+
PAGE_SIZE = 5
|
| 19 |
NUM_TRENDING_DATASETS = 10
|
| 20 |
NUM_USER_DATASETS = 10
|
| 21 |
+
SESSIONS_DIR = "s"
|
| 22 |
+
URL = "https://huggingface.co/spaces/lhoestq/dataset-spreadsheets"
|
| 23 |
css = """
|
| 24 |
.transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion {
|
| 25 |
background: var(--body-background-fill);
|
|
|
|
| 28 |
padding: var(--size-4) 0 !important;
|
| 29 |
max-width: 98% !important;
|
| 30 |
}
|
| 31 |
+
.cell-menu-button {
|
| 32 |
+
z-index: -1;
|
| 33 |
+
}
|
| 34 |
"""
|
| 35 |
|
| 36 |
+
def to_json_df(con: Connection, tbl: Table) -> pd.DataFrame:
|
| 37 |
+
query = ", ".join("nullif(([" + col + "]::JSON)[0]::VARCHAR, 'null') AS " + col for col in tbl.columns)
|
| 38 |
+
out = con.sql(f"SELECT {query} FROM tbl").df()
|
| 39 |
+
return out
|
| 40 |
|
| 41 |
+
def from_json_df(con: Connection, df: pd.DataFrame, columns: list[str], dtypes: list[Dtype]) -> Table:
|
| 42 |
+
query = ", ".join(
|
| 43 |
+
"if(" + col + " IS null, null, (" + col + "::JSON::" + str(dtype) + ")"
|
| 44 |
+
+ ("[2:-2]" if str(dtype) == "VARCHAR" else "") # remove double quotes at the start and end
|
| 45 |
+
+ ") AS " + col for col, dtype in zip(columns, dtypes))
|
| 46 |
+
return con.sql(f"SELECT {query} FROM df")
|
| 47 |
|
| 48 |
+
def setup_edits(con: Connection, dataset: str, pattern: str) -> None:
|
| 49 |
+
con.sql(f"CREATE VIEW IF NOT EXISTS dataset AS SELECT * FROM 'hf://datasets/{dataset}/{pattern}'")
|
| 50 |
+
empty_dataset_tbl = con.sql("SELECT * FROM dataset LIMIT 0;")
|
| 51 |
+
columns = empty_dataset_tbl.columns
|
| 52 |
+
dtypes = empty_dataset_tbl.dtypes
|
| 53 |
+
con.sql(f"CREATE TABLE IF NOT EXISTS edits(rowid INTEGER PRIMARY KEY, {', '.join(col + ' ' + str(dtype) for col, dtype in zip(columns, dtypes))})")
|
| 54 |
+
con.sql(
|
| 55 |
+
"CREATE VIEW IF NOT EXISTS edited_dataset AS "
|
| 56 |
+
"WITH edits_per_rowid AS (SELECT * FROM (SELECT unnest(range(max(rowid) + 1)) AS rowid FROM edits) LEFT JOIN edits USING (rowid) ORDER BY rowid) "
|
| 57 |
+
f"SELECT {', '.join('ifnull(edits_per_rowid.' + col + ', dataset.' + col + ') AS ' + col for col in columns)} FROM dataset POSITIONAL JOIN edits_per_rowid"
|
| 58 |
+
)
|
| 59 |
+
gr.set_static_paths(paths=[SESSIONS_DIR + "/"])
|
| 60 |
|
| 61 |
with gr.Blocks(css=css) as demo:
|
| 62 |
+
session_state = gr.BrowserState()
|
| 63 |
+
loading_codes_json = gr.JSON([], visible=False)
|
| 64 |
with gr.Row():
|
| 65 |
with gr.Column():
|
| 66 |
gr.Markdown("# <p style='text-align:center;'>π€ (WIP) Hugging Face Dataset Spreadsheets π</p>\n\n<p style='text-align:center;'>Edit any dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>)")
|
| 67 |
with gr.Group():
|
| 68 |
+
with gr.Tab("Select Dataset"):
|
| 69 |
+
with gr.Row():
|
| 70 |
+
dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
|
| 71 |
+
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
|
| 72 |
+
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
|
| 73 |
+
with gr.Tab("Share Link"):
|
| 74 |
+
share_link_textbox = gr.Textbox(label="Copy the link to the Spreadsheet:", show_copy_button=True, interactive=False)
|
| 75 |
+
with gr.Tab("Use Locally"):
|
| 76 |
+
use_locally_markdown = gr.Markdown()
|
| 77 |
+
dataframe = gr.DataFrame(to_json_df(memory_con, empty_tbl), interactive=True, wrap=True)
|
| 78 |
|
| 79 |
def show_subset_dropdown(dataset: str):
|
| 80 |
if dataset and "/" not in dataset.strip().strip("/"):
|
|
|
|
| 90 |
split = (splits or [""])[0]
|
| 91 |
return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
|
| 92 |
|
| 93 |
+
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str):
|
| 94 |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
| 95 |
+
if session and dataset and subset and split and pattern:
|
| 96 |
+
duckdb_file = session + ".duckdb"
|
| 97 |
+
os.makedirs(SESSIONS_DIR, exist_ok=True)
|
| 98 |
+
con = duckdb.connect(os.path.join(SESSIONS_DIR, duckdb_file))
|
| 99 |
+
setup_edits(con, dataset, pattern)
|
| 100 |
+
# Uncomment to have one edit for testing
|
| 101 |
+
# con.sql("INSERT OR REPLACE INTO edits SELECT 2 AS rowid, * FROM dataset LIMIT 1")
|
| 102 |
+
tbl = con.sql(f"SELECT * FROM edited_dataset LIMIT {PAGE_SIZE}")
|
| 103 |
+
return dict(value=to_json_df(con, tbl))
|
| 104 |
else:
|
| 105 |
+
return dict(value=to_json_df(memory_con, empty_tbl))
|
| 106 |
+
|
| 107 |
|
| 108 |
+
@demo.load(inputs=session_state, outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, dataframe, session_state, share_link_textbox, use_locally_markdown])
|
| 109 |
+
def _fetch_datasets(session: str | None, request: gr.Request):
|
| 110 |
+
datasets = list(HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]))
|
| 111 |
+
session = request.query_params.get(SESSIONS_DIR) or session
|
| 112 |
+
if session:
|
| 113 |
+
namespace, dataset_name, subset, split, _ = session.split("--")
|
| 114 |
+
dataset = namespace + "/" + dataset_name
|
| 115 |
+
if "dataset" in request.query_params and request.query_params["dataset"] != dataset:
|
| 116 |
+
session = None
|
| 117 |
+
dataset = request.query_params["dataset"]
|
| 118 |
+
else:
|
| 119 |
+
dataset = request.query_params.get("dataset") or datasets[0].id
|
| 120 |
subsets, loading_codes = show_subset_dropdown(dataset)
|
| 121 |
+
subsets["value"] = subset if session else subsets["value"]
|
| 122 |
splits = show_split_dropdown(subsets["value"], loading_codes)
|
| 123 |
+
splits["value"] = split if session else splits["value"]
|
| 124 |
+
session = session if isinstance(session, str) else f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
|
| 125 |
+
input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session)
|
| 126 |
return {
|
| 127 |
dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
|
| 128 |
loading_codes_json: loading_codes,
|
| 129 |
subset_dropdown: gr.Dropdown(**subsets),
|
| 130 |
split_dropdown: gr.Dropdown(**splits),
|
| 131 |
+
session_state: session,
|
| 132 |
dataframe: gr.DataFrame(**input_dataframe),
|
| 133 |
+
share_link_textbox: f"{URL}?{SESSIONS_DIR}={session}",
|
| 134 |
+
use_locally_markdown: (
|
| 135 |
+
f"""In DuckDB:\n\n```sql\nATTACH '{URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
|
| 136 |
+
f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
|
| 137 |
+
)
|
| 138 |
}
|
| 139 |
|
| 140 |
+
@dataset_dropdown.select(inputs=[dataset_dropdown], outputs=[session_state, loading_codes_json, subset_dropdown, split_dropdown, dataframe])
|
| 141 |
+
def _show_subset_dropdown(session: str | None, dataset: str):
|
| 142 |
subsets, loading_codes = show_subset_dropdown(dataset)
|
| 143 |
splits = show_split_dropdown(subsets["value"], loading_codes)
|
| 144 |
+
session = f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
|
| 145 |
+
input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session)
|
| 146 |
return {
|
| 147 |
loading_codes_json: loading_codes,
|
| 148 |
subset_dropdown: gr.Dropdown(**subsets),
|
| 149 |
split_dropdown: gr.Dropdown(**splits),
|
| 150 |
+
session_state: session,
|
| 151 |
dataframe: gr.DataFrame(**input_dataframe),
|
| 152 |
}
|
| 153 |
|
| 154 |
+
@subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[session_state, split_dropdown, dataframe])
|
| 155 |
def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
|
| 156 |
splits = show_split_dropdown(subset, loading_codes)
|
| 157 |
+
session = f"{dataset.replace('/', '--')}--{subset}--{splits['value']}--{uuid4()}"
|
| 158 |
+
input_dataframe = show_input_dataframe(dataset, subset, splits["value"], loading_codes, session)
|
| 159 |
return {
|
| 160 |
split_dropdown: gr.Dropdown(**splits),
|
| 161 |
+
session_state: session,
|
| 162 |
dataframe: gr.DataFrame(**input_dataframe),
|
| 163 |
}
|
| 164 |
|
| 165 |
+
@split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[session_state, dataframe])
|
| 166 |
def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
|
| 167 |
+
session = f"{dataset.replace('/', '--')}--{subset}--{split}--{uuid4()}"
|
| 168 |
+
input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session)
|
| 169 |
return {
|
| 170 |
+
session_state: session,
|
| 171 |
dataframe: gr.DataFrame(**input_dataframe),
|
| 172 |
}
|
| 173 |
+
|
| 174 |
+
@dataframe.input(inputs=[dataframe, session_state, dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json])
|
| 175 |
+
def _dataframe_input(df: pd.DataFrame, session: str | None, dataset: str, subset: str, split: str, loading_codes: list[dict]):
|
| 176 |
+
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
| 177 |
+
if session and dataset and subset and split and pattern:
|
| 178 |
+
duckdb_file = session + ".duckdb"
|
| 179 |
+
os.makedirs(SESSIONS_DIR, exist_ok=True)
|
| 180 |
+
con = duckdb.connect(os.path.join(SESSIONS_DIR, duckdb_file))
|
| 181 |
+
setup_edits(con, dataset, pattern)
|
| 182 |
+
empty_dataset_tbl = con.sql("SELECT * EXCLUDE (rowid) FROM edits LIMIT 0;")
|
| 183 |
+
columns = empty_dataset_tbl.columns
|
| 184 |
+
dtypes = empty_dataset_tbl.dtypes
|
| 185 |
+
tbl = from_json_df(con, df, columns=columns, dtypes=dtypes)
|
| 186 |
+
# TODO add edits for page > 1
|
| 187 |
+
con.sql(f"INSERT OR REPLACE INTO edits SELECT * FROM (SELECT unnest(range({len(df)})) AS rowid) POSITIONAL JOIN tbl")
|
| 188 |
+
print(f"Saved {dataset} edits")
|
| 189 |
+
|
| 190 |
|
| 191 |
if __name__ == "__main__":
|
| 192 |
demo.launch()
|