Spaces:

synthetic-data-universe
/

synth

Sleeping

App Files Files Community

edbeeching commited on Sep 5

Commit

8854100

1 Parent(s): b6d1901

bugfixes and improved validation

Browse files

Files changed (1) hide show

app.py +53 -44

app.py CHANGED Viewed

@@ -8,6 +8,13 @@ from enum import Enum
 from datasets import get_dataset_infos
 from transformers import AutoConfig
 class GenerationStatus(Enum):
     PENDING = "PENDING"
@@ -27,6 +34,7 @@ class GenerationRequest:
     input_dataset_name: str
     input_dataset_config: str
     input_dataset_split: str
     prompt_column: str
     model_name_or_path: str
     model_revision: str
@@ -55,7 +63,7 @@ def validate_request(request: GenerationRequest):
         raise Exception(f"Dataset split {request.input_dataset_split} does not exist in dataset {request.input_dataset_name}. Available splits: {list(input_dataset_info.splits.keys())}")
     # check that the number of samples is less than MAX_SAMPLES
-    if input_dataset_info.splits[request.input_dataset_split].num_samples > MAX_SAMPLES:
         raise Exception(f"Dataset split {request.input_dataset_split} in dataset {request.input_dataset_name} exceeds max sample limit of {MAX_SAMPLES}.")
     # check the prompt column exists in the dataset
@@ -67,6 +75,7 @@ def validate_request(request: GenerationRequest):
     try:
         model_config = AutoConfig.from_pretrained(request.model_name_or_path, revision=request.model_revision, token=request.model_token)
     except Exception as e:
         raise Exception(f"Model {request.model_name_or_path} revision {request.model_revision} does not exist or cannot be accessed with the provided token.")
     # check the model max position embeddings is greater than the requested max tokens and less than MAX_TOKENS
@@ -91,36 +100,42 @@ def validate_request(request: GenerationRequest):
 def add_request_to_db(request: GenerationRequest):
     url: str = os.getenv("SUPABASE_URL")
     key: str = os.getenv("SUPABASE_KEY")
-    options: ClientOptions = {
-        "schema": "public"
-    }
-    supabase: Client = create_client(url, key, options)
-    data = {
-        "status": request.status.value,
-        "input_dataset_name": request.input_dataset_name,
-        "input_dataset_config": request.input_dataset_config,
-        "input_dataset_split": request.input_dataset_split,
-        "prompt_column": request.prompt_column,
-        "model_name_or_path": request.model_name_or_path,
-        "model_revision": request.model_revision,
-        "model_token": request.model_token,
-        "system_prompt": request.system_prompt,
-        "max_tokens": request.max_tokens,
-        "temperature": request.temperature,
-        "top_k": request.top_k,
-        "top_p": request.top_p,
-        "input_dataset_token": request.input_dataset_token,
-        "output_dataset_token": request.output_dataset_token,
-        "username": request.username,
-        "email": request.email
-    }
-    response = supabase.table("generation-requests").insert(data).execute()
-    if response.status_code != 201:
-        raise Exception(f"Failed to add request to database: {response.data}")
-    return response.data
 def create_gradio_interface():
@@ -130,11 +145,7 @@ def create_gradio_interface():
                 gr.Markdown("# Synthetic Data Generation Request")
             with gr.Row():
                 gr.Markdown("""
-                Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models.
-                Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
                 """)
         with gr.Group():
             with gr.Row():
@@ -153,8 +164,6 @@ def create_gradio_interface():
                 - Model must be accessible (public or with valid token)
                 - Maximum 10,000 samples per dataset
                 - Maximum of 32k generation tokens
-                **Note:** Generation requests are processed asynchronously. You will be notified via email when your request is complete.
                 """)
         with gr.Row():
@@ -184,7 +193,7 @@ def create_gradio_interface():
                         temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
                     with gr.Row():
                         top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
-                        top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.1)
                 with gr.Column():
                     system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
@@ -202,15 +211,16 @@ def create_gradio_interface():
         submit_btn = gr.Button("Submit Generation Request", variant="primary")
         output_status = gr.Textbox(label="Status", interactive=False)
-        def submit_request(input_ds, input_split, prompt_col, model_name, model_rev, model_token, sys_prompt,
-                            max_tok, temp, top_k_val, top_p_val, output_ds, user, email_addr, input_dataset_token, output_dataset_token):
             try:
                 request = GenerationRequest(
                     id="",  # Will be generated when adding to the database
                     status=GenerationStatus.PENDING,
-                    input_dataset_name=input_ds,
                     input_dataset_split=input_split,
                     input_dataset_config=input_dataset_config,
                     prompt_column=prompt_col,
                     model_name_or_path=model_name,
                     model_revision=model_rev,
@@ -220,7 +230,6 @@ def create_gradio_interface():
                     temperature=temp,
                     top_k=int(top_k_val),
                     top_p=top_p_val,
-                    output_dataset_name=output_ds,
                     input_dataset_token=input_dataset_token if input_dataset_token else None,
                     output_dataset_token=output_dataset_token,
                     username=user,
@@ -237,9 +246,9 @@ def create_gradio_interface():
         submit_btn.click(
             submit_request,
-            inputs=[input_dataset_name, input_dataset_split, prompt_column, model_name_or_path,
                     model_revision, model_token, system_prompt, max_tokens, temperature, top_k, top_p,
-                    output_dataset_name, username, email, input_dataset_token, output_dataset_token],
             outputs=output_status
         )

 from datasets import get_dataset_infos
 from transformers import AutoConfig
+"""
+ Still TODO:
+ - validate the user is PRO
+ - check the output dataset token is valid
+"""
 class GenerationStatus(Enum):
     PENDING = "PENDING"
     input_dataset_name: str
     input_dataset_config: str
     input_dataset_split: str
+    output_dataset_name: str
     prompt_column: str
     model_name_or_path: str
     model_revision: str
         raise Exception(f"Dataset split {request.input_dataset_split} does not exist in dataset {request.input_dataset_name}. Available splits: {list(input_dataset_info.splits.keys())}")
     # check that the number of samples is less than MAX_SAMPLES
+    if input_dataset_info.splits[request.input_dataset_split].num_examples > MAX_SAMPLES:
         raise Exception(f"Dataset split {request.input_dataset_split} in dataset {request.input_dataset_name} exceeds max sample limit of {MAX_SAMPLES}.")
     # check the prompt column exists in the dataset
     try:
         model_config = AutoConfig.from_pretrained(request.model_name_or_path, revision=request.model_revision, token=request.model_token)
     except Exception as e:
+        print(e)
         raise Exception(f"Model {request.model_name_or_path} revision {request.model_revision} does not exist or cannot be accessed with the provided token.")
     # check the model max position embeddings is greater than the requested max tokens and less than MAX_TOKENS
 def add_request_to_db(request: GenerationRequest):
     url: str = os.getenv("SUPABASE_URL")
     key: str = os.getenv("SUPABASE_KEY")
+    try:
+        supabase: Client = create_client(
+            url,
+            key,
+            options=ClientOptions(
+                postgrest_client_timeout=10,
+                storage_client_timeout=10,
+                schema="public",
+            )
+        )
+        data = {
+            "status": request.status.value,
+            "input_dataset_name": request.input_dataset_name,
+            "input_dataset_config": request.input_dataset_config,
+            "input_dataset_split": request.input_dataset_split,
+            "output_dataset_name": request.output_dataset_name,
+            "prompt_column": request.prompt_column,
+            "model_name_or_path": request.model_name_or_path,
+            "model_revision": request.model_revision,
+            "model_token": request.model_token,
+            "system_prompt": request.system_prompt,
+            "max_tokens": request.max_tokens,
+            "temperature": request.temperature,
+            "top_k": request.top_k,
+            "top_p": request.top_p,
+            "input_dataset_token": request.input_dataset_token,
+            "output_dataset_token": request.output_dataset_token,
+            "username": request.username,
+            "email": request.email
+        }
+        supabase.table("gen-requests").insert(data).execute()
+    except Exception as e:
+        raise Exception("Failed to add request to database")
 def create_gradio_interface():
                 gr.Markdown("# Synthetic Data Generation Request")
             with gr.Row():
                 gr.Markdown("""
+                Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
                 """)
         with gr.Group():
             with gr.Row():
                 - Model must be accessible (public or with valid token)
                 - Maximum 10,000 samples per dataset
                 - Maximum of 32k generation tokens
                 """)
         with gr.Row():
                         temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
                     with gr.Row():
                         top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
+                        top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
                 with gr.Column():
                     system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
         submit_btn = gr.Button("Submit Generation Request", variant="primary")
         output_status = gr.Textbox(label="Status", interactive=False)
+        def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, model_token, sys_prompt,
+                            max_tok, temp, top_k_val, top_p_val, user, email_addr, input_dataset_token, output_dataset_token):
             try:
                 request = GenerationRequest(
                     id="",  # Will be generated when adding to the database
                     status=GenerationStatus.PENDING,
+                    input_dataset_name=input_dataset_name,
                     input_dataset_split=input_split,
                     input_dataset_config=input_dataset_config,
+                    output_dataset_name=output_dataset_name,
                     prompt_column=prompt_col,
                     model_name_or_path=model_name,
                     model_revision=model_rev,
                     temperature=temp,
                     top_k=int(top_k_val),
                     top_p=top_p_val,
                     input_dataset_token=input_dataset_token if input_dataset_token else None,
                     output_dataset_token=output_dataset_token,
                     username=user,
         submit_btn.click(
             submit_request,
+            inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
                     model_revision, model_token, system_prompt, max_tokens, temperature, top_k, top_p,
+                    username, email, input_dataset_token, output_dataset_token],
             outputs=output_status
         )