Spaces:

loubnabnl
/

the-stack-bot

Runtime error

App Files Files Community

loubnabnl HF Staff commited on Dec 29, 2022

Commit

72c2877

1 Parent(s): 2171b06

update code

Browse files

Files changed (3) hide show

app.py +38 -35
languages.json → utils/languages.json +3 -3
utils/table_contents.md +9 -0

app.py CHANGED Viewed

@@ -2,19 +2,23 @@ import json
 import requests
 import streamlit as st
 st.title("The Stack Bot 🤖")
 intro = """
 The Stack Bot is a tool to help you get started with tools developed in [BigCode](https://huggingface.co/bigcode),
 such as [The Stack](https://huggingface.co/bigcode/the-stack) dataset and [SantaCoder](https://huggingface.co/bigcode/santacoder) model.
-We show information about existing programming languages and models trained on them. If you trained a model on The Stack, let us know so we feature your model! 🚀
 """
 st.markdown(intro, unsafe_allow_html=True)
 @st.cache()
 def load_languages():
-    with open("languages.json", "r") as f:
         languages = json.load(f)
     return languages
@@ -22,7 +26,11 @@ def how_to_load(language):
     text = f"""
     ```python
     from datasets import load_dataset
-    dataset = load_dataset("bigcode/the-stack", data_dir=f"data/{language}, split="train")
     ```
     """
     st.markdown(text)
@@ -34,43 +42,37 @@ def load_model(values, language):
         You can also train your own model on The Stack using the instructions below 🚀"""
         st.write(text)
         if st.button("Fine-tune your own model", key=4):
-            st.write("Code available at [GitHub link] + add preview + example of time & required hardware estimation")
     else:
-        text = f"""{model} is a model that was trained on the {language} from The Stack. Here's how to use it:"""
         code = f"""
         ```python
         from transformers import AutoModelForCausalLM, AutoTokenizer
-        device = "cuda" # for GPU usage or "cpu" for CPU usage
         tokenizer = AutoTokenizer.from_pretrained({model})
-        model = AutoModelForCausalLM.from_pretrained({model}, trust_remote_code=True).to(device)
-        inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device)
         outputs = model.generate(inputs)
         print(tokenizer.decode(outputs[0]))
         ```
         """
         st.write(text)
         st.markdown(code)
-        st.write("The scores of this model are the following:")
-        for key, value in values["scores"].items():
-                st.write(f"{key}: {value}")
 def generate_code(
    demo, gen_prompt, max_new_tokens=40, temperature=0.2, seed=0
 ):
     # call space using its API endpoint
-    try:
-        url = (
-            f"https://hf.space/embed/{demo.lower()}/+/api/predict/"
-        )
-        r = requests.post(
-            url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]}
-        )
-        generated_text = r.json()["data"][0]
-    except:
-        generated_text = ""
     return generated_text
 def init_nested_buttons():
@@ -86,9 +88,9 @@ def init_nested_buttons():
 languages = load_languages()
-col1, col2 = st.columns([1, 2])
 with col1:
-    selected_language = st.selectbox("Languages of The Stack", list(languages.keys()), key=1)
 st.write(f"Here's how you can load the {selected_language.capitalize()} subset of The Stack:")
 code = how_to_load(selected_language)
@@ -101,21 +103,22 @@ if st.session_state["Models trained on dataset"]:
     load_model(languages[selected_language], selected_language)
     if languages[selected_language]["model"] and languages[selected_language]["gradio_demo"]:
-        st.write(f"Here's a demo to try the model, for more flxibilty you can use the original at [Gradio demo](hf.co/{languages[selected_language]['gradio_demo']})")
         gen_prompt = st.text_area(
         "Generate code with prompt:",
-        value="# print hello world",
         height=100,
         ).strip()
         if st.button("Generate code"):
             st.session_state["Generate code"] = not st.session_state["Generate code"]
         if st.session_state["Generate code"]:
-            generated_text = generate_code(
-                demo=languages[selected_language]["gradio_demo"],
-                gen_prompt=gen_prompt,
-            )
-            if not generated_text:
-                st.write(f"Error: could not generate code. Make sure the Gradio demo at hf.co/{languages[selected_language]['gradio_demo']} works.")
-            else:
-                st.code(generated_text)

 import requests
 import streamlit as st
+st.set_page_config(layout="wide")
+with open("utils/table_contents.md", "r") as f:
+    contents = f.read()
+st.sidebar.markdown(contents)
 st.title("The Stack Bot 🤖")
 intro = """
 The Stack Bot is a tool to help you get started with tools developed in [BigCode](https://huggingface.co/bigcode),
 such as [The Stack](https://huggingface.co/bigcode/the-stack) dataset and [SantaCoder](https://huggingface.co/bigcode/santacoder) model.
 """
 st.markdown(intro, unsafe_allow_html=True)
 @st.cache()
 def load_languages():
+    with open("utils/languages.json", "r") as f:
         languages = json.load(f)
     return languages
     text = f"""
     ```python
     from datasets import load_dataset
+    dataset = load_dataset("bigcode/the-stack", data_dir="data/{language}", split="train")
+    # print first element
+    print(dataset[0])
     ```
     """
     st.markdown(text)
         You can also train your own model on The Stack using the instructions below 🚀"""
         st.write(text)
         if st.button("Fine-tune your own model", key=4):
+            st.write("Code available at [GitHub link] + add preview")
     else:
+        text = f"""{model} is a model that was trained on the {language.capitalize()} subset of The Stack. Here's how to use it:"""
         code = f"""
         ```python
         from transformers import AutoModelForCausalLM, AutoTokenizer
         tokenizer = AutoTokenizer.from_pretrained({model})
+        model = AutoModelForCausalLM.from_pretrained({model}, trust_remote_code=True)
+        inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt")
         outputs = model.generate(inputs)
         print(tokenizer.decode(outputs[0]))
         ```
         """
         st.write(text)
         st.markdown(code)
+        st.write(f"The scores of this model are the following: {values['scores']}")
 def generate_code(
    demo, gen_prompt, max_new_tokens=40, temperature=0.2, seed=0
 ):
     # call space using its API endpoint
+    #try:
+    url = (
+        f"{demo}/run/predict/"
+    )
+    r = requests.post(
+        url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]}
+    )
+    generated_text = r.json()["data"][0]
     return generated_text
 def init_nested_buttons():
 languages = load_languages()
+col1, col2 = st.columns([1, 1.5])
 with col1:
+    selected_language = st.selectbox("Select one of 358 languages in The Stack", list(languages.keys()), key=1)
 st.write(f"Here's how you can load the {selected_language.capitalize()} subset of The Stack:")
 code = how_to_load(selected_language)
     load_model(languages[selected_language], selected_language)
     if languages[selected_language]["model"] and languages[selected_language]["gradio_demo"]:
+        st.write(f"Here's a demo to try the model, for more flexibilty you can use the [Gradio demo]({languages[selected_language]['gradio_demo']}).")
         gen_prompt = st.text_area(
         "Generate code with prompt:",
+        value="# Implement a function to print hello world",
         height=100,
         ).strip()
         if st.button("Generate code"):
             st.session_state["Generate code"] = not st.session_state["Generate code"]
         if st.session_state["Generate code"]:
+            with st.spinner("Generating code..."):
+                generated_text = generate_code(
+                    demo=languages[selected_language]["gradio_demo"],
+                    gen_prompt=gen_prompt,
+                )
+                if not generated_text:
+                    st.markdown(f"Error: could not generate code. Make sure the Gradio demo at [{languages[selected_language]['gradio_demo']}]({languages[selected_language]['gradio_demo']}) works.")
+                else:
+                    st.code(generated_text)

languages.json → utils/languages.json RENAMED Viewed

@@ -1,6 +1,6 @@
-{"python": {"num_examples": 10, "model": "bigcode/santacoder", "scores": {"HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "bigcode/santacoder-demo"},
-"java": {"num_examples": 10, "model": "bigcode/santacoder", "scores": { "HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "bigcode/santacoder-demo"},
-"javascript": {"num_examples": 10, "model": "bigcode/santacoder", "scores": { "HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "bigcode/santacoder-demo"},
 "typescript": {"num_examples": 10, "model": ""},
 "go": {"num_examples": 10, "model": ""},
 "php": {"num_examples": 10, "model": ""},

+{"python": {"num_examples": 10, "model": "bigcode/santacoder", "scores": {"HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "https://loubnabnl-santa-demo.hf.space"},
+"java": {"num_examples": 10, "model": "bigcode/santacoder", "scores": { "HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "https://loubnabnl-santa-demo.hf.space"},
+"javascript": {"num_examples": 10, "model": "bigcode/santacoder", "scores": { "HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "https://loubnabnl-santa-demo.hf.space"},
 "typescript": {"num_examples": 10, "model": ""},
 "go": {"num_examples": 10, "model": ""},
 "php": {"num_examples": 10, "model": ""},

utils/table_contents.md ADDED Viewed

	@@ -0,0 +1,9 @@

+### 📖 Table of contents 📖
+1 - [The Stack](https://huggingface.co/bigcode/the-stack) exploration
+2 - Models trained on The Stack (e.g. [SantaCoder](https://huggingface.co/bigcode/santacodee))
+3 - Demos for code generation
+If you trained a model on The Stack, let us know so we can feature it! 🚀