File size: 2,941 Bytes
8d96c36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

import datasets
import pandas as pd
import gradio as gr
import csv
from collections import defaultdict
import random

INTRO = """
The table below demonstrates a sample paragraph from the dev split of BOUQuET.
"""
ALL = "All"


def data_browse_tab():
    # Load the data
    ds = datasets.load_dataset("facebook/bouquet", "sentence_level", split="dev")
    long_df = ds.to_pandas()
    lang2df = {
        lang: part.drop(columns=["tgt_text", "tgt_lang"]).reset_index(drop=True) 
        for lang, part in long_df.groupby('src_lang')
    }
    eng_df = lang2df["eng_Latn"]
    langs = sorted(lang2df.keys())
    domains = sorted(set(eng_df["domain"]))
    paragraph_ids = sorted(set(eng_df["par_id"]))
    domain2par_ids = {domain: sorted(set(group["par_id"])) for domain, group in eng_df.groupby("domain")}

    def select_data(src_lang, tgt_lang, par_id):
        src_df = lang2df[src_lang]
        tgt_df = lang2df[tgt_lang]
        df = src_df.copy()
        df["tgt_text"] = tgt_df["src_text"]
        par = df[df['par_id'].eq(par_id)].copy()
        part = par[['domain', 'uniq_id', 'orig_text', 'src_text', 'tgt_text', 'tags', 'register']]
        # TODO: add 'par_comment' in a text field below
        return gr.update(value=part, wrap=True)

    with gr.Tab("Data samples"):
        gr.Markdown("# BOUQuET data browser")
        # Define the controls
        with gr.Row():
            gr_src_lang = gr.Dropdown(langs, label="Source lang", value=random.choice(langs))
            gr_tgt_lang = gr.Dropdown(langs, label="Target lang", value=random.choice(langs))
            gr_domain = gr.Dropdown([ALL] + domains, label="Domain", value=ALL)
            gr_par_id = gr.Dropdown(paragraph_ids, label="Paragraph ID", value=random.choice(paragraph_ids))
            inputs = [gr_src_lang, gr_tgt_lang, gr_par_id]
            gr_sample_btn = gr.Button(value="Sample a paragraph")
            gr_sample_btn.click(fn=lambda: random.choice(paragraph_ids), inputs=None, outputs=gr_par_id)

        # Define the data
        df_all = select_data(*[inp.value for inp in inputs])
        gr_df = gr.Dataframe(
            df_all, 
            wrap=True, 
            show_fullscreen_button=True,
            column_widths=["10%", "5%", "20%", "20%", "20%", "15%", "6%"],
            elem_classes=["small-font"],
        )
        # Interactivity
        for inp in inputs:
            inp.change(fn=select_data, inputs=inputs, outputs=gr_df)
        
        def change_domain(domain, par_id):
            if domain == ALL:
                par_ids = paragraph_ids
            else:
                par_ids = domain2par_ids[domain]
            if par_id not in par_ids:
                par_id = random.choice(par_ids)
            print(f"par_id: {par_id} is one of {par_ids}")
            return gr.Dropdown(choices=par_ids, value=par_id)
        
        gr_domain.change(fn=change_domain, inputs=[gr_domain, gr_par_id], outputs=[gr_par_id])