import gradio as gr from utils.compression_util import get_compression_leaderboard from utils.compression_util import common_corpuses with gr.Blocks() as demo: # gr.Markdown("## Convertor") # with gr.Accordion("Convertor", open=False): # gr.Markdown("Tokenize {} corpus") # with gr.Row(elem_classes="no-border"): # gr.Button("File Size", min_width=50) # file_size = gr.Textbox( # show_label=False, # min_width=50, # # elem_classes="textbox-as-text" # ) # gr.Dropdown( # choices=['MB', 'GB', 'TB'], # show_label=False, # min_width=15, # # elem_classes="textbox-as-text" # ) # # gr.Markdown('

') # # gr.HTML('

') # gr.Button( # "≈", # min_width=10, # elem_classes="button-white h2-font" # # ) # # gr.Button( # "Tokens", # min_width=50 # ) # gr.Textbox( # show_label=False, # min_width=50 # ) # gr.Dropdown( # ['million', 'billion', 'trillion'], # show_label=False, # min_width=15, # elem_classes="button-white" # ) gr.Markdown("## 🛠️ Setting") # ⚙ with gr.Accordion("Please select corpus and measure of compression rate ...", open=True): # file size 💽 🖴, tokens 🧮 # gr.Markdown( # "Please select corpus and measure of compression rate.\n" #"`num_of_trillion_tokens` `num_of_billion_tokens`\n" # "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n" # "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n" # "- `n_chars/n_tokens` measures how many chars per token in the current corpus. \n\n" # "All the above measures are depend on corpus. You can reproduce this " # "procedure at [github](https://github.com/xu-song/tokenizer-arena/)." # ) with gr.Row(): compress_rate_corpus = gr.Dropdown( common_corpuses, # , "code" value=["cc100-en", "cc100-zh-Hans"], label="corpus", multiselect=True # info="" ) # unit of file_size: gigabyte terabyte # unit of token_num: million billion trillion # The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour) compress_rate_unit = gr.Radio( ["b_tokens/g_bytes", "t_tokens/t_bytes"], value="b_tokens/g_bytes", label="measure", ) gr.Markdown( # "`num_of_trillion_tokens` `num_of_billion_tokens`\n" "- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus. \n" "- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus. \n" "- `n_chars/n_tokens` measures how many chars per token in the tokenized corpus. \n\n" "All the above measures are depend on corpus. You can reproduce this " "procedure at [github](https://github.com/xu-song/tokenizer-arena/)." ) gr.Markdown("## 🏆 Compression Rate Leaderboard") search_bar = gr.Textbox( placeholder="🔍 Search tokenizers(e.g., 'llama') and press ENTER...", show_label=False, elem_id="search-bar", ) compress_rate_table = gr.Dataframe() # func call compress_rate_corpus.change( get_compression_leaderboard, inputs=[compress_rate_corpus, compress_rate_unit], outputs=compress_rate_table ) compress_rate_unit.change( get_compression_leaderboard, inputs=[compress_rate_corpus, compress_rate_unit], outputs=compress_rate_table ) # file_size.change( # get_all_compress_rate, # outputs=compress_rate_table # ) search_bar.submit( get_compression_leaderboard, inputs=[ compress_rate_corpus, compress_rate_unit, search_bar, ], outputs=compress_rate_table ) demo.load( get_compression_leaderboard, inputs=[compress_rate_corpus, compress_rate_unit], outputs=compress_rate_table ) if __name__ == "__main__": demo.launch()