| | import gradio as gr |
| | import pandas as pd |
| |
|
| | from dataset import get_dataframe |
| | from markdown import COLUMN_DESC_MARKDOWN, GUIDELINES, PANEL_MARKDOWN |
| |
|
| | df = get_dataframe() |
| |
|
| |
|
| | def filter_dataframe(dataframe, eval_dataset, cont_source, checkboxes): |
| | """ |
| | Filter the dataframe based on the provided evaluation dataset, contaminated source, and checkboxes. |
| | |
| | Args: |
| | dataframe (pandas.DataFrame): The input dataframe to filter. |
| | eval_dataset (str): The evaluation dataset to filter by. |
| | cont_source (str): The contaminated source to filter by. |
| | checkboxes (list): The checkboxes to filter by. |
| | |
| | Returns: |
| | pandas.DataFrame: The filtered dataframe. |
| | """ |
| | if isinstance(eval_dataset, str): |
| | dataframe = dataframe[ |
| | dataframe["Evaluation Dataset"].str.contains(f"(?i){eval_dataset}") |
| | ] |
| | if isinstance(cont_source, str): |
| | dataframe = dataframe[ |
| | dataframe["Contaminated Source"].str.contains(f"(?i){cont_source}") |
| | ] |
| | if isinstance(checkboxes, list) and "Exclude model-based evidences" in checkboxes: |
| | dataframe = dataframe[dataframe["Approach"] != "model-based"] |
| | if isinstance(checkboxes, list) and "Show only contaminated" in checkboxes: |
| | dataframe = dataframe[ |
| | (dataframe["Train Split"] > 0.0) |
| | | (dataframe["Development Split"] > 0.0) |
| | | (dataframe["Test Split"] > 0.0) |
| | ] |
| |
|
| | dataframe = dataframe.sort_values("Test Split", ascending=False) |
| |
|
| | return dataframe.style.format( |
| | { |
| | "Train Split": "{:.1%}", |
| | "Development Split": "{:.1%}", |
| | "Test Split": "{:.1%}", |
| | }, |
| | na_rep="Unknown", |
| | ) |
| |
|
| |
|
| | def filter_dataframe_corpus(*args, **kwargs) -> pd.DataFrame: |
| | """ |
| | Filter the dataframe for corpus contamination. |
| | |
| | Returns: |
| | pandas.DataFrame: The filtered dataframe for corpus contamination. |
| | """ |
| | |
| | filtered_df = df[df["Model or corpus"] == "corpus"] |
| | filtered_df = filtered_df.drop(columns=["Model or corpus"]) |
| | return filter_dataframe(filtered_df, *args, **kwargs) |
| |
|
| |
|
| | def filter_dataframe_model(*args, **kwargs) -> pd.DataFrame: |
| | """ |
| | Filter the dataframe for model contamination. |
| | |
| | Returns: |
| | pandas.DataFrame: The filtered dataframe for model contamination. |
| | """ |
| | |
| | filtered_df = df[df["Model or corpus"] == "model"] |
| | filtered_df = filtered_df.drop(columns=["Model or corpus"]) |
| | return filter_dataframe(filtered_df, *args, **kwargs) |
| |
|
| |
|
| | theme = gr.themes.Soft( |
| | primary_hue="emerald", |
| | secondary_hue="cyan", |
| | text_size="md", |
| | spacing_size="lg", |
| | font=[ |
| | gr.themes.GoogleFont("Poppins"), |
| | gr.themes.GoogleFont("Poppins"), |
| | gr.themes.GoogleFont("Poppins"), |
| | gr.themes.GoogleFont("Poppins"), |
| | ], |
| | ).set( |
| | block_background_fill="*neutral_50", |
| | block_background_fill_dark="*neutral_950", |
| | section_header_text_size="*text_lg", |
| | section_header_text_weight="800", |
| | ) |
| |
|
| |
|
| | with gr.Blocks( |
| | theme=theme, |
| | title="π¨ Data Contamination Database", |
| | analytics_enabled=False, |
| | fill_height=True, |
| | ) as demo: |
| | gr.Markdown(PANEL_MARKDOWN) |
| | with gr.Accordion("Column descriptions (See details)", open=False) as accordion: |
| | gr.Markdown(COLUMN_DESC_MARKDOWN) |
| |
|
| | gr.Markdown(f"### Total contributions: {len(df)}") |
| | |
| | with gr.Tab("Corpus contamination") as tab_corpus: |
| | with gr.Row(variant="compact"): |
| | with gr.Column(): |
| | eval_dataset_corpus = gr.Textbox( |
| | placeholder="Evaluation dataset", |
| | label="Evaluation dataset", |
| | value="", |
| | ) |
| | cont_corpora = gr.Textbox( |
| | placeholder="Pre-training corpora", |
| | label="Pre-training corpora", |
| | value="", |
| | ) |
| | with gr.Column(): |
| | checkboxes_corpus = gr.CheckboxGroup( |
| | ["Exclude model-based evidences", "Show only contaminated"], |
| | label="Search options", |
| | value=[], |
| | ) |
| |
|
| | filter_corpus_btn = gr.Button("Filter") |
| |
|
| | corpus_dataframe = gr.DataFrame( |
| | value=filter_dataframe_corpus( |
| | eval_dataset_corpus, cont_corpora, checkboxes_corpus |
| | ), |
| | headers=df.columns.to_list(), |
| | datatype=[ |
| | "markdown", |
| | "markdown", |
| | "number", |
| | "number", |
| | "number", |
| | "str", |
| | "markdown", |
| | "markdown", |
| | ], |
| | ) |
| |
|
| | with gr.Tab("Model contamination") as tab_model: |
| | with gr.Row(variant="compact"): |
| | with gr.Column(): |
| | eval_dataset_model = gr.Textbox( |
| | placeholder="Evaluation dataset", |
| | label="Evaluation dataset", |
| | value="", |
| | ) |
| | cont_model = gr.Textbox( |
| | placeholder="Model", label="Pre-trained model", value="" |
| | ) |
| | with gr.Column(): |
| | checkboxes_model = gr.CheckboxGroup( |
| | ["Exclude model-based evidences", "Show only contaminated"], |
| | label="Search options", |
| | value=[], |
| | ) |
| |
|
| | filter_model_btn = gr.Button("Filter") |
| |
|
| | model_dataframe = gr.DataFrame( |
| | value=filter_dataframe_model( |
| | eval_dataset_model, cont_model, checkboxes_model |
| | ), |
| | headers=df.columns.to_list(), |
| | datatype=[ |
| | "markdown", |
| | "markdown", |
| | "number", |
| | "number", |
| | "number", |
| | "str", |
| | "markdown", |
| | "markdown", |
| | ], |
| | ) |
| |
|
| | filter_corpus_btn.click( |
| | filter_dataframe_corpus, |
| | inputs=[eval_dataset_corpus, cont_corpora, checkboxes_corpus], |
| | outputs=corpus_dataframe, |
| | ) |
| | filter_model_btn.click( |
| | filter_dataframe_model, |
| | inputs=[eval_dataset_model, cont_model, checkboxes_model], |
| | outputs=model_dataframe, |
| | ) |
| |
|
| | with gr.Tab("Contribution Guidelines") as tab_guidelines: |
| | gr.Markdown(GUIDELINES) |
| |
|
| |
|
| | demo.launch() |
| |
|