Spaces:

mib-bench
/

leaderboard

Running

App Files Files Community

jasonshaoshun commited on Jan 22

Commit

29701ab

1 Parent(s): f65df62

debug

Browse files

Files changed (3) hide show

src/display/utils.py +27 -4
src/leaderboard/read_evals.py +114 -48
src/populate.py +20 -1

src/display/utils.py CHANGED Viewed

@@ -140,7 +140,30 @@ BENCHMARK_COLS_MIB_CAUSALGRAPH = []
 #                         ColumnContent(col_name, "number", True)
 #                     ])
-# In utils.py, modify auto_eval_column_dict_mib_causalgraph:
 auto_eval_column_dict_mib_causalgraph = []
 # Method name column
@@ -148,12 +171,12 @@ auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnCon
 # For each model-task-intervention-counterfactual combination
 for task in TasksMib_Causalgraph:
-    for model in ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"]:  # exact model names
         for layer in task.value.layers:
             for intervention in task.value.interventions:
                 for counterfactual in task.value.counterfactuals:
-                    # Match the exact format from the data
-                    col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}".lower()
                     auto_eval_column_dict_mib_causalgraph.append([
                         col_name,
                         ColumnContent,

 #                         ColumnContent(col_name, "number", True)
 #                     ])
+# # In utils.py, modify auto_eval_column_dict_mib_causalgraph:
+# auto_eval_column_dict_mib_causalgraph = []
+# # Method name column
+# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
+# # For each model-task-intervention-counterfactual combination
+# for task in TasksMib_Causalgraph:
+#     for model in ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"]:  # exact model names
+#         for layer in task.value.layers:
+#             for intervention in task.value.interventions:
+#                 for counterfactual in task.value.counterfactuals:
+#                     # Match the exact format from the data
+#                     col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}".lower()
+#                     auto_eval_column_dict_mib_causalgraph.append([
+#                         col_name,
+#                         ColumnContent,
+#                         ColumnContent(col_name, "number", True)
+#                     ])
 auto_eval_column_dict_mib_causalgraph = []
 # Method name column
 # For each model-task-intervention-counterfactual combination
 for task in TasksMib_Causalgraph:
+    for model in task.value.models:  # Use exact model names from JSON
+        model_name = model  # Don't convert to lowercase
         for layer in task.value.layers:
             for intervention in task.value.interventions:
                 for counterfactual in task.value.counterfactuals:
+                    col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}"
                     auto_eval_column_dict_mib_causalgraph.append([
                         col_name,
                         ColumnContent,

src/leaderboard/read_evals.py CHANGED Viewed

@@ -182,52 +182,52 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
-@dataclass
-class EvalResult_MIB_CAUSALGRAPH:
-    """Represents one full evaluation for a method in MIB causalgraph."""
-    eval_name: str
-    method_name: str
-    results: Dict
-    def init_from_json_file(self, json_filepath):
-        """Inits results from the method result file"""
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        method_name = data.get("method_name")
-        results = {}
-        # Get results for each model
-        for model_result in data.get("results", []):
-            model_id = model_result.get("model_id", "")  # Will be one of the three models
-            task_scores = model_result.get("task_scores", {})
-            # Process MCQA task scores
-            mcqa_scores = {}
-            for layer_data in task_scores.get("MCQA", []):
-                layer = layer_data.get("layer")
-                layer_scores = layer_data.get("layer_scores", [])
-                # Store scores for each intervention and counterfactual
-                for intervention_data in layer_scores:
-                    intervention = intervention_data["intervention"][0]
-                    counterfactual_scores = intervention_data["counterfactual_scores"]
-                    for cf_score in counterfactual_scores:
-                        counterfactual = cf_score["counterfactual"][0]
-                        score = cf_score["score"]
-                        # Create key for this combination
-                        key = f"layer{layer}_{intervention}_{counterfactual}"
-                        mcqa_scores[key] = score
-            results[model_id] = mcqa_scores
-        return EvalResult_MIB_CAUSALGRAPH(
-            eval_name=method_name,
-            method_name=method_name,
-            results=results
-        )
     # def to_dict(self):
     #     """Converts the Eval Result to a dict for dataframe display"""
@@ -308,24 +308,90 @@ class EvalResult_MIB_CAUSALGRAPH:
     #                     data_dict[col_name] = intervention_data['score']
     #     return data_dict
     def to_dict(self):
         """Converts the Eval Result to a dict for dataframe display"""
         data_dict = {
             "eval_name": self.eval_name,
             "Method": self.method_name,
         }
-        # Process each model's results
-        for model_id, scores in self.results.items():
-            model_name = model_id.lower()
-            # The scores are already in the format we want
-            for key, value in scores.items():
-                col_name = f"{model_name}_{key}"
-                data_dict[col_name] = value
         return data_dict
 # def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
 #     """Extract evaluation results for MIB causalgraph"""
 #     model_result_filepaths = []

+# @dataclass
+# class EvalResult_MIB_CAUSALGRAPH:
+#     """Represents one full evaluation for a method in MIB causalgraph."""
+#     eval_name: str
+#     method_name: str
+#     results: Dict
+#     def init_from_json_file(self, json_filepath):
+#         """Inits results from the method result file"""
+#         with open(json_filepath) as fp:
+#             data = json.load(fp)
+#         method_name = data.get("method_name")
+#         results = {}
+#         # Get results for each model
+#         for model_result in data.get("results", []):
+#             model_id = model_result.get("model_id", "")  # Will be one of the three models
+#             task_scores = model_result.get("task_scores", {})
+#             # Process MCQA task scores
+#             mcqa_scores = {}
+#             for layer_data in task_scores.get("MCQA", []):
+#                 layer = layer_data.get("layer")
+#                 layer_scores = layer_data.get("layer_scores", [])
+#                 # Store scores for each intervention and counterfactual
+#                 for intervention_data in layer_scores:
+#                     intervention = intervention_data["intervention"][0]
+#                     counterfactual_scores = intervention_data["counterfactual_scores"]
+#                     for cf_score in counterfactual_scores:
+#                         counterfactual = cf_score["counterfactual"][0]
+#                         score = cf_score["score"]
+#                         # Create key for this combination
+#                         key = f"layer{layer}_{intervention}_{counterfactual}"
+#                         mcqa_scores[key] = score
+#             results[model_id] = mcqa_scores
+#         return EvalResult_MIB_CAUSALGRAPH(
+#             eval_name=method_name,
+#             method_name=method_name,
+#             results=results
+#         )
     # def to_dict(self):
     #     """Converts the Eval Result to a dict for dataframe display"""
     #                     data_dict[col_name] = intervention_data['score']
     #     return data_dict
+    # def to_dict(self):
+    #     """Converts the Eval Result to a dict for dataframe display"""
+    #     data_dict = {
+    #         "eval_name": self.eval_name,
+    #         "Method": self.method_name,
+    #     }
+    #     # Process each model's results
+    #     for model_id, scores in self.results.items():
+    #         model_name = model_id.lower()
+    #         # The scores are already in the format we want
+    #         for key, value in scores.items():
+    #             col_name = f"{model_name}_{key}"
+    #             data_dict[col_name] = value
+    #     return data_dict
+@dataclass
+class EvalResult_MIB_CAUSALGRAPH:
+    eval_name: str
+    method_name: str
+    results: Dict
+    def init_from_json_file(self, json_filepath):
+        """Inits results from the method result file"""
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        method_name = data.get("method_name")
+        results = {}
+        # Process each model's results
+        for model_result in data.get("results", []):
+            model_id = model_result.get("model_id", "")
+            task_scores = model_result.get("task_scores", {})
+            # Process MCQA scores
+            for layer_data in task_scores.get("MCQA", []):
+                layer = layer_data.get("layer")
+                for score_data in layer_data.get("layer_scores", []):
+                    intervention = score_data["intervention"][0]
+                    for cf_score in score_data["counterfactual_scores"]:
+                        counterfactual = cf_score["counterfactual"][0]
+                        score = cf_score["score"]
+                        # Create key matching the expected column format
+                        key = f"{model_id}_layer{layer}_{intervention}_{counterfactual}"
+                        results[key] = score
+        return EvalResult_MIB_CAUSALGRAPH(
+            eval_name=method_name,
+            method_name=method_name,
+            results=results
+        )
     def to_dict(self):
         """Converts the Eval Result to a dict for dataframe display"""
         data_dict = {
             "eval_name": self.eval_name,
             "Method": self.method_name,
         }
+        # Add all results directly
+        data_dict.update(self.results)
         return data_dict
 # def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
 #     """Extract evaluation results for MIB causalgraph"""
 #     model_result_filepaths = []

src/populate.py CHANGED Viewed

@@ -221,6 +221,25 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
 #     # Only return detailed_df for display
 #     return detailed_df
 def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     print(f"results_path is {results_path}, requests_path is {requests_path}")
     raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
@@ -228,7 +247,7 @@ def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, co
     # Convert each result to dict format for detailed df
     all_data_json = [v.to_dict() for v in raw_data]
     detailed_df = pd.DataFrame.from_records(all_data_json)
-    print("Columns in detailed_df:", detailed_df.columns.tolist())  # Print actual columns
     # Create aggregated df
     aggregated_df = aggregate_methods(detailed_df)

 #     # Only return detailed_df for display
 #     return detailed_df
+# def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+#     print(f"results_path is {results_path}, requests_path is {requests_path}")
+#     raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
+#     # Convert each result to dict format for detailed df
+#     all_data_json = [v.to_dict() for v in raw_data]
+#     detailed_df = pd.DataFrame.from_records(all_data_json)
+#     print("Columns in detailed_df:", detailed_df.columns.tolist())  # Print actual columns
+#     # Create aggregated df
+#     aggregated_df = aggregate_methods(detailed_df)
+#     print("Columns in aggregated_df:", aggregated_df.columns.tolist())
+#     # Create intervention-averaged df
+#     intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
+#     print("Columns in intervention_averaged_df:", intervention_averaged_df.columns.tolist())
+#     return detailed_df, aggregated_df, intervention_averaged_df
 def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     print(f"results_path is {results_path}, requests_path is {requests_path}")
     raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
     # Convert each result to dict format for detailed df
     all_data_json = [v.to_dict() for v in raw_data]
     detailed_df = pd.DataFrame.from_records(all_data_json)
+    print("Columns in detailed_df:", detailed_df.columns.tolist())
     # Create aggregated df
     aggregated_df = aggregate_methods(detailed_df)