Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
29701ab
1
Parent(s):
f65df62
debug
Browse files- src/display/utils.py +27 -4
- src/leaderboard/read_evals.py +114 -48
- src/populate.py +20 -1
src/display/utils.py
CHANGED
|
@@ -140,7 +140,30 @@ BENCHMARK_COLS_MIB_CAUSALGRAPH = []
|
|
| 140 |
# ColumnContent(col_name, "number", True)
|
| 141 |
# ])
|
| 142 |
|
| 143 |
-
# In utils.py, modify auto_eval_column_dict_mib_causalgraph:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
auto_eval_column_dict_mib_causalgraph = []
|
| 145 |
|
| 146 |
# Method name column
|
|
@@ -148,12 +171,12 @@ auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnCon
|
|
| 148 |
|
| 149 |
# For each model-task-intervention-counterfactual combination
|
| 150 |
for task in TasksMib_Causalgraph:
|
| 151 |
-
for model in
|
|
|
|
| 152 |
for layer in task.value.layers:
|
| 153 |
for intervention in task.value.interventions:
|
| 154 |
for counterfactual in task.value.counterfactuals:
|
| 155 |
-
|
| 156 |
-
col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}".lower()
|
| 157 |
auto_eval_column_dict_mib_causalgraph.append([
|
| 158 |
col_name,
|
| 159 |
ColumnContent,
|
|
|
|
| 140 |
# ColumnContent(col_name, "number", True)
|
| 141 |
# ])
|
| 142 |
|
| 143 |
+
# # In utils.py, modify auto_eval_column_dict_mib_causalgraph:
|
| 144 |
+
# auto_eval_column_dict_mib_causalgraph = []
|
| 145 |
+
|
| 146 |
+
# # Method name column
|
| 147 |
+
# auto_eval_column_dict_mib_causalgraph.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 148 |
+
|
| 149 |
+
# # For each model-task-intervention-counterfactual combination
|
| 150 |
+
# for task in TasksMib_Causalgraph:
|
| 151 |
+
# for model in ["qwen2forcausallm", "gemma2forcausallm", "llamaforcausallm"]: # exact model names
|
| 152 |
+
# for layer in task.value.layers:
|
| 153 |
+
# for intervention in task.value.interventions:
|
| 154 |
+
# for counterfactual in task.value.counterfactuals:
|
| 155 |
+
# # Match the exact format from the data
|
| 156 |
+
# col_name = f"{model}_layer{layer}_{intervention}_{counterfactual}".lower()
|
| 157 |
+
# auto_eval_column_dict_mib_causalgraph.append([
|
| 158 |
+
# col_name,
|
| 159 |
+
# ColumnContent,
|
| 160 |
+
# ColumnContent(col_name, "number", True)
|
| 161 |
+
# ])
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
|
| 167 |
auto_eval_column_dict_mib_causalgraph = []
|
| 168 |
|
| 169 |
# Method name column
|
|
|
|
| 171 |
|
| 172 |
# For each model-task-intervention-counterfactual combination
|
| 173 |
for task in TasksMib_Causalgraph:
|
| 174 |
+
for model in task.value.models: # Use exact model names from JSON
|
| 175 |
+
model_name = model # Don't convert to lowercase
|
| 176 |
for layer in task.value.layers:
|
| 177 |
for intervention in task.value.interventions:
|
| 178 |
for counterfactual in task.value.counterfactuals:
|
| 179 |
+
col_name = f"{model_name}_layer{layer}_{intervention}_{counterfactual}"
|
|
|
|
| 180 |
auto_eval_column_dict_mib_causalgraph.append([
|
| 181 |
col_name,
|
| 182 |
ColumnContent,
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -182,52 +182,52 @@ def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) ->
|
|
| 182 |
|
| 183 |
|
| 184 |
|
| 185 |
-
@dataclass
|
| 186 |
-
class EvalResult_MIB_CAUSALGRAPH:
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
|
| 224 |
-
|
| 225 |
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
|
| 232 |
# def to_dict(self):
|
| 233 |
# """Converts the Eval Result to a dict for dataframe display"""
|
|
@@ -308,24 +308,90 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
| 308 |
# data_dict[col_name] = intervention_data['score']
|
| 309 |
|
| 310 |
# return data_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
def to_dict(self):
|
| 312 |
"""Converts the Eval Result to a dict for dataframe display"""
|
| 313 |
data_dict = {
|
| 314 |
"eval_name": self.eval_name,
|
| 315 |
"Method": self.method_name,
|
| 316 |
}
|
| 317 |
-
|
| 318 |
-
#
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
# The scores are already in the format we want
|
| 322 |
-
for key, value in scores.items():
|
| 323 |
-
col_name = f"{model_name}_{key}"
|
| 324 |
-
data_dict[col_name] = value
|
| 325 |
-
|
| 326 |
return data_dict
|
| 327 |
|
| 328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
| 330 |
# """Extract evaluation results for MIB causalgraph"""
|
| 331 |
# model_result_filepaths = []
|
|
|
|
| 182 |
|
| 183 |
|
| 184 |
|
| 185 |
+
# @dataclass
|
| 186 |
+
# class EvalResult_MIB_CAUSALGRAPH:
|
| 187 |
+
# """Represents one full evaluation for a method in MIB causalgraph."""
|
| 188 |
+
# eval_name: str
|
| 189 |
+
# method_name: str
|
| 190 |
+
# results: Dict
|
| 191 |
+
|
| 192 |
+
# def init_from_json_file(self, json_filepath):
|
| 193 |
+
# """Inits results from the method result file"""
|
| 194 |
+
# with open(json_filepath) as fp:
|
| 195 |
+
# data = json.load(fp)
|
| 196 |
+
|
| 197 |
+
# method_name = data.get("method_name")
|
| 198 |
+
# results = {}
|
| 199 |
|
| 200 |
+
# # Get results for each model
|
| 201 |
+
# for model_result in data.get("results", []):
|
| 202 |
+
# model_id = model_result.get("model_id", "") # Will be one of the three models
|
| 203 |
+
# task_scores = model_result.get("task_scores", {})
|
| 204 |
|
| 205 |
+
# # Process MCQA task scores
|
| 206 |
+
# mcqa_scores = {}
|
| 207 |
+
# for layer_data in task_scores.get("MCQA", []):
|
| 208 |
+
# layer = layer_data.get("layer")
|
| 209 |
+
# layer_scores = layer_data.get("layer_scores", [])
|
| 210 |
|
| 211 |
+
# # Store scores for each intervention and counterfactual
|
| 212 |
+
# for intervention_data in layer_scores:
|
| 213 |
+
# intervention = intervention_data["intervention"][0]
|
| 214 |
+
# counterfactual_scores = intervention_data["counterfactual_scores"]
|
| 215 |
|
| 216 |
+
# for cf_score in counterfactual_scores:
|
| 217 |
+
# counterfactual = cf_score["counterfactual"][0]
|
| 218 |
+
# score = cf_score["score"]
|
| 219 |
|
| 220 |
+
# # Create key for this combination
|
| 221 |
+
# key = f"layer{layer}_{intervention}_{counterfactual}"
|
| 222 |
+
# mcqa_scores[key] = score
|
| 223 |
|
| 224 |
+
# results[model_id] = mcqa_scores
|
| 225 |
|
| 226 |
+
# return EvalResult_MIB_CAUSALGRAPH(
|
| 227 |
+
# eval_name=method_name,
|
| 228 |
+
# method_name=method_name,
|
| 229 |
+
# results=results
|
| 230 |
+
# )
|
| 231 |
|
| 232 |
# def to_dict(self):
|
| 233 |
# """Converts the Eval Result to a dict for dataframe display"""
|
|
|
|
| 308 |
# data_dict[col_name] = intervention_data['score']
|
| 309 |
|
| 310 |
# return data_dict
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
# def to_dict(self):
|
| 314 |
+
# """Converts the Eval Result to a dict for dataframe display"""
|
| 315 |
+
# data_dict = {
|
| 316 |
+
# "eval_name": self.eval_name,
|
| 317 |
+
# "Method": self.method_name,
|
| 318 |
+
# }
|
| 319 |
+
|
| 320 |
+
# # Process each model's results
|
| 321 |
+
# for model_id, scores in self.results.items():
|
| 322 |
+
# model_name = model_id.lower()
|
| 323 |
+
# # The scores are already in the format we want
|
| 324 |
+
# for key, value in scores.items():
|
| 325 |
+
# col_name = f"{model_name}_{key}"
|
| 326 |
+
# data_dict[col_name] = value
|
| 327 |
+
|
| 328 |
+
# return data_dict
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
@dataclass
|
| 338 |
+
class EvalResult_MIB_CAUSALGRAPH:
|
| 339 |
+
eval_name: str
|
| 340 |
+
method_name: str
|
| 341 |
+
results: Dict
|
| 342 |
+
|
| 343 |
+
def init_from_json_file(self, json_filepath):
|
| 344 |
+
"""Inits results from the method result file"""
|
| 345 |
+
with open(json_filepath) as fp:
|
| 346 |
+
data = json.load(fp)
|
| 347 |
+
|
| 348 |
+
method_name = data.get("method_name")
|
| 349 |
+
results = {}
|
| 350 |
+
|
| 351 |
+
# Process each model's results
|
| 352 |
+
for model_result in data.get("results", []):
|
| 353 |
+
model_id = model_result.get("model_id", "")
|
| 354 |
+
task_scores = model_result.get("task_scores", {})
|
| 355 |
+
|
| 356 |
+
# Process MCQA scores
|
| 357 |
+
for layer_data in task_scores.get("MCQA", []):
|
| 358 |
+
layer = layer_data.get("layer")
|
| 359 |
+
for score_data in layer_data.get("layer_scores", []):
|
| 360 |
+
intervention = score_data["intervention"][0]
|
| 361 |
+
for cf_score in score_data["counterfactual_scores"]:
|
| 362 |
+
counterfactual = cf_score["counterfactual"][0]
|
| 363 |
+
score = cf_score["score"]
|
| 364 |
+
|
| 365 |
+
# Create key matching the expected column format
|
| 366 |
+
key = f"{model_id}_layer{layer}_{intervention}_{counterfactual}"
|
| 367 |
+
results[key] = score
|
| 368 |
+
|
| 369 |
+
return EvalResult_MIB_CAUSALGRAPH(
|
| 370 |
+
eval_name=method_name,
|
| 371 |
+
method_name=method_name,
|
| 372 |
+
results=results
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
def to_dict(self):
|
| 376 |
"""Converts the Eval Result to a dict for dataframe display"""
|
| 377 |
data_dict = {
|
| 378 |
"eval_name": self.eval_name,
|
| 379 |
"Method": self.method_name,
|
| 380 |
}
|
| 381 |
+
|
| 382 |
+
# Add all results directly
|
| 383 |
+
data_dict.update(self.results)
|
| 384 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
return data_dict
|
| 386 |
|
| 387 |
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
|
| 395 |
# def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
| 396 |
# """Extract evaluation results for MIB causalgraph"""
|
| 397 |
# model_result_filepaths = []
|
src/populate.py
CHANGED
|
@@ -221,6 +221,25 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 221 |
# # Only return detailed_df for display
|
| 222 |
# return detailed_df
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
| 225 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 226 |
raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
|
@@ -228,7 +247,7 @@ def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, co
|
|
| 228 |
# Convert each result to dict format for detailed df
|
| 229 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 230 |
detailed_df = pd.DataFrame.from_records(all_data_json)
|
| 231 |
-
print("Columns in detailed_df:", detailed_df.columns.tolist())
|
| 232 |
|
| 233 |
# Create aggregated df
|
| 234 |
aggregated_df = aggregate_methods(detailed_df)
|
|
|
|
| 221 |
# # Only return detailed_df for display
|
| 222 |
# return detailed_df
|
| 223 |
|
| 224 |
+
# def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
| 225 |
+
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 226 |
+
# raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
| 227 |
+
|
| 228 |
+
# # Convert each result to dict format for detailed df
|
| 229 |
+
# all_data_json = [v.to_dict() for v in raw_data]
|
| 230 |
+
# detailed_df = pd.DataFrame.from_records(all_data_json)
|
| 231 |
+
# print("Columns in detailed_df:", detailed_df.columns.tolist()) # Print actual columns
|
| 232 |
+
|
| 233 |
+
# # Create aggregated df
|
| 234 |
+
# aggregated_df = aggregate_methods(detailed_df)
|
| 235 |
+
# print("Columns in aggregated_df:", aggregated_df.columns.tolist())
|
| 236 |
+
|
| 237 |
+
# # Create intervention-averaged df
|
| 238 |
+
# intervention_averaged_df = create_intervention_averaged_df(aggregated_df)
|
| 239 |
+
# print("Columns in intervention_averaged_df:", intervention_averaged_df.columns.tolist())
|
| 240 |
+
|
| 241 |
+
# return detailed_df, aggregated_df, intervention_averaged_df
|
| 242 |
+
|
| 243 |
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
| 244 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 245 |
raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
|
|
|
| 247 |
# Convert each result to dict format for detailed df
|
| 248 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 249 |
detailed_df = pd.DataFrame.from_records(all_data_json)
|
| 250 |
+
print("Columns in detailed_df:", detailed_df.columns.tolist())
|
| 251 |
|
| 252 |
# Create aggregated df
|
| 253 |
aggregated_df = aggregate_methods(detailed_df)
|