Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
fa9330f9
"vscode:/vscode.git/clone" did not exist on "79b57a3f6ffbfba181b904f3c571d7acc45ed494"
Commit
fa9330f9
authored
Oct 09, 2025
by
Baber
Browse files
log failures
parent
a0096a09
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
74 additions
and
21 deletions
+74
-21
run_mrl_evals.py
run_mrl_evals.py
+74
-21
No files found.
run_mrl_evals.py
View file @
fa9330f9
...
@@ -283,26 +283,57 @@ def has_results(model: str, output_path: str) -> bool:
...
@@ -283,26 +283,57 @@ def has_results(model: str, output_path: str) -> bool:
return
False
return
False
def
save_failed_models
(
results
:
list
[
dict
[
str
,
str
]],
output_path
:
str
):
def
save_failed_model_incremental
(
result
:
dict
[
str
,
str
],
output_path
:
str
):
"""Save failed models to both text and JSON files for later debugging."""
"""Incrementally append a failed model to the failure tracking files.
failed_results
=
[
r
for
r
in
results
if
r
[
"status"
]
!=
"success"
]
if
not
failed_results
:
This function is called immediately when a model fails, ensuring the failure
return
is recorded even if the script crashes or is interrupted.
Args:
result: The result dictionary for a failed model
output_path: Base output directory
"""
if
result
[
"status"
]
==
"success"
:
return
# Only save failures
output_dir
=
Path
(
output_path
)
output_dir
=
Path
(
output_path
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
#
Save
simple text file with just model name
s
#
Append to
simple text file with just model name
failed_txt
=
output_dir
/
"failed_models.txt"
failed_txt
=
output_dir
/
"failed_models.txt"
with
open
(
failed_txt
,
"
w
"
)
as
f
:
with
open
(
failed_txt
,
"
a
"
)
as
f
:
f
or
r
in
failed_results
:
f
.
write
(
f
"
{
result
[
'model'
]
}
\n
"
)
f
.
write
(
f
"
{
r
[
'model'
]
}
\n
"
)
f
.
flush
()
# Ensure it's written to disk immediately
#
Save detailed JSON file with full error information
#
For JSON, we need to read, append, and rewrite the entire file
failed_json
=
output_dir
/
"failed_models.json"
failed_json
=
output_dir
/
"failed_models.json"
failed_results
=
[]
# Read existing failures if file exists
if
failed_json
.
exists
():
try
:
with
open
(
failed_json
,
"r"
)
as
f
:
failed_results
=
json
.
load
(
f
)
except
(
json
.
JSONDecodeError
,
FileNotFoundError
):
failed_results
=
[]
# Append new failure and write back
failed_results
.
append
(
result
)
with
open
(
failed_json
,
"w"
)
as
f
:
with
open
(
failed_json
,
"w"
)
as
f
:
json
.
dump
(
failed_results
,
f
,
indent
=
2
)
json
.
dump
(
failed_results
,
f
,
indent
=
2
)
f
.
flush
()
# Ensure it's written to disk immediately
def
save_failed_models
(
results
:
list
[
dict
[
str
,
str
]],
output_path
:
str
):
"""Print summary of failed models (actual saving happens incrementally now)."""
failed_results
=
[
r
for
r
in
results
if
r
[
"status"
]
!=
"success"
]
if
not
failed_results
:
return
output_dir
=
Path
(
output_path
)
failed_txt
=
output_dir
/
"failed_models.txt"
failed_json
=
output_dir
/
"failed_models.json"
print
(
"
\n
📝 Failed models saved to:"
)
print
(
"
\n
📝 Failed models saved to:"
)
print
(
f
" -
{
failed_txt
}
(simple list)"
)
print
(
f
" -
{
failed_txt
}
(simple list)"
)
...
@@ -330,6 +361,7 @@ def gpu_worker(
...
@@ -330,6 +361,7 @@ def gpu_worker(
results
:
list
[
dict
[
str
,
Any
]],
results
:
list
[
dict
[
str
,
Any
]],
results_lock
:
threading
.
Lock
,
results_lock
:
threading
.
Lock
,
pbar
:
tqdm
,
pbar
:
tqdm
,
output_path
:
str
,
):
):
"""Worker function that processes models from a queue on a specific GPU.
"""Worker function that processes models from a queue on a specific GPU.
...
@@ -339,6 +371,7 @@ def gpu_worker(
...
@@ -339,6 +371,7 @@ def gpu_worker(
results: Shared list to store results (protected by results_lock)
results: Shared list to store results (protected by results_lock)
results_lock: Lock for thread-safe access to results list
results_lock: Lock for thread-safe access to results list
pbar: Progress bar to update
pbar: Progress bar to update
output_path: Output path for saving failed models incrementally
"""
"""
success_count
=
0
success_count
=
0
failed_count
=
0
failed_count
=
0
...
@@ -355,13 +388,15 @@ def gpu_worker(
...
@@ -355,13 +388,15 @@ def gpu_worker(
# Run evaluation on this GPU
# Run evaluation on this GPU
result
=
run_evaluation
(
model
,
gpu_id
)
result
=
run_evaluation
(
model
,
gpu_id
)
# Thread-safe append to results
# Thread-safe append to results
and save failures immediately
with
results_lock
:
with
results_lock
:
results
.
append
(
result
)
results
.
append
(
result
)
if
result
[
"status"
]
==
"success"
:
if
result
[
"status"
]
==
"success"
:
success_count
+=
1
success_count
+=
1
else
:
else
:
failed_count
+=
1
failed_count
+=
1
# Save failed model immediately to disk
save_failed_model_incremental
(
result
,
output_path
)
# Update progress bar
# Update progress bar
pbar
.
set_postfix
(
pbar
.
set_postfix
(
...
@@ -375,16 +410,17 @@ def gpu_worker(
...
@@ -375,16 +410,17 @@ def gpu_worker(
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
"[GPU
{
gpu_id
}
] Unexpected error processing
{
model
}
:
{
e
}
"
)
print
(
f
"[GPU
{
gpu_id
}
] Unexpected error processing
{
model
}
:
{
e
}
"
)
with
results_lock
:
with
results_lock
:
results
.
append
(
result
=
{
{
"model"
:
model
,
"model"
:
model
,
"gpu_id"
:
gpu_id
,
"gpu_id"
:
gpu_id
,
"status"
:
"exception"
,
"status"
:
"exception"
,
"error"
:
str
(
e
),
"error"
:
str
(
e
),
"timestamp"
:
datetime
.
now
().
isoformat
(),
"timestamp"
:
datetime
.
now
().
isoformat
(),
}
}
results
.
append
(
result
)
)
failed_count
+=
1
failed_count
+=
1
# Save failed model immediately to disk
save_failed_model_incremental
(
result
,
output_path
)
pbar
.
update
(
1
)
pbar
.
update
(
1
)
finally
:
finally
:
# Mark task as done
# Mark task as done
...
@@ -445,6 +481,17 @@ def main():
...
@@ -445,6 +481,17 @@ def main():
print
(
"No models to evaluate. Exiting."
)
print
(
"No models to evaluate. Exiting."
)
return
0
return
0
# Clear previous failure tracking files to start fresh
# (Incremental saving will recreate them as failures occur)
output_dir
=
Path
(
OUTPUT_PATH
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
failed_txt
=
output_dir
/
"failed_models.txt"
failed_json
=
output_dir
/
"failed_models.json"
if
failed_txt
.
exists
():
failed_txt
.
unlink
()
if
failed_json
.
exists
():
failed_json
.
unlink
()
# Create a queue and populate it with models
# Create a queue and populate it with models
model_queue
=
queue
.
Queue
()
model_queue
=
queue
.
Queue
()
for
model
in
models_to_run
:
for
model
in
models_to_run
:
...
@@ -461,7 +508,13 @@ def main():
...
@@ -461,7 +508,13 @@ def main():
# Submit one worker per GPU
# Submit one worker per GPU
futures
=
[
futures
=
[
executor
.
submit
(
executor
.
submit
(
gpu_worker
,
gpu_id
,
model_queue
,
results
,
results_lock
,
pbar
gpu_worker
,
gpu_id
,
model_queue
,
results
,
results_lock
,
pbar
,
OUTPUT_PATH
,
)
)
for
gpu_id
in
range
(
NUM_GPUS
)
for
gpu_id
in
range
(
NUM_GPUS
)
]
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment