Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
a0096a09
Commit
a0096a09
authored
Oct 09, 2025
by
Baber
Browse files
fix gpu queue
parent
1412e0c6
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
95 additions
and
43 deletions
+95
-43
run_mrl_evals.py
run_mrl_evals.py
+95
-43
No files found.
run_mrl_evals.py
View file @
a0096a09
...
@@ -8,8 +8,10 @@ As each GPU finishes evaluating a model, it automatically picks up the next one
...
@@ -8,8 +8,10 @@ As each GPU finishes evaluating a model, it automatically picks up the next one
import
argparse
import
argparse
import
json
import
json
import
queue
import
subprocess
import
subprocess
import
sys
import
sys
import
threading
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
from
datetime
import
datetime
from
datetime
import
datetime
from
pathlib
import
Path
from
pathlib
import
Path
...
@@ -322,6 +324,73 @@ def load_failed_models(output_path: str) -> list[str]:
...
@@ -322,6 +324,73 @@ def load_failed_models(output_path: str) -> list[str]:
return
models
return
models
def
gpu_worker
(
gpu_id
:
int
,
model_queue
:
queue
.
Queue
,
results
:
list
[
dict
[
str
,
Any
]],
results_lock
:
threading
.
Lock
,
pbar
:
tqdm
,
):
"""Worker function that processes models from a queue on a specific GPU.
Args:
gpu_id: The GPU ID (0, 1, 2, ...) this worker should use
model_queue: Queue containing models to process
results: Shared list to store results (protected by results_lock)
results_lock: Lock for thread-safe access to results list
pbar: Progress bar to update
"""
success_count
=
0
failed_count
=
0
while
True
:
try
:
# Get next model from queue (non-blocking)
model
=
model_queue
.
get_nowait
()
except
queue
.
Empty
:
# No more models to process
break
try
:
# Run evaluation on this GPU
result
=
run_evaluation
(
model
,
gpu_id
)
# Thread-safe append to results
with
results_lock
:
results
.
append
(
result
)
if
result
[
"status"
]
==
"success"
:
success_count
+=
1
else
:
failed_count
+=
1
# Update progress bar
pbar
.
set_postfix
(
{
"✓"
:
sum
(
1
for
r
in
results
if
r
[
"status"
]
==
"success"
),
"✗"
:
sum
(
1
for
r
in
results
if
r
[
"status"
]
!=
"success"
),
}
)
pbar
.
update
(
1
)
except
Exception
as
e
:
print
(
f
"[GPU
{
gpu_id
}
] Unexpected error processing
{
model
}
:
{
e
}
"
)
with
results_lock
:
results
.
append
(
{
"model"
:
model
,
"gpu_id"
:
gpu_id
,
"status"
:
"exception"
,
"error"
:
str
(
e
),
"timestamp"
:
datetime
.
now
().
isoformat
(),
}
)
failed_count
+=
1
pbar
.
update
(
1
)
finally
:
# Mark task as done
model_queue
.
task_done
()
def
main
():
def
main
():
"""Main execution function."""
"""Main execution function."""
# Parse command-line arguments
# Parse command-line arguments
...
@@ -376,59 +445,42 @@ def main():
...
@@ -376,59 +445,42 @@ def main():
print
(
"No models to evaluate. Exiting."
)
print
(
"No models to evaluate. Exiting."
)
return
0
return
0
# Create a queue of (model, gpu_id) pairs
# Create a queue and populate it with models
# We cycle through GPUs as we assign models
model_queue
=
queue
.
Queue
()
model_gpu_pairs
=
[
for
model
in
models_to_run
:
(
model
,
gpu_id
%
NUM_GPUS
)
for
gpu_id
,
model
in
enumerate
(
models_to_run
)
model_queue
.
put
(
model
)
]
# Shared data structures
results
=
[]
results
=
[]
success_count
=
0
results_lock
=
threading
.
Lock
()
failed_count
=
0
# Use ThreadPoolExecutor with one worker per GPU
# Use ThreadPoolExecutor to run evaluations in parallel
# Each worker will dynamically pull models from the queue
# max_workers = NUM_GPUS ensures we don't oversubscribe GPUs
with
tqdm
(
total
=
len
(
models_to_run
),
desc
=
"Evaluating models"
,
unit
=
"model"
)
as
pbar
:
with
ThreadPoolExecutor
(
max_workers
=
NUM_GPUS
)
as
executor
:
with
ThreadPoolExecutor
(
max_workers
=
NUM_GPUS
)
as
executor
:
# Submit all jobs
# Submit one worker per GPU
future_to_model
=
{
futures
=
[
executor
.
submit
(
run_evaluation
,
model
,
gpu_id
):
(
model
,
gpu_id
)
executor
.
submit
(
for
model
,
gpu_id
in
model_gpu_pairs
gpu_worker
,
gpu_id
,
model_queue
,
results
,
results_lock
,
pbar
}
)
for
gpu_id
in
range
(
NUM_GPUS
)
# Process completed jobs as they finish with a progress bar
]
with
tqdm
(
total
=
len
(
models_to_run
),
desc
=
"Evaluating models"
,
unit
=
"model"
# Wait for all workers to complete
)
as
pbar
:
for
future
in
as_completed
(
futures
):
for
future
in
as_completed
(
future_to_model
):
model
,
gpu_id
=
future_to_model
[
future
]
try
:
try
:
result
=
future
.
result
()
future
.
result
()
# This will raise any exceptions from the worker
results
.
append
(
result
)
if
result
[
"status"
]
==
"success"
:
success_count
+=
1
else
:
failed_count
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
"Unexpected error processing
{
model
}
:
{
e
}
"
)
print
(
f
"Worker thread error:
{
e
}
"
)
results
.
append
(
{
"model"
:
model
,
"gpu_id"
:
gpu_id
,
"status"
:
"exception"
,
"error"
:
str
(
e
),
}
)
failed_count
+=
1
# Update progress bar with current statistics
pbar
.
set_postfix
({
"✓"
:
success_count
,
"✗"
:
failed_count
})
pbar
.
update
(
1
)
# Print summary
# Print summary
print
(
"
\n
"
+
"="
*
80
)
print
(
"
\n
"
+
"="
*
80
)
print
(
"EVALUATION SUMMARY"
)
print
(
"EVALUATION SUMMARY"
)
print
(
"="
*
80
)
print
(
"="
*
80
)
success_count
=
sum
(
1
for
r
in
results
if
r
[
"status"
]
==
"success"
)
failed_count
=
len
(
results
)
-
success_count
print
(
f
"Total models:
{
len
(
models_to_run
)
}
"
)
print
(
f
"Total models:
{
len
(
models_to_run
)
}
"
)
print
(
f
"Successful:
{
success_count
}
"
)
print
(
f
"Successful:
{
success_count
}
"
)
print
(
f
"Failed:
{
failed_count
}
"
)
print
(
f
"Failed:
{
failed_count
}
"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment