Unverified Commit bb6047db authored by Artem Perevedentsev's avatar Artem Perevedentsev Committed by GitHub
Browse files

[Model][Perf] Enable checkpoints prefetching for Lustre FS by default (#39422)


Signed-off-by: default avatarArtem Perevedentsev <aperevedents@nvidia.com>
parent 467d3247
......@@ -763,31 +763,24 @@ def np_cache_weights_iterator(
yield name, torch.from_numpy(param)
def _checkpoints_fit_in_ram(files: list[str], threshold: float = 0.9) -> bool:
"""Return True if total size of *files* fits within *threshold* of available RAM."""
def _get_checkpoints_size_bytes(files: list[str]) -> int:
"""Return the total size of the checkpoint files in bytes."""
if not files:
return True
return 0
return sum(os.path.getsize(f) for f in files)
def _get_available_ram_bytes() -> int:
"""Return the available RAM in bytes."""
import psutil
total_size = sum(os.path.getsize(f) for f in files)
available_ram = psutil.virtual_memory().available
fits = total_size <= threshold * available_ram
if not fits:
logger.warning(
"NFS detected but checkpoint total size (%.2f GiB) exceeds "
"%.0f%% of available RAM (%.2f GiB). Skipping prefetching checkpoints.",
total_size / (1024**3),
threshold * 100,
available_ram / (1024**3),
)
return fits
return psutil.virtual_memory().available
def _is_nfs_path(files: list[str]) -> bool:
"""Check whether the first file in *files* resides on an NFS
filesystem (Linux only)."""
def _get_fs_type(files: list[str]) -> str:
"""Get the filesystem type of the first file in *files* (Linux only)."""
if not files:
return False
return ""
try:
# Only the first file is checked — all checkpoint shards reside
# in the same directory and therefore on the same filesystem.
......@@ -810,12 +803,11 @@ def _is_nfs_path(files: list[str]) -> bool:
) and len(mount_point) > len(best_mount):
best_mount = mount_point
best_fstype = fstype
return best_fstype in ("nfs", "nfs4")
return best_fstype
except Exception:
# /proc/mounts is Linux-specific; on other OSes (or if the read
# fails for any reason) we fall back to "not NFS" rather than
# crashing model loading.
return False
# fails for any reason) we fall back to an empty string.
return ""
def _prefetch_checkpoint(file_path: str) -> None:
......@@ -901,11 +893,63 @@ def safetensors_weights_iterator(
sorted_files = sorted(hf_weights_files, key=_natural_sort_key)
should_prefetch = safetensors_load_strategy == "prefetch" or (
safetensors_load_strategy is None
and _is_nfs_path(sorted_files)
and _checkpoints_fit_in_ram(sorted_files)
fs_type = _get_fs_type(sorted_files)
is_net_fs = fs_type in ("nfs", "nfs4", "lustre")
total_bytes = _get_checkpoints_size_bytes(sorted_files)
avail_bytes = _get_available_ram_bytes()
ram_threshold_pct = 90
fits_in_ram = total_bytes <= (ram_threshold_pct / 100.0) * avail_bytes
fs_name = fs_type.upper() if fs_type else "unknown"
logger.info_once(
"Filesystem type for checkpoints: %s. Checkpoint size: %.2f GiB. "
"Available RAM: %.2f GiB.",
fs_name,
total_bytes / 1024**3,
avail_bytes / 1024**3,
)
should_prefetch = safetensors_load_strategy == "prefetch"
if safetensors_load_strategy is None:
if is_net_fs and fits_in_ram:
should_prefetch = True
elif is_net_fs and not fits_in_ram:
logger.warning_once(
"Network filesystem (%s) detected but checkpoint total size "
"(%.2f GiB) exceeds %d%% of available RAM (%.2f GiB). "
"Skipping auto-prefetch.",
fs_name,
total_bytes / 1024**3,
ram_threshold_pct,
avail_bytes / 1024**3,
)
elif not is_net_fs and fits_in_ram:
logger.info_once(
"Auto-prefetch is disabled because the filesystem (%s) is not a "
"recognized network FS (NFS/Lustre). If you want to force "
"prefetching, start vLLM with --safetensors-load-strategy=prefetch.",
fs_name,
)
elif not is_net_fs and not fits_in_ram:
logger.info_once(
"Auto-prefetch is disabled because the filesystem (%s) is not a "
"recognized network FS (NFS/Lustre) and the checkpoint size "
"(%.2f GiB) exceeds %d%% of available RAM (%.2f GiB).",
fs_name,
total_bytes / 1024**3,
ram_threshold_pct,
avail_bytes / 1024**3,
)
elif should_prefetch and not fits_in_ram:
logger.warning_once(
"safetensors_load_strategy='prefetch' was explicitly specified, but "
"checkpoint total size (%.2f GiB) exceeds %d%% of available RAM "
"(%.2f GiB). This may cause out-of-memory errors.",
total_bytes / 1024**3,
ram_threshold_pct,
avail_bytes / 1024**3,
)
if should_prefetch:
_prefetch_all_checkpoints(sorted_files)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment