Unverified Commit 00b814ba authored by lif's avatar lif Committed by GitHub
Browse files

[V0 Deprecation] Remove unused swap_space parameter (#36216)


Signed-off-by: default avatarmajiayu000 <1835304752@qq.com>
Co-authored-by: mcelrath
parent ee8a2951
......@@ -447,7 +447,6 @@ class EngineArgs:
)
disable_sliding_window: bool = ModelConfig.disable_sliding_window
disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
swap_space: float = CacheConfig.swap_space
offload_backend: str = OffloadConfig.offload_backend
cpu_offload_gb: float = UVAOffloadConfig.cpu_offload_gb
cpu_offload_params: set[str] = get_field(UVAOffloadConfig, "cpu_offload_params")
......@@ -961,7 +960,6 @@ class EngineArgs:
cache_group.add_argument(
"--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"]
)
cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"])
cache_group.add_argument(
"--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
......@@ -1526,7 +1524,6 @@ class EngineArgs:
block_size=self.block_size,
gpu_memory_utilization=self.gpu_memory_utilization,
kv_cache_memory_bytes=self.kv_cache_memory_bytes,
swap_space=self.swap_space,
cache_dtype=resolved_cache_dtype, # type: ignore[arg-type]
is_attention_free=model_config.is_attention_free,
num_gpu_blocks_override=self.num_gpu_blocks_override,
......
......@@ -164,12 +164,6 @@ class LLM:
compared with using gpu_memory_utilization. Note that
kv_cache_memory_bytes (when not-None) ignores
gpu_memory_utilization
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
This can be used for temporarily storing the states of the requests
when their `best_of` sampling parameters are larger than 1. If all
requests will have `best_of=1`, you can safely set this to 0.
Noting that `best_of` is only supported in V0. Otherwise, too small
values may cause out-of-memory (OOM) errors.
cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
the model weights. This virtually increases the GPU memory space
you can use to hold the model weights, at the cost of CPU-GPU data
......@@ -240,7 +234,6 @@ class LLM:
chat_template: Path | str | None = None,
seed: int = 0,
gpu_memory_utilization: float = 0.9,
swap_space: float = 4,
cpu_offload_gb: float = 0,
offload_group_size: int = 0,
offload_num_in_group: int = 1,
......@@ -265,6 +258,17 @@ class LLM:
) -> None:
"""LLM constructor."""
if "swap_space" in kwargs:
kwargs.pop("swap_space")
import warnings
warnings.warn(
"The 'swap_space' parameter is deprecated and ignored. "
"It will be removed in a future version.",
DeprecationWarning,
stacklevel=2,
)
if "disable_log_stats" not in kwargs:
kwargs["disable_log_stats"] = True
......@@ -353,7 +357,6 @@ class LLM:
seed=seed,
gpu_memory_utilization=gpu_memory_utilization,
kv_cache_memory_bytes=kv_cache_memory_bytes,
swap_space=swap_space,
cpu_offload_gb=cpu_offload_gb,
offload_group_size=offload_group_size,
offload_num_in_group=offload_num_in_group,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment