Unverified Commit 708f4ff4 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Rename max_micro_batch_size -> pp_max_micro_batch_size (#11279)

parent e2daeb35
......@@ -136,7 +136,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
| `--device` | The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified. | None |
| `--tp-size` | The tensor parallelism size. | 1 |
| `--pp-size` | The pipeline parallelism size. | 1 |
| `--max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | None |
| `--pp-max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | None |
| `--stream-interval` | The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher. | 1 |
| `--stream-output` | Whether to output as a sequence of disjoint segments. | False |
| `--random-seed` | The random seed. | None |
......
......@@ -494,7 +494,7 @@ async def get_load():
# example usage:
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
@app.api_route("/set_internal_state", methods=["POST", "PUT"])
async def set_internal_state(obj: SetInternalStateReq, request: Request):
res = await _global_state.tokenizer_manager.set_internal_state(obj)
......
......@@ -97,7 +97,7 @@ GLOBAL_SERVER_ARGS_KEYS = [
"ep_num_redundant_experts",
"enable_nan_detection",
"flashinfer_mla_disable_ragged",
"max_micro_batch_size",
"pp_max_micro_batch_size",
"disable_shared_experts_fusion",
"sampling_backend",
"speculative_accept_threshold_single",
......
......@@ -464,8 +464,8 @@ class Scheduler(
_,
_,
) = self.tp_worker.get_worker_info()
if global_server_args_dict["max_micro_batch_size"] is None:
global_server_args_dict["max_micro_batch_size"] = max(
if global_server_args_dict["pp_max_micro_batch_size"] is None:
global_server_args_dict["pp_max_micro_batch_size"] = max(
self.max_running_requests // server_args.pp_size, 1
)
......@@ -1802,7 +1802,7 @@ class Scheduler(
return ret
def get_num_allocatable_reqs(self, running_bs):
res = global_server_args_dict["max_micro_batch_size"] - running_bs
res = global_server_args_dict["pp_max_micro_batch_size"] - running_bs
if self.pp_size > 1:
res = min(res, self.req_to_token_pool.available_size())
return res
......@@ -2510,7 +2510,7 @@ class Scheduler(
server_args_dict = recv_req.server_args
args_allow_update = set(
[
"max_micro_batch_size",
"pp_max_micro_batch_size",
"speculative_accept_threshold_single",
"speculative_accept_threshold_acc",
]
......@@ -2521,7 +2521,7 @@ class Scheduler(
logging.warning(f"Updating {k} is not supported.")
if_success = False
break
elif k == "max_micro_batch_size" and (
elif k == "pp_max_micro_batch_size" and (
v > self.max_running_requests // self.pp_size or v < 1
):
logging.warning(
......
......@@ -205,7 +205,7 @@ class ServerArgs:
device: Optional[str] = None
tp_size: int = 1
pp_size: int = 1
max_micro_batch_size: Optional[int] = None
pp_max_micro_batch_size: Optional[int] = None
stream_interval: int = 1
stream_output: bool = False
random_seed: Optional[int] = None
......@@ -1599,9 +1599,9 @@ class ServerArgs:
help="The pipeline parallelism size.",
)
parser.add_argument(
"--max-micro-batch-size",
"--pp-max-micro-batch-size",
type=int,
default=ServerArgs.max_micro_batch_size,
default=ServerArgs.pp_max_micro_batch_size,
help="The maximum micro batch size in pipeline parallelism.",
)
parser.add_argument(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment