Unverified Commit 708f4ff4 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Rename max_micro_batch_size -> pp_max_micro_batch_size (#11279)

parent e2daeb35
...@@ -136,7 +136,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s ...@@ -136,7 +136,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
| `--device` | The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified. | None | | `--device` | The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified. | None |
| `--tp-size` | The tensor parallelism size. | 1 | | `--tp-size` | The tensor parallelism size. | 1 |
| `--pp-size` | The pipeline parallelism size. | 1 | | `--pp-size` | The pipeline parallelism size. | 1 |
| `--max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | None | | `--pp-max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | None |
| `--stream-interval` | The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher. | 1 | | `--stream-interval` | The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher. | 1 |
| `--stream-output` | Whether to output as a sequence of disjoint segments. | False | | `--stream-output` | Whether to output as a sequence of disjoint segments. | False |
| `--random-seed` | The random seed. | None | | `--random-seed` | The random seed. | None |
......
...@@ -494,7 +494,7 @@ async def get_load(): ...@@ -494,7 +494,7 @@ async def get_load():
# example usage: # example usage:
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}' # curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
@app.api_route("/set_internal_state", methods=["POST", "PUT"]) @app.api_route("/set_internal_state", methods=["POST", "PUT"])
async def set_internal_state(obj: SetInternalStateReq, request: Request): async def set_internal_state(obj: SetInternalStateReq, request: Request):
res = await _global_state.tokenizer_manager.set_internal_state(obj) res = await _global_state.tokenizer_manager.set_internal_state(obj)
......
...@@ -97,7 +97,7 @@ GLOBAL_SERVER_ARGS_KEYS = [ ...@@ -97,7 +97,7 @@ GLOBAL_SERVER_ARGS_KEYS = [
"ep_num_redundant_experts", "ep_num_redundant_experts",
"enable_nan_detection", "enable_nan_detection",
"flashinfer_mla_disable_ragged", "flashinfer_mla_disable_ragged",
"max_micro_batch_size", "pp_max_micro_batch_size",
"disable_shared_experts_fusion", "disable_shared_experts_fusion",
"sampling_backend", "sampling_backend",
"speculative_accept_threshold_single", "speculative_accept_threshold_single",
......
...@@ -464,8 +464,8 @@ class Scheduler( ...@@ -464,8 +464,8 @@ class Scheduler(
_, _,
_, _,
) = self.tp_worker.get_worker_info() ) = self.tp_worker.get_worker_info()
if global_server_args_dict["max_micro_batch_size"] is None: if global_server_args_dict["pp_max_micro_batch_size"] is None:
global_server_args_dict["max_micro_batch_size"] = max( global_server_args_dict["pp_max_micro_batch_size"] = max(
self.max_running_requests // server_args.pp_size, 1 self.max_running_requests // server_args.pp_size, 1
) )
...@@ -1802,7 +1802,7 @@ class Scheduler( ...@@ -1802,7 +1802,7 @@ class Scheduler(
return ret return ret
def get_num_allocatable_reqs(self, running_bs): def get_num_allocatable_reqs(self, running_bs):
res = global_server_args_dict["max_micro_batch_size"] - running_bs res = global_server_args_dict["pp_max_micro_batch_size"] - running_bs
if self.pp_size > 1: if self.pp_size > 1:
res = min(res, self.req_to_token_pool.available_size()) res = min(res, self.req_to_token_pool.available_size())
return res return res
...@@ -2510,7 +2510,7 @@ class Scheduler( ...@@ -2510,7 +2510,7 @@ class Scheduler(
server_args_dict = recv_req.server_args server_args_dict = recv_req.server_args
args_allow_update = set( args_allow_update = set(
[ [
"max_micro_batch_size", "pp_max_micro_batch_size",
"speculative_accept_threshold_single", "speculative_accept_threshold_single",
"speculative_accept_threshold_acc", "speculative_accept_threshold_acc",
] ]
...@@ -2521,7 +2521,7 @@ class Scheduler( ...@@ -2521,7 +2521,7 @@ class Scheduler(
logging.warning(f"Updating {k} is not supported.") logging.warning(f"Updating {k} is not supported.")
if_success = False if_success = False
break break
elif k == "max_micro_batch_size" and ( elif k == "pp_max_micro_batch_size" and (
v > self.max_running_requests // self.pp_size or v < 1 v > self.max_running_requests // self.pp_size or v < 1
): ):
logging.warning( logging.warning(
......
...@@ -205,7 +205,7 @@ class ServerArgs: ...@@ -205,7 +205,7 @@ class ServerArgs:
device: Optional[str] = None device: Optional[str] = None
tp_size: int = 1 tp_size: int = 1
pp_size: int = 1 pp_size: int = 1
max_micro_batch_size: Optional[int] = None pp_max_micro_batch_size: Optional[int] = None
stream_interval: int = 1 stream_interval: int = 1
stream_output: bool = False stream_output: bool = False
random_seed: Optional[int] = None random_seed: Optional[int] = None
...@@ -1599,9 +1599,9 @@ class ServerArgs: ...@@ -1599,9 +1599,9 @@ class ServerArgs:
help="The pipeline parallelism size.", help="The pipeline parallelism size.",
) )
parser.add_argument( parser.add_argument(
"--max-micro-batch-size", "--pp-max-micro-batch-size",
type=int, type=int,
default=ServerArgs.max_micro_batch_size, default=ServerArgs.pp_max_micro_batch_size,
help="The maximum micro batch size in pipeline parallelism.", help="The maximum micro batch size in pipeline parallelism.",
) )
parser.add_argument( parser.add_argument(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment