help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
)
parser.add_argument(
...
...
@@ -311,10 +316,18 @@ class ServerArgs:
help="The nccl init address of multi-node server.",
)
parser.add_argument(
"--nnodes",type=int,default=1,help="The number of nodes."
"--nnodes",type=int,default=ServerArgs.nnodes,help="The number of nodes."