help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
)
)
parser.add_argument(
parser.add_argument(
...
@@ -316,18 +311,10 @@ class ServerArgs:
...
@@ -316,18 +311,10 @@ class ServerArgs:
help="The nccl init address of multi-node server.",
help="The nccl init address of multi-node server.",
)
)
parser.add_argument(
parser.add_argument(
"--nnodes",type=int,default=ServerArgs.nnodes,help="The number of nodes."
"--nnodes",type=int,default=1,help="The number of nodes."