help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
)
)
parser.add_argument(
"--cpu-offload-gb",
type=int,
default=ServerArgs.cpu_offload_gb,
help="How many GBs of RAM to reserve for CPU offloading.",
)
parser.add_argument(
parser.add_argument(
"--page-size",
"--page-size",
type=int,
type=int,
...
@@ -1683,6 +1683,38 @@ class ServerArgs:
...
@@ -1683,6 +1683,38 @@ class ServerArgs:
help="The type of heavy channels in double sparsity attention",
help="The type of heavy channels in double sparsity attention",
)
)
# Offloading
parser.add_argument(
"--cpu-offload-gb",
type=int,
default=ServerArgs.cpu_offload_gb,
help="How many GBs of RAM to reserve for CPU offloading.",
)
parser.add_argument(
"--offload-group-size",
type=int,
default=ServerArgs.offload_group_size,
help="Number of layers per group in offloading.",
)
parser.add_argument(
"--offload-num-in-group",
type=int,
default=ServerArgs.offload_num_in_group,
help="Number of layers to be offloaded within a group.",