Unverified Commit ee56782b authored by William Arnold's avatar William Arnold Committed by GitHub
Browse files

fix: standardize all planner ttft/itl units to float ms and fix docs (#3673)


Signed-off-by: default avatarWilliam Arnold <7565007+Aphoh@users.noreply.github.com>
parent d3cdb91d
......@@ -87,8 +87,8 @@ def create_profiler_parser() -> argparse.Namespace:
sla:
isl: Int (target input sequence length, default: 3000)
osl: Int (target output sequence length, default: 500)
ttft: Int (target Time To First Token in ms, default: 50)
itl: Int (target Inter Token Latency in ms, default: 10)
ttft: Float (target Time To First Token in milliseconds, default: 50)
itl: Float (target Inter Token Latency in milliseconds, default: 10)
planner: (planner-bypass arguments, use hyphens or underscores)
i.e., planner-min-endpoint: 2 # or planner_min_endpoint: 2 (both work)
"""
......@@ -179,15 +179,15 @@ def create_profiler_parser() -> argparse.Namespace:
)
parser.add_argument(
"--ttft",
type=int,
default=config.get("sla", {}).get("ttft", 50),
help="target Time To First Token in ms",
type=float,
default=config.get("sla", {}).get("ttft", 50.0),
help="target Time To First Token (float, in milliseconds)",
)
parser.add_argument(
"--itl",
type=int,
default=config.get("sla", {}).get("itl", 10),
help="target Inter Token Latency in ms",
type=float,
default=config.get("sla", {}).get("itl", 10.0),
help="target Inter Token Latency (float, in milliseconds)",
)
# arguments used for interpolating TTFT and ITL under different ISL/OSL
......
......@@ -89,8 +89,8 @@ class SLAPlannerDefaults(BasePlannerDefaults):
profile_results_dir = "profiling_results"
isl = 3000 # in number of tokens
osl = 150 # in number of tokens
ttft = 0.5 # in seconds
itl = 0.05 # in seconds
ttft = 500.0 # in milliseconds
itl = 50.0 # in milliseconds
load_predictor = "arima" # ["constant", "arima", "prophet"]
load_prediction_window_size = 50 # predict load using how many recent load samples
no_correction = False # disable correction factor, might be useful under some conditions like long cold start time
......
......@@ -51,9 +51,7 @@ class PrefillInterpolator:
try:
with np.load(prefill_npz_fn) as raw_data:
self.prefill_isl = raw_data["prefill_isl"]
self.prefill_ttft = (
raw_data["prefill_ttft"] / 1000
) # convert ms to s
self.prefill_ttft = raw_data["prefill_ttft"] # in milliseconds
self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"]
except FileNotFoundError:
logger.error(
......@@ -64,7 +62,7 @@ class PrefillInterpolator:
elif raw_data:
self.prefill_isl = raw_data["prefill_isl"]
self.prefill_ttft = raw_data["prefill_ttft"] / 1000 # convert ms to s
self.prefill_ttft = raw_data["prefill_ttft"] # in milliseconds
self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"]
else:
raise ValueError("Either profile_results_dir or raw_data must be provided")
......@@ -150,7 +148,7 @@ class DecodeInterpolator:
method="nearest",
)
self.itl_interpolator[nan_mask] = itl_nearest[nan_mask]
self.itl_interpolator /= 1000 # convert ms to s
# ITL values are in milliseconds
self.thpt_interpolator = scipy.interpolate.griddata(
(self.x_kv_usage, self.y_context_length),
......@@ -230,12 +228,12 @@ if __name__ == "__main__":
parser.add_argument("--profile_results_dir", type=str, required=True)
parser.add_argument("--isl", type=int, default=3000)
parser.add_argument("--osl", type=int, default=150)
parser.add_argument("--ttft", type=float, default=0.1, help="in s")
parser.add_argument("--itl", type=float, default=0.01, help="in s")
parser.add_argument("--ttft", type=float, default=100.0, help="in milliseconds")
parser.add_argument("--itl", type=float, default=10.0, help="in milliseconds")
args = parser.parse_args()
print(f"ISL={args.isl}, OSL={args.osl}")
print(f"TTFT={args.ttft}s, ITL={args.itl}s")
print(f"TTFT={args.ttft}ms, ITL={args.itl}ms")
print(f"Using profile results from {args.profile_results_dir}")
print("")
......@@ -248,11 +246,11 @@ if __name__ == "__main__":
if est_ttft <= args.ttft:
print(
f"\tEstimated TTFT={est_ttft:.3f}s <= target TTFT={args.ttft:.3f}s. Requests can queue {args.ttft - est_ttft:.3f}s maximally while meeting TTFT SLA."
f"\tEstimated TTFT={est_ttft:.2f}ms <= target TTFT={args.ttft:.2f}ms. Requests can queue {args.ttft - est_ttft:.2f}ms maximally while meeting TTFT SLA."
)
else:
print(
f"\tEstimated TTFT={est_ttft:.3f}s > target TTFT={args.ttft:.3f}s. Cannot meet TTFT SLA."
f"\tEstimated TTFT={est_ttft:.2f}ms > target TTFT={args.ttft:.2f}ms. Cannot meet TTFT SLA."
)
print(
......@@ -274,12 +272,12 @@ if __name__ == "__main__":
) = decode_interpolator.find_best_throughput_per_gpu(args.itl, context_length)
if est_itl <= args.itl:
print(
f"\tEstimated ITL={est_itl:.4f}s <= target ITL={args.itl:.4f}s at {est_kv_usage*100:.2f}% active kv usage."
f"\tEstimated ITL={est_itl:.2f}ms <= target ITL={args.itl:.2f}ms at {est_kv_usage*100:.2f}% active kv usage."
)
print(
f"\tEstimated throughput: {est_thpt_per_gpu:.2f} token/s/gpu. Request rate at {est_thpt_per_gpu / args.osl:.2f} requests/s will saturate one GPU."
)
else:
print(
f"\tEstimated ITL={est_itl:.4f}s > target ITL={args.itl:.4f}s. Cannot meet ITL SLA."
f"\tEstimated ITL={est_itl:.2f}ms > target ITL={args.itl:.2f}ms. Cannot meet ITL SLA."
)
......@@ -90,10 +90,13 @@ def create_sla_planner_parser() -> argparse.ArgumentParser:
"--ttft",
type=float,
default=SLAPlannerDefaults.ttft,
help="Time to first token",
help="Time to first token (float, in milliseconds)",
)
parser.add_argument(
"--itl", type=float, default=SLAPlannerDefaults.itl, help="Inter-token latency"
"--itl",
type=float,
default=SLAPlannerDefaults.itl,
help="Inter-token latency (float, in milliseconds)",
)
parser.add_argument(
"--load-predictor",
......
......@@ -249,13 +249,20 @@ class Planner:
self.num_p_workers_gauge.set(len(self.p_endpoints))
self.num_d_workers_gauge.set(len(self.d_endpoints))
self.last_metrics.ttft = self.prometheus_api_client.get_avg_time_to_first_token(
f"{self.args.adjustment_interval}s",
self.model_name,
# Prometheus returns seconds, convert to milliseconds
self.last_metrics.ttft = (
self.prometheus_api_client.get_avg_time_to_first_token(
f"{self.args.adjustment_interval}s",
self.model_name,
)
* 1000
)
self.last_metrics.itl = self.prometheus_api_client.get_avg_inter_token_latency(
f"{self.args.adjustment_interval}s",
self.model_name,
self.last_metrics.itl = (
self.prometheus_api_client.get_avg_inter_token_latency(
f"{self.args.adjustment_interval}s",
self.model_name,
)
* 1000
)
self.last_metrics.num_req = self.prometheus_api_client.get_avg_request_count(
f"{self.args.adjustment_interval}s",
......@@ -284,7 +291,7 @@ class Planner:
f"Observed num_req: {self.last_metrics.num_req:.2f} isl: {self.last_metrics.isl:.2f} osl: {self.last_metrics.osl:.2f}"
)
logger.info(
f"Observed ttft: {self.last_metrics.ttft:.3f}s itl: {self.last_metrics.itl:.3f}s"
f"Observed ttft: {self.last_metrics.ttft:.2f}ms itl: {self.last_metrics.itl:.2f}ms"
)
self.num_req_predictor.add_data_point(self.last_metrics.num_req)
......
......@@ -119,9 +119,9 @@ spec:
- --osl
- "150" # average OSL is 150 tokens
- --ttft
- "200" # target TTFT is 200ms
- "200" # target TTFT is 200ms (float, in milliseconds)
- --itl
- "20" # target ITL is 20ms
- "20" # target ITL is 20ms (float, in milliseconds)
- --backend
- <vllm/sglang>
```
......@@ -290,8 +290,8 @@ python3 -m benchmarks.profiler.profile_sla \
--aic-backend-version 0.20.0 \
--isl 3000 \
--osl 150 \
--ttft 0.2 \
--itl 0.02
--ttft 200 \ # target TTFT in milliseconds (float)
--itl 20 # target ITL in milliseconds (float)
```
The output will be written to `./profiling_results/` and can be used directly with SLA planner deployment.
......@@ -206,7 +206,7 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
```
New adjustment interval started!
Observed num_req: X.XXX isl: X.XXX osl: X.XXX
Observed ttft: X.XXXs itl: X.XXXs
Observed ttft: X.XXms itl: X.XXms
Number of prefill workers: 1, number of decode workers: 1
```
......
......@@ -34,34 +34,34 @@ python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
--profile_results_dir <path_to_profile_results> \
--isl <ISL> \
--osl <OSL> \
--ttft <TTFT(s)> \
--itl <ITL(s)>
--ttft <TTFT(ms)> \
--itl <ITL(ms)>
```
The script will perform the interpolation based on ISL, OSL, and TTFT and ITL SLAs and advise the load that can saturate the engine.
For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200,
For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200 (target TTFT=200ms, ITL=10ms):
```bash
python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
--profile_results_dir tests/planner/profiling_results/H200_TP1P_TP1D/ \
--isl 3000 \
--osl 300 \
--ttft 0.2 \
--itl 0.01
--ttft 200 \
--itl 10
# output:
ISL=3000, OSL=300
TTFT=0.1s, ITL=0.01s
TTFT=200ms, ITL=10ms
Using profile results from tests/planner/profiling_results/H200_TP1P_TP1D/
Interpolating prefill performance ...
Estimated TTFT=0.060s <= target TTFT=0.200s. Requests can queue 0.140s maximally while meeting TTFT SLA.
Estimated TTFT=60.00ms <= target TTFT=200.00ms. Requests can queue 140.00ms maximally while meeting TTFT SLA.
Estimated throughput: 49481.09 tokens/s/gpu. Request rate at 16.49 requests/s will saturate one GPU.
Interpolating decode performance ...
Average context length: isl + osl/2 = 3150.
Estimated ITL=0.0097s <= target ITL=0.0100s at 16.16% active kv usage.
Estimated ITL=9.70ms <= target ITL=10.00ms at 16.16% active kv usage.
Estimated throughput: 4555.68 token/s/gpu. Request rate at 15.19 requests/s will saturate one GPU.
```
......@@ -111,8 +111,8 @@ For example, to dry run SLA planner for the previous FP8 8B on H200 using the ge
```bash
python components/planner/test/planner_sla_dryrun.py \
--ttft 0.2 \
--itl 0.01 \
--ttft 200 \
--itl 10 \
--adjustment-interval 60 \
--profile-results-dir tests/planner/profiling_results/H200_TP1P_TP1D/ \
--dataset rr-5-45_i3000o300.jsonl \
......
......@@ -87,8 +87,8 @@ spec:
python3 -m planner_sla
--environment=kubernetes
--backend=vllm
--ttft 0.2
--itl 0.01
--ttft 200
--itl 10
--profile-results-dir /workspace/tests/planner/profiling_results/H200_TP1P_TP1D/
--adjustment-interval=60
--prometheus-port=9085
......
......@@ -57,8 +57,8 @@ spec:
--adjustment-interval=60
--profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
--prometheus-port=9085
--ttft=0.1
--itl=0.01
--ttft=100
--itl=10
--load-predictor=constant
--no-correction
VllmDecodeWorker:
......
......@@ -49,8 +49,8 @@ def planner():
args.decode_engine_num_gpu = 1
args.min_endpoint = 1
args.max_gpu_budget = 10
args.ttft = 80 # ms
args.itl = 10 # ms
args.ttft = 80.0 # ms
args.itl = 10.0 # ms
args.backend = "vllm"
args.no_operation = True # Don't actually scale
args.no_correction = False # Allow correction factors
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment