fix: standardize all planner ttft/itl units to float ms and fix docs (#3673)

Signed-off-by: William Arnold <7565007+Aphoh@users.noreply.github.com>

fix: standardize all planner ttft/itl units to float ms and fix docs (#3673)
Signed-off-by: William Arnold <7565007+Aphoh@users.noreply.github.com>
ee56782b · William Arnold · GitHub · d3cdb91d · ee56782b · ee56782b
Unverified Commit ee56782b authored Oct 17, 2025 by William Arnold Committed by GitHub Oct 17, 2025
11 changed files
--- a/benchmarks/profiler/utils/profiler_argparse.py
+++ b/benchmarks/profiler/utils/profiler_argparse.py
@@ -87,8 +87,8 @@ def create_profiler_parser() -> argparse.Namespace:
        sla:
            isl: Int (target input sequence length, default: 3000)
            osl: Int (target output sequence length, default: 500)
-            ttft: Int (target Time To First Token in ms, default: 50)
-            itl: Int (target Inter Token Latency in ms, default: 10)
+            ttft: Float (target Time To First Token in milliseconds, default: 50)
+            itl: Float (target Inter Token Latency in milliseconds, default: 10)
        planner: (planner-bypass arguments, use hyphens or underscores)
            i.e., planner-min-endpoint: 2  # or planner_min_endpoint: 2 (both work)
    """
@@ -179,15 +179,15 @@ def create_profiler_parser() -> argparse.Namespace:
    )
    parser.add_argument(
        "--ttft",
-        type=int,
-        default=config.get("sla", {}).get("ttft", 50),
-        help="target Time To First Token in ms",
+        type=float,
+        default=config.get("sla", {}).get("ttft", 50.0),
+        help="target Time To First Token (float, in milliseconds)",
    )
    parser.add_argument(
        "--itl",
-        type=int,
-        default=config.get("sla", {}).get("itl", 10),
-        help="target Inter Token Latency in ms",
+        type=float,
+        default=config.get("sla", {}).get("itl", 10.0),
+        help="target Inter Token Latency (float, in milliseconds)",
    )

    # arguments used for interpolating TTFT and ITL under different ISL/OSL

--- a/components/src/dynamo/planner/defaults.py
+++ b/components/src/dynamo/planner/defaults.py
@@ -89,8 +89,8 @@ class SLAPlannerDefaults(BasePlannerDefaults):
    profile_results_dir = "profiling_results"
    isl = 3000  # in number of tokens
    osl = 150  # in number of tokens
-    ttft = 0.5  # in seconds
-    itl = 0.05  # in seconds
+    ttft = 500.0  # in milliseconds
+    itl = 50.0  # in milliseconds
    load_predictor = "arima"  # ["constant", "arima", "prophet"]
    load_prediction_window_size = 50  # predict load using how many recent load samples
    no_correction = False  # disable correction factor, might be useful under some conditions like long cold start time

--- a/components/src/dynamo/planner/utils/perf_interpolation.py
+++ b/components/src/dynamo/planner/utils/perf_interpolation.py
@@ -51,9 +51,7 @@ class PrefillInterpolator:
            try:
                with np.load(prefill_npz_fn) as raw_data:
                    self.prefill_isl = raw_data["prefill_isl"]
-                    self.prefill_ttft = (
-                        raw_data["prefill_ttft"] / 1000
-                    )  # convert ms to s
+                    self.prefill_ttft = raw_data["prefill_ttft"]  # in milliseconds
                    self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"]
            except FileNotFoundError:
                logger.error(
@@ -64,7 +62,7 @@ class PrefillInterpolator:

        elif raw_data:
            self.prefill_isl = raw_data["prefill_isl"]
-            self.prefill_ttft = raw_data["prefill_ttft"] / 1000  # convert ms to s
+            self.prefill_ttft = raw_data["prefill_ttft"]  # in milliseconds
            self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"]
        else:
            raise ValueError("Either profile_results_dir or raw_data must be provided")
@@ -150,7 +148,7 @@ class DecodeInterpolator:
                method="nearest",
            )
            self.itl_interpolator[nan_mask] = itl_nearest[nan_mask]
-        self.itl_interpolator /= 1000  # convert ms to s
+        # ITL values are in milliseconds

        self.thpt_interpolator = scipy.interpolate.griddata(
            (self.x_kv_usage, self.y_context_length),
@@ -230,12 +228,12 @@ if __name__ == "__main__":
    parser.add_argument("--profile_results_dir", type=str, required=True)
    parser.add_argument("--isl", type=int, default=3000)
    parser.add_argument("--osl", type=int, default=150)
-    parser.add_argument("--ttft", type=float, default=0.1, help="in s")
-    parser.add_argument("--itl", type=float, default=0.01, help="in s")
+    parser.add_argument("--ttft", type=float, default=100.0, help="in milliseconds")
+    parser.add_argument("--itl", type=float, default=10.0, help="in milliseconds")
    args = parser.parse_args()

    print(f"ISL={args.isl}, OSL={args.osl}")
-    print(f"TTFT={args.ttft}s, ITL={args.itl}s")
+    print(f"TTFT={args.ttft}ms, ITL={args.itl}ms")
    print(f"Using profile results from {args.profile_results_dir}")
    print("")

@@ -248,11 +246,11 @@ if __name__ == "__main__":

    if est_ttft <= args.ttft:
        print(
-            f"\tEstimated TTFT={est_ttft:.3f}s <= target TTFT={args.ttft:.3f}s. Requests can queue {args.ttft - est_ttft:.3f}s maximally while meeting TTFT SLA."
+            f"\tEstimated TTFT={est_ttft:.2f}ms <= target TTFT={args.ttft:.2f}ms. Requests can queue {args.ttft - est_ttft:.2f}ms maximally while meeting TTFT SLA."
        )
    else:
        print(
-            f"\tEstimated TTFT={est_ttft:.3f}s > target TTFT={args.ttft:.3f}s. Cannot meet TTFT SLA."
+            f"\tEstimated TTFT={est_ttft:.2f}ms > target TTFT={args.ttft:.2f}ms. Cannot meet TTFT SLA."
        )

    print(
@@ -274,12 +272,12 @@ if __name__ == "__main__":
    ) = decode_interpolator.find_best_throughput_per_gpu(args.itl, context_length)
    if est_itl <= args.itl:
        print(
-            f"\tEstimated ITL={est_itl:.4f}s <= target ITL={args.itl:.4f}s at {est_kv_usage*100:.2f}% active kv usage."
+            f"\tEstimated ITL={est_itl:.2f}ms <= target ITL={args.itl:.2f}ms at {est_kv_usage*100:.2f}% active kv usage."
        )
        print(
            f"\tEstimated throughput: {est_thpt_per_gpu:.2f} token/s/gpu. Request rate at {est_thpt_per_gpu / args.osl:.2f} requests/s will saturate one GPU."
        )
    else:
        print(
-            f"\tEstimated ITL={est_itl:.4f}s > target ITL={args.itl:.4f}s. Cannot meet ITL SLA."
+            f"\tEstimated ITL={est_itl:.2f}ms > target ITL={args.itl:.2f}ms. Cannot meet ITL SLA."
        )
--- a/components/src/dynamo/planner/utils/planner_argparse.py
+++ b/components/src/dynamo/planner/utils/planner_argparse.py
@@ -90,10 +90,13 @@ def create_sla_planner_parser() -> argparse.ArgumentParser:
        "--ttft",
        type=float,
        default=SLAPlannerDefaults.ttft,
-        help="Time to first token",
+        help="Time to first token (float, in milliseconds)",
    )
    parser.add_argument(
-        "--itl", type=float, default=SLAPlannerDefaults.itl, help="Inter-token latency"
+        "--itl",
+        type=float,
+        default=SLAPlannerDefaults.itl,
+        help="Inter-token latency (float, in milliseconds)",
    )
    parser.add_argument(
        "--load-predictor",

--- a/components/src/dynamo/planner/utils/planner_core.py
+++ b/components/src/dynamo/planner/utils/planner_core.py
@@ -249,13 +249,20 @@ class Planner:
            self.num_p_workers_gauge.set(len(self.p_endpoints))
            self.num_d_workers_gauge.set(len(self.d_endpoints))

-        self.last_metrics.ttft = self.prometheus_api_client.get_avg_time_to_first_token(
-            f"{self.args.adjustment_interval}s",
-            self.model_name,
+        # Prometheus returns seconds, convert to milliseconds
+        self.last_metrics.ttft = (
+            self.prometheus_api_client.get_avg_time_to_first_token(
+                f"{self.args.adjustment_interval}s",
+                self.model_name,
+            )
+            * 1000
        )
-        self.last_metrics.itl = self.prometheus_api_client.get_avg_inter_token_latency(
-            f"{self.args.adjustment_interval}s",
-            self.model_name,
+        self.last_metrics.itl = (
+            self.prometheus_api_client.get_avg_inter_token_latency(
+                f"{self.args.adjustment_interval}s",
+                self.model_name,
+            )
+            * 1000
        )
        self.last_metrics.num_req = self.prometheus_api_client.get_avg_request_count(
            f"{self.args.adjustment_interval}s",
@@ -284,7 +291,7 @@ class Planner:
            f"Observed num_req: {self.last_metrics.num_req:.2f} isl: {self.last_metrics.isl:.2f} osl: {self.last_metrics.osl:.2f}"
        )
        logger.info(
-            f"Observed ttft: {self.last_metrics.ttft:.3f}s itl: {self.last_metrics.itl:.3f}s"
+            f"Observed ttft: {self.last_metrics.ttft:.2f}ms itl: {self.last_metrics.itl:.2f}ms"
        )

        self.num_req_predictor.add_data_point(self.last_metrics.num_req)

--- a/docs/benchmarks/pre_deployment_profiling.md
+++ b/docs/benchmarks/pre_deployment_profiling.md
@@ -119,9 +119,9 @@ spec:
            - --osl
            - "150" # average OSL is 150 tokens
            - --ttft
-            - "200" # target TTFT is 200ms
+            - "200" # target TTFT is 200ms (float, in milliseconds)
            - --itl
-            - "20" # target ITL is 20ms
+            - "20" # target ITL is 20ms (float, in milliseconds)
            - --backend
            - <vllm/sglang>
 ```
@@ -290,8 +290,8 @@ python3 -m benchmarks.profiler.profile_sla \
   --aic-backend-version 0.20.0 \
   --isl 3000 \
   --osl 150 \
-   --ttft 0.2 \
-   --itl 0.02
+   --ttft 200 \ # target TTFT in milliseconds (float)
+   --itl 20 # target ITL in milliseconds (float)
 ```

 The output will be written to `./profiling_results/` and can be used directly with SLA planner deployment.
--- a/docs/planner/sla_planner_quickstart.md
+++ b/docs/planner/sla_planner_quickstart.md
@@ -206,7 +206,7 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
 ```
 New adjustment interval started!
 Observed num_req: X.XXX isl: X.XXX osl: X.XXX
-Observed ttft: X.XXXs itl: X.XXXs
+Observed ttft: X.XXms itl: X.XXms
 Number of prefill workers: 1, number of decode workers: 1
 ```


--- a/tests/planner/README.md
+++ b/tests/planner/README.md
@@ -34,34 +34,34 @@ python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
  --profile_results_dir <path_to_profile_results> \
  --isl <ISL> \
  --osl <OSL> \
-  --ttft <TTFT(s)> \
-  --itl <ITL(s)>
+  --ttft <TTFT(ms)> \
+  --itl <ITL(ms)>
 ```

 The script will perform the interpolation based on ISL, OSL, and TTFT and ITL SLAs and advise the load that can saturate the engine.

-For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200,
+For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200 (target TTFT=200ms, ITL=10ms):

 ```bash
 python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
  --profile_results_dir tests/planner/profiling_results/H200_TP1P_TP1D/ \
  --isl 3000 \
  --osl 300 \
-  --ttft 0.2 \
-  --itl 0.01
+  --ttft 200 \
+  --itl 10

 # output:
 ISL=3000, OSL=300
-TTFT=0.1s, ITL=0.01s
+TTFT=200ms, ITL=10ms
 Using profile results from tests/planner/profiling_results/H200_TP1P_TP1D/

 Interpolating prefill performance ...
-        Estimated TTFT=0.060s <= target TTFT=0.200s. Requests can queue 0.140s maximally while meeting TTFT SLA.
+        Estimated TTFT=60.00ms <= target TTFT=200.00ms. Requests can queue 140.00ms maximally while meeting TTFT SLA.
        Estimated throughput: 49481.09 tokens/s/gpu. Request rate at 16.49 requests/s will saturate one GPU.

 Interpolating decode performance ...
        Average context length: isl + osl/2 = 3150.
-        Estimated ITL=0.0097s <= target ITL=0.0100s at 16.16% active kv usage.
+        Estimated ITL=9.70ms <= target ITL=10.00ms at 16.16% active kv usage.
        Estimated throughput: 4555.68 token/s/gpu. Request rate at 15.19 requests/s will saturate one GPU.
 ```

@@ -111,8 +111,8 @@ For example, to dry run SLA planner for the previous FP8 8B on H200 using the ge

 ```bash
 python components/planner/test/planner_sla_dryrun.py \
-    --ttft 0.2 \
-    --itl 0.01 \
+    --ttft 200 \
+    --itl 10 \
    --adjustment-interval 60 \
    --profile-results-dir tests/planner/profiling_results/H200_TP1P_TP1D/ \
    --dataset rr-5-45_i3000o300.jsonl \

--- a/tests/planner/perf_test_configs/disagg_8b_planner.yaml
+++ b/tests/planner/perf_test_configs/disagg_8b_planner.yaml
@@ -87,8 +87,8 @@ spec:
              python3 -m planner_sla
              --environment=kubernetes
              --backend=vllm
-              --ttft 0.2
-              --itl 0.01
+              --ttft 200
+              --itl 10
              --profile-results-dir /workspace/tests/planner/profiling_results/H200_TP1P_TP1D/
              --adjustment-interval=60
              --prometheus-port=9085

--- a/tests/planner/scaling/disagg_planner.yaml
+++ b/tests/planner/scaling/disagg_planner.yaml
@@ -57,8 +57,8 @@ spec:
              --adjustment-interval=60
              --profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
              --prometheus-port=9085
-              --ttft=0.1
-              --itl=0.01
+              --ttft=100
+              --itl=10
              --load-predictor=constant
              --no-correction
    VllmDecodeWorker:

--- a/tests/planner/test_replica_calculation.py
+++ b/tests/planner/test_replica_calculation.py
@@ -49,8 +49,8 @@ def planner():
    args.decode_engine_num_gpu = 1
    args.min_endpoint = 1
    args.max_gpu_budget = 10
-    args.ttft = 80  # ms
-    args.itl = 10  # ms
+    args.ttft = 80.0  # ms
+    args.itl = 10.0  # ms
    args.backend = "vllm"
    args.no_operation = True  # Don't actually scale
    args.no_correction = False  # Allow correction factors