test: add unit test for sla planner's interpolator (#2505)

0376e72f · Hongkuan Zhou · GitHub · 62978595 · 0376e72f · 0376e72f
Unverified Commit 0376e72f authored Aug 18, 2025 by Hongkuan Zhou Committed by GitHub Aug 18, 2025
3 changed files
--- a/components/planner/src/dynamo/planner/utils/perf_interpolation.py
+++ b/components/planner/src/dynamo/planner/utils/perf_interpolation.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import scipy
@@ -30,7 +31,7 @@ class PrefillInterpolator:
        with np.load(prefill_npz_fn) as raw_data:
            self.prefill_isl = raw_data["prefill_isl"]
-            self.prefill_ttft = raw_data["prefill_ttft"]
+            self.prefill_ttft = raw_data["prefill_ttft"] / 1000  # convert ms to s
            self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"]
        self.min_isl = min(self.prefill_isl)
@@ -143,7 +144,9 @@ class DecodeInterpolator:
        ix, iy = self.compute_idx(concurrency, context_length)
        return self.thpt_interpolator[iy, ix]
-    def find_best_throughput_per_gpu(self, itl: float, context_length: float) -> float:
+    def find_best_throughput_per_gpu(
+        self, itl: float, context_length: float
+    ) -> tuple[float, float, float]:
        # find the max kv_load that has itl <= target itl
        # here we cannot use binary search as interpolated itl might not be monotonic
        iy = int(
@@ -157,5 +160,71 @@ class DecodeInterpolator:
        for ix in range(self.resolution - 1, -1, -1):
            if self.itl_interpolator[iy, ix] <= itl:
-                return self.thpt_interpolator[iy, ix]
+                return (
-        return self.thpt_interpolator[iy, 0]
+                    self.thpt_interpolator[iy, ix],
+                    self.itl_interpolator[iy, ix],
+                    self.xi[ix],
+                )
+        return self.thpt_interpolator[iy, 0], self.itl_interpolator[iy, 0], self.xi[0]
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--profile_results_dir", type=str, required=True)
+    parser.add_argument("--isl", type=int, default=3000)
+    parser.add_argument("--osl", type=int, default=150)
+    parser.add_argument("--ttft", type=float, default=0.1, help="in s")
+    parser.add_argument("--itl", type=float, default=0.01, help="in s")
+    args = parser.parse_args()
+    print(f"ISL={args.isl}, OSL={args.osl}")
+    print(f"TTFT={args.ttft}s, ITL={args.itl}s")
+    print(f"Using profile results from {args.profile_results_dir}")
+    print("")
+    # first interpolate prefill
+    print("Interpolating prefill performance ...")
+    prefill_interpolator = PrefillInterpolator(args.profile_results_dir)
+    est_ttft = prefill_interpolator.interpolate_ttft(args.isl)
+    est_thpt_per_gpu = prefill_interpolator.interpolate_thpt_per_gpu(args.isl)
+    if est_ttft <= args.ttft:
+        print(
+            f"\tEstimated TTFT={est_ttft:.3f}s <= target TTFT={args.ttft:.3f}s. Requests can queue {args.ttft - est_ttft:.3f}s maximally while meeting TTFT SLA."
+        )
+    else:
+        print(
+            f"\tEstimated TTFT={est_ttft:.3f}s > target TTFT={args.ttft:.3f}s. Cannot meet TTFT SLA."
+        )
+    print(
+        f"\tEstimated throughput: {est_thpt_per_gpu:.2f} tokens/s/gpu. Request rate at {est_thpt_per_gpu / args.isl:.2f} requests/s will saturate one GPU."
+    )
+    print("")
+    # then interpolate decode
+    decode_interpolator = DecodeInterpolator(args.profile_results_dir)
+    print("Interpolating decode performance ...")
+    context_length = args.isl + args.osl // 2
+    print(f"\tAverage context length: isl + osl/2 = {context_length}.")
+    (
+        est_thpt_per_gpu,
+        est_itl,
+        est_kv_usage,
+    ) = decode_interpolator.find_best_throughput_per_gpu(args.itl, context_length)
+    if est_itl <= args.itl:
+        print(
+            f"\tEstimated ITL={est_itl:.4f}s <= target ITL={args.itl:.4f}s at {est_kv_usage*100:.2f}% active kv usage."
+        )
+        print(
+            f"\tEstimated throughput: {est_thpt_per_gpu:.2f} token/s/gpu. Request rate at {est_thpt_per_gpu / args.osl:.2f} requests/s will saturate one GPU."
+        )
+    else:
+        print(
+            f"\tEstimated ITL={est_itl:.4f}s > target ITL={args.itl:.4f}s. Cannot meet ITL SLA."
+        )
--- a/components/planner/src/dynamo/planner/utils/planner_core.py
+++ b/components/planner/src/dynamo/planner/utils/planner_core.py
@@ -278,11 +278,13 @@ class Planner:
            else:
                corrected_itl = self.args.itl / self.d_correction_factor
            # 2. reversely find out what is best throughput/gpu that can achieve corrected_itl under the predicted context length
-            pred_decode_thpt_per_gpu = (
+            (
-                self.decode_interpolator.find_best_throughput_per_gpu(
+                pred_decode_thpt_per_gpu,
+                _,
+                _,
+            ) = self.decode_interpolator.find_best_throughput_per_gpu(
                itl=corrected_itl, context_length=next_isl + next_osl / 2
            )
-            )
            # 3. compute number of decode replicas needed
            next_num_d = math.ceil(
                next_num_req

--- a/tests/planner/README.md
+++ b/tests/planner/README.md
@@ -22,3 +22,42 @@ Use the pre-configured test deployment with sample profiling data, we provide th
 ### Option B: Use Your Own Profiling Results
 1. Run pre-deployment profiling for your specific setup. See the [pre-deployment profiling documentation](../../docs/architecture/pre_deployment_profiling.md) for detailed instructions.
+## Interpolator Testing
+SLA planner uses two interpolators to estimate the performance of prefill and decode. You can test the interpolators with the following command:
+```bash
+python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
+  --profile_results_dir <path_to_profile_results> \
+  --isl <ISL> \
+  --osl <OSL> \
+  --ttft <TTFT(s)> \
+  --itl <ITL(s)>
+```
+The script will perform the interpolation based on ISL, OSL, and TTFT and ITL SLAs and advise the load that can saturate the engine.
+For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200,
+```bash
+python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
+  --profile_results_dir tests/planner/profiling_results/H200_TP1P_TP1D/ \
+  --isl 3000 \
+  --osl 150 \
+  --ttft 0.1 \
+  --itl 0.01
+> ISL=3000, OSL=150
+> TTFT=0.1s, ITL=0.01s
+> Using profile results from tests/planner/profiling_results/H200_TP1P_TP1D/
+>
+> Interpolating prefill performance ...
+>         Estimated TTFT=0.027s <= target TTFT=0.100s. Requests can queue 0.073s maximally while meeting TTFT SLA.
+>         Estimated throughput: 110893.48 tokens/s/gpu. Request rate at 36.96 requests/s will saturate one GPU.
+>
+> Interpolating decode performance ...
+>         Average context length: isl + osl/2 = 3075.
+>         Estimated ITL=0.0098s <= target ITL=0.0100s at 33.33% active kv usage.
+>         Estimated throughput: 10226.60 token/s/gpu. Request rate at 68.18 requests/s will saturate one GPU.
+```
\ No newline at end of file