Use monotonic time where appropriate (#1249)

acbed3ef · Antoni Baum · GitHub · 66d18a7f · acbed3ef · acbed3ef
Unverified Commit acbed3ef authored Oct 02, 2023 by Antoni Baum Committed by GitHub Oct 02, 2023
7 changed files
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -40,13 +40,13 @@ def main(args: argparse.Namespace):
    def run_to_completion(profile: bool = False):
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
-        start_time = time.time()
+        start_time = time.perf_counter()
        llm.generate(prompt_token_ids=dummy_prompt_token_ids,
                     sampling_params=sampling_params,
                     use_tqdm=False)
-        end_time = time.time()
+        end_time = time.perf_counter()
        latency = end_time - start_time
        if profile:
            torch.cuda.cudart().cudaProfilerStop()

--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -105,7 +105,7 @@ async def send_request(
    best_of: int,
    use_beam_search: bool,
 ) -> None:
-    request_start_time = time.time()
+    request_start_time = time.perf_counter()
    headers = {"User-Agent": "Benchmark Client"}
    if backend == "vllm":
@@ -148,7 +148,7 @@ async def send_request(
            if "error" not in output:
                break
-    request_end_time = time.time()
+    request_end_time = time.perf_counter()
    request_latency = request_end_time - request_start_time
    REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
@@ -180,10 +180,10 @@ def main(args: argparse.Namespace):
    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
    input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
-    benchmark_start_time = time.time()
+    benchmark_start_time = time.perf_counter()
    asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
                          args.use_beam_search, args.request_rate))
-    benchmark_end_time = time.time()
+    benchmark_end_time = time.perf_counter()
    benchmark_time = benchmark_end_time - benchmark_start_time
    print(f"Total time: {benchmark_time:.2f} s")
    print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s")

--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -93,10 +93,10 @@ def run_vllm(
            sampling_params=sampling_params,
        )
-    start = time.time()
+    start = time.perf_counter()
    # FIXME(woosuk): Do use internal method.
    llm._run_engine(use_tqdm=True)
-    end = time.time()
+    end = time.perf_counter()
    return end - start
@@ -118,7 +118,7 @@ def run_hf(
    llm = llm.cuda()
    pbar = tqdm(total=len(requests))
-    start = time.time()
+    start = time.perf_counter()
    batch: List[str] = []
    max_prompt_len = 0
    max_output_len = 0
@@ -156,7 +156,7 @@ def run_hf(
        batch = []
        max_prompt_len = 0
        max_output_len = 0
-    end = time.time()
+    end = time.perf_counter()
    return end - start

--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -121,7 +121,7 @@ class Scheduler:
        blocks_to_copy: Dict[int, List[int]] = {}
        # Fix the current time.
-        now = time.time()
+        now = time.monotonic()
        # Join waiting sequences if possible.
        if not self.swapped:

--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -417,7 +417,8 @@ class AsyncLLMEngine:
            request.
        """
        # Preprocess the request.
-        arrival_time = time.time()
+        # This should not be used for logging, as it is monotonic time.
+        arrival_time = time.monotonic()
        try:
            stream = await self.add_request(request_id,

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -256,10 +256,10 @@ class LLMEngine:
            prompt_token_ids: The token IDs of the prompt. If None, we
                use the tokenizer to convert the prompts to token IDs.
            arrival_time: The arrival time of the request. If None, we use
-                the current time.
+                the current monotonic time.
        """
        if arrival_time is None:
-            arrival_time = time.time()
+            arrival_time = time.monotonic()
        if prompt_token_ids is None:
            assert prompt is not None
            prompt_token_ids = self.tokenizer.encode(prompt)
@@ -568,7 +568,7 @@ class LLMEngine:
        prompt_run: bool,
        num_batched_tokens: int,
    ) -> None:
-        now = time.time()
+        now = time.monotonic()
        # Log the number of batched input tokens.
        if prompt_run:
            self.num_prompt_tokens.append((now, num_batched_tokens))

--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -210,7 +210,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
    model_name = request.model
    request_id = f"cmpl-{random_uuid()}"
-    created_time = int(time.time())
+    created_time = int(time.monotonic())
    try:
        sampling_params = SamplingParams(
            n=request.n,
@@ -411,7 +411,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
    if error_check_ret is not None:
        return error_check_ret
-    created_time = int(time.time())
+    created_time = int(time.monotonic())
    try:
        sampling_params = SamplingParams(
            n=request.n,