[Frontend] add tok/s speed metric to llm class when using tqdm (#4400)

Co-authored-by: Michael Goin <michael@neuralmagic.com>

[Frontend] add tok/s speed metric to llm class when using tqdm (#4400)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
16bc0a09 · Mahmoud Ashraf · GitHub · e288df06 · 16bc0a09
Unverified Commit 16bc0a09 authored May 09, 2024 by Mahmoud Ashraf Committed by GitHub May 08, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 4 deletions

vllm/entrypoints/llm.py vllm/entrypoints/llm.py +12 -4

No files found.
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -238,17 +238,25 @@ class LLM:
        # Initialize tqdm.
        if use_tqdm:
            num_requests = self.llm_engine.get_num_unfinished_requests()
-            pbar = tqdm(total=num_requests,
-                        desc="Processed prompts",
-                        dynamic_ncols=True)
+            pbar = tqdm(
+                total=num_requests,
+                desc="Processed prompts",
+                dynamic_ncols=True,
+                postfix=f"Generation Speed: {0:.2f} toks/s",
+            )
        # Run the engine.
        outputs: List[RequestOutput] = []
+        total_toks = 0
        while self.llm_engine.has_unfinished_requests():
            step_outputs = self.llm_engine.step()
            for output in step_outputs:
                if output.finished:
                    outputs.append(output)
                    if use_tqdm:
+                        total_toks += (sum(
+                            len(stp.token_ids) for stp in output.outputs))
+                        spd = total_toks / pbar.format_dict["elapsed"]
+                        pbar.postfix = f"Generation Speed: {spd:.2f} toks/s"
                        pbar.update(1)
        if use_tqdm:
            pbar.close()
@@ -256,4 +264,4 @@ class LLM:
        # This is necessary because some requests may be finished earlier than
        # its previous requests.
        outputs = sorted(outputs, key=lambda x: int(x.request_id))
-        return outputs
\ No newline at end of file
+        return outputs