f"Capture draft cuda graph end. Time elapsed: {time.perf_counter()-tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem-after_mem):.2f} GB."
f"Capture draft cuda graph end. Time elapsed: {time.perf_counter()-tic:.2f} s. mem usage={(before_mem-after_mem):.2f}GB. avail mem={after_mem:.2f}GB."
)
# Capture extend
...
...
@@ -269,7 +281,7 @@ class EAGLEWorker(TpModelWorker):
f"Capture draft extend cuda graph end. Time elapsed: {time.perf_counter()-tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem-after_mem):.2f} GB."
f"Capture draft extend cuda graph end. Time elapsed: {time.perf_counter()-tic:.2f} s. mem usage={(before_mem-after_mem):.2f}GB. avail mem={after_mem:.2f}GB."
)
@property
...
...
@@ -290,7 +302,6 @@ class EAGLEWorker(TpModelWorker):
A tuple of the final logit output of the target model, next tokens accepted,
the batch id (used for overlap schedule), and number of accepted tokens.