f"Capture draft cuda graph end. Time elapsed: {time.perf_counter()-tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem-after_mem):.2f} GB."
f"Capture draft cuda graph end. Time elapsed: {time.perf_counter()-tic:.2f} s. mem usage={(before_mem-after_mem):.2f}GB. avail mem={after_mem:.2f}GB."
)
)
# Capture extend
# Capture extend
...
@@ -269,7 +281,7 @@ class EAGLEWorker(TpModelWorker):
...
@@ -269,7 +281,7 @@ class EAGLEWorker(TpModelWorker):
f"Capture draft extend cuda graph end. Time elapsed: {time.perf_counter()-tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem-after_mem):.2f} GB."
f"Capture draft extend cuda graph end. Time elapsed: {time.perf_counter()-tic:.2f} s. mem usage={(before_mem-after_mem):.2f}GB. avail mem={after_mem:.2f}GB."
)
)
@property
@property
...
@@ -290,7 +302,6 @@ class EAGLEWorker(TpModelWorker):
...
@@ -290,7 +302,6 @@ class EAGLEWorker(TpModelWorker):
A tuple of the final logit output of the target model, next tokens accepted,
A tuple of the final logit output of the target model, next tokens accepted,
the batch id (used for overlap schedule), and number of accepted tokens.
the batch id (used for overlap schedule), and number of accepted tokens.