f"Capture draft cuda graph end. Time elapsed: {time.perf_counter()-tic:.2f} s. mem usage={(before_mem-after_mem):.2f}GB. avail mem={after_mem:.2f}GB."
f"Capture draft cuda graph end. Time elapsed: {time.perf_counter()-tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem-after_mem):.2f} GB."
)
)
# Capture extend
# Capture extend
...
@@ -281,7 +269,7 @@ class EAGLEWorker(TpModelWorker):
...
@@ -281,7 +269,7 @@ class EAGLEWorker(TpModelWorker):
f"Capture draft extend cuda graph end. Time elapsed: {time.perf_counter()-tic:.2f} s. mem usage={(before_mem-after_mem):.2f}GB. avail mem={after_mem:.2f}GB."
f"Capture draft extend cuda graph end. Time elapsed: {time.perf_counter()-tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem-after_mem):.2f} GB."
)
)
@property
@property
...
@@ -302,6 +290,7 @@ class EAGLEWorker(TpModelWorker):
...
@@ -302,6 +290,7 @@ class EAGLEWorker(TpModelWorker):
A tuple of the final logit output of the target model, next tokens accepted,
A tuple of the final logit output of the target model, next tokens accepted,
the batch id (used for overlap schedule), and number of accepted tokens.
the batch id (used for overlap schedule), and number of accepted tokens.