Unverified Commit b724afe3 authored by Shanshan Shen's avatar Shanshan Shen Committed by GitHub
Browse files

[V1][Structured Output] Clear xgrammar compiler object when engine core shut...


[V1][Structured Output] Clear xgrammar compiler object when engine core shut down to avoid nanobind leaked warning (#16954)
Signed-off-by: default avatarshen-shanshan <467638484@qq.com>
parent 21f4f1c9
......@@ -253,6 +253,7 @@ class EngineCore:
return engine_core_outputs
def shutdown(self):
self.structured_output_manager.clear_backend()
if self.model_executor:
self.model_executor.shutdown()
......
......@@ -107,3 +107,7 @@ class StructuredOutputManager:
# np.ndarray, because that is much more efficient for serialization
# and deserialization when sending this to the GPU workers.
return bitmask_tensor.numpy()
def clear_backend(self) -> None:
if self.backend is not None:
self.backend.destroy()
......@@ -108,6 +108,9 @@ class GuidanceBackend(StructuredOutputBackend):
return llguidance_torch.allocate_token_bitmask(
max_num_seqs, self.ll_tokenizer.vocab_size)
def destroy(self):
pass
@dataclass
class GuidanceGrammar(StructuredOutputGrammar):
......
......@@ -87,3 +87,9 @@ class StructuredOutputBackend(ABC):
max_num_seqs (int): The maximum number of sequences for which
to allocate the bitmask.
"""
@abstractmethod
def destroy(self):
"""
Backend-specific cleanup.
"""
......@@ -124,6 +124,9 @@ class XgrammarBackend(StructuredOutputBackend):
def allocate_token_bitmask(self, max_num_seqs: int):
return xgr.allocate_token_bitmask(max_num_seqs, self.vocab_size)
def destroy(self):
del self.compiler
@dataclass
class XgrammarGrammar(StructuredOutputGrammar):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment