# Check if the grammar is ready in the grammar queue
ifself.grammar_queue:
...
...
@@ -1185,7 +1342,12 @@ class Scheduler(
returnNone
running_bs=len(self.running_batch.reqs)
ifrunning_bs>=self.max_running_requests:
# Igore the check if self.chunked_req is not None.
# In the non-PP case, when self.chunked_req is not None, num_allocatable_reqs should always be greater than 0,
# as the space for the chunked request has just been released.
# In PP case, a chunked req can start in one microbatch and end in another microbatch, so the max_running_requests per microbatch should not be strict.
# Instead, we should always allow chunked request to be added, otherwise, there will be a memory leak.