Commit f2f0289f authored by OlivierDehaene's avatar OlivierDehaene
Browse files

feat(server): empty cache on errors

parent 67347950
import torch
import grpc import grpc
from google.rpc import status_pb2, code_pb2 from google.rpc import status_pb2, code_pb2
...@@ -22,6 +23,9 @@ class ExceptionInterceptor(AsyncServerInterceptor): ...@@ -22,6 +23,9 @@ class ExceptionInterceptor(AsyncServerInterceptor):
method_name = method_name.split("/")[-1] method_name = method_name.split("/")[-1]
logger.exception(f"Method {method_name} encountered an error.") logger.exception(f"Method {method_name} encountered an error.")
if torch.cuda.is_available():
torch.cuda.empty_cache()
await context.abort_with_status( await context.abort_with_status(
rpc_status.to_status( rpc_status.to_status(
status_pb2.Status(code=code_pb2.INTERNAL, message=str(err)) status_pb2.Status(code=code_pb2.INTERNAL, message=str(err))
......
...@@ -639,7 +639,6 @@ class FlashCausalLMBatch(Batch): ...@@ -639,7 +639,6 @@ class FlashCausalLMBatch(Batch):
for b in batches: for b in batches:
b.block_tables = None b.block_tables = None
del b del b
torch.cuda.empty_cache()
return FlashCausalLMBatch( return FlashCausalLMBatch(
batch_id=batches[0].batch_id, batch_id=batches[0].batch_id,
...@@ -733,7 +732,6 @@ class FlashCausalLM(Model): ...@@ -733,7 +732,6 @@ class FlashCausalLM(Model):
f"You need to decrease `--max-batch-total-tokens` or `--max-batch-prefill-tokens`" f"You need to decrease `--max-batch-total-tokens` or `--max-batch-prefill-tokens`"
) from e ) from e
del batch del batch
torch.cuda.empty_cache()
def decode(self, generated_ids: Union[torch.Tensor, List[int]]) -> str: def decode(self, generated_ids: Union[torch.Tensor, List[int]]) -> str:
return self.tokenizer.decode( return self.tokenizer.decode(
...@@ -790,7 +788,6 @@ class FlashCausalLM(Model): ...@@ -790,7 +788,6 @@ class FlashCausalLM(Model):
) )
except Exception as e: except Exception as e:
del batch del batch
torch.cuda.empty_cache()
raise e raise e
if prefill: if prefill:
......
...@@ -51,6 +51,9 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): ...@@ -51,6 +51,9 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
filtered_batch = batch.filter(request.request_ids) filtered_batch = batch.filter(request.request_ids)
self.cache.set(filtered_batch) self.cache.set(filtered_batch)
if torch.cuda.is_available():
torch.cuda.empty_cache()
return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb()) return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
async def Warmup(self, request, context): async def Warmup(self, request, context):
...@@ -58,6 +61,10 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): ...@@ -58,6 +61,10 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
request.batch, self.model.tokenizer, self.model.dtype, self.model.device request.batch, self.model.tokenizer, self.model.dtype, self.model.device
) )
self.model.warmup(batch, request.max_total_tokens) self.model.warmup(batch, request.max_total_tokens)
if torch.cuda.is_available():
torch.cuda.empty_cache()
return generate_pb2.WarmupResponse() return generate_pb2.WarmupResponse()
async def Prefill(self, request, context): async def Prefill(self, request, context):
...@@ -89,6 +96,8 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): ...@@ -89,6 +96,8 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
if len(batches) > 1: if len(batches) > 1:
batch = self.model.batch_type.concatenate(batches) batch = self.model.batch_type.concatenate(batches)
if torch.cuda.is_available():
torch.cuda.empty_cache()
else: else:
batch = batches[0] batch = batches[0]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment