Commit ea790031 authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

kvcache: Skip computing causal mask for worst case graph reservation

Computing an attention mask for a large context and max batch is
expensive - over 100ms. Models like Gemma3 that have multiple types
of caches and custom attention masks need to do this 4 times, so this
adds approximately 500ms to startup time when using 128k context

When we are reserving the worst case graph, we don't need the mask,
only its shape, so we can skip this.
parent 9239a254
...@@ -30,6 +30,11 @@ type Causal struct { ...@@ -30,6 +30,11 @@ type Causal struct {
// ** current forward pass ** // ** current forward pass **
// curReserve indicates that this forward pass is only for
// memory reservation and we should not update our metadata
// based on it.
curReserve bool
// the active layer for Get and Put // the active layer for Get and Put
curLayer int curLayer int
...@@ -159,12 +164,13 @@ func (c *Causal) Close() { ...@@ -159,12 +164,13 @@ func (c *Causal) Close() {
} }
func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error { func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) error {
c.curReserve = reserve
c.curBatchSize = len(batch.Positions) c.curBatchSize = len(batch.Positions)
c.curSequences = batch.Sequences c.curSequences = batch.Sequences
c.curPositions = batch.Positions c.curPositions = batch.Positions
c.opts.Except = nil c.opts.Except = nil
if !reserve { if !c.curReserve {
c.updateSlidingWindow() c.updateSlidingWindow()
var err error var err error
...@@ -304,6 +310,11 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor { ...@@ -304,6 +310,11 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1 c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1
length := c.curCellRange.max - c.curCellRange.min + 1 length := c.curCellRange.max - c.curCellRange.min + 1
if c.curReserve {
return ctx.Input().Empty(c.config.MaskDType, length, batchSize)
}
mask := make([]float32, batchSize*length) mask := make([]float32, batchSize*length)
for i := range c.curBatchSize { for i := range c.curBatchSize {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment