Fix a bug in 1D input shape (#5)

04e5acc0 · Woosuk Kwon · GitHub · 3e9f991d · 04e5acc0 · 04e5acc0
Unverified Commit 04e5acc0 authored Mar 06, 2023 by Woosuk Kwon Committed by GitHub Mar 06, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 6 deletions

cacheflow/models/attention.py cacheflow/models/attention.py +8 -3

cacheflow/models/input_metadata.py cacheflow/models/input_metadata.py +1 -1

server.py server.py +2 -2

No files found.
--- a/cacheflow/models/attention.py
+++ b/cacheflow/models/attention.py
@@ -47,9 +47,8 @@ class OPTCacheFlowAttention(nn.Module):
            max_s=max_prompt_len,
            causal=True,
        )[0]
-        num_tokens = prefix_sum[-1]
        # FIXME(woosuk): Unnecessary copy. Optimize this.
-        output[:num_tokens].copy_(out, non_blocking=True)
+        output.copy_(out, non_blocking=True)
    def single_query_cached_kv_attention(
        self,
@@ -108,8 +107,14 @@ class OPTCacheFlowAttention(nn.Module):
        # Compute the attention op for prompts.
        if input_metadata.num_prompts > 0:
+            num_prompt_tokens = sum(input_metadata.prompt_lens)
            self.multi_query_kv_attention(
-                output, query, key, value, input_metadata.prompt_lens)
+                output[:num_prompt_tokens],
+                query[:num_prompt_tokens],
+                key[:num_prompt_tokens],
+                value[:num_prompt_tokens],               
+                input_metadata.prompt_lens,
+            )
        # Wait until the cache op is done.
        if cache_event is not None:

--- a/cacheflow/models/input_metadata.py
+++ b/cacheflow/models/input_metadata.py
@@ -24,7 +24,7 @@ class InputMetadata:
        self.num_prompts = len(prompt_lens)
        self.num_generation_tokens = context_lens.shape[0]
-        self.num_valid_tokens = len(slot_mapping)
+        self.num_valid_tokens = slot_mapping.shape[0]
        if block_tables.numel() > 0:
            self.max_num_blocks_per_seq = block_tables.shape[1]
        else:

--- a/server.py
+++ b/server.py
@@ -57,11 +57,11 @@ def main():
        'UC Berkeley is',
        'The future of cloud computing is',
    ]
-    for prompt in test_inputs:
-        frontend.query(prompt)
    # FIXME
    while True:
+        if test_inputs:
+            frontend.query(test_inputs.pop())
        scheduler.step()
        if not scheduler.pending and not scheduler.running:
            break