update dtk to 24.04.1 and modify README

6a583c2f · chenych · 7d576a9a · 6a583c2f · 6a583c2f · 6a583c2f
Commit 6a583c2f authored Aug 21, 2024 by chenych
20 changed files
--- a/vllm/vllm/__pycache__/utils.cpython-310.pyc
+++ b/vllm/vllm/__pycache__/utils.cpython-310.pyc
--- a/vllm/vllm/attention/__pycache__/__init__.cpython-310.pyc
+++ b/vllm/vllm/attention/__pycache__/__init__.cpython-310.pyc
--- a/vllm/vllm/attention/__pycache__/layer.cpython-310.pyc
+++ b/vllm/vllm/attention/__pycache__/layer.cpython-310.pyc
--- a/vllm/vllm/attention/__pycache__/selector.cpython-310.pyc
+++ b/vllm/vllm/attention/__pycache__/selector.cpython-310.pyc
--- a/vllm/vllm/attention/backends/__pycache__/__init__.cpython-310.pyc
+++ b/vllm/vllm/attention/backends/__pycache__/__init__.cpython-310.pyc
--- a/vllm/vllm/attention/backends/__pycache__/abstract.cpython-310.pyc
+++ b/vllm/vllm/attention/backends/__pycache__/abstract.cpython-310.pyc
--- a/vllm/vllm/attention/backends/__pycache__/flash_attn.cpython-310.pyc
+++ b/vllm/vllm/attention/backends/__pycache__/flash_attn.cpython-310.pyc
--- a/vllm/vllm/attention/ops/__pycache__/__init__.cpython-310.pyc
+++ b/vllm/vllm/attention/ops/__pycache__/__init__.cpython-310.pyc
--- a/vllm/vllm/attention/ops/__pycache__/paged_attn.cpython-310.pyc
+++ b/vllm/vllm/attention/ops/__pycache__/paged_attn.cpython-310.pyc
--- a/vllm/vllm/attention/ops/__pycache__/prefix_prefill.cpython-310.pyc
+++ b/vllm/vllm/attention/ops/__pycache__/prefix_prefill.cpython-310.pyc
--- a/vllm/vllm/config.py
+++ b/vllm/vllm/config.py
@@ -292,7 +292,7 @@ class CacheConfig:
        num_gpu_blocks_override: Optional[int] = None,
        sliding_window: Optional[int] = None,
        enable_prefix_caching: bool = False,
-        max_num_seqs: int = 1,
+        max_num_seqs: int = 128,
    ) -> None:
        self.block_size = block_size
        self.gpu_memory_utilization = gpu_memory_utilization

--- a/vllm/vllm/core/__pycache__/__init__.cpython-310.pyc
+++ b/vllm/vllm/core/__pycache__/__init__.cpython-310.pyc
--- a/vllm/vllm/core/__pycache__/block_manager_v1.cpython-310.pyc
+++ b/vllm/vllm/core/__pycache__/block_manager_v1.cpython-310.pyc
--- a/vllm/vllm/core/__pycache__/evictor.cpython-310.pyc
+++ b/vllm/vllm/core/__pycache__/evictor.cpython-310.pyc
--- a/vllm/vllm/core/__pycache__/interfaces.cpython-310.pyc
+++ b/vllm/vllm/core/__pycache__/interfaces.cpython-310.pyc
--- a/vllm/vllm/core/__pycache__/policy.cpython-310.pyc
+++ b/vllm/vllm/core/__pycache__/policy.cpython-310.pyc
--- a/vllm/vllm/core/__pycache__/scheduler.cpython-310.pyc
+++ b/vllm/vllm/core/__pycache__/scheduler.cpython-310.pyc
--- a/vllm/vllm/core/scheduler.py
+++ b/vllm/vllm/core/scheduler.py
@@ -4,7 +4,7 @@ from collections import deque
 from dataclasses import dataclass, field
 from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig, ModelConfig, ParallelConfig
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.core.policy import Policy, PolicyFactory
 from vllm.logger import init_logger
@@ -239,12 +239,18 @@ class Scheduler:
    def __init__(
        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
        scheduler_config: SchedulerConfig,
        cache_config: CacheConfig,
        lora_config: Optional[LoRAConfig],
    ) -> None:
+        self.model_config = model_config
+        self.parallel_config = parallel_config
        self.scheduler_config = scheduler_config
        self.cache_config = cache_config
+        self.num_layers = model_config.get_num_layers(parallel_config)
        # Note for LoRA scheduling: the current policy is extremely
        # simple and NOT fair. It can lead to starvation of some
        # LoRAs. This should be improved in the future.
@@ -898,12 +904,19 @@ class Scheduler:
            seq_data: Dict[int, SequenceData] = {}
            # seq_id -> physical block numbers
            block_tables: Dict[int, List[int]] = {}
+            lf1_caches = [[] for _ in range(self.num_layers)]
+            lf2_caches = [[] for _ in range(self.num_layers)]
            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
                seq_id = seq.seq_id
                seq_data[seq_id] = seq.data
                block_tables[seq_id] = self.block_manager.get_block_table(seq)
                self.block_manager.access_all_blocks_in_seq(seq, now)
+                if self.model_config.hf_config.model_type == 'yuan':
+                    for l in range(self.num_layers):
+                        lf1_caches[l].append(seq.lf1_caches[l])
+                        lf2_caches[l].append(seq.lf2_caches[l])
            common_computed_block_nums = (
                self.block_manager.get_common_computed_block_ids(
@@ -928,6 +941,8 @@ class Scheduler:
                # `multi_modal_data` will be None.
                multi_modal_data=seq_group.multi_modal_data
                if scheduler_outputs.num_prefill_groups > 0 else None,
+                lf1_caches=lf1_caches,
+                lf2_caches=lf2_caches,
            )
            seq_group_metadata_list.append(seq_group_metadata)

--- a/vllm/vllm/distributed/__pycache__/__init__.cpython-310.pyc
+++ b/vllm/vllm/distributed/__pycache__/__init__.cpython-310.pyc
--- a/vllm/vllm/distributed/__pycache__/communication_op.cpython-310.pyc
+++ b/vllm/vllm/distributed/__pycache__/communication_op.cpython-310.pyc