[CI/Build] drop support for Python 3.8 EOL (#8464)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>

[CI/Build] drop support for Python 3.8 EOL (#8464)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
21063c11 · Aaron Pham · GitHub · 4be3a451 · 21063c11 · 21063c11
Unverified Commit 21063c11 authored Nov 06, 2024 by Aaron Pham Committed by GitHub Nov 06, 2024
20 changed files
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -413,12 +413,10 @@ class _CorrectnessTestHelper:
    def generate_probs_for_test(
        self, draft_and_target_probs_equal: bool
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        draft_probs, target_probs = [
+        draft_probs, target_probs = (F.softmax(
-            F.softmax(
            torch.rand(self.vocab_size, dtype=torch.float32),
            dim=-1,
-            ) for _ in range(2)
+        ) for _ in range(2))
-        ]
        num_reference_probs = 100
        reference_probs = F.softmax(

--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -29,7 +29,7 @@ def test_trace_function_call():
    cur_dir = os.path.dirname(__file__)
    enable_trace_function_call(path, cur_dir)
    f1(1)
-    with open(path, 'r') as f:
+    with open(path) as f:
        content = f.read()
    assert "f1" in content

--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -93,10 +93,10 @@ def test_mistral_edge_case(tokenizer, truth):
 def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
    if "mistral" in tokenizer_name:
        yield (
-            bool(True) if request.param else
+            True if request.param else
            pytest.skip("mistral doesn't support skip_special_tokens=False"))
    else:
-        yield bool(True) if request.param else bool(False)
+        yield bool(request.param)
 @pytest.mark.parametrize("truth", TRUTH)

--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -46,7 +46,7 @@ if __name__ == "__main__":
    args = parser.parse_args()
-    with open(args.json_trace, "r") as f:
+    with open(args.json_trace) as f:
        profile_data = json.load(f)
    if args.table == "summary":

--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -434,7 +434,7 @@ def main(
                f"{', Sparsity ' + sparsity if sparsity else ''}")
    profile_json = None
-    with open(json_trace, "r") as f:
+    with open(json_trace) as f:
        profile_json = json.load(f)
    assert profile_json is not None

--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@@ -81,7 +81,7 @@ class Target:
        # Allow for modest floating-point errors
        epsilon = 0.000002
        if (self.weighted_duration > self.Duration() + epsilon):
-            print('%s > %s?' % (self.weighted_duration, self.Duration()))
+            print('{} > {}?'.format(self.weighted_duration, self.Duration()))
        assert (self.weighted_duration <= self.Duration() + epsilon)
        return self.weighted_duration
@@ -104,7 +104,7 @@ def ReadTargets(log, show_all):
    The result is a list of Target objects."""
    header = log.readline()
    assert header == '# ninja log v5\n', \
-           'unrecognized ninja log version %r' % header
+           'unrecognized ninja log version {!r}'.format(header)
    targets_dict = {}
    last_end_seen = 0.0
    for line in log:
@@ -254,8 +254,8 @@ def SummarizeEntries(entries, extra_step_types):
    # Warn if the sum of weighted times is off by more than half a second.
    if abs(length - weighted_total) > 500:
        print('Warning: Possible corrupt ninja log, results may be '
-              'untrustworthy. Length = %.3f, weighted total = %.3f' %
+              'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format(
-              (length, weighted_total))
+                  length, weighted_total))
    entries_by_ext = defaultdict(list)
    for target in entries:
@@ -263,16 +263,17 @@ def SummarizeEntries(entries, extra_step_types):
        entries_by_ext[extension].append(target)
    for key, values in entries_by_ext.items():
-        print('    Longest build steps for %s:' % key)
+        print('    Longest build steps for {}:'.format(key))
        values.sort(key=lambda x: x.WeightedDuration())
        for target in values[-long_count:]:
-            print('      %8.1f weighted s to build %s (%.1f s elapsed time)' %
+            print(
-                  (target.WeightedDuration(), target.DescribeTargets(),
+                '      {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'.
+                format(target.WeightedDuration(), target.DescribeTargets(),
                       target.Duration()))
-    print('    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
+    print('    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
-          'parallelism)' %
+          'parallelism)'.format(length, total_cpu_time,
-          (length, total_cpu_time, total_cpu_time * 1.0 / length))
+                                total_cpu_time * 1.0 / length))
    print('    %d build steps completed, average of %1.2f/s' %
          (len(entries), len(entries) / (length)))
@@ -298,11 +299,12 @@ def main():
        long_ext_count += len(args.step_types.split(';'))
    try:
-        with open(log_file, 'r') as log:
+        with open(log_file) as log:
            entries = ReadTargets(log, False)
            SummarizeEntries(entries, args.step_types)
-    except IOError:
+    except OSError:
-        print('Log file %r not found, no build summary created.' % log_file)
+        print('Log file {!r} not found, no build summary created.'.format(
+            log_file))
        return errno.ENOENT

--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -4,7 +4,7 @@ requires_files = glob.glob('requirements*.txt')
 requires_files += ["pyproject.toml"]
 for file in requires_files:
    print(f">>> cleaning {file}")
-    with open(file, 'r') as f:
+    with open(file) as f:
        lines = f.readlines()
    if "torch" in "".join(lines).lower():
        print("removed:")

--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -192,10 +192,8 @@ class LocalStridedBlockSparseAttn(torch.nn.Module):
        attn_mask = self.dense_attn_mask[None, :, :maxlen, :maxlen]
        q2 = self.transpose_and_pad(q, cu_seqlens, maxlen, 1)
-        k2, v2 = [
+        k2, v2 = (self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
-            self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
+                  for x in [k, v])
-            for x in [k, v]
-        ]
        spda_output = torch.nn.functional.scaled_dot_product_attention(
            q2, k2, v2, attn_mask=attn_mask, scale=sm_scale)
        return self.transpose_and_unpad(spda_output, cu_seqlens)

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -668,9 +668,10 @@ class ModelConfig:
    @property
    def is_encoder_decoder_model(self) -> bool:
        """Extract the HF encoder/decoder model flag."""
-        return getattr(self.hf_config, "is_encoder_decoder", False) or (
+        return getattr(
-            (hasattr(self.hf_config, "text_config") and getattr(
+            self.hf_config, "is_encoder_decoder",
-                self.hf_config.text_config, "is_encoder_decoder", False)))
+            False) or (hasattr(self.hf_config, "text_config") and getattr(
+                self.hf_config.text_config, "is_encoder_decoder", False))
    @property
    def is_multimodal_model(self) -> bool:

--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -52,7 +52,7 @@ class Evictor(ABC):
        pass
-class BlockMetaData():
+class BlockMetaData:
    """Data structure for storing key data describe cached block, so that
    evitor could use to make its decision which one to choose for eviction

--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -240,7 +240,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
    if is_distributed:
        get_world_group().barrier()
    logger.info("reading GPU P2P access cache from %s", path)
-    with open(path, "r") as f:
+    with open(path) as f:
        cache = json.load(f)
    _gpu_p2p_access_cache = cache
    return _gpu_p2p_access_cache[f"{src}->{tgt}"]

--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -812,7 +812,7 @@ class AsyncLLMEngine(EngineClient):
    async def run_engine_loop(engine_ref: ReferenceType):
        """We use a weakref to the engine so that the running loop
        doesn't prevent the engine being garbage collected."""
-        engine: Optional["AsyncLLMEngine"] = engine_ref()
+        engine: Optional[AsyncLLMEngine] = engine_ref()
        if not engine:
            return

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1541,8 +1541,8 @@ class LLMEngine:
                seq_group.state.remaining_steps != ref_remaining_steps
                for seq_group in seq_group_metadata_list[1:]
        ]):
-            raise AssertionError(("All running sequence groups should "
+            raise AssertionError("All running sequence groups should "
-                                  "have the same remaining steps."))
+                                 "have the same remaining steps.")
        return ref_remaining_steps > 0

--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -77,7 +77,7 @@ class StatLoggerBase(ABC):
        self.num_generation_tokens: List[int] = []
        self.last_local_log = time.time()
        self.local_interval = local_interval
-        self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
+        self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None
    @abstractmethod
    def log(self, stats: Stats) -> None:

--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -63,7 +63,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
            single_step_process_prompt_logprob(self, seq_group, output)
    @staticmethod
-    @functools.lru_cache()
+    @functools.lru_cache
    def _log_prompt_logprob_unsupported_warning_once():
        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
        # If the feature combo become valid

--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -362,7 +362,7 @@ def load_chat_template(
    if chat_template is None:
        return None
    try:
-        with open(chat_template, "r") as f:
+        with open(chat_template) as f:
            resolved_chat_template = f.read()
    except OSError as e:
        if isinstance(chat_template, Path):

--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -120,7 +120,7 @@ async def read_file(path_or_url: str) -> str:
                   session.get(path_or_url) as resp:
            return await resp.text()
    else:
-        with open(path_or_url, "r", encoding="utf-8") as f:
+        with open(path_or_url, encoding="utf-8") as f:
            return f.read()

--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -32,7 +32,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
    uses_ray: bool = True
    def _init_executor(self) -> None:
-        self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
+        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
        # If the env var is set, it uses the Ray's compiled DAG API
        # which optimizes the control plane overhead.
        # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.

--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -67,8 +67,7 @@ def _configure_vllm_root_logger() -> None:
            raise RuntimeError(
                "Could not load logging config. File does not exist: %s",
                VLLM_LOGGING_CONFIG_PATH)
-        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8",
+        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
-                  mode="r") as file:
            custom_config = json.loads(file.read())
        if not isinstance(custom_config, dict):

--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -343,7 +343,7 @@ class LoRAModelManager(AdapterModelManager):
            # text modules (e.g. ChatGLM)
            and hasattr(self.model, "get_mm_mapping"))
        self.packed_modules: Dict[str, List[str]] = {}
-        self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
+        self.modules: Dict[str, BaseLayerWithLoRA] = {}
        # Dict instead of a Set for compatibility with LRUCache.
        self._last_mapping: Optional[LoRAMapping] = None
        self._create_lora_modules()
@@ -548,7 +548,7 @@ class LoRAModelManager(AdapterModelManager):
            else:
                parts = module_name.split(".")
                replacements = self.packed_modules_mapping[parts[-1]]
-                subloras: List[Optional["LoRALayerWeights"]] = []
+                subloras: List[Optional[LoRALayerWeights]] = []
                for i, r in enumerate(replacements):
                    lora = LoRALayerWeights.create_dummy_lora_weights(
                        module_name + "." + r,