import argparse import inspect import logging import os import sys from pathlib import Path def _maybe_add_src_to_path() -> None: # Allow running without `pip install -e .` by pointing to `compactor-vllm/src`. here = Path(__file__).resolve() repo_root = here.parents[1] src_dir = repo_root / "src" if src_dir.is_dir() and str(src_dir) not in sys.path: sys.path.insert(0, str(src_dir)) _maybe_add_src_to_path() from compactor_vllm import LLM, LLMConfig, SamplingParams # noqa: E402 from compactor_vllm.compression import ( # noqa: E402 BatchCompressionParams, CompressionMethod, SequenceCompressionParams, ) from compactor_vllm.config.engine_config import AttentionBackend # noqa: E402 def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Minimal smoke test for compactor-vllm (no speculative decoding)." ) parser.add_argument( "--model", type=str, default=os.environ.get("MODEL", "/mnt/data/llm-models/Qwen3-8B"), help="Local model directory or HF id. In the container this is usually a local dir.", ) parser.add_argument( "--tp", type=int, default=int(os.environ.get("TP", "1")), help="Tensor parallel size (world size).", ) parser.add_argument( "--nccl-port", type=int, default=int(os.environ.get("NCCL_PORT", "1218")), help="TCP port for torch.distributed init (only used for NCCL init_method=tcp://localhost:).", ) parser.add_argument("--max-model-len", type=int, default=2048) parser.add_argument("--max-num-seqs", type=int, default=2) parser.add_argument( "--gpu-memory-utilization", type=float, default=float(os.environ.get("GPU_MEMORY_UTILIZATION", "0.9")), help="Fraction of total GPU memory used for KV cache + activations.", ) parser.add_argument( "--attention-backend", type=str, default="compactor_triton", choices=[b.name.lower() for b in AttentionBackend], ) parser.add_argument( "--compression-method", type=str, default="compactor", choices=[m.name.lower() for m in CompressionMethod], ) parser.add_argument( "--compression-ratio", type=float, default=0.8, help="Sequence-level compression ratio (e.g. 0.8 keeps 80%% of tokens).", ) parser.add_argument("--chunk-size", type=int, default=512) parser.add_argument( "--no-chunked-compression", dest="do_chunked_compression", action="store_false", ) parser.set_defaults(do_chunked_compression=True) parser.add_argument("--prompt", type=str, default="用一句话介绍你自己,给我讲一个故事,200字左右。") parser.add_argument("--max-new-tokens", type=int, default=64) parser.add_argument( "--temperature", type=float, default=0.0, help="0.0 = greedy decoding (recommended for smoke tests).", ) parser.add_argument( "--tokenizer-enable-thinking", dest="tokenizer_enable_thinking", action="store_true", help="Pass enable_thinking=True to tokenizer.apply_chat_template (if supported).", ) parser.add_argument( "--no-tokenizer-enable-thinking", dest="tokenizer_enable_thinking", action="store_false", help="Pass enable_thinking=False to tokenizer.apply_chat_template (if supported).", ) parser.set_defaults(tokenizer_enable_thinking=False) parser.add_argument( "--tokenizer-add-generation-prompt", dest="tokenizer_add_generation_prompt", action="store_true", help="Pass add_generation_prompt=True to tokenizer.apply_chat_template (if supported).", ) parser.add_argument( "--no-tokenizer-add-generation-prompt", dest="tokenizer_add_generation_prompt", action="store_false", help="Pass add_generation_prompt=False to tokenizer.apply_chat_template (if supported).", ) parser.set_defaults(tokenizer_add_generation_prompt=True) parser.add_argument( "--tokenizer-continue-final-message", dest="tokenizer_continue_final_message", action="store_true", help="Pass continue_final_message=True to tokenizer.apply_chat_template (if supported).", ) parser.add_argument( "--no-tokenizer-continue-final-message", dest="tokenizer_continue_final_message", action="store_false", help="Pass continue_final_message=False to tokenizer.apply_chat_template (if supported).", ) parser.set_defaults(tokenizer_continue_final_message=False) parser.add_argument( "--skip-special-tokens", dest="skip_special_tokens", action="store_true", help="Skip special tokens in output decoding (recommended).", ) parser.add_argument( "--no-skip-special-tokens", dest="skip_special_tokens", action="store_false", help="Keep special tokens in output decoding (e.g. <|im_end|>).", ) parser.set_defaults(skip_special_tokens=True) parser.add_argument( "--log-level", type=str, default="INFO", choices=["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"], ) return parser.parse_args() def main() -> None: args = _parse_args() logging.basicConfig( level=getattr(logging, args.log_level.upper()), format="%(asctime)s - %(levelname)s - %(message)s", ) attention_backend = AttentionBackend[args.attention_backend.upper()] compression_method = CompressionMethod[args.compression_method.upper()] model = args.model cfg = LLMConfig( model=model, path=model, tensor_parallel_size=int(args.tp), nccl_port=int(args.nccl_port), max_model_len=int(args.max_model_len), max_num_seqs=int(args.max_num_seqs), gpu_memory_utilization=float(args.gpu_memory_utilization), enforce_eager=True, attention_backend=attention_backend, show_progress_bar=False, ) llm = LLM(cfg) tokenizer_kwargs = { "add_generation_prompt": bool(args.tokenizer_add_generation_prompt), "enable_thinking": bool(args.tokenizer_enable_thinking), "continue_final_message": bool(args.tokenizer_continue_final_message), } if tokenizer_kwargs.get("add_generation_prompt") and tokenizer_kwargs.get( "continue_final_message" ): # HF tokenizer API rejects these being simultaneously True. tokenizer_kwargs["continue_final_message"] = False # Be defensive: only pass kwargs supported by this tokenizer build. try: supported = set(inspect.signature(llm.tokenizer.apply_chat_template).parameters) tokenizer_kwargs = {k: v for k, v in tokenizer_kwargs.items() if k in supported} except (TypeError, ValueError): pass outs = llm.generate_chat( [[{"role": "user", "content": args.prompt}]], sampling_params=SamplingParams( temperature=float(args.temperature), max_new_tokens=int(args.max_new_tokens), ), batch_compression_params=BatchCompressionParams( compression_method=compression_method, do_chunked_compression=bool(args.do_chunked_compression), chunk_size=int(args.chunk_size), ), per_sequence_compression_params=SequenceCompressionParams( compression_ratio=float(args.compression_ratio), ), tokenizer_kwargs=tokenizer_kwargs, detokenizer_kwargs={"skip_special_tokens": bool(args.skip_special_tokens)}, ) print(outs[0]) llm.exit() if __name__ == "__main__": main()