#!/usr/bin/env python3 """ Example: ModelOpt Quantization and Export with SGLang This example demonstrates the streamlined workflow for quantizing a model with ModelOpt and automatically exporting it for deployment with SGLang. """ import argparse import os from typing import Optional import torch import sglang as sgl from sglang.srt.configs.device_config import DeviceConfig from sglang.srt.configs.load_config import LoadConfig from sglang.srt.configs.model_config import ModelConfig from sglang.srt.distributed.parallel_state import ( init_distributed_environment, initialize_model_parallel, ) from sglang.srt.model_loader.loader import get_model_loader def _validate_export(export_dir: str) -> bool: """Validate that an exported model directory contains the expected files.""" import glob required_files = ["config.json", "tokenizer_config.json"] if not os.path.exists(export_dir): return False # Check required files for file in required_files: if not os.path.exists(os.path.join(export_dir, file)): return False # Check for model files using pattern matching to handle sharded models model_patterns = [ "model*.safetensors", "pytorch_model*.bin", ] has_model_file = False for pattern in model_patterns: matching_files = glob.glob(os.path.join(export_dir, pattern)) if matching_files: has_model_file = True break return has_model_file def _get_export_info(export_dir: str) -> Optional[dict]: """Get information about an exported model.""" import json if not _validate_export(export_dir): return None try: config_path = os.path.join(export_dir, "config.json") with open(config_path, "r") as f: config = json.load(f) return { "model_type": config.get("model_type", "unknown"), "architectures": config.get("architectures", []), "quantization_config": config.get("quantization_config", {}), "export_dir": export_dir, } except Exception: return None def quantize_and_export_model( model_path: str, export_dir: str, quantization_method: str = "modelopt_fp8", checkpoint_save_path: Optional[str] = None, device: str = "cuda", ) -> None: """ Quantize a model with ModelOpt and export it for SGLang deployment. Args: model_path: Path to the original model export_dir: Directory to export the quantized model quantization_method: Quantization method ("modelopt_fp8" or "modelopt_fp4") checkpoint_save_path: Optional path to save ModelOpt checkpoint device: Device to use for quantization """ print("๐Ÿš€ Starting ModelOpt quantization and export workflow") print(f"๐Ÿ“ฅ Input model: {model_path}") print(f"๐Ÿ“ค Export directory: {export_dir}") print(f"โš™๏ธ Quantization method: {quantization_method}") # Initialize minimal distributed environment for single GPU quantization if not torch.distributed.is_initialized(): print("๐Ÿ”ง Initializing distributed environment...") # Set up environment variables for single-process distributed os.environ["RANK"] = "0" os.environ["WORLD_SIZE"] = "1" os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "12355" # Use a different port than tests os.environ["LOCAL_RANK"] = "0" init_distributed_environment( world_size=1, rank=0, local_rank=0, backend="nccl" if device == "cuda" else "gloo", ) initialize_model_parallel( tensor_model_parallel_size=1, pipeline_model_parallel_size=1, ) # Configure model loading with ModelOpt quantization and export model_config = ModelConfig( model_path=model_path, quantization=quantization_method, # Use unified quantization flag trust_remote_code=True, ) load_config = LoadConfig( modelopt_checkpoint_save_path=checkpoint_save_path, modelopt_export_path=export_dir, ) device_config = DeviceConfig(device=device) # Load and quantize the model (export happens automatically) print("๐Ÿ”„ Loading and quantizing model...") model_loader = get_model_loader(load_config, model_config) try: model_loader.load_model( model_config=model_config, device_config=device_config, ) print("โœ… Model quantized successfully!") # Validate the export if _validate_export(export_dir): print("โœ… Export validation passed!") info = _get_export_info(export_dir) if info: print("๐Ÿ“‹ Model info:") print(f" - Type: {info['model_type']}") print(f" - Architecture: {info['architectures']}") print(f" - Quantization: {info['quantization_config']}") else: print("โŒ Export validation failed!") return except Exception as e: print(f"โŒ Quantization failed: {e}") return print("\n๐ŸŽ‰ Workflow completed successfully!") print(f"๐Ÿ“ Quantized model exported to: {export_dir}") print("\n๐Ÿš€ To use the exported model:") print( f" python -m sglang.launch_server --model-path {export_dir} --quantization modelopt" ) print("\n # Or in Python:") print(" import sglang as sgl") print(f" llm = sgl.Engine(model_path='{export_dir}', quantization='modelopt')") print(" # Note: 'modelopt' auto-detects FP4/FP8 from model config") def deploy_exported_model( export_dir: str, host: str = "127.0.0.1", port: int = 30000, ) -> None: """ Deploy an exported ModelOpt quantized model with SGLang. Args: export_dir: Directory containing the exported model host: Host to bind the server to port: Port to bind the server to """ print(f"๐Ÿš€ Deploying exported model from: {export_dir}") # Validate export first if not _validate_export(export_dir): print("โŒ Invalid export directory!") return try: # Launch SGLang engine with the exported model # Using generic "modelopt" for auto-detection of FP4/FP8 llm = sgl.Engine( model_path=export_dir, quantization="modelopt", host=host, port=port, ) print("โœ… Model deployed successfully!") print(f"๐ŸŒ Server running at http://{host}:{port}") # Example inference prompts = ["Hello, how are you?", "What is the capital of France?"] sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 100} print("\n๐Ÿงช Running example inference...") outputs = llm.generate(prompts, sampling_params) for i, output in enumerate(outputs): print(f"Prompt {i+1}: {prompts[i]}") print(f"Output: {output['text']}") print() except Exception as e: print(f"โŒ Deployment failed: {e}") def main(): parser = argparse.ArgumentParser( description="ModelOpt Quantization and Export with SGLang", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Quantize and export a model (recommended workflow) python modelopt_quantize_and_export.py quantize \\ --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 \\ --export-dir ./quantized_model \\ --quantization-method modelopt_fp8 # Deploy a pre-exported model python modelopt_quantize_and_export.py deploy \\ --export-dir ./quantized_model """, ) subparsers = parser.add_subparsers(dest="command", help="Available commands") # Quantize command quantize_parser = subparsers.add_parser( "quantize", help="Quantize and export a model" ) quantize_parser.add_argument( "--model-path", required=True, help="Path to the model to quantize" ) quantize_parser.add_argument( "--export-dir", required=True, help="Directory to export the quantized model" ) quantize_parser.add_argument( "--quantization-method", choices=["modelopt_fp8", "modelopt_fp4"], default="modelopt_fp8", help="Quantization method to use", ) quantize_parser.add_argument( "--checkpoint-save-path", help="Optional path to save ModelOpt checkpoint" ) quantize_parser.add_argument( "--device", default="cuda", help="Device to use for quantization" ) # TODO: Quantize-and-serve command removed due to compatibility issues # Use the separate quantize-then-deploy workflow instead # Deploy command deploy_parser = subparsers.add_parser("deploy", help="Deploy an exported model") deploy_parser.add_argument( "--export-dir", required=True, help="Directory containing the exported model" ) deploy_parser.add_argument( "--host", default="127.0.0.1", help="Host to bind the server to" ) deploy_parser.add_argument( "--port", type=int, default=30000, help="Port to bind the server to" ) args = parser.parse_args() if args.command == "quantize": quantize_and_export_model( model_path=args.model_path, export_dir=args.export_dir, quantization_method=args.quantization_method, checkpoint_save_path=args.checkpoint_save_path, device=args.device, ) elif args.command == "deploy": deploy_exported_model( export_dir=args.export_dir, host=args.host, port=args.port, ) else: parser.print_help() if __name__ == "__main__": main()