# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


KNOWN_MODELS = {
    "mock": {
        "hf_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "download_patterns": ["*.json"],
        "max_num_tokens": 2048,
        "max_batch_size": 512,
        "templates": [
            "preprocessing",
            "postprocessing",
            "ensemble",
            (
                "/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock",
                "context",
            ),
            (
                "/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock",
                "generate",
            ),
            (
                "/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock",
                "tensorrt_llm",
            ),
        ],
        "template_arguments": {
            "tokenizer_dir": "{args.hf_download}",
            "triton_max_batch_size": "{args.max_batch_size}",
            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
            "context_token_latency_ms": "0.1",
            "generate_token_latency_ms": "0.5",
        },
    },
    "llama-3.1-70b-instruct": {
        "hf_id": "meta-llama/Meta-Llama-3.1-70B-Instruct",
        "download_model_name": "llama-3.1-70b-instruct",
        "convert": [
            "quantization/quantize.py",
            "--dtype",
            "float16",
            "--qformat",
            "fp8",
            "--calib_size",
            "512",
            "--kv_cache_dtype",
            "fp8",
        ],
        "build": [
            "--gpt_attention_plugin",
            "float16",
            "--max_seq_len",
            "131072",
            "--use_fused_mlp",
            "enable",
            "--reduce_fusion",
            "disable",
            "--multiple_profiles",
            "enable",
            "--use_paged_context_fmha",
            "enable",
        ],
        "max_num_tokens": 2048,
        "max_batch_size": 512,
        "templates": [
            "preprocessing",
            "postprocessing",
            "ensemble",
            ("tensorrt_llm", "context"),
            ("tensorrt_llm", "generate"),
            "tensorrt_llm",
        ],
        "template_arguments": {
            "triton_max_batch_size": "{args.max_batch_size}",
            "decoupled_mode": "True",
            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
            "triton_backend": "tensorrtllm",
            "enable_chunked_context": "{args.enable_chunked_context}",
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
            "enable_kv_cache_reuse": "False",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
            "participant_ids": "{args.participant_ids}",
            "tokenizer_dir": "{args.hf_download}",
            "encoder_input_features_data_type": "TYPE_FP16",
        },
    },
    "llama-3.1-8b-instruct": {
        "hf_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "download_model_name": "llama-3.1-8b-instruct",
        "convert": ["llama/convert_checkpoint.py", "--dtype", "float16"],
        "build": [
            "--remove_input_padding",
            "enable",
            "--gpt_attention_plugin",
            "float16",
            "--context_fmha",
            "enable",
            "--gemm_plugin",
            "float16",
            "--paged_kv_cache",
            "enable",
        ],
        "max_num_tokens": 16384,
        "max_batch_size": 64,
        "templates": [
            "preprocessing",
            "postprocessing",
            "ensemble",
            ("tensorrt_llm", "context"),
            ("tensorrt_llm", "generate"),
            "tensorrt_llm",
        ],
        "template_arguments": {
            "triton_max_batch_size": "{args.max_batch_size}",
            "decoupled_mode": "True",
            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
            "triton_backend": "tensorrtllm",
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
            "enable_kv_cache_reuse": "False",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
            "participant_ids": "0",
            "tokenizer_dir": "{args.hf_download}",
            "encoder_input_features_data_type": "TYPE_FP16",
        },
    },
    "llama-3-8b-instruct-generate": {
        "hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
        "max_batch_size": 256,
        "model_repo_name": "llama-3-8b-instruct-disaggregated",
        "download_model_name": "llama-3-8b-instruct",
        "convert": [
            "quantization/quantize.py",
            "--dtype",
            "float16",
            "--qformat",
            "fp8",
            "--calib_size",
            "512",
            "--kv_cache_dtype",
            "fp8",
        ],
        "build": [
            "--gpt_attention_plugin",
            "float16",
            "--workers",
            "{args.tp_size}",
            "--max_seq_len",
            "1024",
            "--use_fused_mlp",
            "enable",
            "--multiple_profiles",
            "enable",
        ],
        "max_num_tokens": 256,
        "templates": [
            ("tensorrt_llm", "generate"),
            "postprocessing",
        ],
        "template_arguments": {
            "triton_max_batch_size": "{args.max_batch_size}",
            "decoupled_mode": "True",
            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
            "triton_backend": "tensorrtllm",
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
            "enable_kv_cache_reuse": "False",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
            "participant_ids": "0",
            "tokenizer_dir": "{args.hf_download}",
            "encoder_input_features_data_type": "TYPE_FP16",
        },
    },
    "llama-3-8b-instruct-context": {
        "hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
        "max_batch_size": 256,
        "model_repo_name": "llama-3-8b-instruct-disaggregated",
        "download_model_name": "llama-3-8b-instruct",
        "convert": [
            "quantization/quantize.py",
            "--dtype",
            "float16",
            "--qformat",
            "fp8",
            "--calib_size",
            "512",
            "--kv_cache_dtype",
            "fp8",
        ],
        "build": [
            "--gpt_attention_plugin",
            "float16",
            "--workers",
            "{args.tp_size}",
            "--max_seq_len",
            "8192",
            "--use_fused_mlp",
            "enable",
            "--multiple_profiles",
            "enable",
        ],
        "max_num_tokens": 8192,
        "templates": [
            "/workspace/examples/disaggregated_serving/tensorrtllm_templates/context",
            "preprocessing",
        ],
        "template_arguments": {
            "triton_max_batch_size": "{args.max_batch_size}",
            "decoupled_mode": "False",
            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
            "triton_backend": "tensorrtllm",
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
            "enable_kv_cache_reuse": "False",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
            "participant_ids": "0",
            "tokenizer_dir": "{args.hf_download}",
            "encoder_input_features_data_type": "TYPE_FP16",
        },
    },
    "llama-3-8b-instruct": {
        "hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
        "convert": [
            "quantization/quantize.py",
            "--dtype",
            "float16",
            "--qformat",
            "fp8",
            "--calib_size",
            "512",
            "--kv_cache_dtype",
            "fp8",
        ],
        "build": [
            "--gpt_attention_plugin",
            "float16",
            "--workers",
            "{args.tp_size}",
            "--max_seq_len",
            "8192",
            "--use_fused_mlp",
            "enable",
            "--multiple_profiles",
            "enable",
            "--reduce_fusion",
            "{args.reduce_fusion}",
        ],
        "max_num_tokens": 16384,
        "max_batch_size": 512,
        "templates": [
            "preprocessing",
            "postprocessing",
            "ensemble",
            ("tensorrt_llm", "context"),
            ("tensorrt_llm", "generate"),
            "tensorrt_llm",
        ],
        "template_arguments": {
            "triton_max_batch_size": "{args.max_batch_size}",
            "decoupled_mode": "True",
            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
            "triton_backend": "tensorrtllm",
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
            "enable_kv_cache_reuse": "False",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
            "participant_ids": "0",
            "tokenizer_dir": "{args.hf_download}",
            "encoder_input_features_data_type": "TYPE_FP16",
        },
    },
    "llama-3-8b-instruct-default": {
        "hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
        "download_model_name": "llama-3-8b-instruct",
        "convert": ["llama/convert_checkpoint.py", "--dtype", "float16"],
        "build": [
            "--remove_input_padding",
            "enable",
            "--gpt_attention_plugin",
            "float16",
            "--context_fmha",
            "enable",
            "--gemm_plugin",
            "float16",
            "--paged_kv_cache",
            "enable",
        ],
        "max_batch_size": 64,
        "templates": [
            "preprocessing",
            "postprocessing",
            "ensemble",
            ("tensorrt_llm", "context"),
            ("tensorrt_llm", "generate"),
            "tensorrt_llm",
        ],
        "template_arguments": {
            "triton_max_batch_size": "{args.max_batch_size}",
            "decoupled_mode": "True",
            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
            "triton_backend": "tensorrtllm",
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
            "enable_kv_cache_reuse": "False",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
            "participant_ids": "0",
            "tokenizer_dir": "{args.hf_download}",
            "encoder_input_features_data_type": "TYPE_FP16",
        },
    },
    "llama-3-70b-instruct-context": {
        "hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
        "download_model_name": "llama-3-70b-instruct",
        "model_repo_name": "llama-3-70b-disaggegated",
        "max_batch_size": 128,
        "convert": [
            "quantization/quantize.py",
            "--dtype",
            "float16",
            "--qformat",
            "fp8",
            "--calib_size",
            "512",
            "--kv_cache_dtype",
            "fp8",
        ],
        "build": [
            "--gpt_attention_plugin",
            "float16",
            "--workers",
            "{args.tp_size}",
            "--max_seq_len",
            "8192",
            "--use_fused_mlp",
            "enable",
            "--reduce_fusion",
            "{args.reduce_fusion}",
            "--multiple_profiles",
            "enable",
        ],
        "max_num_tokens": 8192,
        "templates": [
            "preprocessing",
            "/workspace/examples/disaggregated_serving/tensorrtllm_templates/context",
        ],
        "template_arguments": {
            "triton_max_batch_size": "{args.max_batch_size}",
            "decoupled_mode": "True",
            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
            "triton_backend": "tensorrtllm",
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
            "enable_kv_cache_reuse": "False",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
            "participant_ids": "{args.participant_ids}",
            "tokenizer_dir": "{args.hf_download}",
            "encoder_input_features_data_type": "TYPE_FP16",
        },
    },
    "llama-3-70b-instruct-generate": {
        "hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
        "download_model_name": "llama-3-70b-instruct",
        "model_repo_name": "llama-3-70b-disaggegated",
        "max_batch_size": 128,
        "convert": [
            "quantization/quantize.py",
            "--dtype",
            "float16",
            "--qformat",
            "fp8",
            "--calib_size",
            "512",
            "--kv_cache_dtype",
            "fp8",
        ],
        "build": [
            "--gpt_attention_plugin",
            "float16",
            "--workers",
            "{args.tp_size}",
            "--max_seq_len",
            "1024",
            "--use_fused_mlp",
            "enable",
            "--reduce_fusion",
            "{args.reduce_fusion}",
            "--multiple_profiles",
            "enable",
        ],
        "max_num_tokens": 128,
        "templates": [
            "postprocessing",
            "/workspace/examples/disaggregated_serving/tensorrtllm_templates/generate",
        ],
        "template_arguments": {
            "triton_max_batch_size": "{args.max_batch_size}",
            "decoupled_mode": "True",
            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
            "triton_backend": "tensorrtllm",
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
            "enable_kv_cache_reuse": "False",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
            "participant_ids": "{args.participant_ids}",
            "tokenizer_dir": "{args.hf_download}",
            "encoder_input_features_data_type": "TYPE_FP16",
        },
    },
    "llama-3-70b-instruct": {
        "hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
        "max_batch_size": 512,
        "convert": [
            "quantization/quantize.py",
            "--dtype",
            "float16",
            "--qformat",
            "fp8",
            "--calib_size",
            "512",
            "--kv_cache_dtype",
            "fp8",
        ],
        "build": [
            "--gpt_attention_plugin",
            "float16",
            "--workers",
            "{args.tp_size}",
            "--max_seq_len",
            "8192",
            "--use_fused_mlp",
            "enable",
            "--reduce_fusion",
            "{args.reduce_fusion}",
            "--multiple_profiles",
            "enable",
        ],
        "max_num_tokens": 16384,
        "templates": [
            "preprocessing",
            "postprocessing",
            "ensemble",
            "tensorrt_llm",
        ],
        "template_arguments": {
            "triton_max_batch_size": "{args.max_batch_size}",
            "decoupled_mode": "True",
            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
            "triton_backend": "tensorrtllm",
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
            "enable_kv_cache_reuse": "False",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
            "participant_ids": "{args.participant_ids}",
            "tokenizer_dir": "{args.hf_download}",
            "encoder_input_features_data_type": "TYPE_FP16",
        },
    },
}