multimodal-llava.yaml

type: basic
format_version: 1
maintainers: [mcore]
loggers: [stdout]
launchers:
  type:slurm:
    ntasks_per_node: '{gpus}'
spec:
  name: '{test_case}'
  model: multimodal-llava
  build: mcore-pyt-{environment}
  nodes: 1
  gpus: 8
  platforms: dgx_a100
  time_limit: 1800
  scope: null
  script: |-
    ls
    cd /opt/megatron-lm

    ARGUMENTS=(
        "DATA_PATH='-'"
        "DATA_CACHE_PATH='-'"
        "OUTPUT_PATH={assets_dir}"
        "TENSORBOARD_PATH={assets_dir}/tensorboard"
        "CHECKPOINT_PATH=/workspace/checkpoints"
        "TRAINING_SCRIPT_PATH=pretrain_vlm.py"
        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
        "N_REPEAT={n_repeat}"
    )

    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}

products:
  - environment: [lts, dev]
    scope: [mr]
    n_repeat: [5]
    gpus: [8]
    test_case:
      - multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G
      - multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G
  - environment: [lts, dev]
    scope: [mr]
    n_repeat: [5]
    gpus: [7]
    test_case:
      - multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G
      - multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G