[devops] remove post commit ci (#5566)

* [devops] remove post commit ci * [misc] run pre-commit on all files * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

[devops] remove post commit ci (#5566)
* [devops] remove post commit ci * [misc] run pre-commit on all files * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
641b1ee7 · Hongxin Liu · GitHub · 341263df · 641b1ee7 · 641b1ee7
Unverified Commit 641b1ee7 authored Apr 08, 2024 by Hongxin Liu Committed by GitHub Apr 08, 2024
20 changed files
--- a/examples/language/data_utils.py
+++ b/examples/language/data_utils.py
@@ -121,4 +121,4 @@ class RandomDataset(Dataset):
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.input_ids[idx],
        }
\ No newline at end of file
--- a/examples/language/llama2/finetune.py
+++ b/examples/language/llama2/finetune.py
@@ -270,9 +270,7 @@ def main():
        ) as pbar:
            for step in pbar:
                if use_pipeline:
-                    outputs = booster.execute_pipeline(
+                    outputs = booster.execute_pipeline(dataloader_iter, model, _criterion, optimizer, return_loss=True)
-                        dataloader_iter, model, _criterion, optimizer, return_loss=True
-                    )
                    loss = outputs["loss"]
                else:
                    batch = next(dataloader_iter)

--- a/examples/language/llama2/pretrain.py
+++ b/examples/language/llama2/pretrain.py
@@ -285,9 +285,7 @@ def main():
        ) as pbar:
            for step in pbar:
                if use_pipeline:
-                    outputs = booster.execute_pipeline(
+                    outputs = booster.execute_pipeline(dataloader_iter, model, _criterion, optimizer, return_loss=True)
-                        dataloader_iter, model, _criterion, optimizer, return_loss=True
-                    )
                    loss = outputs["loss"]
                else:
                    batch = next(dataloader_iter)

--- a/examples/language/openmoe/benchmark/utils.py
+++ b/examples/language/openmoe/benchmark/utils.py
@@ -50,7 +50,6 @@ def all_reduce_mean(x: float, world_size: int) -> float:
 class Timer:
    def __init__(self) -> None:
        self.start_time: Optional[float] = None
        self.duration: float = 0.0
@@ -112,7 +111,7 @@ class PerformanceEvaluator:
        batch_size, seq_len = input_ids.shape
        self.num_samples += batch_size
-        self.flop += (batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint)))
+        self.flop += batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint))
    def on_fit_end(self) -> None:
        avg_duration = all_reduce_mean(self.timer.duration, self.world_size)
@@ -122,5 +121,6 @@ class PerformanceEvaluator:
        if dist.get_rank() == 0:
            print(
                f"num_samples: {self.num_samples}, dp_world_size: {self.dp_world_size}, flop: {self.flop}, avg_duration: {avg_duration}, "
-                f"avg_throughput: {avg_throughput}")
+                f"avg_throughput: {avg_throughput}"
+            )
            print(f"Throughput: {avg_throughput:.2f} samples/sec, TFLOPS per GPU: {avg_tflops_per_gpu:.2f}")
--- a/examples/language/openmoe/infer.py
+++ b/examples/language/openmoe/infer.py
@@ -16,17 +16,15 @@ def inference(args):
    tokenizer = T5Tokenizer.from_pretrained("google/umt5-small")
    if args.model == "test":
        config = LlamaConfig.from_pretrained("hpcai-tech/openmoe-base")
-        set_openmoe_args(config,
+        set_openmoe_args(
-                         num_experts=config.num_experts,
+            config, num_experts=config.num_experts, moe_layer_interval=config.moe_layer_interval, enable_kernel=True
-                         moe_layer_interval=config.moe_layer_interval,
+        )
-                         enable_kernel=True)
        model = OpenMoeForCausalLM(config)
    else:
        config = LlamaConfig.from_pretrained(f"hpcai-tech/openmoe-{args.model}")
-        set_openmoe_args(config,
+        set_openmoe_args(
-                         num_experts=config.num_experts,
+            config, num_experts=config.num_experts, moe_layer_interval=config.moe_layer_interval, enable_kernel=False
-                         moe_layer_interval=config.moe_layer_interval,
+        )
-                         enable_kernel=False)
        model = OpenMoeForCausalLM.from_pretrained(f"hpcai-tech/openmoe-{args.model}", config=config)
    model = model.eval().bfloat16()
    model = model.to(torch.cuda.current_device())

--- a/examples/language/openmoe/model/convert_openmoe_ckpt.py
+++ b/examples/language/openmoe/model/convert_openmoe_ckpt.py
@@ -172,9 +172,9 @@ def make_state_dict(converted_params):
 def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path):
    """Replaces the params in model witht the T5X converted params."""
    variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
-    converted = convert_t5x_to_pytorch(variables,
+    converted = convert_t5x_to_pytorch(
-                                       num_layers=config.num_hidden_layers,
+        variables, num_layers=config.num_hidden_layers, moe_interval=config.moe_layer_interval
-                                       moe_interval=config.moe_layer_interval)
+    )
    state_dict = make_state_dict(converted)
    model.load_state_dict(state_dict, strict=True)
@@ -203,11 +203,9 @@ def convert_t5x_checkpoint_to_pytorch(t5x_checkpoint_path, config_file, pytorch_
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Converts a native T5X checkpoint into a PyTorch checkpoint.")
    # Required parameters
-    parser.add_argument("--t5x_checkpoint_path",
+    parser.add_argument(
-                        default=None,
+        "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path to the T5X checkpoint."
-                        type=str,
+    )
-                        required=True,
-                        help="Path to the T5X checkpoint.")
    parser.add_argument(
        "--config_file",
        default=None,
@@ -215,10 +213,8 @@ if __name__ == "__main__":
        required=True,
        help="The config json file corresponding to the pre-trained T5 model.\nThis specifies the model architecture.",
    )
-    parser.add_argument("--pytorch_dump_path",
+    parser.add_argument(
-                        default=None,
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-                        type=str,
+    )
-                        required=True,
-                        help="Path to the output PyTorch model.")
    args = parser.parse_args()
    convert_t5x_checkpoint_to_pytorch(args.t5x_checkpoint_path, args.config_file, args.pytorch_dump_path)
--- a/examples/language/opt/opt_train_demo.py
+++ b/examples/language/opt/opt_train_demo.py
@@ -41,9 +41,7 @@ def train_epoch(epoch, model, optimizer, _criterion, lr_scheduler, dataloader, b
        # Forward pass
        for _ in pbar:
            if use_pipeline:
-                outputs = booster.execute_pipeline(
+                outputs = booster.execute_pipeline(dataloader, model, _criterion, optimizer, return_loss=True)
-                    dataloader, model, _criterion, optimizer, return_loss=True
-                )
                # Backward and optimize
                if is_pp_last_stage:
                    loss = outputs["loss"]

--- a/extensions/cpu_adam/__init__.py
+++ b/extensions/cpu_adam/__init__.py
 from .cpu_adam_arm import CpuAdamArmExtension
 from .cpu_adam_x86 import CpuAdamX86Extension
-__all__ = ['CpuAdamArmExtension', 'CpuAdamX86Extension']
+__all__ = ["CpuAdamArmExtension", "CpuAdamX86Extension"]
--- a/extensions/layernorm/__init__.py
+++ b/extensions/layernorm/__init__.py
 from .layernorm_cuda import LayerNormCudaExtension
 __all__ = ["LayerNormCudaExtension"]
\ No newline at end of file
--- a/extensions/moe/__init__.py
+++ b/extensions/moe/__init__.py
 from .moe_cuda import MoeCudaExtension
-__all__ = ['MoeCudaExtension']
+__all__ = ["MoeCudaExtension"]
\ No newline at end of file
--- a/extensions/optimizer/__init__.py
+++ b/extensions/optimizer/__init__.py
 from .fused_optimizer_cuda import FusedOptimizerCudaExtension
-__all__ = ['FusedOptimizerCudaExtension']
+__all__ = ["FusedOptimizerCudaExtension"]
\ No newline at end of file
--- a/extensions/softmax/__init__.py
+++ b/extensions/softmax/__init__.py
 from .scaled_masked_softmax_cuda import ScaledMaskedSoftmaxCudaExtension
 from .scaled_upper_triangle_masked_softmax_cuda import ScaledUpperTriangleMaskedSoftmaxCudaExtension
-__all__ = ['ScaledMaskedSoftmaxCudaExtension', 'ScaledUpperTriangleMaskedSoftmaxCudaExtension'] 
+__all__ = ["ScaledMaskedSoftmaxCudaExtension", "ScaledUpperTriangleMaskedSoftmaxCudaExtension"]
\ No newline at end of file
--- a/tests/kit/model_zoo/__init__.py
+++ b/tests/kit/model_zoo/__init__.py
 import os
 from . import custom, diffusers, timm, torchaudio, torchvision, transformers
 from .executor import run_fwd, run_fwd_bwd
 from .registry import model_zoo
 # We pick a subset of models for fast testing in order to reduce the total testing time
 COMMON_MODELS = [
-    'custom_hanging_param_model',
+    "custom_hanging_param_model",
-    'custom_nested_model',
+    "custom_nested_model",
-    'custom_repeated_computed_layers',
+    "custom_repeated_computed_layers",
-    'custom_simple_net',
+    "custom_simple_net",
-    'diffusers_clip_text_model',
+    "diffusers_clip_text_model",
-    'diffusers_auto_encoder_kl',
+    "diffusers_auto_encoder_kl",
-    'diffusers_unet2d_model',
+    "diffusers_unet2d_model",
-    'timm_densenet',
+    "timm_densenet",
-    'timm_resnet',
+    "timm_resnet",
-    'timm_swin_transformer',
+    "timm_swin_transformer",
-    'torchaudio_wav2vec2_base',
+    "torchaudio_wav2vec2_base",
-    'torchaudio_conformer',
+    "torchaudio_conformer",
-    'transformers_bert_for_masked_lm',
+    "transformers_bert_for_masked_lm",
-    'transformers_bloom_for_causal_lm',
+    "transformers_bloom_for_causal_lm",
-    'transformers_falcon_for_causal_lm',
+    "transformers_falcon_for_causal_lm",
-    'transformers_chatglm_for_conditional_generation',
+    "transformers_chatglm_for_conditional_generation",
-    'transformers_llama_for_casual_lm',
+    "transformers_llama_for_casual_lm",
-    'transformers_vit_for_masked_image_modeling',
+    "transformers_vit_for_masked_image_modeling",
-    'transformers_mistral_for_casual_lm'
+    "transformers_mistral_for_casual_lm",
 ]
-IS_FAST_TEST = os.environ.get('FAST_TEST', '0') == '1'
+IS_FAST_TEST = os.environ.get("FAST_TEST", "0") == "1"
-__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", 'COMMON_MODELS', 'IS_FAST_TEST']
+__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", "COMMON_MODELS", "IS_FAST_TEST"]
--- a/tests/kit/model_zoo/registry.py
+++ b/tests/kit/model_zoo/registry.py
@@ -102,4 +102,4 @@ class ModelZooRegistry(dict):
        return new_dict
 model_zoo = ModelZooRegistry()
\ No newline at end of file
--- a/tests/kit/model_zoo/transformers/chatglm2.py
+++ b/tests/kit/model_zoo/transformers/chatglm2.py
@@ -2,6 +2,7 @@ import torch
 from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
 from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration, ChatGLMModel
 from ..registry import ModelAttribute, model_zoo
 # ================================

--- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
@@ -74,9 +74,7 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
    data = data_gen_fn()
    model.train()
    if booster.plugin.stage_manager is not None:
-        booster.execute_pipeline(
+        booster.execute_pipeline(_preprocess_data(data), model, _criterion, optimizer, return_loss=True)
-            _preprocess_data(data), model, _criterion, optimizer, return_loss=True
-        )
    else:
        output = model(**_preprocess_data(data))
        loss = criterion(output)
@@ -108,9 +106,7 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
    data_for_shard = data_gen_fn()
    data_for_origin = data_gen_fn()
    if booster.plugin.stage_manager is not None:
-        booster.execute_pipeline(
+        booster.execute_pipeline(_preprocess_data(data_for_shard), model, _criterion, optimizer, return_loss=True)
-            _preprocess_data(data_for_shard), model, _criterion, optimizer, return_loss=True
-        )
        booster.execute_pipeline(
            _preprocess_data(data_for_origin),
            new_model,

--- a/tests/test_checkpoint_io/test_torch_fsdp_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_torch_fsdp_checkpoint_io.py
@@ -113,6 +113,7 @@ def check_torch_fsdp_ckpt():
        full_osd = FSDP.full_optim_state_dict(optimizer.unwrap_model().unwrap(), optim=optimizer)
        import copy
        sharded_osd = copy.deepcopy(full_osd)
        run_model()

--- a/tests/test_gptq/test_gptq_linear.py
+++ b/tests/test_gptq/test_gptq_linear.py
-import math
-import time
-import numpy as np
 import pytest
 import torch
-import torch.nn as nn
-import transformers
 from packaging import version
 try:
-    import triton
-    import triton.language as tl
    HAS_TRITON = True
 except ImportError:
    HAS_TRITON = False
@@ -22,6 +14,7 @@ try:
    from exllama_kernels import prepare_buffers, set_tuning_params
    from colossalai.inference.quant.gptq import CaiQuantLinear
    HAS_AUTO_GPTQ = True
 except:
    HAS_AUTO_GPTQ = False
@@ -32,13 +25,14 @@ import warnings
 HAS_GPTQ_CUDA = False
 try:
    from colossalai.kernel.op_builder.gptq import GPTQBuilder
    gptq_cuda = GPTQBuilder().load()
    HAS_GPTQ_CUDA = True
 except ImportError:
-    warnings.warn('CUDA gptq is not installed')
+    warnings.warn("CUDA gptq is not installed")
    HAS_GPTQ_CUDA = False
-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")
 max_inner_outer_dim = 1
 max_input_len = 1
@@ -64,9 +58,9 @@ def init_buffer(cai_linear, use_act_order=False):
        max_input_len = 4096
    # The temp_state buffer is required to reorder X in the act-order case.
    # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
-    gptq_temp_state_buffer = torch.zeros((max_input_len, max_inner_outer_dim),
+    gptq_temp_state_buffer = torch.zeros(
-                                         dtype=torch.float16,
+        (max_input_len, max_inner_outer_dim), dtype=torch.float16, device=torch.cuda.current_device()
-                                         device=torch.cuda.current_device())
+    )
    gptq_temp_dq_buffer = torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=torch.cuda.current_device())
    gptq_cuda.prepare_buffers(torch.device(torch.cuda.current_device()), gptq_temp_state_buffer, gptq_temp_dq_buffer)
@@ -77,10 +71,11 @@ def init_buffer(cai_linear, use_act_order=False):
    gptq_cuda.set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON or not HAS_AUTO_GPTQ,
+@pytest.mark.skipif(
-                    reason="triton requires cuda version to be higher than 11.4 or not install auto-gptq")
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON or not HAS_AUTO_GPTQ,
+    reason="triton requires cuda version to be higher than 11.4 or not install auto-gptq",
+)
 def test_gptq_linear():
    infeature = 1024
    outfeature = 1024
    group_size = 128
@@ -120,7 +115,7 @@ def test_gptq_linear():
    max_input_len = 2048
    buffers = {
        "temp_state": torch.zeros((max_input_len, max_inner_outer_dim), dtype=torch.float16, device=device),
-        "temp_dq": torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=device)
+        "temp_dq": torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=device),
    }
    prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"])
@@ -146,5 +141,4 @@ def test_gptq_linear():
 if __name__ == "__main__":
    test_gptq_linear()
--- a/tests/test_lazy/test_models.py
+++ b/tests/test_lazy/test_models.py
@@ -24,4 +24,4 @@ def test_torchvision_models_lazy_init(subset, default_device):
 if __name__ == "__main__":
    test_torchvision_models_lazy_init("transformers", "cpu")
\ No newline at end of file
--- a/tests/test_optimizer/test_nvme.py
+++ b/tests/test_optimizer/test_nvme.py
-import torch
 import pytest
+import torch
 from colossalai.nn.optimizer import CPUAdam, HybridAdam
 from colossalai.testing import clear_cache_before_run, parameterize
@@ -17,6 +17,7 @@ def check_params_equal(model, torch_model):
    for p, torch_p in zip(model.parameters(), torch_model.parameters()):
        assert torch.allclose(p, torch_p, atol=1e-3), f"diff: {torch.abs(p - torch_p)}"
 # TODO Something wrong with ci when running this test.
 @pytest.mark.skip(reason="skip because of something wrong with CI")
 @clear_cache_before_run()