Unverified Commit 641b1ee7 authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[devops] remove post commit ci (#5566)

* [devops] remove post commit ci

* [misc] run pre-commit on all files

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



---------
Co-authored-by: default avatarpre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
parent 341263df
...@@ -121,4 +121,4 @@ class RandomDataset(Dataset): ...@@ -121,4 +121,4 @@ class RandomDataset(Dataset):
"input_ids": self.input_ids[idx], "input_ids": self.input_ids[idx],
"attention_mask": self.attention_mask[idx], "attention_mask": self.attention_mask[idx],
"labels": self.input_ids[idx], "labels": self.input_ids[idx],
} }
\ No newline at end of file
...@@ -270,9 +270,7 @@ def main(): ...@@ -270,9 +270,7 @@ def main():
) as pbar: ) as pbar:
for step in pbar: for step in pbar:
if use_pipeline: if use_pipeline:
outputs = booster.execute_pipeline( outputs = booster.execute_pipeline(dataloader_iter, model, _criterion, optimizer, return_loss=True)
dataloader_iter, model, _criterion, optimizer, return_loss=True
)
loss = outputs["loss"] loss = outputs["loss"]
else: else:
batch = next(dataloader_iter) batch = next(dataloader_iter)
......
...@@ -285,9 +285,7 @@ def main(): ...@@ -285,9 +285,7 @@ def main():
) as pbar: ) as pbar:
for step in pbar: for step in pbar:
if use_pipeline: if use_pipeline:
outputs = booster.execute_pipeline( outputs = booster.execute_pipeline(dataloader_iter, model, _criterion, optimizer, return_loss=True)
dataloader_iter, model, _criterion, optimizer, return_loss=True
)
loss = outputs["loss"] loss = outputs["loss"]
else: else:
batch = next(dataloader_iter) batch = next(dataloader_iter)
......
...@@ -50,7 +50,6 @@ def all_reduce_mean(x: float, world_size: int) -> float: ...@@ -50,7 +50,6 @@ def all_reduce_mean(x: float, world_size: int) -> float:
class Timer: class Timer:
def __init__(self) -> None: def __init__(self) -> None:
self.start_time: Optional[float] = None self.start_time: Optional[float] = None
self.duration: float = 0.0 self.duration: float = 0.0
...@@ -112,7 +111,7 @@ class PerformanceEvaluator: ...@@ -112,7 +111,7 @@ class PerformanceEvaluator:
batch_size, seq_len = input_ids.shape batch_size, seq_len = input_ids.shape
self.num_samples += batch_size self.num_samples += batch_size
self.flop += (batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint))) self.flop += batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint))
def on_fit_end(self) -> None: def on_fit_end(self) -> None:
avg_duration = all_reduce_mean(self.timer.duration, self.world_size) avg_duration = all_reduce_mean(self.timer.duration, self.world_size)
...@@ -122,5 +121,6 @@ class PerformanceEvaluator: ...@@ -122,5 +121,6 @@ class PerformanceEvaluator:
if dist.get_rank() == 0: if dist.get_rank() == 0:
print( print(
f"num_samples: {self.num_samples}, dp_world_size: {self.dp_world_size}, flop: {self.flop}, avg_duration: {avg_duration}, " f"num_samples: {self.num_samples}, dp_world_size: {self.dp_world_size}, flop: {self.flop}, avg_duration: {avg_duration}, "
f"avg_throughput: {avg_throughput}") f"avg_throughput: {avg_throughput}"
)
print(f"Throughput: {avg_throughput:.2f} samples/sec, TFLOPS per GPU: {avg_tflops_per_gpu:.2f}") print(f"Throughput: {avg_throughput:.2f} samples/sec, TFLOPS per GPU: {avg_tflops_per_gpu:.2f}")
...@@ -16,17 +16,15 @@ def inference(args): ...@@ -16,17 +16,15 @@ def inference(args):
tokenizer = T5Tokenizer.from_pretrained("google/umt5-small") tokenizer = T5Tokenizer.from_pretrained("google/umt5-small")
if args.model == "test": if args.model == "test":
config = LlamaConfig.from_pretrained("hpcai-tech/openmoe-base") config = LlamaConfig.from_pretrained("hpcai-tech/openmoe-base")
set_openmoe_args(config, set_openmoe_args(
num_experts=config.num_experts, config, num_experts=config.num_experts, moe_layer_interval=config.moe_layer_interval, enable_kernel=True
moe_layer_interval=config.moe_layer_interval, )
enable_kernel=True)
model = OpenMoeForCausalLM(config) model = OpenMoeForCausalLM(config)
else: else:
config = LlamaConfig.from_pretrained(f"hpcai-tech/openmoe-{args.model}") config = LlamaConfig.from_pretrained(f"hpcai-tech/openmoe-{args.model}")
set_openmoe_args(config, set_openmoe_args(
num_experts=config.num_experts, config, num_experts=config.num_experts, moe_layer_interval=config.moe_layer_interval, enable_kernel=False
moe_layer_interval=config.moe_layer_interval, )
enable_kernel=False)
model = OpenMoeForCausalLM.from_pretrained(f"hpcai-tech/openmoe-{args.model}", config=config) model = OpenMoeForCausalLM.from_pretrained(f"hpcai-tech/openmoe-{args.model}", config=config)
model = model.eval().bfloat16() model = model.eval().bfloat16()
model = model.to(torch.cuda.current_device()) model = model.to(torch.cuda.current_device())
......
...@@ -172,9 +172,9 @@ def make_state_dict(converted_params): ...@@ -172,9 +172,9 @@ def make_state_dict(converted_params):
def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path): def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path):
"""Replaces the params in model witht the T5X converted params.""" """Replaces the params in model witht the T5X converted params."""
variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path) variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
converted = convert_t5x_to_pytorch(variables, converted = convert_t5x_to_pytorch(
num_layers=config.num_hidden_layers, variables, num_layers=config.num_hidden_layers, moe_interval=config.moe_layer_interval
moe_interval=config.moe_layer_interval) )
state_dict = make_state_dict(converted) state_dict = make_state_dict(converted)
model.load_state_dict(state_dict, strict=True) model.load_state_dict(state_dict, strict=True)
...@@ -203,11 +203,9 @@ def convert_t5x_checkpoint_to_pytorch(t5x_checkpoint_path, config_file, pytorch_ ...@@ -203,11 +203,9 @@ def convert_t5x_checkpoint_to_pytorch(t5x_checkpoint_path, config_file, pytorch_
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Converts a native T5X checkpoint into a PyTorch checkpoint.") parser = argparse.ArgumentParser(description="Converts a native T5X checkpoint into a PyTorch checkpoint.")
# Required parameters # Required parameters
parser.add_argument("--t5x_checkpoint_path", parser.add_argument(
default=None, "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path to the T5X checkpoint."
type=str, )
required=True,
help="Path to the T5X checkpoint.")
parser.add_argument( parser.add_argument(
"--config_file", "--config_file",
default=None, default=None,
...@@ -215,10 +213,8 @@ if __name__ == "__main__": ...@@ -215,10 +213,8 @@ if __name__ == "__main__":
required=True, required=True,
help="The config json file corresponding to the pre-trained T5 model.\nThis specifies the model architecture.", help="The config json file corresponding to the pre-trained T5 model.\nThis specifies the model architecture.",
) )
parser.add_argument("--pytorch_dump_path", parser.add_argument(
default=None, "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
type=str, )
required=True,
help="Path to the output PyTorch model.")
args = parser.parse_args() args = parser.parse_args()
convert_t5x_checkpoint_to_pytorch(args.t5x_checkpoint_path, args.config_file, args.pytorch_dump_path) convert_t5x_checkpoint_to_pytorch(args.t5x_checkpoint_path, args.config_file, args.pytorch_dump_path)
...@@ -41,9 +41,7 @@ def train_epoch(epoch, model, optimizer, _criterion, lr_scheduler, dataloader, b ...@@ -41,9 +41,7 @@ def train_epoch(epoch, model, optimizer, _criterion, lr_scheduler, dataloader, b
# Forward pass # Forward pass
for _ in pbar: for _ in pbar:
if use_pipeline: if use_pipeline:
outputs = booster.execute_pipeline( outputs = booster.execute_pipeline(dataloader, model, _criterion, optimizer, return_loss=True)
dataloader, model, _criterion, optimizer, return_loss=True
)
# Backward and optimize # Backward and optimize
if is_pp_last_stage: if is_pp_last_stage:
loss = outputs["loss"] loss = outputs["loss"]
......
from .cpu_adam_arm import CpuAdamArmExtension from .cpu_adam_arm import CpuAdamArmExtension
from .cpu_adam_x86 import CpuAdamX86Extension from .cpu_adam_x86 import CpuAdamX86Extension
__all__ = ['CpuAdamArmExtension', 'CpuAdamX86Extension'] __all__ = ["CpuAdamArmExtension", "CpuAdamX86Extension"]
from .layernorm_cuda import LayerNormCudaExtension from .layernorm_cuda import LayerNormCudaExtension
__all__ = ["LayerNormCudaExtension"] __all__ = ["LayerNormCudaExtension"]
\ No newline at end of file
from .moe_cuda import MoeCudaExtension from .moe_cuda import MoeCudaExtension
__all__ = ['MoeCudaExtension'] __all__ = ["MoeCudaExtension"]
\ No newline at end of file
from .fused_optimizer_cuda import FusedOptimizerCudaExtension from .fused_optimizer_cuda import FusedOptimizerCudaExtension
__all__ = ['FusedOptimizerCudaExtension'] __all__ = ["FusedOptimizerCudaExtension"]
\ No newline at end of file
from .scaled_masked_softmax_cuda import ScaledMaskedSoftmaxCudaExtension from .scaled_masked_softmax_cuda import ScaledMaskedSoftmaxCudaExtension
from .scaled_upper_triangle_masked_softmax_cuda import ScaledUpperTriangleMaskedSoftmaxCudaExtension from .scaled_upper_triangle_masked_softmax_cuda import ScaledUpperTriangleMaskedSoftmaxCudaExtension
__all__ = ['ScaledMaskedSoftmaxCudaExtension', 'ScaledUpperTriangleMaskedSoftmaxCudaExtension'] __all__ = ["ScaledMaskedSoftmaxCudaExtension", "ScaledUpperTriangleMaskedSoftmaxCudaExtension"]
\ No newline at end of file
import os import os
from . import custom, diffusers, timm, torchaudio, torchvision, transformers from . import custom, diffusers, timm, torchaudio, torchvision, transformers
from .executor import run_fwd, run_fwd_bwd from .executor import run_fwd, run_fwd_bwd
from .registry import model_zoo from .registry import model_zoo
# We pick a subset of models for fast testing in order to reduce the total testing time # We pick a subset of models for fast testing in order to reduce the total testing time
COMMON_MODELS = [ COMMON_MODELS = [
'custom_hanging_param_model', "custom_hanging_param_model",
'custom_nested_model', "custom_nested_model",
'custom_repeated_computed_layers', "custom_repeated_computed_layers",
'custom_simple_net', "custom_simple_net",
'diffusers_clip_text_model', "diffusers_clip_text_model",
'diffusers_auto_encoder_kl', "diffusers_auto_encoder_kl",
'diffusers_unet2d_model', "diffusers_unet2d_model",
'timm_densenet', "timm_densenet",
'timm_resnet', "timm_resnet",
'timm_swin_transformer', "timm_swin_transformer",
'torchaudio_wav2vec2_base', "torchaudio_wav2vec2_base",
'torchaudio_conformer', "torchaudio_conformer",
'transformers_bert_for_masked_lm', "transformers_bert_for_masked_lm",
'transformers_bloom_for_causal_lm', "transformers_bloom_for_causal_lm",
'transformers_falcon_for_causal_lm', "transformers_falcon_for_causal_lm",
'transformers_chatglm_for_conditional_generation', "transformers_chatglm_for_conditional_generation",
'transformers_llama_for_casual_lm', "transformers_llama_for_casual_lm",
'transformers_vit_for_masked_image_modeling', "transformers_vit_for_masked_image_modeling",
'transformers_mistral_for_casual_lm' "transformers_mistral_for_casual_lm",
] ]
IS_FAST_TEST = os.environ.get('FAST_TEST', '0') == '1' IS_FAST_TEST = os.environ.get("FAST_TEST", "0") == "1"
__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", 'COMMON_MODELS', 'IS_FAST_TEST']
__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", "COMMON_MODELS", "IS_FAST_TEST"]
...@@ -102,4 +102,4 @@ class ModelZooRegistry(dict): ...@@ -102,4 +102,4 @@ class ModelZooRegistry(dict):
return new_dict return new_dict
model_zoo = ModelZooRegistry() model_zoo = ModelZooRegistry()
\ No newline at end of file
...@@ -2,6 +2,7 @@ import torch ...@@ -2,6 +2,7 @@ import torch
from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration, ChatGLMModel from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration, ChatGLMModel
from ..registry import ModelAttribute, model_zoo from ..registry import ModelAttribute, model_zoo
# ================================ # ================================
......
...@@ -74,9 +74,7 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf ...@@ -74,9 +74,7 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
data = data_gen_fn() data = data_gen_fn()
model.train() model.train()
if booster.plugin.stage_manager is not None: if booster.plugin.stage_manager is not None:
booster.execute_pipeline( booster.execute_pipeline(_preprocess_data(data), model, _criterion, optimizer, return_loss=True)
_preprocess_data(data), model, _criterion, optimizer, return_loss=True
)
else: else:
output = model(**_preprocess_data(data)) output = model(**_preprocess_data(data))
loss = criterion(output) loss = criterion(output)
...@@ -108,9 +106,7 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf ...@@ -108,9 +106,7 @@ def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_conf
data_for_shard = data_gen_fn() data_for_shard = data_gen_fn()
data_for_origin = data_gen_fn() data_for_origin = data_gen_fn()
if booster.plugin.stage_manager is not None: if booster.plugin.stage_manager is not None:
booster.execute_pipeline( booster.execute_pipeline(_preprocess_data(data_for_shard), model, _criterion, optimizer, return_loss=True)
_preprocess_data(data_for_shard), model, _criterion, optimizer, return_loss=True
)
booster.execute_pipeline( booster.execute_pipeline(
_preprocess_data(data_for_origin), _preprocess_data(data_for_origin),
new_model, new_model,
......
...@@ -113,6 +113,7 @@ def check_torch_fsdp_ckpt(): ...@@ -113,6 +113,7 @@ def check_torch_fsdp_ckpt():
full_osd = FSDP.full_optim_state_dict(optimizer.unwrap_model().unwrap(), optim=optimizer) full_osd = FSDP.full_optim_state_dict(optimizer.unwrap_model().unwrap(), optim=optimizer)
import copy import copy
sharded_osd = copy.deepcopy(full_osd) sharded_osd = copy.deepcopy(full_osd)
run_model() run_model()
......
import math
import time
import numpy as np
import pytest import pytest
import torch import torch
import torch.nn as nn
import transformers
from packaging import version from packaging import version
try: try:
import triton
import triton.language as tl
HAS_TRITON = True HAS_TRITON = True
except ImportError: except ImportError:
HAS_TRITON = False HAS_TRITON = False
...@@ -22,6 +14,7 @@ try: ...@@ -22,6 +14,7 @@ try:
from exllama_kernels import prepare_buffers, set_tuning_params from exllama_kernels import prepare_buffers, set_tuning_params
from colossalai.inference.quant.gptq import CaiQuantLinear from colossalai.inference.quant.gptq import CaiQuantLinear
HAS_AUTO_GPTQ = True HAS_AUTO_GPTQ = True
except: except:
HAS_AUTO_GPTQ = False HAS_AUTO_GPTQ = False
...@@ -32,13 +25,14 @@ import warnings ...@@ -32,13 +25,14 @@ import warnings
HAS_GPTQ_CUDA = False HAS_GPTQ_CUDA = False
try: try:
from colossalai.kernel.op_builder.gptq import GPTQBuilder from colossalai.kernel.op_builder.gptq import GPTQBuilder
gptq_cuda = GPTQBuilder().load() gptq_cuda = GPTQBuilder().load()
HAS_GPTQ_CUDA = True HAS_GPTQ_CUDA = True
except ImportError: except ImportError:
warnings.warn('CUDA gptq is not installed') warnings.warn("CUDA gptq is not installed")
HAS_GPTQ_CUDA = False HAS_GPTQ_CUDA = False
TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4') TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")
max_inner_outer_dim = 1 max_inner_outer_dim = 1
max_input_len = 1 max_input_len = 1
...@@ -64,9 +58,9 @@ def init_buffer(cai_linear, use_act_order=False): ...@@ -64,9 +58,9 @@ def init_buffer(cai_linear, use_act_order=False):
max_input_len = 4096 max_input_len = 4096
# The temp_state buffer is required to reorder X in the act-order case. # The temp_state buffer is required to reorder X in the act-order case.
# The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill. # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
gptq_temp_state_buffer = torch.zeros((max_input_len, max_inner_outer_dim), gptq_temp_state_buffer = torch.zeros(
dtype=torch.float16, (max_input_len, max_inner_outer_dim), dtype=torch.float16, device=torch.cuda.current_device()
device=torch.cuda.current_device()) )
gptq_temp_dq_buffer = torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=torch.cuda.current_device()) gptq_temp_dq_buffer = torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=torch.cuda.current_device())
gptq_cuda.prepare_buffers(torch.device(torch.cuda.current_device()), gptq_temp_state_buffer, gptq_temp_dq_buffer) gptq_cuda.prepare_buffers(torch.device(torch.cuda.current_device()), gptq_temp_state_buffer, gptq_temp_dq_buffer)
...@@ -77,10 +71,11 @@ def init_buffer(cai_linear, use_act_order=False): ...@@ -77,10 +71,11 @@ def init_buffer(cai_linear, use_act_order=False):
gptq_cuda.set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2) gptq_cuda.set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON or not HAS_AUTO_GPTQ, @pytest.mark.skipif(
reason="triton requires cuda version to be higher than 11.4 or not install auto-gptq") not TRITON_CUDA_SUPPORT or not HAS_TRITON or not HAS_AUTO_GPTQ,
reason="triton requires cuda version to be higher than 11.4 or not install auto-gptq",
)
def test_gptq_linear(): def test_gptq_linear():
infeature = 1024 infeature = 1024
outfeature = 1024 outfeature = 1024
group_size = 128 group_size = 128
...@@ -120,7 +115,7 @@ def test_gptq_linear(): ...@@ -120,7 +115,7 @@ def test_gptq_linear():
max_input_len = 2048 max_input_len = 2048
buffers = { buffers = {
"temp_state": torch.zeros((max_input_len, max_inner_outer_dim), dtype=torch.float16, device=device), "temp_state": torch.zeros((max_input_len, max_inner_outer_dim), dtype=torch.float16, device=device),
"temp_dq": torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=device) "temp_dq": torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=device),
} }
prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"]) prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"])
...@@ -146,5 +141,4 @@ def test_gptq_linear(): ...@@ -146,5 +141,4 @@ def test_gptq_linear():
if __name__ == "__main__": if __name__ == "__main__":
test_gptq_linear() test_gptq_linear()
...@@ -24,4 +24,4 @@ def test_torchvision_models_lazy_init(subset, default_device): ...@@ -24,4 +24,4 @@ def test_torchvision_models_lazy_init(subset, default_device):
if __name__ == "__main__": if __name__ == "__main__":
test_torchvision_models_lazy_init("transformers", "cpu") test_torchvision_models_lazy_init("transformers", "cpu")
\ No newline at end of file
import torch
import pytest import pytest
import torch
from colossalai.nn.optimizer import CPUAdam, HybridAdam from colossalai.nn.optimizer import CPUAdam, HybridAdam
from colossalai.testing import clear_cache_before_run, parameterize from colossalai.testing import clear_cache_before_run, parameterize
...@@ -17,6 +17,7 @@ def check_params_equal(model, torch_model): ...@@ -17,6 +17,7 @@ def check_params_equal(model, torch_model):
for p, torch_p in zip(model.parameters(), torch_model.parameters()): for p, torch_p in zip(model.parameters(), torch_model.parameters()):
assert torch.allclose(p, torch_p, atol=1e-3), f"diff: {torch.abs(p - torch_p)}" assert torch.allclose(p, torch_p, atol=1e-3), f"diff: {torch.abs(p - torch_p)}"
# TODO Something wrong with ci when running this test. # TODO Something wrong with ci when running this test.
@pytest.mark.skip(reason="skip because of something wrong with CI") @pytest.mark.skip(reason="skip because of something wrong with CI")
@clear_cache_before_run() @clear_cache_before_run()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment