Commit 112bf76b authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #1826 canceled with stages
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"train_micro_batch_size_per_gpu": "auto",
"train_batch_size": "auto",
"gradient_accumulation_steps": "auto",
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
}
}
\ No newline at end of file
import argparse
import os
from vita.model.builder import load_pretrained_model
from vita.util.mm_utils import get_model_name_from_path
def merge_lora(args):
model_path = os.path.expanduser(args.model_path)
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(
model_path, args.model_base, model_name, args.model_type
)
model.save_pretrained(args.save_model_path)
tokenizer.save_pretrained(args.save_model_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, required=True)
parser.add_argument("--model-base", type=str, required=True)
parser.add_argument("--model-type", type=str, required=True)
parser.add_argument("--save-model-path", type=str, required=True)
args = parser.parse_args()
merge_lora(args)
#!/bin/bash
MODEL_TYPE=mixtral-8x7b
OUTPUT_DIR=$1
OUTPUT_DIR_FT=${OUTPUT_DIR}/llava-s2-pretrain_video
mkdir -p ${OUTPUT_DIR_FT}
deepspeed --include localhost:0,1,2,3,4,5,6,7 vita/train/train.py \
--deepspeed ./script/deepspeed/ds_config_zero3_offload2.json \
--model_name_or_path Mixtral-8x7B_modVocab/mg2hg \
--model_type $MODEL_TYPE \
--version mixtral_two \
--dataset_use Pretrain_video \
--vision_tower InternViT-300M-448px \
--pretrain_mm_mlp_adapter ${OUTPUT_DIR}/llava-s1-pretrain_mlp_video/mm_projector.bin \
--mm_projector_type mlp2x_gelu \
--audio_encoder audio-encoder-2wh_zh_en_audioset_Mixtral-8x7B_New-base-tunning \
--freeze_audio_encoder True \
--freeze_audio_encoder_adapter False \
--image_aspect_ratio square \
--group_by_modality_length False \
--bf16 True \
--output_dir ${OUTPUT_DIR_FT} \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 500 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 6200 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to none \
2>&1 | tee -a ${OUTPUT_DIR_FT}/log.txt && echo "Done."
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
DIR=`pwd`
export NCCL_SOCKET_IFNAME=eth0
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
#export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1
#export NCCL_IB_SL=3
#export NCCL_CHECKS_DISABLE=1
export NCCL_P2P_DISABLE=0
#export NCCL_LL_THRESHOLD=16384
export NCCL_IB_CUDA_SUPPORT=1
export NCCL_DEBUG=INFO
INDEX=5
MASTER_ADDR="10.206.0.199"
# communication on taiji platform
DISTRIBUTED_ARGS="
--nproc_per_node 8 \
--nnodes 6 \
--node_rank $INDEX \
--master_addr $MASTER_ADDR \
--master_port 9999
"
export NCCL_TIMEOUT=25200
MODEL_TYPE=mixtral-8x7b
OUTPUT_DIR=$1
OUTPUT_DIR_FT=${OUTPUT_DIR}/llava-s3-finetune_task
mkdir -p ${OUTPUT_DIR_FT}
torchrun $DISTRIBUTED_ARGS vita/train/train.py \
--deepspeed ./script/deepspeed/zero3.json \
--model_name_or_path VITA_ckpt \
--model_type $MODEL_TYPE \
--version mixtral_two \
--dataset_use Pretrain_video \
--vision_tower InternViT-300M-448px \
--mm_projector_type mlp2x_gelu \
--audio_encoder audio-encoder-2wh_zh_en_audioset_Mixtral-8x7B_New-base-tunning \
--freeze_audio_encoder True \
--freeze_audio_encoder_adapter True \
--image_aspect_ratio square \
--group_by_modality_length False \
--bf16 True \
--output_dir ${OUTPUT_DIR_FT} \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 2 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 200 \
--save_total_limit 2 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 9100 \
--ddp_timeout 25200 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to none \
2>&1 | tee -a ${OUTPUT_DIR_FT}/log_node_$INDEX.txt && echo "Done."
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
DIR=`pwd`
export NCCL_SOCKET_IFNAME=eth0
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
#export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1
#export NCCL_IB_SL=3
#export NCCL_CHECKS_DISABLE=1
export NCCL_P2P_DISABLE=0
#export NCCL_LL_THRESHOLD=16384
export NCCL_IB_CUDA_SUPPORT=1
export NCCL_DEBUG=INFO
INDEX=5
# MASTER_ADDR="10.206.0.199"
MASTER_ADDR="0.0.0.0"
# communication on taiji platform
DISTRIBUTED_ARGS="
--nproc_per_node 8 \
--nnodes 1 \
--node_rank 0 \
--master_addr $MASTER_ADDR \
--master_port 9999
"
export NCCL_TIMEOUT=25200
MODEL_TYPE=mixtral-8x7b
OUTPUT_DIR=$1
OUTPUT_DIR_FT=${OUTPUT_DIR}/llava-s3-finetune_task
mkdir -p ${OUTPUT_DIR_FT}
# torchrun $DISTRIBUTED_ARGS vita/train/train.py \
torchrun --nproc-per-node=8 --nnodes=1 vita/train/train.py \
--deepspeed ./script/deepspeed/zero3.json \
--model_name_or_path VITA/VITA_ckpt \
--model_type $MODEL_TYPE \
--version mixtral_two \
--dataset_use Pretrain_video \
--vision_tower InternViT-300M-448px \
--mm_projector_type mlp2x_gelu \
--audio_encoder audio-encoder-2wh_zh_en_audioset_Mixtral-8x7B_New-base-tunning \
--freeze_audio_encoder True \
--freeze_audio_encoder_adapter True \
--image_aspect_ratio square \
--group_by_modality_length False \
--bf16 True \
--output_dir ${OUTPUT_DIR_FT} \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 2 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 200 \
--save_total_limit 2 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 False \
--model_max_length 9100 \
--ddp_timeout 25200 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to none \
2>&1 | tee -a ${OUTPUT_DIR_FT}/log_node_$INDEX.txt && echo "Done."
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
DIR=`pwd`
export NCCL_SOCKET_IFNAME=eth0
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
#export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1
#export NCCL_IB_SL=3
#export NCCL_CHECKS_DISABLE=1
export NCCL_P2P_DISABLE=0
#export NCCL_LL_THRESHOLD=16384
export NCCL_IB_CUDA_SUPPORT=1
export NCCL_DEBUG=INFO
INDEX=0
MASTER_ADDR="172.17.0.5"
# communication on taiji platform
DISTRIBUTED_ARGS="
--nproc_per_node 8 \
--nnodes 4 \
--node_rank $INDEX \
--master_addr $MASTER_ADDR \
--master_port 9999
"
export NCCL_TIMEOUT=25200
MODEL_TYPE=mixtral-8x7b
OUTPUT_DIR=$1
OUTPUT_DIR_FT=${OUTPUT_DIR}/llava-s2-pretrain_video
mkdir -p ${OUTPUT_DIR_FT}
torchrun $DISTRIBUTED_ARGS vita/train/train.py \
--deepspeed ./script/deepspeed/ds_config_zero3_offload.json \
--model_name_or_path Mixtral-8x7B_modVocab/mg2hg \
--model_type $MODEL_TYPE \
--version mixtral_two \
--dataset_use Pretrain_video \
--vision_tower InternViT-300M-448px \
--pretrain_mm_mlp_adapter ${OUTPUT_DIR}/llava-s1-pretrain_mlp_video/mm_projector.bin \
--mm_projector_type mlp2x_gelu \
--audio_encoder audio-encoder-2wh_zh_en_audioset_Mixtral-8x7B_New-base-tunning \
--freeze_audio_encoder True \
--freeze_audio_encoder_adapter True \
--image_aspect_ratio square \
--group_by_modality_length False \
--bf16 True \
--output_dir ${OUTPUT_DIR_FT} \
--num_train_epochs 1 \
--per_device_train_batch_size 10 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 2 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 200 \
--save_total_limit 2 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 6200 \
--ddp_timeout 25200 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to none \
2>&1 | tee -a ${OUTPUT_DIR_FT}/log_node_$INDEX.txt && echo "Done."
#!/bin/bash
MODEL_TYPE=mixtral-8x7b
OUTPUT_DIR=$1
OUTPUT_DIR_FT=${OUTPUT_DIR}/llava-s1-pretrain_mlp_video
mkdir -p ${OUTPUT_DIR_FT}
deepspeed --include localhost:0,1,2,3,4,5,6,7 vita/train/train.py \
--deepspeed ./script/deepspeed/zero3.json \
--model_name_or_path Mixtral-8x7B_modVocab/mg2hg \
--model_type $MODEL_TYPE \
--version mixtral_two \
--dataset_use Pretrain_video \
--vision_tower InternViT-300M-448px \
--mm_projector_type mlp2x_gelu \
--tune_mm_mlp_adapter True \
--audio_encoder audio-encoder-2wh_zh_en_audioset_Mixtral-8x7B_New-base-tunning \
--freeze_audio_encoder True \
--freeze_audio_encoder_adapter True \
--image_aspect_ratio square \
--group_by_modality_length False \
--bf16 True \
--output_dir ${OUTPUT_DIR_FT} \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 2 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 500 \
--save_total_limit 1 \
--learning_rate 5e-4 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 6200 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to none \
2>&1 | tee -a ${OUTPUT_DIR_FT}/log.txt && echo "Done."
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
DIR=`pwd`
export NCCL_SOCKET_IFNAME=eth0
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
#export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1
#export NCCL_IB_SL=3
#export NCCL_CHECKS_DISABLE=1
export NCCL_P2P_DISABLE=0
#export NCCL_LL_THRESHOLD=16384
export NCCL_IB_CUDA_SUPPORT=1
export NCCL_DEBUG=INFO
INDEX=3
MASTER_ADDR="172.17.0.5"
# communication on taiji platform
DISTRIBUTED_ARGS="
--nproc_per_node 8 \
--nnodes 4 \
--node_rank $INDEX \
--master_addr $MASTER_ADDR \
--master_port 9999
"
MODEL_TYPE=mixtral-8x7b
OUTPUT_DIR=$1
OUTPUT_DIR_FT=${OUTPUT_DIR}/llava-s1-pretrain_mlp_video
mkdir -p ${OUTPUT_DIR_FT}
torchrun $DISTRIBUTED_ARGS vita/train/train.py \
--deepspeed ./script/deepspeed/zero3.json \
--model_name_or_path Mixtral-8x7B_modVocab/mg2hg \
--model_type $MODEL_TYPE \
--version mixtral_two \
--dataset_use Pretrain_video \
--vision_tower InternViT-300M-448px \
--mm_projector_type mlp2x_gelu \
--tune_mm_mlp_adapter True \
--audio_encoder audio-encoder-2wh_zh_en_audioset_Mixtral-8x7B_New-base-tunning \
--freeze_audio_encoder True \
--freeze_audio_encoder_adapter True \
--image_aspect_ratio square \
--group_by_modality_length False \
--bf16 True \
--output_dir ${OUTPUT_DIR_FT} \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 2 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 500 \
--save_total_limit 1 \
--learning_rate 5e-4 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 6200 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to none \
2>&1 | tee -a ${OUTPUT_DIR_FT}/log_node_$INDEX.txt && echo "Done."
export PYTHONPATH=./
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
OUTPUT_DIR=outputs/vita_video_audio
bash script/train/finetuneTask_nodes.sh ${OUTPUT_DIR} # origin
# bash script/train/finetuneTask_nodes_singlenode.sh ${OUTPUT_DIR} # for try train
import argparse
import os
import time
import numpy as np
import torch
from PIL import Image
from decord import VideoReader, cpu
from vita.constants import (
DEFAULT_AUDIO_TOKEN,
DEFAULT_IMAGE_TOKEN,
DEFAULT_VIDEO_TOKEN,
IGNORE_INDEX,
IMAGE_TOKEN_INDEX,
MAX_IMAGE_LENGTH,
)
from vita.conversation import SeparatorStyle, conv_templates
from vita.model.builder import load_pretrained_model
from vita.util.data_utils_video_audio_neg_patch import dynamic_preprocess
from vita.util.mm_utils import (
KeywordsStoppingCriteria,
get_model_name_from_path,
tokenizer_image_audio_token,
tokenizer_image_token,
)
from vita.util.utils import disable_torch_init
import soundfile as sf
def _get_rawvideo_dec(
video_path,
image_processor,
max_frames=MAX_IMAGE_LENGTH,
min_frames=4,
image_resolution=384,
video_framerate=1,
s=None,
e=None,
image_aspect_ratio="pad",
):
# speed up video decode via decord.
if s is None:
start_time, end_time = None, None
else:
start_time = int(s)
end_time = int(e)
start_time = start_time if start_time >= 0.0 else 0.0
end_time = end_time if end_time >= 0.0 else 0.0
if start_time > end_time:
start_time, end_time = end_time, start_time
elif start_time == end_time:
end_time = start_time + 1
if os.path.exists(video_path):
vreader = VideoReader(video_path, ctx=cpu(0))
else:
print(video_path)
raise FileNotFoundError
fps = vreader.get_avg_fps()
f_start = 0 if start_time is None else int(start_time * fps)
f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
num_frames = f_end - f_start + 1
if num_frames > 0:
# T x 3 x H x W
sample_fps = int(video_framerate)
t_stride = int(round(float(fps) / sample_fps))
all_pos = list(range(f_start, f_end + 1, t_stride))
if len(all_pos) > max_frames:
sample_pos = [
all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)
]
elif len(all_pos) < min_frames:
sample_pos = [
all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=min_frames, dtype=int)
]
else:
sample_pos = all_pos
patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
if image_aspect_ratio == "pad":
def expand2square(pil_img, background_color):
width, height = pil_img.size
if width == height:
return pil_img
elif width > height:
result = Image.new(pil_img.mode, (width, width), background_color)
result.paste(pil_img, (0, (width - height) // 2))
return result
else:
result = Image.new(pil_img.mode, (height, height), background_color)
result.paste(pil_img, ((height - width) // 2, 0))
return result
patch_images = [
expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean))
for i in patch_images
]
patch_images = [
image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
for i in patch_images
]
else:
patch_images = [
image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
for i in patch_images
]
patch_images = torch.stack(patch_images)
slice_len = patch_images.shape[0]
return patch_images, slice_len
else:
print("video path: {} error.".format(video_path))
if __name__ == "__main__":
# Initialize the parser
parser = argparse.ArgumentParser(description="Process model and video paths.")
# Add arguments
parser.add_argument("--model_path", type=str, required=True, help="Path to the model directory")
parser.add_argument("--model_base", type=str, default=None)
parser.add_argument("--video_path", type=str, default=None)
parser.add_argument("--image_path", type=str, default=None)
parser.add_argument("--audio_path", type=str, default=None)
parser.add_argument("--model_type", type=str, default="mixtral-8x7b")
parser.add_argument("--conv_mode", type=str, default="mixtral_two")
parser.add_argument("--question", type=str, default="")
# Parse the arguments
args = parser.parse_args()
# Assign arguments to variables
model_path = args.model_path
model_base = args.model_base
video_path = args.video_path
image_path = args.image_path
audio_path = args.audio_path
qs = args.question
assert (audio_path is None) != (qs == ""), "Exactly one of audio_path or qs must be non-None"
conv_mode = args.conv_mode
# The number of visual tokens varies with the length of the video. "max_frames" is the maximum number of frames.
# When the video is long, we will uniformly downsample the video to meet the frames when equal to the "max_frames".
max_frames = MAX_IMAGE_LENGTH # 100
# The number of frames retained per second in the video.
video_framerate = 1
# Sampling Parameter
temperature = 0.01
top_p = None
num_beams = 1
disable_torch_init()
model_path = os.path.expanduser(model_path)
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(
model_path, model_base, model_name, args.model_type
)
model.resize_token_embeddings(len(tokenizer))
vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
vision_tower.load_model()
image_processor = vision_tower.image_processor
audio_encoder = model.get_audio_encoder()
# audio_encoder.to(device="cuda", dtype=torch.float16)
audio_encoder.to(dtype=torch.float16)
audio_processor = audio_encoder.audio_processor
model.eval()
if audio_path is not None:
audio, audio_for_llm_lens = audio_processor.process(os.path.join(audio_path))
# audio, fs = sf.read(os.path.join(audio_path))
# audio = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
audio_length = audio.shape[0]
audio = torch.unsqueeze(audio, dim=0)
audio_length = torch.unsqueeze(torch.tensor(audio_length), dim=0)
audios = dict()
audios["audios"] = audio.half().cuda()
audios["lengths"] = audio_length.half().cuda()
else:
audio = torch.zeros(400, 80)
audio_length = audio.shape[0]
audio = torch.unsqueeze(audio, dim=0)
audio_length = torch.unsqueeze(torch.tensor(audio_length), dim=0)
audios = dict()
audios["audios"] = audio.half().cuda()
audios["lengths"] = audio_length.half().cuda()
# audios = None
# Check if the video exists
if video_path is not None:
video_frames, slice_len = _get_rawvideo_dec(
video_path,
image_processor,
max_frames=max_frames,
video_framerate=video_framerate,
image_aspect_ratio=getattr(model.config, "image_aspect_ratio", None),
)
image_tensor = video_frames.half().cuda()
if audio_path:
qs = DEFAULT_IMAGE_TOKEN * slice_len + "\n" + qs + DEFAULT_AUDIO_TOKEN
else:
qs = DEFAULT_IMAGE_TOKEN * slice_len + "\n" + qs
modality = "video"
elif image_path is not None:
image = Image.open(image_path).convert("RGB")
image, p_num = dynamic_preprocess(
image, min_num=1, max_num=12, image_size=448, use_thumbnail=True
)
assert len(p_num) == 1
image_tensor = model.process_images(image, model.config).to(
dtype=model.dtype, device="cuda"
)
if audio_path:
qs = DEFAULT_IMAGE_TOKEN * p_num[0] + "\n" + qs + DEFAULT_AUDIO_TOKEN
else:
qs = DEFAULT_IMAGE_TOKEN * p_num[0] + "\n" + qs
modality = "image"
else:
image_tensor = torch.zeros((1, 3, 448, 448)).to(dtype=model.dtype, device="cuda")
if audio_path:
qs = qs + DEFAULT_AUDIO_TOKEN
modality = "lang"
conv = conv_templates[conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt(modality)
if audio_path:
input_ids = (
tokenizer_image_audio_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
.unsqueeze(0)
.cuda()
)
else:
input_ids = (
tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
.unsqueeze(0)
.cuda()
)
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
start_time = time.time()
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image_tensor,
audios=audios,
do_sample=False,
temperature=temperature,
top_p=top_p,
num_beams=num_beams,
output_scores=True,
return_dict_in_generate=True,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria],
)
infer_time = time.time() - start_time
output_ids = output_ids.sequences
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids")
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=False)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[: -len(stop_str)]
outputs = outputs.strip()
print(outputs)
print(f"Time consume: {infer_time}")
{
"_name_or_path": "model_weights/Mixtral-8x7B_New/mg2hg",
"architectures": [
"MixtralForConditionalGeneration"
],
"auto_map": {
"AutoConfig": "configuration_mixtral_multimodal.MixtralMultiModalConfig",
"AutoModel": "modeling_mixtral_multimodal.MixtralForConditionalGeneration"
},
"ignore_index": -100,
"model_type": "mixtral_multimodal",
"projector_hidden_act": "gelu",
"audio_projector_hidden_act": "gelu",
"image_token_index": 51000,
"audio_token_index": 51001,
"text_config": {
"architectures": [
"MixtralForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 14336,
"max_position_embeddings": 32768,
"model_type": "mixtral",
"num_attention_heads": 32,
"num_experts_per_tok": 2,
"num_hidden_layers": 32,
"num_key_value_heads": 8,
"num_local_experts": 8,
"output_router_logits": false,
"rms_norm_eps": 1e-05,
"rope_theta": 1000000.0,
"router_aux_loss_coef": 0.02,
"sliding_window": null,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.41.1",
"use_cache": false,
"vocab_size": 51760
},
"vision_config": {
"architectures": [
"InternVisionModel"
],
"auto_map": {
"AutoConfig": "configuration_intern_vit.InternVisionConfig",
"AutoModel": "modeling_intern_vit.InternVisionModel"
},
"attention_dropout": 0.0,
"drop_path_rate": 0.1,
"dropout": 0.0,
"hidden_act": "gelu",
"hidden_size": 1024,
"image_size": 448,
"initializer_factor": 1.0,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-06,
"model_type": "intern_vit_6b",
"norm_type": "layer_norm",
"num_attention_heads": 16,
"num_channels": 3,
"num_hidden_layers": 24,
"patch_size": 14,
"qk_normalization": false,
"qkv_bias": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.37.2",
"use_flash_attn": true
},
"audio_config":{
"_name_or_path": "whale_audio_mini",
"architectures": [
"WhaleAudioModel"
],
"attention_dropout": 0.0,
"auto_map": {
"AutoConfig": "configuration_whale.WhaleConfig",
"AutoFeatureExtractor": "processor_whale.WhaleFeatureExtractor",
"AutoModel": "modeling_whale.WhaleAudioModel"
},
"concat_after": false,
"dropout": 0.1,
"hidden_act": "relu",
"hidden_size": 1024,
"initializer_factor": 0.1,
"initializer_range": 0.02,
"input_dim": 80,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 5000,
"model_type": "whale",
"norm_type": "layer_norm",
"normalize_before": true,
"num_attention_heads": 16,
"num_channels": 1,
"num_hidden_layers": 24,
"positional_dropout": 0.1,
"qk_normalization": false,
"qkv_bias": false,
"torch_dtype": "float32",
"transformers_version": "4.42.4",
"use_flash_attn": false,
"use_relative_pe": true
},
"downsample_ratio": 0.5,
"dynamic_image_size": true,
"max_dynamic_patch": 12,
"min_dynamic_patch": 1,
"vision_feature_layer": -1,
"use_thumbnail": true,
"tokenizer_model_max_length": 4600,
"tokenizer_padding_side": "right",
"vocab_size": 51760
}
# --------------------------------------------------------
# InternVL
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import os
from typing import Union
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class InternVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
instantiate a vision encoder according to the specified arguments, defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
num_channels (`int`, *optional*, defaults to 3):
Number of color channels in the input images (e.g., 3 for RGB).
patch_size (`int`, *optional*, defaults to 14):
The size (resolution) of each patch.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
qkv_bias (`bool`, *optional*, defaults to `False`):
Whether to add a bias to the queries and values in the self-attention layers.
hidden_size (`int`, *optional*, defaults to 3200):
Dimensionality of the encoder layers and the pooler layer.
num_attention_heads (`int`, *optional*, defaults to 25):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (`int`, *optional*, defaults to 12800):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
qk_normalization (`bool`, *optional*, defaults to `True`):
Whether to normalize the queries and keys in the self-attention layers.
num_hidden_layers (`int`, *optional*, defaults to 48):
Number of hidden layers in the Transformer encoder.
use_flash_attn (`bool`, *optional*, defaults to `True`):
Whether to use flash attention mechanism.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
drop_path_rate (`float`, *optional*, defaults to 0.0):
Dropout rate for stochastic depth.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 0.1):
A factor for layer scale.
"""
model_type = 'intern_vit_6b'
def __init__(
self,
num_channels=3,
patch_size=14,
image_size=224,
qkv_bias=False,
hidden_size=3200,
num_attention_heads=25,
intermediate_size=12800,
qk_normalization=True,
num_hidden_layers=48,
use_flash_attn=True,
hidden_act='gelu',
norm_type='rms_norm',
layer_norm_eps=1e-6,
dropout=0.0,
drop_path_rate=0.0,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=0.1,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.dropout = dropout
self.drop_path_rate = drop_path_rate
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.norm_type = norm_type
self.qkv_bias = qkv_bias
self.qk_normalization = qk_normalization
self.use_flash_attn = use_flash_attn
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if 'vision_config' in config_dict:
config_dict = config_dict['vision_config']
if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
)
return cls.from_dict(config_dict, **kwargs)
# coding=utf-8
# Copyright 2024 The Vita team. All rights reserved.
"""Multi-modal Mixtral model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
from transformers.models.auto import CONFIG_MAPPING
from .configuration_intern_vit import InternVisionConfig
from .configuration_whale import WhaleConfig
class MixtralMultiModalConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a `MixtralMultiModal` model. It is used to instantiate a
MixtralMultiModal model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a model with the specified default parameters.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `None`):
The config object or dictionary of the vision backbone.
text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `None`):
The config object or dictionary of the text backbone.
audio_config (`Union[AutoConfig, dict]`, *optional*, defaults to `None`):
The config object or dictionary of the audio backbone.
ignore_index (`int`, *optional*, defaults to -100):
The ignore index for the loss function.
image_token_index (`int`, *optional*, defaults to 32000):
The image token index to encode the image prompt.
projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
The activation function used by the multimodal projector.
vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
The feature selection strategy used to select the vision feature from the vision backbone.
Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
If `"full"`, the full vision features are used.
vision_feature_layer (`int`, *optional*, defaults to -2):
The index of the layer to select the vision feature.
vision_downsample_ratio (`float`, *optional*, defaults to 0.5):
The downsample ratio for the vision features.
dynamic_image_size (`bool`, *optional*, defaults to `True`):
Whether to use dynamic image sizes.
max_dynamic_patch (`int`, *optional*, defaults to 12):
The maximum number of dynamic patches.
min_dynamic_patch (`int`, *optional*, defaults to 1):
The minimum number of dynamic patches.
use_thumbnail (`bool`, *optional*, defaults to `True`):
Whether to use thumbnails.
audio_token_index (`int`, *optional*, defaults to 32000):
The audio token index to encode the audio prompt.
audio_projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
The activation function used by the audio projector.
audio_projector_kernel_size (`int`, *optional*, defaults to 5):
The kernel size used by the audio projector.
audio_downsample_ratio (`float`, *optional*, defaults to 0.125):
The downsample ratio for the audio features.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be tied.
Example:
```python
>>> from transformers import MixtralMultiModalConfig, MixtralMultiModalModel
>>> # Initializing a MixtralMultiModal configuration
>>> configuration = MixtralMultiModalConfig()
>>> # Initializing a model from the configuration
>>> model = MixtralMultiModalModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "mixtral_multimodal"
is_composition = False
def __init__(
self,
vision_config=None,
text_config=None,
audio_config=None,
ignore_index=-100,
image_token_index=32000,
projector_hidden_act="gelu",
vision_feature_select_strategy="default",
vision_feature_layer=-2,
vision_downsample_ratio=0.5,
dynamic_image_size=True,
max_dynamic_patch=12,
min_dynamic_patch=1,
use_thumbnail=True,
audio_token_index=32000,
audio_projector_hidden_act="gelu",
audio_projector_kernel_size=5,
audio_downsample_ratio=0.125,
tie_word_embeddings=False,
**kwargs,
):
self.ignore_index = ignore_index
self.image_token_index = image_token_index
self.projector_hidden_act = projector_hidden_act
if vision_feature_select_strategy not in ["default", "full"]:
raise ValueError(
"vision_feature_select_strategy should be one of 'default', 'full'."
f"Got: {vision_feature_select_strategy}"
)
self.vision_feature_select_strategy = vision_feature_select_strategy
self.vision_feature_layer = vision_feature_layer
self.vision_downsample_ratio = vision_downsample_ratio
self.dynamic_image_size = dynamic_image_size
self.max_dynamic_patch = max_dynamic_patch
self.min_dynamic_patch = min_dynamic_patch
self.use_thumbnail = use_thumbnail
self.audio_token_index = audio_token_index
self.audio_projector_hidden_act = audio_projector_hidden_act
self.audio_projector_kernel_size = audio_projector_kernel_size
self.audio_downsample_ratio = audio_downsample_ratio
if isinstance(vision_config, dict):
vision_config["model_type"] = (
vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
)
vision_config = InternVisionConfig(**vision_config)
elif vision_config is None:
vision_config = CONFIG_MAPPING["clip_vision_model"](
intermediate_size=4096,
hidden_size=1024,
patch_size=14,
image_size=336,
num_hidden_layers=24,
num_attention_heads=16,
vocab_size=32000,
projection_dim=768,
)
self.vision_config = vision_config
if isinstance(text_config, dict):
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
elif text_config is None:
text_config = CONFIG_MAPPING["llama"]()
self.text_config = text_config
if isinstance(audio_config, dict):
audio_config["model_type"] = (
audio_config["model_type"] if "model_type" in audio_config else "clip_vision_model"
)
audio_config = WhaleConfig(**audio_config)
self.audio_config = audio_config
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
# --------------------------------------------------------
# Copyright (c)
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import os
from typing import Union
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class WhaleConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a `Whale` model. It is used to instantiate a
Whale model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a model with the specified default parameters.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
input_dim (`int`, *optional*, defaults to 80):
The input dimension of the model.
num_channels (`int`, *optional*, defaults to 1):
The number of input channels.
qkv_bias (`bool`, *optional*, defaults to `False`):
Whether to add a bias to the query, key, value projections.
hidden_size (`int`, *optional*, defaults to 1024):
The size of the hidden layers.
num_attention_heads (`int`, *optional*, defaults to 25):
The number of attention heads.
max_position_embeddings (`int`, *optional*, defaults to 5000):
The maximum number of position embeddings.
intermediate_size (`int`, *optional*, defaults to 4096):
The size of the intermediate (feed-forward) layer.
qk_normalization (`bool`, *optional*, defaults to `True`):
Whether to apply normalization to the query and key projections.
num_hidden_layers (`int`, *optional*, defaults to 48):
The number of hidden layers in the model.
use_flash_attn (`bool`, *optional*, defaults to `True`):
Whether to use flash attention.
hidden_act (`str`, *optional*, defaults to `'relu'`):
The activation function to use in the hidden layers.
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
The epsilon value for layer normalization.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for the hidden layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for the attention layers.
positional_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for the positional encodings.
normalize_before (`bool`, *optional*, defaults to `True`):
Whether to apply normalization before the attention and feed-forward layers.
concat_after (`bool`, *optional*, defaults to `True`):
Whether to concatenate the attention output with the input before the feed-forward layer.
use_relative_pe (`bool`, *optional*, defaults to `True`):
Whether to use relative position encodings.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 0.1):
A factor for initializing the weights.
Example:
```python
>>> from transformers import WhaleConfig, WhaleModel
>>> # Initializing a Whale configuration
>>> configuration = WhaleConfig()
>>> # Initializing a model from the configuration
>>> model = WhaleModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = 'whale'
def __init__(
self,
input_dim=80,
num_channels=1,
qkv_bias=False,
hidden_size=1024,
num_attention_heads=25,
max_position_embeddings=5000,
intermediate_size=4096,
qk_normalization=True,
num_hidden_layers=48,
use_flash_attn=True,
hidden_act='relu',
layer_norm_eps=1e-6,
dropout=0.0,
attention_dropout=0.0,
positional_dropout=0.0,
normalize_before=True,
concat_after=True,
use_relative_pe=True,
initializer_range=0.02,
initializer_factor=0.1,
**kwargs,
):
super().__init__(**kwargs)
self.input_dim = input_dim
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.dropout = dropout
self.num_channels = num_channels
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
self.positional_dropout = positional_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.qkv_bias = qkv_bias
self.qk_normalization = qk_normalization
self.use_flash_attn = use_flash_attn
self.normalize_before = normalize_before
self.concat_after = concat_after
self.max_position_embeddings = max_position_embeddings
self.use_relative_pe = use_relative_pe
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if 'audio_config' in config_dict:
config_dict = config_dict['audio_config']
if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
)
return cls.from_dict(config_dict, **kwargs)
{
"_name_or_path": "whale_audio_mini",
"auto_map": {
"AutoFeatureExtractor": "processor_whale.WhaleFeatureExtractor"
}
}
{
"auto_map": {
"AutoFeatureExtractor": "processor_whale.WhaleFeatureExtractor"
},
"cmvn_istds": [
0.3329853392031094,
0.304457074540202,
0.29546332871219255,
0.3016748868710893,
0.30146666620931134,
0.29722031038819924,
0.2917111074466677,
0.28384523520067434,
0.28523771805804266,
0.2901033423245173,
0.29489059636859316,
0.29547348893246006,
0.29620672776488977,
0.2956320049313563,
0.29470081603763376,
0.2963227174569345,
0.2976606657847016,
0.3003491761055444,
0.30211458429446625,
0.30174903334867953,
0.30016818970986625,
0.3033775994936976,
0.30356758993511046,
0.30603640492289896,
0.30671985841954447,
0.3069228273662989,
0.30770085196779645,
0.3067656201661381,
0.3055203785780098,
0.30690623983421333,
0.30723413937044297,
0.3088911065771803,
0.3091251382267279,
0.30986769010126597,
0.31000059868830204,
0.30963732259143195,
0.3093967488140671,
0.30918562772813507,
0.30968744324817,
0.3085437993502015,
0.309308051573859,
0.3087313674687873,
0.30814804868295664,
0.30722053416625006,
0.30732656820194293,
0.3064066246045986,
0.30390658225471334,
0.302131011830547,
0.3014575331911756,
0.301449202764865,
0.30039048343978525,
0.29975195531574894,
0.2993214974016792,
0.29809597194189,
0.2950458103872353,
0.29250998818879875,
0.29285432953044965,
0.2928594451679315,
0.2922642564293608,
0.2934287968886421,
0.2929937863211079,
0.2921845930953747,
0.2917417094543235,
0.28991734472060865,
0.2888153105794442,
0.2870270977983177,
0.2843282542200158,
0.2827033299131669,
0.28035104778082265,
0.2782082983359874,
0.27589950120001683,
0.27325842201376005,
0.27104919439201897,
0.2688075817805597,
0.26814315263564775,
0.26998556725462286,
0.269346874312791,
0.2673887565870066,
0.2683233739448121,
0.2702135698992237
],
"cmvn_means": [
11.837255115918403,
12.473204615847946,
13.416767619583318,
14.077409846458519,
14.692713667734644,
15.134646755356338,
15.425053998320841,
15.520304088736482,
15.664980906057181,
15.682885368714361,
15.83134095973795,
15.901056812316575,
16.043105914428832,
16.141928413478638,
16.146063740161384,
16.17268368755442,
16.13231180127601,
16.065540090344545,
16.170683092860383,
15.998216926090535,
15.867837768614727,
16.081028935225024,
15.90913485828459,
16.032066529724602,
15.94857810175373,
16.03539817911192,
15.919972463810511,
16.012130517613077,
15.93573072975294,
15.914797286475908,
15.949416173227279,
15.914241247262952,
15.9205949984345,
15.979177455555364,
15.986889776762691,
16.04603056604172,
16.110854420018935,
16.11681722403251,
16.129875546992444,
16.085759281189265,
16.134709075491045,
16.09818475127177,
16.202892094198077,
16.195676195628295,
16.265984774543206,
16.368600951439756,
16.48524192770144,
16.53072364237602,
16.58613266332892,
16.682058026108336,
16.643586991407417,
16.62329213337083,
16.638263919106894,
16.703993486441295,
16.75845666749587,
16.818435528248443,
16.88729840520967,
16.89038585593233,
16.816687157527294,
16.731004380992307,
16.674947603018126,
16.562815703508104,
16.50694580056838,
16.427151307327705,
16.33695716109585,
16.22435176840036,
16.122595445956836,
16.074572001519112,
16.045862034568927,
15.997705599309137,
15.955502796282088,
15.925529416522258,
15.884868619147634,
15.847951054825177,
15.812488364237238,
15.791251105720136,
15.698867196814575,
15.451057143452907,
15.043111236177761,
14.453490694177178
],
"cmvn_preload": true,
"dither": 1.0,
"do_ceptral_normalize": true,
"feature_extractor_type": "WhaleFeatureExtractor",
"feature_size": 80,
"frame_length": 25,
"frame_shift": 10,
"normalize_means": true,
"normalize_vars": true,
"num_mel_bins": 80,
"padding_side": "right",
"padding_value": 0.0,
"return_attention_mask": true,
"sampling_rate": 16000
}
# --------------------------------------------------------
# InternVL
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from typing import Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from einops import rearrange
from timm.models.layers import DropPath
from torch import nn
from transformers.activations import ACT2FN
from transformers.modeling_outputs import (BaseModelOutput,
BaseModelOutputWithPooling)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import logging
from .configuration_intern_vit import InternVisionConfig
try:
from .flash_attention import FlashAttention
has_flash_attn = True
except:
print('FlashAttention is not installed.')
has_flash_attn = False
logger = logging.get_logger(__name__)
class InternRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
return self.weight * hidden_states.to(input_dtype)
try:
from apex.normalization import FusedRMSNorm
InternRMSNorm = FusedRMSNorm # noqa
logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
except ImportError:
# using the normal InternRMSNorm
pass
except Exception:
logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
pass
NORM2FN = {
'rms_norm': InternRMSNorm,
'layer_norm': nn.LayerNorm,
}
class InternVisionEmbeddings(nn.Module):
def __init__(self, config: InternVisionConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size
self.class_embedding = nn.Parameter(
torch.randn(1, 1, self.embed_dim),
)
self.patch_embedding = nn.Conv2d(
in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
)
self.num_patches = (self.image_size // self.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
def _get_pos_embed(self, pos_embed, H, W):
target_dtype = pos_embed.dtype
pos_embed = pos_embed.float().reshape(
1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False).\
reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
return pos_embed
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
target_dtype = self.patch_embedding.weight.dtype
patch_embeds = self.patch_embedding(pixel_values) # shape = [*, channel, width, height]
batch_size, _, height, width = patch_embeds.shape
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
position_embedding = torch.cat([
self.position_embedding[:, :1, :],
self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
], dim=1)
embeddings = embeddings + position_embedding.to(target_dtype)
return embeddings
class InternAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config: InternVisionConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.use_flash_attn = config.use_flash_attn and has_flash_attn
if config.use_flash_attn and not has_flash_attn:
print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
f' {self.num_heads}).'
)
self.scale = self.head_dim ** -0.5
self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
self.attn_drop = nn.Dropout(config.attention_dropout)
self.proj_drop = nn.Dropout(config.dropout)
self.qk_normalization = config.qk_normalization
if self.qk_normalization:
self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
if self.use_flash_attn:
self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
self.proj = nn.Linear(self.embed_dim, self.embed_dim)
def _naive_attn(self, x):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
if self.qk_normalization:
B_, H_, N_, D_ = q.shape
q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
attn = ((q * self.scale) @ k.transpose(-2, -1))
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
qkv = self.qkv(x)
qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
if self.qk_normalization:
q, k, v = qkv.unbind(2)
q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
qkv = torch.stack([q, k, v], dim=2)
context, _ = self.inner_attn(
qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
)
outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
outs = self.proj_drop(outs)
return outs
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
return x
class InternMLP(nn.Module):
def __init__(self, config: InternVisionConfig):
super().__init__()
self.config = config
self.act = ACT2FN[config.hidden_act]
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.fc1(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.fc2(hidden_states)
return hidden_states
class InternVisionEncoderLayer(nn.Module):
def __init__(self, config: InternVisionConfig, drop_path_rate: float):
super().__init__()
self.embed_dim = config.hidden_size
self.intermediate_size = config.intermediate_size
self.norm_type = config.norm_type
self.attn = InternAttention(config)
self.mlp = InternMLP(config)
self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
def forward(
self,
hidden_states: torch.Tensor,
) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
"""
Args:
hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
"""
hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
return hidden_states
class InternVisionEncoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`InternEncoderLayer`].
Args:
config (`InternConfig`):
The corresponding vision configuration for the `InternEncoder`.
"""
def __init__(self, config: InternVisionConfig):
super().__init__()
self.config = config
# stochastic depth decay rule
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
self.layers = nn.ModuleList([
InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
self.gradient_checkpointing = True
def forward(
self,
inputs_embeds,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
r"""
Args:
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Embedded representation of the inputs. Should be float, not int tokens.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_states = () if output_hidden_states else None
hidden_states = inputs_embeds
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = torch.utils.checkpoint.checkpoint(
encoder_layer,
hidden_states)
else:
layer_outputs = encoder_layer(
hidden_states,
)
hidden_states = layer_outputs
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=encoder_states
)
class InternVisionModel(PreTrainedModel):
main_input_name = 'pixel_values'
config_class = InternVisionConfig
_no_split_modules = ['InternVisionEncoderLayer']
def __init__(self, config: InternVisionConfig):
super().__init__(config)
self.config = config
self.embeddings = InternVisionEmbeddings(config)
self.encoder = InternVisionEncoder(config)
def resize_pos_embeddings(self, old_size, new_size, patch_size):
pos_emb = self.embeddings.position_embedding
_, num_positions, embed_dim = pos_emb.shape
cls_emb = pos_emb[:, :1, :]
pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
self.embeddings.position_embedding = nn.Parameter(pos_emb)
self.embeddings.image_size = new_size
logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
def get_input_embeddings(self):
return self.embeddings
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pixel_embeds: Optional[torch.FloatTensor] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None and pixel_embeds is None:
raise ValueError('You have to specify pixel_values or pixel_embeds')
if pixel_embeds is not None:
hidden_states = pixel_embeds
else:
if len(pixel_values.shape) == 4:
hidden_states = self.embeddings(pixel_values)
else:
raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs.last_hidden_state
pooled_output = last_hidden_state[:, 0, :]
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
# coding=utf-8
# Copyright 2023 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Llava model."""
from dataclasses import dataclass
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from transformers import PreTrainedModel
from transformers.activations import ACT2FN
from transformers.cache_utils import Cache
from transformers.modeling_outputs import (BaseModelOutput,
BaseModelOutputWithPooling)
from transformers.utils import (
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from transformers.models.auto import AutoModel, AutoModelForCausalLM
from .configuration_whale import WhaleConfig
from einops import rearrange
import torch.nn.functional as F
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "WhaleConfig"
# try:
# from .flash_attention import FlashAttention
# has_flash_attn = True
# except:
# print('FlashAttention is not installed.')
# has_flash_attn = False
has_flash_attn = False
class WhaleConv2dSubsampling4(nn.Module):
"""Convolutional 2D subsampling (to 1/4 length).
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
"""
def __init__(self, config: WhaleConfig):
"""Construct an Conv2dSubsampling4 object."""
super().__init__()
self.config = config
self.in_channels = config.num_channels
self.hidden_size = config.hidden_size
self.input_dim = config.input_dim
self.conv_in = nn.Sequential(
nn.Conv2d(
in_channels=self.in_channels, out_channels=self.hidden_size, kernel_size=3, stride=2
),
nn.ReLU(),
nn.Conv2d(
in_channels=self.hidden_size, out_channels=self.hidden_size, kernel_size=3, stride=2
),
nn.ReLU(),
)
self.intermediate_size = self.hidden_size * (((self.input_dim - 1) // 2 - 1) // 2)
self.out = nn.Linear(self.intermediate_size, self.hidden_size)
# The right context for every conv layer is computed by:
# (kernel_size - 1) * frame_rate_of_this_layer
self.subsampling_rate = 4
# 6 = (3 - 1) * 1 + (3 - 1) * 2
self.right_context = 6
def forward(
self,
x: torch.Tensor,
x_mask: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""Subsample x.
Args:
x (torch.Tensor): Input tensor (#batch, time, idim).
x_mask (torch.Tensor): Input mask (#batch, 1, time).
Returns:
torch.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 4.
torch.Tensor: Subsampled mask (#batch, 1, time'),
where time' = time // 4.
torch.Tensor: positional encoding
"""
x = x.unsqueeze(1) # (b, c=1, t, f)
x = self.conv_in(x)
b, c, t, f = x.size()
x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
return x, x_mask[:, 2::2][:, 2::2]
class WhalePositionalEncoding(torch.nn.Module):
"""Positional encoding.
:param int d_model: embedding dim
:param float dropout_rate: dropout rate
:param int max_len: maximum input length
PE(pos, 2i) = sin(pos/(10000^(2i/dmodel)))
PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
"""
def __init__(self, config: WhaleConfig):
"""Construct an PositionalEncoding object."""
super().__init__()
self.d_model = config.hidden_size
self.xscale = math.sqrt(self.d_model)
self.dropout = torch.nn.Dropout(p=config.dropout)
self.max_len = config.max_position_embeddings
self.pe = torch.zeros(self.max_len, self.d_model)
position = torch.arange(0, self.max_len,
dtype=torch.float32).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, self.d_model, 2, dtype=torch.float32) *
-(math.log(10000.0) / self.d_model))
self.pe[:, 0::2] = torch.sin(position * div_term)
self.pe[:, 1::2] = torch.cos(position * div_term)
self.pe = self.pe.unsqueeze(0)
def forward(self,
x: torch.Tensor,
offset: int = 0):
"""Add positional encoding.
Args:
x (torch.Tensor): Input. Its shape is (batch, time, ...)
offset (int): position offset
Returns:
torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
torch.Tensor: for compatibility to RelPositionalEncoding
"""
assert offset + x.size(1) < self.max_len
self.pe = self.pe.to(x.device)
pos_emb = self.pe[:, offset:offset + x.size(1)]
x = x * self.xscale + pos_emb
return self.dropout(x), self.dropout(pos_emb)
def position_encoding(self, offset: int, size: int):
""" For getting encoding in a streaming fashion
Attention!!!!!
we apply dropout only once at the whole utterance level in a none
streaming way, but will call this function several times with
increasing input size in a streaming scenario, so the dropout will
be applied several times.
Args:
offset (int): start offset
size (int): requried size of position encoding
Returns:
torch.Tensor: Corresponding encoding
"""
assert offset + size < self.max_len
return self.dropout(self.pe[:, offset:offset + size])
class RelPositionalEncoding(WhalePositionalEncoding):
"""Relative positional encoding module.
See : Appendix B in https://arxiv.org/abs/1901.02860
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
"""
def __init__(self, config: WhaleConfig):
"""Initialize class."""
super().__init__(config)
self.hidden_size = config.hidden_size
# self.chunk_size = chunk_size
# self.left_chunks = left_chunks
# self.full_chunk_size = (self.left_chunks + 1) * self.chunk_size
self.div_term = torch.exp(
torch.arange(0, self.hidden_size, 2, dtype=torch.float32) *
-(math.log(10000.0) / self.hidden_size))
self.max_length = config.max_position_embeddings
# self.max_len = self.chunk_size * (max_len // self.chunk_size) - self.full_chunk_size
@torch.jit.export
def forward(self,
x: torch.Tensor,
offset: int = 0):
"""Compute positional encoding.
Args:
x (torch.Tensor): Input tensor (batch, time, `*`).
Returns:
torch.Tensor: Encoded tensor (batch, time, `*`).
torch.Tensor: Positional embedding tensor (1, time, `*`).
"""
self.pe = self.pe.to(x.device)
x = x * self.xscale
pos_emb = self.pe[:, offset:offset + x.size(1)]
return self.dropout(x), self.dropout(pos_emb)
class WhaleAudioEmbeddings(nn.Module):
def __init__(self, config: WhaleConfig):
super().__init__()
self.config = config
self.hidden_size = config.hidden_size
self.embed_dim = config.hidden_size
self.dropout_rate = config.dropout
self.input_dim = config.input_dim
self.embedding = nn.Sequential(
nn.Linear(config.hidden_size, self.embed_dim),
nn.LayerNorm(self.embed_dim),
nn.Dropout(self.dropout_rate),
nn.ReLU()
)
self.positional_embedding = RelPositionalEncoding(config)
def forward(self, input_features: torch.Tensor) -> torch.Tensor:
hidden_states = self.embedding(input_features)
hidden_states, pos_embeds = self.positional_embedding(hidden_states)
return hidden_states, pos_embeds
class WhaleAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config: WhaleConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.use_flash_attn = config.use_flash_attn and has_flash_attn
if config.use_flash_attn and not has_flash_attn:
print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
f' {self.num_heads}).'
)
self.scale = self.head_dim ** -0.5
self.linear_q = nn.Linear(self.embed_dim, self.embed_dim)
self.linear_k = nn.Linear(self.embed_dim, self.embed_dim)
self.linear_v = nn.Linear(self.embed_dim, self.embed_dim)
self.linear_out = nn.Linear(self.embed_dim, self.embed_dim)
self.attn_drop = nn.Dropout(config.attention_dropout)
self.qk_normalization = config.qk_normalization
if self.qk_normalization:
self.q_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.k_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
if self.use_flash_attn:
self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
self.linear_out = nn.Linear(self.embed_dim, self.embed_dim)
self.use_relative_pe = config.use_relative_pe
if self.use_relative_pe:
self.linear_pos = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
# these two learnable bias are used in matrix c and matrix d
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
self.pos_bias_u = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
self.pos_bias_v = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
nn.init.xavier_uniform_(self.pos_bias_u)
nn.init.xavier_uniform_(self.pos_bias_v)
def _naive_attn(self, x, attention_mask=None, pos_embeds=None):
B, N, C = x.shape
q = self.linear_q(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
k = self.linear_k(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
v = self.linear_v(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
if self.qk_normalization:
B_, H_, N_, D_ = q.shape
q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
if self.use_relative_pe:
q = q.transpose(1, 2)
batch_size = pos_embeds.size(0)
p = self.linear_pos(pos_embeds.to(q.dtype)).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
query_with_bias_u = (q + self.pos_bias_u.to(q.device)).transpose(1, 2)
query_with_bias_v = (q + self.pos_bias_v.to(q.device)).transpose(1, 2)
# compute attention score
# first compute matrix a and matrix c
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
matrix_ac = torch.matmul(query_with_bias_u, k.transpose(-2, -1))
# compute matrix b and matrix d
matrix_bd = torch.matmul(query_with_bias_v, p.transpose(-2, -1))
attn = (matrix_ac + matrix_bd) * self.scale
else:
attn = ((q * self.scale) @ k.transpose(-2, -1))
if attention_mask is not None:
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
attn = attn.masked_fill(~attention_mask.bool(), float("-inf"))
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.linear_out(x)
return x
def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
qkv = self.qkv(x)
qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
if self.qk_normalization:
q, k, v = qkv.unbind(2)
q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
qkv = torch.stack([q, k, v], dim=2)
context, _ = self.inner_attn(
qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
)
outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
outs = self.proj_drop(outs)
return outs
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor = None,
pos_embeds: torch.Tensor = None
) -> torch.Tensor:
x = self._naive_attn(hidden_states, attention_mask, pos_embeds) if not self.use_flash_attn else self._flash_attn(hidden_states)
return x
class WhaleMLP(nn.Module):
def __init__(self, config: WhaleConfig):
super().__init__()
self.config = config
self.act = ACT2FN[config.hidden_act]
self.w_1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.w_2 = nn.Linear(config.intermediate_size, config.hidden_size)
self.dropout = nn.Dropout(config.dropout)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.w_1(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.w_2(hidden_states)
return hidden_states
class WhaleAudioEncoderLayer(nn.Module):
def __init__(self, config: WhaleConfig):
super().__init__()
self.embed_dim = config.hidden_size
self.intermediate_size = config.intermediate_size
self.dropout_rate = config.dropout
self.normalize_before = config.normalize_before
self.concat_after = config.concat_after
self.attn = WhaleAttention(config)
self.feed_forward = WhaleMLP(config)
self.norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.dropout)
if self.concat_after:
self.concat_linear = nn.Linear(self.embed_dim * 2, self.embed_dim)
else:
self.concat_linear = nn.Identity()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
pos_emb: torch.Tensor,
) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
"""
Args:
hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
"""
residual = hidden_states
if self.normalize_before:
hidden_states = self.norm1(hidden_states)
if self.concat_after:
hidden_states = torch.cat(
[hidden_states, self.attn(hidden_states, attention_mask, pos_emb)],
dim=-1
)
hidden_states = self.concat_linear(hidden_states) + residual
else:
hidden_states = self.dropout(self.attn(hidden_states, attention_mask, pos_emb)) + residual
if not self.normalize_before:
hidden_states = self.norm1(hidden_states)
residual = hidden_states
if self.normalize_before:
hidden_states = self.norm2(hidden_states)
hidden_states = self.dropout(self.feed_forward(hidden_states)) + residual
if not self.normalize_before:
hidden_states = self.norm2(hidden_states)
return hidden_states
class WhaleAudioEncoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`InternEncoderLayer`].
Args:
config (`InternConfig`):
The corresponding vision configuration for the `InternEncoder`.
"""
def __init__(self, config: WhaleConfig):
super().__init__()
self.config = config
# stochastic depth decay rule
self.layers = nn.ModuleList([
WhaleAudioEncoderLayer(config) for idx in range(config.num_hidden_layers)])
self.gradient_checkpointing = True
self.normalize_before = config.normalize_before
if self.normalize_before:
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(
self,
inputs_embeds,
attention_mask: Optional[torch.FloatTensor] = None,
pos_embeds: Optional[torch.FloatTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
r"""
Args:
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Embedded representation of the inputs. Should be float, not int tokens.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_states = () if output_hidden_states else None
hidden_states = inputs_embeds
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
layer_outputs = torch.utils.checkpoint.checkpoint(
encoder_layer,
hidden_states,
attention_mask,
pos_embeds,
)
else:
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
pos_embeds,
)
hidden_states = layer_outputs
if self.normalize_before:
hidden_states = self.layer_norm(hidden_states)
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=encoder_states
)
class WhaleAudioModel(PreTrainedModel):
main_input_name = 'pixel_values'
config_class = WhaleConfig
_no_split_modules = ['WhaleAudioEncoderLayer']
def __init__(self, config: WhaleConfig):
super().__init__(config)
self.config = config
# self.embeddings = InternVisionEmbeddings(config)
self.subsampling = WhaleConv2dSubsampling4(config)
self.embeddings = WhaleAudioEmbeddings(config)
self.encoder = WhaleAudioEncoder(config)
def get_input_embeddings(self):
return self.embeddings
def forward(
self,
input_features: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.FloatTensor] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pixel_embeds: Optional[torch.FloatTensor] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_features is None and pixel_embeds is None:
raise ValueError('You have to specify pixel_values or pixel_embeds')
if pixel_embeds is not None:
hidden_states = pixel_embeds
else:
if len(input_features.shape) == 3:
input_features, attention_mask = self.subsampling(input_features, attention_mask)
hidden_states, pos_embeds = self.embeddings(input_features)
else:
raise ValueError(f'wrong pixel_values size: {input_features.shape}')
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
attention_mask=attention_mask,
pos_embeds=pos_embeds,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs.last_hidden_state
pooled_output = last_hidden_state[:, 0, :]
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
{
"crop_size": 448,
"do_center_crop": true,
"do_normalize": true,
"do_resize": true,
"feature_extractor_type": "CLIPFeatureExtractor",
"image_mean": [
0.485,
0.456,
0.406
],
"image_std": [
0.229,
0.224,
0.225
],
"resample": 3,
"size": 448
}
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Feature extractor class for Speech2Text
"""
from typing import List, Optional, Union
import numpy as np
import os
import json
from transformers.audio_utils import mel_filter_bank, spectrogram, window_function
from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
from transformers.feature_extraction_utils import BatchFeature
from transformers.utils import PaddingStrategy, TensorType, is_speech_available, logging
if is_speech_available():
import torch
import torchaudio
import torchaudio.compliance.kaldi as ta_kaldi
logger = logging.get_logger(__name__)
class WhaleFeatureExtractor(SequenceFeatureExtractor):
r"""
Constructs a WhaleFeatureExtractor for extracting features from raw speech.
This feature extractor inherits from [`SequenceFeatureExtractor`] which contains most of the main methods. Users
should refer to this superclass for more information regarding those methods.
This class extracts mel-filter bank features from raw speech using TorchAudio if installed or using numpy
otherwise, and applies utterance-level cepstral mean and variance normalization (CMVN) to the extracted features.
Args:
feature_size (`int`, *optional*, defaults to 80):
The feature dimension of the extracted features.
sampling_rate (`int`, *optional*, defaults to 16000):
The sampling rate at which the audio files should be digitalized, expressed in hertz (Hz).
num_mel_bins (`int`, *optional*, defaults to 80):
Number of Mel-frequency bins.
padding_value (`float`, *optional*, defaults to 0.0):
The value that is used to fill the padding vectors.
frame_length (`int`, *optional*, defaults to 25):
The length of each frame in milliseconds.
frame_shift (`int`, *optional*, defaults to 10):
The shift between consecutive frames in milliseconds.
dither (`float`, *optional*, defaults to 1.0):
The amount of dithering (random noise) to apply to the signal.
do_ceptral_normalize (`bool`, *optional*, defaults to `True`):
Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
normalize_means (`bool`, *optional*, defaults to `True`):
Whether or not to zero-mean normalize the extracted features.
normalize_vars (`bool`, *optional*, defaults to `True`):
Whether or not to unit-variance normalize the extracted features.
cmvn_preload (`bool`, *optional*, defaults to `True`):
Whether or not to preload CMVN statistics from a file.
cmvn_file (`str`, *optional*, defaults to ""):
Path to the file containing precomputed CMVN statistics.
cmvn_means (`list` of `float`, *optional*, defaults to `None`):
Precomputed means for CMVN.
cmvn_istds (`list` of `float`, *optional*, defaults to `None`):
Precomputed inverse standard deviations for CMVN.
"""
model_input_names = ["input_features", "attention_mask"]
def __init__(
self,
feature_size=80,
sampling_rate=16000,
num_mel_bins=80,
padding_value=0.0,
frame_length=25,
frame_shift=10,
dither=1.0,
do_ceptral_normalize=True,
normalize_means=True,
normalize_vars=True,
cmvn_preload=True,
cmvn_file="",
cmvn_means=None,
cmvn_istds=None,
**kwargs,
):
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
self.num_mel_bins = num_mel_bins
self.sampling_rate = sampling_rate
self.padding_value = padding_value
self.frame_length = frame_length
self.frame_shift = frame_shift
self.dither = dither
self.do_ceptral_normalize = do_ceptral_normalize
self.normalize_means = normalize_means
self.normalize_vars = normalize_vars
self.return_attention_mask = True
self.cmvn_preload = cmvn_preload
self.cmvn_file = cmvn_file
self.cmvn_means = cmvn_means
self.cmvn_istds = cmvn_istds
if self.cmvn_preload:
if self.cmvn_means is not None and self.cmvn_istds is not None:
self.cmvn_means = np.array(self.cmvn_means, dtype=np.float32)
self.cmvn_istds = np.array(self.cmvn_istds, dtype=np.float32)
else:
if self.cmvn_file is None or self.cmvn_file == "":
raise ValueError(f"cmvn_file should be a valid file if cmvn_preload is set True, but we get {self.cmvn_file}.")
if not os.path.join(self.cmvn_file):
raise ValueError(f"file {self.cmvn_file} is not found.")
self.cmvn_means, self.cmvn_istds = self._load_json_cmvn(self.cmvn_file)
if not is_speech_available():
mel_filters = mel_filter_bank(
num_frequency_bins=256,
num_mel_filters=self.num_mel_bins,
min_frequency=20,
max_frequency=sampling_rate // 2,
sampling_rate=sampling_rate,
norm=None,
mel_scale="kaldi",
triangularize_in_mel_space=True,
)
self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
self.window = window_function(400, "povey", periodic=False)
def _load_json_cmvn(self, json_cmvn_file):
""" Load the json format cmvn stats file and calculate cmvn
Args:
json_cmvn_file: cmvn stats file in json format
Returns:
a numpy array of [means, vars]
"""
with open(json_cmvn_file) as f:
cmvn_stats = json.load(f)
means = np.array(cmvn_stats['mean_stat'])
variances = np.array(cmvn_stats['var_stat'])
count = cmvn_stats['frame_num']
epsilon = 1.0e-6
means = means / count
variances = variances / count - means ** 2
variances[variances < epsilon] = epsilon
istds = 1.0 / np.sqrt(variances)
return means, istds
def _extract_fbank_features(
self,
waveform: np.ndarray,
) -> np.ndarray:
"""
Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
and hence the waveform should not be normalized before feature extraction.
"""
waveform = waveform * (2**15) # Kaldi compliance: 16-bit signed integers
if is_speech_available():
if not isinstance(waveform, torch.Tensor):
waveform = torch.from_numpy(waveform)
features = ta_kaldi.fbank(
waveform,
num_mel_bins=self.num_mel_bins,
sample_frequency=self.sampling_rate,
frame_length=self.frame_length,
frame_shift=self.frame_shift,
dither=self.dither,
energy_floor=0.0,
)
features = features.numpy()
else:
waveform = np.squeeze(waveform)
features = spectrogram(
waveform,
self.window,
frame_length=400,
hop_length=160,
fft_length=512,
power=2.0,
center=False,
preemphasis=0.97,
mel_filters=self.mel_filters,
log_mel="log",
mel_floor=1.192092955078125e-07,
remove_dc_offset=True,
).T
return features
@staticmethod
def utterance_cmvn(
x: np.ndarray,
input_length: int,
normalize_means: Optional[bool] = True,
normalize_vars: Optional[bool] = True,
padding_value: float = 0.0,
cmvn_means: Optional[np.ndarray] = None,
cmvn_istds: Optional[np.ndarray] = None,
) -> np.ndarray:
# make sure we normalize float32 arrays
if normalize_means:
mean = cmvn_means if cmvn_means is not None else x[:input_length].mean(axis=0)
x = np.subtract(x, mean)
if normalize_vars:
istd = cmvn_istds if cmvn_istds is not None else 1 / x[:input_length].std(axis=0)
x = np.multiply(x, istd)
if input_length < x.shape[0]:
x[input_length:] = padding_value
# make sure array is in float32
x = x.astype(np.float32)
return x
def normalize(
self, input_features: List[np.ndarray], attention_mask: Optional[np.ndarray] = None
) -> List[np.ndarray]:
lengths = attention_mask.sum(-1) if attention_mask is not None else [x.shape[0] for x in input_features]
return [
self.utterance_cmvn(
x,
n,
self.normalize_means,
self.normalize_vars,
self.padding_value,
self.cmvn_means if self.cmvn_preload else None,
self.cmvn_istds if self.cmvn_preload else None,
)
for x, n in zip(input_features, lengths)
]
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
padding: Union[bool, str, PaddingStrategy] = False,
max_length: Optional[int] = None,
truncation: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
sampling_rate: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
**kwargs,
) -> BatchFeature:
"""
Main method to featurize and prepare for the model one or several sequence(s).
Args:
raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
stereo, i.e. single float per timestep.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
lengths).
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
truncation (`bool`):
Activates truncation to cut input sequences longer than *max_length* to *max_length*.
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
`>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific feature_extractor's default.
[What are attention masks?](../glossary#attention-mask)
<Tip>
For Speech2TextTransformer models, `attention_mask` should always be passed for batched inference, to
avoid subtle bugs.
</Tip>
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return Numpy `np.ndarray` objects.
sampling_rate (`int`, *optional*):
The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
`sampling_rate` at the forward call to prevent silent errors.
padding_value (`float`, defaults to 0.0):
The value that is used to fill the padding values / vectors.
"""
if sampling_rate is not None:
if sampling_rate != self.sampling_rate:
logger.warning(
f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
f" {self.sampling_rate} and not {sampling_rate}."
)
if is_speech_available():
resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=self.sampling_rate)
if isinstance(raw_speech, List):
raw_speech = [resampler(speech) for speech in raw_speech]
else:
raw_speech = resampler(raw_speech)
logger.warning(
f"Resampling the input audio to match the model's sampling rate of {self.sampling_rate}."
)
else:
logger.warning(
"It is strongly recommended to pass the `sampling_rate` argument to this function. "
"Failing to do so can result in silent errors that might be hard to debug."
)
is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
if is_batched_numpy and len(raw_speech.shape) > 2:
raise ValueError(f"Only mono-channel audio is supported for input to {self}")
is_batched = is_batched_numpy or (
isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
)
if is_batched:
raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
elif not is_batched and not isinstance(raw_speech, np.ndarray):
raw_speech = np.asarray(raw_speech, dtype=np.float32)
elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
raw_speech = raw_speech.astype(np.float32)
# always return batch
if not is_batched:
raw_speech = [raw_speech]
# extract fbank features
features = [self._extract_fbank_features(waveform) for waveform in raw_speech]
# convert into correct format for padding
encoded_inputs = BatchFeature({"input_features": features})
padded_inputs = self.pad(
encoded_inputs,
padding=padding,
max_length=max_length,
truncation=truncation,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
**kwargs,
)
# make sure list is in array format
input_features = padded_inputs.get("input_features")
if isinstance(input_features[0], list):
padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
attention_mask = padded_inputs.get("attention_mask")
if attention_mask is not None:
padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
# Utterance-level cepstral mean and variance normalization
if self.do_ceptral_normalize:
attention_mask = (
np.array(attention_mask, dtype=np.int32)
if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
else None
)
padded_inputs["input_features"] = self.normalize(
padded_inputs["input_features"], attention_mask=attention_mask
)
if return_tensors is not None:
padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
return padded_inputs
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment