v1.0

112bf76b · chenzk · 112bf76b · 112bf76b · 112bf76b · 112bf76b
Commit 112bf76b authored Oct 31, 2024 by chenzk
20 changed files
--- a/script/deepspeed/zero3.json
+++ b/script/deepspeed/zero3.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}
\ No newline at end of file
--- a/script/merge_lora_weights.py
+++ b/script/merge_lora_weights.py
+import argparse
+import os
+
+from vita.model.builder import load_pretrained_model
+from vita.util.mm_utils import get_model_name_from_path
+
+
+def merge_lora(args):
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(
+        model_path, args.model_base, model_name, args.model_type
+    )
+
+    model.save_pretrained(args.save_model_path)
+    tokenizer.save_pretrained(args.save_model_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, required=True)
+    parser.add_argument("--model-base", type=str, required=True)
+    parser.add_argument("--model-type", type=str, required=True)
+    parser.add_argument("--save-model-path", type=str, required=True)
+
+    args = parser.parse_args()
+
+    merge_lora(args)
--- a/script/train/finetune.sh
+++ b/script/train/finetune.sh
+#!/bin/bash
+
+MODEL_TYPE=mixtral-8x7b
+OUTPUT_DIR=$1
+OUTPUT_DIR_FT=${OUTPUT_DIR}/llava-s2-pretrain_video
+mkdir -p ${OUTPUT_DIR_FT}
+
+deepspeed --include localhost:0,1,2,3,4,5,6,7 vita/train/train.py \
+    --deepspeed ./script/deepspeed/ds_config_zero3_offload2.json \
+    --model_name_or_path Mixtral-8x7B_modVocab/mg2hg \
+    --model_type $MODEL_TYPE \
+    --version mixtral_two \
+    --dataset_use Pretrain_video \
+    --vision_tower InternViT-300M-448px \
+    --pretrain_mm_mlp_adapter ${OUTPUT_DIR}/llava-s1-pretrain_mlp_video/mm_projector.bin \
+    --mm_projector_type mlp2x_gelu \
+    --audio_encoder audio-encoder-2wh_zh_en_audioset_Mixtral-8x7B_New-base-tunning \
+    --freeze_audio_encoder True \
+    --freeze_audio_encoder_adapter False \
+    --image_aspect_ratio square \
+    --group_by_modality_length False \
+    --bf16 True \
+    --output_dir ${OUTPUT_DIR_FT} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 6200 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to none \
+    2>&1 | tee -a ${OUTPUT_DIR_FT}/log.txt && echo "Done."
+
--- a/script/train/finetuneTask_nodes.sh
+++ b/script/train/finetuneTask_nodes.sh
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+DIR=`pwd`
+
+
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_DISABLE=0
+export NCCL_IB_GID_INDEX=3
+#export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1
+#export NCCL_IB_SL=3
+#export NCCL_CHECKS_DISABLE=1
+export NCCL_P2P_DISABLE=0
+#export NCCL_LL_THRESHOLD=16384
+export NCCL_IB_CUDA_SUPPORT=1
+export NCCL_DEBUG=INFO
+
+INDEX=5
+MASTER_ADDR="10.206.0.199"
+# communication on taiji platform
+DISTRIBUTED_ARGS="
+    --nproc_per_node 8 \
+    --nnodes 6 \
+    --node_rank $INDEX \
+    --master_addr $MASTER_ADDR \
+    --master_port 9999
+"
+export NCCL_TIMEOUT=25200
+MODEL_TYPE=mixtral-8x7b
+OUTPUT_DIR=$1
+OUTPUT_DIR_FT=${OUTPUT_DIR}/llava-s3-finetune_task
+mkdir -p ${OUTPUT_DIR_FT}
+
+torchrun $DISTRIBUTED_ARGS vita/train/train.py \
+    --deepspeed ./script/deepspeed/zero3.json \
+    --model_name_or_path VITA_ckpt \
+    --model_type $MODEL_TYPE \
+    --version mixtral_two \
+    --dataset_use Pretrain_video \
+    --vision_tower InternViT-300M-448px \
+    --mm_projector_type mlp2x_gelu \
+    --audio_encoder audio-encoder-2wh_zh_en_audioset_Mixtral-8x7B_New-base-tunning \
+    --freeze_audio_encoder True \
+    --freeze_audio_encoder_adapter True \
+    --image_aspect_ratio square \
+    --group_by_modality_length False \
+    --bf16 True \
+    --output_dir ${OUTPUT_DIR_FT} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 2 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 200 \
+    --save_total_limit 2 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 9100 \
+    --ddp_timeout 25200 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to none \
+    2>&1 | tee -a ${OUTPUT_DIR_FT}/log_node_$INDEX.txt && echo "Done."
+
+
+
--- a/script/train/finetuneTask_nodes_singlenode.sh
+++ b/script/train/finetuneTask_nodes_singlenode.sh
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+DIR=`pwd`
+
+
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_DISABLE=0
+export NCCL_IB_GID_INDEX=3
+#export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1
+#export NCCL_IB_SL=3
+#export NCCL_CHECKS_DISABLE=1
+export NCCL_P2P_DISABLE=0
+#export NCCL_LL_THRESHOLD=16384
+export NCCL_IB_CUDA_SUPPORT=1
+export NCCL_DEBUG=INFO
+
+INDEX=5
+# MASTER_ADDR="10.206.0.199"
+MASTER_ADDR="0.0.0.0"
+# communication on taiji platform
+DISTRIBUTED_ARGS="
+    --nproc_per_node 8 \
+    --nnodes 1 \
+    --node_rank 0 \
+    --master_addr $MASTER_ADDR \
+    --master_port 9999
+"
+export NCCL_TIMEOUT=25200
+MODEL_TYPE=mixtral-8x7b
+OUTPUT_DIR=$1
+OUTPUT_DIR_FT=${OUTPUT_DIR}/llava-s3-finetune_task
+mkdir -p ${OUTPUT_DIR_FT}
+
+# torchrun $DISTRIBUTED_ARGS vita/train/train.py \
+torchrun --nproc-per-node=8 --nnodes=1 vita/train/train.py \
+    --deepspeed ./script/deepspeed/zero3.json \
+    --model_name_or_path VITA/VITA_ckpt \
+    --model_type $MODEL_TYPE \
+    --version mixtral_two \
+    --dataset_use Pretrain_video \
+    --vision_tower InternViT-300M-448px \
+    --mm_projector_type mlp2x_gelu \
+    --audio_encoder audio-encoder-2wh_zh_en_audioset_Mixtral-8x7B_New-base-tunning \
+    --freeze_audio_encoder True \
+    --freeze_audio_encoder_adapter True \
+    --image_aspect_ratio square \
+    --group_by_modality_length False \
+    --bf16 True \
+    --output_dir ${OUTPUT_DIR_FT} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 2 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 200 \
+    --save_total_limit 2 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 False \
+    --model_max_length 9100 \
+    --ddp_timeout 25200 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to none \
+    2>&1 | tee -a ${OUTPUT_DIR_FT}/log_node_$INDEX.txt && echo "Done."
+
+
+
--- a/script/train/finetune_nodes.sh
+++ b/script/train/finetune_nodes.sh
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+DIR=`pwd`
+
+
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_DISABLE=0
+export NCCL_IB_GID_INDEX=3
+#export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1
+#export NCCL_IB_SL=3
+#export NCCL_CHECKS_DISABLE=1
+export NCCL_P2P_DISABLE=0
+#export NCCL_LL_THRESHOLD=16384
+export NCCL_IB_CUDA_SUPPORT=1
+export NCCL_DEBUG=INFO
+
+INDEX=0
+MASTER_ADDR="172.17.0.5"
+# communication on taiji platform
+DISTRIBUTED_ARGS="
+    --nproc_per_node 8 \
+    --nnodes 4 \
+    --node_rank $INDEX \
+    --master_addr $MASTER_ADDR \
+    --master_port 9999
+"
+export NCCL_TIMEOUT=25200
+MODEL_TYPE=mixtral-8x7b
+OUTPUT_DIR=$1
+OUTPUT_DIR_FT=${OUTPUT_DIR}/llava-s2-pretrain_video
+mkdir -p ${OUTPUT_DIR_FT}
+
+torchrun $DISTRIBUTED_ARGS vita/train/train.py \
+    --deepspeed ./script/deepspeed/ds_config_zero3_offload.json \
+    --model_name_or_path Mixtral-8x7B_modVocab/mg2hg \
+    --model_type $MODEL_TYPE \
+    --version mixtral_two \
+    --dataset_use Pretrain_video \
+    --vision_tower InternViT-300M-448px \
+    --pretrain_mm_mlp_adapter ${OUTPUT_DIR}/llava-s1-pretrain_mlp_video/mm_projector.bin \
+    --mm_projector_type mlp2x_gelu \
+    --audio_encoder audio-encoder-2wh_zh_en_audioset_Mixtral-8x7B_New-base-tunning \
+    --freeze_audio_encoder True \
+    --freeze_audio_encoder_adapter True \
+    --image_aspect_ratio square \
+    --group_by_modality_length False \
+    --bf16 True \
+    --output_dir ${OUTPUT_DIR_FT} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 10 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 2 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 200 \
+    --save_total_limit 2 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 6200 \
+    --ddp_timeout 25200 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to none \
+    2>&1 | tee -a ${OUTPUT_DIR_FT}/log_node_$INDEX.txt && echo "Done."
+
+
--- a/script/train/pretrain_mlp.sh
+++ b/script/train/pretrain_mlp.sh
+#!/bin/bash
+
+MODEL_TYPE=mixtral-8x7b
+OUTPUT_DIR=$1
+OUTPUT_DIR_FT=${OUTPUT_DIR}/llava-s1-pretrain_mlp_video
+mkdir -p ${OUTPUT_DIR_FT}
+
+deepspeed --include localhost:0,1,2,3,4,5,6,7 vita/train/train.py \
+    --deepspeed ./script/deepspeed/zero3.json \
+    --model_name_or_path Mixtral-8x7B_modVocab/mg2hg \
+    --model_type $MODEL_TYPE \
+    --version mixtral_two \
+    --dataset_use Pretrain_video \
+    --vision_tower InternViT-300M-448px \
+    --mm_projector_type mlp2x_gelu \
+    --tune_mm_mlp_adapter True \
+    --audio_encoder audio-encoder-2wh_zh_en_audioset_Mixtral-8x7B_New-base-tunning \
+    --freeze_audio_encoder True \
+    --freeze_audio_encoder_adapter True \
+    --image_aspect_ratio square \
+    --group_by_modality_length False \
+    --bf16 True \
+    --output_dir ${OUTPUT_DIR_FT} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 2 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 1 \
+    --learning_rate 5e-4 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 6200 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to none \
+    2>&1 | tee -a ${OUTPUT_DIR_FT}/log.txt && echo "Done."
+
+
--- a/script/train/pretrain_mlp_nodes.sh
+++ b/script/train/pretrain_mlp_nodes.sh
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+DIR=`pwd`
+
+
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_IB_DISABLE=0
+export NCCL_IB_GID_INDEX=3
+#export NCCL_IB_HCA=mlx5_2:1,mlx5_2:1
+#export NCCL_IB_SL=3
+#export NCCL_CHECKS_DISABLE=1
+export NCCL_P2P_DISABLE=0
+#export NCCL_LL_THRESHOLD=16384
+export NCCL_IB_CUDA_SUPPORT=1
+export NCCL_DEBUG=INFO
+
+INDEX=3
+MASTER_ADDR="172.17.0.5"
+# communication on taiji platform
+DISTRIBUTED_ARGS="
+    --nproc_per_node 8 \
+    --nnodes 4 \
+    --node_rank $INDEX \
+    --master_addr $MASTER_ADDR \
+    --master_port 9999
+"
+
+MODEL_TYPE=mixtral-8x7b
+OUTPUT_DIR=$1
+OUTPUT_DIR_FT=${OUTPUT_DIR}/llava-s1-pretrain_mlp_video
+mkdir -p ${OUTPUT_DIR_FT}
+
+torchrun $DISTRIBUTED_ARGS vita/train/train.py \
+    --deepspeed ./script/deepspeed/zero3.json \
+    --model_name_or_path Mixtral-8x7B_modVocab/mg2hg \
+    --model_type $MODEL_TYPE \
+    --version mixtral_two \
+    --dataset_use Pretrain_video \
+    --vision_tower InternViT-300M-448px \
+    --mm_projector_type mlp2x_gelu \
+    --tune_mm_mlp_adapter True \
+    --audio_encoder audio-encoder-2wh_zh_en_audioset_Mixtral-8x7B_New-base-tunning \
+    --freeze_audio_encoder True \
+    --freeze_audio_encoder_adapter True \
+    --image_aspect_ratio square \
+    --group_by_modality_length False \
+    --bf16 True \
+    --output_dir ${OUTPUT_DIR_FT} \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 2 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 1 \
+    --learning_rate 5e-4 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 6200 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to none \
+    2>&1 | tee -a ${OUTPUT_DIR_FT}/log_node_$INDEX.txt && echo "Done."
+
+
+
--- a/train.sh
+++ b/train.sh
+export PYTHONPATH=./
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+OUTPUT_DIR=outputs/vita_video_audio
+bash script/train/finetuneTask_nodes.sh ${OUTPUT_DIR} # origin
+# bash script/train/finetuneTask_nodes_singlenode.sh ${OUTPUT_DIR} # for try train
--- a/video_audio_demo.py
+++ b/video_audio_demo.py
+import argparse
+import os
+import time
+
+import numpy as np
+import torch
+from PIL import Image
+
+from decord import VideoReader, cpu
+from vita.constants import (
+    DEFAULT_AUDIO_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
+    IGNORE_INDEX,
+    IMAGE_TOKEN_INDEX,
+    MAX_IMAGE_LENGTH,
+)
+from vita.conversation import SeparatorStyle, conv_templates
+from vita.model.builder import load_pretrained_model
+from vita.util.data_utils_video_audio_neg_patch import dynamic_preprocess
+from vita.util.mm_utils import (
+    KeywordsStoppingCriteria,
+    get_model_name_from_path,
+    tokenizer_image_audio_token,
+    tokenizer_image_token,
+)
+from vita.util.utils import disable_torch_init
+import soundfile as sf
+
+def _get_rawvideo_dec(
+    video_path,
+    image_processor,
+    max_frames=MAX_IMAGE_LENGTH,
+    min_frames=4,
+    image_resolution=384,
+    video_framerate=1,
+    s=None,
+    e=None,
+    image_aspect_ratio="pad",
+):
+    # speed up video decode via decord.
+
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0.0 else 0.0
+        end_time = end_time if end_time >= 0.0 else 0.0
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)
+            ]
+        elif len(all_pos) < min_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=min_frames, dtype=int)
+            ]
+        else:
+            sample_pos = all_pos
+
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+
+        if image_aspect_ratio == "pad":
+
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            patch_images = [
+                expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean))
+                for i in patch_images
+            ]
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+        else:
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+
+        patch_images = torch.stack(patch_images)
+        slice_len = patch_images.shape[0]
+
+        return patch_images, slice_len
+    else:
+        print("video path: {} error.".format(video_path))
+
+
+if __name__ == "__main__":
+    # Initialize the parser
+    parser = argparse.ArgumentParser(description="Process model and video paths.")
+
+    # Add arguments
+    parser.add_argument("--model_path", type=str, required=True, help="Path to the model directory")
+    parser.add_argument("--model_base", type=str, default=None)
+    parser.add_argument("--video_path", type=str, default=None)
+    parser.add_argument("--image_path", type=str, default=None)
+    parser.add_argument("--audio_path", type=str, default=None)
+    parser.add_argument("--model_type", type=str, default="mixtral-8x7b")
+    parser.add_argument("--conv_mode", type=str, default="mixtral_two")
+    parser.add_argument("--question", type=str, default="")
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Assign arguments to variables
+    model_path = args.model_path
+    model_base = args.model_base
+    video_path = args.video_path
+    image_path = args.image_path
+    audio_path = args.audio_path
+    qs = args.question
+    assert (audio_path is None) != (qs == ""), "Exactly one of audio_path or qs must be non-None"
+    conv_mode = args.conv_mode
+
+    # The number of visual tokens varies with the length of the video. "max_frames" is the maximum number of frames.
+    # When the video is long, we will uniformly downsample the video to meet the frames when equal to the "max_frames".
+    max_frames = MAX_IMAGE_LENGTH  # 100
+
+    # The number of frames retained per second in the video.
+    video_framerate = 1
+
+    # Sampling Parameter
+    temperature = 0.01
+    top_p = None
+    num_beams = 1
+
+    disable_torch_init()
+    model_path = os.path.expanduser(model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(
+        model_path, model_base, model_name, args.model_type
+    )
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+
+    audio_encoder = model.get_audio_encoder()
+    # audio_encoder.to(device="cuda", dtype=torch.float16)
+    audio_encoder.to(dtype=torch.float16)
+    audio_processor = audio_encoder.audio_processor
+
+    model.eval()
+    if audio_path is not None:
+        audio, audio_for_llm_lens = audio_processor.process(os.path.join(audio_path))
+        # audio, fs = sf.read(os.path.join(audio_path))
+        # audio = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
+        audio_length = audio.shape[0]
+        audio = torch.unsqueeze(audio, dim=0)
+        audio_length = torch.unsqueeze(torch.tensor(audio_length), dim=0)
+        audios = dict()
+        audios["audios"] = audio.half().cuda()
+        audios["lengths"] = audio_length.half().cuda()
+    else:
+        audio = torch.zeros(400, 80)
+        audio_length = audio.shape[0]
+        audio = torch.unsqueeze(audio, dim=0)
+        audio_length = torch.unsqueeze(torch.tensor(audio_length), dim=0)
+        audios = dict()
+        audios["audios"] = audio.half().cuda()
+        audios["lengths"] = audio_length.half().cuda()
+        # audios = None
+
+    # Check if the video exists
+    if video_path is not None:
+        video_frames, slice_len = _get_rawvideo_dec(
+            video_path,
+            image_processor,
+            max_frames=max_frames,
+            video_framerate=video_framerate,
+            image_aspect_ratio=getattr(model.config, "image_aspect_ratio", None),
+        )
+        image_tensor = video_frames.half().cuda()
+        if audio_path:
+            qs = DEFAULT_IMAGE_TOKEN * slice_len + "\n" + qs + DEFAULT_AUDIO_TOKEN
+        else:
+            qs = DEFAULT_IMAGE_TOKEN * slice_len + "\n" + qs
+        modality = "video"
+    elif image_path is not None:
+        image = Image.open(image_path).convert("RGB")
+        image, p_num = dynamic_preprocess(
+            image, min_num=1, max_num=12, image_size=448, use_thumbnail=True
+        )
+        assert len(p_num) == 1
+        image_tensor = model.process_images(image, model.config).to(
+            dtype=model.dtype, device="cuda"
+        )
+        if audio_path:
+            qs = DEFAULT_IMAGE_TOKEN * p_num[0] + "\n" + qs + DEFAULT_AUDIO_TOKEN
+        else:
+            qs = DEFAULT_IMAGE_TOKEN * p_num[0] + "\n" + qs
+        modality = "image"
+    else:
+        image_tensor = torch.zeros((1, 3, 448, 448)).to(dtype=model.dtype, device="cuda")
+        if audio_path:
+            qs = qs + DEFAULT_AUDIO_TOKEN
+        modality = "lang"
+
+    conv = conv_templates[conv_mode].copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt(modality)
+
+    if audio_path:
+        input_ids = (
+            tokenizer_image_audio_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+            .unsqueeze(0)
+            .cuda()
+        )
+    else:
+        input_ids = (
+            tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+            .unsqueeze(0)
+            .cuda()
+        )
+
+    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+    keywords = [stop_str]
+    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+
+    start_time = time.time()
+    with torch.inference_mode():
+        output_ids = model.generate(
+            input_ids,
+            images=image_tensor,
+            audios=audios,
+            do_sample=False,
+            temperature=temperature,
+            top_p=top_p,
+            num_beams=num_beams,
+            output_scores=True,
+            return_dict_in_generate=True,
+            max_new_tokens=1024,
+            use_cache=True,
+            stopping_criteria=[stopping_criteria],
+        )
+    infer_time = time.time() - start_time
+    output_ids = output_ids.sequences
+    input_token_len = input_ids.shape[1]
+    n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+    if n_diff_input_output > 0:
+        print(f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids")
+    outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=False)[0]
+    outputs = outputs.strip()
+    if outputs.endswith(stop_str):
+        outputs = outputs[: -len(stop_str)]
+    outputs = outputs.strip()
+    print(outputs)
+    print(f"Time consume: {infer_time}")
--- a/web_demo/vllm_tools/model_weight_file/config.json
+++ b/web_demo/vllm_tools/model_weight_file/config.json
+{
+    "_name_or_path": "model_weights/Mixtral-8x7B_New/mg2hg",
+    "architectures": [
+      "MixtralForConditionalGeneration"
+    ],
+    "auto_map": {
+      "AutoConfig": "configuration_mixtral_multimodal.MixtralMultiModalConfig",
+      "AutoModel": "modeling_mixtral_multimodal.MixtralForConditionalGeneration"
+    },
+    "ignore_index": -100,
+    "model_type": "mixtral_multimodal",
+    "projector_hidden_act": "gelu",
+    "audio_projector_hidden_act": "gelu",
+    "image_token_index": 51000,
+    "audio_token_index": 51001,
+    "text_config": {
+      "architectures": [
+        "MixtralForCausalLM"
+      ],
+      "attention_dropout": 0.0,
+      "bos_token_id": 1,
+      "eos_token_id": 2,
+      "hidden_act": "silu",
+      "hidden_size": 4096,
+      "initializer_range": 0.02,
+      "intermediate_size": 14336,
+      "max_position_embeddings": 32768,
+      "model_type": "mixtral",
+      "num_attention_heads": 32,
+      "num_experts_per_tok": 2,
+      "num_hidden_layers": 32,
+      "num_key_value_heads": 8,
+      "num_local_experts": 8,
+      "output_router_logits": false,
+      "rms_norm_eps": 1e-05,
+      "rope_theta": 1000000.0,
+      "router_aux_loss_coef": 0.02,
+      "sliding_window": null,
+      "tie_word_embeddings": false,
+      "torch_dtype": "bfloat16",
+      "transformers_version": "4.41.1",
+      "use_cache": false,
+      "vocab_size": 51760
+    },
+    "vision_config": {
+      "architectures": [
+        "InternVisionModel"
+      ],
+      "auto_map": {
+        "AutoConfig": "configuration_intern_vit.InternVisionConfig",
+        "AutoModel": "modeling_intern_vit.InternVisionModel"
+      },
+      "attention_dropout": 0.0,
+      "drop_path_rate": 0.1,
+      "dropout": 0.0,
+      "hidden_act": "gelu",
+      "hidden_size": 1024,
+      "image_size": 448,
+      "initializer_factor": 1.0,
+      "initializer_range": 0.02,
+      "intermediate_size": 4096,
+      "layer_norm_eps": 1e-06,
+      "model_type": "intern_vit_6b",
+      "norm_type": "layer_norm",
+      "num_attention_heads": 16,
+      "num_channels": 3,
+      "num_hidden_layers": 24,
+      "patch_size": 14,
+      "qk_normalization": false,
+      "qkv_bias": true,
+      "torch_dtype": "bfloat16",
+      "transformers_version": "4.37.2",
+      "use_flash_attn": true
+    },
+    "audio_config":{
+      "_name_or_path": "whale_audio_mini",
+      "architectures": [
+        "WhaleAudioModel"
+      ],
+      "attention_dropout": 0.0,
+      "auto_map": {
+        "AutoConfig": "configuration_whale.WhaleConfig",
+        "AutoFeatureExtractor": "processor_whale.WhaleFeatureExtractor",
+        "AutoModel": "modeling_whale.WhaleAudioModel"
+      },
+      "concat_after": false,
+      "dropout": 0.1,
+      "hidden_act": "relu",
+      "hidden_size": 1024,
+      "initializer_factor": 0.1,
+      "initializer_range": 0.02,
+      "input_dim": 80,
+      "intermediate_size": 4096,
+      "layer_norm_eps": 1e-05,
+      "max_position_embeddings": 5000,
+      "model_type": "whale",
+      "norm_type": "layer_norm",
+      "normalize_before": true,
+      "num_attention_heads": 16,
+      "num_channels": 1,
+      "num_hidden_layers": 24,
+      "positional_dropout": 0.1,
+      "qk_normalization": false,
+      "qkv_bias": false,
+      "torch_dtype": "float32",
+      "transformers_version": "4.42.4",
+      "use_flash_attn": false,
+      "use_relative_pe": true
+    },
+    "downsample_ratio": 0.5,
+    "dynamic_image_size": true,
+    "max_dynamic_patch": 12,
+    "min_dynamic_patch": 1,
+    "vision_feature_layer": -1,
+    "use_thumbnail": true,
+    "tokenizer_model_max_length": 4600,
+    "tokenizer_padding_side": "right",
+    "vocab_size": 51760
+  }       
+  
--- a/web_demo/vllm_tools/model_weight_file/configuration_intern_vit.py
+++ b/web_demo/vllm_tools/model_weight_file/configuration_intern_vit.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class InternVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
+    instantiate a vision encoder according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of color channels in the input images (e.g., 3 for RGB).
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+        hidden_size (`int`, *optional*, defaults to 3200):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 12800):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the queries and keys in the self-attention layers.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention mechanism.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for layer scale.
+    """
+
+    model_type = 'intern_vit_6b'
+
+    def __init__(
+            self,
+            num_channels=3,
+            patch_size=14,
+            image_size=224,
+            qkv_bias=False,
+            hidden_size=3200,
+            num_attention_heads=25,
+            intermediate_size=12800,
+            qk_normalization=True,
+            num_hidden_layers=48,
+            use_flash_attn=True,
+            hidden_act='gelu',
+            norm_type='rms_norm',
+            layer_norm_eps=1e-6,
+            dropout=0.0,
+            drop_path_rate=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=0.1,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.drop_path_rate = drop_path_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.norm_type = norm_type
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        if 'vision_config' in config_dict:
+            config_dict = config_dict['vision_config']
+
+        if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
--- a/web_demo/vllm_tools/model_weight_file/configuration_mixtral_multimodal.py
+++ b/web_demo/vllm_tools/model_weight_file/configuration_mixtral_multimodal.py
+# coding=utf-8
+# Copyright 2024 The Vita team. All rights reserved.
+
+
+"""Multi-modal Mixtral model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+from transformers.models.auto import CONFIG_MAPPING
+
+from .configuration_intern_vit import InternVisionConfig
+from .configuration_whale import WhaleConfig
+
+class MixtralMultiModalConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a `MixtralMultiModal` model. It is used to instantiate a
+    MixtralMultiModal model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a model with the specified default parameters.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `None`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `None`):
+            The config object or dictionary of the text backbone.
+        audio_config (`Union[AutoConfig, dict]`, *optional*, defaults to `None`):
+            The config object or dictionary of the audio backbone.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
+            If `"full"`, the full vision features are used.
+        vision_feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature.
+        vision_downsample_ratio (`float`, *optional*, defaults to 0.5):
+            The downsample ratio for the vision features.
+        dynamic_image_size (`bool`, *optional*, defaults to `True`):
+            Whether to use dynamic image sizes.
+        max_dynamic_patch (`int`, *optional*, defaults to 12):
+            The maximum number of dynamic patches.
+        min_dynamic_patch (`int`, *optional*, defaults to 1):
+            The minimum number of dynamic patches.
+        use_thumbnail (`bool`, *optional*, defaults to `True`):
+            Whether to use thumbnails.
+        audio_token_index (`int`, *optional*, defaults to 32000):
+            The audio token index to encode the audio prompt.
+        audio_projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the audio projector.
+        audio_projector_kernel_size (`int`, *optional*, defaults to 5):
+            The kernel size used by the audio projector.
+        audio_downsample_ratio (`float`, *optional*, defaults to 0.125):
+            The downsample ratio for the audio features.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+
+    Example:
+
+    ```python
+    >>> from transformers import MixtralMultiModalConfig, MixtralMultiModalModel
+
+    >>> # Initializing a MixtralMultiModal configuration
+    >>> configuration = MixtralMultiModalConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = MixtralMultiModalModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "mixtral_multimodal"
+    is_composition = False
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        audio_config=None,
+        ignore_index=-100,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-2,
+        vision_downsample_ratio=0.5,
+        dynamic_image_size=True,
+        max_dynamic_patch=12,
+        min_dynamic_patch=1,
+        use_thumbnail=True,
+        audio_token_index=32000,
+        audio_projector_hidden_act="gelu",
+        audio_projector_kernel_size=5,
+        audio_downsample_ratio=0.125,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(
+                "vision_feature_select_strategy should be one of 'default', 'full'."
+                f"Got: {vision_feature_select_strategy}"
+            )
+
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        self.vision_downsample_ratio = vision_downsample_ratio
+        self.dynamic_image_size = dynamic_image_size
+        self.max_dynamic_patch = max_dynamic_patch
+        self.min_dynamic_patch = min_dynamic_patch
+        self.use_thumbnail = use_thumbnail
+
+
+        self.audio_token_index = audio_token_index
+        self.audio_projector_hidden_act = audio_projector_hidden_act
+        self.audio_projector_kernel_size = audio_projector_kernel_size
+        self.audio_downsample_ratio = audio_downsample_ratio
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
+            )
+            vision_config = InternVisionConfig(**vision_config)
+            
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["llama"]()
+
+        self.text_config = text_config
+
+        if isinstance(audio_config, dict):
+            audio_config["model_type"] = (
+                audio_config["model_type"] if "model_type" in audio_config else "clip_vision_model"
+            )
+            audio_config = WhaleConfig(**audio_config)
+        
+        self.audio_config = audio_config
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
--- a/web_demo/vllm_tools/model_weight_file/configuration_whale.py
+++ b/web_demo/vllm_tools/model_weight_file/configuration_whale.py
+# --------------------------------------------------------
+# Copyright (c) 
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class WhaleConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a `Whale` model. It is used to instantiate a
+    Whale model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a model with the specified default parameters.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        input_dim (`int`, *optional*, defaults to 80):
+            The input dimension of the model.
+        num_channels (`int`, *optional*, defaults to 1):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the query, key, value projections.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            The size of the hidden layers.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            The number of attention heads.
+        max_position_embeddings (`int`, *optional*, defaults to 5000):
+            The maximum number of position embeddings.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            The size of the intermediate (feed-forward) layer.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to apply normalization to the query and key projections.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            The number of hidden layers in the model.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention.
+        hidden_act (`str`, *optional*, defaults to `'relu'`):
+            The activation function to use in the hidden layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon value for layer normalization.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the hidden layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        positional_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the positional encodings.
+        normalize_before (`bool`, *optional*, defaults to `True`):
+            Whether to apply normalization before the attention and feed-forward layers.
+        concat_after (`bool`, *optional*, defaults to `True`):
+            Whether to concatenate the attention output with the input before the feed-forward layer.
+        use_relative_pe (`bool`, *optional*, defaults to `True`):
+            Whether to use relative position encodings.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for initializing the weights.
+
+    Example:
+
+    ```python
+    >>> from transformers import WhaleConfig, WhaleModel
+
+    >>> # Initializing a Whale configuration
+    >>> configuration = WhaleConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = WhaleModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = 'whale'
+
+    def __init__(
+            self,
+            input_dim=80,
+            num_channels=1,
+            qkv_bias=False,
+            hidden_size=1024,
+            num_attention_heads=25,
+            max_position_embeddings=5000,
+            intermediate_size=4096,
+            qk_normalization=True,
+            num_hidden_layers=48,
+            use_flash_attn=True,
+            hidden_act='relu',
+            layer_norm_eps=1e-6,
+            dropout=0.0,
+            attention_dropout=0.0,
+            positional_dropout=0.0,
+            normalize_before=True,
+            concat_after=True,
+            use_relative_pe=True,
+            initializer_range=0.02,
+            initializer_factor=0.1,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.input_dim = input_dim
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.num_channels = num_channels
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.positional_dropout = positional_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        self.max_position_embeddings = max_position_embeddings
+        self.use_relative_pe = use_relative_pe
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        if 'audio_config' in config_dict:
+            config_dict = config_dict['audio_config']
+
+        if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
--- a/web_demo/vllm_tools/model_weight_file/feature_extractor/config.json
+++ b/web_demo/vllm_tools/model_weight_file/feature_extractor/config.json
+{
+    "_name_or_path": "whale_audio_mini",
+    "auto_map": {
+      "AutoFeatureExtractor": "processor_whale.WhaleFeatureExtractor"
+    }
+  }
+  
--- a/web_demo/vllm_tools/model_weight_file/feature_extractor/preprocessor_config.json
+++ b/web_demo/vllm_tools/model_weight_file/feature_extractor/preprocessor_config.json
+{
+  "auto_map": {
+    "AutoFeatureExtractor": "processor_whale.WhaleFeatureExtractor"
+  },
+  "cmvn_istds": [
+    0.3329853392031094,
+    0.304457074540202,
+    0.29546332871219255,
+    0.3016748868710893,
+    0.30146666620931134,
+    0.29722031038819924,
+    0.2917111074466677,
+    0.28384523520067434,
+    0.28523771805804266,
+    0.2901033423245173,
+    0.29489059636859316,
+    0.29547348893246006,
+    0.29620672776488977,
+    0.2956320049313563,
+    0.29470081603763376,
+    0.2963227174569345,
+    0.2976606657847016,
+    0.3003491761055444,
+    0.30211458429446625,
+    0.30174903334867953,
+    0.30016818970986625,
+    0.3033775994936976,
+    0.30356758993511046,
+    0.30603640492289896,
+    0.30671985841954447,
+    0.3069228273662989,
+    0.30770085196779645,
+    0.3067656201661381,
+    0.3055203785780098,
+    0.30690623983421333,
+    0.30723413937044297,
+    0.3088911065771803,
+    0.3091251382267279,
+    0.30986769010126597,
+    0.31000059868830204,
+    0.30963732259143195,
+    0.3093967488140671,
+    0.30918562772813507,
+    0.30968744324817,
+    0.3085437993502015,
+    0.309308051573859,
+    0.3087313674687873,
+    0.30814804868295664,
+    0.30722053416625006,
+    0.30732656820194293,
+    0.3064066246045986,
+    0.30390658225471334,
+    0.302131011830547,
+    0.3014575331911756,
+    0.301449202764865,
+    0.30039048343978525,
+    0.29975195531574894,
+    0.2993214974016792,
+    0.29809597194189,
+    0.2950458103872353,
+    0.29250998818879875,
+    0.29285432953044965,
+    0.2928594451679315,
+    0.2922642564293608,
+    0.2934287968886421,
+    0.2929937863211079,
+    0.2921845930953747,
+    0.2917417094543235,
+    0.28991734472060865,
+    0.2888153105794442,
+    0.2870270977983177,
+    0.2843282542200158,
+    0.2827033299131669,
+    0.28035104778082265,
+    0.2782082983359874,
+    0.27589950120001683,
+    0.27325842201376005,
+    0.27104919439201897,
+    0.2688075817805597,
+    0.26814315263564775,
+    0.26998556725462286,
+    0.269346874312791,
+    0.2673887565870066,
+    0.2683233739448121,
+    0.2702135698992237
+  ],
+  "cmvn_means": [
+    11.837255115918403,
+    12.473204615847946,
+    13.416767619583318,
+    14.077409846458519,
+    14.692713667734644,
+    15.134646755356338,
+    15.425053998320841,
+    15.520304088736482,
+    15.664980906057181,
+    15.682885368714361,
+    15.83134095973795,
+    15.901056812316575,
+    16.043105914428832,
+    16.141928413478638,
+    16.146063740161384,
+    16.17268368755442,
+    16.13231180127601,
+    16.065540090344545,
+    16.170683092860383,
+    15.998216926090535,
+    15.867837768614727,
+    16.081028935225024,
+    15.90913485828459,
+    16.032066529724602,
+    15.94857810175373,
+    16.03539817911192,
+    15.919972463810511,
+    16.012130517613077,
+    15.93573072975294,
+    15.914797286475908,
+    15.949416173227279,
+    15.914241247262952,
+    15.9205949984345,
+    15.979177455555364,
+    15.986889776762691,
+    16.04603056604172,
+    16.110854420018935,
+    16.11681722403251,
+    16.129875546992444,
+    16.085759281189265,
+    16.134709075491045,
+    16.09818475127177,
+    16.202892094198077,
+    16.195676195628295,
+    16.265984774543206,
+    16.368600951439756,
+    16.48524192770144,
+    16.53072364237602,
+    16.58613266332892,
+    16.682058026108336,
+    16.643586991407417,
+    16.62329213337083,
+    16.638263919106894,
+    16.703993486441295,
+    16.75845666749587,
+    16.818435528248443,
+    16.88729840520967,
+    16.89038585593233,
+    16.816687157527294,
+    16.731004380992307,
+    16.674947603018126,
+    16.562815703508104,
+    16.50694580056838,
+    16.427151307327705,
+    16.33695716109585,
+    16.22435176840036,
+    16.122595445956836,
+    16.074572001519112,
+    16.045862034568927,
+    15.997705599309137,
+    15.955502796282088,
+    15.925529416522258,
+    15.884868619147634,
+    15.847951054825177,
+    15.812488364237238,
+    15.791251105720136,
+    15.698867196814575,
+    15.451057143452907,
+    15.043111236177761,
+    14.453490694177178
+  ],
+  "cmvn_preload": true,
+  "dither": 1.0,
+  "do_ceptral_normalize": true,
+  "feature_extractor_type": "WhaleFeatureExtractor",
+  "feature_size": 80,
+  "frame_length": 25,
+  "frame_shift": 10,
+  "normalize_means": true,
+  "normalize_vars": true,
+  "num_mel_bins": 80,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}
+
--- a/web_demo/vllm_tools/model_weight_file/modeling_intern_vit.py
+++ b/web_demo/vllm_tools/model_weight_file/modeling_intern_vit.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from timm.models.layers import DropPath
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+
+from .configuration_intern_vit import InternVisionConfig
+
+try:
+    from .flash_attention import FlashAttention
+    has_flash_attn = True
+except:
+    print('FlashAttention is not installed.')
+    has_flash_attn = False
+
+
+logger = logging.get_logger(__name__)
+
+
+class InternRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+try:
+    from apex.normalization import FusedRMSNorm
+
+    InternRMSNorm = FusedRMSNorm  # noqa
+
+    logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
+except ImportError:
+    # using the normal InternRMSNorm
+    pass
+except Exception:
+    logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
+    pass
+
+
+NORM2FN = {
+    'rms_norm': InternRMSNorm,
+    'layer_norm': nn.LayerNorm,
+}
+
+
+class InternVisionEmbeddings(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.float().reshape(
+            1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False).\
+            reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
+        return pos_embed
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat([
+            self.position_embedding[:, :1, :],
+            self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
+        ], dim=1)
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+
+
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_flash_attn = config.use_flash_attn and has_flash_attn
+        if config.use_flash_attn and not has_flash_attn:
+            print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).'
+            )
+
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+        self.proj_drop = nn.Dropout(config.dropout)
+
+        self.qk_normalization = config.qk_normalization
+
+        if self.qk_normalization:
+            self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        if self.use_flash_attn:
+            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _naive_attn(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
+
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+
+        context, _ = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
+        )
+        outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
+        outs = self.proj_drop(outs)
+        return outs
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
+        return x
+
+
+class InternMLP(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class InternVisionEncoderLayer(nn.Module):
+    def __init__(self, config: InternVisionConfig, drop_path_rate: float):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+        hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
+
+        hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
+
+        return hidden_states
+
+
+class InternVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = True
+
+    def forward(
+            self,
+            inputs_embeds,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    encoder_layer,
+                    hidden_states)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                )
+            hidden_states = layer_outputs
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states
+        )
+
+
+class InternVisionModel(PreTrainedModel):
+    main_input_name = 'pixel_values'
+    config_class = InternVisionConfig
+    _no_split_modules = ['InternVisionEncoderLayer']
+
+    def __init__(self, config: InternVisionConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(config)
+
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
+        pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        self.embeddings.image_size = new_size
+        logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError('You have to specify pixel_values or pixel_embeds')
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
--- a/web_demo/vllm_tools/model_weight_file/modeling_whale.py
+++ b/web_demo/vllm_tools/model_weight_file/modeling_whale.py
+# coding=utf-8
+# Copyright 2023 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Llava model."""
+
+from dataclasses import dataclass
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.auto import AutoModel, AutoModelForCausalLM
+from .configuration_whale import WhaleConfig
+from einops import rearrange
+import torch.nn.functional as F
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "WhaleConfig"
+
+# try:
+#     from .flash_attention import FlashAttention
+#     has_flash_attn = True
+# except:
+#     print('FlashAttention is not installed.')
+#     has_flash_attn = False
+
+has_flash_attn = False
+
+class WhaleConv2dSubsampling4(nn.Module):
+    """Convolutional 2D subsampling (to 1/4 length).
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+
+    """
+    def __init__(self, config: WhaleConfig):
+        """Construct an Conv2dSubsampling4 object."""
+        super().__init__()
+        self.config = config
+        self.in_channels = config.num_channels
+        self.hidden_size = config.hidden_size
+        self.input_dim = config.input_dim
+
+        self.conv_in = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.in_channels, out_channels=self.hidden_size, kernel_size=3, stride=2
+            ),
+            nn.ReLU(),
+            nn.Conv2d(
+                in_channels=self.hidden_size, out_channels=self.hidden_size, kernel_size=3, stride=2
+            ),
+            nn.ReLU(),
+        )
+
+        self.intermediate_size = self.hidden_size * (((self.input_dim - 1) // 2 - 1) // 2)
+        self.out = nn.Linear(self.intermediate_size, self.hidden_size)
+        # The right context for every conv layer is computed by:
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        self.subsampling_rate = 4
+        # 6 = (3 - 1) * 1 + (3 - 1) * 2
+        self.right_context = 6
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            x_mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 4.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 4.
+            torch.Tensor: positional encoding
+
+        """
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.conv_in(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+
+        return x, x_mask[:, 2::2][:, 2::2]
+    
+
+class WhalePositionalEncoding(torch.nn.Module):
+    """Positional encoding.
+    :param int d_model: embedding dim
+    :param float dropout_rate: dropout rate
+    :param int max_len: maximum input length
+    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
+    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
+    """
+    def __init__(self, config: WhaleConfig):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = config.hidden_size
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=config.dropout)
+        self.max_len = config.max_position_embeddings
+
+        self.pe = torch.zeros(self.max_len, self.d_model)
+        position = torch.arange(0, self.max_len,
+                                dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) *
+            -(math.log(10000.0) / self.d_model))
+        self.pe[:, 0::2] = torch.sin(position * div_term)
+        self.pe[:, 1::2] = torch.cos(position * div_term)
+        self.pe = self.pe.unsqueeze(0)
+
+    def forward(self,
+                x: torch.Tensor,
+                offset: int = 0):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input. Its shape is (batch, time, ...)
+            offset (int): position offset
+        Returns:
+            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+            torch.Tensor: for compatibility to RelPositionalEncoding
+        """
+        assert offset + x.size(1) < self.max_len
+        self.pe = self.pe.to(x.device)
+        pos_emb = self.pe[:, offset:offset + x.size(1)]
+        x = x * self.xscale + pos_emb
+        return self.dropout(x), self.dropout(pos_emb)
+
+    def position_encoding(self, offset: int, size: int):
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int): start offset
+            size (int): requried size of position encoding
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        assert offset + size < self.max_len
+        return self.dropout(self.pe[:, offset:offset + size])
+    
+
+class RelPositionalEncoding(WhalePositionalEncoding):
+    """Relative positional encoding module.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, config: WhaleConfig):
+        """Initialize class."""
+        super().__init__(config)
+        self.hidden_size = config.hidden_size
+        # self.chunk_size = chunk_size
+        # self.left_chunks = left_chunks
+        # self.full_chunk_size = (self.left_chunks + 1) * self.chunk_size
+
+        self.div_term = torch.exp(
+            torch.arange(0, self.hidden_size, 2, dtype=torch.float32) *
+            -(math.log(10000.0) / self.hidden_size))
+        self.max_length = config.max_position_embeddings
+        # self.max_len = self.chunk_size * (max_len // self.chunk_size) - self.full_chunk_size
+
+    @torch.jit.export
+    def forward(self,
+                x: torch.Tensor,
+                offset: int = 0):
+        """Compute positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+            torch.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        self.pe = self.pe.to(x.device)
+        x = x * self.xscale
+        pos_emb = self.pe[:, offset:offset + x.size(1)]
+        return self.dropout(x), self.dropout(pos_emb)
+    
+
+class WhaleAudioEmbeddings(nn.Module):
+    def __init__(self, config: WhaleConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.embed_dim = config.hidden_size
+        self.dropout_rate = config.dropout
+        self.input_dim = config.input_dim
+
+        self.embedding = nn.Sequential(
+            nn.Linear(config.hidden_size, self.embed_dim),
+            nn.LayerNorm(self.embed_dim),
+            nn.Dropout(self.dropout_rate),
+            nn.ReLU()
+        )
+
+        self.positional_embedding = RelPositionalEncoding(config)
+    
+    def forward(self, input_features: torch.Tensor) -> torch.Tensor:
+
+        hidden_states = self.embedding(input_features)
+        hidden_states, pos_embeds = self.positional_embedding(hidden_states)
+        return hidden_states, pos_embeds
+
+
+
+class WhaleAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: WhaleConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_flash_attn = config.use_flash_attn and has_flash_attn
+        if config.use_flash_attn and not has_flash_attn:
+            print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).'
+            )
+
+        self.scale = self.head_dim ** -0.5
+        self.linear_q = nn.Linear(self.embed_dim, self.embed_dim)
+        self.linear_k = nn.Linear(self.embed_dim, self.embed_dim)
+        self.linear_v = nn.Linear(self.embed_dim, self.embed_dim)
+        self.linear_out = nn.Linear(self.embed_dim, self.embed_dim)
+
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+
+        self.qk_normalization = config.qk_normalization
+
+        if self.qk_normalization:
+            self.q_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        if self.use_flash_attn:
+            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
+        self.linear_out = nn.Linear(self.embed_dim, self.embed_dim)
+
+        self.use_relative_pe = config.use_relative_pe
+        if self.use_relative_pe:
+
+            self.linear_pos = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
+            self.pos_bias_v = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim))
+            nn.init.xavier_uniform_(self.pos_bias_u)
+            nn.init.xavier_uniform_(self.pos_bias_v)
+
+
+    def _naive_attn(self, x, attention_mask=None, pos_embeds=None):
+        B, N, C = x.shape
+        q = self.linear_q(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        k = self.linear_k(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        v = self.linear_v(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+
+        if self.use_relative_pe:
+            q = q.transpose(1, 2)
+            batch_size = pos_embeds.size(0)
+            p = self.linear_pos(pos_embeds.to(q.dtype)).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            query_with_bias_u = (q + self.pos_bias_u.to(q.device)).transpose(1, 2)
+            query_with_bias_v = (q + self.pos_bias_v.to(q.device)).transpose(1, 2)
+
+            # compute attention score
+            # first compute matrix a and matrix c
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            matrix_ac = torch.matmul(query_with_bias_u, k.transpose(-2, -1))
+            # compute matrix b and matrix d
+            matrix_bd = torch.matmul(query_with_bias_v, p.transpose(-2, -1))
+            attn = (matrix_ac + matrix_bd) * self.scale
+
+        else:
+            attn = ((q * self.scale) @ k.transpose(-2, -1))
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            attn = attn.masked_fill(~attention_mask.bool(), float("-inf"))
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.linear_out(x)
+        return x
+
+    def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
+
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+
+        context, _ = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
+        )
+        outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
+        outs = self.proj_drop(outs)
+        return outs
+
+    def forward(
+            self, 
+            hidden_states: torch.Tensor,
+            attention_mask: torch.Tensor = None,
+            pos_embeds: torch.Tensor = None
+        ) -> torch.Tensor:
+        x = self._naive_attn(hidden_states, attention_mask, pos_embeds) if not self.use_flash_attn else self._flash_attn(hidden_states)
+        return x
+
+    
+class WhaleMLP(nn.Module):
+    def __init__(self, config: WhaleConfig):
+        super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.w_1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.w_2 = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.w_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.w_2(hidden_states)
+        return hidden_states
+
+
+class WhaleAudioEncoderLayer(nn.Module):
+    def __init__(self, config: WhaleConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.dropout_rate = config.dropout
+        self.normalize_before = config.normalize_before
+        self.concat_after = config.concat_after
+
+        self.attn = WhaleAttention(config)
+        self.feed_forward = WhaleMLP(config)
+        self.norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.dropout = nn.Dropout(config.dropout)
+
+        if self.concat_after:
+            self.concat_linear = nn.Linear(self.embed_dim * 2, self.embed_dim)
+        else:
+            self.concat_linear = nn.Identity()
+
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: torch.Tensor,
+            pos_emb: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.norm1(hidden_states)
+        if self.concat_after:
+            hidden_states = torch.cat(
+                [hidden_states, self.attn(hidden_states, attention_mask, pos_emb)],
+                dim=-1
+            )
+            hidden_states = self.concat_linear(hidden_states) + residual
+        else:
+            hidden_states = self.dropout(self.attn(hidden_states, attention_mask, pos_emb)) + residual
+        if not self.normalize_before:
+            hidden_states = self.norm1(hidden_states)
+
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.norm2(hidden_states)
+        hidden_states = self.dropout(self.feed_forward(hidden_states)) + residual
+        if not self.normalize_before:
+            hidden_states = self.norm2(hidden_states)
+
+        return hidden_states
+
+
+class WhaleAudioEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+
+    def __init__(self, config: WhaleConfig):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        
+        self.layers = nn.ModuleList([
+            WhaleAudioEncoderLayer(config) for idx in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = True
+
+        self.normalize_before = config.normalize_before
+        if self.normalize_before:
+            self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+            self,
+            inputs_embeds,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            pos_embeds: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    encoder_layer,
+                    hidden_states,
+                    attention_mask,
+                    pos_embeds,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    pos_embeds,
+                )
+            hidden_states = layer_outputs
+        
+        if self.normalize_before:
+            hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states
+        )
+
+        
+
+class WhaleAudioModel(PreTrainedModel):
+    main_input_name = 'pixel_values'
+    config_class = WhaleConfig
+    _no_split_modules = ['WhaleAudioEncoderLayer']
+
+    def __init__(self, config: WhaleConfig):
+        super().__init__(config)
+        self.config = config
+
+        # self.embeddings = InternVisionEmbeddings(config)
+        self.subsampling = WhaleConv2dSubsampling4(config)
+        self.embeddings = WhaleAudioEmbeddings(config)
+        self.encoder = WhaleAudioEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+            self,
+            input_features: Optional[torch.FloatTensor] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_features is None and pixel_embeds is None:
+            raise ValueError('You have to specify pixel_values or pixel_embeds')
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(input_features.shape) == 3:
+                input_features, attention_mask = self.subsampling(input_features, attention_mask)
+                hidden_states, pos_embeds = self.embeddings(input_features)
+            else:
+                raise ValueError(f'wrong pixel_values size: {input_features.shape}')
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            pos_embeds=pos_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
--- a/web_demo/vllm_tools/model_weight_file/preprocessor_config.json
+++ b/web_demo/vllm_tools/model_weight_file/preprocessor_config.json
+{
+  "crop_size": 448,
+  "do_center_crop": true,
+  "do_normalize": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "resample": 3,
+  "size": 448
+}
--- a/web_demo/vllm_tools/model_weight_file/processor_whale.py
+++ b/web_demo/vllm_tools/model_weight_file/processor_whale.py
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Feature extractor class for Speech2Text
+"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+import os
+import json
+
+from transformers.audio_utils import mel_filter_bank, spectrogram, window_function
+from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.utils import PaddingStrategy, TensorType, is_speech_available, logging
+
+
+if is_speech_available():
+    import torch
+    import torchaudio
+    import torchaudio.compliance.kaldi as ta_kaldi
+
+logger = logging.get_logger(__name__)
+
+
+class WhaleFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a WhaleFeatureExtractor for extracting features from raw speech.
+
+    This feature extractor inherits from [`SequenceFeatureExtractor`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+    This class extracts mel-filter bank features from raw speech using TorchAudio if installed or using numpy
+    otherwise, and applies utterance-level cepstral mean and variance normalization (CMVN) to the extracted features.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 80):
+            The feature dimension of the extracted features.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized, expressed in hertz (Hz).
+        num_mel_bins (`int`, *optional*, defaults to 80):
+            Number of Mel-frequency bins.
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used to fill the padding vectors.
+        frame_length (`int`, *optional*, defaults to 25):
+            The length of each frame in milliseconds.
+        frame_shift (`int`, *optional*, defaults to 10):
+            The shift between consecutive frames in milliseconds.
+        dither (`float`, *optional*, defaults to 1.0):
+            The amount of dithering (random noise) to apply to the signal.
+        do_ceptral_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
+        normalize_means (`bool`, *optional*, defaults to `True`):
+            Whether or not to zero-mean normalize the extracted features.
+        normalize_vars (`bool`, *optional*, defaults to `True`):
+            Whether or not to unit-variance normalize the extracted features.
+        cmvn_preload (`bool`, *optional*, defaults to `True`):
+            Whether or not to preload CMVN statistics from a file.
+        cmvn_file (`str`, *optional*, defaults to ""):
+            Path to the file containing precomputed CMVN statistics.
+        cmvn_means (`list` of `float`, *optional*, defaults to `None`):
+            Precomputed means for CMVN.
+        cmvn_istds (`list` of `float`, *optional*, defaults to `None`):
+            Precomputed inverse standard deviations for CMVN.
+    """
+
+    model_input_names = ["input_features", "attention_mask"]
+
+    def __init__(
+        self,
+        feature_size=80,
+        sampling_rate=16000,
+        num_mel_bins=80,
+        padding_value=0.0,
+        frame_length=25,
+        frame_shift=10,
+        dither=1.0,
+        do_ceptral_normalize=True,
+        normalize_means=True,
+        normalize_vars=True,
+        cmvn_preload=True,
+        cmvn_file="",
+        cmvn_means=None,
+        cmvn_istds=None,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.num_mel_bins = num_mel_bins
+        self.sampling_rate = sampling_rate
+        self.padding_value = padding_value
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+        self.dither = dither
+        self.do_ceptral_normalize = do_ceptral_normalize
+        self.normalize_means = normalize_means
+        self.normalize_vars = normalize_vars
+        self.return_attention_mask = True
+        self.cmvn_preload = cmvn_preload
+        self.cmvn_file = cmvn_file
+        self.cmvn_means = cmvn_means
+        self.cmvn_istds = cmvn_istds
+
+        if self.cmvn_preload:
+            if self.cmvn_means is not None and self.cmvn_istds is not None:
+                self.cmvn_means = np.array(self.cmvn_means, dtype=np.float32)
+                self.cmvn_istds = np.array(self.cmvn_istds, dtype=np.float32)
+            else:
+                if self.cmvn_file is None or self.cmvn_file == "":
+                    raise ValueError(f"cmvn_file should be a valid file if cmvn_preload is set True, but we get {self.cmvn_file}.")
+                if not os.path.join(self.cmvn_file):
+                    raise ValueError(f"file {self.cmvn_file} is not found.")
+                self.cmvn_means, self.cmvn_istds = self._load_json_cmvn(self.cmvn_file)
+
+        if not is_speech_available():
+            mel_filters = mel_filter_bank(
+                num_frequency_bins=256,
+                num_mel_filters=self.num_mel_bins,
+                min_frequency=20,
+                max_frequency=sampling_rate // 2,
+                sampling_rate=sampling_rate,
+                norm=None,
+                mel_scale="kaldi",
+                triangularize_in_mel_space=True,
+            )
+
+            self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
+            self.window = window_function(400, "povey", periodic=False)
+
+    def _load_json_cmvn(self, json_cmvn_file):
+        """ Load the json format cmvn stats file and calculate cmvn
+
+        Args:
+            json_cmvn_file: cmvn stats file in json format
+
+        Returns:
+            a numpy array of [means, vars]
+        """
+        with open(json_cmvn_file) as f:
+            cmvn_stats = json.load(f)
+
+        means = np.array(cmvn_stats['mean_stat'])
+        variances = np.array(cmvn_stats['var_stat'])
+        count = cmvn_stats['frame_num']
+
+        epsilon = 1.0e-6
+
+        means = means / count
+        variances = variances / count - means ** 2
+        variances[variances < epsilon] = epsilon
+        istds = 1.0 / np.sqrt(variances)
+
+        return means, istds
+
+
+    def _extract_fbank_features(
+        self,
+        waveform: np.ndarray,
+    ) -> np.ndarray:
+        """
+        Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
+        and hence the waveform should not be normalized before feature extraction.
+        """
+        waveform = waveform * (2**15)  # Kaldi compliance: 16-bit signed integers
+        if is_speech_available():
+            if not isinstance(waveform, torch.Tensor):
+                waveform = torch.from_numpy(waveform)
+            
+            features = ta_kaldi.fbank(
+                waveform,
+                num_mel_bins=self.num_mel_bins,
+                sample_frequency=self.sampling_rate,
+                frame_length=self.frame_length,
+                frame_shift=self.frame_shift,
+                dither=self.dither,
+                energy_floor=0.0,
+            )
+            features = features.numpy()
+        else:
+            waveform = np.squeeze(waveform)
+            features = spectrogram(
+                waveform,
+                self.window,
+                frame_length=400,
+                hop_length=160,
+                fft_length=512,
+                power=2.0,
+                center=False,
+                preemphasis=0.97,
+                mel_filters=self.mel_filters,
+                log_mel="log",
+                mel_floor=1.192092955078125e-07,
+                remove_dc_offset=True,
+            ).T
+        return features
+
+    @staticmethod
+    def utterance_cmvn(
+        x: np.ndarray,
+        input_length: int,
+        normalize_means: Optional[bool] = True,
+        normalize_vars: Optional[bool] = True,
+        padding_value: float = 0.0,
+        cmvn_means: Optional[np.ndarray] = None,
+        cmvn_istds: Optional[np.ndarray] = None,
+    ) -> np.ndarray:
+        # make sure we normalize float32 arrays
+        if normalize_means:
+            mean = cmvn_means if cmvn_means is not None else x[:input_length].mean(axis=0)
+            x = np.subtract(x, mean)
+        if normalize_vars:
+            istd = cmvn_istds if cmvn_istds is not None else 1 / x[:input_length].std(axis=0)
+            x = np.multiply(x, istd)
+
+        if input_length < x.shape[0]:
+            x[input_length:] = padding_value
+
+        # make sure array is in float32
+        x = x.astype(np.float32)
+
+        return x
+
+    def normalize(
+        self, input_features: List[np.ndarray], attention_mask: Optional[np.ndarray] = None
+    ) -> List[np.ndarray]:
+        lengths = attention_mask.sum(-1) if attention_mask is not None else [x.shape[0] for x in input_features]
+        return [
+            self.utterance_cmvn(
+                x, 
+                n, 
+                self.normalize_means, 
+                self.normalize_vars, 
+                self.padding_value,
+                self.cmvn_means if self.cmvn_preload else None,
+                self.cmvn_istds if self.cmvn_preload else None,
+            )
+            for x, n in zip(input_features, lengths)
+        ]
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+                <Tip>
+
+                For Speech2TextTransformer models, `attention_mask` should always be passed for batched inference, to
+                avoid subtle bugs.
+
+                </Tip>
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            padding_value (`float`, defaults to 0.0):
+                The value that is used to fill the padding values / vectors.
+        """
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                logger.warning(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+                if is_speech_available():
+                    resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=self.sampling_rate)
+                    if isinstance(raw_speech, List):
+                        raw_speech = [resampler(speech) for speech in raw_speech]
+                    else:
+                        raw_speech = resampler(raw_speech)
+
+                logger.warning(
+                    f"Resampling the input audio to match the model's sampling rate of {self.sampling_rate}."
+                )
+
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [raw_speech]
+
+        # extract fbank features
+        features = [self._extract_fbank_features(waveform) for waveform in raw_speech]
+
+        # convert into correct format for padding
+        encoded_inputs = BatchFeature({"input_features": features})
+
+        padded_inputs = self.pad(
+            encoded_inputs,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            **kwargs,
+        )
+
+        # make sure list is in array format
+        input_features = padded_inputs.get("input_features")
+        if isinstance(input_features[0], list):
+            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
+
+        attention_mask = padded_inputs.get("attention_mask")
+        if attention_mask is not None:
+            padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
+
+        # Utterance-level cepstral mean and variance normalization
+        if self.do_ceptral_normalize:
+            attention_mask = (
+                np.array(attention_mask, dtype=np.int32)
+                if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
+                else None
+            )
+            padded_inputs["input_features"] = self.normalize(
+                padded_inputs["input_features"], attention_mask=attention_mask
+            )
+
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+