Initial commit

be78f35d · wanglch · be78f35d · be78f35d · be78f35d · be78f35d
Commit be78f35d authored Jan 15, 2025 by wanglch
11 changed files
--- a/eval_audio/whisper_normalizer/basic.py
+++ b/eval_audio/whisper_normalizer/basic.py
+import re
+import unicodedata
+
+import regex
+
+# non-ASCII letters that are not separated by "NFKD" normalization
+ADDITIONAL_DIACRITICS = {
+    "œ": "oe",
+    "Œ": "OE",
+    "ø": "o",
+    "Ø": "O",
+    "æ": "ae",
+    "Æ": "AE",
+    "ß": "ss",
+    "ẞ": "SS",
+    "đ": "d",
+    "Đ": "D",
+    "ð": "d",
+    "Ð": "D",
+    "þ": "th",
+    "Þ": "th",
+    "ł": "l",
+    "Ł": "L",
+}
+
+
+def remove_symbols_and_diacritics(s: str, keep=""):
+    """
+    Replace any other markers, symbols, and punctuations with a space,
+    and drop any diacritics (category 'Mn' and some manual mappings)
+    """
+    return "".join(
+        c
+        if c in keep
+        else ADDITIONAL_DIACRITICS[c]
+        if c in ADDITIONAL_DIACRITICS
+        else ""
+        if unicodedata.category(c) == "Mn"
+        else " "
+        if unicodedata.category(c)[0] in "MSP"
+        else c
+        for c in unicodedata.normalize("NFKD", s)
+    )
+
+
+def remove_symbols(s: str):
+    """
+    Replace any other markers, symbols, punctuations with a space, keeping diacritics
+    """
+    return "".join(
+        " " if unicodedata.category(c)[0] in "MSP" else c
+        for c in unicodedata.normalize("NFKC", s)
+    )
+
+
+class BasicTextNormalizer:
+    def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
+        self.clean = (
+            remove_symbols_and_diacritics if remove_diacritics else remove_symbols
+        )
+        self.split_letters = split_letters
+
+    def __call__(self, s: str):
+        s = s.lower()
+        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
+        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
+        s = self.clean(s).lower()
+
+        if self.split_letters:
+            s = " ".join(regex.findall(r"\X", s, regex.U))
+
+        s = re.sub(
+            r"\s+", " ", s
+        )  # replace any successive whitespace characters with a space
+
+        return s
--- a/eval_audio/whisper_normalizer/english.json
+++ b/eval_audio/whisper_normalizer/english.json
--- a/eval_audio/whisper_normalizer/english.py
+++ b/eval_audio/whisper_normalizer/english.py
--- a/finetune/accelerate_configs/deepspeed_z1.yaml
+++ b/finetune/accelerate_configs/deepspeed_z1.yaml
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  zero3_init_flag: false
+  zero_stage: 1
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_process_ip: localhost
+main_process_port: 9999
+main_training_function: main
+mixed_precision: bf16
+num_machines: 2
+num_processes: 16
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/finetune/accelerate_configs/deepspeed_z2.yaml
+++ b/finetune/accelerate_configs/deepspeed_z2.yaml
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_process_ip: localhost
+main_process_port: 9999
+main_training_function: main
+mixed_precision: bf16
+num_machines: 2
+num_processes: 16
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/finetune/accelerate_configs/deepspeed_z3.yaml
+++ b/finetune/accelerate_configs/deepspeed_z3.yaml
+compute_environment: LOCAL_MACHINE                                                                                                                                                                     
+debug: false                                                                                                                                                                                           
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero3_save_16bit_model: false
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_process_ip: localhost
+main_process_port: 999
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
\ No newline at end of file
--- a/finetune/run.py
+++ b/finetune/run.py
+import json
+import os
+import sys
+import math
+import random
+import logging
+import argparse
+import numpy as np
+from pathlib import Path
+from tqdm.auto import tqdm
+from collections import defaultdict
+
+import librosa
+from io import BytesIO
+from urllib.request import urlopen
+
+from peft import get_peft_model
+from peft import LoraConfig, TaskType
+
+import torch
+from torch.utils.data import DataLoader
+from datasets import IterableDataset
+
+from accelerate.utils import set_seed
+from accelerate.logging import get_logger
+from accelerate import Accelerator, DistributedType
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoProcessor,
+    Qwen2AudioForConditionalGeneration,
+    SchedulerType,
+    get_scheduler,
+)
+
+
+logger = get_logger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        default="qwen2-audio",
+        help="Path to pretrained model or model identifier from huggingface.co/models",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed"
+    )
+    parser.add_argument(
+        "--learning_rate", type=float, default=5e-5, help="The initial learning rate for AdamW."
+    )
+    parser.add_argument(
+        "--weight_decay", type=float, default=0.0, help="Weight decay for AdamW."
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=1,
+        help="Batch size per GPU/TPU core/CPU for training.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--steps_per_print",
+        type=int,
+        default=1,
+        help="Number of steps before printing the loss.",
+    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help="Trust remote code for the model and tokenizer.",
+    )
+    parser.add_argument(
+        "--low_cpu_mem_usage",
+        action="store_true",
+        help="Use low CPU memory usage for the model.",
+    )
+    parser.add_argument(
+        "--flash_attention",
+        action="store_true",
+        help="Use FlashAttention for the model.",
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=1000,
+        help="Total number of training steps to perform.",
+    )
+    parser.add_argument(
+        "--num_warmup_steps",
+        type=int,
+        default=0,
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default=SchedulerType.LINEAR,
+        help="The learning rate scheduler type to use.",
+    )
+    parser.add_argument(
+        "--save_interval",
+        type=int,
+        default=100,
+        help="Number of steps before saving the model.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Use gradient checkpointing to save memory.",
+    )
+    parser.add_argument(
+        "--lora",
+        action="store_true",
+        help="Use lora to finetune.",
+    )
+    return parser.parse_args()
+
+
+def toy_data():
+    conversation = [
+        {
+            "role": "system", "content": "You are a helpful assistant."
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "audio",
+                    "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"
+                }
+            ]
+        },
+        {
+            "role": "assistant", "content": "Yes, the speaker is female and in her twenties."
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "audio",
+                    "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"
+                }
+            ]
+        }
+    ]
+    conversation1 = [
+        {
+            "role": "system", "content": "You are a helpful assistant."
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "audio",
+                    "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"
+                },
+                {
+                    "type": "text", "text": "What's that sound?"
+                },
+            ]
+        },
+        {
+            "role": "assistant", "content": "It is the sound of glass shattering."
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What can you do when you hear that?"},
+            ]
+        },
+        {
+            "role": "assistant",
+            "content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property."
+        },
+        {
+            "role": "user", "content": [
+                {
+                    "type": "audio",
+                    "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"
+                },
+                {"type": "text", "text": "What does the person say?"},
+            ]
+        }
+    ]
+    while True:
+        if random.random() < 0.5:
+            yield {"conversations": conversation}
+        else:
+            yield {"conversations": conversation1}
+
+
+def main(args):
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    accelerator_log_kwargs = {"dispatch_batches": False}
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        **accelerator_log_kwargs
+    )
+
+    def init_dataloader(processor):
+        def _func(batch):
+            # copy from `https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct`
+            conversations = batch["conversations"]
+
+            text = [
+                processor.apply_chat_template(
+                    conversation,
+                    add_generation_prompt=False,
+                    tokeni_in_conversationze=False
+                )
+                for conversation in conversations
+            ]
+
+            audios, audio_num_for_each_conversation = [], []
+            for conversation in conversations:
+                audio_num = 0
+                for message in conversation:
+                    if isinstance(message["content"], list):
+                        for ele in message["content"]:
+                            if ele["type"] == "audio":
+                                audios.append(
+                                    librosa.load(
+                                        BytesIO(
+                                            urlopen(ele['audio_url']).read()
+                                        ),
+                                        sr=processor.feature_extractor.sampling_rate)[0]
+                                )
+                                audio_num += 1
+                audio_num_for_each_conversation.append(audio_num)
+
+            inputs = processor(
+                text=text,
+                audios=audios if audios else None,
+                return_tensors="pt",
+                padding=True
+            )
+
+            # Split the tensors for each conversation, make sure the dataset is iterable
+            inputs["feature_attention_mask"] = [
+                x for x in torch.split(
+                    inputs["feature_attention_mask"],
+                    audio_num_for_each_conversation, dim=0)
+            ]
+            inputs["input_features"] = [
+                x for x in torch.split(
+                    inputs["input_features"],
+                    audio_num_for_each_conversation,
+                    dim=0
+                )
+            ]
+            logger.warning(
+                "We automatically learn from all tokens except for `audio` in the conversation. If you want to learn about a specific `role` or `content`, please modify the code accordingly."
+            )
+            # Qwen2AudioForConditionalGeneration will automatically shift the input_ids for you
+            inputs["labels"] = inputs["input_ids"]
+            return inputs
+
+        # Load dataset
+        dataset = IterableDataset.from_generator(toy_data)
+        dataset = dataset.map(
+            _func,
+            batched=True,
+            remove_columns=["conversations"],
+            batch_size=2
+        )
+
+        def collate_fn(batch):
+            flatten_batch = defaultdict(list)
+            for k in batch[0]:
+                for instance in batch:
+                    if isinstance(instance[k], list):
+                        flatten_batch[k] += instance[k]
+                    else:
+                        flatten_batch[k].append(instance[k])
+            return {
+                k: torch.cat(v, dim=0)
+                if k in ["feature_attention_mask", "input_features"] else torch.stack(v)
+                for k, v in flatten_batch.items()
+            }
+
+        dataloader = DataLoader(
+            dataset,
+            batch_size=args.per_device_train_batch_size,
+            num_workers=0,
+            collate_fn=collate_fn,
+        )
+        return dataloader
+
+    accelerator.state.deepspeed_plugin.deepspeed_config[
+        'train_micro_batch_size_per_gpu'] = args.per_device_train_batch_size
+    accelerator.state.deepspeed_plugin.deepspeed_config[
+        'gradient_accumulation_steps'] = args.gradient_accumulation_steps
+    accelerator.state.deepspeed_plugin.deepspeed_config[
+        'steps_per_print'] = args.steps_per_print
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
+    config = AutoConfig.from_pretrained(
+        args.model_name_or_path,
+        trust_remote_code=args.trust_remote_code,
+    )
+    processor = AutoProcessor.from_pretrained(args.model_name_or_path)
+
+    model = Qwen2AudioForConditionalGeneration.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        low_cpu_mem_usage=args.low_cpu_mem_usage,
+        trust_remote_code=args.trust_remote_code,
+        # Qwen2AudioForConditionalGeneration can not support `flash_attention` but we keep it here for demonstration
+        attn_implementation="flash_attention_2" if args.flash_attention else None,
+        torch_dtype=config.torch_dtype
+    )
+
+    if args.lora:
+        logger.info("Use lora to finetune...")
+        peft_config = LoraConfig(
+            task_type=TaskType.CAUSAL_LM,
+            inference_mode=False,
+            r=8,
+            lora_alpha=32,
+            lora_dropout=0.1,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj"]
+        )
+        model.enable_input_require_grads()
+        model = get_peft_model(model, peft_config)
+        model.print_trainable_parameters()
+
+    if args.gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+
+    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
+    # on a small vocab and want a smaller embedding size, remove this test.
+    embedding_size = model.get_input_embeddings().weight.shape[0]
+    if len(processor.tokenizer) > embedding_size:
+        model.resize_token_embeddings(len(processor.tokenizer))
+
+    # Prepare the dataloader
+    train_dataloader = init_dataloader(processor)
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "layer_norm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = torch.optim.AdamW(
+        optimizer_grouped_parameters,
+        lr=args.learning_rate
+    )
+
+    # Scheduler and math around the number of training steps.
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes
+    )
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, lr_scheduler, train_dataloader = accelerator.prepare(
+        model, optimizer, lr_scheduler, train_dataloader
+    )
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * \
+        accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(
+        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(
+        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    logger.info(f"  Num processes: {accelerator.num_processes}")
+    logger.info(f"  Process index: {accelerator.process_index}")
+
+    completed_steps = 0
+
+    for _, batch in enumerate(train_dataloader):
+        model.train()
+
+        with accelerator.accumulate(model):
+            # Move the batch to the device (should be done by the accelerator)
+            for k, v in batch.items():
+                if isinstance(v, torch.Tensor) and v.device == torch.device("cpu"):
+                    batch[k] = v.cuda()
+
+            outputs = model(**batch)
+            loss = outputs.loss
+            # We keep track of the loss at each step
+            local_loss = loss.detach().float()
+            logger.info(
+                f"Steps = {completed_steps + 1}, Local loss = {local_loss}...")
+            accelerator.backward(loss)
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+
+        # Checks if the accelerator has performed an optimization step behind the scenes
+        if accelerator.sync_gradients:
+            completed_steps += 1
+
+        if args.output_dir is not None and completed_steps % args.save_interval == 0:
+            accelerator.wait_for_everyone()
+            output_dir = os.path.join(
+                args.output_dir,
+                f"checkpoint_{completed_steps}"
+            )
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(
+                output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
+            )
+            if accelerator.is_main_process:
+                processor.save_pretrained(output_dir)
+
+        if completed_steps >= args.max_train_steps:
+            return
+
+
+if __name__ == "__main__":
+    # Parse arguments
+    args = parse_args()
+
+    main(args)
--- a/finetune/run.sh
+++ b/finetune/run.sh
+export GPUS_PER_NODE=8
+export NCCL_IB_QPS_PER_CONNECTION=8
+export WORLD_SIZE=1
+export MASTER_ADDR=localhost
+export MASTER_PORT=29500
+export RANK=0
+
+# Only test deepspeed_z1.yaml but it should be the same for other configs
+accelerate launch \
+    --config_file accelerate_configs/deepspeed_z1.yaml \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank $RANK \
+    --num_machines $WORLD_SIZE \
+    --num_processes $(($WORLD_SIZE * $GPUS_PER_NODE)) \
+    run.py \
+    --model_name_or_path  Qwen/Qwen2-Audio-7B-Instruct \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 1 \
+    --learning_rate 3e-5 \
+    --max_train_steps 20000 \
+    --trust_remote_code \
+    --save_interval 5 \
+    --gradient_checkpointing \
+    --lora \
+    $@
\ No newline at end of file
--- a/icon.png
+++ b/icon.png
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode = 1092
+# 模型名称
+modelName=mplug-docowl_pytorch                    
+# 模型描述
+modelDescription=多模态OCR大模型，端侧可用
+# 应用场景
+appScenario=推理,OCR,金融,教育,政府,科研,交通,广媒
+# 框架类型
+frameType=pytorch
--- a/qwen2_audio_inference.py
+++ b/qwen2_audio_inference.py
+from io import BytesIO
+from urllib.request import urlopen
+import librosa
+from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
+
+processor = AutoProcessor.from_pretrained("/home/wanglch/Qwen2-Audio/Qwen2-Audio-7B-Instruct/", trust_remote_code=True)
+model = Qwen2AudioForConditionalGeneration.from_pretrained("/home/wanglch/Qwen2-Audio/Qwen2-Audio-7B-Instruct/", trust_remote_code=True, device_map="auto")
+
+conversation = [
+    {"role": "user", "content": [
+        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
+    ]},
+    {"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
+    {"role": "user", "content": [
+        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
+    ]},
+]
+text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+audios = []
+for message in conversation:
+    if isinstance(message["content"], list):
+        for ele in message["content"]:
+            if ele["type"] == "audio":
+                audios.append(librosa.load(
+                    BytesIO(urlopen(ele['audio_url']).read()), 
+                    sr=processor.feature_extractor.sampling_rate)[0]
+                )
+
+inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
+inputs.input_ids = inputs.input_ids.to("cuda")
+
+generate_ids = model.generate(**inputs, max_length=256)
+generate_ids = generate_ids[:, inputs.input_ids.size(1):]
+
+response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+print("Qwen_Audio Output:", response)