Initial commit

58d33d4c · wanglch · 58d33d4c · 58d33d4c · 58d33d4c · 58d33d4c
Commit 58d33d4c authored Nov 13, 2024 by wanglch
8 changed files
--- a/mplug_docowl/utils.py
+++ b/mplug_docowl/utils.py
+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+import requests
+from mplug_docowl.constants import LOGDIR
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+handler = None
+def build_logger(logger_name, logger_filename):
+    global handler
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True)
+        handler.setFormatter(formatter)
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+    return logger
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json",
+               "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+    return flagged
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+torch
+transformers==4.34.0
+deepspeed
+ninja
+accelerate
+tokenizers
+icecream
+requests
+uvicorn
+fastapi
+markdown2
+gradio==3.35.2
+gradio_client==0.2.9
+pydantic
+httpx==0.24.0
+tensorboardX
+tensorboard
+shortuuid
+sentencepiece==0.1.99
+peft
+opencv-python
+shortuuid
+textdistance
+editdistance
+pycocoevalcap
+httpx==0.24.0
+jsonlines
+uvicorn
+einops==0.6.1
+einops-exts==0.0.4
+timm==0.6.13
+word2number
\ No newline at end of file
--- a/saves/DocOwl1.5/train_lora_multi_dcu/runs/Nov06_14-50-54_K100-AI07/events.out.tfevents.1730875922.K100-AI07.5644.0
+++ b/saves/DocOwl1.5/train_lora_multi_dcu/runs/Nov06_14-50-54_K100-AI07/events.out.tfevents.1730875922.K100-AI07.5644.0
--- a/scripts/finetune_docowl.sh
+++ b/scripts/finetune_docowl.sh
+#!/bin/bash
+if [ $MASTER_ADDR ];then
+	echo $MASTER_ADDR
+    echo $MASTER_PORT
+    echo $WORLD_SIZE
+    echo $RANK
+else
+	MASTER_ADDR=127.0.0.1
+    MASTER_PORT=2$(($RANDOM % 10))$(($RANDOM % 10))15
+    WORLD_SIZE=1
+    RANK=0
+fi
+# Change for multinode config
+NNODES=${WORLD_SIZE}
+NODE_RANK=${RANK}
+GPUS_PER_NODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+# GPUS_PER_NODE=1
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+echo $DISTRIBUTED_ARGS
+# change LOAD to your local path of DocOwl1.5-stage1
+LOAD='./mPLUG/DocOwl1.5-stage1'
+# batch size = per_device_train_batch_size x GPUS_PER_NODE x NNODES x gradient_accumulation_steps
+DATA_FILE=./DocDownstream-1.0/train.jsonl
+torchrun $DISTRIBUTED_ARGS mplug_docowl/train/train_docowl.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path $LOAD \
+    --version v1 \
+    --data_path $DATA_FILE \
+    --image_folder './DocDownstream-1.0/' \
+    --image_size 448 \
+    --crop_anchors 'grid_9' \
+    --add_global_img True \
+    --add_textual_crop_indicator True \
+    --bf16 True \
+    --output_dir ./checkpoints/docowl1.5 \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 500 \
+    --save_total_limit 4 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 3600 \
+    --gradient_checkpointing True \
+    --tune_vision2text True \
+    --freeze_vision_model True \
+    --freeze_backbone False \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to tensorboard
\ No newline at end of file
--- a/scripts/zero2.json
+++ b/scripts/zero2.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}
\ No newline at end of file
--- a/scripts/zero3.json
+++ b/scripts/zero3.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}
\ No newline at end of file
--- a/scripts/zero3_offload.json
+++ b/scripts/zero3_offload.json
+{
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": "auto",
+      "warmup_max_lr": "auto",
+      "warmup_num_steps": "auto"
+    }
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "gather_16bit_weights_on_model_save": true
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "steps_per_print": 1e5,
+  "wall_clock_breakdown": false
+}
\ No newline at end of file
--- a/对话问答.png
+++ b/对话问答.png