llama_fastchat_pytorch

cc338b7c · zhaoying1 · cc338b7c · cc338b7c · cc338b7c · cc338b7c
Commit cc338b7c authored Sep 07, 2023 by zhaoying1
20 changed files
--- a/FastChat-main/fastchat/train/train_mem.py
+++ b/FastChat-main/fastchat/train/train_mem.py
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+# Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
+
+# Need to call this before importing transformers.
+from fastchat.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
+
+replace_llama_attn_with_flash_attn()
+
+from fastchat.train.train import train
+
+if __name__ == "__main__":
+    train()
--- a/FastChat-main/fastchat/utils.py
+++ b/FastChat-main/fastchat/utils.py
+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+
+import requests
+
+from fastchat.constants import LOGDIR
+
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+
+handler = None
+
+
+def build_logger(logger_name, logger_filename):
+    global handler
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO, encoding='utf-8')
+    logging.getLogger().handlers[0].setFormatter(formatter)
+
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True)
+        handler.setFormatter(formatter)
+
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+
+    return logger
+
+
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                encoded_message = line.encode('utf-8', 'ignore').decode('utf-8')
+                self.logger.log(self.log_level, encoded_message.rstrip())
+            else:
+                self.linebuf += line
+
+    def flush(self):
+        if self.linebuf != '':
+            encoded_message = self.linebuf.encode('utf-8', 'ignore').decode('utf-8')
+            self.logger.log(self.log_level, encoded_message.rstrip())
+        self.linebuf = ''
+
+
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+
+
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json",
+               "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+
+    return flagged
+
+
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
--- a/FastChat-main/playground/data/alpaca-data-conversation.json
+++ b/FastChat-main/playground/data/alpaca-data-conversation.json
--- a/FastChat-main/pyproject.toml
+++ b/FastChat-main/pyproject.toml
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "fschat"
+version = "0.1.9"
+description = "An open platform for training, serving, and evaluating large language model based chatbots."
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+]
+dependencies = [
+    "accelerate", "fastapi", "gradio==3.23", "markdown2[all]", "numpy",
+    "requests", "sentencepiece", "tokenizers>=0.12.1",
+    "torch", "uvicorn", "wandb", "prompt_toolkit>=3.0.0", "rich>=10.0.0"
+]
+
+[project.urls]
+"Homepage" = "https://github.com/lm-sys/fastchat"
+"Bug Tracker" = "https://github.com/lm-sys/fastchat/issues"
+
+[tool.setuptools.packages.find]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
+
+[tool.wheel]
+exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
--- a/FastChat-main/scripts/serving/controller.yaml
+++ b/FastChat-main/scripts/serving/controller.yaml
+resources:
+    cloud: gcp
+    region: us-central1
+
+num_nodes: 1
+
+workdir: .
+
+file_mounts:
+  ~/chatlogs:
+    name: skypilot-chatbot-logs
+    store: gcs
+    mode: MOUNT
+
+setup: |
+  conda activate chatbot
+  if [ $? -eq 0 ]; then
+    echo 'conda env exists'
+  else
+    # Setup the environment
+    conda create -n chatbot python=3.10 -y
+    conda activate chatbot
+    pip3 install -e .
+  fi
+
+run: |
+  conda activate chatbot
+  python3 -m fastchat.serve.controller --host 0.0.0.0 --port 21001 &
+  python3 -m fastchat.serve.gradio_web_server --share
--- a/FastChat-main/scripts/serving/model_worker.yaml
+++ b/FastChat-main/scripts/serving/model_worker.yaml
+resources:
+  accelerators: A100:1
+  cloud: gcp
+  region: us-central1
+
+num_nodes: 1
+
+workdir: .
+
+file_mounts:
+  /artifacts:
+    name: skypilot-chatbot
+    store: gcs
+    mode: MOUNT
+
+  ~/chatlogs:
+    name: skypilot-chatbot-logs
+    store: gcs
+    mode: MOUNT
+
+setup: |
+  conda activate chatbot
+  if [ $? -eq 0 ]; then
+    echo 'conda env exists'
+  else
+    # Setup the environment
+    conda create -n chatbot python=3.10 -y
+    conda activate chatbot
+
+    pip3 install -e .
+
+    # Install pytorch
+    pip install torch==1.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
+
+    # Install huggingface with the LLaMA commit
+    pip install git+https://github.com/huggingface/transformers
+
+    # Install alpaca
+    git clone https://github.com/tatsu-lab/stanford_alpaca.git
+    cd stanford_alpaca
+    pip install -r requirements.txt
+    cd -
+  fi
+
+  ln -s /artifacts/chatbot/13b/ckpt/ ~/alpaca-13b
+
+run: |
+  conda activate chatbot
+  WORKER_IP=$(hostname -I | cut -d' ' -f1)
+  CONTROLLER_PORT=21001
+  WORKER_PORT=21002
+  python3 -m fastchat.serve.model_worker \
+    --model ~/alpaca-13b \
+    --controller-address http://${CONTROLLER_IP}:${CONTROLLER_PORT} \
+    --worker-address http://${WORKER_IP}:${WORKER_PORT} \
+    --host 0.0.0.0 \
+    --port ${WORKER_PORT}
--- a/FastChat-main/scripts/sync_local_checkpoint.sh
+++ b/FastChat-main/scripts/sync_local_checkpoint.sh
+#!/bin/bash
+
+local_path=$1
+remote_path=$2
+MAX_NUM_CKPT=3
+
+# This script is used to periodically copy local checkpoint to mounted storage
+while true; do
+    local_last_ckpt=$(ls ${local_path} | grep checkpoint- | grep -E '[0-9]+' | sort -t'-' -k1,1 -k2,2n | tail -1)
+    remote_last_ckpt=$(ls ${remote_path} | grep checkpoint- | grep -E '[0-9]+' | sort -t'-' -k1,1 -k2,2n | tail -1)
+    echo "local_last_ckpt: ${local_last_ckpt}"
+    echo "remote_last_ckpt: ${remote_last_ckpt}"
+    if [ "${local_last_ckpt}" != "${remote_last_ckpt}" ]; then
+        mkdir -p ${remote_path}/${local_last_ckpt}
+        gsutil -m rsync -r ${local_path}/${local_last_ckpt}/ ${remote_path}/${local_last_ckpt}
+
+        # Keep only the last MAX_NUM_CKPT checkpoints
+        num_local_ckpt=$(ls ${local_path} | grep checkpoint- | wc -l)
+        echo "num_local_ckpt: ${num_local_ckpt}"
+        if [ ${num_local_ckpt} -gt $MAX_NUM_CKPT ]; then
+            for ckpt in $(ls ${local_path} | grep checkpoint- | grep -E '[0-9]+' | sort -t'-' -k1,1 -k2,2n | head -n-${MAX_NUM_CKPT}); do
+                rm -rf ${local_path}/${ckpt}
+            done
+        fi
+    fi
+
+    sleep 600
+done
--- a/FastChat-main/scripts/train-alpaca.yaml
+++ b/FastChat-main/scripts/train-alpaca.yaml
+resources:
+  accelerators: A100-80GB:4
+  disk_size: 1000
+
+num_nodes: 1
+
+file_mounts:
+  /artifacts:
+    name: skypilot-chatbot # Change to your own bucket
+    store: gcs
+    mode: MOUNT
+  /data/alpaca-data-conversation.json: chatserver/data/example/alpaca-data-conversation.json
+  # /lamma:
+  #   name: llama-ckpts # Change to the bucket that contains the LLaMA weights
+  #   store: gcs
+  #   mode: MOUNT
+
+workdir: .
+
+setup: |
+  # Setup the environment
+  conda create -n chatbot python=3.10 -y
+  conda activate chatbot
+
+  # Install pytorch
+  pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
+
+  # Install huggingface with the LLaMA commit
+  cd ~
+  git clone https://github.com/huggingface/transformers.git
+  cd transformers
+  git checkout 41a2f3529c6b56866c317031375ffd3e7b8bea01
+  pip install .
+  cd ~/sky_workdir
+
+  # Install fastchat
+  pip install -e .
+
+  mkdir -p /artifacts/llama-hf/llama-${MODEL_SIZE}B
+  if [ ! -f /artifacts/llama-hf/llama-${MODEL_SIZE}B/complete ]; then
+    mkdir -p ~/llama-${MODEL_SIZE}b
+    gsutil -m rsync -r /llama/${MODEL_SIZE}b/ ~/llama-${MODEL_SIZE}b
+    cd ~/transformers
+    python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+      --input_dir $HOME/llama-${MODEL_SIZE}b \
+      --model_size ${MODEL_SIZE}B \
+      --output_dir ~/hf-output || exit 1
+    mv ~/hf-output/tokenizer/* ~/hf-output/llama-${MODEL_SIZE}b
+    gsutil -m rsync -r ~/hf-output/llama-${MODEL_SIZE}b/ /artifacts/llama-hf/llama-${MODEL_SIZE}B
+    touch /artifacts/llama-hf/llama-${MODEL_SIZE}B/complete
+  else
+    mkdir -p ~/hf-output/llama-${MODEL_SIZE}b
+    gsutil -m cp -r /artifacts/llama-hf/llama-${MODEL_SIZE}B/* ~/hf-output/llama-${MODEL_SIZE}b
+  fi
+
+run: |
+  conda activate chatbot
+  SEQ_LEN=${SEQ_LEN:-512}
+  echo "Training with seq_len=${SEQ_LEN} and model_size=${MODEL_SIZE}B"
+  PER_DEVICE_BATCH_SIZE=$((2048 / $SEQ_LEN))
+  NUM_NODES=`echo "$SKYPILOT_NODE_IPS" | wc -l`
+  HOST_ADDR=`echo "$SKYPILOT_NODE_IPS" | head -n1`
+
+  # Hack copy it once to make it faster later
+  mkdir -p ~/.checkpoints
+  CKPT_PATH=/artifacts/chatbot/${MODEL_SIZE}b/alpaca-${SEQ_LEN}
+  last_ckpt=$(ls ${CKPT_PATH} | grep -E '[0-9]+' | sort -n | tail -1)
+  gsutil -m rsync -r ${CKPT_PATH}/${last_ckpt}/ ~/.checkpoints 
+  
+  torchrun \
+    --nnodes=$NUM_NODES \
+    --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \
+    --master_port=12375 \
+    --master_addr=$HOST_ADDR \
+    --node_rank=${SKYPILOT_NODE_RANK} \
+    fastchat/train/train.py \
+    --model_name_or_path ~/hf-output/llama-${MODEL_SIZE}b \
+    --data_path /data/alpaca-data-conversation.json \
+    --bf16 True \
+    --output_dir $CKPT_PATH \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size $PER_DEVICE_BATCH_SIZE \
+    --per_device_eval_batch_size $PER_DEVICE_BATCH_SIZE \
+    --gradient_accumulation_steps $((128 * 512 / $SEQ_LEN / $PER_DEVICE_BATCH_SIZE / $NUM_NODES / $SKYPILOT_NUM_GPUS_PER_NODE)) \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 2000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
+    --tf32 True \
+    --model_max_length ${SEQ_LEN}
+
+
+envs:
+  MODEL_SIZE: 7
+  SEQ_LEN: 512
--- a/FastChat-main/scripts/train-vicuna.yaml
+++ b/FastChat-main/scripts/train-vicuna.yaml
+resources:
+  accelerators: A100-80GB:8
+  disk_size: 1000
+  use_spot: true
+
+num_nodes: 1
+
+file_mounts:
+  /artifacts:
+    name: skypilot-chatbot # Change to your own bucket
+    store: gcs
+    mode: MOUNT
+  /data:
+    name: model-weights # Change to your own bucket
+    store: gcs
+    mode: MOUNT
+  # /llamma:
+  #   name: llama-ckpts # Change to the bucket that contains the LLaMA weights
+  #   store: gcs
+  #   mode: MOUNT
+
+workdir: .
+
+setup: |
+  # Setup the environment
+  conda create -n chatbot python=3.10 -y
+  conda activate chatbot
+
+  # Install pytorch
+  pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
+
+  # Install huggingface with the LLaMA commit
+  cd ~
+  git clone https://github.com/huggingface/transformers.git
+  cd transformers
+  git checkout 41a2f3529c6b56866c317031375ffd3e7b8bea01
+  pip install .
+  cd ~/sky_workdir
+
+  # Install fastchat
+  pip install -e .
+  pip install flash-attn
+
+  mkdir -p /artifacts/llama-hf/llama-${MODEL_SIZE}B
+  if [ ! -f /artifacts/llama-hf/llama-${MODEL_SIZE}B/complete ]; then
+    mkdir -p ~/llama-${MODEL_SIZE}b
+    gsutil -m rsync -r /llama/${MODEL_SIZE}b/ ~/llama-${MODEL_SIZE}b
+    cd ~/transformers
+    python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+      --input_dir $HOME/llama-${MODEL_SIZE}b \
+      --model_size ${MODEL_SIZE}B \
+      --output_dir ~/hf-output || exit 1
+    mv ~/hf-output/tokenizer/* ~/hf-output/llama-${MODEL_SIZE}b
+    gsutil -m rsync -r ~/hf-output/llama-${MODEL_SIZE}b/ /artifacts/llama-hf/llama-${MODEL_SIZE}B
+    touch /artifacts/llama-hf/llama-${MODEL_SIZE}B/complete
+  else
+    mkdir -p ~/hf-output/llama-${MODEL_SIZE}b
+    gsutil -m cp -r /artifacts/llama-hf/llama-${MODEL_SIZE}B/* ~/hf-output/llama-${MODEL_SIZE}b
+  fi
+
+run: |
+  conda activate chatbot
+  SEQ_LEN=${SEQ_LEN:-512}
+  GC_SCALE=${GC_SCALE:-1}
+  DATE=${DATE:-20230303}
+  USE_FLASH_ATTN=${USE_FLASH_ATTN:-0}
+  if [ $USE_FLASH_ATTN -eq 1 ]; then
+    TRAIN_SCRIPT=fastchat/train/train_mem.py
+    USE_FLASH_SUFFIX="-flash"
+  else
+    TRAIN_SCRIPT=fastchat/train/train.py
+    USE_FLASH_SUFFIX=""
+  fi
+  echo "Training with seq_len=${SEQ_LEN} and gc_scale=${GC_SCALE}"
+  PER_DEVICE_BATCH_SIZE=$((2048 * $GC_SCALE / $SEQ_LEN))
+  NUM_NODES=`echo "$SKYPILOT_NODE_IPS" | wc -l`
+  HOST_ADDR=`echo "$SKYPILOT_NODE_IPS" | head -n1`
+
+  # Do the periodic syncing manually, to avoid the degradation of
+  # the training for saving checkpoints.
+  mkdir -p ~/.checkpoints
+  LOCAL_CKPT_PATH=~/.checkpoints
+  CKPT_PATH=/artifacts/chatbot/${MODEL_SIZE}b/sharegpt-${DATE}-seq-${SEQ_LEN}${USE_FLASH_SUFFIX}
+  last_ckpt=$(ls ${CKPT_PATH} | grep -E '[0-9]+' | sort -t'-' -k1,1 -k2,2n | tail -1)
+  mkdir -p ~/.checkpoints/${last_ckpt}
+  gsutil -m rsync -r ${CKPT_PATH}/${last_ckpt}/ ~/.checkpoints/${last_ckpt}
+
+  bash scripts/sync_local_checkpoint.sh ${LOCAL_CKPT_PATH} ${CKPT_PATH} > sync.log 2>&1 &
+  
+  torchrun \
+    --nnodes=$NUM_NODES \
+    --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \
+    --master_port=12375 \
+    --master_addr=$HOST_ADDR \
+    --node_rank=${SKYPILOT_NODE_RANK} \
+    $TRAIN_SCRIPT \
+    --model_name_or_path ~/hf-output/llama-${MODEL_SIZE}b \
+    --data_path /data/sharegpt/sharegpt_20230322_clean_lang_split.json \
+    --bf16 True \
+    --output_dir $LOCAL_CKPT_PATH \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size $PER_DEVICE_BATCH_SIZE \
+    --per_device_eval_batch_size $PER_DEVICE_BATCH_SIZE \
+    --gradient_accumulation_steps $((128 * 512 / $SEQ_LEN / $PER_DEVICE_BATCH_SIZE / $NUM_NODES / $SKYPILOT_NUM_GPUS_PER_NODE)) \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 1200 \
+    --save_total_limit 10 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
+    --tf32 True \
+    --model_max_length ${SEQ_LEN} \
+    --gradient_checkpointing True \
+    --lazy_preprocess True
+
+  # Sync any files not in the checkpoint-* folders
+  gsutil -m rsync -r -x 'checkpoint-*' $LOCAL_CKPT_PATH/ $CKPT_PATH/
+
+
+envs:
+  MODEL_SIZE: 13
+  SEQ_LEN: 2048
+  GC_SCALE: 4
+  DATE: 20230322
+  USE_FLASH_ATTN: 1
--- a/FastChat-main/scripts/train_7b_4x40g_a100.sh
+++ b/FastChat-main/scripts/train_7b_4x40g_a100.sh
+torchrun --nproc_per_node=4 --master_port=20001 fastchat/train/alpaca_train.py \
+    --model_name_or_path /home/haozhang/model_weights/hf-llama-7b \
+    --data_path /home/haozhang/datasets/alpaca_data.json \
+    --bf16 True \
+    --output_dir output \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 8 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 2000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
+    --tf32 True \
+    --gradient_checkpointing True
--- a/FastChat-main/scripts/upload_pypi.sh
+++ b/FastChat-main/scripts/upload_pypi.sh
+rm -rf dist
+python3 -m build
+python3 -m twine upload dist/*
--- a/FastChat-main/setup.cfg
+++ b/FastChat-main/setup.cfg
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
+# Open and Efficient Foundation Language Models(LLAMA)
+
+
+
+## 模型介绍
+LLaMA，这是一个基础语言模型的集合，参数范围从7B到65B。在数万亿的tokens上训练出的模型，并表明可以专门使用公开可用的数据集来训练最先进的模型，而不依赖于专有的和不可访问的数据集。特别是，llama 13B在大多数基准测试中优于GPT-3 (175B)， LLaMA 65B与最好的模型Chinchilla-70B和PaLM-540B具有竞争力。
+## 模型结构
+LLAMA网络基于 Transformer 架构。提出了各种改进，并用于不同的模型，例如 PaLM。以下是与原始架构的主要区别：
+
+**预归一化**。为了提高训练稳定性，对每个transformer 子层的输入进行归一化，而不是对输出进行归一化。使用 RMSNorm 归一化函数。
+
+**SwiGLU 激活函数 [PaLM]**。使用 SwiGLU 激活函数替换 ReLU 非线性以提高性能。使用 2 /3 4d 的维度而不是 PaLM 中的 4d。
+
+**旋转嵌入**。移除了绝对位置嵌入，而是添加了旋转位置嵌入 (RoPE)，在网络的每一层。
+
+以下是llama-13B的主要网络参数配置：
+
+```
+"hidden_act": "silu", 
+"hidden_size": 5120, 
+"intermediate_size": 13824, 
+"initializer_range": 0.02, 
+"max_sequence_length": 2048, 
+"model_type": "llama", 
+"num_attention_heads": 40, 
+"num_hidden_layers": 40, 
+"rms_norm_eps": 1e-06, 
+"torch_dtype": "float16", 
+"vocab_size": 32000
+```
+
+## 数据集
+我们在Fastchat目录下集成了英文对话数据集供用户快速验证：
+
+    ./FastChat-main/playground/data/alpaca-data-conversation.json
+
+
+## LLAMA-13B微调（slurm）
+
+### 环境配置
+
+要求DCU集群Slurm环境正常。
+
+依赖开发者社区torch1.10，deepspeed 0.6.3，apex0.1（可选）：https://developer.hpccube.com/tool/
+
+推荐用户使用预编译好的python3.8包来快速建立python3虚拟环境：
+
+    cp -r slurm/* ./
+    根据当前系统更改env.sh中相关路径
+    virtualenv -p /python_bin_path/python3 --system-site-packages venv_torch3.8
+    source env.sh	#进入venv_torch3.8虚拟环境
+    
+    pip3 install --upgrade pip -i https://mirrors.aliyun.com/pypi/simple	#更新pip
+    cd FastChat-main
+    pip3 install -e .
+    cd ../transformers-main
+    pip3 install -e .
+    cd ..
+    pip3 install  torch-1.10.0a0+git2040069.dtk2210-cp38-cp38-manylinux2014_x86_64.whl
+    pip3 install  deepspeed-0.6.3+1b2721a.dtk2210-cp38-cp38-manylinux2014_x86_64.whl
+    pip3 install  apex-0.1+gitdb7007a.dtk2210-cp38-cp38-manylinux2014_x86_64.whl（可选）
+
+### 训练
+
+该训练脚本需要8节点，每节点4张DCU-Z100-16G。
+
+并行配置采用zero3，使用fp16精度微调，如果想使能apex adamw_apex_fused优化器，更改./FastChat-main/fastchat/train/train.py:55行优化器改成adamw_apex_fused。deepspeed config.json如下：
+
+```
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "gradient_accumulation_steps":4,
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "initial_scale_power": 16,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "cpu_offload": false,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients" : true
+  }
+}
+```
+
+进入登陆节点，微调命令：
+
+    source submit_job.sh
+    tail -f log/xxx.out.log	#查看输出log
+    tail -f log/xxx.err.log	#查看错误log
+
+
+
+## LLAMA-13B微调（无slurm，使用mpi）
+
+### 环境配置
+
+2节点16卡Z00L裸金属节点，要求dtk22.10.1环境正常，mpirun文件夹下包含预编译好的openmpi库mpi4.tar.gz，可直接使用：
+
+```
+cp -r mpirun/* ./
+根据当前系统更改env.sh中相关路径
+cd FastChat-main
+pip3 install -e .
+cd ../transformers-main
+pip3 install -e .
+cd ..
+pip3 install  torch-1.10.0a0+git2040069.dtk2210-cp38-cp38-manylinux2014_x86_64.whl
+pip3 install  deepspeed-0.6.3+1b2721a.dtk2210-cp38-cp38-manylinux2014_x86_64.whl
+pip3 install  apex-0.1+gitdb7007a.dtk2210-cp38-cp38-manylinux2014_x86_64.whl（可选）
+```
+
+### 训练
+
+该训练脚本需要2节点，每节点8张DCU-Z100L-32G。
+
+并行配置采用zero3，使用fp16精度微调，如果想使能apex adamw_apex_fused优化器，更改./FastChat-main/fastchat/train/train.py:55行优化器改成adamw_apex_fused。deepspeed config.json如下：
+
+```
+{
+  "train_micro_batch_size_per_gpu": 4,
+  "gradient_accumulation_steps":16,
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "initial_scale_power": 16,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "cpu_offload": false,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients" : true
+  }
+}
+```
+
+进入节点1，根据环境修改hostfile，保证两节点文件路径一致，配置相同，修改mpi_job.sh中--mca btl_tcp_if_include enp97s0f1，enp97s0f1改为ip -a命令后对应节点ip的网卡名，numa可以根据当前节点拓扑更改绑定，微调命令：
+
+```
+source mpi_job.sh
+```
+
+### 模型精度
+
+训练数据：[./FastChat-main/playground/data/alpaca-data-conversation.json](链接)
+
+使用的GPGPU：16张DCU-Z100L-32G。
+
+模型精度（max_sequence_length: 2048）：
+| 卡数 | 分布式工具 | 收敛性 |
+| :------: | :------: |:------: |
+| 16 | deepspeed | total_loss: 0.62/150 steps |
+## 源码仓库及问题反馈
+
+- https://developer.hpccube.com/codes/modelzoo/llama_torch
+
+## 参考
+
+* https://huggingface.co/decapoda-research/llama-13b-hf
+* https://github.com/lm-sys/FastChat
\ No newline at end of file
--- a/model.properties
+++ b/model.properties
+# 模型名称
+modelName=LLAMA_torch
+# 模型描述
+modelDescription=基于Pytorch框架的llama-13b
+# 应用场景(多个标签以英文逗号分割)
+appScenario=训练,推理,train,inference,nlp,智能聊天助手
+# 框架类型(多个标签以英文逗号分割)
+frameType=Pytorch,Transformers,Deepspeed
--- a/mpirun/ds_config.json
+++ b/mpirun/ds_config.json
+{
+  "train_micro_batch_size_per_gpu": 4,
+  "gradient_accumulation_steps":16,
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "initial_scale_power": 16,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "cpu_offload": false,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients" : true
+  }
+}
+
--- a/mpirun/env.sh
+++ b/mpirun/env.sh
+#!/bin/bash
+
+export ROCM_PATH=/opt/dtk-22.10.1
+export ROCM_SOURCE_DIR=${ROCM_PATH}
+echo $ROCM_PATH
+export HIP_PATH=${ROCM_PATH}/hip
+export AMDGPU_TARGETS="gfx900;gfx906"
+export PATH=${ROCM_PATH}/bin:${ROCM_PATH}/llvm/bin:${ROCM_PATH}/hcc/bin:${ROCM_PATH}/hip/bin:$PATH
+
+export LD_LIBRARY_PATH=${ROCM_PATH}/lib:${ROCM_PATH}/lib64:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=${ROCM_PATH}/hip/lib:${ROCM_PATH}/llvm/lib:${ROCM_PATH}/opencl/lib/x86_64:$LD_LIBRARY_PATH
+export C_INCLUDE_PATH=${ROCM_PATH}/include:${ROCM_PATH}/hip/include/hip:${ROCM_PATH}/llvm/include:/opencl/include:${ROCM_PATH}/include/rocrand:${ROCM_PATH}/include/hiprand
+export CPLUS_INCLUDE_PATH=${ROCM_PATH}/include:${ROCM_PATH}/hip/include/hip:${ROCM_PATH}/llvm/include:/opencl/include:${ROCM_PATH}/include/rocrand:${ROCM_PATH}/include/hiprand
+export PATH=${ROCM_PATH}/miopen/bin:${ROCM_PATH}/rocblas/bin:${ROCM_PATH}/hipsparse/bin:$PATH
+export LD_LIBRARY_PATH=${ROCM_PATH}/miopen/lib:${ROCM_PATH}/rocblas/lib:$LD_LIBRARY_PATH
+export MIOPEN_SYSTEM_DB_PATH=${ROCM_PATH}/miopen/share/miopen/db/
+export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/usr/lib64:$LIBRARY_PATH
+
+export RCCL_PATH=$ROCM_PATH/rccl
+export NCCL_PATH=$ROCM_PATH/rccl
+export LD_LIBRARY_PATH=$RCCL_PATH/lib:$LD_LIBRARY_PATH
+
+export MIOPEN_FIND_MODE=3
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_P2P_LEVEL=5
+
+export NCCL_GDR_FLUSH_DISABLE=1
+export NCCL_NET_GDR_LEVEL=SYS
+export RCCL_NCHANNELS=2
+
+export NCCL_DEBUG=INFO
+export MODEL_SIZE=13
+export SEQ_LEN=2048
+export GC_SCALE=4
+export DATE=20230322
+export USE_FLASH_ATTN=0
+
+#export HIP_LOG_LEVEL=5
+#export HIP_MODULE_MASK=0xffffffff
+#export HIP_MEMORY_BLOCKING=1
+#export HIP_LAUNCH_BLOCKING=1
+
+export HIP_CLANG_PATH=/opt/dtk-22.10.1/llvm/bin
+
+export HSA_PATH=/opt/dtk-22.10.1/hsa
+export AOMP=/opt/dtk-22.10.1/llvm
+export LD_LIBRARY_PATH=/opt/dtk-22.10.1/rccl/lib:/usr/lib64:/opt/dtk-22.10.1/miopen/lib:/opt/dtk-22.10.1/rocblas/lib:/opt/dtk-22.10.1/hip/lib:/opt/dtk-22.10.1/llvm/lib:/opt/dtk-22.10.1/opencl/lib/x86_64:/opt/dtk-22.10.1/lib:/opt/dtk-22.10.1/lib64:/opt/dtk-22.10.1/rccl/lib:/usr/lib64:/opt/dtk-22.10.1/miopen/lib:/opt/dtk-22.10.1/rocblas/lib:/opt/dtk-22.10.1/hip/lib:/opt/dtk-22.10.1/llvm/lib:/opt/dtk-22.10.1/opencl/lib/x86_64:/opt/dtk-22.10.1/lib:/opt/dtk-22.10.1/lib64:/opt/dtk-22.10.1/roctracer/lib:/opt/dtk-22.10.1/rocthrust/lib:/opt/dtk-22.10.1/rocsparse/lib:/opt/dtk-22.10.1/rocsolver/lib:/opt/dtk-22.10.1/rocrand/lib:/opt/dtk-22.10.1/rocprofiler/lib:/opt/dtk-22.10.1/rocprim/lib:/opt/dtk-22.10.1/dtk-22.10.1_smi/lib:/opt/dtk-22.10.1/rocfft/lib:/opt/dtk-22.10.1/rocblas/lib:/opt/dtk-22.10.1/rocalution/lib:/opt/dtk-22.10.1/rccl/lib:/opt/dtk-22.10.1/opencl/lib:/opt/dtk-22.10.1/oam/lib:/opt/dtk-22.10.1/migraphx/lib:/opt/dtk-22.10.1/miopengemm/lib:/opt/dtk-22.10.1/miopen/lib:/opt/dtk-22.10.1/llvm/lib-debug/src/openmp/libomptarget/plugins/remote/lib:/opt/dtk-22.10.1/llvm/lib/clang/14.0.0/lib:/opt/dtk-22.10.1/llvm/lib:/opt/dtk-22.10.1/hsa/lib:/opt/dtk-22.10.1/hipsparse/lib:/opt/dtk-22.10.1/hipsolver/lib:/opt/dtk-22.10.1/hiprand/lib:/opt/dtk-22.10.1/hipfft/lib:/opt/dtk-22.10.1/hipcub/lib:/opt/dtk-22.10.1/hipblas-clients/lib:/opt/dtk-22.10.1/hipblas/lib:/opt/dtk-22.10.1/hip/lib:/opt/dtk-22.10.1/lib:/opt/dtk-22.10.1/lib64:/opt/mpi/lib:/usr/local/lib/:/usr/local/lib64/:/usr/lib64/
+export PATH=/opt/dtk-22.10.1/miopen/bin:/opt/dtk-22.10.1/rocblas/bin:/opt/dtk-22.10.1/hipsparse/bin:/opt/dtk-22.10.1/bin:/opt/dtk-22.10.1/llvm/bin:/opt/dtk-22.10.1/hcc/bin:/opt/dtk-22.10.1/hip/bin:/opt/dtk-22.10.1/miopen/bin:/opt/dtk-22.10.1/rocblas/bin:/opt/dtk-22.10.1/hipsparse/bin:/opt/dtk-22.10.1/bin:/opt/dtk-22.10.1/llvm/bin:/opt/dtk-22.10.1/hcc/bin:/opt/dtk-22.10.1/hip/bin:/opt/dtk-22.10.1/libexec/rocprofiler:/opt/dtk-22.10.1/libexec/dtk-22.10.1_smi:/opt/dtk-22.10.1/rocprofiler/bin:/opt/dtk-22.10.1/opencl/bin:/opt/dtk-22.10.1/miopen/bin:/opt/dtk-22.10.1/llvm/lib/clang/14.0.0/bin:/opt/dtk-22.10.1/llvm/bin:/opt/dtk-22.10.1/hip/bin:/opt/dtk-22.10.1/bin:/opt/mpi/bin:/root/anaconda3/bin:/root/anaconda3/condabin:/usr/lib64/qt-3.3/bin:/root/perl5/bin:/opt/dtk-22.10.1/bin:/opt/dtk-22.10.1/hip/bin:/opt/dtk-22.10.1/llvm/bin:/opt/dtk-22.10.1/llvm/lib/clang/14.0.0/bin:/opt/dtk-22.10.1/miopen/bin:/opt/dtk-22.10.1/opencl/bin:/opt/dtk-22.10.1/rocprofiler/bin:/opt/dtk-22.10.1/libexec/dtk-22.10.1_smi:/opt/dtk-22.10.1/libexec/rocprofiler:/opt/rh/devtoolset-7/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/root/bin
+export ROCM_ROOT=/opt/dtk-22.10.1
+export ROCBLAS_TENSILE_LIBPATH=/opt/dtk-22.10.1/lib/rocblas/library
+export HIP_ROCCLR_HOME=/opt/dtk-22.10.1/hip
+export HIP_LIB_PATH=/opt/dtk-22.10.1/hip/lib
+export DEVICE_LIB_PATH=/opt/dtk-22.10.1/amdgcn/bitcode
+
+#export ROCBLAS_LAYER=3
+
--- a/mpirun/hostfile
+++ b/mpirun/hostfile
+10.0.21.163 slots=8
+10.0.21.116 slots=8
\ No newline at end of file
--- a/mpirun/mpi4.tar.gz
+++ b/mpirun/mpi4.tar.gz
--- a/mpirun/mpi_job.sh
+++ b/mpirun/mpi_job.sh
+source env.sh
+hostfile=./hostfile
+
+np=$(cat $hostfile|sort|uniq |wc -l)
+np=$(($np*8))
+
+which mpirun
+mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none --mca btl_tcp_if_include enp97s0f1 mpi_single.sh 8
+echo "END TIME: $(date)"
+
+
+