Release training code

0bf5e500 · Tri Dao · 9bc63d1e · 0bf5e500 · 0bf5e500 · 0bf5e500
Commit 0bf5e500 authored Nov 28, 2022 by Tri Dao
20 changed files
--- a/training/configs/logger/wandb.yaml
+++ b/training/configs/logger/wandb.yaml
+# https://wandb.ai
+
+wandb:
+  _target_: pytorch_lightning.loggers.wandb.WandbLogger
+  project: attention
+  name: ${name}
+  save_dir: "."
+  mode: online # set offline to store all logs only locally
+  id: ${oc.select:name} # pass correct id to resume experiment!
+  # entity: ""  # set to name of your wandb team or just remove it
+  log_model: False
+  prefix: ""
+  job_type: "train"
+  group: ""
+  tags: []
--- a/training/configs/metrics/acc.yaml
+++ b/training/configs/metrics/acc.yaml
+# @package eval.metrics
+acc:
+  _target_: src.metrics.accuracy.AccuracyMine
--- a/training/configs/metrics/acc_ignore_index.yaml
+++ b/training/configs/metrics/acc_ignore_index.yaml
+# @package eval.metrics
+acc:
+  _target_: torchmetrics.Accuracy
+  ignore_index: -100
--- a/training/configs/metrics/acctop5.yaml
+++ b/training/configs/metrics/acctop5.yaml
+# @package eval.metrics
+acctop5:
+  _target_: src.metrics.accuracy.AccuracyMine
+  top_k: 5
--- a/training/configs/metrics/mse.yaml
+++ b/training/configs/metrics/mse.yaml
+# @package eval.metrics
+mse:
+  _target_: torchmetrics.MeanSquaredError
--- a/training/configs/metrics/num-tokens.yaml
+++ b/training/configs/metrics/num-tokens.yaml
+# @package eval.metrics
+num-tokens:
+  _target_: src.metrics.num_tokens.NumTokens
--- a/training/configs/metrics/perplexity.yaml
+++ b/training/configs/metrics/perplexity.yaml
+# @package eval.metrics
+ppl:
+  _target_: src.metrics.perplexity.Perplexity
--- a/training/configs/mode/debug.yaml
+++ b/training/configs/mode/debug.yaml
+# @package _global_
+
+# run in debug mode with:
+# `python run.py mode=debug`
+
+defaults:
+  - override /trainer: debug.yaml
+
+debug_mode: True
+
+hydra:
+  # sets level of all command line loggers to 'DEBUG'
+  verbose: True
+
+  # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
+  # sets level of only chosen command line loggers to 'DEBUG'
+  # verbose: [src.train, src.utils.utils]
+
+  # sets output paths for all file logs to 'logs/debug/'
+  run:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/debug/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/debug/multirun_${now:%Y-%m-%d_%H-%M-%S}
+    subdir: ${hydra.job.num}
+
+# disable rich config printing, since it will be already printed by hydra when `verbose: True`
+print_config: False
--- a/training/configs/mode/default.yaml
+++ b/training/configs/mode/default.yaml
+# @package _global_
+
+# default running mode
+
+default_mode: True
+
+hydra:
+  # default output paths for all file logs
+  run:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/runs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/multiruns/${now:%Y-%m-%d_%H-%M-%S}
+    subdir: ${hydra.job.num}
--- a/training/configs/mode/exp.yaml
+++ b/training/configs/mode/exp.yaml
+# @package _global_
+
+# run in experiment mode with:
+# `python run.py mode=exp name=experiment_name`
+
+experiment_mode: True
+
+# allows for custom naming of the experiment
+name: ???
+
+hydra:
+  # sets output paths for all file logs to `logs/experiment/name'
+  run:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/experiments/${name}
+  sweep:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/experiments/${name}
+    subdir: ${hydra.job.num}
--- a/training/configs/mode/profile.yaml
+++ b/training/configs/mode/profile.yaml
+# @package _global_
+# Run the Pytorch profiler
+
+trainer:
+  profiler:
+    _target_: pytorch_lightning.profilers.PyTorchProfiler
+    dirpath: ${hydra.run.dir}
+    schedule:
+      _target_: torch.profiler.schedule
+      wait: 5
+      warmup: 5
+      active: 5
+    use_cuda: True
+  max_steps: 20
+
+logger:
+  wandb:
+    mode: disabled
+
+callbacks:
+  model_checkpoint: null
+  model_checkpoint_progress: null
+  early_stopping: null
+
+hydra:
+  # sets output paths for all file logs to 'logs/profile/'
+  run:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/profile/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/profile/multirun_${now:%Y-%m-%d_%H-%M-%S}
+    subdir: ${hydra.job.num}
--- a/training/configs/mode/smoke.yaml
+++ b/training/configs/mode/smoke.yaml
+# @package _global_
+# Smoke test: disable logging and model checkpointing
+
+logger:
+  wandb:
+    mode: disabled
+
+callbacks:
+  model_checkpoint: null
+  model_checkpoint_progress: null
+
+hydra:
+  # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
+  # sets level of only chosen command line loggers to 'DEBUG'
+  # verbose: [src.train, src.utils.utils]
+
+  # sets output paths for all file logs to 'logs/debug/'
+  run:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/debug/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/debug/multirun_${now:%Y-%m-%d_%H-%M-%S}
+    subdir: ${hydra.job.num}
--- a/training/configs/model/gpt2-hf.yaml
+++ b/training/configs/model/gpt2-hf.yaml
+defaults:
+  - _self_
+  - gpt2model: gpt2-small
+
+_target_: transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel
+_recursive_: True
+config:
+  _target_: transformers.GPT2Config
+  # Mistral's config: https://github.com/stanford-crfm/mistral/blob/main/conf/models/gpt2-small.yaml
+  # However, reorder_and_upcast_attn slows things down
+  reorder_and_upcast_attn: false
+  scale_attn_by_inverse_layer_idx: true
+  n_positions: ${datamodule.max_length}
--- a/training/configs/model/gpt2.yaml
+++ b/training/configs/model/gpt2.yaml
+defaults:
+  - _self_
+  - gpt2model: gpt2-small
+
+_target_: flash_attn.models.gpt.GPTLMHeadModel
+_recursive_: True
+config:
+  _target_: transformers.GPT2Config
+  # Mistral's config: # https://github.com/stanford-crfm/mistral/blob/main/conf/models/mistral-small.yaml
+  # However, reorder_and_upcast_attn slows things down
+  reorder_and_upcast_attn: false
+  scale_attn_by_inverse_layer_idx: true
+  n_positions: ${datamodule.max_length}
--- a/training/configs/model/gpt2model/gpt2-large.yaml
+++ b/training/configs/model/gpt2model/gpt2-large.yaml
+# @package _global_
+model:
+  config:
+    n_embd: 1280
+    n_head: 20
+    n_layer: 36
--- a/training/configs/model/gpt2model/gpt2-medium.yaml
+++ b/training/configs/model/gpt2model/gpt2-medium.yaml
+# @package _global_
+model:
+  config:
+    n_embd: 1024
+    n_head: 16
+    n_layer: 24
--- a/training/configs/model/gpt2model/gpt2-small.yaml
+++ b/training/configs/model/gpt2model/gpt2-small.yaml
+# @package _global_
+model:
+  config:
+    n_embd: 768
+    n_head: 12
+    n_layer: 12
--- a/training/configs/model/gpt2model/gpt2-xlarge.yaml
+++ b/training/configs/model/gpt2model/gpt2-xlarge.yaml
+# @package _global_
+model:
+  config:
+    n_embd: 1600
+    n_head: 25
+    n_layer: 48
--- a/training/configs/optimizer/adam.yaml
+++ b/training/configs/optimizer/adam.yaml
+# @package train.optimizer
+_target_: torch.optim.Adam
--- a/training/configs/optimizer/adamw-apex-distributed.yaml
+++ b/training/configs/optimizer/adamw-apex-distributed.yaml
+# @package train.optimizer
+_target_: apex.contrib.optimizers.distributed_fused_adam.DistributedFusedAdam
+adam_w_mode: True