Initial commit

26e59280 · wanglch · 26e59280 · 26e59280 · 26e59280 · 26e59280
Commit 26e59280 authored Apr 24, 2025 by wanglch
20 changed files
--- a/.flake8
+++ b/.flake8
+[flake8]
+ignore = E501, F403, C901, W504, W605, E251, E122, E126, E127, E722, W503, E128, E741, E731, E701
+select = E1, E3, E502, E7, E9, W1, W5, W6
+max-line-length = 180
+exclude=*.egg/*,build,dist,detection/configs/*
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+.idea/
+.DS_Store
+data_process/
+internvl_chat/work_dirs/
+internvl_chat/unittest/
+internvl_chat/data/
+Husky2/*
+data_process/
+*distillation*
+batchscript-*
+results/
--- a/.isort.cfg
+++ b/.isort.cfg
+[isort]
+line-length = 180
+multi_line_output = 0
+extra_standard_library = setuptools
+known_third_party = PIL,asynctest,cityscapesscripts,cv2,gather_models,matplotlib,mmcv,numpy,onnx,onnxruntime,pycocotools,pytest,pytorch_sphinx_theme,requests,scipy,seaborn,six,terminaltables,torch,ts,yaml
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
+[yapf]
+BASED_ON_STYLE = pep8
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
+SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
+[codespell]
+skip = *.ipynb
+quiet-level = 3
+ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids,TOOD,tood
+© 2022 GitHub, Inc.
+Terms
+Privacy
+Security
+Status
+Docs
+Contact GitHub
+Pricing
+API
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+exclude: ^internvl_chat_llava/
+repos:
+  - repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
+    hooks:
+      - id: flake8
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.11.5
+    hooks:
+      - id: isort
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: double-quote-string-fixer
+      - id: check-merge-conflict
+      - id: fix-encoding-pragma
+        args: ["--remove"]
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+  - repo: https://github.com/executablebooks/mdformat
+    rev: 0.7.9
+    hooks:
+      - id: mdformat
+        args: ["--number"]
+        additional_dependencies:
+          - mdformat-openmmlab
+          - mdformat_frontmatter
+          - linkify-it-py
--- a/Dockerfile
+++ b/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-py3.10-dtk24.04.3-ubuntu20.04
\ No newline at end of file
--- a/INSTALLATION.md
+++ b/INSTALLATION.md
+## 🛠️ Installation
+- Clone this repository:
+  ```bash
+  git clone https://github.com/OpenGVLab/InternVL.git
+  ```
+- Create a conda virtual environment and activate it:
+  ```bash
+  conda create -n internvl python=3.9 -y
+  conda activate internvl
+  ```
+- Install dependencies using `requirements.txt`:
+  ```bash
+  pip install -r requirements.txt
+  ```
+  By default, our `requirements.txt` file includes the following dependencies:
+  - `-r requirements/internvl_chat.txt`
+  - `-r requirements/streamlit_demo.txt`
+  - `-r requirements/classification.txt`
+  - `-r requirements/segmentation.txt`
+  The `clip_benchmark.txt` is **not** included in the default installation. If you require the `clip_benchmark` functionality, please install it manually by running the following command:
+  ```bash
+  pip install -r requirements/clip_benchmark.txt
+  ```
+### Additional Instructions
+- Install `flash-attn==2.3.6`:
+  ```bash
+  pip install flash-attn==2.3.6 --no-build-isolation
+  ```
+  Alternatively you can compile from source:
+  ```bash
+  git clone https://github.com/Dao-AILab/flash-attention.git
+  cd flash-attention
+  git checkout v2.3.6
+  python setup.py install
+  ```
+- Install `mmcv-full==1.6.2` (optional, for `segmentation`):
+  ```bash
+  pip install -U openmim
+  mim install mmcv-full==1.6.2
+  ```
+- Install `apex` (optional, for `segmentation`):
+  ```bash
+  git clone https://github.com/NVIDIA/apex.git
+  git checkout 2386a912164b0c5cfcd8be7a2b890fbac5607c82  # https://github.com/NVIDIA/apex/issues/1735
+  pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+  ```
+  If you encounter `ModuleNotFoundError: No module named 'fused_layer_norm_cuda'`, it is because apex's CUDA extensions are not being installed successfully. You can try uninstalling apex and the code will default to the PyTorch version of RMSNorm. Alternatively, if you prefer using apex, try adding a few lines to `setup.py` and then recompiling.
+  <img src=https://github.com/OpenGVLab/InternVL/assets/23737120/c04a989c-8024-49fa-b62c-2da623e63729 width=50%>
--- a/LICENSE
+++ b/LICENSE
+MIT License
+Copyright (c) 2023 OpenGVLab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
+# InternVL3
+## 论文
+[Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks](https://arxiv.org/abs/2312.14238)
+## 模型结构
+InternVL3 的架构沿袭了其前代模型采用的 “ViT-MLP-LLM” 范式。研究人员选择使用预训练的 ViT 和 LLM 组件初始化模型，以减少计算成本。
+- 视觉编码器 (Vision Encoder)： 提供 InternViT-300M 和 InternViT-6B 两种配置。
+- 语言模型 (Language Model)： 利用了 Qwen2.5 系列和 InternLM3-8B 的预训练基座模型（未经指令微调）。
+- 高分辨率处理： InternVL3 沿袭 InternVL2.5 的做法，引入像素反混叠操作 (pixel unshuffle)，将 448x448 图像块的视觉 token 数量减少到原始值的四分之一，增强处理高分辨率图像的可扩展性。
+- 可变视觉位置编码 (V2PE)： InternVL3 集成了 V2PE ([42])，通过使用更小、更灵活的视觉 token 位置增量来处理更长的多模态上下文。具体来说，文本 token 的位置增量仍为 1，而视觉 token 的位置增量 δ 小于 1。δ 在训练时从预定义的包含分数的小值集合中随机选择（例如 1/2, 1/4, ..., 1/256），并在推理时根据序列长度灵活选择。当 δ=1 时，V2PE 退化为 InternVL2.5 中使用的传统位置编码。
+<div align=center>
+    <img src="./images/arch.png"/>
+</div>
+## 算法原理
+ViT是Google团队提出的将Transformer应用在图像分类的模型。ViT将输入图片分为多个patch（16x16），再将每个patch投影为固定长度的向量送入Transformer，后续encoder的操作和原始Transformer中完全相同。但是因为对图片分类，因此在输入序列中加入一个特殊的token，该token对应的输出即为最后的类别预测
+<div align=center>
+    <img src="./images/theory.png"/>
+</div>
+## 环境配置
+### Docker（方法一）
+推荐使用docker方式运行， 此处提供[光源](https://www.sourcefind.cn/#/service-details)拉取docker镜像的地址与使用步骤
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-py3.10-dtk24.04.3-ubuntu20.04
+docker run -it --shm-size=1024G -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal:/opt/hyhal --network=host --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name internvl3  <your IMAGE ID> bash # <your IMAGE ID>为以上拉取的docker的镜像ID替换
+git clone http://developer.sourcefind.cn/codes/modelzoo/internvl3_pytorch.git
+cd /path/your_code_data/
+pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应。
+### Dockerfile（方法二）
+此处提供dockerfile的使用方法
+```
+git clone http://developer.sourcefind.cn/codes/modelzoo/internvl3_pytorch.git
+docker build -t internvl:latest .
+docker run --shm-size 500g --network=host --name=internvl3  --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home/ -v /opt/hyhal:/opt/hyhal:ro -it <your IMAGE ID> bash
+cd /path/your_code_data/
+pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+### Anaconda（方法三）
+此处提供本地配置、编译的详细步骤，例如：
+关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
+```
+DTK驱动:dtk24.04.3
+python:3.10
+torch:2.3.0
+transformers>=4.48.0
+```
+`Tips：以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
+其它非深度学习库参照requirement.txt安装：
+```
+git clone http://developer.sourcefind.cn/codes/modelzoo/internvl3_pytorch.git
+cd /path/your_code_data/
+pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+## 数据集
+无
+## 训练
+无
+## 推理
+### 单机单卡
+```
+python internvl3_inference.py.py
+```
+### 单机多卡
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3 internvl3_inference.py
+```
+## result
+<div align=left>
+    <img src="./images/result.png"/>
+</div>
+### 精度
+无
+## 应用场景
+### 算法类别
+`对话问答`
+### 热点应用行业
+`科研,教育,政府,金融`
+## 预训练权重
+HF/github下载地址为：[OpenGVLab/InternVL3](https://huggingface.co/collections/OpenGVLab/internvl3-67f7f690be79c2fe9d74fe9d)
+魔搭下载路径：[OpenGVLab/InternVL3](https://www.modelscope.cn/collections/InternVL3-5d0bdc54b7d84e)
+## 源码仓库及问题反馈
+- https://developer.sourcefind.cn/codes/modelzoo/internvl3_pytorch
+## 参考资料
+- https://github.com/OpenGVLab/InternVL
--- a/classification/README.md
+++ b/classification/README.md
+# InternViT-6B for Image Classification
+This folder contains the implementation of the InternViT-6B for image classification, which corresponds to Section 4.2.1 of our [InternVL 1.0 paper](https://arxiv.org/pdf/2312.14238).
+The codebase for this part is derived from [InternImage](https://github.com/OpenGVLab/InternImage), with some code references to [EVA](https://github.com/baaivision/EVA/tree/master) and [DINOv2](https://github.com/facebookresearch/dinov2). Thanks for their great work.
+In this part, we validate the visual perception capabilities of InternViT-6B, the most core component of InternVL 1.0.
+We evaluate the quality of visual representation produced by InternViT-6B using the ImageNet-1K dataset. Following common practices, we adopt the linear probing evaluation, i.e. training a linear classifier while keeping the backbone frozen. In addition to the ImageNet-1K validation set,
+we also report performance metrics on several ImageNet variants, to benchmark the domain generalization capability.
+InternViT-6B follows the structure of vanilla ViT, and its hyperparameters are listed in the table below.
+<img width="558" alt="image" src="https://github.com/OpenGVLab/InternVL/assets/23737120/e6bb0151-ab2f-4436-982f-6c68c5a69bc4">
+## 🛠️ Installation
+Follow the [installation guide](../INSTALLATION.md) to perform installations.
+## 📦 Data Preparation
+> Please prepare the dataset according to your needs.
+- `ImageNet-1K`: We use the standard ImageNet dataset, you can download it from [http://image-net.org/](http://image-net.org/).
+- `ImageNet-A`: Download it from [https://people.eecs.berkeley.edu/~hendrycks/imagenet-a.tar](https://people.eecs.berkeley.edu/~hendrycks/imagenet-a.tar).
+- `ImageNet-R`: Download it from [https://people.eecs.berkeley.edu/~hendrycks/imagenet-r.tar](https://people.eecs.berkeley.edu/~hendrycks/imagenet-r.tar).
+- `ImageNetV2`: Download it from [https://imagenetv2public.s3-us-west-2.amazonaws.com/imagenetv2-matched-frequency.tar.gz](https://imagenetv2public.s3-us-west-2.amazonaws.com/imagenetv2-matched-frequency.tar.gz).
+- `ImageNet-Sketch`: Download it using `gdown`.
+  ```shell
+  # GDown is needed to download the dataset.
+  # Please install it via `pip install gdown`
+  gdown --id 1Mj0i5HBthqH1p_yeXzsg22gZduvgoNeA
+  ```
+First, please prepare the `ImageNet-1K`, `ImageNet-A`, `ImageNet-R`, `ImageNetV2`, and `ImageNet-Sketch` datasets following the directory structure outlined below.
+```bash
+$ tree data
+data
+├── imagenet-1k
+│         ├── train
+          │    ├── n01498041
+          │    └── ...
+│         └── val
+│              ├── ILSVRC2012_val_00000001.JPEG
+│              └── ...
+├── imagenet-a
+│         ├── n01498041
+│         └── ...
+├── imagenet-r
+│         ├── n01443537
+│         └── ...
+├── imagenet-sketch
+│         ├── n01440764
+│         └── ...
+└── imagenetv2
+    └── ImageNetV2-matched-frequency
+```
+Then, unzip the `train.txt.zip` and `val.txt.zip` in `meta_data/`.
+```shell
+cd meta_data/
+unzip train.txt.zip
+unzip val.txt.zip
+```
+## 📦 Model Preparation
+| model name                   | type    | download                                                                                       |  size   |
+| ---------------------------- | ------- | ---------------------------------------------------------------------------------------------- | :-----: |
+| intern_vit_6b_224px.pth      | pytorch | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL/blob/main/intern_vit_6b_224px.pth)      |  12 GB  |
+| intern_vit_6b_224px_head.pth | pytorch | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL/blob/main/intern_vit_6b_224px_head.pth) | 25.7 MB |
+Please download the above model weights and place them in the `pretrained/` folder.
+```sh
+cd pretrained
+wget https://huggingface.co/OpenGVLab/InternVL/resolve/main/intern_vit_6b_224px.pth
+wget https://huggingface.co/OpenGVLab/InternVL/resolve/main/intern_vit_6b_224px_head.pth
+```
+The directory structure is:
+```sh
+pretrained
+├── intern_vit_6b_224px_head.pth
+└── intern_vit_6b_224px.pth
+```
+## 🔍 Linear Probing on ImageNet-1K
+> **Warning**: Please install `apex` before training (see [installation guide](../INSTALLATION.md#additional-instructions) for details).
+To train a linear classifier for `InternViT-6B` on ImageNet with 8 GPUs, run:
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 --master_port 12345 main.py --cfg configs/intern_vit_6b_1k_224.yaml
+# or manage jobs with slurm
+GPUS=8 sh train_in1k.sh <partition> <job-name> configs/intern_vit_6b_1k_224.yaml --launcher slurm
+```
+Note, it is normal for the following information to appear during training and it can be safely ignored:
+> \_IncompatibleKeys(missing_keys=\[\], unexpected_keys=\['clip_projector.norm1_q.weight', 'clip_projector.norm1_q.bias', 'clip_projector.norm1_k.weight', 'clip_projector.norm1_k.bias', 'clip_projector.norm1_v.weight', 'clip_projector.norm1_v.bias', 'clip_projector.cross_attn.q_bias', 'clip_projector.cross_attn.k_bias', 'clip_projector.cross_attn.v_bias', 'clip_projector.cross_attn.q.weight', 'clip_projector.cross_attn.k.weight', 'clip_projector.cross_attn.v.weight', 'clip_projector.cross_attn.proj.weight', 'clip_projector.cross_attn.proj.bias'\])
+## 📊 Evaluation
+> **Warning**: Please install `apex` before evaluation (see [installation guide](../INSTALLATION.md#additional-instructions) for details).
+| model name                                                     | IN-1K | IN-ReaL | IN-V2 | IN-A | IN-R | IN-Sketch |                                                                       download                                                                       |
+| -------------------------------------------------------------- | :---: | :-----: | :---: | :--: | :--: | :-------: | :--------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [intern_vit_6b_1k_224.yaml](configs/intern_vit_6b_1k_224.yaml) | 88.2  |  90.4   | 79.9  | 77.5 | 89.8 |   69.1    | [ckpt](https://huggingface.co/OpenGVLab/InternVL/resolve/main/intern_vit_6b_224px_head.pth) \| [log](./work_dirs/intern_vit_6b_1k_224/log_rank0.txt) |
+<details>
+  <summary>Evaluate InternViT-6B on <b>ImageNet-1K val</b> with 8 GPUs (click to expand).</summary>
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 --master_port 12345 main.py --eval \
+    --cfg configs/intern_vit_6b_1k_224.yaml --resume pretrained/intern_vit_6b_224px_head.pth
+# or manage jobs with slurm
+GPUS=8 sh train_in1k.sh <partition> <job-name> configs/intern_vit_6b_1k_224.yaml --eval \
+    --resume pretrained/intern_vit_6b_224px_head.pth --launcher slurm
+```
+Expected results:
+```
+ * Acc@1 88.230 Acc@5 98.474
+Accuracy of the network on the 50000 test images: 88.2%
+```
+</details>
+<details>
+  <summary>Evaluate InternViT-6B on <b>ImageNet-ReaL</b> with 1 GPU (click to expand).</summary>
+**Note: ImageNet-ReaL now only supports single-GPU testing.**
+```bash
+python -m torch.distributed.launch --nproc_per_node 1 --master_port 12345 main.py --eval \
+    --cfg configs/intern_vit_6b_1k_224_test_imagenet_real.yaml --resume pretrained/intern_vit_6b_224px_head.pth
+# or manage jobs with slurm
+GPUS=1 GPUS_PER_NODE=1 sh train_in1k.sh <partition> <job-name> configs/intern_vit_6b_1k_224_test_imagenet_real.yaml --eval \
+    --resume pretrained/intern_vit_6b_224px_head.pth --launcher slurm
+```
+Expected results:
+```
+* ReaL Acc@1 90.437 Acc@5 98.567 loss 0.605
+ReaL Accuracy of the network on the 50000 test images: 90.4%
+```
+</details>
+<details>
+  <summary>Evaluate InternViT-6B on <b>ImageNetV2</b> with 8 GPUs (click to expand).</summary>
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 --master_port 12345 main.py --eval \
+    --cfg configs/intern_vit_6b_1k_224_test_imagenetv2.yaml --resume pretrained/intern_vit_6b_224px_head.pth
+# or manage jobs with slurm
+GPUS=8 sh train_in1k.sh <partition> <job-name> configs/intern_vit_6b_1k_224_test_imagenetv2.yaml --eval \
+    --resume pretrained/intern_vit_6b_224px_head.pth --launcher slurm
+```
+Expected results:
+```
+ * Acc@1 79.940 Acc@5 95.340
+Accuracy of the network on the 10000 test images: 79.9%
+```
+</details>
+<details>
+  <summary>Evaluate InternViT-6B on <b>ImageNet-A</b> with 8 GPUs (click to expand).</summary>
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 --master_port 12345 main.py --eval \
+    --cfg configs/intern_vit_6b_1k_224_test_imagenet_a.yaml --resume pretrained/intern_vit_6b_224px_head.pth
+# or manage jobs with slurm
+GPUS=8 sh train_in1k.sh <partition> <job-name> configs/intern_vit_6b_1k_224_test_imagenet_a.yaml --eval \
+    --resume pretrained/intern_vit_6b_224px_head.pth --launcher slurm
+```
+Expected results:
+```
+ * Acc@1 77.479 Acc@5 92.737
+Accuracy of the network on the 7500 test images: 77.5%
+```
+</details>
+<details>
+  <summary>Evaluate InternViT-6B on <b>ImageNet-R</b> with 8 GPUs (click to expand).</summary>
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 --master_port 12345 main.py --eval \
+    --cfg configs/intern_vit_6b_1k_224_test_imagenet_r.yaml --resume pretrained/intern_vit_6b_224px_head.pth
+# or manage jobs with slurm
+GPUS=8 sh train_in1k.sh <partition> <job-name> configs/intern_vit_6b_1k_224_test_imagenet_r.yaml --eval \
+    --resume pretrained/intern_vit_6b_224px_head.pth --launcher slurm
+```
+Expected results:
+```
+ * Acc@1 89.777 Acc@5 97.023
+Accuracy of the network on the 30000 test images: 89.8%
+```
+</details>
+<details>
+  <summary>Evaluate InternViT-6B on <b>ImageNet-Sketch</b> with 8 GPUs (click to expand).</summary>
+```bash
+python -m torch.distributed.launch --nproc_per_node 8 --master_port 12345 main.py --eval \
+    --cfg configs/intern_vit_6b_1k_224_test_imagenet_sketch.yaml --resume pretrained/intern_vit_6b_224px_head.pth
+# or manage jobs with slurm
+GPUS=8 sh train_in1k.sh <partition> <job-name> configs/intern_vit_6b_1k_224_test_imagenet_sketch.yaml --eval \
+    --resume pretrained/intern_vit_6b_224px_head.pth --launcher slurm
+```
+Expected results:
+```
+ * Acc@1 69.117 Acc@5 88.341
+Accuracy of the network on the 50889 test images: 69.1%
+```
+</details>
--- a/classification/config.py
+++ b/classification/config.py
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2022 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+import yaml
+from yacs.config import CfgNode as CN
+_C = CN()
+# Base config files
+_C.BASE = ['']
+# -----------------------------------------------------------------------------
+# Data settings
+# -----------------------------------------------------------------------------
+_C.DATA = CN()
+# Batch size for a single GPU, could be overwritten by command line argument
+_C.DATA.BATCH_SIZE = 128
+# Path to dataset, could be overwritten by command line argument
+_C.DATA.DATA_PATH = ''
+# Dataset name
+_C.DATA.DATASET = 'imagenet'
+# Input image size
+_C.DATA.IMG_SIZE = 224
+# Interpolation to resize image (random, bilinear, bicubic)
+_C.DATA.INTERPOLATION = 'bicubic'
+# Use zipped dataset instead of folder dataset
+# could be overwritten by command line argument
+_C.DATA.ZIP_MODE = False
+# Cache Data in Memory, could be overwritten by command line argument
+_C.DATA.CACHE_MODE = 'part'
+# Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.
+_C.DATA.PIN_MEMORY = True
+# Number of data loading threads
+_C.DATA.NUM_WORKERS = 8
+# Load data to memory
+_C.DATA.IMG_ON_MEMORY = False
+# Name of the build_transform function
+_C.DATA.TRANSFORM = 'build_transform'
+# -----------------------------------------------------------------------------
+# Model settings
+# -----------------------------------------------------------------------------
+_C.MODEL = CN()
+# Model type
+_C.MODEL.TYPE = 'intern_vit_6b'
+# Model name
+_C.MODEL.NAME = 'intern_vit_6b'
+# Pretrained weight from checkpoint, could be imagenet22k pretrained weight
+# could be overwritten by command line argument
+_C.MODEL.PRETRAINED = ''
+# Checkpoint to resume, could be overwritten by command line argument
+_C.MODEL.RESUME = ''
+# Number of classes, overwritten in data preparation
+_C.MODEL.NUM_CLASSES = 1000
+# Dropout rate
+_C.MODEL.DROP_RATE = 0.0
+# Drop path rate
+_C.MODEL.DROP_PATH_RATE = 0.1
+# Drop path type
+_C.MODEL.DROP_PATH_TYPE = 'linear'  # linear, uniform
+# Label Smoothing
+_C.MODEL.LABEL_SMOOTHING = 0.1
+# INTERN_VIT_6B parameters
+_C.MODEL.INTERN_VIT_6B = CN()
+_C.MODEL.INTERN_VIT_6B.PATCH_SIZE = 14
+_C.MODEL.INTERN_VIT_6B.PRETRAIN_SIZE = 224
+_C.MODEL.INTERN_VIT_6B.QKV_BIAS = False
+_C.MODEL.INTERN_VIT_6B.EMBED_DIM = 3200
+_C.MODEL.INTERN_VIT_6B.NUM_HEADS = 25
+_C.MODEL.INTERN_VIT_6B.MLP_RATIO = 4
+_C.MODEL.INTERN_VIT_6B.INIT_VALUES = 0.1
+_C.MODEL.INTERN_VIT_6B.QK_NORMALIZATION = True
+_C.MODEL.INTERN_VIT_6B.DEPTH = 48
+_C.MODEL.INTERN_VIT_6B.USE_FLASH_ATTN = True
+_C.MODEL.INTERN_VIT_6B.FREEZE_VIT = True
+_C.MODEL.INTERN_VIT_6B.PRETRAINED = None
+_C.MODEL.INTERN_VIT_6B.CLS_TARGET = 'cls_patch_concat'
+_C.MODEL.INTERN_VIT_6B.NORM_TYPE = 'rms'
+# CLIP_VIT parameters
+_C.MODEL.CLIP_VIT = CN()
+_C.MODEL.CLIP_VIT.PATCH_SIZE = 14
+_C.MODEL.CLIP_VIT.PRETRAIN_SIZE = 336
+_C.MODEL.CLIP_VIT.EMBED_DIM = 1024
+_C.MODEL.CLIP_VIT.NUM_HEADS = 16
+_C.MODEL.CLIP_VIT.MLP_RATIO = 4
+_C.MODEL.CLIP_VIT.DEPTH = 24
+_C.MODEL.CLIP_VIT.FREEZE_VIT = True
+_C.MODEL.CLIP_VIT.PRETRAINED = 'openai/clip-vit-large-patch14-336'
+_C.MODEL.CLIP_VIT.CLS_TARGET = 'cls_patch_concat'
+# -----------------------------------------------------------------------------
+# Training settings
+# -----------------------------------------------------------------------------
+_C.TRAIN = CN()
+_C.TRAIN.START_EPOCH = 0
+_C.TRAIN.EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 20
+_C.TRAIN.WEIGHT_DECAY = 0.05
+_C.TRAIN.BASE_LR = 5e-4
+_C.TRAIN.WARMUP_LR = 5e-7
+_C.TRAIN.MIN_LR = 5e-6
+# Clip gradient norm
+_C.TRAIN.CLIP_GRAD = 5.0
+# Auto resume from latest checkpoint
+_C.TRAIN.AUTO_RESUME = True
+# Gradient accumulation steps
+# could be overwritten by command line argument
+_C.TRAIN.ACCUMULATION_STEPS = 0
+# Whether to use gradient checkpointing to save memory
+# could be overwritten by command line argument
+_C.TRAIN.USE_CHECKPOINT = False
+# LR scheduler
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'cosine'
+# Epoch interval to decay LR, used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30
+# LR decay rate, used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1
+# Optimizer
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'adamw'
+# Optimizer Epsilon
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+# Optimizer Betas
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)
+# SGD momentum
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+# ZeRO
+_C.TRAIN.OPTIMIZER.USE_ZERO = False
+# freeze backbone
+_C.TRAIN.OPTIMIZER.FREEZE_BACKBONE = None
+# dcn lr
+_C.TRAIN.OPTIMIZER.DCN_LR_MUL = None
+# EMA
+_C.TRAIN.EMA = CN()
+_C.TRAIN.EMA.ENABLE = False
+_C.TRAIN.EMA.DECAY = 0.9998
+# LR_LAYER_DECAY
+_C.TRAIN.LR_LAYER_DECAY = False
+_C.TRAIN.LR_LAYER_DECAY_RATIO = 0.875
+# FT head init weights
+_C.TRAIN.RAND_INIT_FT_HEAD = False
+# -----------------------------------------------------------------------------
+# Augmentation settings
+# -----------------------------------------------------------------------------
+_C.AUG = CN()
+# Color jitter factor
+_C.AUG.COLOR_JITTER = 0.4
+# Use AutoAugment policy. "v0" or "original"
+_C.AUG.AUTO_AUGMENT = 'rand-m9-mstd0.5-inc1'
+# Random erase prob
+_C.AUG.REPROB = 0.25
+# Random erase mode
+_C.AUG.REMODE = 'pixel'
+# Random erase count
+_C.AUG.RECOUNT = 1
+# Mixup alpha, mixup enabled if > 0
+_C.AUG.MIXUP = 0.8
+# Cutmix alpha, cutmix enabled if > 0
+_C.AUG.CUTMIX = 1.0
+# Cutmix min/max ratio, overrides alpha and enables cutmix if set
+_C.AUG.CUTMIX_MINMAX = None
+# Probability of performing mixup or cutmix when either/both is enabled
+_C.AUG.MIXUP_PROB = 1.0
+# Probability of switching to cutmix when both mixup and cutmix enabled
+_C.AUG.MIXUP_SWITCH_PROB = 0.5
+# How to apply mixup/cutmix params. Per "batch", "pair", or "elem"
+_C.AUG.MIXUP_MODE = 'batch'
+# RandomResizedCrop
+_C.AUG.RANDOM_RESIZED_CROP = False
+_C.AUG.MEAN = (0.485, 0.456, 0.406)
+_C.AUG.STD = (0.229, 0.224, 0.225)
+# -----------------------------------------------------------------------------
+# Testing settings
+# -----------------------------------------------------------------------------
+_C.TEST = CN()
+# Whether to use center crop when testing
+_C.TEST.CROP = True
+# Whether to use SequentialSampler as validation sampler
+_C.TEST.SEQUENTIAL = False
+# -----------------------------------------------------------------------------
+# Misc
+# -----------------------------------------------------------------------------
+# Mixed precision opt level, if O0, no amp is used ('O0', 'O1', 'O2')
+# overwritten by command line argument
+_C.AMP_OPT_LEVEL = ''
+# Path to output folder, overwritten by command line argument
+_C.OUTPUT = ''
+# Tag of experiment, overwritten by command line argument
+_C.TAG = 'default'
+# Frequency to save checkpoint
+_C.SAVE_FREQ = 1
+# Frequency to logging info
+_C.PRINT_FREQ = 10
+# eval freq
+_C.EVAL_FREQ = 1
+# Fixed random seed
+_C.SEED = 0
+# Perform evaluation only, overwritten by command line argument
+_C.EVAL_MODE = False
+# Test throughput only, overwritten by command line argument
+_C.THROUGHPUT_MODE = False
+# local rank for DistributedDataParallel, given by command line argument
+_C.LOCAL_RANK = 0
+_C.EVAL_22K_TO_1K = False
+_C.AMP_TYPE = 'float16'
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as f:
+        yaml_cfg = yaml.load(f, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg))
+    print('=> merge config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+def update_config(config, args):
+    _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if hasattr(args, 'opts') and args.opts:
+        config.merge_from_list(args.opts)
+    # merge from specific arguments
+    if hasattr(args, 'batch_size') and args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if hasattr(args, 'dataset') and args.dataset:
+        config.DATA.DATASET = args.dataset
+    if hasattr(args, 'data_path') and args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if hasattr(args, 'zip') and args.zip:
+        config.DATA.ZIP_MODE = True
+    if hasattr(args, 'cache_mode') and args.cache_mode:
+        config.DATA.CACHE_MODE = args.cache_mode
+    if hasattr(args, 'pretrained') and args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if hasattr(args, 'resume') and args.resume:
+        config.MODEL.RESUME = args.resume
+    if hasattr(args, 'accumulation_steps') and args.accumulation_steps:
+        config.TRAIN.ACCUMULATION_STEPS = args.accumulation_steps
+    if hasattr(args, 'use_checkpoint') and args.use_checkpoint:
+        config.TRAIN.USE_CHECKPOINT = True
+    if hasattr(args, 'amp_opt_level') and args.amp_opt_level:
+        config.AMP_OPT_LEVEL = args.amp_opt_level
+    if hasattr(args, 'output') and args.output:
+        config.OUTPUT = args.output
+    if hasattr(args, 'tag') and args.tag:
+        config.TAG = args.tag
+    if hasattr(args, 'eval') and args.eval:
+        config.EVAL_MODE = True
+    if hasattr(args, 'throughput') and args.throughput:
+        config.THROUGHPUT_MODE = True
+    if hasattr(args, 'save_ckpt_num') and args.save_ckpt_num:
+        config.SAVE_CKPT_NUM = args.save_ckpt_num
+    if hasattr(args, 'use_zero') and args.use_zero:
+        config.TRAIN.OPTIMIZER.USE_ZERO = True
+    # set local rank for distributed training
+    if hasattr(args, 'local_rank') and args.local_rank:
+        config.LOCAL_RANK = args.local_rank
+    # output folder
+    config.MODEL.NAME = args.cfg.split('/')[-1].replace('.yaml', '')
+    config.OUTPUT = os.path.join(config.OUTPUT, config.MODEL.NAME)
+    # config.OUTPUT = os.path.join(config.OUTPUT, config.MODEL.NAME, config.TAG)
+    config.freeze()
+def get_config(args):
+    """Get a yacs CfgNode object with default values."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    config = _C.clone()
+    update_config(config, args)
+    return config
--- a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
--- a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
--- a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
--- a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
--- a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_sketch'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-sketch'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
--- a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
--- a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
--- a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
--- a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
--- a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'