"magic_pdf/git@developer.sourcefind.cn:wangsen/mineru.git" did not exist on "272014c42b95428dbcc3c69eaf82beb4f35b0eb5"
Commit 6f43e8fa authored by mashun1's avatar mashun1
Browse files

open_clip

parents
Pipeline #1689 canceled with stages
*.py linguist-language=python
*.ipynb linguist-documentation
name: Continuous integration
on:
push:
branches:
- main
paths-ignore:
- '**.md'
- 'CITATION.cff'
- 'LICENSE'
- '.gitignore'
- 'docs/**'
pull_request:
branches:
- main
paths-ignore:
- '**.md'
- 'CITATION.cff'
- 'LICENSE'
- '.gitignore'
- 'docs/**'
workflow_dispatch:
inputs:
manual_revision_reference:
required: false
type: string
manual_revision_test:
required: false
type: string
env:
REVISION_REFERENCE: v2.8.2
#9d31b2ec4df6d8228f370ff20c8267ec6ba39383 earliest compatible v2.7.0 + pretrained_hf param
jobs:
Tests:
strategy:
matrix:
os: [ ubuntu-latest ] #, macos-latest ]
python: [ 3.8 ]
job_num: [ 4 ]
job: [ 1, 2, 3, 4 ]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
ref: ${{ inputs.manual_revision_test }}
- name: Set up Python ${{ matrix.python }}
id: pythonsetup
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python }}
- name: Venv cache
id: venv-cache
uses: actions/cache@v3
with:
path: .env
key: venv-${{ matrix.os }}-${{ steps.pythonsetup.outputs.python-version }}-${{ hashFiles('requirements*') }}
- name: Pytest durations cache
uses: actions/cache@v3
with:
path: .test_durations
key: test_durations-${{ matrix.os }}-${{ steps.pythonsetup.outputs.python-version }}-${{ matrix.job }}-${{ github.run_id }}
restore-keys: test_durations-0-
- name: Setup
if: steps.venv-cache.outputs.cache-hit != 'true'
run: |
python3 -m venv .env
source .env/bin/activate
pip install -e .[test]
- name: Prepare test data
run: |
source .env/bin/activate
python -m pytest \
--quiet --co \
--splitting-algorithm least_duration \
--splits ${{ matrix.job_num }} \
--group ${{ matrix.job }} \
-m regression_test \
tests \
| head -n -2 | grep -Po 'test_inference_with_data\[\K[^]]*(?=-False]|-True])' \
> models_gh_runner.txt
if [ -n "${{ inputs.manual_revision_reference }}" ]; then
REVISION_REFERENCE=${{ inputs.manual_revision_reference }}
fi
python tests/util_test.py \
--save_model_list models_gh_runner.txt \
--model_list models_gh_runner.txt \
--git_revision $REVISION_REFERENCE
- name: Unit tests
run: |
source .env/bin/activate
if [[ -f .test_durations ]]
then
cp .test_durations durations_1
mv .test_durations durations_2
fi
python -m pytest \
-x -s -v \
--splitting-algorithm least_duration \
--splits ${{ matrix.job_num }} \
--group ${{ matrix.job }} \
--store-durations \
--durations-path durations_1 \
--clean-durations \
-m "not regression_test" \
tests
OPEN_CLIP_TEST_REG_MODELS=models_gh_runner.txt python -m pytest \
-x -s -v \
--store-durations \
--durations-path durations_2 \
--clean-durations \
-m "regression_test" \
tests
jq -s -S 'add' durations_* > .test_durations
- name: Collect pytest durations
uses: actions/upload-artifact@v3
with:
name: pytest_durations_${{ matrix.os }}-${{ matrix.python }}-${{ matrix.job }}
path: .test_durations
Collect:
needs: Tests
runs-on: ubuntu-latest
steps:
- name: Cache
uses: actions/cache@v3
with:
path: .test_durations
key: test_durations-0-${{ github.run_id }}
- name: Collect
uses: actions/download-artifact@v3
with:
path: artifacts
- name: Consolidate
run: |
jq -n -S \
'reduce (inputs | to_entries[]) as {$key, $value} ({}; .[$key] += $value)' \
artifacts/pytest_durations_*/.test_durations > .test_durations
name: Clear cache
on:
workflow_dispatch:
permissions:
actions: write
jobs:
clear-cache:
runs-on: ubuntu-latest
steps:
- name: Clear cache
uses: actions/github-script@v6
with:
script: |
const caches = await github.rest.actions.getActionsCacheList({
owner: context.repo.owner,
repo: context.repo.repo,
})
for (const cache of caches.data.actions_caches) {
console.log(cache)
await github.rest.actions.deleteActionsCacheById({
owner: context.repo.owner,
repo: context.repo.repo,
cache_id: cache.id,
})
}
name: Release
on:
push:
branches:
- main
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions-ecosystem/action-regex-match@v2
id: regex-match
with:
text: ${{ github.event.head_commit.message }}
regex: '^Release ([^ ]+)'
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install setuptools wheel twine build
- name: Release
if: ${{ steps.regex-match.outputs.match != '' }}
uses: softprops/action-gh-release@v1
with:
tag_name: v${{ steps.regex-match.outputs.group1 }}
- name: Build and publish
if: ${{ steps.regex-match.outputs.match != '' }}
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
python -m build
twine upload dist/*
**/logs/
**/wandb/
models/
features/
results/
tests/data/
*.pt
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
.pdm-build/
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
sync.sh
gpu1sync.sh
.idea
*.pdf
**/._*
**/*DS_*
**.jsonl
src/sbatch
src/misc
.vscode
src/debug
core.*
# Allow
!src/evaluation/misc/results_dbs/*
# self
scripts/single_train_test.sh
pretrained_models/
logs/
dist/
datasets/
\ No newline at end of file
cff-version: 1.1.0
message: If you use this software, please cite it as below.
authors:
- family-names: Ilharco
given-names: Gabriel
- family-names: Wortsman
given-names: Mitchell
- family-names: Wightman
given-names: Ross
- family-names: Gordon
given-names: Cade
- family-names: Carlini
given-names: Nicholas
- family-names: Taori
given-names: Rohan
- family-names: Dave
given-names: Achal
- family-names: Shankar
given-names: Vaishaal
- family-names: Namkoong
given-names: Hongseok
- family-names: Miller
given-names: John
- family-names: Hajishirzi
given-names: Hannaneh
- family-names: Farhadi
given-names: Ali
- family-names: Schmidt
given-names: Ludwig
title: OpenCLIP
version: v0.1
doi: 10.5281/zenodo.5143773
date-released: 2021-07-28
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
\ No newline at end of file
## 2.24.0
* Fix missing space in error message
* use model flag for normalizing embeddings
* init logit_bias for non siglip pretrained models
* Fix logit_bias load_checkpoint addition
* Make CoCa model match CLIP models for logit scale/bias init
* Fix missing return of "logit_bias" in CoCa.forward
* Add NLLB-CLIP with SigLIP models
* Add get_logits method and NLLB tokenizer
* Remove the empty file src/open_clip/generation_utils.py
* Update params.py: "BatchNorm" -> "LayerNorm" in the description string for "--lock-text-freeze-layer-norm"
## 2.23.0
* Add CLIPA-v2 models
* Add SigLIP models
* Add MetaCLIP models
* Add NLLB-CLIP models
* CLIPA train code
* Minor changes/fixes
* Remove protobuf version limit
* Stop checking model name when loading CoCa models
* Log native wandb step
* Use bool instead of long masks
## 2.21.0
* Add SigLIP loss + training support
* Add more DataComp models (B/16, B/32 and B/32@256)
* Update default num workers
* Update CoCa generation for `transformers>=4.31`
* PyTorch 2.0 `state_dict()` compatibility fix for compiled models
* Fix padding in `ResizeMaxSize`
* Convert JIT model on state dict load for `pretrained='filename…'`
* Other minor changes and fixes (typos, README, dependencies, CI)
## 2.20.0
* Add EVA models
* Support serial worker training
* Fix Python 3.7 compatibility
## 2.19.0
* Add DataComp models
## 2.18.0
* Enable int8 inference without `.weight` attribute
## 2.17.2
* Update push_to_hf_hub
## 2.17.0
* Add int8 support
* Update notebook demo
* Refactor zero-shot classification code
## 2.16.2
* Fixes for context_length and vocab_size attributes
## 2.16.1
* Fixes for context_length and vocab_size attributes
* Fix --train-num-samples logic
* Add HF BERT configs for PubMed CLIP model
## 2.16.0
* Add improved g-14 weights
* Update protobuf version
## 2.15.0
* Add convnext_xxlarge weights
* Fixed import in readme
* Add samples per second per gpu logging
* Fix slurm example
## 2.14.0
* Move dataset mixtures logic to shard level
* Fix CoCa accum-grad training
* Safer transformers import guard
* get_labels refactoring
## 2.13.0
* Add support for dataset mixtures with different sampling weights
* Make transformers optional again
## 2.12.0
* Updated convnext configs for consistency
* Added input_patchnorm option
* Clean and improve CoCa generation
* Support model distillation
* Add ConvNeXt-Large 320x320 fine-tune weights
## 2.11.1
* Make transformers optional
* Add MSCOCO CoCa finetunes to pretrained models
## 2.11.0
* coca support and weights
* ConvNeXt-Large weights
## 2.10.1
* `hf-hub:org/model_id` support for loading models w/ config and weights in Hugging Face Hub
## 2.10.0
* Added a ViT-bigG-14 model.
* Added an up-to-date example slurm script for large training jobs.
* Added a option to sync logs and checkpoints to S3 during training.
* New options for LR schedulers, constant and constant with cooldown
* Fix wandb autoresuming when resume is not set
* ConvNeXt `base` & `base_w` pretrained models added
* `timm-` model prefix removed from configs
* `timm` augmentation + regularization (dropout / drop-path) supported
## 2.9.3
* Fix wandb collapsing multiple parallel runs into a single one
## 2.9.2
* Fix braceexpand memory explosion for complex webdataset urls
## 2.9.1
* Fix release
## 2.9.0
* Add training feature to auto-resume from the latest checkpoint on restart via `--resume latest`
* Allow webp in webdataset
* Fix logging for number of samples when using gradient accumulation
* Add model configs for convnext xxlarge
## 2.8.2
* wrapped patchdropout in a torch.nn.Module
## 2.8.1
* relax protobuf dependency
* override the default patch dropout value in 'vision_cfg'
## 2.8.0
* better support for HF models
* add support for gradient accumulation
* CI fixes
* add support for patch dropout
* add convnext configs
## 2.7.0
* add multilingual H/14 xlm roberta large
## 2.6.1
* fix setup.py _read_reqs
## 2.6.0
* Make openclip training usable from pypi.
* Add xlm roberta large vit h 14 config.
## 2.5.0
* pretrained B/32 xlm roberta base: first multilingual clip trained on laion5B
* pretrained B/32 roberta base: first clip trained using an HF text encoder
## 2.4.1
* Add missing hf_tokenizer_name in CLIPTextCfg.
## 2.4.0
* Fix #211, missing RN50x64 config. Fix type of dropout param for ResNet models
* Bring back LayerNorm impl that casts to input for non bf16/fp16
* zero_shot.py: set correct tokenizer based on args
* training/params.py: remove hf params and get them from model config
## 2.3.1
* Implement grad checkpointing for hf model.
* custom_text: True if hf_model_name is set
* Disable hf tokenizer parallelism
## 2.3.0
* Generalizable Text Transformer with HuggingFace Models (@iejMac)
## 2.2.0
* Support for custom text tower
* Add checksum verification for pretrained model weights
## 2.1.0
* lot including sota models, bfloat16 option, better loading, better metrics
## 1.2.0
* ViT-B/32 trained on Laion2B-en
* add missing openai RN50x64 model
## 1.1.1
* ViT-B/16+
* Add grad checkpointing support
* more robust data loader
Copyright (c) 2012-2021 Gabriel Ilharco, Mitchell Wortsman,
Nicholas Carlini, Rohan Taori, Achal Dave, Vaishaal Shankar,
John Miller, Hongseok Namkoong, Hannaneh Hajishirzi, Ali Farhadi,
Ludwig Schmidt
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
include src/open_clip/bpe_simple_vocab_16e6.txt.gz
include src/open_clip/model_configs/*.json
install: ## [Local development] Upgrade pip, install requirements, install package.
python -m pip install -U pip
python -m pip install -e .
install-training:
python -m pip install -r requirements-training.txt
install-test: ## [Local development] Install test requirements
python -m pip install -r requirements-test.txt
test: ## [Local development] Run unit tests
python -m pytest -x -s -v tests
# open-clip
## 论文
**Reproducible scaling laws for contrastive language-image learning**
* https://arxiv.org/pdf/2212.07143
## 模型结构
CLIP 模型有两个主要组件,一个文本编码器和一个图像编码器。对于文本编码器,使用了`Transformer`;对于图像编码器采用了`ResNet``Vision Transformer(ViT)`
![alt text](readme_imgs/model.png)
## 算法原理
CLIP通过最大化`文本-图像`匹配得分同时训练一个图像编码器和文本编码器。
![alt text](readme_imgs/alg.png)
## 环境配置
### Docker(方法一)
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
docker run --shm-size 50g --network=host --name=clip --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home/ -v /opt/hyhal:/opt/hyhal:ro -it <your IMAGE ID> bash
pip install timm --no-deps
pip install -r requirements.txt
pip install build
python -m build && pip install dist/open_clip_torch-2.26.1-py3-none-any.whl
### Dockerfile(方法二)
docker build -t <IMAGE_NAME>:<TAG> .
docker run --shm-size 50g --network=host --name=clip --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home/ -v /opt/hyhal:/opt/hyhal:ro -it <your IMAGE ID> bash
pip install timm --no-deps
pip install -r requirements.txt
pip install build
python -m build && pip install dist/open_clip_torch-2.26.1-py3-none-any.whl
### Anaconda (方法三)
1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装:
https://developer.hpccube.com/tool/
DTK驱动:dtk24.04.1
python:python3.10
torch: 2.1.0
torchvision: 0.16.0
Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应
2、其它非特殊库参照requirements.txt安装
pip install timm --no-deps
pip install -r requirements.txt
pip install build
python -m build && pip install dist/open_clip_torch-2.26.1-py3-none-any.whl
## 数据集
### CLIP
#### 数据下载
完整数据集下载参考[img2dataset](https://github.com/rom1504/img2dataset)
这里提供其他数据集,可通过[SCNet高速通道下载](http://113.200.138.88:18080/aidatasets/)
- 训练数据集:[cc3m wds](http://113.200.138.88:18080/aidatasets/pixparse/cc3m-wds)
- 验证数据集:[imagenet2012-val](http://113.200.138.88:18080/aidatasets/project-dependency/imagenet-2012/),数据下载完成后在数据目录下运行`bash valprep.sh`对数据进行分类(将extra_utils下的`valprep.sh`移动到数据目录下)。
#### 数据处理
在准备数据集时,可以使用`img2dataset`直接下载的数据集(`.tar`),也可自行准备数据集,自行准备的数据集可以`image_path caption`的形式并以`csv`格式保存,本项目提供了相应的脚本,位于`extra_utils/convert_to_csv.py`
|filepath|title|
|:-------:|:-------:|
|<path/to/image>|a white cat|
注意:更多数据处理及使用方式,可参考[官方readme](README_official.md)
### CoCa
运行`extra_utils/coca_data.py`会自动下载并处理数据。也可通过[SCNet高速下载通道](http://113.200.138.88:18080/aidatasets/project-dependency/mscoco2014)下载数据集并修改代码中`root_path`后运行该脚本处理数据。
## 训练
### CLIP
#### 单卡运行
python -m open_clip_train.main \
--save-frequency 1 \
--zeroshot-frequency 1 \
--report-to tensorboard \
--train-data="/path/to/train_data.csv" \
--val-data="/path/to/validation_data.csv" \
--csv-img-key filepath \
--csv-caption-key title \
--imagenet-val=/path/to/imagenet/root/val/ \
--warmup 10000 \
--batch-size=128 \
--lr=1e-3 \
--wd=0.1 \
--epochs=30 \
--workers=8 \
--model RN50 \
--csv-separator "," \
--precision amp_bf16
注意:`imagenet-val`表示使用imagenet作为验证集,若不需要可以删除。
#### 多卡运行
##### 单节点
cd open_clip/src
torchrun --nproc_per_node 4 -m open_clip_train.main \
--train-data '/data/cc12m/cc12m-train-{0000..2175}.tar' \
--train-num-samples 10968539 \
--dataset-type webdataset \
--batch-size 320 \
--precision amp_bf16 \
--workers 4 \
--imagenet-val /data/imagenet/validation/
##### 多节点
cd open_clip/src
torchrun --nproc_per_node=4 \
--rdzv_endpoint=$HOSTE_NODE_ADDR \
-m open_clip_train.main \
--train-data '/data/cc12m/cc12m-train-{0000..2175}.tar' \
--train-num-samples 10968539 \
--dataset-type webdataset \
--batch-size 320 \
--precision amp_bf16 \
--workers 4 \
--imagenet-val /data/imagenet/validation/ \
--precision amp_bf16
注意:更多方式请参考[README_official.md](README_official.md),使用`fp16`训练会导致`nan`,需要替换为`bf16`
### CoCa
```python
python -m open_clip_train.main \
--dataset-type "csv" \
--train-data /path/to/your.csv \
--warmup 1000 \
--batch-size 32 \
--lr 1e-5 \
--wd 0.1 \
--epochs 1 \
--workers 3 \
--model "coca_ViT-L-14" \
--coca-contrastive-loss-weight 0 \
--coca-caption-loss-weight 1 \
--log-every-n-steps 100
```
## 推理
### CLIP
# 不提供参数会使用默认参数并自动下载模型
# export HF_ENDPOINT=https://hf-mirror.com
python custom_tests/simple_test.py --model_name <xxx> --pretrained <可以是本地路径/path/to/open_clip_pytorch_model.bin,也可以是远程仓库名(如:laion2b_s34b_b79k)>
### CoCa
# 可以仅输入image_path,其他使用默认参数
# export HF_ENDPOINT=https://hf-mirror.com
python custom_tests/coca_simple_test.py <path/to/image>
注意:可使用以下参数修改使用的模型`--model_name <xxx> --pretrained <可以是本地路径/path/to/open_clip_pytorch_model.bin,也可以是远程仓库名(如:mscoco_finetuned_laion2B-s13B-b90k)> `
## result
`python custom_test.py`
CLIP:
|a diagram|a dog|a cat|
|:---:|:---:|:---:|
|0.995|0.004|0.001|
CoCa:
输入:<img src="readme_imgs/girl.png" style="zoom:50%;">
输出:a beautiful young woman with a long black ponytail .
### 精度
K100_AI与L20精度保持一致。
## 应用场景
### 算法类别
`图像分类`
### 热点应用行业
`电商,绘画,交通`
## 预训练权重
下载模型后放入`pretrained_models`文件夹中(需要自行创建)。
完整模型列表可通过`python -c "import open_clip; print(open_clip.list_pretrained())"`查看,这里提供常用的预训练权重链接。
|model|链接|
|:---:|:------:|
|CLIP-ViT-B-32-laion2B-s34B-b79K|[原始链接](https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K)/[SCNet高速下载通道](http://113.200.138.88:18080/aimodels/CLIP-ViT-B-32-laion2B-s34B-b79K)||
|CLIP-ViT-B-16-laion2B-s34B-b88K|[原始链接](https://huggingface.co/laion/CLIP-ViT-B-16-laion2B-s34B-b88K)/[SCNet高速下载通道](http://113.200.138.88:18080/aimodels/CLIP-ViT-B-16-laion2B-s34B-b88K)|
|CLIP-ViT-H-14-laion2B-s32B-b79K|[原始链接](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)/[SCNet高速下载通道](http://113.200.138.88:18080/aimodels/CLIP-ViT-H-14-laion2B-s32B-b79K)|
|CLIP-ViT-bigG-14-laion2B-39B-b160k|[原始链接](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)/[SCNet高速下载通道](http://113.200.138.88:18080/aimodels/CLIP-ViT-bigG-14-laion2B-39B-b160k)|
|CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup|[原始链接](https://huggingface.co/laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup)/[SCNet高速下载通道](http://113.200.138.88:18080/aimodels/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup)|
|bioclip|[原始链接](https://huggingface.co/imageomics/bioclip)/[SCNet高速下载通道](http://113.200.138.88:18080/aimodels/imageomics/bioclip)|
|mscoco_finetuned_CoCa-ViT-L-14-laion2B-s13B-b90k |[原始链接](https://hf-mirror.com/laion/mscoco_finetuned_CoCa-ViT-L-14-laion2B-s13B-b90k/tree/main)/[SCNet高速下载通道](http://113.200.138.88:18080/aimodels/mscoco_finetuned_CoCa-ViT-L-14-laion2B-s13B-b90k)|
## 源码仓库及问题反馈
* https://developer.hpccube.com/codes/modelzoo/clip_pytorch
## 参考资料
* https://github.com/mlfoundations/open_clip/
This diff is collapsed.
import open_clip
import torch
from PIL import Image
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("--model_name", type=str, default="coca_ViT-L-14")
parser.add_argument("--pretrained", type=str, default="mscoco_finetuned_laion2B-s13B-b90k")
parser.add_argument("--image_path", type=str)
args = parser.parse_args()
model, _, transform = open_clip.create_model_and_transforms(
model_name=args.model_name,
pretrained=args.pretrained
)
model.cuda()
im = Image.open(args.image_path).convert("RGB")
im = transform(im).unsqueeze(0).cuda()
with torch.no_grad(), torch.cuda.amp.autocast():
generated = model.generate(im)
print(open_clip.decode(generated[0]).split("<end_of_text>")[0].replace("<start_of_text>", ""))
\ No newline at end of file
import torch
from PIL import Image
import open_clip
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", default="ViT-B-32", type=str)
parser.add_argument("--pretrained", default="laion2b_s34b_b79k", type=str)
args = parser.parse_args()
model, _, preprocess = open_clip.create_model_and_transforms(args.model_name, pretrained=args.pretrained)
model.eval() # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
tokenizer = open_clip.get_tokenizer(args.model_name)
model.cuda()
image = preprocess(Image.open("docs/CLIP.png")).unsqueeze(0)
text = tokenizer(["a diagram", "a dog", "a cat"])
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs) # prints: [[1., 0., 0.]]
This diff is collapsed.
This diff is collapsed.
As we describe in more detail below, CLIP models in a medium accuracy regime already allow us to draw conclusions about the robustness of larger CLIP models since the models follow reliable scaling laws.
[Cherti et al., 2022](https://arxiv.org/abs/2212.07143) and [Gadre et al., 2023](https://arxiv.org/abs/2304.14108) show additional discussions about the scaling behavior of CLIP models.
## Scaling trends
The plot below shows how zero-shot performance of CLIP models varies as we scale the number of samples used for training. Zero-shot performance increases steadily for both ImageNet and [ImageNetV2](https://arxiv.org/abs/1902.10811), and is far from saturated at ~15M samples.
<img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/scaling.png" width="700">
## Why are low-accuracy CLIP models interesting?
**TL;DR:** CLIP models have high effective robustness, even at small scales.
CLIP models are particularly intriguing because they are more robust to natural distribution shifts (see Section 3.3 in the [CLIP paper](https://arxiv.org/abs/2103.00020)).
This phenomena is illustrated by the figure below, with ImageNet accuracy on the x-axis
and [ImageNetV2](https://arxiv.org/abs/1902.10811) (a reproduction of the ImageNet validation set with distribution shift) accuracy on the y-axis.
Standard training denotes training on the ImageNet train set and the CLIP zero-shot models
are shown as stars.
![CLIP scatter plot](https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/effective_robustness.png)
As observed by [Taori et al., 2020](https://arxiv.org/abs/2007.00644) and [Miller et al., 2021](https://arxiv.org/abs/2107.04649), the in-distribution
and out-of-distribution accuracies of models trained on ImageNet follow a predictable linear trend (the red line in the above plot). *Effective robustness*
quantifies robustness as accuracy beyond this baseline, i.e., how far a model lies above the red line. Ideally a model would not suffer from distribution shift and fall on the y = x line ([trained human labelers are within a percentage point of the y = x line](http://proceedings.mlr.press/v119/shankar20c.html)).
Even though the CLIP models trained with
this codebase achieve much lower accuracy than those trained by OpenAI, our models still lie on the same
trend of improved effective robustness (the purple line). Therefore, we can study what makes
CLIP robust without requiring industrial-scale compute.
For more information on effective robustness, please see:
- [Recht et al., 2019](https://arxiv.org/abs/1902.10811).
- [Taori et al., 2020](https://arxiv.org/abs/2007.00644).
- [Miller et al., 2021](https://arxiv.org/abs/2107.04649).
To know more about the factors that contribute to CLIP's robustness refer to [Fang et al., 2022](https://arxiv.org/abs/2205.01397).
\ No newline at end of file
## Pretrained model results
We evaluate the full collection of available models on a suite of 38 datasets in a zero-shot setting (i.e., without fine-tuning), following [Gadre et al., 2023](https://arxiv.org/abs/2304.14108).
Click below to see the full results.
- [Full results (English)](openclip_results.csv)
- [Classification-only results](openclip_classification_results.csv)
- [Retrieval results](openclip_retrieval_results.csv)
- [Multilingual retrieval results](openclip_multilingual_retrieval_results.csv)
## Pretrained model details
Below are details for several of our pretrained models.
### LAION-400M - https://laion.ai/laion-400-open-dataset
We ran experiments in an attempt to reproduce OpenAI's ViT results with the comparably sized (and open) LAION-400M dataset. Trained
weights can be found in release [v0.2](https://github.com/mlfoundations/open_clip/releases/tag/v0.2-weights).
The LAION400M weights have been trained on the JUWELS supercomputer (see acknowledgements section below).
#### ViT-B/32 224x224
We replicate OpenAI's results on ViT-B/32, reaching a top-1 ImageNet-1k zero-shot accuracy of 62.96%.
<img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/laion_clip_zeroshot.png" width="700">
**Zero-shot comparison (courtesy of Andreas Fürst)**
<img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/laion_openai_compare_b32.jpg" width="700">
ViT-B/32 was trained with 128 A100 (40 GB) GPUs for ~36 hours, 4600 GPU-hours. The per-GPU batch size was 256 for a global batch size of 32768. 256 is much lower than it could have been (~320-384) due to being sized initially before moving to 'local' contrastive loss.
#### ViT-B/16 224x224
The B/16 LAION400M training reached a top-1 ImageNet-1k zero-shot validation score of 67.07.
<img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/laion_clip_zeroshot_b16.png" width="700">
This was the first major train session using the updated webdataset 0.2.x code. A bug was found that prevented shards from being shuffled properly between nodes/workers each epoch. This was fixed part way through training (epoch 26) but likely had an impact.
ViT-B/16 was trained with 176 A100 (40 GB) GPUS for ~61 hours, 10700 GPU-hours. Batch size per GPU was 192 for a global batch size of 33792.
#### ViT-B/16+ 240x240
The B/16+ 240x240 LAION400M training reached a top-1 ImageNet-1k zero-shot validation score of 69.21.
This model is the same depth as the B/16, but increases the
- vision width from 768 -> 896
- text width from 512 -> 640
- the resolution 224x224 -> 240x240 (196 -> 225 tokens)
<img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/laion_clip_zeroshot_b16_plus_240.png" width="700">
Unlike the B/16 run above, this model was a clean run with no dataset shuffling issues.
ViT-B/16+ was trained with 224 A100 (40 GB) GPUS for ~61 hours, 13620 GPU-hours. Batch size per GPU was 160 for a global batch size of 35840.
#### ViT-L/14 224x224
The L/14 LAION-400M training reached a top-1 ImageNet-1k zero-shot validation score of 72.77.
<img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/laion_clip_zeroshot_l14.png" width="700">
ViT-L/14 was trained with 400 A100 (40 GB) GPUS for ~127 hours, 50800 GPU-hours. Batch size per GPU was 96 for a global batch size of 38400. Grad checkpointing was enabled.
### LAION-2B (en) - https://laion.ai/laion-5b-a-new-era-of-open-large-scale-multi-modal-datasets/
A ~2B sample subset of LAION-5B with english captions (https://huggingface.co/datasets/laion/laion2B-en)
#### ViT-B/32 224x224
A ViT-B/32 trained on LAION-2B, reaching a top-1 ImageNet-1k zero-shot accuracy of 65.62%.
<img src="https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/laion2b_clip_zeroshot_b32.png" width="700">
ViT-B/32 was trained with 112 A100 (40 GB) GPUs. The per-GPU batch size was 416 for a global batch size of 46592. Compute generously provided by [stability.ai](https://stability.ai/).
A second iteration of B/32 was trained on stability.ai cluster with a larger global batch size and learning rate, hitting 66.6% top-1. See https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K
#### ViT-L/14 224x224
A ViT-L/14 with a 75.3% top-1 ImageNet-1k zero-shot was trained on JUWELS Booster. See model details here https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K
These weights use a different dataset mean and std than others. Instead of using the OpenAI mean & std, inception style normalization `[-1, 1]` is used via a mean and std of `[0.5, 0.5, 0.5]`. This is handled automatically if using `open_clip.create_model_and_transforms` from pretrained weights.
#### ViT-H/14 224x224
A ViT-H/14 with a 78.0% top-1 ImageNet-1k zero-shot was trained on JUWELS Booster. See model details here https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
#### ViT-g/14 224x224
A ViT-g/14 with a 76.6% top-1 ImageNet-1k zero-shot was trained on JUWELS Booster. See model details here https://huggingface.co/laion/CLIP-ViT-g-14-laion2B-s12B-b42K
This model was trained with a shorted schedule than other LAION-2B models with 12B samples seen instead of 32+B. It matches LAION-400M training in samples seen. Many zero-shot results are lower as a result, but despite this it performs very well in some OOD zero-shot and retrieval tasks.
#### ViT-B/32 roberta base
A ViT-B/32 with roberta base encoder with a 61.7% top-1 ImageNet-1k zero-shot was trained on stability. See model details here https://huggingface.co/laion/CLIP-ViT-B-32-roberta-base-laion2B-s12B-b32k
This is the first openclip model using a HF text tower. It has better performance on a range of tasks compared to the standard text encoder, see [metrics](https://huggingface.co/laion/CLIP-ViT-B-32-roberta-base-laion2B-s12B-b32k/blob/main/unknown.png)
#### ViT-B/32 xlm roberta base
A ViT-B/32 with xlm roberta base encoder with a 62.33% top-1 ImageNet-1k zero-shot was trained on stability. See model details here https://huggingface.co/laion/CLIP-ViT-B-32-xlm-roberta-base-laion5B-s13B-b90k
This is the first openclip model trained on the full laion5B dataset; hence the first multilingual clip trained with openclip. It has better performance on a range of tasks compared to the standard text encoder, see [metrics](https://huggingface.co/laion/CLIP-ViT-B-32-xlm-roberta-base-laion5B-s13B-b90k/blob/main/metrics.png)
A preliminary multilingual evaluation was run: 43% on imagenet1k italian (vs 21% for english B/32), 37% for imagenet1k japanese (vs 1% for english B/32 and 50% for B/16 clip japanese). It shows the multilingual property is indeed there as expected. Larger models will get even better performance.
#### ViT-H/14 xlm roberta large
A ViT-H/14 with xlm roberta large encoder with a 77.0% (vs 78% for the english equivalent) top-1 ImageNet-1k zero-shot was trained on stability. See model details here https://huggingface.co/laion/CLIP-ViT-H-14-frozen-xlm-roberta-large-laion5B-s13B-b90k
This model was trained following the [LiT](https://arxiv.org/abs/2111.07991) methodology: the image tower was frozen (initialized from english openclip ViT-H/14), the text tower was initialized from [xlm roberta large](https://huggingface.co/xlm-roberta-large) and unfrozen. This reduced training cost by a 3x factor.
See full english [metrics](https://huggingface.co/laion/CLIP-ViT-H-14-frozen-xlm-roberta-large-laion5B-s13B-b90k/resolve/main/results_xlm_roberta_large.png)
On zero shot classification on imagenet with translated prompts this model reaches:
- 56% in italian (vs 21% for https://github.com/clip-italian/clip-italian)
- 53% in japanese (vs 54.6% for https://github.com/rinnakk/japanese-clip)
- 55.7% in chinese (to be compared with https://github.com/OFA-Sys/Chinese-CLIP)
#### YFCC-15M
Below are checkpoints of models trained on YFCC-15M, along with their zero-shot top-1 accuracies on ImageNet and ImageNetV2. These models were trained using 8 GPUs and the same hyperparameters described in the "Sample running code" section, with the exception of `lr=5e-4` and `epochs=32`.
- [ResNet-50](https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-yfcc15m-455df137.pt) (32.7% / 27.9%)
- [ResNet-101](https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn101-quickgelu-yfcc15m-3e04b30e.pt) (34.8% / 30.0%)
#### CC12M - https://github.com/google-research-datasets/conceptual-12m
- [ResNet-50](https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-cc12m-f000538c.pt) (36.45%)
### CommonPool and DataComp models
As part of [DataComp](https://github.com/mlfoundations/datacomp), we trained models on CommonPool using various data filtering strategies.
The best performing models are specified below for the xlarge scale, see our paper [DataComp: In seearch of the next generation of multimodal datasets](https://arxiv.org/abs/2304.14108) for more details.
Additional models and more information can be found at [/docs/datacomp_models.md](/docs/datacomp_models.md).
- `datacomp_xl_s13b_b90k`: A ViT-L/14 trained on DataComp-1B for 12.8B steps and batch size 90k. Achieves 79.2% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K.
- `commonpool_xl_clip_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL filtered using CLIP scores, for 12.8B steps and batch size 90k. Achieves 76.4% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL.clip-s13B-b90K.
- `commonpool_xl_laion_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL filtered using the LAION-2B filtering scheme, for 12.8B steps and batch size 90k. Achieves 75.5% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL.laion-s13B-b90K.
- `commonpool_xl_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL without any filtering, for 12.8B steps and batch size 90k. Achieves 72.3% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL-s13B-b90K.
If you use models trained on DataComp-1B or CommonPool variations, please consider citing the following:
```bibtex
@article{datacomp,
title={DataComp: In search of the next generation of multimodal datasets},
author={Samir Yitzhak Gadre, Gabriel Ilharco, Alex Fang, Jonathan Hayase, Georgios Smyrnis, Thao Nguyen, Ryan Marten, Mitchell Wortsman, Dhruba Ghosh, Jieyu Zhang, Eyal Orgad, Rahim Entezari, Giannis Daras, Sarah Pratt, Vivek Ramanujan, Yonatan Bitton, Kalyani Marathe, Stephen Mussmann, Richard Vencu, Mehdi Cherti, Ranjay Krishna, Pang Wei Koh, Olga Saukh, Alexander Ratner, Shuran Song, Hannaneh Hajishirzi, Ali Farhadi, Romain Beaumont, Sewoong Oh, Alex Dimakis, Jenia Jitsev, Yair Carmon, Vaishaal Shankar, Ludwig Schmidt},
journal={arXiv preprint arXiv:2304.14108},
year={2023}
}
```
### MetaCLIP
MetaCLIP models are described in the paper [Demystifying CLIP Data](https://arxiv.org/abs/2309.16671).
These models were developed by Hu Xu, Saining Xie, Xiaoqing Ellen Tan, Po-Yao Huang, Russell Howes, Vasu Sharma, Shang-Wen Li, Gargi Ghosh, Luke Zettlemoyer and Christoph Feichtenhofer from Meta, New York University and the University of Washington.
Models are licensed under CC-BY-NC.
More details are available at https://github.com/facebookresearch/MetaCLIP.
If you use MetaCLIP models, please cite the following:
```bibtex
@inproceedings{xu2023metaclip,
title={Demystifying CLIP Data},
author={Hu Xu, Saining Xie, Xiaoqing Ellen Tan, Po-Yao Huang, Russell Howes, Vasu, Sharma, Shang-Wen Li, Gargi Ghosh, Luke Zettlemoyer and Christoph Feichtenhofer},
journal={arXiv preprint arXiv:2309.16671},
year={2023}
}
```
### EVA-CLIP
EVA-CLIP models are described in the paper [EVA-CLIP: Improved Training Techniques for CLIP at Scale](https://arxiv.org/abs/2303.15389).
These models were developed by Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang and Yue Cao from BAAI and HUST.
Models are licensed under the MIT License.
More details are available at https://github.com/baaivision/EVA/tree/master/EVA-CLIP.
If you use EVA models, please cite the following:
```bibtex
@article{EVA-CLIP,
title={EVA-CLIP: Improved Training Techniques for CLIP at Scale},
author={Sun, Quan and Fang, Yuxin and Wu, Ledell and Wang, Xinlong and Cao, Yue},
journal={arXiv preprint arXiv:2303.15389},
year={2023}
}
```
### NLLB-CLIP
NLLB-CLIP models are described in the paper [NLLB-CLIP - train performant multilingual image retrieval model on a budget](https://arxiv.org/abs/2309.01859) by Alexander Visheratin.
The model was trained following the [LiT](https://arxiv.org/abs/2111.07991) methodology: the image tower was frozen, the text tower was initialized from the [NLLB](https://arxiv.org/abs/2207.04672) encoder and unfrozen.
The model was trained on the [LAION-COCO-NLLB](https://huggingface.co/datasets/visheratin/laion-coco-nllb) dataset.
The first version of the model (`nllb-clip`) described in the paper was trained using the OpenAI CLIP image encoder.
The second version of the model (`nllb-clip-siglip`) was trained using the [SigLIP](https://arxiv.org/abs/2303.15343) image encoder.
Models are licensed under CC-BY-NC.
If you use NLLB-CLIP models, please cite the following:
```bibtex
@article{visheratin2023nllb,
title={NLLB-CLIP--train performant multilingual image retrieval model on a budget},
author={Visheratin, Alexander},
journal={arXiv preprint arXiv:2309.01859},
year={2023}
}
```
### CLIPA
CLIPA models are described in the following papers by Xianhang Li, Zeyu Wang, Cihang Xie from UC Santa Cruz:
- [An Inverse Scaling Law for CLIP Training](https://arxiv.org/abs/2305.07017)
- [CLIPA-v2: Scaling CLIP Training with 81.1% Zero-shot ImageNet Accuracy within a $10,000 Budget; An Extra $4,000 Unlocks 81.8% Accuracy](https://arxiv.org/abs/2306.15658)
Models are licensed under Apache 2.0.
More details are available at https://github.com/UCSC-VLAA/CLIPA and [here](clipa.md).
If you use CLIPA models, please cite the following:
```bibtex
@inproceedings{li2023clipa,
title={An Inverse Scaling Law for CLIP Training},
author={Xianhang Li and Zeyu Wang and Cihang Xie},
booktitle={NeurIPS},
year={2023},
}
```
```bibtex
@article{li2023clipav2,
title={CLIPA-v2: Scaling CLIP Training with 81.1% Zero-shot ImageNet Accuracy within a $10,000 Budget; An Extra $4,000 Unlocks 81.8% Accuracy},
author={Xianhang Li and Zeyu Wang and Cihang Xie},
journal={arXiv preprint arXiv:2306.15658},
year={2023},
}
```
### SigLIP
SigLIP models are described in the paper [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343).
These models were developed by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer from Google Deepmind.
Models are licensed under the Apache 2 license.
More details are available at hhttps://github.com/google-research/big_vision.
If you use SigLIP models, please cite the following:
```bibtex
@article{zhai2023sigmoid,
title={Sigmoid loss for language image pre-training},
author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},
journal={arXiv preprint arXiv:2303.15343},
year={2023}
}
```
### DFN
Data Filtering Network models are described in https://arxiv.org/abs/2309.17425.
These models were developed by Alex Fang, Albin Madappally Jose, Amit Jain, Ludwig Schmidt, Alexander Toshev and Vaishaal Shankar from Apple and the University of Washington.
Models are licensed under the following: https://huggingface.co/apple/DFN5B-CLIP-ViT-H-14-384/blob/main/LICENSE.
If you use DFN models, please cite the following:
```bibtext
@article{fang2023data,
title={Data Filtering Networks},
author={Fang, Alex and Jose, Albin Madappally and Jain, Amit and Schmidt, Ludwig and Toshev, Alexander and Shankar, Vaishaal},
journal={arXiv preprint arXiv:2309.17425},
year={2023}
}
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment