open_clip

6f43e8fa · mashun1 · 6f43e8fa · 6f43e8fa · 6f43e8fa · 6f43e8fa
Commit 6f43e8fa authored Sep 14, 2024 by mashun1
20 changed files
--- a/docs/openclip_multilingual_retrieval_results.csv
+++ b/docs/openclip_multilingual_retrieval_results.csv
--- a/docs/openclip_results.csv
+++ b/docs/openclip_results.csv
--- a/docs/openclip_retrieval_results.csv
+++ b/docs/openclip_retrieval_results.csv
--- a/docs/scaling.png
+++ b/docs/scaling.png
--- a/docs/script_examples/clipa/vit_b16/i50_t16_finetune.sh
+++ b/docs/script_examples/clipa/vit_b16/i50_t16_finetune.sh
+torchrun --nproc_per_node 8 -m open_clip_train.main \
+    --save-frequency 1 \
+    --save-most-recent \
+    --zeroshot-frequency 1 \
+    --train-data '/path/to/laion-400m' \
+    --dataset-type webdataset \
+    --lr "2.56e-5" \
+    --beta1 0.9 \
+    --beta2 0.95 \
+    --warmup 3072 \
+    --wd 0.2 \
+    --batch-size 1024 \
+    --aug-cfg scale='(0.4, 1.0)' \
+    --epochs 1 \
+    --train-num-samples 131072000 \
+    --workers 6 \
+    --model ViT-B-16-CL16 \
+    --pretrained '/path/to/ckpt' \
+    --precision 'amp_bf16' \
+    --ddp-static-graph \
+    --local-loss \
+    --gather-with-grad \
+    --grad-checkpointing \
+    --log-every-n-steps 256 \
+    --seed 0 \
+    --logs ./logs/ \
+    --imagenet-val '/path/to/imagenet/val'
--- a/docs/script_examples/clipa/vit_b16/i50_t16_pretrain.sh
+++ b/docs/script_examples/clipa/vit_b16/i50_t16_pretrain.sh
+torchrun --nproc_per_node 8 -m open_clip_train.main \
+    --save-frequency 1 \
+    --save-most-recent \
+    --zeroshot-frequency 1 \
+    --train-data '/path/to/laion-400m' \
+    --dataset-type webdataset \
+    --lr "2.048e-3" \
+    --beta1 0.9 \
+    --beta2 0.95 \
+    --warmup 782 \
+    --wd 0.2 \
+    --batch-size 8192 \
+    --aug-cfg scale='(0.4, 1.0)' \
+    --epochs 6 \
+    --workers 6 \
+    --model ViT-B-16-CL16 \
+    --precision 'amp_bf16' \
+    --ddp-static-graph \
+    --local-loss \
+    --gather-with-grad \
+    --force-image-size 112 \
+    --grad-checkpointing \
+    --log-every-n-steps 32 \
+    --seed 0 \
+    --logs ./logs/ \
+    --imagenet-val '/path/to/imagenet/val'
\ No newline at end of file
--- a/docs/script_examples/clipa/vit_l16/i17_t16_finetune.sh
+++ b/docs/script_examples/clipa/vit_l16/i17_t16_finetune.sh
+torchrun --nproc_per_node 8 -m open_clip_train.main \
+    --save-frequency 1 \
+    --save-most-recent \
+    --zeroshot-frequency 1 \
+    --train-data '/path/to/laion-400m' \
+    --dataset-type webdataset \
+    --lr "2.24e-5" \
+    --beta1 0.9 \
+    --beta2 0.95 \
+    --warmup 3571 \
+    --wd 0.2 \
+    --batch-size 896 \
+    --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
+    --epochs 1 \
+    --train-num-samples 131072000 \
+    --workers 6 \
+    --model ViT-L-16-CL16-GAP \
+    --pretrained '/path/to/ckpt' \
+    --precision 'amp_bf16' \
+    --ddp-static-graph \
+    --local-loss \
+    --gather-with-grad \
+    --grad-checkpointing \
+    --log-every-n-steps 293 \
+    --seed 0 \
+    --logs ./logs/ \
+    --imagenet-val '/path/to/imagenet/val'
\ No newline at end of file
--- a/docs/script_examples/clipa/vit_l16/i17_t16_pretrain.sh
+++ b/docs/script_examples/clipa/vit_l16/i17_t16_pretrain.sh
+torchrun --nproc_per_node 8 -m open_clip_train.main \
+    --save-frequency 1 \
+    --save-most-recent \
+    --zeroshot-frequency 1 \
+    --train-data '/path/to/laion-400m' \
+    --dataset-type webdataset \
+    --lr "1.024e-3" \
+    --beta1 0.9 \
+    --beta2 0.95 \
+    --warmup 1563 \
+    --wd 0.2 \
+    --batch-size 4096 \
+    --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
+    --epochs 6 \
+    --workers 6 \
+    --model ViT-L-16-CL16-GAP \
+    --precision 'amp_bf16' \
+    --ddp-static-graph \
+    --local-loss \
+    --gather-with-grad \
+    --force-image-size 64 \
+    --grad-checkpointing \
+    --log-every-n-steps 64 \
+    --seed 0 \
+    --logs ./logs/ \
+    --imagenet-val '/path/to/imagenet/val'
\ No newline at end of file
--- a/docs/script_examples/clipa/vit_l16/i37_t8_finetune.sh
+++ b/docs/script_examples/clipa/vit_l16/i37_t8_finetune.sh
+torchrun --nproc_per_node 8 -m open_clip_train.main \
+    --save-frequency 1 \
+    --save-most-recent \
+    --zeroshot-frequency 1 \
+    --train-data '/path/to/laion-400m' \
+    --dataset-type webdataset \
+    --lr "2.24e-5" \
+    --beta1 0.9 \
+    --beta2 0.95 \
+    --warmup 3571 \
+    --wd 0.2 \
+    --batch-size 896 \
+    --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
+    --epochs 1 \
+    --train-num-samples 131072000 \
+    --workers 6 \
+    --model ViT-L-16-CL32-GAP \
+    --pretrained '/path/to/ckpt' \
+    --precision 'amp_bf16' \
+    --ddp-static-graph \
+    --local-loss \
+    --gather-with-grad \
+    --grad-checkpointing \
+    --log-every-n-steps 293 \
+    --seed 0 \
+    --logs ./logs/ \
+    --imagenet-val '/path/to/imagenet/val'
\ No newline at end of file
--- a/docs/script_examples/clipa/vit_l16/i37_t8_pretrain.sh
+++ b/docs/script_examples/clipa/vit_l16/i37_t8_pretrain.sh
+torchrun --nproc_per_node 8 -m open_clip_train.main \
+    --save-frequency 1 \
+    --save-most-recent \
+    --zeroshot-frequency 1 \
+    --train-data '/path/to/laion-400m' \
+    --dataset-type webdataset \
+    --lr "1.024e-3" \
+    --beta1 0.9 \
+    --beta2 0.95 \
+    --warmup 1563 \
+    --wd 0.2 \
+    --batch-size 4096 \
+    --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
+    --epochs 6 \
+    --workers 6 \
+    --model ViT-L-16-CL8-Syntax-GAP \
+    --precision 'amp_bf16' \
+    --ddp-static-graph \
+    --local-loss \
+    --gather-with-grad \
+    --force-image-size 96 \
+    --grad-checkpointing \
+    --log-every-n-steps 64 \
+    --seed 0 \
+    --logs ./logs/ \
+    --imagenet-val '/path/to/imagenet/val'
\ No newline at end of file
--- a/docs/script_examples/clipav2/vit_h14/i257_t32_finetunex4.sh
+++ b/docs/script_examples/clipav2/vit_h14/i257_t32_finetunex4.sh
+# have not been tested. use it at your own discretion
+# the original experiment was run on tpu v3-256.
+# this example script assumes 8 gpus, each with huge memory. Tune batchsize, warmup, and lr accordingly if you have different machine setups.
+torchrun --nproc_per_node 8 -m open_clip_train.main \
+    --save-frequency 1 \
+    --save-most-recent \
+    --zeroshot-frequency 1 \
+    --train-data '/path/to/laion2b_or_datacomp1b' \
+    --train-num-samples 131072000 \
+    --dataset-type webdataset \
+    --lr "5.12e-5" \
+    --beta1 0.9 \
+    --beta2 0.95 \
+    --warmup 800 \
+    --wd 0.2 \
+    --batch-size 4096 \
+    --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
+    --epochs 4 \
+    --workers 6 \
+    --model ViT-H-14-CL32-GAP \
+    --pretrained '/path/to/pretrain84_ckpt' \
+    --precision 'amp_bf16' \
+    --ddp-static-graph \
+    --local-loss \
+    --gather-with-grad \
+    --force-image-size 224 \
+    --force-patch-dropout 0.3 \
+    --grad-checkpointing \
+    --log-every-n-steps 64 \
+    --seed 0 \
+    --logs ./logs/ \
+    --imagenet-val '/path/to/imagenet/val'
\ No newline at end of file
--- a/docs/script_examples/clipav2/vit_h14/i50_t8_pretrain.sh
+++ b/docs/script_examples/clipav2/vit_h14/i50_t8_pretrain.sh
+# have not been tested. use it at your own discretion
+# the original experiment was run on tpu v3-256.
+# this example script assumes 8 gpus, each with huge memory. Tune batchsize, warmup, and lr accordingly if you have different machine setups.
+torchrun --nproc_per_node 8 -m open_clip_train.main \
+    --save-frequency 1 \
+    --save-most-recent \
+    --zeroshot-frequency 1 \
+    --train-data '/path/to/laion2b_or_datacomp1b' \
+    --train-num-samples 4e8 \
+    --dataset-type webdataset \
+    --lr "2.048e-3" \
+    --beta1 0.9 \
+    --beta2 0.95 \
+    --warmup 3200 \
+    --wd 0.2 \
+    --batch-size 8192 \
+    --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
+    --epochs 32 \
+    --workers 6 \
+    --model ViT-H-14-CL8-Syntax-GAP \
+    --precision 'amp_bf16' \
+    --ddp-static-graph \
+    --local-loss \
+    --gather-with-grad \
+    --force-image-size 84 \
+    --grad-checkpointing \
+    --log-every-n-steps 32 \
+    --seed 0 \
+    --logs ./logs/ \
+    --imagenet-val '/path/to/imagenet/val'
\ No newline at end of file
--- a/docs/script_examples/clipav2/vit_h14/i577_t32_finetunex1.sh
+++ b/docs/script_examples/clipav2/vit_h14/i577_t32_finetunex1.sh
+# have not been tested. use it at your own discretion
+# the original experiment was run on tpu v3-256.
+# this example script assumes 8 gpus, each with huge memory. Tune batchsize, warmup, and lr accordingly if you have different machine setups.
+torchrun --nproc_per_node 8 -m open_clip_train.main \
+    --save-frequency 1 \
+    --save-most-recent \
+    --zeroshot-frequency 1 \
+    --train-data '/path/to/laion2b_or_datacomp1b' \
+    --train-num-samples 131072000 \
+    --dataset-type webdataset \
+    --lr "6.4e-6" \
+    --beta1 0.9 \
+    --beta2 0.95 \
+    --warmup 1600 \
+    --wd 0.2 \
+    --batch-size 2048 \
+    --aug-cfg scale='(0.4, 1.0)' color_jitter='(0.32, 0.32, 0.32, 0.08)' color_jitter_prob=0.8 gray_scale_prob=0.2 \
+    --epochs 1 \
+    --workers 6 \
+    --model ViT-H-14-CL32-GAP \
+    --pretrained '/path/to/finetune224_ckpt' \
+    --precision 'amp_bf16' \
+    --ddp-static-graph \
+    --local-loss \
+    --gather-with-grad \
+    --force-image-size 336 \
+    --force-patch-dropout 0.4 \
+    --grad-checkpointing \
+    --log-every-n-steps 64 \
+    --seed 0 \
+    --logs ./logs/ \
+    --imagenet-val '/path/to/imagenet/val'
\ No newline at end of file
--- a/docs/script_examples/stability_example.sh
+++ b/docs/script_examples/stability_example.sh
+#!/bin/bash
+#SBATCH --partition=g40423
+#SBATCH --job-name=testopenclip
+#SBATCH --nodes 30
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=12
+#SBATCH --output=%x_%j.out
+#SBATCH --comment=laion
+#SBATCH --open-mode=append
+#SBATCH --exclusive
+
+module load openmpi
+module load cuda/11.7
+
+export MASTER_ADDR=`hostname`
+export MASTER_PORT=12802
+export NCCL_PROTO=simple
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1
+export NCCL_DEBUG=info
+
+export PYTHONFAULTHANDLER=1
+
+export CUDA_LAUNCH_BLOCKING=0
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+
+cd /admin/home-mitchellw/open_clip/src
+export PYTHONPATH="$PYTHONPATH:/admin/home-mitchellw/open_clip/src"
+
+EXP_NAME="test-B-32-laion5b-lr1e-3-bs90k"
+
+srun --comment laion --cpu_bind=v --accel-bind=gn python -m open_clip_train.main \
+    --save-frequency 1 \
+    --train-data="pipe:aws s3 cp s3://s-datasets/laion5b/{laion2B-data/{000000..231349}.tar,laion2B-multi-data/{000000..226687}.tar,laion1B-nolang-data/{000000..127231}.tar} -" \
+    --train-num-samples 135646078 \
+    --dataset-type webdataset \
+    --dataset-resampled \
+    --warmup 2000 \
+    --batch-size=375 \
+    --epochs=97 \
+    --lr 1e-3 \
+    --workers=8 \
+    --report-to wandb \
+    --name ${EXP_NAME} \
+    --logs /scratch/logs/ \
+    --model ViT-B-32 \
+    --seed 0 \
+    --ddp-static-graph \
+    --local-loss \
+    --gather-with-grad \
+    --grad-checkpointing \
+    --precision amp_bfloat16 \
+    --wandb-project-name open_clip6 \
+    --resume "latest" \
+    --remote-sync s3://s-laion/mitchellw/logs
--- a/extra_utils/coca_data.py
+++ b/extra_utils/coca_data.py
+from clip_benchmark.datasets.builder import build_dataset
+import pandas as pd
+import os
+
+root_path = "datasets" # set this to smth meaningful
+ds = build_dataset("mscoco_captions", root=root_path, split="val", task="captioning") # this downloads the dataset if it is not there already
+coco = ds.coco
+imgs = coco.loadImgs(coco.getImgIds())
+future_df = {"filepath":[], "title":[]}
+for img in imgs:
+    caps = coco.imgToAnns[img["id"]]
+    for cap in caps:
+        future_df["filepath"].append(img["file_name"])
+        future_df["title"].append(cap["caption"])
+pd.DataFrame.from_dict(future_df).to_csv(
+  os.path.join(root_path, "train2014.csv"), index=False, sep="\t"
+)
\ No newline at end of file
--- a/extra_utils/convert_to_csv.py
+++ b/extra_utils/convert_to_csv.py
+import pandas as pd
+
+from pathlib import Path
+from typing import Optional
+
+
+def convert_to_csv(data_root: str,
+                   image_subfolder: str = "",
+                   text_subfolder: str = "",
+                   save_path: str = None,
+                   counts: Optional[int] = 10000):
+    
+    if not save_path.endswith(".csv"):
+        save_path += ".csv"
+    
+    data_root = Path(data_root)
+        
+    images_root = data_root / image_subfolder
+    texts_root = data_root / text_subfolder
+    
+    image_files = [
+        *images_root.glob("*.png"), *images_root.glob("*.jpg"),
+        *images_root.glob("*.jpeg"), *images_root.glob("*.bmp")
+    ]
+    text_files = [*texts_root.glob("*.txt")]
+    
+    image_files = {image_file.stem: image_file for image_file in image_files}
+    text_files = {text_file.stem: text_file for text_file in text_files}
+    
+    keys = (image_files.keys() & text_files.keys())
+    
+    keys = list(keys)[:counts]
+    text_files = {k: v for k, v in text_files.items() if k in keys}
+    image_files = {k: v for k, v in image_files.items() if k in keys}
+    
+    results = []
+    
+    for key in keys:
+        with open(text_files[key], "r") as f:
+            title = f.read().strip()
+        results.append({"filepath": image_files[key], "title": title})
+    
+    df = pd.DataFrame(results)
+    
+    df.to_csv(save_path, index=False)
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--data_root", type=str, help="存有images和texts文件夹的目录")
+    
+    parser.add_argument("--image_subfolder", type=str, default="")
+    
+    parser.add_argument("--text_subfolder", type=str, default="")
+    
+    parser.add_argument("--save_path", type=str, help="csv保存路径")
+    
+    args = parser.parse_args()
+    
+    convert_to_csv(args.data_root, args.image_subfolder, args.text_subfolder, args.save_path)
--- a/extra_utils/valprep.sh
+++ b/extra_utils/valprep.sh
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=999
+# 模型名称
+modelName=open_clip_pytorch
+# 模型描述
+modelDescription=多模态图像分类
+# 应用场景
+appScenario=训练,推理,图像分类,电商,绘画,交通
+# 框架类型
+frameType=Pytorch
--- a/pyproject.toml
+++ b/pyproject.toml
+[build-system]
+requires = ["pdm-backend"]
+build-backend = "pdm.backend"
+
+[project]
+name = "open_clip_torch"
+# NOTE for full list of authors see https://github.com/mlfoundations/open_clip?tab=readme-ov-file#citing
+# below covers most active / recent maintainers
+authors = [
+    {name = "Ross Wightman", email = "ross@huggingface.co"},
+    {name = "Gabriel Ilharco"},
+    {name = "Mitchell Wortsman"},
+    {name = "Romain Beaumont"},
+]
+description = "Open reproduction of consastive language-image pretraining (CLIP) and related."
+readme = "README.md"
+requires-python = ">=3.8"
+keywords = ["pytorch", "clip", "image-text", "language-image", "multimodal"]
+license = {text = "MIT"}
+classifiers = [
+        'Development Status :: 4 - Beta',
+        'Intended Audience :: Education',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
+        'Topic :: Scientific/Engineering',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Software Development',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+]
+dependencies = [
+    'torch>=1.9.0',
+    'torchvision',
+    'regex',
+    'ftfy',
+    'tqdm',
+    'huggingface-hub',
+    'timm',
+    "transformers[sentencepiece]",
+    "transformers"
+]
+dynamic = ["version"]
+
+[project.optional-dependencies]
+training = [
+    'torch>=2.0',
+    'webdataset>=0.2.5',
+    'pandas',
+    'transformers[sentencepiece]',
+    "transformers",
+    'timm>=1.0.7',
+    'fsspec',
+]
+test = [
+    'pytest-split',
+    'pytest',
+    'open_clip_torch[training]',
+    "transformers"
+]
+
+[project.urls]
+homepage = "https://github.com/mlfoundations/open_clip"
+repository = "https://github.com/mlfoundations/open_clip"
+
+[tool.pdm.version]
+source = "file"
+path = "src/open_clip/version.py"
+
+[tool.pdm.build]
+excludes = ["./**/.git", "./**/logs/*"]
+package-dir = "src"
+includes = ["src/open_clip", "src/open_clip_train"]
+
+[tool.pytest.ini_options]
+testpaths = ['tests']
+markers = [
+    'regression_test'
+]
\ No newline at end of file
--- a/pytest.ini
+++ b/pytest.ini
+[pytest]
+markers =
+    regression_test