print("Nvidia APEX normalization not installed, using PyTorch LayerNorm")
try:
importxformers.opsasxops
exceptImportError:
xops=None
#print("Please 'pip install xformers'")
@dataclass
classCLIPVisionCfg:
layers:Union[Tuple[int,int,int,int],int]=12
width:int=768
head_width:int=64
mlp_ratio:float=4.0
patch_size:int=16
image_size:Union[Tuple[int,int],int]=224
ls_init_value:Optional[float]=None# layer scale initial value
patch_dropout:float=0.# what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
global_average_pool:bool=False# whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
drop_path_rate:Optional[float]=None# drop path rate
timm_model_name:str=None# a valid model name overrides layers, width, patch_size
timm_model_pretrained:bool=False# use (imagenet) pretrained weights for named model
timm_pool:str='avg'# feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
timm_proj:str='linear'# linear projection for timm model output ('linear', 'mlp', '')
timm_proj_bias:bool=False# enable bias final projection
eva_model_name:str=None# a valid eva model name overrides layers, width, patch_size
qkv_bias:bool=True
fusedLN:bool=False
xattn:bool=False
postnorm:bool=False
rope:bool=False
pt_hw_seq_len:int=16# 224/14
intp_freq:bool=False
naiveswiglu:bool=False
subln:bool=False
@dataclass
classCLIPTextCfg:
context_length:int=77
vocab_size:int=49408
width:int=512
heads:int=8
layers:int=12
ls_init_value:Optional[float]=None# layer scale initial value
hf_model_name:str=None
hf_tokenizer_name:str=None
hf_model_pretrained:bool=True
proj:str='mlp'
pooler_type:str='mean_pooler'
masked_language_modeling:bool=False
fusedLN:bool=False
xattn:bool=False
attn_mask:bool=True
defget_cast_dtype(precision:str):
cast_dtype=None
ifprecision=='bf16':
cast_dtype=torch.bfloat16
elifprecision=='fp16':
cast_dtype=torch.float16
returncast_dtype
def_build_vision_tower(
embed_dim:int,
vision_cfg:CLIPVisionCfg,
quick_gelu:bool=False,
cast_dtype:Optional[torch.dtype]=None
):
ifisinstance(vision_cfg,dict):
vision_cfg=CLIPVisionCfg(**vision_cfg)
# OpenAI models are pretrained w/ QuickGELU but native nn.GELU is both faster and more
# memory efficient in recent PyTorch releases (>= 1.10).
# NOTE: timm models always use native GELU regardless of quick_gelu flag.