ls_init_value:Optional[float]=None# layer scale initial value
patch_dropout:float=0.# what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
attentional_pool:bool=False# whether to use attentional pooler in the last embedding layer (overrides pool_type)
attn_pooler_queries:int=256# n_queries for attentional pooler
attn_pooler_heads:int=8# n heads for attentional_pooling
no_ln_pre:bool=False# disable pre transformer LayerNorm
pos_embed_type:str='learnable'
final_ln_after_pool:bool=False# apply final LayerNorm after pooling
pool_type:str='tok'
output_tokens:bool=False
act_kwargs:Optional[dict]=None
norm_kwargs:Optional[dict]=None
timm_model_name:Optional[str]=None# a valid model name overrides layers, width, patch_size
timm_model_pretrained:bool=False# use (imagenet) pretrained weights for named model
timm_pool:str='avg'# feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
timm_proj:str='linear'# linear projection for timm model output ('linear', 'mlp', '')
timm_proj_bias:bool=False# enable bias final projection