ls_init_value:Optional[float]=None# layer scale initial value
patch_dropout:float=0.# what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
global_average_pool:bool=False# whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
timm_model_name:str=None# a valid model name overrides layers, width, patch_size
timm_model_pretrained:bool=False# use (imagenet) pretrained weights for named model
timm_pool:str='avg'# feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
timm_proj:str='linear'# linear projection for timm model output ('linear', 'mlp', '')
timm_proj_bias:bool=False# enable bias final projection