# Copyright 2023 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """CoAtNet Image classification configuration definition.""" import dataclasses from typing import Optional, Tuple import tensorflow as tf from official.modeling import hyperparams from official.vision.configs import backbones @dataclasses.dataclass class MaxViT(hyperparams.Config): """MaxViT config.""" model_name: str = 'maxvit-tiny' # These configs are specified according to `model_name` in default. # Set values will override the default configs. stem_hsize: Optional[Tuple[int, ...]] = None block_type: Optional[Tuple[str, ...]] = None num_blocks: Optional[Tuple[int, ...]] = None hidden_size: Optional[Tuple[int, ...]] = None # specific to the multi-axis attention in MaxViT # Note that the window_size and grid_size should be divisible by all the # feature map sizes along the entire network. Say, if you train on ImageNet # classification at 224x224, set both to 7 is almost the only choice. # If you train on COCO object detection at 896x896, set it to 28 is suggested, # as following Swin Transformer, window size should scales with feature size. # You may as well set it as 14 or 7. window_size: int = 7 # window size for conducting block attention module. grid_size: int = 7 # grid size for conducting sparse global grid attention. # tfm specific head_size: int = 32 dropatt: Optional[float] = None dropout: Optional[float] = None rel_attn_type: str = '2d_multi_head' num_heads: Optional[int] = None # A string of `current_window_size/ckpt_window_size` for finetuning from a # checkpoint trained with `ckpt_window_size`. scale_ratio: Optional[str] = None ln_epsilon: float = 1e-5 ln_dtype: Optional[tf.DType] = None # conv specific downsample_loc: str = 'depth_conv' kernel_size: int = 3 se_ratio: float = 0.25 dropcnn: Optional[float] = None # Only channels_last is supported for now. data_format: str = 'channels_last' norm_type: str = 'sync_batch_norm' # shared add_pos_enc: bool = False pool_type: str = '2d:avg' pool_stride: int = 2 expansion_rate: int = 4 # Stochastic depth keep probability for the residual connection in. Smaller # value means stronger regularization. If using anneal, it decays linearly # from 1.0 to this value with the depth of each layer." survival_prob: Optional[float] = None # from [0, 1] survival_prob_anneal: bool = True kernel_initializer: str = 'glorot_uniform' bias_initializer: str = 'zeros' # For cls head, should be same as the last `hidden_size` of backbone. representation_size: Optional[int] = None # Only effective when representation_size > 0. add_gap_layer_norm: bool = True @dataclasses.dataclass class Backbone(backbones.Backbone): """Configuration for backbones.""" type: Optional[str] = 'maxvit' maxvit: MaxViT = MaxViT()