cbgs_bevfusion.yaml 4.53 KB
Newer Older
chenshi3's avatar
chenshi3 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
CLASS_NAMES: ['car','truck', 'construction_vehicle', 'bus', 'trailer',
              'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']

DATA_CONFIG:
  _BASE_CONFIG_: cfgs/dataset_configs/nuscenes_dataset.yaml
  POINT_CLOUD_RANGE: [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
  CAMERA_CONFIG:
      USE_CAMERA: True
      IMAGE:
        FINAL_DIM: [256,704]
        RESIZE_LIM_TRAIN: [0.38, 0.55]
        RESIZE_LIM_TEST: [0.48, 0.48]

  DATA_AUGMENTOR:
    DISABLE_AUG_LIST: ['placeholder']
    AUG_CONFIG_LIST:
      - NAME: random_world_flip
        ALONG_AXIS_LIST: ['x', 'y']

      - NAME: random_world_rotation
        WORLD_ROT_ANGLE: [-0.78539816, 0.78539816]

      - NAME: random_world_scaling
        WORLD_SCALE_RANGE: [0.9, 1.1]

      - NAME: random_world_translation
        NOISE_TRANSLATE_STD: [0.5, 0.5, 0.5]
      
      - NAME: imgaug
        ROT_LIM: [-5.4, 5.4]
        RAND_FLIP: true

  DATA_PROCESSOR:
    - NAME: mask_points_and_boxes_outside_range
      REMOVE_OUTSIDE_BOXES: True

    - NAME: shuffle_points
      SHUFFLE_ENABLED: {
        'train': True,
        'test': True
      }

    - NAME: transform_points_to_voxels
      VOXEL_SIZE: [0.075, 0.075, 0.2]
      MAX_POINTS_PER_VOXEL: 10
      MAX_NUMBER_OF_VOXELS: {
        'train': 120000,
        'test': 160000
      }

    - NAME: image_calibrate
    
    - NAME: image_normalize
      mean: [0.485, 0.456, 0.406]
      std: [0.229, 0.224, 0.225]


MODEL:
  NAME: BevFusion

  VFE:
    NAME: MeanVFE

  BACKBONE_3D:
    NAME: VoxelResBackBone8x
    USE_BIAS: False

  MAP_TO_BEV:
    NAME: HeightCompression
    NUM_BEV_FEATURES: 256
  
  IMAGE_BACKBONE:
    NAME: SwinTransformer
    EMBED_DIMS: 96
    DEPTHS: [2, 2, 6, 2]
    NUM_HEADS: [3, 6, 12, 24]
    WINDOW_SIZE: 7
    MLP_RATIO: 4
    DROP_RATE: 0.
    ATTN_DROP_RATE: 0.
    DROP_PATH_RATE: 0.2
    PATCH_NORM: True
    OUT_INDICES: [1, 2, 3]
    WITH_CP: False
    CONVERT_WEIGHTS: True
    INIT_CFG:
      type: Pretrained
      checkpoint: swint-nuimages-pretrained.pth
  
  NECK:
    NAME: GeneralizedLSSFPN
    IN_CHANNELS: [192, 384, 768]
    OUT_CHANNELS: 256
    START_LEVEL: 0
    END_LEVEL: -1
    NUM_OUTS: 3
  
  VTRANSFORM:
    NAME: DepthLSSTransform
    IMAGE_SIZE: [256, 704]
    IN_CHANNEL: 256
    OUT_CHANNEL: 80
    FEATURE_SIZE: [32, 88]
    XBOUND: [-54.0, 54.0, 0.3]
    YBOUND: [-54.0, 54.0, 0.3]
    ZBOUND: [-10.0, 10.0, 20.0]
    DBOUND: [1.0, 60.0, 0.5]
    DOWNSAMPLE: 2
  
  FUSER:
    NAME: 'ConvFuser'
    IN_CHANNEL: 336
    OUT_CHANNEL: 256
  
  BACKBONE_2D:
    NAME: BaseBEVBackbone
    LAYER_NUMS: [5, 5]
    LAYER_STRIDES: [1, 2]
    NUM_FILTERS: [128, 256]
    UPSAMPLE_STRIDES: [1, 2]
    NUM_UPSAMPLE_FILTERS: [256, 256]
    USE_CONV_FOR_NO_STRIDE: true


  DENSE_HEAD:
    CLASS_AGNOSTIC: False
    NAME: TransFusionHead

    USE_BIAS_BEFORE_NORM: False

    NUM_PROPOSALS: 200
    HIDDEN_CHANNEL: 128
    NUM_CLASSES: 10
    NUM_HEADS: 8
    NMS_KERNEL_SIZE: 3
    FFN_CHANNEL: 256
    DROPOUT: 0.1
    BN_MOMENTUM: 0.1
    ACTIVATION: relu

    NUM_HM_CONV: 2
    SEPARATE_HEAD_CFG:
      HEAD_ORDER: ['center', 'height', 'dim', 'rot', 'vel']
      HEAD_DICT: {
          'center': {'out_channels': 2, 'num_conv': 2},
          'height': {'out_channels': 1, 'num_conv': 2},
          'dim': {'out_channels': 3, 'num_conv': 2},
          'rot': {'out_channels': 2, 'num_conv': 2},
          'vel': {'out_channels': 2, 'num_conv': 2},
      }
  
    TARGET_ASSIGNER_CONFIG:
      FEATURE_MAP_STRIDE: 8
      DATASET: nuScenes
      GAUSSIAN_OVERLAP: 0.1
      MIN_RADIUS: 2
      HUNGARIAN_ASSIGNER:
        cls_cost: {'gamma': 2.0, 'alpha': 0.25, 'weight': 0.15}
        reg_cost: {'weight': 0.25}
        iou_cost: {'weight': 0.25}
    
    LOSS_CONFIG:
      LOSS_WEIGHTS: {
              'cls_weight': 1.0,
              'bbox_weight': 0.25,
              'hm_weight': 1.0,
              'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
          }
      LOSS_CLS:
        use_sigmoid: true
        gamma: 2.0
        alpha: 0.25
      
    POST_PROCESSING:
      SCORE_THRESH: 0.0
      POST_CENTER_RANGE: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]

  POST_PROCESSING:
    RECALL_THRESH_LIST: [0.3, 0.5, 0.7]
    SCORE_THRESH: 0.1
    OUTPUT_RAW_SCORE: False

    EVAL_METRIC: kitti



OPTIMIZATION:
  BATCH_SIZE_PER_GPU: 3
  NUM_EPOCHS: 6

  OPTIMIZER: adam_cosineanneal
  LR: 0.0001
  WEIGHT_DECAY: 0.01
  MOMENTUM: 0.9
  BETAS: [0.9, 0.999]

  MOMS: [0.9, 0.8052631]
  PCT_START: 0.4
  WARMUP_ITER: 500

  DECAY_STEP_LIST: [35, 45]
  LR_WARMUP: False
  WARMUP_EPOCH: 1

  GRAD_NORM_CLIP: 35

  LOSS_SCALE_FP16: 32