_BASE_: config.yaml MODEL: META_ARCHITECTURE: "SED" BACKBONE: NAME: "CLIP" WEIGHTS: "" ENC: CLIP_MODEL_NAME: "convnext_large_d_320" CLIP_PRETRAINED_WEIGHTS: "laion2b_s29b_b131k_ft_soup" EMBED_DIM: 768 PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] SEM_SEG_HEAD: NAME: "SEDHead" IN_FEATURES: ["res2", "res3", "res4", "res5"] IGNORE_VALUE: 255 NUM_CLASSES: 171 TRAIN_CLASS_JSON: "datasets/coco.json" TEST_CLASS_JSON: "datasets/coco.json" CLIP_PRETRAINED: "Convnext-L" PROMPT_DEPTH: 0 PROMPT_LENGTH: 0 TEXT_GUIDANCE_DIM: 0 TEXT_GUIDANCE_PROJ_DIM: 0 APPEARANCE_GUIDANCE_DIM: 1536 APPEARANCE_GUIDANCE_PROJ_DIM: 128 DECODER_DIMS: [64, 32, 16] DECODER_GUIDANCE_DIMS: [768, 384, 192] DECODER_GUIDANCE_PROJ_DIMS: [32, 16, 8] NUM_LAYERS: 2 NUM_HEADS: 4 HIDDEN_DIMS: 128 POOLING_SIZES: [2, 2] FEATURE_RESOLUTION: [24, 24] WINDOW_SIZES: 12 ATTENTION_TYPE: "linear" CNEXT_TYPE: "V1" KERNEL_SIZE: [9, 9, 9] CLIP_FINETUNE: "full" PROMPT_ENSEMBLE_TYPE: "imagenet" INPUT: MIN_SIZE_TRAIN: (768, ) MAX_SIZE_TRAIN: 2666 MIN_SIZE_TRAIN_SAMPLING: "choice" MIN_SIZE_TEST: 640 CROP: ENABLED: True TYPE: "absolute" SIZE: (768, 768) SIZE_DIVISIBILITY: 768 FORMAT: "RGB" DATASET_MAPPER_NAME: "mask_former_semantic" SOLVER: IMS_PER_BATCH: 4 LR_SCHEDULER_NAME: WarmupCosineLR BASE_LR: 0.0002 MAX_ITER: 80000 BACKBONE_MULTIPLIER: 0.0 CLIP_MULTIPLIER: 0.01 TEST: EVAL_PERIOD: 10000 FAST_INFERENCE: False TOPK: 1