Commit 72f5785f authored by huaerkl's avatar huaerkl
Browse files

v1.0

parents
Pipeline #505 canceled with stages
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: VideoProcessor
aligner: DSAligner
bert_name: bert-base-uncased
meta_processor: MSRVTTMetaProcessor
test_path: data/msrvtt/MSRVTT_JSFUSION_test.csv
vfeat_dir: data/feat/feat_vtt_s3d
text_processor: MSRVTTTextProcessor
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 256
valid_subset: test
num_workers: 2
common_eval:
path: runs/mtm/vlm/vtt/checkpoint_last.pt
model:
model_cls: MMFusionJoint
mm_encoder_cls: MMBertForJoint
use_seg_emb: true
eval:
save_path: runs/mtm/vlm/vtt/eval
metric: RetrievalMetric
predictor: RetrievalPredictor
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: VideoProcessor
aligner: MSRVTTQAAligner
bert_name: bert-base-uncased
meta_processor: MSRVTTQAMetaProcessor
test_path: data/msrvtt-qa/MSR_MC_test.csv
vfeat_dir: data/feat/feat_vtt_s3d
text_processor: MSRVTTQATextProcessor
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 256
valid_subset: test
num_workers: 2
common_eval:
path: runs/mtm/vlm/vttqa/checkpoint_last.pt
model:
model_cls: MMFusionJoint
mm_encoder_cls: MMBertForJoint
use_seg_emb: true
eval:
save_path: runs/mtm/vlm/vttqa/eval
metric: QAMetric
predictor: QAPredictor
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: YoucookVideoProcessor
aligner: DSAligner
bert_name: bert-base-uncased
meta_processor: YoucookMetaProcessor
test_path: data/youcook/youcook_val.pkl
trainval_annotation: data/youcook/youcookii_annotations_trainval.json
use_annotation_text: true
vfeat_dir: data/feat/feat_youcook_s3d
text_processor: TextProcessor
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 256
valid_subset: test
num_workers: 2
common_eval:
path: runs/mtm/vlm/youcook/checkpoint_last.pt
model:
model_cls: MMFusionJoint
mm_encoder_cls: MMBertForJoint
use_seg_emb: true
eval:
save_path: runs/mtm/vlm/youcook/eval
metric: RetrievalMetric
predictor: RetrievalPredictor
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: YoucookVideoProcessor
aligner: DSNLGAligner
bert_name: bert-base-uncased
meta_processor: YoucookNLGMetaProcessor
test_path: data/youcook/val_list.txt
trainval_annotation: data/youcook/youcookii_annotations_trainval.json
vfeat_dir: data/feat/feat_youcook_s3d
text_processor: NLGTextProcessor
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 256
valid_subset: test
num_workers: 2
common_eval:
path: runs/mtm/vlm/youcookcap/checkpoint_best.pt
model:
model_cls: MMFusionNLG
mm_encoder_cls: MMBertForNLG
max_decode_length: 24
use_seg_emb: true
eval:
save_path: runs/mtm/vlm/youcookcap/eval
metric: NLGMetric
predictor: NLGPredictor
gen_param:
num_beams: 5
dataset:
video_processor: VideoProcessor
bert_name: bert-base-uncased
meta_processor: MSRVTTMetaProcessor
train_path: data/msrvtt/MSRVTT_train.csv
jsfusion_path: data/msrvtt/MSRVTT_JSFUSION_test.csv
full_test_path: data/msrvtt/MSRVTT_FULL_test.csv
dup: 20
val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv
vfeat_dir: data/feat/feat_vtt_s3d
text_processor: MSRVTTTextProcessor
json_path: data/msrvtt/MSRVTT_data.json
aligner: DSAligner
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
common:
tensorboard_logdir: run
log_interval: 1000
fp16: true
dataset:
num_workers: 4
batch_size: 256
optimization:
lr:
- 5.0e-05
clip_norm: 2.0
optimizer: adam
adam_betas: (0.9, 0.98)
lr_scheduler: polynomial_decay
total_num_update: 1000000
warmup_updates: 122
weight_decay: 0.0
ddp_backend: no_c10d
max_epoch: 10
checkpoint:
restore_file: runs/mtm/vlm/checkpoint_best.pt
reset_optimizer: true
reset_dataloader: true
reset_meters: true
save_dir: runs/mtm/vlm/vtt
task_type: sweep_small
model:
model_cls: MMFusionJoint
mm_encoder_cls: MMBertForJoint
use_seg_emb: true
loss:
loss_cls: T2VContraLoss
dataset:
video_processor: VideoProcessor
bert_name: bert-base-uncased
meta_processor: MSRVTTMetaProcessor
train_path: data/msrvtt/MSRVTT_train.csv
dup: 20
val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv
vfeat_dir: data/feat/feat_vtt_s3d
text_processor: MSRVTTTextProcessor
json_path: data/msrvtt/MSRVTT_data.json
aligner: DSAligner
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
common:
tensorboard_logdir: run
log_interval: 1000
fp16: true
dataset:
num_workers: 4
batch_size: 128
optimization:
lr:
- 5.0e-05
clip_norm: 2.0
optimizer: adam
adam_betas: (0.9, 0.98)
lr_scheduler: polynomial_decay
total_num_update: 1000000
warmup_updates: 122
weight_decay: 0.0
ddp_backend: no_c10d
max_epoch: 5
checkpoint:
restore_file: runs/mtm/vlm/checkpoint_best.pt
reset_optimizer: true
reset_dataloader: true
reset_meters: true
save_dir: runs/mtm/vlm/vttqa
task_type: sweep_small
model:
model_cls: MMFusionJoint
mm_encoder_cls: MMBertForJoint
use_seg_emb: true
loss:
loss_cls: V2TContraLoss
dataset:
video_processor: YoucookVideoProcessor
bert_name: bert-base-uncased
meta_processor: YoucookMetaProcessor
train_path: data/youcook/youcook_train.pkl
val_path: data/youcook/youcook_val.pkl
trainval_annotation: data/youcook/youcookii_annotations_trainval.json
use_annotation_text: true
vfeat_dir: data/feat/feat_youcook_s3d
text_processor: TextProcessor
aligner: DSAligner
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
common:
tensorboard_logdir: run
log_interval: 1000
fp16: true
dataset:
num_workers: 4
batch_size: 128
optimization:
lr:
- 5.0e-05
clip_norm: 2.0
optimizer: adam
adam_betas: (0.9, 0.98)
lr_scheduler: polynomial_decay
total_num_update: 1000000
warmup_updates: 122
weight_decay: 0.0
ddp_backend: no_c10d
max_epoch: 10
checkpoint:
restore_file: runs/mtm/vlm/checkpoint_best.pt
reset_optimizer: true
reset_dataloader: true
reset_meters: true
save_dir: runs/mtm/vlm/youcook
task_type: sweep_small
model:
model_cls: MMFusionJoint
mm_encoder_cls: MMBertForJoint
use_seg_emb: true
loss:
loss_cls: T2VContraLoss
dataset:
video_processor: YoucookVideoProcessor
bert_name: bert-base-uncased
meta_processor: YoucookNLGMetaProcessor
train_path: data/youcook/train_list.txt
val_path: data/youcook/val_list.txt
trainval_annotation: data/youcook/youcookii_annotations_trainval.json
vfeat_dir: data/feat/feat_youcook_s3d
text_processor: NLGTextProcessor
aligner: DSNLGAligner
max_video_len: 32
max_len: 96
fairseq:
common:
tensorboard_logdir: run
log_interval: 1000
fp16: true
dataset:
num_workers: 4
batch_size: 128
optimization:
lr:
- 5.0e-05
clip_norm: 2.0
optimizer: adam
adam_betas: (0.9, 0.98)
lr_scheduler: polynomial_decay
total_num_update: 1000000
warmup_updates: 122
weight_decay: 0.0
ddp_backend: no_c10d
max_epoch: 10
checkpoint:
restore_file: runs/mtm/vlm/checkpoint_best.pt
reset_optimizer: true
reset_dataloader: true
reset_meters: true
save_dir: runs/mtm/vlm/youcookcap
task_type: sweep_small
model:
model_cls: MMFusionNLG
mm_encoder_cls: MMBertForNLG
use_seg_emb: true
loss:
loss_cls: NLGLoss
includes: projects/retri/videoretri.yaml
project_dir: retri/videoclip
task_group:
pretrain:
model:
model_cls: MMFusionSeparate
mm_encoder_cls:
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
dataset:
video_processor: VideoProcessor
bert_name: bert-base-uncased
meta_processor: COINActionSegmentationMetaProcessor
train_path: data/coin/COIN.json
val_path: data/coin/COIN.json
vfeat_dir: data/feat/feat_coin_s3d
text_processor: COINActionSegmentationTextProcessor
aligner: COINActionSegmentationAligner
num_iso_layer: 12
sliding_window: 8
sliding_window_size: 32
max_video_len: 32
max_len: 96
fairseq:
common:
tensorboard_logdir: run
log_interval: 1000
fp16: true
dataset:
num_workers: 4
batch_size: 1
optimization:
lr:
- 5.0e-05
clip_norm: 2.0
optimizer: adam
adam_betas: (0.9, 0.98)
lr_scheduler: polynomial_decay
total_num_update: 1000000
warmup_updates: 122
weight_decay: 0.0
ddp_backend: no_c10d
max_epoch: 8
checkpoint:
restore_file: runs/retri/videoclip/checkpoint_best.pt
reset_optimizer: true
reset_dataloader: true
reset_meters: true
save_dir: runs/retri/videoclip/coin
task_type: sweep_big
model:
model_cls: MMFusionSeparateActionSegmentation
mm_encoder_cls: null
video_encoder_cls: MMBertForTokenClassification
text_encoder_cls: BertModel
num_hidden_video_layers: 6
loss:
loss_cls: CrossEntropy
dataset:
video_processor: CrossTaskVideoProcessor
bert_name: bert-base-uncased
meta_processor: CrossTaskMetaProcessor
train_path: data/crosstask/crosstask_release/videos.csv
train_csv_path: data/crosstask/crosstask_release/videos.csv
val_path: data/crosstask/crosstask_release/videos_val.csv
val_csv_path: data/crosstask/crosstask_release/videos_val.csv
primary_path: data/crosstask/crosstask_release/tasks_primary.txt
related_path: data/crosstask/crosstask_release/tasks_related.txt
vfeat_dir: data/feat/feat_crosstask_s3d
annotation_path: data/crosstask/crosstask_release/annotations
n_train: 30
text_processor: CrossTaskTextProcessor
aligner: CrossTaskAligner
num_iso_layer: 12
sliding_window: 16
sliding_window_size: 32
max_video_len: 32
max_len: 96
fairseq:
common:
tensorboard_logdir: run
log_interval: 1000
fp16: true
dataset:
num_workers: 4
batch_size: 1
optimization:
lr:
- 5.0e-05
clip_norm: 2.0
optimizer: adam
adam_betas: (0.9, 0.98)
lr_scheduler: polynomial_decay
total_num_update: 1000000
warmup_updates: 122
weight_decay: 0.0
ddp_backend: no_c10d
max_epoch: 5
checkpoint:
restore_file: runs/retri/videoclip/checkpoint_best.pt
reset_optimizer: true
reset_dataloader: true
reset_meters: true
save_dir: runs/retri/videoclip/crosstask
task_type: sweep_small
model:
model_cls: MMFusionSeparateActionLocalization
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
loss:
loss_cls: BCE
dataset:
video_processor: ShardedVideoRetriVideoProcessor
bert_name: bert-base-uncased
meta_processor: ShardedHow2VideoRetriMetaProcessor
train_path: data/how2/how2_s3d_train.lst
val_path: data/how2/how2_s3d_val.lst
vfeat_dir: data/feat/feat_how2_s3d_shard_small
text_processor: ShardedVideoRetriTextProcessor
tfeat_dir: data/feat/feat_how2_s3d_shard_small/raw_caption_dedup.bert-base-uncased.
aligner: VideoRetriOverlappedAligner
subsampling: 1
sampled_min_len: 8
sampled_max_len: 64
max_video_len: 32
max_len: 96
lazy_vfeat_mask: true
mfm_probability: 0.15
mlm_probability: 0.15
mm_prob: 0.5
sampled_video_min_len: 3
sampled_video_max_len: 32
num_video_per_batch: 32
clip_per_video: 16
fairseq:
common:
tensorboard_logdir: run
log_interval: 1000
fp16: true
dataset:
num_workers: 4
batch_size: 1
optimization:
lr:
- 5.0e-05
clip_norm: 2.0
optimizer: adam
adam_betas: (0.9, 0.98)
lr_scheduler: polynomial_decay
total_num_update: 1000000
warmup_updates: 1000
weight_decay: 0.0
ddp_backend: no_c10d
max_epoch: 25
checkpoint:
save_dir: runs/retri/videoclip
save_interval_updates: 1024
keep_interval_updates: 2
keep_last_epochs: 30
task_type: sweep_big
slurm_config: big
eval:
save_path: runs/retri/videoclip
model:
model_cls: MMFusionSeparate
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
loss:
loss_cls: MMContraLoss
task: VideoRetriTask
retri_epoch: 1
vectorpool_cls: VideoVectorPool
retriever_cls: VectorRetriever
num_cands: 64
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: VideoProcessor
aligner: COINActionSegmentationAligner
bert_name: bert-base-uncased
test_path: data/coin/COIN.json
meta_processor: COINActionSegmentationMetaProcessor
vfeat_dir: data/feat/feat_coin_s3d
text_processor: COINActionSegmentationTextProcessor
num_iso_layer: 12
sliding_window: 16
sliding_window_size: 32
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 1
valid_subset: test
num_workers: 2
common_eval:
path: runs/retri/videoclip/coin/checkpoint_best.pt
model:
model_cls: MMFusionSeparateActionSegmentation
mm_encoder_cls: null
video_encoder_cls: MMBertForTokenClassification
text_encoder_cls: BertModel
num_hidden_video_layers: 6
eval:
save_path: runs/retri/videoclip/coin/eval
metric: COINActionSegmentationMetric
predictor: COINPredictor
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: VideoProcessor
aligner: COINActionSegmentationAligner
bert_name: bert-base-uncased
test_path: data/coin/COIN.json
meta_processor: COINActionSegmentationMetaProcessor
vfeat_dir: data/feat/feat_coin_s3d
text_processor: COINActionSegmentationTextProcessor
num_iso_layer: 12
sliding_window: 16
sliding_window_size: 32
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 1
valid_subset: test
num_workers: 2
common_eval:
path: runs/retri/videoclip/checkpoint_best.pt
model:
model_cls: MMFusionSeparate
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
eval:
save_path: runs/retri/videoclip/coin_zs/eval
metric: COINActionSegmentationMetric
predictor: COINZSPredictor
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: CrossTaskVideoProcessor
aligner: CrossTaskAligner
bert_name: bert-base-uncased
meta_processor: CrossTaskMetaProcessor
test_path: data/crosstask/crosstask_release/videos_val.csv
train_csv_path: data/crosstask/crosstask_release/videos.csv
val_path: data/crosstask/crosstask_release/videos_val.csv
val_csv_path: data/crosstask/crosstask_release/videos_val.csv
primary_path: data/crosstask/crosstask_release/tasks_primary.txt
related_path: data/crosstask/crosstask_release/tasks_related.txt
vfeat_dir: data/feat/feat_crosstask_s3d
annotation_path: data/crosstask/crosstask_release/annotations
n_train: 30
text_processor: CrossTaskTextProcessor
num_iso_layer: 12
sliding_window: 16
sliding_window_size: 32
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 1
valid_subset: test
num_workers: 2
common_eval:
path: runs/retri/videoclip/crosstask/checkpoint_best.pt
model:
model_cls: MMFusionSeparateActionLocalization
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
eval:
save_path: runs/retri/videoclip/crosstask/eval
metric: CrossTaskMetric
predictor: CrossTaskPredictor
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: CrossTaskVideoProcessor
aligner: CrossTaskAligner
bert_name: bert-base-uncased
meta_processor: CrossTaskMetaProcessor
test_path: data/crosstask/crosstask_release/videos_val.csv
train_csv_path: data/crosstask/crosstask_release/videos.csv
val_path: data/crosstask/crosstask_release/videos_val.csv
val_csv_path: data/crosstask/crosstask_release/videos_val.csv
primary_path: data/crosstask/crosstask_release/tasks_primary.txt
related_path: data/crosstask/crosstask_release/tasks_related.txt
vfeat_dir: data/feat/feat_crosstask_s3d
annotation_path: data/crosstask/crosstask_release/annotations
n_train: 30
text_processor: CrossTaskTextProcessor
num_iso_layer: 12
sliding_window: 16
sliding_window_size: 32
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 1
valid_subset: test
num_workers: 2
common_eval:
path: runs/retri/videoclip/checkpoint_best.pt
model:
model_cls: MMFusionSeparateActionLocalization
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
eval:
save_path: runs/retri/videoclip/crosstask_zs/eval
metric: CrossTaskMetric
predictor: CrossTaskPredictor
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: VideoProcessor
aligner: DiDeMoAligner
bert_name: bert-base-uncased
meta_processor: DiDeMoMetaProcessor
test_path: data/didemo/test_data.json
vfeat_dir: data/feat/feat_didemo_s3d
text_processor: DiDeMoTextProcessor
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 256
valid_subset: test
num_workers: 2
common_eval:
path: runs/retri/videoclip/checkpoint_best.pt
model:
model_cls: MMFusionSeparate
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
eval:
save_path: runs/retri/videoclip/didemo_zs/eval
metric: DiDeMoMetric
predictor: DiDeMoPredictor
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: VideoProcessor
aligner: DSAligner
bert_name: bert-base-uncased
meta_processor: MSRVTTMetaProcessor
test_path: data/msrvtt/MSRVTT_JSFUSION_test.csv
vfeat_dir: data/feat/feat_vtt_s3d
text_processor: MSRVTTTextProcessor
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 256
valid_subset: test
num_workers: 2
common_eval:
path: runs/retri/videoclip/vtt/checkpoint_last.pt
model:
model_cls: MMFusionSeparate
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
eval:
save_path: runs/retri/videoclip/vtt/eval
metric: RetrievalMetric
predictor: RetrievalPredictor
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: VideoProcessor
aligner: DSAligner
bert_name: bert-base-uncased
meta_processor: MSRVTTMetaProcessor
test_path: data/msrvtt/MSRVTT_JSFUSION_test.csv
vfeat_dir: data/feat/feat_vtt_s3d
text_processor: MSRVTTTextProcessor
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 256
valid_subset: test
num_workers: 2
common_eval:
path: runs/retri/videoclip/checkpoint_best.pt
model:
model_cls: MMFusionSeparate
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
eval:
save_path: runs/retri/videoclip/vtt_zs/eval
metric: RetrievalMetric
predictor: RetrievalPredictor
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: VideoProcessor
aligner: MSRVTTQAAligner
bert_name: bert-base-uncased
meta_processor: MSRVTTQAMetaProcessor
test_path: data/msrvtt-qa/MSR_MC_test.csv
vfeat_dir: data/feat/feat_vtt_s3d
text_processor: MSRVTTQATextProcessor
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 256
valid_subset: test
num_workers: 2
common_eval:
path: runs/retri/videoclip/vttqa/checkpoint_last.pt
model:
model_cls: MMFusionSeparate
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
eval:
save_path: runs/retri/videoclip/vttqa/eval
metric: QAMetric
predictor: QAPredictor
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment