Commit 72f5785f authored by huaerkl's avatar huaerkl
Browse files

v1.0

parents
Pipeline #505 canceled with stages
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: VideoProcessor
aligner: MSRVTTQAAligner
bert_name: bert-base-uncased
meta_processor: MSRVTTQAMetaProcessor
test_path: data/msrvtt-qa/MSR_MC_test.csv
vfeat_dir: data/feat/feat_vtt_s3d
text_processor: MSRVTTQATextProcessor
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 256
valid_subset: test
num_workers: 2
common_eval:
path: runs/retri/videoclip/checkpoint_best.pt
model:
model_cls: MMFusionSeparate
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
eval:
save_path: runs/retri/videoclip/vttqa_zs/eval
metric: QAMetric
predictor: QAPredictor
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: YoucookVideoProcessor
aligner: DSAligner
bert_name: bert-base-uncased
meta_processor: YoucookMetaProcessor
test_path: data/youcook/youcook_val.pkl
trainval_annotation: data/youcook/youcookii_annotations_trainval.json
use_annotation_text: true
vfeat_dir: data/feat/feat_youcook_s3d
text_processor: TextProcessor
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 256
valid_subset: test
num_workers: 2
common_eval:
path: runs/retri/videoclip/youcook/checkpoint_last.pt
model:
model_cls: MMFusionSeparate
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
eval:
save_path: runs/retri/videoclip/youcook/eval
metric: RetrievalMetric
predictor: RetrievalPredictor
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: YoucookVideoProcessor
aligner: DSAligner
bert_name: bert-base-uncased
meta_processor: YoucookMetaProcessor
test_path: data/youcook/youcook_val.pkl
trainval_annotation: data/youcook/youcookii_annotations_trainval.json
use_annotation_text: true
vfeat_dir: data/feat/feat_youcook_s3d
text_processor: TextProcessor
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
dataset:
batch_size: 256
valid_subset: test
num_workers: 2
common_eval:
path: runs/retri/videoclip/checkpoint_best.pt
model:
model_cls: MMFusionSeparate
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
eval:
save_path: runs/retri/videoclip/youcook_zs/eval
metric: RetrievalMetric
predictor: RetrievalPredictor
dataset:
video_processor: VideoProcessor
bert_name: bert-base-uncased
meta_processor: MSRVTTMetaProcessor
train_path: data/msrvtt/MSRVTT_train.csv
jsfusion_path: data/msrvtt/MSRVTT_JSFUSION_test.csv
full_test_path: data/msrvtt/MSRVTT_FULL_test.csv
dup: 20
val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv
vfeat_dir: data/feat/feat_vtt_s3d
text_processor: MSRVTTTextProcessor
json_path: data/msrvtt/MSRVTT_data.json
aligner: DSAligner
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
common:
tensorboard_logdir: run
log_interval: 1000
fp16: true
dataset:
num_workers: 4
batch_size: 224
optimization:
lr:
- 5.0e-05
clip_norm: 2.0
optimizer: adam
adam_betas: (0.9, 0.98)
lr_scheduler: polynomial_decay
total_num_update: 1000000
warmup_updates: 122
weight_decay: 0.0
ddp_backend: no_c10d
max_epoch: 10
checkpoint:
restore_file: runs/retri/videoclip/checkpoint_best.pt
reset_optimizer: true
reset_dataloader: true
reset_meters: true
save_dir: runs/retri/videoclip/vtt
task_type: sweep_small
model:
model_cls: MMFusionSeparate
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
loss:
loss_cls: T2VContraLoss
dataset:
video_processor: VideoProcessor
bert_name: bert-base-uncased
meta_processor: MSRVTTMetaProcessor
train_path: data/msrvtt/MSRVTT_train.csv
dup: 20
val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv
vfeat_dir: data/feat/feat_vtt_s3d
text_processor: MSRVTTTextProcessor
json_path: data/msrvtt/MSRVTT_data.json
aligner: DSAligner
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
common:
tensorboard_logdir: run
log_interval: 1000
fp16: true
dataset:
num_workers: 4
batch_size: 128
optimization:
lr:
- 5.0e-05
clip_norm: 2.0
optimizer: adam
adam_betas: (0.9, 0.98)
lr_scheduler: polynomial_decay
total_num_update: 1000000
warmup_updates: 122
weight_decay: 0.0
ddp_backend: no_c10d
max_epoch: 5
checkpoint:
restore_file: runs/retri/videoclip/checkpoint_best.pt
reset_optimizer: true
reset_dataloader: true
reset_meters: true
save_dir: runs/retri/videoclip/vttqa
task_type: sweep_small
model:
model_cls: MMFusionSeparate
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
loss:
loss_cls: V2TContraLoss
dataset:
video_processor: YoucookVideoProcessor
bert_name: bert-base-uncased
meta_processor: YoucookMetaProcessor
train_path: data/youcook/youcook_train.pkl
val_path: data/youcook/youcook_val.pkl
trainval_annotation: data/youcook/youcookii_annotations_trainval.json
use_annotation_text: true
vfeat_dir: data/feat/feat_youcook_s3d
text_processor: TextProcessor
aligner: DSAligner
num_iso_layer: 12
max_video_len: 32
max_len: 96
fairseq:
common:
tensorboard_logdir: run
log_interval: 1000
fp16: true
dataset:
num_workers: 4
batch_size: 128
optimization:
lr:
- 5.0e-05
clip_norm: 2.0
optimizer: adam
adam_betas: (0.9, 0.98)
lr_scheduler: polynomial_decay
total_num_update: 1000000
warmup_updates: 122
weight_decay: 0.0
ddp_backend: no_c10d
max_epoch: 10
checkpoint:
restore_file: runs/retri/videoclip/checkpoint_best.pt
reset_optimizer: true
reset_dataloader: true
reset_meters: true
save_dir: runs/retri/videoclip/youcook
task_type: sweep_small
model:
model_cls: MMFusionSeparate
mm_encoder_cls: null
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
loss:
loss_cls: T2VContraLoss
includes: projects/mfmmlm.yaml
project_dir: retri/videoretri
run_task:
- how2.yaml
task_group:
pretrain:
task: VideoRetriTask
retri_epoch: 1
vectorpool_cls: VideoVectorPool
retriever_cls: VectorRetriever
num_cands: 64
dataset:
train_path: data/how2/how2_s3d_train.lst
meta_processor: ShardedHow2VideoRetriMetaProcessor
video_processor: ShardedVideoRetriVideoProcessor
text_processor: ShardedVideoRetriTextProcessor
aligner: VideoRetriOverlappedAligner
sampled_video_min_len: 3
sampled_video_max_len: 32
sampled_min_len: 8
sampled_max_len: 64
num_video_per_batch: 32
# do not use subsampling as it changes fairseq batch_size.
subsampling: 1 # disable subsampling
clip_per_video: 16
fairseq:
dataset:
batch_size: 1
optimization:
max_epoch: 25
model:
model_cls: MMFusionShare
mm_encoder_cls: MMBertForEncoder
loss:
loss_cls: MMContraLoss
finetune:
task_list: [vtt_videoclip.yaml, youcook_videoclip.yaml, vttqa_videoclip.yaml, crosstask_videoclip.yaml, coin_videoclip.yaml]
test:
task_list:
- test_youcook_zs.yaml
- test_vtt_zs.yaml
- test_vttqa_zs.yaml
- test_crosstask_zs_videoclip.yaml
- test_coin_zs.yaml
- test_didemo_zs.yaml
- test_youcook_videoclip.yaml
- test_vtt_videoclip.yaml
- test_vttqa_videoclip.yaml
- test_crosstask_videoclip.yaml
- test_coin_videoclip.yaml
includes: projects/task/ft.yaml
task_type: sweep_big
dataset:
meta_processor: COINActionSegmentationMetaProcessor
train_path: data/coin/COIN.json
val_path: data/coin/COIN.json
vfeat_dir: data/feat/feat_coin_s3d
video_processor: VideoProcessor
text_processor: COINActionSegmentationTextProcessor
aligner: COINActionSegmentationAligner
num_iso_layer: 12
sliding_window: 8
sliding_window_size: 32
model:
model_cls: MMFusionActionSegmentation
mm_encoder_cls: MMBertForTokenClassification
loss:
loss_cls: CrossEntropy
fairseq:
dataset:
batch_size: 1
optimization:
max_epoch: 8
checkpoint:
save_dir: runs/task/coin
includes: projects/task/coin.yaml
model:
model_cls: MMFusionSeparateActionSegmentation
mm_encoder_cls:
video_encoder_cls: MMBertForTokenClassification
text_encoder_cls: BertModel # dummy, not used.
num_hidden_video_layers: 6
includes: projects/task/ft.yaml
dataset:
meta_processor: CrossTaskMetaProcessor
train_path: data/crosstask/crosstask_release/videos.csv # dummy
train_csv_path: data/crosstask/crosstask_release/videos.csv
val_path: data/crosstask/crosstask_release/videos_val.csv # dummy
val_csv_path: data/crosstask/crosstask_release/videos_val.csv
primary_path: data/crosstask/crosstask_release/tasks_primary.txt
related_path: data/crosstask/crosstask_release/tasks_related.txt
vfeat_dir: data/feat/feat_crosstask_s3d
annotation_path: data/crosstask/crosstask_release/annotations
n_train: 30
video_processor: CrossTaskVideoProcessor
text_processor: CrossTaskTextProcessor
aligner: CrossTaskAligner
num_iso_layer: 12
sliding_window: 16
sliding_window_size: 32
model:
model_cls: MMFusionActionLocalization
mm_encoder_cls: MMBertForJoint
loss:
loss_cls: BCE
fairseq:
dataset:
batch_size: 1
optimization:
max_epoch: 5
checkpoint:
save_dir: runs/task/crosstask
restore_file: runs/task/checkpoint11.pt # for VLM
includes: projects/task/crosstask.yaml
model:
model_cls: MMFusionSeparateActionLocalization
mm_encoder_cls:
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel # dummy, not used.
num_hidden_video_layers: 6
fairseq:
checkpoint:
restore_file: runs/task/checkpoint_best.pt # overwrite the default of VLM.
# this yaml cannot be run alone. you must use `how2.yaml`, `vtt.yaml` etc for training.
dataset:
video_processor: VideoProcessor
bert_name: bert-base-uncased
fairseq:
common:
tensorboard_logdir: run
log_interval: 1000
dataset:
num_workers: 4
optimization:
lr: [ 0.00005 ]
clip_norm: 2.0
optimizer: adam
adam_betas: (0.9, 0.98)
lr_scheduler: polynomial_decay
total_num_update: 1000000 # backward compatible on fairseq 1.0.0a0+af0389f for reproducibility.
warmup_updates: 1000
weight_decay: 0.0
ddp_backend: no_c10d
includes: projects/task/default.yaml
# all derived config will be run by fairseq-train.
task_type: sweep_small
fairseq:
optimization:
warmup_updates: 122 # copied from roberta glue: https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.glue.md
checkpoint:
# save_interval_updates: 512
# borrowed from Roberta script.
restore_file: runs/task/checkpoint_best.pt
reset_optimizer: True
reset_dataloader: True
reset_meters: True
includes: projects/task/default.yaml
task_type: sweep_big
slurm_config: big
dataset:
meta_processor: ShardedHow2MetaProcessor
train_path: data/how2/how2_s3d_train.lst
val_path: data/how2/how2_s3d_val.lst
video_processor: ShardedVideoProcessor
vfeat_dir: data/feat/feat_how2_s3d_shard_small
text_processor: ShardedTextProcessor
tfeat_dir: data/feat/feat_how2_s3d_shard_small/raw_caption_dedup.bert-base-uncased.
aligner: FixedLenAligner
# disable direct running of this yaml
eval:
save_path: runs/task
fairseq:
checkpoint:
save_dir: runs/task
save_interval_updates: 1024
keep_interval_updates: 2
keep_last_epochs: 30
# this yaml cannot be run alone: implement a test_${dataset}.yaml
slurm_config: big
task_type: local_predict
dataset:
split: test
video_processor: VideoProcessor
aligner: DSAligner
bert_name: bert-base-uncased
fairseq:
dataset:
batch_size: 256
valid_subset: test
num_workers: 2
includes: projects/task/test.yaml
dataset:
split: test
test_path: data/coin/COIN.json
meta_processor: COINActionSegmentationMetaProcessor
vfeat_dir: data/feat/feat_coin_s3d
video_processor: VideoProcessor
text_processor: COINActionSegmentationTextProcessor
aligner: COINActionSegmentationAligner
num_iso_layer: 12
sliding_window: 16
sliding_window_size: 32
model:
model_cls: MMFusionActionSegmentation
mm_encoder_cls: MMBertForTokenClassification
eval:
save_path: runs/task/coin/eval
fairseq:
dataset:
batch_size: 1
common_eval:
path: runs/task/coin/checkpoint_best.pt
metric: COINActionSegmentationMetric
predictor: COINPredictor
includes: projects/task/test_coin.yaml
model:
model_cls: MMFusionSeparateActionSegmentation
mm_encoder_cls:
video_encoder_cls: MMBertForTokenClassification
text_encoder_cls: BertModel # dummy, not used.
num_hidden_video_layers: 6
includes: projects/task/test_coin.yaml
model:
model_cls: MMFusionSeparate
mm_encoder_cls:
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel
num_hidden_video_layers: 6
eval:
save_path: runs/task/coin_zs/eval
fairseq:
common_eval:
path: runs/task/checkpoint_best.pt
predictor: COINZSPredictor
includes: projects/task/test.yaml
dataset:
split: test
meta_processor: CrossTaskMetaProcessor
test_path: data/crosstask/crosstask_release/videos_val.csv
train_csv_path: data/crosstask/crosstask_release/videos.csv
val_path: data/crosstask/crosstask_release/videos_val.csv # dummy
val_csv_path: data/crosstask/crosstask_release/videos_val.csv
primary_path: data/crosstask/crosstask_release/tasks_primary.txt
related_path: data/crosstask/crosstask_release/tasks_related.txt
vfeat_dir: data/feat/feat_crosstask_s3d
annotation_path: data/crosstask/crosstask_release/annotations
n_train: 30
video_processor: CrossTaskVideoProcessor
text_processor: CrossTaskTextProcessor
aligner: CrossTaskAligner
num_iso_layer: 12
sliding_window: 16
sliding_window_size: 32
model:
model_cls: MMFusionActionLocalization
mm_encoder_cls: MMBertForJoint
eval:
save_path: runs/task/crosstask/eval
fairseq:
# read code and find what is the checkpoint arg.
dataset:
batch_size: 1
common_eval:
path: runs/task/crosstask/checkpoint_best.pt
metric: CrossTaskMetric
predictor: CrossTaskPredictor
includes: projects/task/test_crosstask.yaml
model:
model_cls: MMFusionSeparateActionLocalization
mm_encoder_cls:
video_encoder_cls: MMBertForEncoder
text_encoder_cls: BertModel # dummy, not used.
num_hidden_video_layers: 6
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment