From 1a79eae34d9e5ea2e984b1036def004376913878 Mon Sep 17 00:00:00 2001 From: Zihan Wang Date: Thu, 24 Feb 2022 21:10:01 -0800 Subject: [PATCH 01/54] longformer --- official/projects/longformer/README.md | 30 + .../longformer/experiments/glue_mnli.yaml | 47 + .../experiments/glue_mnli_allenai.yaml | 48 + .../experiments/pretraining_512.yaml | 64 + official/projects/longformer/longformer.py | 67 + .../longformer/longformer_attention.py | 1013 +++++++++++ .../longformer/longformer_attention_test.py | 256 +++ .../projects/longformer/longformer_encoder.py | 394 ++++ .../longformer/longformer_encoder_block.py | 358 ++++ .../longformer/longformer_encoder_test.py | 84 + .../longformer/longformer_experiments.py | 116 ++ ...ence_prediction_with_checkpoint_convert.py | 389 ++++ official/projects/longformer/train.py | 69 + ...form_longformer_tokenized_into_tfrecord.py | 1592 +++++++++++++++++ ...ters_from_pretrained_pytorch_checkpoint.py | 9 + .../utils/longformer_tokenizer_to_tfrecord.py | 93 + 16 files changed, 4629 insertions(+) create mode 100644 official/projects/longformer/README.md create mode 100644 official/projects/longformer/experiments/glue_mnli.yaml create mode 100644 official/projects/longformer/experiments/glue_mnli_allenai.yaml create mode 100644 official/projects/longformer/experiments/pretraining_512.yaml create mode 100644 official/projects/longformer/longformer.py create mode 100644 official/projects/longformer/longformer_attention.py create mode 100644 official/projects/longformer/longformer_attention_test.py create mode 100644 official/projects/longformer/longformer_encoder.py create mode 100644 official/projects/longformer/longformer_encoder_block.py create mode 100644 official/projects/longformer/longformer_encoder_test.py create mode 100644 official/projects/longformer/longformer_experiments.py create mode 100644 official/projects/longformer/sentence_prediction_with_checkpoint_convert.py create mode 100644 official/projects/longformer/train.py create mode 100644 official/projects/longformer/transform_longformer_tokenized_into_tfrecord.py create mode 100644 official/projects/longformer/utils/get_parameters_from_pretrained_pytorch_checkpoint.py create mode 100644 official/projects/longformer/utils/longformer_tokenizer_to_tfrecord.py diff --git a/official/projects/longformer/README.md b/official/projects/longformer/README.md new file mode 100644 index 000000000..48a97745f --- /dev/null +++ b/official/projects/longformer/README.md @@ -0,0 +1,30 @@ +# Longformer: The Long-Document Transformer + +## Modifications from Huggingface's Implementation +All models require a `global_attention_size` specified in the config, +setting a global attention for all first `global_attention_size` tokens in any sentence. +Individual different global attention sizes for sentences are not supported. +This setting allows running on TPUs where tensor sizes have to be determined. + +`_get_global_attn_indices` in `longformer_attention.py` contains how the new global attention indices are specified. +Changed all `tf.cond` to if confiditions, since global attention is specified in the start now. + +`sentence_prediction_with_checkpoint_convert.py` now contains a `initial_parameters_from_pk` parameter that +specified a pk file containing all pre-trained weights from a pytorch longformer, which can be loaded into the +tf model. +The pk file can be generated from `utils/get_parameters_from_pretrained_pytorch_checkpoint.py`. +There is also a `longformer_tokenizer_to_tfrecord.py` that transformers pytorch longformer tokenized data to tf_records. + +## Running +```bash +python utils/get_parameters_from_pretrained_pytorch_checkpoint.py +TRAIN_DATA=task.train_data.input_path=gs://model-garden-ucsd-zihan/longformer_allenai_mnli_train.tf_record,task.validation_data.input_path=gs://model-garden-ucsd-zihan/longformer_allenai_mnli_eval.tf_record +PYTHONPATH=/path/to/model/garden \ + python3 train.py \ + --experiment=longformer/glue \ + --config_file=experiments/glue_mnli_allenai.yaml \ + --params_override="${TRAIN_DATA},runtime.distribution_strategy=tpu,task.initial_parameters_from_pk=allenai_longformer-base-4096.pk" \ + --tpu=local \ + --model_dir=/path/to/outputdir \ + --mode=train_and_eval +``` diff --git a/official/projects/longformer/experiments/glue_mnli.yaml b/official/projects/longformer/experiments/glue_mnli.yaml new file mode 100644 index 000000000..7c5540cfe --- /dev/null +++ b/official/projects/longformer/experiments/glue_mnli.yaml @@ -0,0 +1,47 @@ +task: + hub_module_url: '' + model: + num_classes: 3 + encoder: + type: any + any: + max_position_embeddings: 512 + attention_window: [32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32] + global_attention_size: 1 + metric_type: 'accuracy' + train_data: + drop_remainder: true + global_batch_size: 32 + input_path: TODO + is_training: true + seq_length: 128 + validation_data: + drop_remainder: true + global_batch_size: 32 + input_path: TODO + is_training: false + seq_length: 128 +trainer: + checkpoint_interval: 1000 + continuous_eval_timeout: 7200 + optimizer_config: + learning_rate: + polynomial: + decay_steps: 61359 + end_learning_rate: 0.0 + initial_learning_rate: 3.0e-05 + power: 1.0 + type: polynomial + optimizer: + type: adamw + warmup: + polynomial: + power: 1 + warmup_steps: 6136 + type: polynomial + steps_per_loop: 100 + summary_interval: 100 + # Training data size 392,702 examples, 5 epochs. + train_steps: 61359 + validation_interval: 2000 + validation_steps: 307 diff --git a/official/projects/longformer/experiments/glue_mnli_allenai.yaml b/official/projects/longformer/experiments/glue_mnli_allenai.yaml new file mode 100644 index 000000000..c3495786d --- /dev/null +++ b/official/projects/longformer/experiments/glue_mnli_allenai.yaml @@ -0,0 +1,48 @@ +task: + hub_module_url: '' + model: + num_classes: 3 + encoder: + type: any + any: + max_position_embeddings: 4098 + attention_window: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128] + global_attention_size: 1 + vocab_size: 50265 + metric_type: 'accuracy' + train_data: + drop_remainder: true + global_batch_size: 32 + input_path: TODO + is_training: true + seq_length: 512 + validation_data: + drop_remainder: true + global_batch_size: 32 + input_path: TODO + is_training: false + seq_length: 512 +trainer: + checkpoint_interval: 1000 + continuous_eval_timeout: 7200 + optimizer_config: + learning_rate: + polynomial: + decay_steps: 61359 + end_learning_rate: 0.0 + initial_learning_rate: 3.0e-05 + power: 1.0 + type: polynomial + optimizer: + type: adamw + warmup: + polynomial: + power: 1 + warmup_steps: 6136 + type: polynomial + steps_per_loop: 1000 + summary_interval: 1000 + # Training data size 392,702 examples, 5 epochs. + train_steps: 61359 + validation_interval: 2000 + validation_steps: 307 diff --git a/official/projects/longformer/experiments/pretraining_512.yaml b/official/projects/longformer/experiments/pretraining_512.yaml new file mode 100644 index 000000000..c535c79c3 --- /dev/null +++ b/official/projects/longformer/experiments/pretraining_512.yaml @@ -0,0 +1,64 @@ +task: + init_checkpoint: '' + model: + cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.1, inner_dim: 768, name: next_sentence, num_classes: 2}] + encoder: + type: any + any: + attention_dropout_rate: 0.1 + dropout_rate: 0.1 + embedding_size: 768 + hidden_activation: gelu + hidden_size: 768 + initializer_range: 0.02 + intermediate_size: 3072 + max_position_embeddings: 512 + num_attention_heads: 12 + num_layers: 12 + type_vocab_size: 2 + vocab_size: 30522 + attention_window: [ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 ] + global_attention_size: 1 + train_data: + drop_remainder: true + global_batch_size: 256 + input_path: gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00000-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00001-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00002-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00003-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00004-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00005-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00006-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00007-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00008-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00009-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00010-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00011-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00012-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00013-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00014-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00015-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00016-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00017-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00018-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00019-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00020-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00021-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00022-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00023-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00024-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00025-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00026-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00027-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00028-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00029-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00030-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00031-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00032-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00033-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00034-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00035-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00036-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00037-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00038-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00039-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00040-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00041-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00042-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00043-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00044-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00045-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00046-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00047-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00048-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00049-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00050-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00051-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00052-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00053-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00054-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00055-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00056-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00057-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00058-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00059-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00060-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00061-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00062-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00063-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00064-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00065-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00066-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00067-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00068-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00069-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00070-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00071-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00072-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00073-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00074-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00075-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00076-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00077-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00078-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00079-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00080-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00081-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00082-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00083-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00084-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00085-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00086-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00087-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00088-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00089-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00090-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00091-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00092-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00093-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00094-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00095-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00096-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00097-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00098-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00099-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00100-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00101-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00102-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00103-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00104-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00105-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00106-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00107-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00108-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00109-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00110-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00111-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00112-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00113-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00114-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00115-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00116-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00117-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00118-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00119-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00120-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00121-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00122-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00123-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00124-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00125-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00126-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00127-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00128-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00129-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00130-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00131-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00132-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00133-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00134-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00135-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00136-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00137-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00138-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00139-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00140-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00141-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00142-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00143-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00144-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00145-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00146-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00147-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00148-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00149-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00150-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00151-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00152-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00153-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00154-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00155-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00156-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00157-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00158-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00159-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00160-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00161-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00162-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00163-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00164-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00165-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00166-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00167-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00168-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00169-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00170-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00171-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00172-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00173-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00174-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00175-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00176-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00177-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00178-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00179-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00180-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00181-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00182-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00183-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00184-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00185-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00186-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00187-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00188-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00189-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00190-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00191-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00192-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00193-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00194-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00195-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00196-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00197-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00198-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00199-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00200-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00201-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00202-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00203-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00204-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00205-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00206-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00207-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00208-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00209-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00210-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00211-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00212-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00213-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00214-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00215-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00216-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00217-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00218-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00219-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00220-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00221-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00222-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00223-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00224-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00225-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00226-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00227-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00228-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00229-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00230-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00231-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00232-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00233-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00234-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00235-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00236-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00237-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00238-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00239-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00240-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00241-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00242-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00243-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00244-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00245-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00246-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00247-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00248-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00249-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00250-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00251-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00252-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00253-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00254-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00255-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00256-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00257-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00258-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00259-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00260-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00261-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00262-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00263-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00264-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00265-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00266-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00267-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00268-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00269-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00270-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00271-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00272-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00273-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00274-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00275-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00276-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00277-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00278-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00279-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00280-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00281-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00282-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00283-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00284-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00285-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00286-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00287-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00288-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00289-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00290-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00291-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00292-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00293-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00294-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00295-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00296-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00297-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00298-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00299-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00300-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00301-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00302-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00303-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00304-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00305-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00306-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00307-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00308-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00309-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00310-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00311-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00312-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00313-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00314-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00315-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00316-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00317-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00318-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00319-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00320-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00321-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00322-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00323-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00324-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00325-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00326-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00327-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00328-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00329-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00330-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00331-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00332-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00333-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00334-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00335-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00336-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00337-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00338-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00339-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00340-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00341-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00342-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00343-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00344-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00345-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00346-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00347-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00348-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00349-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00350-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00351-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00352-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00353-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00354-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00355-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00356-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00357-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00358-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00359-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00360-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00361-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00362-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00363-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00364-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00365-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00366-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00367-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00368-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00369-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00370-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00371-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00372-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00373-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00374-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00375-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00376-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00377-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00378-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00379-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00380-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00381-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00382-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00383-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00384-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00385-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00386-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00387-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00388-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00389-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00390-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00391-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00392-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00393-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00394-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00395-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00396-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00397-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00398-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00399-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00400-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00401-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00402-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00403-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00404-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00405-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00406-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00407-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00408-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00409-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00410-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00411-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00412-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00413-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00414-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00415-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00416-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00417-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00418-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00419-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00420-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00421-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00422-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00423-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00424-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00425-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00426-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00427-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00428-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00429-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00430-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00431-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00432-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00433-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00434-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00435-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00436-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00437-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00438-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00439-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00440-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00441-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00442-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00443-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00444-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00445-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00446-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00447-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00448-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00449-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00450-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00451-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00452-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00453-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00454-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00455-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00456-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00457-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00458-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00459-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00460-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00461-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00462-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00463-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00464-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00465-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00466-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00467-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00468-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00469-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00470-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00471-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00472-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00473-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00474-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00475-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00476-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00477-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00478-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00479-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00480-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00481-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00482-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00483-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00484-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00485-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00486-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00487-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00488-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00489-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00490-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00491-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00492-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00493-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00494-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00495-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00496-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00497-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00498-of-00500,gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00499-of-00500 + + is_training: true + max_predictions_per_seq: 76 + seq_length: 512 + use_next_sentence_label: true + use_position_id: false + validation_data: + drop_remainder: true + global_batch_size: 256 + input_path: TODO + is_training: false + max_predictions_per_seq: 76 + seq_length: 512 + use_next_sentence_label: true + use_position_id: false +trainer: + checkpoint_interval: 20000 + max_to_keep: 5 + optimizer_config: + learning_rate: + polynomial: + cycle: false + decay_steps: 1000000 + end_learning_rate: 0.0 + initial_learning_rate: 0.0001 + power: 1.0 + type: polynomial + optimizer: + type: adamw + warmup: + polynomial: + power: 1 + warmup_steps: 10000 + type: polynomial + steps_per_loop: 50 + summary_interval: 50 + train_steps: 1000000 + validation_interval: 1000 + validation_steps: 64 diff --git a/official/projects/longformer/longformer.py b/official/projects/longformer/longformer.py new file mode 100644 index 000000000..1237eee1b --- /dev/null +++ b/official/projects/longformer/longformer.py @@ -0,0 +1,67 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Longformer model configurations and instantiation methods.""" +import dataclasses + +import gin +import tensorflow as tf + +from official.modeling import tf_utils +from official.modeling.hyperparams import base_config +from official.nlp.configs import encoders +from official.projects.longformer.longformer_encoder import LongformerEncoder +from typing import List + +@dataclasses.dataclass +class LongformerEncoderConfig(encoders.BertEncoderConfig): + '''Extra paramerters for Longformer configs + Args: + attention_window: list of ints representing the window size for each layer. + global_attention_size: the size of global attention used for each token. + ''' + attention_window: List[int] = dataclasses.field(default_factory=list) + global_attention_size: int = 0 + +@gin.configurable +@base_config.bind(LongformerEncoderConfig) +def get_encoder(encoder_cfg: LongformerEncoderConfig): + """Gets a 'LongformerEncoder' object. + + Args: + encoder_cfg: A 'LongformerEncoderConfig'. + + Returns: + A encoder object. + """ + encoder = LongformerEncoder( + attention_window=encoder_cfg.attention_window, + global_attention_size=encoder_cfg.global_attention_size, + vocab_size=encoder_cfg.vocab_size, + hidden_size=encoder_cfg.hidden_size, + num_layers=encoder_cfg.num_layers, + num_attention_heads=encoder_cfg.num_attention_heads, + intermediate_size=encoder_cfg.intermediate_size, + activation=tf_utils.get_activation(encoder_cfg.hidden_activation), + dropout_rate=encoder_cfg.dropout_rate, + attention_dropout_rate=encoder_cfg.attention_dropout_rate, + max_sequence_length=encoder_cfg.max_position_embeddings, + type_vocab_size=encoder_cfg.type_vocab_size, + initializer=tf.keras.initializers.TruncatedNormal( + stddev=encoder_cfg.initializer_range), + output_range=encoder_cfg.output_range, + embedding_width=encoder_cfg.embedding_size, + norm_first=encoder_cfg.norm_first + ) + return encoder diff --git a/official/projects/longformer/longformer_attention.py b/official/projects/longformer/longformer_attention.py new file mode 100644 index 000000000..43e2ec0fc --- /dev/null +++ b/official/projects/longformer/longformer_attention.py @@ -0,0 +1,1013 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Longformer attention block. Modified From huggingface/transformers +""" + +# pylint: disable=g-classes-have-attributes + +import collections +import math +import string + +import tensorflow as tf + +import numpy as np +from keras import constraints +from keras import initializers +from keras import regularizers +from keras.engine.base_layer import Layer +from keras.layers import core +from keras.layers import einsum_dense +from keras.utils import tf_utils +from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.util.tf_export import keras_export +from typing import Dict, List, Optional, Union + +def shape_list(tensor: tf.Tensor) -> List[int]: + """ + Deal with dynamic shape in tensorflow cleanly. + + Args: + tensor (:obj:`tf.Tensor`): The tensor we want the shape of. + + Returns: + :obj:`List[int]`: The shape of the tensor as a list. + """ + dynamic = tf.shape(tensor) + + if tensor.shape == tf.TensorShape(None): + return dynamic + + static = tensor.shape.as_list() + + return [dynamic[i] if s is None else s for i, s in enumerate(static)] + + +_CHR_IDX = string.ascii_lowercase + +def _build_attention_equation(rank, attn_axes): + """Builds einsum equations for the attention computation. + Query, key, value inputs after projection are expected to have the shape as: + `(bs, , , num_heads, channels)`. + `bs` and `` are treated as ``. + The attention operations can be generalized: + (1) Query-key dot product: + `(, , num_heads, channels), (, + , num_heads, channels) -> (, + num_heads, , )` + (2) Combination: + `(, num_heads, , ), + (, , num_heads, channels) -> (, + , num_heads, channels)` + Args: + rank: Rank of query, key, value tensors. + attn_axes: List/tuple of axes, `[-1, rank)`, + that attention will be applied to. + Returns: + Einsum equations. + """ + target_notation = _CHR_IDX[:rank] + # `batch_dims` includes the head dim. + batch_dims = tuple(np.delete(range(rank), attn_axes + (rank - 1,))) + letter_offset = rank + source_notation = "" + for i in range(rank): + if i in batch_dims or i == rank - 1: + source_notation += target_notation[i] + else: + source_notation += _CHR_IDX[letter_offset] + letter_offset += 1 + + product_notation = "".join([target_notation[i] for i in batch_dims] + + [target_notation[i] for i in attn_axes] + + [source_notation[i] for i in attn_axes]) + dot_product_equation = "%s,%s->%s" % (source_notation, target_notation, + product_notation) + attn_scores_rank = len(product_notation) + combine_equation = "%s,%s->%s" % (product_notation, source_notation, + target_notation) + return dot_product_equation, combine_equation, attn_scores_rank + + +def _build_proj_equation(free_dims, bound_dims, output_dims): + """Builds an einsum equation for projections inside multi-head attention.""" + input_str = "" + kernel_str = "" + output_str = "" + bias_axes = "" + letter_offset = 0 + for i in range(free_dims): + char = _CHR_IDX[i + letter_offset] + input_str += char + output_str += char + + letter_offset += free_dims + for i in range(bound_dims): + char = _CHR_IDX[i + letter_offset] + input_str += char + kernel_str += char + + letter_offset += bound_dims + for i in range(output_dims): + char = _CHR_IDX[i + letter_offset] + kernel_str += char + output_str += char + bias_axes += char + equation = "%s,%s->%s" % (input_str, kernel_str, output_str) + + return equation, bias_axes, len(output_str) + + +def _get_output_shape(output_rank, known_last_dims): + return [None] * (output_rank - len(known_last_dims)) + list(known_last_dims) + +@tf.keras.utils.register_keras_serializable(package="Text") +class LongformerAttention(tf.keras.layers.MultiHeadAttention): + def __init__(self, + attention_window, + layer_id, + global_attention_size, + **kwargs): + super().__init__(**kwargs) + self._layer_id = layer_id + _attention_window = attention_window + assert ( + _attention_window % 2 == 0 + ), f"`attention_window` for layer {self._layer_id} has to be an even value. Given {attention_window}" + assert ( + _attention_window > 0 + ), f"`attention_window` for layer {self._layer_id} has to be positive. Given {attention_window}" + self._one_sided_attn_window_size = _attention_window // 2 + self.global_attention_size = global_attention_size + + def _build_from_signature(self, query, value, key=None): + """Builds layers and variables. + Once the method is called, self._built_from_signature will be set to True. + Args: + query: Query tensor or TensorShape. + value: Value tensor or TensorShape. + key: Key tensor or TensorShape. + """ + self._built_from_signature = True + if hasattr(query, "shape"): + self._query_shape = tf.TensorShape(query.shape) + else: + self._query_shape = tf.TensorShape(query) + if hasattr(value, "shape"): + self._value_shape = tf.TensorShape(value.shape) + else: + self._value_shape = tf.TensorShape(value) + if key is None: + self._key_shape = self._value_shape + elif hasattr(key, "shape"): + self._key_shape = tf.TensorShape(key.shape) + else: + self._key_shape = tf.TensorShape(key) + + common_kwargs = dict( + kernel_initializer=self._kernel_initializer, + bias_initializer=self._bias_initializer, + kernel_regularizer=self._kernel_regularizer, + bias_regularizer=self._bias_regularizer, + activity_regularizer=self._activity_regularizer, + kernel_constraint=self._kernel_constraint, + bias_constraint=self._bias_constraint) + # Any setup work performed only once should happen in an `init_scope` + # to avoid creating symbolic Tensors that will later pollute any eager + # operations. + with tf_utils.maybe_init_scope(self): + free_dims = self._query_shape.rank - 1 + einsum_equation, bias_axes, output_rank = _build_proj_equation( + free_dims, bound_dims=1, output_dims=2) + self._query_dense = einsum_dense.EinsumDense( + einsum_equation, + output_shape=_get_output_shape(output_rank - 1, + [self._num_heads, self._key_dim]), + bias_axes=bias_axes if self._use_bias else None, + name="query", + **common_kwargs) + self._global_query_dense = einsum_dense.EinsumDense( + einsum_equation, + output_shape=_get_output_shape(output_rank - 1, + [self._num_heads, self._key_dim]), + bias_axes=bias_axes if self._use_bias else None, + name="global_query", + **common_kwargs) + einsum_equation, bias_axes, output_rank = _build_proj_equation( + self._key_shape.rank - 1, bound_dims=1, output_dims=2) + self._key_dense = einsum_dense.EinsumDense( + einsum_equation, + output_shape=_get_output_shape(output_rank - 1, + [self._num_heads, self._key_dim]), + bias_axes=bias_axes if self._use_bias else None, + name="key", + **common_kwargs) + self._global_key_dense = einsum_dense.EinsumDense( + einsum_equation, + output_shape=_get_output_shape(output_rank - 1, + [self._num_heads, self._key_dim]), + bias_axes=bias_axes if self._use_bias else None, + name="global_key", + **common_kwargs) + einsum_equation, bias_axes, output_rank = _build_proj_equation( + self._value_shape.rank - 1, bound_dims=1, output_dims=2) + self._value_dense = einsum_dense.EinsumDense( + einsum_equation, + output_shape=_get_output_shape(output_rank - 1, + [self._num_heads, self._value_dim]), + bias_axes=bias_axes if self._use_bias else None, + name="value", + **common_kwargs) + self._global_value_dense = einsum_dense.EinsumDense( + einsum_equation, + output_shape=_get_output_shape(output_rank - 1, + [self._num_heads, self._value_dim]), + bias_axes=bias_axes if self._use_bias else None, + name="global_value", + **common_kwargs) + + # Builds the attention computations for multi-head dot product attention. + # These computations could be wrapped into the keras attention layer once + # it support mult-head einsum computations. + self._build_attention(output_rank) + self._global_dropout_layer = core.Dropout(rate=self._dropout) + # self._output_dense = self._make_output_dense( + # free_dims, common_kwargs, "attention_output") + self._output_dense = tf.keras.layers.Dense( + units=self._num_heads * self._key_dim, name="dense", + **common_kwargs + ) + + def call(self, + hidden_states, + attention_mask=None, + is_index_masked=None, + is_index_global_attn=None, + is_global_attn=None, + training=None): + """Applies Dot-product attention with query, key, value tensors. + This function defines the computation inside `call` with projected + multi-head Q, K, V inputs. Users can override this function for customized + attention implementation. + Args: + query: Projected query `Tensor` of shape `(B, T, N, key_dim)`. + key: Projected key `Tensor` of shape `(B, T, N, key_dim)`. + value: Projected value `Tensor` of shape `(B, T, N, value_dim)`. + attention_mask: a boolean mask of shape `(B, T, S)`, that prevents + attention to certain positions. + training: Python boolean indicating whether the layer should behave in + training mode (adding dropout) or in inference mode (doing nothing). + Returns: + attention_output: Multi-headed outputs of attention computation. + attention_scores: Multi-headed attention weights. + """ + if not self._built_from_signature: + self._build_from_signature(query=hidden_states, value=hidden_states, key=hidden_states) + + # N = `num_attention_heads` + # H = `size_per_head` + # `query` = [B, T, N ,H] + query = self._query_dense(hidden_states) + + # `key` = [B, S, N, H] + key = self._key_dense(hidden_states) + + # `value` = [B, S, N, H] + value = self._value_dense(hidden_states) + + # Note: Applying scalar multiply at the smaller end of einsum improves + # XLA performance, but may introduce slight numeric differences in + # the Transformer attention head. + query = tf.multiply(query, 1.0 / math.sqrt(float(self._key_dim))) # (B, T, N, key_dim) + batch_size, seq_len, num_heads, head_dim = shape_list(query) + + # attn_probs = (batch_size, seq_len, num_heads, window*2+1) + attn_scores = self._sliding_chunks_query_key_matmul( + query, key, self._one_sided_attn_window_size + ) + + # diagonal mask with zeros everywhere and -inf inplace of padding + diagonal_mask = self._sliding_chunks_query_key_matmul( + tf.ones(shape_list(attention_mask)), + attention_mask, + self._one_sided_attn_window_size, + ) + + # pad local attention probs + attn_scores += diagonal_mask + + if tf.executing_eagerly(): + tf.debugging.assert_equal( + shape_list(attn_scores), + [batch_size, seq_len, self._num_heads, self._one_sided_attn_window_size * 2 + 1], + message=f"attn_probs should be of size ({batch_size}, {seq_len}, {num_heads}, {self._one_sided_attn_window_size * 2 + 1}), but is of size {shape_list(attn_scores)}", + ) + + # compute global attn indices required through out forward fn + ( + max_num_global_attn_indices, + is_index_global_attn_nonzero, + is_local_index_global_attn_nonzero, + is_local_index_no_global_attn_nonzero, + ) = self._get_global_attn_indices(is_index_global_attn, self.global_attention_size) + # this function is only relevant for global attention + if self.global_attention_size > 0: + attn_scores = self._concat_with_global_key_attn_probs( + attn_scores=attn_scores, + query_vectors=query, + key_vectors=key, + max_num_global_attn_indices=max_num_global_attn_indices, + is_index_global_attn_nonzero=is_index_global_attn_nonzero, + is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero, + is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero, + ) + else: + pass + + attn_probs = tf.nn.softmax(attn_scores, axis=-1) + + # softmax sometimes inserts NaN if all positions are masked, replace them with 0 + # Make sure to create a mask with the proper shape: + # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1] + # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1] + if self.global_attention_size > 0: + masked_index = tf.tile( + is_index_masked[:, :, None, None], + (1, 1, self._num_heads, self._one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1), + ) + else: + masked_index = tf.tile( + is_index_masked[:, :, None, None], + (1, 1, self._num_heads, self._one_sided_attn_window_size * 2 + 1), + ) + + attn_probs = tf.where( + masked_index, + tf.zeros(shape_list(masked_index), dtype=attn_probs.dtype), + attn_probs, + ) + + layer_head_mask = None + if layer_head_mask is not None: + if tf.executing_eagerly(): + tf.debugging.assert_equal( + shape_list(layer_head_mask), + [self._num_heads], + message=f"Head mask for a single layer should be of size {(self._num_heads)}, but is {shape_list(layer_head_mask)}", + ) + + attn_probs = tf.reshape(layer_head_mask, (1, 1, -1, 1)) * attn_probs + + # apply dropout + attn_probs = self._dropout_layer(attn_probs, training=training) + value_vectors = tf.reshape(value, (batch_size, seq_len, self._num_heads, self._key_dim)) # TODO: _key_dim == _value_dim + + # if global attention, compute sum of global and local attn + if self.global_attention_size > 0: + attn_output = self._compute_attn_output_with_global_indices( + value_vectors=value_vectors, + attn_probs=attn_probs, + max_num_global_attn_indices=max_num_global_attn_indices, + is_index_global_attn_nonzero=is_index_global_attn_nonzero, + is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero, + ) + else: + attn_output = self._sliding_chunks_matmul_attn_probs_value( + attn_probs, value_vectors, self._one_sided_attn_window_size + ) + + if tf.executing_eagerly(): + tf.debugging.assert_equal( + shape_list(attn_output), + [batch_size, seq_len, self._num_heads, head_dim], + message="Unexpected size", + ) + + attn_output = tf.reshape(attn_output, (batch_size, seq_len, self._num_heads * self._key_dim)) # FIXME + + # compute value for global attention and overwrite to attention output + # TODO: remove the redundant computation + if self.global_attention_size > 0: + attn_output, global_attn_probs = self._compute_global_attn_output_from_hidden( + attn_output=attn_output, + hidden_states=hidden_states, + max_num_global_attn_indices=max_num_global_attn_indices, + layer_head_mask=layer_head_mask, + is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero, + is_index_global_attn_nonzero=is_index_global_attn_nonzero, + is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero, + is_index_masked=is_index_masked, + training=training, + ) + else: + global_attn_probs = tf.zeros((batch_size, self._num_heads, max_num_global_attn_indices, seq_len)) + + # make sure that local attention probabilities are set to 0 for indices of global attn + # Make sure to create a mask with the proper shape: + # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1] + # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1] + if self.global_attention_size > 0: + masked_global_attn_index = tf.tile( + is_index_global_attn[:, :, None, None], + (1, 1, self._num_heads, self._one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1), + ) + else: + masked_global_attn_index = tf.tile( + is_index_global_attn[:, :, None, None], + (1, 1, self._num_heads, self._one_sided_attn_window_size * 2 + 1), + ) + + attn_probs = tf.where( + masked_global_attn_index, + tf.zeros(shape_list(masked_global_attn_index), dtype=attn_probs.dtype), + attn_probs, + ) + + # we can return extra information here + attention_output = attn_output # (attn_output, attn_probs, global_attn_probs) + + return attention_output + + def get_config(self): + config = { + "layer_id": self._layer_id, + "attention_window": self._one_sided_attn_window_size, + } + base_config = super().get_config() + return dict(list(base_config.items()) + list(config.items())) + + def _sliding_chunks_query_key_matmul(self, query, key, window_overlap): + """ + Matrix multiplication of query and key tensors using with a sliding window attention pattern. This + implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer) with an + overlap of size window_overlap + """ + batch_size, seq_len, num_heads, head_dim = shape_list(query) + + if tf.executing_eagerly(): + tf.debugging.assert_equal( + seq_len % (window_overlap * 2), + 0, + message=f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}", + ) + tf.debugging.assert_equal( + shape_list(query), + shape_list(key), + message=f"Shape of query and key should be equal, but got query: {shape_list(query)} and key: {shape_list(key)}", + ) + + chunks_count = seq_len // window_overlap - 1 + + # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size window_overlap * 2 + query = tf.reshape( + tf.transpose(query, (0, 2, 1, 3)), + (batch_size * num_heads, seq_len, head_dim), + ) + key = tf.reshape(tf.transpose(key, (0, 2, 1, 3)), (batch_size * num_heads, seq_len, head_dim)) + chunked_query = self._chunk(query, window_overlap) + chunked_key = self._chunk(key, window_overlap) + + # matrix multiplication + # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim + # bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim + # bcxy: batch_size * num_heads x chunks x 2window_overlap x 2window_overlap + chunked_query = tf.cast(chunked_query, dtype=chunked_key.dtype) + chunked_attention_scores = tf.einsum("bcxd,bcyd->bcxy", chunked_query, chunked_key) # multiply + + # convert diagonals into columns + paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 1], [0, 0]]) + diagonal_chunked_attention_scores = self._pad_and_transpose_last_two_dims(chunked_attention_scores, paddings) + + # allocate space for the overall attention matrix where the chunks are combined. The last dimension + # has (window_overlap * 2 + 1) columns. The first (window_overlap) columns are the window_overlap lower triangles (attention from a word to + # window_overlap previous words). The following column is attention score from each word to itself, then + # followed by window_overlap columns for the upper triangle. + + # copy parts from diagonal_chunked_attention_scores into the combined matrix of attentions + # - copying the main diagonal and the upper triangle + # TODO: This code is most likely not very efficient and should be improved + diagonal_attn_scores_up_triang = tf.concat( + [ + diagonal_chunked_attention_scores[:, :, :window_overlap, : window_overlap + 1], + diagonal_chunked_attention_scores[:, -1:, window_overlap:, : window_overlap + 1], + ], + axis=1, + ) + + # - copying the lower triangle + diagonal_attn_scores_low_triang = tf.concat( + [ + tf.zeros( + (batch_size * num_heads, 1, window_overlap, window_overlap), + dtype=diagonal_chunked_attention_scores.dtype, + ), + diagonal_chunked_attention_scores[:, :, -(window_overlap + 1): -1, window_overlap + 1:], + ], + axis=1, + ) + diagonal_attn_scores_first_chunk = tf.concat( + [ + tf.roll( + diagonal_chunked_attention_scores, + shift=[1, window_overlap], + axis=[2, 3], + )[:, :, :window_overlap, :window_overlap], + tf.zeros( + (batch_size * num_heads, 1, window_overlap, window_overlap), + dtype=diagonal_chunked_attention_scores.dtype, + ), + ], + axis=1, + ) + first_chunk_mask = ( + tf.tile( + tf.range(chunks_count + 1)[None, :, None, None], + (batch_size * num_heads, 1, window_overlap, window_overlap), + ) + < 1 + ) + #first_chunk_mask = tf.repeat(first_chunk_mask, batch_size * num_heads, axis=0) + diagonal_attn_scores_low_triang = tf.where( + first_chunk_mask, + diagonal_attn_scores_first_chunk, + diagonal_attn_scores_low_triang, + ) + + # merging upper and lower triangle + diagonal_attention_scores = tf.concat( + [diagonal_attn_scores_low_triang, diagonal_attn_scores_up_triang], axis=-1 + ) + + # separate batch_size and num_heads dimensions again + diagonal_attention_scores = tf.transpose( + tf.reshape( + diagonal_attention_scores, + (batch_size, num_heads, seq_len, 2 * window_overlap + 1), + ), + (0, 2, 1, 3), + ) + + diagonal_attention_scores = self._mask_invalid_locations(diagonal_attention_scores, window_overlap) + + return diagonal_attention_scores + + @staticmethod + def _mask_invalid_locations(input_tensor, window_overlap): + # create correct upper triangle bool mask + mask_2d_upper = tf.reverse( + tf.linalg.band_part(tf.ones(shape=(window_overlap, window_overlap + 1)), -1, 0), + axis=[0], + ) + + # pad to full matrix + padding = tf.convert_to_tensor( + [[0, shape_list(input_tensor)[1] - window_overlap], [0, shape_list(input_tensor)[3] - window_overlap - 1]] + ) + + # create lower mask + mask_2d = tf.pad(mask_2d_upper, padding) + + # combine with upper mask + mask_2d = mask_2d + tf.reverse(mask_2d, axis=[0, 1]) + + # broadcast to full matrix + mask_4d = tf.tile(mask_2d[None, :, None, :], (shape_list(input_tensor)[0], 1, 1, 1)) + + # inf tensor used for masking + inf_tensor = -float("inf") * tf.ones_like(input_tensor) + + # mask + input_tensor = tf.where(tf.math.greater(mask_4d, 0), inf_tensor, input_tensor) + + return input_tensor + + def _sliding_chunks_matmul_attn_probs_value(self, attn_probs, value, window_overlap): + """ + Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the + same shape as `attn_probs` + """ + + batch_size, seq_len, num_heads, head_dim = shape_list(value) + + if tf.executing_eagerly(): + tf.debugging.assert_equal( + seq_len % (window_overlap * 2), + 0, + message="Seq_len has to be multiple of 2 * window_overlap", + ) + tf.debugging.assert_equal( + shape_list(attn_probs)[:3], + shape_list(value)[:3], + message="value and attn_probs must have same dims (except head_dim)", + ) + tf.debugging.assert_equal( + shape_list(attn_probs)[3], + 2 * window_overlap + 1, + message="attn_probs last dim has to be 2 * window_overlap + 1", + ) + + chunks_count = seq_len // window_overlap - 1 + + # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size 2 window overlap + chunked_attn_probs = tf.reshape( + tf.transpose(attn_probs, (0, 2, 1, 3)), + ( + batch_size * num_heads, + seq_len // window_overlap, + window_overlap, + 2 * window_overlap + 1, + ), + ) + + # group batch_size and num_heads dimensions into one + value = tf.reshape( + tf.transpose(value, (0, 2, 1, 3)), + (batch_size * num_heads, seq_len, head_dim), + ) + + # pad seq_len with w at the beginning of the sequence and another window overlap at the end + paddings = tf.convert_to_tensor([[0, 0], [window_overlap, window_overlap], [0, 0]]) + padded_value = tf.pad(value, paddings, constant_values=-1) + + # chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap + frame_size = 3 * window_overlap * head_dim + frame_hop_size = (shape_list(padded_value)[1] * head_dim - frame_size) // chunks_count + chunked_value = tf.signal.frame( + tf.reshape(padded_value, (batch_size * num_heads, -1)), + frame_size, + frame_hop_size, + ) + chunked_value = tf.reshape( + chunked_value, + (batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim), + ) + + if tf.executing_eagerly(): + tf.debugging.assert_equal( + shape_list(chunked_value), + [batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim], + message="Chunked value has the wrong shape", + ) + + chunked_attn_probs = self._pad_and_diagonalize(chunked_attn_probs) + context = tf.einsum("bcwd,bcdh->bcwh", chunked_attn_probs, chunked_value) + context = tf.transpose( + tf.reshape(context, (batch_size, num_heads, seq_len, head_dim)), + (0, 2, 1, 3), + ) + + return context + + @staticmethod + def _pad_and_transpose_last_two_dims(hidden_states_padded, paddings): + """pads rows and then flips rows and columns""" + hidden_states_padded = tf.pad( + hidden_states_padded, paddings + ) # padding value is not important because it will be overwritten + batch_size, chunk_size, seq_length, hidden_dim = shape_list(hidden_states_padded) + hidden_states_padded = tf.reshape(hidden_states_padded, (batch_size, chunk_size, hidden_dim, seq_length)) + + return hidden_states_padded + + @staticmethod + def _pad_and_diagonalize(chunked_hidden_states): + """ + shift every row 1 step right, converting columns into diagonals. + + Example:: + + chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492, + -1.8348, 0.7672, 0.2986, 0.0285, + -0.7584, 0.4206, -0.0405, 0.1599, + 2.0514, -1.1600, 0.5372, 0.2629 ] + window_overlap = num_rows = 4 + (pad & diagonalize) => + [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000 + 0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000 + 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000 + 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ] + """ + total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states) + paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 0], [0, window_overlap + 1]]) + chunked_hidden_states = tf.pad( + chunked_hidden_states, paddings + ) # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten + chunked_hidden_states = tf.reshape( + chunked_hidden_states, (total_num_heads, num_chunks, -1) + ) # total_num_heads x num_chunks x window_overlapL+window_overlapwindow_overlap+window_overlap + chunked_hidden_states = chunked_hidden_states[ + :, :, :-window_overlap + ] # total_num_heads x num_chunks x window_overlapL+window_overlapwindow_overlap + chunked_hidden_states = tf.reshape( + chunked_hidden_states, + (total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim), + ) # total_num_heads x num_chunks, window_overlap x hidden_dim+window_overlap + chunked_hidden_states = chunked_hidden_states[:, :, :, :-1] + + return chunked_hidden_states + + @staticmethod + def _chunk(hidden_states, window_overlap): + """convert into overlapping chunks. Chunk size = 2w, overlap size = w""" + batch_size, seq_length, hidden_dim = shape_list(hidden_states) + num_output_chunks = 2 * (seq_length // (2 * window_overlap)) - 1 + + # define frame size and frame stride (similar to convolution) + frame_hop_size = window_overlap * hidden_dim + frame_size = 2 * frame_hop_size + hidden_states = tf.reshape(hidden_states, (batch_size, seq_length * hidden_dim)) + + # chunk with overlap + chunked_hidden_states = tf.signal.frame(hidden_states, frame_size, frame_hop_size) + + if tf.executing_eagerly(): + tf.debugging.assert_equal( + shape_list(chunked_hidden_states), + [batch_size, num_output_chunks, frame_size], + message=f"Make sure chunking is correctly applied. `Chunked hidden states should have output dimension {[batch_size, frame_size, num_output_chunks]}, but got {shape_list(chunked_hidden_states)}.", + ) + + chunked_hidden_states = tf.reshape( + chunked_hidden_states, + (batch_size, num_output_chunks, 2 * window_overlap, hidden_dim), + ) + + return chunked_hidden_states + + @staticmethod + def _get_global_attn_indices(is_index_global_attn, global_attention_size): + """compute global attn indices required throughout forward pass""" + # All global attention size are fixed through global_attention_size + + batch_size, seq_len = shape_list(is_index_global_attn) + + max_num_global_attn_indices = global_attention_size + + row_indices = tf.range(batch_size) + row_indices = tf.repeat(tf.expand_dims(row_indices, axis=0), repeats=[global_attention_size], axis=0) + row_indices = tf.reshape(row_indices, (batch_size * global_attention_size, 1)) + + col_indices = tf.range(global_attention_size) + col_indices = tf.repeat(tf.expand_dims(col_indices, axis=1), repeats=[batch_size], axis=0) + + is_index_global_attn_nonzero = tf.concat((row_indices, col_indices), axis=1) + + # this is actually same as `is_index_global_attn_nonzero`, since we assume all global attention are the same size + is_local_index_global_attn_nonzero = tf.concat((row_indices, col_indices), axis=1) + + # empty tensor + is_local_index_no_global_attn_nonzero = tf.reshape(tf.expand_dims(tf.range(0), axis=1), (0, 2)) + return ( + max_num_global_attn_indices, + is_index_global_attn_nonzero, + is_local_index_global_attn_nonzero, + is_local_index_no_global_attn_nonzero, + ) + + def _concat_with_global_key_attn_probs( + self, + attn_scores, + key_vectors, + query_vectors, + max_num_global_attn_indices, + is_index_global_attn_nonzero, + is_local_index_global_attn_nonzero, + is_local_index_no_global_attn_nonzero, + ): + batch_size = shape_list(key_vectors)[0] + + # select global key vectors + global_key_vectors = tf.gather_nd(key_vectors, is_index_global_attn_nonzero) + + # create only global key vectors + key_vectors_only_global = tf.scatter_nd( + is_local_index_global_attn_nonzero, + global_key_vectors, + shape=( + batch_size, + max_num_global_attn_indices, + self._num_heads, + self._key_dim, + ), + ) + + # (batch_size, seq_len, num_heads, max_num_global_attn_indices) + attn_probs_from_global_key = tf.einsum("blhd,bshd->blhs", query_vectors, key_vectors_only_global) + + # (batch_size, max_num_global_attn_indices, seq_len, num_heads) + attn_probs_from_global_key_trans = tf.transpose(attn_probs_from_global_key, (0, 3, 1, 2)) + mask_shape = (shape_list(is_local_index_no_global_attn_nonzero)[0],) + tuple( + shape_list(attn_probs_from_global_key_trans)[-2:] + ) + mask = tf.ones(mask_shape) * -10000.0 + mask = tf.cast(mask, dtype=attn_probs_from_global_key_trans.dtype) + + # scatter mask + attn_probs_from_global_key_trans = tf.tensor_scatter_nd_update( + attn_probs_from_global_key_trans, + is_local_index_no_global_attn_nonzero, + mask, + ) + + # (batch_size, seq_len, num_heads, max_num_global_attn_indices) + attn_probs_from_global_key = tf.transpose(attn_probs_from_global_key_trans, (0, 2, 3, 1)) + + # concat to attn_probs + # (batch_size, seq_len, num_heads, extra attention count + 2*window+1) + attn_scores = tf.concat((attn_probs_from_global_key, attn_scores), axis=-1) + return attn_scores + + def _compute_attn_output_with_global_indices( + self, + value_vectors, + attn_probs, + max_num_global_attn_indices, + is_index_global_attn_nonzero, + is_local_index_global_attn_nonzero, + ): + batch_size = shape_list(attn_probs)[0] + + # cut local attn probs to global only + attn_probs_only_global = attn_probs[:, :, :, :max_num_global_attn_indices] + # attn_probs_only_global = tf.slice(attn_probs, [0, 0, 0, 0], shape_list(attn_probs)[: -1] + [max_num_global_attn_indices]) + + # select global value vectors + global_value_vectors = tf.gather_nd(value_vectors, is_index_global_attn_nonzero) + + # create only global value vectors + value_vectors_only_global = tf.scatter_nd( + is_local_index_global_attn_nonzero, + global_value_vectors, + shape=( + batch_size, + max_num_global_attn_indices, + self._num_heads, + self._key_dim, + ), + ) + + # compute attn output only global + attn_output_only_global = tf.einsum("blhs,bshd->blhd", attn_probs_only_global, value_vectors_only_global) + # reshape attn probs + attn_probs_without_global = attn_probs[:, :, :, max_num_global_attn_indices:] + # attn_probs_without_global = tf.slice(attn_probs, [0, 0, 0, max_num_global_attn_indices], shape_list(attn_probs)[: -1] + [shape_list(attn_probs)[-1] - max_num_global_attn_indices]) + + # compute attn output with global + attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value( + attn_probs_without_global, value_vectors, self._one_sided_attn_window_size + ) + + return attn_output_only_global + attn_output_without_global + + def _compute_global_attn_output_from_hidden( + self, + attn_output, + hidden_states, + max_num_global_attn_indices, + layer_head_mask, + is_local_index_global_attn_nonzero, + is_index_global_attn_nonzero, + is_local_index_no_global_attn_nonzero, + is_index_masked, + training, + ): + batch_size, seq_len = shape_list(hidden_states)[:2] + + # prepare global hidden states + global_attn_hidden_states = tf.gather_nd(hidden_states, is_index_global_attn_nonzero) + global_attn_hidden_states = tf.scatter_nd( + is_local_index_global_attn_nonzero, + global_attn_hidden_states, + shape=(batch_size, max_num_global_attn_indices, self._num_heads * self._key_dim), + ) + + # global key, query, value + global_query_vectors_only_global = self._global_query_dense(global_attn_hidden_states) + global_key_vectors = self._global_key_dense(hidden_states) + global_value_vectors = self._global_value_dense(hidden_states) + + # normalize + global_query_vectors_only_global /= tf.math.sqrt( + tf.cast(self._key_dim, dtype=global_query_vectors_only_global.dtype) + ) + global_query_vectors_only_global = self.reshape_and_transpose(global_query_vectors_only_global, batch_size) + global_key_vectors = self.reshape_and_transpose(global_key_vectors, batch_size) + global_value_vectors = self.reshape_and_transpose(global_value_vectors, batch_size) + + # compute attn scores + global_attn_scores = tf.matmul(global_query_vectors_only_global, global_key_vectors, transpose_b=True) + + if tf.executing_eagerly(): + tf.debugging.assert_equal( + shape_list(global_attn_scores), + [batch_size * self._num_heads, max_num_global_attn_indices, seq_len], + message=f"global_attn_scores have the wrong size. Size should be {(batch_size * self._num_heads, max_num_global_attn_indices, seq_len)}, but is {shape_list(global_attn_scores)}.", + ) + + global_attn_scores = tf.reshape( + global_attn_scores, + (batch_size, self._num_heads, max_num_global_attn_indices, seq_len), + ) + global_attn_scores_trans = tf.transpose(global_attn_scores, (0, 2, 1, 3)) + mask_shape = (shape_list(is_local_index_no_global_attn_nonzero)[0],) + tuple( + shape_list(global_attn_scores_trans)[-2:] + ) + global_attn_mask = tf.ones(mask_shape) * -10000.0 + global_attn_mask = tf.cast(global_attn_mask, dtype=global_attn_scores_trans.dtype) + + # scatter mask + global_attn_scores_trans = tf.tensor_scatter_nd_update( + global_attn_scores_trans, + is_local_index_no_global_attn_nonzero, + global_attn_mask, + ) + global_attn_scores = tf.transpose(global_attn_scores_trans, (0, 2, 1, 3)) + + # mask global attn scores + attn_mask = tf.tile(is_index_masked[:, None, None, :], (1, shape_list(global_attn_scores)[1], 1, 1)) + global_attn_scores = tf.where(attn_mask, -10000.0, global_attn_scores) + global_attn_scores = tf.reshape( + global_attn_scores, + (batch_size * self._num_heads, max_num_global_attn_indices, seq_len), + ) + + # compute global attn probs + global_attn_probs_float = tf.nn.softmax(global_attn_scores, axis=-1) + + # apply layer head masking + if layer_head_mask is not None: + if tf.executing_eagerly(): + tf.debugging.assert_equal( + shape_list(layer_head_mask), + [self._num_heads], + message=f"Head mask for a single layer should be of size {(self._num_heads)}, but is {shape_list(layer_head_mask)}", + ) + global_attn_probs_float = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape( + global_attn_probs_float, (batch_size, self._num_heads, max_num_global_attn_indices, seq_len) + ) + global_attn_probs_float = tf.reshape( + global_attn_probs_float, (batch_size * self._num_heads, max_num_global_attn_indices, seq_len) + ) + + # dropout + global_attn_probs = self._global_dropout_layer(global_attn_probs_float, training=training) + + # global attn output + global_attn_output = tf.matmul(global_attn_probs, global_value_vectors) + + if tf.executing_eagerly(): + tf.debugging.assert_equal( + shape_list(global_attn_output), + [batch_size * self._num_heads, max_num_global_attn_indices, self._key_dim], + message=f"global_attn_output tensor has the wrong size. Size should be {(batch_size * self._num_heads, max_num_global_attn_indices, self._key_dim)}, but is {shape_list(global_attn_output)}.", + ) + + global_attn_output = tf.reshape( + global_attn_output, + (batch_size, self._num_heads, max_num_global_attn_indices, self._key_dim), + ) + + # get only non zero global attn output + nonzero_global_attn_output = tf.gather_nd( + tf.transpose(global_attn_output, (0, 2, 1, 3)), + is_local_index_global_attn_nonzero, + ) + nonzero_global_attn_output = tf.reshape( + nonzero_global_attn_output, + (shape_list(is_local_index_global_attn_nonzero)[0], -1), + ) + + # overwrite values with global attention + attn_output = tf.tensor_scatter_nd_update( + attn_output, is_index_global_attn_nonzero, nonzero_global_attn_output + ) + + global_attn_probs = tf.reshape( + global_attn_probs, (batch_size, self._num_heads, max_num_global_attn_indices, seq_len) + ) + + attn_output = self._output_dense(attn_output) + + return attn_output, global_attn_probs + + def reshape_and_transpose(self, vector, batch_size): + return tf.reshape( + tf.transpose( + tf.reshape(vector, (batch_size, -1, self._num_heads, self._key_dim)), + (0, 2, 1, 3), + ), + (batch_size * self._num_heads, -1, self._key_dim), + ) diff --git a/official/projects/longformer/longformer_attention_test.py b/official/projects/longformer/longformer_attention_test.py new file mode 100644 index 000000000..992437b0e --- /dev/null +++ b/official/projects/longformer/longformer_attention_test.py @@ -0,0 +1,256 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for the attention layer.""" + +import numpy as np +import tensorflow as tf + +from tensorflow.python.distribute import combinations +from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import +from official.projects.longformer import longformer_attention + + +def _create_mock_attention_data( + num_heads, + key_dim, + value_dim, + q_seq_length, + kv_seq_length, + batch_size, + include_mask=False): + """Creates mock testing data. + + Args: + num_heads: `int`, Number of attention heads. + key_dim: `int`, Size of query head. + value_dim: `int`, Size of key, value dim. + seq_length: `int`, Sequence length of the input. + batch_size: `int`, the batch size. + include_mask: optional `bool`, whether or not to include mask data. + + Returns: + A dictionary with `str` as keys and `Tensor` as values. + """ + query_shape = (batch_size, q_seq_length, key_dim) + value_shape = (batch_size, kv_seq_length, value_dim) + + data = dict( + query=tf.random.normal(shape=query_shape), + value=tf.random.normal(shape=value_shape), + key=tf.random.normal(shape=value_shape)) + + total_seq_length = kv_seq_length + + if include_mask: + mask_shape = (batch_size, num_heads, q_seq_length, total_seq_length) + mask_data = np.random.randint(2, size=mask_shape).astype("float32") + mask_data = dict(attention_mask=mask_data) + data.update(mask_data) + + return data + + +@keras_parameterized.run_all_keras_modes +class LongformerAttentionTest(keras_parameterized.TestCase): + def _get_hidden_states(self): + return tf.convert_to_tensor( + [ + [ + [ + 4.98332758e-01, + 2.69175139e00, + -7.08081422e-03, + 1.04915401e00, + -1.83476661e00, + 7.67220476e-01, + 2.98580543e-01, + 2.84803992e-02, + ], + [ + -7.58357372e-01, + 4.20635998e-01, + -4.04739919e-02, + 1.59924145e-01, + 2.05135748e00, + -1.15997978e00, + 5.37166397e-01, + 2.62873606e-01, + ], + [ + -1.69438001e00, + 4.17574660e-01, + -1.49196962e00, + -1.76483717e00, + -1.94566312e-01, + -1.71183858e00, + 7.72903565e-01, + -1.11557056e00, + ], + [ + 5.44028163e-01, + 2.05466114e-01, + -3.63045868e-01, + 2.41865062e-01, + 3.20348382e-01, + -9.05611176e-01, + -1.92690727e-01, + -1.19917547e00, + ], + ] + ], + dtype=tf.float32, + ) + + def test_diagonalize(self): + hidden_states = self._get_hidden_states() + hidden_states = tf.reshape(hidden_states, (1, 8, 4)) # set seq length = 8, hidden dim = 4 + chunked_hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2) + window_overlap_size = longformer_attention.shape_list(chunked_hidden_states)[2] + self.assertTrue(window_overlap_size == 4) + + padded_hidden_states = longformer_attention.LongformerAttention._pad_and_diagonalize(chunked_hidden_states) + + self.assertTrue( + longformer_attention.shape_list(padded_hidden_states)[-1] == longformer_attention.shape_list(chunked_hidden_states)[-1] + window_overlap_size - 1 + ) + + # first row => [0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000] + tf.debugging.assert_near(padded_hidden_states[0, 0, 0, :4], chunked_hidden_states[0, 0, 0], rtol=1e-3) + tf.debugging.assert_near(padded_hidden_states[0, 0, 0, 4:], tf.zeros((3,), dtype=tf.dtypes.float32), rtol=1e-3) + + # last row => [0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629] + tf.debugging.assert_near(padded_hidden_states[0, 0, -1, 3:], chunked_hidden_states[0, 0, -1], rtol=1e-3) + tf.debugging.assert_near( + padded_hidden_states[0, 0, -1, :3], tf.zeros((3,), dtype=tf.dtypes.float32), rtol=1e-3 + ) + + def test_pad_and_transpose_last_two_dims(self): + hidden_states = self._get_hidden_states() + self.assertTrue(longformer_attention.shape_list(hidden_states), [1, 8, 4]) + + # pad along seq length dim + paddings = tf.constant([[0, 0], [0, 0], [0, 1], [0, 0]], dtype=tf.dtypes.int32) + + hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2) + padded_hidden_states = longformer_attention.LongformerAttention._pad_and_transpose_last_two_dims(hidden_states, paddings) + self.assertTrue(longformer_attention.shape_list(padded_hidden_states) == [1, 1, 8, 5]) + + expected_added_dim = tf.zeros((5,), dtype=tf.dtypes.float32) + tf.debugging.assert_near(expected_added_dim, padded_hidden_states[0, 0, -1, :], rtol=1e-6) + tf.debugging.assert_near( + hidden_states[0, 0, -1, :], tf.reshape(padded_hidden_states, (1, -1))[0, 24:32], rtol=1e-6 + ) + + def test_mask_invalid_locations(self): + hidden_states = self._get_hidden_states() + batch_size = 1 + seq_length = 8 + hidden_size = 4 + hidden_states = tf.reshape(hidden_states, (batch_size, seq_length, hidden_size)) + hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2) + + hid_states_1 = longformer_attention.LongformerAttention._mask_invalid_locations(hidden_states, 1) + hid_states_2 = longformer_attention.LongformerAttention._mask_invalid_locations(hidden_states, 2) + hid_states_3 = longformer_attention.LongformerAttention._mask_invalid_locations(hidden_states[:, :, :, :3], 2) + hid_states_4 = longformer_attention.LongformerAttention._mask_invalid_locations(hidden_states[:, :, 2:, :], 2) + + self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_1), tf.dtypes.int32)) == 8) + self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_2), tf.dtypes.int32)) == 24) + self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_3), tf.dtypes.int32)) == 24) + self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_4), tf.dtypes.int32)) == 12) + + def test_chunk(self): + hidden_states = self._get_hidden_states() + batch_size = 1 + seq_length = 8 + hidden_size = 4 + hidden_states = tf.reshape(hidden_states, (batch_size, seq_length, hidden_size)) + + chunked_hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2) + + # expected slices across chunk and seq length dim + expected_slice_along_seq_length = tf.convert_to_tensor([0.4983, -0.7584, -1.6944], dtype=tf.dtypes.float32) + expected_slice_along_chunk = tf.convert_to_tensor([0.4983, -1.8348, -0.7584, 2.0514], dtype=tf.dtypes.float32) + + self.assertTrue(longformer_attention.shape_list(chunked_hidden_states) == [1, 3, 4, 4]) + tf.debugging.assert_near(chunked_hidden_states[0, :, 0, 0], expected_slice_along_seq_length, rtol=1e-3) + tf.debugging.assert_near(chunked_hidden_states[0, 0, :, 0], expected_slice_along_chunk, rtol=1e-3) + + def test_layer_local_attn(self): + hidden_states = self._get_hidden_states() + batch_size, seq_length, hidden_size = hidden_states.shape + layer = longformer_attention.LongformerAttention( + num_heads=2, + key_dim=4, + value_dim=4, + layer_id=0, + attention_window=4, + global_attention_size=0, + ) + + attention_mask = tf.zeros((batch_size, seq_length), dtype=tf.dtypes.float32) + is_index_global_attn = tf.math.greater(attention_mask, 1) + is_global_attn = tf.math.reduce_any(is_index_global_attn) + + attention_mask = tf.where(tf.range(4)[None, :, None, None] > 1, -10000.0, attention_mask[:, :, None, None]) + is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0) + + output_hidden_states = layer( + hidden_states=hidden_states, attention_mask=attention_mask, + is_index_masked=is_index_masked, is_index_global_attn=is_index_global_attn, is_global_attn=is_global_attn, + )[0] + + self.assertTrue(output_hidden_states.shape, (1, 4, 8)) + + def test_layer_global_attn(self): + layer = longformer_attention.LongformerAttention( + num_heads=2, + key_dim=4, + value_dim=4, + layer_id=0, + attention_window=4, + global_attention_size=1, + ) + hidden_states = self._get_hidden_states() + + hidden_states = tf.concat([self._get_hidden_states(), self._get_hidden_states() - 0.5], axis=0) + batch_size, seq_length, hidden_size = hidden_states.shape + + # create attn mask + attention_mask_1 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32) + attention_mask_2 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32) + + attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] == 0, 10000.0, attention_mask_1) + attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] > 2, -10000.0, attention_mask_1) + attention_mask_2 = tf.where(tf.range(4)[None, :, None, None] == 0, 10000.0, attention_mask_2) + attention_mask = tf.concat([attention_mask_1, attention_mask_2], axis=0) + + is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0) + is_index_global_attn = tf.math.greater(attention_mask[:, :, 0, 0], 0) + is_global_attn = tf.math.reduce_any(is_index_global_attn) + + output_hidden_states = layer( + hidden_states=hidden_states, attention_mask=-tf.math.abs(attention_mask), + is_index_masked=is_index_masked, is_index_global_attn=is_index_global_attn, is_global_attn=is_global_attn, + )[0] + + self.assertTrue(output_hidden_states.shape, (2, 4, 8)) + + +if __name__ == "__main__": + np.random.seed(0) + tf.random.set_seed(0) + tf.test.main() + diff --git a/official/projects/longformer/longformer_encoder.py b/official/projects/longformer/longformer_encoder.py new file mode 100644 index 000000000..11ad01646 --- /dev/null +++ b/official/projects/longformer/longformer_encoder.py @@ -0,0 +1,394 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Longformer encoder. Modified From huggingface/transformers +""" + +# pylint: disable=g-classes-have-attributes + +from typing import Any, Callable, Optional, Union, List +from absl import logging +import tensorflow as tf + +from official.nlp.modeling import layers +from official.projects.longformer.longformer_encoder_block import LongformerEncoderBlock + +def shape_list(tensor: tf.Tensor) -> List[int]: + """ + Deal with dynamic shape in tensorflow cleanly. + + Args: + tensor (:obj:`tf.Tensor`): The tensor we want the shape of. + + Returns: + :obj:`List[int]`: The shape of the tensor as a list. + """ + dynamic = tf.shape(tensor) + + if tensor.shape == tf.TensorShape(None): + return dynamic + + static = tensor.shape.as_list() + + return [dynamic[i] if s is None else s for i, s in enumerate(static)] + + +_Initializer = Union[str, tf.keras.initializers.Initializer] +_approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True) + + +# Transferred from huggingface.longformer.TFLongformerMainLayer & TFLongformerEncoder +class LongformerEncoder(tf.keras.layers.Layer): + """Bi-directional Transformer-based encoder network. + + This network implements a bi-directional Transformer-based encoder as + described in "BERT: Pre-training of Deep Bidirectional Transformers for + Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the + embedding lookups and transformer layers, but not the masked language model + or classification task networks. + + The default values for this object are taken from the BERT-Base implementation + in "BERT: Pre-training of Deep Bidirectional Transformers for Language + Understanding". + + Args: + vocab_size: The size of the token vocabulary. + hidden_size: The size of the transformer hidden layers. + num_layers: The number of transformer layers. + num_attention_heads: The number of attention heads for each transformer. The + hidden size must be divisible by the number of attention heads. + max_sequence_length: The maximum sequence length that this encoder can + consume. If None, max_sequence_length uses the value from sequence length. + This determines the variable shape for positional embeddings. + type_vocab_size: The number of types that the 'type_ids' input can take. + inner_dim: The output dimension of the first Dense layer in a two-layer + feedforward network for each transformer. + inner_activation: The activation for the first Dense layer in a two-layer + feedforward network for each transformer. + output_dropout: Dropout probability for the post-attention and output + dropout. + attention_dropout: The dropout rate to use for the attention layers within + the transformer layers. + initializer: The initialzer to use for all weights in this encoder. + output_range: The sequence output range, [0, output_range), by slicing the + target sequence of the last transformer layer. `None` means the entire + target sequence will attend to the source sequence, which yields the full + output. + embedding_width: The width of the word embeddings. If the embedding width is + not equal to hidden size, embedding parameters will be factorized into two + matrices in the shape of ['vocab_size', 'embedding_width'] and + ['embedding_width', 'hidden_size'] ('embedding_width' is usually much + smaller than 'hidden_size'). + embedding_layer: An optional Layer instance which will be called to generate + embeddings for the input word IDs. + norm_first: Whether to normalize inputs to attention and intermediate dense + layers. If set False, output of attention and intermediate dense layers is + normalized. + """ + + def __init__( + self, + vocab_size: int, + attention_window: Union[List[int], int] = 512, + global_attention_size: int = 0, + pad_token_id: int = 1, + hidden_size: int = 768, + num_layers: int = 12, + num_attention_heads: int = 12, + max_sequence_length: int = 512, + type_vocab_size: int = 16, + inner_dim: int = 3072, + inner_activation: Callable[..., Any] = _approx_gelu, + output_dropout: float = 0.1, + attention_dropout: float = 0.1, + initializer: _Initializer = tf.keras.initializers.TruncatedNormal( + stddev=0.02), + output_range: Optional[int] = None, + embedding_width: Optional[int] = None, + embedding_layer: Optional[tf.keras.layers.Layer] = None, + norm_first: bool = False, + **kwargs): + # Pops kwargs that are used in V1 implementation. + if 'dict_outputs' in kwargs: + kwargs.pop('dict_outputs') + if 'return_all_encoder_outputs' in kwargs: + kwargs.pop('return_all_encoder_outputs') + if 'intermediate_size' in kwargs: + inner_dim = kwargs.pop('intermediate_size') + if 'activation' in kwargs: + inner_activation = kwargs.pop('activation') + if 'dropout_rate' in kwargs: + output_dropout = kwargs.pop('dropout_rate') + if 'attention_dropout_rate' in kwargs: + attention_dropout = kwargs.pop('attention_dropout_rate') + super().__init__(**kwargs) + # Longformer + self._attention_window = attention_window + self.global_attention_size = global_attention_size + self._pad_token_id = pad_token_id + + activation = tf.keras.activations.get(inner_activation) + initializer = tf.keras.initializers.get(initializer) + + if embedding_width is None: + embedding_width = hidden_size + + if embedding_layer is None: + self._embedding_layer = layers.OnDeviceEmbedding( + vocab_size=vocab_size, + embedding_width=embedding_width, + initializer=initializer, + name='word_embeddings') + else: + self._embedding_layer = embedding_layer + + self._position_embedding_layer = layers.PositionEmbedding( + initializer=initializer, + max_length=max_sequence_length, + name='position_embedding') + + self._type_embedding_layer = layers.OnDeviceEmbedding( + vocab_size=type_vocab_size, + embedding_width=embedding_width, + initializer=initializer, + use_one_hot=True, + name='type_embeddings') + + self._embedding_norm_layer = tf.keras.layers.LayerNormalization( + name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32) + + self._embedding_dropout = tf.keras.layers.Dropout( + rate=output_dropout, name='embedding_dropout') + + # We project the 'embedding' output to 'hidden_size' if it is not already + # 'hidden_size'. + self._embedding_projection = None + if embedding_width != hidden_size: + self._embedding_projection = tf.keras.layers.experimental.EinsumDense( + '...x,xy->...y', + output_shape=hidden_size, + bias_axes='y', + kernel_initializer=initializer, + name='embedding_projection') + + self._transformer_layers = [] + self._attention_mask_layer = layers.SelfAttentionMask( + name='self_attention_mask') + for i in range(num_layers): + layer = LongformerEncoderBlock( + global_attention_size=global_attention_size, + num_attention_heads=num_attention_heads, + inner_dim=inner_dim, + inner_activation=inner_activation, + # Longformer, instead of passing a list of attention_window, pass a value to sub-block + attention_window=attention_window if isinstance(attention_window, int) else attention_window[i], + layer_id=i, + output_dropout=output_dropout, + attention_dropout=attention_dropout, + norm_first=norm_first, + output_range=output_range if i == num_layers - 1 else None, + kernel_initializer=initializer, + name='transformer/layer_%d' % i) + self._transformer_layers.append(layer) + + self._pooler_layer = tf.keras.layers.Dense( + units=hidden_size, + activation='tanh', + kernel_initializer=initializer, + name='pooler_transform') + + self._config = { + 'vocab_size': vocab_size, + 'hidden_size': hidden_size, + 'num_layers': num_layers, + 'num_attention_heads': num_attention_heads, + 'max_sequence_length': max_sequence_length, + 'type_vocab_size': type_vocab_size, + 'inner_dim': inner_dim, + 'inner_activation': tf.keras.activations.serialize(activation), + 'output_dropout': output_dropout, + 'attention_dropout': attention_dropout, + 'initializer': tf.keras.initializers.serialize(initializer), + 'output_range': output_range, + 'embedding_width': embedding_width, + 'embedding_layer': embedding_layer, + 'norm_first': norm_first, + # Longformer + 'attention_window': attention_window, + 'pad_token_id': pad_token_id, + } + self.inputs = dict( + input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32), + input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32), + input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32)) + + def call(self, inputs): + word_embeddings = None + if isinstance(inputs, dict): + word_ids = inputs.get('input_word_ids') # input_ids + mask = inputs.get('input_mask') # attention_mask + type_ids = inputs.get('input_type_ids') # token_type_ids + word_embeddings = inputs.get('input_word_embeddings', None) # input_embeds + else: + raise ValueError('Unexpected inputs type to %s.' % self.__class__) + + ( + padding_len, + word_ids, + mask, + type_ids, + word_embeddings, + ) = self._pad_to_window_size( + word_ids=word_ids, + mask=mask, + type_ids=type_ids, + word_embeddings=word_embeddings, + pad_token_id=self._pad_token_id + ) + + if word_embeddings is None: + word_embeddings = self._embedding_layer(word_ids) + # absolute position embeddings. + position_embeddings = self._position_embedding_layer(word_embeddings) + type_embeddings = self._type_embedding_layer(type_ids) + + embeddings = word_embeddings + position_embeddings + type_embeddings + embeddings = self._embedding_norm_layer(embeddings) + embeddings = self._embedding_dropout(embeddings) + + if self._embedding_projection is not None: + embeddings = self._embedding_projection(embeddings) + + batch_size, seq_len = shape_list(mask) + # create masks with fixed len global_attention_size + mask = tf.transpose(tf.concat(values=[tf.ones((self.global_attention_size, batch_size), tf.int32) * 2, + tf.transpose(mask)[self.global_attention_size:]], axis=0)) + + is_index_masked = tf.math.less(mask, 1) + + is_index_global_attn = tf.transpose(tf.concat(values=[ + tf.ones((self.global_attention_size, batch_size), tf.bool), tf.zeros((seq_len - self.global_attention_size, batch_size), tf.bool) + ], axis=0)) + is_global_attn = self.global_attention_size > 0 + + # Longformer + attention_mask = mask + extended_attention_mask = tf.reshape( + attention_mask, (tf.shape(mask)[0], tf.shape(mask)[1], 1, 1) + ) + attention_mask = tf.cast(tf.math.abs(1 - extended_attention_mask), tf.dtypes.float32) * -10000.0 + + encoder_outputs = [] + x = embeddings + # TFLongformerEncoder + for i, layer in enumerate(self._transformer_layers): + x = layer([ + x, + attention_mask, + is_index_masked, + is_index_global_attn, + is_global_attn]) + encoder_outputs.append(x) + + last_encoder_output = encoder_outputs[-1] + if padding_len > 0: + last_encoder_output = last_encoder_output[:, :-padding_len] + first_token_tensor = last_encoder_output[:, 0, :] + pooled_output = self._pooler_layer(first_token_tensor) + + return dict( + sequence_output=last_encoder_output, + pooled_output=pooled_output, + encoder_outputs=encoder_outputs) + + def get_embedding_table(self): + return self._embedding_layer.embeddings + + def get_embedding_layer(self): + return self._embedding_layer + + def get_config(self): + return dict(self._config) + + @property + def transformer_layers(self): + """List of Transformer layers in the encoder.""" + return self._transformer_layers + + @property + def pooler_layer(self): + """The pooler dense layer after the transformer layers.""" + return self._pooler_layer + + @classmethod + def from_config(cls, config, custom_objects=None): + if 'embedding_layer' in config and config['embedding_layer'] is not None: + warn_string = ( + 'You are reloading a model that was saved with a ' + 'potentially-shared embedding layer object. If you contine to ' + 'train this model, the embedding layer will no longer be shared. ' + 'To work around this, load the model outside of the Keras API.') + print('WARNING: ' + warn_string) + logging.warn(warn_string) + + return cls(**config) + + def _pad_to_window_size( + self, + word_ids, # input_ids + mask, # attention_mask + type_ids, # token_type_ids + word_embeddings, # inputs_embeds + pad_token_id, # pad_token_id + ): + """A helper function to pad tokens and mask to work with implementation of Longformer selfattention.""" + # padding + attention_window = ( + self._attention_window if isinstance(self._attention_window, int) else max(self._attention_window) + ) + + assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}" + + # input_shape = shape_list(input_ids) if input_ids is not None else shape_list(inputs_embeds) + input_shape = word_ids.shape if word_ids is not None else word_embeddings.shape + batch_size, seq_len = input_shape[:2] + + if seq_len is not None: + padding_len = (attention_window - seq_len % attention_window) % attention_window + else: + padding_len = 0 + + paddings = tf.convert_to_tensor([[0, 0], [0, padding_len]]) + + if word_ids is not None: + word_ids = tf.pad(word_ids, paddings, constant_values=pad_token_id) + + if word_embeddings is not None: + def pad_embeddings(): + word_ids_padding = tf.fill((batch_size, padding_len), self.pad_token_id) + word_embeddings_padding = self._embedding_layer(word_ids_padding) + return tf.concat([word_embeddings, word_embeddings_padding], axis=-2) + + word_embeddings = tf.cond(tf.math.greater(padding_len, 0), pad_embeddings, lambda: word_embeddings) + + mask = tf.pad(mask, paddings, constant_values=False) # no attention on the padding tokens + token_type_ids = tf.pad(type_ids, paddings, constant_values=0) # pad with token_type_id = 0 + + return ( + padding_len, + word_ids, + mask, + token_type_ids, + word_embeddings,) diff --git a/official/projects/longformer/longformer_encoder_block.py b/official/projects/longformer/longformer_encoder_block.py new file mode 100644 index 000000000..2fb78c888 --- /dev/null +++ b/official/projects/longformer/longformer_encoder_block.py @@ -0,0 +1,358 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Longformer attention layer. Modified From huggingface/transformers +""" + +import tensorflow as tf +from official.projects.longformer.longformer_attention import LongformerAttention + +@tf.keras.utils.register_keras_serializable(package="Text") +class LongformerEncoderBlock(tf.keras.layers.Layer): + """TransformerEncoderBlock layer. + + This layer implements the Transformer Encoder from + "Attention Is All You Need". (https://arxiv.org/abs/1706.03762), + which combines a `tf.keras.layers.MultiHeadAttention` layer with a + two-layer feedforward network. + + References: + [Attention Is All You Need](https://arxiv.org/abs/1706.03762) + [BERT: Pre-training of Deep Bidirectional Transformers for Language + Understanding](https://arxiv.org/abs/1810.04805) + """ + + def __init__(self, + global_attention_size, + num_attention_heads, + inner_dim, + inner_activation, + # Longformer + attention_window, + layer_id=0, + output_range=None, + kernel_initializer="glorot_uniform", + bias_initializer="zeros", + kernel_regularizer=None, + bias_regularizer=None, + activity_regularizer=None, + kernel_constraint=None, + bias_constraint=None, + use_bias=True, + norm_first=False, + norm_epsilon=1e-12, + output_dropout=0.0, + attention_dropout=0.0, + inner_dropout=0.0, + attention_initializer=None, + attention_axes=None, + **kwargs): + """Initializes `TransformerEncoderBlock`. + + Args: + num_attention_heads: Number of attention heads. + inner_dim: The output dimension of the first Dense layer in a two-layer + feedforward network. + inner_activation: The activation for the first Dense layer in a two-layer + feedforward network. + output_range: the sequence output range, [0, output_range) for slicing the + target sequence. `None` means the target sequence is not sliced. + kernel_initializer: Initializer for dense layer kernels. + bias_initializer: Initializer for dense layer biases. + kernel_regularizer: Regularizer for dense layer kernels. + bias_regularizer: Regularizer for dense layer biases. + activity_regularizer: Regularizer for dense layer activity. + kernel_constraint: Constraint for dense layer kernels. + bias_constraint: Constraint for dense layer kernels. + use_bias: Whether to enable use_bias in attention layer. If set False, + use_bias in attention layer is disabled. + norm_first: Whether to normalize inputs to attention and intermediate + dense layers. If set False, output of attention and intermediate dense + layers is normalized. + norm_epsilon: Epsilon value to initialize normalization layers. + output_dropout: Dropout probability for the post-attention and output + dropout. + attention_dropout: Dropout probability for within the attention layer. + inner_dropout: Dropout probability for the first Dense layer in a + two-layer feedforward network. + attention_initializer: Initializer for kernels of attention layers. If set + `None`, attention layers use kernel_initializer as initializer for + kernel. + attention_axes: axes over which the attention is applied. `None` means + attention over all axes, but batch, heads, and features. + **kwargs: keyword arguments/ + """ + super().__init__(**kwargs) + + self.global_attention_size = global_attention_size + self._num_heads = num_attention_heads + self._inner_dim = inner_dim + self._inner_activation = inner_activation + # Longformer + self._attention_window = attention_window + self._layer_id = layer_id + self._attention_dropout = attention_dropout + self._attention_dropout_rate = attention_dropout + self._output_dropout = output_dropout + self._output_dropout_rate = output_dropout + self._output_range = output_range + self._kernel_initializer = tf.keras.initializers.get(kernel_initializer) + self._bias_initializer = tf.keras.initializers.get(bias_initializer) + self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer) + self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer) + self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer) + self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) + self._bias_constraint = tf.keras.constraints.get(bias_constraint) + self._use_bias = use_bias + self._norm_first = norm_first + self._norm_epsilon = norm_epsilon + self._inner_dropout = inner_dropout + if attention_initializer: + self._attention_initializer = tf.keras.initializers.get( + attention_initializer) + else: + self._attention_initializer = self._kernel_initializer + self._attention_axes = attention_axes + + def build(self, input_shape): + if isinstance(input_shape, tf.TensorShape): + input_tensor_shape = input_shape + elif isinstance(input_shape, (list, tuple)): + input_tensor_shape = tf.TensorShape(input_shape[0]) + else: + raise ValueError( + "The type of input shape argument is not supported, got: %s" % + type(input_shape)) + einsum_equation = "abc,cd->abd" + if len(input_tensor_shape.as_list()) > 3: + einsum_equation = "...bc,cd->...bd" + hidden_size = input_tensor_shape[-1] + if hidden_size % self._num_heads != 0: + raise ValueError( + "The input size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, self._num_heads)) + self._attention_head_size = int(hidden_size // self._num_heads) + common_kwargs = dict( + bias_initializer=self._bias_initializer, + kernel_regularizer=self._kernel_regularizer, + bias_regularizer=self._bias_regularizer, + activity_regularizer=self._activity_regularizer, + kernel_constraint=self._kernel_constraint, + bias_constraint=self._bias_constraint) + # TFLongformerSelfAttention + TFLongformerSelfOutput.dense + self._attention_layer = LongformerAttention( + # Longformer + layer_id=self._layer_id, + global_attention_size=self.global_attention_size, + attention_window=self._attention_window, + num_heads=self._num_heads, + key_dim=self._attention_head_size, + dropout=self._attention_dropout, + use_bias=self._use_bias, + kernel_initializer=self._attention_initializer, + attention_axes=self._attention_axes, + name="self_attention", + **common_kwargs) + # TFLongformerSelfOutput.dropout + self._attention_dropout = tf.keras.layers.Dropout(rate=self._output_dropout) + # Use float32 in layernorm for numeric stability. + # It is probably safe in mixed_float16, but we haven't validated this yet. + # TFLongformerSelfOutput.Layernorm + self._attention_layer_norm = ( + tf.keras.layers.LayerNormalization( + name="self_attention_layer_norm", + axis=-1, + epsilon=self._norm_epsilon, + dtype=tf.float32)) + # TFLongformerIntermediate + # TFLongformerIntermediate.dense + self._intermediate_dense = tf.keras.layers.experimental.EinsumDense( + einsum_equation, + output_shape=(None, self._inner_dim), + bias_axes="d", + kernel_initializer=self._kernel_initializer, + name="intermediate", + **common_kwargs) + policy = tf.keras.mixed_precision.global_policy() + if policy.name == "mixed_bfloat16": + # bfloat16 causes BERT with the LAMB optimizer to not converge + # as well, so we use float32. + # TODO(b/154538392): Investigate this. + policy = tf.float32 + # TFLongformerIntermediate.intermediate_act_fn + self._intermediate_activation_layer = tf.keras.layers.Activation( + self._inner_activation, dtype=policy) + # ??? + self._inner_dropout_layer = tf.keras.layers.Dropout( + rate=self._inner_dropout) + # TFLongformerOutput + # TFLongformerOutput.dense + self._output_dense = tf.keras.layers.experimental.EinsumDense( + einsum_equation, + output_shape=(None, hidden_size), + bias_axes="d", + name="output", + kernel_initializer=self._kernel_initializer, + **common_kwargs) + # TFLongformerOutput.dropout + self._output_dropout = tf.keras.layers.Dropout(rate=self._output_dropout) + # Use float32 in layernorm for numeric stability. + # TFLongformerOutput.layernorm + self._output_layer_norm = tf.keras.layers.LayerNormalization( + name="output_layer_norm", + axis=-1, + epsilon=self._norm_epsilon, + dtype=tf.float32) + + super(LongformerEncoderBlock, self).build(input_shape) + + def get_config(self): + config = { + "num_attention_heads": + self._num_heads, + "inner_dim": + self._inner_dim, + "inner_activation": + self._inner_activation, + "output_dropout": + self._output_dropout_rate, + "attention_dropout": + self._attention_dropout_rate, + "output_range": + self._output_range, + "kernel_initializer": + tf.keras.initializers.serialize(self._kernel_initializer), + "bias_initializer": + tf.keras.initializers.serialize(self._bias_initializer), + "kernel_regularizer": + tf.keras.regularizers.serialize(self._kernel_regularizer), + "bias_regularizer": + tf.keras.regularizers.serialize(self._bias_regularizer), + "activity_regularizer": + tf.keras.regularizers.serialize(self._activity_regularizer), + "kernel_constraint": + tf.keras.constraints.serialize(self._kernel_constraint), + "bias_constraint": + tf.keras.constraints.serialize(self._bias_constraint), + "use_bias": + self._use_bias, + "norm_first": + self._norm_first, + "norm_epsilon": + self._norm_epsilon, + "inner_dropout": + self._inner_dropout, + "attention_initializer": + tf.keras.initializers.serialize(self._attention_initializer), + "attention_axes": self._attention_axes, + } + base_config = super(LongformerEncoderBlock, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + def call(self, inputs): + """Transformer self-attention encoder block call. + + Args: + inputs: a single tensor or a list of tensors. + `input tensor` as the single sequence of embeddings. + [`input tensor`, `attention mask`] to have the additional attention + mask. + [`query tensor`, `key value tensor`, `attention mask`] to have separate + input streams for the query, and key/value to the multi-head + attention. + + Returns: + An output tensor with the same dimensions as input/query tensor. + """ + if isinstance(inputs, (list, tuple)): + if len(inputs) == 5: + ( + input_tensor, + attention_mask, + is_index_masked, + is_index_global_attn, + is_global_attn + ) = inputs + key_value = None + elif len(inputs) == 6: + assert False # No key_value + else: + raise ValueError("Unexpected inputs to %s with length at %d" % + (self.__class__, len(inputs))) + else: + input_tensor = inputs + attention_mask = None + is_index_masked = None + is_index_global_attn = None + is_global_attn = None + key_value = None + + if self._output_range: + if self._norm_first: + source_tensor = input_tensor[:, 0:self._output_range, :] + input_tensor = self._attention_layer_norm(input_tensor) + if key_value is not None: + key_value = self._attention_layer_norm(key_value) + target_tensor = input_tensor[:, 0:self._output_range, :] + if attention_mask is not None: + attention_mask = attention_mask[:, 0:self._output_range, :] + if is_index_masked is not None: + is_index_masked = is_index_masked[:, 0:self._output_range] + if is_index_global_attn is not None: + is_index_global_attn = is_index_global_attn[:, 0:self._output_range] + else: + if self._norm_first: + source_tensor = input_tensor + input_tensor = self._attention_layer_norm(input_tensor) + if key_value is not None: + key_value = self._attention_layer_norm(key_value) + target_tensor = input_tensor + + if key_value is None: + key_value = input_tensor + # attention_output = self._attention_layer( + # query=target_tensor, value=key_value, attention_mask=attention_mask) + attention_output = self._attention_layer( + hidden_states=target_tensor, + attention_mask=attention_mask, + is_index_masked=is_index_masked, + is_index_global_attn=is_index_global_attn, + is_global_attn=is_global_attn + ) + # TFLongformerAttention.TFLongformerSelfOutput.* - {.dense} + attention_output = self._attention_dropout(attention_output) + if self._norm_first: + attention_output = source_tensor + attention_output + else: + attention_output = self._attention_layer_norm(target_tensor + + attention_output) + if self._norm_first: + source_attention_output = attention_output + attention_output = self._output_layer_norm(attention_output) + # TFLongformerIntermediate + inner_output = self._intermediate_dense(attention_output) + inner_output = self._intermediate_activation_layer(inner_output) + inner_output = self._inner_dropout_layer(inner_output) + # TFLongformerOutput + layer_output = self._output_dense(inner_output) + layer_output = self._output_dropout(layer_output) + + if self._norm_first: + return source_attention_output + layer_output + + # During mixed precision training, layer norm output is always fp32 for now. + # Casts fp32 for the subsequent add. + layer_output = tf.cast(layer_output, tf.float32) + return self._output_layer_norm(layer_output + attention_output) diff --git a/official/projects/longformer/longformer_encoder_test.py b/official/projects/longformer/longformer_encoder_test.py new file mode 100644 index 000000000..afa90b775 --- /dev/null +++ b/official/projects/longformer/longformer_encoder_test.py @@ -0,0 +1,84 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for official.nlp.projects.bigbird.encoder.""" + +import numpy as np +import tensorflow as tf +from absl.testing import parameterized +from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import +from tensorflow.python.distribute import combinations + +from official.projects.longformer.longformer_encoder import LongformerEncoder + +@keras_parameterized.run_all_keras_modes +class LongformerEncoderTest(keras_parameterized.TestCase): + @combinations.generate(combinations.combine( + attention_window=[32, 128], global_attention_size=[0, 1, 2])) + def test_encoder(self, attention_window, global_attention_size): + sequence_length = 128 + batch_size = 2 + vocab_size = 1024 + hidden_size=256 + network = LongformerEncoder( + global_attention_size=global_attention_size, + vocab_size=vocab_size, + attention_window=attention_window, + hidden_size=hidden_size, + num_layers=1, + num_attention_heads=4, + max_sequence_length=512) + word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length), dtype=np.int32) + mask_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32) + type_id_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32) + inputs = { + 'input_word_ids': word_id_data, + 'input_mask': mask_data, + 'input_type_ids': type_id_data, + } + outputs = network(inputs) + self.assertEqual(outputs["sequence_output"].shape, + (batch_size, sequence_length, hidden_size)) + + @combinations.generate(combinations.combine( + norm_first=[True, False], global_attention_size=[0, 1, 2])) + def test_norm_first(self, norm_first, global_attention_size): + sequence_length = 128 + batch_size = 2 + vocab_size = 1024 + hidden_size = 256 + network = LongformerEncoder( + global_attention_size=global_attention_size, + vocab_size=vocab_size, + attention_window=32, + hidden_size=hidden_size, + num_layers=1, + num_attention_heads=4, + max_sequence_length=512, + norm_first=norm_first) + word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length), dtype=np.int32) + mask_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32) + type_id_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32) + inputs = { + 'input_word_ids': word_id_data, + 'input_mask': mask_data, + 'input_type_ids': type_id_data, + } + outputs = network(inputs) + self.assertEqual(outputs["sequence_output"].shape, + (batch_size, sequence_length, hidden_size)) + + +if __name__ == "__main__": + tf.test.main() \ No newline at end of file diff --git a/official/projects/longformer/longformer_experiments.py b/official/projects/longformer/longformer_experiments.py new file mode 100644 index 000000000..09e5cc010 --- /dev/null +++ b/official/projects/longformer/longformer_experiments.py @@ -0,0 +1,116 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Longformer experiments +""" +# pylint: disable=g-doc-return-or-yield,line-too-long +import dataclasses +from official.core import config_definitions as cfg +from official.core import exp_factory +from official.modeling import optimization +from official.nlp.data import pretrain_dataloader +from official.nlp.tasks import masked_lm +from official.nlp.data import sentence_prediction_dataloader +from official.nlp.configs import bert +from official.nlp.configs import encoders +import official.projects.longformer.sentence_prediction_with_load as sentence_prediction + +from official.projects.longformer.longformer import LongformerEncoderConfig + +AdamWeightDecay = optimization.AdamWeightDecayConfig +PolynomialLr = optimization.PolynomialLrConfig +PolynomialWarmupConfig = optimization.PolynomialWarmupConfig + +@dataclasses.dataclass +class LongformerOptimizationConfig(optimization.OptimizationConfig): + optimizer: optimization.OptimizerConfig = optimization.OptimizerConfig( + type="adamw", + adamw=AdamWeightDecay( + weight_decay_rate=0.01, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], + epsilon=1e-6)) + learning_rate: optimization.LrConfig = optimization.LrConfig( + type="polynomial", + polynomial=PolynomialLr( + initial_learning_rate=1e-4, + decay_steps=1000000, + end_learning_rate=0.0)) + warmup: optimization.WarmupConfig = optimization.WarmupConfig( + type="polynomial", polynomial=PolynomialWarmupConfig(warmup_steps=10000)) + +@exp_factory.register_config_factory('longformer/pretraining') +def longformer_pretraining() -> cfg.ExperimentConfig: + """BERT pretraining experiment.""" + config = cfg.ExperimentConfig( + runtime=cfg.RuntimeConfig(enable_xla=True), + task=masked_lm.MaskedLMConfig( + model=bert.PretrainerConfig( + encoder=encoders.EncoderConfig( + type="any", any=LongformerEncoderConfig()), + cls_heads=[ + bert.ClsHeadConfig( + inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence') + ] + ), + train_data=pretrain_dataloader.BertPretrainDataConfig(use_v2_feature_names=True), + validation_data=pretrain_dataloader.BertPretrainDataConfig(use_v2_feature_names=True, + is_training=False)), + trainer=cfg.TrainerConfig( + optimizer_config=LongformerOptimizationConfig(), train_steps=1000000), + restrictions=[ + 'task.train_data.is_training != None', + 'task.validation_data.is_training != None' + ]) + return config + +@exp_factory.register_config_factory('longformer/glue') +def longformer_glue() -> cfg.ExperimentConfig: + config = cfg.ExperimentConfig( + task=sentence_prediction.SentencePredictionConfig( + model=sentence_prediction.ModelConfig( + encoder=encoders.EncoderConfig( + type="any", any=LongformerEncoderConfig())), + train_data=sentence_prediction_dataloader + .SentencePredictionDataConfig(), + validation_data=sentence_prediction_dataloader + .SentencePredictionDataConfig( + is_training=False, drop_remainder=False)), + trainer=cfg.TrainerConfig( + optimizer_config=optimization.OptimizationConfig({ + 'optimizer': { + 'type': 'adamw', + 'adamw': { + 'weight_decay_rate': + 0.01, + 'exclude_from_weight_decay': + ['LayerNorm', 'layer_norm', 'bias'], + } + }, + 'learning_rate': { + 'type': 'polynomial', + 'polynomial': { + 'initial_learning_rate': 3e-5, + 'end_learning_rate': 0.0, + } + }, + 'warmup': { + 'type': 'polynomial' + } + })), + restrictions=[ + 'task.train_data.is_training != None', + 'task.validation_data.is_training != None' + ]) + return config diff --git a/official/projects/longformer/sentence_prediction_with_checkpoint_convert.py b/official/projects/longformer/sentence_prediction_with_checkpoint_convert.py new file mode 100644 index 000000000..fdecee0b9 --- /dev/null +++ b/official/projects/longformer/sentence_prediction_with_checkpoint_convert.py @@ -0,0 +1,389 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sentence prediction (classification) task.""" +import dataclasses +from typing import List, Union, Optional + +from absl import logging +import numpy as np +import orbit +from scipy import stats +from sklearn import metrics as sklearn_metrics +import tensorflow as tf + +from official.core import base_task +from official.core import config_definitions as cfg +from official.core import task_factory +from official.modeling import tf_utils +from official.modeling.hyperparams import base_config +from official.nlp.configs import encoders +from official.nlp.data import data_loader_factory +from official.nlp.modeling import models +from official.nlp.tasks import utils + +import pickle + +METRIC_TYPES = frozenset( + ['accuracy', 'matthews_corrcoef', 'pearson_spearman_corr']) + + +@dataclasses.dataclass +class ModelConfig(base_config.Config): + """A classifier/regressor configuration.""" + num_classes: int = 0 + use_encoder_pooler: bool = False + encoder: encoders.EncoderConfig = encoders.EncoderConfig() + + +@dataclasses.dataclass +class SentencePredictionConfig(cfg.TaskConfig): + """The model config.""" + # At most one of `init_checkpoint` and `hub_module_url` can + # be specified. + init_checkpoint: str = '' + init_cls_pooler: bool = False + initial_parameters_from_pk: str = '' + hub_module_url: str = '' + metric_type: str = 'accuracy' + # Defines the concrete model config at instantiation time. + model: ModelConfig = ModelConfig() + train_data: cfg.DataConfig = cfg.DataConfig() + validation_data: cfg.DataConfig = cfg.DataConfig() + + +@task_factory.register_task_cls(SentencePredictionConfig) +class SentencePredictionTask(base_task.Task): + """Task object for sentence_prediction.""" + + def __init__(self, params: cfg.TaskConfig, logging_dir=None, name=None): + super().__init__(params, logging_dir, name=name) + if params.metric_type not in METRIC_TYPES: + raise ValueError('Invalid metric_type: {}'.format(params.metric_type)) + self.metric_type = params.metric_type + if hasattr(params.train_data, 'label_field'): + self.label_field = params.train_data.label_field + else: + self.label_field = 'label_ids' + + def build_model(self): + if self.task_config.hub_module_url and self.task_config.init_checkpoint: + raise ValueError('At most one of `hub_module_url` and ' + '`init_checkpoint` can be specified.') + if self.task_config.hub_module_url: + encoder_network = utils.get_encoder_from_hub( + self.task_config.hub_module_url) + else: + encoder_network = encoders.build_encoder(self.task_config.model.encoder) + encoder_cfg = self.task_config.model.encoder.get() + if self.task_config.model.encoder.type == 'xlnet': + return models.XLNetClassifier( + network=encoder_network, + num_classes=self.task_config.model.num_classes, + initializer=tf.keras.initializers.RandomNormal( + stddev=encoder_cfg.initializer_range)) + else: + return models.BertClassifier( + network=encoder_network, + num_classes=self.task_config.model.num_classes, + initializer=tf.keras.initializers.TruncatedNormal( + stddev=encoder_cfg.initializer_range), + use_encoder_pooler=self.task_config.model.use_encoder_pooler) + + def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor: + label_ids = labels[self.label_field] + if self.task_config.model.num_classes == 1: + loss = tf.keras.losses.mean_squared_error(label_ids, model_outputs) + else: + loss = tf.keras.losses.sparse_categorical_crossentropy( + label_ids, tf.cast(model_outputs, tf.float32), from_logits=True) + + if aux_losses: + loss += tf.add_n(aux_losses) + return tf_utils.safe_mean(loss) + + def build_inputs(self, params, input_context=None): + """Returns tf.data.Dataset for sentence_prediction task.""" + if params.input_path == 'dummy': + + def dummy_data(_): + dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32) + x = dict( + input_word_ids=dummy_ids, + input_mask=dummy_ids, + input_type_ids=dummy_ids) + + if self.task_config.model.num_classes == 1: + y = tf.zeros((1,), dtype=tf.float32) + else: + y = tf.zeros((1, 1), dtype=tf.int32) + x[self.label_field] = y + return x + + dataset = tf.data.Dataset.range(1) + dataset = dataset.repeat() + dataset = dataset.map( + dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE) + return dataset + + return data_loader_factory.get_data_loader(params).load(input_context) + + def build_metrics(self, training=None): + del training + if self.task_config.model.num_classes == 1: + metrics = [tf.keras.metrics.MeanSquaredError()] + elif self.task_config.model.num_classes == 2: + metrics = [ + tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy'), + tf.keras.metrics.AUC(name='auc', curve='PR'), + ] + else: + metrics = [ + tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy'), + ] + return metrics + + def process_metrics(self, metrics, labels, model_outputs): + for metric in metrics: + if metric.name == 'auc': + # Convert the logit to probability and extract the probability of True.. + metric.update_state( + labels[self.label_field], + tf.expand_dims(tf.nn.softmax(model_outputs)[:, 1], axis=1)) + if metric.name == 'cls_accuracy': + metric.update_state(labels[self.label_field], model_outputs) + + def process_compiled_metrics(self, compiled_metrics, labels, model_outputs): + compiled_metrics.update_state(labels[self.label_field], model_outputs) + + def validation_step(self, inputs, model: tf.keras.Model, metrics=None): + if self.metric_type == 'accuracy': + return super(SentencePredictionTask, + self).validation_step(inputs, model, metrics) + features, labels = inputs, inputs + outputs = self.inference_step(features, model) + loss = self.build_losses( + labels=labels, model_outputs=outputs, aux_losses=model.losses) + logs = {self.loss: loss} + if self.metric_type == 'matthews_corrcoef': + logs.update({ + 'sentence_prediction': # Ensure one prediction along batch dimension. + tf.expand_dims(tf.math.argmax(outputs, axis=1), axis=1), + 'labels': + labels[self.label_field], + }) + if self.metric_type == 'pearson_spearman_corr': + logs.update({ + 'sentence_prediction': outputs, + 'labels': labels[self.label_field], + }) + return logs + + def aggregate_logs(self, state=None, step_outputs=None): + if self.metric_type == 'accuracy': + return None + if state is None: + state = {'sentence_prediction': [], 'labels': []} + state['sentence_prediction'].append( + np.concatenate([v.numpy() for v in step_outputs['sentence_prediction']], + axis=0)) + state['labels'].append( + np.concatenate([v.numpy() for v in step_outputs['labels']], axis=0)) + return state + + def reduce_aggregated_logs(self, aggregated_logs, global_step=None): + if self.metric_type == 'accuracy': + return None + elif self.metric_type == 'matthews_corrcoef': + preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0) + preds = np.reshape(preds, -1) + labels = np.concatenate(aggregated_logs['labels'], axis=0) + labels = np.reshape(labels, -1) + return { + self.metric_type: sklearn_metrics.matthews_corrcoef(preds, labels) + } + elif self.metric_type == 'pearson_spearman_corr': + preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0) + preds = np.reshape(preds, -1) + labels = np.concatenate(aggregated_logs['labels'], axis=0) + labels = np.reshape(labels, -1) + pearson_corr = stats.pearsonr(preds, labels)[0] + spearman_corr = stats.spearmanr(preds, labels)[0] + corr_metric = (pearson_corr + spearman_corr) / 2 + return {self.metric_type: corr_metric} + + def initialize(self, model): + """Load a pretrained checkpoint (if exists) and then train from iter 0.""" + ckpt_dir_or_file = self.task_config.init_checkpoint + if self.task_config.initial_parameters_from_pk: + num_layers = self.task_config.model.encoder.num_layers + num_attention_heads = self.task_config.model.encoder.num_attention_heads + hidden_size = self.task_config.model.encoder.hidden_size + inner_dim = self.task_config.model.encoder.inner_dim + head_size = hidden_size / num_attention_heads + assert head_size * num_attention_heads == hidden_size + + encoder = model.checkpoint_items['encoder'] + allenai_model = pickle.load(open(self.task_config.initial_parameters_from_pk, "rb")) + encoder._embedding_layer.set_weights( + [allenai_model["embeddings.word_embeddings.weight"]] + ) + encoder._embedding_norm_layer.set_weights( + [allenai_model["embeddings.LayerNorm.weight"], + allenai_model["embeddings.LayerNorm.bias"]] + ) + encoder._type_embedding_layer.set_weights( + [np.repeat( + allenai_model["embeddings.token_type_embeddings.weight"], + 2, + axis=0 + )] + ) + encoder._position_embedding_layer.set_weights( + [allenai_model["embeddings.position_embeddings.weight"]] + ) + encoder._pooler_layer.set_weights( + [allenai_model["pooler.dense.weight"], + allenai_model["pooler.dense.bias"]] + ) + for layer_num in range(num_layers): + encoder._transformer_layers[layer_num]._attention_layer._global_key_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.self.key_global.weight"].T.reshape((hidden_size, num_attention_heads, head_size)), + allenai_model[f"encoder.layer.{layer_num}.attention.self.key_global.bias"].reshape((num_attention_heads, head_size))] + ) + encoder._transformer_layers[layer_num]._attention_layer._global_query_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.self.query_global.weight"].T.reshape((hidden_size, num_attention_heads, head_size)), + allenai_model[f"encoder.layer.{layer_num}.attention.self.query_global.bias"].reshape((num_attention_heads, head_size))] + ) + encoder._transformer_layers[layer_num]._attention_layer._global_value_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.self.value_global.weight"].T.reshape((hidden_size, num_attention_heads, head_size)), + allenai_model[f"encoder.layer.{layer_num}.attention.self.value_global.bias"].reshape((num_attention_heads, head_size))] + ) + encoder._transformer_layers[layer_num]._attention_layer._key_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.self.key.weight"].T.reshape((hidden_size, num_attention_heads, head_size)), + allenai_model[f"encoder.layer.{layer_num}.attention.self.key_global.bias"].reshape((num_attention_heads, head_size))] + ) + encoder._transformer_layers[layer_num]._attention_layer._query_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.self.query.weight"].T.reshape((hidden_size, num_attention_heads, head_size)), + allenai_model[f"encoder.layer.{layer_num}.attention.self.query.bias"].reshape((num_attention_heads, head_size))] + ) + encoder._transformer_layers[layer_num]._attention_layer._value_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.self.value.weight"].T.reshape((hidden_size, num_attention_heads, head_size)), + allenai_model[f"encoder.layer.{layer_num}.attention.self.value.bias"].reshape((num_attention_heads, head_size))] + ) + encoder._transformer_layers[layer_num]._attention_layer._output_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.output.dense.weight"].T, + allenai_model[f"encoder.layer.{layer_num}.attention.output.dense.bias"]] + ) + encoder._transformer_layers[layer_num]._attention_layer_norm.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.output.LayerNorm.weight"], + allenai_model[f"encoder.layer.{layer_num}.attention.output.LayerNorm.bias"]] + ) + encoder._transformer_layers[layer_num]._intermediate_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.intermediate.dense.weight"].T, + allenai_model[f"encoder.layer.{layer_num}.intermediate.dense.bias"]] + ) + encoder._transformer_layers[layer_num]._output_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.output.dense.weight"].T, + allenai_model[f"encoder.layer.{layer_num}.output.dense.bias"]] + ) + encoder._transformer_layers[layer_num]._output_layer_norm.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.output.LayerNorm.weight"], + allenai_model[f"encoder.layer.{layer_num}.output.LayerNorm.bias"]] + ) + if not ckpt_dir_or_file: + return + if tf.io.gfile.isdir(ckpt_dir_or_file): + ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file) + + pretrain2finetune_mapping = { + 'encoder': model.checkpoint_items['encoder'], + } + if self.task_config.init_cls_pooler: + # This option is valid when use_encoder_pooler is false. + pretrain2finetune_mapping[ + 'next_sentence.pooler_dense'] = model.checkpoint_items[ + 'sentence_prediction.pooler_dense'] + ckpt = tf.train.Checkpoint(**pretrain2finetune_mapping) + status = ckpt.read(ckpt_dir_or_file) + status.expect_partial().assert_existing_objects_matched() + logging.info('Finished loading pretrained checkpoint from %s', + ckpt_dir_or_file) + + +def predict(task: SentencePredictionTask, + params: cfg.DataConfig, + model: tf.keras.Model, + params_aug: Optional[cfg.DataConfig] = None, + test_time_aug_wgt: float = 0.3) -> List[Union[int, float]]: + """Predicts on the input data. + + Args: + task: A `SentencePredictionTask` object. + params: A `cfg.DataConfig` object. + model: A keras.Model. + params_aug: A `cfg.DataConfig` object for augmented data. + test_time_aug_wgt: Test time augmentation weight. The prediction score will + use (1. - test_time_aug_wgt) original prediction plus test_time_aug_wgt + augmented prediction. + + Returns: + A list of predictions with length of `num_examples`. For regression task, + each element in the list is the predicted score; for classification task, + each element is the predicted class id. + """ + + def predict_step(inputs): + """Replicated prediction calculation.""" + x = inputs + example_id = x.pop('example_id') + outputs = task.inference_step(x, model) + return dict(example_id=example_id, predictions=outputs) + + def aggregate_fn(state, outputs): + """Concatenates model's outputs.""" + if state is None: + state = [] + + for per_replica_example_id, per_replica_batch_predictions in zip( + outputs['example_id'], outputs['predictions']): + state.extend(zip(per_replica_example_id, per_replica_batch_predictions)) + return state + + dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(), + task.build_inputs, params) + outputs = utils.predict(predict_step, aggregate_fn, dataset) + + # When running on TPU POD, the order of output cannot be maintained, + # so we need to sort by example_id. + outputs = sorted(outputs, key=lambda x: x[0]) + is_regression = task.task_config.model.num_classes == 1 + if params_aug is not None: + dataset_aug = orbit.utils.make_distributed_dataset( + tf.distribute.get_strategy(), task.build_inputs, params_aug) + outputs_aug = utils.predict(predict_step, aggregate_fn, dataset_aug) + outputs_aug = sorted(outputs_aug, key=lambda x: x[0]) + if is_regression: + return [(1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1] + for x, y in zip(outputs, outputs_aug)] + else: + return [ + tf.argmax( + (1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1], + axis=-1) for x, y in zip(outputs, outputs_aug) + ] + if is_regression: + return [x[1] for x in outputs] + else: + return [tf.argmax(x[1], axis=-1) for x in outputs] diff --git a/official/projects/longformer/train.py b/official/projects/longformer/train.py new file mode 100644 index 000000000..91e8b516e --- /dev/null +++ b/official/projects/longformer/train.py @@ -0,0 +1,69 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A customized training library for the specific task.""" + +from absl import app +from absl import flags +import gin + +from official.common import distribute_utils +from official.common import flags as tfm_flags +from official.core import task_factory +from official.core import train_lib +from official.core import train_utils +from official.modeling import performance +from official.projects.longformer import longformer_experiments + +FLAGS = flags.FLAGS + + +def main(_): + gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) + params = train_utils.parse_configuration(FLAGS) + model_dir = FLAGS.model_dir + if 'train' in FLAGS.mode: + # Pure eval modes do not output yaml files. Otherwise continuous eval job + # may race against the train job for writing the same file. + train_utils.serialize_config(params, model_dir) + + # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' + # can have significant impact on model speeds by utilizing float16 in case of + # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when + # dtype is float16 + if params.runtime.mixed_precision_dtype: + performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype) + distribution_strategy = distribute_utils.get_distribution_strategy( + distribution_strategy=params.runtime.distribution_strategy, + all_reduce_alg=params.runtime.all_reduce_alg, + num_gpus=params.runtime.num_gpus, + tpu_address=params.runtime.tpu, + **params.runtime.model_parallelism()) + + with distribution_strategy.scope(): + task = task_factory.get_task(params.task, logging_dir=model_dir) + + train_lib.run_experiment( + distribution_strategy=distribution_strategy, + task=task, + mode=FLAGS.mode, + params=params, + model_dir=model_dir) + + train_utils.save_gin_config(FLAGS.mode, model_dir) + + +if __name__ == '__main__': + tfm_flags.define_flags() + app.run(main) diff --git a/official/projects/longformer/transform_longformer_tokenized_into_tfrecord.py b/official/projects/longformer/transform_longformer_tokenized_into_tfrecord.py new file mode 100644 index 000000000..0ba9dcf9a --- /dev/null +++ b/official/projects/longformer/transform_longformer_tokenized_into_tfrecord.py @@ -0,0 +1,1592 @@ +# Copyright 2021 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""BERT library to process data for classification task.""" + +import collections +import csv +import importlib +import json +import os + +from absl import logging +import tensorflow as tf +import tensorflow_datasets as tfds + +from official.nlp.bert import tokenization + + +class InputExample(object): + """A single training/test example for simple seq regression/classification.""" + + def __init__(self, + guid, + text_a, + text_b=None, + label=None, + weight=None, + example_id=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string for classification, float for regression. The + label of the example. This should be specified for train and dev + examples, but not for test examples. + weight: (Optional) float. The weight of the example to be used during + training. + example_id: (Optional) int. The int identification number of example in + the corpus. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + self.weight = weight + self.example_id = example_id + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + input_ids, + input_mask, + segment_ids, + label_id, + is_real_example=True, + weight=None, + example_id=None): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.is_real_example = is_real_example + self.weight = weight + self.example_id = example_id + + +class DataProcessor(object): + """Base class for converters for seq regression/classification datasets.""" + + def __init__(self, process_text_fn=tokenization.convert_to_unicode): + self.process_text_fn = process_text_fn + self.is_regression = False + self.label_type = None + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @staticmethod + def get_processor_name(): + """Gets the string identifier of the processor.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with tf.io.gfile.GFile(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + @classmethod + def _read_jsonl(cls, input_file): + """Reads a json line file.""" + with tf.io.gfile.GFile(input_file, "r") as f: + lines = [] + for json_str in f: + lines.append(json.loads(json_str)) + return lines + + def featurize_example(self, *kargs, **kwargs): + """Converts a single `InputExample` into a single `InputFeatures`.""" + return convert_single_example(*kargs, **kwargs) + + +class DefaultGLUEDataProcessor(DataProcessor): + """Processor for the SuperGLUE dataset.""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples_tfds("train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples_tfds("validation") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples_tfds("test") + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + raise NotImplementedError() + + +class AxProcessor(DataProcessor): + """Processor for the AX dataset (GLUE diagnostics dataset).""" + + def get_train_examples(self, data_dir): + """See base class.""" + train_mnli_dataset = tfds.load( + "glue/mnli", split="train", try_gcs=True).as_numpy_iterator() + return self._create_examples_tfds(train_mnli_dataset, "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + val_mnli_dataset = tfds.load( + "glue/mnli", split="validation_matched", + try_gcs=True).as_numpy_iterator() + return self._create_examples_tfds(val_mnli_dataset, "validation") + + def get_test_examples(self, data_dir): + """See base class.""" + test_ax_dataset = tfds.load( + "glue/ax", split="test", try_gcs=True).as_numpy_iterator() + return self._create_examples_tfds(test_ax_dataset, "test") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "AX" + + def _create_examples_tfds(self, dataset, set_type): + """Creates examples for the training/dev/test sets.""" + examples = [] + for i, example in enumerate(dataset): + guid = "%s-%s" % (set_type, i) + label = "contradiction" + text_a = self.process_text_fn(example["hypothesis"]) + text_b = self.process_text_fn(example["premise"]) + if set_type != "test": + label = self.get_labels()[example["label"]] + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label, + weight=None)) + return examples + + +class ColaProcessor(DefaultGLUEDataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "COLA" + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + dataset = tfds.load( + "glue/cola", split=set_type, try_gcs=True).as_numpy_iterator() + examples = [] + for i, example in enumerate(dataset): + guid = "%s-%s" % (set_type, i) + label = "0" + text_a = self.process_text_fn(example["sentence"]) + if set_type != "test": + label = str(example["label"]) + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=None, label=label, weight=None)) + return examples + + +class ImdbProcessor(DataProcessor): + """Processor for the IMDb dataset.""" + + def get_labels(self): + return ["neg", "pos"] + + def get_train_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, "train")) + + def get_dev_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, "test")) + + @staticmethod + def get_processor_name(): + """See base class.""" + return "IMDB" + + def _create_examples(self, data_dir): + """Creates examples.""" + examples = [] + for label in ["neg", "pos"]: + cur_dir = os.path.join(data_dir, label) + for filename in tf.io.gfile.listdir(cur_dir): + if not filename.endswith("txt"): + continue + + if len(examples) % 1000 == 0: + logging.info("Loading dev example %d", len(examples)) + + path = os.path.join(cur_dir, filename) + with tf.io.gfile.GFile(path, "r") as f: + text = f.read().strip().replace("
", " ") + examples.append( + InputExample( + guid="unused_id", text_a=text, text_b=None, label=label)) + return examples + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def __init__(self, + mnli_type="matched", + process_text_fn=tokenization.convert_to_unicode): + super(MnliProcessor, self).__init__(process_text_fn) + self.dataset = tfds.load("glue/mnli", try_gcs=True) + if mnli_type not in ("matched", "mismatched"): + raise ValueError("Invalid `mnli_type`: %s" % mnli_type) + self.mnli_type = mnli_type + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples_tfds("train") + + def get_dev_examples(self, data_dir): + """See base class.""" + if self.mnli_type == "matched": + return self._create_examples_tfds("validation_matched") + else: + return self._create_examples_tfds("validation_mismatched") + + def get_test_examples(self, data_dir): + """See base class.""" + if self.mnli_type == "matched": + return self._create_examples_tfds("test_matched") + else: + return self._create_examples_tfds("test_mismatched") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "MNLI" + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + dataset = tfds.load( + "glue/mnli", split=set_type, try_gcs=True).as_numpy_iterator() + examples = [] + for i, example in enumerate(dataset): + guid = "%s-%s" % (set_type, i) + label = "contradiction" + text_a = self.process_text_fn(example["hypothesis"]) + text_b = self.process_text_fn(example["premise"]) + if set_type != "test": + label = self.get_labels()[example["label"]] + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label, + weight=None)) + return examples + + +class MrpcProcessor(DefaultGLUEDataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "MRPC" + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + dataset = tfds.load( + "glue/mrpc", split=set_type, try_gcs=True).as_numpy_iterator() + examples = [] + for i, example in enumerate(dataset): + guid = "%s-%s" % (set_type, i) + label = "0" + text_a = self.process_text_fn(example["sentence1"]) + text_b = self.process_text_fn(example["sentence2"]) + if set_type != "test": + label = str(example["label"]) + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label, + weight=None)) + return examples + + +class PawsxProcessor(DataProcessor): + """Processor for the PAWS-X data set.""" + supported_languages = ["de", "en", "es", "fr", "ja", "ko", "zh"] + + def __init__(self, + language="en", + process_text_fn=tokenization.convert_to_unicode): + super(PawsxProcessor, self).__init__(process_text_fn) + if language == "all": + self.languages = PawsxProcessor.supported_languages + elif language not in PawsxProcessor.supported_languages: + raise ValueError("language %s is not supported for PAWS-X task." % + language) + else: + self.languages = [language] + + def get_train_examples(self, data_dir): + """See base class.""" + lines = [] + for language in self.languages: + if language == "en": + train_tsv = "train.tsv" + else: + train_tsv = "translated_train.tsv" + # Skips the header. + lines.extend( + self._read_tsv(os.path.join(data_dir, language, train_tsv))[1:]) + + examples = [] + for i, line in enumerate(lines): + guid = "train-%d" % i + text_a = self.process_text_fn(line[1]) + text_b = self.process_text_fn(line[2]) + label = self.process_text_fn(line[3]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + lines = [] + for lang in PawsxProcessor.supported_languages: + lines.extend( + self._read_tsv(os.path.join(data_dir, lang, "dev_2k.tsv"))[1:]) + + examples = [] + for i, line in enumerate(lines): + guid = "dev-%d" % i + text_a = self.process_text_fn(line[1]) + text_b = self.process_text_fn(line[2]) + label = self.process_text_fn(line[3]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_test_examples(self, data_dir): + """See base class.""" + examples_by_lang = {k: [] for k in self.supported_languages} + for lang in self.supported_languages: + lines = self._read_tsv(os.path.join(data_dir, lang, "test_2k.tsv"))[1:] + for i, line in enumerate(lines): + guid = "test-%d" % i + text_a = self.process_text_fn(line[1]) + text_b = self.process_text_fn(line[2]) + label = self.process_text_fn(line[3]) + examples_by_lang[lang].append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples_by_lang + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "XTREME-PAWS-X" + + +class QnliProcessor(DefaultGLUEDataProcessor): + """Processor for the QNLI data set (GLUE version).""" + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "QNLI" + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + dataset = tfds.load( + "glue/qnli", split=set_type, try_gcs=True).as_numpy_iterator() + examples = [] + for i, example in enumerate(dataset): + guid = "%s-%s" % (set_type, i) + label = "entailment" + text_a = self.process_text_fn(example["question"]) + text_b = self.process_text_fn(example["sentence"]) + if set_type != "test": + label = self.get_labels()[example["label"]] + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label, + weight=None)) + return examples + + +class QqpProcessor(DefaultGLUEDataProcessor): + """Processor for the QQP data set (GLUE version).""" + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "QQP" + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + dataset = tfds.load( + "glue/qqp", split=set_type, try_gcs=True).as_numpy_iterator() + examples = [] + for i, example in enumerate(dataset): + guid = "%s-%s" % (set_type, i) + label = "0" + text_a = self.process_text_fn(example["question1"]) + text_b = self.process_text_fn(example["question2"]) + if set_type != "test": + label = str(example["label"]) + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label, + weight=None)) + return examples + + +class RteProcessor(DefaultGLUEDataProcessor): + """Processor for the RTE data set (GLUE version).""" + + def get_labels(self): + """See base class.""" + # All datasets are converted to 2-class split, where for 3-class datasets we + # collapse neutral and contradiction into not_entailment. + return ["entailment", "not_entailment"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "RTE" + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + dataset = tfds.load( + "glue/rte", split=set_type, try_gcs=True).as_numpy_iterator() + examples = [] + for i, example in enumerate(dataset): + guid = "%s-%s" % (set_type, i) + label = "entailment" + text_a = self.process_text_fn(example["sentence1"]) + text_b = self.process_text_fn(example["sentence2"]) + if set_type != "test": + label = self.get_labels()[example["label"]] + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label, + weight=None)) + return examples + + +class SstProcessor(DefaultGLUEDataProcessor): + """Processor for the SST-2 data set (GLUE version).""" + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "SST-2" + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + dataset = tfds.load( + "glue/sst2", split=set_type, try_gcs=True).as_numpy_iterator() + examples = [] + for i, example in enumerate(dataset): + guid = "%s-%s" % (set_type, i) + label = "0" + text_a = self.process_text_fn(example["sentence"]) + if set_type != "test": + label = str(example["label"]) + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=None, label=label, weight=None)) + return examples + + +class StsBProcessor(DefaultGLUEDataProcessor): + """Processor for the STS-B data set (GLUE version).""" + + def __init__(self, process_text_fn=tokenization.convert_to_unicode): + super(StsBProcessor, self).__init__(process_text_fn=process_text_fn) + self.is_regression = True + self.label_type = float + self._labels = None + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + dataset = tfds.load( + "glue/stsb", split=set_type, try_gcs=True).as_numpy_iterator() + examples = [] + for i, example in enumerate(dataset): + guid = "%s-%s" % (set_type, i) + label = 0.0 + text_a = self.process_text_fn(example["sentence1"]) + text_b = self.process_text_fn(example["sentence2"]) + if set_type != "test": + label = self.label_type(example["label"]) + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label, + weight=None)) + return examples + + def get_labels(self): + """See base class.""" + return self._labels + + @staticmethod + def get_processor_name(): + """See base class.""" + return "STS-B" + + +class TfdsProcessor(DataProcessor): + """Processor for generic text classification and regression TFDS data set. + + The TFDS parameters are expected to be provided in the tfds_params string, in + a comma-separated list of parameter assignments. + Examples: + tfds_params="dataset=scicite,text_key=string" + tfds_params="dataset=imdb_reviews,test_split=,dev_split=test" + tfds_params="dataset=glue/cola,text_key=sentence" + tfds_params="dataset=glue/sst2,text_key=sentence" + tfds_params="dataset=glue/qnli,text_key=question,text_b_key=sentence" + tfds_params="dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2" + tfds_params="dataset=glue/stsb,text_key=sentence1,text_b_key=sentence2," + "is_regression=true,label_type=float" + tfds_params="dataset=snli,text_key=premise,text_b_key=hypothesis," + "skip_label=-1" + Possible parameters (please refer to the documentation of Tensorflow Datasets + (TFDS) for the meaning of individual parameters): + dataset: Required dataset name (potentially with subset and version number). + data_dir: Optional TFDS source root directory. + module_import: Optional Dataset module to import. + train_split: Name of the train split (defaults to `train`). + dev_split: Name of the dev split (defaults to `validation`). + test_split: Name of the test split (defaults to `test`). + text_key: Key of the text_a feature (defaults to `text`). + text_b_key: Key of the second text feature if available. + label_key: Key of the label feature (defaults to `label`). + test_text_key: Key of the text feature to use in test set. + test_text_b_key: Key of the second text feature to use in test set. + test_label: String to be used as the label for all test examples. + label_type: Type of the label key (defaults to `int`). + weight_key: Key of the float sample weight (is not used if not provided). + is_regression: Whether the task is a regression problem (defaults to False). + skip_label: Skip examples with given label (defaults to None). + """ + + def __init__(self, + tfds_params, + process_text_fn=tokenization.convert_to_unicode): + super(TfdsProcessor, self).__init__(process_text_fn) + self._process_tfds_params_str(tfds_params) + if self.module_import: + importlib.import_module(self.module_import) + + self.dataset, info = tfds.load( + self.dataset_name, data_dir=self.data_dir, with_info=True) + if self.is_regression: + self._labels = None + else: + self._labels = list(range(info.features[self.label_key].num_classes)) + + def _process_tfds_params_str(self, params_str): + """Extracts TFDS parameters from a comma-separated assignements string.""" + dtype_map = {"int": int, "float": float} + cast_str_to_bool = lambda s: s.lower() not in ["false", "0"] + + tuples = [x.split("=") for x in params_str.split(",")] + d = {k.strip(): v.strip() for k, v in tuples} + self.dataset_name = d["dataset"] # Required. + self.data_dir = d.get("data_dir", None) + self.module_import = d.get("module_import", None) + self.train_split = d.get("train_split", "train") + self.dev_split = d.get("dev_split", "validation") + self.test_split = d.get("test_split", "test") + self.text_key = d.get("text_key", "text") + self.text_b_key = d.get("text_b_key", None) + self.label_key = d.get("label_key", "label") + self.test_text_key = d.get("test_text_key", self.text_key) + self.test_text_b_key = d.get("test_text_b_key", self.text_b_key) + self.test_label = d.get("test_label", "test_example") + self.label_type = dtype_map[d.get("label_type", "int")] + self.is_regression = cast_str_to_bool(d.get("is_regression", "False")) + self.weight_key = d.get("weight_key", None) + self.skip_label = d.get("skip_label", None) + if self.skip_label is not None: + self.skip_label = self.label_type(self.skip_label) + + def get_train_examples(self, data_dir): + assert data_dir is None + return self._create_examples(self.train_split, "train") + + def get_dev_examples(self, data_dir): + assert data_dir is None + return self._create_examples(self.dev_split, "dev") + + def get_test_examples(self, data_dir): + assert data_dir is None + return self._create_examples(self.test_split, "test") + + def get_labels(self): + return self._labels + + def get_processor_name(self): + return "TFDS_" + self.dataset_name + + def _create_examples(self, split_name, set_type): + """Creates examples for the training/dev/test sets.""" + if split_name not in self.dataset: + raise ValueError("Split {} not available.".format(split_name)) + dataset = self.dataset[split_name].as_numpy_iterator() + examples = [] + text_b, weight = None, None + for i, example in enumerate(dataset): + guid = "%s-%s" % (set_type, i) + if set_type == "test": + text_a = self.process_text_fn(example[self.test_text_key]) + if self.test_text_b_key: + text_b = self.process_text_fn(example[self.test_text_b_key]) + label = self.test_label + else: + text_a = self.process_text_fn(example[self.text_key]) + if self.text_b_key: + text_b = self.process_text_fn(example[self.text_b_key]) + label = self.label_type(example[self.label_key]) + if self.skip_label is not None and label == self.skip_label: + continue + if self.weight_key: + weight = float(example[self.weight_key]) + examples.append( + InputExample( + guid=guid, + text_a=text_a, + text_b=text_b, + label=label, + weight=weight)) + return examples + + +class WnliProcessor(DefaultGLUEDataProcessor): + """Processor for the WNLI data set (GLUE version).""" + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "WNLI" + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + dataset = tfds.load( + "glue/wnli", split=set_type, try_gcs=True).as_numpy_iterator() + examples = [] + for i, example in enumerate(dataset): + guid = "%s-%s" % (set_type, i) + label = "0" + text_a = self.process_text_fn(example["sentence1"]) + text_b = self.process_text_fn(example["sentence2"]) + if set_type != "test": + label = str(example["label"]) + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label, + weight=None)) + return examples + + +class XnliProcessor(DataProcessor): + """Processor for the XNLI data set.""" + supported_languages = [ + "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr", + "ur", "vi", "zh" + ] + + def __init__(self, + language="en", + process_text_fn=tokenization.convert_to_unicode): + super(XnliProcessor, self).__init__(process_text_fn) + if language == "all": + self.languages = XnliProcessor.supported_languages + elif language not in XnliProcessor.supported_languages: + raise ValueError("language %s is not supported for XNLI task." % language) + else: + self.languages = [language] + + def get_train_examples(self, data_dir): + """See base class.""" + lines = [] + for language in self.languages: + # Skips the header. + lines.extend( + self._read_tsv( + os.path.join(data_dir, "multinli", + "multinli.train.%s.tsv" % language))[1:]) + + examples = [] + for i, line in enumerate(lines): + guid = "train-%d" % i + text_a = self.process_text_fn(line[0]) + text_b = self.process_text_fn(line[1]) + label = self.process_text_fn(line[2]) + if label == self.process_text_fn("contradictory"): + label = self.process_text_fn("contradiction") + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) + examples = [] + for i, line in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % i + text_a = self.process_text_fn(line[6]) + text_b = self.process_text_fn(line[7]) + label = self.process_text_fn(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_test_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv")) + examples_by_lang = {k: [] for k in XnliProcessor.supported_languages} + for i, line in enumerate(lines): + if i == 0: + continue + guid = "test-%d" % i + language = self.process_text_fn(line[0]) + text_a = self.process_text_fn(line[6]) + text_b = self.process_text_fn(line[7]) + label = self.process_text_fn(line[1]) + examples_by_lang[language].append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples_by_lang + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "XNLI" + + +class XtremePawsxProcessor(DataProcessor): + """Processor for the XTREME PAWS-X data set.""" + supported_languages = ["de", "en", "es", "fr", "ja", "ko", "zh"] + + def __init__(self, + process_text_fn=tokenization.convert_to_unicode, + translated_data_dir=None, + only_use_en_dev=True): + """See base class. + + Args: + process_text_fn: See base class. + translated_data_dir: If specified, will also include translated data in + the training and testing data. + only_use_en_dev: If True, only use english dev data. Otherwise, use dev + data from all languages. + """ + super(XtremePawsxProcessor, self).__init__(process_text_fn) + self.translated_data_dir = translated_data_dir + self.only_use_en_dev = only_use_en_dev + + def get_train_examples(self, data_dir): + """See base class.""" + examples = [] + if self.translated_data_dir is None: + lines = self._read_tsv(os.path.join(data_dir, "train-en.tsv")) + for i, line in enumerate(lines): + guid = "train-%d" % i + text_a = self.process_text_fn(line[0]) + text_b = self.process_text_fn(line[1]) + label = self.process_text_fn(line[2]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + else: + for lang in self.supported_languages: + lines = self._read_tsv( + os.path.join(self.translated_data_dir, "translate-train", + f"en-{lang}-translated.tsv")) + for i, line in enumerate(lines): + guid = f"train-{lang}-{i}" + text_a = self.process_text_fn(line[2]) + text_b = self.process_text_fn(line[3]) + label = self.process_text_fn(line[4]) + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + examples = [] + if self.only_use_en_dev: + lines = self._read_tsv(os.path.join(data_dir, "dev-en.tsv")) + for i, line in enumerate(lines): + guid = "dev-%d" % i + text_a = self.process_text_fn(line[0]) + text_b = self.process_text_fn(line[1]) + label = self.process_text_fn(line[2]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + else: + for lang in self.supported_languages: + lines = self._read_tsv(os.path.join(data_dir, f"dev-{lang}.tsv")) + for i, line in enumerate(lines): + guid = f"dev-{lang}-{i}" + text_a = self.process_text_fn(line[0]) + text_b = self.process_text_fn(line[1]) + label = self.process_text_fn(line[2]) + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_test_examples(self, data_dir): + """See base class.""" + examples_by_lang = {} + for lang in self.supported_languages: + examples_by_lang[lang] = [] + lines = self._read_tsv(os.path.join(data_dir, f"test-{lang}.tsv")) + for i, line in enumerate(lines): + guid = f"test-{lang}-{i}" + text_a = self.process_text_fn(line[0]) + text_b = self.process_text_fn(line[1]) + label = "0" + examples_by_lang[lang].append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + if self.translated_data_dir is not None: + for lang in self.supported_languages: + if lang == "en": + continue + examples_by_lang[f"{lang}-en"] = [] + lines = self._read_tsv( + os.path.join(self.translated_data_dir, "translate-test", + f"test-{lang}-en-translated.tsv")) + for i, line in enumerate(lines): + guid = f"test-{lang}-en-{i}" + text_a = self.process_text_fn(line[2]) + text_b = self.process_text_fn(line[3]) + label = "0" + examples_by_lang[f"{lang}-en"].append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples_by_lang + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "XTREME-PAWS-X" + + +class XtremeXnliProcessor(DataProcessor): + """Processor for the XTREME XNLI data set.""" + supported_languages = [ + "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr", + "ur", "vi", "zh" + ] + + def __init__(self, + process_text_fn=tokenization.convert_to_unicode, + translated_data_dir=None, + only_use_en_dev=True): + """See base class. + + Args: + process_text_fn: See base class. + translated_data_dir: If specified, will also include translated data in + the training data. + only_use_en_dev: If True, only use english dev data. Otherwise, use dev + data from all languages. + """ + super(XtremeXnliProcessor, self).__init__(process_text_fn) + self.translated_data_dir = translated_data_dir + self.only_use_en_dev = only_use_en_dev + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "train-en.tsv")) + + examples = [] + if self.translated_data_dir is None: + for i, line in enumerate(lines): + guid = "train-%d" % i + text_a = self.process_text_fn(line[0]) + text_b = self.process_text_fn(line[1]) + label = self.process_text_fn(line[2]) + if label == self.process_text_fn("contradictory"): + label = self.process_text_fn("contradiction") + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + else: + for lang in self.supported_languages: + lines = self._read_tsv( + os.path.join(self.translated_data_dir, "translate-train", + f"en-{lang}-translated.tsv")) + for i, line in enumerate(lines): + guid = f"train-{lang}-{i}" + text_a = self.process_text_fn(line[2]) + text_b = self.process_text_fn(line[3]) + label = self.process_text_fn(line[4]) + if label == self.process_text_fn("contradictory"): + label = self.process_text_fn("contradiction") + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + examples = [] + if self.only_use_en_dev: + lines = self._read_tsv(os.path.join(data_dir, "dev-en.tsv")) + for i, line in enumerate(lines): + guid = "dev-%d" % i + text_a = self.process_text_fn(line[0]) + text_b = self.process_text_fn(line[1]) + label = self.process_text_fn(line[2]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + else: + for lang in self.supported_languages: + lines = self._read_tsv(os.path.join(data_dir, f"dev-{lang}.tsv")) + for i, line in enumerate(lines): + guid = f"dev-{lang}-{i}" + text_a = self.process_text_fn(line[0]) + text_b = self.process_text_fn(line[1]) + label = self.process_text_fn(line[2]) + if label == self.process_text_fn("contradictory"): + label = self.process_text_fn("contradiction") + examples.append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_test_examples(self, data_dir): + """See base class.""" + examples_by_lang = {} + for lang in self.supported_languages: + examples_by_lang[lang] = [] + lines = self._read_tsv(os.path.join(data_dir, f"test-{lang}.tsv")) + for i, line in enumerate(lines): + guid = f"test-{lang}-{i}" + text_a = self.process_text_fn(line[0]) + text_b = self.process_text_fn(line[1]) + label = "contradiction" + examples_by_lang[lang].append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + if self.translated_data_dir is not None: + for lang in self.supported_languages: + if lang == "en": + continue + examples_by_lang[f"{lang}-en"] = [] + lines = self._read_tsv( + os.path.join(self.translated_data_dir, "translate-test", + f"test-{lang}-en-translated.tsv")) + for i, line in enumerate(lines): + guid = f"test-{lang}-en-{i}" + text_a = self.process_text_fn(line[2]) + text_b = self.process_text_fn(line[3]) + label = "contradiction" + examples_by_lang[f"{lang}-en"].append( + InputExample( + guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples_by_lang + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "XTREME-XNLI" + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + label_map = {} + if label_list: + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + seg_id_a = 0 + seg_id_b = 1 + seg_id_cls = 0 + seg_id_pad = 0 + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(seg_id_cls) + for token in tokens_a: + tokens.append(token) + segment_ids.append(seg_id_a) + tokens.append("[SEP]") + segment_ids.append(seg_id_a) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(seg_id_b) + tokens.append("[SEP]") + segment_ids.append(seg_id_b) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(seg_id_pad) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] if label_map else example.label + if ex_index < 5: + logging.info("*** Example ***") + logging.info("guid: %s", (example.guid)) + logging.info("tokens: %s", + " ".join([tokenization.printable_text(x) for x in tokens])) + logging.info("input_ids: %s", " ".join([str(x) for x in input_ids])) + logging.info("input_mask: %s", " ".join([str(x) for x in input_mask])) + logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) + logging.info("label: %s (id = %s)", example.label, str(label_id)) + logging.info("weight: %s", example.weight) + logging.info("example_id: %s", example.example_id) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True, + weight=example.weight, + example_id=example.example_id) + + return feature + + +class AXgProcessor(DataProcessor): + """Processor for the AXg dataset (SuperGLUE diagnostics dataset).""" + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_jsonl(os.path.join(data_dir, "AX-g.jsonl")), "test") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "AXg" + + def _create_examples(self, lines, set_type): + """Creates examples for the training/dev/test sets.""" + examples = [] + for line in lines: + guid = "%s-%s" % (set_type, self.process_text_fn(str(line["idx"]))) + text_a = self.process_text_fn(line["premise"]) + text_b = self.process_text_fn(line["hypothesis"]) + label = self.process_text_fn(line["label"]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class BoolQProcessor(DefaultGLUEDataProcessor): + """Processor for the BoolQ dataset (SuperGLUE diagnostics dataset).""" + + def get_labels(self): + """See base class.""" + return ["True", "False"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "BoolQ" + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + dataset = tfds.load( + "super_glue/boolq", split=set_type, try_gcs=True).as_numpy_iterator() + examples = [] + for example in dataset: + guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"]))) + text_a = self.process_text_fn(example["question"]) + text_b = self.process_text_fn(example["passage"]) + label = "False" + if set_type != "test": + label = self.get_labels()[example["label"]] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class CBProcessor(DefaultGLUEDataProcessor): + """Processor for the CB dataset (SuperGLUE diagnostics dataset).""" + + def get_labels(self): + """See base class.""" + return ["entailment", "neutral", "contradiction"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "CB" + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + dataset = tfds.load( + "super_glue/cb", split=set_type, try_gcs=True).as_numpy_iterator() + examples = [] + for example in dataset: + guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"]))) + text_a = self.process_text_fn(example["premise"]) + text_b = self.process_text_fn(example["hypothesis"]) + label = "entailment" + if set_type != "test": + label = self.get_labels()[example["label"]] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class SuperGLUERTEProcessor(DefaultGLUEDataProcessor): + """Processor for the RTE dataset (SuperGLUE version).""" + + def get_labels(self): + """See base class.""" + # All datasets are converted to 2-class split, where for 3-class datasets we + # collapse neutral and contradiction into not_entailment. + return ["entailment", "not_entailment"] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "RTESuperGLUE" + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + examples = [] + dataset = tfds.load( + "super_glue/rte", split=set_type, try_gcs=True).as_numpy_iterator() + for example in dataset: + guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"]))) + text_a = self.process_text_fn(example["premise"]) + text_b = self.process_text_fn(example["hypothesis"]) + label = "entailment" + if set_type != "test": + label = self.get_labels()[example["label"]] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class WiCInputExample(InputExample): + """Processor for the WiC dataset (SuperGLUE version).""" + + def __init__(self, + guid, + text_a, + text_b=None, + label=None, + word=None, + weight=None, + example_id=None): + """A single training/test example for simple seq regression/classification.""" + super(WiCInputExample, self).__init__(guid, text_a, text_b, label, weight, + example_id) + self.word = word + + +class WiCProcessor(DefaultGLUEDataProcessor): + """Processor for the RTE dataset (SuperGLUE version).""" + + def get_labels(self): + """Not used.""" + return [] + + @staticmethod + def get_processor_name(): + """See base class.""" + return "RTESuperGLUE" + + def _create_examples_tfds(self, set_type): + """Creates examples for the training/dev/test sets.""" + examples = [] + dataset = tfds.load( + "super_glue/wic", split=set_type, try_gcs=True).as_numpy_iterator() + for example in dataset: + guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"]))) + text_a = self.process_text_fn(example["sentence1"]) + text_b = self.process_text_fn(example["sentence2"]) + word = self.process_text_fn(example["word"]) + label = 0 + if set_type != "test": + label = example["label"] + examples.append( + WiCInputExample( + guid=guid, text_a=text_a, text_b=text_b, word=word, label=label)) + return examples + + def featurize_example(self, ex_index, example, label_list, max_seq_length, + tokenizer): + """Here we concate sentence1, sentence2, word together with [SEP] tokens.""" + del label_list + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = tokenizer.tokenize(example.text_b) + tokens_word = tokenizer.tokenize(example.word) + + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP], [SEP] with "- 4" + # Here we only pop out the first two sentence tokens. + _truncate_seq_pair(tokens_a, tokens_b, + max_seq_length - 4 - len(tokens_word)) + + seg_id_a = 0 + seg_id_b = 1 + seg_id_c = 2 + seg_id_cls = 0 + seg_id_pad = 0 + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(seg_id_cls) + for token in tokens_a: + tokens.append(token) + segment_ids.append(seg_id_a) + tokens.append("[SEP]") + segment_ids.append(seg_id_a) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(seg_id_b) + + tokens.append("[SEP]") + segment_ids.append(seg_id_b) + + for token in tokens_word: + tokens.append(token) + segment_ids.append(seg_id_c) + + tokens.append("[SEP]") + segment_ids.append(seg_id_c) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(seg_id_pad) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = example.label + if ex_index < 5: + logging.info("*** Example ***") + logging.info("guid: %s", (example.guid)) + logging.info("tokens: %s", + " ".join([tokenization.printable_text(x) for x in tokens])) + logging.info("input_ids: %s", " ".join([str(x) for x in input_ids])) + logging.info("input_mask: %s", " ".join([str(x) for x in input_mask])) + logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) + logging.info("label: %s (id = %s)", example.label, str(label_id)) + logging.info("weight: %s", example.weight) + logging.info("example_id: %s", example.example_id) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True, + weight=example.weight, + example_id=example.example_id) + + return feature + + +def file_based_convert_examples_to_features(examples, + label_list, + max_seq_length, + tokenizer, + output_file, + label_type=None, + featurize_fn=None): + """Convert a set of `InputExample`s to a TFRecord file.""" + + tf.io.gfile.makedirs(os.path.dirname(output_file)) + writer = tf.io.TFRecordWriter(output_file) + + for ex_index, example in enumerate(examples): + if ex_index % 10000 == 0: + logging.info("Writing example %d of %d", ex_index, len(examples)) + + if featurize_fn: + feature = featurize_fn(ex_index, example, label_list, max_seq_length, + tokenizer) + else: + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + def create_float_feature(values): + f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + if label_type is not None and label_type == float: + features["label_ids"] = create_float_feature([feature.label_id]) + elif feature.label_id is not None: + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + if feature.weight is not None: + features["weight"] = create_float_feature([feature.weight]) + if feature.example_id is not None: + features["example_id"] = create_int_feature([feature.example_id]) + else: + features["example_id"] = create_int_feature([ex_index]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def generate_tf_record_from_data_file(processor, + data_dir, + tokenizer, + train_data_output_path=None, + eval_data_output_path=None, + test_data_output_path=None, + max_seq_length=128): + """Generates and saves training data into a tf record file. + + Args: + processor: Input processor object to be used for generating data. Subclass + of `DataProcessor`. + data_dir: Directory that contains train/eval/test data to process. + tokenizer: The tokenizer to be applied on the data. + train_data_output_path: Output to which processed tf record for training + will be saved. + eval_data_output_path: Output to which processed tf record for evaluation + will be saved. + test_data_output_path: Output to which processed tf record for testing + will be saved. Must be a pattern template with {} if processor has + language specific test data. + max_seq_length: Maximum sequence length of the to be generated + training/eval data. + + Returns: + A dictionary containing input meta data. + """ + assert train_data_output_path or eval_data_output_path + + label_list = processor.get_labels() + label_type = getattr(processor, "label_type", None) + is_regression = getattr(processor, "is_regression", False) + has_sample_weights = getattr(processor, "weight_key", False) + + num_training_data = 0 + if train_data_output_path: + train_input_data_examples = processor.get_train_examples(data_dir) + file_based_convert_examples_to_features(train_input_data_examples, + label_list, max_seq_length, + tokenizer, train_data_output_path, + label_type, + processor.featurize_example) + num_training_data = len(train_input_data_examples) + + if eval_data_output_path: + eval_input_data_examples = processor.get_dev_examples(data_dir) + file_based_convert_examples_to_features(eval_input_data_examples, + label_list, max_seq_length, + tokenizer, eval_data_output_path, + label_type, + processor.featurize_example) + + meta_data = { + "processor_type": processor.get_processor_name(), + "train_data_size": num_training_data, + "max_seq_length": max_seq_length, + } + + if test_data_output_path: + test_input_data_examples = processor.get_test_examples(data_dir) + if isinstance(test_input_data_examples, dict): + for language, examples in test_input_data_examples.items(): + file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, + test_data_output_path.format(language), label_type, + processor.featurize_example) + meta_data["test_{}_data_size".format(language)] = len(examples) + else: + file_based_convert_examples_to_features(test_input_data_examples, + label_list, max_seq_length, + tokenizer, test_data_output_path, + label_type, + processor.featurize_example) + meta_data["test_data_size"] = len(test_input_data_examples) + + if is_regression: + meta_data["task_type"] = "bert_regression" + meta_data["label_type"] = {int: "int", float: "float"}[label_type] + else: + meta_data["task_type"] = "bert_classification" + meta_data["num_labels"] = len(processor.get_labels()) + if has_sample_weights: + meta_data["has_sample_weights"] = True + + if eval_data_output_path: + meta_data["eval_data_size"] = len(eval_input_data_examples) + + return meta_data diff --git a/official/projects/longformer/utils/get_parameters_from_pretrained_pytorch_checkpoint.py b/official/projects/longformer/utils/get_parameters_from_pretrained_pytorch_checkpoint.py new file mode 100644 index 000000000..d646364b5 --- /dev/null +++ b/official/projects/longformer/utils/get_parameters_from_pretrained_pytorch_checkpoint.py @@ -0,0 +1,9 @@ +import transformers +pretrained_lm = "allenai/longformer-base-4096" + +model = transformers.AutoModel.from_pretrained(pretrained_lm) + +import pickle +pickle.dump({ + n: p.data.numpy() +for n, p in model.named_parameters()}, open(f"{pretrained_lm.replace('/', '_')}.pk", "wb")) \ No newline at end of file diff --git a/official/projects/longformer/utils/longformer_tokenizer_to_tfrecord.py b/official/projects/longformer/utils/longformer_tokenizer_to_tfrecord.py new file mode 100644 index 000000000..75b2666c2 --- /dev/null +++ b/official/projects/longformer/utils/longformer_tokenizer_to_tfrecord.py @@ -0,0 +1,93 @@ +import os +import tensorflow as tf + +import transformers +import datasets +from convert_to_tf_record import file_based_convert_examples_to_features + +pretrained_lm = "allenai/longformer-base-4096" +task_name = "mnli" +save_path = "./" + +raw_datasets = datasets.load_dataset("glue", task_name, cache_dir=None) +label_list = raw_datasets["train"].features["label"].names +num_labels = len(label_list) + +tokenizer = transformers.AutoTokenizer.from_pretrained( + pretrained_lm, + use_fast=True, +) + +task_to_keys = { + "cola": ("sentence", None), + "mnli": ("premise", "hypothesis"), + "mrpc": ("sentence1", "sentence2"), + "qnli": ("question", "sentence"), + "qqp": ("question1", "question2"), + "rte": ("sentence1", "sentence2"), + "sst2": ("sentence", None), + "stsb": ("sentence1", "sentence2"), + "wnli": ("sentence1", "sentence2"), +} + +sentence1_key, sentence2_key = task_to_keys[task_name] +padding = "max_length" + +# make sure this is the same with model input size. +max_seq_length = 512 + + +def preprocess_function(examples): + # Tokenize the texts + args = ( + (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) + ) + result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) + return result + +raw_datasets = raw_datasets.map( + preprocess_function, + batched=True, + desc="Running tokenizer on dataset", +) + +train_dataset = raw_datasets["train"] +eval_dataset = raw_datasets["validation_matched" if task_name == "mnli" else "validation"] + +print("train_dataset", train_dataset[0]) + +print("eval_dataset", eval_dataset[0]) + +def file_based_convert_examples_to_features(examples, + output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + tf.io.gfile.makedirs(os.path.dirname(output_file)) + writer = tf.io.TFRecordWriter(output_file) + + for ex_index, example in enumerate(examples): + if ex_index % 10000 == 0: + print(f"Writing example {ex_index} of {len(examples)}") + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + def create_float_feature(values): + f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(example["input_ids"]) + features["input_mask"] = create_int_feature(example["attention_mask"]) + features["segment_ids"] = create_int_feature([0] * len(example["attention_mask"])) + features["label_ids"] = create_int_feature([example["label"]]) + features["is_real_example"] = create_int_feature([1]) + features["example_id"] = create_int_feature([example["idx"]]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + +file_based_convert_examples_to_features(train_dataset, os.path.join(save_path, f"{pretrained_lm.replace('/', '_')}_train.tf_record")) +file_based_convert_examples_to_features(eval_dataset, os.path.join(save_path, f"{pretrained_lm.replace('/', '_')}_eval.tf_record")) + -- GitLab From 78a2d252f261a2558b37abe50943bcfd74fa0210 Mon Sep 17 00:00:00 2001 From: Zihan Wang Date: Thu, 24 Feb 2022 21:16:38 -0800 Subject: [PATCH 02/54] add more readme --- official/projects/longformer/README.md | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/official/projects/longformer/README.md b/official/projects/longformer/README.md index 48a97745f..2ca4112dd 100644 --- a/official/projects/longformer/README.md +++ b/official/projects/longformer/README.md @@ -15,9 +15,23 @@ tf model. The pk file can be generated from `utils/get_parameters_from_pretrained_pytorch_checkpoint.py`. There is also a `longformer_tokenizer_to_tfrecord.py` that transformers pytorch longformer tokenized data to tf_records. -## Running +## Steps to Fine-tune on MNLI +#### Prepare the pre-trained checkpoint +Option 1. Use our saved checkpoint of `allenai/longformer-base-4096` stored in cloud storage +```bash +gsutil cp gs://model-garden-ucsd-zihan/allenai.pk allenai_longformer-base-4096.pk +``` +Option 2. Create it directly +```bash +python3 utils/get_parameters_from_pretrained_pytorch_checkpoint.py +``` +#### [Optional] Prepare the input file +```bash +python3 longformer_tokenizer_to_tfrecord.py +``` +#### Training +Here, we use the training data of MNLI that were uploaded to the cloud storage, you can replace it with the input files you generated. ```bash -python utils/get_parameters_from_pretrained_pytorch_checkpoint.py TRAIN_DATA=task.train_data.input_path=gs://model-garden-ucsd-zihan/longformer_allenai_mnli_train.tf_record,task.validation_data.input_path=gs://model-garden-ucsd-zihan/longformer_allenai_mnli_eval.tf_record PYTHONPATH=/path/to/model/garden \ python3 train.py \ @@ -28,3 +42,4 @@ PYTHONPATH=/path/to/model/garden \ --model_dir=/path/to/outputdir \ --mode=train_and_eval ``` +This should take an hour or two to run, and give a performance of ~86. \ No newline at end of file -- GitLab From cadde4ee2bd6ecb25252dfa61c58f140838053d9 Mon Sep 17 00:00:00 2001 From: Zihan Wang Date: Thu, 24 Feb 2022 21:21:47 -0800 Subject: [PATCH 03/54] remove one file --- ...form_longformer_tokenized_into_tfrecord.py | 1592 ----------------- 1 file changed, 1592 deletions(-) delete mode 100644 official/projects/longformer/transform_longformer_tokenized_into_tfrecord.py diff --git a/official/projects/longformer/transform_longformer_tokenized_into_tfrecord.py b/official/projects/longformer/transform_longformer_tokenized_into_tfrecord.py deleted file mode 100644 index 0ba9dcf9a..000000000 --- a/official/projects/longformer/transform_longformer_tokenized_into_tfrecord.py +++ /dev/null @@ -1,1592 +0,0 @@ -# Copyright 2021 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""BERT library to process data for classification task.""" - -import collections -import csv -import importlib -import json -import os - -from absl import logging -import tensorflow as tf -import tensorflow_datasets as tfds - -from official.nlp.bert import tokenization - - -class InputExample(object): - """A single training/test example for simple seq regression/classification.""" - - def __init__(self, - guid, - text_a, - text_b=None, - label=None, - weight=None, - example_id=None): - """Constructs a InputExample. - - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second sequence. - Only must be specified for sequence pair tasks. - label: (Optional) string for classification, float for regression. The - label of the example. This should be specified for train and dev - examples, but not for test examples. - weight: (Optional) float. The weight of the example to be used during - training. - example_id: (Optional) int. The int identification number of example in - the corpus. - """ - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - self.weight = weight - self.example_id = example_id - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, - input_ids, - input_mask, - segment_ids, - label_id, - is_real_example=True, - weight=None, - example_id=None): - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.label_id = label_id - self.is_real_example = is_real_example - self.weight = weight - self.example_id = example_id - - -class DataProcessor(object): - """Base class for converters for seq regression/classification datasets.""" - - def __init__(self, process_text_fn=tokenization.convert_to_unicode): - self.process_text_fn = process_text_fn - self.is_regression = False - self.label_type = None - - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_test_examples(self, data_dir): - """Gets a collection of `InputExample`s for prediction.""" - raise NotImplementedError() - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - @staticmethod - def get_processor_name(): - """Gets the string identifier of the processor.""" - raise NotImplementedError() - - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with tf.io.gfile.GFile(input_file, "r") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - lines.append(line) - return lines - - @classmethod - def _read_jsonl(cls, input_file): - """Reads a json line file.""" - with tf.io.gfile.GFile(input_file, "r") as f: - lines = [] - for json_str in f: - lines.append(json.loads(json_str)) - return lines - - def featurize_example(self, *kargs, **kwargs): - """Converts a single `InputExample` into a single `InputFeatures`.""" - return convert_single_example(*kargs, **kwargs) - - -class DefaultGLUEDataProcessor(DataProcessor): - """Processor for the SuperGLUE dataset.""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples_tfds("train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples_tfds("validation") - - def get_test_examples(self, data_dir): - """See base class.""" - return self._create_examples_tfds("test") - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - raise NotImplementedError() - - -class AxProcessor(DataProcessor): - """Processor for the AX dataset (GLUE diagnostics dataset).""" - - def get_train_examples(self, data_dir): - """See base class.""" - train_mnli_dataset = tfds.load( - "glue/mnli", split="train", try_gcs=True).as_numpy_iterator() - return self._create_examples_tfds(train_mnli_dataset, "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - val_mnli_dataset = tfds.load( - "glue/mnli", split="validation_matched", - try_gcs=True).as_numpy_iterator() - return self._create_examples_tfds(val_mnli_dataset, "validation") - - def get_test_examples(self, data_dir): - """See base class.""" - test_ax_dataset = tfds.load( - "glue/ax", split="test", try_gcs=True).as_numpy_iterator() - return self._create_examples_tfds(test_ax_dataset, "test") - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "AX" - - def _create_examples_tfds(self, dataset, set_type): - """Creates examples for the training/dev/test sets.""" - examples = [] - for i, example in enumerate(dataset): - guid = "%s-%s" % (set_type, i) - label = "contradiction" - text_a = self.process_text_fn(example["hypothesis"]) - text_b = self.process_text_fn(example["premise"]) - if set_type != "test": - label = self.get_labels()[example["label"]] - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label, - weight=None)) - return examples - - -class ColaProcessor(DefaultGLUEDataProcessor): - """Processor for the CoLA data set (GLUE version).""" - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "COLA" - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - dataset = tfds.load( - "glue/cola", split=set_type, try_gcs=True).as_numpy_iterator() - examples = [] - for i, example in enumerate(dataset): - guid = "%s-%s" % (set_type, i) - label = "0" - text_a = self.process_text_fn(example["sentence"]) - if set_type != "test": - label = str(example["label"]) - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=None, label=label, weight=None)) - return examples - - -class ImdbProcessor(DataProcessor): - """Processor for the IMDb dataset.""" - - def get_labels(self): - return ["neg", "pos"] - - def get_train_examples(self, data_dir): - return self._create_examples(os.path.join(data_dir, "train")) - - def get_dev_examples(self, data_dir): - return self._create_examples(os.path.join(data_dir, "test")) - - @staticmethod - def get_processor_name(): - """See base class.""" - return "IMDB" - - def _create_examples(self, data_dir): - """Creates examples.""" - examples = [] - for label in ["neg", "pos"]: - cur_dir = os.path.join(data_dir, label) - for filename in tf.io.gfile.listdir(cur_dir): - if not filename.endswith("txt"): - continue - - if len(examples) % 1000 == 0: - logging.info("Loading dev example %d", len(examples)) - - path = os.path.join(cur_dir, filename) - with tf.io.gfile.GFile(path, "r") as f: - text = f.read().strip().replace("
", " ") - examples.append( - InputExample( - guid="unused_id", text_a=text, text_b=None, label=label)) - return examples - - -class MnliProcessor(DataProcessor): - """Processor for the MultiNLI data set (GLUE version).""" - - def __init__(self, - mnli_type="matched", - process_text_fn=tokenization.convert_to_unicode): - super(MnliProcessor, self).__init__(process_text_fn) - self.dataset = tfds.load("glue/mnli", try_gcs=True) - if mnli_type not in ("matched", "mismatched"): - raise ValueError("Invalid `mnli_type`: %s" % mnli_type) - self.mnli_type = mnli_type - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples_tfds("train") - - def get_dev_examples(self, data_dir): - """See base class.""" - if self.mnli_type == "matched": - return self._create_examples_tfds("validation_matched") - else: - return self._create_examples_tfds("validation_mismatched") - - def get_test_examples(self, data_dir): - """See base class.""" - if self.mnli_type == "matched": - return self._create_examples_tfds("test_matched") - else: - return self._create_examples_tfds("test_mismatched") - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "MNLI" - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - dataset = tfds.load( - "glue/mnli", split=set_type, try_gcs=True).as_numpy_iterator() - examples = [] - for i, example in enumerate(dataset): - guid = "%s-%s" % (set_type, i) - label = "contradiction" - text_a = self.process_text_fn(example["hypothesis"]) - text_b = self.process_text_fn(example["premise"]) - if set_type != "test": - label = self.get_labels()[example["label"]] - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label, - weight=None)) - return examples - - -class MrpcProcessor(DefaultGLUEDataProcessor): - """Processor for the MRPC data set (GLUE version).""" - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "MRPC" - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - dataset = tfds.load( - "glue/mrpc", split=set_type, try_gcs=True).as_numpy_iterator() - examples = [] - for i, example in enumerate(dataset): - guid = "%s-%s" % (set_type, i) - label = "0" - text_a = self.process_text_fn(example["sentence1"]) - text_b = self.process_text_fn(example["sentence2"]) - if set_type != "test": - label = str(example["label"]) - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label, - weight=None)) - return examples - - -class PawsxProcessor(DataProcessor): - """Processor for the PAWS-X data set.""" - supported_languages = ["de", "en", "es", "fr", "ja", "ko", "zh"] - - def __init__(self, - language="en", - process_text_fn=tokenization.convert_to_unicode): - super(PawsxProcessor, self).__init__(process_text_fn) - if language == "all": - self.languages = PawsxProcessor.supported_languages - elif language not in PawsxProcessor.supported_languages: - raise ValueError("language %s is not supported for PAWS-X task." % - language) - else: - self.languages = [language] - - def get_train_examples(self, data_dir): - """See base class.""" - lines = [] - for language in self.languages: - if language == "en": - train_tsv = "train.tsv" - else: - train_tsv = "translated_train.tsv" - # Skips the header. - lines.extend( - self._read_tsv(os.path.join(data_dir, language, train_tsv))[1:]) - - examples = [] - for i, line in enumerate(lines): - guid = "train-%d" % i - text_a = self.process_text_fn(line[1]) - text_b = self.process_text_fn(line[2]) - label = self.process_text_fn(line[3]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_dev_examples(self, data_dir): - """See base class.""" - lines = [] - for lang in PawsxProcessor.supported_languages: - lines.extend( - self._read_tsv(os.path.join(data_dir, lang, "dev_2k.tsv"))[1:]) - - examples = [] - for i, line in enumerate(lines): - guid = "dev-%d" % i - text_a = self.process_text_fn(line[1]) - text_b = self.process_text_fn(line[2]) - label = self.process_text_fn(line[3]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_test_examples(self, data_dir): - """See base class.""" - examples_by_lang = {k: [] for k in self.supported_languages} - for lang in self.supported_languages: - lines = self._read_tsv(os.path.join(data_dir, lang, "test_2k.tsv"))[1:] - for i, line in enumerate(lines): - guid = "test-%d" % i - text_a = self.process_text_fn(line[1]) - text_b = self.process_text_fn(line[2]) - label = self.process_text_fn(line[3]) - examples_by_lang[lang].append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples_by_lang - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "XTREME-PAWS-X" - - -class QnliProcessor(DefaultGLUEDataProcessor): - """Processor for the QNLI data set (GLUE version).""" - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "QNLI" - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - dataset = tfds.load( - "glue/qnli", split=set_type, try_gcs=True).as_numpy_iterator() - examples = [] - for i, example in enumerate(dataset): - guid = "%s-%s" % (set_type, i) - label = "entailment" - text_a = self.process_text_fn(example["question"]) - text_b = self.process_text_fn(example["sentence"]) - if set_type != "test": - label = self.get_labels()[example["label"]] - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label, - weight=None)) - return examples - - -class QqpProcessor(DefaultGLUEDataProcessor): - """Processor for the QQP data set (GLUE version).""" - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "QQP" - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - dataset = tfds.load( - "glue/qqp", split=set_type, try_gcs=True).as_numpy_iterator() - examples = [] - for i, example in enumerate(dataset): - guid = "%s-%s" % (set_type, i) - label = "0" - text_a = self.process_text_fn(example["question1"]) - text_b = self.process_text_fn(example["question2"]) - if set_type != "test": - label = str(example["label"]) - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label, - weight=None)) - return examples - - -class RteProcessor(DefaultGLUEDataProcessor): - """Processor for the RTE data set (GLUE version).""" - - def get_labels(self): - """See base class.""" - # All datasets are converted to 2-class split, where for 3-class datasets we - # collapse neutral and contradiction into not_entailment. - return ["entailment", "not_entailment"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "RTE" - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - dataset = tfds.load( - "glue/rte", split=set_type, try_gcs=True).as_numpy_iterator() - examples = [] - for i, example in enumerate(dataset): - guid = "%s-%s" % (set_type, i) - label = "entailment" - text_a = self.process_text_fn(example["sentence1"]) - text_b = self.process_text_fn(example["sentence2"]) - if set_type != "test": - label = self.get_labels()[example["label"]] - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label, - weight=None)) - return examples - - -class SstProcessor(DefaultGLUEDataProcessor): - """Processor for the SST-2 data set (GLUE version).""" - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "SST-2" - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - dataset = tfds.load( - "glue/sst2", split=set_type, try_gcs=True).as_numpy_iterator() - examples = [] - for i, example in enumerate(dataset): - guid = "%s-%s" % (set_type, i) - label = "0" - text_a = self.process_text_fn(example["sentence"]) - if set_type != "test": - label = str(example["label"]) - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=None, label=label, weight=None)) - return examples - - -class StsBProcessor(DefaultGLUEDataProcessor): - """Processor for the STS-B data set (GLUE version).""" - - def __init__(self, process_text_fn=tokenization.convert_to_unicode): - super(StsBProcessor, self).__init__(process_text_fn=process_text_fn) - self.is_regression = True - self.label_type = float - self._labels = None - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - dataset = tfds.load( - "glue/stsb", split=set_type, try_gcs=True).as_numpy_iterator() - examples = [] - for i, example in enumerate(dataset): - guid = "%s-%s" % (set_type, i) - label = 0.0 - text_a = self.process_text_fn(example["sentence1"]) - text_b = self.process_text_fn(example["sentence2"]) - if set_type != "test": - label = self.label_type(example["label"]) - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label, - weight=None)) - return examples - - def get_labels(self): - """See base class.""" - return self._labels - - @staticmethod - def get_processor_name(): - """See base class.""" - return "STS-B" - - -class TfdsProcessor(DataProcessor): - """Processor for generic text classification and regression TFDS data set. - - The TFDS parameters are expected to be provided in the tfds_params string, in - a comma-separated list of parameter assignments. - Examples: - tfds_params="dataset=scicite,text_key=string" - tfds_params="dataset=imdb_reviews,test_split=,dev_split=test" - tfds_params="dataset=glue/cola,text_key=sentence" - tfds_params="dataset=glue/sst2,text_key=sentence" - tfds_params="dataset=glue/qnli,text_key=question,text_b_key=sentence" - tfds_params="dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2" - tfds_params="dataset=glue/stsb,text_key=sentence1,text_b_key=sentence2," - "is_regression=true,label_type=float" - tfds_params="dataset=snli,text_key=premise,text_b_key=hypothesis," - "skip_label=-1" - Possible parameters (please refer to the documentation of Tensorflow Datasets - (TFDS) for the meaning of individual parameters): - dataset: Required dataset name (potentially with subset and version number). - data_dir: Optional TFDS source root directory. - module_import: Optional Dataset module to import. - train_split: Name of the train split (defaults to `train`). - dev_split: Name of the dev split (defaults to `validation`). - test_split: Name of the test split (defaults to `test`). - text_key: Key of the text_a feature (defaults to `text`). - text_b_key: Key of the second text feature if available. - label_key: Key of the label feature (defaults to `label`). - test_text_key: Key of the text feature to use in test set. - test_text_b_key: Key of the second text feature to use in test set. - test_label: String to be used as the label for all test examples. - label_type: Type of the label key (defaults to `int`). - weight_key: Key of the float sample weight (is not used if not provided). - is_regression: Whether the task is a regression problem (defaults to False). - skip_label: Skip examples with given label (defaults to None). - """ - - def __init__(self, - tfds_params, - process_text_fn=tokenization.convert_to_unicode): - super(TfdsProcessor, self).__init__(process_text_fn) - self._process_tfds_params_str(tfds_params) - if self.module_import: - importlib.import_module(self.module_import) - - self.dataset, info = tfds.load( - self.dataset_name, data_dir=self.data_dir, with_info=True) - if self.is_regression: - self._labels = None - else: - self._labels = list(range(info.features[self.label_key].num_classes)) - - def _process_tfds_params_str(self, params_str): - """Extracts TFDS parameters from a comma-separated assignements string.""" - dtype_map = {"int": int, "float": float} - cast_str_to_bool = lambda s: s.lower() not in ["false", "0"] - - tuples = [x.split("=") for x in params_str.split(",")] - d = {k.strip(): v.strip() for k, v in tuples} - self.dataset_name = d["dataset"] # Required. - self.data_dir = d.get("data_dir", None) - self.module_import = d.get("module_import", None) - self.train_split = d.get("train_split", "train") - self.dev_split = d.get("dev_split", "validation") - self.test_split = d.get("test_split", "test") - self.text_key = d.get("text_key", "text") - self.text_b_key = d.get("text_b_key", None) - self.label_key = d.get("label_key", "label") - self.test_text_key = d.get("test_text_key", self.text_key) - self.test_text_b_key = d.get("test_text_b_key", self.text_b_key) - self.test_label = d.get("test_label", "test_example") - self.label_type = dtype_map[d.get("label_type", "int")] - self.is_regression = cast_str_to_bool(d.get("is_regression", "False")) - self.weight_key = d.get("weight_key", None) - self.skip_label = d.get("skip_label", None) - if self.skip_label is not None: - self.skip_label = self.label_type(self.skip_label) - - def get_train_examples(self, data_dir): - assert data_dir is None - return self._create_examples(self.train_split, "train") - - def get_dev_examples(self, data_dir): - assert data_dir is None - return self._create_examples(self.dev_split, "dev") - - def get_test_examples(self, data_dir): - assert data_dir is None - return self._create_examples(self.test_split, "test") - - def get_labels(self): - return self._labels - - def get_processor_name(self): - return "TFDS_" + self.dataset_name - - def _create_examples(self, split_name, set_type): - """Creates examples for the training/dev/test sets.""" - if split_name not in self.dataset: - raise ValueError("Split {} not available.".format(split_name)) - dataset = self.dataset[split_name].as_numpy_iterator() - examples = [] - text_b, weight = None, None - for i, example in enumerate(dataset): - guid = "%s-%s" % (set_type, i) - if set_type == "test": - text_a = self.process_text_fn(example[self.test_text_key]) - if self.test_text_b_key: - text_b = self.process_text_fn(example[self.test_text_b_key]) - label = self.test_label - else: - text_a = self.process_text_fn(example[self.text_key]) - if self.text_b_key: - text_b = self.process_text_fn(example[self.text_b_key]) - label = self.label_type(example[self.label_key]) - if self.skip_label is not None and label == self.skip_label: - continue - if self.weight_key: - weight = float(example[self.weight_key]) - examples.append( - InputExample( - guid=guid, - text_a=text_a, - text_b=text_b, - label=label, - weight=weight)) - return examples - - -class WnliProcessor(DefaultGLUEDataProcessor): - """Processor for the WNLI data set (GLUE version).""" - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "WNLI" - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - dataset = tfds.load( - "glue/wnli", split=set_type, try_gcs=True).as_numpy_iterator() - examples = [] - for i, example in enumerate(dataset): - guid = "%s-%s" % (set_type, i) - label = "0" - text_a = self.process_text_fn(example["sentence1"]) - text_b = self.process_text_fn(example["sentence2"]) - if set_type != "test": - label = str(example["label"]) - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label, - weight=None)) - return examples - - -class XnliProcessor(DataProcessor): - """Processor for the XNLI data set.""" - supported_languages = [ - "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr", - "ur", "vi", "zh" - ] - - def __init__(self, - language="en", - process_text_fn=tokenization.convert_to_unicode): - super(XnliProcessor, self).__init__(process_text_fn) - if language == "all": - self.languages = XnliProcessor.supported_languages - elif language not in XnliProcessor.supported_languages: - raise ValueError("language %s is not supported for XNLI task." % language) - else: - self.languages = [language] - - def get_train_examples(self, data_dir): - """See base class.""" - lines = [] - for language in self.languages: - # Skips the header. - lines.extend( - self._read_tsv( - os.path.join(data_dir, "multinli", - "multinli.train.%s.tsv" % language))[1:]) - - examples = [] - for i, line in enumerate(lines): - guid = "train-%d" % i - text_a = self.process_text_fn(line[0]) - text_b = self.process_text_fn(line[1]) - label = self.process_text_fn(line[2]) - if label == self.process_text_fn("contradictory"): - label = self.process_text_fn("contradiction") - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_dev_examples(self, data_dir): - """See base class.""" - lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) - examples = [] - for i, line in enumerate(lines): - if i == 0: - continue - guid = "dev-%d" % i - text_a = self.process_text_fn(line[6]) - text_b = self.process_text_fn(line[7]) - label = self.process_text_fn(line[1]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_test_examples(self, data_dir): - """See base class.""" - lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv")) - examples_by_lang = {k: [] for k in XnliProcessor.supported_languages} - for i, line in enumerate(lines): - if i == 0: - continue - guid = "test-%d" % i - language = self.process_text_fn(line[0]) - text_a = self.process_text_fn(line[6]) - text_b = self.process_text_fn(line[7]) - label = self.process_text_fn(line[1]) - examples_by_lang[language].append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples_by_lang - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "XNLI" - - -class XtremePawsxProcessor(DataProcessor): - """Processor for the XTREME PAWS-X data set.""" - supported_languages = ["de", "en", "es", "fr", "ja", "ko", "zh"] - - def __init__(self, - process_text_fn=tokenization.convert_to_unicode, - translated_data_dir=None, - only_use_en_dev=True): - """See base class. - - Args: - process_text_fn: See base class. - translated_data_dir: If specified, will also include translated data in - the training and testing data. - only_use_en_dev: If True, only use english dev data. Otherwise, use dev - data from all languages. - """ - super(XtremePawsxProcessor, self).__init__(process_text_fn) - self.translated_data_dir = translated_data_dir - self.only_use_en_dev = only_use_en_dev - - def get_train_examples(self, data_dir): - """See base class.""" - examples = [] - if self.translated_data_dir is None: - lines = self._read_tsv(os.path.join(data_dir, "train-en.tsv")) - for i, line in enumerate(lines): - guid = "train-%d" % i - text_a = self.process_text_fn(line[0]) - text_b = self.process_text_fn(line[1]) - label = self.process_text_fn(line[2]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - else: - for lang in self.supported_languages: - lines = self._read_tsv( - os.path.join(self.translated_data_dir, "translate-train", - f"en-{lang}-translated.tsv")) - for i, line in enumerate(lines): - guid = f"train-{lang}-{i}" - text_a = self.process_text_fn(line[2]) - text_b = self.process_text_fn(line[3]) - label = self.process_text_fn(line[4]) - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_dev_examples(self, data_dir): - """See base class.""" - examples = [] - if self.only_use_en_dev: - lines = self._read_tsv(os.path.join(data_dir, "dev-en.tsv")) - for i, line in enumerate(lines): - guid = "dev-%d" % i - text_a = self.process_text_fn(line[0]) - text_b = self.process_text_fn(line[1]) - label = self.process_text_fn(line[2]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - else: - for lang in self.supported_languages: - lines = self._read_tsv(os.path.join(data_dir, f"dev-{lang}.tsv")) - for i, line in enumerate(lines): - guid = f"dev-{lang}-{i}" - text_a = self.process_text_fn(line[0]) - text_b = self.process_text_fn(line[1]) - label = self.process_text_fn(line[2]) - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_test_examples(self, data_dir): - """See base class.""" - examples_by_lang = {} - for lang in self.supported_languages: - examples_by_lang[lang] = [] - lines = self._read_tsv(os.path.join(data_dir, f"test-{lang}.tsv")) - for i, line in enumerate(lines): - guid = f"test-{lang}-{i}" - text_a = self.process_text_fn(line[0]) - text_b = self.process_text_fn(line[1]) - label = "0" - examples_by_lang[lang].append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - if self.translated_data_dir is not None: - for lang in self.supported_languages: - if lang == "en": - continue - examples_by_lang[f"{lang}-en"] = [] - lines = self._read_tsv( - os.path.join(self.translated_data_dir, "translate-test", - f"test-{lang}-en-translated.tsv")) - for i, line in enumerate(lines): - guid = f"test-{lang}-en-{i}" - text_a = self.process_text_fn(line[2]) - text_b = self.process_text_fn(line[3]) - label = "0" - examples_by_lang[f"{lang}-en"].append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples_by_lang - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "XTREME-PAWS-X" - - -class XtremeXnliProcessor(DataProcessor): - """Processor for the XTREME XNLI data set.""" - supported_languages = [ - "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr", - "ur", "vi", "zh" - ] - - def __init__(self, - process_text_fn=tokenization.convert_to_unicode, - translated_data_dir=None, - only_use_en_dev=True): - """See base class. - - Args: - process_text_fn: See base class. - translated_data_dir: If specified, will also include translated data in - the training data. - only_use_en_dev: If True, only use english dev data. Otherwise, use dev - data from all languages. - """ - super(XtremeXnliProcessor, self).__init__(process_text_fn) - self.translated_data_dir = translated_data_dir - self.only_use_en_dev = only_use_en_dev - - def get_train_examples(self, data_dir): - """See base class.""" - lines = self._read_tsv(os.path.join(data_dir, "train-en.tsv")) - - examples = [] - if self.translated_data_dir is None: - for i, line in enumerate(lines): - guid = "train-%d" % i - text_a = self.process_text_fn(line[0]) - text_b = self.process_text_fn(line[1]) - label = self.process_text_fn(line[2]) - if label == self.process_text_fn("contradictory"): - label = self.process_text_fn("contradiction") - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - else: - for lang in self.supported_languages: - lines = self._read_tsv( - os.path.join(self.translated_data_dir, "translate-train", - f"en-{lang}-translated.tsv")) - for i, line in enumerate(lines): - guid = f"train-{lang}-{i}" - text_a = self.process_text_fn(line[2]) - text_b = self.process_text_fn(line[3]) - label = self.process_text_fn(line[4]) - if label == self.process_text_fn("contradictory"): - label = self.process_text_fn("contradiction") - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_dev_examples(self, data_dir): - """See base class.""" - examples = [] - if self.only_use_en_dev: - lines = self._read_tsv(os.path.join(data_dir, "dev-en.tsv")) - for i, line in enumerate(lines): - guid = "dev-%d" % i - text_a = self.process_text_fn(line[0]) - text_b = self.process_text_fn(line[1]) - label = self.process_text_fn(line[2]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - else: - for lang in self.supported_languages: - lines = self._read_tsv(os.path.join(data_dir, f"dev-{lang}.tsv")) - for i, line in enumerate(lines): - guid = f"dev-{lang}-{i}" - text_a = self.process_text_fn(line[0]) - text_b = self.process_text_fn(line[1]) - label = self.process_text_fn(line[2]) - if label == self.process_text_fn("contradictory"): - label = self.process_text_fn("contradiction") - examples.append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_test_examples(self, data_dir): - """See base class.""" - examples_by_lang = {} - for lang in self.supported_languages: - examples_by_lang[lang] = [] - lines = self._read_tsv(os.path.join(data_dir, f"test-{lang}.tsv")) - for i, line in enumerate(lines): - guid = f"test-{lang}-{i}" - text_a = self.process_text_fn(line[0]) - text_b = self.process_text_fn(line[1]) - label = "contradiction" - examples_by_lang[lang].append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - if self.translated_data_dir is not None: - for lang in self.supported_languages: - if lang == "en": - continue - examples_by_lang[f"{lang}-en"] = [] - lines = self._read_tsv( - os.path.join(self.translated_data_dir, "translate-test", - f"test-{lang}-en-translated.tsv")) - for i, line in enumerate(lines): - guid = f"test-{lang}-en-{i}" - text_a = self.process_text_fn(line[2]) - text_b = self.process_text_fn(line[3]) - label = "contradiction" - examples_by_lang[f"{lang}-en"].append( - InputExample( - guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples_by_lang - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "XTREME-XNLI" - - -def convert_single_example(ex_index, example, label_list, max_seq_length, - tokenizer): - """Converts a single `InputExample` into a single `InputFeatures`.""" - label_map = {} - if label_list: - for (i, label) in enumerate(label_list): - label_map[label] = i - - tokens_a = tokenizer.tokenize(example.text_a) - tokens_b = None - if example.text_b: - tokens_b = tokenizer.tokenize(example.text_b) - - if tokens_b: - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3" - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) - else: - # Account for [CLS] and [SEP] with "- 2" - if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[0:(max_seq_length - 2)] - - seg_id_a = 0 - seg_id_b = 1 - seg_id_cls = 0 - seg_id_pad = 0 - - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambiguously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens = [] - segment_ids = [] - tokens.append("[CLS]") - segment_ids.append(seg_id_cls) - for token in tokens_a: - tokens.append(token) - segment_ids.append(seg_id_a) - tokens.append("[SEP]") - segment_ids.append(seg_id_a) - - if tokens_b: - for token in tokens_b: - tokens.append(token) - segment_ids.append(seg_id_b) - tokens.append("[SEP]") - segment_ids.append(seg_id_b) - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1] * len(input_ids) - - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(0) - input_mask.append(0) - segment_ids.append(seg_id_pad) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - label_id = label_map[example.label] if label_map else example.label - if ex_index < 5: - logging.info("*** Example ***") - logging.info("guid: %s", (example.guid)) - logging.info("tokens: %s", - " ".join([tokenization.printable_text(x) for x in tokens])) - logging.info("input_ids: %s", " ".join([str(x) for x in input_ids])) - logging.info("input_mask: %s", " ".join([str(x) for x in input_mask])) - logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) - logging.info("label: %s (id = %s)", example.label, str(label_id)) - logging.info("weight: %s", example.weight) - logging.info("example_id: %s", example.example_id) - - feature = InputFeatures( - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - label_id=label_id, - is_real_example=True, - weight=example.weight, - example_id=example.example_id) - - return feature - - -class AXgProcessor(DataProcessor): - """Processor for the AXg dataset (SuperGLUE diagnostics dataset).""" - - def get_test_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_jsonl(os.path.join(data_dir, "AX-g.jsonl")), "test") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "AXg" - - def _create_examples(self, lines, set_type): - """Creates examples for the training/dev/test sets.""" - examples = [] - for line in lines: - guid = "%s-%s" % (set_type, self.process_text_fn(str(line["idx"]))) - text_a = self.process_text_fn(line["premise"]) - text_b = self.process_text_fn(line["hypothesis"]) - label = self.process_text_fn(line["label"]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class BoolQProcessor(DefaultGLUEDataProcessor): - """Processor for the BoolQ dataset (SuperGLUE diagnostics dataset).""" - - def get_labels(self): - """See base class.""" - return ["True", "False"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "BoolQ" - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - dataset = tfds.load( - "super_glue/boolq", split=set_type, try_gcs=True).as_numpy_iterator() - examples = [] - for example in dataset: - guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"]))) - text_a = self.process_text_fn(example["question"]) - text_b = self.process_text_fn(example["passage"]) - label = "False" - if set_type != "test": - label = self.get_labels()[example["label"]] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class CBProcessor(DefaultGLUEDataProcessor): - """Processor for the CB dataset (SuperGLUE diagnostics dataset).""" - - def get_labels(self): - """See base class.""" - return ["entailment", "neutral", "contradiction"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "CB" - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - dataset = tfds.load( - "super_glue/cb", split=set_type, try_gcs=True).as_numpy_iterator() - examples = [] - for example in dataset: - guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"]))) - text_a = self.process_text_fn(example["premise"]) - text_b = self.process_text_fn(example["hypothesis"]) - label = "entailment" - if set_type != "test": - label = self.get_labels()[example["label"]] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class SuperGLUERTEProcessor(DefaultGLUEDataProcessor): - """Processor for the RTE dataset (SuperGLUE version).""" - - def get_labels(self): - """See base class.""" - # All datasets are converted to 2-class split, where for 3-class datasets we - # collapse neutral and contradiction into not_entailment. - return ["entailment", "not_entailment"] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "RTESuperGLUE" - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - examples = [] - dataset = tfds.load( - "super_glue/rte", split=set_type, try_gcs=True).as_numpy_iterator() - for example in dataset: - guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"]))) - text_a = self.process_text_fn(example["premise"]) - text_b = self.process_text_fn(example["hypothesis"]) - label = "entailment" - if set_type != "test": - label = self.get_labels()[example["label"]] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class WiCInputExample(InputExample): - """Processor for the WiC dataset (SuperGLUE version).""" - - def __init__(self, - guid, - text_a, - text_b=None, - label=None, - word=None, - weight=None, - example_id=None): - """A single training/test example for simple seq regression/classification.""" - super(WiCInputExample, self).__init__(guid, text_a, text_b, label, weight, - example_id) - self.word = word - - -class WiCProcessor(DefaultGLUEDataProcessor): - """Processor for the RTE dataset (SuperGLUE version).""" - - def get_labels(self): - """Not used.""" - return [] - - @staticmethod - def get_processor_name(): - """See base class.""" - return "RTESuperGLUE" - - def _create_examples_tfds(self, set_type): - """Creates examples for the training/dev/test sets.""" - examples = [] - dataset = tfds.load( - "super_glue/wic", split=set_type, try_gcs=True).as_numpy_iterator() - for example in dataset: - guid = "%s-%s" % (set_type, self.process_text_fn(str(example["idx"]))) - text_a = self.process_text_fn(example["sentence1"]) - text_b = self.process_text_fn(example["sentence2"]) - word = self.process_text_fn(example["word"]) - label = 0 - if set_type != "test": - label = example["label"] - examples.append( - WiCInputExample( - guid=guid, text_a=text_a, text_b=text_b, word=word, label=label)) - return examples - - def featurize_example(self, ex_index, example, label_list, max_seq_length, - tokenizer): - """Here we concate sentence1, sentence2, word together with [SEP] tokens.""" - del label_list - tokens_a = tokenizer.tokenize(example.text_a) - tokens_b = tokenizer.tokenize(example.text_b) - tokens_word = tokenizer.tokenize(example.word) - - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP], [SEP] with "- 4" - # Here we only pop out the first two sentence tokens. - _truncate_seq_pair(tokens_a, tokens_b, - max_seq_length - 4 - len(tokens_word)) - - seg_id_a = 0 - seg_id_b = 1 - seg_id_c = 2 - seg_id_cls = 0 - seg_id_pad = 0 - - tokens = [] - segment_ids = [] - tokens.append("[CLS]") - segment_ids.append(seg_id_cls) - for token in tokens_a: - tokens.append(token) - segment_ids.append(seg_id_a) - tokens.append("[SEP]") - segment_ids.append(seg_id_a) - - for token in tokens_b: - tokens.append(token) - segment_ids.append(seg_id_b) - - tokens.append("[SEP]") - segment_ids.append(seg_id_b) - - for token in tokens_word: - tokens.append(token) - segment_ids.append(seg_id_c) - - tokens.append("[SEP]") - segment_ids.append(seg_id_c) - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1] * len(input_ids) - - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(0) - input_mask.append(0) - segment_ids.append(seg_id_pad) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - label_id = example.label - if ex_index < 5: - logging.info("*** Example ***") - logging.info("guid: %s", (example.guid)) - logging.info("tokens: %s", - " ".join([tokenization.printable_text(x) for x in tokens])) - logging.info("input_ids: %s", " ".join([str(x) for x in input_ids])) - logging.info("input_mask: %s", " ".join([str(x) for x in input_mask])) - logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) - logging.info("label: %s (id = %s)", example.label, str(label_id)) - logging.info("weight: %s", example.weight) - logging.info("example_id: %s", example.example_id) - - feature = InputFeatures( - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - label_id=label_id, - is_real_example=True, - weight=example.weight, - example_id=example.example_id) - - return feature - - -def file_based_convert_examples_to_features(examples, - label_list, - max_seq_length, - tokenizer, - output_file, - label_type=None, - featurize_fn=None): - """Convert a set of `InputExample`s to a TFRecord file.""" - - tf.io.gfile.makedirs(os.path.dirname(output_file)) - writer = tf.io.TFRecordWriter(output_file) - - for ex_index, example in enumerate(examples): - if ex_index % 10000 == 0: - logging.info("Writing example %d of %d", ex_index, len(examples)) - - if featurize_fn: - feature = featurize_fn(ex_index, example, label_list, max_seq_length, - tokenizer) - else: - feature = convert_single_example(ex_index, example, label_list, - max_seq_length, tokenizer) - - def create_int_feature(values): - f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) - return f - - def create_float_feature(values): - f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) - return f - - features = collections.OrderedDict() - features["input_ids"] = create_int_feature(feature.input_ids) - features["input_mask"] = create_int_feature(feature.input_mask) - features["segment_ids"] = create_int_feature(feature.segment_ids) - if label_type is not None and label_type == float: - features["label_ids"] = create_float_feature([feature.label_id]) - elif feature.label_id is not None: - features["label_ids"] = create_int_feature([feature.label_id]) - features["is_real_example"] = create_int_feature( - [int(feature.is_real_example)]) - if feature.weight is not None: - features["weight"] = create_float_feature([feature.weight]) - if feature.example_id is not None: - features["example_id"] = create_int_feature([feature.example_id]) - else: - features["example_id"] = create_int_feature([ex_index]) - - tf_example = tf.train.Example(features=tf.train.Features(feature=features)) - writer.write(tf_example.SerializeToString()) - writer.close() - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -def generate_tf_record_from_data_file(processor, - data_dir, - tokenizer, - train_data_output_path=None, - eval_data_output_path=None, - test_data_output_path=None, - max_seq_length=128): - """Generates and saves training data into a tf record file. - - Args: - processor: Input processor object to be used for generating data. Subclass - of `DataProcessor`. - data_dir: Directory that contains train/eval/test data to process. - tokenizer: The tokenizer to be applied on the data. - train_data_output_path: Output to which processed tf record for training - will be saved. - eval_data_output_path: Output to which processed tf record for evaluation - will be saved. - test_data_output_path: Output to which processed tf record for testing - will be saved. Must be a pattern template with {} if processor has - language specific test data. - max_seq_length: Maximum sequence length of the to be generated - training/eval data. - - Returns: - A dictionary containing input meta data. - """ - assert train_data_output_path or eval_data_output_path - - label_list = processor.get_labels() - label_type = getattr(processor, "label_type", None) - is_regression = getattr(processor, "is_regression", False) - has_sample_weights = getattr(processor, "weight_key", False) - - num_training_data = 0 - if train_data_output_path: - train_input_data_examples = processor.get_train_examples(data_dir) - file_based_convert_examples_to_features(train_input_data_examples, - label_list, max_seq_length, - tokenizer, train_data_output_path, - label_type, - processor.featurize_example) - num_training_data = len(train_input_data_examples) - - if eval_data_output_path: - eval_input_data_examples = processor.get_dev_examples(data_dir) - file_based_convert_examples_to_features(eval_input_data_examples, - label_list, max_seq_length, - tokenizer, eval_data_output_path, - label_type, - processor.featurize_example) - - meta_data = { - "processor_type": processor.get_processor_name(), - "train_data_size": num_training_data, - "max_seq_length": max_seq_length, - } - - if test_data_output_path: - test_input_data_examples = processor.get_test_examples(data_dir) - if isinstance(test_input_data_examples, dict): - for language, examples in test_input_data_examples.items(): - file_based_convert_examples_to_features( - examples, label_list, max_seq_length, tokenizer, - test_data_output_path.format(language), label_type, - processor.featurize_example) - meta_data["test_{}_data_size".format(language)] = len(examples) - else: - file_based_convert_examples_to_features(test_input_data_examples, - label_list, max_seq_length, - tokenizer, test_data_output_path, - label_type, - processor.featurize_example) - meta_data["test_data_size"] = len(test_input_data_examples) - - if is_regression: - meta_data["task_type"] = "bert_regression" - meta_data["label_type"] = {int: "int", float: "float"}[label_type] - else: - meta_data["task_type"] = "bert_classification" - meta_data["num_labels"] = len(processor.get_labels()) - if has_sample_weights: - meta_data["has_sample_weights"] = True - - if eval_data_output_path: - meta_data["eval_data_size"] = len(eval_input_data_examples) - - return meta_data -- GitLab From 8c430b98f87e50ddc96e119d9585a2aaa7f96f0b Mon Sep 17 00:00:00 2001 From: Zihan Wang Date: Fri, 25 Feb 2022 14:51:59 -0800 Subject: [PATCH 04/54] fix docstrings --- official/projects/longformer/longformer.py | 2 + .../projects/longformer/longformer_encoder.py | 43 ++++++++----------- 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/official/projects/longformer/longformer.py b/official/projects/longformer/longformer.py index 1237eee1b..c657da29e 100644 --- a/official/projects/longformer/longformer.py +++ b/official/projects/longformer/longformer.py @@ -30,9 +30,11 @@ class LongformerEncoderConfig(encoders.BertEncoderConfig): Args: attention_window: list of ints representing the window size for each layer. global_attention_size: the size of global attention used for each token. + pad_token_id: the token id for the pad token ''' attention_window: List[int] = dataclasses.field(default_factory=list) global_attention_size: int = 0 + pad_token_id: int = 1 @gin.configurable @base_config.bind(LongformerEncoderConfig) diff --git a/official/projects/longformer/longformer_encoder.py b/official/projects/longformer/longformer_encoder.py index 11ad01646..5e46596e0 100644 --- a/official/projects/longformer/longformer_encoder.py +++ b/official/projects/longformer/longformer_encoder.py @@ -65,6 +65,9 @@ class LongformerEncoder(tf.keras.layers.Layer): Args: vocab_size: The size of the token vocabulary. + attention_window: list of ints representing the window size for each layer. + global_attention_size: the size of global attention used for each token. + pad_token_id: the token id for the pad token hidden_size: The size of the transformer hidden layers. num_layers: The number of transformer layers. num_attention_heads: The number of attention heads for each transformer. The @@ -120,23 +123,10 @@ class LongformerEncoder(tf.keras.layers.Layer): embedding_layer: Optional[tf.keras.layers.Layer] = None, norm_first: bool = False, **kwargs): - # Pops kwargs that are used in V1 implementation. - if 'dict_outputs' in kwargs: - kwargs.pop('dict_outputs') - if 'return_all_encoder_outputs' in kwargs: - kwargs.pop('return_all_encoder_outputs') - if 'intermediate_size' in kwargs: - inner_dim = kwargs.pop('intermediate_size') - if 'activation' in kwargs: - inner_activation = kwargs.pop('activation') - if 'dropout_rate' in kwargs: - output_dropout = kwargs.pop('dropout_rate') - if 'attention_dropout_rate' in kwargs: - attention_dropout = kwargs.pop('attention_dropout_rate') super().__init__(**kwargs) - # Longformer + # Longformer args self._attention_window = attention_window - self.global_attention_size = global_attention_size + self._global_attention_size = global_attention_size self._pad_token_id = pad_token_id activation = tf.keras.activations.get(inner_activation) @@ -227,6 +217,7 @@ class LongformerEncoder(tf.keras.layers.Layer): 'norm_first': norm_first, # Longformer 'attention_window': attention_window, + 'global_attention_size': global_attention_size, 'pad_token_id': pad_token_id, } self.inputs = dict( @@ -273,15 +264,16 @@ class LongformerEncoder(tf.keras.layers.Layer): batch_size, seq_len = shape_list(mask) # create masks with fixed len global_attention_size - mask = tf.transpose(tf.concat(values=[tf.ones((self.global_attention_size, batch_size), tf.int32) * 2, - tf.transpose(mask)[self.global_attention_size:]], axis=0)) + mask = tf.transpose(tf.concat(values=[tf.ones((self._global_attention_size, batch_size), tf.int32) * 2, + tf.transpose(mask)[self._global_attention_size:]], axis=0)) is_index_masked = tf.math.less(mask, 1) is_index_global_attn = tf.transpose(tf.concat(values=[ - tf.ones((self.global_attention_size, batch_size), tf.bool), tf.zeros((seq_len - self.global_attention_size, batch_size), tf.bool) + tf.ones((self._global_attention_size, batch_size), tf.bool), tf.zeros((seq_len - self._global_attention_size, + batch_size), tf.bool) ], axis=0)) - is_global_attn = self.global_attention_size > 0 + is_global_attn = self._global_attention_size > 0 # Longformer attention_mask = mask @@ -347,11 +339,11 @@ class LongformerEncoder(tf.keras.layers.Layer): def _pad_to_window_size( self, - word_ids, # input_ids - mask, # attention_mask - type_ids, # token_type_ids - word_embeddings, # inputs_embeds - pad_token_id, # pad_token_id + word_ids, + mask, + type_ids, + word_embeddings, + pad_token_id, ): """A helper function to pad tokens and mask to work with implementation of Longformer selfattention.""" # padding @@ -361,8 +353,7 @@ class LongformerEncoder(tf.keras.layers.Layer): assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}" - # input_shape = shape_list(input_ids) if input_ids is not None else shape_list(inputs_embeds) - input_shape = word_ids.shape if word_ids is not None else word_embeddings.shape + input_shape = shape_list(word_ids) if word_ids is not None else shape_list(word_embeddings) batch_size, seq_len = input_shape[:2] if seq_len is not None: -- GitLab From dc588495bd69a862bc2e6dd4340b646809999630 Mon Sep 17 00:00:00 2001 From: Zihan Wang Date: Fri, 25 Feb 2022 15:57:18 -0800 Subject: [PATCH 05/54] use tf-utils.get_shape_list --- .../longformer/longformer_attention.py | 109 ++++++++---------- .../longformer/longformer_attention_test.py | 11 +- .../projects/longformer/longformer_encoder.py | 23 +--- 3 files changed, 54 insertions(+), 89 deletions(-) diff --git a/official/projects/longformer/longformer_attention.py b/official/projects/longformer/longformer_attention.py index 43e2ec0fc..1c32e6f51 100644 --- a/official/projects/longformer/longformer_attention.py +++ b/official/projects/longformer/longformer_attention.py @@ -34,28 +34,9 @@ from keras.layers import einsum_dense from keras.utils import tf_utils from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util.tf_export import keras_export +from official.modeling.tf_utils import get_shape_list from typing import Dict, List, Optional, Union -def shape_list(tensor: tf.Tensor) -> List[int]: - """ - Deal with dynamic shape in tensorflow cleanly. - - Args: - tensor (:obj:`tf.Tensor`): The tensor we want the shape of. - - Returns: - :obj:`List[int]`: The shape of the tensor as a list. - """ - dynamic = tf.shape(tensor) - - if tensor.shape == tf.TensorShape(None): - return dynamic - - static = tensor.shape.as_list() - - return [dynamic[i] if s is None else s for i, s in enumerate(static)] - - _CHR_IDX = string.ascii_lowercase def _build_attention_equation(rank, attn_axes): @@ -292,7 +273,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): # XLA performance, but may introduce slight numeric differences in # the Transformer attention head. query = tf.multiply(query, 1.0 / math.sqrt(float(self._key_dim))) # (B, T, N, key_dim) - batch_size, seq_len, num_heads, head_dim = shape_list(query) + batch_size, seq_len, num_heads, head_dim = get_shape_list(query) # attn_probs = (batch_size, seq_len, num_heads, window*2+1) attn_scores = self._sliding_chunks_query_key_matmul( @@ -301,7 +282,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): # diagonal mask with zeros everywhere and -inf inplace of padding diagonal_mask = self._sliding_chunks_query_key_matmul( - tf.ones(shape_list(attention_mask)), + tf.ones(get_shape_list(attention_mask)), attention_mask, self._one_sided_attn_window_size, ) @@ -311,9 +292,9 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): if tf.executing_eagerly(): tf.debugging.assert_equal( - shape_list(attn_scores), + get_shape_list(attn_scores), [batch_size, seq_len, self._num_heads, self._one_sided_attn_window_size * 2 + 1], - message=f"attn_probs should be of size ({batch_size}, {seq_len}, {num_heads}, {self._one_sided_attn_window_size * 2 + 1}), but is of size {shape_list(attn_scores)}", + message=f"attn_probs should be of size ({batch_size}, {seq_len}, {num_heads}, {self._one_sided_attn_window_size * 2 + 1}), but is of size {get_shape_list(attn_scores)}", ) # compute global attn indices required through out forward fn @@ -356,7 +337,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): attn_probs = tf.where( masked_index, - tf.zeros(shape_list(masked_index), dtype=attn_probs.dtype), + tf.zeros(get_shape_list(masked_index), dtype=attn_probs.dtype), attn_probs, ) @@ -364,9 +345,9 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): if layer_head_mask is not None: if tf.executing_eagerly(): tf.debugging.assert_equal( - shape_list(layer_head_mask), + get_shape_list(layer_head_mask), [self._num_heads], - message=f"Head mask for a single layer should be of size {(self._num_heads)}, but is {shape_list(layer_head_mask)}", + message=f"Head mask for a single layer should be of size {(self._num_heads)}, but is {get_shape_list(layer_head_mask)}", ) attn_probs = tf.reshape(layer_head_mask, (1, 1, -1, 1)) * attn_probs @@ -391,7 +372,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): if tf.executing_eagerly(): tf.debugging.assert_equal( - shape_list(attn_output), + get_shape_list(attn_output), [batch_size, seq_len, self._num_heads, head_dim], message="Unexpected size", ) @@ -432,7 +413,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): attn_probs = tf.where( masked_global_attn_index, - tf.zeros(shape_list(masked_global_attn_index), dtype=attn_probs.dtype), + tf.zeros(get_shape_list(masked_global_attn_index), dtype=attn_probs.dtype), attn_probs, ) @@ -455,7 +436,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer) with an overlap of size window_overlap """ - batch_size, seq_len, num_heads, head_dim = shape_list(query) + batch_size, seq_len, num_heads, head_dim = get_shape_list(query) if tf.executing_eagerly(): tf.debugging.assert_equal( @@ -464,9 +445,9 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): message=f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}", ) tf.debugging.assert_equal( - shape_list(query), - shape_list(key), - message=f"Shape of query and key should be equal, but got query: {shape_list(query)} and key: {shape_list(key)}", + get_shape_list(query), + get_shape_list(key), + message=f"Shape of query and key should be equal, but got query: {get_shape_list(query)} and key: {get_shape_list(key)}", ) chunks_count = seq_len // window_overlap - 1 @@ -574,7 +555,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): # pad to full matrix padding = tf.convert_to_tensor( - [[0, shape_list(input_tensor)[1] - window_overlap], [0, shape_list(input_tensor)[3] - window_overlap - 1]] + [[0, get_shape_list(input_tensor)[1] - window_overlap], [0, get_shape_list(input_tensor)[3] - window_overlap - 1]] ) # create lower mask @@ -584,7 +565,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): mask_2d = mask_2d + tf.reverse(mask_2d, axis=[0, 1]) # broadcast to full matrix - mask_4d = tf.tile(mask_2d[None, :, None, :], (shape_list(input_tensor)[0], 1, 1, 1)) + mask_4d = tf.tile(mask_2d[None, :, None, :], (get_shape_list(input_tensor)[0], 1, 1, 1)) # inf tensor used for masking inf_tensor = -float("inf") * tf.ones_like(input_tensor) @@ -600,7 +581,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): same shape as `attn_probs` """ - batch_size, seq_len, num_heads, head_dim = shape_list(value) + batch_size, seq_len, num_heads, head_dim = get_shape_list(value) if tf.executing_eagerly(): tf.debugging.assert_equal( @@ -609,12 +590,12 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): message="Seq_len has to be multiple of 2 * window_overlap", ) tf.debugging.assert_equal( - shape_list(attn_probs)[:3], - shape_list(value)[:3], + get_shape_list(attn_probs)[:3], + get_shape_list(value)[:3], message="value and attn_probs must have same dims (except head_dim)", ) tf.debugging.assert_equal( - shape_list(attn_probs)[3], + get_shape_list(attn_probs)[3], 2 * window_overlap + 1, message="attn_probs last dim has to be 2 * window_overlap + 1", ) @@ -644,7 +625,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): # chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap frame_size = 3 * window_overlap * head_dim - frame_hop_size = (shape_list(padded_value)[1] * head_dim - frame_size) // chunks_count + frame_hop_size = (get_shape_list(padded_value)[1] * head_dim - frame_size) // chunks_count chunked_value = tf.signal.frame( tf.reshape(padded_value, (batch_size * num_heads, -1)), frame_size, @@ -657,7 +638,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): if tf.executing_eagerly(): tf.debugging.assert_equal( - shape_list(chunked_value), + get_shape_list(chunked_value), [batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim], message="Chunked value has the wrong shape", ) @@ -677,7 +658,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): hidden_states_padded = tf.pad( hidden_states_padded, paddings ) # padding value is not important because it will be overwritten - batch_size, chunk_size, seq_length, hidden_dim = shape_list(hidden_states_padded) + batch_size, chunk_size, seq_length, hidden_dim = get_shape_list(hidden_states_padded) hidden_states_padded = tf.reshape(hidden_states_padded, (batch_size, chunk_size, hidden_dim, seq_length)) return hidden_states_padded @@ -700,7 +681,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ] """ - total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states) + total_num_heads, num_chunks, window_overlap, hidden_dim = get_shape_list(chunked_hidden_states) paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 0], [0, window_overlap + 1]]) chunked_hidden_states = tf.pad( chunked_hidden_states, paddings @@ -722,7 +703,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): @staticmethod def _chunk(hidden_states, window_overlap): """convert into overlapping chunks. Chunk size = 2w, overlap size = w""" - batch_size, seq_length, hidden_dim = shape_list(hidden_states) + batch_size, seq_length, hidden_dim = get_shape_list(hidden_states) num_output_chunks = 2 * (seq_length // (2 * window_overlap)) - 1 # define frame size and frame stride (similar to convolution) @@ -735,9 +716,9 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): if tf.executing_eagerly(): tf.debugging.assert_equal( - shape_list(chunked_hidden_states), + get_shape_list(chunked_hidden_states), [batch_size, num_output_chunks, frame_size], - message=f"Make sure chunking is correctly applied. `Chunked hidden states should have output dimension {[batch_size, frame_size, num_output_chunks]}, but got {shape_list(chunked_hidden_states)}.", + message=f"Make sure chunking is correctly applied. `Chunked hidden states should have output dimension {[batch_size, frame_size, num_output_chunks]}, but got {get_shape_list(chunked_hidden_states)}.", ) chunked_hidden_states = tf.reshape( @@ -752,7 +733,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): """compute global attn indices required throughout forward pass""" # All global attention size are fixed through global_attention_size - batch_size, seq_len = shape_list(is_index_global_attn) + batch_size, seq_len = get_shape_list(is_index_global_attn) max_num_global_attn_indices = global_attention_size @@ -787,7 +768,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): is_local_index_global_attn_nonzero, is_local_index_no_global_attn_nonzero, ): - batch_size = shape_list(key_vectors)[0] + batch_size = get_shape_list(key_vectors)[0] # select global key vectors global_key_vectors = tf.gather_nd(key_vectors, is_index_global_attn_nonzero) @@ -809,8 +790,8 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): # (batch_size, max_num_global_attn_indices, seq_len, num_heads) attn_probs_from_global_key_trans = tf.transpose(attn_probs_from_global_key, (0, 3, 1, 2)) - mask_shape = (shape_list(is_local_index_no_global_attn_nonzero)[0],) + tuple( - shape_list(attn_probs_from_global_key_trans)[-2:] + mask_shape = (get_shape_list(is_local_index_no_global_attn_nonzero)[0],) + tuple( + get_shape_list(attn_probs_from_global_key_trans)[-2:] ) mask = tf.ones(mask_shape) * -10000.0 mask = tf.cast(mask, dtype=attn_probs_from_global_key_trans.dtype) @@ -838,11 +819,11 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): is_index_global_attn_nonzero, is_local_index_global_attn_nonzero, ): - batch_size = shape_list(attn_probs)[0] + batch_size = get_shape_list(attn_probs)[0] # cut local attn probs to global only attn_probs_only_global = attn_probs[:, :, :, :max_num_global_attn_indices] - # attn_probs_only_global = tf.slice(attn_probs, [0, 0, 0, 0], shape_list(attn_probs)[: -1] + [max_num_global_attn_indices]) + # attn_probs_only_global = tf.slice(attn_probs, [0, 0, 0, 0], get_shape_list(attn_probs)[: -1] + [max_num_global_attn_indices]) # select global value vectors global_value_vectors = tf.gather_nd(value_vectors, is_index_global_attn_nonzero) @@ -863,7 +844,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): attn_output_only_global = tf.einsum("blhs,bshd->blhd", attn_probs_only_global, value_vectors_only_global) # reshape attn probs attn_probs_without_global = attn_probs[:, :, :, max_num_global_attn_indices:] - # attn_probs_without_global = tf.slice(attn_probs, [0, 0, 0, max_num_global_attn_indices], shape_list(attn_probs)[: -1] + [shape_list(attn_probs)[-1] - max_num_global_attn_indices]) + # attn_probs_without_global = tf.slice(attn_probs, [0, 0, 0, max_num_global_attn_indices], get_shape_list(attn_probs)[: -1] + [get_shape_list(attn_probs)[-1] - max_num_global_attn_indices]) # compute attn output with global attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value( @@ -884,7 +865,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): is_index_masked, training, ): - batch_size, seq_len = shape_list(hidden_states)[:2] + batch_size, seq_len = get_shape_list(hidden_states)[:2] # prepare global hidden states global_attn_hidden_states = tf.gather_nd(hidden_states, is_index_global_attn_nonzero) @@ -912,9 +893,9 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): if tf.executing_eagerly(): tf.debugging.assert_equal( - shape_list(global_attn_scores), + get_shape_list(global_attn_scores), [batch_size * self._num_heads, max_num_global_attn_indices, seq_len], - message=f"global_attn_scores have the wrong size. Size should be {(batch_size * self._num_heads, max_num_global_attn_indices, seq_len)}, but is {shape_list(global_attn_scores)}.", + message=f"global_attn_scores have the wrong size. Size should be {(batch_size * self._num_heads, max_num_global_attn_indices, seq_len)}, but is {get_shape_list(global_attn_scores)}.", ) global_attn_scores = tf.reshape( @@ -922,8 +903,8 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): (batch_size, self._num_heads, max_num_global_attn_indices, seq_len), ) global_attn_scores_trans = tf.transpose(global_attn_scores, (0, 2, 1, 3)) - mask_shape = (shape_list(is_local_index_no_global_attn_nonzero)[0],) + tuple( - shape_list(global_attn_scores_trans)[-2:] + mask_shape = (get_shape_list(is_local_index_no_global_attn_nonzero)[0],) + tuple( + get_shape_list(global_attn_scores_trans)[-2:] ) global_attn_mask = tf.ones(mask_shape) * -10000.0 global_attn_mask = tf.cast(global_attn_mask, dtype=global_attn_scores_trans.dtype) @@ -937,7 +918,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): global_attn_scores = tf.transpose(global_attn_scores_trans, (0, 2, 1, 3)) # mask global attn scores - attn_mask = tf.tile(is_index_masked[:, None, None, :], (1, shape_list(global_attn_scores)[1], 1, 1)) + attn_mask = tf.tile(is_index_masked[:, None, None, :], (1, get_shape_list(global_attn_scores)[1], 1, 1)) global_attn_scores = tf.where(attn_mask, -10000.0, global_attn_scores) global_attn_scores = tf.reshape( global_attn_scores, @@ -951,9 +932,9 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): if layer_head_mask is not None: if tf.executing_eagerly(): tf.debugging.assert_equal( - shape_list(layer_head_mask), + get_shape_list(layer_head_mask), [self._num_heads], - message=f"Head mask for a single layer should be of size {(self._num_heads)}, but is {shape_list(layer_head_mask)}", + message=f"Head mask for a single layer should be of size {(self._num_heads)}, but is {get_shape_list(layer_head_mask)}", ) global_attn_probs_float = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape( global_attn_probs_float, (batch_size, self._num_heads, max_num_global_attn_indices, seq_len) @@ -970,9 +951,9 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): if tf.executing_eagerly(): tf.debugging.assert_equal( - shape_list(global_attn_output), + get_shape_list(global_attn_output), [batch_size * self._num_heads, max_num_global_attn_indices, self._key_dim], - message=f"global_attn_output tensor has the wrong size. Size should be {(batch_size * self._num_heads, max_num_global_attn_indices, self._key_dim)}, but is {shape_list(global_attn_output)}.", + message=f"global_attn_output tensor has the wrong size. Size should be {(batch_size * self._num_heads, max_num_global_attn_indices, self._key_dim)}, but is {get_shape_list(global_attn_output)}.", ) global_attn_output = tf.reshape( @@ -987,7 +968,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): ) nonzero_global_attn_output = tf.reshape( nonzero_global_attn_output, - (shape_list(is_local_index_global_attn_nonzero)[0], -1), + (get_shape_list(is_local_index_global_attn_nonzero)[0], -1), ) # overwrite values with global attention diff --git a/official/projects/longformer/longformer_attention_test.py b/official/projects/longformer/longformer_attention_test.py index 992437b0e..656d8eb6d 100644 --- a/official/projects/longformer/longformer_attention_test.py +++ b/official/projects/longformer/longformer_attention_test.py @@ -20,6 +20,7 @@ import tensorflow as tf from tensorflow.python.distribute import combinations from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import from official.projects.longformer import longformer_attention +from official.modeling.tf_utils import get_shape_list def _create_mock_attention_data( @@ -117,13 +118,13 @@ class LongformerAttentionTest(keras_parameterized.TestCase): hidden_states = self._get_hidden_states() hidden_states = tf.reshape(hidden_states, (1, 8, 4)) # set seq length = 8, hidden dim = 4 chunked_hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2) - window_overlap_size = longformer_attention.shape_list(chunked_hidden_states)[2] + window_overlap_size = get_shape_list(chunked_hidden_states)[2] self.assertTrue(window_overlap_size == 4) padded_hidden_states = longformer_attention.LongformerAttention._pad_and_diagonalize(chunked_hidden_states) self.assertTrue( - longformer_attention.shape_list(padded_hidden_states)[-1] == longformer_attention.shape_list(chunked_hidden_states)[-1] + window_overlap_size - 1 + get_shape_list(padded_hidden_states)[-1] == get_shape_list(chunked_hidden_states)[-1] + window_overlap_size - 1 ) # first row => [0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000] @@ -138,14 +139,14 @@ class LongformerAttentionTest(keras_parameterized.TestCase): def test_pad_and_transpose_last_two_dims(self): hidden_states = self._get_hidden_states() - self.assertTrue(longformer_attention.shape_list(hidden_states), [1, 8, 4]) + self.assertTrue(get_shape_list(hidden_states), [1, 8, 4]) # pad along seq length dim paddings = tf.constant([[0, 0], [0, 0], [0, 1], [0, 0]], dtype=tf.dtypes.int32) hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2) padded_hidden_states = longformer_attention.LongformerAttention._pad_and_transpose_last_two_dims(hidden_states, paddings) - self.assertTrue(longformer_attention.shape_list(padded_hidden_states) == [1, 1, 8, 5]) + self.assertTrue(get_shape_list(padded_hidden_states) == [1, 1, 8, 5]) expected_added_dim = tf.zeros((5,), dtype=tf.dtypes.float32) tf.debugging.assert_near(expected_added_dim, padded_hidden_states[0, 0, -1, :], rtol=1e-6) @@ -184,7 +185,7 @@ class LongformerAttentionTest(keras_parameterized.TestCase): expected_slice_along_seq_length = tf.convert_to_tensor([0.4983, -0.7584, -1.6944], dtype=tf.dtypes.float32) expected_slice_along_chunk = tf.convert_to_tensor([0.4983, -1.8348, -0.7584, 2.0514], dtype=tf.dtypes.float32) - self.assertTrue(longformer_attention.shape_list(chunked_hidden_states) == [1, 3, 4, 4]) + self.assertTrue(get_shape_list(chunked_hidden_states) == [1, 3, 4, 4]) tf.debugging.assert_near(chunked_hidden_states[0, :, 0, 0], expected_slice_along_seq_length, rtol=1e-3) tf.debugging.assert_near(chunked_hidden_states[0, 0, :, 0], expected_slice_along_chunk, rtol=1e-3) diff --git a/official/projects/longformer/longformer_encoder.py b/official/projects/longformer/longformer_encoder.py index 5e46596e0..4764a0761 100644 --- a/official/projects/longformer/longformer_encoder.py +++ b/official/projects/longformer/longformer_encoder.py @@ -24,25 +24,8 @@ import tensorflow as tf from official.nlp.modeling import layers from official.projects.longformer.longformer_encoder_block import LongformerEncoderBlock +from official.modeling.tf_utils import get_shape_list -def shape_list(tensor: tf.Tensor) -> List[int]: - """ - Deal with dynamic shape in tensorflow cleanly. - - Args: - tensor (:obj:`tf.Tensor`): The tensor we want the shape of. - - Returns: - :obj:`List[int]`: The shape of the tensor as a list. - """ - dynamic = tf.shape(tensor) - - if tensor.shape == tf.TensorShape(None): - return dynamic - - static = tensor.shape.as_list() - - return [dynamic[i] if s is None else s for i, s in enumerate(static)] _Initializer = Union[str, tf.keras.initializers.Initializer] @@ -262,7 +245,7 @@ class LongformerEncoder(tf.keras.layers.Layer): if self._embedding_projection is not None: embeddings = self._embedding_projection(embeddings) - batch_size, seq_len = shape_list(mask) + batch_size, seq_len = get_shape_list(mask) # create masks with fixed len global_attention_size mask = tf.transpose(tf.concat(values=[tf.ones((self._global_attention_size, batch_size), tf.int32) * 2, tf.transpose(mask)[self._global_attention_size:]], axis=0)) @@ -353,7 +336,7 @@ class LongformerEncoder(tf.keras.layers.Layer): assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}" - input_shape = shape_list(word_ids) if word_ids is not None else shape_list(word_embeddings) + input_shape = get_shape_list(word_ids) if word_ids is not None else get_shape_list(word_embeddings) batch_size, seq_len = input_shape[:2] if seq_len is not None: -- GitLab From f296795268d1bf34af0d7ebbf2654dd65c653ba4 Mon Sep 17 00:00:00 2001 From: Zihan Wang Date: Fri, 25 Feb 2022 17:19:00 -0800 Subject: [PATCH 06/54] fix experiment name --- official/projects/longformer/longformer_experiments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/official/projects/longformer/longformer_experiments.py b/official/projects/longformer/longformer_experiments.py index 09e5cc010..cc85baa0a 100644 --- a/official/projects/longformer/longformer_experiments.py +++ b/official/projects/longformer/longformer_experiments.py @@ -25,7 +25,7 @@ from official.nlp.tasks import masked_lm from official.nlp.data import sentence_prediction_dataloader from official.nlp.configs import bert from official.nlp.configs import encoders -import official.projects.longformer.sentence_prediction_with_load as sentence_prediction +import official.projects.longformer.sentence_prediction_with_checkpoint_convert as sentence_prediction from official.projects.longformer.longformer import LongformerEncoderConfig -- GitLab From fd1528b1fe63c5bf7e09185ead0cf4279b8e14ae Mon Sep 17 00:00:00 2001 From: Zihan Wang Date: Fri, 25 Feb 2022 17:25:48 -0800 Subject: [PATCH 07/54] add back v1 arguments --- official/projects/longformer/longformer_encoder.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/official/projects/longformer/longformer_encoder.py b/official/projects/longformer/longformer_encoder.py index 4764a0761..e403495d3 100644 --- a/official/projects/longformer/longformer_encoder.py +++ b/official/projects/longformer/longformer_encoder.py @@ -106,6 +106,19 @@ class LongformerEncoder(tf.keras.layers.Layer): embedding_layer: Optional[tf.keras.layers.Layer] = None, norm_first: bool = False, **kwargs): + # Pops kwargs that are used in V1 implementation. + if 'dict_outputs' in kwargs: + kwargs.pop('dict_outputs') + if 'return_all_encoder_outputs' in kwargs: + kwargs.pop('return_all_encoder_outputs') + if 'intermediate_size' in kwargs: + inner_dim = kwargs.pop('intermediate_size') + if 'activation' in kwargs: + inner_activation = kwargs.pop('activation') + if 'dropout_rate' in kwargs: + output_dropout = kwargs.pop('dropout_rate') + if 'attention_dropout_rate' in kwargs: + attention_dropout = kwargs.pop('attention_dropout_rate') super().__init__(**kwargs) # Longformer args self._attention_window = attention_window -- GitLab From 5ad1d93fce323b88d50394e8b7a07f7bb53c390d Mon Sep 17 00:00:00 2001 From: Zihan Wang Date: Fri, 25 Feb 2022 20:16:30 -0800 Subject: [PATCH 08/54] fix bug --- .../sentence_prediction_with_checkpoint_convert.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/official/projects/longformer/sentence_prediction_with_checkpoint_convert.py b/official/projects/longformer/sentence_prediction_with_checkpoint_convert.py index fdecee0b9..8f91584bf 100644 --- a/official/projects/longformer/sentence_prediction_with_checkpoint_convert.py +++ b/official/projects/longformer/sentence_prediction_with_checkpoint_convert.py @@ -227,11 +227,10 @@ class SentencePredictionTask(base_task.Task): """Load a pretrained checkpoint (if exists) and then train from iter 0.""" ckpt_dir_or_file = self.task_config.init_checkpoint if self.task_config.initial_parameters_from_pk: - num_layers = self.task_config.model.encoder.num_layers - num_attention_heads = self.task_config.model.encoder.num_attention_heads - hidden_size = self.task_config.model.encoder.hidden_size - inner_dim = self.task_config.model.encoder.inner_dim - head_size = hidden_size / num_attention_heads + num_layers = self.task_config.model.encoder.any.num_layers + num_attention_heads = self.task_config.model.encoder.any.num_attention_heads + hidden_size = self.task_config.model.encoder.any.hidden_size + head_size = hidden_size // num_attention_heads assert head_size * num_attention_heads == hidden_size encoder = model.checkpoint_items['encoder'] -- GitLab From f2adc5ef988fe54706d088cc0cd2865bbe976b77 Mon Sep 17 00:00:00 2001 From: Zihan Wang Date: Fri, 25 Feb 2022 22:38:43 -0800 Subject: [PATCH 09/54] fix argument passing --- official/projects/longformer/longformer.py | 8 ++++---- official/projects/longformer/longformer_encoder.py | 13 ------------- 2 files changed, 4 insertions(+), 17 deletions(-) diff --git a/official/projects/longformer/longformer.py b/official/projects/longformer/longformer.py index c657da29e..5cccbe37c 100644 --- a/official/projects/longformer/longformer.py +++ b/official/projects/longformer/longformer.py @@ -54,10 +54,10 @@ def get_encoder(encoder_cfg: LongformerEncoderConfig): hidden_size=encoder_cfg.hidden_size, num_layers=encoder_cfg.num_layers, num_attention_heads=encoder_cfg.num_attention_heads, - intermediate_size=encoder_cfg.intermediate_size, - activation=tf_utils.get_activation(encoder_cfg.hidden_activation), - dropout_rate=encoder_cfg.dropout_rate, - attention_dropout_rate=encoder_cfg.attention_dropout_rate, + inner_dim=encoder_cfg.intermediate_size, + inner_activation=tf_utils.get_activation(encoder_cfg.hidden_activation), + output_dropout=encoder_cfg.dropout_rate, + attention_dropout=encoder_cfg.attention_dropout_rate, max_sequence_length=encoder_cfg.max_position_embeddings, type_vocab_size=encoder_cfg.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( diff --git a/official/projects/longformer/longformer_encoder.py b/official/projects/longformer/longformer_encoder.py index e403495d3..4764a0761 100644 --- a/official/projects/longformer/longformer_encoder.py +++ b/official/projects/longformer/longformer_encoder.py @@ -106,19 +106,6 @@ class LongformerEncoder(tf.keras.layers.Layer): embedding_layer: Optional[tf.keras.layers.Layer] = None, norm_first: bool = False, **kwargs): - # Pops kwargs that are used in V1 implementation. - if 'dict_outputs' in kwargs: - kwargs.pop('dict_outputs') - if 'return_all_encoder_outputs' in kwargs: - kwargs.pop('return_all_encoder_outputs') - if 'intermediate_size' in kwargs: - inner_dim = kwargs.pop('intermediate_size') - if 'activation' in kwargs: - inner_activation = kwargs.pop('activation') - if 'dropout_rate' in kwargs: - output_dropout = kwargs.pop('dropout_rate') - if 'attention_dropout_rate' in kwargs: - attention_dropout = kwargs.pop('attention_dropout_rate') super().__init__(**kwargs) # Longformer args self._attention_window = attention_window -- GitLab From d75ec8ba580bc01834637ffa25c32ee248e00e11 Mon Sep 17 00:00:00 2001 From: Zihan Wang Date: Tue, 1 Mar 2022 17:08:03 -0800 Subject: [PATCH 10/54] make init checkpoint a separate function. --- official/projects/longformer/README.md | 17 +- .../longformer/longformer_experiments.py | 3 +- ...ence_prediction_with_checkpoint_convert.py | 388 ------------------ ...ert_pretrained_pytorch_checkpoint_to_tf.py | 176 ++++++++ ...ters_from_pretrained_pytorch_checkpoint.py | 9 - 5 files changed, 186 insertions(+), 407 deletions(-) delete mode 100644 official/projects/longformer/sentence_prediction_with_checkpoint_convert.py create mode 100644 official/projects/longformer/utils/convert_pretrained_pytorch_checkpoint_to_tf.py delete mode 100644 official/projects/longformer/utils/get_parameters_from_pretrained_pytorch_checkpoint.py diff --git a/official/projects/longformer/README.md b/official/projects/longformer/README.md index 2ca4112dd..73241a6ee 100644 --- a/official/projects/longformer/README.md +++ b/official/projects/longformer/README.md @@ -9,21 +9,19 @@ This setting allows running on TPUs where tensor sizes have to be determined. `_get_global_attn_indices` in `longformer_attention.py` contains how the new global attention indices are specified. Changed all `tf.cond` to if confiditions, since global attention is specified in the start now. -`sentence_prediction_with_checkpoint_convert.py` now contains a `initial_parameters_from_pk` parameter that -specified a pk file containing all pre-trained weights from a pytorch longformer, which can be loaded into the -tf model. -The pk file can be generated from `utils/get_parameters_from_pretrained_pytorch_checkpoint.py`. -There is also a `longformer_tokenizer_to_tfrecord.py` that transformers pytorch longformer tokenized data to tf_records. +To load weights from a pre-trained huggingface longformer, run `utils/convert_pretrained_pytorch_checkpoint_to_tf.py` +to create a checkpoint. +There is also a `utils/longformer_tokenizer_to_tfrecord.py` that transformers pytorch longformer tokenized data to tf_records. ## Steps to Fine-tune on MNLI #### Prepare the pre-trained checkpoint Option 1. Use our saved checkpoint of `allenai/longformer-base-4096` stored in cloud storage ```bash -gsutil cp gs://model-garden-ucsd-zihan/allenai.pk allenai_longformer-base-4096.pk +gsutil cp -r gs://model-garden-ucsd-zihan/longformer-4096 . ``` Option 2. Create it directly ```bash -python3 utils/get_parameters_from_pretrained_pytorch_checkpoint.py +python3 utils/convert_pretrained_pytorch_checkpoint_to_tf.py ``` #### [Optional] Prepare the input file ```bash @@ -33,13 +31,14 @@ python3 longformer_tokenizer_to_tfrecord.py Here, we use the training data of MNLI that were uploaded to the cloud storage, you can replace it with the input files you generated. ```bash TRAIN_DATA=task.train_data.input_path=gs://model-garden-ucsd-zihan/longformer_allenai_mnli_train.tf_record,task.validation_data.input_path=gs://model-garden-ucsd-zihan/longformer_allenai_mnli_eval.tf_record +INIT_CHECKPOINT=longformer-4096/longformer PYTHONPATH=/path/to/model/garden \ python3 train.py \ --experiment=longformer/glue \ --config_file=experiments/glue_mnli_allenai.yaml \ - --params_override="${TRAIN_DATA},runtime.distribution_strategy=tpu,task.initial_parameters_from_pk=allenai_longformer-base-4096.pk" \ + --params_override="${TRAIN_DATA},runtime.distribution_strategy=tpu,task.init_checkpoint=${INIT_CHECKPOINT}" \ --tpu=local \ --model_dir=/path/to/outputdir \ --mode=train_and_eval ``` -This should take an hour or two to run, and give a performance of ~86. \ No newline at end of file +This should take ~ 3 hours to run, and give a performance of ~86. \ No newline at end of file diff --git a/official/projects/longformer/longformer_experiments.py b/official/projects/longformer/longformer_experiments.py index cc85baa0a..49307acaf 100644 --- a/official/projects/longformer/longformer_experiments.py +++ b/official/projects/longformer/longformer_experiments.py @@ -22,10 +22,11 @@ from official.core import exp_factory from official.modeling import optimization from official.nlp.data import pretrain_dataloader from official.nlp.tasks import masked_lm +from official.nlp.tasks import sentence_prediction from official.nlp.data import sentence_prediction_dataloader from official.nlp.configs import bert from official.nlp.configs import encoders -import official.projects.longformer.sentence_prediction_with_checkpoint_convert as sentence_prediction +# import official.projects.longformer.sentence_prediction_with_checkpoint_convert as sentence_prediction from official.projects.longformer.longformer import LongformerEncoderConfig diff --git a/official/projects/longformer/sentence_prediction_with_checkpoint_convert.py b/official/projects/longformer/sentence_prediction_with_checkpoint_convert.py deleted file mode 100644 index 8f91584bf..000000000 --- a/official/projects/longformer/sentence_prediction_with_checkpoint_convert.py +++ /dev/null @@ -1,388 +0,0 @@ -# Copyright 2021 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Sentence prediction (classification) task.""" -import dataclasses -from typing import List, Union, Optional - -from absl import logging -import numpy as np -import orbit -from scipy import stats -from sklearn import metrics as sklearn_metrics -import tensorflow as tf - -from official.core import base_task -from official.core import config_definitions as cfg -from official.core import task_factory -from official.modeling import tf_utils -from official.modeling.hyperparams import base_config -from official.nlp.configs import encoders -from official.nlp.data import data_loader_factory -from official.nlp.modeling import models -from official.nlp.tasks import utils - -import pickle - -METRIC_TYPES = frozenset( - ['accuracy', 'matthews_corrcoef', 'pearson_spearman_corr']) - - -@dataclasses.dataclass -class ModelConfig(base_config.Config): - """A classifier/regressor configuration.""" - num_classes: int = 0 - use_encoder_pooler: bool = False - encoder: encoders.EncoderConfig = encoders.EncoderConfig() - - -@dataclasses.dataclass -class SentencePredictionConfig(cfg.TaskConfig): - """The model config.""" - # At most one of `init_checkpoint` and `hub_module_url` can - # be specified. - init_checkpoint: str = '' - init_cls_pooler: bool = False - initial_parameters_from_pk: str = '' - hub_module_url: str = '' - metric_type: str = 'accuracy' - # Defines the concrete model config at instantiation time. - model: ModelConfig = ModelConfig() - train_data: cfg.DataConfig = cfg.DataConfig() - validation_data: cfg.DataConfig = cfg.DataConfig() - - -@task_factory.register_task_cls(SentencePredictionConfig) -class SentencePredictionTask(base_task.Task): - """Task object for sentence_prediction.""" - - def __init__(self, params: cfg.TaskConfig, logging_dir=None, name=None): - super().__init__(params, logging_dir, name=name) - if params.metric_type not in METRIC_TYPES: - raise ValueError('Invalid metric_type: {}'.format(params.metric_type)) - self.metric_type = params.metric_type - if hasattr(params.train_data, 'label_field'): - self.label_field = params.train_data.label_field - else: - self.label_field = 'label_ids' - - def build_model(self): - if self.task_config.hub_module_url and self.task_config.init_checkpoint: - raise ValueError('At most one of `hub_module_url` and ' - '`init_checkpoint` can be specified.') - if self.task_config.hub_module_url: - encoder_network = utils.get_encoder_from_hub( - self.task_config.hub_module_url) - else: - encoder_network = encoders.build_encoder(self.task_config.model.encoder) - encoder_cfg = self.task_config.model.encoder.get() - if self.task_config.model.encoder.type == 'xlnet': - return models.XLNetClassifier( - network=encoder_network, - num_classes=self.task_config.model.num_classes, - initializer=tf.keras.initializers.RandomNormal( - stddev=encoder_cfg.initializer_range)) - else: - return models.BertClassifier( - network=encoder_network, - num_classes=self.task_config.model.num_classes, - initializer=tf.keras.initializers.TruncatedNormal( - stddev=encoder_cfg.initializer_range), - use_encoder_pooler=self.task_config.model.use_encoder_pooler) - - def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor: - label_ids = labels[self.label_field] - if self.task_config.model.num_classes == 1: - loss = tf.keras.losses.mean_squared_error(label_ids, model_outputs) - else: - loss = tf.keras.losses.sparse_categorical_crossentropy( - label_ids, tf.cast(model_outputs, tf.float32), from_logits=True) - - if aux_losses: - loss += tf.add_n(aux_losses) - return tf_utils.safe_mean(loss) - - def build_inputs(self, params, input_context=None): - """Returns tf.data.Dataset for sentence_prediction task.""" - if params.input_path == 'dummy': - - def dummy_data(_): - dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32) - x = dict( - input_word_ids=dummy_ids, - input_mask=dummy_ids, - input_type_ids=dummy_ids) - - if self.task_config.model.num_classes == 1: - y = tf.zeros((1,), dtype=tf.float32) - else: - y = tf.zeros((1, 1), dtype=tf.int32) - x[self.label_field] = y - return x - - dataset = tf.data.Dataset.range(1) - dataset = dataset.repeat() - dataset = dataset.map( - dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE) - return dataset - - return data_loader_factory.get_data_loader(params).load(input_context) - - def build_metrics(self, training=None): - del training - if self.task_config.model.num_classes == 1: - metrics = [tf.keras.metrics.MeanSquaredError()] - elif self.task_config.model.num_classes == 2: - metrics = [ - tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy'), - tf.keras.metrics.AUC(name='auc', curve='PR'), - ] - else: - metrics = [ - tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy'), - ] - return metrics - - def process_metrics(self, metrics, labels, model_outputs): - for metric in metrics: - if metric.name == 'auc': - # Convert the logit to probability and extract the probability of True.. - metric.update_state( - labels[self.label_field], - tf.expand_dims(tf.nn.softmax(model_outputs)[:, 1], axis=1)) - if metric.name == 'cls_accuracy': - metric.update_state(labels[self.label_field], model_outputs) - - def process_compiled_metrics(self, compiled_metrics, labels, model_outputs): - compiled_metrics.update_state(labels[self.label_field], model_outputs) - - def validation_step(self, inputs, model: tf.keras.Model, metrics=None): - if self.metric_type == 'accuracy': - return super(SentencePredictionTask, - self).validation_step(inputs, model, metrics) - features, labels = inputs, inputs - outputs = self.inference_step(features, model) - loss = self.build_losses( - labels=labels, model_outputs=outputs, aux_losses=model.losses) - logs = {self.loss: loss} - if self.metric_type == 'matthews_corrcoef': - logs.update({ - 'sentence_prediction': # Ensure one prediction along batch dimension. - tf.expand_dims(tf.math.argmax(outputs, axis=1), axis=1), - 'labels': - labels[self.label_field], - }) - if self.metric_type == 'pearson_spearman_corr': - logs.update({ - 'sentence_prediction': outputs, - 'labels': labels[self.label_field], - }) - return logs - - def aggregate_logs(self, state=None, step_outputs=None): - if self.metric_type == 'accuracy': - return None - if state is None: - state = {'sentence_prediction': [], 'labels': []} - state['sentence_prediction'].append( - np.concatenate([v.numpy() for v in step_outputs['sentence_prediction']], - axis=0)) - state['labels'].append( - np.concatenate([v.numpy() for v in step_outputs['labels']], axis=0)) - return state - - def reduce_aggregated_logs(self, aggregated_logs, global_step=None): - if self.metric_type == 'accuracy': - return None - elif self.metric_type == 'matthews_corrcoef': - preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0) - preds = np.reshape(preds, -1) - labels = np.concatenate(aggregated_logs['labels'], axis=0) - labels = np.reshape(labels, -1) - return { - self.metric_type: sklearn_metrics.matthews_corrcoef(preds, labels) - } - elif self.metric_type == 'pearson_spearman_corr': - preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0) - preds = np.reshape(preds, -1) - labels = np.concatenate(aggregated_logs['labels'], axis=0) - labels = np.reshape(labels, -1) - pearson_corr = stats.pearsonr(preds, labels)[0] - spearman_corr = stats.spearmanr(preds, labels)[0] - corr_metric = (pearson_corr + spearman_corr) / 2 - return {self.metric_type: corr_metric} - - def initialize(self, model): - """Load a pretrained checkpoint (if exists) and then train from iter 0.""" - ckpt_dir_or_file = self.task_config.init_checkpoint - if self.task_config.initial_parameters_from_pk: - num_layers = self.task_config.model.encoder.any.num_layers - num_attention_heads = self.task_config.model.encoder.any.num_attention_heads - hidden_size = self.task_config.model.encoder.any.hidden_size - head_size = hidden_size // num_attention_heads - assert head_size * num_attention_heads == hidden_size - - encoder = model.checkpoint_items['encoder'] - allenai_model = pickle.load(open(self.task_config.initial_parameters_from_pk, "rb")) - encoder._embedding_layer.set_weights( - [allenai_model["embeddings.word_embeddings.weight"]] - ) - encoder._embedding_norm_layer.set_weights( - [allenai_model["embeddings.LayerNorm.weight"], - allenai_model["embeddings.LayerNorm.bias"]] - ) - encoder._type_embedding_layer.set_weights( - [np.repeat( - allenai_model["embeddings.token_type_embeddings.weight"], - 2, - axis=0 - )] - ) - encoder._position_embedding_layer.set_weights( - [allenai_model["embeddings.position_embeddings.weight"]] - ) - encoder._pooler_layer.set_weights( - [allenai_model["pooler.dense.weight"], - allenai_model["pooler.dense.bias"]] - ) - for layer_num in range(num_layers): - encoder._transformer_layers[layer_num]._attention_layer._global_key_dense.set_weights( - [allenai_model[f"encoder.layer.{layer_num}.attention.self.key_global.weight"].T.reshape((hidden_size, num_attention_heads, head_size)), - allenai_model[f"encoder.layer.{layer_num}.attention.self.key_global.bias"].reshape((num_attention_heads, head_size))] - ) - encoder._transformer_layers[layer_num]._attention_layer._global_query_dense.set_weights( - [allenai_model[f"encoder.layer.{layer_num}.attention.self.query_global.weight"].T.reshape((hidden_size, num_attention_heads, head_size)), - allenai_model[f"encoder.layer.{layer_num}.attention.self.query_global.bias"].reshape((num_attention_heads, head_size))] - ) - encoder._transformer_layers[layer_num]._attention_layer._global_value_dense.set_weights( - [allenai_model[f"encoder.layer.{layer_num}.attention.self.value_global.weight"].T.reshape((hidden_size, num_attention_heads, head_size)), - allenai_model[f"encoder.layer.{layer_num}.attention.self.value_global.bias"].reshape((num_attention_heads, head_size))] - ) - encoder._transformer_layers[layer_num]._attention_layer._key_dense.set_weights( - [allenai_model[f"encoder.layer.{layer_num}.attention.self.key.weight"].T.reshape((hidden_size, num_attention_heads, head_size)), - allenai_model[f"encoder.layer.{layer_num}.attention.self.key_global.bias"].reshape((num_attention_heads, head_size))] - ) - encoder._transformer_layers[layer_num]._attention_layer._query_dense.set_weights( - [allenai_model[f"encoder.layer.{layer_num}.attention.self.query.weight"].T.reshape((hidden_size, num_attention_heads, head_size)), - allenai_model[f"encoder.layer.{layer_num}.attention.self.query.bias"].reshape((num_attention_heads, head_size))] - ) - encoder._transformer_layers[layer_num]._attention_layer._value_dense.set_weights( - [allenai_model[f"encoder.layer.{layer_num}.attention.self.value.weight"].T.reshape((hidden_size, num_attention_heads, head_size)), - allenai_model[f"encoder.layer.{layer_num}.attention.self.value.bias"].reshape((num_attention_heads, head_size))] - ) - encoder._transformer_layers[layer_num]._attention_layer._output_dense.set_weights( - [allenai_model[f"encoder.layer.{layer_num}.attention.output.dense.weight"].T, - allenai_model[f"encoder.layer.{layer_num}.attention.output.dense.bias"]] - ) - encoder._transformer_layers[layer_num]._attention_layer_norm.set_weights( - [allenai_model[f"encoder.layer.{layer_num}.attention.output.LayerNorm.weight"], - allenai_model[f"encoder.layer.{layer_num}.attention.output.LayerNorm.bias"]] - ) - encoder._transformer_layers[layer_num]._intermediate_dense.set_weights( - [allenai_model[f"encoder.layer.{layer_num}.intermediate.dense.weight"].T, - allenai_model[f"encoder.layer.{layer_num}.intermediate.dense.bias"]] - ) - encoder._transformer_layers[layer_num]._output_dense.set_weights( - [allenai_model[f"encoder.layer.{layer_num}.output.dense.weight"].T, - allenai_model[f"encoder.layer.{layer_num}.output.dense.bias"]] - ) - encoder._transformer_layers[layer_num]._output_layer_norm.set_weights( - [allenai_model[f"encoder.layer.{layer_num}.output.LayerNorm.weight"], - allenai_model[f"encoder.layer.{layer_num}.output.LayerNorm.bias"]] - ) - if not ckpt_dir_or_file: - return - if tf.io.gfile.isdir(ckpt_dir_or_file): - ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file) - - pretrain2finetune_mapping = { - 'encoder': model.checkpoint_items['encoder'], - } - if self.task_config.init_cls_pooler: - # This option is valid when use_encoder_pooler is false. - pretrain2finetune_mapping[ - 'next_sentence.pooler_dense'] = model.checkpoint_items[ - 'sentence_prediction.pooler_dense'] - ckpt = tf.train.Checkpoint(**pretrain2finetune_mapping) - status = ckpt.read(ckpt_dir_or_file) - status.expect_partial().assert_existing_objects_matched() - logging.info('Finished loading pretrained checkpoint from %s', - ckpt_dir_or_file) - - -def predict(task: SentencePredictionTask, - params: cfg.DataConfig, - model: tf.keras.Model, - params_aug: Optional[cfg.DataConfig] = None, - test_time_aug_wgt: float = 0.3) -> List[Union[int, float]]: - """Predicts on the input data. - - Args: - task: A `SentencePredictionTask` object. - params: A `cfg.DataConfig` object. - model: A keras.Model. - params_aug: A `cfg.DataConfig` object for augmented data. - test_time_aug_wgt: Test time augmentation weight. The prediction score will - use (1. - test_time_aug_wgt) original prediction plus test_time_aug_wgt - augmented prediction. - - Returns: - A list of predictions with length of `num_examples`. For regression task, - each element in the list is the predicted score; for classification task, - each element is the predicted class id. - """ - - def predict_step(inputs): - """Replicated prediction calculation.""" - x = inputs - example_id = x.pop('example_id') - outputs = task.inference_step(x, model) - return dict(example_id=example_id, predictions=outputs) - - def aggregate_fn(state, outputs): - """Concatenates model's outputs.""" - if state is None: - state = [] - - for per_replica_example_id, per_replica_batch_predictions in zip( - outputs['example_id'], outputs['predictions']): - state.extend(zip(per_replica_example_id, per_replica_batch_predictions)) - return state - - dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(), - task.build_inputs, params) - outputs = utils.predict(predict_step, aggregate_fn, dataset) - - # When running on TPU POD, the order of output cannot be maintained, - # so we need to sort by example_id. - outputs = sorted(outputs, key=lambda x: x[0]) - is_regression = task.task_config.model.num_classes == 1 - if params_aug is not None: - dataset_aug = orbit.utils.make_distributed_dataset( - tf.distribute.get_strategy(), task.build_inputs, params_aug) - outputs_aug = utils.predict(predict_step, aggregate_fn, dataset_aug) - outputs_aug = sorted(outputs_aug, key=lambda x: x[0]) - if is_regression: - return [(1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1] - for x, y in zip(outputs, outputs_aug)] - else: - return [ - tf.argmax( - (1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1], - axis=-1) for x, y in zip(outputs, outputs_aug) - ] - if is_regression: - return [x[1] for x in outputs] - else: - return [tf.argmax(x[1], axis=-1) for x in outputs] diff --git a/official/projects/longformer/utils/convert_pretrained_pytorch_checkpoint_to_tf.py b/official/projects/longformer/utils/convert_pretrained_pytorch_checkpoint_to_tf.py new file mode 100644 index 000000000..01d448005 --- /dev/null +++ b/official/projects/longformer/utils/convert_pretrained_pytorch_checkpoint_to_tf.py @@ -0,0 +1,176 @@ +# Copyright 2022 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Converts pre-trained pytorch checkpoint into a tf encoder checkpoint +""" + +import os + +from absl import app + +import tensorflow as tf +from official.modeling import tf_utils +import transformers +import numpy as np +from official.projects.longformer.longformer_encoder import LongformerEncoder +from official.projects.longformer.longformer import LongformerEncoderConfig + +def _get_pytorch_longformer_model(): + pretrained_lm = "allenai/longformer-base-4096" + + model = transformers.AutoModel.from_pretrained(pretrained_lm) + + return { + n: p.data.numpy() for n, p in model.named_parameters() + } + +def _create_longformer_model(): + encoder_cfg = LongformerEncoderConfig + encoder_cfg.vocab_size = 50265 + encoder_cfg.max_position_embeddings = 4098 + encoder_cfg.attention_window = [2] * encoder_cfg.num_layers + encoder_cfg.global_attention_size = 1 + encoder = LongformerEncoder( + attention_window=encoder_cfg.attention_window, + global_attention_size=encoder_cfg.global_attention_size, + vocab_size=encoder_cfg.vocab_size, + hidden_size=encoder_cfg.hidden_size, + num_layers=encoder_cfg.num_layers, + num_attention_heads=encoder_cfg.num_attention_heads, + inner_dim=encoder_cfg.intermediate_size, + inner_activation=tf_utils.get_activation(encoder_cfg.hidden_activation), + output_dropout=encoder_cfg.dropout_rate, + attention_dropout=encoder_cfg.attention_dropout_rate, + max_sequence_length=encoder_cfg.max_position_embeddings, + type_vocab_size=encoder_cfg.type_vocab_size, + initializer=tf.keras.initializers.TruncatedNormal( + stddev=encoder_cfg.initializer_range), + output_range=encoder_cfg.output_range, + embedding_width=encoder_cfg.embedding_size, + norm_first=encoder_cfg.norm_first + ) + return encoder + + +def convert(encoder, allenai_model): + num_layers = encoder._config["num_layers"] + num_attention_heads = encoder._config["num_attention_heads"] + hidden_size = encoder._config["hidden_size"] + head_size = hidden_size // num_attention_heads + assert head_size * num_attention_heads == hidden_size + encoder._embedding_layer.set_weights( + [allenai_model["embeddings.word_embeddings.weight"]] + ) + encoder._embedding_norm_layer.set_weights( + [allenai_model["embeddings.LayerNorm.weight"], + allenai_model["embeddings.LayerNorm.bias"]] + ) + encoder._type_embedding_layer.set_weights( + [np.repeat( + allenai_model["embeddings.token_type_embeddings.weight"], + 2, + axis=0 + )] + ) + encoder._position_embedding_layer.set_weights( + [allenai_model["embeddings.position_embeddings.weight"]] + ) + encoder._pooler_layer.set_weights( + [allenai_model["pooler.dense.weight"], + allenai_model["pooler.dense.bias"]] + ) + for layer_num in range(num_layers): + encoder._transformer_layers[layer_num]._attention_layer._global_key_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.self.key_global.weight"].T.reshape( + (hidden_size, num_attention_heads, head_size)), + allenai_model[f"encoder.layer.{layer_num}.attention.self.key_global.bias"].reshape( + (num_attention_heads, head_size))] + ) + encoder._transformer_layers[layer_num]._attention_layer._global_query_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.self.query_global.weight"].T.reshape( + (hidden_size, num_attention_heads, head_size)), + allenai_model[f"encoder.layer.{layer_num}.attention.self.query_global.bias"].reshape( + (num_attention_heads, head_size))] + ) + encoder._transformer_layers[layer_num]._attention_layer._global_value_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.self.value_global.weight"].T.reshape( + (hidden_size, num_attention_heads, head_size)), + allenai_model[f"encoder.layer.{layer_num}.attention.self.value_global.bias"].reshape( + (num_attention_heads, head_size))] + ) + encoder._transformer_layers[layer_num]._attention_layer._key_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.self.key.weight"].T.reshape( + (hidden_size, num_attention_heads, head_size)), + allenai_model[f"encoder.layer.{layer_num}.attention.self.key_global.bias"].reshape( + (num_attention_heads, head_size))] + ) + encoder._transformer_layers[layer_num]._attention_layer._query_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.self.query.weight"].T.reshape( + (hidden_size, num_attention_heads, head_size)), + allenai_model[f"encoder.layer.{layer_num}.attention.self.query.bias"].reshape((num_attention_heads, head_size))] + ) + encoder._transformer_layers[layer_num]._attention_layer._value_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.self.value.weight"].T.reshape( + (hidden_size, num_attention_heads, head_size)), + allenai_model[f"encoder.layer.{layer_num}.attention.self.value.bias"].reshape((num_attention_heads, head_size))] + ) + encoder._transformer_layers[layer_num]._attention_layer._output_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.output.dense.weight"].T, + allenai_model[f"encoder.layer.{layer_num}.attention.output.dense.bias"]] + ) + encoder._transformer_layers[layer_num]._attention_layer_norm.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.attention.output.LayerNorm.weight"], + allenai_model[f"encoder.layer.{layer_num}.attention.output.LayerNorm.bias"]] + ) + encoder._transformer_layers[layer_num]._intermediate_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.intermediate.dense.weight"].T, + allenai_model[f"encoder.layer.{layer_num}.intermediate.dense.bias"]] + ) + encoder._transformer_layers[layer_num]._output_dense.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.output.dense.weight"].T, + allenai_model[f"encoder.layer.{layer_num}.output.dense.bias"]] + ) + encoder._transformer_layers[layer_num]._output_layer_norm.set_weights( + [allenai_model[f"encoder.layer.{layer_num}.output.LayerNorm.weight"], + allenai_model[f"encoder.layer.{layer_num}.output.LayerNorm.bias"]] + ) + +def convert_checkpoint(output_path): + output_dir, _ = os.path.split(output_path) + tf.io.gfile.makedirs(output_dir) + + encoder = _create_longformer_model() + allenai_model = _get_pytorch_longformer_model() + sequence_length = 128 + batch_size = 2 + word_id_data = np.random.randint(10, size=(batch_size, sequence_length), dtype=np.int32) + mask_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32) + type_id_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32) + inputs = { + 'input_word_ids': word_id_data, + 'input_mask': mask_data, + 'input_type_ids': type_id_data, + } + encoder(inputs) + convert(encoder, allenai_model) + tf.train.Checkpoint(encoder=encoder).write(output_path) + + +def main(argv): + convert_checkpoint("longformer-4096/longformer") + + +if __name__ == "__main__": + app.run(main) diff --git a/official/projects/longformer/utils/get_parameters_from_pretrained_pytorch_checkpoint.py b/official/projects/longformer/utils/get_parameters_from_pretrained_pytorch_checkpoint.py deleted file mode 100644 index d646364b5..000000000 --- a/official/projects/longformer/utils/get_parameters_from_pretrained_pytorch_checkpoint.py +++ /dev/null @@ -1,9 +0,0 @@ -import transformers -pretrained_lm = "allenai/longformer-base-4096" - -model = transformers.AutoModel.from_pretrained(pretrained_lm) - -import pickle -pickle.dump({ - n: p.data.numpy() -for n, p in model.named_parameters()}, open(f"{pretrained_lm.replace('/', '_')}.pk", "wb")) \ No newline at end of file -- GitLab From 32867f40629b3b28637215e72da8d0f902eddda4 Mon Sep 17 00:00:00 2001 From: Zihan Wang Date: Tue, 1 Mar 2022 17:13:51 -0800 Subject: [PATCH 11/54] add empty line --- official/projects/longformer/longformer_encoder_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/official/projects/longformer/longformer_encoder_test.py b/official/projects/longformer/longformer_encoder_test.py index afa90b775..e8a1bf78f 100644 --- a/official/projects/longformer/longformer_encoder_test.py +++ b/official/projects/longformer/longformer_encoder_test.py @@ -24,6 +24,7 @@ from official.projects.longformer.longformer_encoder import LongformerEncoder @keras_parameterized.run_all_keras_modes class LongformerEncoderTest(keras_parameterized.TestCase): + @combinations.generate(combinations.combine( attention_window=[32, 128], global_attention_size=[0, 1, 2])) def test_encoder(self, attention_window, global_attention_size): -- GitLab From 09e6e71c68ad432efe29ef88f5aa9296fa40cc9a Mon Sep 17 00:00:00 2001 From: Zihan Wang Date: Wed, 2 Mar 2022 09:42:00 -0800 Subject: [PATCH 12/54] lint --- .../longformer/longformer_attention.py | 431 +++++++++++------- .../longformer/longformer_attention_test.py | 158 ++++--- .../projects/longformer/longformer_encoder.py | 292 ++++++------ .../longformer/longformer_encoder_block.py | 258 +++++------ .../longformer/longformer_encoder_test.py | 87 ++-- .../longformer/longformer_experiments.py | 136 +++--- official/projects/longformer/train.py | 24 +- 7 files changed, 755 insertions(+), 631 deletions(-) diff --git a/official/projects/longformer/longformer_attention.py b/official/projects/longformer/longformer_attention.py index 1c32e6f51..46e9181bd 100644 --- a/official/projects/longformer/longformer_attention.py +++ b/official/projects/longformer/longformer_attention.py @@ -18,27 +18,20 @@ Longformer attention block. Modified From huggingface/transformers # pylint: disable=g-classes-have-attributes -import collections import math import string import tensorflow as tf import numpy as np -from keras import constraints -from keras import initializers -from keras import regularizers -from keras.engine.base_layer import Layer from keras.layers import core from keras.layers import einsum_dense from keras.utils import tf_utils -from tensorflow.python.platform import tf_logging as logging -from tensorflow.python.util.tf_export import keras_export from official.modeling.tf_utils import get_shape_list -from typing import Dict, List, Optional, Union _CHR_IDX = string.ascii_lowercase + def _build_attention_equation(rank, attn_axes): """Builds einsum equations for the attention computation. Query, key, value inputs after projection are expected to have the shape as: @@ -64,7 +57,7 @@ def _build_attention_equation(rank, attn_axes): # `batch_dims` includes the head dim. batch_dims = tuple(np.delete(range(rank), attn_axes + (rank - 1,))) letter_offset = rank - source_notation = "" + source_notation = '' for i in range(rank): if i in batch_dims or i == rank - 1: source_notation += target_notation[i] @@ -72,23 +65,21 @@ def _build_attention_equation(rank, attn_axes): source_notation += _CHR_IDX[letter_offset] letter_offset += 1 - product_notation = "".join([target_notation[i] for i in batch_dims] + + product_notation = ''.join([target_notation[i] for i in batch_dims] + [target_notation[i] for i in attn_axes] + [source_notation[i] for i in attn_axes]) - dot_product_equation = "%s,%s->%s" % (source_notation, target_notation, - product_notation) + dot_product_equation = f'{source_notation},{target_notation}->{product_notation}' attn_scores_rank = len(product_notation) - combine_equation = "%s,%s->%s" % (product_notation, source_notation, - target_notation) + combine_equation = f'{product_notation},{source_notation}->{target_notation}' return dot_product_equation, combine_equation, attn_scores_rank def _build_proj_equation(free_dims, bound_dims, output_dims): """Builds an einsum equation for projections inside multi-head attention.""" - input_str = "" - kernel_str = "" - output_str = "" - bias_axes = "" + input_str = '' + kernel_str = '' + output_str = '' + bias_axes = '' letter_offset = 0 for i in range(free_dims): char = _CHR_IDX[i + letter_offset] @@ -107,7 +98,7 @@ def _build_proj_equation(free_dims, bound_dims, output_dims): kernel_str += char output_str += char bias_axes += char - equation = "%s,%s->%s" % (input_str, kernel_str, output_str) + equation = f'{input_str},{kernel_str}->{output_str}' return equation, bias_axes, len(output_str) @@ -115,8 +106,17 @@ def _build_proj_equation(free_dims, bound_dims, output_dims): def _get_output_shape(output_rank, known_last_dims): return [None] * (output_rank - len(known_last_dims)) + list(known_last_dims) + @tf.keras.utils.register_keras_serializable(package="Text") class LongformerAttention(tf.keras.layers.MultiHeadAttention): + """LongformerAttention + + Args: + attention_window: int representing the window size for attention. + layer_id: int of the id of the layer. + global_attention_size: the size of global attention used for each token. + """ + def __init__(self, attention_window, layer_id, @@ -124,14 +124,16 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): **kwargs): super().__init__(**kwargs) self._layer_id = layer_id - _attention_window = attention_window + self._attention_window = attention_window assert ( - _attention_window % 2 == 0 - ), f"`attention_window` for layer {self._layer_id} has to be an even value. Given {attention_window}" + self._attention_window % 2 == 0 + ), f"`attention_window` for layer {self._layer_id} has to be an even " \ + f"value. Given {self.attention_window}" assert ( - _attention_window > 0 - ), f"`attention_window` for layer {self._layer_id} has to be positive. Given {attention_window}" - self._one_sided_attn_window_size = _attention_window // 2 + self._attention_window > 0 + ), f"`attention_window` for layer {self._layer_id} has to be positive. " \ + f"Given {self.attention_window}" + self._one_sided_attn_window_size = self._attention_window // 2 self.global_attention_size = global_attention_size def _build_from_signature(self, query, value, key=None): @@ -228,16 +230,15 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): # self._output_dense = self._make_output_dense( # free_dims, common_kwargs, "attention_output") self._output_dense = tf.keras.layers.Dense( - units=self._num_heads * self._key_dim, name="dense", - **common_kwargs - ) + units=self._num_heads * self._key_dim, name="dense", + **common_kwargs + ) def call(self, hidden_states, attention_mask=None, is_index_masked=None, is_index_global_attn=None, - is_global_attn=None, training=None): """Applies Dot-product attention with query, key, value tensors. This function defines the computation inside `call` with projected @@ -256,7 +257,8 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): attention_scores: Multi-headed attention weights. """ if not self._built_from_signature: - self._build_from_signature(query=hidden_states, value=hidden_states, key=hidden_states) + self._build_from_signature(query=hidden_states, value=hidden_states, + key=hidden_states) # N = `num_attention_heads` # H = `size_per_head` @@ -272,7 +274,7 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): # Note: Applying scalar multiply at the smaller end of einsum improves # XLA performance, but may introduce slight numeric differences in # the Transformer attention head. - query = tf.multiply(query, 1.0 / math.sqrt(float(self._key_dim))) # (B, T, N, key_dim) + query = tf.multiply(query, 1.0 / math.sqrt(float(self._key_dim))) batch_size, seq_len, num_heads, head_dim = get_shape_list(query) # attn_probs = (batch_size, seq_len, num_heads, window*2+1) @@ -293,8 +295,12 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): if tf.executing_eagerly(): tf.debugging.assert_equal( get_shape_list(attn_scores), - [batch_size, seq_len, self._num_heads, self._one_sided_attn_window_size * 2 + 1], - message=f"attn_probs should be of size ({batch_size}, {seq_len}, {num_heads}, {self._one_sided_attn_window_size * 2 + 1}), but is of size {get_shape_list(attn_scores)}", + [batch_size, seq_len, self._num_heads, + self._one_sided_attn_window_size * 2 + 1], + message=f"attn_probs should be of size " + f"({batch_size}, {seq_len}, {num_heads}, " + f"{self._one_sided_attn_window_size * 2 + 1})," + f" but is of size {get_shape_list(attn_scores)}", ) # compute global attn indices required through out forward fn @@ -303,7 +309,8 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): is_index_global_attn_nonzero, is_local_index_global_attn_nonzero, is_local_index_no_global_attn_nonzero, - ) = self._get_global_attn_indices(is_index_global_attn, self.global_attention_size) + ) = self._get_global_attn_indices(is_index_global_attn, + self.global_attention_size) # this function is only relevant for global attention if self.global_attention_size > 0: attn_scores = self._concat_with_global_key_attn_probs( @@ -320,14 +327,18 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): attn_probs = tf.nn.softmax(attn_scores, axis=-1) - # softmax sometimes inserts NaN if all positions are masked, replace them with 0 + # softmax sometimes inserts NaN if all positions are masked, + # replace them with 0 # Make sure to create a mask with the proper shape: - # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1] - # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1] + # if is_global_attn==True => [batch_size, seq_len, self.num_heads, + # self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1] + # if is_global_attn==False => [batch_size, seq_len, self.num_heads, + # self.one_sided_attn_window_size * 2 + 1] if self.global_attention_size > 0: masked_index = tf.tile( is_index_masked[:, :, None, None], - (1, 1, self._num_heads, self._one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1), + (1, 1, self._num_heads, + self._one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1), ) else: masked_index = tf.tile( @@ -347,14 +358,17 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): tf.debugging.assert_equal( get_shape_list(layer_head_mask), [self._num_heads], - message=f"Head mask for a single layer should be of size {(self._num_heads)}, but is {get_shape_list(layer_head_mask)}", + message=f"Head mask for a single layer should be of size " + f"{(self._num_heads)}, but is " + f"{get_shape_list(layer_head_mask)}", ) attn_probs = tf.reshape(layer_head_mask, (1, 1, -1, 1)) * attn_probs # apply dropout attn_probs = self._dropout_layer(attn_probs, training=training) - value_vectors = tf.reshape(value, (batch_size, seq_len, self._num_heads, self._key_dim)) # TODO: _key_dim == _value_dim + value_vectors = tf.reshape(value, (batch_size, seq_len, self._num_heads, + self._key_dim)) # if global attention, compute sum of global and local attn if self.global_attention_size > 0: @@ -377,33 +391,35 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): message="Unexpected size", ) - attn_output = tf.reshape(attn_output, (batch_size, seq_len, self._num_heads * self._key_dim)) # FIXME + attn_output = tf.reshape(attn_output, ( + batch_size, seq_len, self._num_heads * self._key_dim)) # FIXME # compute value for global attention and overwrite to attention output # TODO: remove the redundant computation if self.global_attention_size > 0: - attn_output, global_attn_probs = self._compute_global_attn_output_from_hidden( - attn_output=attn_output, - hidden_states=hidden_states, - max_num_global_attn_indices=max_num_global_attn_indices, - layer_head_mask=layer_head_mask, - is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero, - is_index_global_attn_nonzero=is_index_global_attn_nonzero, - is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero, - is_index_masked=is_index_masked, - training=training, - ) + attn_output, global_attn_probs = \ + self._compute_global_attn_output_from_hidden( + attn_output=attn_output, + hidden_states=hidden_states, + max_num_global_attn_indices=max_num_global_attn_indices, + layer_head_mask=layer_head_mask, + is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero, + is_index_global_attn_nonzero=is_index_global_attn_nonzero, + is_local_index_no_global_attn_nonzero=is_local_index_no_global_attn_nonzero, + is_index_masked=is_index_masked, + training=training, + ) else: - global_attn_probs = tf.zeros((batch_size, self._num_heads, max_num_global_attn_indices, seq_len)) + global_attn_probs = tf.zeros( + (batch_size, self._num_heads, max_num_global_attn_indices, seq_len)) - # make sure that local attention probabilities are set to 0 for indices of global attn - # Make sure to create a mask with the proper shape: - # if is_global_attn==True => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1] - # if is_global_attn==False => [batch_size, seq_len, self.num_heads, self.one_sided_attn_window_size * 2 + 1] + # make sure that local attention probabilities are set to 0 for indices of + # global attn if self.global_attention_size > 0: masked_global_attn_index = tf.tile( is_index_global_attn[:, :, None, None], - (1, 1, self._num_heads, self._one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1), + (1, 1, self._num_heads, + self._one_sided_attn_window_size * 2 + max_num_global_attn_indices + 1), ) else: masked_global_attn_index = tf.tile( @@ -413,28 +429,30 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): attn_probs = tf.where( masked_global_attn_index, - tf.zeros(get_shape_list(masked_global_attn_index), dtype=attn_probs.dtype), + tf.zeros(get_shape_list(masked_global_attn_index), + dtype=attn_probs.dtype), attn_probs, ) # we can return extra information here - attention_output = attn_output # (attn_output, attn_probs, global_attn_probs) + attention_output = attn_output # (attn_output, attn_probs, global_attn_probs) return attention_output def get_config(self): config = { - "layer_id": self._layer_id, - "attention_window": self._one_sided_attn_window_size, + "layer_id": self._layer_id, + "attention_window": self._one_sided_attn_window_size, } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) def _sliding_chunks_query_key_matmul(self, query, key, window_overlap): """ - Matrix multiplication of query and key tensors using with a sliding window attention pattern. This - implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer) with an - overlap of size window_overlap + Matrix multiplication of query and key tensors using with a sliding window + attention pattern. This implementation splits the input into overlapping + chunks of size 2w (e.g. 512 for pretrained Longformer) with an overlap of + size window_overlap """ batch_size, seq_len, num_heads, head_dim = get_shape_list(query) @@ -442,22 +460,26 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): tf.debugging.assert_equal( seq_len % (window_overlap * 2), 0, - message=f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}", + message=f"Sequence length should be multiple of {window_overlap * 2}. " + f"Given {seq_len}", ) tf.debugging.assert_equal( get_shape_list(query), get_shape_list(key), - message=f"Shape of query and key should be equal, but got query: {get_shape_list(query)} and key: {get_shape_list(key)}", + message=f"Shape of query and key should be equal, but got query: " + f"{get_shape_list(query)} and key: {get_shape_list(key)}", ) chunks_count = seq_len // window_overlap - 1 - # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size window_overlap * 2 + # group batch_size and num_heads dimensions into one, + # then chunk seq_len into chunks of size window_overlap * 2 query = tf.reshape( tf.transpose(query, (0, 2, 1, 3)), (batch_size * num_heads, seq_len, head_dim), ) - key = tf.reshape(tf.transpose(key, (0, 2, 1, 3)), (batch_size * num_heads, seq_len, head_dim)) + key = tf.reshape(tf.transpose(key, (0, 2, 1, 3)), + (batch_size * num_heads, seq_len, head_dim)) chunked_query = self._chunk(query, window_overlap) chunked_key = self._chunk(key, window_overlap) @@ -466,24 +488,31 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): # bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim # bcxy: batch_size * num_heads x chunks x 2window_overlap x 2window_overlap chunked_query = tf.cast(chunked_query, dtype=chunked_key.dtype) - chunked_attention_scores = tf.einsum("bcxd,bcyd->bcxy", chunked_query, chunked_key) # multiply + chunked_attention_scores = tf.einsum("bcxd,bcyd->bcxy", chunked_query, + chunked_key) # multiply # convert diagonals into columns paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 1], [0, 0]]) - diagonal_chunked_attention_scores = self._pad_and_transpose_last_two_dims(chunked_attention_scores, paddings) - - # allocate space for the overall attention matrix where the chunks are combined. The last dimension - # has (window_overlap * 2 + 1) columns. The first (window_overlap) columns are the window_overlap lower triangles (attention from a word to - # window_overlap previous words). The following column is attention score from each word to itself, then + diagonal_chunked_attention_scores = self._pad_and_transpose_last_two_dims( + chunked_attention_scores, paddings) + + # allocate space for the overall attention matrix where the chunks are + # combined. The last dimension + # has (window_overlap * 2 + 1) columns. The first (window_overlap) columns + # are the window_overlap lower triangles (attention from a word to + # window_overlap previous words). The following column is attention score + # from each word to itself, then # followed by window_overlap columns for the upper triangle. - # copy parts from diagonal_chunked_attention_scores into the combined matrix of attentions - # - copying the main diagonal and the upper triangle + # copy parts from diagonal_chunked_attention_scores into the combined matrix + # of attentions - copying the main diagonal and the upper triangle # TODO: This code is most likely not very efficient and should be improved diagonal_attn_scores_up_triang = tf.concat( [ - diagonal_chunked_attention_scores[:, :, :window_overlap, : window_overlap + 1], - diagonal_chunked_attention_scores[:, -1:, window_overlap:, : window_overlap + 1], + diagonal_chunked_attention_scores[:, :, :window_overlap, + : window_overlap + 1], + diagonal_chunked_attention_scores[:, -1:, window_overlap:, + : window_overlap + 1], ], axis=1, ) @@ -495,7 +524,8 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): (batch_size * num_heads, 1, window_overlap, window_overlap), dtype=diagonal_chunked_attention_scores.dtype, ), - diagonal_chunked_attention_scores[:, :, -(window_overlap + 1): -1, window_overlap + 1:], + diagonal_chunked_attention_scores[:, :, -(window_overlap + 1): -1, + window_overlap + 1:], ], axis=1, ) @@ -514,13 +544,13 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): axis=1, ) first_chunk_mask = ( - tf.tile( - tf.range(chunks_count + 1)[None, :, None, None], - (batch_size * num_heads, 1, window_overlap, window_overlap), - ) - < 1 + tf.tile( + tf.range(chunks_count + 1)[None, :, None, None], + (batch_size * num_heads, 1, window_overlap, window_overlap), + ) + < 1 ) - #first_chunk_mask = tf.repeat(first_chunk_mask, batch_size * num_heads, axis=0) + diagonal_attn_scores_low_triang = tf.where( first_chunk_mask, diagonal_attn_scores_first_chunk, @@ -541,7 +571,8 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): (0, 2, 1, 3), ) - diagonal_attention_scores = self._mask_invalid_locations(diagonal_attention_scores, window_overlap) + diagonal_attention_scores = self._mask_invalid_locations( + diagonal_attention_scores, window_overlap) return diagonal_attention_scores @@ -549,13 +580,15 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): def _mask_invalid_locations(input_tensor, window_overlap): # create correct upper triangle bool mask mask_2d_upper = tf.reverse( - tf.linalg.band_part(tf.ones(shape=(window_overlap, window_overlap + 1)), -1, 0), + tf.linalg.band_part(tf.ones(shape=(window_overlap, window_overlap + 1)), + -1, 0), axis=[0], ) # pad to full matrix padding = tf.convert_to_tensor( - [[0, get_shape_list(input_tensor)[1] - window_overlap], [0, get_shape_list(input_tensor)[3] - window_overlap - 1]] + [[0, get_shape_list(input_tensor)[1] - window_overlap], + [0, get_shape_list(input_tensor)[3] - window_overlap - 1]] ) # create lower mask @@ -565,20 +598,23 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): mask_2d = mask_2d + tf.reverse(mask_2d, axis=[0, 1]) # broadcast to full matrix - mask_4d = tf.tile(mask_2d[None, :, None, :], (get_shape_list(input_tensor)[0], 1, 1, 1)) + mask_4d = tf.tile(mask_2d[None, :, None, :], + (get_shape_list(input_tensor)[0], 1, 1, 1)) # inf tensor used for masking inf_tensor = -float("inf") * tf.ones_like(input_tensor) # mask - input_tensor = tf.where(tf.math.greater(mask_4d, 0), inf_tensor, input_tensor) + input_tensor = tf.where(tf.math.greater(mask_4d, 0), inf_tensor, + input_tensor) return input_tensor - def _sliding_chunks_matmul_attn_probs_value(self, attn_probs, value, window_overlap): + def _sliding_chunks_matmul_attn_probs_value(self, attn_probs, value, + window_overlap): """ - Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the - same shape as `attn_probs` + Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. + Returned tensor will be of the same shape as `attn_probs` """ batch_size, seq_len, num_heads, head_dim = get_shape_list(value) @@ -602,7 +638,8 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): chunks_count = seq_len // window_overlap - 1 - # group batch_size and num_heads dimensions into one, then chunk seq_len into chunks of size 2 window overlap + # group batch_size and num_heads dimensions into one, then chunk seq_len + # into chunks of size 2 window overlap chunked_attn_probs = tf.reshape( tf.transpose(attn_probs, (0, 2, 1, 3)), ( @@ -619,13 +656,17 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): (batch_size * num_heads, seq_len, head_dim), ) - # pad seq_len with w at the beginning of the sequence and another window overlap at the end - paddings = tf.convert_to_tensor([[0, 0], [window_overlap, window_overlap], [0, 0]]) + # pad seq_len with w at the beginning of the sequence and another window + # overlap at the end + paddings = tf.convert_to_tensor( + [[0, 0], [window_overlap, window_overlap], [0, 0]]) padded_value = tf.pad(value, paddings, constant_values=-1) - # chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap + # chunk padded_value into chunks of size 3 window overlap and an overlap of + # size window overlap frame_size = 3 * window_overlap * head_dim - frame_hop_size = (get_shape_list(padded_value)[1] * head_dim - frame_size) // chunks_count + frame_hop_size = (get_shape_list(padded_value)[ + 1] * head_dim - frame_size) // chunks_count chunked_value = tf.signal.frame( tf.reshape(padded_value, (batch_size * num_heads, -1)), frame_size, @@ -639,7 +680,8 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): if tf.executing_eagerly(): tf.debugging.assert_equal( get_shape_list(chunked_value), - [batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim], + [batch_size * num_heads, chunks_count + 1, 3 * window_overlap, + head_dim], message="Chunked value has the wrong shape", ) @@ -658,8 +700,10 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): hidden_states_padded = tf.pad( hidden_states_padded, paddings ) # padding value is not important because it will be overwritten - batch_size, chunk_size, seq_length, hidden_dim = get_shape_list(hidden_states_padded) - hidden_states_padded = tf.reshape(hidden_states_padded, (batch_size, chunk_size, hidden_dim, seq_length)) + batch_size, chunk_size, seq_length, hidden_dim = get_shape_list( + hidden_states_padded) + hidden_states_padded = tf.reshape(hidden_states_padded, ( + batch_size, chunk_size, hidden_dim, seq_length)) return hidden_states_padded @@ -681,21 +725,27 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ] """ - total_num_heads, num_chunks, window_overlap, hidden_dim = get_shape_list(chunked_hidden_states) - paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 0], [0, window_overlap + 1]]) + total_num_heads, num_chunks, window_overlap, hidden_dim = get_shape_list( + chunked_hidden_states) + paddings = tf.convert_to_tensor( + [[0, 0], [0, 0], [0, 0], [0, window_overlap + 1]]) + chunked_hidden_states = tf.pad( chunked_hidden_states, paddings - ) # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten + ) + chunked_hidden_states = tf.reshape( chunked_hidden_states, (total_num_heads, num_chunks, -1) - ) # total_num_heads x num_chunks x window_overlapL+window_overlapwindow_overlap+window_overlap + ) chunked_hidden_states = chunked_hidden_states[ :, :, :-window_overlap - ] # total_num_heads x num_chunks x window_overlapL+window_overlapwindow_overlap + ] chunked_hidden_states = tf.reshape( chunked_hidden_states, - (total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim), - ) # total_num_heads x num_chunks, window_overlap x hidden_dim+window_overlap + ( + total_num_heads, num_chunks, window_overlap, + window_overlap + hidden_dim), + ) chunked_hidden_states = chunked_hidden_states[:, :, :, :-1] return chunked_hidden_states @@ -709,16 +759,21 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): # define frame size and frame stride (similar to convolution) frame_hop_size = window_overlap * hidden_dim frame_size = 2 * frame_hop_size - hidden_states = tf.reshape(hidden_states, (batch_size, seq_length * hidden_dim)) + hidden_states = tf.reshape(hidden_states, + (batch_size, seq_length * hidden_dim)) # chunk with overlap - chunked_hidden_states = tf.signal.frame(hidden_states, frame_size, frame_hop_size) + chunked_hidden_states = tf.signal.frame(hidden_states, frame_size, + frame_hop_size) if tf.executing_eagerly(): tf.debugging.assert_equal( get_shape_list(chunked_hidden_states), [batch_size, num_output_chunks, frame_size], - message=f"Make sure chunking is correctly applied. `Chunked hidden states should have output dimension {[batch_size, frame_size, num_output_chunks]}, but got {get_shape_list(chunked_hidden_states)}.", + message=f"Make sure chunking is correctly applied. `Chunked hidden " + f"states should have output dimension" + f" {[batch_size, frame_size, num_output_chunks]}, but got " + f"{get_shape_list(chunked_hidden_states)}.", ) chunked_hidden_states = tf.reshape( @@ -738,19 +793,25 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): max_num_global_attn_indices = global_attention_size row_indices = tf.range(batch_size) - row_indices = tf.repeat(tf.expand_dims(row_indices, axis=0), repeats=[global_attention_size], axis=0) - row_indices = tf.reshape(row_indices, (batch_size * global_attention_size, 1)) + row_indices = tf.repeat(tf.expand_dims(row_indices, axis=0), + repeats=[global_attention_size], axis=0) + row_indices = tf.reshape(row_indices, + (batch_size * global_attention_size, 1)) col_indices = tf.range(global_attention_size) - col_indices = tf.repeat(tf.expand_dims(col_indices, axis=1), repeats=[batch_size], axis=0) + col_indices = tf.repeat(tf.expand_dims(col_indices, axis=1), + repeats=[batch_size], axis=0) is_index_global_attn_nonzero = tf.concat((row_indices, col_indices), axis=1) - # this is actually same as `is_index_global_attn_nonzero`, since we assume all global attention are the same size - is_local_index_global_attn_nonzero = tf.concat((row_indices, col_indices), axis=1) + # this is actually same as `is_index_global_attn_nonzero`, + # since we assume all global attention are the same size + is_local_index_global_attn_nonzero = tf.concat((row_indices, col_indices), + axis=1) # empty tensor - is_local_index_no_global_attn_nonzero = tf.reshape(tf.expand_dims(tf.range(0), axis=1), (0, 2)) + is_local_index_no_global_attn_nonzero = tf.reshape( + tf.expand_dims(tf.range(0), axis=1), (0, 2)) return ( max_num_global_attn_indices, is_index_global_attn_nonzero, @@ -759,14 +820,14 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): ) def _concat_with_global_key_attn_probs( - self, - attn_scores, - key_vectors, - query_vectors, - max_num_global_attn_indices, - is_index_global_attn_nonzero, - is_local_index_global_attn_nonzero, - is_local_index_no_global_attn_nonzero, + self, + attn_scores, + key_vectors, + query_vectors, + max_num_global_attn_indices, + is_index_global_attn_nonzero, + is_local_index_global_attn_nonzero, + is_local_index_no_global_attn_nonzero, ): batch_size = get_shape_list(key_vectors)[0] @@ -786,11 +847,14 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): ) # (batch_size, seq_len, num_heads, max_num_global_attn_indices) - attn_probs_from_global_key = tf.einsum("blhd,bshd->blhs", query_vectors, key_vectors_only_global) + attn_probs_from_global_key = tf.einsum("blhd,bshd->blhs", query_vectors, + key_vectors_only_global) # (batch_size, max_num_global_attn_indices, seq_len, num_heads) - attn_probs_from_global_key_trans = tf.transpose(attn_probs_from_global_key, (0, 3, 1, 2)) - mask_shape = (get_shape_list(is_local_index_no_global_attn_nonzero)[0],) + tuple( + attn_probs_from_global_key_trans = tf.transpose(attn_probs_from_global_key, + (0, 3, 1, 2)) + mask_shape = (get_shape_list(is_local_index_no_global_attn_nonzero)[ + 0],) + tuple( get_shape_list(attn_probs_from_global_key_trans)[-2:] ) mask = tf.ones(mask_shape) * -10000.0 @@ -804,7 +868,8 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): ) # (batch_size, seq_len, num_heads, max_num_global_attn_indices) - attn_probs_from_global_key = tf.transpose(attn_probs_from_global_key_trans, (0, 2, 3, 1)) + attn_probs_from_global_key = tf.transpose(attn_probs_from_global_key_trans, + (0, 2, 3, 1)) # concat to attn_probs # (batch_size, seq_len, num_heads, extra attention count + 2*window+1) @@ -812,21 +877,21 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): return attn_scores def _compute_attn_output_with_global_indices( - self, - value_vectors, - attn_probs, - max_num_global_attn_indices, - is_index_global_attn_nonzero, - is_local_index_global_attn_nonzero, + self, + value_vectors, + attn_probs, + max_num_global_attn_indices, + is_index_global_attn_nonzero, + is_local_index_global_attn_nonzero, ): batch_size = get_shape_list(attn_probs)[0] # cut local attn probs to global only attn_probs_only_global = attn_probs[:, :, :, :max_num_global_attn_indices] - # attn_probs_only_global = tf.slice(attn_probs, [0, 0, 0, 0], get_shape_list(attn_probs)[: -1] + [max_num_global_attn_indices]) # select global value vectors - global_value_vectors = tf.gather_nd(value_vectors, is_index_global_attn_nonzero) + global_value_vectors = tf.gather_nd(value_vectors, + is_index_global_attn_nonzero) # create only global value vectors value_vectors_only_global = tf.scatter_nd( @@ -841,10 +906,12 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): ) # compute attn output only global - attn_output_only_global = tf.einsum("blhs,bshd->blhd", attn_probs_only_global, value_vectors_only_global) + attn_output_only_global = tf.einsum("blhs,bshd->blhd", + attn_probs_only_global, + value_vectors_only_global) # reshape attn probs - attn_probs_without_global = attn_probs[:, :, :, max_num_global_attn_indices:] - # attn_probs_without_global = tf.slice(attn_probs, [0, 0, 0, max_num_global_attn_indices], get_shape_list(attn_probs)[: -1] + [get_shape_list(attn_probs)[-1] - max_num_global_attn_indices]) + attn_probs_without_global = attn_probs[:, :, :, + max_num_global_attn_indices:] # compute attn output with global attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value( @@ -854,29 +921,33 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): return attn_output_only_global + attn_output_without_global def _compute_global_attn_output_from_hidden( - self, - attn_output, - hidden_states, - max_num_global_attn_indices, - layer_head_mask, - is_local_index_global_attn_nonzero, - is_index_global_attn_nonzero, - is_local_index_no_global_attn_nonzero, - is_index_masked, - training, + self, + attn_output, + hidden_states, + max_num_global_attn_indices, + layer_head_mask, + is_local_index_global_attn_nonzero, + is_index_global_attn_nonzero, + is_local_index_no_global_attn_nonzero, + is_index_masked, + training, ): batch_size, seq_len = get_shape_list(hidden_states)[:2] # prepare global hidden states - global_attn_hidden_states = tf.gather_nd(hidden_states, is_index_global_attn_nonzero) + global_attn_hidden_states = tf.gather_nd(hidden_states, + is_index_global_attn_nonzero) global_attn_hidden_states = tf.scatter_nd( is_local_index_global_attn_nonzero, global_attn_hidden_states, - shape=(batch_size, max_num_global_attn_indices, self._num_heads * self._key_dim), + shape=( + batch_size, max_num_global_attn_indices, + self._num_heads * self._key_dim), ) # global key, query, value - global_query_vectors_only_global = self._global_query_dense(global_attn_hidden_states) + global_query_vectors_only_global = self._global_query_dense( + global_attn_hidden_states) global_key_vectors = self._global_key_dense(hidden_states) global_value_vectors = self._global_value_dense(hidden_states) @@ -884,18 +955,24 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): global_query_vectors_only_global /= tf.math.sqrt( tf.cast(self._key_dim, dtype=global_query_vectors_only_global.dtype) ) - global_query_vectors_only_global = self.reshape_and_transpose(global_query_vectors_only_global, batch_size) - global_key_vectors = self.reshape_and_transpose(global_key_vectors, batch_size) - global_value_vectors = self.reshape_and_transpose(global_value_vectors, batch_size) + global_query_vectors_only_global = self.reshape_and_transpose( + global_query_vectors_only_global, batch_size) + global_key_vectors = self.reshape_and_transpose(global_key_vectors, + batch_size) + global_value_vectors = self.reshape_and_transpose(global_value_vectors, + batch_size) # compute attn scores - global_attn_scores = tf.matmul(global_query_vectors_only_global, global_key_vectors, transpose_b=True) + global_attn_scores = tf.matmul(global_query_vectors_only_global, + global_key_vectors, transpose_b=True) if tf.executing_eagerly(): tf.debugging.assert_equal( get_shape_list(global_attn_scores), [batch_size * self._num_heads, max_num_global_attn_indices, seq_len], - message=f"global_attn_scores have the wrong size. Size should be {(batch_size * self._num_heads, max_num_global_attn_indices, seq_len)}, but is {get_shape_list(global_attn_scores)}.", + message=f"global_attn_scores have the wrong size. Size should be" + f"{(batch_size * self._num_heads, max_num_global_attn_indices, seq_len)}, " + f"but is {get_shape_list(global_attn_scores)}.", ) global_attn_scores = tf.reshape( @@ -903,11 +980,13 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): (batch_size, self._num_heads, max_num_global_attn_indices, seq_len), ) global_attn_scores_trans = tf.transpose(global_attn_scores, (0, 2, 1, 3)) - mask_shape = (get_shape_list(is_local_index_no_global_attn_nonzero)[0],) + tuple( + mask_shape = (get_shape_list(is_local_index_no_global_attn_nonzero)[ + 0],) + tuple( get_shape_list(global_attn_scores_trans)[-2:] ) global_attn_mask = tf.ones(mask_shape) * -10000.0 - global_attn_mask = tf.cast(global_attn_mask, dtype=global_attn_scores_trans.dtype) + global_attn_mask = tf.cast(global_attn_mask, + dtype=global_attn_scores_trans.dtype) # scatter mask global_attn_scores_trans = tf.tensor_scatter_nd_update( @@ -916,9 +995,10 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): global_attn_mask, ) global_attn_scores = tf.transpose(global_attn_scores_trans, (0, 2, 1, 3)) - + # mask global attn scores - attn_mask = tf.tile(is_index_masked[:, None, None, :], (1, get_shape_list(global_attn_scores)[1], 1, 1)) + attn_mask = tf.tile(is_index_masked[:, None, None, :], + (1, get_shape_list(global_attn_scores)[1], 1, 1)) global_attn_scores = tf.where(attn_mask, -10000.0, global_attn_scores) global_attn_scores = tf.reshape( global_attn_scores, @@ -934,17 +1014,22 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): tf.debugging.assert_equal( get_shape_list(layer_head_mask), [self._num_heads], - message=f"Head mask for a single layer should be of size {(self._num_heads)}, but is {get_shape_list(layer_head_mask)}", + message=f"Head mask for a single layer should be of size " + f"{(self._num_heads)}, but is {get_shape_list(layer_head_mask)}", ) - global_attn_probs_float = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape( - global_attn_probs_float, (batch_size, self._num_heads, max_num_global_attn_indices, seq_len) + global_attn_probs_float = tf.reshape(layer_head_mask, + (1, -1, 1, 1)) * tf.reshape( + global_attn_probs_float, + (batch_size, self._num_heads, max_num_global_attn_indices, seq_len) ) global_attn_probs_float = tf.reshape( - global_attn_probs_float, (batch_size * self._num_heads, max_num_global_attn_indices, seq_len) + global_attn_probs_float, + (batch_size * self._num_heads, max_num_global_attn_indices, seq_len) ) # dropout - global_attn_probs = self._global_dropout_layer(global_attn_probs_float, training=training) + global_attn_probs = self._global_dropout_layer(global_attn_probs_float, + training=training) # global attn output global_attn_output = tf.matmul(global_attn_probs, global_value_vectors) @@ -952,8 +1037,11 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): if tf.executing_eagerly(): tf.debugging.assert_equal( get_shape_list(global_attn_output), - [batch_size * self._num_heads, max_num_global_attn_indices, self._key_dim], - message=f"global_attn_output tensor has the wrong size. Size should be {(batch_size * self._num_heads, max_num_global_attn_indices, self._key_dim)}, but is {get_shape_list(global_attn_output)}.", + [batch_size * self._num_heads, max_num_global_attn_indices, + self._key_dim], + message=f"global_attn_output tensor has the wrong size. Size should be " + f"{(batch_size * self._num_heads, max_num_global_attn_indices, self._key_dim)}, " + f"but is {get_shape_list(global_attn_output)}.", ) global_attn_output = tf.reshape( @@ -977,7 +1065,8 @@ class LongformerAttention(tf.keras.layers.MultiHeadAttention): ) global_attn_probs = tf.reshape( - global_attn_probs, (batch_size, self._num_heads, max_num_global_attn_indices, seq_len) + global_attn_probs, + (batch_size, self._num_heads, max_num_global_attn_indices, seq_len) ) attn_output = self._output_dense(attn_output) diff --git a/official/projects/longformer/longformer_attention_test.py b/official/projects/longformer/longformer_attention_test.py index 656d8eb6d..1e869ccb6 100644 --- a/official/projects/longformer/longformer_attention_test.py +++ b/official/projects/longformer/longformer_attention_test.py @@ -12,25 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for the attention layer.""" +"""Tests for official.nlp.projects.longformer.longformer_attention.""" import numpy as np import tensorflow as tf from tensorflow.python.distribute import combinations -from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import +from tensorflow.python.keras import \ + keras_parameterized # pylint: disable=g-direct-tensorflow-import from official.projects.longformer import longformer_attention from official.modeling.tf_utils import get_shape_list def _create_mock_attention_data( - num_heads, - key_dim, - value_dim, - q_seq_length, - kv_seq_length, - batch_size, - include_mask=False): + num_heads, + key_dim, + value_dim, + q_seq_length, + kv_seq_length, + batch_size, + include_mask=False): """Creates mock testing data. Args: @@ -48,15 +49,15 @@ def _create_mock_attention_data( value_shape = (batch_size, kv_seq_length, value_dim) data = dict( - query=tf.random.normal(shape=query_shape), - value=tf.random.normal(shape=value_shape), - key=tf.random.normal(shape=value_shape)) + query=tf.random.normal(shape=query_shape), + value=tf.random.normal(shape=value_shape), + key=tf.random.normal(shape=value_shape)) total_seq_length = kv_seq_length if include_mask: mask_shape = (batch_size, num_heads, q_seq_length, total_seq_length) - mask_data = np.random.randint(2, size=mask_shape).astype("float32") + mask_data = np.random.randint(2, size=mask_shape).astype('float32') mask_data = dict(attention_mask=mask_data) data.update(mask_data) @@ -65,6 +66,12 @@ def _create_mock_attention_data( @keras_parameterized.run_all_keras_modes class LongformerAttentionTest(keras_parameterized.TestCase): + + def setUp(self): + super(LongformerAttentionTest, self).setUp() + np.random.seed(0) + tf.random.set_seed(0) + def _get_hidden_states(self): return tf.convert_to_tensor( [ @@ -116,25 +123,33 @@ class LongformerAttentionTest(keras_parameterized.TestCase): def test_diagonalize(self): hidden_states = self._get_hidden_states() - hidden_states = tf.reshape(hidden_states, (1, 8, 4)) # set seq length = 8, hidden dim = 4 - chunked_hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2) + hidden_states = tf.reshape(hidden_states, + (1, 8, 4)) # set seq length = 8, hidden dim = 4 + chunked_hidden_states = longformer_attention.LongformerAttention._chunk( + hidden_states, window_overlap=2) window_overlap_size = get_shape_list(chunked_hidden_states)[2] self.assertTrue(window_overlap_size == 4) - padded_hidden_states = longformer_attention.LongformerAttention._pad_and_diagonalize(chunked_hidden_states) + padded_hidden_states = longformer_attention.LongformerAttention._pad_and_diagonalize( + chunked_hidden_states) self.assertTrue( - get_shape_list(padded_hidden_states)[-1] == get_shape_list(chunked_hidden_states)[-1] + window_overlap_size - 1 + get_shape_list(padded_hidden_states)[-1] == + get_shape_list(chunked_hidden_states)[-1] + window_overlap_size - 1 ) # first row => [0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000] - tf.debugging.assert_near(padded_hidden_states[0, 0, 0, :4], chunked_hidden_states[0, 0, 0], rtol=1e-3) - tf.debugging.assert_near(padded_hidden_states[0, 0, 0, 4:], tf.zeros((3,), dtype=tf.dtypes.float32), rtol=1e-3) + tf.debugging.assert_near(padded_hidden_states[0, 0, 0, :4], + chunked_hidden_states[0, 0, 0], rtol=1e-3) + tf.debugging.assert_near(padded_hidden_states[0, 0, 0, 4:], + tf.zeros((3,), dtype=tf.dtypes.float32), rtol=1e-3) # last row => [0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629] - tf.debugging.assert_near(padded_hidden_states[0, 0, -1, 3:], chunked_hidden_states[0, 0, -1], rtol=1e-3) + tf.debugging.assert_near(padded_hidden_states[0, 0, -1, 3:], + chunked_hidden_states[0, 0, -1], rtol=1e-3) tf.debugging.assert_near( - padded_hidden_states[0, 0, -1, :3], tf.zeros((3,), dtype=tf.dtypes.float32), rtol=1e-3 + padded_hidden_states[0, 0, -1, :3], + tf.zeros((3,), dtype=tf.dtypes.float32), rtol=1e-3 ) def test_pad_and_transpose_last_two_dims(self): @@ -142,16 +157,21 @@ class LongformerAttentionTest(keras_parameterized.TestCase): self.assertTrue(get_shape_list(hidden_states), [1, 8, 4]) # pad along seq length dim - paddings = tf.constant([[0, 0], [0, 0], [0, 1], [0, 0]], dtype=tf.dtypes.int32) + paddings = tf.constant([[0, 0], [0, 0], [0, 1], [0, 0]], + dtype=tf.dtypes.int32) - hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2) - padded_hidden_states = longformer_attention.LongformerAttention._pad_and_transpose_last_two_dims(hidden_states, paddings) + hidden_states = longformer_attention.LongformerAttention._chunk( + hidden_states, window_overlap=2) + padded_hidden_states = longformer_attention.LongformerAttention._pad_and_transpose_last_two_dims( + hidden_states, paddings) self.assertTrue(get_shape_list(padded_hidden_states) == [1, 1, 8, 5]) expected_added_dim = tf.zeros((5,), dtype=tf.dtypes.float32) - tf.debugging.assert_near(expected_added_dim, padded_hidden_states[0, 0, -1, :], rtol=1e-6) + tf.debugging.assert_near(expected_added_dim, + padded_hidden_states[0, 0, -1, :], rtol=1e-6) tf.debugging.assert_near( - hidden_states[0, 0, -1, :], tf.reshape(padded_hidden_states, (1, -1))[0, 24:32], rtol=1e-6 + hidden_states[0, 0, -1, :], + tf.reshape(padded_hidden_states, (1, -1))[0, 24:32], rtol=1e-6 ) def test_mask_invalid_locations(self): @@ -159,39 +179,55 @@ class LongformerAttentionTest(keras_parameterized.TestCase): batch_size = 1 seq_length = 8 hidden_size = 4 - hidden_states = tf.reshape(hidden_states, (batch_size, seq_length, hidden_size)) - hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2) - - hid_states_1 = longformer_attention.LongformerAttention._mask_invalid_locations(hidden_states, 1) - hid_states_2 = longformer_attention.LongformerAttention._mask_invalid_locations(hidden_states, 2) - hid_states_3 = longformer_attention.LongformerAttention._mask_invalid_locations(hidden_states[:, :, :, :3], 2) - hid_states_4 = longformer_attention.LongformerAttention._mask_invalid_locations(hidden_states[:, :, 2:, :], 2) - - self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_1), tf.dtypes.int32)) == 8) - self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_2), tf.dtypes.int32)) == 24) - self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_3), tf.dtypes.int32)) == 24) - self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_4), tf.dtypes.int32)) == 12) + hidden_states = tf.reshape(hidden_states, + (batch_size, seq_length, hidden_size)) + hidden_states = longformer_attention.LongformerAttention._chunk( + hidden_states, window_overlap=2) + + hid_states_1 = longformer_attention.LongformerAttention._mask_invalid_locations( + hidden_states, 1) + hid_states_2 = longformer_attention.LongformerAttention._mask_invalid_locations( + hidden_states, 2) + hid_states_3 = longformer_attention.LongformerAttention._mask_invalid_locations( + hidden_states[:, :, :, :3], 2) + hid_states_4 = longformer_attention.LongformerAttention._mask_invalid_locations( + hidden_states[:, :, 2:, :], 2) + + self.assertTrue(tf.math.reduce_sum( + tf.cast(tf.math.is_inf(hid_states_1), tf.dtypes.int32)) == 8) + self.assertTrue(tf.math.reduce_sum( + tf.cast(tf.math.is_inf(hid_states_2), tf.dtypes.int32)) == 24) + self.assertTrue(tf.math.reduce_sum( + tf.cast(tf.math.is_inf(hid_states_3), tf.dtypes.int32)) == 24) + self.assertTrue(tf.math.reduce_sum( + tf.cast(tf.math.is_inf(hid_states_4), tf.dtypes.int32)) == 12) def test_chunk(self): hidden_states = self._get_hidden_states() batch_size = 1 seq_length = 8 hidden_size = 4 - hidden_states = tf.reshape(hidden_states, (batch_size, seq_length, hidden_size)) + hidden_states = tf.reshape(hidden_states, + (batch_size, seq_length, hidden_size)) - chunked_hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2) + chunked_hidden_states = longformer_attention.LongformerAttention._chunk( + hidden_states, window_overlap=2) # expected slices across chunk and seq length dim - expected_slice_along_seq_length = tf.convert_to_tensor([0.4983, -0.7584, -1.6944], dtype=tf.dtypes.float32) - expected_slice_along_chunk = tf.convert_to_tensor([0.4983, -1.8348, -0.7584, 2.0514], dtype=tf.dtypes.float32) + expected_slice_along_seq_length = tf.convert_to_tensor( + [0.4983, -0.7584, -1.6944], dtype=tf.dtypes.float32) + expected_slice_along_chunk = tf.convert_to_tensor( + [0.4983, -1.8348, -0.7584, 2.0514], dtype=tf.dtypes.float32) self.assertTrue(get_shape_list(chunked_hidden_states) == [1, 3, 4, 4]) - tf.debugging.assert_near(chunked_hidden_states[0, :, 0, 0], expected_slice_along_seq_length, rtol=1e-3) - tf.debugging.assert_near(chunked_hidden_states[0, 0, :, 0], expected_slice_along_chunk, rtol=1e-3) + tf.debugging.assert_near(chunked_hidden_states[0, :, 0, 0], + expected_slice_along_seq_length, rtol=1e-3) + tf.debugging.assert_near(chunked_hidden_states[0, 0, :, 0], + expected_slice_along_chunk, rtol=1e-3) def test_layer_local_attn(self): hidden_states = self._get_hidden_states() - batch_size, seq_length, hidden_size = hidden_states.shape + batch_size, seq_length, _ = hidden_states.shape layer = longformer_attention.LongformerAttention( num_heads=2, key_dim=4, @@ -203,14 +239,15 @@ class LongformerAttentionTest(keras_parameterized.TestCase): attention_mask = tf.zeros((batch_size, seq_length), dtype=tf.dtypes.float32) is_index_global_attn = tf.math.greater(attention_mask, 1) - is_global_attn = tf.math.reduce_any(is_index_global_attn) - attention_mask = tf.where(tf.range(4)[None, :, None, None] > 1, -10000.0, attention_mask[:, :, None, None]) + attention_mask = tf.where(tf.range(4)[None, :, None, None] > 1, -10000.0, + attention_mask[:, :, None, None]) is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0) output_hidden_states = layer( hidden_states=hidden_states, attention_mask=attention_mask, - is_index_masked=is_index_masked, is_index_global_attn=is_index_global_attn, is_global_attn=is_global_attn, + is_index_masked=is_index_masked, + is_index_global_attn=is_index_global_attn, )[0] self.assertTrue(output_hidden_states.shape, (1, 4, 8)) @@ -226,32 +263,33 @@ class LongformerAttentionTest(keras_parameterized.TestCase): ) hidden_states = self._get_hidden_states() - hidden_states = tf.concat([self._get_hidden_states(), self._get_hidden_states() - 0.5], axis=0) + hidden_states = tf.concat( + [self._get_hidden_states(), self._get_hidden_states() - 0.5], axis=0) batch_size, seq_length, hidden_size = hidden_states.shape # create attn mask attention_mask_1 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32) attention_mask_2 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32) - attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] == 0, 10000.0, attention_mask_1) - attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] > 2, -10000.0, attention_mask_1) - attention_mask_2 = tf.where(tf.range(4)[None, :, None, None] == 0, 10000.0, attention_mask_2) + attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] == 0, 10000.0, + attention_mask_1) + attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] > 2, -10000.0, + attention_mask_1) + attention_mask_2 = tf.where(tf.range(4)[None, :, None, None] == 0, 10000.0, + attention_mask_2) attention_mask = tf.concat([attention_mask_1, attention_mask_2], axis=0) is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0) is_index_global_attn = tf.math.greater(attention_mask[:, :, 0, 0], 0) - is_global_attn = tf.math.reduce_any(is_index_global_attn) output_hidden_states = layer( - hidden_states=hidden_states, attention_mask=-tf.math.abs(attention_mask), - is_index_masked=is_index_masked, is_index_global_attn=is_index_global_attn, is_global_attn=is_global_attn, - )[0] + hidden_states=hidden_states, attention_mask=-tf.math.abs(attention_mask), + is_index_masked=is_index_masked, + is_index_global_attn=is_index_global_attn, + )[0] self.assertTrue(output_hidden_states.shape, (2, 4, 8)) -if __name__ == "__main__": - np.random.seed(0) - tf.random.set_seed(0) +if __name__ == '__main__': tf.test.main() - diff --git a/official/projects/longformer/longformer_encoder.py b/official/projects/longformer/longformer_encoder.py index 4764a0761..a2cb672ee 100644 --- a/official/projects/longformer/longformer_encoder.py +++ b/official/projects/longformer/longformer_encoder.py @@ -23,29 +23,16 @@ from absl import logging import tensorflow as tf from official.nlp.modeling import layers -from official.projects.longformer.longformer_encoder_block import LongformerEncoderBlock +from official.projects.longformer.longformer_encoder_block import \ + LongformerEncoderBlock from official.modeling.tf_utils import get_shape_list - - _Initializer = Union[str, tf.keras.initializers.Initializer] _approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True) -# Transferred from huggingface.longformer.TFLongformerMainLayer & TFLongformerEncoder class LongformerEncoder(tf.keras.layers.Layer): - """Bi-directional Transformer-based encoder network. - - This network implements a bi-directional Transformer-based encoder as - described in "BERT: Pre-training of Deep Bidirectional Transformers for - Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the - embedding lookups and transformer layers, but not the masked language model - or classification task networks. - - The default values for this object are taken from the BERT-Base implementation - in "BERT: Pre-training of Deep Bidirectional Transformers for Language - Understanding". - + """LongformerEncoder Args: vocab_size: The size of the token vocabulary. attention_window: list of ints representing the window size for each layer. @@ -85,27 +72,27 @@ class LongformerEncoder(tf.keras.layers.Layer): """ def __init__( - self, - vocab_size: int, - attention_window: Union[List[int], int] = 512, - global_attention_size: int = 0, - pad_token_id: int = 1, - hidden_size: int = 768, - num_layers: int = 12, - num_attention_heads: int = 12, - max_sequence_length: int = 512, - type_vocab_size: int = 16, - inner_dim: int = 3072, - inner_activation: Callable[..., Any] = _approx_gelu, - output_dropout: float = 0.1, - attention_dropout: float = 0.1, - initializer: _Initializer = tf.keras.initializers.TruncatedNormal( - stddev=0.02), - output_range: Optional[int] = None, - embedding_width: Optional[int] = None, - embedding_layer: Optional[tf.keras.layers.Layer] = None, - norm_first: bool = False, - **kwargs): + self, + vocab_size: int, + attention_window: Union[List[int], int] = 512, + global_attention_size: int = 0, + pad_token_id: int = 1, + hidden_size: int = 768, + num_layers: int = 12, + num_attention_heads: int = 12, + max_sequence_length: int = 512, + type_vocab_size: int = 16, + inner_dim: int = 3072, + inner_activation: Callable[..., Any] = _approx_gelu, + output_dropout: float = 0.1, + attention_dropout: float = 0.1, + initializer: _Initializer = tf.keras.initializers.TruncatedNormal( + stddev=0.02), + output_range: Optional[int] = None, + embedding_width: Optional[int] = None, + embedding_layer: Optional[tf.keras.layers.Layer] = None, + norm_first: bool = False, + **kwargs): super().__init__(**kwargs) # Longformer args self._attention_window = attention_window @@ -120,93 +107,91 @@ class LongformerEncoder(tf.keras.layers.Layer): if embedding_layer is None: self._embedding_layer = layers.OnDeviceEmbedding( - vocab_size=vocab_size, - embedding_width=embedding_width, - initializer=initializer, - name='word_embeddings') + vocab_size=vocab_size, + embedding_width=embedding_width, + initializer=initializer, + name='word_embeddings') else: self._embedding_layer = embedding_layer self._position_embedding_layer = layers.PositionEmbedding( - initializer=initializer, - max_length=max_sequence_length, - name='position_embedding') + initializer=initializer, + max_length=max_sequence_length, + name='position_embedding') self._type_embedding_layer = layers.OnDeviceEmbedding( - vocab_size=type_vocab_size, - embedding_width=embedding_width, - initializer=initializer, - use_one_hot=True, - name='type_embeddings') + vocab_size=type_vocab_size, + embedding_width=embedding_width, + initializer=initializer, + use_one_hot=True, + name='type_embeddings') self._embedding_norm_layer = tf.keras.layers.LayerNormalization( - name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32) + name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32) self._embedding_dropout = tf.keras.layers.Dropout( - rate=output_dropout, name='embedding_dropout') + rate=output_dropout, name='embedding_dropout') # We project the 'embedding' output to 'hidden_size' if it is not already # 'hidden_size'. self._embedding_projection = None if embedding_width != hidden_size: self._embedding_projection = tf.keras.layers.experimental.EinsumDense( - '...x,xy->...y', - output_shape=hidden_size, - bias_axes='y', - kernel_initializer=initializer, - name='embedding_projection') + '...x,xy->...y', + output_shape=hidden_size, + bias_axes='y', + kernel_initializer=initializer, + name='embedding_projection') self._transformer_layers = [] self._attention_mask_layer = layers.SelfAttentionMask( - name='self_attention_mask') + name='self_attention_mask') for i in range(num_layers): layer = LongformerEncoderBlock( - global_attention_size=global_attention_size, - num_attention_heads=num_attention_heads, - inner_dim=inner_dim, - inner_activation=inner_activation, - # Longformer, instead of passing a list of attention_window, pass a value to sub-block - attention_window=attention_window if isinstance(attention_window, int) else attention_window[i], - layer_id=i, - output_dropout=output_dropout, - attention_dropout=attention_dropout, - norm_first=norm_first, - output_range=output_range if i == num_layers - 1 else None, - kernel_initializer=initializer, - name='transformer/layer_%d' % i) + global_attention_size=global_attention_size, + num_attention_heads=num_attention_heads, + inner_dim=inner_dim, + inner_activation=inner_activation, + attention_window=attention_window[i], + layer_id=i, + output_dropout=output_dropout, + attention_dropout=attention_dropout, + norm_first=norm_first, + output_range=output_range if i == num_layers - 1 else None, + kernel_initializer=initializer, + name=f'transformer/layer_{i}') self._transformer_layers.append(layer) self._pooler_layer = tf.keras.layers.Dense( - units=hidden_size, - activation='tanh', - kernel_initializer=initializer, - name='pooler_transform') + units=hidden_size, + activation='tanh', + kernel_initializer=initializer, + name='pooler_transform') self._config = { - 'vocab_size': vocab_size, - 'hidden_size': hidden_size, - 'num_layers': num_layers, - 'num_attention_heads': num_attention_heads, - 'max_sequence_length': max_sequence_length, - 'type_vocab_size': type_vocab_size, - 'inner_dim': inner_dim, - 'inner_activation': tf.keras.activations.serialize(activation), - 'output_dropout': output_dropout, - 'attention_dropout': attention_dropout, - 'initializer': tf.keras.initializers.serialize(initializer), - 'output_range': output_range, - 'embedding_width': embedding_width, - 'embedding_layer': embedding_layer, - 'norm_first': norm_first, - # Longformer - 'attention_window': attention_window, - 'global_attention_size': global_attention_size, - 'pad_token_id': pad_token_id, + 'vocab_size': vocab_size, + 'hidden_size': hidden_size, + 'num_layers': num_layers, + 'num_attention_heads': num_attention_heads, + 'max_sequence_length': max_sequence_length, + 'type_vocab_size': type_vocab_size, + 'inner_dim': inner_dim, + 'inner_activation': tf.keras.activations.serialize(activation), + 'output_dropout': output_dropout, + 'attention_dropout': attention_dropout, + 'initializer': tf.keras.initializers.serialize(initializer), + 'output_range': output_range, + 'embedding_width': embedding_width, + 'embedding_layer': embedding_layer, + 'norm_first': norm_first, + 'attention_window': attention_window, + 'global_attention_size': global_attention_size, + 'pad_token_id': pad_token_id, } self.inputs = dict( - input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32), - input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32), - input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32)) + input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32), + input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32), + input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32)) def call(self, inputs): word_embeddings = None @@ -214,22 +199,23 @@ class LongformerEncoder(tf.keras.layers.Layer): word_ids = inputs.get('input_word_ids') # input_ids mask = inputs.get('input_mask') # attention_mask type_ids = inputs.get('input_type_ids') # token_type_ids - word_embeddings = inputs.get('input_word_embeddings', None) # input_embeds + word_embeddings = inputs.get('input_word_embeddings', + None) # input_embeds else: - raise ValueError('Unexpected inputs type to %s.' % self.__class__) + raise ValueError(f'Unexpected inputs type to {self.__class__}.') ( - padding_len, - word_ids, - mask, - type_ids, - word_embeddings, + padding_len, + word_ids, + mask, + type_ids, + word_embeddings, ) = self._pad_to_window_size( - word_ids=word_ids, - mask=mask, - type_ids=type_ids, - word_embeddings=word_embeddings, - pad_token_id=self._pad_token_id + word_ids=word_ids, + mask=mask, + type_ids=type_ids, + word_embeddings=word_embeddings, + pad_token_id=self._pad_token_id ) if word_embeddings is None: @@ -247,46 +233,47 @@ class LongformerEncoder(tf.keras.layers.Layer): batch_size, seq_len = get_shape_list(mask) # create masks with fixed len global_attention_size - mask = tf.transpose(tf.concat(values=[tf.ones((self._global_attention_size, batch_size), tf.int32) * 2, - tf.transpose(mask)[self._global_attention_size:]], axis=0)) + mask = tf.transpose(tf.concat( + values=[tf.ones((self._global_attention_size, batch_size), tf.int32) * 2, + tf.transpose(mask)[self._global_attention_size:]], axis=0)) is_index_masked = tf.math.less(mask, 1) is_index_global_attn = tf.transpose(tf.concat(values=[ - tf.ones((self._global_attention_size, batch_size), tf.bool), tf.zeros((seq_len - self._global_attention_size, - batch_size), tf.bool) + tf.ones((self._global_attention_size, batch_size), tf.bool), + tf.zeros((seq_len - self._global_attention_size, + batch_size), tf.bool) ], axis=0)) - is_global_attn = self._global_attention_size > 0 # Longformer attention_mask = mask extended_attention_mask = tf.reshape( - attention_mask, (tf.shape(mask)[0], tf.shape(mask)[1], 1, 1) + attention_mask, (tf.shape(mask)[0], tf.shape(mask)[1], 1, 1) ) - attention_mask = tf.cast(tf.math.abs(1 - extended_attention_mask), tf.dtypes.float32) * -10000.0 + attention_mask = tf.cast(tf.math.abs(1 - extended_attention_mask), + tf.dtypes.float32) * -10000.0 encoder_outputs = [] x = embeddings # TFLongformerEncoder - for i, layer in enumerate(self._transformer_layers): + for layer in self._transformer_layers: x = layer([ - x, - attention_mask, - is_index_masked, - is_index_global_attn, - is_global_attn]) + x, + attention_mask, + is_index_masked, + is_index_global_attn]) encoder_outputs.append(x) last_encoder_output = encoder_outputs[-1] if padding_len > 0: - last_encoder_output = last_encoder_output[:, :-padding_len] + last_encoder_output = last_encoder_output[:, :-padding_len] first_token_tensor = last_encoder_output[:, 0, :] pooled_output = self._pooler_layer(first_token_tensor) return dict( - sequence_output=last_encoder_output, - pooled_output=pooled_output, - encoder_outputs=encoder_outputs) + sequence_output=last_encoder_output, + pooled_output=pooled_output, + encoder_outputs=encoder_outputs) def get_embedding_table(self): return self._embedding_layer.embeddings @@ -311,36 +298,36 @@ class LongformerEncoder(tf.keras.layers.Layer): def from_config(cls, config, custom_objects=None): if 'embedding_layer' in config and config['embedding_layer'] is not None: warn_string = ( - 'You are reloading a model that was saved with a ' - 'potentially-shared embedding layer object. If you contine to ' - 'train this model, the embedding layer will no longer be shared. ' - 'To work around this, load the model outside of the Keras API.') + 'You are reloading a model that was saved with a ' + 'potentially-shared embedding layer object. If you contine to ' + 'train this model, the embedding layer will no longer be shared. ' + 'To work around this, load the model outside of the Keras API.') print('WARNING: ' + warn_string) logging.warn(warn_string) return cls(**config) def _pad_to_window_size( - self, - word_ids, - mask, - type_ids, - word_embeddings, - pad_token_id, + self, + word_ids, + mask, + type_ids, + word_embeddings, + pad_token_id, ): - """A helper function to pad tokens and mask to work with implementation of Longformer selfattention.""" # padding - attention_window = ( - self._attention_window if isinstance(self._attention_window, int) else max(self._attention_window) - ) + attention_window = max(self._attention_window) - assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}" + assert attention_window % 2 == 0, \ + f'`attention_window` should be an even value. Given {attention_window}' - input_shape = get_shape_list(word_ids) if word_ids is not None else get_shape_list(word_embeddings) + input_shape = get_shape_list( + word_ids) if word_ids is not None else get_shape_list(word_embeddings) batch_size, seq_len = input_shape[:2] - + if seq_len is not None: - padding_len = (attention_window - seq_len % attention_window) % attention_window + padding_len = (attention_window - + seq_len % attention_window) % attention_window else: padding_len = 0 @@ -355,14 +342,17 @@ class LongformerEncoder(tf.keras.layers.Layer): word_embeddings_padding = self._embedding_layer(word_ids_padding) return tf.concat([word_embeddings, word_embeddings_padding], axis=-2) - word_embeddings = tf.cond(tf.math.greater(padding_len, 0), pad_embeddings, lambda: word_embeddings) + word_embeddings = tf.cond(tf.math.greater(padding_len, 0), pad_embeddings, + lambda: word_embeddings) - mask = tf.pad(mask, paddings, constant_values=False) # no attention on the padding tokens - token_type_ids = tf.pad(type_ids, paddings, constant_values=0) # pad with token_type_id = 0 + mask = tf.pad(mask, paddings, + constant_values=False) # no attention on the padding tokens + token_type_ids = tf.pad(type_ids, paddings, + constant_values=0) # pad with token_type_id = 0 return ( - padding_len, - word_ids, - mask, - token_type_ids, - word_embeddings,) + padding_len, + word_ids, + mask, + token_type_ids, + word_embeddings,) diff --git a/official/projects/longformer/longformer_encoder_block.py b/official/projects/longformer/longformer_encoder_block.py index 2fb78c888..149beb7cf 100644 --- a/official/projects/longformer/longformer_encoder_block.py +++ b/official/projects/longformer/longformer_encoder_block.py @@ -17,49 +17,13 @@ Longformer attention layer. Modified From huggingface/transformers """ import tensorflow as tf -from official.projects.longformer.longformer_attention import LongformerAttention +from official.projects.longformer.longformer_attention import \ + LongformerAttention + @tf.keras.utils.register_keras_serializable(package="Text") class LongformerEncoderBlock(tf.keras.layers.Layer): - """TransformerEncoderBlock layer. - - This layer implements the Transformer Encoder from - "Attention Is All You Need". (https://arxiv.org/abs/1706.03762), - which combines a `tf.keras.layers.MultiHeadAttention` layer with a - two-layer feedforward network. - - References: - [Attention Is All You Need](https://arxiv.org/abs/1706.03762) - [BERT: Pre-training of Deep Bidirectional Transformers for Language - Understanding](https://arxiv.org/abs/1810.04805) - """ - - def __init__(self, - global_attention_size, - num_attention_heads, - inner_dim, - inner_activation, - # Longformer - attention_window, - layer_id=0, - output_range=None, - kernel_initializer="glorot_uniform", - bias_initializer="zeros", - kernel_regularizer=None, - bias_regularizer=None, - activity_regularizer=None, - kernel_constraint=None, - bias_constraint=None, - use_bias=True, - norm_first=False, - norm_epsilon=1e-12, - output_dropout=0.0, - attention_dropout=0.0, - inner_dropout=0.0, - attention_initializer=None, - attention_axes=None, - **kwargs): - """Initializes `TransformerEncoderBlock`. + """LongformerEncoderBlock. Args: num_attention_heads: Number of attention heads. @@ -94,6 +58,32 @@ class LongformerEncoderBlock(tf.keras.layers.Layer): attention over all axes, but batch, heads, and features. **kwargs: keyword arguments/ """ + + def __init__(self, + global_attention_size, + num_attention_heads, + inner_dim, + inner_activation, + # Longformer + attention_window, + layer_id=0, + output_range=None, + kernel_initializer="glorot_uniform", + bias_initializer="zeros", + kernel_regularizer=None, + bias_regularizer=None, + activity_regularizer=None, + kernel_constraint=None, + bias_constraint=None, + use_bias=True, + norm_first=False, + norm_epsilon=1e-12, + output_dropout=0.0, + attention_dropout=0.0, + inner_dropout=0.0, + attention_initializer=None, + attention_axes=None, + **kwargs): super().__init__(**kwargs) self.global_attention_size = global_attention_size @@ -121,7 +111,7 @@ class LongformerEncoderBlock(tf.keras.layers.Layer): self._inner_dropout = inner_dropout if attention_initializer: self._attention_initializer = tf.keras.initializers.get( - attention_initializer) + attention_initializer) else: self._attention_initializer = self._kernel_initializer self._attention_axes = attention_axes @@ -133,58 +123,58 @@ class LongformerEncoderBlock(tf.keras.layers.Layer): input_tensor_shape = tf.TensorShape(input_shape[0]) else: raise ValueError( - "The type of input shape argument is not supported, got: %s" % - type(input_shape)) + f"The type of input shape argument is not supported, got: " + f"{type(input_shape)}") einsum_equation = "abc,cd->abd" if len(input_tensor_shape.as_list()) > 3: einsum_equation = "...bc,cd->...bd" hidden_size = input_tensor_shape[-1] if hidden_size % self._num_heads != 0: raise ValueError( - "The input size (%d) is not a multiple of the number of attention " - "heads (%d)" % (hidden_size, self._num_heads)) + f"The input size ({hidden_size}) is not a multiple of the number of attention " + f"heads ({self._num_heads})") self._attention_head_size = int(hidden_size // self._num_heads) common_kwargs = dict( - bias_initializer=self._bias_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activity_regularizer=self._activity_regularizer, - kernel_constraint=self._kernel_constraint, - bias_constraint=self._bias_constraint) + bias_initializer=self._bias_initializer, + kernel_regularizer=self._kernel_regularizer, + bias_regularizer=self._bias_regularizer, + activity_regularizer=self._activity_regularizer, + kernel_constraint=self._kernel_constraint, + bias_constraint=self._bias_constraint) # TFLongformerSelfAttention + TFLongformerSelfOutput.dense self._attention_layer = LongformerAttention( - # Longformer - layer_id=self._layer_id, - global_attention_size=self.global_attention_size, - attention_window=self._attention_window, - num_heads=self._num_heads, - key_dim=self._attention_head_size, - dropout=self._attention_dropout, - use_bias=self._use_bias, - kernel_initializer=self._attention_initializer, - attention_axes=self._attention_axes, - name="self_attention", - **common_kwargs) + # Longformer + layer_id=self._layer_id, + global_attention_size=self.global_attention_size, + attention_window=self._attention_window, + num_heads=self._num_heads, + key_dim=self._attention_head_size, + dropout=self._attention_dropout, + use_bias=self._use_bias, + kernel_initializer=self._attention_initializer, + attention_axes=self._attention_axes, + name="self_attention", + **common_kwargs) # TFLongformerSelfOutput.dropout self._attention_dropout = tf.keras.layers.Dropout(rate=self._output_dropout) # Use float32 in layernorm for numeric stability. # It is probably safe in mixed_float16, but we haven't validated this yet. # TFLongformerSelfOutput.Layernorm self._attention_layer_norm = ( - tf.keras.layers.LayerNormalization( - name="self_attention_layer_norm", - axis=-1, - epsilon=self._norm_epsilon, - dtype=tf.float32)) + tf.keras.layers.LayerNormalization( + name="self_attention_layer_norm", + axis=-1, + epsilon=self._norm_epsilon, + dtype=tf.float32)) # TFLongformerIntermediate # TFLongformerIntermediate.dense self._intermediate_dense = tf.keras.layers.experimental.EinsumDense( - einsum_equation, - output_shape=(None, self._inner_dim), - bias_axes="d", - kernel_initializer=self._kernel_initializer, - name="intermediate", - **common_kwargs) + einsum_equation, + output_shape=(None, self._inner_dim), + bias_axes="d", + kernel_initializer=self._kernel_initializer, + name="intermediate", + **common_kwargs) policy = tf.keras.mixed_precision.global_policy() if policy.name == "mixed_bfloat16": # bfloat16 causes BERT with the LAMB optimizer to not converge @@ -193,72 +183,72 @@ class LongformerEncoderBlock(tf.keras.layers.Layer): policy = tf.float32 # TFLongformerIntermediate.intermediate_act_fn self._intermediate_activation_layer = tf.keras.layers.Activation( - self._inner_activation, dtype=policy) + self._inner_activation, dtype=policy) # ??? self._inner_dropout_layer = tf.keras.layers.Dropout( - rate=self._inner_dropout) + rate=self._inner_dropout) # TFLongformerOutput # TFLongformerOutput.dense self._output_dense = tf.keras.layers.experimental.EinsumDense( - einsum_equation, - output_shape=(None, hidden_size), - bias_axes="d", - name="output", - kernel_initializer=self._kernel_initializer, - **common_kwargs) + einsum_equation, + output_shape=(None, hidden_size), + bias_axes="d", + name="output", + kernel_initializer=self._kernel_initializer, + **common_kwargs) # TFLongformerOutput.dropout self._output_dropout = tf.keras.layers.Dropout(rate=self._output_dropout) # Use float32 in layernorm for numeric stability. # TFLongformerOutput.layernorm self._output_layer_norm = tf.keras.layers.LayerNormalization( - name="output_layer_norm", - axis=-1, - epsilon=self._norm_epsilon, - dtype=tf.float32) + name="output_layer_norm", + axis=-1, + epsilon=self._norm_epsilon, + dtype=tf.float32) - super(LongformerEncoderBlock, self).build(input_shape) + super().build(input_shape) def get_config(self): config = { - "num_attention_heads": - self._num_heads, - "inner_dim": - self._inner_dim, - "inner_activation": - self._inner_activation, - "output_dropout": - self._output_dropout_rate, - "attention_dropout": - self._attention_dropout_rate, - "output_range": - self._output_range, - "kernel_initializer": - tf.keras.initializers.serialize(self._kernel_initializer), - "bias_initializer": - tf.keras.initializers.serialize(self._bias_initializer), - "kernel_regularizer": - tf.keras.regularizers.serialize(self._kernel_regularizer), - "bias_regularizer": - tf.keras.regularizers.serialize(self._bias_regularizer), - "activity_regularizer": - tf.keras.regularizers.serialize(self._activity_regularizer), - "kernel_constraint": - tf.keras.constraints.serialize(self._kernel_constraint), - "bias_constraint": - tf.keras.constraints.serialize(self._bias_constraint), - "use_bias": - self._use_bias, - "norm_first": - self._norm_first, - "norm_epsilon": - self._norm_epsilon, - "inner_dropout": - self._inner_dropout, - "attention_initializer": - tf.keras.initializers.serialize(self._attention_initializer), - "attention_axes": self._attention_axes, + "num_attention_heads": + self._num_heads, + "inner_dim": + self._inner_dim, + "inner_activation": + self._inner_activation, + "output_dropout": + self._output_dropout_rate, + "attention_dropout": + self._attention_dropout_rate, + "output_range": + self._output_range, + "kernel_initializer": + tf.keras.initializers.serialize(self._kernel_initializer), + "bias_initializer": + tf.keras.initializers.serialize(self._bias_initializer), + "kernel_regularizer": + tf.keras.regularizers.serialize(self._kernel_regularizer), + "bias_regularizer": + tf.keras.regularizers.serialize(self._bias_regularizer), + "activity_regularizer": + tf.keras.regularizers.serialize(self._activity_regularizer), + "kernel_constraint": + tf.keras.constraints.serialize(self._kernel_constraint), + "bias_constraint": + tf.keras.constraints.serialize(self._bias_constraint), + "use_bias": + self._use_bias, + "norm_first": + self._norm_first, + "norm_epsilon": + self._norm_epsilon, + "inner_dropout": + self._inner_dropout, + "attention_initializer": + tf.keras.initializers.serialize(self._attention_initializer), + "attention_axes": self._attention_axes, } - base_config = super(LongformerEncoderBlock, self).get_config() + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) def call(self, inputs): @@ -277,26 +267,23 @@ class LongformerEncoderBlock(tf.keras.layers.Layer): An output tensor with the same dimensions as input/query tensor. """ if isinstance(inputs, (list, tuple)): - if len(inputs) == 5: + if len(inputs) == 4: ( input_tensor, attention_mask, is_index_masked, is_index_global_attn, - is_global_attn ) = inputs key_value = None - elif len(inputs) == 6: + elif len(inputs) == 5: assert False # No key_value else: - raise ValueError("Unexpected inputs to %s with length at %d" % - (self.__class__, len(inputs))) + raise ValueError(f"Unexpected inputs to {self.__class__} with length at {len(inputs)}") else: input_tensor = inputs attention_mask = None is_index_masked = None is_index_global_attn = None - is_global_attn = None key_value = None if self._output_range: @@ -325,11 +312,10 @@ class LongformerEncoderBlock(tf.keras.layers.Layer): # attention_output = self._attention_layer( # query=target_tensor, value=key_value, attention_mask=attention_mask) attention_output = self._attention_layer( - hidden_states=target_tensor, - attention_mask=attention_mask, - is_index_masked=is_index_masked, - is_index_global_attn=is_index_global_attn, - is_global_attn=is_global_attn + hidden_states=target_tensor, + attention_mask=attention_mask, + is_index_masked=is_index_masked, + is_index_global_attn=is_index_global_attn, ) # TFLongformerAttention.TFLongformerSelfOutput.* - {.dense} attention_output = self._attention_dropout(attention_output) diff --git a/official/projects/longformer/longformer_encoder_test.py b/official/projects/longformer/longformer_encoder_test.py index e8a1bf78f..88a17348f 100644 --- a/official/projects/longformer/longformer_encoder_test.py +++ b/official/projects/longformer/longformer_encoder_test.py @@ -12,44 +12,55 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for official.nlp.projects.bigbird.encoder.""" +"""Tests for official.nlp.projects.longformer.longformer_encoder.""" import numpy as np import tensorflow as tf from absl.testing import parameterized -from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import +from tensorflow.python.keras import \ + keras_parameterized # pylint: disable=g-direct-tensorflow-import from tensorflow.python.distribute import combinations from official.projects.longformer.longformer_encoder import LongformerEncoder + @keras_parameterized.run_all_keras_modes class LongformerEncoderTest(keras_parameterized.TestCase): - + + def setUp(self): + super(LongformerEncoderTest, self).setUp() + np.random.seed(0) + tf.random.set_seed(0) + @combinations.generate(combinations.combine( - attention_window=[32, 128], global_attention_size=[0, 1, 2])) + attention_window=[32, 128], global_attention_size=[0, 1, 2])) def test_encoder(self, attention_window, global_attention_size): sequence_length = 128 batch_size = 2 vocab_size = 1024 - hidden_size=256 + hidden_size = 256 network = LongformerEncoder( - global_attention_size=global_attention_size, - vocab_size=vocab_size, - attention_window=attention_window, - hidden_size=hidden_size, - num_layers=1, - num_attention_heads=4, - max_sequence_length=512) - word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length), dtype=np.int32) - mask_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32) - type_id_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32) + global_attention_size=global_attention_size, + vocab_size=vocab_size, + attention_window=[attention_window], + hidden_size=hidden_size, + num_layers=1, + num_attention_heads=4, + max_sequence_length=512) + word_id_data = np.random.randint(vocab_size, + size=(batch_size, sequence_length), + dtype=np.int32) + mask_data = np.random.randint(2, size=(batch_size, sequence_length), + dtype=np.int32) + type_id_data = np.random.randint(2, size=(batch_size, sequence_length), + dtype=np.int32) inputs = { - 'input_word_ids': word_id_data, - 'input_mask': mask_data, - 'input_type_ids': type_id_data, + 'input_word_ids': word_id_data, + 'input_mask': mask_data, + 'input_type_ids': type_id_data, } outputs = network(inputs) - self.assertEqual(outputs["sequence_output"].shape, + self.assertEqual(outputs['sequence_output'].shape, (batch_size, sequence_length, hidden_size)) @combinations.generate(combinations.combine( @@ -60,26 +71,30 @@ class LongformerEncoderTest(keras_parameterized.TestCase): vocab_size = 1024 hidden_size = 256 network = LongformerEncoder( - global_attention_size=global_attention_size, - vocab_size=vocab_size, - attention_window=32, - hidden_size=hidden_size, - num_layers=1, - num_attention_heads=4, - max_sequence_length=512, - norm_first=norm_first) - word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length), dtype=np.int32) - mask_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32) - type_id_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32) + global_attention_size=global_attention_size, + vocab_size=vocab_size, + attention_window=[32], + hidden_size=hidden_size, + num_layers=1, + num_attention_heads=4, + max_sequence_length=512, + norm_first=norm_first) + word_id_data = np.random.randint(vocab_size, + size=(batch_size, sequence_length), + dtype=np.int32) + mask_data = np.random.randint(2, size=(batch_size, sequence_length), + dtype=np.int32) + type_id_data = np.random.randint(2, size=(batch_size, sequence_length), + dtype=np.int32) inputs = { - 'input_word_ids': word_id_data, - 'input_mask': mask_data, - 'input_type_ids': type_id_data, + 'input_word_ids': word_id_data, + 'input_mask': mask_data, + 'input_type_ids': type_id_data, } outputs = network(inputs) - self.assertEqual(outputs["sequence_output"].shape, + self.assertEqual(outputs['sequence_output'].shape, (batch_size, sequence_length, hidden_size)) -if __name__ == "__main__": - tf.test.main() \ No newline at end of file +if __name__ == '__main__': + tf.test.main() diff --git a/official/projects/longformer/longformer_experiments.py b/official/projects/longformer/longformer_experiments.py index 49307acaf..6ce47c119 100644 --- a/official/projects/longformer/longformer_experiments.py +++ b/official/projects/longformer/longformer_experiments.py @@ -34,84 +34,90 @@ AdamWeightDecay = optimization.AdamWeightDecayConfig PolynomialLr = optimization.PolynomialLrConfig PolynomialWarmupConfig = optimization.PolynomialWarmupConfig + @dataclasses.dataclass class LongformerOptimizationConfig(optimization.OptimizationConfig): optimizer: optimization.OptimizerConfig = optimization.OptimizerConfig( - type="adamw", - adamw=AdamWeightDecay( - weight_decay_rate=0.01, - exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], - epsilon=1e-6)) + type='adamw', + adamw=AdamWeightDecay( + weight_decay_rate=0.01, + exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'], + epsilon=1e-6)) learning_rate: optimization.LrConfig = optimization.LrConfig( - type="polynomial", - polynomial=PolynomialLr( - initial_learning_rate=1e-4, - decay_steps=1000000, - end_learning_rate=0.0)) + type='polynomial', + polynomial=PolynomialLr( + initial_learning_rate=1e-4, + decay_steps=1000000, + end_learning_rate=0.0)) warmup: optimization.WarmupConfig = optimization.WarmupConfig( - type="polynomial", polynomial=PolynomialWarmupConfig(warmup_steps=10000)) + type='polynomial', polynomial=PolynomialWarmupConfig(warmup_steps=10000)) + @exp_factory.register_config_factory('longformer/pretraining') def longformer_pretraining() -> cfg.ExperimentConfig: """BERT pretraining experiment.""" config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(enable_xla=True), - task=masked_lm.MaskedLMConfig( - model=bert.PretrainerConfig( - encoder=encoders.EncoderConfig( - type="any", any=LongformerEncoderConfig()), - cls_heads=[ - bert.ClsHeadConfig( - inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence') - ] - ), - train_data=pretrain_dataloader.BertPretrainDataConfig(use_v2_feature_names=True), - validation_data=pretrain_dataloader.BertPretrainDataConfig(use_v2_feature_names=True, - is_training=False)), - trainer=cfg.TrainerConfig( - optimizer_config=LongformerOptimizationConfig(), train_steps=1000000), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) + runtime=cfg.RuntimeConfig(enable_xla=True), + task=masked_lm.MaskedLMConfig( + model=bert.PretrainerConfig( + encoder=encoders.EncoderConfig( + type="any", any=LongformerEncoderConfig()), + cls_heads=[ + bert.ClsHeadConfig( + inner_dim=768, num_classes=2, dropout_rate=0.1, + name='next_sentence') + ] + ), + train_data=pretrain_dataloader.BertPretrainDataConfig( + use_v2_feature_names=True), + validation_data=pretrain_dataloader.BertPretrainDataConfig( + use_v2_feature_names=True, + is_training=False)), + trainer=cfg.TrainerConfig( + optimizer_config=LongformerOptimizationConfig(), train_steps=1000000), + restrictions=[ + 'task.train_data.is_training != None', + 'task.validation_data.is_training != None' + ]) return config + @exp_factory.register_config_factory('longformer/glue') def longformer_glue() -> cfg.ExperimentConfig: config = cfg.ExperimentConfig( - task=sentence_prediction.SentencePredictionConfig( - model=sentence_prediction.ModelConfig( - encoder=encoders.EncoderConfig( - type="any", any=LongformerEncoderConfig())), - train_data=sentence_prediction_dataloader - .SentencePredictionDataConfig(), - validation_data=sentence_prediction_dataloader - .SentencePredictionDataConfig( - is_training=False, drop_remainder=False)), - trainer=cfg.TrainerConfig( - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'adamw', - 'adamw': { - 'weight_decay_rate': - 0.01, - 'exclude_from_weight_decay': - ['LayerNorm', 'layer_norm', 'bias'], - } - }, - 'learning_rate': { - 'type': 'polynomial', - 'polynomial': { - 'initial_learning_rate': 3e-5, - 'end_learning_rate': 0.0, - } - }, - 'warmup': { - 'type': 'polynomial' - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) + task=sentence_prediction.SentencePredictionConfig( + model=sentence_prediction.ModelConfig( + encoder=encoders.EncoderConfig( + type="any", any=LongformerEncoderConfig())), + train_data=sentence_prediction_dataloader + .SentencePredictionDataConfig(), + validation_data=sentence_prediction_dataloader + .SentencePredictionDataConfig( + is_training=False, drop_remainder=False)), + trainer=cfg.TrainerConfig( + optimizer_config=optimization.OptimizationConfig({ + 'optimizer': { + 'type': 'adamw', + 'adamw': { + 'weight_decay_rate': + 0.01, + 'exclude_from_weight_decay': + ['LayerNorm', 'layer_norm', 'bias'], + } + }, + 'learning_rate': { + 'type': 'polynomial', + 'polynomial': { + 'initial_learning_rate': 3e-5, + 'end_learning_rate': 0.0, + } + }, + 'warmup': { + 'type': 'polynomial' + } + })), + restrictions=[ + 'task.train_data.is_training != None', + 'task.validation_data.is_training != None' + ]) return config diff --git a/official/projects/longformer/train.py b/official/projects/longformer/train.py index 91e8b516e..39fa3bb41 100644 --- a/official/projects/longformer/train.py +++ b/official/projects/longformer/train.py @@ -24,7 +24,6 @@ from official.core import task_factory from official.core import train_lib from official.core import train_utils from official.modeling import performance -from official.projects.longformer import longformer_experiments FLAGS = flags.FLAGS @@ -43,23 +42,24 @@ def main(_): # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: - performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype) + performance.set_mixed_precision_policy( + params.runtime.mixed_precision_dtype) distribution_strategy = distribute_utils.get_distribution_strategy( - distribution_strategy=params.runtime.distribution_strategy, - all_reduce_alg=params.runtime.all_reduce_alg, - num_gpus=params.runtime.num_gpus, - tpu_address=params.runtime.tpu, - **params.runtime.model_parallelism()) + distribution_strategy=params.runtime.distribution_strategy, + all_reduce_alg=params.runtime.all_reduce_alg, + num_gpus=params.runtime.num_gpus, + tpu_address=params.runtime.tpu, + **params.runtime.model_parallelism()) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment( - distribution_strategy=distribution_strategy, - task=task, - mode=FLAGS.mode, - params=params, - model_dir=model_dir) + distribution_strategy=distribution_strategy, + task=task, + mode=FLAGS.mode, + params=params, + model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir) -- GitLab From 259c43470ee9036914eb25a23ae5511b0fb9b175 Mon Sep 17 00:00:00 2001 From: Zihan Wang Date: Wed, 2 Mar 2022 09:44:20 -0800 Subject: [PATCH 13/54] lint 2 --- official/projects/longformer/longformer_encoder_block.py | 1 - 1 file changed, 1 deletion(-) diff --git a/official/projects/longformer/longformer_encoder_block.py b/official/projects/longformer/longformer_encoder_block.py index 149beb7cf..ee1bbe291 100644 --- a/official/projects/longformer/longformer_encoder_block.py +++ b/official/projects/longformer/longformer_encoder_block.py @@ -184,7 +184,6 @@ class LongformerEncoderBlock(tf.keras.layers.Layer): # TFLongformerIntermediate.intermediate_act_fn self._intermediate_activation_layer = tf.keras.layers.Activation( self._inner_activation, dtype=policy) - # ??? self._inner_dropout_layer = tf.keras.layers.Dropout( rate=self._inner_dropout) # TFLongformerOutput -- GitLab From 2c7caa84b81070571495381c488b5cff4a997969 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Wed, 16 Mar 2022 05:15:18 -0700 Subject: [PATCH 14/54] Add a doc-generator script for the full TensorFlow models package. This will likely replace the adjacent build_nlp_api_docs and build_vision_api_docs scripts. PiperOrigin-RevId: 435019769 --- official/utils/docs/build_all_api_docs.py | 116 ++++++++++++++++++ .../utils/docs/build_all_api_docs_test.py | 50 ++++++++ 2 files changed, 166 insertions(+) create mode 100644 official/utils/docs/build_all_api_docs.py create mode 100644 official/utils/docs/build_all_api_docs_test.py diff --git a/official/utils/docs/build_all_api_docs.py b/official/utils/docs/build_all_api_docs.py new file mode 100644 index 000000000..1c09bfe9b --- /dev/null +++ b/official/utils/docs/build_all_api_docs.py @@ -0,0 +1,116 @@ +# Copyright 2022 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +r"""Tool to generate api_docs for tensorflow_models/official library. + +Example: + +$> pip install -U git+https://github.com/tensorflow/docs +$> python build_nlp_api_docs \ + --output_dir=/tmp/api_docs +""" + +import pathlib + +from absl import app +from absl import flags +from absl import logging +from tensorflow_docs.api_generator import generate_lib +from tensorflow_docs.api_generator import public_api + +import tensorflow_models as tfm +from official.utils.docs import build_api_docs_lib + +FLAGS = flags.FLAGS + +flags.DEFINE_string('output_dir', None, 'Where to write the resulting docs to.') +flags.DEFINE_string( + 'code_url_prefix', + 'https://github.com/tensorflow/models/blob/master/tensorflow_models', + 'The url prefix for links to code.') + +flags.DEFINE_bool('search_hints', True, + 'Include metadata search hints in the generated files') + +flags.DEFINE_string('site_path', '/api_docs/python', + 'Path prefix in the _toc.yaml') + + +PROJECT_SHORT_NAME = 'tfm' +PROJECT_FULL_NAME = 'TensorFlow Official Models - Modeling Library' + + +def custom_filter(path, parent, children): + if len(path) <= 2: + # Don't filter the contents of the top level `tfm.vision` package. + return children + else: + return public_api.explicit_package_contents_filter(path, parent, children) + + +def gen_api_docs(code_url_prefix, site_path, output_dir, project_short_name, + project_full_name, search_hints): + """Generates api docs for the tensorflow docs package.""" + build_api_docs_lib.hide_module_model_and_layer_methods() + del tfm.nlp.layers.MultiHeadAttention + del tfm.nlp.layers.EinsumDense + + url_parts = code_url_prefix.strip('/').split('/') + url_parts = url_parts[:url_parts.index('tensorflow_models')] + url_parts.append('official') + + official_url_prefix = '/'.join(url_parts) + + tfm_base_dir = pathlib.Path(tfm.__file__).parent + + # The `layers` submodule (and others) are actually defined in the `official` + # package. Find the path to `official`. + official_base_dir = [ + p for p in pathlib.Path(tfm.vision.layers.__file__).parents + if p.name == 'official' + ][0] + + doc_generator = generate_lib.DocGenerator( + root_title=project_full_name, + py_modules=[(project_short_name, tfm)], + base_dir=[tfm_base_dir, official_base_dir], + code_url_prefix=[ + code_url_prefix, + official_url_prefix, + ], + search_hints=search_hints, + site_path=site_path, + callbacks=[custom_filter], + ) + + doc_generator.build(output_dir) + logging.info('Output docs to: %s', output_dir) + + +def main(argv): + if len(argv) > 1: + raise app.UsageError('Too many command-line arguments.') + + gen_api_docs( + code_url_prefix=FLAGS.code_url_prefix, + site_path=FLAGS.site_path, + output_dir=FLAGS.output_dir, + project_short_name=PROJECT_SHORT_NAME, + project_full_name=PROJECT_FULL_NAME, + search_hints=FLAGS.search_hints) + + +if __name__ == '__main__': + flags.mark_flag_as_required('output_dir') + app.run(main) diff --git a/official/utils/docs/build_all_api_docs_test.py b/official/utils/docs/build_all_api_docs_test.py new file mode 100644 index 000000000..d84de1c67 --- /dev/null +++ b/official/utils/docs/build_all_api_docs_test.py @@ -0,0 +1,50 @@ +# Copyright 2022 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for official.tools.build_docs.""" + +import os +import shutil + +import tensorflow as tf + +from official.utils.docs import build_all_api_docs + + +class BuildDocsTest(tf.test.TestCase): + + def setUp(self): + super(BuildDocsTest, self).setUp() + self.workdir = self.get_temp_dir() + if os.path.exists(self.workdir): + shutil.rmtree(self.workdir) + os.makedirs(self.workdir) + + def test_api_gen(self): + build_all_api_docs.gen_api_docs( + code_url_prefix="https://github.com/tensorflow/models/blob/master/tensorflow_models", + site_path="tf_modeling/api_docs/python", + output_dir=self.workdir, + project_short_name="tfm", + project_full_name="TensorFlow Modeling", + search_hints=True) + + # Check that the "defined in" section is working + with open(os.path.join(self.workdir, "tfm.md")) as f: + content = f.read() + self.assertIn("__init__.py", content) + + +if __name__ == "__main__": + tf.test.main() -- GitLab From 9aee435a564ff59650e921ffeee26708a0f356c6 Mon Sep 17 00:00:00 2001 From: Fan Yang Date: Wed, 16 Mar 2022 09:54:25 -0700 Subject: [PATCH 15/54] Internal change PiperOrigin-RevId: 435078736 --- .../configs/image_classification_test.py | 2 +- .../tasks/image_classification_test.py | 4 ++-- .../configs/image_classification_test.py | 2 +- .../qat/vision/configs/retinanet_test.py | 2 +- .../configs/semantic_segmentation_test.py | 2 +- official/projects/qat/vision/n_bit/schemes.py | 18 +---------------- .../qat/vision/quantization/schemes.py | 20 ++----------------- .../vision/tasks/image_classification_test.py | 2 +- .../qat/vision/tasks/retinanet_test.py | 2 +- 9 files changed, 11 insertions(+), 43 deletions(-) diff --git a/official/projects/pruning/configs/image_classification_test.py b/official/projects/pruning/configs/image_classification_test.py index 36e4cbbfb..844ae24e6 100644 --- a/official/projects/pruning/configs/image_classification_test.py +++ b/official/projects/pruning/configs/image_classification_test.py @@ -18,10 +18,10 @@ from absl.testing import parameterized import tensorflow as tf +from official import vision from official.core import config_definitions as cfg from official.core import exp_factory from official.projects.pruning.configs import image_classification as pruning_exp_cfg -from official.vision import beta from official.vision.configs import image_classification as exp_cfg diff --git a/official/projects/pruning/tasks/image_classification_test.py b/official/projects/pruning/tasks/image_classification_test.py index e06b0242a..dfcb82372 100644 --- a/official/projects/pruning/tasks/image_classification_test.py +++ b/official/projects/pruning/tasks/image_classification_test.py @@ -22,13 +22,13 @@ from absl.testing import parameterized import numpy as np import orbit import tensorflow as tf -import tensorflow_model_optimization as tfmot +import tensorflow_model_optimization as tfmot +from official import vision from official.core import actions from official.core import exp_factory from official.modeling import optimization from official.projects.pruning.tasks import image_classification as img_cls_task -from official.vision import beta class ImageClassificationTaskTest(tf.test.TestCase, parameterized.TestCase): diff --git a/official/projects/qat/vision/configs/image_classification_test.py b/official/projects/qat/vision/configs/image_classification_test.py index 40f4c2692..31208890d 100644 --- a/official/projects/qat/vision/configs/image_classification_test.py +++ b/official/projects/qat/vision/configs/image_classification_test.py @@ -17,11 +17,11 @@ from absl.testing import parameterized import tensorflow as tf +from official import vision from official.core import config_definitions as cfg from official.core import exp_factory from official.projects.qat.vision.configs import common from official.projects.qat.vision.configs import image_classification as qat_exp_cfg -from official.vision import beta from official.vision.configs import image_classification as exp_cfg diff --git a/official/projects/qat/vision/configs/retinanet_test.py b/official/projects/qat/vision/configs/retinanet_test.py index 658d88ffa..1a5ac79e8 100644 --- a/official/projects/qat/vision/configs/retinanet_test.py +++ b/official/projects/qat/vision/configs/retinanet_test.py @@ -17,11 +17,11 @@ from absl.testing import parameterized import tensorflow as tf +from official import vision from official.core import config_definitions as cfg from official.core import exp_factory from official.projects.qat.vision.configs import common from official.projects.qat.vision.configs import retinanet as qat_exp_cfg -from official.vision import beta from official.vision.configs import retinanet as exp_cfg diff --git a/official/projects/qat/vision/configs/semantic_segmentation_test.py b/official/projects/qat/vision/configs/semantic_segmentation_test.py index 3fae56aad..58c1b54af 100644 --- a/official/projects/qat/vision/configs/semantic_segmentation_test.py +++ b/official/projects/qat/vision/configs/semantic_segmentation_test.py @@ -17,11 +17,11 @@ from absl.testing import parameterized import tensorflow as tf +from official import vision from official.core import config_definitions as cfg from official.core import exp_factory from official.projects.qat.vision.configs import common from official.projects.qat.vision.configs import semantic_segmentation as qat_exp_cfg -from official.vision import beta from official.vision.configs import semantic_segmentation as exp_cfg diff --git a/official/projects/qat/vision/n_bit/schemes.py b/official/projects/qat/vision/n_bit/schemes.py index 353aa5167..31661f89e 100644 --- a/official/projects/qat/vision/n_bit/schemes.py +++ b/official/projects/qat/vision/n_bit/schemes.py @@ -199,23 +199,7 @@ class QuantizeLayoutTransform( 'Vision>Conv2DBNBlock', nn_blocks.Conv2DBNBlockNBitQuantized, num_bits_weight=self._num_bits_weight, - num_bits_activation=self._num_bits_activation), - # TODO(yeqing): Remove the `Beta` custom layers. - CustomLayerQuantize( - 'Beta>BottleneckBlock', - nn_blocks.BottleneckBlockNBitQuantized, - num_bits_weight=self._num_bits_weight, - num_bits_activation=self._num_bits_activation), - CustomLayerQuantize( - 'Beta>InvertedBottleneckBlock', - nn_blocks.InvertedBottleneckBlockNBitQuantized, - num_bits_weight=self._num_bits_weight, - num_bits_activation=self._num_bits_activation), - CustomLayerQuantize( - 'Beta>Conv2DBNBlock', - nn_blocks.Conv2DBNBlockNBitQuantized, - num_bits_weight=self._num_bits_weight, - num_bits_activation=self._num_bits_activation), + num_bits_activation=self._num_bits_activation) ] return _ModelTransformer(model, transforms, set(layer_quantize_map.keys()), layer_quantize_map).transform() diff --git a/official/projects/qat/vision/quantization/schemes.py b/official/projects/qat/vision/quantization/schemes.py index f52258aca..df11554f1 100644 --- a/official/projects/qat/vision/quantization/schemes.py +++ b/official/projects/qat/vision/quantization/schemes.py @@ -102,10 +102,7 @@ class CustomLayerQuantize( if bottleneck_layer['class_name'] in [ 'Vision>Conv2DBNBlock', 'Vision>InvertedBottleneckBlock', 'Vision>SegmentationHead', 'Vision>SpatialPyramidPooling', - 'Vision>ASPP', - # TODO(yeqing): Removes the Beta layers. - 'Beta>Conv2DBNBlock', 'Beta>InvertedBottleneckBlock', - 'Beta>SegmentationHead', 'Beta>SpatialPyramidPooling', 'Beta>ASPP' + 'Vision>ASPP' ]: layer_metadata = {'quantize_config': configs.NoOpQuantizeConfig()} else: @@ -170,20 +167,7 @@ class QuantizeLayoutTransform( quantized_nn_layers.SegmentationHeadQuantized), CustomLayerQuantize('Vision>SpatialPyramidPooling', quantized_nn_layers.SpatialPyramidPoolingQuantized), - CustomLayerQuantize('Vision>ASPP', quantized_nn_layers.ASPPQuantized), - # TODO(yeqing): Remove the `Beta` components. - CustomLayerQuantize('Beta>BottleneckBlock', - quantized_nn_blocks.BottleneckBlockQuantized), - CustomLayerQuantize( - 'Beta>InvertedBottleneckBlock', - quantized_nn_blocks.InvertedBottleneckBlockQuantized), - CustomLayerQuantize('Beta>Conv2DBNBlock', - quantized_nn_blocks.Conv2DBNBlockQuantized), - CustomLayerQuantize('Beta>SegmentationHead', - quantized_nn_layers.SegmentationHeadQuantized), - CustomLayerQuantize('Beta>SpatialPyramidPooling', - quantized_nn_layers.SpatialPyramidPoolingQuantized), - CustomLayerQuantize('Beta>ASPP', quantized_nn_layers.ASPPQuantized) + CustomLayerQuantize('Vision>ASPP', quantized_nn_layers.ASPPQuantized) ] return tfmot.quantization.keras.graph_transformations.model_transformer.ModelTransformer( model, transforms, diff --git a/official/projects/qat/vision/tasks/image_classification_test.py b/official/projects/qat/vision/tasks/image_classification_test.py index 19efd4e16..494c66390 100644 --- a/official/projects/qat/vision/tasks/image_classification_test.py +++ b/official/projects/qat/vision/tasks/image_classification_test.py @@ -19,10 +19,10 @@ from absl.testing import parameterized import orbit import tensorflow as tf +from official import vision from official.core import exp_factory from official.modeling import optimization from official.projects.qat.vision.tasks import image_classification as img_cls_task -from official.vision import beta class ImageClassificationTaskTest(tf.test.TestCase, parameterized.TestCase): diff --git a/official/projects/qat/vision/tasks/retinanet_test.py b/official/projects/qat/vision/tasks/retinanet_test.py index 454b0aadc..131517366 100644 --- a/official/projects/qat/vision/tasks/retinanet_test.py +++ b/official/projects/qat/vision/tasks/retinanet_test.py @@ -19,10 +19,10 @@ from absl.testing import parameterized import orbit import tensorflow as tf +from official import vision from official.core import exp_factory from official.modeling import optimization from official.projects.qat.vision.tasks import retinanet -from official.vision import beta from official.vision.configs import retinanet as exp_cfg -- GitLab From 36a140b8765eaa07525ac42a00cbd01a8b03b98e Mon Sep 17 00:00:00 2001 From: Hongkun Yu Date: Wed, 16 Mar 2022 11:12:00 -0700 Subject: [PATCH 16/54] Remove experimental optimizer usage on github. PiperOrigin-RevId: 435102510 --- official/modeling/optimization/optimizer_factory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/official/modeling/optimization/optimizer_factory.py b/official/modeling/optimization/optimizer_factory.py index 78cf027c8..4d03445d5 100644 --- a/official/modeling/optimization/optimizer_factory.py +++ b/official/modeling/optimization/optimizer_factory.py @@ -28,9 +28,9 @@ from official.nlp import optimization as nlp_optimization OPTIMIZERS_CLS = { 'sgd': tf.keras.optimizers.SGD, - 'sgd_experimental': tf.keras.optimizers.experimental.SGD, + # TODO(chenmoneygithub): experimental.SGD 'adam': tf.keras.optimizers.Adam, - 'adam_experimental': tf.keras.optimizers.experimental.Adam, + # TODO(chenmoneygithub): experimental.Adam 'adamw': nlp_optimization.AdamWeightDecay, 'lamb': tfa_optimizers.LAMB, 'rmsprop': tf.keras.optimizers.RMSprop, -- GitLab From 77bf83b493617df6c5cd35b8d8cf495944161d99 Mon Sep 17 00:00:00 2001 From: Liangzhe Yuan Date: Wed, 16 Mar 2022 12:41:51 -0700 Subject: [PATCH 17/54] Start a project folder in preparing for ConST-CL open-source. (https://arxiv.org/abs/2112.05181) PiperOrigin-RevId: 435128336 --- official/projects/const_cl/README.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 official/projects/const_cl/README.md diff --git a/official/projects/const_cl/README.md b/official/projects/const_cl/README.md new file mode 100644 index 000000000..488b455fd --- /dev/null +++ b/official/projects/const_cl/README.md @@ -0,0 +1,5 @@ +# Contextualized Spatial-Temporal Contrastive Learning with Self-Supervision + +(WIP) This repository contains the official implementation of +[Contextualized Spatio-Temporal Contrastive Learning with Self-Supervision](https://arxiv.org/abs/2112.05181) +in TF2. -- GitLab From 24ae1f51df065e38e89c3bda5e4a53448f40b426 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 17 Mar 2022 12:15:48 -0700 Subject: [PATCH 18/54] Expose flag to log model flops and parameters. PiperOrigin-RevId: 435413594 --- official/vision/serving/export_saved_model.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/official/vision/serving/export_saved_model.py b/official/vision/serving/export_saved_model.py index 2c360e083..5ea55e286 100644 --- a/official/vision/serving/export_saved_model.py +++ b/official/vision/serving/export_saved_model.py @@ -46,9 +46,8 @@ from official.vision.serving import export_saved_model_lib FLAGS = flags.FLAGS - -flags.DEFINE_string( - 'experiment', None, 'experiment type, e.g. retinanet_resnetfpn_coco') +flags.DEFINE_string('experiment', None, + 'experiment type, e.g. retinanet_resnetfpn_coco') flags.DEFINE_string('export_dir', None, 'The export directory.') flags.DEFINE_string('checkpoint_path', None, 'Checkpoint path.') flags.DEFINE_multi_string( @@ -64,8 +63,7 @@ flags.DEFINE_string( 'params_override', '', 'The JSON/YAML file or string which specifies the parameter to be overriden' ' on top of `config_file` template.') -flags.DEFINE_integer( - 'batch_size', None, 'The batch size.') +flags.DEFINE_integer('batch_size', None, 'The batch size.') flags.DEFINE_string( 'input_type', 'image_tensor', 'One of `image_tensor`, `image_bytes`, `tf_example` and `tflite`.') @@ -77,6 +75,8 @@ flags.DEFINE_string('export_checkpoint_subdir', 'checkpoint', 'The subdirectory for checkpoints.') flags.DEFINE_string('export_saved_model_subdir', 'saved_model', 'The subdirectory for saved model.') +flags.DEFINE_bool('log_model_flops_and_params', False, + 'If true, logs model flops and parameters.') def main(_): @@ -100,7 +100,8 @@ def main(_): checkpoint_path=FLAGS.checkpoint_path, export_dir=FLAGS.export_dir, export_checkpoint_subdir=FLAGS.export_checkpoint_subdir, - export_saved_model_subdir=FLAGS.export_saved_model_subdir) + export_saved_model_subdir=FLAGS.export_saved_model_subdir, + log_model_flops_and_params=FLAGS.log_model_flops_and_params) if __name__ == '__main__': -- GitLab From e0ed507504b4efa6780e244677ded563b36b0ee7 Mon Sep 17 00:00:00 2001 From: Rebecca Chen Date: Fri, 18 Mar 2022 14:14:55 -0700 Subject: [PATCH 19/54] Internal change PiperOrigin-RevId: 435732468 --- official/nlp/data/dual_encoder_dataloader.py | 2 +- official/vision/beta/modeling/backbones/revnet.py | 2 +- official/vision/beta/modeling/decoders/aspp.py | 2 +- official/vision/modeling/backbones/revnet.py | 2 +- official/vision/modeling/decoders/aspp.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/official/nlp/data/dual_encoder_dataloader.py b/official/nlp/data/dual_encoder_dataloader.py index 5852455b0..1818d07f0 100644 --- a/official/nlp/data/dual_encoder_dataloader.py +++ b/official/nlp/data/dual_encoder_dataloader.py @@ -124,7 +124,7 @@ class DualEncoderDataLoader(data_loader.DataLoader): raise ValueError('Expected {} to start with {}'.format(string, old)) def _switch_key_prefix(d, old, new): - return {_switch_prefix(key, old, new): value for key, value in d.items()} + return {_switch_prefix(key, old, new): value for key, value in d.items()} # pytype: disable=attribute-error # trace-all-classes model_inputs = _switch_key_prefix( self._bert_tokenize(record, self._left_text_fields), diff --git a/official/vision/beta/modeling/backbones/revnet.py b/official/vision/beta/modeling/backbones/revnet.py index 3e071bafc..3dd3ea330 100644 --- a/official/vision/beta/modeling/backbones/revnet.py +++ b/official/vision/beta/modeling/backbones/revnet.py @@ -208,7 +208,7 @@ class RevNet(tf.keras.Model): @property def output_specs(self) -> Dict[int, tf.TensorShape]: """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs + return self._output_specs # pytype: disable=bad-return-type # trace-all-classes @factory.register_backbone_builder('revnet') diff --git a/official/vision/beta/modeling/decoders/aspp.py b/official/vision/beta/modeling/decoders/aspp.py index 6c99da0c8..a10fe5d97 100644 --- a/official/vision/beta/modeling/decoders/aspp.py +++ b/official/vision/beta/modeling/decoders/aspp.py @@ -103,7 +103,7 @@ class ASPP(tf.keras.layers.Layer): if self._config_dict['pool_kernel_size']: pool_kernel_size = [ int(p_size // 2**self._config_dict['level']) - for p_size in self._config_dict['pool_kernel_size'] + for p_size in self._config_dict['pool_kernel_size'] # pytype: disable=attribute-error # trace-all-classes ] self.aspp = self._aspp_layer( diff --git a/official/vision/modeling/backbones/revnet.py b/official/vision/modeling/backbones/revnet.py index 04de0dca8..65319306d 100644 --- a/official/vision/modeling/backbones/revnet.py +++ b/official/vision/modeling/backbones/revnet.py @@ -208,7 +208,7 @@ class RevNet(tf.keras.Model): @property def output_specs(self) -> Dict[int, tf.TensorShape]: """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs + return self._output_specs # pytype: disable=bad-return-type # trace-all-classes @factory.register_backbone_builder('revnet') diff --git a/official/vision/modeling/decoders/aspp.py b/official/vision/modeling/decoders/aspp.py index a15b36bee..946a5750d 100644 --- a/official/vision/modeling/decoders/aspp.py +++ b/official/vision/modeling/decoders/aspp.py @@ -103,7 +103,7 @@ class ASPP(tf.keras.layers.Layer): if self._config_dict['pool_kernel_size']: pool_kernel_size = [ int(p_size // 2**self._config_dict['level']) - for p_size in self._config_dict['pool_kernel_size'] + for p_size in self._config_dict['pool_kernel_size'] # pytype: disable=attribute-error # trace-all-classes ] self.aspp = self._aspp_layer( -- GitLab From f65fd49099b202f17d48fa9e3b25c44813ba121f Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 21 Mar 2022 12:20:18 -0700 Subject: [PATCH 20/54] Internal change PiperOrigin-RevId: 436274142 --- official/vision/configs/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/official/vision/configs/__init__.py b/official/vision/configs/__init__.py index e840e5e12..9c123af9f 100644 --- a/official/vision/configs/__init__.py +++ b/official/vision/configs/__init__.py @@ -15,8 +15,11 @@ # Lint as: python3 """Configs package definition.""" +from official.vision.configs import common from official.vision.configs import image_classification from official.vision.configs import maskrcnn from official.vision.configs import retinanet from official.vision.configs import semantic_segmentation from official.vision.configs import video_classification +from official.vision.configs.google import backbones +from official.vision.configs.google import backbones_3d -- GitLab From 470986b60ba11322ed896dbd7d4d664a0f403781 Mon Sep 17 00:00:00 2001 From: Dan Kondratyuk Date: Mon, 21 Mar 2022 12:29:27 -0700 Subject: [PATCH 21/54] Add video classification with MoViNets to official readme. PiperOrigin-RevId: 436276225 --- official/README.md | 8 ++++++++ official/projects/movinet/README.md | 3 +-- official/vision/beta/MODEL_GARDEN.md | 16 +++++++++++++++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/official/README.md b/official/README.md index 001d48ff8..06f09d4e5 100644 --- a/official/README.md +++ b/official/README.md @@ -20,6 +20,7 @@ In the near future, we will add: * State-of-the-art language understanding models. * State-of-the-art image classification models. * State-of-the-art object detection and instance segmentation models. +* State-of-the-art video classification models. ## Table of Contents @@ -27,6 +28,7 @@ In the near future, we will add: * [Computer Vision](#computer-vision) + [Image Classification](#image-classification) + [Object Detection and Segmentation](#object-detection-and-segmentation) + + [Video Classification](#video-classification) * [Natural Language Processing](#natural-language-processing) * [Recommendation](#recommendation) - [How to get started with the official models](#how-to-get-started-with-the-official-models) @@ -55,6 +57,12 @@ In the near future, we will add: | [SpineNet](vision/beta/MODEL_GARDEN.md) | [SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization](https://arxiv.org/abs/1912.05027) | | [Cascade RCNN-RS and RetinaNet-RS](vision/beta/MODEL_GARDEN.md) | [Simple Training Strategies and Model Scaling for Object Detection](https://arxiv.org/abs/2107.00057)| +#### Video Classification + +| Model | Reference (Paper) | +|-------|-------------------| +| [Mobile Video Networks (MoViNets)](projects/movinet) | [MoViNets: Mobile Video Networks for Efficient Video Recognition](https://arxiv.org/abs/2103.11511) | + ### Natural Language Processing | Model | Reference (Paper) | diff --git a/official/projects/movinet/README.md b/official/projects/movinet/README.md index 0e72f7459..36bcfe89d 100644 --- a/official/projects/movinet/README.md +++ b/official/projects/movinet/README.md @@ -176,8 +176,7 @@ devices. See the [TF Lite Example](#tf-lite-example) to export and run your own models. We also provide [quantized TF Lite binaries via TF Hub](https://tfhub.dev/s?deployment-format=lite&q=movinet). For reference, MoViNet-A0-Stream runs with a similar latency to -[MobileNetV3-Large] -(https://tfhub.dev/google/imagenet/mobilenet_v3_large_100_224/classification/) +[MobileNetV3-Large](https://tfhub.dev/google/imagenet/mobilenet_v3_large_100_224/classification/) with +5% accuracy on Kinetics 600. | Model Name | Input Shape | Pixel 4 Latency\* | x86 Latency\* | TF Lite Binary | diff --git a/official/vision/beta/MODEL_GARDEN.md b/official/vision/beta/MODEL_GARDEN.md index ebb0cf280..d8bd43d9e 100644 --- a/official/vision/beta/MODEL_GARDEN.md +++ b/official/vision/beta/MODEL_GARDEN.md @@ -171,8 +171,10 @@ evaluated on [COCO](https://cocodataset.org/) val2017. [Spatiotemporal Contrastive Video Representation Learning](https://arxiv.org/abs/2008.03800). * ResNet-3D-RS (R3D-RS) in [Revisiting 3D ResNets for Video Recognition](https://arxiv.org/pdf/2109.01696.pdf). + * Mobile Video Networks (MoViNets) in + [MoViNets: Mobile Video Networks for Efficient Video Recognition](https://arxiv.org/abs/2103.11511). -* Training and evaluation details: +* Training and evaluation details (SlowFast and ResNet): * All models are trained from scratch with vision modality (RGB) for 200 epochs. * We use batch size of 1024 and cosine learning rate decay with linear warmup @@ -192,6 +194,12 @@ evaluated on [COCO](https://cocodataset.org/) val2017. | R3D-RS-152 | 32 x 2 | 79.9 | 94.3 | - | R3D-RS-200 | 32 x 2 | 80.4 | 94.4 | - | R3D-RS-200 | 48 x 2 | 81.0 | - | - +| MoViNet-A0-Base | 50 x 5 | 69.40 | 89.18 | - +| MoViNet-A1-Base | 50 x 5 | 74.57 | 92.03 | - +| MoViNet-A2-Base | 50 x 5 | 75.91 | 92.63 | - +| MoViNet-A3-Base | 120 x 2 | 79.34 | 94.52 | - +| MoViNet-A4-Base | 80 x 3 | 80.64 | 94.93 | - +| MoViNet-A5-Base | 120 x 2 | 81.39 | 95.06 | - ### Kinetics-600 Action Recognition Baselines @@ -201,3 +209,9 @@ evaluated on [COCO](https://cocodataset.org/) val2017. | R3D-50 | 32 x 2 | 79.5 | 94.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50_tpu.yaml) | | R3D-RS-200 | 32 x 2 | 83.1 | - | - | R3D-RS-200 | 48 x 2 | 83.8 | - | - +| MoViNet-A0-Base | 50 x 5 | 72.05 | 90.92 | [config](https://github.com/tensorflow/models/blob/master/official/projects/movinet/configs/yaml/movinet_a0_k600_8x8.yaml) | +| MoViNet-A1-Base | 50 x 5 | 76.69 | 93.40 | [config](https://github.com/tensorflow/models/blob/master/official/projects/movinet/configs/yaml/movinet_a1_k600_8x8.yaml) | +| MoViNet-A2-Base | 50 x 5 | 78.62 | 94.17 | [config](https://github.com/tensorflow/models/blob/master/official/projects/movinet/configs/yaml/movinet_a2_k600_8x8.yaml) | +| MoViNet-A3-Base | 120 x 2 | 81.79 | 95.67 | [config](https://github.com/tensorflow/models/blob/master/official/projects/movinet/configs/yaml/movinet_a3_k600_8x8.yaml) | +| MoViNet-A4-Base | 80 x 3 | 83.48 | 96.16 | [config](https://github.com/tensorflow/models/blob/master/official/projects/movinet/configs/yaml/movinet_a4_k600_8x8.yaml) | +| MoViNet-A5-Base | 120 x 2 | 84.27 | 96.39 | [config](https://github.com/tensorflow/models/blob/master/official/projects/movinet/configs/yaml/movinet_a5_k600_8x8.yaml) | -- GitLab From 599c26f5ecde037763cc6f9d29127489316f44c3 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 21 Mar 2022 13:20:53 -0700 Subject: [PATCH 22/54] Add checkpoint arg to export_inference_graph PiperOrigin-RevId: 436288626 --- official/vision/serving/export_saved_model_lib.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/official/vision/serving/export_saved_model_lib.py b/official/vision/serving/export_saved_model_lib.py index 2559aab06..56593afa3 100644 --- a/official/vision/serving/export_saved_model_lib.py +++ b/official/vision/serving/export_saved_model_lib.py @@ -43,7 +43,8 @@ def export_inference_graph( export_checkpoint_subdir: Optional[str] = None, export_saved_model_subdir: Optional[str] = None, save_options: Optional[tf.saved_model.SaveOptions] = None, - log_model_flops_and_params: bool = False): + log_model_flops_and_params: bool = False, + checkpoint: Optional[tf.train.Checkpoint] = None): """Exports inference graph for the model specified in the exp config. Saved model is stored at export_dir/saved_model, checkpoint is saved @@ -67,6 +68,8 @@ def export_inference_graph( save_options: `SaveOptions` for `tf.saved_model.save`. log_model_flops_and_params: If True, writes model FLOPs to model_flops.txt and model parameters to model_params.txt. + checkpoint: An optional tf.train.Checkpoint. If provided, the export module + will use it to read the weights. """ if export_checkpoint_subdir: @@ -123,6 +126,7 @@ def export_inference_graph( export_module, function_keys=[input_type], export_savedmodel_dir=output_saved_model_directory, + checkpoint=checkpoint, checkpoint_path=checkpoint_path, timestamped=False, save_options=save_options) -- GitLab From c998c93bc92bf9672fa128058a519aa06752151e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 21 Mar 2022 18:13:33 -0700 Subject: [PATCH 23/54] Internal change PiperOrigin-RevId: 436353256 --- official/vision/modeling/factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/official/vision/modeling/factory.py b/official/vision/modeling/factory.py index 927adefa9..b90a76855 100644 --- a/official/vision/modeling/factory.py +++ b/official/vision/modeling/factory.py @@ -180,7 +180,7 @@ def build_maskrcnn(input_specs: tf.keras.layers.InputSpec, background_iou_low_threshold=( roi_sampler_config.background_iou_low_threshold)) roi_sampler_cascade.append(roi_sampler_obj) - # Initialize addtional roi simplers for cascade heads. + # Initialize additional roi simplers for cascade heads. if roi_sampler_config.cascade_iou_thresholds: for iou in roi_sampler_config.cascade_iou_thresholds: roi_sampler_obj = roi_sampler.ROISampler( -- GitLab From cef3156cbb7aae37ae5b9adccc53a71f67aeaa61 Mon Sep 17 00:00:00 2001 From: Yeqing Li Date: Mon, 21 Mar 2022 23:01:21 -0700 Subject: [PATCH 24/54] Fixed the import path. PiperOrigin-RevId: 436391530 --- official/vision/configs/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/official/vision/configs/__init__.py b/official/vision/configs/__init__.py index 9c123af9f..faa4ae0b4 100644 --- a/official/vision/configs/__init__.py +++ b/official/vision/configs/__init__.py @@ -15,11 +15,11 @@ # Lint as: python3 """Configs package definition.""" +from official.vision.configs import backbones +from official.vision.configs import backbones_3d from official.vision.configs import common from official.vision.configs import image_classification from official.vision.configs import maskrcnn from official.vision.configs import retinanet from official.vision.configs import semantic_segmentation from official.vision.configs import video_classification -from official.vision.configs.google import backbones -from official.vision.configs.google import backbones_3d -- GitLab From bf4a2c77e4b2e6196d99dd86ede6b8c2ac2429fc Mon Sep 17 00:00:00 2001 From: Yilei Yang Date: Mon, 21 Mar 2022 23:16:07 -0700 Subject: [PATCH 25/54] Internal change PiperOrigin-RevId: 436393494 --- official/projects/movinet/modeling/movinet.py | 1 - official/projects/movinet/modeling/movinet_layers.py | 1 - official/projects/movinet/modeling/movinet_layers_test.py | 1 - official/projects/movinet/modeling/movinet_model_test.py | 1 - official/projects/movinet/modeling/movinet_test.py | 1 - official/projects/qat/vision/configs/__init__.py | 1 - official/projects/qat/vision/configs/common.py | 1 - official/projects/qat/vision/configs/image_classification.py | 1 - official/projects/qat/vision/configs/retinanet.py | 1 - official/projects/qat/vision/configs/semantic_segmentation.py | 1 - 10 files changed, 10 deletions(-) diff --git a/official/projects/movinet/modeling/movinet.py b/official/projects/movinet/modeling/movinet.py index e13d38361..26487d11d 100644 --- a/official/projects/movinet/modeling/movinet.py +++ b/official/projects/movinet/modeling/movinet.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Contains definitions of Mobile Video Networks. Reference: https://arxiv.org/pdf/2103.11511.pdf diff --git a/official/projects/movinet/modeling/movinet_layers.py b/official/projects/movinet/modeling/movinet_layers.py index dc713164c..61264a345 100644 --- a/official/projects/movinet/modeling/movinet_layers.py +++ b/official/projects/movinet/modeling/movinet_layers.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Contains common building blocks for MoViNets. Reference: https://arxiv.org/pdf/2103.11511.pdf diff --git a/official/projects/movinet/modeling/movinet_layers_test.py b/official/projects/movinet/modeling/movinet_layers_test.py index 44abbbdf6..b4027043c 100644 --- a/official/projects/movinet/modeling/movinet_layers_test.py +++ b/official/projects/movinet/modeling/movinet_layers_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Tests for movinet_layers.py.""" from absl.testing import parameterized diff --git a/official/projects/movinet/modeling/movinet_model_test.py b/official/projects/movinet/modeling/movinet_model_test.py index 352d78f8b..3187e38a3 100644 --- a/official/projects/movinet/modeling/movinet_model_test.py +++ b/official/projects/movinet/modeling/movinet_model_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Tests for movinet_model.py.""" from absl.testing import parameterized diff --git a/official/projects/movinet/modeling/movinet_test.py b/official/projects/movinet/modeling/movinet_test.py index b54386c6b..0b082c00a 100644 --- a/official/projects/movinet/modeling/movinet_test.py +++ b/official/projects/movinet/modeling/movinet_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Tests for movinet.py.""" from absl.testing import parameterized diff --git a/official/projects/qat/vision/configs/__init__.py b/official/projects/qat/vision/configs/__init__.py index f57b34606..c542ea9f5 100644 --- a/official/projects/qat/vision/configs/__init__.py +++ b/official/projects/qat/vision/configs/__init__.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Configs package definition.""" from official.projects.qat.vision.configs import image_classification diff --git a/official/projects/qat/vision/configs/common.py b/official/projects/qat/vision/configs/common.py index fc2814474..96d2bccb1 100644 --- a/official/projects/qat/vision/configs/common.py +++ b/official/projects/qat/vision/configs/common.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Image classification configuration definition.""" import dataclasses diff --git a/official/projects/qat/vision/configs/image_classification.py b/official/projects/qat/vision/configs/image_classification.py index 1389f37f7..08e01cb65 100644 --- a/official/projects/qat/vision/configs/image_classification.py +++ b/official/projects/qat/vision/configs/image_classification.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Image classification configuration definition.""" import dataclasses diff --git a/official/projects/qat/vision/configs/retinanet.py b/official/projects/qat/vision/configs/retinanet.py index 5410ed9f5..e12dcbfe8 100644 --- a/official/projects/qat/vision/configs/retinanet.py +++ b/official/projects/qat/vision/configs/retinanet.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """RetinaNet configuration definition.""" import dataclasses from typing import Optional diff --git a/official/projects/qat/vision/configs/semantic_segmentation.py b/official/projects/qat/vision/configs/semantic_segmentation.py index 20e3fae2d..0bfe94b45 100644 --- a/official/projects/qat/vision/configs/semantic_segmentation.py +++ b/official/projects/qat/vision/configs/semantic_segmentation.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """RetinaNet configuration definition.""" import dataclasses from typing import Optional -- GitLab From 996e3a2a8d9a4e4015de42c0d0c96f804573e625 Mon Sep 17 00:00:00 2001 From: Yilei Yang Date: Mon, 21 Mar 2022 23:16:13 -0700 Subject: [PATCH 26/54] Internal change PiperOrigin-RevId: 436393505 --- official/legacy/image_classification/configs/base_configs.py | 1 - official/legacy/image_classification/configs/configs.py | 1 - .../projects/volumetric_models/tasks/semantic_segmentation_3d.py | 1 - .../volumetric_models/tasks/semantic_segmentation_3d_test.py | 1 - official/vision/beta/ops/preprocess_ops_3d.py | 1 - official/vision/beta/ops/preprocess_ops_3d_test.py | 1 - .../beta/projects/video_ssl/dataloaders/video_ssl_input.py | 1 - .../beta/projects/video_ssl/dataloaders/video_ssl_input_test.py | 1 - official/vision/beta/train.py | 1 - official/vision/beta/train_spatial_partitioning.py | 1 - official/vision/ops/preprocess_ops_3d.py | 1 - official/vision/ops/preprocess_ops_3d_test.py | 1 - official/vision/train.py | 1 - official/vision/train_spatial_partitioning.py | 1 - 14 files changed, 14 deletions(-) diff --git a/official/legacy/image_classification/configs/base_configs.py b/official/legacy/image_classification/configs/base_configs.py index e8f8803df..7fd230b41 100644 --- a/official/legacy/image_classification/configs/base_configs.py +++ b/official/legacy/image_classification/configs/base_configs.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Definitions for high level configuration groups..""" import dataclasses diff --git a/official/legacy/image_classification/configs/configs.py b/official/legacy/image_classification/configs/configs.py index 5722671f4..87fb5df5b 100644 --- a/official/legacy/image_classification/configs/configs.py +++ b/official/legacy/image_classification/configs/configs.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Configuration utils for image classification experiments.""" import dataclasses diff --git a/official/projects/volumetric_models/tasks/semantic_segmentation_3d.py b/official/projects/volumetric_models/tasks/semantic_segmentation_3d.py index 07e76836b..928d5d26c 100644 --- a/official/projects/volumetric_models/tasks/semantic_segmentation_3d.py +++ b/official/projects/volumetric_models/tasks/semantic_segmentation_3d.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Image segmentation task definition.""" from typing import Any, Dict, Mapping, Optional, Sequence, Union diff --git a/official/projects/volumetric_models/tasks/semantic_segmentation_3d_test.py b/official/projects/volumetric_models/tasks/semantic_segmentation_3d_test.py index ed918039e..08cf0e693 100644 --- a/official/projects/volumetric_models/tasks/semantic_segmentation_3d_test.py +++ b/official/projects/volumetric_models/tasks/semantic_segmentation_3d_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Tests for semantic segmentation task.""" # pylint: disable=unused-import diff --git a/official/vision/beta/ops/preprocess_ops_3d.py b/official/vision/beta/ops/preprocess_ops_3d.py index 16ab2dd6e..25f680169 100644 --- a/official/vision/beta/ops/preprocess_ops_3d.py +++ b/official/vision/beta/ops/preprocess_ops_3d.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Utils for processing video dataset features.""" from typing import Optional, Tuple diff --git a/official/vision/beta/ops/preprocess_ops_3d_test.py b/official/vision/beta/ops/preprocess_ops_3d_test.py index ccad05fa9..2d2b75832 100644 --- a/official/vision/beta/ops/preprocess_ops_3d_test.py +++ b/official/vision/beta/ops/preprocess_ops_3d_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 import io import itertools diff --git a/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input.py b/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input.py index d4240a9a0..506d197b7 100644 --- a/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input.py +++ b/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Parser for video and label datasets.""" from typing import Dict, Optional, Tuple diff --git a/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input_test.py b/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input_test.py index 21e99acfa..5e6dd1ce2 100644 --- a/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input_test.py +++ b/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 import io diff --git a/official/vision/beta/train.py b/official/vision/beta/train.py index eb2d786ca..cf1501f04 100644 --- a/official/vision/beta/train.py +++ b/official/vision/beta/train.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """TensorFlow Model Garden Vision training driver.""" from absl import app diff --git a/official/vision/beta/train_spatial_partitioning.py b/official/vision/beta/train_spatial_partitioning.py index 4e4caccc7..30bf604fa 100644 --- a/official/vision/beta/train_spatial_partitioning.py +++ b/official/vision/beta/train_spatial_partitioning.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """TensorFlow Model Garden Vision training driver with spatial partitioning.""" from typing import Sequence diff --git a/official/vision/ops/preprocess_ops_3d.py b/official/vision/ops/preprocess_ops_3d.py index 16ab2dd6e..25f680169 100644 --- a/official/vision/ops/preprocess_ops_3d.py +++ b/official/vision/ops/preprocess_ops_3d.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Utils for processing video dataset features.""" from typing import Optional, Tuple diff --git a/official/vision/ops/preprocess_ops_3d_test.py b/official/vision/ops/preprocess_ops_3d_test.py index 546c3ab14..e438f5a7d 100644 --- a/official/vision/ops/preprocess_ops_3d_test.py +++ b/official/vision/ops/preprocess_ops_3d_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 import io import itertools diff --git a/official/vision/train.py b/official/vision/train.py index 46ed592ff..cb0a3cb58 100644 --- a/official/vision/train.py +++ b/official/vision/train.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """TensorFlow Model Garden Vision training driver.""" from absl import app diff --git a/official/vision/train_spatial_partitioning.py b/official/vision/train_spatial_partitioning.py index 4cba1a585..bb0f5ec97 100644 --- a/official/vision/train_spatial_partitioning.py +++ b/official/vision/train_spatial_partitioning.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """TensorFlow Model Garden Vision training driver with spatial partitioning.""" from typing import Sequence -- GitLab From bc8b63322e38c010e2c60e4a24cd86dddeaa804a Mon Sep 17 00:00:00 2001 From: Yilei Yang Date: Mon, 21 Mar 2022 23:17:07 -0700 Subject: [PATCH 27/54] Internal change PiperOrigin-RevId: 436393593 --- official/legacy/image_classification/vgg/vgg_config.py | 1 - official/nlp/configs/wmt_transformer_experiments.py | 1 - official/projects/basnet/train.py | 1 - .../deepmac_maskrcnn/modeling/heads/instance_heads_test.py | 1 - .../projects/deepmac_maskrcnn/modeling/maskrcnn_model_test.py | 1 - official/projects/deepmac_maskrcnn/train.py | 1 - .../edgetpu/vision/configs/semantic_segmentation_config.py | 1 - .../edgetpu/vision/modeling/backbones/mobilenet_edgetpu_test.py | 1 - official/projects/edgetpu/vision/train.py | 1 - 9 files changed, 9 deletions(-) diff --git a/official/legacy/image_classification/vgg/vgg_config.py b/official/legacy/image_classification/vgg/vgg_config.py index 5f9a391f4..0bf936744 100644 --- a/official/legacy/image_classification/vgg/vgg_config.py +++ b/official/legacy/image_classification/vgg/vgg_config.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Configuration definitions for VGG losses, learning rates, and optimizers.""" import dataclasses diff --git a/official/nlp/configs/wmt_transformer_experiments.py b/official/nlp/configs/wmt_transformer_experiments.py index 1b9382d96..bdef599fa 100644 --- a/official/nlp/configs/wmt_transformer_experiments.py +++ b/official/nlp/configs/wmt_transformer_experiments.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 # pylint: disable=g-doc-return-or-yield,line-too-long """WMT translation configurations.""" diff --git a/official/projects/basnet/train.py b/official/projects/basnet/train.py index 490c35668..a95b9ba92 100644 --- a/official/projects/basnet/train.py +++ b/official/projects/basnet/train.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """TensorFlow Model Garden Vision training driver.""" from absl import app diff --git a/official/projects/deepmac_maskrcnn/modeling/heads/instance_heads_test.py b/official/projects/deepmac_maskrcnn/modeling/heads/instance_heads_test.py index 9acee545f..20cdc0fca 100644 --- a/official/projects/deepmac_maskrcnn/modeling/heads/instance_heads_test.py +++ b/official/projects/deepmac_maskrcnn/modeling/heads/instance_heads_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Tests for instance_heads.py.""" # Import libraries diff --git a/official/projects/deepmac_maskrcnn/modeling/maskrcnn_model_test.py b/official/projects/deepmac_maskrcnn/modeling/maskrcnn_model_test.py index 404474980..08e9ab537 100644 --- a/official/projects/deepmac_maskrcnn/modeling/maskrcnn_model_test.py +++ b/official/projects/deepmac_maskrcnn/modeling/maskrcnn_model_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Tests for maskrcnn_model.py.""" # Import libraries diff --git a/official/projects/deepmac_maskrcnn/train.py b/official/projects/deepmac_maskrcnn/train.py index b478ea0e3..ac866f51d 100644 --- a/official/projects/deepmac_maskrcnn/train.py +++ b/official/projects/deepmac_maskrcnn/train.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """TensorFlow Model Garden Vision training driver.""" from absl import app diff --git a/official/projects/edgetpu/vision/configs/semantic_segmentation_config.py b/official/projects/edgetpu/vision/configs/semantic_segmentation_config.py index 84f9b8ee2..10012436d 100644 --- a/official/projects/edgetpu/vision/configs/semantic_segmentation_config.py +++ b/official/projects/edgetpu/vision/configs/semantic_segmentation_config.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Semantic segmentation configuration definition. The segmentation model is built using the mobilenet edgetpu v2 backbone and diff --git a/official/projects/edgetpu/vision/modeling/backbones/mobilenet_edgetpu_test.py b/official/projects/edgetpu/vision/modeling/backbones/mobilenet_edgetpu_test.py index eaf160553..9043aeb06 100644 --- a/official/projects/edgetpu/vision/modeling/backbones/mobilenet_edgetpu_test.py +++ b/official/projects/edgetpu/vision/modeling/backbones/mobilenet_edgetpu_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Tests for MobileNet.""" # Import libraries diff --git a/official/projects/edgetpu/vision/train.py b/official/projects/edgetpu/vision/train.py index 6fd82bd49..d08da9381 100644 --- a/official/projects/edgetpu/vision/train.py +++ b/official/projects/edgetpu/vision/train.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """TensorFlow Model Garden Vision training for MobileNet-EdgeTPU.""" from absl import app -- GitLab From c1230a3e3dd1965a561f311d44a098bccc12d56f Mon Sep 17 00:00:00 2001 From: Nishidha Panpaliya Date: Tue, 22 Mar 2022 05:22:26 -0400 Subject: [PATCH 28/54] Fixed movielens dataset URL --- official/recommendation/README.md | 2 +- official/recommendation/movielens.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/official/recommendation/README.md b/official/recommendation/README.md index ea2abfadc..59c73b5a8 100644 --- a/official/recommendation/README.md +++ b/official/recommendation/README.md @@ -17,7 +17,7 @@ Some abbreviations used the code base include: - ml-20m: MovieLens 20 million dataset ## Dataset -The [MovieLens datasets](http://files.grouplens.org/datasets/movielens/) are used for model training and evaluation. Specifically, we use two datasets: **ml-1m** (short for MovieLens 1 million) and **ml-20m** (short for MovieLens 20 million). +The [MovieLens datasets](https://files.grouplens.org/datasets/movielens/) are used for model training and evaluation. Specifically, we use two datasets: **ml-1m** (short for MovieLens 1 million) and **ml-20m** (short for MovieLens 20 million). ### ml-1m ml-1m dataset contains 1,000,209 anonymous ratings of approximately 3,706 movies made by 6,040 users who joined MovieLens in 2000. All ratings are contained in the file "ratings.dat" without header row, and are in the following format: diff --git a/official/recommendation/movielens.py b/official/recommendation/movielens.py index 10b3a1a9d..fb9e59517 100644 --- a/official/recommendation/movielens.py +++ b/official/recommendation/movielens.py @@ -49,7 +49,7 @@ RATINGS_FILE = "ratings.csv" MOVIES_FILE = "movies.csv" # URL to download dataset -_DATA_URL = "http://files.grouplens.org/datasets/movielens/" +_DATA_URL = "https://files.grouplens.org/datasets/movielens/" GENRE_COLUMN = "genres" ITEM_COLUMN = "item_id" # movies -- GitLab From 5d40c9945ad48e6c3e227d901f0602444a01c9e8 Mon Sep 17 00:00:00 2001 From: Rui Qian Date: Tue, 22 Mar 2022 18:21:23 -0700 Subject: [PATCH 29/54] Internal change PiperOrigin-RevId: 436619391 --- .../vision/beta/projects/video_ssl/README.md | 62 --- .../projects/video_ssl/configs/__init__.py | 17 - .../experiments/cvrl_linear_eval_k600.yaml | 92 ---- .../experiments/cvrl_pretrain_k600_200ep.yaml | 73 ---- .../projects/video_ssl/configs/video_ssl.py | 137 ------ .../video_ssl/configs/video_ssl_test.py | 55 --- .../video_ssl/dataloaders/video_ssl_input.py | 320 -------------- .../dataloaders/video_ssl_input_test.py | 110 ----- .../beta/projects/video_ssl/losses/losses.py | 135 ------ .../video_ssl/modeling/video_ssl_model.py | 179 -------- .../video_ssl/ops/video_ssl_preprocess_ops.py | 405 ------------------ .../ops/video_ssl_preprocess_ops_test.py | 47 -- .../beta/projects/video_ssl/tasks/__init__.py | 18 - .../projects/video_ssl/tasks/linear_eval.py | 70 --- .../beta/projects/video_ssl/tasks/pretrain.py | 185 -------- .../projects/video_ssl/tasks/pretrain_test.py | 81 ---- .../vision/beta/projects/video_ssl/train.py | 77 ---- 17 files changed, 2063 deletions(-) delete mode 100644 official/vision/beta/projects/video_ssl/README.md delete mode 100644 official/vision/beta/projects/video_ssl/configs/__init__.py delete mode 100644 official/vision/beta/projects/video_ssl/configs/experiments/cvrl_linear_eval_k600.yaml delete mode 100644 official/vision/beta/projects/video_ssl/configs/experiments/cvrl_pretrain_k600_200ep.yaml delete mode 100644 official/vision/beta/projects/video_ssl/configs/video_ssl.py delete mode 100644 official/vision/beta/projects/video_ssl/configs/video_ssl_test.py delete mode 100644 official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input.py delete mode 100644 official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input_test.py delete mode 100644 official/vision/beta/projects/video_ssl/losses/losses.py delete mode 100644 official/vision/beta/projects/video_ssl/modeling/video_ssl_model.py delete mode 100644 official/vision/beta/projects/video_ssl/ops/video_ssl_preprocess_ops.py delete mode 100644 official/vision/beta/projects/video_ssl/ops/video_ssl_preprocess_ops_test.py delete mode 100644 official/vision/beta/projects/video_ssl/tasks/__init__.py delete mode 100644 official/vision/beta/projects/video_ssl/tasks/linear_eval.py delete mode 100644 official/vision/beta/projects/video_ssl/tasks/pretrain.py delete mode 100644 official/vision/beta/projects/video_ssl/tasks/pretrain_test.py delete mode 100644 official/vision/beta/projects/video_ssl/train.py diff --git a/official/vision/beta/projects/video_ssl/README.md b/official/vision/beta/projects/video_ssl/README.md deleted file mode 100644 index 926261649..000000000 --- a/official/vision/beta/projects/video_ssl/README.md +++ /dev/null @@ -1,62 +0,0 @@ -# Spatiotemporal Contrastive Video Representation Learning - -[![Paper](http://img.shields.io/badge/Paper-arXiv.2008.03800-B3181B?logo=arXiv)](https://arxiv.org/abs/2008.03800) - -This repository is the official TF2 implementation of [Spatiotemporal Contrastive Video Representation Learning](https://arxiv.org/abs/2008.03800). - -

- -

- -## Description - -We present a self-supervised Contrastive Video Representation Learning (CVRL) -method to learn spatiotemporal visual representations from unlabeled videos. Our -representations are learned using a contrastive loss, where two augmented clips -from the same short video are pulled together in the embedding space, while -clips from different videos are pushed away. CVRL significantly closes the gap -between unsupervised and supervised video representation learning. - -We release the code and pre-trained models. - -More pre-trained model checkpoints and a detailed instruction about the code -will be updated. - - -## Experimental Results - -### Kinetics-600 top-1 linear classification accuracy - -

- -

- - -## Pre-trained Model Checkpoints - -We provide model checkpoints pre-trained on unlabeled RGB videos from -Kinetics-400 and Kinetics-600. All models are trained scratch with random -initialization. - -We also provide a baseline model checkpoint of "ImageNet inflated" we used in -the paper. The model has the same architecture as 3D-ResNet-50 (R3D-50), with -model weights inflated from a 2D ResNet-50 pre-trained on ImageNet. - -| Model | Parameters | Dataset | Epochs | K400 Linear Eval. | K600 Linear Eval. | Checkpoint | -| :--------------: | :----: | :--: | :--: |:-----------: | :----------: | :----------: | -| R3D-50 (1x) | 31.7M | ImageNet | - | 53.5% | 54.7% | [ckpt (127 MB)](https://storage.googleapis.com/tf_model_garden/vision/cvrl/imagenet.tar.gz) | -| R3D-50 (1x) | 31.7M | Kinetics-400 | 200 | 63.8% | - | [ckpt (127 MB)](https://storage.googleapis.com/tf_model_garden/vision/cvrl/r3d_1x_k400_200ep.tar.gz) | -| R3D-50 (1x) | 31.7M | Kinetics-400 | 800 | 66.1% | - | [ckpt (127 MB)](https://storage.googleapis.com/tf_model_garden/vision/cvrl/r3d_1x_k400_800ep.tar.gz) | -| R3D-50 (1x) | 31.7M | Kinetics-600 | 800 | 68.5% | 70.4% | [ckpt (127 MB)](https://storage.googleapis.com/tf_model_garden/vision/cvrl/r3d_1x_k600_800ep.tar.gz) | - - -## Citation - -``` -@inproceedings{qian2021spatiotemporal, - title={Spatiotemporal contrastive video representation learning}, - author={Qian, Rui and Meng, Tianjian and Gong, Boqing and Yang, Ming-Hsuan and Wang, Huisheng and Belongie, Serge and Cui, Yin}, - booktitle={CVPR}, - year={2021} -} -``` diff --git a/official/vision/beta/projects/video_ssl/configs/__init__.py b/official/vision/beta/projects/video_ssl/configs/__init__.py deleted file mode 100644 index 868e6094d..000000000 --- a/official/vision/beta/projects/video_ssl/configs/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Configs package definition.""" - -from official.vision.beta.projects.video_ssl.configs import video_ssl diff --git a/official/vision/beta/projects/video_ssl/configs/experiments/cvrl_linear_eval_k600.yaml b/official/vision/beta/projects/video_ssl/configs/experiments/cvrl_linear_eval_k600.yaml deleted file mode 100644 index fc9c2121a..000000000 --- a/official/vision/beta/projects/video_ssl/configs/experiments/cvrl_linear_eval_k600.yaml +++ /dev/null @@ -1,92 +0,0 @@ -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - # Put the pretrained checkpoint here for linear evaluation - init_checkpoint: 'r3d_1x_k600_800ep_backbone-1' - init_checkpoint_modules: 'backbone' - model: - dropout_rate: 1.0 - norm_activation: - use_sync_bn: false - backbone: - resnet_3d: - block_specs: !!python/tuple - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - model_id: 50 - stem_conv_temporal_kernel_size: 5 - stem_conv_temporal_stride: 2 - stem_pool_temporal_stride: 1 - train_data: - name: kinetics600 - feature_shape: !!python/tuple - - 32 - - 224 - - 224 - - 3 - temporal_stride: 2 - global_batch_size: 1024 - dtype: 'bfloat16' - shuffle_buffer_size: 1024 - aug_max_area_ratio: 1.0 - aug_max_aspect_ratio: 2.0 - aug_min_area_ratio: 0.3 - aug_min_aspect_ratio: 0.5 - validation_data: - name: kinetics600 - feature_shape: !!python/tuple - - 32 - - 256 - - 256 - - 3 - temporal_stride: 2 - num_test_clips: 10 - num_test_crops: 3 - global_batch_size: 64 - dtype: 'bfloat16' - drop_remainder: false - losses: - l2_weight_decay: 0.0 -trainer: - optimizer_config: - learning_rate: - cosine: - initial_learning_rate: 32.0 - decay_steps: 35744 - optimizer: - sgd: - nesterov: false - warmup: - linear: - warmup_steps: 1787 - train_steps: 35744 - steps_per_loop: 100 - summary_interval: 100 - validation_interval: 100 diff --git a/official/vision/beta/projects/video_ssl/configs/experiments/cvrl_pretrain_k600_200ep.yaml b/official/vision/beta/projects/video_ssl/configs/experiments/cvrl_pretrain_k600_200ep.yaml deleted file mode 100644 index 316d02fbb..000000000 --- a/official/vision/beta/projects/video_ssl/configs/experiments/cvrl_pretrain_k600_200ep.yaml +++ /dev/null @@ -1,73 +0,0 @@ -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - dropout_rate: 1.0 - norm_activation: - use_sync_bn: true - hidden_norm_activation: - use_sync_bn: true - backbone: - resnet_3d: - block_specs: !!python/tuple - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - model_id: 50 - stem_conv_temporal_kernel_size: 5 - stem_conv_temporal_stride: 2 - stem_pool_temporal_stride: 1 - train_data: - name: kinetics600 - feature_shape: !!python/tuple - - 16 - - 224 - - 224 - - 3 - temporal_stride: 2 - global_batch_size: 1024 - dtype: 'bfloat16' - shuffle_buffer_size: 1024 - losses: - l2_weight_decay: 0.000001 -trainer: - optimizer_config: - learning_rate: - cosine: - initial_learning_rate: 0.32 - decay_steps: 71488 - optimizer: - sgd: - nesterov: false - warmup: - linear: - warmup_steps: 1787 - train_steps: 71488 - steps_per_loop: 100 - summary_interval: 100 diff --git a/official/vision/beta/projects/video_ssl/configs/video_ssl.py b/official/vision/beta/projects/video_ssl/configs/video_ssl.py deleted file mode 100644 index 5fb640ad5..000000000 --- a/official/vision/beta/projects/video_ssl/configs/video_ssl.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Video classification configuration definition.""" - - -import dataclasses - -from official.core import config_definitions as cfg -from official.core import exp_factory -from official.vision.beta.configs import common -from official.vision.beta.configs import video_classification - - -Losses = video_classification.Losses -VideoClassificationModel = video_classification.VideoClassificationModel -VideoClassificationTask = video_classification.VideoClassificationTask - - -@dataclasses.dataclass -class VideoSSLPretrainTask(VideoClassificationTask): - pass - - -@dataclasses.dataclass -class VideoSSLEvalTask(VideoClassificationTask): - pass - - -@dataclasses.dataclass -class DataConfig(video_classification.DataConfig): - """The base configuration for building datasets.""" - is_ssl: bool = False - - -@dataclasses.dataclass -class VideoSSLModel(VideoClassificationModel): - """The model config.""" - normalize_feature: bool = False - hidden_dim: int = 2048 - hidden_layer_num: int = 3 - projection_dim: int = 128 - hidden_norm_activation: common.NormActivation = common.NormActivation( - use_sync_bn=False, norm_momentum=0.997, norm_epsilon=1.0e-05) - - -@dataclasses.dataclass -class SSLLosses(Losses): - normalize_hidden: bool = True - temperature: float = 0.1 - - -@exp_factory.register_config_factory('video_ssl_pretrain_kinetics400') -def video_ssl_pretrain_kinetics400() -> cfg.ExperimentConfig: - """Pretrain SSL Video classification on Kinectics 400 with resnet.""" - exp = video_classification.video_classification_kinetics400() - exp.task = VideoSSLPretrainTask(**exp.task.as_dict()) - exp.task.train_data = DataConfig(is_ssl=True, **exp.task.train_data.as_dict()) - exp.task.train_data.feature_shape = (16, 224, 224, 3) - exp.task.train_data.temporal_stride = 2 - exp.task.model = VideoSSLModel(exp.task.model) - exp.task.model.model_type = 'video_ssl_model' - exp.task.losses = SSLLosses(exp.task.losses) - return exp - - -@exp_factory.register_config_factory('video_ssl_linear_eval_kinetics400') -def video_ssl_linear_eval_kinetics400() -> cfg.ExperimentConfig: - """Pretrain SSL Video classification on Kinectics 400 with resnet.""" - exp = video_classification.video_classification_kinetics400() - exp.task = VideoSSLEvalTask(**exp.task.as_dict()) - exp.task.train_data = DataConfig(is_ssl=False, - **exp.task.train_data.as_dict()) - exp.task.train_data.feature_shape = (32, 224, 224, 3) - exp.task.train_data.temporal_stride = 2 - exp.task.validation_data.feature_shape = (32, 256, 256, 3) - exp.task.validation_data.temporal_stride = 2 - exp.task.validation_data = DataConfig(is_ssl=False, - **exp.task.validation_data.as_dict()) - exp.task.validation_data.min_image_size = 256 - exp.task.validation_data.num_test_clips = 10 - exp.task.validation_data.num_test_crops = 3 - exp.task.model = VideoSSLModel(exp.task.model) - exp.task.model.model_type = 'video_ssl_model' - exp.task.model.normalize_feature = True - exp.task.model.hidden_layer_num = 0 - exp.task.model.projection_dim = 400 - return exp - - -@exp_factory.register_config_factory('video_ssl_pretrain_kinetics600') -def video_ssl_pretrain_kinetics600() -> cfg.ExperimentConfig: - """Pretrain SSL Video classification on Kinectics 400 with resnet.""" - exp = video_classification.video_classification_kinetics600() - exp.task = VideoSSLPretrainTask(**exp.task.as_dict()) - exp.task.train_data = DataConfig(is_ssl=True, **exp.task.train_data.as_dict()) - exp.task.train_data.feature_shape = (16, 224, 224, 3) - exp.task.train_data.temporal_stride = 2 - exp.task.model = VideoSSLModel(exp.task.model) - exp.task.model.model_type = 'video_ssl_model' - exp.task.losses = SSLLosses(exp.task.losses) - return exp - - -@exp_factory.register_config_factory('video_ssl_linear_eval_kinetics600') -def video_ssl_linear_eval_kinetics600() -> cfg.ExperimentConfig: - """Pretrain SSL Video classification on Kinectics 400 with resnet.""" - exp = video_classification.video_classification_kinetics600() - exp.task = VideoSSLEvalTask(**exp.task.as_dict()) - exp.task.train_data = DataConfig(is_ssl=False, - **exp.task.train_data.as_dict()) - exp.task.train_data.feature_shape = (32, 224, 224, 3) - exp.task.train_data.temporal_stride = 2 - exp.task.validation_data = DataConfig(is_ssl=False, - **exp.task.validation_data.as_dict()) - exp.task.validation_data.feature_shape = (32, 256, 256, 3) - exp.task.validation_data.temporal_stride = 2 - exp.task.validation_data.min_image_size = 256 - exp.task.validation_data.num_test_clips = 10 - exp.task.validation_data.num_test_crops = 3 - exp.task.model = VideoSSLModel(exp.task.model) - exp.task.model.model_type = 'video_ssl_model' - exp.task.model.normalize_feature = True - exp.task.model.hidden_layer_num = 0 - exp.task.model.projection_dim = 600 - return exp diff --git a/official/vision/beta/projects/video_ssl/configs/video_ssl_test.py b/official/vision/beta/projects/video_ssl/configs/video_ssl_test.py deleted file mode 100644 index a3a9ce3f0..000000000 --- a/official/vision/beta/projects/video_ssl/configs/video_ssl_test.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# pylint: disable=unused-import -from absl.testing import parameterized -import tensorflow as tf - -from official.core import config_definitions as cfg -from official.core import exp_factory -from official.vision import beta -from official.vision.beta.projects.video_ssl.configs import video_ssl as exp_cfg - - -class VideoClassificationConfigTest(tf.test.TestCase, parameterized.TestCase): - - @parameterized.parameters(('video_ssl_pretrain_kinetics400',), - ('video_ssl_pretrain_kinetics600',)) - def test_video_ssl_pretrain_configs(self, config_name): - config = exp_factory.get_exp_config(config_name) - self.assertIsInstance(config, cfg.ExperimentConfig) - self.assertIsInstance(config.task, exp_cfg.VideoSSLPretrainTask) - self.assertIsInstance(config.task.model, exp_cfg.VideoSSLModel) - self.assertIsInstance(config.task.losses, exp_cfg.SSLLosses) - self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig) - config.task.train_data.is_training = None - with self.assertRaises(KeyError): - config.validate() - - @parameterized.parameters(('video_ssl_linear_eval_kinetics400',), - ('video_ssl_linear_eval_kinetics600',)) - def test_video_ssl_linear_eval_configs(self, config_name): - config = exp_factory.get_exp_config(config_name) - self.assertIsInstance(config, cfg.ExperimentConfig) - self.assertIsInstance(config.task, exp_cfg.VideoSSLEvalTask) - self.assertIsInstance(config.task.model, exp_cfg.VideoSSLModel) - self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig) - config.task.train_data.is_training = None - with self.assertRaises(KeyError): - config.validate() - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input.py b/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input.py deleted file mode 100644 index 506d197b7..000000000 --- a/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input.py +++ /dev/null @@ -1,320 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Parser for video and label datasets.""" - -from typing import Dict, Optional, Tuple - -from absl import logging -import tensorflow as tf - -from official.vision.beta.dataloaders import video_input -from official.vision.beta.ops import preprocess_ops_3d -from official.vision.beta.projects.video_ssl.configs import video_ssl as exp_cfg -from official.vision.beta.projects.video_ssl.ops import video_ssl_preprocess_ops - -IMAGE_KEY = 'image/encoded' -LABEL_KEY = 'clip/label/index' -Decoder = video_input.Decoder - - -def _process_image(image: tf.Tensor, - is_training: bool = True, - is_ssl: bool = False, - num_frames: int = 32, - stride: int = 1, - num_test_clips: int = 1, - min_resize: int = 256, - crop_size: int = 224, - num_crops: int = 1, - zero_centering_image: bool = False, - seed: Optional[int] = None) -> tf.Tensor: - """Processes a serialized image tensor. - - Args: - image: Input Tensor of shape [timesteps] and type tf.string of serialized - frames. - is_training: Whether or not in training mode. If True, random sample, crop - and left right flip is used. - is_ssl: Whether or not in self-supervised pre-training mode. - num_frames: Number of frames per subclip. - stride: Temporal stride to sample frames. - num_test_clips: Number of test clips (1 by default). If more than 1, this - will sample multiple linearly spaced clips within each video at test time. - If 1, then a single clip in the middle of the video is sampled. The clips - are aggreagated in the batch dimension. - min_resize: Frames are resized so that min(height, width) is min_resize. - crop_size: Final size of the frame after cropping the resized frames. Both - height and width are the same. - num_crops: Number of crops to perform on the resized frames. - zero_centering_image: If True, frames are normalized to values in [-1, 1]. - If False, values in [0, 1]. - seed: A deterministic seed to use when sampling. - - Returns: - Processed frames. Tensor of shape - [num_frames * num_test_clips, crop_size, crop_size, 3]. - """ - # Validate parameters. - if is_training and num_test_clips != 1: - logging.warning( - '`num_test_clips` %d is ignored since `is_training` is `True`.', - num_test_clips) - - # Temporal sampler. - if is_training: - # Sampler for training. - if is_ssl: - # Sample two clips from linear decreasing distribution. - image = video_ssl_preprocess_ops.sample_ssl_sequence( - image, num_frames, True, stride) - else: - # Sample random clip. - image = preprocess_ops_3d.sample_sequence(image, num_frames, True, stride) - - else: - # Sampler for evaluation. - if num_test_clips > 1: - # Sample linspace clips. - image = preprocess_ops_3d.sample_linspace_sequence(image, num_test_clips, - num_frames, stride) - else: - # Sample middle clip. - image = preprocess_ops_3d.sample_sequence(image, num_frames, False, - stride) - - # Decode JPEG string to tf.uint8. - image = preprocess_ops_3d.decode_jpeg(image, 3) - - if is_training: - # Standard image data augmentation: random resized crop and random flip. - if is_ssl: - image_1, image_2 = tf.split(image, num_or_size_splits=2, axis=0) - image_1 = preprocess_ops_3d.random_crop_resize( - image_1, crop_size, crop_size, num_frames, 3, (0.5, 2), (0.3, 1)) - image_1 = preprocess_ops_3d.random_flip_left_right(image_1, seed) - image_2 = preprocess_ops_3d.random_crop_resize( - image_2, crop_size, crop_size, num_frames, 3, (0.5, 2), (0.3, 1)) - image_2 = preprocess_ops_3d.random_flip_left_right(image_2, seed) - - else: - image = preprocess_ops_3d.random_crop_resize( - image, crop_size, crop_size, num_frames, 3, (0.5, 2), (0.3, 1)) - image = preprocess_ops_3d.random_flip_left_right(image, seed) - else: - # Resize images (resize happens only if necessary to save compute). - image = preprocess_ops_3d.resize_smallest(image, min_resize) - # Three-crop of the frames. - image = preprocess_ops_3d.crop_image(image, crop_size, crop_size, False, - num_crops) - - # Cast the frames in float32, normalizing according to zero_centering_image. - if is_training and is_ssl: - image_1 = preprocess_ops_3d.normalize_image(image_1, zero_centering_image) - image_2 = preprocess_ops_3d.normalize_image(image_2, zero_centering_image) - - else: - image = preprocess_ops_3d.normalize_image(image, zero_centering_image) - - # Self-supervised pre-training augmentations. - if is_training and is_ssl: - # Temporally consistent color jittering. - image_1 = video_ssl_preprocess_ops.random_color_jitter_3d(image_1) - image_2 = video_ssl_preprocess_ops.random_color_jitter_3d(image_2) - # Temporally consistent gaussian blurring. - image_1 = video_ssl_preprocess_ops.random_blur(image_1, crop_size, - crop_size, 1.0) - image_2 = video_ssl_preprocess_ops.random_blur(image_2, crop_size, - crop_size, 0.1) - image_2 = video_ssl_preprocess_ops.random_solarization(image_2) - image = tf.concat([image_1, image_2], axis=0) - image = tf.clip_by_value(image, 0., 1.) - - return image - - -def _postprocess_image(image: tf.Tensor, - is_training: bool = True, - is_ssl: bool = False, - num_frames: int = 32, - num_test_clips: int = 1, - num_test_crops: int = 1) -> tf.Tensor: - """Processes a batched Tensor of frames. - - The same parameters used in process should be used here. - - Args: - image: Input Tensor of shape [batch, timesteps, height, width, 3]. - is_training: Whether or not in training mode. If True, random sample, crop - and left right flip is used. - is_ssl: Whether or not in self-supervised pre-training mode. - num_frames: Number of frames per subclip. - num_test_clips: Number of test clips (1 by default). If more than 1, this - will sample multiple linearly spaced clips within each video at test time. - If 1, then a single clip in the middle of the video is sampled. The clips - are aggreagated in the batch dimension. - num_test_crops: Number of test crops (1 by default). If more than 1, there - are multiple crops for each clip at test time. If 1, there is a single - central crop. The crops are aggreagated in the batch dimension. - - Returns: - Processed frames. Tensor of shape - [batch * num_test_clips * num_test_crops, num_frames, height, width, 3]. - """ - if is_ssl and is_training: - # In this case, two clips of self-supervised pre-training are merged - # together in batch dimenstion which will be 2 * batch. - image = tf.concat(tf.split(image, num_or_size_splits=2, axis=1), axis=0) - - num_views = num_test_clips * num_test_crops - if num_views > 1 and not is_training: - # In this case, multiple views are merged together in batch dimenstion which - # will be batch * num_views. - image = tf.reshape(image, [-1, num_frames] + image.shape[2:].as_list()) - - return image - - -def _process_label(label: tf.Tensor, - one_hot_label: bool = True, - num_classes: Optional[int] = None) -> tf.Tensor: - """Processes label Tensor.""" - # Validate parameters. - if one_hot_label and not num_classes: - raise ValueError( - '`num_classes` should be given when requesting one hot label.') - - # Cast to tf.int32. - label = tf.cast(label, dtype=tf.int32) - - if one_hot_label: - # Replace label index by one hot representation. - label = tf.one_hot(label, num_classes) - if len(label.shape.as_list()) > 1: - label = tf.reduce_sum(label, axis=0) - if num_classes == 1: - # The trick for single label. - label = 1 - label - - return label - - -class Parser(video_input.Parser): - """Parses a video and label dataset.""" - - def __init__(self, - input_params: exp_cfg.DataConfig, - image_key: str = IMAGE_KEY, - label_key: str = LABEL_KEY): - super(Parser, self).__init__(input_params, image_key, label_key) - self._is_ssl = input_params.is_ssl - - def _parse_train_data( - self, decoded_tensors: Dict[str, tf.Tensor] - ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]: - """Parses data for training.""" - # Process image and label. - image = decoded_tensors[self._image_key] - image = _process_image( - image=image, - is_training=True, - is_ssl=self._is_ssl, - num_frames=self._num_frames, - stride=self._stride, - num_test_clips=self._num_test_clips, - min_resize=self._min_resize, - crop_size=self._crop_size) - image = tf.cast(image, dtype=self._dtype) - features = {'image': image} - - label = decoded_tensors[self._label_key] - label = _process_label(label, self._one_hot_label, self._num_classes) - - return features, label - - def _parse_eval_data( - self, decoded_tensors: Dict[str, tf.Tensor] - ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]: - """Parses data for evaluation.""" - image = decoded_tensors[self._image_key] - image = _process_image( - image=image, - is_training=False, - num_frames=self._num_frames, - stride=self._stride, - num_test_clips=self._num_test_clips, - min_resize=self._min_resize, - crop_size=self._crop_size, - num_crops=self._num_crops) - image = tf.cast(image, dtype=self._dtype) - features = {'image': image} - - label = decoded_tensors[self._label_key] - label = _process_label(label, self._one_hot_label, self._num_classes) - - if self._output_audio: - audio = decoded_tensors[self._audio_feature] - audio = tf.cast(audio, dtype=self._dtype) - audio = preprocess_ops_3d.sample_sequence( - audio, 20, random=False, stride=1) - audio = tf.ensure_shape(audio, [20, 2048]) - features['audio'] = audio - - return features, label - - def parse_fn(self, is_training): - """Returns a parse fn that reads and parses raw tensors from the decoder. - - Args: - is_training: a `bool` to indicate whether it is in training mode. - - Returns: - parse: a `callable` that takes the serialized examle and generate the - images, labels tuple where labels is a dict of Tensors that contains - labels. - """ - def parse(decoded_tensors): - """Parses the serialized example data.""" - if is_training: - return self._parse_train_data(decoded_tensors) - else: - return self._parse_eval_data(decoded_tensors) - - return parse - - -class PostBatchProcessor(object): - """Processes a video and label dataset which is batched.""" - - def __init__(self, input_params: exp_cfg.DataConfig): - self._is_training = input_params.is_training - self._is_ssl = input_params.is_ssl - self._num_frames = input_params.feature_shape[0] - self._num_test_clips = input_params.num_test_clips - self._num_test_crops = input_params.num_test_crops - - def __call__(self, features: Dict[str, tf.Tensor], - label: tf.Tensor) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]: - """Parses a single tf.Example into image and label tensors.""" - for key in ['image', 'audio']: - if key in features: - features[key] = _postprocess_image( - image=features[key], - is_training=self._is_training, - is_ssl=self._is_ssl, - num_frames=self._num_frames, - num_test_clips=self._num_test_clips, - num_test_crops=self._num_test_crops) - - return features, label diff --git a/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input_test.py b/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input_test.py deleted file mode 100644 index 5e6dd1ce2..000000000 --- a/official/vision/beta/projects/video_ssl/dataloaders/video_ssl_input_test.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import io - -# Import libraries -import numpy as np -from PIL import Image -import tensorflow as tf - -from official.vision.beta.projects.video_ssl.configs import video_ssl as exp_cfg -from official.vision.beta.projects.video_ssl.dataloaders import video_ssl_input - -AUDIO_KEY = 'features/audio' - - -def fake_seq_example(): - # Create fake data. - random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8) - random_image = Image.fromarray(random_image) - label = 42 - with io.BytesIO() as buffer: - random_image.save(buffer, format='JPEG') - raw_image_bytes = buffer.getvalue() - - seq_example = tf.train.SequenceExample() - seq_example.feature_lists.feature_list.get_or_create( - video_ssl_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [ - raw_image_bytes - ] - seq_example.feature_lists.feature_list.get_or_create( - video_ssl_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [ - raw_image_bytes - ] - seq_example.context.feature[video_ssl_input.LABEL_KEY].int64_list.value[:] = [ - label - ] - - random_audio = np.random.normal(size=(10, 256)).tolist() - for s in random_audio: - seq_example.feature_lists.feature_list.get_or_create( - AUDIO_KEY).feature.add().float_list.value[:] = s - return seq_example, label - - -class VideoAndLabelParserTest(tf.test.TestCase): - - def test_video_ssl_input_pretrain(self): - params = exp_cfg.video_ssl_pretrain_kinetics600().task.train_data - - decoder = video_ssl_input.Decoder() - parser = video_ssl_input.Parser(params).parse_fn(params.is_training) - seq_example, _ = fake_seq_example() - - input_tensor = tf.constant(seq_example.SerializeToString()) - decoded_tensors = decoder.decode(input_tensor) - output_tensor = parser(decoded_tensors) - image_features, _ = output_tensor - image = image_features['image'] - - self.assertAllEqual(image.shape, (32, 224, 224, 3)) - - def test_video_ssl_input_linear_train(self): - params = exp_cfg.video_ssl_linear_eval_kinetics600().task.train_data - - decoder = video_ssl_input.Decoder() - parser = video_ssl_input.Parser(params).parse_fn(params.is_training) - seq_example, label = fake_seq_example() - - input_tensor = tf.constant(seq_example.SerializeToString()) - decoded_tensors = decoder.decode(input_tensor) - output_tensor = parser(decoded_tensors) - image_features, label = output_tensor - image = image_features['image'] - - self.assertAllEqual(image.shape, (32, 224, 224, 3)) - self.assertAllEqual(label.shape, (600,)) - - def test_video_ssl_input_linear_eval(self): - params = exp_cfg.video_ssl_linear_eval_kinetics600().task.validation_data - print('!!!', params) - - decoder = video_ssl_input.Decoder() - parser = video_ssl_input.Parser(params).parse_fn(params.is_training) - seq_example, label = fake_seq_example() - - input_tensor = tf.constant(seq_example.SerializeToString()) - decoded_tensors = decoder.decode(input_tensor) - output_tensor = parser(decoded_tensors) - image_features, label = output_tensor - image = image_features['image'] - - self.assertAllEqual(image.shape, (960, 256, 256, 3)) - self.assertAllEqual(label.shape, (600,)) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/projects/video_ssl/losses/losses.py b/official/vision/beta/projects/video_ssl/losses/losses.py deleted file mode 100644 index 2aa2085b8..000000000 --- a/official/vision/beta/projects/video_ssl/losses/losses.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Define losses.""" - -# Import libraries -import tensorflow as tf -from tensorflow.compiler.tf2xla.python import xla - - -def contrastive_loss(hidden, - num_replicas, - normalize_hidden, - temperature, - model, - weight_decay): - """Computes contrastive loss. - - Args: - hidden: embedding of video clips after projection head. - num_replicas: number of distributed replicas. - normalize_hidden: whether or not to l2 normalize the hidden vector. - temperature: temperature in the InfoNCE contrastive loss. - model: keras model for calculating weight decay. - weight_decay: weight decay parameter. - - Returns: - A loss scalar. - The logits for contrastive prediction task. - The labels for contrastive prediction task. - """ - large_num = 1e9 - - hidden1, hidden2 = tf.split(hidden, num_or_size_splits=2, axis=0) - if normalize_hidden: - hidden1 = tf.math.l2_normalize(hidden1, -1) - hidden2 = tf.math.l2_normalize(hidden2, -1) - batch_size = tf.shape(hidden1)[0] - - if num_replicas == 1: - # This is the local version - hidden1_large = hidden1 - hidden2_large = hidden2 - labels = tf.one_hot(tf.range(batch_size), batch_size * 2) - masks = tf.one_hot(tf.range(batch_size), batch_size) - - else: - # This is the cross-tpu version. - hidden1_large = tpu_cross_replica_concat(hidden1, num_replicas) - hidden2_large = tpu_cross_replica_concat(hidden2, num_replicas) - enlarged_batch_size = tf.shape(hidden1_large)[0] - replica_id = tf.cast(tf.cast(xla.replica_id(), tf.uint32), tf.int32) - labels_idx = tf.range(batch_size) + replica_id * batch_size - labels = tf.one_hot(labels_idx, enlarged_batch_size * 2) - masks = tf.one_hot(labels_idx, enlarged_batch_size) - - logits_aa = tf.matmul(hidden1, hidden1_large, transpose_b=True) / temperature - logits_aa = logits_aa - tf.cast(masks, logits_aa.dtype) * large_num - logits_bb = tf.matmul(hidden2, hidden2_large, transpose_b=True) / temperature - logits_bb = logits_bb - tf.cast(masks, logits_bb.dtype) * large_num - logits_ab = tf.matmul(hidden1, hidden2_large, transpose_b=True) / temperature - logits_ba = tf.matmul(hidden2, hidden1_large, transpose_b=True) / temperature - - loss_a = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( - labels, tf.concat([logits_ab, logits_aa], 1))) - loss_b = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( - labels, tf.concat([logits_ba, logits_bb], 1))) - loss = loss_a + loss_b - - l2_loss = weight_decay * tf.add_n([ - tf.nn.l2_loss(v) - for v in model.trainable_variables - if 'kernel' in v.name - ]) - - total_loss = loss + tf.cast(l2_loss, loss.dtype) - - contrast_prob = tf.nn.softmax(logits_ab) - contrast_entropy = - tf.reduce_mean( - tf.reduce_sum(contrast_prob * tf.math.log(contrast_prob + 1e-8), -1)) - - contrast_acc = tf.equal(tf.argmax(labels, 1), tf.argmax(logits_ab, axis=1)) - contrast_acc = tf.reduce_mean(tf.cast(contrast_acc, tf.float32)) - - return { - 'total_loss': total_loss, - 'contrastive_loss': loss, - 'reg_loss': l2_loss, - 'contrast_acc': contrast_acc, - 'contrast_entropy': contrast_entropy, - } - - -def tpu_cross_replica_concat(tensor, num_replicas): - """Reduce a concatenation of the `tensor` across TPU cores. - - Args: - tensor: tensor to concatenate. - num_replicas: number of TPU device replicas. - - Returns: - Tensor of the same rank as `tensor` with first dimension `num_replicas` - times larger. - """ - with tf.name_scope('tpu_cross_replica_concat'): - # This creates a tensor that is like the input tensor but has an added - # replica dimension as the outermost dimension. On each replica it will - # contain the local values and zeros for all other values that need to be - # fetched from other replicas. - ext_tensor = tf.scatter_nd( - indices=[[xla.replica_id()]], - updates=[tensor], - shape=[num_replicas] + tensor.shape.as_list()) - - # As every value is only present on one replica and 0 in all others, adding - # them all together will result in the full tensor on all replicas. - replica_context = tf.distribute.get_replica_context() - ext_tensor = replica_context.all_reduce(tf.distribute.ReduceOp.SUM, - ext_tensor) - - # Flatten the replica dimension. - # The first dimension size will be: tensor.shape[0] * num_replicas - # Using [-1] trick to support also scalar input. - return tf.reshape(ext_tensor, [-1] + ext_tensor.shape.as_list()[2:]) diff --git a/official/vision/beta/projects/video_ssl/modeling/video_ssl_model.py b/official/vision/beta/projects/video_ssl/modeling/video_ssl_model.py deleted file mode 100644 index fd631c9ee..000000000 --- a/official/vision/beta/projects/video_ssl/modeling/video_ssl_model.py +++ /dev/null @@ -1,179 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Build video classification models.""" -from typing import Mapping, Optional - -# Import libraries - -import tensorflow as tf - -from official.modeling import tf_utils -from official.vision.beta.modeling import backbones -from official.vision.beta.modeling import factory_3d as model_factory -from official.vision.beta.projects.video_ssl.configs import video_ssl as video_ssl_cfg - -layers = tf.keras.layers - - -class VideoSSLModel(tf.keras.Model): - """A video ssl model class builder.""" - - def __init__(self, - backbone, - normalize_feature, - hidden_dim, - hidden_layer_num, - hidden_norm_args, - projection_dim, - input_specs: Optional[Mapping[str, - tf.keras.layers.InputSpec]] = None, - dropout_rate: float = 0.0, - aggregate_endpoints: bool = False, - kernel_initializer='random_uniform', - kernel_regularizer=None, - bias_regularizer=None, - **kwargs): - """Video Classification initialization function. - - Args: - backbone: a 3d backbone network. - normalize_feature: whether normalize backbone feature. - hidden_dim: `int` number of hidden units in MLP. - hidden_layer_num: `int` number of hidden layers in MLP. - hidden_norm_args: `dict` for batchnorm arguments in MLP. - projection_dim: `int` number of ouput dimension for MLP. - input_specs: `tf.keras.layers.InputSpec` specs of the input tensor. - dropout_rate: `float` rate for dropout regularization. - aggregate_endpoints: `bool` aggregate all end ponits or only use the - final end point. - kernel_initializer: kernel initializer for the dense layer. - kernel_regularizer: tf.keras.regularizers.Regularizer object. Default to - None. - bias_regularizer: tf.keras.regularizers.Regularizer object. Default to - None. - **kwargs: keyword arguments to be passed. - """ - if not input_specs: - input_specs = { - 'image': layers.InputSpec(shape=[None, None, None, None, 3]) - } - self._self_setattr_tracking = False - self._config_dict = { - 'backbone': backbone, - 'normalize_feature': normalize_feature, - 'hidden_dim': hidden_dim, - 'hidden_layer_num': hidden_layer_num, - 'use_sync_bn': hidden_norm_args.use_sync_bn, - 'norm_momentum': hidden_norm_args.norm_momentum, - 'norm_epsilon': hidden_norm_args.norm_epsilon, - 'activation': hidden_norm_args.activation, - 'projection_dim': projection_dim, - 'input_specs': input_specs, - 'dropout_rate': dropout_rate, - 'aggregate_endpoints': aggregate_endpoints, - 'kernel_initializer': kernel_initializer, - 'kernel_regularizer': kernel_regularizer, - 'bias_regularizer': bias_regularizer, - } - self._input_specs = input_specs - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - self._backbone = backbone - - inputs = { - k: tf.keras.Input(shape=v.shape[1:]) for k, v in input_specs.items() - } - endpoints = backbone(inputs['image']) - - if aggregate_endpoints: - pooled_feats = [] - for endpoint in endpoints.values(): - x_pool = tf.keras.layers.GlobalAveragePooling3D()(endpoint) - pooled_feats.append(x_pool) - x = tf.concat(pooled_feats, axis=1) - else: - x = endpoints[max(endpoints.keys())] - x = tf.keras.layers.GlobalAveragePooling3D()(x) - - # L2 Normalize feature after backbone - if normalize_feature: - x = tf.nn.l2_normalize(x, axis=-1) - - # MLP hidden layers - for _ in range(hidden_layer_num): - x = tf.keras.layers.Dense(hidden_dim)(x) - if self._config_dict['use_sync_bn']: - x = tf.keras.layers.experimental.SyncBatchNormalization( - momentum=self._config_dict['norm_momentum'], - epsilon=self._config_dict['norm_epsilon'])(x) - else: - x = tf.keras.layers.BatchNormalization( - momentum=self._config_dict['norm_momentum'], - epsilon=self._config_dict['norm_epsilon'])(x) - x = tf_utils.get_activation(self._config_dict['activation'])(x) - - # Projection head - x = tf.keras.layers.Dense(projection_dim)(x) - - super(VideoSSLModel, self).__init__( - inputs=inputs, outputs=x, **kwargs) - - @property - def checkpoint_items(self): - """Returns a dictionary of items to be additionally checkpointed.""" - return dict(backbone=self.backbone) - - @property - def backbone(self): - return self._backbone - - def get_config(self): - return self._config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - -@model_factory.register_model_builder('video_ssl_model') -def build_video_ssl_pretrain_model( - input_specs: tf.keras.layers.InputSpec, - model_config: video_ssl_cfg.VideoSSLModel, - num_classes: int, - l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None): - """Builds the video classification model.""" - del num_classes - input_specs_dict = {'image': input_specs} - backbone = backbones.factory.build_backbone( - input_specs=input_specs, - backbone_config=model_config.backbone, - norm_activation_config=model_config.norm_activation, - l2_regularizer=l2_regularizer) - - # Norm layer type in the MLP head should same with backbone - assert model_config.norm_activation.use_sync_bn == model_config.hidden_norm_activation.use_sync_bn - - model = VideoSSLModel( - backbone=backbone, - normalize_feature=model_config.normalize_feature, - hidden_dim=model_config.hidden_dim, - hidden_layer_num=model_config.hidden_layer_num, - hidden_norm_args=model_config.hidden_norm_activation, - projection_dim=model_config.projection_dim, - input_specs=input_specs_dict, - dropout_rate=model_config.dropout_rate, - aggregate_endpoints=model_config.aggregate_endpoints, - kernel_regularizer=l2_regularizer) - return model diff --git a/official/vision/beta/projects/video_ssl/ops/video_ssl_preprocess_ops.py b/official/vision/beta/projects/video_ssl/ops/video_ssl_preprocess_ops.py deleted file mode 100644 index f6a2ef3aa..000000000 --- a/official/vision/beta/projects/video_ssl/ops/video_ssl_preprocess_ops.py +++ /dev/null @@ -1,405 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Utils for customed ops for video ssl.""" - -import functools -from typing import Optional -import tensorflow as tf - - -def random_apply(func, p, x): - """Randomly apply function func to x with probability p.""" - return tf.cond( - tf.less(tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32), - tf.cast(p, tf.float32)), - lambda: func(x), - lambda: x) - - -def random_brightness(image, max_delta): - """Distort brightness of image (SimCLRv2 style).""" - factor = tf.random.uniform( - [], tf.maximum(1.0 - max_delta, 0), 1.0 + max_delta) - image = image * factor - return image - - -def random_solarization(image, p=0.2): - """Random solarize image.""" - def _transform(image): - image = image * tf.cast(tf.less(image, 0.5), dtype=image.dtype) + ( - 1.0 - image) * tf.cast(tf.greater_equal(image, 0.5), dtype=image.dtype) - return image - return random_apply(_transform, p=p, x=image) - - -def to_grayscale(image, keep_channels=True): - """Turn the input image to gray scale. - - Args: - image: The input image tensor. - keep_channels: Whether maintaining the channel number for the image. - If true, the transformed image will repeat three times in channel. - If false, the transformed image will only have one channel. - - Returns: - The distorted image tensor. - """ - image = tf.image.rgb_to_grayscale(image) - if keep_channels: - image = tf.tile(image, [1, 1, 3]) - return image - - -def color_jitter(image, strength, random_order=True): - """Distorts the color of the image (SimCLRv2 style). - - Args: - image: The input image tensor. - strength: The floating number for the strength of the color augmentation. - random_order: A bool, specifying whether to randomize the jittering order. - - Returns: - The distorted image tensor. - """ - brightness = 0.8 * strength - contrast = 0.8 * strength - saturation = 0.8 * strength - hue = 0.2 * strength - if random_order: - return color_jitter_rand( - image, brightness, contrast, saturation, hue) - else: - return color_jitter_nonrand( - image, brightness, contrast, saturation, hue) - - -def color_jitter_nonrand(image, - brightness=0, - contrast=0, - saturation=0, - hue=0): - """Distorts the color of the image (jittering order is fixed, SimCLRv2 style). - - Args: - image: The input image tensor. - brightness: A float, specifying the brightness for color jitter. - contrast: A float, specifying the contrast for color jitter. - saturation: A float, specifying the saturation for color jitter. - hue: A float, specifying the hue for color jitter. - - Returns: - The distorted image tensor. - """ - with tf.name_scope('distort_color'): - def apply_transform(i, x, brightness, contrast, saturation, hue): - """Apply the i-th transformation.""" - if brightness != 0 and i == 0: - x = random_brightness(x, max_delta=brightness) - elif contrast != 0 and i == 1: - x = tf.image.random_contrast( - x, lower=1-contrast, upper=1+contrast) - elif saturation != 0 and i == 2: - x = tf.image.random_saturation( - x, lower=1-saturation, upper=1+saturation) - elif hue != 0: - x = tf.image.random_hue(x, max_delta=hue) - return x - - for i in range(4): - image = apply_transform(i, image, brightness, contrast, saturation, hue) - image = tf.clip_by_value(image, 0., 1.) - return image - - -def color_jitter_rand(image, - brightness=0, - contrast=0, - saturation=0, - hue=0): - """Distorts the color of the image (jittering order is random, SimCLRv2 style). - - Args: - image: The input image tensor. - brightness: A float, specifying the brightness for color jitter. - contrast: A float, specifying the contrast for color jitter. - saturation: A float, specifying the saturation for color jitter. - hue: A float, specifying the hue for color jitter. - - Returns: - The distorted image tensor. - """ - with tf.name_scope('distort_color'): - def apply_transform(i, x): - """Apply the i-th transformation.""" - def brightness_transform(): - if brightness == 0: - return x - else: - return random_brightness(x, max_delta=brightness) - def contrast_transform(): - if contrast == 0: - return x - else: - return tf.image.random_contrast(x, lower=1-contrast, upper=1+contrast) - def saturation_transform(): - if saturation == 0: - return x - else: - return tf.image.random_saturation( - x, lower=1-saturation, upper=1+saturation) - def hue_transform(): - if hue == 0: - return x - else: - return tf.image.random_hue(x, max_delta=hue) - # pylint:disable=g-long-lambda - x = tf.cond( - tf.less(i, 2), lambda: tf.cond( - tf.less(i, 1), brightness_transform, contrast_transform), - lambda: tf.cond(tf.less(i, 3), saturation_transform, hue_transform)) - # pylint:disable=g-long-lambda - return x - - perm = tf.random.shuffle(tf.range(4)) - for i in range(4): - image = apply_transform(perm[i], image) - image = tf.clip_by_value(image, 0., 1.) - return image - - -def random_color_jitter_3d(frames): - """Applies temporally consistent color jittering to one video clip. - - Args: - frames: `Tensor` of shape [num_frames, height, width, channels]. - - Returns: - A Tensor of shape [num_frames, height, width, channels] being color jittered - with the same operation. - """ - def random_color_jitter(image, p=1.0): - def _transform(image): - color_jitter_t = functools.partial( - color_jitter, strength=1.0) - image = random_apply(color_jitter_t, p=0.8, x=image) - return random_apply(to_grayscale, p=0.2, x=image) - return random_apply(_transform, p=p, x=image) - - num_frames, width, height, channels = frames.shape.as_list() - big_image = tf.reshape(frames, [num_frames*width, height, channels]) - big_image = random_color_jitter(big_image) - return tf.reshape(big_image, [num_frames, width, height, channels]) - - -def gaussian_blur(image, kernel_size, sigma, padding='SAME'): - """Blurs the given image with separable convolution. - - Args: - image: Tensor of shape [height, width, channels] and dtype float to blur. - kernel_size: Integer Tensor for the size of the blur kernel. This is should - be an odd number. If it is an even number, the actual kernel size will be - size + 1. - sigma: Sigma value for gaussian operator. - padding: Padding to use for the convolution. Typically 'SAME' or 'VALID'. - - Returns: - A Tensor representing the blurred image. - """ - radius = tf.cast(kernel_size / 2, dtype=tf.int32) - kernel_size = radius * 2 + 1 - x = tf.cast(tf.range(-radius, radius + 1), dtype=tf.float32) - blur_filter = tf.exp( - -tf.pow(x, 2.0) / (2.0 * tf.pow(tf.cast(sigma, dtype=tf.float32), 2.0))) - blur_filter /= tf.reduce_sum(blur_filter) - # One vertical and one horizontal filter. - blur_v = tf.reshape(blur_filter, [kernel_size, 1, 1, 1]) - blur_h = tf.reshape(blur_filter, [1, kernel_size, 1, 1]) - num_channels = tf.shape(image)[-1] - blur_h = tf.tile(blur_h, [1, 1, num_channels, 1]) - blur_v = tf.tile(blur_v, [1, 1, num_channels, 1]) - expand_batch_dim = image.shape.ndims == 3 - if expand_batch_dim: - # Tensorflow requires batched input to convolutions, which we can fake with - # an extra dimension. - image = tf.expand_dims(image, axis=0) - blurred = tf.nn.depthwise_conv2d( - image, blur_h, strides=[1, 1, 1, 1], padding=padding) - blurred = tf.nn.depthwise_conv2d( - blurred, blur_v, strides=[1, 1, 1, 1], padding=padding) - if expand_batch_dim: - blurred = tf.squeeze(blurred, axis=0) - return blurred - - -def random_blur(image, height, width, p=1.0): - """Randomly blur an image. - - Args: - image: `Tensor` representing an image of arbitrary size. - height: Height of output image. - width: Width of output image. - p: probability of applying this transformation. - - Returns: - A preprocessed image `Tensor`. - """ - del width - def _transform(image): - sigma = tf.random.uniform([], 0.1, 2.0, dtype=tf.float32) - return gaussian_blur( - image, kernel_size=height//10, sigma=sigma, padding='SAME') - return random_apply(_transform, p=p, x=image) - - -def random_blur_3d(frames, height, width, blur_probability=0.5): - """Apply efficient batch data transformations. - - Args: - frames: `Tensor` of shape [timesteps, height, width, 3]. - height: the height of image. - width: the width of image. - blur_probability: the probaility to apply the blur operator. - - Returns: - Preprocessed feature list. - """ - def generate_selector(p, bsz): - shape = [bsz, 1, 1, 1] - selector = tf.cast( - tf.less(tf.random.uniform(shape, 0, 1, dtype=tf.float32), p), - tf.float32) - return selector - - frames_new = random_blur(frames, height, width, p=1.) - selector = generate_selector(blur_probability, 1) - frames = frames_new * selector + frames * (1 - selector) - frames = tf.clip_by_value(frames, 0., 1.) - - return frames - - -def _sample_or_pad_sequence_indices(sequence: tf.Tensor, - num_steps: int, - stride: int, - offset: tf.Tensor) -> tf.Tensor: - """Returns indices to take for sampling or padding sequences to fixed size.""" - sequence_length = tf.shape(sequence)[0] - sel_idx = tf.range(sequence_length) - - # Repeats sequence until num_steps are available in total. - max_length = num_steps * stride + offset - num_repeats = tf.math.floordiv( - max_length + sequence_length - 1, sequence_length) - sel_idx = tf.tile(sel_idx, [num_repeats]) - - steps = tf.range(offset, offset + num_steps * stride, stride) - return tf.gather(sel_idx, steps) - - -def sample_ssl_sequence(sequence: tf.Tensor, - num_steps: int, - random: bool, - stride: int = 1, - num_windows: Optional[int] = 2) -> tf.Tensor: - """Samples two segments of size num_steps randomly from a given sequence. - - Currently it only supports images, and specically designed for video self- - supervised learning. - - Args: - sequence: Any tensor where the first dimension is timesteps. - num_steps: Number of steps (e.g. frames) to take. - random: A boolean indicating whether to random sample the single window. If - True, the offset is randomized. Only True is supported. - stride: Distance to sample between timesteps. - num_windows: Number of sequence sampled. - - Returns: - A single Tensor with first dimension num_steps with the sampled segment. - """ - sequence_length = tf.shape(sequence)[0] - sequence_length = tf.cast(sequence_length, tf.float32) - if random: - max_offset = tf.cond( - tf.greater(sequence_length, (num_steps - 1) * stride), - lambda: sequence_length - (num_steps - 1) * stride, - lambda: sequence_length) - - max_offset = tf.cast(max_offset, dtype=tf.float32) - def cdf(k, power=1.0): - """Cumulative distribution function for x^power.""" - p = -tf.math.pow(k, power + 1) / ( - power * tf.math.pow(max_offset, power + 1)) + k * (power + 1) / ( - power * max_offset) - return p - - u = tf.random.uniform(()) - k_low = tf.constant(0, dtype=tf.float32) - k_up = max_offset - k = tf.math.floordiv(max_offset, 2.0) - - c = lambda k_low, k_up, k: tf.greater(tf.math.abs(k_up - k_low), 1.0) - # pylint:disable=g-long-lambda - b = lambda k_low, k_up, k: tf.cond( - tf.greater(cdf(k), u), - lambda: [k_low, k, tf.math.floordiv(k + k_low, 2.0)], - lambda: [k, k_up, tf.math.floordiv(k_up + k, 2.0)]) - - _, _, k = tf.while_loop(c, b, [k_low, k_up, k]) - delta = tf.cast(k, tf.int32) - max_offset = tf.cast(max_offset, tf.int32) - sequence_length = tf.cast(sequence_length, tf.int32) - - choice_1 = tf.cond( - tf.equal(max_offset, sequence_length), - lambda: tf.random.uniform((), - maxval=tf.cast(max_offset, dtype=tf.int32), - dtype=tf.int32), - lambda: tf.random.uniform((), - maxval=tf.cast(max_offset - delta, - dtype=tf.int32), - dtype=tf.int32)) - choice_2 = tf.cond( - tf.equal(max_offset, sequence_length), - lambda: tf.random.uniform((), - maxval=tf.cast(max_offset, dtype=tf.int32), - dtype=tf.int32), - lambda: choice_1 + delta) - # pylint:disable=g-long-lambda - shuffle_choice = tf.random.shuffle((choice_1, choice_2)) - offset_1 = shuffle_choice[0] - offset_2 = shuffle_choice[1] - - else: - raise NotImplementedError - - indices_1 = _sample_or_pad_sequence_indices( - sequence=sequence, - num_steps=num_steps, - stride=stride, - offset=offset_1) - - indices_2 = _sample_or_pad_sequence_indices( - sequence=sequence, - num_steps=num_steps, - stride=stride, - offset=offset_2) - - indices = tf.concat([indices_1, indices_2], axis=0) - indices.set_shape((num_windows * num_steps,)) - output = tf.gather(sequence, indices) - - return output diff --git a/official/vision/beta/projects/video_ssl/ops/video_ssl_preprocess_ops_test.py b/official/vision/beta/projects/video_ssl/ops/video_ssl_preprocess_ops_test.py deleted file mode 100644 index 4cf667d31..000000000 --- a/official/vision/beta/projects/video_ssl/ops/video_ssl_preprocess_ops_test.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import tensorflow as tf -from official.vision.beta.ops import preprocess_ops_3d -from official.vision.beta.projects.video_ssl.ops import video_ssl_preprocess_ops - - -class VideoSslPreprocessOpsTest(tf.test.TestCase): - - def setUp(self): - super().setUp() - self._raw_frames = tf.random.uniform((250, 256, 256, 3), minval=0, - maxval=255, dtype=tf.dtypes.int32) - self._sampled_frames = self._raw_frames[:16] - self._frames = preprocess_ops_3d.normalize_image( - self._sampled_frames, False, tf.float32) - - def test_sample_ssl_sequence(self): - sampled_seq = video_ssl_preprocess_ops.sample_ssl_sequence( - self._raw_frames, 16, True, 2) - self.assertAllEqual(sampled_seq.shape, (32, 256, 256, 3)) - - def test_random_color_jitter_3d(self): - jittered_clip = video_ssl_preprocess_ops.random_color_jitter_3d( - self._frames) - self.assertAllEqual(jittered_clip.shape, (16, 256, 256, 3)) - - def test_random_blur_3d(self): - blurred_clip = video_ssl_preprocess_ops.random_blur_3d( - self._frames, 256, 256) - self.assertAllEqual(blurred_clip.shape, (16, 256, 256, 3)) - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/projects/video_ssl/tasks/__init__.py b/official/vision/beta/projects/video_ssl/tasks/__init__.py deleted file mode 100644 index 89b2e6626..000000000 --- a/official/vision/beta/projects/video_ssl/tasks/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tasks package definition.""" - -from official.vision.beta.projects.video_ssl.tasks import linear_eval -from official.vision.beta.projects.video_ssl.tasks import pretrain diff --git a/official/vision/beta/projects/video_ssl/tasks/linear_eval.py b/official/vision/beta/projects/video_ssl/tasks/linear_eval.py deleted file mode 100644 index 46263f660..000000000 --- a/official/vision/beta/projects/video_ssl/tasks/linear_eval.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Video ssl linear evaluation task definition.""" -from typing import Any, Optional, List, Tuple -from absl import logging -import tensorflow as tf - -# pylint: disable=unused-import -from official.core import task_factory -from official.vision.beta.projects.video_ssl.configs import video_ssl as exp_cfg -from official.vision.beta.projects.video_ssl.modeling import video_ssl_model -from official.vision.beta.tasks import video_classification - - -@task_factory.register_task_cls(exp_cfg.VideoSSLEvalTask) -class VideoSSLEvalTask(video_classification.VideoClassificationTask): - """A task for video ssl linear evaluation.""" - - def initialize(self, model: tf.keras.Model): - """Loading pretrained checkpoint.""" - if not self.task_config.init_checkpoint: - return - - ckpt_dir_or_file = self.task_config.init_checkpoint - if tf.io.gfile.isdir(ckpt_dir_or_file): - ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file) - - # Restoring checkpoint. - if self.task_config.init_checkpoint_modules == 'backbone': - ckpt = tf.train.Checkpoint(backbone=model.backbone) - ckpt.read(ckpt_dir_or_file) - else: - raise NotImplementedError - - logging.info('Finished loading pretrained checkpoint from %s', - ckpt_dir_or_file) - - def train_step(self, - inputs: Tuple[Any, Any], - model: tf.keras.Model, - optimizer: tf.keras.optimizers.Optimizer, - metrics: Optional[List[Any]] = None): - """Does forward and backward. - - Args: - inputs: a dictionary of input tensors. - model: the model, forward pass definition. - optimizer: the optimizer for this training step. - metrics: a nested structure of metrics objects. - - Returns: - A dictionary of logs. - """ - model.backbone.trainable = False - logging.info('Setting the backbone to non-trainable.') - - return super(video_classification.VideoClassificationTask, - self).train_step(inputs, model, optimizer, metrics) diff --git a/official/vision/beta/projects/video_ssl/tasks/pretrain.py b/official/vision/beta/projects/video_ssl/tasks/pretrain.py deleted file mode 100644 index 942e8cab9..000000000 --- a/official/vision/beta/projects/video_ssl/tasks/pretrain.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Video ssl pretrain task definition.""" -from absl import logging -import tensorflow as tf - -# pylint: disable=unused-import -from official.core import input_reader -from official.core import task_factory -from official.vision.beta.modeling import factory_3d -from official.vision.beta.projects.video_ssl.configs import video_ssl as exp_cfg -from official.vision.beta.projects.video_ssl.dataloaders import video_ssl_input -from official.vision.beta.projects.video_ssl.losses import losses -from official.vision.beta.projects.video_ssl.modeling import video_ssl_model -from official.vision.beta.tasks import video_classification - - -@task_factory.register_task_cls(exp_cfg.VideoSSLPretrainTask) -class VideoSSLPretrainTask(video_classification.VideoClassificationTask): - """A task for video ssl pretraining.""" - - def build_model(self): - """Builds video ssl pretraining model.""" - common_input_shape = [ - d1 if d1 == d2 else None - for d1, d2 in zip(self.task_config.train_data.feature_shape, - self.task_config.validation_data.feature_shape) - ] - input_specs = tf.keras.layers.InputSpec(shape=[None] + common_input_shape) - logging.info('Build model input %r', common_input_shape) - - model = factory_3d.build_model( - self.task_config.model.model_type, - input_specs=input_specs, - model_config=self.task_config.model, - num_classes=self.task_config.train_data.num_classes) - return model - - def _get_decoder_fn(self, params): - decoder = video_ssl_input.Decoder() - return decoder.decode - - def build_inputs(self, params: exp_cfg.DataConfig, input_context=None): - """Builds classification input.""" - - parser = video_ssl_input.Parser(input_params=params) - postprocess_fn = video_ssl_input.PostBatchProcessor(params) - - reader = input_reader.InputReader( - params, - dataset_fn=self._get_dataset_fn(params), - decoder_fn=self._get_decoder_fn(params), - parser_fn=parser.parse_fn(params.is_training), - postprocess_fn=postprocess_fn) - - dataset = reader.read(input_context=input_context) - - return dataset - - def build_losses(self, model_outputs, num_replicas, model): - """Sparse categorical cross entropy loss. - - Args: - model_outputs: Output logits of the model. - num_replicas: distributed replica number. - model: keras model for calculating weight decay. - - Returns: - The total loss tensor. - """ - all_losses = {} - contrastive_metrics = {} - losses_config = self.task_config.losses - total_loss = None - contrastive_loss_dict = losses.contrastive_loss( - model_outputs, num_replicas, losses_config.normalize_hidden, - losses_config.temperature, model, - self.task_config.losses.l2_weight_decay) - total_loss = contrastive_loss_dict['total_loss'] - all_losses.update({ - 'total_loss': total_loss - }) - all_losses[self.loss] = total_loss - contrastive_metrics.update({ - 'contrast_acc': contrastive_loss_dict['contrast_acc'], - 'contrast_entropy': contrastive_loss_dict['contrast_entropy'], - 'reg_loss': contrastive_loss_dict['reg_loss'] - }) - return all_losses, contrastive_metrics - - def build_metrics(self, training=True): - """Gets streaming metrics for training/validation.""" - metrics = [ - tf.keras.metrics.Mean(name='contrast_acc'), - tf.keras.metrics.Mean(name='contrast_entropy'), - tf.keras.metrics.Mean(name='reg_loss') - ] - return metrics - - def process_metrics(self, metrics, contrastive_metrics): - """Process and update metrics.""" - contrastive_metric_values = contrastive_metrics.values() - for metric, contrastive_metric_value in zip(metrics, - contrastive_metric_values): - metric.update_state(contrastive_metric_value) - - def train_step(self, inputs, model, optimizer, metrics=None): - """Does forward and backward. - - Args: - inputs: a dictionary of input tensors. - model: the model, forward pass definition. - optimizer: the optimizer for this training step. - metrics: a nested structure of metrics objects. - - Returns: - A dictionary of logs. - """ - features, _ = inputs - - num_replicas = tf.distribute.get_strategy().num_replicas_in_sync - with tf.GradientTape() as tape: - if self.task_config.train_data.output_audio: - outputs = model(features, training=True) - else: - outputs = model(features['image'], training=True) - # Casting output layer as float32 is necessary when mixed_precision is - # mixed_float16 or mixed_bfloat16 to ensure output is casted as float32. - outputs = tf.nest.map_structure( - lambda x: tf.cast(x, tf.float32), outputs) - - all_losses, contrastive_metrics = self.build_losses( - model_outputs=outputs, num_replicas=num_replicas, - model=model) - loss = all_losses[self.loss] - scaled_loss = loss - - # For mixed_precision policy, when LossScaleOptimizer is used, loss is - # scaled for numerical stability. - if isinstance( - optimizer, tf.keras.mixed_precision.LossScaleOptimizer): - scaled_loss = optimizer.get_scaled_loss(scaled_loss) - - tvars = model.trainable_variables - grads = tape.gradient(scaled_loss, tvars) - # Scales back gradient before apply_gradients when LossScaleOptimizer is - # used. - if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer): - grads = optimizer.get_unscaled_gradients(grads) - optimizer.apply_gradients(list(zip(grads, tvars))) - - logs = all_losses - if metrics: - self.process_metrics(metrics, contrastive_metrics) - logs.update({m.name: m.result() for m in metrics}) - return logs - - def validation_step(self, inputs, model, metrics=None): - """Validatation step. - - Args: - inputs: a dictionary of input tensors. - model: the keras.Model. - metrics: a nested structure of metrics objects. - - Returns: - A dictionary of logs. - """ - raise NotImplementedError - - def inference_step(self, features, model): - """Performs the forward step.""" - raise NotImplementedError diff --git a/official/vision/beta/projects/video_ssl/tasks/pretrain_test.py b/official/vision/beta/projects/video_ssl/tasks/pretrain_test.py deleted file mode 100644 index 4bbb468f6..000000000 --- a/official/vision/beta/projects/video_ssl/tasks/pretrain_test.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import functools -import os -import random - -import orbit -import tensorflow as tf - -# pylint: disable=unused-import -from official.core import exp_factory -from official.core import task_factory -from official.modeling import optimization -from official.vision import beta -from official.vision.beta.dataloaders import tfexample_utils -from official.vision.beta.projects.video_ssl.tasks import pretrain - - -class VideoClassificationTaskTest(tf.test.TestCase): - - def setUp(self): - super(VideoClassificationTaskTest, self).setUp() - data_dir = os.path.join(self.get_temp_dir(), 'data') - tf.io.gfile.makedirs(data_dir) - self._data_path = os.path.join(data_dir, 'data.tfrecord') - # pylint: disable=g-complex-comprehension - examples = [ - tfexample_utils.make_video_test_example( - image_shape=(36, 36, 3), - audio_shape=(20, 128), - label=random.randint(0, 100)) for _ in range(2) - ] - # pylint: enable=g-complex-comprehension - tfexample_utils.dump_to_tfrecord(self._data_path, tf_examples=examples) - - def test_task(self): - config = exp_factory.get_exp_config('video_ssl_pretrain_kinetics600') - config.task.train_data.global_batch_size = 2 - config.task.train_data.input_path = self._data_path - - task = pretrain.VideoSSLPretrainTask( - config.task) - model = task.build_model() - metrics = task.build_metrics() - strategy = tf.distribute.get_strategy() - - dataset = orbit.utils.make_distributed_dataset( - strategy, - functools.partial(task.build_inputs), - config.task.train_data) - - iterator = iter(dataset) - opt_factory = optimization.OptimizerFactory(config.trainer.optimizer_config) - optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate()) - logs = task.train_step(next(iterator), model, optimizer, metrics=metrics) - self.assertIn('total_loss', logs) - self.assertIn('reg_loss', logs) - self.assertIn('contrast_acc', logs) - self.assertIn('contrast_entropy', logs) - - def test_task_factory(self): - config = exp_factory.get_exp_config('video_ssl_pretrain_kinetics600') - task = task_factory.get_task(config.task) - self.assertIs(type(task), pretrain.VideoSSLPretrainTask) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/projects/video_ssl/train.py b/official/vision/beta/projects/video_ssl/train.py deleted file mode 100644 index 8db5e48a6..000000000 --- a/official/vision/beta/projects/video_ssl/train.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Training driver.""" - -from absl import app -from absl import flags -import gin - -# pylint: disable=unused-import -from official.common import registry_imports -from official.common import distribute_utils -from official.common import flags as tfm_flags -from official.core import task_factory -from official.core import train_lib -from official.core import train_utils -from official.modeling import performance -from official.vision.beta.projects.video_ssl.modeling import video_ssl_model -from official.vision.beta.projects.video_ssl.tasks import linear_eval -from official.vision.beta.projects.video_ssl.tasks import pretrain -# pylint: disable=unused-import - -FLAGS = flags.FLAGS - - -def main(_): - gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) - params = train_utils.parse_configuration(FLAGS) - model_dir = FLAGS.model_dir - if 'train' in FLAGS.mode: - # Pure eval modes do not output yaml files. Otherwise continuous eval job - # may race against the train job for writing the same file. - train_utils.serialize_config(params, model_dir) - - if 'train_and_eval' in FLAGS.mode: - assert (params.task.train_data.feature_shape == - params.task.validation_data.feature_shape), ( - f'train {params.task.train_data.feature_shape} != validate ' - f'{params.task.validation_data.feature_shape}') - - # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' - # can have significant impact on model speeds by utilizing float16 in case of - # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when - # dtype is float16 - if params.runtime.mixed_precision_dtype: - performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype) - distribution_strategy = distribute_utils.get_distribution_strategy( - distribution_strategy=params.runtime.distribution_strategy, - all_reduce_alg=params.runtime.all_reduce_alg, - num_gpus=params.runtime.num_gpus, - tpu_address=params.runtime.tpu) - with distribution_strategy.scope(): - task = task_factory.get_task(params.task, logging_dir=model_dir) - - train_lib.run_experiment( - distribution_strategy=distribution_strategy, - task=task, - mode=FLAGS.mode, - params=params, - model_dir=model_dir) - - train_utils.save_gin_config(FLAGS.mode, model_dir) - -if __name__ == '__main__': - tfm_flags.define_flags() - app.run(main) -- GitLab From 244b66bb842dd06eaa8fddb753a53f2731a3b0a2 Mon Sep 17 00:00:00 2001 From: Jiayu Ye Date: Tue, 22 Mar 2022 21:50:26 -0700 Subject: [PATCH 30/54] Internal change PiperOrigin-RevId: 436647758 --- official/nlp/modeling/models/t5.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/official/nlp/modeling/models/t5.py b/official/nlp/modeling/models/t5.py index 236cc164d..c7bc2e1c8 100644 --- a/official/nlp/modeling/models/t5.py +++ b/official/nlp/modeling/models/t5.py @@ -1004,6 +1004,7 @@ class T5TransformerParams: num_heads: int d_ff: int vocab_size: int + target_vocab_size: Optional[int] = None dropout_rate: float = 0.0 layer_norm_epsilon: float = 1e-6 shared_embedding: bool = False @@ -1159,11 +1160,15 @@ class Decoder(Module): self.compute_dtype = compute_dtype if self.config.num_decoder_layers is None: self.config.num_decoder_layers = self.config.num_layers + if not hasattr( + self.config, + "target_vocab_size") or self.config.target_vocab_size is None: + self.config.target_vocab_size = self.config.vocab_size with self.name_scope: # Target Embedding. if shared_embedding is None: self.target_embed = Embed( - vocab_size=self.config.vocab_size, + vocab_size=self.config.target_vocab_size, features=self.config.d_model, embeddings_initializer=self.config.vocab_embeddings_initializer, dtype=self.dtype, @@ -1211,7 +1216,7 @@ class Decoder(Module): if not self.config.logits_via_embedding: self.logits_dense = Linear( in_features=self.config.d_model, - out_features=self.config.vocab_size, + out_features=self.config.target_vocab_size, use_bias=False, dtype=self.dtype, name="logits") -- GitLab From 67e6ddcdb43888e319014d279daf6bff32299f45 Mon Sep 17 00:00:00 2001 From: Vishnuvardhan Janapati <46058173+jvishnuvardhan@users.noreply.github.com> Date: Tue, 22 Mar 2022 23:11:14 -0700 Subject: [PATCH 31/54] Create CODE_OF_CONDUCT.md (#10544) Adding a new CODE_OF_CONDUCT.md to models repository --- CODE_OF_CONDUCT.md | 79 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..2ec7ff4d2 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,79 @@ +# TensorFlow-models Code of Conduct + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to make participation in our project and our +community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, gender identity and expression, level of +experience, nationality, personal appearance, race, religion, or sexual identity +and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language. +* Being respectful of differing viewpoints and experiences. +* Gracefully accepting constructive criticism. +* Focusing on what is best for the community. +* Showing empathy towards other community members. + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances. +* Trolling, insulting/derogatory comments, and personal or political attacks. +* Public or private harassment. +* Publishing others' private information, such as a physical or electronic + address, without explicit permission. +* Conduct which could reasonably be considered inappropriate for the forum in + which it occurs. + +All TensorFlow-models forums and spaces are meant for professional interactions, and any behavior which could reasonably be considered inappropriate in a professional setting is unacceptable. + +å +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + + +## Scope + +This Code of Conduct applies to all content on tensorflow.org, TensorFlow-models GitHub organization, or any other official TensorFlow-models web presence allowing for community interactions, as well as at all official TensorFlow-models events, whether offline or online. + +The Code of Conduct also applies within project spaces and in public spaces whenever an individual is representing TensorFlow-models or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed or de facto representative at an online or offline event. + + +## Conflict Resolution + +Conflicts in an open source project can take many forms, from someone having a bad day and using harsh and hurtful language in the issue queue, to more serious instances such as sexist/racist statements or threats of violence, and everything in between. + +If the behavior is threatening or harassing, or for other reasons requires immediate escalation, please see below. + +However, for the vast majority of issues, we aim to empower individuals to first resolve conflicts themselves, asking for help when needed, and only after that fails to escalate further. This approach gives people more control over the outcome of their dispute. + +If you are experiencing or witnessing conflict, we ask you to use the following escalation strategy to address the conflict: + +1. Address the perceived conflict directly with those involved, preferably in a + real-time medium. +2. If this fails, get a third party (e.g. a mutual friend, and/or someone with + background on the issue, but not involved in the conflict) to intercede. +3. If you are still unable to resolve the conflict, and you believe it rises to + harassment or another code of conduct violation, report it. + +## Reporting Violations + +Violations of the Code of Conduct can be reported to TensorFlow’s Project Stewards, Thea Lamkin (thealamkin@google.com) and Joana Carrasqueira (joanafilipa@google.com). The Project Steward will determine whether the Code of Conduct was violated, and will issue an appropriate sanction, possibly including a written warning or expulsion from the project, project sponsored spaces, or project forums. We ask that you make a good-faith effort to resolve your conflict via the conflict resolution policy before submitting a report. + +Violations of the Code of Conduct can occur in any setting, even those unrelated to the project. We will only consider complaints about conduct that has occurred within one year of the report. + + +## Enforcement + +If the Project Stewards receive a report alleging a violation of the Code of Conduct, the Project Stewards will notify the accused of the report, and provide them an opportunity to discuss the report before a sanction is issued. The Project Stewards will do their utmost to keep the reporter anonymous. If the act is ongoing (such as someone engaging in harassment), or involves a threat to anyone's safety (e.g. threats of violence), the Project Stewards may issue sanctions without notice. + + +## Attribution + +This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at https://contributor-covenant.org/version/1/4, and includes some aspects of the Geek Feminism Code of Conduct and the Drupal Code of Conduct. -- GitLab From f0f9bb10d710d387b46596c6f92a57aad442ce71 Mon Sep 17 00:00:00 2001 From: Vishnuvardhan Janapati <46058173+jvishnuvardhan@users.noreply.github.com> Date: Tue, 22 Mar 2022 23:11:38 -0700 Subject: [PATCH 32/54] Create SECURITY.md (#10546) Adding `SECURITY.md` to model repository as it is a way to inform the user about any security vulnerabilities and how to report in case if they notice any vulnerability. --- SECURITY.md | 251 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 000000000..4b71ec02b --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,251 @@ +# Using TensorFlow Securely + +This document discusses how to safely deal with untrusted programs (models or +model parameters), and input data. Below, we also provide guidelines on how to +report vulnerabilities in TensorFlow. + +## TensorFlow models are programs + +TensorFlow's runtime system interprets and executes programs. What machine +learning practitioners term +[**models**](https://developers.google.com/machine-learning/glossary/#model) are +expressed as programs that TensorFlow executes. TensorFlow programs are encoded +as computation +[**graphs**](https://developers.google.com/machine-learning/glossary/#graph). +The model's parameters are often stored separately in **checkpoints**. + +At runtime, TensorFlow executes the computation graph using the parameters +provided. Note that the behavior of the computation graph may change +depending on the parameters provided. TensorFlow itself is not a sandbox. When +executing the computation graph, TensorFlow may read and write files, send and +receive data over the network, and even spawn additional processes. All these +tasks are performed with the permissions of the TensorFlow process. Allowing +for this flexibility makes for a powerful machine learning platform, +but it has implications for security. + +The computation graph may also accept **inputs**. Those inputs are the +data you supply to TensorFlow to train a model, or to use a model to run +inference on the data. + +**TensorFlow models are programs, and need to be treated as such from a security +perspective.** + +## Running untrusted models + +As a general rule: **Always** execute untrusted models inside a sandbox (e.g., +[nsjail](https://github.com/google/nsjail)). + +There are several ways in which a model could become untrusted. Obviously, if an +untrusted party supplies TensorFlow kernels, arbitrary code may be executed. +The same is true if the untrusted party provides Python code, such as the +Python code that generates TensorFlow graphs. + +Even if the untrusted party only supplies the serialized computation +graph (in form of a `GraphDef`, `SavedModel`, or equivalent on-disk format), the +set of computation primitives available to TensorFlow is powerful enough that +you should assume that the TensorFlow process effectively executes arbitrary +code. One common solution is to allow only a few safe Ops. While this is +possible in theory, we still recommend you sandbox the execution. + +It depends on the computation graph whether a user provided checkpoint is safe. +It is easily possible to create computation graphs in which malicious +checkpoints can trigger unsafe behavior. For example, consider a graph that +contains a `tf.cond` depending on the value of a `tf.Variable`. One branch of +the `tf.cond` is harmless, but the other is unsafe. Since the `tf.Variable` is +stored in the checkpoint, whoever provides the checkpoint now has the ability to +trigger unsafe behavior, even though the graph is not under their control. + +In other words, graphs can contain vulnerabilities of their own. To allow users +to provide checkpoints to a model you run on their behalf (e.g., in order to +compare model quality for a fixed model architecture), you must carefully audit +your model, and we recommend you run the TensorFlow process in a sandbox. + +## Accepting untrusted Inputs + +It is possible to write models that are secure in a sense that they can safely +process untrusted inputs assuming there are no bugs. There are two main reasons +to not rely on this: First, it is easy to write models which must not be exposed +to untrusted inputs, and second, there are bugs in any software system of +sufficient complexity. Letting users control inputs could allow them to trigger +bugs either in TensorFlow or in dependent libraries. + +In general, it is good practice to isolate parts of any system which is exposed +to untrusted (e.g., user-provided) inputs in a sandbox. + +A useful analogy to how any TensorFlow graph is executed is any interpreted +programming language, such as Python. While it is possible to write secure +Python code which can be exposed to user supplied inputs (by, e.g., carefully +quoting and sanitizing input strings, size-checking input blobs, etc.), it is +very easy to write Python programs which are insecure. Even secure Python code +could be rendered insecure by a bug in the Python interpreter, or in a bug in a +Python library used (e.g., +[this one](https://www.cvedetails.com/cve/CVE-2017-12852/)). + +## Running a TensorFlow server + +TensorFlow is a platform for distributed computing, and as such there is a +TensorFlow server (`tf.train.Server`). **The TensorFlow server is meant for +internal communication only. It is not built for use in an untrusted network.** + +For performance reasons, the default TensorFlow server does not include any +authorization protocol and sends messages unencrypted. It accepts connections +from anywhere, and executes the graphs it is sent without performing any checks. +Therefore, if you run a `tf.train.Server` in your network, anybody with +access to the network can execute what you should consider arbitrary code with +the privileges of the process running the `tf.train.Server`. + +When running distributed TensorFlow, you must isolate the network in which the +cluster lives. Cloud providers provide instructions for setting up isolated +networks, which are sometimes branded as "virtual private cloud." Refer to the +instructions for +[GCP](https://cloud.google.com/compute/docs/networks-and-firewalls) and +[AWS](https://aws.amazon.com/vpc/)) for details. + +Note that `tf.train.Server` is different from the server created by +`tensorflow/serving` (the default binary for which is called `ModelServer`). +By default, `ModelServer` also has no built-in mechanism for authentication. +Connecting it to an untrusted network allows anyone on this network to run the +graphs known to the `ModelServer`. This means that an attacker may run +graphs using untrusted inputs as described above, but they would not be able to +execute arbitrary graphs. It is possible to safely expose a `ModelServer` +directly to an untrusted network, **but only if the graphs it is configured to +use have been carefully audited to be safe**. + +Similar to best practices for other servers, we recommend running any +`ModelServer` with appropriate privileges (i.e., using a separate user with +reduced permissions). In the spirit of defense in depth, we recommend +authenticating requests to any TensorFlow server connected to an untrusted +network, as well as sandboxing the server to minimize the adverse effects of +any breach. + +## Vulnerabilities in TensorFlow + +TensorFlow is a large and complex system. It also depends on a large set of +third party libraries (e.g., `numpy`, `libjpeg-turbo`, PNG parsers, `protobuf`). +It is possible that TensorFlow or its dependent libraries contain +vulnerabilities that would allow triggering unexpected or dangerous behavior +with specially crafted inputs. + +### What is a vulnerability? + +Given TensorFlow's flexibility, it is possible to specify computation graphs +which exhibit unexpected or unwanted behavior. The fact that TensorFlow models +can perform arbitrary computations means that they may read and write files, +communicate via the network, produce deadlocks and infinite loops, or run out +of memory. It is only when these behaviors are outside the specifications of the +operations involved that such behavior is a vulnerability. + +A `FileWriter` writing a file is not unexpected behavior and therefore is not a +vulnerability in TensorFlow. A `MatMul` allowing arbitrary binary code execution +**is** a vulnerability. + +This is more subtle from a system perspective. For example, it is easy to cause +a TensorFlow process to try to allocate more memory than available by specifying +a computation graph containing an ill-considered `tf.tile` operation. TensorFlow +should exit cleanly in this case (it would raise an exception in Python, or +return an error `Status` in C++). However, if the surrounding system is not +expecting the possibility, such behavior could be used in a denial of service +attack (or worse). Because TensorFlow behaves correctly, this is not a +vulnerability in TensorFlow (although it would be a vulnerability of this +hypothetical system). + +As a general rule, it is incorrect behavior for TensorFlow to access memory it +does not own, or to terminate in an unclean way. Bugs in TensorFlow that lead to +such behaviors constitute a vulnerability. + +One of the most critical parts of any system is input handling. If malicious +input can trigger side effects or incorrect behavior, this is a bug, and likely +a vulnerability. + +### Reporting vulnerabilities + +Please email reports about any security related issues you find to +`security@tensorflow.org`. This mail is delivered to a small security team. Your +email will be acknowledged within one business day, and you'll receive a more +detailed response to your email within 7 days indicating the next steps in +handling your report. For critical problems, you may encrypt your report (see +below). + +Please use a descriptive subject line for your report email. After the initial +reply to your report, the security team will endeavor to keep you informed of +the progress being made towards a fix and announcement. + +In addition, please include the following information along with your report: + +* Your name and affiliation (if any). +* A description of the technical details of the vulnerabilities. It is very + important to let us know how we can reproduce your findings. +* An explanation who can exploit this vulnerability, and what they gain when + doing so -- write an attack scenario. This will help us evaluate your report + quickly, especially if the issue is complex. +* Whether this vulnerability public or known to third parties. If it is, please + provide details. + +If you believe that an existing (public) issue is security-related, please send +an email to `security@tensorflow.org`. The email should include the issue ID and +a short description of why it should be handled according to this security +policy. + +Once an issue is reported, TensorFlow uses the following disclosure process: + +* When a report is received, we confirm the issue and determine its severity. +* If we know of specific third-party services or software based on TensorFlow + that require mitigation before publication, those projects will be notified. +* An advisory is prepared (but not published) which details the problem and + steps for mitigation. +* The vulnerability is fixed and potential workarounds are identified. +* Wherever possible, the fix is also prepared for the branches corresponding to + all releases of TensorFlow at most one year old. We will attempt to commit + these fixes as soon as possible, and as close together as possible. +* Patch releases are published for all fixed released versions, a + notification is sent to discuss@tensorflow.org, and the advisory is published. + +Note that we mostly do patch releases for security reasons and each version of +TensorFlow is supported for only 1 year after the release. + +Past security advisories are listed below. We credit reporters for identifying +security issues, although we keep your name confidential if you request it. + +#### Encryption key for `security@tensorflow.org` + +If your disclosure is extremely sensitive, you may choose to encrypt your +report using the key below. Please only use this for critical security +reports. + +``` +-----BEGIN PGP PUBLIC KEY BLOCK----- + +mQENBFpqdzwBCADTeAHLNEe9Vm77AxhmGP+CdjlY84O6DouOCDSq00zFYdIU/7aI +LjYwhEmDEvLnRCYeFGdIHVtW9YrVktqYE9HXVQC7nULU6U6cvkQbwHCdrjaDaylP +aJUXkNrrxibhx9YYdy465CfusAaZ0aM+T9DpcZg98SmsSml/HAiiY4mbg/yNVdPs +SEp/Ui4zdIBNNs6at2gGZrd4qWhdM0MqGJlehqdeUKRICE/mdedXwsWLM8AfEA0e +OeTVhZ+EtYCypiF4fVl/NsqJ/zhBJpCx/1FBI1Uf/lu2TE4eOS1FgmIqb2j4T+jY +e+4C8kGB405PAC0n50YpOrOs6k7fiQDjYmbNABEBAAG0LVRlbnNvckZsb3cgU2Vj +dXJpdHkgPHNlY3VyaXR5QHRlbnNvcmZsb3cub3JnPokBTgQTAQgAOBYhBEkvXzHm +gOJBnwP4Wxnef3wVoM2yBQJaanc8AhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheA +AAoJEBnef3wVoM2yNlkIAICqetv33MD9W6mPAXH3eon+KJoeHQHYOuwWfYkUF6CC +o+X2dlPqBSqMG3bFuTrrcwjr9w1V8HkNuzzOJvCm1CJVKaxMzPuXhBq5+DeT67+a +T/wK1L2R1bF0gs7Pp40W3np8iAFEh8sgqtxXvLGJLGDZ1Lnfdprg3HciqaVAiTum +HBFwszszZZ1wAnKJs5KVteFN7GSSng3qBcj0E0ql2nPGEqCVh+6RG/TU5C8gEsEf +3DX768M4okmFDKTzLNBm+l08kkBFt+P43rNK8dyC4PXk7yJa93SmS/dlK6DZ16Yw +2FS1StiZSVqygTW59rM5XNwdhKVXy2mf/RtNSr84gSi5AQ0EWmp3PAEIALInfBLR +N6fAUGPFj+K3za3PeD0fWDijlC9f4Ety/icwWPkOBdYVBn0atzI21thPRbfuUxfe +zr76xNNrtRRlbDSAChA1J5T86EflowcQor8dNC6fS+oHFCGeUjfEAm16P6mGTo0p +osdG2XnnTHOOEFbEUeWOwR/zT0QRaGGknoy2pc4doWcJptqJIdTl1K8xyBieik/b +nSoClqQdZJa4XA3H9G+F4NmoZGEguC5GGb2P9NHYAJ3MLHBHywZip8g9oojIwda+ +OCLL4UPEZ89cl0EyhXM0nIAmGn3Chdjfu3ebF0SeuToGN8E1goUs3qSE77ZdzIsR +BzZSDFrgmZH+uP0AEQEAAYkBNgQYAQgAIBYhBEkvXzHmgOJBnwP4Wxnef3wVoM2y +BQJaanc8AhsMAAoJEBnef3wVoM2yX4wIALcYZbQhSEzCsTl56UHofze6C3QuFQIH +J4MIKrkTfwiHlCujv7GASGU2Vtis5YEyOoMidUVLlwnebE388MmaJYRm0fhYq6lP +A3vnOCcczy1tbo846bRdv012zdUA+wY+mOITdOoUjAhYulUR0kiA2UdLSfYzbWwy +7Obq96Jb/cPRxk8jKUu2rqC/KDrkFDtAtjdIHh6nbbQhFuaRuWntISZgpIJxd8Bt +Gwi0imUVd9m9wZGuTbDGi6YTNk0GPpX5OMF5hjtM/objzTihSw9UN+65Y/oSQM81 +v//Fw6ZeY+HmRDFdirjD7wXtIuER4vqCryIqR6Xe9X8oJXz9L/Jhslc= +=CDME +-----END PGP PUBLIC KEY BLOCK----- +``` + +### Known Vulnerabilities + +At this time there are no known vulnerability with TensorFlow-models. For a list of known vulnerabilities and security advisories for TensorFlow, +[click here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/README.md). -- GitLab From 4698b1cd468ad687fb901361206d61e545ba82a5 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 23 Mar 2022 09:42:45 -0700 Subject: [PATCH 33/54] Internal change PiperOrigin-RevId: 436761233 --- official/projects/backbone_reuse/README.md | 5 +++++ official/vision/configs/maskrcnn.py | 4 ++++ official/vision/tasks/maskrcnn.py | 5 +++++ 3 files changed, 14 insertions(+) create mode 100644 official/projects/backbone_reuse/README.md diff --git a/official/projects/backbone_reuse/README.md b/official/projects/backbone_reuse/README.md new file mode 100644 index 000000000..371de30f6 --- /dev/null +++ b/official/projects/backbone_reuse/README.md @@ -0,0 +1,5 @@ +# Proper Reuse of Image Classification Features Improves Object Detection + +Coming soon +1. CVPR 2022 paper +2. Table of results \ No newline at end of file diff --git a/official/vision/configs/maskrcnn.py b/official/vision/configs/maskrcnn.py index 137b168b7..768d87121 100644 --- a/official/vision/configs/maskrcnn.py +++ b/official/vision/configs/maskrcnn.py @@ -214,6 +214,10 @@ class MaskRCNNTask(cfg.TaskConfig): # If set, the Waymo Open Dataset evaluator would be used. use_wod_metrics: bool = False + # If set, freezes the backbone during training. + # TODO(crisnv) Add paper link when available. + freeze_backbone: bool = False + COCO_INPUT_PATH_BASE = 'coco' diff --git a/official/vision/tasks/maskrcnn.py b/official/vision/tasks/maskrcnn.py index c7169c4a3..3aed67363 100644 --- a/official/vision/tasks/maskrcnn.py +++ b/official/vision/tasks/maskrcnn.py @@ -82,10 +82,15 @@ class MaskRCNNTask(base_task.Task): input_specs=input_specs, model_config=self.task_config.model, l2_regularizer=l2_regularizer) + + if self.task_config.freeze_backbone: + model.backbone.trainable = False + return model def initialize(self, model: tf.keras.Model): """Loading pretrained checkpoint.""" + if not self.task_config.init_checkpoint: return -- GitLab From 5964641878773190ca89be15d6013db7b738c525 Mon Sep 17 00:00:00 2001 From: Yeqing Li Date: Wed, 23 Mar 2022 10:03:39 -0700 Subject: [PATCH 34/54] Move some documents out of beta. PiperOrigin-RevId: 436766118 --- official/README.md | 20 +++---- official/vision/{beta => }/MODEL_GARDEN.md | 66 +++++++++++----------- official/vision/{beta => }/README.md | 0 3 files changed, 43 insertions(+), 43 deletions(-) rename official/vision/{beta => }/MODEL_GARDEN.md (75%) rename official/vision/{beta => }/README.md (100%) diff --git a/official/README.md b/official/README.md index 06f09d4e5..89256d2f9 100644 --- a/official/README.md +++ b/official/README.md @@ -41,21 +41,21 @@ In the near future, we will add: | Model | Reference (Paper) | |-------|-------------------| -| [MNIST](vision/image_classification) | A basic model to classify digits from the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) | -| [ResNet](vision/beta/MODEL_GARDEN.md) | [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) | -| [ResNet-RS](vision/beta/MODEL_GARDEN.md) | [Revisiting ResNets: Improved Training and Scaling Strategies](https://arxiv.org/abs/2103.07579) | -| [EfficientNet](vision/image_classification) | [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) | -| [Vision Transformer](vision/beta/MODEL_GARDEN.md) | [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) | +| [MNIST](legacy/image_classification) | A basic model to classify digits from the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) | +| [ResNet](vision/MODEL_GARDEN.md) | [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) | +| [ResNet-RS](vision/MODEL_GARDEN.md) | [Revisiting ResNets: Improved Training and Scaling Strategies](https://arxiv.org/abs/2103.07579) | +| [EfficientNet](legacy/image_classification) | [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) | +| [Vision Transformer](vision/MODEL_GARDEN.md) | [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) | #### Object Detection and Segmentation | Model | Reference (Paper) | |-------|-------------------| -| [RetinaNet](vision/beta/MODEL_GARDEN.md) | [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) | -| [Mask R-CNN](vision/beta/MODEL_GARDEN.md) | [Mask R-CNN](https://arxiv.org/abs/1703.06870) | -| [ShapeMask](vision/detection) | [ShapeMask: Learning to Segment Novel Objects by Refining Shape Priors](https://arxiv.org/abs/1904.03239) | -| [SpineNet](vision/beta/MODEL_GARDEN.md) | [SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization](https://arxiv.org/abs/1912.05027) | -| [Cascade RCNN-RS and RetinaNet-RS](vision/beta/MODEL_GARDEN.md) | [Simple Training Strategies and Model Scaling for Object Detection](https://arxiv.org/abs/2107.00057)| +| [RetinaNet](vision/MODEL_GARDEN.md) | [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) | +| [Mask R-CNN](vision/MODEL_GARDEN.md) | [Mask R-CNN](https://arxiv.org/abs/1703.06870) | +| [ShapeMask](legacy/detection) | [ShapeMask: Learning to Segment Novel Objects by Refining Shape Priors](https://arxiv.org/abs/1904.03239) | +| [SpineNet](vision/MODEL_GARDEN.md) | [SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization](https://arxiv.org/abs/1912.05027) | +| [Cascade RCNN-RS and RetinaNet-RS](vision/MODEL_GARDEN.md) | [Simple Training Strategies and Model Scaling for Object Detection](https://arxiv.org/abs/2107.00057)| #### Video Classification diff --git a/official/vision/beta/MODEL_GARDEN.md b/official/vision/MODEL_GARDEN.md similarity index 75% rename from official/vision/beta/MODEL_GARDEN.md rename to official/vision/MODEL_GARDEN.md index d8bd43d9e..058cabeaa 100644 --- a/official/vision/beta/MODEL_GARDEN.md +++ b/official/vision/MODEL_GARDEN.md @@ -24,10 +24,10 @@ segmentation. | Model | Resolution | Epochs | Top-1 | Top-5 | Download | | ------------ |:-------------:|--------:|--------:|--------:|---------:| -| ResNet-50 | 224x224 | 90 | 76.1 | 92.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) | -| ResNet-50 | 224x224 | 200 | 77.1 | 93.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) | -| ResNet-101 | 224x224 | 200 | 78.3 | 94.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_tpu.yaml) | -| ResNet-152 | 224x224 | 200 | 78.7 | 94.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnet152_tpu.yaml) | +| ResNet-50 | 224x224 | 90 | 76.1 | 92.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) | +| ResNet-50 | 224x224 | 200 | 77.1 | 93.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) | +| ResNet-101 | 224x224 | 200 | 78.3 | 94.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnet101_tpu.yaml) | +| ResNet-152 | 224x224 | 200 | 78.7 | 94.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnet152_tpu.yaml) | #### ResNet-RS models trained with various settings @@ -40,20 +40,20 @@ classification models with features: depth, label smoothing and dropout. * New training methods including a 350-epoch schedule, cosine learning rate and EMA. -* Configs are in this [directory](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification). +* Configs are in this [directory](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification). | Model | Resolution | Params (M) | Top-1 | Top-5 | Download | | --------- | :--------: | ---------: | ----: | ----: | --------:| -| ResNet-RS-50 | 160x160 | 35.7 | 79.1 | 94.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-50-i160.tar.gz) | -| ResNet-RS-101 | 160x160 | 63.7 | 80.2 | 94.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-101-i160.tar.gz) | -| ResNet-RS-101 | 192x192 | 63.7 | 81.3 | 95.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-101-i192.tar.gz) | -| ResNet-RS-152 | 192x192 | 86.8 | 81.9 | 95.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i192.tar.gz) | -| ResNet-RS-152 | 224x224 | 86.8 | 82.5 | 96.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i224.tar.gz) | -| ResNet-RS-152 | 256x256 | 86.8 | 83.1 | 96.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i256.tar.gz) | -| ResNet-RS-200 | 256x256 | 93.4 | 83.5 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-200-i256.tar.gz) | -| ResNet-RS-270 | 256x256 | 130.1 | 83.6 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-270-i256.tar.gz) | -| ResNet-RS-350 | 256x256 | 164.3 | 83.7 | 96.7 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i256.tar.gz) | -| ResNet-RS-350 | 320x320 | 164.3 | 84.2 | 96.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs420_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i320.tar.gz) | +| ResNet-RS-50 | 160x160 | 35.7 | 79.1 | 94.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-50-i160.tar.gz) | +| ResNet-RS-101 | 160x160 | 63.7 | 80.2 | 94.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-101-i160.tar.gz) | +| ResNet-RS-101 | 192x192 | 63.7 | 81.3 | 95.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-101-i192.tar.gz) | +| ResNet-RS-152 | 192x192 | 86.8 | 81.9 | 95.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i192.tar.gz) | +| ResNet-RS-152 | 224x224 | 86.8 | 82.5 | 96.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i224.tar.gz) | +| ResNet-RS-152 | 256x256 | 86.8 | 83.1 | 96.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i256.tar.gz) | +| ResNet-RS-200 | 256x256 | 93.4 | 83.5 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-200-i256.tar.gz) | +| ResNet-RS-270 | 256x256 | 130.1 | 83.6 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-270-i256.tar.gz) | +| ResNet-RS-350 | 256x256 | 164.3 | 83.7 | 96.7 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i256.tar.gz) | +| ResNet-RS-350 | 320x320 | 164.3 | 84.2 | 96.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs420_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i320.tar.gz) | #### Vision Transformer (ViT) @@ -109,16 +109,16 @@ evaluated on [COCO](https://cocodataset.org/) val2017. | Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Download | | ------------ |:-------------:| -------:|--------------:|-----------:|--------:|---------:| -| SpineNet-49 | 640x640 | 500 | 85.4| 28.5 | 44.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)| -| SpineNet-96 | 1024x1024 | 500 | 265.4 | 43.0 | 48.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet96_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)| -| SpineNet-143 | 1280x1280 | 500 | 524.0 | 67.0 | 50.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet143_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)| +| SpineNet-49 | 640x640 | 500 | 85.4| 28.5 | 44.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/retinanet/coco_spinenet49_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)| +| SpineNet-96 | 1024x1024 | 500 | 265.4 | 43.0 | 48.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/retinanet/coco_spinenet96_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)| +| SpineNet-143 | 1280x1280 | 500 | 524.0 | 67.0 | 50.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/retinanet/coco_spinenet143_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)| #### Mobile-size RetinaNet (Trained from scratch): | Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Download | | ----------- | :--------: | -----: | --------: | ---------: | -----: | --------:| -| MobileNetv2 | 256x256 | 600 | - | 2.27 | 23.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_mobilenetv2_tpu.yaml) | -| Mobile SpineNet-49 | 384x384 | 600 | 1.0 | 2.32 | 28.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/retinanet/spinenet49mobile.tar.gz) | +| MobileNetv2 | 256x256 | 600 | - | 2.27 | 23.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/retinanet/coco_mobilenetv2_tpu.yaml) | +| Mobile SpineNet-49 | 384x384 | 600 | 1.0 | 2.32 | 28.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/retinanet/spinenet49mobile.tar.gz) | ### Instance Segmentation Baselines @@ -126,19 +126,19 @@ evaluated on [COCO](https://cocodataset.org/) val2017. | Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Mask AP | Download | | ------------ |:-------------:| -------:|-----------:|-----------:|-------:|--------:|---------:| -| ResNet50-FPN | 640x640 | 350 | 227.7 | 46.3 | 42.3 | 37.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/maskrcnn/r50fpn_640_coco_scratch_tpu4x4.yaml) | -| SpineNet-49 | 640x640 | 350 | 215.7 | 40.8 | 42.6 | 37.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet49_mrcnn_tpu.yaml) | -| SpineNet-96 | 1024x1024 | 500 | 315.0 | 55.2 | 48.1 | 42.4 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet96_mrcnn_tpu.yaml) | -| SpineNet-143 | 1280x1280 | 500 | 498.8 | 79.2 | 49.3 | 43.4 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet143_mrcnn_tpu.yaml) | +| ResNet50-FPN | 640x640 | 350 | 227.7 | 46.3 | 42.3 | 37.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/r50fpn_640_coco_scratch_tpu4x4.yaml) | +| SpineNet-49 | 640x640 | 350 | 215.7 | 40.8 | 42.6 | 37.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/coco_spinenet49_mrcnn_tpu.yaml) | +| SpineNet-96 | 1024x1024 | 500 | 315.0 | 55.2 | 48.1 | 42.4 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/coco_spinenet96_mrcnn_tpu.yaml) | +| SpineNet-143 | 1280x1280 | 500 | 498.8 | 79.2 | 49.3 | 43.4 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/coco_spinenet143_mrcnn_tpu.yaml) | #### Cascade RCNN-RS (Trained from scratch) | Backbone | Resolution | Epochs | Params (M) | Box AP | Mask AP | Download ------------ | :--------: | -----: | ---------: | -----: | ------: | -------: -| SpineNet-49 | 640x640 | 500 | 56.4 | 46.4 | 40.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet49_cascadercnn_tpu.yaml)| -| SpineNet-96 | 1024x1024 | 500 | 70.8 | 50.9 | 43.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet96_cascadercnn_tpu.yaml)| -| SpineNet-143 | 1280x1280 | 500 | 94.9 | 51.9 | 45.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet143_cascadercnn_tpu.yaml)| +| SpineNet-49 | 640x640 | 500 | 56.4 | 46.4 | 40.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/coco_spinenet49_cascadercnn_tpu.yaml)| +| SpineNet-96 | 1024x1024 | 500 | 70.8 | 50.9 | 43.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/coco_spinenet96_cascadercnn_tpu.yaml)| +| SpineNet-143 | 1280x1280 | 500 | 94.9 | 51.9 | 45.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/coco_spinenet143_cascadercnn_tpu.yaml)| ## Semantic Segmentation @@ -186,10 +186,10 @@ evaluated on [COCO](https://cocodataset.org/) val2017. | Model | Input (frame x stride) | Top-1 | Top-5 | Download | | -------- |:----------------------:|--------:|--------:|---------:| -| SlowOnly | 8 x 8 | 74.1 | 91.4 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_slowonly8x8_tpu.yaml) | -| SlowOnly | 16 x 4 | 75.6 | 92.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_slowonly16x4_tpu.yaml) | -| R3D-50 | 32 x 2 | 77.0 | 93.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_3d-resnet50_tpu.yaml) | -| R3D-RS-50 | 32 x 2 | 78.2 | 93.7 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k400_resnet3drs_50_tpu.yaml) | +| SlowOnly | 8 x 8 | 74.1 | 91.4 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/video_classification/k400_slowonly8x8_tpu.yaml) | +| SlowOnly | 16 x 4 | 75.6 | 92.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/video_classification/k400_slowonly16x4_tpu.yaml) | +| R3D-50 | 32 x 2 | 77.0 | 93.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/video_classification/k400_3d-resnet50_tpu.yaml) | +| R3D-RS-50 | 32 x 2 | 78.2 | 93.7 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/video_classification/k400_resnet3drs_50_tpu.yaml) | | R3D-RS-101 | 32 x 2 | 79.5 | 94.2 | - | R3D-RS-152 | 32 x 2 | 79.9 | 94.3 | - | R3D-RS-200 | 32 x 2 | 80.4 | 94.4 | - @@ -205,8 +205,8 @@ evaluated on [COCO](https://cocodataset.org/) val2017. | Model | Input (frame x stride) | Top-1 | Top-5 | Download | | -------- |:----------------------:|--------:|--------:|---------:| -| SlowOnly | 8 x 8 | 77.3 | 93.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k600_slowonly8x8_tpu.yaml) | -| R3D-50 | 32 x 2 | 79.5 | 94.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50_tpu.yaml) | +| SlowOnly | 8 x 8 | 77.3 | 93.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/video_classification/k600_slowonly8x8_tpu.yaml) | +| R3D-50 | 32 x 2 | 79.5 | 94.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/video_classification/k600_3d-resnet50_tpu.yaml) | | R3D-RS-200 | 32 x 2 | 83.1 | - | - | R3D-RS-200 | 48 x 2 | 83.8 | - | - | MoViNet-A0-Base | 50 x 5 | 72.05 | 90.92 | [config](https://github.com/tensorflow/models/blob/master/official/projects/movinet/configs/yaml/movinet_a0_k600_8x8.yaml) | diff --git a/official/vision/beta/README.md b/official/vision/README.md similarity index 100% rename from official/vision/beta/README.md rename to official/vision/README.md -- GitLab From 957f5d2d3444d1e9f4e703bd69c4a5eb92305599 Mon Sep 17 00:00:00 2001 From: Yeqing Li Date: Wed, 23 Mar 2022 12:07:04 -0700 Subject: [PATCH 35/54] Corrected the import paths. PiperOrigin-RevId: 436798106 --- official/projects/qat/vision/configs/retinanet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/official/projects/qat/vision/configs/retinanet.py b/official/projects/qat/vision/configs/retinanet.py index e12dcbfe8..91f2b5282 100644 --- a/official/projects/qat/vision/configs/retinanet.py +++ b/official/projects/qat/vision/configs/retinanet.py @@ -20,7 +20,7 @@ from official.core import config_definitions as cfg from official.core import exp_factory from official.projects.qat.vision.configs import common from official.vision.configs import retinanet -from official.vision.configs.google import backbones +from official.vision.configs import backbones @dataclasses.dataclass -- GitLab From e396718a0dc44ee5a238d3a55004bc521ddecbf7 Mon Sep 17 00:00:00 2001 From: Fan Yang Date: Wed, 23 Mar 2022 12:25:38 -0700 Subject: [PATCH 36/54] Internal change PiperOrigin-RevId: 436802310 --- official/common/registry_imports.py | 2 +- official/vision/beta/serving/detection_test.py | 2 +- official/vision/beta/serving/export_tflite_lib_test.py | 2 +- official/vision/beta/serving/image_classification_test.py | 2 +- official/vision/beta/serving/semantic_segmentation_test.py | 2 +- official/vision/beta/serving/video_classification_test.py | 3 +-- 6 files changed, 6 insertions(+), 7 deletions(-) diff --git a/official/common/registry_imports.py b/official/common/registry_imports.py index c1aede6db..eb9af692a 100644 --- a/official/common/registry_imports.py +++ b/official/common/registry_imports.py @@ -14,7 +14,7 @@ """All necessary imports for registration.""" # pylint: disable=unused-import +from official import vision from official.nlp import tasks from official.nlp.configs import experiment_configs from official.utils.testing import mock_task -from official.vision import beta diff --git a/official/vision/beta/serving/detection_test.py b/official/vision/beta/serving/detection_test.py index ccd00b4bc..865f291c7 100644 --- a/official/vision/beta/serving/detection_test.py +++ b/official/vision/beta/serving/detection_test.py @@ -22,8 +22,8 @@ import numpy as np from PIL import Image import tensorflow as tf -from official.common import registry_imports # pylint: disable=unused-import from official.core import exp_factory +from official.vision.beta import configs # pylint: disable=unused-import from official.vision.beta.serving import detection diff --git a/official/vision/beta/serving/export_tflite_lib_test.py b/official/vision/beta/serving/export_tflite_lib_test.py index 72990b6ba..138dde6ef 100644 --- a/official/vision/beta/serving/export_tflite_lib_test.py +++ b/official/vision/beta/serving/export_tflite_lib_test.py @@ -19,8 +19,8 @@ from absl.testing import parameterized import tensorflow as tf from tensorflow.python.distribute import combinations -from official.common import registry_imports # pylint: disable=unused-import from official.core import exp_factory +from official.vision.beta import configs # pylint: disable=unused-import from official.vision.beta.dataloaders import tfexample_utils from official.vision.beta.serving import detection as detection_serving from official.vision.beta.serving import export_tflite_lib diff --git a/official/vision/beta/serving/image_classification_test.py b/official/vision/beta/serving/image_classification_test.py index 818069f98..4469ef3a8 100644 --- a/official/vision/beta/serving/image_classification_test.py +++ b/official/vision/beta/serving/image_classification_test.py @@ -22,8 +22,8 @@ import numpy as np from PIL import Image import tensorflow as tf -from official.common import registry_imports # pylint: disable=unused-import from official.core import exp_factory +from official.vision.beta import configs # pylint: disable=unused-import from official.vision.beta.serving import image_classification diff --git a/official/vision/beta/serving/semantic_segmentation_test.py b/official/vision/beta/serving/semantic_segmentation_test.py index e690958ff..24a2df048 100644 --- a/official/vision/beta/serving/semantic_segmentation_test.py +++ b/official/vision/beta/serving/semantic_segmentation_test.py @@ -22,8 +22,8 @@ import numpy as np from PIL import Image import tensorflow as tf -from official.common import registry_imports # pylint: disable=unused-import from official.core import exp_factory +from official.vision.beta import configs # pylint: disable=unused-import from official.vision.beta.serving import semantic_segmentation diff --git a/official/vision/beta/serving/video_classification_test.py b/official/vision/beta/serving/video_classification_test.py index d89ff0ed4..fe348d9dc 100644 --- a/official/vision/beta/serving/video_classification_test.py +++ b/official/vision/beta/serving/video_classification_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - # import io import os import random @@ -21,8 +20,8 @@ from absl.testing import parameterized import numpy as np import tensorflow as tf -from official.common import registry_imports # pylint: disable=unused-import from official.core import exp_factory +from official.vision.beta import configs # pylint: disable=unused-import from official.vision.beta.dataloaders import tfexample_utils from official.vision.beta.serving import video_classification -- GitLab From 99f15f32fda80615df3ada763fe8c71294d262aa Mon Sep 17 00:00:00 2001 From: Vishnuvardhan Janapati <46058173+jvishnuvardhan@users.noreply.github.com> Date: Wed, 23 Mar 2022 13:23:27 -0700 Subject: [PATCH 37/54] Update README.md (#10551) * Update README.md Updating README.md with the installation approach. Also, formatted text for consistency. * Updated tensorflow/models/README.md Added markdown code to hide details. User need to clic the triangle symbol to see the details. This makes the page look better * Update README.md --- README.md | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a27d3e7be..ad40f14dd 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,9 @@ -![Logo](https://storage.googleapis.com/tf_model_garden/tf_model_garden_logo.png) +
+ +
+ +[![Python](https://img.shields.io/pypi/pyversions/tensorflow.svg?style=plastic)](https://badge.fury.io/py/tensorflow) +[![PyPI](https://badge.fury.io/py/tensorflow.svg)](https://badge.fury.io/py/tensorflow) # Welcome to the Model Garden for TensorFlow @@ -19,7 +24,88 @@ extent possible though not all models are suitable. | [community](community) | • A curated list of the GitHub repositories with machine learning models and implementations powered by TensorFlow 2 | | [orbit](orbit) | • A flexible and lightweight library that users can easily use or fork when writing customized training loop code in TensorFlow 2.x. It seamlessly integrates with `tf.distribute` and supports running on different device types (CPU, GPU, and TPU). | -## [Announcements](https://github.com/tensorflow/models/wiki/Announcements) +## Installation + +To install the current release of tensorflow-models, please follow any one of the methods described below. + +#### Method 1: Install the TensorFlow Model Garden pip package + +
+ +**tf-models-official** is the stable Model Garden package. +pip will install all models and dependencies automatically. + +```shell +pip3 install tf-models-official +``` + +If you are using nlp packages, please also install **tensorflow-text**: + +```shell +pip3 install tensorflow-text +``` + +Please check out our [example](https://github.com/tensorflow/text/blob/master/docs/tutorials/fine_tune_bert.ipynb) +to learn how to use a PIP package. + +Note that **tf-models-official** may not include the latest changes in this +github repo. To include latest changes, you may install **tf-models-nightly**, +which is the nightly Model Garden package created daily automatically. + +```shell +pip3 install tf-models-nightly +``` + +If you are using `nlp` packages, please also install tensorflow-text-nightly + +```shell +pip3 install tensorflow-text-nightly +``` +
+ + +#### Method 2: Clone the source + +
+ +1. Clone the GitHub repository: + +```shell +git clone https://github.com/tensorflow/models.git +``` + +2. Add the top-level ***/models*** folder to the Python path. + +```shell +export PYTHONPATH=$PYTHONPATH:/path/to/models +``` + +If you are using a Colab notebook, please set the Python path with os.environ. + +```python +import os +os.environ['PYTHONPATH'] += ":/path/to/models" +``` + +3. Install other dependencies + +```shell +pip3 install --user -r official/requirements.txt +``` + +Finally, if you are using nlp packages, please also install +**tensorflow-text-nightly**: + +```shell +pip3 install tensorflow-text-nightly +``` + +
+ + +## Announcements + +Please check [this page](https://github.com/tensorflow/models/wiki/Announcements) for recent announcements. ## Contributions -- GitLab From 48c6be9d939864ee9d46f1879552d97b2ce4de07 Mon Sep 17 00:00:00 2001 From: Liangzhe Yuan Date: Wed, 23 Mar 2022 13:25:12 -0700 Subject: [PATCH 38/54] Refactor resnet_3d. PiperOrigin-RevId: 436815441 --- .../vision/modeling/backbones/resnet_3d.py | 137 ++++++++++-------- 1 file changed, 80 insertions(+), 57 deletions(-) diff --git a/official/vision/modeling/backbones/resnet_3d.py b/official/vision/modeling/backbones/resnet_3d.py index fe90a2157..6fffb901a 100644 --- a/official/vision/modeling/backbones/resnet_3d.py +++ b/official/vision/modeling/backbones/resnet_3d.py @@ -153,19 +153,76 @@ class ResNet3D(tf.keras.Model): self._kernel_regularizer = kernel_regularizer self._bias_regularizer = bias_regularizer if tf.keras.backend.image_data_format() == 'channels_last': - bn_axis = -1 + self._bn_axis = -1 else: - bn_axis = 1 + self._bn_axis = 1 # Build ResNet3D backbone. inputs = tf.keras.Input(shape=input_specs.shape[1:]) + endpoints = self._build_model(inputs) + self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} + + super(ResNet3D, self).__init__(inputs=inputs, outputs=endpoints, **kwargs) + + def _build_model(self, inputs): + """Builds model architecture. + + Args: + inputs: the keras input spec. + + Returns: + endpoints: A dictionary of backbone endpoint features. + """ + # Build stem. + x = self._build_stem(inputs, stem_type=self._stem_type) + + temporal_kernel_size = 1 if self._stem_pool_temporal_stride == 1 else 3 + x = layers.MaxPool3D( + pool_size=[temporal_kernel_size, 3, 3], + strides=[self._stem_pool_temporal_stride, 2, 2], + padding='same')(x) + + # Build intermediate blocks and endpoints. + resnet_specs = RESNET_SPECS[self._model_id] + if len(self._temporal_strides) != len(resnet_specs) or len( + self._temporal_kernel_sizes) != len(resnet_specs): + raise ValueError( + 'Number of blocks in temporal specs should equal to resnet_specs.') + + endpoints = {} + for i, resnet_spec in enumerate(resnet_specs): + if resnet_spec[0] == 'bottleneck3d': + block_fn = nn_blocks_3d.BottleneckBlock3D + else: + raise ValueError('Block fn `{}` is not supported.'.format( + resnet_spec[0])) + + use_self_gating = ( + self._use_self_gating[i] if self._use_self_gating else False) + x = self._block_group( + inputs=x, + filters=resnet_spec[1], + temporal_kernel_sizes=self._temporal_kernel_sizes[i], + temporal_strides=self._temporal_strides[i], + spatial_strides=(1 if i == 0 else 2), + block_fn=block_fn, + block_repeats=resnet_spec[2], + stochastic_depth_drop_rate=nn_layers.get_stochastic_depth_rate( + self._init_stochastic_depth_rate, i + 2, 5), + use_self_gating=use_self_gating, + name='block_group_l{}'.format(i + 2)) + endpoints[str(i + 2)] = x + + return endpoints + def _build_stem(self, inputs, stem_type): + """Builds stem layer.""" # Build stem. if stem_type == 'v0': x = layers.Conv3D( filters=64, - kernel_size=[stem_conv_temporal_kernel_size, 7, 7], - strides=[stem_conv_temporal_stride, 2, 2], + kernel_size=[self._stem_conv_temporal_kernel_size, 7, 7], + strides=[self._stem_conv_temporal_stride, 2, 2], use_bias=False, padding='same', kernel_initializer=self._kernel_initializer, @@ -173,14 +230,15 @@ class ResNet3D(tf.keras.Model): bias_regularizer=self._bias_regularizer)( inputs) x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - x = tf_utils.get_activation(activation)(x) + axis=self._bn_axis, + momentum=self._norm_momentum, + epsilon=self._norm_epsilon)(x) + x = tf_utils.get_activation(self._activation)(x) elif stem_type == 'v1': x = layers.Conv3D( filters=32, - kernel_size=[stem_conv_temporal_kernel_size, 3, 3], - strides=[stem_conv_temporal_stride, 2, 2], + kernel_size=[self._stem_conv_temporal_kernel_size, 3, 3], + strides=[self._stem_conv_temporal_stride, 2, 2], use_bias=False, padding='same', kernel_initializer=self._kernel_initializer, @@ -188,9 +246,10 @@ class ResNet3D(tf.keras.Model): bias_regularizer=self._bias_regularizer)( inputs) x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - x = tf_utils.get_activation(activation)(x) + axis=self._bn_axis, + momentum=self._norm_momentum, + epsilon=self._norm_epsilon)(x) + x = tf_utils.get_activation(self._activation)(x) x = layers.Conv3D( filters=32, kernel_size=[1, 3, 3], @@ -202,9 +261,10 @@ class ResNet3D(tf.keras.Model): bias_regularizer=self._bias_regularizer)( x) x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - x = tf_utils.get_activation(activation)(x) + axis=self._bn_axis, + momentum=self._norm_momentum, + epsilon=self._norm_epsilon)(x) + x = tf_utils.get_activation(self._activation)(x) x = layers.Conv3D( filters=64, kernel_size=[1, 3, 3], @@ -216,51 +276,14 @@ class ResNet3D(tf.keras.Model): bias_regularizer=self._bias_regularizer)( x) x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - x = tf_utils.get_activation(activation)(x) + axis=self._bn_axis, + momentum=self._norm_momentum, + epsilon=self._norm_epsilon)(x) + x = tf_utils.get_activation(self._activation)(x) else: raise ValueError(f'Stem type {stem_type} not supported.') - temporal_kernel_size = 1 if stem_pool_temporal_stride == 1 else 3 - x = layers.MaxPool3D( - pool_size=[temporal_kernel_size, 3, 3], - strides=[stem_pool_temporal_stride, 2, 2], - padding='same')( - x) - - # Build intermediate blocks and endpoints. - resnet_specs = RESNET_SPECS[model_id] - if len(temporal_strides) != len(resnet_specs) or len( - temporal_kernel_sizes) != len(resnet_specs): - raise ValueError( - 'Number of blocks in temporal specs should equal to resnet_specs.') - - endpoints = {} - for i, resnet_spec in enumerate(resnet_specs): - if resnet_spec[0] == 'bottleneck3d': - block_fn = nn_blocks_3d.BottleneckBlock3D - else: - raise ValueError('Block fn `{}` is not supported.'.format( - resnet_spec[0])) - - x = self._block_group( - inputs=x, - filters=resnet_spec[1], - temporal_kernel_sizes=temporal_kernel_sizes[i], - temporal_strides=temporal_strides[i], - spatial_strides=(1 if i == 0 else 2), - block_fn=block_fn, - block_repeats=resnet_spec[2], - stochastic_depth_drop_rate=nn_layers.get_stochastic_depth_rate( - self._init_stochastic_depth_rate, i + 2, 5), - use_self_gating=use_self_gating[i] if use_self_gating else False, - name='block_group_l{}'.format(i + 2)) - endpoints[str(i + 2)] = x - - self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} - - super(ResNet3D, self).__init__(inputs=inputs, outputs=endpoints, **kwargs) + return x def _block_group(self, inputs: tf.Tensor, -- GitLab From c166ae02a73607cd918df536b0a65816050de400 Mon Sep 17 00:00:00 2001 From: Yilei Yang Date: Wed, 23 Mar 2022 14:02:54 -0700 Subject: [PATCH 39/54] Internal change PiperOrigin-RevId: 436824079 --- official/legacy/detection/modeling/architecture/spinenet.py | 1 - .../image_classification/efficientnet/efficientnet_config.py | 1 - .../image_classification/efficientnet/efficientnet_model.py | 1 - official/legacy/image_classification/resnet/resnet_config.py | 1 - official/nlp/modeling/layers/gaussian_process.py | 1 - official/nlp/modeling/layers/gaussian_process_test.py | 1 - official/projects/assemblenet/configs/assemblenet.py | 1 - official/projects/assemblenet/configs/assemblenet_test.py | 1 - official/projects/assemblenet/train.py | 1 - official/projects/assemblenet/train_test.py | 1 - 10 files changed, 10 deletions(-) diff --git a/official/legacy/detection/modeling/architecture/spinenet.py b/official/legacy/detection/modeling/architecture/spinenet.py index 95072843f..ea86a70f2 100644 --- a/official/legacy/detection/modeling/architecture/spinenet.py +++ b/official/legacy/detection/modeling/architecture/spinenet.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 # ============================================================================== """Implementation of SpineNet model. diff --git a/official/legacy/image_classification/efficientnet/efficientnet_config.py b/official/legacy/image_classification/efficientnet/efficientnet_config.py index 0a77ea174..148851cf6 100644 --- a/official/legacy/image_classification/efficientnet/efficientnet_config.py +++ b/official/legacy/image_classification/efficientnet/efficientnet_config.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Configuration definitions for EfficientNet losses, learning rates, and optimizers.""" from __future__ import absolute_import from __future__ import division diff --git a/official/legacy/image_classification/efficientnet/efficientnet_model.py b/official/legacy/image_classification/efficientnet/efficientnet_model.py index b92885365..a9aa243b0 100644 --- a/official/legacy/image_classification/efficientnet/efficientnet_model.py +++ b/official/legacy/image_classification/efficientnet/efficientnet_model.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Contains definitions for EfficientNet model. [1] Mingxing Tan, Quoc V. Le diff --git a/official/legacy/image_classification/resnet/resnet_config.py b/official/legacy/image_classification/resnet/resnet_config.py index 3c1d5a033..9c4062821 100644 --- a/official/legacy/image_classification/resnet/resnet_config.py +++ b/official/legacy/image_classification/resnet/resnet_config.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Configuration definitions for ResNet losses, learning rates, and optimizers.""" from __future__ import absolute_import from __future__ import division diff --git a/official/nlp/modeling/layers/gaussian_process.py b/official/nlp/modeling/layers/gaussian_process.py index 3bbedfaa5..ac47eaf2f 100644 --- a/official/nlp/modeling/layers/gaussian_process.py +++ b/official/nlp/modeling/layers/gaussian_process.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Definitions for random feature Gaussian process layer.""" import math import tensorflow as tf diff --git a/official/nlp/modeling/layers/gaussian_process_test.py b/official/nlp/modeling/layers/gaussian_process_test.py index 1b0f8af40..7a9a56fe4 100644 --- a/official/nlp/modeling/layers/gaussian_process_test.py +++ b/official/nlp/modeling/layers/gaussian_process_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Tests for Gaussian process functions.""" import os import shutil diff --git a/official/projects/assemblenet/configs/assemblenet.py b/official/projects/assemblenet/configs/assemblenet.py index b0c633caa..08301dc27 100644 --- a/official/projects/assemblenet/configs/assemblenet.py +++ b/official/projects/assemblenet/configs/assemblenet.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 """Definitions for AssembleNet/++ structures. This structure is a `list` corresponding to a graph representation of the diff --git a/official/projects/assemblenet/configs/assemblenet_test.py b/official/projects/assemblenet/configs/assemblenet_test.py index eadc94669..f11c21135 100644 --- a/official/projects/assemblenet/configs/assemblenet_test.py +++ b/official/projects/assemblenet/configs/assemblenet_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 from absl.testing import parameterized import tensorflow as tf from official.core import config_definitions as cfg diff --git a/official/projects/assemblenet/train.py b/official/projects/assemblenet/train.py index 629874252..54b682ef0 100644 --- a/official/projects/assemblenet/train.py +++ b/official/projects/assemblenet/train.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 r"""Training driver. Commandline: diff --git a/official/projects/assemblenet/train_test.py b/official/projects/assemblenet/train_test.py index c8bb7906e..b3fda0679 100644 --- a/official/projects/assemblenet/train_test.py +++ b/official/projects/assemblenet/train_test.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Lint as: python3 import json import os import random -- GitLab From b8849274e9f966b759fb7a36f2849f15040ad991 Mon Sep 17 00:00:00 2001 From: Fan Yang Date: Wed, 23 Mar 2022 16:06:44 -0700 Subject: [PATCH 40/54] Internal change PiperOrigin-RevId: 436852559 --- .../vit/configs/image_classification.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/official/projects/vit/configs/image_classification.py b/official/projects/vit/configs/image_classification.py index 0fba4abfc..d312719a3 100644 --- a/official/projects/vit/configs/image_classification.py +++ b/official/projects/vit/configs/image_classification.py @@ -13,37 +13,30 @@ # limitations under the License. """Image classification configuration definition.""" -import os -from typing import List, Optional - import dataclasses +import os +from typing import Optional from official.core import config_definitions as cfg from official.core import exp_factory from official.core import task_factory from official.modeling import hyperparams from official.modeling import optimization +from official.projects.vit.configs import backbones from official.vision.configs import common from official.vision.configs import image_classification as img_cls_cfg -from official.projects.vit.configs import backbones from official.vision.tasks import image_classification +# pytype: disable=wrong-keyword-args + DataConfig = img_cls_cfg.DataConfig @dataclasses.dataclass -class ImageClassificationModel(hyperparams.Config): +class ImageClassificationModel(img_cls_cfg.ImageClassificationModel): """The model config.""" - num_classes: int = 0 - input_size: List[int] = dataclasses.field(default_factory=list) backbone: backbones.Backbone = backbones.Backbone( type='vit', vit=backbones.VisionTransformer()) - dropout_rate: float = 0.0 - norm_activation: common.NormActivation = common.NormActivation( - use_sync_bn=False) - # Adds a BatchNormalization layer pre-GlobalAveragePooling in classification - add_head_batch_norm: bool = False - kernel_initializer: str = 'random_uniform' @dataclasses.dataclass -- GitLab From 204dd7ec488761e57d11560beabf11ce499dbbbd Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 23 Mar 2022 16:30:38 -0700 Subject: [PATCH 41/54] Updated official/README.md to be compatible with other README.md's in the model garden PiperOrigin-RevId: 436857882 --- official/README.md | 70 ++++++---------------------------------------- 1 file changed, 8 insertions(+), 62 deletions(-) diff --git a/official/README.md b/official/README.md index 89256d2f9..09a4175cf 100644 --- a/official/README.md +++ b/official/README.md @@ -1,4 +1,6 @@ -![Logo](https://storage.googleapis.com/model_garden_artifacts/TF_Model_Garden.png) +
+ +
# TensorFlow Official Models @@ -32,6 +34,7 @@ In the near future, we will add: * [Natural Language Processing](#natural-language-processing) * [Recommendation](#recommendation) - [How to get started with the official models](#how-to-get-started-with-the-official-models) +- [Contributions](#contributions) ## Models and Implementations @@ -92,7 +95,7 @@ built from the as tagged branches or [downloadable releases](https://github.com/tensorflow/models/releases). * Model repository version numbers match the target TensorFlow release, such that -[release v2.5.0](https://github.com/tensorflow/models/releases/tag/v2.5.0) +[TensorFlow-models v2.5.0](https://github.com/tensorflow/models/releases/tag/v2.5.0) is compatible with [TensorFlow v2.5.0](https://github.com/tensorflow/tensorflow/releases/tag/v2.5.0). @@ -100,12 +103,13 @@ Please follow the below steps before running models in this repository. ### Requirements -* The latest TensorFlow Model Garden release and TensorFlow 2 +* The latest TensorFlow Model Garden release and the latest TensorFlow 2 * If you are on a version of TensorFlow earlier than 2.2, please upgrade your TensorFlow to [the latest TensorFlow 2](https://www.tensorflow.org/install/). ```shell pip3 install tf-nightly +# pip3 install tensorflow # for latest stable version ``` * Python 3.7+ @@ -115,65 +119,7 @@ don't recommend earlier versions. ### Installation -#### Method 1: Install the TensorFlow Model Garden pip package - -**tf-models-official** is the stable Model Garden package. -pip will install all models and dependencies automatically. - -```shell -pip install tf-models-official -``` - -If you are using nlp packages, please also install **tensorflow-text**: - -```shell -pip install tensorflow-text -``` - -Please check out our [example](https://github.com/tensorflow/text/blob/master/docs/tutorials/fine_tune_bert.ipynb) -to learn how to use a PIP package. - -Note that **tf-models-official** may not include the latest changes in this -github repo. To include latest changes, you may install **tf-models-nightly**, -which is the nightly Model Garden package created daily automatically. - -```shell -pip install tf-models-nightly -``` - -#### Method 2: Clone the source - -1. Clone the GitHub repository: - -```shell -git clone https://github.com/tensorflow/models.git -``` - -2. Add the top-level ***/models*** folder to the Python path. - -```shell -export PYTHONPATH=$PYTHONPATH:/path/to/models -``` - -If you are using a Colab notebook, please set the Python path with os.environ. - -```python -import os -os.environ['PYTHONPATH'] += ":/path/to/models" -``` - -3. Install other dependencies - -```shell -pip3 install --user -r official/requirements.txt -``` - -Finally, if you are using nlp packages, please also install -**tensorflow-text-nightly**: - -```shell -pip3 install tensorflow-text-nightly -``` +Please check the installation instructions [here](https://github.com/tensorflow/models#Installation) ## Contributions -- GitLab From dc91f48b70ebb6e92deca7ce5bce24c2dfa30f90 Mon Sep 17 00:00:00 2001 From: Fan Yang Date: Wed, 23 Mar 2022 17:19:40 -0700 Subject: [PATCH 42/54] Support reading compressed tfrecord files. PiperOrigin-RevId: 436868704 --- official/common/dataset_fn.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/official/common/dataset_fn.py b/official/common/dataset_fn.py index e27f2d9a6..52138d717 100644 --- a/official/common/dataset_fn.py +++ b/official/common/dataset_fn.py @@ -28,7 +28,8 @@ # ============================================================================== """Utility library for picking an appropriate dataset function.""" -from typing import Any, Callable, Union, Type +import functools +from typing import Any, Callable, Type, Union import tensorflow as tf @@ -38,5 +39,6 @@ PossibleDatasetType = Union[Type[tf.data.Dataset], Callable[[tf.Tensor], Any]] def pick_dataset_fn(file_type: str) -> PossibleDatasetType: if file_type == 'tfrecord': return tf.data.TFRecordDataset - + if file_type == 'tfrecord_compressed': + return functools.partial(tf.data.TFRecordDataset, compression_type='GZIP') raise ValueError('Unrecognized file_type: {}'.format(file_type)) -- GitLab From 88d7510f5475133a9b90089dfa17a9bec1ff4844 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 24 Mar 2022 13:47:44 -0700 Subject: [PATCH 43/54] Updated official/README.md with better getting started section PiperOrigin-RevId: 437077650 --- official/README.md | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/official/README.md b/official/README.md index 09a4175cf..b84109e2f 100644 --- a/official/README.md +++ b/official/README.md @@ -87,17 +87,29 @@ Model | Reference (Paper) ## How to get started with the official models -* The models in the master branch are developed using TensorFlow 2, -and they target the TensorFlow [nightly binaries](https://github.com/tensorflow/tensorflow#installation) -built from the -[master branch of TensorFlow](https://github.com/tensorflow/tensorflow/tree/master). -* The stable versions targeting releases of TensorFlow are available -as tagged branches or [downloadable releases](https://github.com/tensorflow/models/releases). -* Model repository version numbers match the target TensorFlow release, -such that -[TensorFlow-models v2.5.0](https://github.com/tensorflow/models/releases/tag/v2.5.0) -is compatible with -[TensorFlow v2.5.0](https://github.com/tensorflow/tensorflow/releases/tag/v2.5.0). +* The official models in the master branch are developed using +[master branch of TensorFlow 2](https://github.com/tensorflow/tensorflow/tree/master). +When you clone (the repository) or download (`pip` binary) master branch of +official models , master branch of TensorFlow gets downloaded as a +dependency. This is equivalent to the following. + +```shell +pip3 install tf-models-nightly +pip3 install tensorflow-text-nightly # when model uses `nlp` packages +``` + +* Incase of stable versions, targeting a specific release, Tensorflow-models +repository version numbers match with the target TensorFlow release. For +example, [TensorFlow-models v2.5.0] +(https://github.com/tensorflow/models/releases/tag/v2.5.0) +is compatible with [TensorFlow v2.5.0] +(https://github.com/tensorflow/tensorflow/releases/tag/v2.5.0). +This is equivalent to the following. + +```shell +pip3 install tf-models-official==2.5.0 +pip3 install tensorflow-text==2.5.0 # when model uses `nlp` packages +``` Please follow the below steps before running models in this repository. @@ -106,12 +118,6 @@ Please follow the below steps before running models in this repository. * The latest TensorFlow Model Garden release and the latest TensorFlow 2 * If you are on a version of TensorFlow earlier than 2.2, please upgrade your TensorFlow to [the latest TensorFlow 2](https://www.tensorflow.org/install/). - -```shell -pip3 install tf-nightly -# pip3 install tensorflow # for latest stable version -``` - * Python 3.7+ Our integration tests run with Python 3.7. Although Python 3.6 should work, we @@ -119,7 +125,8 @@ don't recommend earlier versions. ### Installation -Please check the installation instructions [here](https://github.com/tensorflow/models#Installation) +Please check [here](https://github.com/tensorflow/models#Installation) for the +instructions ## Contributions -- GitLab From dc8c2fb40868bc36363cf834ef3159698b7b63ca Mon Sep 17 00:00:00 2001 From: Frederick Liu Date: Thu, 24 Mar 2022 16:59:17 -0700 Subject: [PATCH 44/54] Internal change PiperOrigin-RevId: 437120060 --- official/core/base_task.py | 25 ++++++++++- official/core/config_definitions.py | 6 +++ official/core/train_utils.py | 22 +++++++++- official/modeling/multitask/multitask.py | 8 +++- official/modeling/multitask/train_lib.py | 6 +-- official/modeling/privacy/__init__.py | 14 ++++++ official/modeling/privacy/configs.py | 24 +++++++++++ official/modeling/privacy/configs_test.py | 41 ++++++++++++++++++ official/modeling/privacy/ops.py | 42 ++++++++++++++++++ official/modeling/privacy/ops_test.py | 52 +++++++++++++++++++++++ 10 files changed, 230 insertions(+), 10 deletions(-) create mode 100644 official/modeling/privacy/__init__.py create mode 100644 official/modeling/privacy/configs.py create mode 100644 official/modeling/privacy/configs_test.py create mode 100644 official/modeling/privacy/ops.py create mode 100644 official/modeling/privacy/ops_test.py diff --git a/official/core/base_task.py b/official/core/base_task.py index 1f9f101e4..56b9bc439 100644 --- a/official/core/base_task.py +++ b/official/core/base_task.py @@ -14,6 +14,7 @@ """Defines the base task abstraction.""" import abc +import functools from typing import Optional from absl import logging @@ -22,9 +23,12 @@ import tensorflow as tf from official.core import config_definitions from official.modeling import optimization from official.modeling import performance +from official.modeling.privacy import configs +from official.modeling.privacy import ops OptimizationConfig = optimization.OptimizationConfig RuntimeConfig = config_definitions.RuntimeConfig +DifferentialPrivacyConfig = configs.DifferentialPrivacyConfig class Task(tf.Module, metaclass=abc.ABCMeta): @@ -65,18 +69,35 @@ class Task(tf.Module, metaclass=abc.ABCMeta): @classmethod def create_optimizer(cls, optimizer_config: OptimizationConfig, - runtime_config: Optional[RuntimeConfig] = None): + runtime_config: Optional[RuntimeConfig] = None, + dp_config: Optional[DifferentialPrivacyConfig] = None): """Creates an TF optimizer from configurations. Args: optimizer_config: the parameters of the Optimization settings. runtime_config: the parameters of the runtime. + dp_config: the parameter of differential privacy. Returns: A tf.optimizers.Optimizer object. """ + gradient_transformers = None + if dp_config is not None: + logging.info("Adding differential privacy transform with config %s.", + dp_config.as_dict()) + noise_stddev = dp_config.clipping_norm * dp_config.noise_multiplier + gradient_transformers = [ + functools.partial( + ops.clip_l2_norm, l2_norm_clip=dp_config.clipping_norm), + functools.partial( + ops.add_noise, noise_stddev=noise_stddev) + ] + opt_factory = optimization.OptimizerFactory(optimizer_config) - optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate()) + optimizer = opt_factory.build_optimizer( + opt_factory.build_learning_rate(), + gradient_transformers=gradient_transformers + ) # Configuring optimizer when loss_scale is set in runtime config. This helps # avoiding overflow/underflow for float16 computations. if runtime_config: diff --git a/official/core/config_definitions.py b/official/core/config_definitions.py index 7d05be96f..f2acfeb31 100644 --- a/official/core/config_definitions.py +++ b/official/core/config_definitions.py @@ -19,6 +19,7 @@ from typing import Optional, Sequence, Union from official.modeling.hyperparams import base_config from official.modeling.optimization.configs import optimization_config +from official.modeling.privacy import configs as dp_configs OptimizationConfig = optimization_config.OptimizationConfig @@ -236,6 +237,11 @@ class TrainerConfig(base_config.Config): # we will retore the model states. recovery_max_trials: int = 0 validation_summary_subdir: str = "validation" + # Configs for differential privacy + # These configs are only effective if you use create_optimizer in + # tensorflow_models/official/core/base_task.py + differential_privacy_config: Optional[ + dp_configs.DifferentialPrivacyConfig] = None @dataclasses.dataclass diff --git a/official/core/train_utils.py b/official/core/train_utils.py index ac478e6d3..c05721ddd 100644 --- a/official/core/train_utils.py +++ b/official/core/train_utils.py @@ -15,6 +15,7 @@ """Training utils.""" import copy import dataclasses +import inspect import json import os import pprint @@ -208,6 +209,24 @@ class BestCheckpointExporter: return tf.train.latest_checkpoint(self._export_dir) +def create_optimizer(task: base_task.Task, + params: config_definitions.ExperimentConfig + ) -> tf.keras.optimizers.Optimizer: + """A create optimizer util to be backward compatability with new args.""" + if 'dp_config' in inspect.signature(task.create_optimizer).parameters: + optimizer = task.create_optimizer( + params.trainer.optimizer_config, params.runtime, + params.trainer.differential_privacy_config) + else: + if params.trainer.differential_privacy_config is not None: + raise ValueError('Differential privacy config is specified but ' + 'task.create_optimizer api does not accept it.') + optimizer = task.create_optimizer( + params.trainer.optimizer_config, + params.runtime) + return optimizer + + @gin.configurable def create_trainer(params: config_definitions.ExperimentConfig, task: base_task.Task, @@ -218,8 +237,7 @@ def create_trainer(params: config_definitions.ExperimentConfig, """Create trainer.""" logging.info('Running default trainer.') model = task.build_model() - optimizer = task.create_optimizer(params.trainer.optimizer_config, - params.runtime) + optimizer = create_optimizer(task, params) return trainer_cls( params, task, diff --git a/official/modeling/multitask/multitask.py b/official/modeling/multitask/multitask.py index 3b1630057..23e85afe8 100644 --- a/official/modeling/multitask/multitask.py +++ b/official/modeling/multitask/multitask.py @@ -23,9 +23,11 @@ from official.core import task_factory from official.modeling import optimization from official.modeling.multitask import base_model from official.modeling.multitask import configs +from official.modeling.privacy import configs as dp_configs OptimizationConfig = optimization.OptimizationConfig RuntimeConfig = config_definitions.RuntimeConfig +DifferentialPrivacyConfig = dp_configs.DifferentialPrivacyConfig class MultiTask(tf.Module, metaclass=abc.ABCMeta): @@ -93,9 +95,11 @@ class MultiTask(tf.Module, metaclass=abc.ABCMeta): @classmethod def create_optimizer(cls, optimizer_config: OptimizationConfig, - runtime_config: Optional[RuntimeConfig] = None): + runtime_config: Optional[RuntimeConfig] = None, + dp_config: Optional[DifferentialPrivacyConfig] = None): return base_task.Task.create_optimizer( - optimizer_config=optimizer_config, runtime_config=runtime_config) + optimizer_config=optimizer_config, runtime_config=runtime_config, + dp_config=dp_config) def joint_train_step(self, task_inputs, multi_task_model: base_model.MultiTaskBaseModel, diff --git a/official/modeling/multitask/train_lib.py b/official/modeling/multitask/train_lib.py index 4c1b3bdd0..a730bb160 100644 --- a/official/modeling/multitask/train_lib.py +++ b/official/modeling/multitask/train_lib.py @@ -66,8 +66,7 @@ def run_experiment( is_training = 'train' in mode is_eval = 'eval' in mode with distribution_strategy.scope(): - optimizer = task.create_optimizer(params.trainer.optimizer_config, - params.runtime) + optimizer = train_utils.create_optimizer(task, params) kwargs = dict(multi_task=task, multi_task_model=model, optimizer=optimizer) if params.trainer.trainer_type == 'interleaving': sampler = task_sampler.get_task_sampler(params.trainer.task_sampler, @@ -183,8 +182,7 @@ def run_experiment_with_multitask_eval( config=params, task=train_task, model=train_task.build_model(), - optimizer=train_task.create_optimizer(params.trainer.optimizer_config, - params.runtime), + optimizer=train_utils.create_optimizer(train_task, params), train=True, evaluate=False) else: diff --git a/official/modeling/privacy/__init__.py b/official/modeling/privacy/__init__.py new file mode 100644 index 000000000..310bfb28f --- /dev/null +++ b/official/modeling/privacy/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/official/modeling/privacy/configs.py b/official/modeling/privacy/configs.py new file mode 100644 index 000000000..e3f957d3b --- /dev/null +++ b/official/modeling/privacy/configs.py @@ -0,0 +1,24 @@ +# Copyright 2022 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Configs for differential privacy.""" + +from official.modeling.hyperparams import base_config + + +class DifferentialPrivacyConfig(base_config.Config): + # Applied to the gradients + # Setting to a large number so nothing is clipped. + clipping_norm: float = 100000000.0 # 10^9 + noise_multiplier: float = 0.0 diff --git a/official/modeling/privacy/configs_test.py b/official/modeling/privacy/configs_test.py new file mode 100644 index 000000000..4223eda9a --- /dev/null +++ b/official/modeling/privacy/configs_test.py @@ -0,0 +1,41 @@ +# Copyright 2022 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for configs.""" + +import tensorflow as tf +from official.modeling.privacy import configs + + +class ConfigsTest(tf.test.TestCase): + + def test_clipping_norm_default(self): + clipping_norm = configs.DifferentialPrivacyConfig().clipping_norm + self.assertEqual(100000000.0, clipping_norm) + + def test_noise_multiplier_default(self): + noise_multiplier = configs.DifferentialPrivacyConfig().noise_multiplier + self.assertEqual(0.0, noise_multiplier) + + def test_config(self): + dp_config = configs.DifferentialPrivacyConfig({ + 'clipping_norm': 1.0, + 'noise_multiplier': 1.0 + }) + self.assertEqual(1.0, dp_config.clipping_norm) + self.assertEqual(1.0, dp_config.noise_multiplier) + + +if __name__ == '__main__': + tf.test.main() diff --git a/official/modeling/privacy/ops.py b/official/modeling/privacy/ops.py new file mode 100644 index 000000000..8b0247020 --- /dev/null +++ b/official/modeling/privacy/ops.py @@ -0,0 +1,42 @@ +# Copyright 2022 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Ops for differential privacy (gradient) transforms.""" + +from typing import List, Tuple +import tensorflow as tf + + +def clip_l2_norm(grads_vars: List[Tuple[tf.Tensor, tf.Tensor]], + l2_norm_clip: float) -> List[Tuple[tf.Tensor, tf.Tensor]]: + """Clip gradients by global norm.""" + + gradients = [] + variables = [] + for (g, v) in grads_vars: + gradients.append(g) + variables.append(v) + clipped_gradients = tf.clip_by_global_norm(gradients, l2_norm_clip)[0] + return list(zip(clipped_gradients, variables)) + + +def add_noise(grads_vars: List[Tuple[tf.Tensor, tf.Tensor]], + noise_stddev: float) -> List[Tuple[tf.Tensor, tf.Tensor]]: + """Add noise to gradients.""" + ret = [] + for (g, v) in grads_vars: + noise = tf.random.normal(tf.shape(g), stddev=noise_stddev) + ret.append((g + noise, v)) + return ret + diff --git a/official/modeling/privacy/ops_test.py b/official/modeling/privacy/ops_test.py new file mode 100644 index 000000000..4f5d580c7 --- /dev/null +++ b/official/modeling/privacy/ops_test.py @@ -0,0 +1,52 @@ +# Copyright 2022 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for ops.""" + +from unittest import mock + +import tensorflow as tf + +from official.modeling.privacy import ops + + +class OpsTest(tf.test.TestCase): + + def test_clip_l2_norm(self): + x = tf.constant([4.0, 3.0]) + y = tf.constant([[12.0]]) + tensors = [(x, x), (y, y)] + clipped = ops.clip_l2_norm(tensors, 1.0) + for a, b in zip(clipped, tensors): + self.assertAllClose(a[0], b[0] / 13.0) # sqrt(4^2 + 3^2 + 12 ^3) = 13 + self.assertAllClose(a[1], b[1]) + + @mock.patch.object(tf.random, + 'normal', + autospec=True) + def test_add_noise(self, mock_random): + x = tf.constant([0.0, 0.0]) + y = tf.constant([[0.0]]) + tensors = [(x, x), (y, y)] + mock_random.side_effect = [tf.constant([1.0, 1.0]), tf.constant([[1.0]])] + added = ops.add_noise(tensors, 10.0) + for a, b in zip(added, tensors): + self.assertAllClose(a[0], b[0] + 1.0) + self.assertAllClose(a[1], b[1]) + _, kwargs = mock_random.call_args + self.assertEqual(kwargs['stddev'], 10.0) + + +if __name__ == '__main__': + tf.test.main() -- GitLab From d8f05122d36823b3a653c967fdef82a9b1f62535 Mon Sep 17 00:00:00 2001 From: Abdullah Rashwan Date: Thu, 24 Mar 2022 17:13:33 -0700 Subject: [PATCH 45/54] Internal change PiperOrigin-RevId: 437123122 --- official/projects/basnet/configs/basnet.py | 2 +- official/projects/basnet/serving/basnet.py | 2 +- official/projects/basnet/serving/export_saved_model.py | 2 +- official/projects/basnet/tasks/basnet.py | 2 +- official/projects/basnet/train.py | 2 +- .../vision/beta/projects/centernet/common/registry_imports.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/official/projects/basnet/configs/basnet.py b/official/projects/basnet/configs/basnet.py index 6981e2864..3c971d3ca 100644 --- a/official/projects/basnet/configs/basnet.py +++ b/official/projects/basnet/configs/basnet.py @@ -20,7 +20,7 @@ from official.core import config_definitions as cfg from official.core import exp_factory from official.modeling import hyperparams from official.modeling import optimization -from official.vision.beta.configs import common +from official.vision.configs import common @dataclasses.dataclass diff --git a/official/projects/basnet/serving/basnet.py b/official/projects/basnet/serving/basnet.py index 1734ac927..c9f5cb9a1 100644 --- a/official/projects/basnet/serving/basnet.py +++ b/official/projects/basnet/serving/basnet.py @@ -17,7 +17,7 @@ import tensorflow as tf from official.projects.basnet.tasks import basnet -from official.vision.beta.serving import semantic_segmentation +from official.vision.serving import semantic_segmentation MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255) diff --git a/official/projects/basnet/serving/export_saved_model.py b/official/projects/basnet/serving/export_saved_model.py index d1b7b370b..417beac57 100644 --- a/official/projects/basnet/serving/export_saved_model.py +++ b/official/projects/basnet/serving/export_saved_model.py @@ -41,7 +41,7 @@ from absl import flags from official.core import exp_factory from official.modeling import hyperparams from official.projects.basnet.serving import basnet -from official.vision.beta.serving import export_saved_model_lib +from official.vision.serving import export_saved_model_lib FLAGS = flags.FLAGS diff --git a/official/projects/basnet/tasks/basnet.py b/official/projects/basnet/tasks/basnet.py index fcb218616..a68463abf 100644 --- a/official/projects/basnet/tasks/basnet.py +++ b/official/projects/basnet/tasks/basnet.py @@ -27,7 +27,7 @@ from official.projects.basnet.evaluation import metrics as basnet_metrics from official.projects.basnet.losses import basnet_losses from official.projects.basnet.modeling import basnet_model from official.projects.basnet.modeling import refunet -from official.vision.beta.dataloaders import segmentation_input +from official.vision.dataloaders import segmentation_input def build_basnet_model( diff --git a/official/projects/basnet/train.py b/official/projects/basnet/train.py index a95b9ba92..d30321ac3 100644 --- a/official/projects/basnet/train.py +++ b/official/projects/basnet/train.py @@ -22,7 +22,7 @@ from official.projects.basnet.configs import basnet as basnet_cfg from official.projects.basnet.modeling import basnet_model from official.projects.basnet.modeling import refunet from official.projects.basnet.tasks import basnet as basenet_task -from official.vision.beta import train +from official.vision import train if __name__ == '__main__': diff --git a/official/vision/beta/projects/centernet/common/registry_imports.py b/official/vision/beta/projects/centernet/common/registry_imports.py index d70231f26..0d3b946fd 100644 --- a/official/vision/beta/projects/centernet/common/registry_imports.py +++ b/official/vision/beta/projects/centernet/common/registry_imports.py @@ -15,7 +15,7 @@ """All necessary imports for registration.""" # pylint: disable=unused-import -from official.common import registry_imports +from official.vision import registry_imports from official.vision.beta.projects.centernet.configs import centernet from official.vision.beta.projects.centernet.modeling import centernet_model from official.vision.beta.projects.centernet.modeling.backbones import hourglass -- GitLab From c1c9bb0fb019de246ea268b1f15e0c76a76138f1 Mon Sep 17 00:00:00 2001 From: Fan Yang Date: Thu, 24 Mar 2022 18:20:34 -0700 Subject: [PATCH 46/54] Remove duplicate functions and classes in QAT project PiperOrigin-RevId: 437133811 --- .../qat/vision/modeling/layers/nn_blocks.py | 99 ++++++---------- .../qat/vision/modeling/layers/nn_layers.py | 109 +++++++----------- .../qat/vision/quantization/helper.py | 36 ++++++ 3 files changed, 110 insertions(+), 134 deletions(-) diff --git a/official/projects/qat/vision/modeling/layers/nn_blocks.py b/official/projects/qat/vision/modeling/layers/nn_blocks.py index 37a7f1adb..e832d09c6 100644 --- a/official/projects/qat/vision/modeling/layers/nn_blocks.py +++ b/official/projects/qat/vision/modeling/layers/nn_blocks.py @@ -24,42 +24,10 @@ import tensorflow_model_optimization as tfmot from official.modeling import tf_utils from official.projects.qat.vision.modeling.layers import nn_layers as qat_nn_layers from official.projects.qat.vision.quantization import configs +from official.projects.qat.vision.quantization import helper from official.vision.modeling.layers import nn_layers -class NoOpActivation: - """No-op activation which simply returns the incoming tensor. - - This activation is required to distinguish between `keras.activations.linear` - which does the same thing. The main difference is that NoOpActivation should - not have any quantize operation applied to it. - """ - - def __call__(self, x: tf.Tensor) -> tf.Tensor: - return x - - def get_config(self) -> Dict[str, Any]: - """Get a config of this object.""" - return {} - - def __eq__(self, other: Any) -> bool: - if not other or not isinstance(other, NoOpActivation): - return False - - return True - - def __ne__(self, other: Any) -> bool: - return not self.__eq__(other) - - -def _quantize_wrapped_layer(cls, quantize_config): - def constructor(*arg, **kwargs): - return tfmot.quantization.keras.QuantizeWrapperV2( - cls(*arg, **kwargs), - quantize_config) - return constructor - - # This class is copied from modeling.layers.nn_blocks.BottleneckBlock and apply # QAT. @tf.keras.utils.register_keras_serializable(package='Vision') @@ -131,17 +99,16 @@ class BottleneckBlockQuantized(tf.keras.layers.Layer): self._kernel_regularizer = kernel_regularizer self._bias_regularizer = bias_regularizer if use_sync_bn: - self._norm = _quantize_wrapped_layer( + self._norm = helper.quantize_wrapped_layer( tf.keras.layers.experimental.SyncBatchNormalization, configs.NoOpQuantizeConfig()) - self._norm_with_quantize = _quantize_wrapped_layer( + self._norm_with_quantize = helper.quantize_wrapped_layer( tf.keras.layers.experimental.SyncBatchNormalization, configs.Default8BitOutputQuantizeConfig()) else: - self._norm = _quantize_wrapped_layer( - tf.keras.layers.BatchNormalization, - configs.NoOpQuantizeConfig()) - self._norm_with_quantize = _quantize_wrapped_layer( + self._norm = helper.quantize_wrapped_layer( + tf.keras.layers.BatchNormalization, configs.NoOpQuantizeConfig()) + self._norm_with_quantize = helper.quantize_wrapped_layer( tf.keras.layers.BatchNormalization, configs.Default8BitOutputQuantizeConfig()) if tf.keras.backend.image_data_format() == 'channels_last': @@ -152,10 +119,10 @@ class BottleneckBlockQuantized(tf.keras.layers.Layer): def build(self, input_shape: Optional[Union[Sequence[int], tf.Tensor]]): """Build variables and child layers to prepare for calling.""" - conv2d_quantized = _quantize_wrapped_layer( + conv2d_quantized = helper.quantize_wrapped_layer( tf.keras.layers.Conv2D, - configs.Default8BitConvQuantizeConfig( - ['kernel'], ['activation'], False)) + configs.Default8BitConvQuantizeConfig(['kernel'], ['activation'], + False)) if self._use_projection: if self._resnetd_shortcut: self._shortcut0 = tf.keras.layers.AveragePooling2D( @@ -168,7 +135,7 @@ class BottleneckBlockQuantized(tf.keras.layers.Layer): kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) else: self._shortcut = conv2d_quantized( filters=self._filters * 4, @@ -178,7 +145,7 @@ class BottleneckBlockQuantized(tf.keras.layers.Layer): kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) self._norm0 = self._norm_with_quantize( axis=self._bn_axis, @@ -194,7 +161,7 @@ class BottleneckBlockQuantized(tf.keras.layers.Layer): kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) self._norm1 = self._norm( axis=self._bn_axis, momentum=self._norm_momentum, @@ -214,7 +181,7 @@ class BottleneckBlockQuantized(tf.keras.layers.Layer): kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) self._norm2 = self._norm( axis=self._bn_axis, momentum=self._norm_momentum, @@ -232,7 +199,7 @@ class BottleneckBlockQuantized(tf.keras.layers.Layer): kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) self._norm3 = self._norm_with_quantize( axis=self._bn_axis, momentum=self._norm_momentum, @@ -392,10 +359,10 @@ class Conv2DBNBlockQuantized(tf.keras.layers.Layer): norm_layer = ( tf.keras.layers.experimental.SyncBatchNormalization if use_sync_bn else tf.keras.layers.BatchNormalization) - self._norm_with_quantize = _quantize_wrapped_layer( + self._norm_with_quantize = helper.quantize_wrapped_layer( norm_layer, configs.Default8BitOutputQuantizeConfig()) - self._norm = _quantize_wrapped_layer(norm_layer, - configs.NoOpQuantizeConfig()) + self._norm = helper.quantize_wrapped_layer(norm_layer, + configs.NoOpQuantizeConfig()) if tf.keras.backend.image_data_format() == 'channels_last': self._bn_axis = -1 @@ -432,10 +399,10 @@ class Conv2DBNBlockQuantized(tf.keras.layers.Layer): if self._use_explicit_padding and self._kernel_size > 1: padding_size = nn_layers.get_padding_for_kernel_size(self._kernel_size) self._pad = tf.keras.layers.ZeroPadding2D(padding_size) - conv2d_quantized = _quantize_wrapped_layer( + conv2d_quantized = helper.quantize_wrapped_layer( tf.keras.layers.Conv2D, - configs.Default8BitConvQuantizeConfig( - ['kernel'], ['activation'], not self._use_normalization)) + configs.Default8BitConvQuantizeConfig(['kernel'], ['activation'], + not self._use_normalization)) self._conv0 = conv2d_quantized( filters=self._filters, kernel_size=self._kernel_size, @@ -445,7 +412,7 @@ class Conv2DBNBlockQuantized(tf.keras.layers.Layer): kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) if self._use_normalization: self._norm0 = self._norm_by_activation(self._activation)( axis=self._bn_axis, @@ -579,10 +546,10 @@ class InvertedBottleneckBlockQuantized(tf.keras.layers.Layer): norm_layer = ( tf.keras.layers.experimental.SyncBatchNormalization if use_sync_bn else tf.keras.layers.BatchNormalization) - self._norm_with_quantize = _quantize_wrapped_layer( + self._norm_with_quantize = helper.quantize_wrapped_layer( norm_layer, configs.Default8BitOutputQuantizeConfig()) - self._norm = _quantize_wrapped_layer(norm_layer, - configs.NoOpQuantizeConfig()) + self._norm = helper.quantize_wrapped_layer(norm_layer, + configs.NoOpQuantizeConfig()) if tf.keras.backend.image_data_format() == 'channels_last': self._bn_axis = -1 @@ -602,14 +569,14 @@ class InvertedBottleneckBlockQuantized(tf.keras.layers.Layer): def build(self, input_shape: Optional[Union[Sequence[int], tf.Tensor]]): """Build variables and child layers to prepare for calling.""" - conv2d_quantized = _quantize_wrapped_layer( + conv2d_quantized = helper.quantize_wrapped_layer( tf.keras.layers.Conv2D, - configs.Default8BitConvQuantizeConfig( - ['kernel'], ['activation'], False)) - depthwise_conv2d_quantized = _quantize_wrapped_layer( + configs.Default8BitConvQuantizeConfig(['kernel'], ['activation'], + False)) + depthwise_conv2d_quantized = helper.quantize_wrapped_layer( tf.keras.layers.DepthwiseConv2D, - configs.Default8BitConvQuantizeConfig( - ['depthwise_kernel'], ['activation'], False)) + configs.Default8BitConvQuantizeConfig(['depthwise_kernel'], + ['activation'], False)) expand_filters = self._in_filters if self._expand_ratio > 1: # First 1x1 conv for channel expansion. @@ -628,7 +595,7 @@ class InvertedBottleneckBlockQuantized(tf.keras.layers.Layer): kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) self._norm0 = self._norm_by_activation(self._activation)( axis=self._bn_axis, momentum=self._norm_momentum, @@ -649,7 +616,7 @@ class InvertedBottleneckBlockQuantized(tf.keras.layers.Layer): depthwise_initializer=self._kernel_initializer, depthwise_regularizer=self._depthsize_regularizer, bias_regularizer=self._bias_regularizer, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) self._norm1 = self._norm_by_activation(self._depthwise_activation)( axis=self._bn_axis, momentum=self._norm_momentum, @@ -690,7 +657,7 @@ class InvertedBottleneckBlockQuantized(tf.keras.layers.Layer): kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) self._norm2 = self._norm_with_quantize( axis=self._bn_axis, momentum=self._norm_momentum, diff --git a/official/projects/qat/vision/modeling/layers/nn_layers.py b/official/projects/qat/vision/modeling/layers/nn_layers.py index 330de785a..ac0079734 100644 --- a/official/projects/qat/vision/modeling/layers/nn_layers.py +++ b/official/projects/qat/vision/modeling/layers/nn_layers.py @@ -14,7 +14,7 @@ """Contains common building blocks for neural networks.""" -from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union +from typing import Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union import tensorflow as tf @@ -31,36 +31,6 @@ States = Dict[str, tf.Tensor] Activation = Union[str, Callable] -class NoOpActivation: - """No-op activation which simply returns the incoming tensor. - - This activation is required to distinguish between `keras.activations.linear` - which does the same thing. The main difference is that NoOpActivation should - not have any quantize operation applied to it. - """ - - def __call__(self, x: tf.Tensor) -> tf.Tensor: - return x - - def get_config(self) -> Dict[str, Any]: - """Get a config of this object.""" - return {} - - def __eq__(self, other: Any) -> bool: - return isinstance(other, NoOpActivation) - - def __ne__(self, other: Any) -> bool: - return not self.__eq__(other) - - -def _quantize_wrapped_layer(cls, quantize_config): - def constructor(*arg, **kwargs): - return tfmot.quantization.keras.QuantizeWrapperV2( - cls(*arg, **kwargs), - quantize_config) - return constructor - - @tf.keras.utils.register_keras_serializable(package='Vision') class SqueezeExcitationQuantized( helper.LayerQuantizerHelper, @@ -154,14 +124,13 @@ class SqueezeExcitationQuantized( return x def build(self, input_shape): - conv2d_quantized = _quantize_wrapped_layer( + conv2d_quantized = helper.quantize_wrapped_layer( tf.keras.layers.Conv2D, - configs.Default8BitConvQuantizeConfig( - ['kernel'], ['activation'], False)) - conv2d_quantized_output_quantized = _quantize_wrapped_layer( + configs.Default8BitConvQuantizeConfig(['kernel'], ['activation'], + False)) + conv2d_quantized_output_quantized = helper.quantize_wrapped_layer( tf.keras.layers.Conv2D, - configs.Default8BitConvQuantizeConfig( - ['kernel'], ['activation'], True)) + configs.Default8BitConvQuantizeConfig(['kernel'], ['activation'], True)) num_reduced_filters = nn_layers.make_divisible( max(1, int(self._in_filters * self._se_ratio)), divisor=self._divisible_by, @@ -176,7 +145,7 @@ class SqueezeExcitationQuantized( kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) self._se_expand = conv2d_quantized_output_quantized( filters=self._out_filters, @@ -187,7 +156,7 @@ class SqueezeExcitationQuantized( kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) self._multiply = tfmot.quantization.keras.QuantizeWrapperV2( tf.keras.layers.Multiply(), @@ -342,14 +311,14 @@ class SegmentationHeadQuantized(tf.keras.layers.Layer): backbone_shape = input_shape[0] use_depthwise_convolution = self._config_dict['use_depthwise_convolution'] random_initializer = tf.keras.initializers.RandomNormal(stddev=0.01) - conv2d_quantized = _quantize_wrapped_layer( + conv2d_quantized = helper.quantize_wrapped_layer( tf.keras.layers.Conv2D, configs.Default8BitConvQuantizeConfig(['kernel'], ['activation'], False)) - conv2d_quantized_output_quantized = _quantize_wrapped_layer( + conv2d_quantized_output_quantized = helper.quantize_wrapped_layer( tf.keras.layers.Conv2D, configs.Default8BitConvQuantizeConfig(['kernel'], ['activation'], True)) - depthwise_conv2d_quantized = _quantize_wrapped_layer( + depthwise_conv2d_quantized = helper.quantize_wrapped_layer( tf.keras.layers.DepthwiseConv2D, configs.Default8BitConvQuantizeConfig(['depthwise_kernel'], ['activation'], False)) @@ -365,11 +334,13 @@ class SegmentationHeadQuantized(tf.keras.layers.Layer): tf.keras.layers.experimental.SyncBatchNormalization if self._config_dict['use_sync_bn'] else tf.keras.layers.BatchNormalization) - norm_with_quantize = _quantize_wrapped_layer( + norm_with_quantize = helper.quantize_wrapped_layer( norm_layer, configs.Default8BitOutputQuantizeConfig()) - norm = norm_with_quantize if self._config_dict['activation'] not in [ - 'relu', 'relu6' - ] else _quantize_wrapped_layer(norm_layer, configs.NoOpQuantizeConfig()) + if self._config_dict['activation'] not in ['relu', 'relu6']: + norm = norm_with_quantize + else: + norm = helper.quantize_wrapped_layer(norm_layer, + configs.NoOpQuantizeConfig()) bn_kwargs = { 'axis': self._bn_axis, @@ -387,7 +358,7 @@ class SegmentationHeadQuantized(tf.keras.layers.Layer): kernel_regularizer=self._config_dict['kernel_regularizer'], name='segmentation_head_deeplabv3p_fusion_conv', filters=self._config_dict['low_level_num_filters'], - activation=NoOpActivation()) + activation=helper.NoOpActivation()) self._dlv3p_norm = norm( name='segmentation_head_deeplabv3p_fusion_norm', **bn_kwargs) @@ -406,7 +377,7 @@ class SegmentationHeadQuantized(tf.keras.layers.Layer): depthwise_initializer=random_initializer, depthwise_regularizer=self._config_dict['kernel_regularizer'], depth_multiplier=1, - activation=NoOpActivation())) + activation=helper.NoOpActivation())) norm_name = 'segmentation_head_depthwise_norm_{}'.format(i) self._norms.append(norm(name=norm_name, **bn_kwargs)) conv_name = 'segmentation_head_conv_{}'.format(i) @@ -414,7 +385,7 @@ class SegmentationHeadQuantized(tf.keras.layers.Layer): conv2d_quantized( name=conv_name, filters=self._config_dict['num_filters'], - activation=NoOpActivation(), + activation=helper.NoOpActivation(), **conv_kwargs)) norm_name = 'segmentation_head_norm_{}'.format(i) self._norms.append(norm(name=norm_name, **bn_kwargs)) @@ -428,9 +399,9 @@ class SegmentationHeadQuantized(tf.keras.layers.Layer): kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01), kernel_regularizer=self._config_dict['kernel_regularizer'], bias_regularizer=self._config_dict['bias_regularizer'], - activation=NoOpActivation()) + activation=helper.NoOpActivation()) - upsampling = _quantize_wrapped_layer( + upsampling = helper.quantize_wrapped_layer( tf.keras.layers.UpSampling2D, configs.Default8BitQuantizeConfig([], [], True)) self._upsampling_layer = upsampling( @@ -440,7 +411,7 @@ class SegmentationHeadQuantized(tf.keras.layers.Layer): self._resizing_layer = tf.keras.layers.Resizing( backbone_shape[1], backbone_shape[2], interpolation='bilinear') - concat = _quantize_wrapped_layer( + concat = helper.quantize_wrapped_layer( tf.keras.layers.Concatenate, configs.Default8BitQuantizeConfig([], [], True)) self._concat_layer = concat(axis=self._bn_axis) @@ -589,17 +560,19 @@ class SpatialPyramidPoolingQuantized(nn_layers.SpatialPyramidPooling): norm_layer = ( tf.keras.layers.experimental.SyncBatchNormalization if self._use_sync_bn else tf.keras.layers.BatchNormalization) - norm_with_quantize = _quantize_wrapped_layer( + norm_with_quantize = helper.quantize_wrapped_layer( norm_layer, configs.Default8BitOutputQuantizeConfig()) - norm = norm_with_quantize if self._activation not in [ - 'relu', 'relu6' - ] else _quantize_wrapped_layer(norm_layer, configs.NoOpQuantizeConfig()) + if self._activation not in ['relu', 'relu6']: + norm = norm_with_quantize + else: + norm = helper.quantize_wrapped_layer(norm_layer, + configs.NoOpQuantizeConfig()) - conv2d_quantized = _quantize_wrapped_layer( + conv2d_quantized = helper.quantize_wrapped_layer( tf.keras.layers.Conv2D, configs.Default8BitConvQuantizeConfig(['kernel'], ['activation'], False)) - depthwise_conv2d_quantized_output_quantized = _quantize_wrapped_layer( + depthwise_conv2d_quantized_output_quantized = helper.quantize_wrapped_layer( tf.keras.layers.DepthwiseConv2D, configs.Default8BitConvQuantizeConfig(['depthwise_kernel'], ['activation'], True)) @@ -612,7 +585,7 @@ class SpatialPyramidPoolingQuantized(nn_layers.SpatialPyramidPooling): kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, use_bias=False, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) norm1 = norm( axis=self._bn_axis, momentum=self._batchnorm_momentum, @@ -633,7 +606,7 @@ class SpatialPyramidPoolingQuantized(nn_layers.SpatialPyramidPooling): depthwise_initializer=self._kernel_initializer, dilation_rate=dilation_rate, use_bias=False, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) ] kernel_size = (1, 1) conv_dilation = leading_layers + [ @@ -645,7 +618,7 @@ class SpatialPyramidPoolingQuantized(nn_layers.SpatialPyramidPooling): kernel_initializer=self._kernel_initializer, dilation_rate=dilation_rate, use_bias=False, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) ] norm_dilation = norm( axis=self._bn_axis, @@ -656,16 +629,16 @@ class SpatialPyramidPoolingQuantized(nn_layers.SpatialPyramidPooling): if self._pool_kernel_size is None: pooling = [ - _quantize_wrapped_layer( + helper.quantize_wrapped_layer( tf.keras.layers.GlobalAveragePooling2D, configs.Default8BitQuantizeConfig([], [], True))(), - _quantize_wrapped_layer( + helper.quantize_wrapped_layer( tf.keras.layers.Reshape, configs.Default8BitQuantizeConfig([], [], True))((1, 1, channels)) ] else: pooling = [ - _quantize_wrapped_layer( + helper.quantize_wrapped_layer( tf.keras.layers.AveragePooling2D, configs.Default8BitQuantizeConfig([], [], True))(self._pool_kernel_size) @@ -677,7 +650,7 @@ class SpatialPyramidPoolingQuantized(nn_layers.SpatialPyramidPooling): kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, use_bias=False, - activation=NoOpActivation()) + activation=helper.NoOpActivation()) norm2 = norm( axis=self._bn_axis, momentum=self._batchnorm_momentum, @@ -685,7 +658,7 @@ class SpatialPyramidPoolingQuantized(nn_layers.SpatialPyramidPooling): self.aspp_layers.append(pooling + [conv2, norm2]) - resizing = _quantize_wrapped_layer( + resizing = helper.quantize_wrapped_layer( tf.keras.layers.Resizing, configs.Default8BitQuantizeConfig([], [], True)) self._resizing_layer = resizing( @@ -698,14 +671,14 @@ class SpatialPyramidPoolingQuantized(nn_layers.SpatialPyramidPooling): kernel_initializer=self._kernel_initializer, kernel_regularizer=self._kernel_regularizer, use_bias=False, - activation=NoOpActivation()), + activation=helper.NoOpActivation()), norm_with_quantize( axis=self._bn_axis, momentum=self._batchnorm_momentum, epsilon=self._batchnorm_epsilon) ] self._dropout_layer = tf.keras.layers.Dropout(rate=self._dropout) - concat = _quantize_wrapped_layer( + concat = helper.quantize_wrapped_layer( tf.keras.layers.Concatenate, configs.Default8BitQuantizeConfig([], [], True)) self._concat_layer = concat(axis=-1) diff --git a/official/projects/qat/vision/quantization/helper.py b/official/projects/qat/vision/quantization/helper.py index ee0844d5a..aa59fa419 100644 --- a/official/projects/qat/vision/quantization/helper.py +++ b/official/projects/qat/vision/quantization/helper.py @@ -13,7 +13,9 @@ # limitations under the License. """Quantization helpers.""" +from typing import Any, Dict +import tensorflow as tf import tensorflow_model_optimization as tfmot @@ -47,3 +49,37 @@ class LayerQuantizerHelper(object): for name in self._quantizers: self._quantizer_vars[name] = self._quantizers[name].build( tensor_shape=None, name=name, layer=self) + + +class NoOpActivation: + """No-op activation which simply returns the incoming tensor. + + This activation is required to distinguish between `keras.activations.linear` + which does the same thing. The main difference is that NoOpActivation should + not have any quantize operation applied to it. + """ + + def __call__(self, x: tf.Tensor) -> tf.Tensor: + return x + + def get_config(self) -> Dict[str, Any]: + """Get a config of this object.""" + return {} + + def __eq__(self, other: Any) -> bool: + if not other or not isinstance(other, NoOpActivation): + return False + + return True + + def __ne__(self, other: Any) -> bool: + return not self.__eq__(other) + + +def quantize_wrapped_layer(cls, quantize_config): + + def constructor(*arg, **kwargs): + return tfmot.quantization.keras.QuantizeWrapperV2( + cls(*arg, **kwargs), quantize_config) + + return constructor -- GitLab From ba6459815a63222090d8bd39bc9e7ac1feb33515 Mon Sep 17 00:00:00 2001 From: Vishnuvardhan Janapati <46058173+jvishnuvardhan@users.noreply.github.com> Date: Thu, 24 Mar 2022 20:25:45 -0700 Subject: [PATCH 47/54] Update README.md (#10556) Formatted `BibTex` for correct citation. --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ad40f14dd..e9c928a4e 100644 --- a/README.md +++ b/README.md @@ -123,9 +123,8 @@ If you use TensorFlow Model Garden in your research, please cite this repository ``` @misc{tensorflowmodelgarden2020, - author = {Hongkun Yu and Chen Chen and Xianzhi Du and Yeqing Li and - Abdullah Rashwan and Le Hou and Pengchong Jin and Fan Yang and - Frederick Liu and Jaeyoun Kim and Jing Li}, + author = {Hongkun Yu, Chen Chen, Xianzhi Du, Yeqing Li, Abdullah Rashwan, Le Hou, Pengchong Jin, Fan Yang, + Frederick Liu, Jaeyoun Kim, and Jing Li}, title = {{TensorFlow Model Garden}}, howpublished = {\url{https://github.com/tensorflow/models}}, year = {2020} -- GitLab From 8060872c96e71c6356d7bf6de1e5e7bb4d88888c Mon Sep 17 00:00:00 2001 From: Le Hou Date: Thu, 24 Mar 2022 21:17:34 -0700 Subject: [PATCH 48/54] Minor documentation change PiperOrigin-RevId: 437158986 --- official/projects/token_dropping/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/official/projects/token_dropping/README.md b/official/projects/token_dropping/README.md index 3a7c91483..4e8eb007e 100644 --- a/official/projects/token_dropping/README.md +++ b/official/projects/token_dropping/README.md @@ -95,10 +95,10 @@ modeling library: Please cite our paper: ``` -@inproceedings{pang2022, +@article{hou2022token, title={Token Dropping for Efficient BERT Pretraining}, - author={Richard Yuanzhe Pang*, Le Hou*, Tianyi Zhou, Yuexin Wu, Xinying Song, Xiaodan Song, Denny Zhou}, - year={2022}, - organization={Association for Computational Linguistics} + author={Pang, Richard Yuanzhe and Hou, Le and Zhou, Tianyi and Wu, Yuexin and Song, Xinying and Song, Xiaodan and Zhou, Denny}, + journal={arXiv preprint arXiv:2203.13240}, + year={2022} } ``` -- GitLab From 54f1ce849137d4584f93cc57fde37a60cf5c2d3f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 24 Mar 2022 21:41:36 -0700 Subject: [PATCH 49/54] Updated readme with available models PiperOrigin-RevId: 437162068 --- official/vision/README.md | 296 +++++++++++++++++++++++++++++++++++++- 1 file changed, 295 insertions(+), 1 deletion(-) diff --git a/official/vision/README.md b/official/vision/README.md index 065323b84..57365b3c1 100644 --- a/official/vision/README.md +++ b/official/vision/README.md @@ -1 +1,295 @@ -This directory contains the new design of TF model garden vision framework. +# TF-Vision Model Garden + +⚠️ Disclaimer: All datasets hyperlinked from this page are not owned or +distributed by Google. The dataset is made available by third parties. +Please review the terms and conditions made available by the third parties +before using the data. + +## Table of Contents + +- [Introduction](#introduction) +- [Image Classification](#image-classification) + * [ResNet models trained with vanilla settings](#resnet-models-trained-with-vanilla-settings) + * [ResNet-RS models trained with various settings](#resnet-rs-models-trained-with-various-settings) + * [Vision Transformer (ViT)](#vision-transformer-ViT) +- [Object Detection and Instance Segmentation](#object-detection-and-instance-segmentation) + * [Common Settings and Notes](#Common-Settings-and-Notes) +- [COCO Object Detection Baselines](#COCO-Object-Detection-Baselines) + * [RetinaNet (ImageNet pretrained)](#RetinaNet-ImageNet-pretrained) + * [RetinaNet (Trained from scratch)](#RetinaNet-Trained-from-scratch) + * [Mobile-size RetinaNet (Trained from scratch)](#Mobile-size-RetinaNet-Trained-from-scratch)) +- [Instance Segmentation Baselines](#Instance-Segmentation-Baselines) + * [Mask R-CNN (Trained from scratch)](#Mask-R-CNN-Trained-from-scratch) + * [Cascade RCNN-RS (Trained from scratch)](#Cascade-RCNN-RS-Trained-from-scratch) +- [Semantic Segmentation](#semantic-segmentation) + * [PASCAL-VOC](#PASCAL-VOC) + * [CITYSCAPES](#CITYSCAPES) +- [Video Classification](#video-classification) + * [Common Settings and Notes](#Common-Settings-and-Notes) + * [Kinetics-400 Action Recognition Baselines](#Kinetics-400-Action-Recognition-Baselines) + * [Kinetics-600 Action Recognition Baselines](#Kinetics-600-Action-Recognition-Baselines) + +## Introduction + +TF-Vision modeling library for computer vision provides a collection of +baselines and checkpoints for image classification, object detection, and +segmentation. + +## Image Classification + +### ResNet models trained with vanilla settings + +
+ +* Models are trained from scratch with batch size 4096 and 1.6 initial learning + rate. +* Linear warmup is applied for the first 5 epochs. +* Models trained with l2 weight regularization and ReLU activation. + +| Model | Resolution | Epochs | Top-1 | Top-5 | Download | +| ------------ |:-------------:|--------:|--------:|--------:|---------:| +| ResNet-50 | 224x224 | 90 | 76.1 | 92.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) | +| ResNet-50 | 224x224 | 200 | 77.1 | 93.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml) | +| ResNet-101 | 224x224 | 200 | 78.3 | 94.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnet101_tpu.yaml) | +| ResNet-152 | 224x224 | 200 | 78.7 | 94.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnet152_tpu.yaml) | + +
+ +### ResNet-RS models trained with various settings + +
+ +We support state-of-the-art [ResNet-RS](https://arxiv.org/abs/2103.07579) image +classification models with features: + +* ResNet-RS architectural changes and Swish activation. (Note that ResNet-RS + adopts ReLU activation in the paper.) +* Regularization methods including Random Augment, 4e-5 weight decay, stochastic +depth, label smoothing and dropout. +* New training methods including a 350-epoch schedule, cosine learning rate and + EMA. +* Configs are in this [directory](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification). + +| Model | Resolution | Params (M) | Top-1 | Top-5 | Download | +| --------- | :--------: | ---------: | ----: | ----: | --------:| +| ResNet-RS-50 | 160x160 | 35.7 | 79.1 | 94.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-50-i160.tar.gz) | +| ResNet-RS-101 | 160x160 | 63.7 | 80.2 | 94.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-101-i160.tar.gz) | +| ResNet-RS-101 | 192x192 | 63.7 | 81.3 | 95.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-101-i192.tar.gz) | +| ResNet-RS-152 | 192x192 | 86.8 | 81.9 | 95.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i192.tar.gz) | +| ResNet-RS-152 | 224x224 | 86.8 | 82.5 | 96.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i224.tar.gz) | +| ResNet-RS-152 | 256x256 | 86.8 | 83.1 | 96.3 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-152-i256.tar.gz) | +| ResNet-RS-200 | 256x256 | 93.4 | 83.5 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-200-i256.tar.gz) | +| ResNet-RS-270 | 256x256 | 130.1 | 83.6 | 96.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-270-i256.tar.gz) | +| ResNet-RS-350 | 256x256 | 164.3 | 83.7 | 96.7 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i256.tar.gz) | +| ResNet-RS-350 | 320x320 | 164.3 | 84.2 | 96.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/image_classification/imagenet_resnetrs420_i256.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/resnet-rs/resnet-rs-350-i320.tar.gz) | + +
+ +### Vision Transformer (ViT) + +
+ +We support [ViT](https://arxiv.org/abs/2010.11929) and [DEIT](https://arxiv.org/abs/2012.12877) implementations in a TF +Vision +[project](https://github.com/tensorflow/models/tree/master/official/projects/vit). ViT models trained under the DEIT settings: + +model | resolution | Top-1 | Top-5 | +--------- | :--------: | ----: | ----: | +ViT-s16 | 224x224 | 79.4 | 94.7 | +ViT-b16 | 224x224 | 81.8 | 95.8 | +ViT-l16 | 224x224 | 82.2 | 95.8 | + +
+ +## Object Detection and Instance Segmentation + +### Common Settings and Notes + +
+ +* We provide models adopting [ResNet-FPN](https://arxiv.org/abs/1612.03144) + and [SpineNet](https://arxiv.org/abs/1912.05027) backbones based on + detection frameworks: + * [RetinaNet](https://arxiv.org/abs/1708.02002) and + [RetinaNet-RS](https://arxiv.org/abs/2107.00057) + * [Mask R-CNN](https://arxiv.org/abs/1703.06870) + * [Cascade RCNN](https://arxiv.org/abs/1712.00726) and + [Cascade RCNN-RS](https://arxiv.org/abs/2107.00057) +* Models are all trained on [COCO](https://cocodataset.org/) train2017 and + evaluated on [COCO](https://cocodataset.org/) val2017. +* Training details: + * Models finetuned from [ImageNet](https://www.image-net.org/) pretrained + checkpoints adopt the 12 or 36 epochs schedule. Models trained from + scratch adopt the 350 epochs schedule. + * The default training data augmentation implements horizontal flipping + and scale jittering with a random scale between [0.5, 2.0]. + * Unless noted, all models are trained with l2 weight regularization and + ReLU activation. + * We use batch size 256 and stepwise learning rate that decays at the last + 30 and 10 epoch. + * We use square image as input by resizing the long side of an image to + the target size then padding the short side with zeros. + +
+ +## COCO Object Detection Baselines + +### RetinaNet (ImageNet pretrained) + +
+ +| Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Download | +| ------------ |:-------------:| -------:|--------------:|-----------:|-------:|---------:| +| R50-FPN | 640x640 | 12 | 97.0 | 34.0 | 34.3 | config| +| R50-FPN | 640x640 | 72 | 97.0 | 34.0 | 36.8 | config \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/retinanet/retinanet-resnet50fpn.tar.gz) | + +
+ +### RetinaNet (Trained from scratch) + +
+ +training features including: +* Stochastic depth with drop rate 0.2. +* Swish activation. + +| Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Download | +| ------------ |:-------------:| -------:|--------------:|-----------:|--------:|---------:| +| SpineNet-49 | 640x640 | 500 | 85.4| 28.5 | 44.2 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/retinanet/coco_spinenet49_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)| +| SpineNet-96 | 1024x1024 | 500 | 265.4 | 43.0 | 48.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/retinanet/coco_spinenet96_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)| +| SpineNet-143 | 1280x1280 | 500 | 524.0 | 67.0 | 50.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/retinanet/coco_spinenet143_tpu.yaml) \| [TB.dev](https://tensorboard.dev/experiment/n2UN83TkTdyKZn3slCWulg/#scalars&_smoothingWeight=0)| + +
+ +### Mobile-size RetinaNet (Trained from scratch): + +
+ +| Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Download | +| ----------- | :--------: | -----: | --------: | ---------: | -----: | --------:| +| MobileNetv2 | 256x256 | 600 | - | 2.27 | 23.5 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/retinanet/coco_mobilenetv2_tpu.yaml) | +| Mobile SpineNet-49 | 384x384 | 600 | 1.0 | 2.32 | 28.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml) \| [ckpt](https://storage.cloud.google.com/tf_model_garden/vision/retinanet/spinenet49mobile.tar.gz) | + +
+ +## Instance Segmentation Baselines + +### Mask R-CNN (Trained from scratch) + +
+ +| Backbone | Resolution | Epochs | FLOPs (B) | Params (M) | Box AP | Mask AP | Download | +| ------------ |:-------------:| -------:|-----------:|-----------:|-------:|--------:|---------:| +| ResNet50-FPN | 640x640 | 350 | 227.7 | 46.3 | 42.3 | 37.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/r50fpn_640_coco_scratch_tpu4x4.yaml) | +| SpineNet-49 | 640x640 | 350 | 215.7 | 40.8 | 42.6 | 37.9 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/coco_spinenet49_mrcnn_tpu.yaml) | +| SpineNet-96 | 1024x1024 | 500 | 315.0 | 55.2 | 48.1 | 42.4 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/coco_spinenet96_mrcnn_tpu.yaml) | +| SpineNet-143 | 1280x1280 | 500 | 498.8 | 79.2 | 49.3 | 43.4 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/coco_spinenet143_mrcnn_tpu.yaml) | + +
+ +### Cascade RCNN-RS (Trained from scratch) + +
+ +| Backbone | Resolution | Epochs | Params (M) | Box AP | Mask AP | Download +------------ | :--------: | -----: | ---------: | -----: | ------: | -------: +| SpineNet-49 | 640x640 | 500 | 56.4 | 46.4 | 40.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/coco_spinenet49_cascadercnn_tpu.yaml)| +| SpineNet-96 | 1024x1024 | 500 | 70.8 | 50.9 | 43.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/coco_spinenet96_cascadercnn_tpu.yaml)| +| SpineNet-143 | 1280x1280 | 500 | 94.9 | 51.9 | 45.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/maskrcnn/coco_spinenet143_cascadercnn_tpu.yaml)| + +
+ +## Semantic Segmentation + +* We support [DeepLabV3](https://arxiv.org/pdf/1706.05587.pdf) and + [DeepLabV3+](https://arxiv.org/pdf/1802.02611.pdf) architectures, with + Dilated ResNet backbones. +* Backbones are pre-trained on ImageNet. + +### PASCAL-VOC + +
+ +| Model | Backbone | Resolution | Steps | mIoU | Download | +| ---------- | :----------------: | :--------: | ----: | ---: | --------:| +| DeepLabV3 | Dilated Resnet-101 | 512x512 | 30k | 78.7 | | +| DeepLabV3+ | Dilated Resnet-101 | 512x512 | 30k | 79.2 | | + +
+ +### CITYSCAPES + +
+ +| Model | Backbone | Resolution | Steps | mIoU | Download | +| ---------- | :----------------: | :--------: | ----: | ----: | --------:| +| DeepLabV3+ | Dilated Resnet-101 | 1024x2048 | 90k | 78.79 | | + +
+ +## Video Classification + +### Common Settings and Notes + +
+ +* We provide models for video classification with backbones: + * SlowOnly in + [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982). + * ResNet-3D (R3D) in + [Spatiotemporal Contrastive Video Representation Learning](https://arxiv.org/abs/2008.03800). + * ResNet-3D-RS (R3D-RS) in + [Revisiting 3D ResNets for Video Recognition](https://arxiv.org/pdf/2109.01696.pdf). + * Mobile Video Networks (MoViNets) in + [MoViNets: Mobile Video Networks for Efficient Video Recognition](https://arxiv.org/abs/2103.11511). + +* Training and evaluation details (SlowFast and ResNet): + * All models are trained from scratch with vision modality (RGB) for 200 + epochs. + * We use batch size of 1024 and cosine learning rate decay with linear warmup + in first 5 epochs. + * We follow [SlowFast](https://arxiv.org/abs/1812.03982) to perform 30-view + evaluation. + +
+ +### Kinetics-400 Action Recognition Baselines + +
+ +| Model | Input (frame x stride) | Top-1 | Top-5 | Download | +| -------- |:----------------------:|--------:|--------:|---------:| +| SlowOnly | 8 x 8 | 74.1 | 91.4 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/video_classification/k400_slowonly8x8_tpu.yaml) | +| SlowOnly | 16 x 4 | 75.6 | 92.1 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/video_classification/k400_slowonly16x4_tpu.yaml) | +| R3D-50 | 32 x 2 | 77.0 | 93.0 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/video_classification/k400_3d-resnet50_tpu.yaml) | +| R3D-RS-50 | 32 x 2 | 78.2 | 93.7 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/video_classification/k400_resnet3drs_50_tpu.yaml) | +| R3D-RS-101 | 32 x 2 | 79.5 | 94.2 | - +| R3D-RS-152 | 32 x 2 | 79.9 | 94.3 | - +| R3D-RS-200 | 32 x 2 | 80.4 | 94.4 | - +| R3D-RS-200 | 48 x 2 | 81.0 | - | - +| MoViNet-A0-Base | 50 x 5 | 69.40 | 89.18 | - +| MoViNet-A1-Base | 50 x 5 | 74.57 | 92.03 | - +| MoViNet-A2-Base | 50 x 5 | 75.91 | 92.63 | - +| MoViNet-A3-Base | 120 x 2 | 79.34 | 94.52 | - +| MoViNet-A4-Base | 80 x 3 | 80.64 | 94.93 | - +| MoViNet-A5-Base | 120 x 2 | 81.39 | 95.06 | - + +
+ +### Kinetics-600 Action Recognition Baselines + +
+ +| Model | Input (frame x stride) | Top-1 | Top-5 | Download | +| -------- |:----------------------:|--------:|--------:|---------:| +| SlowOnly | 8 x 8 | 77.3 | 93.6 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/video_classification/k600_slowonly8x8_tpu.yaml) | +| R3D-50 | 32 x 2 | 79.5 | 94.8 | [config](https://github.com/tensorflow/models/blob/master/official/vision/configs/experiments/video_classification/k600_3d-resnet50_tpu.yaml) | +| R3D-RS-200 | 32 x 2 | 83.1 | - | - +| R3D-RS-200 | 48 x 2 | 83.8 | - | - +| MoViNet-A0-Base | 50 x 5 | 72.05 | 90.92 | [config](https://github.com/tensorflow/models/blob/master/official/projects/movinet/configs/yaml/movinet_a0_k600_8x8.yaml) | +| MoViNet-A1-Base | 50 x 5 | 76.69 | 93.40 | [config](https://github.com/tensorflow/models/blob/master/official/projects/movinet/configs/yaml/movinet_a1_k600_8x8.yaml) | +| MoViNet-A2-Base | 50 x 5 | 78.62 | 94.17 | [config](https://github.com/tensorflow/models/blob/master/official/projects/movinet/configs/yaml/movinet_a2_k600_8x8.yaml) | +| MoViNet-A3-Base | 120 x 2 | 81.79 | 95.67 | [config](https://github.com/tensorflow/models/blob/master/official/projects/movinet/configs/yaml/movinet_a3_k600_8x8.yaml) | +| MoViNet-A4-Base | 80 x 3 | 83.48 | 96.16 | [config](https://github.com/tensorflow/models/blob/master/official/projects/movinet/configs/yaml/movinet_a4_k600_8x8.yaml) | +| MoViNet-A5-Base | 120 x 2 | 84.27 | 96.39 | [config](https://github.com/tensorflow/models/blob/master/official/projects/movinet/configs/yaml/movinet_a5_k600_8x8.yaml) | +
-- GitLab From c25e8665836834579125f93e0576b764114d624d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 24 Mar 2022 21:58:40 -0700 Subject: [PATCH 50/54] Deleting these files as they moved to legacy folder PiperOrigin-RevId: 437163784 --- official/vision/detection/README.md | 6 ------ official/vision/detection/__init__.py | 16 ---------------- official/vision/image_classification/README.md | 3 --- .../vision/image_classification/__init__.py | 17 ----------------- 4 files changed, 42 deletions(-) delete mode 100644 official/vision/detection/README.md delete mode 100644 official/vision/detection/__init__.py delete mode 100644 official/vision/image_classification/README.md delete mode 100644 official/vision/image_classification/__init__.py diff --git a/official/vision/detection/README.md b/official/vision/detection/README.md deleted file mode 100644 index 61e793397..000000000 --- a/official/vision/detection/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# Object Detection Models on TensorFlow 2 - -This repository is deprecated and replaced by the solid -implementations inside vision/beta/. All the content has been moved to -[official/legacy/detection](https://github.com/tensorflow/models/tree/master/official/legacy/detection). - diff --git a/official/vision/detection/__init__.py b/official/vision/detection/__init__.py deleted file mode 100644 index 8362beb56..000000000 --- a/official/vision/detection/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Deprecating the vision/detection folder.""" -raise ImportError('This module has been moved to official/legacy/detection') diff --git a/official/vision/image_classification/README.md b/official/vision/image_classification/README.md deleted file mode 100644 index c34b48a4f..000000000 --- a/official/vision/image_classification/README.md +++ /dev/null @@ -1,3 +0,0 @@ -This repository is deprecated and replaced by the solid -implementations inside vision/beta/. All the content has been moved to -[official/legacy/image_classification](https://github.com/tensorflow/models/tree/master/official/legacy/image_classification). diff --git a/official/vision/image_classification/__init__.py b/official/vision/image_classification/__init__.py deleted file mode 100644 index b35cc0cfa..000000000 --- a/official/vision/image_classification/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Deprecating the vision/detection folder.""" -raise ImportError( - 'This module has been moved to official/legacy/image_classification') -- GitLab From d5ce303a3b675886827344e2b85d4a94c7e61242 Mon Sep 17 00:00:00 2001 From: Yeqing Li Date: Fri, 25 Mar 2022 09:13:06 -0700 Subject: [PATCH 51/54] Update the beta dependencies. PiperOrigin-RevId: 437262580 --- official/projects/basnet/modeling/basnet_model.py | 2 +- .../edgetpu/vision/tasks/semantic_segmentation_test.py | 2 +- official/projects/teams/teams_experiments_test.py | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/official/projects/basnet/modeling/basnet_model.py b/official/projects/basnet/modeling/basnet_model.py index 09970af6a..cef6d456d 100644 --- a/official/projects/basnet/modeling/basnet_model.py +++ b/official/projects/basnet/modeling/basnet_model.py @@ -20,7 +20,7 @@ import tensorflow as tf from official.modeling import tf_utils from official.projects.basnet.modeling import nn_blocks -from official.vision.beta.modeling.backbones import factory +from official.vision.modeling.backbones import factory # Specifications for BASNet encoder. # Each element in the block configuration is in the following format: diff --git a/official/projects/edgetpu/vision/tasks/semantic_segmentation_test.py b/official/projects/edgetpu/vision/tasks/semantic_segmentation_test.py index 378133251..d12eb8dcd 100644 --- a/official/projects/edgetpu/vision/tasks/semantic_segmentation_test.py +++ b/official/projects/edgetpu/vision/tasks/semantic_segmentation_test.py @@ -19,12 +19,12 @@ from absl.testing import parameterized import orbit import tensorflow as tf +from official import vision from official.core import exp_factory from official.modeling import optimization from official.projects.edgetpu.vision.configs import semantic_segmentation_config as seg_cfg from official.projects.edgetpu.vision.configs import semantic_segmentation_searched_config as autoseg_cfg from official.projects.edgetpu.vision.tasks import semantic_segmentation as img_seg_task -from official.vision import beta # Dummy ADE20K TF dataset. diff --git a/official/projects/teams/teams_experiments_test.py b/official/projects/teams/teams_experiments_test.py index c18299aca..641b65ddc 100644 --- a/official/projects/teams/teams_experiments_test.py +++ b/official/projects/teams/teams_experiments_test.py @@ -17,9 +17,7 @@ from absl.testing import parameterized import tensorflow as tf -# pylint: disable=unused-import -from official.common import registry_imports -# pylint: enable=unused-import +from official.common import registry_imports # pylint: disable=unused-import from official.core import config_definitions as cfg from official.core import exp_factory -- GitLab From 45bd66a63a76af5ee3acfc9feaf5fd8b1d3dbb5a Mon Sep 17 00:00:00 2001 From: Abdullah Rashwan Date: Fri, 25 Mar 2022 09:15:37 -0700 Subject: [PATCH 52/54] Internal change PiperOrigin-RevId: 437263145 --- .../vision/beta/projects/yolo/common/registry_imports.py | 2 +- official/vision/beta/projects/yolo/configs/backbones.py | 2 +- .../beta/projects/yolo/configs/darknet_classification.py | 4 ++-- official/vision/beta/projects/yolo/configs/decoders.py | 2 +- official/vision/beta/projects/yolo/configs/yolo.py | 2 +- .../projects/yolo/dataloaders/classification_input.py | 4 ++-- .../beta/projects/yolo/dataloaders/tf_example_decoder.py | 2 +- .../vision/beta/projects/yolo/dataloaders/yolo_input.py | 8 ++++---- .../beta/projects/yolo/modeling/backbones/darknet.py | 3 ++- .../beta/projects/yolo/modeling/decoders/yolo_decoder.py | 4 ++-- official/vision/beta/projects/yolo/modeling/factory.py | 4 ++-- .../projects/yolo/modeling/layers/detection_generator.py | 2 +- .../beta/projects/yolo/modeling/layers/nn_blocks.py | 2 +- official/vision/beta/projects/yolo/ops/mosaic.py | 5 +++-- .../vision/beta/projects/yolo/ops/preprocessing_ops.py | 2 +- .../beta/projects/yolo/ops/preprocessing_ops_test.py | 2 +- .../beta/projects/yolo/tasks/image_classification.py | 8 ++++---- official/vision/beta/projects/yolo/tasks/yolo.py | 8 ++++---- 18 files changed, 34 insertions(+), 32 deletions(-) diff --git a/official/vision/beta/projects/yolo/common/registry_imports.py b/official/vision/beta/projects/yolo/common/registry_imports.py index 3d9d85d7b..28218cdfa 100644 --- a/official/vision/beta/projects/yolo/common/registry_imports.py +++ b/official/vision/beta/projects/yolo/common/registry_imports.py @@ -16,7 +16,7 @@ # pylint: disable=unused-import # pylint: disable=g-bad-import-order -from official.common import registry_imports +from official.vision import registry_imports # import configs from official.vision.beta.projects.yolo.configs import darknet_classification diff --git a/official/vision/beta/projects/yolo/configs/backbones.py b/official/vision/beta/projects/yolo/configs/backbones.py index 1bc3af3fd..f397809a1 100644 --- a/official/vision/beta/projects/yolo/configs/backbones.py +++ b/official/vision/beta/projects/yolo/configs/backbones.py @@ -15,7 +15,7 @@ """Backbones configurations.""" import dataclasses from official.modeling import hyperparams -from official.vision.beta.configs import backbones +from official.vision.configs import backbones @dataclasses.dataclass diff --git a/official/vision/beta/projects/yolo/configs/darknet_classification.py b/official/vision/beta/projects/yolo/configs/darknet_classification.py index ee927ac0e..f1bd29a09 100644 --- a/official/vision/beta/projects/yolo/configs/darknet_classification.py +++ b/official/vision/beta/projects/yolo/configs/darknet_classification.py @@ -20,9 +20,9 @@ from typing import List, Optional from official.core import config_definitions as cfg from official.core import exp_factory from official.modeling import hyperparams -from official.vision.beta.configs import common -from official.vision.beta.configs import image_classification as imc from official.vision.beta.projects.yolo.configs import backbones +from official.vision.configs import common +from official.vision.configs import image_classification as imc @dataclasses.dataclass diff --git a/official/vision/beta/projects/yolo/configs/decoders.py b/official/vision/beta/projects/yolo/configs/decoders.py index 0ae93daec..2a796a1e2 100755 --- a/official/vision/beta/projects/yolo/configs/decoders.py +++ b/official/vision/beta/projects/yolo/configs/decoders.py @@ -16,7 +16,7 @@ import dataclasses from typing import Optional from official.modeling import hyperparams -from official.vision.beta.configs import decoders +from official.vision.configs import decoders @dataclasses.dataclass diff --git a/official/vision/beta/projects/yolo/configs/yolo.py b/official/vision/beta/projects/yolo/configs/yolo.py index 343fe3347..59dcbb054 100755 --- a/official/vision/beta/projects/yolo/configs/yolo.py +++ b/official/vision/beta/projects/yolo/configs/yolo.py @@ -22,10 +22,10 @@ import numpy as np from official.core import config_definitions as cfg from official.core import exp_factory from official.modeling import hyperparams -from official.vision.beta.configs import common from official.vision.beta.projects.yolo import optimization from official.vision.beta.projects.yolo.configs import backbones from official.vision.beta.projects.yolo.configs import decoders +from official.vision.configs import common # pytype: disable=annotation-type-mismatch diff --git a/official/vision/beta/projects/yolo/dataloaders/classification_input.py b/official/vision/beta/projects/yolo/dataloaders/classification_input.py index 4c9c663b2..e1737dba3 100755 --- a/official/vision/beta/projects/yolo/dataloaders/classification_input.py +++ b/official/vision/beta/projects/yolo/dataloaders/classification_input.py @@ -14,8 +14,8 @@ """Classification decoder and parser.""" import tensorflow as tf -from official.vision.beta.dataloaders import classification_input -from official.vision.beta.ops import preprocess_ops +from official.vision.dataloaders import classification_input +from official.vision.ops import preprocess_ops class Parser(classification_input.Parser): diff --git a/official/vision/beta/projects/yolo/dataloaders/tf_example_decoder.py b/official/vision/beta/projects/yolo/dataloaders/tf_example_decoder.py index 1b4a3fa0a..8578b663f 100644 --- a/official/vision/beta/projects/yolo/dataloaders/tf_example_decoder.py +++ b/official/vision/beta/projects/yolo/dataloaders/tf_example_decoder.py @@ -19,7 +19,7 @@ protos for object detection. """ import tensorflow as tf -from official.vision.beta.dataloaders import tf_example_decoder +from official.vision.dataloaders import tf_example_decoder def _coco91_to_80(classif, box, areas, iscrowds): diff --git a/official/vision/beta/projects/yolo/dataloaders/yolo_input.py b/official/vision/beta/projects/yolo/dataloaders/yolo_input.py index d8981b1d1..a6591aa00 100755 --- a/official/vision/beta/projects/yolo/dataloaders/yolo_input.py +++ b/official/vision/beta/projects/yolo/dataloaders/yolo_input.py @@ -15,12 +15,12 @@ """Detection Data parser and processing for YOLO.""" import tensorflow as tf -from official.vision.beta.dataloaders import parser -from official.vision.beta.dataloaders import utils -from official.vision.beta.ops import box_ops as bbox_ops -from official.vision.beta.ops import preprocess_ops from official.vision.beta.projects.yolo.ops import anchor from official.vision.beta.projects.yolo.ops import preprocessing_ops +from official.vision.dataloaders import parser +from official.vision.dataloaders import utils +from official.vision.ops import box_ops as bbox_ops +from official.vision.ops import preprocess_ops class Parser(parser.Parser): diff --git a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py index 623f15ed1..4f1937987 100644 --- a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py +++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py @@ -36,11 +36,12 @@ Darknets are used mainly for object detection in: """ import collections + import tensorflow as tf from official.modeling import hyperparams -from official.vision.beta.modeling.backbones import factory from official.vision.beta.projects.yolo.modeling.layers import nn_blocks +from official.vision.modeling.backbones import factory class BlockConfig: diff --git a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py index aa3d64105..3aa0dfa44 100644 --- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py +++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py @@ -13,13 +13,13 @@ # limitations under the License. """Feature Pyramid Network and Path Aggregation variants used in YOLO.""" -from typing import Mapping, Union, Optional +from typing import Mapping, Optional, Union import tensorflow as tf from official.modeling import hyperparams -from official.vision.beta.modeling.decoders import factory from official.vision.beta.projects.yolo.modeling.layers import nn_blocks +from official.vision.modeling.decoders import factory # model configurations # the structure is as follows. model version, {v3, v4, v#, ... etc} diff --git a/official/vision/beta/projects/yolo/modeling/factory.py b/official/vision/beta/projects/yolo/modeling/factory.py index 68066a37c..1fbbecb49 100644 --- a/official/vision/beta/projects/yolo/modeling/factory.py +++ b/official/vision/beta/projects/yolo/modeling/factory.py @@ -15,13 +15,13 @@ """Contains common factory functions yolo neural networks.""" from absl import logging -from official.vision.beta.modeling.backbones import factory as backbone_factory -from official.vision.beta.modeling.decoders import factory as decoder_factory from official.vision.beta.projects.yolo.configs import yolo from official.vision.beta.projects.yolo.modeling import yolo_model from official.vision.beta.projects.yolo.modeling.heads import yolo_head from official.vision.beta.projects.yolo.modeling.layers import detection_generator +from official.vision.modeling.backbones import factory as backbone_factory +from official.vision.modeling.decoders import factory as decoder_factory def build_yolo_detection_generator(model_config: yolo.Yolo, anchor_boxes): diff --git a/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py b/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py index 1adc1e3c9..e3df866aa 100644 --- a/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py +++ b/official/vision/beta/projects/yolo/modeling/layers/detection_generator.py @@ -15,10 +15,10 @@ """Contains common building blocks for yolo layer (detection layer).""" import tensorflow as tf -from official.vision.beta.modeling.layers import detection_generator from official.vision.beta.projects.yolo.losses import yolo_loss from official.vision.beta.projects.yolo.ops import box_ops from official.vision.beta.projects.yolo.ops import loss_utils +from official.vision.modeling.layers import detection_generator class YoloLayer(tf.keras.Model): diff --git a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py index f4ab88492..5598d1580 100644 --- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py +++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py @@ -18,7 +18,7 @@ from typing import Callable, List, Tuple import tensorflow as tf from official.modeling import tf_utils -from official.vision.beta.ops import spatial_transform_ops +from official.vision.ops import spatial_transform_ops class Identity(tf.keras.layers.Layer): diff --git a/official/vision/beta/projects/yolo/ops/mosaic.py b/official/vision/beta/projects/yolo/ops/mosaic.py index c49423104..e3982de2d 100755 --- a/official/vision/beta/projects/yolo/ops/mosaic.py +++ b/official/vision/beta/projects/yolo/ops/mosaic.py @@ -14,12 +14,13 @@ """Mosaic op.""" import random + import tensorflow as tf import tensorflow_addons as tfa -from official.vision.beta.ops import box_ops -from official.vision.beta.ops import preprocess_ops from official.vision.beta.projects.yolo.ops import preprocessing_ops +from official.vision.ops import box_ops +from official.vision.ops import preprocess_ops class Mosaic: diff --git a/official/vision/beta/projects/yolo/ops/preprocessing_ops.py b/official/vision/beta/projects/yolo/ops/preprocessing_ops.py index bc97cc460..93c8b1569 100755 --- a/official/vision/beta/projects/yolo/ops/preprocessing_ops.py +++ b/official/vision/beta/projects/yolo/ops/preprocessing_ops.py @@ -19,7 +19,7 @@ import numpy as np import tensorflow as tf import tensorflow_addons as tfa -from official.vision.beta.ops import box_ops as bbox_ops +from official.vision.ops import box_ops as bbox_ops PAD_VALUE = 114 GLOBAL_SEED_SET = False diff --git a/official/vision/beta/projects/yolo/ops/preprocessing_ops_test.py b/official/vision/beta/projects/yolo/ops/preprocessing_ops_test.py index 22cfc1c8a..c2f477684 100755 --- a/official/vision/beta/projects/yolo/ops/preprocessing_ops_test.py +++ b/official/vision/beta/projects/yolo/ops/preprocessing_ops_test.py @@ -17,8 +17,8 @@ from absl.testing import parameterized import numpy as np import tensorflow as tf -from official.vision.beta.ops import box_ops as bbox_ops from official.vision.beta.projects.yolo.ops import preprocessing_ops +from official.vision.ops import box_ops as bbox_ops class InputUtilsTest(parameterized.TestCase, tf.test.TestCase): diff --git a/official/vision/beta/projects/yolo/tasks/image_classification.py b/official/vision/beta/projects/yolo/tasks/image_classification.py index c0fdd0ae1..b69bee618 100644 --- a/official/vision/beta/projects/yolo/tasks/image_classification.py +++ b/official/vision/beta/projects/yolo/tasks/image_classification.py @@ -15,12 +15,12 @@ """Image classification task definition.""" from official.common import dataset_fn from official.core import task_factory -from official.vision.beta.dataloaders import classification_input as classification_input_base -from official.vision.beta.dataloaders import input_reader_factory -from official.vision.beta.dataloaders import tfds_factory from official.vision.beta.projects.yolo.configs import darknet_classification as exp_cfg from official.vision.beta.projects.yolo.dataloaders import classification_input -from official.vision.beta.tasks import image_classification +from official.vision.dataloaders import classification_input as classification_input_base +from official.vision.dataloaders import input_reader_factory +from official.vision.dataloaders import tfds_factory +from official.vision.tasks import image_classification @task_factory.register_task_cls(exp_cfg.ImageClassificationTask) diff --git a/official/vision/beta/projects/yolo/tasks/yolo.py b/official/vision/beta/projects/yolo/tasks/yolo.py index 6d9e9f484..4441012f6 100755 --- a/official/vision/beta/projects/yolo/tasks/yolo.py +++ b/official/vision/beta/projects/yolo/tasks/yolo.py @@ -26,10 +26,6 @@ from official.core import config_definitions from official.core import input_reader from official.core import task_factory from official.modeling import performance -from official.vision.beta.dataloaders import tfds_factory -from official.vision.beta.dataloaders import tf_example_label_map_decoder -from official.vision.beta.evaluation import coco_evaluator -from official.vision.beta.ops import box_ops from official.vision.beta.projects.yolo import optimization from official.vision.beta.projects.yolo.configs import yolo as exp_cfg from official.vision.beta.projects.yolo.dataloaders import tf_example_decoder @@ -39,6 +35,10 @@ from official.vision.beta.projects.yolo.ops import kmeans_anchors from official.vision.beta.projects.yolo.ops import mosaic from official.vision.beta.projects.yolo.ops import preprocessing_ops from official.vision.beta.projects.yolo.tasks import task_utils +from official.vision.dataloaders import tfds_factory +from official.vision.dataloaders.google import tf_example_label_map_decoder +from official.vision.evaluation import coco_evaluator +from official.vision.ops import box_ops OptimizationConfig = optimization.OptimizationConfig RuntimeConfig = config_definitions.RuntimeConfig -- GitLab From 42f76467ed481843f5772a54ca342db8565e78c2 Mon Sep 17 00:00:00 2001 From: Yeqing Li Date: Fri, 25 Mar 2022 10:15:40 -0700 Subject: [PATCH 53/54] Removes unneeded content of the beta folder. PiperOrigin-RevId: 437276665 --- official/vision/beta/__init__.py | 19 - official/vision/beta/configs/__init__.py | 21 - official/vision/beta/configs/backbones.py | 131 - official/vision/beta/configs/backbones_3d.py | 106 - official/vision/beta/configs/common.py | 136 - official/vision/beta/configs/decoders.py | 71 - .../imagenet_mobilenetv2_gpu.yaml | 49 - .../imagenet_mobilenetv2_tpu.yaml | 48 - .../imagenet_mobilenetv3large_tpu.yaml | 53 - .../imagenet_mobilenetv3small_tpu.yaml | 63 - .../imagenet_resnet101_deeplab_tpu.yaml | 57 - .../imagenet_resnet101_tpu.yaml | 50 - .../imagenet_resnet152_tpu.yaml | 50 - .../imagenet_resnet50_deeplab_tpu.yaml | 50 - .../imagenet_resnet50_gpu.yaml | 48 - .../imagenet_resnet50_tfds_tpu.yaml | 56 - .../imagenet_resnet50_tpu.yaml | 47 - .../imagenet_resnetrs101_i160.yaml | 64 - .../imagenet_resnetrs101_i192.yaml | 64 - .../imagenet_resnetrs152_i192.yaml | 64 - .../imagenet_resnetrs152_i224.yaml | 64 - .../imagenet_resnetrs152_i256.yaml | 64 - .../imagenet_resnetrs200_i256.yaml | 64 - .../imagenet_resnetrs270_i256.yaml | 64 - .../imagenet_resnetrs350_i256.yaml | 64 - .../imagenet_resnetrs350_i320.yaml | 64 - .../imagenet_resnetrs420_i320.yaml | 63 - .../imagenet_resnetrs50_i160.yaml | 64 - .../coco_spinenet143_cascadercnn_tpu.yaml | 58 - .../maskrcnn/coco_spinenet143_mrcnn_tpu.yaml | 47 - .../coco_spinenet49_cascadercnn_tpu.yaml | 58 - .../maskrcnn/coco_spinenet49_mrcnn_tpu.yaml | 47 - .../coco_spinenet96_cascadercnn_tpu.yaml | 58 - .../maskrcnn/coco_spinenet96_mrcnn_tpu.yaml | 47 - .../r50fpn_640_coco_scratch_tpu4x4.yaml | 36 - .../retinanet/coco_mobiledetcpu_tpu.yaml | 62 - .../retinanet/coco_mobilenetv2_tpu.yaml | 62 - .../retinanet/coco_spinenet143_tpu.yaml | 58 - .../retinanet/coco_spinenet190_tpu.yaml | 57 - .../retinanet/coco_spinenet49_mobile_tpu.yaml | 60 - .../retinanet/coco_spinenet49_tpu.yaml | 58 - .../coco_spinenet49s_mobile_tpu.yaml | 60 - .../coco_spinenet49xs_mobile_tpu.yaml | 60 - .../retinanet/coco_spinenet96_tpu.yaml | 58 - .../retinanet/resnet50fpn_coco_tfds_tpu.yaml | 34 - .../resnet50fpn_coco_tpu4x4_benchmark.yaml | 7 - ...bv3plus_resnet101_cityscapes_tfds_tpu.yaml | 78 - .../k400_3d-resnet50_tpu.yaml | 88 - .../k400_resnet3drs_50_tpu.yaml | 99 - .../k400_slowonly16x4_tpu.yaml | 88 - .../k400_slowonly8x8_tpu.yaml | 88 - .../k600_3d-resnet50_tpu.yaml | 88 - .../k600_3d-resnet50g_tpu.yaml | 112 - .../k600_slowonly8x8_tpu.yaml | 88 - .../beta/configs/image_classification.py | 397 --- .../beta/configs/image_classification_test.py | 48 - official/vision/beta/configs/maskrcnn.py | 522 ---- official/vision/beta/configs/maskrcnn_test.py | 47 - official/vision/beta/configs/retinanet.py | 420 --- .../vision/beta/configs/retinanet_test.py | 46 - .../beta/configs/semantic_segmentation.py | 712 ----- .../configs/semantic_segmentation_test.py | 45 - .../beta/configs/video_classification.py | 370 --- .../beta/configs/video_classification_test.py | 44 - official/vision/beta/data/__init__.py | 14 - .../vision/beta/data/create_coco_tf_record.py | 554 ---- .../vision/beta/data/process_coco_few_shot.sh | 70 - .../data/process_coco_few_shot_json_files.py | 144 - .../vision/beta/data/process_coco_panoptic.sh | 40 - official/vision/beta/data/tfrecord_lib.py | 181 -- .../vision/beta/data/tfrecord_lib_test.py | 93 - official/vision/beta/dataloaders/__init__.py | 14 - .../beta/dataloaders/classification_input.py | 273 -- official/vision/beta/dataloaders/decoder.py | 35 - .../vision/beta/dataloaders/input_reader.py | 178 -- .../beta/dataloaders/input_reader_factory.py | 43 - .../vision/beta/dataloaders/maskrcnn_input.py | 345 --- official/vision/beta/dataloaders/parser.py | 81 - .../beta/dataloaders/retinanet_input.py | 328 --- .../beta/dataloaders/segmentation_input.py | 218 -- .../beta/dataloaders/tf_example_decoder.py | 176 -- .../dataloaders/tf_example_decoder_test.py | 267 -- .../tf_example_label_map_decoder.py | 67 - .../tf_example_label_map_decoder_test.py | 188 -- .../tfds_classification_decoders.py | 38 - .../dataloaders/tfds_detection_decoders.py | 60 - .../vision/beta/dataloaders/tfds_factory.py | 71 - .../beta/dataloaders/tfds_factory_test.py | 114 - .../dataloaders/tfds_segmentation_decoders.py | 86 - .../beta/dataloaders/tfexample_utils.py | 268 -- official/vision/beta/dataloaders/utils.py | 69 - .../vision/beta/dataloaders/utils_test.py | 71 - .../vision/beta/dataloaders/video_input.py | 392 --- .../beta/dataloaders/video_input_test.py | 194 -- official/vision/beta/evaluation/__init__.py | 14 - .../vision/beta/evaluation/coco_evaluator.py | 336 --- official/vision/beta/evaluation/coco_utils.py | 400 --- .../vision/beta/evaluation/coco_utils_test.py | 49 - official/vision/beta/evaluation/iou.py | 129 - official/vision/beta/evaluation/iou_test.py | 115 - .../beta/evaluation/panoptic_quality.py | 294 --- .../evaluation/panoptic_quality_evaluator.py | 184 -- .../panoptic_quality_evaluator_test.py | 96 - .../beta/evaluation/panoptic_quality_test.py | 305 --- .../beta/evaluation/segmentation_metrics.py | 227 -- .../evaluation/segmentation_metrics_test.py | 77 - .../evaluation/wod_detection_evaluator.py | 161 -- official/vision/beta/losses/__init__.py | 14 - official/vision/beta/losses/focal_loss.py | 84 - official/vision/beta/losses/loss_utils.py | 42 - .../vision/beta/losses/maskrcnn_losses.py | 312 --- .../vision/beta/losses/retinanet_losses.py | 206 -- .../vision/beta/losses/segmentation_losses.py | 134 - official/vision/beta/modeling/__init__.py | 20 - .../beta/modeling/backbones/__init__.py | 25 - .../beta/modeling/backbones/efficientnet.py | 318 --- .../modeling/backbones/efficientnet_test.py | 103 - .../vision/beta/modeling/backbones/factory.py | 112 - .../beta/modeling/backbones/factory_test.py | 227 -- .../beta/modeling/backbones/mobiledet.py | 579 ---- .../beta/modeling/backbones/mobiledet_test.py | 114 - .../beta/modeling/backbones/mobilenet.py | 936 ------- .../beta/modeling/backbones/mobilenet_test.py | 298 --- .../vision/beta/modeling/backbones/resnet.py | 432 --- .../beta/modeling/backbones/resnet_3d.py | 454 ---- .../beta/modeling/backbones/resnet_3d_test.py | 102 - .../beta/modeling/backbones/resnet_deeplab.py | 366 --- .../modeling/backbones/resnet_deeplab_test.py | 143 - .../beta/modeling/backbones/resnet_test.py | 155 -- .../vision/beta/modeling/backbones/revnet.py | 232 -- .../beta/modeling/backbones/revnet_test.py | 91 - .../beta/modeling/backbones/spinenet.py | 572 ---- .../modeling/backbones/spinenet_mobile.py | 538 ---- .../backbones/spinenet_mobile_test.py | 111 - .../beta/modeling/backbones/spinenet_test.py | 127 - .../beta/modeling/classification_model.py | 122 - .../modeling/classification_model_test.py | 183 -- .../vision/beta/modeling/decoders/__init__.py | 19 - .../vision/beta/modeling/decoders/aspp.py | 203 -- .../beta/modeling/decoders/aspp_test.py | 93 - .../vision/beta/modeling/decoders/factory.py | 135 - .../beta/modeling/decoders/factory_test.py | 159 -- official/vision/beta/modeling/decoders/fpn.py | 246 -- .../vision/beta/modeling/decoders/fpn_test.py | 116 - .../vision/beta/modeling/decoders/nasfpn.py | 368 --- .../beta/modeling/decoders/nasfpn_test.py | 58 - official/vision/beta/modeling/factory.py | 385 --- official/vision/beta/modeling/factory_3d.py | 103 - official/vision/beta/modeling/factory_test.py | 131 - .../vision/beta/modeling/heads/__init__.py | 21 - .../modeling/heads/dense_prediction_heads.py | 517 ---- .../heads/dense_prediction_heads_test.py | 147 -- .../beta/modeling/heads/instance_heads.py | 444 ---- .../modeling/heads/instance_heads_test.py | 134 - .../beta/modeling/heads/segmentation_heads.py | 441 ---- .../modeling/heads/segmentation_heads_test.py | 106 - .../vision/beta/modeling/layers/__init__.py | 43 - .../beta/modeling/layers/box_sampler.py | 93 - .../vision/beta/modeling/layers/deeplab.py | 211 -- .../beta/modeling/layers/deeplab_test.py | 53 - .../modeling/layers/detection_generator.py | 852 ------ .../layers/detection_generator_test.py | 249 -- .../beta/modeling/layers/mask_sampler.py | 166 -- .../vision/beta/modeling/layers/nn_blocks.py | 1512 ----------- .../beta/modeling/layers/nn_blocks_3d.py | 286 -- .../beta/modeling/layers/nn_blocks_3d_test.py | 58 - .../beta/modeling/layers/nn_blocks_test.py | 340 --- .../vision/beta/modeling/layers/nn_layers.py | 1277 --------- .../beta/modeling/layers/nn_layers_test.py | 418 --- .../beta/modeling/layers/roi_aligner.py | 72 - .../beta/modeling/layers/roi_aligner_test.py | 42 - .../beta/modeling/layers/roi_generator.py | 313 --- .../beta/modeling/layers/roi_sampler.py | 175 -- .../vision/beta/modeling/maskrcnn_model.py | 429 --- .../beta/modeling/maskrcnn_model_test.py | 397 --- .../vision/beta/modeling/retinanet_model.py | 216 -- .../beta/modeling/retinanet_model_test.py | 313 --- .../beta/modeling/segmentation_model.py | 94 - .../beta/modeling/segmentation_model_test.py | 85 - .../modeling/video_classification_model.py | 128 - .../video_classification_model_test.py | 91 - official/vision/beta/ops/__init__.py | 14 - official/vision/beta/ops/anchor.py | 373 --- official/vision/beta/ops/anchor_generator.py | 182 -- .../vision/beta/ops/anchor_generator_test.py | 137 - official/vision/beta/ops/anchor_test.py | 186 -- official/vision/beta/ops/augment.py | 2320 ----------------- official/vision/beta/ops/augment_test.py | 440 ---- official/vision/beta/ops/box_matcher.py | 191 -- official/vision/beta/ops/box_matcher_test.py | 78 - official/vision/beta/ops/box_ops.py | 763 ------ official/vision/beta/ops/iou_similarity.py | 167 -- .../vision/beta/ops/iou_similarity_test.py | 76 - official/vision/beta/ops/mask_ops.py | 190 -- official/vision/beta/ops/mask_ops_test.py | 55 - official/vision/beta/ops/nms.py | 202 -- official/vision/beta/ops/preprocess_ops.py | 919 ------- official/vision/beta/ops/preprocess_ops_3d.py | 354 --- .../vision/beta/ops/preprocess_ops_3d_test.py | 158 -- .../vision/beta/ops/preprocess_ops_test.py | 246 -- official/vision/beta/ops/sampling_ops.py | 383 --- .../vision/beta/ops/spatial_transform_ops.py | 544 ---- official/vision/beta/ops/target_gather.py | 103 - .../vision/beta/ops/target_gather_test.py | 77 - official/vision/beta/serving/__init__.py | 14 - official/vision/beta/serving/detection.py | 205 -- .../vision/beta/serving/detection_test.py | 144 - official/vision/beta/serving/export_base.py | 191 -- .../vision/beta/serving/export_base_v2.py | 75 - .../beta/serving/export_base_v2_test.py | 89 - .../beta/serving/export_module_factory.py | 89 - .../serving/export_module_factory_test.py | 117 - .../vision/beta/serving/export_saved_model.py | 106 - .../beta/serving/export_saved_model_lib.py | 163 -- .../serving/export_saved_model_lib_test.py | 69 - .../beta/serving/export_saved_model_lib_v2.py | 93 - official/vision/beta/serving/export_tfhub.py | 104 - official/vision/beta/serving/export_tflite.py | 108 - .../vision/beta/serving/export_tflite_lib.py | 122 - .../beta/serving/export_tflite_lib_test.py | 152 -- official/vision/beta/serving/export_utils.py | 121 - .../beta/serving/image_classification.py | 83 - .../beta/serving/image_classification_test.py | 120 - .../beta/serving/semantic_segmentation.py | 89 - .../serving/semantic_segmentation_test.py | 114 - .../beta/serving/video_classification.py | 190 -- .../beta/serving/video_classification_test.py | 112 - official/vision/beta/tasks/__init__.py | 21 - .../vision/beta/tasks/image_classification.py | 312 --- official/vision/beta/tasks/maskrcnn.py | 455 ---- official/vision/beta/tasks/retinanet.py | 358 --- .../beta/tasks/semantic_segmentation.py | 337 --- .../vision/beta/tasks/video_classification.py | 353 --- official/vision/beta/train.py | 69 - .../vision/beta/train_spatial_partitioning.py | 151 -- 235 files changed, 44899 deletions(-) delete mode 100644 official/vision/beta/__init__.py delete mode 100644 official/vision/beta/configs/__init__.py delete mode 100644 official/vision/beta/configs/backbones.py delete mode 100644 official/vision/beta/configs/backbones_3d.py delete mode 100644 official/vision/beta/configs/common.py delete mode 100644 official/vision/beta/configs/decoders.py delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_gpu.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv3large_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv3small_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_deeplab_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnet152_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_deeplab_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_gpu.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tfds_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i320.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs420_i320.yaml delete mode 100644 official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml delete mode 100644 official/vision/beta/configs/experiments/maskrcnn/coco_spinenet143_cascadercnn_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/maskrcnn/coco_spinenet143_mrcnn_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/maskrcnn/coco_spinenet49_cascadercnn_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/maskrcnn/coco_spinenet49_mrcnn_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/maskrcnn/coco_spinenet96_cascadercnn_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/maskrcnn/coco_spinenet96_mrcnn_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/maskrcnn/r50fpn_640_coco_scratch_tpu4x4.yaml delete mode 100644 official/vision/beta/configs/experiments/retinanet/coco_mobiledetcpu_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/retinanet/coco_mobilenetv2_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/retinanet/coco_spinenet143_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/retinanet/coco_spinenet190_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/retinanet/coco_spinenet49_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/retinanet/coco_spinenet49s_mobile_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/retinanet/coco_spinenet49xs_mobile_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/retinanet/coco_spinenet96_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/retinanet/resnet50fpn_coco_tfds_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/retinanet/resnet50fpn_coco_tpu4x4_benchmark.yaml delete mode 100644 official/vision/beta/configs/experiments/semantic_segmentation/deeplabv3plus_resnet101_cityscapes_tfds_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/video_classification/k400_3d-resnet50_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/video_classification/k400_resnet3drs_50_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/video_classification/k400_slowonly16x4_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/video_classification/k400_slowonly8x8_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50g_tpu.yaml delete mode 100644 official/vision/beta/configs/experiments/video_classification/k600_slowonly8x8_tpu.yaml delete mode 100644 official/vision/beta/configs/image_classification.py delete mode 100644 official/vision/beta/configs/image_classification_test.py delete mode 100644 official/vision/beta/configs/maskrcnn.py delete mode 100644 official/vision/beta/configs/maskrcnn_test.py delete mode 100644 official/vision/beta/configs/retinanet.py delete mode 100644 official/vision/beta/configs/retinanet_test.py delete mode 100644 official/vision/beta/configs/semantic_segmentation.py delete mode 100644 official/vision/beta/configs/semantic_segmentation_test.py delete mode 100644 official/vision/beta/configs/video_classification.py delete mode 100644 official/vision/beta/configs/video_classification_test.py delete mode 100644 official/vision/beta/data/__init__.py delete mode 100644 official/vision/beta/data/create_coco_tf_record.py delete mode 100644 official/vision/beta/data/process_coco_few_shot.sh delete mode 100644 official/vision/beta/data/process_coco_few_shot_json_files.py delete mode 100644 official/vision/beta/data/process_coco_panoptic.sh delete mode 100644 official/vision/beta/data/tfrecord_lib.py delete mode 100644 official/vision/beta/data/tfrecord_lib_test.py delete mode 100644 official/vision/beta/dataloaders/__init__.py delete mode 100644 official/vision/beta/dataloaders/classification_input.py delete mode 100644 official/vision/beta/dataloaders/decoder.py delete mode 100644 official/vision/beta/dataloaders/input_reader.py delete mode 100644 official/vision/beta/dataloaders/input_reader_factory.py delete mode 100644 official/vision/beta/dataloaders/maskrcnn_input.py delete mode 100644 official/vision/beta/dataloaders/parser.py delete mode 100644 official/vision/beta/dataloaders/retinanet_input.py delete mode 100644 official/vision/beta/dataloaders/segmentation_input.py delete mode 100644 official/vision/beta/dataloaders/tf_example_decoder.py delete mode 100644 official/vision/beta/dataloaders/tf_example_decoder_test.py delete mode 100644 official/vision/beta/dataloaders/tf_example_label_map_decoder.py delete mode 100644 official/vision/beta/dataloaders/tf_example_label_map_decoder_test.py delete mode 100644 official/vision/beta/dataloaders/tfds_classification_decoders.py delete mode 100644 official/vision/beta/dataloaders/tfds_detection_decoders.py delete mode 100644 official/vision/beta/dataloaders/tfds_factory.py delete mode 100644 official/vision/beta/dataloaders/tfds_factory_test.py delete mode 100644 official/vision/beta/dataloaders/tfds_segmentation_decoders.py delete mode 100644 official/vision/beta/dataloaders/tfexample_utils.py delete mode 100644 official/vision/beta/dataloaders/utils.py delete mode 100644 official/vision/beta/dataloaders/utils_test.py delete mode 100644 official/vision/beta/dataloaders/video_input.py delete mode 100644 official/vision/beta/dataloaders/video_input_test.py delete mode 100644 official/vision/beta/evaluation/__init__.py delete mode 100644 official/vision/beta/evaluation/coco_evaluator.py delete mode 100644 official/vision/beta/evaluation/coco_utils.py delete mode 100644 official/vision/beta/evaluation/coco_utils_test.py delete mode 100644 official/vision/beta/evaluation/iou.py delete mode 100644 official/vision/beta/evaluation/iou_test.py delete mode 100644 official/vision/beta/evaluation/panoptic_quality.py delete mode 100644 official/vision/beta/evaluation/panoptic_quality_evaluator.py delete mode 100644 official/vision/beta/evaluation/panoptic_quality_evaluator_test.py delete mode 100644 official/vision/beta/evaluation/panoptic_quality_test.py delete mode 100644 official/vision/beta/evaluation/segmentation_metrics.py delete mode 100644 official/vision/beta/evaluation/segmentation_metrics_test.py delete mode 100644 official/vision/beta/evaluation/wod_detection_evaluator.py delete mode 100644 official/vision/beta/losses/__init__.py delete mode 100644 official/vision/beta/losses/focal_loss.py delete mode 100644 official/vision/beta/losses/loss_utils.py delete mode 100644 official/vision/beta/losses/maskrcnn_losses.py delete mode 100644 official/vision/beta/losses/retinanet_losses.py delete mode 100644 official/vision/beta/losses/segmentation_losses.py delete mode 100644 official/vision/beta/modeling/__init__.py delete mode 100644 official/vision/beta/modeling/backbones/__init__.py delete mode 100644 official/vision/beta/modeling/backbones/efficientnet.py delete mode 100644 official/vision/beta/modeling/backbones/efficientnet_test.py delete mode 100644 official/vision/beta/modeling/backbones/factory.py delete mode 100644 official/vision/beta/modeling/backbones/factory_test.py delete mode 100644 official/vision/beta/modeling/backbones/mobiledet.py delete mode 100644 official/vision/beta/modeling/backbones/mobiledet_test.py delete mode 100644 official/vision/beta/modeling/backbones/mobilenet.py delete mode 100644 official/vision/beta/modeling/backbones/mobilenet_test.py delete mode 100644 official/vision/beta/modeling/backbones/resnet.py delete mode 100644 official/vision/beta/modeling/backbones/resnet_3d.py delete mode 100644 official/vision/beta/modeling/backbones/resnet_3d_test.py delete mode 100644 official/vision/beta/modeling/backbones/resnet_deeplab.py delete mode 100644 official/vision/beta/modeling/backbones/resnet_deeplab_test.py delete mode 100644 official/vision/beta/modeling/backbones/resnet_test.py delete mode 100644 official/vision/beta/modeling/backbones/revnet.py delete mode 100644 official/vision/beta/modeling/backbones/revnet_test.py delete mode 100644 official/vision/beta/modeling/backbones/spinenet.py delete mode 100644 official/vision/beta/modeling/backbones/spinenet_mobile.py delete mode 100644 official/vision/beta/modeling/backbones/spinenet_mobile_test.py delete mode 100644 official/vision/beta/modeling/backbones/spinenet_test.py delete mode 100644 official/vision/beta/modeling/classification_model.py delete mode 100644 official/vision/beta/modeling/classification_model_test.py delete mode 100644 official/vision/beta/modeling/decoders/__init__.py delete mode 100644 official/vision/beta/modeling/decoders/aspp.py delete mode 100644 official/vision/beta/modeling/decoders/aspp_test.py delete mode 100644 official/vision/beta/modeling/decoders/factory.py delete mode 100644 official/vision/beta/modeling/decoders/factory_test.py delete mode 100644 official/vision/beta/modeling/decoders/fpn.py delete mode 100644 official/vision/beta/modeling/decoders/fpn_test.py delete mode 100644 official/vision/beta/modeling/decoders/nasfpn.py delete mode 100644 official/vision/beta/modeling/decoders/nasfpn_test.py delete mode 100644 official/vision/beta/modeling/factory.py delete mode 100644 official/vision/beta/modeling/factory_3d.py delete mode 100644 official/vision/beta/modeling/factory_test.py delete mode 100644 official/vision/beta/modeling/heads/__init__.py delete mode 100644 official/vision/beta/modeling/heads/dense_prediction_heads.py delete mode 100644 official/vision/beta/modeling/heads/dense_prediction_heads_test.py delete mode 100644 official/vision/beta/modeling/heads/instance_heads.py delete mode 100644 official/vision/beta/modeling/heads/instance_heads_test.py delete mode 100644 official/vision/beta/modeling/heads/segmentation_heads.py delete mode 100644 official/vision/beta/modeling/heads/segmentation_heads_test.py delete mode 100644 official/vision/beta/modeling/layers/__init__.py delete mode 100644 official/vision/beta/modeling/layers/box_sampler.py delete mode 100644 official/vision/beta/modeling/layers/deeplab.py delete mode 100644 official/vision/beta/modeling/layers/deeplab_test.py delete mode 100644 official/vision/beta/modeling/layers/detection_generator.py delete mode 100644 official/vision/beta/modeling/layers/detection_generator_test.py delete mode 100644 official/vision/beta/modeling/layers/mask_sampler.py delete mode 100644 official/vision/beta/modeling/layers/nn_blocks.py delete mode 100644 official/vision/beta/modeling/layers/nn_blocks_3d.py delete mode 100644 official/vision/beta/modeling/layers/nn_blocks_3d_test.py delete mode 100644 official/vision/beta/modeling/layers/nn_blocks_test.py delete mode 100644 official/vision/beta/modeling/layers/nn_layers.py delete mode 100644 official/vision/beta/modeling/layers/nn_layers_test.py delete mode 100644 official/vision/beta/modeling/layers/roi_aligner.py delete mode 100644 official/vision/beta/modeling/layers/roi_aligner_test.py delete mode 100644 official/vision/beta/modeling/layers/roi_generator.py delete mode 100644 official/vision/beta/modeling/layers/roi_sampler.py delete mode 100644 official/vision/beta/modeling/maskrcnn_model.py delete mode 100644 official/vision/beta/modeling/maskrcnn_model_test.py delete mode 100644 official/vision/beta/modeling/retinanet_model.py delete mode 100644 official/vision/beta/modeling/retinanet_model_test.py delete mode 100644 official/vision/beta/modeling/segmentation_model.py delete mode 100644 official/vision/beta/modeling/segmentation_model_test.py delete mode 100644 official/vision/beta/modeling/video_classification_model.py delete mode 100644 official/vision/beta/modeling/video_classification_model_test.py delete mode 100644 official/vision/beta/ops/__init__.py delete mode 100644 official/vision/beta/ops/anchor.py delete mode 100644 official/vision/beta/ops/anchor_generator.py delete mode 100644 official/vision/beta/ops/anchor_generator_test.py delete mode 100644 official/vision/beta/ops/anchor_test.py delete mode 100644 official/vision/beta/ops/augment.py delete mode 100644 official/vision/beta/ops/augment_test.py delete mode 100644 official/vision/beta/ops/box_matcher.py delete mode 100644 official/vision/beta/ops/box_matcher_test.py delete mode 100644 official/vision/beta/ops/box_ops.py delete mode 100644 official/vision/beta/ops/iou_similarity.py delete mode 100644 official/vision/beta/ops/iou_similarity_test.py delete mode 100644 official/vision/beta/ops/mask_ops.py delete mode 100644 official/vision/beta/ops/mask_ops_test.py delete mode 100644 official/vision/beta/ops/nms.py delete mode 100644 official/vision/beta/ops/preprocess_ops.py delete mode 100644 official/vision/beta/ops/preprocess_ops_3d.py delete mode 100644 official/vision/beta/ops/preprocess_ops_3d_test.py delete mode 100644 official/vision/beta/ops/preprocess_ops_test.py delete mode 100644 official/vision/beta/ops/sampling_ops.py delete mode 100644 official/vision/beta/ops/spatial_transform_ops.py delete mode 100644 official/vision/beta/ops/target_gather.py delete mode 100644 official/vision/beta/ops/target_gather_test.py delete mode 100644 official/vision/beta/serving/__init__.py delete mode 100644 official/vision/beta/serving/detection.py delete mode 100644 official/vision/beta/serving/detection_test.py delete mode 100644 official/vision/beta/serving/export_base.py delete mode 100644 official/vision/beta/serving/export_base_v2.py delete mode 100644 official/vision/beta/serving/export_base_v2_test.py delete mode 100644 official/vision/beta/serving/export_module_factory.py delete mode 100644 official/vision/beta/serving/export_module_factory_test.py delete mode 100644 official/vision/beta/serving/export_saved_model.py delete mode 100644 official/vision/beta/serving/export_saved_model_lib.py delete mode 100644 official/vision/beta/serving/export_saved_model_lib_test.py delete mode 100644 official/vision/beta/serving/export_saved_model_lib_v2.py delete mode 100644 official/vision/beta/serving/export_tfhub.py delete mode 100644 official/vision/beta/serving/export_tflite.py delete mode 100644 official/vision/beta/serving/export_tflite_lib.py delete mode 100644 official/vision/beta/serving/export_tflite_lib_test.py delete mode 100644 official/vision/beta/serving/export_utils.py delete mode 100644 official/vision/beta/serving/image_classification.py delete mode 100644 official/vision/beta/serving/image_classification_test.py delete mode 100644 official/vision/beta/serving/semantic_segmentation.py delete mode 100644 official/vision/beta/serving/semantic_segmentation_test.py delete mode 100644 official/vision/beta/serving/video_classification.py delete mode 100644 official/vision/beta/serving/video_classification_test.py delete mode 100644 official/vision/beta/tasks/__init__.py delete mode 100644 official/vision/beta/tasks/image_classification.py delete mode 100644 official/vision/beta/tasks/maskrcnn.py delete mode 100644 official/vision/beta/tasks/retinanet.py delete mode 100644 official/vision/beta/tasks/semantic_segmentation.py delete mode 100644 official/vision/beta/tasks/video_classification.py delete mode 100644 official/vision/beta/train.py delete mode 100644 official/vision/beta/train_spatial_partitioning.py diff --git a/official/vision/beta/__init__.py b/official/vision/beta/__init__.py deleted file mode 100644 index 16339134f..000000000 --- a/official/vision/beta/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Vision package definition.""" -# Lint as: python3 -# pylint: disable=unused-import -from official.vision.beta import configs -from official.vision.beta import tasks diff --git a/official/vision/beta/configs/__init__.py b/official/vision/beta/configs/__init__.py deleted file mode 100644 index c196abeea..000000000 --- a/official/vision/beta/configs/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Configs package definition.""" - -from official.vision.beta.configs import image_classification -from official.vision.beta.configs import maskrcnn -from official.vision.beta.configs import retinanet -from official.vision.beta.configs import semantic_segmentation -from official.vision.beta.configs import video_classification diff --git a/official/vision/beta/configs/backbones.py b/official/vision/beta/configs/backbones.py deleted file mode 100644 index 337844c4b..000000000 --- a/official/vision/beta/configs/backbones.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Backbones configurations.""" -import dataclasses -from typing import Optional, List - -# Import libraries - -from official.modeling import hyperparams - - -@dataclasses.dataclass -class ResNet(hyperparams.Config): - """ResNet config.""" - model_id: int = 50 - depth_multiplier: float = 1.0 - stem_type: str = 'v0' - se_ratio: float = 0.0 - stochastic_depth_drop_rate: float = 0.0 - scale_stem: bool = True - resnetd_shortcut: bool = False - replace_stem_max_pool: bool = False - bn_trainable: bool = True - - -@dataclasses.dataclass -class DilatedResNet(hyperparams.Config): - """DilatedResNet config.""" - model_id: int = 50 - output_stride: int = 16 - multigrid: Optional[List[int]] = None - stem_type: str = 'v0' - last_stage_repeats: int = 1 - se_ratio: float = 0.0 - stochastic_depth_drop_rate: float = 0.0 - - -@dataclasses.dataclass -class EfficientNet(hyperparams.Config): - """EfficientNet config.""" - model_id: str = 'b0' - se_ratio: float = 0.0 - stochastic_depth_drop_rate: float = 0.0 - - -@dataclasses.dataclass -class MobileNet(hyperparams.Config): - """Mobilenet config.""" - model_id: str = 'MobileNetV2' - filter_size_scale: float = 1.0 - stochastic_depth_drop_rate: float = 0.0 - output_stride: Optional[int] = None - output_intermediate_endpoints: bool = False - - -@dataclasses.dataclass -class SpineNet(hyperparams.Config): - """SpineNet config.""" - model_id: str = '49' - stochastic_depth_drop_rate: float = 0.0 - min_level: int = 3 - max_level: int = 7 - - -@dataclasses.dataclass -class SpineNetMobile(hyperparams.Config): - """SpineNet config.""" - model_id: str = '49' - stochastic_depth_drop_rate: float = 0.0 - se_ratio: float = 0.2 - expand_ratio: int = 6 - min_level: int = 3 - max_level: int = 7 - # If use_keras_upsampling_2d is True, model uses UpSampling2D keras layer - # instead of optimized custom TF op. It makes model be more keras style. We - # set this flag to True when we apply QAT from model optimization toolkit - # that requires the model should use keras layers. - use_keras_upsampling_2d: bool = False - - -@dataclasses.dataclass -class RevNet(hyperparams.Config): - """RevNet config.""" - # Specifies the depth of RevNet. - model_id: int = 56 - - -@dataclasses.dataclass -class MobileDet(hyperparams.Config): - """Mobiledet config.""" - model_id: str = 'MobileDetCPU' - filter_size_scale: float = 1.0 - - -@dataclasses.dataclass -class Backbone(hyperparams.OneOfConfig): - """Configuration for backbones. - - Attributes: - type: 'str', type of backbone be used, one of the fields below. - resnet: resnet backbone config. - dilated_resnet: dilated resnet backbone for semantic segmentation config. - revnet: revnet backbone config. - efficientnet: efficientnet backbone config. - spinenet: spinenet backbone config. - spinenet_mobile: mobile spinenet backbone config. - mobilenet: mobilenet backbone config. - mobiledet: mobiledet backbone config. - """ - type: Optional[str] = None - resnet: ResNet = ResNet() - dilated_resnet: DilatedResNet = DilatedResNet() - revnet: RevNet = RevNet() - efficientnet: EfficientNet = EfficientNet() - spinenet: SpineNet = SpineNet() - spinenet_mobile: SpineNetMobile = SpineNetMobile() - mobilenet: MobileNet = MobileNet() - mobiledet: MobileDet = MobileDet() - diff --git a/official/vision/beta/configs/backbones_3d.py b/official/vision/beta/configs/backbones_3d.py deleted file mode 100644 index 436a3b1be..000000000 --- a/official/vision/beta/configs/backbones_3d.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""3D Backbones configurations.""" -from typing import Optional, Tuple - -# Import libraries -import dataclasses - -from official.modeling import hyperparams - - -@dataclasses.dataclass -class ResNet3DBlock(hyperparams.Config): - """Configuration of a ResNet 3D block.""" - temporal_strides: int = 1 - temporal_kernel_sizes: Tuple[int, ...] = () - use_self_gating: bool = False - - -@dataclasses.dataclass -class ResNet3D(hyperparams.Config): - """ResNet config.""" - model_id: int = 50 - stem_type: str = 'v0' - stem_conv_temporal_kernel_size: int = 5 - stem_conv_temporal_stride: int = 2 - stem_pool_temporal_stride: int = 2 - block_specs: Tuple[ResNet3DBlock, ...] = () - stochastic_depth_drop_rate: float = 0.0 - se_ratio: float = 0.0 - - -@dataclasses.dataclass -class ResNet3D50(ResNet3D): - """Block specifications of the Resnet50 (3D) model.""" - model_id: int = 50 - block_specs: Tuple[ - ResNet3DBlock, ResNet3DBlock, ResNet3DBlock, ResNet3DBlock] = ( - ResNet3DBlock(temporal_strides=1, - temporal_kernel_sizes=(3, 3, 3), - use_self_gating=True), - ResNet3DBlock(temporal_strides=1, - temporal_kernel_sizes=(3, 1, 3, 1), - use_self_gating=True), - ResNet3DBlock(temporal_strides=1, - temporal_kernel_sizes=(3, 1, 3, 1, 3, 1), - use_self_gating=True), - ResNet3DBlock(temporal_strides=1, - temporal_kernel_sizes=(1, 3, 1), - use_self_gating=True)) - - -@dataclasses.dataclass -class ResNet3DRS(ResNet3D): - """Block specifications of the ResNet-RS (3D) model.""" - model_id: int = 50 - stem_type: str = 'v1' - stem_conv_temporal_kernel_size: int = 5 - stem_conv_temporal_stride: int = 2 - stem_pool_temporal_stride: int = 2 - stochastic_depth_drop_rate: float = 0.1 - se_ratio: float = 0.2 - block_specs: Tuple[ - ResNet3DBlock, ResNet3DBlock, ResNet3DBlock, ResNet3DBlock] = ( - ResNet3DBlock(temporal_strides=1, - temporal_kernel_sizes=(1,), - use_self_gating=True), - ResNet3DBlock(temporal_strides=1, - temporal_kernel_sizes=(1,), - use_self_gating=True), - ResNet3DBlock(temporal_strides=1, - temporal_kernel_sizes=(3,), - use_self_gating=True), - ResNet3DBlock(temporal_strides=1, - temporal_kernel_sizes=(3,), - use_self_gating=True)) - - -_RESNET3D50_DEFAULT_CFG = ResNet3D50() -_RESNET3DRS_DEFAULT_CFG = ResNet3DRS() - - -@dataclasses.dataclass -class Backbone3D(hyperparams.OneOfConfig): - """Configuration for backbones. - - Attributes: - type: 'str', type of backbone be used, one of the fields below. - resnet_3d: resnet3d backbone config. - resnet_3d_rs: resnet3d-rs backbone config. - """ - type: Optional[str] = None - resnet_3d: ResNet3D = _RESNET3D50_DEFAULT_CFG - resnet_3d_rs: ResNet3D = _RESNET3DRS_DEFAULT_CFG diff --git a/official/vision/beta/configs/common.py b/official/vision/beta/configs/common.py deleted file mode 100644 index 2e3897f7f..000000000 --- a/official/vision/beta/configs/common.py +++ /dev/null @@ -1,136 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Common configurations.""" - -import dataclasses -from typing import List, Optional - -# Import libraries - -from official.core import config_definitions as cfg -from official.modeling import hyperparams - - -@dataclasses.dataclass -class TfExampleDecoder(hyperparams.Config): - """A simple TF Example decoder config.""" - regenerate_source_id: bool = False - mask_binarize_threshold: Optional[float] = None - - -@dataclasses.dataclass -class TfExampleDecoderLabelMap(hyperparams.Config): - """TF Example decoder with label map config.""" - regenerate_source_id: bool = False - mask_binarize_threshold: Optional[float] = None - label_map: str = '' - - -@dataclasses.dataclass -class DataDecoder(hyperparams.OneOfConfig): - """Data decoder config. - - Attributes: - type: 'str', type of data decoder be used, one of the fields below. - simple_decoder: simple TF Example decoder config. - label_map_decoder: TF Example decoder with label map config. - """ - type: Optional[str] = 'simple_decoder' - simple_decoder: TfExampleDecoder = TfExampleDecoder() - label_map_decoder: TfExampleDecoderLabelMap = TfExampleDecoderLabelMap() - - -@dataclasses.dataclass -class RandAugment(hyperparams.Config): - """Configuration for RandAugment.""" - num_layers: int = 2 - magnitude: float = 10 - cutout_const: float = 40 - translate_const: float = 10 - magnitude_std: float = 0.0 - prob_to_apply: Optional[float] = None - exclude_ops: List[str] = dataclasses.field(default_factory=list) - - -@dataclasses.dataclass -class AutoAugment(hyperparams.Config): - """Configuration for AutoAugment.""" - augmentation_name: str = 'v0' - cutout_const: float = 100 - translate_const: float = 250 - - -@dataclasses.dataclass -class RandomErasing(hyperparams.Config): - """Configuration for RandomErasing.""" - probability: float = 0.25 - min_area: float = 0.02 - max_area: float = 1 / 3 - min_aspect: float = 0.3 - max_aspect = None - min_count = 1 - max_count = 1 - trials = 10 - - -@dataclasses.dataclass -class MixupAndCutmix(hyperparams.Config): - """Configuration for MixupAndCutmix.""" - mixup_alpha: float = .8 - cutmix_alpha: float = 1. - prob: float = 1.0 - switch_prob: float = 0.5 - label_smoothing: float = 0.1 - - -@dataclasses.dataclass -class Augmentation(hyperparams.OneOfConfig): - """Configuration for input data augmentation. - - Attributes: - type: 'str', type of augmentation be used, one of the fields below. - randaug: RandAugment config. - autoaug: AutoAugment config. - """ - type: Optional[str] = None - randaug: RandAugment = RandAugment() - autoaug: AutoAugment = AutoAugment() - - -@dataclasses.dataclass -class NormActivation(hyperparams.Config): - activation: str = 'relu' - use_sync_bn: bool = True - norm_momentum: float = 0.99 - norm_epsilon: float = 0.001 - - -@dataclasses.dataclass -class PseudoLabelDataConfig(cfg.DataConfig): - """Psuedo Label input config for training.""" - input_path: str = '' - data_ratio: float = 1.0 # Per-batch ratio of pseudo-labeled to labeled data. - is_training: bool = True - dtype: str = 'float32' - shuffle_buffer_size: int = 10000 - cycle_length: int = 10 - aug_rand_hflip: bool = True - aug_type: Optional[ - Augmentation] = None # Choose from AutoAugment and RandAugment. - file_type: str = 'tfrecord' - - # Keep for backward compatibility. - aug_policy: Optional[str] = None # None, 'autoaug', or 'randaug'. - randaug_magnitude: Optional[int] = 10 diff --git a/official/vision/beta/configs/decoders.py b/official/vision/beta/configs/decoders.py deleted file mode 100644 index 4081d429e..000000000 --- a/official/vision/beta/configs/decoders.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoders configurations.""" -import dataclasses -from typing import List, Optional - -# Import libraries - -from official.modeling import hyperparams - - -@dataclasses.dataclass -class Identity(hyperparams.Config): - """Identity config.""" - pass - - -@dataclasses.dataclass -class FPN(hyperparams.Config): - """FPN config.""" - num_filters: int = 256 - fusion_type: str = 'sum' - use_separable_conv: bool = False - - -@dataclasses.dataclass -class NASFPN(hyperparams.Config): - """NASFPN config.""" - num_filters: int = 256 - num_repeats: int = 5 - use_separable_conv: bool = False - - -@dataclasses.dataclass -class ASPP(hyperparams.Config): - """ASPP config.""" - level: int = 4 - dilation_rates: List[int] = dataclasses.field(default_factory=list) - dropout_rate: float = 0.0 - num_filters: int = 256 - use_depthwise_convolution: bool = False - pool_kernel_size: Optional[List[int]] = None # Use global average pooling. - spp_layer_version: str = 'v1' - output_tensor: bool = False - - -@dataclasses.dataclass -class Decoder(hyperparams.OneOfConfig): - """Configuration for decoders. - - Attributes: - type: 'str', type of decoder be used, one of the fields below. - fpn: fpn config. - """ - type: Optional[str] = None - fpn: FPN = FPN() - nasfpn: NASFPN = NASFPN() - identity: Identity = Identity() - aspp: ASPP = ASPP() diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_gpu.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_gpu.yaml deleted file mode 100644 index ff1a0719e..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_gpu.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# MobileNetV2_1.0 ImageNet classification. 71.0% top-1 and 90.0% top-5 accuracy. -runtime: - distribution_strategy: 'mirrored' - mixed_precision_dtype: 'float16' - loss_scale: 'dynamic' -task: - model: - num_classes: 1001 - input_size: [224, 224, 3] - backbone: - type: 'mobilenet' - mobilenet: - model_id: 'MobileNetV2' - filter_size_scale: 1.0 - dropout_rate: 0.2 - losses: - l2_weight_decay: 0.00001 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 1024 # 128 * 8 - dtype: 'float16' - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 1024 # 128 * 8 - dtype: 'float16' - drop_remainder: false -trainer: - train_steps: 625500 # 500 epochs - validation_steps: 49 - validation_interval: 1251 - steps_per_loop: 1251 # NUM_EXAMPLES (1281167) // global_batch_size - summary_interval: 1251 - checkpoint_interval: 1251 - optimizer_config: - learning_rate: - type: 'exponential' - exponential: - initial_learning_rate: 0.064 # 0.008 * batch_size / 128 - decay_steps: 3127 # 2.5 * steps_per_epoch - decay_rate: 0.96 - staircase: true - warmup: - type: 'linear' - linear: - warmup_steps: 6255 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_tpu.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_tpu.yaml deleted file mode 100644 index b5df9d6e7..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_tpu.yaml +++ /dev/null @@ -1,48 +0,0 @@ -# MobileNetV2_1.0 ImageNet classification. 72.72% top-1 and 91.05% top-5 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [224, 224, 3] - backbone: - type: 'mobilenet' - mobilenet: - model_id: 'MobileNetV2' - filter_size_scale: 1.0 - dropout_rate: 0.2 - losses: - l2_weight_decay: 0.00001 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 156000 # 500 epochs - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 # NUM_EXAMPLES (1281167) // global_batch_size - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - learning_rate: - type: 'exponential' - exponential: - initial_learning_rate: 0.256 # 0.008 * batch_size / 128 - decay_steps: 780 # 2.5 * steps_per_epoch - decay_rate: 0.96 - staircase: true - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv3large_tpu.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv3large_tpu.yaml deleted file mode 100644 index a8fb0c95d..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv3large_tpu.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# MobileNetV3-large_1.0 ImageNet classification: 74.96% top-1. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [224, 224, 3] - backbone: - type: 'mobilenet' - mobilenet: - model_id: 'MobileNetV3Large' - filter_size_scale: 1.0 - dropout_rate: 0.2 - losses: - l2_weight_decay: 0.00001 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - # Enables Inception-style pre-processing. - decode_jpeg_only: false - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false - # Enables Inception-style pre-processing. - decode_jpeg_only: false -trainer: - train_steps: 156000 # 500 epochs - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 # NUM_EXAMPLES (1281167) // global_batch_size - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - learning_rate: - type: 'cosine' - cosine: - alpha: 0.0 - decay_steps: 156000 - initial_learning_rate: 0.5 - name: CosineDecay - offset: 0 - warmup: - type: 'linear' - linear: - warmup_steps: 5000 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv3small_tpu.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv3small_tpu.yaml deleted file mode 100644 index 574667952..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv3small_tpu.yaml +++ /dev/null @@ -1,63 +0,0 @@ -# MobileNetV3Small ImageNet classification. 67.5% top-1 and 87.6% top-5 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [224, 224, 3] - backbone: - type: 'mobilenet' - mobilenet: - model_id: 'MobileNetV3Small' - filter_size_scale: 1.0 - norm_activation: - activation: 'relu' - norm_momentum: 0.997 - norm_epsilon: 0.001 - use_sync_bn: false - dropout_rate: 0.2 - losses: - l2_weight_decay: 0.00001 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 312000 # 1000 epochs - validation_steps: 12 - validation_interval: 312 - steps_per_loop: 312 # NUM_EXAMPLES (1281167) // global_batch_size - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - optimizer: - type: 'rmsprop' - rmsprop: - rho: 0.9 - momentum: 0.9 - epsilon: 0.002 - learning_rate: - type: 'exponential' - exponential: - initial_learning_rate: 0.01 - decay_steps: 936 # 3 * steps_per_epoch - decay_rate: 0.99 - staircase: true - ema: - average_decay: 0.9999 - trainable_weights_only: false - warmup: - type: 'linear' - linear: - warmup_steps: 1560 - warmup_learning_rate: 0.001 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_deeplab_tpu.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_deeplab_tpu.yaml deleted file mode 100644 index 5d7d29596..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_deeplab_tpu.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# Top-1 accuracy 81.6% on ImageNet -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [224, 224, 3] - backbone: - type: 'dilated_resnet' - dilated_resnet: - model_id: 101 - output_stride: 16 - stem_type: 'v1' - se_ratio: 0.25 - stochastic_depth_drop_rate: 0.2 - multigrid: [1, 2, 4] - last_stage_repeats: 1 - norm_activation: - activation: 'swish' - losses: - l2_weight_decay: 0.00004 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - aug_policy: 'randaug' - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 109200 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 109200 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_tpu.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_tpu.yaml deleted file mode 100644 index 2600f58fa..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet101_tpu.yaml +++ /dev/null @@ -1,50 +0,0 @@ -# ResNet-101 ImageNet classification. 79.1% top-1 and 94.5% top-5 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [224, 224, 3] - backbone: - type: 'resnet' - resnet: - model_id: 101 - norm_activation: - activation: 'swish' - losses: - l2_weight_decay: 0.0001 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 62400 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 62400 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet152_tpu.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnet152_tpu.yaml deleted file mode 100644 index 1c81953e2..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet152_tpu.yaml +++ /dev/null @@ -1,50 +0,0 @@ -# ResNet-152 ImageNet classification. 79.4% top-1 and 94.7% top-5 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [224, 224, 3] - backbone: - type: 'resnet' - resnet: - model_id: 152 - norm_activation: - activation: 'swish' - losses: - l2_weight_decay: 0.0001 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 62400 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 62400 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_deeplab_tpu.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_deeplab_tpu.yaml deleted file mode 100644 index 11bdafbc3..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_deeplab_tpu.yaml +++ /dev/null @@ -1,50 +0,0 @@ -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [224, 224, 3] - backbone: - type: 'dilated_resnet' - dilated_resnet: - model_id: 50 - output_stride: 16 - norm_activation: - activation: 'swish' - losses: - l2_weight_decay: 0.0001 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 62400 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 62400 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_gpu.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_gpu.yaml deleted file mode 100644 index dd6a4dc16..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_gpu.yaml +++ /dev/null @@ -1,48 +0,0 @@ -runtime: - distribution_strategy: 'mirrored' - mixed_precision_dtype: 'float16' - loss_scale: 'dynamic' -task: - model: - num_classes: 1001 - input_size: [224, 224, 3] - backbone: - type: 'resnet' - resnet: - model_id: 50 - losses: - l2_weight_decay: 0.0001 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 2048 - dtype: 'float16' - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 2048 - dtype: 'float16' - drop_remainder: false -trainer: - train_steps: 56160 - validation_steps: 25 - validation_interval: 625 - steps_per_loop: 625 - summary_interval: 625 - checkpoint_interval: 625 - optimizer_config: - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'stepwise' - stepwise: - boundaries: [18750, 37500, 50000] - values: [0.8, 0.08, 0.008, 0.0008] - warmup: - type: 'linear' - linear: - warmup_steps: 3125 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tfds_tpu.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tfds_tpu.yaml deleted file mode 100644 index 1506b48f9..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tfds_tpu.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# ResNet-50 ImageNet classification. 78.1% top-1 and 93.9% top-5 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [224, 224, 3] - backbone: - type: 'resnet' - resnet: - model_id: 50 - norm_activation: - activation: 'swish' - losses: - l2_weight_decay: 0.0001 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: '' - tfds_name: 'imagenet2012' - tfds_split: 'train' - sharding: true - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - validation_data: - input_path: '' - tfds_name: 'imagenet2012' - tfds_split: 'validation' - sharding: true - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 62400 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 62400 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml deleted file mode 100644 index 358cafb6d..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnet50_tpu.yaml +++ /dev/null @@ -1,47 +0,0 @@ -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [224, 224, 3] - backbone: - type: 'resnet' - resnet: - model_id: 50 - losses: - l2_weight_decay: 0.0001 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 28080 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'stepwise' - stepwise: - boundaries: [9360, 18720, 24960] - values: [1.6, 0.16, 0.016, 0.0016] - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml deleted file mode 100644 index 7c9e7b80a..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i160.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# ResNet-RS-101 ImageNet classification. 80.2% top-1 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [160, 160, 3] - backbone: - type: 'resnet' - resnet: - model_id: 101 - replace_stem_max_pool: true - resnetd_shortcut: true - se_ratio: 0.25 - stem_type: 'v1' - stochastic_depth_drop_rate: 0.0 - norm_activation: - activation: 'swish' - norm_momentum: 0.0 - use_sync_bn: false - dropout_rate: 0.25 - losses: - l2_weight_decay: 0.00004 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - aug_type: - type: 'randaug' - randaug: - magnitude: 15 - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 109200 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - ema: - average_decay: 0.9999 - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 109200 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml deleted file mode 100644 index 576c48625..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs101_i192.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# ResNet-RS-101 ImageNet classification. 81.3% top-1 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [192, 192, 3] - backbone: - type: 'resnet' - resnet: - model_id: 101 - replace_stem_max_pool: true - resnetd_shortcut: true - se_ratio: 0.25 - stem_type: 'v1' - stochastic_depth_drop_rate: 0.0 - norm_activation: - activation: 'swish' - norm_momentum: 0.0 - use_sync_bn: false - dropout_rate: 0.25 - losses: - l2_weight_decay: 0.00004 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - aug_type: - type: 'randaug' - randaug: - magnitude: 15 - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 109200 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - ema: - average_decay: 0.9999 - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 109200 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml deleted file mode 100644 index b1c8edc46..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i192.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# ResNet-RS-152 ImageNet classification. 81.9% top-1 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [192, 192, 3] - backbone: - type: 'resnet' - resnet: - model_id: 152 - replace_stem_max_pool: true - resnetd_shortcut: true - se_ratio: 0.25 - stem_type: 'v1' - stochastic_depth_drop_rate: 0.0 - norm_activation: - activation: 'swish' - norm_momentum: 0.0 - use_sync_bn: false - dropout_rate: 0.25 - losses: - l2_weight_decay: 0.00004 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - aug_type: - type: 'randaug' - randaug: - magnitude: 15 - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 109200 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - ema: - average_decay: 0.9999 - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 109200 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml deleted file mode 100644 index 2ec14bae5..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i224.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# ResNet-RS-152 ImageNet classification. 82.5% top-1 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [224, 224, 3] - backbone: - type: 'resnet' - resnet: - model_id: 152 - replace_stem_max_pool: true - resnetd_shortcut: true - se_ratio: 0.25 - stem_type: 'v1' - stochastic_depth_drop_rate: 0.0 - norm_activation: - activation: 'swish' - norm_momentum: 0.0 - use_sync_bn: false - dropout_rate: 0.25 - losses: - l2_weight_decay: 0.00004 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - aug_type: - type: 'randaug' - randaug: - magnitude: 15 - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 109200 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - ema: - average_decay: 0.9999 - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 109200 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml deleted file mode 100644 index 91b53d621..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs152_i256.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# ResNet-RS-152 ImageNet classification. 83.1% top-1 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [256, 256, 3] - backbone: - type: 'resnet' - resnet: - model_id: 152 - replace_stem_max_pool: true - resnetd_shortcut: true - se_ratio: 0.25 - stem_type: 'v1' - stochastic_depth_drop_rate: 0.0 - norm_activation: - activation: 'swish' - norm_momentum: 0.0 - use_sync_bn: false - dropout_rate: 0.25 - losses: - l2_weight_decay: 0.00004 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - aug_type: - type: 'randaug' - randaug: - magnitude: 15 - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 109200 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - ema: - average_decay: 0.9999 - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 109200 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml deleted file mode 100644 index 9d76c0101..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs200_i256.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# ResNet-RS-200 ImageNet classification. 83.5% top-1 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [256, 256, 3] - backbone: - type: 'resnet' - resnet: - model_id: 200 - replace_stem_max_pool: true - resnetd_shortcut: true - se_ratio: 0.25 - stem_type: 'v1' - stochastic_depth_drop_rate: 0.1 - norm_activation: - activation: 'swish' - norm_momentum: 0.0 - use_sync_bn: false - dropout_rate: 0.25 - losses: - l2_weight_decay: 0.00004 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - aug_type: - type: 'randaug' - randaug: - magnitude: 15 - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 109200 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - ema: - average_decay: 0.9999 - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 109200 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml deleted file mode 100644 index b7c6a644e..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs270_i256.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# ResNet-RS-270 ImageNet classification. 83.6% top-1 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [256, 256, 3] - backbone: - type: 'resnet' - resnet: - model_id: 270 - replace_stem_max_pool: true - resnetd_shortcut: true - se_ratio: 0.25 - stem_type: 'v1' - stochastic_depth_drop_rate: 0.1 - norm_activation: - activation: 'swish' - norm_momentum: 0.0 - use_sync_bn: false - dropout_rate: 0.25 - losses: - l2_weight_decay: 0.00004 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - aug_type: - type: 'randaug' - randaug: - magnitude: 15 - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 109200 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - ema: - average_decay: 0.9999 - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 109200 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml deleted file mode 100644 index 3b2d3fe26..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i256.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# ResNet-RS-350 ImageNet classification. 83.7% top-1 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [256, 256, 3] - backbone: - type: 'resnet' - resnet: - model_id: 350 - replace_stem_max_pool: true - resnetd_shortcut: true - se_ratio: 0.25 - stem_type: 'v1' - stochastic_depth_drop_rate: 0.1 - norm_activation: - activation: 'swish' - norm_momentum: 0.0 - use_sync_bn: false - dropout_rate: 0.25 - losses: - l2_weight_decay: 0.00004 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - aug_type: - type: 'randaug' - randaug: - magnitude: 15 - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 109200 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - ema: - average_decay: 0.9999 - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 109200 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i320.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i320.yaml deleted file mode 100644 index 36cdba7bb..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs350_i320.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# ResNet-RS-350 ImageNet classification. 84.2% top-1 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [320, 320, 3] - backbone: - type: 'resnet' - resnet: - model_id: 350 - replace_stem_max_pool: true - resnetd_shortcut: true - se_ratio: 0.25 - stem_type: 'v1' - stochastic_depth_drop_rate: 0.1 - norm_activation: - activation: 'swish' - norm_momentum: 0.0 - use_sync_bn: false - dropout_rate: 0.4 - losses: - l2_weight_decay: 0.00004 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - aug_type: - type: 'randaug' - randaug: - magnitude: 15 - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 109200 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - ema: - average_decay: 0.9999 - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 109200 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs420_i320.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs420_i320.yaml deleted file mode 100644 index 9b02b7e00..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs420_i320.yaml +++ /dev/null @@ -1,63 +0,0 @@ -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [320, 320, 3] - backbone: - type: 'resnet' - resnet: - model_id: 420 - replace_stem_max_pool: true - resnetd_shortcut: true - se_ratio: 0.25 - stem_type: 'v1' - stochastic_depth_drop_rate: 0.1 - norm_activation: - activation: 'swish' - norm_momentum: 0.0 - use_sync_bn: false - dropout_rate: 0.4 - losses: - l2_weight_decay: 0.00004 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - aug_type: - type: 'randaug' - randaug: - magnitude: 15 - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 109200 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - ema: - average_decay: 0.9999 - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 109200 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml b/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml deleted file mode 100644 index a57f41f39..000000000 --- a/official/vision/beta/configs/experiments/image_classification/imagenet_resnetrs50_i160.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# ResNet-RS-50 ImageNet classification. 79.1% top-1 accuracy. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - num_classes: 1001 - input_size: [160, 160, 3] - backbone: - type: 'resnet' - resnet: - model_id: 50 - replace_stem_max_pool: true - resnetd_shortcut: true - se_ratio: 0.25 - stem_type: 'v1' - stochastic_depth_drop_rate: 0.0 - norm_activation: - activation: 'swish' - norm_momentum: 0.0 - use_sync_bn: false - dropout_rate: 0.25 - losses: - l2_weight_decay: 0.00004 - one_hot: true - label_smoothing: 0.1 - train_data: - input_path: 'imagenet-2012-tfrecord/train*' - is_training: true - global_batch_size: 4096 - dtype: 'bfloat16' - aug_type: - type: 'randaug' - randaug: - magnitude: 10 - validation_data: - input_path: 'imagenet-2012-tfrecord/valid*' - is_training: false - global_batch_size: 4096 - dtype: 'bfloat16' - drop_remainder: false -trainer: - train_steps: 109200 - validation_steps: 13 - validation_interval: 312 - steps_per_loop: 312 - summary_interval: 312 - checkpoint_interval: 312 - optimizer_config: - ema: - average_decay: 0.9999 - optimizer: - type: 'sgd' - sgd: - momentum: 0.9 - learning_rate: - type: 'cosine' - cosine: - initial_learning_rate: 1.6 - decay_steps: 109200 - warmup: - type: 'linear' - linear: - warmup_steps: 1560 diff --git a/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet143_cascadercnn_tpu.yaml b/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet143_cascadercnn_tpu.yaml deleted file mode 100644 index 1f8b245da..000000000 --- a/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet143_cascadercnn_tpu.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# --experiment_type=cascadercnn_spinenet_coco -# Expect to reach: box mAP: 51.9%, mask mAP: 45.0% on COCO -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - init_checkpoint: null - train_data: - global_batch_size: 256 - parser: - aug_rand_hflip: true - aug_scale_min: 0.1 - aug_scale_max: 2.5 - losses: - l2_weight_decay: 0.00004 - model: - anchor: - anchor_size: 4.0 - num_scales: 3 - min_level: 3 - max_level: 7 - input_size: [1280, 1280, 3] - backbone: - spinenet: - stochastic_depth_drop_rate: 0.2 - model_id: '143' - type: 'spinenet' - decoder: - type: 'identity' - detection_head: - cascade_class_ensemble: true - class_agnostic_bbox_pred: true - rpn_head: - num_convs: 2 - num_filters: 256 - roi_sampler: - cascade_iou_thresholds: [0.7] - foreground_iou_threshold: 0.6 - norm_activation: - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - activation: 'swish' - detection_generator: - pre_nms_top_k: 1000 -trainer: - train_steps: 231000 - optimizer_config: - learning_rate: - type: 'stepwise' - stepwise: - boundaries: [219450, 226380] - values: [0.32, 0.032, 0.0032] - warmup: - type: 'linear' - linear: - warmup_steps: 2000 - warmup_learning_rate: 0.0067 diff --git a/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet143_mrcnn_tpu.yaml b/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet143_mrcnn_tpu.yaml deleted file mode 100644 index 4d5ec8ae4..000000000 --- a/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet143_mrcnn_tpu.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# Expect to reach: box mAP: 49.3%, mask mAP: 43.4% on COCO -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - init_checkpoint: null - train_data: - global_batch_size: 256 - parser: - aug_rand_hflip: true - aug_scale_min: 0.1 - aug_scale_max: 2.0 - losses: - l2_weight_decay: 0.00004 - model: - anchor: - anchor_size: 4.0 - num_scales: 3 - min_level: 3 - max_level: 7 - input_size: [1280, 1280, 3] - backbone: - spinenet: - stochastic_depth_drop_rate: 0.2 - model_id: '143' - type: 'spinenet' - decoder: - type: 'identity' - norm_activation: - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - detection_generator: - pre_nms_top_k: 1000 -trainer: - train_steps: 231000 - optimizer_config: - learning_rate: - type: 'stepwise' - stepwise: - boundaries: [219450, 226380] - values: [0.32, 0.032, 0.0032] - warmup: - type: 'linear' - linear: - warmup_steps: 2000 - warmup_learning_rate: 0.0067 diff --git a/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet49_cascadercnn_tpu.yaml b/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet49_cascadercnn_tpu.yaml deleted file mode 100644 index a28dd4bb0..000000000 --- a/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet49_cascadercnn_tpu.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# --experiment_type=cascadercnn_spinenet_coco -# Expect to reach: box mAP: 46.4%, mask mAP: 40.0% on COCO -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - init_checkpoint: null - train_data: - global_batch_size: 256 - parser: - aug_rand_hflip: true - aug_scale_min: 0.1 - aug_scale_max: 2.0 - losses: - l2_weight_decay: 0.00004 - model: - anchor: - anchor_size: 3.0 - num_scales: 3 - min_level: 3 - max_level: 7 - input_size: [640, 640, 3] - backbone: - spinenet: - stochastic_depth_drop_rate: 0.2 - model_id: '49' - type: 'spinenet' - decoder: - type: 'identity' - detection_head: - cascade_class_ensemble: true - class_agnostic_bbox_pred: true - rpn_head: - num_convs: 2 - num_filters: 256 - roi_sampler: - cascade_iou_thresholds: [0.7] - foreground_iou_threshold: 0.6 - norm_activation: - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - activation: 'swish' - detection_generator: - pre_nms_top_k: 1000 -trainer: - train_steps: 231000 - optimizer_config: - learning_rate: - type: 'stepwise' - stepwise: - boundaries: [219450, 226380] - values: [0.32, 0.032, 0.0032] - warmup: - type: 'linear' - linear: - warmup_steps: 2000 - warmup_learning_rate: 0.0067 diff --git a/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet49_mrcnn_tpu.yaml b/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet49_mrcnn_tpu.yaml deleted file mode 100644 index 4ac1ae428..000000000 --- a/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet49_mrcnn_tpu.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# Expect to reach: box mAP: 43.2%, mask mAP: 38.3% on COCO -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - init_checkpoint: null - train_data: - global_batch_size: 256 - parser: - aug_rand_hflip: true - aug_scale_min: 0.1 - aug_scale_max: 2.0 - losses: - l2_weight_decay: 0.00004 - model: - anchor: - anchor_size: 3.0 - num_scales: 3 - min_level: 3 - max_level: 7 - input_size: [640, 640, 3] - backbone: - spinenet: - stochastic_depth_drop_rate: 0.2 - model_id: '49' - type: 'spinenet' - decoder: - type: 'identity' - norm_activation: - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - detection_generator: - pre_nms_top_k: 1000 -trainer: - train_steps: 231000 - optimizer_config: - learning_rate: - type: 'stepwise' - stepwise: - boundaries: [219450, 226380] - values: [0.32, 0.032, 0.0032] - warmup: - type: 'linear' - linear: - warmup_steps: 2000 - warmup_learning_rate: 0.0067 diff --git a/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet96_cascadercnn_tpu.yaml b/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet96_cascadercnn_tpu.yaml deleted file mode 100644 index b8dd2fb9d..000000000 --- a/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet96_cascadercnn_tpu.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# --experiment_type=cascadercnn_spinenet_coco -# Expect to reach: box mAP: 51.9%, mask mAP: 45.0% on COCO -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - init_checkpoint: null - train_data: - global_batch_size: 256 - parser: - aug_rand_hflip: true - aug_scale_min: 0.1 - aug_scale_max: 2.5 - losses: - l2_weight_decay: 0.00004 - model: - anchor: - anchor_size: 4.0 - num_scales: 3 - min_level: 3 - max_level: 7 - input_size: [1024, 1024, 3] - backbone: - spinenet: - stochastic_depth_drop_rate: 0.2 - model_id: '96' - type: 'spinenet' - decoder: - type: 'identity' - detection_head: - cascade_class_ensemble: true - class_agnostic_bbox_pred: true - rpn_head: - num_convs: 2 - num_filters: 256 - roi_sampler: - cascade_iou_thresholds: [0.7] - foreground_iou_threshold: 0.6 - norm_activation: - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - activation: 'swish' - detection_generator: - pre_nms_top_k: 1000 -trainer: - train_steps: 231000 - optimizer_config: - learning_rate: - type: 'stepwise' - stepwise: - boundaries: [219450, 226380] - values: [0.32, 0.032, 0.0032] - warmup: - type: 'linear' - linear: - warmup_steps: 2000 - warmup_learning_rate: 0.0067 diff --git a/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet96_mrcnn_tpu.yaml b/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet96_mrcnn_tpu.yaml deleted file mode 100644 index 9609b7eee..000000000 --- a/official/vision/beta/configs/experiments/maskrcnn/coco_spinenet96_mrcnn_tpu.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# Expect to reach: box mAP: 48.1%, mask mAP: 42.4% on COCO -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - init_checkpoint: null - train_data: - global_batch_size: 256 - parser: - aug_rand_hflip: true - aug_scale_min: 0.1 - aug_scale_max: 2.0 - losses: - l2_weight_decay: 0.00004 - model: - anchor: - anchor_size: 3.0 - num_scales: 3 - min_level: 3 - max_level: 7 - input_size: [1024, 1024, 3] - backbone: - spinenet: - stochastic_depth_drop_rate: 0.2 - model_id: '96' - type: 'spinenet' - decoder: - type: 'identity' - norm_activation: - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - detection_generator: - pre_nms_top_k: 1000 -trainer: - train_steps: 231000 - optimizer_config: - learning_rate: - type: 'stepwise' - stepwise: - boundaries: [219450, 226380] - values: [0.32, 0.032, 0.0032] - warmup: - type: 'linear' - linear: - warmup_steps: 2000 - warmup_learning_rate: 0.0067 diff --git a/official/vision/beta/configs/experiments/maskrcnn/r50fpn_640_coco_scratch_tpu4x4.yaml b/official/vision/beta/configs/experiments/maskrcnn/r50fpn_640_coco_scratch_tpu4x4.yaml deleted file mode 100644 index 218f04510..000000000 --- a/official/vision/beta/configs/experiments/maskrcnn/r50fpn_640_coco_scratch_tpu4x4.yaml +++ /dev/null @@ -1,36 +0,0 @@ -# Expect to reach: box mAP: 42.3%, mask mAP: 37.6% on COCO -task: - init_checkpoint: null - train_data: - global_batch_size: 256 - parser: - aug_rand_hflip: true - aug_scale_min: 0.5 - aug_scale_max: 2.0 - losses: - l2_weight_decay: 0.00008 - model: - anchor: - anchor_size: 3.0 - min_level: 3 - max_level: 7 - input_size: [640, 640, 3] - norm_activation: - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - detection_generator: - pre_nms_top_k: 1000 -trainer: - train_steps: 162050 - optimizer_config: - learning_rate: - type: 'stepwise' - stepwise: - boundaries: [148160, 157420] - values: [0.32, 0.032, 0.0032] - warmup: - type: 'linear' - linear: - warmup_steps: 2000 - warmup_learning_rate: 0.0067 diff --git a/official/vision/beta/configs/experiments/retinanet/coco_mobiledetcpu_tpu.yaml b/official/vision/beta/configs/experiments/retinanet/coco_mobiledetcpu_tpu.yaml deleted file mode 100644 index 80608cc1f..000000000 --- a/official/vision/beta/configs/experiments/retinanet/coco_mobiledetcpu_tpu.yaml +++ /dev/null @@ -1,62 +0,0 @@ -# --experiment_type=retinanet_mobile_coco -# COCO AP 27.0% -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - losses: - l2_weight_decay: 3.0e-05 - model: - anchor: - anchor_size: 3 - aspect_ratios: [0.5, 1.0, 2.0] - num_scales: 3 - backbone: - mobilenet: - model_id: 'MobileDetCPU' - filter_size_scale: 1.0 - type: 'mobiledet' - decoder: - type: 'fpn' - fpn: - num_filters: 128 - use_separable_conv: true - head: - num_convs: 4 - num_filters: 128 - use_separable_conv: true - input_size: [320 320, 3] - max_level: 6 - min_level: 3 - norm_activation: - activation: 'relu6' - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - train_data: - dtype: 'bfloat16' - global_batch_size: 256 - is_training: true - parser: - aug_rand_hflip: true - aug_scale_max: 2.0 - aug_scale_min: 0.5 - validation_data: - dtype: 'bfloat16' - global_batch_size: 8 - is_training: false -trainer: - optimizer_config: - learning_rate: - stepwise: - boundaries: [263340, 272580] - values: [0.32, 0.032, 0.0032] - type: 'stepwise' - warmup: - linear: - warmup_learning_rate: 0.0067 - warmup_steps: 2000 - steps_per_loop: 462 - train_steps: 277200 - validation_interval: 462 - validation_steps: 625 diff --git a/official/vision/beta/configs/experiments/retinanet/coco_mobilenetv2_tpu.yaml b/official/vision/beta/configs/experiments/retinanet/coco_mobilenetv2_tpu.yaml deleted file mode 100644 index 9e27bfe8c..000000000 --- a/official/vision/beta/configs/experiments/retinanet/coco_mobilenetv2_tpu.yaml +++ /dev/null @@ -1,62 +0,0 @@ -# --experiment_type=retinanet_mobile_coco -# COCO AP 23.5% -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - losses: - l2_weight_decay: 3.0e-05 - model: - anchor: - anchor_size: 3 - aspect_ratios: [0.5, 1.0, 2.0] - num_scales: 3 - backbone: - mobilenet: - model_id: 'MobileNetV2' - filter_size_scale: 1.0 - type: 'mobilenet' - decoder: - type: 'fpn' - fpn: - num_filters: 128 - use_separable_conv: true - head: - num_convs: 4 - num_filters: 128 - use_separable_conv: true - input_size: [256, 256, 3] - max_level: 7 - min_level: 3 - norm_activation: - activation: 'relu6' - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - train_data: - dtype: 'bfloat16' - global_batch_size: 256 - is_training: true - parser: - aug_rand_hflip: true - aug_scale_max: 2.0 - aug_scale_min: 0.5 - validation_data: - dtype: 'bfloat16' - global_batch_size: 8 - is_training: false -trainer: - optimizer_config: - learning_rate: - stepwise: - boundaries: [263340, 272580] - values: [0.32, 0.032, 0.0032] - type: 'stepwise' - warmup: - linear: - warmup_learning_rate: 0.0067 - warmup_steps: 2000 - steps_per_loop: 462 - train_steps: 277200 - validation_interval: 462 - validation_steps: 625 diff --git a/official/vision/beta/configs/experiments/retinanet/coco_spinenet143_tpu.yaml b/official/vision/beta/configs/experiments/retinanet/coco_spinenet143_tpu.yaml deleted file mode 100644 index 438fe031a..000000000 --- a/official/vision/beta/configs/experiments/retinanet/coco_spinenet143_tpu.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# SpineNet-143 COCO detection with protocal C config. Expecting 50.0% AP. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - losses: - l2_weight_decay: 4.0e-05 - model: - anchor: - anchor_size: 4 - aspect_ratios: [0.5, 1.0, 2.0] - num_scales: 3 - backbone: - spinenet: - stochastic_depth_drop_rate: 0.2 - model_id: '143' - type: 'spinenet' - decoder: - type: 'identity' - head: - num_convs: 4 - num_filters: 256 - input_size: [1280, 1280, 3] - max_level: 7 - min_level: 3 - norm_activation: - activation: 'swish' - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - train_data: - dtype: 'bfloat16' - global_batch_size: 256 - is_training: true - parser: - aug_rand_hflip: true - aug_scale_max: 2.0 - aug_scale_min: 0.1 - validation_data: - dtype: 'bfloat16' - global_batch_size: 8 - is_training: false -trainer: - checkpoint_interval: 462 - optimizer_config: - learning_rate: - stepwise: - boundaries: [219450, 226380] - values: [0.32, 0.032, 0.0032] - type: 'stepwise' - warmup: - linear: - warmup_learning_rate: 0.0067 - warmup_steps: 2000 - steps_per_loop: 462 - train_steps: 231000 - validation_interval: 462 - validation_steps: 625 diff --git a/official/vision/beta/configs/experiments/retinanet/coco_spinenet190_tpu.yaml b/official/vision/beta/configs/experiments/retinanet/coco_spinenet190_tpu.yaml deleted file mode 100644 index bc0ea1f94..000000000 --- a/official/vision/beta/configs/experiments/retinanet/coco_spinenet190_tpu.yaml +++ /dev/null @@ -1,57 +0,0 @@ -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - losses: - l2_weight_decay: 4.0e-05 - model: - anchor: - anchor_size: 4 - aspect_ratios: [0.5, 1.0, 2.0] - num_scales: 3 - backbone: - spinenet: - stochastic_depth_drop_rate: 0.2 - model_id: '190' - type: 'spinenet' - decoder: - type: 'identity' - head: - num_convs: 7 - num_filters: 512 - input_size: [1280, 1280, 3] - max_level: 7 - min_level: 3 - norm_activation: - activation: 'swish' - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - train_data: - dtype: 'bfloat16' - global_batch_size: 256 - is_training: true - parser: - aug_rand_hflip: true - aug_scale_max: 2.0 - aug_scale_min: 0.1 - validation_data: - dtype: 'bfloat16' - global_batch_size: 8 - is_training: false -trainer: - checkpoint_interval: 462 - optimizer_config: - learning_rate: - stepwise: - boundaries: [219450, 226380] - values: [0.32, 0.032, 0.0032] - type: 'stepwise' - warmup: - linear: - warmup_learning_rate: 0.0067 - warmup_steps: 2000 - steps_per_loop: 462 - train_steps: 231000 - validation_interval: 462 - validation_steps: 625 diff --git a/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml b/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml deleted file mode 100644 index e1e14b321..000000000 --- a/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_mobile_tpu.yaml +++ /dev/null @@ -1,60 +0,0 @@ -# --experiment_type=retinanet_mobile_coco -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - losses: - l2_weight_decay: 3.0e-05 - model: - anchor: - anchor_size: 3 - aspect_ratios: [0.5, 1.0, 2.0] - num_scales: 3 - backbone: - spinenet_mobile: - stochastic_depth_drop_rate: 0.2 - model_id: '49' - se_ratio: 0.2 - type: 'spinenet_mobile' - decoder: - type: 'identity' - head: - num_convs: 4 - num_filters: 48 - use_separable_conv: true - input_size: [384, 384, 3] - max_level: 7 - min_level: 3 - norm_activation: - activation: 'swish' - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - train_data: - dtype: 'bfloat16' - global_batch_size: 256 - is_training: true - parser: - aug_rand_hflip: true - aug_scale_max: 2.0 - aug_scale_min: 0.5 - validation_data: - dtype: 'bfloat16' - global_batch_size: 8 - is_training: false -trainer: - checkpoint_interval: 462 - optimizer_config: - learning_rate: - stepwise: - boundaries: [263340, 272580] - values: [0.32, 0.032, 0.0032] - type: 'stepwise' - warmup: - linear: - warmup_learning_rate: 0.0067 - warmup_steps: 2000 - steps_per_loop: 462 - train_steps: 277200 - validation_interval: 462 - validation_steps: 625 diff --git a/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_tpu.yaml b/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_tpu.yaml deleted file mode 100644 index 725e1fc9b..000000000 --- a/official/vision/beta/configs/experiments/retinanet/coco_spinenet49_tpu.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# SpineNet-49 COCO detection with protocal C config. Expecting 44.2% AP. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - losses: - l2_weight_decay: 4.0e-05 - model: - anchor: - anchor_size: 3 - aspect_ratios: [0.5, 1.0, 2.0] - num_scales: 3 - backbone: - spinenet: - stochastic_depth_drop_rate: 0.2 - model_id: '49' - type: 'spinenet' - decoder: - type: 'identity' - head: - num_convs: 4 - num_filters: 256 - input_size: [640, 640, 3] - max_level: 7 - min_level: 3 - norm_activation: - activation: 'swish' - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - train_data: - dtype: 'bfloat16' - global_batch_size: 256 - is_training: true - parser: - aug_rand_hflip: true - aug_scale_max: 2.0 - aug_scale_min: 0.1 - validation_data: - dtype: 'bfloat16' - global_batch_size: 8 - is_training: false -trainer: - checkpoint_interval: 462 - optimizer_config: - learning_rate: - stepwise: - boundaries: [219450, 226380] - values: [0.32, 0.032, 0.0032] - type: 'stepwise' - warmup: - linear: - warmup_learning_rate: 0.0067 - warmup_steps: 2000 - steps_per_loop: 462 - train_steps: 231000 - validation_interval: 462 - validation_steps: 625 diff --git a/official/vision/beta/configs/experiments/retinanet/coco_spinenet49s_mobile_tpu.yaml b/official/vision/beta/configs/experiments/retinanet/coco_spinenet49s_mobile_tpu.yaml deleted file mode 100644 index 9f854ccf4..000000000 --- a/official/vision/beta/configs/experiments/retinanet/coco_spinenet49s_mobile_tpu.yaml +++ /dev/null @@ -1,60 +0,0 @@ -# --experiment_type=retinanet_mobile_coco -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - losses: - l2_weight_decay: 3.0e-05 - model: - anchor: - anchor_size: 3 - aspect_ratios: [0.5, 1.0, 2.0] - num_scales: 3 - backbone: - spinenet_mobile: - stochastic_depth_drop_rate: 0.2 - model_id: '49S' - se_ratio: 0.2 - type: 'spinenet_mobile' - decoder: - type: 'identity' - head: - num_convs: 4 - num_filters: 40 - use_separable_conv: true - input_size: [384, 384, 3] - max_level: 7 - min_level: 3 - norm_activation: - activation: 'swish' - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - train_data: - dtype: 'bfloat16' - global_batch_size: 256 - is_training: true - parser: - aug_rand_hflip: true - aug_scale_max: 2.0 - aug_scale_min: 0.5 - validation_data: - dtype: 'bfloat16' - global_batch_size: 8 - is_training: false -trainer: - checkpoint_interval: 462 - optimizer_config: - learning_rate: - stepwise: - boundaries: [263340, 272580] - values: [0.32, 0.032, 0.0032] - type: 'stepwise' - warmup: - linear: - warmup_learning_rate: 0.0067 - warmup_steps: 2000 - steps_per_loop: 462 - train_steps: 277200 - validation_interval: 462 - validation_steps: 625 diff --git a/official/vision/beta/configs/experiments/retinanet/coco_spinenet49xs_mobile_tpu.yaml b/official/vision/beta/configs/experiments/retinanet/coco_spinenet49xs_mobile_tpu.yaml deleted file mode 100644 index 926bd6209..000000000 --- a/official/vision/beta/configs/experiments/retinanet/coco_spinenet49xs_mobile_tpu.yaml +++ /dev/null @@ -1,60 +0,0 @@ -# --experiment_type=retinanet_mobile_coco -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - losses: - l2_weight_decay: 3.0e-05 - model: - anchor: - anchor_size: 3 - aspect_ratios: [0.5, 1.0, 2.0] - num_scales: 3 - backbone: - spinenet_mobile: - stochastic_depth_drop_rate: 0.2 - model_id: '49XS' - se_ratio: 0.2 - type: 'spinenet_mobile' - decoder: - type: 'identity' - head: - num_convs: 4 - num_filters: 24 - use_separable_conv: true - input_size: [256, 256, 3] - max_level: 7 - min_level: 3 - norm_activation: - activation: 'swish' - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - train_data: - dtype: 'bfloat16' - global_batch_size: 256 - is_training: true - parser: - aug_rand_hflip: true - aug_scale_max: 2.0 - aug_scale_min: 0.5 - validation_data: - dtype: 'bfloat16' - global_batch_size: 8 - is_training: false -trainer: - checkpoint_interval: 462 - optimizer_config: - learning_rate: - stepwise: - boundaries: [263340, 272580] - values: [0.32, 0.032, 0.0032] - type: 'stepwise' - warmup: - linear: - warmup_learning_rate: 0.0067 - warmup_steps: 2000 - steps_per_loop: 462 - train_steps: 277200 - validation_interval: 462 - validation_steps: 625 diff --git a/official/vision/beta/configs/experiments/retinanet/coco_spinenet96_tpu.yaml b/official/vision/beta/configs/experiments/retinanet/coco_spinenet96_tpu.yaml deleted file mode 100644 index c75d66775..000000000 --- a/official/vision/beta/configs/experiments/retinanet/coco_spinenet96_tpu.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# SpineNet-96 COCO detection with protocol C config. Expecting 48.5% AP. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - losses: - l2_weight_decay: 4.0e-05 - model: - anchor: - anchor_size: 3 - aspect_ratios: [0.5, 1.0, 2.0] - num_scales: 3 - backbone: - spinenet: - stochastic_depth_drop_rate: 0.2 - model_id: '96' - type: 'spinenet' - decoder: - type: 'identity' - head: - num_convs: 4 - num_filters: 256 - input_size: [1024, 1024, 3] - max_level: 7 - min_level: 3 - norm_activation: - activation: 'swish' - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - train_data: - dtype: 'bfloat16' - global_batch_size: 256 - is_training: true - parser: - aug_rand_hflip: true - aug_scale_max: 2.0 - aug_scale_min: 0.1 - validation_data: - dtype: 'bfloat16' - global_batch_size: 8 - is_training: false -trainer: - checkpoint_interval: 462 - optimizer_config: - learning_rate: - stepwise: - boundaries: [219450, 226380] - values: [0.32, 0.032, 0.0032] - type: 'stepwise' - warmup: - linear: - warmup_learning_rate: 0.0067 - warmup_steps: 2000 - steps_per_loop: 462 - train_steps: 231000 - validation_interval: 462 - validation_steps: 625 diff --git a/official/vision/beta/configs/experiments/retinanet/resnet50fpn_coco_tfds_tpu.yaml b/official/vision/beta/configs/experiments/retinanet/resnet50fpn_coco_tfds_tpu.yaml deleted file mode 100644 index 0f9a30a34..000000000 --- a/official/vision/beta/configs/experiments/retinanet/resnet50fpn_coco_tfds_tpu.yaml +++ /dev/null @@ -1,34 +0,0 @@ -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - annotation_file: '' # Can't use annotation file when tfds is used. - losses: - l2_weight_decay: 0.0001 - model: - num_classes: 91 - max_level: 7 - min_level: 3 - input_size: [640, 640, 3] - norm_activation: - activation: relu - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - train_data: - tfds_name: 'coco/2017' - tfds_split: 'train' - drop_remainder: true - dtype: bfloat16 - global_batch_size: 256 - input_path: '' - is_training: true - shuffle_buffer_size: 1000 - validation_data: - tfds_name: 'coco/2017' - tfds_split: 'validation' - drop_remainder: true - dtype: bfloat16 - global_batch_size: 8 - input_path: '' - is_training: false diff --git a/official/vision/beta/configs/experiments/retinanet/resnet50fpn_coco_tpu4x4_benchmark.yaml b/official/vision/beta/configs/experiments/retinanet/resnet50fpn_coco_tpu4x4_benchmark.yaml deleted file mode 100644 index 46b1f3cad..000000000 --- a/official/vision/beta/configs/experiments/retinanet/resnet50fpn_coco_tpu4x4_benchmark.yaml +++ /dev/null @@ -1,7 +0,0 @@ -# Benchmarks runs on same instnace, change eval batch size to fit on 4x4 tpu -task: - validation_data: - global_batch_size: 32 -trainer: - validation_interval: 1560 - validation_steps: 156 diff --git a/official/vision/beta/configs/experiments/semantic_segmentation/deeplabv3plus_resnet101_cityscapes_tfds_tpu.yaml b/official/vision/beta/configs/experiments/semantic_segmentation/deeplabv3plus_resnet101_cityscapes_tfds_tpu.yaml deleted file mode 100644 index 4ffc7689d..000000000 --- a/official/vision/beta/configs/experiments/semantic_segmentation/deeplabv3plus_resnet101_cityscapes_tfds_tpu.yaml +++ /dev/null @@ -1,78 +0,0 @@ -# Use your own cityscapes preprocessed dataset. 79% meanIoU. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'float32' -task: - model: - num_classes: 19 - input_size: [null, null, 3] - backbone: - type: 'dilated_resnet' - dilated_resnet: - model_id: 101 - output_stride: 16 - stem_type: 'v1' - se_ratio: 0.25 - stochastic_depth_drop_rate: 0.2 - multigrid: [1, 2, 4] - last_stage_repeats: 1 - decoder: - aspp: - pool_kernel_size: [512, 1024] - head: - feature_fusion: 'deeplabv3plus' - low_level: 2 - low_level_num_filters: 48 - norm_activation: - activation: 'swish' - norm_epsilon: 0.001 - norm_momentum: 0.99 - use_sync_bn: true - losses: - top_k_percent_pixels: 1.0 # only backpropagate loss for the topk 100% pixels. - train_data: - output_size: [1024, 2048] - crop_size: [512, 1024] - input_path: '' - tfds_name: 'cityscapes/semantic_segmentation' - tfds_split: 'train' - is_training: true - global_batch_size: 16 - dtype: 'float32' - aug_rand_hflip: true - aug_scale_max: 2.0 - aug_scale_min: 0.5 - validation_data: - output_size: [1024, 2048] - input_path: '' - tfds_name: 'cityscapes/semantic_segmentation' - tfds_split: 'validation' - is_training: false - global_batch_size: 16 - dtype: 'float32' - drop_remainder: false - resize_eval_groundtruth: true -trainer: - optimizer_config: - learning_rate: - polynomial: - decay_steps: 90000 - initial_learning_rate: 0.01 - power: 0.9 - type: polynomial - optimizer: - sgd: - momentum: 0.9 - type: sgd - warmup: - linear: - name: linear - warmup_learning_rate: 0 - warmup_steps: 925 - type: linear - steps_per_loop: 185 - summary_interval: 185 - train_steps: 90000 - validation_interval: 185 - validation_steps: 31 - checkpoint_interval: 185 diff --git a/official/vision/beta/configs/experiments/video_classification/k400_3d-resnet50_tpu.yaml b/official/vision/beta/configs/experiments/video_classification/k400_3d-resnet50_tpu.yaml deleted file mode 100644 index d9158c2bd..000000000 --- a/official/vision/beta/configs/experiments/video_classification/k400_3d-resnet50_tpu.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# 3D ResNet-50 video classification on Kinetics-400. -# -# --experiment_type=video_classification_kinetics400 -# Expected accuracy: 77.0% top-1, 93.0% top-5. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - dropout_rate: 0.5 - norm_activation: - use_sync_bn: false - backbone: - resnet_3d: - block_specs: !!python/tuple - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - model_id: 50 - stem_conv_temporal_kernel_size: 5 - stem_conv_temporal_stride: 2 - stem_pool_temporal_stride: 1 - train_data: - name: kinetics400 - feature_shape: !!python/tuple - - 32 - - 224 - - 224 - - 3 - temporal_stride: 2 - global_batch_size: 1024 - dtype: 'bfloat16' - shuffle_buffer_size: 1024 - aug_max_area_ratio: 1.0 - aug_max_aspect_ratio: 2.0 - aug_min_area_ratio: 0.08 - aug_min_aspect_ratio: 0.5 - validation_data: - name: kinetics400 - feature_shape: !!python/tuple - - 32 - - 256 - - 256 - - 3 - temporal_stride: 2 - num_test_clips: 10 - num_test_crops: 3 - global_batch_size: 64 - dtype: 'bfloat16' - drop_remainder: false -trainer: - optimizer_config: - learning_rate: - cosine: - initial_learning_rate: 0.8 - decay_steps: 42104 - warmup: - linear: - warmup_steps: 1053 - train_steps: 42104 - steps_per_loop: 500 - summary_interval: 500 - validation_interval: 500 diff --git a/official/vision/beta/configs/experiments/video_classification/k400_resnet3drs_50_tpu.yaml b/official/vision/beta/configs/experiments/video_classification/k400_resnet3drs_50_tpu.yaml deleted file mode 100644 index 83875d127..000000000 --- a/official/vision/beta/configs/experiments/video_classification/k400_resnet3drs_50_tpu.yaml +++ /dev/null @@ -1,99 +0,0 @@ -# 3D ResNet-RS-50 video classification on Kinetics-400. -# -# --experiment_type=video_classification_kinetics400 -# Expected accuracy: 78.2% top-1 accuracy. -runtime: - mixed_precision_dtype: bfloat16 -task: - losses: - l2_weight_decay: 0.00004 - label_smoothing: 0.1 - one_hot: true - model: - aggregate_endpoints: false - backbone: - resnet_3d_rs: - model_id: 50 - stem_type: 'v1' - stem_conv_temporal_kernel_size: 5 - stem_conv_temporal_stride: 2 - stem_pool_temporal_stride: 1 - stochastic_depth_drop_rate: 0.1 - se_ratio: 0.25 - type: resnet_3d_rs - dropout_rate: 0.5 - model_type: video_classification - norm_activation: - activation: relu - norm_epsilon: 1.0e-05 - norm_momentum: 0.0 - use_sync_bn: false - train_data: - data_format: channels_last - drop_remainder: true - dtype: bfloat16 - feature_shape: !!python/tuple - - 32 - - 224 - - 224 - - 3 - file_type: sstable - global_batch_size: 1024 - is_training: true - min_image_size: 256 - name: kinetics400 - num_channels: 3 - num_classes: 400 - num_examples: 215570 - num_test_clips: 1 - num_test_crops: 1 - one_hot: true - temporal_stride: 2 - aug_max_area_ratio: 1.0 - aug_max_aspect_ratio: 2.0 - aug_min_area_ratio: 0.08 - aug_min_aspect_ratio: 0.5 - validation_data: - data_format: channels_last - drop_remainder: false - dtype: bfloat16 - feature_shape: !!python/tuple - - 32 - - 256 - - 256 - - 3 - file_type: sstable - global_batch_size: 64 - is_training: false - min_image_size: 256 - name: kinetics400 - num_channels: 3 - num_classes: 400 - num_examples: 17706 - num_test_clips: 10 - num_test_crops: 3 - one_hot: true - temporal_stride: 2 -trainer: - checkpoint_interval: 210 - max_to_keep: 3 - optimizer_config: - ema: - average_decay: 0.9999 - trainable_weights_only: false - learning_rate: - cosine: - decay_steps: 73682 - initial_learning_rate: 0.8 - name: CosineDecay - type: cosine - warmup: - linear: - name: linear - warmup_learning_rate: 0 - warmup_steps: 1050 - type: linear - train_steps: 73682 - steps_per_loop: 500 - summary_interval: 500 - validation_interval: 500 diff --git a/official/vision/beta/configs/experiments/video_classification/k400_slowonly16x4_tpu.yaml b/official/vision/beta/configs/experiments/video_classification/k400_slowonly16x4_tpu.yaml deleted file mode 100644 index 8e6793a37..000000000 --- a/official/vision/beta/configs/experiments/video_classification/k400_slowonly16x4_tpu.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# SlowOnly 16x4 video classification on Kinetics-400. -# -# --experiment_type=video_classification_kinetics400 -# Expected accuracy: 75.6% top-1, 92.1% top-5. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - dropout_rate: 0.5 - norm_activation: - use_sync_bn: false - backbone: - resnet_3d: - block_specs: !!python/tuple - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - model_id: 50 - stem_conv_temporal_kernel_size: 1 - stem_conv_temporal_stride: 1 - stem_pool_temporal_stride: 1 - train_data: - name: kinetics400 - feature_shape: !!python/tuple - - 16 - - 224 - - 224 - - 3 - temporal_stride: 4 - global_batch_size: 1024 - dtype: 'bfloat16' - shuffle_buffer_size: 1024 - aug_max_area_ratio: 1.0 - aug_max_aspect_ratio: 2.0 - aug_min_area_ratio: 0.08 - aug_min_aspect_ratio: 0.5 - validation_data: - name: kinetics400 - feature_shape: !!python/tuple - - 16 - - 256 - - 256 - - 3 - temporal_stride: 4 - num_test_clips: 10 - num_test_crops: 3 - global_batch_size: 64 - dtype: 'bfloat16' - drop_remainder: false -trainer: - optimizer_config: - learning_rate: - cosine: - initial_learning_rate: 0.8 - decay_steps: 42104 - warmup: - linear: - warmup_steps: 1053 - train_steps: 42104 - steps_per_loop: 500 - summary_interval: 500 - validation_interval: 500 diff --git a/official/vision/beta/configs/experiments/video_classification/k400_slowonly8x8_tpu.yaml b/official/vision/beta/configs/experiments/video_classification/k400_slowonly8x8_tpu.yaml deleted file mode 100644 index c0bcd881e..000000000 --- a/official/vision/beta/configs/experiments/video_classification/k400_slowonly8x8_tpu.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# SlowOnly 8x8 video classification on Kinetics-400. -# -# --experiment_type=video_classification_kinetics400 -# Expected accuracy: 74.1% top-1, 91.4% top-5. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - dropout_rate: 0.5 - norm_activation: - use_sync_bn: false - backbone: - resnet_3d: - block_specs: !!python/tuple - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - model_id: 50 - stem_conv_temporal_kernel_size: 1 - stem_conv_temporal_stride: 1 - stem_pool_temporal_stride: 1 - train_data: - name: kinetics400 - feature_shape: !!python/tuple - - 8 - - 224 - - 224 - - 3 - temporal_stride: 8 - global_batch_size: 1024 - dtype: 'bfloat16' - shuffle_buffer_size: 1024 - aug_max_area_ratio: 1.0 - aug_max_aspect_ratio: 2.0 - aug_min_area_ratio: 0.08 - aug_min_aspect_ratio: 0.5 - validation_data: - name: kinetics400 - feature_shape: !!python/tuple - - 8 - - 256 - - 256 - - 3 - temporal_stride: 8 - num_test_clips: 10 - num_test_crops: 3 - global_batch_size: 64 - dtype: 'bfloat16' - drop_remainder: false -trainer: - optimizer_config: - learning_rate: - cosine: - initial_learning_rate: 0.8 - decay_steps: 42104 - warmup: - linear: - warmup_steps: 1053 - train_steps: 42104 - steps_per_loop: 500 - summary_interval: 500 - validation_interval: 500 diff --git a/official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50_tpu.yaml b/official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50_tpu.yaml deleted file mode 100644 index ceb38608d..000000000 --- a/official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50_tpu.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# 3D ResNet-50 video classification on Kinetics-600. -# -# --experiment_type=video_classification_kinetics600 -# Expected accuracy: 79.5% top-1, 94.8% top-5. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - dropout_rate: 0.5 - norm_activation: - use_sync_bn: false - backbone: - resnet_3d: - block_specs: !!python/tuple - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - model_id: 50 - stem_conv_temporal_kernel_size: 5 - stem_conv_temporal_stride: 2 - stem_pool_temporal_stride: 1 - train_data: - name: kinetics600 - feature_shape: !!python/tuple - - 32 - - 224 - - 224 - - 3 - temporal_stride: 2 - global_batch_size: 1024 - dtype: 'bfloat16' - shuffle_buffer_size: 1024 - aug_max_area_ratio: 1.0 - aug_max_aspect_ratio: 2.0 - aug_min_area_ratio: 0.08 - aug_min_aspect_ratio: 0.5 - validation_data: - name: kinetics600 - feature_shape: !!python/tuple - - 32 - - 256 - - 256 - - 3 - temporal_stride: 2 - num_test_clips: 10 - num_test_crops: 3 - global_batch_size: 64 - dtype: 'bfloat16' - drop_remainder: false -trainer: - optimizer_config: - learning_rate: - cosine: - initial_learning_rate: 0.8 - decay_steps: 71488 - warmup: - linear: - warmup_steps: 1787 - train_steps: 71488 - steps_per_loop: 500 - summary_interval: 500 - validation_interval: 500 diff --git a/official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50g_tpu.yaml b/official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50g_tpu.yaml deleted file mode 100644 index 3ae54c415..000000000 --- a/official/vision/beta/configs/experiments/video_classification/k600_3d-resnet50g_tpu.yaml +++ /dev/null @@ -1,112 +0,0 @@ -# 3D ResNet-50g video classification on Kinetics-600. -# -# --experiment_type=video_classification_kinetics600 -# Expected accuracy: 78.7% accuracy, 93.6% top-5. -# Train on TPU: v3-128, eval on TPU: v3-32 -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - init_checkpoint: null - init_checkpoint_modules: all - losses: - l2_weight_decay: 0.0001 - label_smoothing: 0.0 - model: - aggregate_endpoints: false - backbone: - resnet_3d: - block_specs: !!python/tuple - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: true - - temporal_kernel_sizes: !!python/tuple - - 3 - - 1 - - 3 - - 1 - temporal_strides: 1 - use_self_gating: true - - temporal_kernel_sizes: !!python/tuple - - 3 - - 1 - - 3 - - 1 - - 3 - - 1 - temporal_strides: 1 - use_self_gating: true - - temporal_kernel_sizes: !!python/tuple - - 1 - - 3 - - 1 - temporal_strides: 1 - use_self_gating: true - model_id: 50 - stem_conv_temporal_kernel_size: 5 - stem_conv_temporal_stride: 2 - stem_pool_temporal_stride: 2 - stem_type: v0 - stochastic_depth_drop_rate: 0.0 - type: resnet_3d - dropout_rate: 0.2 - model_type: video_classification - norm_activation: - activation: relu - norm_epsilon: 1.0e-05 - norm_momentum: 0.9 - use_sync_bn: false - train_data: - aug_max_area_ratio: 1.0 - aug_max_aspect_ratio: 2.0 - aug_min_area_ratio: 0.49 - aug_min_aspect_ratio: 0.5 - drop_remainder: true - dtype: 'bfloat16' - feature_shape: !!python/tuple - - 64 - - 224 - - 224 - - 3 - global_batch_size: 1024 - min_image_size: 256 - name: kinetics600 - num_classes: 600 - split: train - validation_data: - dtype: 'bfloat16' - feature_shape: !!python/tuple - - 250 - - 224 - - 224 - - 3 - global_batch_size: 64 - min_image_size: 256 - name: kinetics600 - num_classes: 600 - num_examples: 27780 - num_test_clips: 1 - num_test_crops: 1 - one_hot: true -trainer: - optimizer_config: - learning_rate: - cosine: - alpha: 0.0 - decay_steps: 71400 - initial_learning_rate: 1.6 - name: CosineDecay - type: cosine - warmup: - linear: - name: linear - warmup_learning_rate: 0 - warmup_steps: 1785 - type: linear - train_steps: 71400 - steps_per_loop: 500 - summary_interval: 500 - validation_interval: 500 diff --git a/official/vision/beta/configs/experiments/video_classification/k600_slowonly8x8_tpu.yaml b/official/vision/beta/configs/experiments/video_classification/k600_slowonly8x8_tpu.yaml deleted file mode 100644 index 43f656ce3..000000000 --- a/official/vision/beta/configs/experiments/video_classification/k600_slowonly8x8_tpu.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# SlowOnly 8x8 video classification on Kinetics-600. -# -# --experiment_type=video_classification_kinetics600 -# Expected accuracy: 77.3% top-1, 93.6% top-5. -runtime: - distribution_strategy: 'tpu' - mixed_precision_dtype: 'bfloat16' -task: - model: - dropout_rate: 0.5 - norm_activation: - use_sync_bn: false - backbone: - resnet_3d: - block_specs: !!python/tuple - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 1 - - 1 - - 1 - - 1 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - - temporal_kernel_sizes: !!python/tuple - - 3 - - 3 - - 3 - temporal_strides: 1 - use_self_gating: false - model_id: 50 - stem_conv_temporal_kernel_size: 1 - stem_conv_temporal_stride: 1 - stem_pool_temporal_stride: 1 - train_data: - name: kinetics600 - feature_shape: !!python/tuple - - 8 - - 224 - - 224 - - 3 - temporal_stride: 8 - global_batch_size: 1024 - dtype: 'bfloat16' - shuffle_buffer_size: 1024 - aug_max_area_ratio: 1.0 - aug_max_aspect_ratio: 2.0 - aug_min_area_ratio: 0.08 - aug_min_aspect_ratio: 0.5 - validation_data: - name: kinetics600 - feature_shape: !!python/tuple - - 8 - - 256 - - 256 - - 3 - temporal_stride: 8 - num_test_clips: 10 - num_test_crops: 3 - global_batch_size: 64 - dtype: 'bfloat16' - drop_remainder: false -trainer: - optimizer_config: - learning_rate: - cosine: - initial_learning_rate: 0.8 - decay_steps: 71488 - warmup: - linear: - warmup_steps: 1787 - train_steps: 71488 - steps_per_loop: 500 - summary_interval: 500 - validation_interval: 500 diff --git a/official/vision/beta/configs/image_classification.py b/official/vision/beta/configs/image_classification.py deleted file mode 100644 index 27a1dc46c..000000000 --- a/official/vision/beta/configs/image_classification.py +++ /dev/null @@ -1,397 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Image classification configuration definition.""" -import dataclasses -import os -from typing import List, Optional - -from official.core import config_definitions as cfg -from official.core import exp_factory -from official.modeling import hyperparams -from official.modeling import optimization -from official.vision.beta.configs import common -from official.vision.beta.configs import backbones - - -@dataclasses.dataclass -class DataConfig(cfg.DataConfig): - """Input config for training.""" - input_path: str = '' - global_batch_size: int = 0 - is_training: bool = True - dtype: str = 'float32' - shuffle_buffer_size: int = 10000 - cycle_length: int = 10 - is_multilabel: bool = False - aug_rand_hflip: bool = True - aug_type: Optional[ - common.Augmentation] = None # Choose from AutoAugment and RandAugment. - color_jitter: float = 0. - random_erasing: Optional[common.RandomErasing] = None - file_type: str = 'tfrecord' - image_field_key: str = 'image/encoded' - label_field_key: str = 'image/class/label' - decode_jpeg_only: bool = True - mixup_and_cutmix: Optional[common.MixupAndCutmix] = None - decoder: Optional[common.DataDecoder] = common.DataDecoder() - - # Keep for backward compatibility. - aug_policy: Optional[str] = None # None, 'autoaug', or 'randaug'. - randaug_magnitude: Optional[int] = 10 - - -@dataclasses.dataclass -class ImageClassificationModel(hyperparams.Config): - """The model config.""" - num_classes: int = 0 - input_size: List[int] = dataclasses.field(default_factory=list) - backbone: backbones.Backbone = backbones.Backbone( - type='resnet', resnet=backbones.ResNet()) - dropout_rate: float = 0.0 - norm_activation: common.NormActivation = common.NormActivation( - use_sync_bn=False) - # Adds a BatchNormalization layer pre-GlobalAveragePooling in classification - add_head_batch_norm: bool = False - kernel_initializer: str = 'random_uniform' - - -@dataclasses.dataclass -class Losses(hyperparams.Config): - loss_weight: float = 1.0 - one_hot: bool = True - label_smoothing: float = 0.0 - l2_weight_decay: float = 0.0 - soft_labels: bool = False - - -@dataclasses.dataclass -class Evaluation(hyperparams.Config): - top_k: int = 5 - - -@dataclasses.dataclass -class ImageClassificationTask(cfg.TaskConfig): - """The task config.""" - model: ImageClassificationModel = ImageClassificationModel() - train_data: DataConfig = DataConfig(is_training=True) - validation_data: DataConfig = DataConfig(is_training=False) - losses: Losses = Losses() - evaluation: Evaluation = Evaluation() - init_checkpoint: Optional[str] = None - init_checkpoint_modules: str = 'all' # all or backbone - model_output_keys: Optional[List[int]] = dataclasses.field( - default_factory=list) - - -@exp_factory.register_config_factory('image_classification') -def image_classification() -> cfg.ExperimentConfig: - """Image classification general.""" - return cfg.ExperimentConfig( - task=ImageClassificationTask(), - trainer=cfg.TrainerConfig(), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - - -IMAGENET_TRAIN_EXAMPLES = 1281167 -IMAGENET_VAL_EXAMPLES = 50000 -IMAGENET_INPUT_PATH_BASE = 'imagenet-2012-tfrecord' - - -@exp_factory.register_config_factory('resnet_imagenet') -def image_classification_imagenet() -> cfg.ExperimentConfig: - """Image classification on imagenet with resnet.""" - train_batch_size = 4096 - eval_batch_size = 4096 - steps_per_epoch = IMAGENET_TRAIN_EXAMPLES // train_batch_size - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(enable_xla=True), - task=ImageClassificationTask( - model=ImageClassificationModel( - num_classes=1001, - input_size=[224, 224, 3], - backbone=backbones.Backbone( - type='resnet', resnet=backbones.ResNet(model_id=50)), - norm_activation=common.NormActivation( - norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)), - losses=Losses(l2_weight_decay=1e-4), - train_data=DataConfig( - input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'train*'), - is_training=True, - global_batch_size=train_batch_size), - validation_data=DataConfig( - input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'valid*'), - is_training=False, - global_batch_size=eval_batch_size)), - trainer=cfg.TrainerConfig( - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - train_steps=90 * steps_per_epoch, - validation_steps=IMAGENET_VAL_EXAMPLES // eval_batch_size, - validation_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'stepwise', - 'stepwise': { - 'boundaries': [ - 30 * steps_per_epoch, 60 * steps_per_epoch, - 80 * steps_per_epoch - ], - 'values': [ - 0.1 * train_batch_size / 256, - 0.01 * train_batch_size / 256, - 0.001 * train_batch_size / 256, - 0.0001 * train_batch_size / 256, - ] - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 5 * steps_per_epoch, - 'warmup_learning_rate': 0 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - - return config - - -@exp_factory.register_config_factory('resnet_rs_imagenet') -def image_classification_imagenet_resnetrs() -> cfg.ExperimentConfig: - """Image classification on imagenet with resnet-rs.""" - train_batch_size = 4096 - eval_batch_size = 4096 - steps_per_epoch = IMAGENET_TRAIN_EXAMPLES // train_batch_size - config = cfg.ExperimentConfig( - task=ImageClassificationTask( - model=ImageClassificationModel( - num_classes=1001, - input_size=[160, 160, 3], - backbone=backbones.Backbone( - type='resnet', - resnet=backbones.ResNet( - model_id=50, - stem_type='v1', - resnetd_shortcut=True, - replace_stem_max_pool=True, - se_ratio=0.25, - stochastic_depth_drop_rate=0.0)), - dropout_rate=0.25, - norm_activation=common.NormActivation( - norm_momentum=0.0, - norm_epsilon=1e-5, - use_sync_bn=False, - activation='swish')), - losses=Losses(l2_weight_decay=4e-5, label_smoothing=0.1), - train_data=DataConfig( - input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'train*'), - is_training=True, - global_batch_size=train_batch_size, - aug_type=common.Augmentation( - type='randaug', randaug=common.RandAugment(magnitude=10))), - validation_data=DataConfig( - input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'valid*'), - is_training=False, - global_batch_size=eval_batch_size)), - trainer=cfg.TrainerConfig( - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - train_steps=350 * steps_per_epoch, - validation_steps=IMAGENET_VAL_EXAMPLES // eval_batch_size, - validation_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'ema': { - 'average_decay': 0.9999, - 'trainable_weights_only': False, - }, - 'learning_rate': { - 'type': 'cosine', - 'cosine': { - 'initial_learning_rate': 1.6, - 'decay_steps': 350 * steps_per_epoch - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 5 * steps_per_epoch, - 'warmup_learning_rate': 0 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - return config - - -@exp_factory.register_config_factory('revnet_imagenet') -def image_classification_imagenet_revnet() -> cfg.ExperimentConfig: - """Returns a revnet config for image classification on imagenet.""" - train_batch_size = 4096 - eval_batch_size = 4096 - steps_per_epoch = IMAGENET_TRAIN_EXAMPLES // train_batch_size - - config = cfg.ExperimentConfig( - task=ImageClassificationTask( - model=ImageClassificationModel( - num_classes=1001, - input_size=[224, 224, 3], - backbone=backbones.Backbone( - type='revnet', revnet=backbones.RevNet(model_id=56)), - norm_activation=common.NormActivation( - norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False), - add_head_batch_norm=True), - losses=Losses(l2_weight_decay=1e-4), - train_data=DataConfig( - input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'train*'), - is_training=True, - global_batch_size=train_batch_size), - validation_data=DataConfig( - input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'valid*'), - is_training=False, - global_batch_size=eval_batch_size)), - trainer=cfg.TrainerConfig( - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - train_steps=90 * steps_per_epoch, - validation_steps=IMAGENET_VAL_EXAMPLES // eval_batch_size, - validation_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'stepwise', - 'stepwise': { - 'boundaries': [ - 30 * steps_per_epoch, 60 * steps_per_epoch, - 80 * steps_per_epoch - ], - 'values': [0.8, 0.08, 0.008, 0.0008] - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 5 * steps_per_epoch, - 'warmup_learning_rate': 0 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - - return config - - -@exp_factory.register_config_factory('mobilenet_imagenet') -def image_classification_imagenet_mobilenet() -> cfg.ExperimentConfig: - """Image classification on imagenet with mobilenet.""" - train_batch_size = 4096 - eval_batch_size = 4096 - steps_per_epoch = IMAGENET_TRAIN_EXAMPLES // train_batch_size - config = cfg.ExperimentConfig( - task=ImageClassificationTask( - model=ImageClassificationModel( - num_classes=1001, - dropout_rate=0.2, - input_size=[224, 224, 3], - backbone=backbones.Backbone( - type='mobilenet', - mobilenet=backbones.MobileNet( - model_id='MobileNetV2', filter_size_scale=1.0)), - norm_activation=common.NormActivation( - norm_momentum=0.997, norm_epsilon=1e-3, use_sync_bn=False)), - losses=Losses(l2_weight_decay=1e-5, label_smoothing=0.1), - train_data=DataConfig( - input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'train*'), - is_training=True, - global_batch_size=train_batch_size), - validation_data=DataConfig( - input_path=os.path.join(IMAGENET_INPUT_PATH_BASE, 'valid*'), - is_training=False, - global_batch_size=eval_batch_size)), - trainer=cfg.TrainerConfig( - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - train_steps=500 * steps_per_epoch, - validation_steps=IMAGENET_VAL_EXAMPLES // eval_batch_size, - validation_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'rmsprop', - 'rmsprop': { - 'rho': 0.9, - 'momentum': 0.9, - 'epsilon': 0.002, - } - }, - 'learning_rate': { - 'type': 'exponential', - 'exponential': { - 'initial_learning_rate': - 0.008 * (train_batch_size // 128), - 'decay_steps': - int(2.5 * steps_per_epoch), - 'decay_rate': - 0.98, - 'staircase': - True - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 5 * steps_per_epoch, - 'warmup_learning_rate': 0 - } - }, - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - - return config diff --git a/official/vision/beta/configs/image_classification_test.py b/official/vision/beta/configs/image_classification_test.py deleted file mode 100644 index d3c13b5c0..000000000 --- a/official/vision/beta/configs/image_classification_test.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for image_classification.""" -# pylint: disable=unused-import -from absl.testing import parameterized -import tensorflow as tf - -from official.core import config_definitions as cfg -from official.core import exp_factory -from official.vision import beta -from official.vision.beta.configs import image_classification as exp_cfg - - -class ImageClassificationConfigTest(tf.test.TestCase, parameterized.TestCase): - - @parameterized.parameters( - ('resnet_imagenet',), - ('resnet_rs_imagenet',), - ('revnet_imagenet',), - ('mobilenet_imagenet'), - ) - def test_image_classification_configs(self, config_name): - config = exp_factory.get_exp_config(config_name) - self.assertIsInstance(config, cfg.ExperimentConfig) - self.assertIsInstance(config.task, exp_cfg.ImageClassificationTask) - self.assertIsInstance(config.task.model, - exp_cfg.ImageClassificationModel) - self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig) - config.validate() - config.task.train_data.is_training = None - with self.assertRaises(KeyError): - config.validate() - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/configs/maskrcnn.py b/official/vision/beta/configs/maskrcnn.py deleted file mode 100644 index 952339ed0..000000000 --- a/official/vision/beta/configs/maskrcnn.py +++ /dev/null @@ -1,522 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""R-CNN(-RS) configuration definition.""" - -import dataclasses -import os -from typing import List, Optional, Union - -from official.core import config_definitions as cfg -from official.core import exp_factory -from official.modeling import hyperparams -from official.modeling import optimization -from official.vision.beta.configs import common -from official.vision.beta.configs import decoders -from official.vision.beta.configs import backbones - - -# pylint: disable=missing-class-docstring -@dataclasses.dataclass -class Parser(hyperparams.Config): - num_channels: int = 3 - match_threshold: float = 0.5 - unmatched_threshold: float = 0.5 - aug_rand_hflip: bool = False - aug_scale_min: float = 1.0 - aug_scale_max: float = 1.0 - skip_crowd_during_training: bool = True - max_num_instances: int = 100 - rpn_match_threshold: float = 0.7 - rpn_unmatched_threshold: float = 0.3 - rpn_batch_size_per_im: int = 256 - rpn_fg_fraction: float = 0.5 - mask_crop_size: int = 112 - - -@dataclasses.dataclass -class DataConfig(cfg.DataConfig): - """Input config for training.""" - input_path: str = '' - global_batch_size: int = 0 - is_training: bool = False - dtype: str = 'bfloat16' - decoder: common.DataDecoder = common.DataDecoder() - parser: Parser = Parser() - shuffle_buffer_size: int = 10000 - file_type: str = 'tfrecord' - drop_remainder: bool = True - # Number of examples in the data set, it's used to create the annotation file. - num_examples: int = -1 - - -@dataclasses.dataclass -class Anchor(hyperparams.Config): - num_scales: int = 1 - aspect_ratios: List[float] = dataclasses.field( - default_factory=lambda: [0.5, 1.0, 2.0]) - anchor_size: float = 8.0 - - -@dataclasses.dataclass -class RPNHead(hyperparams.Config): - num_convs: int = 1 - num_filters: int = 256 - use_separable_conv: bool = False - - -@dataclasses.dataclass -class DetectionHead(hyperparams.Config): - num_convs: int = 4 - num_filters: int = 256 - use_separable_conv: bool = False - num_fcs: int = 1 - fc_dims: int = 1024 - class_agnostic_bbox_pred: bool = False # Has to be True for Cascade RCNN. - # If additional IoUs are passed in 'cascade_iou_thresholds' - # then ensemble the class probabilities from all heads. - cascade_class_ensemble: bool = False - - -@dataclasses.dataclass -class ROIGenerator(hyperparams.Config): - pre_nms_top_k: int = 2000 - pre_nms_score_threshold: float = 0.0 - pre_nms_min_size_threshold: float = 0.0 - nms_iou_threshold: float = 0.7 - num_proposals: int = 1000 - test_pre_nms_top_k: int = 1000 - test_pre_nms_score_threshold: float = 0.0 - test_pre_nms_min_size_threshold: float = 0.0 - test_nms_iou_threshold: float = 0.7 - test_num_proposals: int = 1000 - use_batched_nms: bool = False - - -@dataclasses.dataclass -class ROISampler(hyperparams.Config): - mix_gt_boxes: bool = True - num_sampled_rois: int = 512 - foreground_fraction: float = 0.25 - foreground_iou_threshold: float = 0.5 - background_iou_high_threshold: float = 0.5 - background_iou_low_threshold: float = 0.0 - # IoU thresholds for additional FRCNN heads in Cascade mode. - # `foreground_iou_threshold` is the first threshold. - cascade_iou_thresholds: Optional[List[float]] = None - - -@dataclasses.dataclass -class ROIAligner(hyperparams.Config): - crop_size: int = 7 - sample_offset: float = 0.5 - - -@dataclasses.dataclass -class DetectionGenerator(hyperparams.Config): - apply_nms: bool = True - pre_nms_top_k: int = 5000 - pre_nms_score_threshold: float = 0.05 - nms_iou_threshold: float = 0.5 - max_num_detections: int = 100 - nms_version: str = 'v2' # `v2`, `v1`, `batched` - use_cpu_nms: bool = False - soft_nms_sigma: Optional[float] = None # Only works when nms_version='v1'. - - -@dataclasses.dataclass -class MaskHead(hyperparams.Config): - upsample_factor: int = 2 - num_convs: int = 4 - num_filters: int = 256 - use_separable_conv: bool = False - class_agnostic: bool = False - - -@dataclasses.dataclass -class MaskSampler(hyperparams.Config): - num_sampled_masks: int = 128 - - -@dataclasses.dataclass -class MaskROIAligner(hyperparams.Config): - crop_size: int = 14 - sample_offset: float = 0.5 - - -@dataclasses.dataclass -class MaskRCNN(hyperparams.Config): - num_classes: int = 0 - input_size: List[int] = dataclasses.field(default_factory=list) - min_level: int = 2 - max_level: int = 6 - anchor: Anchor = Anchor() - include_mask: bool = True - backbone: backbones.Backbone = backbones.Backbone( - type='resnet', resnet=backbones.ResNet()) - decoder: decoders.Decoder = decoders.Decoder( - type='fpn', fpn=decoders.FPN()) - rpn_head: RPNHead = RPNHead() - detection_head: DetectionHead = DetectionHead() - roi_generator: ROIGenerator = ROIGenerator() - roi_sampler: ROISampler = ROISampler() - roi_aligner: ROIAligner = ROIAligner() - detection_generator: DetectionGenerator = DetectionGenerator() - mask_head: Optional[MaskHead] = MaskHead() - mask_sampler: Optional[MaskSampler] = MaskSampler() - mask_roi_aligner: Optional[MaskROIAligner] = MaskROIAligner() - norm_activation: common.NormActivation = common.NormActivation( - norm_momentum=0.997, - norm_epsilon=0.0001, - use_sync_bn=True) - - -@dataclasses.dataclass -class Losses(hyperparams.Config): - loss_weight: float = 1.0 - rpn_huber_loss_delta: float = 1. / 9. - frcnn_huber_loss_delta: float = 1. - l2_weight_decay: float = 0.0 - rpn_score_weight: float = 1.0 - rpn_box_weight: float = 1.0 - frcnn_class_weight: float = 1.0 - frcnn_box_weight: float = 1.0 - mask_weight: float = 1.0 - - -@dataclasses.dataclass -class MaskRCNNTask(cfg.TaskConfig): - model: MaskRCNN = MaskRCNN() - train_data: DataConfig = DataConfig(is_training=True) - validation_data: DataConfig = DataConfig(is_training=False, - drop_remainder=False) - losses: Losses = Losses() - init_checkpoint: Optional[str] = None - init_checkpoint_modules: Union[ - str, List[str]] = 'all' # all, backbone, and/or decoder - annotation_file: Optional[str] = None - per_category_metrics: bool = False - # If set, we only use masks for the specified class IDs. - allowed_mask_class_ids: Optional[List[int]] = None - # If set, the COCO metrics will be computed. - use_coco_metrics: bool = True - # If set, the Waymo Open Dataset evaluator would be used. - use_wod_metrics: bool = False - - -COCO_INPUT_PATH_BASE = 'coco' - - -@exp_factory.register_config_factory('fasterrcnn_resnetfpn_coco') -def fasterrcnn_resnetfpn_coco() -> cfg.ExperimentConfig: - """COCO object detection with Faster R-CNN.""" - steps_per_epoch = 500 - coco_val_samples = 5000 - train_batch_size = 64 - eval_batch_size = 8 - - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), - task=MaskRCNNTask( - init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080', - init_checkpoint_modules='backbone', - annotation_file=os.path.join(COCO_INPUT_PATH_BASE, - 'instances_val2017.json'), - model=MaskRCNN( - num_classes=91, - input_size=[1024, 1024, 3], - include_mask=False, - mask_head=None, - mask_sampler=None, - mask_roi_aligner=None), - losses=Losses(l2_weight_decay=0.00004), - train_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'), - is_training=True, - global_batch_size=train_batch_size, - parser=Parser( - aug_rand_hflip=True, aug_scale_min=0.8, aug_scale_max=1.25)), - validation_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'), - is_training=False, - global_batch_size=eval_batch_size, - drop_remainder=False)), - trainer=cfg.TrainerConfig( - train_steps=22500, - validation_steps=coco_val_samples // eval_batch_size, - validation_interval=steps_per_epoch, - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'stepwise', - 'stepwise': { - 'boundaries': [15000, 20000], - 'values': [0.12, 0.012, 0.0012], - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 500, - 'warmup_learning_rate': 0.0067 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - return config - - -@exp_factory.register_config_factory('maskrcnn_resnetfpn_coco') -def maskrcnn_resnetfpn_coco() -> cfg.ExperimentConfig: - """COCO object detection with Mask R-CNN.""" - steps_per_epoch = 500 - coco_val_samples = 5000 - train_batch_size = 64 - eval_batch_size = 8 - - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig( - mixed_precision_dtype='bfloat16', enable_xla=True), - task=MaskRCNNTask( - init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080', - init_checkpoint_modules='backbone', - annotation_file=os.path.join(COCO_INPUT_PATH_BASE, - 'instances_val2017.json'), - model=MaskRCNN( - num_classes=91, input_size=[1024, 1024, 3], include_mask=True), - losses=Losses(l2_weight_decay=0.00004), - train_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'), - is_training=True, - global_batch_size=train_batch_size, - parser=Parser( - aug_rand_hflip=True, aug_scale_min=0.8, aug_scale_max=1.25)), - validation_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'), - is_training=False, - global_batch_size=eval_batch_size, - drop_remainder=False)), - trainer=cfg.TrainerConfig( - train_steps=22500, - validation_steps=coco_val_samples // eval_batch_size, - validation_interval=steps_per_epoch, - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'stepwise', - 'stepwise': { - 'boundaries': [15000, 20000], - 'values': [0.12, 0.012, 0.0012], - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 500, - 'warmup_learning_rate': 0.0067 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - return config - - -@exp_factory.register_config_factory('maskrcnn_spinenet_coco') -def maskrcnn_spinenet_coco() -> cfg.ExperimentConfig: - """COCO object detection with Mask R-CNN with SpineNet backbone.""" - steps_per_epoch = 463 - coco_val_samples = 5000 - train_batch_size = 256 - eval_batch_size = 8 - - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), - task=MaskRCNNTask( - annotation_file=os.path.join(COCO_INPUT_PATH_BASE, - 'instances_val2017.json'), - model=MaskRCNN( - backbone=backbones.Backbone( - type='spinenet', - spinenet=backbones.SpineNet( - model_id='49', - min_level=3, - max_level=7, - )), - decoder=decoders.Decoder( - type='identity', identity=decoders.Identity()), - anchor=Anchor(anchor_size=3), - norm_activation=common.NormActivation(use_sync_bn=True), - num_classes=91, - input_size=[640, 640, 3], - min_level=3, - max_level=7, - include_mask=True), - losses=Losses(l2_weight_decay=0.00004), - train_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'), - is_training=True, - global_batch_size=train_batch_size, - parser=Parser( - aug_rand_hflip=True, aug_scale_min=0.5, aug_scale_max=2.0)), - validation_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'), - is_training=False, - global_batch_size=eval_batch_size, - drop_remainder=False)), - trainer=cfg.TrainerConfig( - train_steps=steps_per_epoch * 350, - validation_steps=coco_val_samples // eval_batch_size, - validation_interval=steps_per_epoch, - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'stepwise', - 'stepwise': { - 'boundaries': [ - steps_per_epoch * 320, steps_per_epoch * 340 - ], - 'values': [0.32, 0.032, 0.0032], - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 2000, - 'warmup_learning_rate': 0.0067 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None', - 'task.model.min_level == task.model.backbone.spinenet.min_level', - 'task.model.max_level == task.model.backbone.spinenet.max_level', - ]) - return config - - -@exp_factory.register_config_factory('cascadercnn_spinenet_coco') -def cascadercnn_spinenet_coco() -> cfg.ExperimentConfig: - """COCO object detection with Cascade RCNN-RS with SpineNet backbone.""" - steps_per_epoch = 463 - coco_val_samples = 5000 - train_batch_size = 256 - eval_batch_size = 8 - - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), - task=MaskRCNNTask( - annotation_file=os.path.join(COCO_INPUT_PATH_BASE, - 'instances_val2017.json'), - model=MaskRCNN( - backbone=backbones.Backbone( - type='spinenet', - spinenet=backbones.SpineNet( - model_id='49', - min_level=3, - max_level=7, - )), - decoder=decoders.Decoder( - type='identity', identity=decoders.Identity()), - roi_sampler=ROISampler(cascade_iou_thresholds=[0.6, 0.7]), - detection_head=DetectionHead( - class_agnostic_bbox_pred=True, cascade_class_ensemble=True), - anchor=Anchor(anchor_size=3), - norm_activation=common.NormActivation( - use_sync_bn=True, activation='swish'), - num_classes=91, - input_size=[640, 640, 3], - min_level=3, - max_level=7, - include_mask=True), - losses=Losses(l2_weight_decay=0.00004), - train_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'), - is_training=True, - global_batch_size=train_batch_size, - parser=Parser( - aug_rand_hflip=True, aug_scale_min=0.1, aug_scale_max=2.5)), - validation_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'), - is_training=False, - global_batch_size=eval_batch_size, - drop_remainder=False)), - trainer=cfg.TrainerConfig( - train_steps=steps_per_epoch * 500, - validation_steps=coco_val_samples // eval_batch_size, - validation_interval=steps_per_epoch, - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'stepwise', - 'stepwise': { - 'boundaries': [ - steps_per_epoch * 475, steps_per_epoch * 490 - ], - 'values': [0.32, 0.032, 0.0032], - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 2000, - 'warmup_learning_rate': 0.0067 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None', - 'task.model.min_level == task.model.backbone.spinenet.min_level', - 'task.model.max_level == task.model.backbone.spinenet.max_level', - ]) - return config diff --git a/official/vision/beta/configs/maskrcnn_test.py b/official/vision/beta/configs/maskrcnn_test.py deleted file mode 100644 index 2b57a05c9..000000000 --- a/official/vision/beta/configs/maskrcnn_test.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for maskrcnn.""" -# pylint: disable=unused-import -from absl.testing import parameterized -import tensorflow as tf - -from official.core import config_definitions as cfg -from official.core import exp_factory -from official.vision import beta -from official.vision.beta.configs import maskrcnn as exp_cfg - - -class MaskRCNNConfigTest(tf.test.TestCase, parameterized.TestCase): - - @parameterized.parameters( - ('fasterrcnn_resnetfpn_coco',), - ('maskrcnn_resnetfpn_coco',), - ('maskrcnn_spinenet_coco',), - ('cascadercnn_spinenet_coco',), - ) - def test_maskrcnn_configs(self, config_name): - config = exp_factory.get_exp_config(config_name) - self.assertIsInstance(config, cfg.ExperimentConfig) - self.assertIsInstance(config.task, exp_cfg.MaskRCNNTask) - self.assertIsInstance(config.task.model, exp_cfg.MaskRCNN) - self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig) - config.validate() - config.task.train_data.is_training = None - with self.assertRaisesRegex(KeyError, 'Found inconsistncy between key'): - config.validate() - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/configs/retinanet.py b/official/vision/beta/configs/retinanet.py deleted file mode 100644 index 4bc16be7a..000000000 --- a/official/vision/beta/configs/retinanet.py +++ /dev/null @@ -1,420 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""RetinaNet configuration definition.""" - -import dataclasses -import os -from typing import List, Optional, Union - -from official.core import config_definitions as cfg -from official.core import exp_factory -from official.modeling import hyperparams -from official.modeling import optimization -from official.vision.beta.configs import common -from official.vision.beta.configs import decoders -from official.vision.beta.configs import backbones - - -# pylint: disable=missing-class-docstring -# Keep for backward compatibility. -@dataclasses.dataclass -class TfExampleDecoder(common.TfExampleDecoder): - """A simple TF Example decoder config.""" - - -# Keep for backward compatibility. -@dataclasses.dataclass -class TfExampleDecoderLabelMap(common.TfExampleDecoderLabelMap): - """TF Example decoder with label map config.""" - - -# Keep for backward compatibility. -@dataclasses.dataclass -class DataDecoder(common.DataDecoder): - """Data decoder config.""" - - -@dataclasses.dataclass -class Parser(hyperparams.Config): - num_channels: int = 3 - match_threshold: float = 0.5 - unmatched_threshold: float = 0.5 - aug_rand_hflip: bool = False - aug_scale_min: float = 1.0 - aug_scale_max: float = 1.0 - skip_crowd_during_training: bool = True - max_num_instances: int = 100 - # Can choose AutoAugment and RandAugment. - aug_type: Optional[common.Augmentation] = None - - # Keep for backward compatibility. Not used. - aug_policy: Optional[str] = None - - -@dataclasses.dataclass -class DataConfig(cfg.DataConfig): - """Input config for training.""" - input_path: str = '' - global_batch_size: int = 0 - is_training: bool = False - dtype: str = 'bfloat16' - decoder: common.DataDecoder = common.DataDecoder() - parser: Parser = Parser() - shuffle_buffer_size: int = 10000 - file_type: str = 'tfrecord' - - -@dataclasses.dataclass -class Anchor(hyperparams.Config): - num_scales: int = 3 - aspect_ratios: List[float] = dataclasses.field( - default_factory=lambda: [0.5, 1.0, 2.0]) - anchor_size: float = 4.0 - - -@dataclasses.dataclass -class Losses(hyperparams.Config): - loss_weight: float = 1.0 - focal_loss_alpha: float = 0.25 - focal_loss_gamma: float = 1.5 - huber_loss_delta: float = 0.1 - box_loss_weight: int = 50 - l2_weight_decay: float = 0.0 - - -@dataclasses.dataclass -class AttributeHead(hyperparams.Config): - name: str = '' - type: str = 'regression' - size: int = 1 - - -@dataclasses.dataclass -class RetinaNetHead(hyperparams.Config): - num_convs: int = 4 - num_filters: int = 256 - use_separable_conv: bool = False - attribute_heads: List[AttributeHead] = dataclasses.field(default_factory=list) - - -@dataclasses.dataclass -class DetectionGenerator(hyperparams.Config): - apply_nms: bool = True - pre_nms_top_k: int = 5000 - pre_nms_score_threshold: float = 0.05 - nms_iou_threshold: float = 0.5 - max_num_detections: int = 100 - nms_version: str = 'v2' # `v2`, `v1`, `batched`. - use_cpu_nms: bool = False - soft_nms_sigma: Optional[float] = None # Only works when nms_version='v1'. - - -@dataclasses.dataclass -class RetinaNet(hyperparams.Config): - num_classes: int = 0 - input_size: List[int] = dataclasses.field(default_factory=list) - min_level: int = 3 - max_level: int = 7 - anchor: Anchor = Anchor() - backbone: backbones.Backbone = backbones.Backbone( - type='resnet', resnet=backbones.ResNet()) - decoder: decoders.Decoder = decoders.Decoder( - type='fpn', fpn=decoders.FPN()) - head: RetinaNetHead = RetinaNetHead() - detection_generator: DetectionGenerator = DetectionGenerator() - norm_activation: common.NormActivation = common.NormActivation() - - -@dataclasses.dataclass -class ExportConfig(hyperparams.Config): - output_normalized_coordinates: bool = False - cast_num_detections_to_float: bool = False - cast_detection_classes_to_float: bool = False - - -@dataclasses.dataclass -class RetinaNetTask(cfg.TaskConfig): - model: RetinaNet = RetinaNet() - train_data: DataConfig = DataConfig(is_training=True) - validation_data: DataConfig = DataConfig(is_training=False) - losses: Losses = Losses() - init_checkpoint: Optional[str] = None - init_checkpoint_modules: Union[ - str, List[str]] = 'all' # all, backbone, and/or decoder - annotation_file: Optional[str] = None - per_category_metrics: bool = False - export_config: ExportConfig = ExportConfig() - - -@exp_factory.register_config_factory('retinanet') -def retinanet() -> cfg.ExperimentConfig: - """RetinaNet general config.""" - return cfg.ExperimentConfig( - task=RetinaNetTask(), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - - -COCO_INPUT_PATH_BASE = 'coco' -COCO_TRAIN_EXAMPLES = 118287 -COCO_VAL_EXAMPLES = 5000 - - -@exp_factory.register_config_factory('retinanet_resnetfpn_coco') -def retinanet_resnetfpn_coco() -> cfg.ExperimentConfig: - """COCO object detection with RetinaNet.""" - train_batch_size = 256 - eval_batch_size = 8 - steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size - - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), - task=RetinaNetTask( - init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/resnet50_imagenet/ckpt-28080', - init_checkpoint_modules='backbone', - annotation_file=os.path.join(COCO_INPUT_PATH_BASE, - 'instances_val2017.json'), - model=RetinaNet( - num_classes=91, - input_size=[640, 640, 3], - norm_activation=common.NormActivation(use_sync_bn=False), - min_level=3, - max_level=7), - losses=Losses(l2_weight_decay=1e-4), - train_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'), - is_training=True, - global_batch_size=train_batch_size, - parser=Parser( - aug_rand_hflip=True, aug_scale_min=0.8, aug_scale_max=1.2)), - validation_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'), - is_training=False, - global_batch_size=eval_batch_size)), - trainer=cfg.TrainerConfig( - train_steps=72 * steps_per_epoch, - validation_steps=COCO_VAL_EXAMPLES // eval_batch_size, - validation_interval=steps_per_epoch, - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'stepwise', - 'stepwise': { - 'boundaries': [ - 57 * steps_per_epoch, 67 * steps_per_epoch - ], - 'values': [ - 0.32 * train_batch_size / 256.0, - 0.032 * train_batch_size / 256.0, - 0.0032 * train_batch_size / 256.0 - ], - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 500, - 'warmup_learning_rate': 0.0067 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - - return config - - -@exp_factory.register_config_factory('retinanet_spinenet_coco') -def retinanet_spinenet_coco() -> cfg.ExperimentConfig: - """COCO object detection with RetinaNet using SpineNet backbone.""" - train_batch_size = 256 - eval_batch_size = 8 - steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size - input_size = 640 - - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(mixed_precision_dtype='float32'), - task=RetinaNetTask( - annotation_file=os.path.join(COCO_INPUT_PATH_BASE, - 'instances_val2017.json'), - model=RetinaNet( - backbone=backbones.Backbone( - type='spinenet', - spinenet=backbones.SpineNet( - model_id='49', - stochastic_depth_drop_rate=0.2, - min_level=3, - max_level=7)), - decoder=decoders.Decoder( - type='identity', identity=decoders.Identity()), - anchor=Anchor(anchor_size=3), - norm_activation=common.NormActivation( - use_sync_bn=True, activation='swish'), - num_classes=91, - input_size=[input_size, input_size, 3], - min_level=3, - max_level=7), - losses=Losses(l2_weight_decay=4e-5), - train_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'), - is_training=True, - global_batch_size=train_batch_size, - parser=Parser( - aug_rand_hflip=True, aug_scale_min=0.1, aug_scale_max=2.0)), - validation_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'), - is_training=False, - global_batch_size=eval_batch_size)), - trainer=cfg.TrainerConfig( - train_steps=500 * steps_per_epoch, - validation_steps=COCO_VAL_EXAMPLES // eval_batch_size, - validation_interval=steps_per_epoch, - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'stepwise', - 'stepwise': { - 'boundaries': [ - 475 * steps_per_epoch, 490 * steps_per_epoch - ], - 'values': [ - 0.32 * train_batch_size / 256.0, - 0.032 * train_batch_size / 256.0, - 0.0032 * train_batch_size / 256.0 - ], - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 2000, - 'warmup_learning_rate': 0.0067 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None', - 'task.model.min_level == task.model.backbone.spinenet.min_level', - 'task.model.max_level == task.model.backbone.spinenet.max_level', - ]) - - return config - - -@exp_factory.register_config_factory('retinanet_mobile_coco') -def retinanet_spinenet_mobile_coco() -> cfg.ExperimentConfig: - """COCO object detection with mobile RetinaNet.""" - train_batch_size = 256 - eval_batch_size = 8 - steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size - input_size = 384 - - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(mixed_precision_dtype='float32'), - task=RetinaNetTask( - annotation_file=os.path.join(COCO_INPUT_PATH_BASE, - 'instances_val2017.json'), - model=RetinaNet( - backbone=backbones.Backbone( - type='spinenet_mobile', - spinenet_mobile=backbones.SpineNetMobile( - model_id='49', - stochastic_depth_drop_rate=0.2, - min_level=3, - max_level=7, - use_keras_upsampling_2d=False)), - decoder=decoders.Decoder( - type='identity', identity=decoders.Identity()), - head=RetinaNetHead(num_filters=48, use_separable_conv=True), - anchor=Anchor(anchor_size=3), - norm_activation=common.NormActivation( - use_sync_bn=True, activation='swish'), - num_classes=91, - input_size=[input_size, input_size, 3], - min_level=3, - max_level=7), - losses=Losses(l2_weight_decay=3e-5), - train_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'train*'), - is_training=True, - global_batch_size=train_batch_size, - parser=Parser( - aug_rand_hflip=True, aug_scale_min=0.1, aug_scale_max=2.0)), - validation_data=DataConfig( - input_path=os.path.join(COCO_INPUT_PATH_BASE, 'val*'), - is_training=False, - global_batch_size=eval_batch_size)), - trainer=cfg.TrainerConfig( - train_steps=600 * steps_per_epoch, - validation_steps=COCO_VAL_EXAMPLES // eval_batch_size, - validation_interval=steps_per_epoch, - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'stepwise', - 'stepwise': { - 'boundaries': [ - 575 * steps_per_epoch, 590 * steps_per_epoch - ], - 'values': [ - 0.32 * train_batch_size / 256.0, - 0.032 * train_batch_size / 256.0, - 0.0032 * train_batch_size / 256.0 - ], - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 2000, - 'warmup_learning_rate': 0.0067 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None', - ]) - - return config diff --git a/official/vision/beta/configs/retinanet_test.py b/official/vision/beta/configs/retinanet_test.py deleted file mode 100644 index 4586bf75f..000000000 --- a/official/vision/beta/configs/retinanet_test.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for retinanet.""" -# pylint: disable=unused-import -from absl.testing import parameterized -import tensorflow as tf - -from official.core import config_definitions as cfg -from official.core import exp_factory -from official.vision import beta -from official.vision.beta.configs import retinanet as exp_cfg - - -class RetinaNetConfigTest(tf.test.TestCase, parameterized.TestCase): - - @parameterized.parameters( - ('retinanet_resnetfpn_coco',), - ('retinanet_spinenet_coco',), - ('retinanet_mobile_coco',), - ) - def test_retinanet_configs(self, config_name): - config = exp_factory.get_exp_config(config_name) - self.assertIsInstance(config, cfg.ExperimentConfig) - self.assertIsInstance(config.task, exp_cfg.RetinaNetTask) - self.assertIsInstance(config.task.model, exp_cfg.RetinaNet) - self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig) - config.validate() - config.task.train_data.is_training = None - with self.assertRaisesRegex(KeyError, 'Found inconsistncy between key'): - config.validate() - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/configs/semantic_segmentation.py b/official/vision/beta/configs/semantic_segmentation.py deleted file mode 100644 index 8e9db1eeb..000000000 --- a/official/vision/beta/configs/semantic_segmentation.py +++ /dev/null @@ -1,712 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Semantic segmentation configuration definition.""" -import dataclasses -import os -from typing import List, Optional, Union - -import numpy as np -from official.core import config_definitions as cfg -from official.core import exp_factory -from official.modeling import hyperparams -from official.modeling import optimization -from official.vision.beta.configs import common -from official.vision.beta.configs import decoders -from official.vision.beta.configs import backbones - - -@dataclasses.dataclass -class DataConfig(cfg.DataConfig): - """Input config for training.""" - output_size: List[int] = dataclasses.field(default_factory=list) - # If crop_size is specified, image will be resized first to - # output_size, then crop of size crop_size will be cropped. - crop_size: List[int] = dataclasses.field(default_factory=list) - input_path: str = '' - global_batch_size: int = 0 - is_training: bool = True - dtype: str = 'float32' - shuffle_buffer_size: int = 1000 - cycle_length: int = 10 - # If resize_eval_groundtruth is set to False, original image sizes are used - # for eval. In that case, groundtruth_padded_size has to be specified too to - # allow for batching the variable input sizes of images. - resize_eval_groundtruth: bool = True - groundtruth_padded_size: List[int] = dataclasses.field(default_factory=list) - aug_scale_min: float = 1.0 - aug_scale_max: float = 1.0 - aug_rand_hflip: bool = True - preserve_aspect_ratio: bool = True - aug_policy: Optional[str] = None - drop_remainder: bool = True - file_type: str = 'tfrecord' - decoder: Optional[common.DataDecoder] = common.DataDecoder() - - -@dataclasses.dataclass -class SegmentationHead(hyperparams.Config): - """Segmentation head config.""" - level: int = 3 - num_convs: int = 2 - num_filters: int = 256 - use_depthwise_convolution: bool = False - prediction_kernel_size: int = 1 - upsample_factor: int = 1 - feature_fusion: Optional[ - str] = None # None, deeplabv3plus, panoptic_fpn_fusion or pyramid_fusion - # deeplabv3plus feature fusion params - low_level: Union[int, str] = 2 - low_level_num_filters: int = 48 - # panoptic_fpn_fusion params - decoder_min_level: Optional[Union[int, str]] = None - decoder_max_level: Optional[Union[int, str]] = None - - -@dataclasses.dataclass -class MaskScoringHead(hyperparams.Config): - """Mask Scoring head config.""" - num_convs: int = 4 - num_filters: int = 128 - fc_input_size: List[int] = dataclasses.field(default_factory=list) - num_fcs: int = 2 - fc_dims: int = 1024 - - -@dataclasses.dataclass -class SemanticSegmentationModel(hyperparams.Config): - """Semantic segmentation model config.""" - num_classes: int = 0 - input_size: List[int] = dataclasses.field(default_factory=list) - min_level: int = 3 - max_level: int = 6 - head: SegmentationHead = SegmentationHead() - backbone: backbones.Backbone = backbones.Backbone( - type='resnet', resnet=backbones.ResNet()) - decoder: decoders.Decoder = decoders.Decoder(type='identity') - mask_scoring_head: Optional[MaskScoringHead] = None - norm_activation: common.NormActivation = common.NormActivation() - - -@dataclasses.dataclass -class Losses(hyperparams.Config): - loss_weight: float = 1.0 - label_smoothing: float = 0.0 - ignore_label: int = 255 - class_weights: List[float] = dataclasses.field(default_factory=list) - l2_weight_decay: float = 0.0 - use_groundtruth_dimension: bool = True - top_k_percent_pixels: float = 1.0 - - -@dataclasses.dataclass -class Evaluation(hyperparams.Config): - report_per_class_iou: bool = True - report_train_mean_iou: bool = True # Turning this off can speed up training. - - -@dataclasses.dataclass -class SemanticSegmentationTask(cfg.TaskConfig): - """The model config.""" - model: SemanticSegmentationModel = SemanticSegmentationModel() - train_data: DataConfig = DataConfig(is_training=True) - validation_data: DataConfig = DataConfig(is_training=False) - losses: Losses = Losses() - evaluation: Evaluation = Evaluation() - train_input_partition_dims: List[int] = dataclasses.field( - default_factory=list) - eval_input_partition_dims: List[int] = dataclasses.field( - default_factory=list) - init_checkpoint: Optional[str] = None - init_checkpoint_modules: Union[ - str, List[str]] = 'all' # all, backbone, and/or decoder - - -@exp_factory.register_config_factory('semantic_segmentation') -def semantic_segmentation() -> cfg.ExperimentConfig: - """Semantic segmentation general.""" - return cfg.ExperimentConfig( - task=SemanticSegmentationTask(), - trainer=cfg.TrainerConfig(), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - -# PASCAL VOC 2012 Dataset -PASCAL_TRAIN_EXAMPLES = 10582 -PASCAL_VAL_EXAMPLES = 1449 -PASCAL_INPUT_PATH_BASE = 'gs://**/pascal_voc_seg' - - -@exp_factory.register_config_factory('seg_deeplabv3_pascal') -def seg_deeplabv3_pascal() -> cfg.ExperimentConfig: - """Image segmentation on pascal voc with resnet deeplabv3.""" - train_batch_size = 16 - eval_batch_size = 8 - steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size - output_stride = 16 - aspp_dilation_rates = [12, 24, 36] # [6, 12, 18] if output_stride = 16 - multigrid = [1, 2, 4] - stem_type = 'v1' - level = int(np.math.log2(output_stride)) - config = cfg.ExperimentConfig( - task=SemanticSegmentationTask( - model=SemanticSegmentationModel( - num_classes=21, - input_size=[None, None, 3], - backbone=backbones.Backbone( - type='dilated_resnet', dilated_resnet=backbones.DilatedResNet( - model_id=101, output_stride=output_stride, - multigrid=multigrid, stem_type=stem_type)), - decoder=decoders.Decoder( - type='aspp', aspp=decoders.ASPP( - level=level, dilation_rates=aspp_dilation_rates)), - head=SegmentationHead(level=level, num_convs=0), - norm_activation=common.NormActivation( - activation='swish', - norm_momentum=0.9997, - norm_epsilon=1e-3, - use_sync_bn=True)), - losses=Losses(l2_weight_decay=1e-4), - train_data=DataConfig( - input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'), - # TODO(arashwan): test changing size to 513 to match deeplab. - output_size=[512, 512], - is_training=True, - global_batch_size=train_batch_size, - aug_scale_min=0.5, - aug_scale_max=2.0), - validation_data=DataConfig( - input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'), - output_size=[512, 512], - is_training=False, - global_batch_size=eval_batch_size, - resize_eval_groundtruth=False, - groundtruth_padded_size=[512, 512], - drop_remainder=False), - # resnet101 - init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400', - init_checkpoint_modules='backbone'), - trainer=cfg.TrainerConfig( - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - train_steps=45 * steps_per_epoch, - validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size, - validation_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'polynomial', - 'polynomial': { - 'initial_learning_rate': 0.007, - 'decay_steps': 45 * steps_per_epoch, - 'end_learning_rate': 0.0, - 'power': 0.9 - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 5 * steps_per_epoch, - 'warmup_learning_rate': 0 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - - return config - - -@exp_factory.register_config_factory('seg_deeplabv3plus_pascal') -def seg_deeplabv3plus_pascal() -> cfg.ExperimentConfig: - """Image segmentation on pascal voc with resnet deeplabv3+.""" - train_batch_size = 16 - eval_batch_size = 8 - steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size - output_stride = 16 - aspp_dilation_rates = [6, 12, 18] - multigrid = [1, 2, 4] - stem_type = 'v1' - level = int(np.math.log2(output_stride)) - config = cfg.ExperimentConfig( - task=SemanticSegmentationTask( - model=SemanticSegmentationModel( - num_classes=21, - input_size=[None, None, 3], - backbone=backbones.Backbone( - type='dilated_resnet', dilated_resnet=backbones.DilatedResNet( - model_id=101, output_stride=output_stride, - stem_type=stem_type, multigrid=multigrid)), - decoder=decoders.Decoder( - type='aspp', - aspp=decoders.ASPP( - level=level, dilation_rates=aspp_dilation_rates)), - head=SegmentationHead( - level=level, - num_convs=2, - feature_fusion='deeplabv3plus', - low_level=2, - low_level_num_filters=48), - norm_activation=common.NormActivation( - activation='swish', - norm_momentum=0.9997, - norm_epsilon=1e-3, - use_sync_bn=True)), - losses=Losses(l2_weight_decay=1e-4), - train_data=DataConfig( - input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'), - output_size=[512, 512], - is_training=True, - global_batch_size=train_batch_size, - aug_scale_min=0.5, - aug_scale_max=2.0), - validation_data=DataConfig( - input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'), - output_size=[512, 512], - is_training=False, - global_batch_size=eval_batch_size, - resize_eval_groundtruth=False, - groundtruth_padded_size=[512, 512], - drop_remainder=False), - # resnet101 - init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400', - init_checkpoint_modules='backbone'), - trainer=cfg.TrainerConfig( - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - train_steps=45 * steps_per_epoch, - validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size, - validation_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'polynomial', - 'polynomial': { - 'initial_learning_rate': 0.007, - 'decay_steps': 45 * steps_per_epoch, - 'end_learning_rate': 0.0, - 'power': 0.9 - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 5 * steps_per_epoch, - 'warmup_learning_rate': 0 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - - return config - - -@exp_factory.register_config_factory('seg_resnetfpn_pascal') -def seg_resnetfpn_pascal() -> cfg.ExperimentConfig: - """Image segmentation on pascal voc with resnet-fpn.""" - train_batch_size = 256 - eval_batch_size = 32 - steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size - config = cfg.ExperimentConfig( - task=SemanticSegmentationTask( - model=SemanticSegmentationModel( - num_classes=21, - input_size=[512, 512, 3], - min_level=3, - max_level=7, - backbone=backbones.Backbone( - type='resnet', resnet=backbones.ResNet(model_id=50)), - decoder=decoders.Decoder(type='fpn', fpn=decoders.FPN()), - head=SegmentationHead(level=3, num_convs=3), - norm_activation=common.NormActivation( - activation='swish', - use_sync_bn=True)), - losses=Losses(l2_weight_decay=1e-4), - train_data=DataConfig( - input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'), - is_training=True, - global_batch_size=train_batch_size, - aug_scale_min=0.2, - aug_scale_max=1.5), - validation_data=DataConfig( - input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'), - is_training=False, - global_batch_size=eval_batch_size, - resize_eval_groundtruth=False, - groundtruth_padded_size=[512, 512], - drop_remainder=False), - ), - trainer=cfg.TrainerConfig( - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - train_steps=450 * steps_per_epoch, - validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size, - validation_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'polynomial', - 'polynomial': { - 'initial_learning_rate': 0.007, - 'decay_steps': 450 * steps_per_epoch, - 'end_learning_rate': 0.0, - 'power': 0.9 - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 5 * steps_per_epoch, - 'warmup_learning_rate': 0 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - - return config - - -@exp_factory.register_config_factory('mnv2_deeplabv3_pascal') -def mnv2_deeplabv3_pascal() -> cfg.ExperimentConfig: - """Image segmentation on pascal with mobilenetv2 deeplabv3.""" - train_batch_size = 16 - eval_batch_size = 16 - steps_per_epoch = PASCAL_TRAIN_EXAMPLES // train_batch_size - output_stride = 16 - aspp_dilation_rates = [] - level = int(np.math.log2(output_stride)) - pool_kernel_size = [] - - config = cfg.ExperimentConfig( - task=SemanticSegmentationTask( - model=SemanticSegmentationModel( - num_classes=21, - input_size=[None, None, 3], - backbone=backbones.Backbone( - type='mobilenet', - mobilenet=backbones.MobileNet( - model_id='MobileNetV2', output_stride=output_stride)), - decoder=decoders.Decoder( - type='aspp', - aspp=decoders.ASPP( - level=level, - dilation_rates=aspp_dilation_rates, - pool_kernel_size=pool_kernel_size)), - head=SegmentationHead(level=level, num_convs=0), - norm_activation=common.NormActivation( - activation='relu', - norm_momentum=0.99, - norm_epsilon=1e-3, - use_sync_bn=True)), - losses=Losses(l2_weight_decay=4e-5), - train_data=DataConfig( - input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'train_aug*'), - output_size=[512, 512], - is_training=True, - global_batch_size=train_batch_size, - aug_scale_min=0.5, - aug_scale_max=2.0), - validation_data=DataConfig( - input_path=os.path.join(PASCAL_INPUT_PATH_BASE, 'val*'), - output_size=[512, 512], - is_training=False, - global_batch_size=eval_batch_size, - resize_eval_groundtruth=False, - groundtruth_padded_size=[512, 512], - drop_remainder=False), - # mobilenetv2 - init_checkpoint='gs://tf_model_garden/cloud/vision-2.0/deeplab/deeplabv3_mobilenetv2_coco/best_ckpt-63', - init_checkpoint_modules=['backbone', 'decoder']), - trainer=cfg.TrainerConfig( - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - train_steps=30000, - validation_steps=PASCAL_VAL_EXAMPLES // eval_batch_size, - validation_interval=steps_per_epoch, - best_checkpoint_eval_metric='mean_iou', - best_checkpoint_export_subdir='best_ckpt', - best_checkpoint_metric_comp='higher', - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'polynomial', - 'polynomial': { - 'initial_learning_rate': 0.007 * train_batch_size / 16, - 'decay_steps': 30000, - 'end_learning_rate': 0.0, - 'power': 0.9 - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 5 * steps_per_epoch, - 'warmup_learning_rate': 0 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - - return config - - -# Cityscapes Dataset (Download and process the dataset yourself) -CITYSCAPES_TRAIN_EXAMPLES = 2975 -CITYSCAPES_VAL_EXAMPLES = 500 -CITYSCAPES_INPUT_PATH_BASE = 'cityscapes' - - -@exp_factory.register_config_factory('seg_deeplabv3plus_cityscapes') -def seg_deeplabv3plus_cityscapes() -> cfg.ExperimentConfig: - """Image segmentation on cityscapes with resnet deeplabv3+.""" - train_batch_size = 16 - eval_batch_size = 16 - steps_per_epoch = CITYSCAPES_TRAIN_EXAMPLES // train_batch_size - output_stride = 16 - aspp_dilation_rates = [6, 12, 18] - multigrid = [1, 2, 4] - stem_type = 'v1' - level = int(np.math.log2(output_stride)) - config = cfg.ExperimentConfig( - task=SemanticSegmentationTask( - model=SemanticSegmentationModel( - # Cityscapes uses only 19 semantic classes for train/evaluation. - # The void (background) class is ignored in train and evaluation. - num_classes=19, - input_size=[None, None, 3], - backbone=backbones.Backbone( - type='dilated_resnet', dilated_resnet=backbones.DilatedResNet( - model_id=101, output_stride=output_stride, - stem_type=stem_type, multigrid=multigrid)), - decoder=decoders.Decoder( - type='aspp', - aspp=decoders.ASPP( - level=level, dilation_rates=aspp_dilation_rates, - pool_kernel_size=[512, 1024])), - head=SegmentationHead( - level=level, - num_convs=2, - feature_fusion='deeplabv3plus', - low_level=2, - low_level_num_filters=48), - norm_activation=common.NormActivation( - activation='swish', - norm_momentum=0.99, - norm_epsilon=1e-3, - use_sync_bn=True)), - losses=Losses(l2_weight_decay=1e-4), - train_data=DataConfig( - input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE, - 'train_fine**'), - crop_size=[512, 1024], - output_size=[1024, 2048], - is_training=True, - global_batch_size=train_batch_size, - aug_scale_min=0.5, - aug_scale_max=2.0), - validation_data=DataConfig( - input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE, 'val_fine*'), - output_size=[1024, 2048], - is_training=False, - global_batch_size=eval_batch_size, - resize_eval_groundtruth=True, - drop_remainder=False), - # resnet101 - init_checkpoint='gs://cloud-tpu-checkpoints/vision-2.0/deeplab/deeplab_resnet101_imagenet/ckpt-62400', - init_checkpoint_modules='backbone'), - trainer=cfg.TrainerConfig( - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - train_steps=500 * steps_per_epoch, - validation_steps=CITYSCAPES_VAL_EXAMPLES // eval_batch_size, - validation_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'polynomial', - 'polynomial': { - 'initial_learning_rate': 0.01, - 'decay_steps': 500 * steps_per_epoch, - 'end_learning_rate': 0.0, - 'power': 0.9 - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 5 * steps_per_epoch, - 'warmup_learning_rate': 0 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - - return config - - -@exp_factory.register_config_factory('mnv2_deeplabv3_cityscapes') -def mnv2_deeplabv3_cityscapes() -> cfg.ExperimentConfig: - """Image segmentation on cityscapes with mobilenetv2 deeplabv3.""" - train_batch_size = 16 - eval_batch_size = 16 - steps_per_epoch = CITYSCAPES_TRAIN_EXAMPLES // train_batch_size - output_stride = 16 - aspp_dilation_rates = [] - pool_kernel_size = [512, 1024] - - level = int(np.math.log2(output_stride)) - config = cfg.ExperimentConfig( - task=SemanticSegmentationTask( - model=SemanticSegmentationModel( - # Cityscapes uses only 19 semantic classes for train/evaluation. - # The void (background) class is ignored in train and evaluation. - num_classes=19, - input_size=[None, None, 3], - backbone=backbones.Backbone( - type='mobilenet', - mobilenet=backbones.MobileNet( - model_id='MobileNetV2', output_stride=output_stride)), - decoder=decoders.Decoder( - type='aspp', - aspp=decoders.ASPP( - level=level, - dilation_rates=aspp_dilation_rates, - pool_kernel_size=pool_kernel_size)), - head=SegmentationHead(level=level, num_convs=0), - norm_activation=common.NormActivation( - activation='relu', - norm_momentum=0.99, - norm_epsilon=1e-3, - use_sync_bn=True)), - losses=Losses(l2_weight_decay=4e-5), - train_data=DataConfig( - input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE, - 'train_fine**'), - crop_size=[512, 1024], - output_size=[1024, 2048], - is_training=True, - global_batch_size=train_batch_size, - aug_scale_min=0.5, - aug_scale_max=2.0), - validation_data=DataConfig( - input_path=os.path.join(CITYSCAPES_INPUT_PATH_BASE, 'val_fine*'), - output_size=[1024, 2048], - is_training=False, - global_batch_size=eval_batch_size, - resize_eval_groundtruth=True, - drop_remainder=False), - # Coco pre-trained mobilenetv2 checkpoint - init_checkpoint='gs://tf_model_garden/cloud/vision-2.0/deeplab/deeplabv3_mobilenetv2_coco/best_ckpt-63', - init_checkpoint_modules='backbone'), - trainer=cfg.TrainerConfig( - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - train_steps=100000, - validation_steps=CITYSCAPES_VAL_EXAMPLES // eval_batch_size, - validation_interval=steps_per_epoch, - best_checkpoint_eval_metric='mean_iou', - best_checkpoint_export_subdir='best_ckpt', - best_checkpoint_metric_comp='higher', - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9 - } - }, - 'learning_rate': { - 'type': 'polynomial', - 'polynomial': { - 'initial_learning_rate': 0.01, - 'decay_steps': 100000, - 'end_learning_rate': 0.0, - 'power': 0.9 - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': 5 * steps_per_epoch, - 'warmup_learning_rate': 0 - } - } - })), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None' - ]) - - return config - - -@exp_factory.register_config_factory('mnv2_deeplabv3plus_cityscapes') -def mnv2_deeplabv3plus_cityscapes() -> cfg.ExperimentConfig: - """Image segmentation on cityscapes with mobilenetv2 deeplabv3plus.""" - config = mnv2_deeplabv3_cityscapes() - config.task.model.head = SegmentationHead( - level=4, - num_convs=2, - feature_fusion='deeplabv3plus', - use_depthwise_convolution=True, - low_level='2/depthwise', - low_level_num_filters=48) - config.task.model.backbone.mobilenet.output_intermediate_endpoints = True - return config diff --git a/official/vision/beta/configs/semantic_segmentation_test.py b/official/vision/beta/configs/semantic_segmentation_test.py deleted file mode 100644 index f1eb2586c..000000000 --- a/official/vision/beta/configs/semantic_segmentation_test.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for semantic_segmentation.""" - -# pylint: disable=unused-import -from absl.testing import parameterized -import tensorflow as tf - -from official.core import config_definitions as cfg -from official.core import exp_factory -from official.vision import beta -from official.vision.beta.configs import semantic_segmentation as exp_cfg - - -class ImageSegmentationConfigTest(tf.test.TestCase, parameterized.TestCase): - - @parameterized.parameters(('seg_deeplabv3_pascal',), - ('seg_deeplabv3plus_pascal',)) - def test_semantic_segmentation_configs(self, config_name): - config = exp_factory.get_exp_config(config_name) - self.assertIsInstance(config, cfg.ExperimentConfig) - self.assertIsInstance(config.task, exp_cfg.SemanticSegmentationTask) - self.assertIsInstance(config.task.model, - exp_cfg.SemanticSegmentationModel) - self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig) - config.validate() - config.task.train_data.is_training = None - with self.assertRaises(KeyError): - config.validate() - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/configs/video_classification.py b/official/vision/beta/configs/video_classification.py deleted file mode 100644 index eed288bdc..000000000 --- a/official/vision/beta/configs/video_classification.py +++ /dev/null @@ -1,370 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Video classification configuration definition.""" -import dataclasses -from typing import Optional, Tuple -from official.core import config_definitions as cfg -from official.core import exp_factory -from official.modeling import hyperparams -from official.modeling import optimization -from official.vision.beta.configs import backbones_3d -from official.vision.beta.configs import common - - -@dataclasses.dataclass -class DataConfig(cfg.DataConfig): - """The base configuration for building datasets.""" - name: Optional[str] = None - file_type: Optional[str] = 'tfrecord' - compressed_input: bool = False - split: str = 'train' - variant_name: Optional[str] = None - feature_shape: Tuple[int, ...] = (64, 224, 224, 3) - temporal_stride: int = 1 - random_stride_range: int = 0 - num_test_clips: int = 1 - num_test_crops: int = 1 - num_classes: int = -1 - num_examples: int = -1 - global_batch_size: int = 128 - data_format: str = 'channels_last' - dtype: str = 'float32' - one_hot: bool = True - shuffle_buffer_size: int = 64 - cache: bool = False - input_path: str = '' - is_training: bool = True - cycle_length: int = 10 - drop_remainder: bool = True - min_image_size: int = 256 - is_multilabel: bool = False - output_audio: bool = False - audio_feature: str = '' - audio_feature_shape: Tuple[int, ...] = (-1,) - aug_min_aspect_ratio: float = 0.5 - aug_max_aspect_ratio: float = 2.0 - aug_min_area_ratio: float = 0.49 - aug_max_area_ratio: float = 1.0 - aug_type: Optional[str] = None # 'autoaug', 'randaug', or None - image_field_key: str = 'image/encoded' - label_field_key: str = 'clip/label/index' - - -def kinetics400(is_training): - """Generated Kinectics 400 dataset configs.""" - return DataConfig( - name='kinetics400', - num_classes=400, - is_training=is_training, - split='train' if is_training else 'valid', - drop_remainder=is_training, - num_examples=215570 if is_training else 17706, - feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3)) - - -def kinetics600(is_training): - """Generated Kinectics 600 dataset configs.""" - return DataConfig( - name='kinetics600', - num_classes=600, - is_training=is_training, - split='train' if is_training else 'valid', - drop_remainder=is_training, - num_examples=366016 if is_training else 27780, - feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3)) - - -def kinetics700(is_training): - """Generated Kinectics 600 dataset configs.""" - return DataConfig( - name='kinetics700', - num_classes=700, - is_training=is_training, - split='train' if is_training else 'valid', - drop_remainder=is_training, - num_examples=522883 if is_training else 33441, - feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3)) - - -def kinetics700_2020(is_training): - """Generated Kinectics 600 dataset configs.""" - return DataConfig( - name='kinetics700', - num_classes=700, - is_training=is_training, - split='train' if is_training else 'valid', - drop_remainder=is_training, - num_examples=535982 if is_training else 33640, - feature_shape=(64, 224, 224, 3) if is_training else (250, 224, 224, 3)) - - -@dataclasses.dataclass -class VideoClassificationModel(hyperparams.Config): - """The model config.""" - model_type: str = 'video_classification' - backbone: backbones_3d.Backbone3D = backbones_3d.Backbone3D( - type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()) - norm_activation: common.NormActivation = common.NormActivation( - use_sync_bn=False) - dropout_rate: float = 0.2 - aggregate_endpoints: bool = False - require_endpoints: Optional[Tuple[str, ...]] = None - - -@dataclasses.dataclass -class Losses(hyperparams.Config): - one_hot: bool = True - label_smoothing: float = 0.0 - l2_weight_decay: float = 0.0 - - -@dataclasses.dataclass -class Metrics(hyperparams.Config): - use_per_class_recall: bool = False - - -@dataclasses.dataclass -class VideoClassificationTask(cfg.TaskConfig): - """The task config.""" - model: VideoClassificationModel = VideoClassificationModel() - train_data: DataConfig = DataConfig(is_training=True, drop_remainder=True) - validation_data: DataConfig = DataConfig( - is_training=False, drop_remainder=False) - losses: Losses = Losses() - metrics: Metrics = Metrics() - init_checkpoint: Optional[str] = None - init_checkpoint_modules: str = 'all' # all or backbone - # Spatial Partitioning fields. - train_input_partition_dims: Optional[Tuple[int, ...]] = None - eval_input_partition_dims: Optional[Tuple[int, ...]] = None - - -def add_trainer(experiment: cfg.ExperimentConfig, - train_batch_size: int, - eval_batch_size: int, - learning_rate: float = 1.6, - train_epochs: int = 44, - warmup_epochs: int = 5): - """Add and config a trainer to the experiment config.""" - if experiment.task.train_data.num_examples <= 0: - raise ValueError('Wrong train dataset size {!r}'.format( - experiment.task.train_data)) - if experiment.task.validation_data.num_examples <= 0: - raise ValueError('Wrong validation dataset size {!r}'.format( - experiment.task.validation_data)) - experiment.task.train_data.global_batch_size = train_batch_size - experiment.task.validation_data.global_batch_size = eval_batch_size - steps_per_epoch = experiment.task.train_data.num_examples // train_batch_size - experiment.trainer = cfg.TrainerConfig( - steps_per_loop=steps_per_epoch, - summary_interval=steps_per_epoch, - checkpoint_interval=steps_per_epoch, - train_steps=train_epochs * steps_per_epoch, - validation_steps=experiment.task.validation_data.num_examples // - eval_batch_size, - validation_interval=steps_per_epoch, - optimizer_config=optimization.OptimizationConfig({ - 'optimizer': { - 'type': 'sgd', - 'sgd': { - 'momentum': 0.9, - 'nesterov': True, - } - }, - 'learning_rate': { - 'type': 'cosine', - 'cosine': { - 'initial_learning_rate': learning_rate, - 'decay_steps': train_epochs * steps_per_epoch, - } - }, - 'warmup': { - 'type': 'linear', - 'linear': { - 'warmup_steps': warmup_epochs * steps_per_epoch, - 'warmup_learning_rate': 0 - } - } - })) - return experiment - - -@exp_factory.register_config_factory('video_classification') -def video_classification() -> cfg.ExperimentConfig: - """Video classification general.""" - return cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), - task=VideoClassificationTask(), - trainer=cfg.TrainerConfig(), - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None', - 'task.train_data.num_classes == task.validation_data.num_classes', - ]) - - -@exp_factory.register_config_factory('video_classification_ucf101') -def video_classification_ucf101() -> cfg.ExperimentConfig: - """Video classification on UCF-101 with resnet.""" - train_dataset = DataConfig( - name='ucf101', - num_classes=101, - is_training=True, - split='train', - drop_remainder=True, - num_examples=9537, - temporal_stride=2, - feature_shape=(32, 224, 224, 3)) - train_dataset.tfds_name = 'ucf101' - train_dataset.tfds_split = 'train' - validation_dataset = DataConfig( - name='ucf101', - num_classes=101, - is_training=True, - split='test', - drop_remainder=False, - num_examples=3783, - temporal_stride=2, - feature_shape=(32, 224, 224, 3)) - validation_dataset.tfds_name = 'ucf101' - validation_dataset.tfds_split = 'test' - task = VideoClassificationTask( - model=VideoClassificationModel( - backbone=backbones_3d.Backbone3D( - type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()), - norm_activation=common.NormActivation( - norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)), - losses=Losses(l2_weight_decay=1e-4), - train_data=train_dataset, - validation_data=validation_dataset) - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), - task=task, - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None', - 'task.train_data.num_classes == task.validation_data.num_classes', - ]) - add_trainer( - config, - train_batch_size=64, - eval_batch_size=16, - learning_rate=0.8, - train_epochs=100) - return config - - -@exp_factory.register_config_factory('video_classification_kinetics400') -def video_classification_kinetics400() -> cfg.ExperimentConfig: - """Video classification on Kinectics 400 with resnet.""" - train_dataset = kinetics400(is_training=True) - validation_dataset = kinetics400(is_training=False) - task = VideoClassificationTask( - model=VideoClassificationModel( - backbone=backbones_3d.Backbone3D( - type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()), - norm_activation=common.NormActivation( - norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)), - losses=Losses(l2_weight_decay=1e-4), - train_data=train_dataset, - validation_data=validation_dataset) - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), - task=task, - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None', - 'task.train_data.num_classes == task.validation_data.num_classes', - ]) - add_trainer(config, train_batch_size=1024, eval_batch_size=64) - return config - - -@exp_factory.register_config_factory('video_classification_kinetics600') -def video_classification_kinetics600() -> cfg.ExperimentConfig: - """Video classification on Kinectics 600 with resnet.""" - train_dataset = kinetics600(is_training=True) - validation_dataset = kinetics600(is_training=False) - task = VideoClassificationTask( - model=VideoClassificationModel( - backbone=backbones_3d.Backbone3D( - type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()), - norm_activation=common.NormActivation( - norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)), - losses=Losses(l2_weight_decay=1e-4), - train_data=train_dataset, - validation_data=validation_dataset) - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), - task=task, - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None', - 'task.train_data.num_classes == task.validation_data.num_classes', - ]) - add_trainer(config, train_batch_size=1024, eval_batch_size=64) - return config - - -@exp_factory.register_config_factory('video_classification_kinetics700') -def video_classification_kinetics700() -> cfg.ExperimentConfig: - """Video classification on Kinectics 700 with resnet.""" - train_dataset = kinetics700(is_training=True) - validation_dataset = kinetics700(is_training=False) - task = VideoClassificationTask( - model=VideoClassificationModel( - backbone=backbones_3d.Backbone3D( - type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()), - norm_activation=common.NormActivation( - norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)), - losses=Losses(l2_weight_decay=1e-4), - train_data=train_dataset, - validation_data=validation_dataset) - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), - task=task, - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None', - 'task.train_data.num_classes == task.validation_data.num_classes', - ]) - add_trainer(config, train_batch_size=1024, eval_batch_size=64) - return config - - -@exp_factory.register_config_factory('video_classification_kinetics700_2020') -def video_classification_kinetics700_2020() -> cfg.ExperimentConfig: - """Video classification on Kinectics 700 2020 with resnet.""" - train_dataset = kinetics700_2020(is_training=True) - validation_dataset = kinetics700_2020(is_training=False) - task = VideoClassificationTask( - model=VideoClassificationModel( - backbone=backbones_3d.Backbone3D( - type='resnet_3d', resnet_3d=backbones_3d.ResNet3D50()), - norm_activation=common.NormActivation( - norm_momentum=0.9, norm_epsilon=1e-5, use_sync_bn=False)), - losses=Losses(l2_weight_decay=1e-4), - train_data=train_dataset, - validation_data=validation_dataset) - config = cfg.ExperimentConfig( - runtime=cfg.RuntimeConfig(mixed_precision_dtype='bfloat16'), - task=task, - restrictions=[ - 'task.train_data.is_training != None', - 'task.validation_data.is_training != None', - 'task.train_data.num_classes == task.validation_data.num_classes', - ]) - add_trainer(config, train_batch_size=1024, eval_batch_size=64) - return config diff --git a/official/vision/beta/configs/video_classification_test.py b/official/vision/beta/configs/video_classification_test.py deleted file mode 100644 index 5608673cc..000000000 --- a/official/vision/beta/configs/video_classification_test.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for video_classification.""" - -# pylint: disable=unused-import -from absl.testing import parameterized -import tensorflow as tf - -from official.core import config_definitions as cfg -from official.core import exp_factory -from official.vision import beta -from official.vision.beta.configs import video_classification as exp_cfg - - -class VideoClassificationConfigTest(tf.test.TestCase, parameterized.TestCase): - - @parameterized.parameters(('video_classification',), - ('video_classification_kinetics600',)) - def test_video_classification_configs(self, config_name): - config = exp_factory.get_exp_config(config_name) - self.assertIsInstance(config, cfg.ExperimentConfig) - self.assertIsInstance(config.task, exp_cfg.VideoClassificationTask) - self.assertIsInstance(config.task.model, exp_cfg.VideoClassificationModel) - self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig) - config.validate() - config.task.train_data.is_training = None - with self.assertRaises(KeyError): - config.validate() - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/data/__init__.py b/official/vision/beta/data/__init__.py deleted file mode 100644 index 310bfb28f..000000000 --- a/official/vision/beta/data/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - diff --git a/official/vision/beta/data/create_coco_tf_record.py b/official/vision/beta/data/create_coco_tf_record.py deleted file mode 100644 index 531500db2..000000000 --- a/official/vision/beta/data/create_coco_tf_record.py +++ /dev/null @@ -1,554 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -r"""Convert raw COCO dataset to TFRecord format. - -This scripts follows the label map decoder format and supports detection -boxes, instance masks and captions. - -Example usage: - python create_coco_tf_record.py --logtostderr \ - --image_dir="${TRAIN_IMAGE_DIR}" \ - --image_info_file="${TRAIN_IMAGE_INFO_FILE}" \ - --object_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \ - --caption_annotations_file="${CAPTION_ANNOTATIONS_FILE}" \ - --output_file_prefix="${OUTPUT_DIR/FILE_PREFIX}" \ - --num_shards=100 -""" - -import collections -import json -import logging -import os - -from absl import app # pylint:disable=unused-import -from absl import flags -import numpy as np - -from pycocotools import mask -import tensorflow as tf - -import multiprocessing as mp -from official.vision.beta.data import tfrecord_lib - - -flags.DEFINE_boolean( - 'include_masks', False, 'Whether to include instance segmentations masks ' - '(PNG encoded) in the result. default: False.') -flags.DEFINE_multi_string('image_dir', '', 'Directory containing images.') -flags.DEFINE_string( - 'image_info_file', '', 'File containing image information. ' - 'Tf Examples in the output files correspond to the image ' - 'info entries in this file. If this file is not provided ' - 'object_annotations_file is used if present. Otherwise, ' - 'caption_annotations_file is used to get image info.') -flags.DEFINE_string( - 'object_annotations_file', '', 'File containing object ' - 'annotations - boxes and instance masks.') -flags.DEFINE_string('caption_annotations_file', '', 'File containing image ' - 'captions.') -flags.DEFINE_string('panoptic_annotations_file', '', 'File containing panoptic ' - 'annotations.') -flags.DEFINE_string('panoptic_masks_dir', '', - 'Directory containing panoptic masks annotations.') -flags.DEFINE_boolean( - 'include_panoptic_masks', False, 'Whether to include category and ' - 'instance masks in the result. These are required to run the PQ evaluator ' - 'default: False.') -flags.DEFINE_string('output_file_prefix', '/tmp/train', 'Path to output file') -flags.DEFINE_integer('num_shards', 32, 'Number of shards for output file.') - -FLAGS = flags.FLAGS - -logger = tf.get_logger() -logger.setLevel(logging.INFO) - -_VOID_LABEL = 0 -_VOID_INSTANCE_ID = 0 -_THING_CLASS_ID = 1 -_STUFF_CLASSES_OFFSET = 90 - - -def coco_segmentation_to_mask_png(segmentation, height, width, is_crowd): - """Encode a COCO mask segmentation as PNG string.""" - run_len_encoding = mask.frPyObjects(segmentation, height, width) - binary_mask = mask.decode(run_len_encoding) - if not is_crowd: - binary_mask = np.amax(binary_mask, axis=2) - - return tfrecord_lib.encode_mask_as_png(binary_mask) - - -def generate_coco_panoptics_masks(segments_info, mask_path, - include_panoptic_masks, - is_category_thing): - """Creates masks for panoptic segmentation task. - - Args: - segments_info: a list of dicts, where each dict has keys: [u'id', - u'category_id', u'area', u'bbox', u'iscrowd'], detailing information for - each segment in the panoptic mask. - mask_path: path to the panoptic mask. - include_panoptic_masks: bool, when set to True, category and instance - masks are included in the outputs. Set this to True, when using - the Panoptic Quality evaluator. - is_category_thing: a dict with category ids as keys and, 0/1 as values to - represent "stuff" and "things" classes respectively. - - Returns: - A dict with with keys: [u'semantic_segmentation_mask', u'category_mask', - u'instance_mask']. The dict contains 'category_mask' and 'instance_mask' - only if `include_panoptic_eval_masks` is set to True. - """ - rgb_mask = tfrecord_lib.read_image(mask_path) - r, g, b = np.split(rgb_mask, 3, axis=-1) - - # decode rgb encoded panoptic mask to get segments ids - # refer https://cocodataset.org/#format-data - segments_encoded_mask = (r + g * 256 + b * (256**2)).squeeze() - - semantic_segmentation_mask = np.ones_like( - segments_encoded_mask, dtype=np.uint8) * _VOID_LABEL - if include_panoptic_masks: - category_mask = np.ones_like( - segments_encoded_mask, dtype=np.uint8) * _VOID_LABEL - instance_mask = np.ones_like( - segments_encoded_mask, dtype=np.uint8) * _VOID_INSTANCE_ID - - for idx, segment in enumerate(segments_info): - segment_id = segment['id'] - category_id = segment['category_id'] - - if is_category_thing[category_id]: - encoded_category_id = _THING_CLASS_ID - instance_id = idx + 1 - else: - encoded_category_id = category_id - _STUFF_CLASSES_OFFSET - instance_id = _VOID_INSTANCE_ID - - segment_mask = (segments_encoded_mask == segment_id) - semantic_segmentation_mask[segment_mask] = encoded_category_id - - if include_panoptic_masks: - category_mask[segment_mask] = category_id - instance_mask[segment_mask] = instance_id - - outputs = { - 'semantic_segmentation_mask': tfrecord_lib.encode_mask_as_png( - semantic_segmentation_mask) - } - - if include_panoptic_masks: - outputs.update({ - 'category_mask': tfrecord_lib.encode_mask_as_png(category_mask), - 'instance_mask': tfrecord_lib.encode_mask_as_png(instance_mask) - }) - return outputs - - -def coco_annotations_to_lists(bbox_annotations, id_to_name_map, - image_height, image_width, include_masks): - """Converts COCO annotations to feature lists.""" - - data = dict((k, list()) for k in - ['xmin', 'xmax', 'ymin', 'ymax', 'is_crowd', - 'category_id', 'category_names', 'area']) - if include_masks: - data['encoded_mask_png'] = [] - - num_annotations_skipped = 0 - - for object_annotations in bbox_annotations: - (x, y, width, height) = tuple(object_annotations['bbox']) - - if width <= 0 or height <= 0: - num_annotations_skipped += 1 - continue - if x + width > image_width or y + height > image_height: - num_annotations_skipped += 1 - continue - data['xmin'].append(float(x) / image_width) - data['xmax'].append(float(x + width) / image_width) - data['ymin'].append(float(y) / image_height) - data['ymax'].append(float(y + height) / image_height) - data['is_crowd'].append(object_annotations['iscrowd']) - category_id = int(object_annotations['category_id']) - data['category_id'].append(category_id) - data['category_names'].append(id_to_name_map[category_id].encode('utf8')) - data['area'].append(object_annotations['area']) - - if include_masks: - data['encoded_mask_png'].append( - coco_segmentation_to_mask_png(object_annotations['segmentation'], - image_height, image_width, - object_annotations['iscrowd']) - ) - - return data, num_annotations_skipped - - -def bbox_annotations_to_feature_dict( - bbox_annotations, image_height, image_width, id_to_name_map, include_masks): - """Convert COCO annotations to an encoded feature dict.""" - - data, num_skipped = coco_annotations_to_lists( - bbox_annotations, id_to_name_map, image_height, image_width, - include_masks) - feature_dict = { - 'image/object/bbox/xmin': - tfrecord_lib.convert_to_feature(data['xmin']), - 'image/object/bbox/xmax': - tfrecord_lib.convert_to_feature(data['xmax']), - 'image/object/bbox/ymin': - tfrecord_lib.convert_to_feature(data['ymin']), - 'image/object/bbox/ymax': - tfrecord_lib.convert_to_feature(data['ymax']), - 'image/object/class/text': - tfrecord_lib.convert_to_feature(data['category_names']), - 'image/object/class/label': - tfrecord_lib.convert_to_feature(data['category_id']), - 'image/object/is_crowd': - tfrecord_lib.convert_to_feature(data['is_crowd']), - 'image/object/area': - tfrecord_lib.convert_to_feature(data['area']), - } - if include_masks: - feature_dict['image/object/mask'] = ( - tfrecord_lib.convert_to_feature(data['encoded_mask_png'])) - - return feature_dict, num_skipped - - -def encode_caption_annotations(caption_annotations): - captions = [] - for caption_annotation in caption_annotations: - captions.append(caption_annotation['caption'].encode('utf8')) - - return captions - - -def create_tf_example(image, - image_dirs, - panoptic_masks_dir=None, - bbox_annotations=None, - id_to_name_map=None, - caption_annotations=None, - panoptic_annotation=None, - is_category_thing=None, - include_panoptic_masks=False, - include_masks=False): - """Converts image and annotations to a tf.Example proto. - - Args: - image: dict with keys: [u'license', u'file_name', u'coco_url', u'height', - u'width', u'date_captured', u'flickr_url', u'id'] - image_dirs: list of directories containing the image files. - panoptic_masks_dir: `str` of the panoptic masks directory. - bbox_annotations: - list of dicts with keys: [u'segmentation', u'area', u'iscrowd', - u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box - coordinates in the official COCO dataset are given as [x, y, width, - height] tuples using absolute coordinates where x, y represent the - top-left (0-indexed) corner. This function converts to the format - expected by the Tensorflow Object Detection API (which is which is - [ymin, xmin, ymax, xmax] with coordinates normalized relative to image - size). - id_to_name_map: a dict mapping category IDs to string names. - caption_annotations: - list of dict with keys: [u'id', u'image_id', u'str']. - panoptic_annotation: dict with keys: [u'image_id', u'file_name', - u'segments_info']. Where the value for segments_info is a list of dicts, - with each dict containing information for a single segment in the mask. - is_category_thing: `bool`, whether it is a category thing. - include_panoptic_masks: `bool`, whether to include panoptic masks. - include_masks: Whether to include instance segmentations masks - (PNG encoded) in the result. default: False. - - Returns: - example: The converted tf.Example - num_annotations_skipped: Number of (invalid) annotations that were ignored. - - Raises: - ValueError: if the image pointed to by data['filename'] is not a valid JPEG, - does not exist, or is not unique across image directories. - """ - image_height = image['height'] - image_width = image['width'] - filename = image['file_name'] - image_id = image['id'] - - if len(image_dirs) > 1: - full_paths = [os.path.join(image_dir, filename) for image_dir in image_dirs] - full_existing_paths = [p for p in full_paths if tf.io.gfile.exists(p)] - if not full_existing_paths: - raise ValueError( - '{} does not exist across image directories.'.format(filename)) - if len(full_existing_paths) > 1: - raise ValueError( - '{} is not unique across image directories'.format(filename)) - full_path, = full_existing_paths - # If there is only one image directory, it's not worth checking for existence, - # since trying to open the file will raise an informative error message if it - # does not exist. - else: - image_dir, = image_dirs - full_path = os.path.join(image_dir, filename) - - with tf.io.gfile.GFile(full_path, 'rb') as fid: - encoded_jpg = fid.read() - - feature_dict = tfrecord_lib.image_info_to_feature_dict( - image_height, image_width, filename, image_id, encoded_jpg, 'jpg') - - num_annotations_skipped = 0 - if bbox_annotations: - box_feature_dict, num_skipped = bbox_annotations_to_feature_dict( - bbox_annotations, image_height, image_width, id_to_name_map, - include_masks) - num_annotations_skipped += num_skipped - feature_dict.update(box_feature_dict) - - if caption_annotations: - encoded_captions = encode_caption_annotations(caption_annotations) - feature_dict.update( - {'image/caption': tfrecord_lib.convert_to_feature(encoded_captions)}) - - if panoptic_annotation: - segments_info = panoptic_annotation['segments_info'] - panoptic_mask_filename = os.path.join( - panoptic_masks_dir, - panoptic_annotation['file_name']) - encoded_panoptic_masks = generate_coco_panoptics_masks( - segments_info, panoptic_mask_filename, include_panoptic_masks, - is_category_thing) - feature_dict.update( - {'image/segmentation/class/encoded': tfrecord_lib.convert_to_feature( - encoded_panoptic_masks['semantic_segmentation_mask'])}) - - if include_panoptic_masks: - feature_dict.update({ - 'image/panoptic/category_mask': tfrecord_lib.convert_to_feature( - encoded_panoptic_masks['category_mask']), - 'image/panoptic/instance_mask': tfrecord_lib.convert_to_feature( - encoded_panoptic_masks['instance_mask']) - }) - - example = tf.train.Example(features=tf.train.Features(feature=feature_dict)) - return example, num_annotations_skipped - - -def _load_object_annotations(object_annotations_file): - """Loads object annotation JSON file.""" - with tf.io.gfile.GFile(object_annotations_file, 'r') as fid: - obj_annotations = json.load(fid) - - images = obj_annotations['images'] - id_to_name_map = dict((element['id'], element['name']) for element in - obj_annotations['categories']) - - img_to_obj_annotation = collections.defaultdict(list) - logging.info('Building bounding box index.') - for annotation in obj_annotations['annotations']: - image_id = annotation['image_id'] - img_to_obj_annotation[image_id].append(annotation) - - missing_annotation_count = 0 - for image in images: - image_id = image['id'] - if image_id not in img_to_obj_annotation: - missing_annotation_count += 1 - - logging.info('%d images are missing bboxes.', missing_annotation_count) - - return img_to_obj_annotation, id_to_name_map - - -def _load_caption_annotations(caption_annotations_file): - """Loads caption annotation JSON file.""" - with tf.io.gfile.GFile(caption_annotations_file, 'r') as fid: - caption_annotations = json.load(fid) - - img_to_caption_annotation = collections.defaultdict(list) - logging.info('Building caption index.') - for annotation in caption_annotations['annotations']: - image_id = annotation['image_id'] - img_to_caption_annotation[image_id].append(annotation) - - missing_annotation_count = 0 - images = caption_annotations['images'] - for image in images: - image_id = image['id'] - if image_id not in img_to_caption_annotation: - missing_annotation_count += 1 - - logging.info('%d images are missing captions.', missing_annotation_count) - - return img_to_caption_annotation - - -def _load_panoptic_annotations(panoptic_annotations_file): - """Loads panoptic annotation from file.""" - with tf.io.gfile.GFile(panoptic_annotations_file, 'r') as fid: - panoptic_annotations = json.load(fid) - - img_to_panoptic_annotation = dict() - logging.info('Building panoptic index.') - for annotation in panoptic_annotations['annotations']: - image_id = annotation['image_id'] - img_to_panoptic_annotation[image_id] = annotation - - is_category_thing = dict() - for category_info in panoptic_annotations['categories']: - is_category_thing[category_info['id']] = category_info['isthing'] == 1 - - missing_annotation_count = 0 - images = panoptic_annotations['images'] - for image in images: - image_id = image['id'] - if image_id not in img_to_panoptic_annotation: - missing_annotation_count += 1 - logging.info( - '%d images are missing panoptic annotations.', missing_annotation_count) - - return img_to_panoptic_annotation, is_category_thing - - -def _load_images_info(images_info_file): - with tf.io.gfile.GFile(images_info_file, 'r') as fid: - info_dict = json.load(fid) - return info_dict['images'] - - -def generate_annotations(images, image_dirs, - panoptic_masks_dir=None, - img_to_obj_annotation=None, - img_to_caption_annotation=None, - img_to_panoptic_annotation=None, - is_category_thing=None, - id_to_name_map=None, - include_panoptic_masks=False, - include_masks=False): - """Generator for COCO annotations.""" - for image in images: - object_annotation = (img_to_obj_annotation.get(image['id'], None) if - img_to_obj_annotation else None) - - caption_annotaion = (img_to_caption_annotation.get(image['id'], None) if - img_to_caption_annotation else None) - - panoptic_annotation = (img_to_panoptic_annotation.get(image['id'], None) if - img_to_panoptic_annotation else None) - yield (image, image_dirs, panoptic_masks_dir, object_annotation, - id_to_name_map, caption_annotaion, panoptic_annotation, - is_category_thing, include_panoptic_masks, include_masks) - - -def _create_tf_record_from_coco_annotations(images_info_file, - image_dirs, - output_path, - num_shards, - object_annotations_file=None, - caption_annotations_file=None, - panoptic_masks_dir=None, - panoptic_annotations_file=None, - include_panoptic_masks=False, - include_masks=False): - """Loads COCO annotation json files and converts to tf.Record format. - - Args: - images_info_file: JSON file containing image info. The number of tf.Examples - in the output tf Record files is exactly equal to the number of image info - entries in this file. This can be any of train/val/test annotation json - files Eg. 'image_info_test-dev2017.json', - 'instance_annotations_train2017.json', - 'caption_annotations_train2017.json', etc. - image_dirs: List of directories containing the image files. - output_path: Path to output tf.Record file. - num_shards: Number of output files to create. - object_annotations_file: JSON file containing bounding box annotations. - caption_annotations_file: JSON file containing caption annotations. - panoptic_masks_dir: Directory containing panoptic masks. - panoptic_annotations_file: JSON file containing panoptic annotations. - include_panoptic_masks: Whether to include 'category_mask' - and 'instance_mask', which is required by the panoptic quality evaluator. - include_masks: Whether to include instance segmentations masks - (PNG encoded) in the result. default: False. - """ - - logging.info('writing to output path: %s', output_path) - - images = _load_images_info(images_info_file) - - img_to_obj_annotation = None - img_to_caption_annotation = None - id_to_name_map = None - img_to_panoptic_annotation = None - is_category_thing = None - if object_annotations_file: - img_to_obj_annotation, id_to_name_map = ( - _load_object_annotations(object_annotations_file)) - if caption_annotations_file: - img_to_caption_annotation = ( - _load_caption_annotations(caption_annotations_file)) - if panoptic_annotations_file: - img_to_panoptic_annotation, is_category_thing = ( - _load_panoptic_annotations(panoptic_annotations_file)) - - coco_annotations_iter = generate_annotations( - images=images, - image_dirs=image_dirs, - panoptic_masks_dir=panoptic_masks_dir, - img_to_obj_annotation=img_to_obj_annotation, - img_to_caption_annotation=img_to_caption_annotation, - img_to_panoptic_annotation=img_to_panoptic_annotation, - is_category_thing=is_category_thing, - id_to_name_map=id_to_name_map, - include_panoptic_masks=include_panoptic_masks, - include_masks=include_masks) - - num_skipped = tfrecord_lib.write_tf_record_dataset( - output_path, coco_annotations_iter, create_tf_example, num_shards) - - logging.info('Finished writing, skipped %d annotations.', num_skipped) - - -def main(_): - assert FLAGS.image_dir, '`image_dir` missing.' - assert (FLAGS.image_info_file or FLAGS.object_annotations_file or - FLAGS.caption_annotations_file), ('All annotation files are ' - 'missing.') - if FLAGS.image_info_file: - images_info_file = FLAGS.image_info_file - elif FLAGS.object_annotations_file: - images_info_file = FLAGS.object_annotations_file - else: - images_info_file = FLAGS.caption_annotations_file - - directory = os.path.dirname(FLAGS.output_file_prefix) - if not tf.io.gfile.isdir(directory): - tf.io.gfile.makedirs(directory) - - _create_tf_record_from_coco_annotations(images_info_file, FLAGS.image_dir, - FLAGS.output_file_prefix, - FLAGS.num_shards, - FLAGS.object_annotations_file, - FLAGS.caption_annotations_file, - FLAGS.panoptic_masks_dir, - FLAGS.panoptic_annotations_file, - FLAGS.include_panoptic_masks, - FLAGS.include_masks) - - -if __name__ == '__main__': - app.run(main) diff --git a/official/vision/beta/data/process_coco_few_shot.sh b/official/vision/beta/data/process_coco_few_shot.sh deleted file mode 100644 index 2fec6cb3f..000000000 --- a/official/vision/beta/data/process_coco_few_shot.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -# -# Processes the COCO few-shot benchmark into TFRecord files. Requires `wget`. - -tmp_dir=$(mktemp -d -t coco-XXXXXXXXXX) -base_image_dir="/tmp/coco_images" -output_dir="/tmp/coco_few_shot" -while getopts ":i:o:" o; do - case "${o}" in - o) output_dir=${OPTARG} ;; - i) base_image_dir=${OPTARG} ;; - *) echo "Usage: ${0} [-i ] [-o ]" 1>&2; exit 1 ;; - esac -done - -cocosplit_url="dl.yf.io/fs-det/datasets/cocosplit" -wget --recursive --no-parent -q --show-progress --progress=bar:force:noscroll \ - -P "${tmp_dir}" -A "trainvalno5k.json,5k.json,*1shot*.json,*3shot*.json,*5shot*.json,*10shot*.json,*30shot*.json" \ - "http://${cocosplit_url}/" -mv "${tmp_dir}/${cocosplit_url}/"* "${tmp_dir}" -rm -rf "${tmp_dir}/${cocosplit_url}/" - -python process_coco_few_shot_json_files.py \ - --logtostderr --workdir="${tmp_dir}" - -for seed in {0..9}; do - for shots in 1 3 5 10 30; do - python create_coco_tf_record.py \ - --logtostderr \ - --image_dir="${base_image_dir}/train2014" \ - --image_dir="${base_image_dir}/val2014" \ - --image_info_file="${tmp_dir}/${shots}shot_seed${seed}.json" \ - --object_annotations_file="${tmp_dir}/${shots}shot_seed${seed}.json" \ - --caption_annotations_file="" \ - --output_file_prefix="${output_dir}/${shots}shot_seed${seed}" \ - --num_shards=4 - done -done - -python create_coco_tf_record.py \ - --logtostderr \ - --image_dir="${base_image_dir}/train2014" \ - --image_dir="${base_image_dir}/val2014" \ - --image_info_file="${tmp_dir}/datasplit/5k.json" \ - --object_annotations_file="${tmp_dir}/datasplit/5k.json" \ - --caption_annotations_file="" \ - --output_file_prefix="${output_dir}/5k" \ - --num_shards=10 - -python create_coco_tf_record.py \ - --logtostderr \ - --image_dir="${base_image_dir}/train2014" \ - --image_dir="${base_image_dir}/val2014" \ - --image_info_file="${tmp_dir}/datasplit/trainvalno5k_base.json" \ - --object_annotations_file="${tmp_dir}/datasplit/trainvalno5k_base.json" \ - --caption_annotations_file="" \ - --output_file_prefix="${output_dir}/trainvalno5k_base" \ - --num_shards=200 - -python create_coco_tf_record.py \ - --logtostderr \ - --image_dir="${base_image_dir}/train2014" \ - --image_dir="${base_image_dir}/val2014" \ - --image_info_file="${tmp_dir}/datasplit/5k_base.json" \ - --object_annotations_file="${tmp_dir}/datasplit/5k_base.json" \ - --caption_annotations_file="" \ - --output_file_prefix="${output_dir}/5k_base" \ - --num_shards=10 - -rm -rf "${tmp_dir}" diff --git a/official/vision/beta/data/process_coco_few_shot_json_files.py b/official/vision/beta/data/process_coco_few_shot_json_files.py deleted file mode 100644 index 7a918c511..000000000 --- a/official/vision/beta/data/process_coco_few_shot_json_files.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Processes the JSON files for COCO few-shot. - -We assume that `workdir` mirrors the contents of -http://dl.yf.io/fs-det/datasets/cocosplit/, which contains the official JSON -files for the few-shot COCO evaluation procedure that Wang et al. (2020)'s -"Frustratingly Simple Few-Shot Object Detection" paper uses. -""" - -import collections -import itertools -import json -import logging -import os - -from absl import app -from absl import flags - -import tensorflow as tf - -logger = tf.get_logger() -logger.setLevel(logging.INFO) - -flags.DEFINE_string('workdir', None, 'Working directory.') - -FLAGS = flags.FLAGS -CATEGORIES = ['airplane', 'apple', 'backpack', 'banana', 'baseball bat', - 'baseball glove', 'bear', 'bed', 'bench', 'bicycle', 'bird', - 'boat', 'book', 'bottle', 'bowl', 'broccoli', 'bus', 'cake', - 'car', 'carrot', 'cat', 'cell phone', 'chair', 'clock', 'couch', - 'cow', 'cup', 'dining table', 'dog', 'donut', 'elephant', - 'fire hydrant', 'fork', 'frisbee', 'giraffe', 'hair drier', - 'handbag', 'horse', 'hot dog', 'keyboard', 'kite', 'knife', - 'laptop', 'microwave', 'motorcycle', 'mouse', 'orange', 'oven', - 'parking meter', 'person', 'pizza', 'potted plant', - 'refrigerator', 'remote', 'sandwich', 'scissors', 'sheep', - 'sink', 'skateboard', 'skis', 'snowboard', 'spoon', 'sports ball', - 'stop sign', 'suitcase', 'surfboard', 'teddy bear', - 'tennis racket', 'tie', 'toaster', 'toilet', 'toothbrush', - 'traffic light', 'train', 'truck', 'tv', 'umbrella', 'vase', - 'wine glass', 'zebra'] -SEEDS = list(range(10)) -SHOTS = [1, 3, 5, 10, 30] - -FILE_SUFFIXES = collections.defaultdict(list) -for _seed, _shots in itertools.product(SEEDS, SHOTS): - for _category in CATEGORIES: - FILE_SUFFIXES[(_seed, _shots)].append( - '{}full_box_{}shot_{}_trainval.json'.format( - # http://dl.yf.io/fs-det/datasets/cocosplit/ is organized like so: - # - # datasplit/ - # trainvalno5k.json - # 5k.json - # full_box_{1,2,3,5,10,30}shot_{category}_trainval.json - # seed{1-9}/ - # full_box_{1,2,3,5,10,30}shot_{category}_trainval.json - # - # This means that the JSON files for seed0 are located in the root - # directory rather than in a `seed?/` subdirectory, hence the - # conditional expression below. - '' if _seed == 0 else 'seed{}/'.format(_seed), - _shots, - _category)) - -# Base class IDs, as defined in -# https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/evaluation/coco_evaluation.py#L60-L65 -BASE_CLASS_IDS = [8, 10, 11, 13, 14, 15, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, - 35, 36, 37, 38, 39, 40, 41, 42, 43, 46, 47, 48, 49, 50, 51, - 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 65, 70, 73, 74, 75, - 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90] - - -def main(unused_argv): - workdir = FLAGS.workdir - - # Filter novel class annotations from the training and validation sets. - for name in ('trainvalno5k', '5k'): - file_path = os.path.join(workdir, 'datasplit', '{}.json'.format(name)) - with tf.io.gfile.GFile(file_path, 'r') as f: - json_dict = json.load(f) - - json_dict['annotations'] = [a for a in json_dict['annotations'] - if a['category_id'] in BASE_CLASS_IDS] - output_path = os.path.join( - workdir, 'datasplit', '{}_base.json'.format(name)) - with tf.io.gfile.GFile(output_path, 'w') as f: - json.dump(json_dict, f) - - for seed, shots in itertools.product(SEEDS, SHOTS): - # Retrieve all examples for a given seed and shots setting. - file_paths = [os.path.join(workdir, suffix) - for suffix in FILE_SUFFIXES[(seed, shots)]] - json_dicts = [] - for file_path in file_paths: - with tf.io.gfile.GFile(file_path, 'r') as f: - json_dicts.append(json.load(f)) - - # Make sure that all JSON files for a given seed and shots setting have the - # same metadata. We count on this to fuse them later on. - metadata_dicts = [{'info': d['info'], 'licenses': d['licenses'], - 'categories': d['categories']} for d in json_dicts] - if not all(d == metadata_dicts[0] for d in metadata_dicts[1:]): - raise RuntimeError( - 'JSON files for {} shots (seed {}) '.format(shots, seed) + - 'have different info, licences, or categories fields') - - # Retrieve images across all JSON files. - images = sum((d['images'] for d in json_dicts), []) - # Remove duplicate image entries. - images = list({image['id']: image for image in images}.values()) - - output_dict = { - 'info': json_dicts[0]['info'], - 'licenses': json_dicts[0]['licenses'], - 'categories': json_dicts[0]['categories'], - 'images': images, - 'annotations': sum((d['annotations'] for d in json_dicts), []) - } - - output_path = os.path.join(workdir, - '{}shot_seed{}.json'.format(shots, seed)) - with tf.io.gfile.GFile(output_path, 'w') as f: - json.dump(output_dict, f) - logger.info('Processed %d shots (seed %d) and saved to %s', - shots, seed, output_path) - - -if __name__ == '__main__': - flags.mark_flag_as_required('workdir') - app.run(main) diff --git a/official/vision/beta/data/process_coco_panoptic.sh b/official/vision/beta/data/process_coco_panoptic.sh deleted file mode 100644 index fd7003974..000000000 --- a/official/vision/beta/data/process_coco_panoptic.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -sudo apt update -sudo apt install unzip aria2 -y - -DATA_DIR=$1 -aria2c -j 8 -Z \ - http://images.cocodataset.org/annotations/annotations_trainval2017.zip \ - http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip \ - http://images.cocodataset.org/zips/train2017.zip \ - http://images.cocodataset.org/zips/val2017.zip \ - --dir=$DATA_DIR; - -unzip $DATA_DIR/"*".zip -d $DATA_DIR; -mkdir $DATA_DIR/zips && mv $DATA_DIR/*.zip $DATA_DIR/zips; -unzip $DATA_DIR/annotations/panoptic_train2017.zip -d $DATA_DIR -unzip $DATA_DIR/annotations/panoptic_val2017.zip -d $DATA_DIR - -python3 official/vision/beta/data/create_coco_tf_record.py \ - --logtostderr \ - --image_dir="$DATA_DIR/val2017" \ - --object_annotations_file="$DATA_DIR/annotations/instances_val2017.json" \ - --output_file_prefix="$DATA_DIR/tfrecords/val" \ - --panoptic_annotations_file="$DATA_DIR/annotations/panoptic_val2017.json" \ - --panoptic_masks_dir="$DATA_DIR/panoptic_val2017" \ - --num_shards=8 \ - --include_masks \ - --include_panoptic_masks - - -python3 official/vision/beta/data/create_coco_tf_record.py \ - --logtostderr \ - --image_dir="$DATA_DIR/train2017" \ - --object_annotations_file="$DATA_DIR/annotations/instances_train2017.json" \ - --output_file_prefix="$DATA_DIR/tfrecords/train" \ - --panoptic_annotations_file="$DATA_DIR/annotations/panoptic_train2017.json" \ - --panoptic_masks_dir="$DATA_DIR/panoptic_train2017" \ - --num_shards=32 \ - --include_masks \ - --include_panoptic_masks diff --git a/official/vision/beta/data/tfrecord_lib.py b/official/vision/beta/data/tfrecord_lib.py deleted file mode 100644 index c91b2edad..000000000 --- a/official/vision/beta/data/tfrecord_lib.py +++ /dev/null @@ -1,181 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Helper functions for creating TFRecord datasets.""" - -import hashlib -import io -import itertools - -from absl import logging -import numpy as np -from PIL import Image -import tensorflow as tf - -import multiprocessing as mp - - -def convert_to_feature(value, value_type=None): - """Converts the given python object to a tf.train.Feature. - - Args: - value: int, float, bytes or a list of them. - value_type: optional, if specified, forces the feature to be of the given - type. Otherwise, type is inferred automatically. Can be one of - ['bytes', 'int64', 'float', 'bytes_list', 'int64_list', 'float_list'] - - Returns: - feature: A tf.train.Feature object. - """ - - if value_type is None: - - element = value[0] if isinstance(value, list) else value - - if isinstance(element, bytes): - value_type = 'bytes' - - elif isinstance(element, (int, np.integer)): - value_type = 'int64' - - elif isinstance(element, (float, np.floating)): - value_type = 'float' - - else: - raise ValueError('Cannot convert type {} to feature'. - format(type(element))) - - if isinstance(value, list): - value_type = value_type + '_list' - - if value_type == 'int64': - return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) - - elif value_type == 'int64_list': - value = np.asarray(value).astype(np.int64).reshape(-1) - return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) - - elif value_type == 'float': - return tf.train.Feature(float_list=tf.train.FloatList(value=[value])) - - elif value_type == 'float_list': - value = np.asarray(value).astype(np.float32).reshape(-1) - return tf.train.Feature(float_list=tf.train.FloatList(value=value)) - - elif value_type == 'bytes': - return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) - - elif value_type == 'bytes_list': - return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) - - else: - raise ValueError('Unknown value_type parameter - {}'.format(value_type)) - - -def image_info_to_feature_dict(height, width, filename, image_id, - encoded_str, encoded_format): - """Convert image information to a dict of features.""" - - key = hashlib.sha256(encoded_str).hexdigest() - - return { - 'image/height': convert_to_feature(height), - 'image/width': convert_to_feature(width), - 'image/filename': convert_to_feature(filename.encode('utf8')), - 'image/source_id': convert_to_feature(str(image_id).encode('utf8')), - 'image/key/sha256': convert_to_feature(key.encode('utf8')), - 'image/encoded': convert_to_feature(encoded_str), - 'image/format': convert_to_feature(encoded_format.encode('utf8')), - } - - -def read_image(image_path): - pil_image = Image.open(image_path) - return np.asarray(pil_image) - - -def encode_mask_as_png(mask): - pil_image = Image.fromarray(mask) - output_io = io.BytesIO() - pil_image.save(output_io, format='PNG') - return output_io.getvalue() - - -def write_tf_record_dataset(output_path, annotation_iterator, - process_func, num_shards, - use_multiprocessing=True, unpack_arguments=True): - """Iterates over annotations, processes them and writes into TFRecords. - - Args: - output_path: The prefix path to create TF record files. - annotation_iterator: An iterator of tuples containing details about the - dataset. - process_func: A function which takes the elements from the tuples of - annotation_iterator as arguments and returns a tuple of (tf.train.Example, - int). The integer indicates the number of annotations that were skipped. - num_shards: int, the number of shards to write for the dataset. - use_multiprocessing: - Whether or not to use multiple processes to write TF Records. - unpack_arguments: - Whether to unpack the tuples from annotation_iterator as individual - arguments to the process func or to pass the returned value as it is. - - Returns: - num_skipped: The total number of skipped annotations. - """ - - writers = [ - tf.io.TFRecordWriter( - output_path + '-%05d-of-%05d.tfrecord' % (i, num_shards)) - for i in range(num_shards) - ] - - total_num_annotations_skipped = 0 - - if use_multiprocessing: - pool = mp.Pool() - if unpack_arguments: - tf_example_iterator = pool.starmap(process_func, annotation_iterator) - else: - tf_example_iterator = pool.imap(process_func, annotation_iterator) - else: - if unpack_arguments: - tf_example_iterator = itertools.starmap(process_func, annotation_iterator) - else: - tf_example_iterator = map(process_func, annotation_iterator) - - for idx, (tf_example, num_annotations_skipped) in enumerate( - tf_example_iterator): - if idx % 100 == 0: - logging.info('On image %d', idx) - - total_num_annotations_skipped += num_annotations_skipped - writers[idx % num_shards].write(tf_example.SerializeToString()) - - if use_multiprocessing: - pool.close() - pool.join() - - for writer in writers: - writer.close() - - logging.info('Finished writing, skipped %d annotations.', - total_num_annotations_skipped) - return total_num_annotations_skipped - - -def check_and_make_dir(directory): - """Creates the directory if it doesn't exist.""" - if not tf.io.gfile.isdir(directory): - tf.io.gfile.makedirs(directory) diff --git a/official/vision/beta/data/tfrecord_lib_test.py b/official/vision/beta/data/tfrecord_lib_test.py deleted file mode 100644 index f6be438cf..000000000 --- a/official/vision/beta/data/tfrecord_lib_test.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for tfrecord_lib.""" - -import os - -from absl import flags -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.data import tfrecord_lib - - -FLAGS = flags.FLAGS - - -def process_sample(x): - d = {'x': x} - return tf.train.Example(features=tf.train.Features(feature=d)), 0 - - -def parse_function(example_proto): - - feature_description = { - 'x': tf.io.FixedLenFeature([], tf.int64, default_value=-1) - } - return tf.io.parse_single_example(example_proto, feature_description) - - -class TfrecordLibTest(parameterized.TestCase): - - def test_write_tf_record_dataset(self): - data = [(tfrecord_lib.convert_to_feature(i),) for i in range(17)] - - path = os.path.join(FLAGS.test_tmpdir, 'train') - - tfrecord_lib.write_tf_record_dataset( - path, data, process_sample, 3, use_multiprocessing=False) - tfrecord_files = tf.io.gfile.glob(path + '*') - - self.assertLen(tfrecord_files, 3) - - dataset = tf.data.TFRecordDataset(tfrecord_files) - dataset = dataset.map(parse_function) - - read_values = set(d['x'] for d in dataset.as_numpy_iterator()) - self.assertSetEqual(read_values, set(range(17))) - - def test_convert_to_feature_float(self): - - proto = tfrecord_lib.convert_to_feature(0.0) - self.assertEqual(proto.float_list.value[0], 0.0) - - def test_convert_to_feature_int(self): - - proto = tfrecord_lib.convert_to_feature(0) - self.assertEqual(proto.int64_list.value[0], 0) - - def test_convert_to_feature_bytes(self): - - proto = tfrecord_lib.convert_to_feature(b'123') - self.assertEqual(proto.bytes_list.value[0], b'123') - - def test_convert_to_feature_float_list(self): - - proto = tfrecord_lib.convert_to_feature([0.0, 1.0]) - self.assertSequenceAlmostEqual(proto.float_list.value, [0.0, 1.0]) - - def test_convert_to_feature_int_list(self): - - proto = tfrecord_lib.convert_to_feature([0, 1]) - self.assertSequenceAlmostEqual(proto.int64_list.value, [0, 1]) - - def test_convert_to_feature_bytes_list(self): - - proto = tfrecord_lib.convert_to_feature([b'123', b'456']) - self.assertSequenceAlmostEqual(proto.bytes_list.value, [b'123', b'456']) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/dataloaders/__init__.py b/official/vision/beta/dataloaders/__init__.py deleted file mode 100644 index 310bfb28f..000000000 --- a/official/vision/beta/dataloaders/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - diff --git a/official/vision/beta/dataloaders/classification_input.py b/official/vision/beta/dataloaders/classification_input.py deleted file mode 100644 index 7bfae3ef5..000000000 --- a/official/vision/beta/dataloaders/classification_input.py +++ /dev/null @@ -1,273 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Classification decoder and parser.""" -from typing import Any, Dict, List, Optional -# Import libraries -import tensorflow as tf - -from official.vision.beta.configs import common -from official.vision.beta.dataloaders import decoder -from official.vision.beta.dataloaders import parser -from official.vision.beta.ops import augment -from official.vision.beta.ops import preprocess_ops - -MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255) -STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255) - -DEFAULT_IMAGE_FIELD_KEY = 'image/encoded' -DEFAULT_LABEL_FIELD_KEY = 'image/class/label' - - -class Decoder(decoder.Decoder): - """A tf.Example decoder for classification task.""" - - def __init__(self, - image_field_key: str = DEFAULT_IMAGE_FIELD_KEY, - label_field_key: str = DEFAULT_LABEL_FIELD_KEY, - is_multilabel: bool = False, - keys_to_features: Optional[Dict[str, Any]] = None): - if not keys_to_features: - keys_to_features = { - image_field_key: - tf.io.FixedLenFeature((), tf.string, default_value=''), - } - if is_multilabel: - keys_to_features.update( - {label_field_key: tf.io.VarLenFeature(dtype=tf.int64)}) - else: - keys_to_features.update({ - label_field_key: - tf.io.FixedLenFeature((), tf.int64, default_value=-1) - }) - self._keys_to_features = keys_to_features - - def decode(self, serialized_example): - return tf.io.parse_single_example( - serialized_example, self._keys_to_features) - - -class Parser(parser.Parser): - """Parser to parse an image and its annotations into a dictionary of tensors.""" - - def __init__(self, - output_size: List[int], - num_classes: float, - image_field_key: str = DEFAULT_IMAGE_FIELD_KEY, - label_field_key: str = DEFAULT_LABEL_FIELD_KEY, - decode_jpeg_only: bool = True, - aug_rand_hflip: bool = True, - aug_type: Optional[common.Augmentation] = None, - color_jitter: float = 0., - random_erasing: Optional[common.RandomErasing] = None, - is_multilabel: bool = False, - dtype: str = 'float32'): - """Initializes parameters for parsing annotations in the dataset. - - Args: - output_size: `Tensor` or `list` for [height, width] of output image. The - output_size should be divided by the largest feature stride 2^max_level. - num_classes: `float`, number of classes. - image_field_key: `str`, the key name to encoded image in tf.Example. - label_field_key: `str`, the key name to label in tf.Example. - decode_jpeg_only: `bool`, if True, only JPEG format is decoded, this is - faster than decoding other types. Default is True. - aug_rand_hflip: `bool`, if True, augment training with random - horizontal flip. - aug_type: An optional Augmentation object to choose from AutoAugment and - RandAugment. - color_jitter: Magnitude of color jitter. If > 0, the value is used to - generate random scale factor for brightness, contrast and saturation. - See `preprocess_ops.color_jitter` for more details. - random_erasing: if not None, augment input image by random erasing. See - `augment.RandomErasing` for more details. - is_multilabel: A `bool`, whether or not each example has multiple labels. - dtype: `str`, cast output image in dtype. It can be 'float32', 'float16', - or 'bfloat16'. - """ - self._output_size = output_size - self._aug_rand_hflip = aug_rand_hflip - self._num_classes = num_classes - self._image_field_key = image_field_key - if dtype == 'float32': - self._dtype = tf.float32 - elif dtype == 'float16': - self._dtype = tf.float16 - elif dtype == 'bfloat16': - self._dtype = tf.bfloat16 - else: - raise ValueError('dtype {!r} is not supported!'.format(dtype)) - if aug_type: - if aug_type.type == 'autoaug': - self._augmenter = augment.AutoAugment( - augmentation_name=aug_type.autoaug.augmentation_name, - cutout_const=aug_type.autoaug.cutout_const, - translate_const=aug_type.autoaug.translate_const) - elif aug_type.type == 'randaug': - self._augmenter = augment.RandAugment( - num_layers=aug_type.randaug.num_layers, - magnitude=aug_type.randaug.magnitude, - cutout_const=aug_type.randaug.cutout_const, - translate_const=aug_type.randaug.translate_const, - prob_to_apply=aug_type.randaug.prob_to_apply, - exclude_ops=aug_type.randaug.exclude_ops) - else: - raise ValueError('Augmentation policy {} not supported.'.format( - aug_type.type)) - else: - self._augmenter = None - self._label_field_key = label_field_key - self._color_jitter = color_jitter - if random_erasing: - self._random_erasing = augment.RandomErasing( - probability=random_erasing.probability, - min_area=random_erasing.min_area, - max_area=random_erasing.max_area, - min_aspect=random_erasing.min_aspect, - max_aspect=random_erasing.max_aspect, - min_count=random_erasing.min_count, - max_count=random_erasing.max_count, - trials=random_erasing.trials) - else: - self._random_erasing = None - self._is_multilabel = is_multilabel - self._decode_jpeg_only = decode_jpeg_only - - def _parse_train_data(self, decoded_tensors): - """Parses data for training.""" - image = self._parse_train_image(decoded_tensors) - label = tf.cast(decoded_tensors[self._label_field_key], dtype=tf.int32) - if self._is_multilabel: - if isinstance(label, tf.sparse.SparseTensor): - label = tf.sparse.to_dense(label) - label = tf.reduce_sum(tf.one_hot(label, self._num_classes), axis=0) - return image, label - - def _parse_eval_data(self, decoded_tensors): - """Parses data for evaluation.""" - image = self._parse_eval_image(decoded_tensors) - label = tf.cast(decoded_tensors[self._label_field_key], dtype=tf.int32) - if self._is_multilabel: - if isinstance(label, tf.sparse.SparseTensor): - label = tf.sparse.to_dense(label) - label = tf.reduce_sum(tf.one_hot(label, self._num_classes), axis=0) - return image, label - - def _parse_train_image(self, decoded_tensors): - """Parses image data for training.""" - image_bytes = decoded_tensors[self._image_field_key] - - if self._decode_jpeg_only: - image_shape = tf.image.extract_jpeg_shape(image_bytes) - - # Crops image. - cropped_image = preprocess_ops.random_crop_image_v2( - image_bytes, image_shape) - image = tf.cond( - tf.reduce_all(tf.equal(tf.shape(cropped_image), image_shape)), - lambda: preprocess_ops.center_crop_image_v2(image_bytes, image_shape), - lambda: cropped_image) - else: - # Decodes image. - image = tf.io.decode_image(image_bytes, channels=3) - image.set_shape([None, None, 3]) - - # Crops image. - cropped_image = preprocess_ops.random_crop_image(image) - - image = tf.cond( - tf.reduce_all(tf.equal(tf.shape(cropped_image), tf.shape(image))), - lambda: preprocess_ops.center_crop_image(image), - lambda: cropped_image) - - if self._aug_rand_hflip: - image = tf.image.random_flip_left_right(image) - - # Color jitter. - if self._color_jitter > 0: - image = preprocess_ops.color_jitter(image, self._color_jitter, - self._color_jitter, - self._color_jitter) - - # Resizes image. - image = tf.image.resize( - image, self._output_size, method=tf.image.ResizeMethod.BILINEAR) - image.set_shape([self._output_size[0], self._output_size[1], 3]) - - # Apply autoaug or randaug. - if self._augmenter is not None: - image = self._augmenter.distort(image) - - # Normalizes image with mean and std pixel values. - image = preprocess_ops.normalize_image(image, - offset=MEAN_RGB, - scale=STDDEV_RGB) - - # Random erasing after the image has been normalized - if self._random_erasing is not None: - image = self._random_erasing.distort(image) - - # Convert image to self._dtype. - image = tf.image.convert_image_dtype(image, self._dtype) - - return image - - def _parse_eval_image(self, decoded_tensors): - """Parses image data for evaluation.""" - image_bytes = decoded_tensors[self._image_field_key] - - if self._decode_jpeg_only: - image_shape = tf.image.extract_jpeg_shape(image_bytes) - - # Center crops. - image = preprocess_ops.center_crop_image_v2(image_bytes, image_shape) - else: - # Decodes image. - image = tf.io.decode_image(image_bytes, channels=3) - image.set_shape([None, None, 3]) - - # Center crops. - image = preprocess_ops.center_crop_image(image) - - image = tf.image.resize( - image, self._output_size, method=tf.image.ResizeMethod.BILINEAR) - image.set_shape([self._output_size[0], self._output_size[1], 3]) - - # Normalizes image with mean and std pixel values. - image = preprocess_ops.normalize_image(image, - offset=MEAN_RGB, - scale=STDDEV_RGB) - - # Convert image to self._dtype. - image = tf.image.convert_image_dtype(image, self._dtype) - - return image - - @classmethod - def inference_fn(cls, - image: tf.Tensor, - input_image_size: List[int], - num_channels: int = 3) -> tf.Tensor: - """Builds image model inputs for serving.""" - - image = tf.cast(image, dtype=tf.float32) - image = preprocess_ops.center_crop_image(image) - image = tf.image.resize( - image, input_image_size, method=tf.image.ResizeMethod.BILINEAR) - - # Normalizes image with mean and std pixel values. - image = preprocess_ops.normalize_image( - image, offset=MEAN_RGB, scale=STDDEV_RGB) - image.set_shape(input_image_size + [num_channels]) - return image diff --git a/official/vision/beta/dataloaders/decoder.py b/official/vision/beta/dataloaders/decoder.py deleted file mode 100644 index 821083f0f..000000000 --- a/official/vision/beta/dataloaders/decoder.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""The generic decoder interface.""" - -import abc - - -class Decoder(object): - """Decodes the raw data into tensors.""" - - __metaclass__ = abc.ABCMeta - - @abc.abstractmethod - def decode(self, serialized_example): - """Decodes the serialized example into tensors. - - Args: - serialized_example: a serialized string tensor that encodes the data. - - Returns: - decoded_tensors: a dict of Tensors. - """ - pass diff --git a/official/vision/beta/dataloaders/input_reader.py b/official/vision/beta/dataloaders/input_reader.py deleted file mode 100644 index fba7dc277..000000000 --- a/official/vision/beta/dataloaders/input_reader.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Dataset reader for vision model garden.""" - -from typing import Any, Callable, Optional, Tuple - -import tensorflow as tf - -from official.core import config_definitions as cfg -from official.core import input_reader - - -def calculate_batch_sizes(total_batch_size: int, - pseudo_label_ratio: float) -> Tuple[int, int]: - """Calculates labeled and pseudo-labeled dataset batch sizes. - - Returns (labeled_batch_size, pseudo_labeled_batch_size) given a - total batch size and pseudo-label data ratio. - - Args: - total_batch_size: The total batch size for all data. - pseudo_label_ratio: A non-negative float ratio of pseudo-labeled - to labeled data in a batch. - - Returns: - (labeled_batch_size, pseudo_labeled_batch_size) as ints. - - Raises: - ValueError: If total_batch_size is negative. - ValueError: If pseudo_label_ratio is negative. - """ - if total_batch_size < 0: - raise ValueError('Invalid total_batch_size: {}'.format(total_batch_size)) - if pseudo_label_ratio < 0.0: - raise ValueError( - 'Invalid pseudo_label_ratio: {}'.format(pseudo_label_ratio)) - - ratio_factor = pseudo_label_ratio / (1.0 + pseudo_label_ratio) - pseudo_labeled_batch_size = int(round(total_batch_size * ratio_factor)) - labeled_batch_size = total_batch_size - pseudo_labeled_batch_size - return labeled_batch_size, pseudo_labeled_batch_size - - -class CombinationDatasetInputReader(input_reader.InputReader): - """Combination dataset input reader.""" - - def __init__(self, - params: cfg.DataConfig, - dataset_fn=tf.data.TFRecordDataset, - pseudo_label_dataset_fn=tf.data.TFRecordDataset, - decoder_fn: Optional[Callable[..., Any]] = None, - sample_fn: Optional[Callable[..., Any]] = None, - parser_fn: Optional[Callable[..., Any]] = None, - transform_and_batch_fn: Optional[Callable[ - [tf.data.Dataset, Optional[tf.distribute.InputContext]], - tf.data.Dataset]] = None, - postprocess_fn: Optional[Callable[..., Any]] = None): - """Initializes an CombinationDatasetInputReader instance. - - This class mixes a labeled and pseudo-labeled dataset. The params - must contain "pseudo_label_data.input_path" to specify the - pseudo-label dataset files and "pseudo_label_data.data_ratio" - to specify a per-batch mixing ratio of pseudo-label examples to - labeled dataset examples. - - Args: - params: A config_definitions.DataConfig object. - dataset_fn: A `tf.data.Dataset` that consumes the input files. For - example, it can be `tf.data.TFRecordDataset`. - pseudo_label_dataset_fn: A `tf.data.Dataset` that consumes the input - files. For example, it can be `tf.data.TFRecordDataset`. - decoder_fn: An optional `callable` that takes the serialized data string - and decodes them into the raw tensor dictionary. - sample_fn: An optional `callable` that takes a `tf.data.Dataset` object as - input and outputs the transformed dataset. It performs sampling on the - decoded raw tensors dict before the parser_fn. - parser_fn: An optional `callable` that takes the decoded raw tensors dict - and parse them into a dictionary of tensors that can be consumed by the - model. It will be executed after decoder_fn. - transform_and_batch_fn: An optional `callable` that takes a - `tf.data.Dataset` object and an optional `tf.distribute.InputContext` as - input, and returns a `tf.data.Dataset` object. It will be executed after - `parser_fn` to transform and batch the dataset; if None, after - `parser_fn` is executed, the dataset will be batched into per-replica - batch size. - postprocess_fn: A optional `callable` that processes batched tensors. It - will be executed after batching. - - Raises: - ValueError: If drop_remainder is False. - """ - super().__init__(params=params, - dataset_fn=dataset_fn, - decoder_fn=decoder_fn, - sample_fn=sample_fn, - parser_fn=parser_fn, - transform_and_batch_fn=transform_and_batch_fn, - postprocess_fn=postprocess_fn) - - self._pseudo_label_file_pattern = params.pseudo_label_data.input_path - self._pseudo_label_dataset_fn = pseudo_label_dataset_fn - self._pseudo_label_data_ratio = params.pseudo_label_data.data_ratio - self._pseudo_label_matched_files = input_reader.match_files( - self._pseudo_label_file_pattern) - if not self._drop_remainder: - raise ValueError( - 'Must use drop_remainder=True with CombinationDatasetInputReader') - - def read( - self, - input_context: Optional[tf.distribute.InputContext] = None - ) -> tf.data.Dataset: - """Generates a tf.data.Dataset object.""" - - labeled_batch_size, pl_batch_size = calculate_batch_sizes( - self._global_batch_size, self._pseudo_label_data_ratio) - - if not labeled_batch_size and pl_batch_size: - raise ValueError( - 'Invalid batch_size: {} and pseudo_label_data_ratio: {}, ' - 'resulting in a 0 batch size for one of the datasets.'.format( - self._global_batch_size, self._pseudo_label_data_ratio)) - - def _read_decode_and_parse_dataset(matched_files, dataset_fn, batch_size, - input_context, tfds_builder): - dataset = self._read_data_source(matched_files, dataset_fn, input_context, - tfds_builder) - return self._decode_and_parse_dataset(dataset, batch_size, input_context) - - labeled_dataset = _read_decode_and_parse_dataset( - matched_files=self._matched_files, - dataset_fn=self._dataset_fn, - batch_size=labeled_batch_size, - input_context=input_context, - tfds_builder=self._tfds_builder) - - pseudo_labeled_dataset = _read_decode_and_parse_dataset( - matched_files=self._pseudo_label_matched_files, - dataset_fn=self._pseudo_label_dataset_fn, - batch_size=pl_batch_size, - input_context=input_context, - tfds_builder=False) - - def concat_fn(d1, d2): - return tf.nest.map_structure( - lambda x1, x2: tf.concat([x1, x2], axis=0), d1, d2) - - dataset_concat = tf.data.Dataset.zip( - (labeled_dataset, pseudo_labeled_dataset)) - dataset_concat = dataset_concat.map( - concat_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) - - def maybe_map_fn(dataset, fn): - return dataset if fn is None else dataset.map( - fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) - - dataset_concat = maybe_map_fn(dataset_concat, self._postprocess_fn) - dataset_concat = self._maybe_apply_data_service(dataset_concat, - input_context) - - if self._deterministic is not None: - options = tf.data.Options() - options.experimental_deterministic = self._deterministic - dataset_concat = dataset_concat.with_options(options) - - return dataset_concat.prefetch(tf.data.experimental.AUTOTUNE) diff --git a/official/vision/beta/dataloaders/input_reader_factory.py b/official/vision/beta/dataloaders/input_reader_factory.py deleted file mode 100644 index 27181db90..000000000 --- a/official/vision/beta/dataloaders/input_reader_factory.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Factory for getting TF-Vision input readers.""" - -from official.common import dataset_fn as dataset_fn_util -from official.core import config_definitions as cfg -from official.core import input_reader as core_input_reader - -from official.vision.beta.dataloaders import input_reader as vision_input_reader - - -def input_reader_generator(params: cfg.DataConfig, - **kwargs) -> core_input_reader.InputReader: - """Instantiates an input reader class according to the params. - - Args: - params: A config_definitions.DataConfig object. - **kwargs: Additional arguments passed to input reader initialization. - - Returns: - An InputReader object. - - """ - if params.is_training and params.get('pseudo_label_data', False): - return vision_input_reader.CombinationDatasetInputReader( - params, - pseudo_label_dataset_fn=dataset_fn_util.pick_dataset_fn( - params.pseudo_label_data.file_type), - **kwargs) - else: - return core_input_reader.InputReader(params, **kwargs) diff --git a/official/vision/beta/dataloaders/maskrcnn_input.py b/official/vision/beta/dataloaders/maskrcnn_input.py deleted file mode 100644 index 62e4ed357..000000000 --- a/official/vision/beta/dataloaders/maskrcnn_input.py +++ /dev/null @@ -1,345 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Data parser and processing for Mask R-CNN.""" - -# Import libraries - -import tensorflow as tf - -from official.vision.beta.dataloaders import parser -from official.vision.beta.dataloaders import utils -from official.vision.beta.ops import anchor -from official.vision.beta.ops import box_ops -from official.vision.beta.ops import preprocess_ops - - -class Parser(parser.Parser): - """Parser to parse an image and its annotations into a dictionary of tensors.""" - - def __init__(self, - output_size, - min_level, - max_level, - num_scales, - aspect_ratios, - anchor_size, - rpn_match_threshold=0.7, - rpn_unmatched_threshold=0.3, - rpn_batch_size_per_im=256, - rpn_fg_fraction=0.5, - aug_rand_hflip=False, - aug_scale_min=1.0, - aug_scale_max=1.0, - skip_crowd_during_training=True, - max_num_instances=100, - include_mask=False, - mask_crop_size=112, - dtype='float32'): - """Initializes parameters for parsing annotations in the dataset. - - Args: - output_size: `Tensor` or `list` for [height, width] of output image. The - output_size should be divided by the largest feature stride 2^max_level. - min_level: `int` number of minimum level of the output feature pyramid. - max_level: `int` number of maximum level of the output feature pyramid. - num_scales: `int` number representing intermediate scales added - on each level. For instances, num_scales=2 adds one additional - intermediate anchor scales [2^0, 2^0.5] on each level. - aspect_ratios: `list` of float numbers representing the aspect raito - anchors added on each level. The number indicates the ratio of width to - height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors - on each scale level. - anchor_size: `float` number representing the scale of size of the base - anchor to the feature stride 2^level. - rpn_match_threshold: - rpn_unmatched_threshold: - rpn_batch_size_per_im: - rpn_fg_fraction: - aug_rand_hflip: `bool`, if True, augment training with random - horizontal flip. - aug_scale_min: `float`, the minimum scale applied to `output_size` for - data augmentation during training. - aug_scale_max: `float`, the maximum scale applied to `output_size` for - data augmentation during training. - skip_crowd_during_training: `bool`, if True, skip annotations labeled with - `is_crowd` equals to 1. - max_num_instances: `int` number of maximum number of instances in an - image. The groundtruth data will be padded to `max_num_instances`. - include_mask: a bool to indicate whether parse mask groundtruth. - mask_crop_size: the size which groundtruth mask is cropped to. - dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}. - """ - - self._max_num_instances = max_num_instances - self._skip_crowd_during_training = skip_crowd_during_training - - # Anchor. - self._output_size = output_size - self._min_level = min_level - self._max_level = max_level - self._num_scales = num_scales - self._aspect_ratios = aspect_ratios - self._anchor_size = anchor_size - - # Target assigning. - self._rpn_match_threshold = rpn_match_threshold - self._rpn_unmatched_threshold = rpn_unmatched_threshold - self._rpn_batch_size_per_im = rpn_batch_size_per_im - self._rpn_fg_fraction = rpn_fg_fraction - - # Data augmentation. - self._aug_rand_hflip = aug_rand_hflip - self._aug_scale_min = aug_scale_min - self._aug_scale_max = aug_scale_max - - # Mask. - self._include_mask = include_mask - self._mask_crop_size = mask_crop_size - - # Image output dtype. - self._dtype = dtype - - def _parse_train_data(self, data): - """Parses data for training. - - Args: - data: the decoded tensor dictionary from TfExampleDecoder. - - Returns: - image: image tensor that is preproessed to have normalized value and - dimension [output_size[0], output_size[1], 3] - labels: a dictionary of tensors used for training. The following describes - {key: value} pairs in the dictionary. - image_info: a 2D `Tensor` that encodes the information of the image and - the applied preprocessing. It is in the format of - [[original_height, original_width], [scaled_height, scaled_width], - anchor_boxes: ordered dictionary with keys - [min_level, min_level+1, ..., max_level]. The values are tensor with - shape [height_l, width_l, 4] representing anchor boxes at each level. - rpn_score_targets: ordered dictionary with keys - [min_level, min_level+1, ..., max_level]. The values are tensor with - shape [height_l, width_l, anchors_per_location]. The height_l and - width_l represent the dimension of class logits at l-th level. - rpn_box_targets: ordered dictionary with keys - [min_level, min_level+1, ..., max_level]. The values are tensor with - shape [height_l, width_l, anchors_per_location * 4]. The height_l and - width_l represent the dimension of bounding box regression output at - l-th level. - gt_boxes: Groundtruth bounding box annotations. The box is represented - in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled - image that is fed to the network. The tennsor is padded with -1 to - the fixed dimension [self._max_num_instances, 4]. - gt_classes: Groundtruth classes annotations. The tennsor is padded - with -1 to the fixed dimension [self._max_num_instances]. - gt_masks: groundtrugh masks cropped by the bounding box and - resized to a fixed size determined by mask_crop_size. - """ - classes = data['groundtruth_classes'] - boxes = data['groundtruth_boxes'] - if self._include_mask: - masks = data['groundtruth_instance_masks'] - - is_crowds = data['groundtruth_is_crowd'] - # Skips annotations with `is_crowd` = True. - if self._skip_crowd_during_training: - num_groundtruths = tf.shape(classes)[0] - with tf.control_dependencies([num_groundtruths, is_crowds]): - indices = tf.cond( - tf.greater(tf.size(is_crowds), 0), - lambda: tf.where(tf.logical_not(is_crowds))[:, 0], - lambda: tf.cast(tf.range(num_groundtruths), tf.int64)) - classes = tf.gather(classes, indices) - boxes = tf.gather(boxes, indices) - if self._include_mask: - masks = tf.gather(masks, indices) - - # Gets original image and its size. - image = data['image'] - image_shape = tf.shape(image)[0:2] - - # Normalizes image with mean and std pixel values. - image = preprocess_ops.normalize_image(image) - - # Flips image randomly during training. - if self._aug_rand_hflip: - if self._include_mask: - image, boxes, masks = preprocess_ops.random_horizontal_flip( - image, boxes, masks) - else: - image, boxes, _ = preprocess_ops.random_horizontal_flip( - image, boxes) - - # Converts boxes from normalized coordinates to pixel coordinates. - # Now the coordinates of boxes are w.r.t. the original image. - boxes = box_ops.denormalize_boxes(boxes, image_shape) - - # Resizes and crops image. - image, image_info = preprocess_ops.resize_and_crop_image( - image, - self._output_size, - padded_size=preprocess_ops.compute_padded_size( - self._output_size, 2 ** self._max_level), - aug_scale_min=self._aug_scale_min, - aug_scale_max=self._aug_scale_max) - image_height, image_width, _ = image.get_shape().as_list() - - # Resizes and crops boxes. - # Now the coordinates of boxes are w.r.t the scaled image. - image_scale = image_info[2, :] - offset = image_info[3, :] - boxes = preprocess_ops.resize_and_crop_boxes( - boxes, image_scale, image_info[1, :], offset) - - # Filters out ground truth boxes that are all zeros. - indices = box_ops.get_non_empty_box_indices(boxes) - boxes = tf.gather(boxes, indices) - classes = tf.gather(classes, indices) - if self._include_mask: - masks = tf.gather(masks, indices) - # Transfer boxes to the original image space and do normalization. - cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) - cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) - cropped_boxes = box_ops.normalize_boxes(cropped_boxes, image_shape) - num_masks = tf.shape(masks)[0] - masks = tf.image.crop_and_resize( - tf.expand_dims(masks, axis=-1), - cropped_boxes, - box_indices=tf.range(num_masks, dtype=tf.int32), - crop_size=[self._mask_crop_size, self._mask_crop_size], - method='bilinear') - masks = tf.squeeze(masks, axis=-1) - - # Assigns anchor targets. - # Note that after the target assignment, box targets are absolute pixel - # offsets w.r.t. the scaled image. - input_anchor = anchor.build_anchor_generator( - min_level=self._min_level, - max_level=self._max_level, - num_scales=self._num_scales, - aspect_ratios=self._aspect_ratios, - anchor_size=self._anchor_size) - anchor_boxes = input_anchor(image_size=(image_height, image_width)) - anchor_labeler = anchor.RpnAnchorLabeler( - self._rpn_match_threshold, - self._rpn_unmatched_threshold, - self._rpn_batch_size_per_im, - self._rpn_fg_fraction) - rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors( - anchor_boxes, boxes, - tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32)) - - # Casts input image to self._dtype - image = tf.cast(image, dtype=self._dtype) - - # Packs labels for model_fn outputs. - labels = { - 'anchor_boxes': - anchor_boxes, - 'image_info': - image_info, - 'rpn_score_targets': - rpn_score_targets, - 'rpn_box_targets': - rpn_box_targets, - 'gt_boxes': - preprocess_ops.clip_or_pad_to_fixed_size(boxes, - self._max_num_instances, - -1), - 'gt_classes': - preprocess_ops.clip_or_pad_to_fixed_size(classes, - self._max_num_instances, - -1), - } - if self._include_mask: - labels['gt_masks'] = preprocess_ops.clip_or_pad_to_fixed_size( - masks, self._max_num_instances, -1) - - return image, labels - - def _parse_eval_data(self, data): - """Parses data for evaluation. - - Args: - data: the decoded tensor dictionary from TfExampleDecoder. - - Returns: - A dictionary of {'images': image, 'labels': labels} where - image: image tensor that is preproessed to have normalized value and - dimension [output_size[0], output_size[1], 3] - labels: a dictionary of tensors used for training. The following - describes {key: value} pairs in the dictionary. - source_ids: Source image id. Default value -1 if the source id is - empty in the groundtruth annotation. - image_info: a 2D `Tensor` that encodes the information of the image - and the applied preprocessing. It is in the format of - [[original_height, original_width], [scaled_height, scaled_width], - anchor_boxes: ordered dictionary with keys - [min_level, min_level+1, ..., max_level]. The values are tensor with - shape [height_l, width_l, 4] representing anchor boxes at each - level. - """ - # Gets original image and its size. - image = data['image'] - image_shape = tf.shape(image)[0:2] - - # Normalizes image with mean and std pixel values. - image = preprocess_ops.normalize_image(image) - - # Resizes and crops image. - image, image_info = preprocess_ops.resize_and_crop_image( - image, - self._output_size, - padded_size=preprocess_ops.compute_padded_size( - self._output_size, 2 ** self._max_level), - aug_scale_min=1.0, - aug_scale_max=1.0) - image_height, image_width, _ = image.get_shape().as_list() - - # Casts input image to self._dtype - image = tf.cast(image, dtype=self._dtype) - - # Converts boxes from normalized coordinates to pixel coordinates. - boxes = box_ops.denormalize_boxes(data['groundtruth_boxes'], image_shape) - - # Compute Anchor boxes. - input_anchor = anchor.build_anchor_generator( - min_level=self._min_level, - max_level=self._max_level, - num_scales=self._num_scales, - aspect_ratios=self._aspect_ratios, - anchor_size=self._anchor_size) - anchor_boxes = input_anchor(image_size=(image_height, image_width)) - - labels = { - 'image_info': image_info, - 'anchor_boxes': anchor_boxes, - } - - groundtruths = { - 'source_id': data['source_id'], - 'height': data['height'], - 'width': data['width'], - 'num_detections': tf.shape(data['groundtruth_classes'])[0], - 'boxes': boxes, - 'classes': data['groundtruth_classes'], - 'areas': data['groundtruth_area'], - 'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32), - } - groundtruths['source_id'] = utils.process_source_id( - groundtruths['source_id']) - groundtruths = utils.pad_groundtruths_to_fixed_size( - groundtruths, self._max_num_instances) - labels['groundtruths'] = groundtruths - return image, labels diff --git a/official/vision/beta/dataloaders/parser.py b/official/vision/beta/dataloaders/parser.py deleted file mode 100644 index 2a415cb01..000000000 --- a/official/vision/beta/dataloaders/parser.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""The generic parser interface.""" - -import abc - - -class Parser(object): - """Parses data and produces tensors to be consumed by models.""" - - __metaclass__ = abc.ABCMeta - - @abc.abstractmethod - def _parse_train_data(self, decoded_tensors): - """Generates images and labels that are usable for model training. - - Args: - decoded_tensors: a dict of Tensors produced by the decoder. - - Returns: - images: the image tensor. - labels: a dict of Tensors that contains labels. - """ - pass - - @abc.abstractmethod - def _parse_eval_data(self, decoded_tensors): - """Generates images and labels that are usable for model evaluation. - - Args: - decoded_tensors: a dict of Tensors produced by the decoder. - - Returns: - images: the image tensor. - labels: a dict of Tensors that contains labels. - """ - pass - - def parse_fn(self, is_training): - """Returns a parse fn that reads and parses raw tensors from the decoder. - - Args: - is_training: a `bool` to indicate whether it is in training mode. - - Returns: - parse: a `callable` that takes the serialized example and generate the - images, labels tuple where labels is a dict of Tensors that contains - labels. - """ - def parse(decoded_tensors): - """Parses the serialized example data.""" - if is_training: - return self._parse_train_data(decoded_tensors) - else: - return self._parse_eval_data(decoded_tensors) - - return parse - - @classmethod - def inference_fn(cls, inputs): - """Parses inputs for predictions. - - Args: - inputs: A Tensor, or dictionary of Tensors. - - Returns: - processed_inputs: An input tensor to the model. - """ - pass diff --git a/official/vision/beta/dataloaders/retinanet_input.py b/official/vision/beta/dataloaders/retinanet_input.py deleted file mode 100644 index 91c0cd3f6..000000000 --- a/official/vision/beta/dataloaders/retinanet_input.py +++ /dev/null @@ -1,328 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Data parser and processing for RetinaNet. - -Parse image and ground truths in a dataset to training targets and package them -into (image, labels) tuple for RetinaNet. -""" - -# Import libraries -from absl import logging -import tensorflow as tf - -from official.vision.beta.dataloaders import parser -from official.vision.beta.dataloaders import utils -from official.vision.beta.ops import anchor -from official.vision.beta.ops import augment -from official.vision.beta.ops import box_ops -from official.vision.beta.ops import preprocess_ops - - -class Parser(parser.Parser): - """Parser to parse an image and its annotations into a dictionary of tensors.""" - - def __init__(self, - output_size, - min_level, - max_level, - num_scales, - aspect_ratios, - anchor_size, - match_threshold=0.5, - unmatched_threshold=0.5, - aug_type=None, - aug_rand_hflip=False, - aug_scale_min=1.0, - aug_scale_max=1.0, - use_autoaugment=False, - autoaugment_policy_name='v0', - skip_crowd_during_training=True, - max_num_instances=100, - dtype='bfloat16', - mode=None): - """Initializes parameters for parsing annotations in the dataset. - - Args: - output_size: `Tensor` or `list` for [height, width] of output image. The - output_size should be divided by the largest feature stride 2^max_level. - min_level: `int` number of minimum level of the output feature pyramid. - max_level: `int` number of maximum level of the output feature pyramid. - num_scales: `int` number representing intermediate scales added on each - level. For instances, num_scales=2 adds one additional intermediate - anchor scales [2^0, 2^0.5] on each level. - aspect_ratios: `list` of float numbers representing the aspect raito - anchors added on each level. The number indicates the ratio of width to - height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors - on each scale level. - anchor_size: `float` number representing the scale of size of the base - anchor to the feature stride 2^level. - match_threshold: `float` number between 0 and 1 representing the - lower-bound threshold to assign positive labels for anchors. An anchor - with a score over the threshold is labeled positive. - unmatched_threshold: `float` number between 0 and 1 representing the - upper-bound threshold to assign negative labels for anchors. An anchor - with a score below the threshold is labeled negative. - aug_type: An optional Augmentation object to choose from AutoAugment and - RandAugment. - aug_rand_hflip: `bool`, if True, augment training with random horizontal - flip. - aug_scale_min: `float`, the minimum scale applied to `output_size` for - data augmentation during training. - aug_scale_max: `float`, the maximum scale applied to `output_size` for - data augmentation during training. - use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy - during training. - autoaugment_policy_name: `string` that specifies the name of the - AutoAugment policy that will be used during training. - skip_crowd_during_training: `bool`, if True, skip annotations labeled with - `is_crowd` equals to 1. - max_num_instances: `int` number of maximum number of instances in an - image. The groundtruth data will be padded to `max_num_instances`. - dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}. - mode: a ModeKeys. Specifies if this is training, evaluation, prediction or - prediction with groundtruths in the outputs. - """ - self._mode = mode - self._max_num_instances = max_num_instances - self._skip_crowd_during_training = skip_crowd_during_training - - # Anchor. - self._output_size = output_size - self._min_level = min_level - self._max_level = max_level - self._num_scales = num_scales - self._aspect_ratios = aspect_ratios - self._anchor_size = anchor_size - self._match_threshold = match_threshold - self._unmatched_threshold = unmatched_threshold - - # Data augmentation. - self._aug_rand_hflip = aug_rand_hflip - self._aug_scale_min = aug_scale_min - self._aug_scale_max = aug_scale_max - - # Data augmentation with AutoAugment or RandAugment. - self._augmenter = None - if aug_type is not None: - if aug_type.type == 'autoaug': - logging.info('Using AutoAugment.') - self._augmenter = augment.AutoAugment( - augmentation_name=aug_type.autoaug.augmentation_name, - cutout_const=aug_type.autoaug.cutout_const, - translate_const=aug_type.autoaug.translate_const) - elif aug_type.type == 'randaug': - logging.info('Using RandAugment.') - self._augmenter = augment.RandAugment.build_for_detection( - num_layers=aug_type.randaug.num_layers, - magnitude=aug_type.randaug.magnitude, - cutout_const=aug_type.randaug.cutout_const, - translate_const=aug_type.randaug.translate_const, - prob_to_apply=aug_type.randaug.prob_to_apply, - exclude_ops=aug_type.randaug.exclude_ops) - else: - raise ValueError(f'Augmentation policy {aug_type.type} not supported.') - - # Deprecated. Data Augmentation with AutoAugment. - self._use_autoaugment = use_autoaugment - self._autoaugment_policy_name = autoaugment_policy_name - - # Data type. - self._dtype = dtype - - def _parse_train_data(self, data): - """Parses data for training and evaluation.""" - classes = data['groundtruth_classes'] - boxes = data['groundtruth_boxes'] - # If not empty, `attributes` is a dict of (name, ground_truth) pairs. - # `ground_gruth` of attributes is assumed in shape [N, attribute_size]. - # TODO(xianzhi): support parsing attributes weights. - attributes = data.get('groundtruth_attributes', {}) - is_crowds = data['groundtruth_is_crowd'] - - # Skips annotations with `is_crowd` = True. - if self._skip_crowd_during_training: - num_groundtrtuhs = tf.shape(input=classes)[0] - with tf.control_dependencies([num_groundtrtuhs, is_crowds]): - indices = tf.cond( - pred=tf.greater(tf.size(input=is_crowds), 0), - true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0], - false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64)) - classes = tf.gather(classes, indices) - boxes = tf.gather(boxes, indices) - for k, v in attributes.items(): - attributes[k] = tf.gather(v, indices) - - # Gets original image. - image = data['image'] - - # Apply autoaug or randaug. - if self._augmenter is not None: - image, boxes = self._augmenter.distort_with_boxes(image, boxes) - image_shape = tf.shape(input=image)[0:2] - - # Normalizes image with mean and std pixel values. - image = preprocess_ops.normalize_image(image) - - # Flips image randomly during training. - if self._aug_rand_hflip: - image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes) - - # Converts boxes from normalized coordinates to pixel coordinates. - boxes = box_ops.denormalize_boxes(boxes, image_shape) - - # Resizes and crops image. - image, image_info = preprocess_ops.resize_and_crop_image( - image, - self._output_size, - padded_size=preprocess_ops.compute_padded_size(self._output_size, - 2**self._max_level), - aug_scale_min=self._aug_scale_min, - aug_scale_max=self._aug_scale_max) - image_height, image_width, _ = image.get_shape().as_list() - - # Resizes and crops boxes. - image_scale = image_info[2, :] - offset = image_info[3, :] - boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale, - image_info[1, :], offset) - # Filters out ground truth boxes that are all zeros. - indices = box_ops.get_non_empty_box_indices(boxes) - boxes = tf.gather(boxes, indices) - classes = tf.gather(classes, indices) - for k, v in attributes.items(): - attributes[k] = tf.gather(v, indices) - - # Assigns anchors. - input_anchor = anchor.build_anchor_generator( - min_level=self._min_level, - max_level=self._max_level, - num_scales=self._num_scales, - aspect_ratios=self._aspect_ratios, - anchor_size=self._anchor_size) - anchor_boxes = input_anchor(image_size=(image_height, image_width)) - anchor_labeler = anchor.AnchorLabeler(self._match_threshold, - self._unmatched_threshold) - (cls_targets, box_targets, att_targets, cls_weights, - box_weights) = anchor_labeler.label_anchors( - anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes) - - # Casts input image to desired data type. - image = tf.cast(image, dtype=self._dtype) - - # Packs labels for model_fn outputs. - labels = { - 'cls_targets': cls_targets, - 'box_targets': box_targets, - 'anchor_boxes': anchor_boxes, - 'cls_weights': cls_weights, - 'box_weights': box_weights, - 'image_info': image_info, - } - if att_targets: - labels['attribute_targets'] = att_targets - return image, labels - - def _parse_eval_data(self, data): - """Parses data for training and evaluation.""" - groundtruths = {} - classes = data['groundtruth_classes'] - boxes = data['groundtruth_boxes'] - # If not empty, `attributes` is a dict of (name, ground_truth) pairs. - # `ground_gruth` of attributes is assumed in shape [N, attribute_size]. - # TODO(xianzhi): support parsing attributes weights. - attributes = data.get('groundtruth_attributes', {}) - - # Gets original image and its size. - image = data['image'] - image_shape = tf.shape(input=image)[0:2] - - # Normalizes image with mean and std pixel values. - image = preprocess_ops.normalize_image(image) - - # Converts boxes from normalized coordinates to pixel coordinates. - boxes = box_ops.denormalize_boxes(boxes, image_shape) - - # Resizes and crops image. - image, image_info = preprocess_ops.resize_and_crop_image( - image, - self._output_size, - padded_size=preprocess_ops.compute_padded_size(self._output_size, - 2**self._max_level), - aug_scale_min=1.0, - aug_scale_max=1.0) - image_height, image_width, _ = image.get_shape().as_list() - - # Resizes and crops boxes. - image_scale = image_info[2, :] - offset = image_info[3, :] - boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale, - image_info[1, :], offset) - # Filters out ground truth boxes that are all zeros. - indices = box_ops.get_non_empty_box_indices(boxes) - boxes = tf.gather(boxes, indices) - classes = tf.gather(classes, indices) - for k, v in attributes.items(): - attributes[k] = tf.gather(v, indices) - - # Assigns anchors. - input_anchor = anchor.build_anchor_generator( - min_level=self._min_level, - max_level=self._max_level, - num_scales=self._num_scales, - aspect_ratios=self._aspect_ratios, - anchor_size=self._anchor_size) - anchor_boxes = input_anchor(image_size=(image_height, image_width)) - anchor_labeler = anchor.AnchorLabeler(self._match_threshold, - self._unmatched_threshold) - (cls_targets, box_targets, att_targets, cls_weights, - box_weights) = anchor_labeler.label_anchors( - anchor_boxes, boxes, tf.expand_dims(classes, axis=1), attributes) - - # Casts input image to desired data type. - image = tf.cast(image, dtype=self._dtype) - - # Sets up groundtruth data for evaluation. - groundtruths = { - 'source_id': data['source_id'], - 'height': data['height'], - 'width': data['width'], - 'num_detections': tf.shape(data['groundtruth_classes']), - 'image_info': image_info, - 'boxes': box_ops.denormalize_boxes( - data['groundtruth_boxes'], image_shape), - 'classes': data['groundtruth_classes'], - 'areas': data['groundtruth_area'], - 'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32), - } - if 'groundtruth_attributes' in data: - groundtruths['attributes'] = data['groundtruth_attributes'] - groundtruths['source_id'] = utils.process_source_id( - groundtruths['source_id']) - groundtruths = utils.pad_groundtruths_to_fixed_size( - groundtruths, self._max_num_instances) - - # Packs labels for model_fn outputs. - labels = { - 'cls_targets': cls_targets, - 'box_targets': box_targets, - 'anchor_boxes': anchor_boxes, - 'cls_weights': cls_weights, - 'box_weights': box_weights, - 'image_info': image_info, - 'groundtruths': groundtruths, - } - if att_targets: - labels['attribute_targets'] = att_targets - return image, labels diff --git a/official/vision/beta/dataloaders/segmentation_input.py b/official/vision/beta/dataloaders/segmentation_input.py deleted file mode 100644 index 101b37ece..000000000 --- a/official/vision/beta/dataloaders/segmentation_input.py +++ /dev/null @@ -1,218 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Data parser and processing for segmentation datasets.""" - -import tensorflow as tf -from official.vision.beta.dataloaders import decoder -from official.vision.beta.dataloaders import parser -from official.vision.beta.ops import preprocess_ops - - -class Decoder(decoder.Decoder): - """A tf.Example decoder for segmentation task.""" - - def __init__(self): - self._keys_to_features = { - 'image/encoded': tf.io.FixedLenFeature((), tf.string, default_value=''), - 'image/height': tf.io.FixedLenFeature((), tf.int64, default_value=0), - 'image/width': tf.io.FixedLenFeature((), tf.int64, default_value=0), - 'image/segmentation/class/encoded': - tf.io.FixedLenFeature((), tf.string, default_value='') - } - - def decode(self, serialized_example): - return tf.io.parse_single_example( - serialized_example, self._keys_to_features) - - -class Parser(parser.Parser): - """Parser to parse an image and its annotations into a dictionary of tensors. - """ - - def __init__(self, - output_size, - crop_size=None, - resize_eval_groundtruth=True, - groundtruth_padded_size=None, - ignore_label=255, - aug_rand_hflip=False, - preserve_aspect_ratio=True, - aug_scale_min=1.0, - aug_scale_max=1.0, - dtype='float32'): - """Initializes parameters for parsing annotations in the dataset. - - Args: - output_size: `Tensor` or `list` for [height, width] of output image. The - output_size should be divided by the largest feature stride 2^max_level. - crop_size: `Tensor` or `list` for [height, width] of the crop. If - specified a training crop of size crop_size is returned. This is useful - for cropping original images during training while evaluating on - original image sizes. - resize_eval_groundtruth: `bool`, if True, eval groundtruth masks are - resized to output_size. - groundtruth_padded_size: `Tensor` or `list` for [height, width]. When - resize_eval_groundtruth is set to False, the groundtruth masks are - padded to this size. - ignore_label: `int` the pixel with ignore label will not used for training - and evaluation. - aug_rand_hflip: `bool`, if True, augment training with random - horizontal flip. - preserve_aspect_ratio: `bool`, if True, the aspect ratio is preserved, - otherwise, the image is resized to output_size. - aug_scale_min: `float`, the minimum scale applied to `output_size` for - data augmentation during training. - aug_scale_max: `float`, the maximum scale applied to `output_size` for - data augmentation during training. - dtype: `str`, data type. One of {`bfloat16`, `float32`, `float16`}. - """ - self._output_size = output_size - self._crop_size = crop_size - self._resize_eval_groundtruth = resize_eval_groundtruth - if (not resize_eval_groundtruth) and (groundtruth_padded_size is None): - raise ValueError('groundtruth_padded_size ([height, width]) needs to be' - 'specified when resize_eval_groundtruth is False.') - self._groundtruth_padded_size = groundtruth_padded_size - self._ignore_label = ignore_label - self._preserve_aspect_ratio = preserve_aspect_ratio - - # Data augmentation. - self._aug_rand_hflip = aug_rand_hflip - self._aug_scale_min = aug_scale_min - self._aug_scale_max = aug_scale_max - - # dtype. - self._dtype = dtype - - def _prepare_image_and_label(self, data): - """Prepare normalized image and label.""" - image = tf.io.decode_image(data['image/encoded'], channels=3) - label = tf.io.decode_image(data['image/segmentation/class/encoded'], - channels=1) - height = data['image/height'] - width = data['image/width'] - image = tf.reshape(image, (height, width, 3)) - - label = tf.reshape(label, (1, height, width)) - label = tf.cast(label, tf.float32) - # Normalizes image with mean and std pixel values. - image = preprocess_ops.normalize_image(image) - - if not self._preserve_aspect_ratio: - label = tf.reshape(label, [data['image/height'], data['image/width'], 1]) - image = tf.image.resize(image, self._output_size, method='bilinear') - label = tf.image.resize(label, self._output_size, method='nearest') - label = tf.reshape(label[:, :, -1], [1] + self._output_size) - - return image, label - - def _parse_train_data(self, data): - """Parses data for training and evaluation.""" - image, label = self._prepare_image_and_label(data) - - if self._crop_size: - - label = tf.reshape(label, [data['image/height'], data['image/width'], 1]) - # If output_size is specified, resize image, and label to desired - # output_size. - if self._output_size: - image = tf.image.resize(image, self._output_size, method='bilinear') - label = tf.image.resize(label, self._output_size, method='nearest') - - image_mask = tf.concat([image, label], axis=2) - image_mask_crop = tf.image.random_crop(image_mask, - self._crop_size + [4]) - image = image_mask_crop[:, :, :-1] - label = tf.reshape(image_mask_crop[:, :, -1], [1] + self._crop_size) - - # Flips image randomly during training. - if self._aug_rand_hflip: - image, _, label = preprocess_ops.random_horizontal_flip( - image, masks=label) - - train_image_size = self._crop_size if self._crop_size else self._output_size - # Resizes and crops image. - image, image_info = preprocess_ops.resize_and_crop_image( - image, - train_image_size, - train_image_size, - aug_scale_min=self._aug_scale_min, - aug_scale_max=self._aug_scale_max) - - # Resizes and crops boxes. - image_scale = image_info[2, :] - offset = image_info[3, :] - - # Pad label and make sure the padded region assigned to the ignore label. - # The label is first offset by +1 and then padded with 0. - label += 1 - label = tf.expand_dims(label, axis=3) - label = preprocess_ops.resize_and_crop_masks( - label, image_scale, train_image_size, offset) - label -= 1 - label = tf.where(tf.equal(label, -1), - self._ignore_label * tf.ones_like(label), label) - label = tf.squeeze(label, axis=0) - valid_mask = tf.not_equal(label, self._ignore_label) - labels = { - 'masks': label, - 'valid_masks': valid_mask, - 'image_info': image_info, - } - - # Cast image as self._dtype - image = tf.cast(image, dtype=self._dtype) - - return image, labels - - def _parse_eval_data(self, data): - """Parses data for training and evaluation.""" - image, label = self._prepare_image_and_label(data) - # The label is first offset by +1 and then padded with 0. - label += 1 - label = tf.expand_dims(label, axis=3) - - # Resizes and crops image. - image, image_info = preprocess_ops.resize_and_crop_image( - image, self._output_size, self._output_size) - - if self._resize_eval_groundtruth: - # Resizes eval masks to match input image sizes. In that case, mean IoU - # is computed on output_size not the original size of the images. - image_scale = image_info[2, :] - offset = image_info[3, :] - label = preprocess_ops.resize_and_crop_masks(label, image_scale, - self._output_size, offset) - else: - label = tf.image.pad_to_bounding_box( - label, 0, 0, self._groundtruth_padded_size[0], - self._groundtruth_padded_size[1]) - - label -= 1 - label = tf.where(tf.equal(label, -1), - self._ignore_label * tf.ones_like(label), label) - label = tf.squeeze(label, axis=0) - - valid_mask = tf.not_equal(label, self._ignore_label) - labels = { - 'masks': label, - 'valid_masks': valid_mask, - 'image_info': image_info - } - - # Cast image as self._dtype - image = tf.cast(image, dtype=self._dtype) - - return image, labels diff --git a/official/vision/beta/dataloaders/tf_example_decoder.py b/official/vision/beta/dataloaders/tf_example_decoder.py deleted file mode 100644 index 4888ab634..000000000 --- a/official/vision/beta/dataloaders/tf_example_decoder.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tensorflow Example proto decoder for object detection. - -A decoder to decode string tensors containing serialized tensorflow.Example -protos for object detection. -""" -import tensorflow as tf - -from official.vision.beta.dataloaders import decoder - - -def _generate_source_id(image_bytes): - # Hashing using 22 bits since float32 has only 23 mantissa bits. - return tf.strings.as_string( - tf.strings.to_hash_bucket_fast(image_bytes, 2 ** 22 - 1)) - - -class TfExampleDecoder(decoder.Decoder): - """Tensorflow Example proto decoder.""" - - def __init__(self, - include_mask=False, - regenerate_source_id=False, - mask_binarize_threshold=None): - self._include_mask = include_mask - self._regenerate_source_id = regenerate_source_id - self._keys_to_features = { - 'image/encoded': tf.io.FixedLenFeature((), tf.string), - 'image/height': tf.io.FixedLenFeature((), tf.int64), - 'image/width': tf.io.FixedLenFeature((), tf.int64), - 'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32), - 'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32), - 'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32), - 'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32), - 'image/object/class/label': tf.io.VarLenFeature(tf.int64), - 'image/object/area': tf.io.VarLenFeature(tf.float32), - 'image/object/is_crowd': tf.io.VarLenFeature(tf.int64), - } - self._mask_binarize_threshold = mask_binarize_threshold - if include_mask: - self._keys_to_features.update({ - 'image/object/mask': tf.io.VarLenFeature(tf.string), - }) - if not regenerate_source_id: - self._keys_to_features.update({ - 'image/source_id': tf.io.FixedLenFeature((), tf.string), - }) - - def _decode_image(self, parsed_tensors): - """Decodes the image and set its static shape.""" - image = tf.io.decode_image(parsed_tensors['image/encoded'], channels=3) - image.set_shape([None, None, 3]) - return image - - def _decode_boxes(self, parsed_tensors): - """Concat box coordinates in the format of [ymin, xmin, ymax, xmax].""" - xmin = parsed_tensors['image/object/bbox/xmin'] - xmax = parsed_tensors['image/object/bbox/xmax'] - ymin = parsed_tensors['image/object/bbox/ymin'] - ymax = parsed_tensors['image/object/bbox/ymax'] - return tf.stack([ymin, xmin, ymax, xmax], axis=-1) - - def _decode_classes(self, parsed_tensors): - return parsed_tensors['image/object/class/label'] - - def _decode_areas(self, parsed_tensors): - xmin = parsed_tensors['image/object/bbox/xmin'] - xmax = parsed_tensors['image/object/bbox/xmax'] - ymin = parsed_tensors['image/object/bbox/ymin'] - ymax = parsed_tensors['image/object/bbox/ymax'] - height = tf.cast(parsed_tensors['image/height'], dtype=tf.float32) - width = tf.cast(parsed_tensors['image/width'], dtype=tf.float32) - return tf.cond( - tf.greater(tf.shape(parsed_tensors['image/object/area'])[0], 0), - lambda: parsed_tensors['image/object/area'], - lambda: (xmax - xmin) * (ymax - ymin) * height * width) - - def _decode_masks(self, parsed_tensors): - """Decode a set of PNG masks to the tf.float32 tensors.""" - - def _decode_png_mask(png_bytes): - mask = tf.squeeze( - tf.io.decode_png(png_bytes, channels=1, dtype=tf.uint8), axis=-1) - mask = tf.cast(mask, dtype=tf.float32) - mask.set_shape([None, None]) - return mask - - height = parsed_tensors['image/height'] - width = parsed_tensors['image/width'] - masks = parsed_tensors['image/object/mask'] - return tf.cond( - pred=tf.greater(tf.size(input=masks), 0), - true_fn=lambda: tf.map_fn(_decode_png_mask, masks, dtype=tf.float32), - false_fn=lambda: tf.zeros([0, height, width], dtype=tf.float32)) - - def decode(self, serialized_example): - """Decode the serialized example. - - Args: - serialized_example: a single serialized tf.Example string. - - Returns: - decoded_tensors: a dictionary of tensors with the following fields: - - source_id: a string scalar tensor. - - image: a uint8 tensor of shape [None, None, 3]. - - height: an integer scalar tensor. - - width: an integer scalar tensor. - - groundtruth_classes: a int64 tensor of shape [None]. - - groundtruth_is_crowd: a bool tensor of shape [None]. - - groundtruth_area: a float32 tensor of shape [None]. - - groundtruth_boxes: a float32 tensor of shape [None, 4]. - - groundtruth_instance_masks: a float32 tensor of shape - [None, None, None]. - - groundtruth_instance_masks_png: a string tensor of shape [None]. - """ - parsed_tensors = tf.io.parse_single_example( - serialized=serialized_example, features=self._keys_to_features) - for k in parsed_tensors: - if isinstance(parsed_tensors[k], tf.SparseTensor): - if parsed_tensors[k].dtype == tf.string: - parsed_tensors[k] = tf.sparse.to_dense( - parsed_tensors[k], default_value='') - else: - parsed_tensors[k] = tf.sparse.to_dense( - parsed_tensors[k], default_value=0) - - if self._regenerate_source_id: - source_id = _generate_source_id(parsed_tensors['image/encoded']) - else: - source_id = tf.cond( - tf.greater(tf.strings.length(parsed_tensors['image/source_id']), 0), - lambda: parsed_tensors['image/source_id'], - lambda: _generate_source_id(parsed_tensors['image/encoded'])) - image = self._decode_image(parsed_tensors) - boxes = self._decode_boxes(parsed_tensors) - classes = self._decode_classes(parsed_tensors) - areas = self._decode_areas(parsed_tensors) - is_crowds = tf.cond( - tf.greater(tf.shape(parsed_tensors['image/object/is_crowd'])[0], 0), - lambda: tf.cast(parsed_tensors['image/object/is_crowd'], dtype=tf.bool), - lambda: tf.zeros_like(classes, dtype=tf.bool)) - if self._include_mask: - masks = self._decode_masks(parsed_tensors) - - if self._mask_binarize_threshold is not None: - masks = tf.cast(masks > self._mask_binarize_threshold, tf.float32) - - decoded_tensors = { - 'source_id': source_id, - 'image': image, - 'height': parsed_tensors['image/height'], - 'width': parsed_tensors['image/width'], - 'groundtruth_classes': classes, - 'groundtruth_is_crowd': is_crowds, - 'groundtruth_area': areas, - 'groundtruth_boxes': boxes, - } - if self._include_mask: - decoded_tensors.update({ - 'groundtruth_instance_masks': masks, - 'groundtruth_instance_masks_png': parsed_tensors['image/object/mask'], - }) - return decoded_tensors diff --git a/official/vision/beta/dataloaders/tf_example_decoder_test.py b/official/vision/beta/dataloaders/tf_example_decoder_test.py deleted file mode 100644 index 187285f1b..000000000 --- a/official/vision/beta/dataloaders/tf_example_decoder_test.py +++ /dev/null @@ -1,267 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for tf_example_decoder.py.""" - -# Import libraries -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from official.vision.beta.dataloaders import tf_example_decoder -from official.vision.beta.dataloaders import tfexample_utils - - -class TfExampleDecoderTest(tf.test.TestCase, parameterized.TestCase): - - @parameterized.parameters( - (100, 100, 0, True), - (100, 100, 1, True), - (100, 100, 2, True), - (100, 100, 0, False), - (100, 100, 1, False), - (100, 100, 2, False), - ) - def test_result_shape(self, - image_height, - image_width, - num_instances, - regenerate_source_id): - decoder = tf_example_decoder.TfExampleDecoder( - include_mask=True, regenerate_source_id=regenerate_source_id) - - serialized_example = tfexample_utils.create_detection_test_example( - image_height=image_height, - image_width=image_width, - image_channel=3, - num_instances=num_instances).SerializeToString() - decoded_tensors = decoder.decode( - tf.convert_to_tensor(value=serialized_example)) - - results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors) - - self.assertAllEqual( - (image_height, image_width, 3), results['image'].shape) - if not regenerate_source_id: - self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id']) - self.assertEqual(image_height, results['height']) - self.assertEqual(image_width, results['width']) - self.assertAllEqual( - (num_instances,), results['groundtruth_classes'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_is_crowd'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_area'].shape) - self.assertAllEqual( - (num_instances, 4), results['groundtruth_boxes'].shape) - self.assertAllEqual( - (num_instances, image_height, image_width), - results['groundtruth_instance_masks'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_instance_masks_png'].shape) - - def test_result_content(self): - decoder = tf_example_decoder.TfExampleDecoder(include_mask=True) - - image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], - [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]], - [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]], - [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]] - image = tfexample_utils.encode_image(np.uint8(image_content), fmt='PNG') - image_height = 4 - image_width = 4 - num_instances = 2 - xmins = [0, 0.25] - xmaxs = [0.5, 1.0] - ymins = [0, 0] - ymaxs = [0.5, 1.0] - labels = [3, 1] - areas = [ - 0.25 * image_height * image_width, 0.75 * image_height * image_width - ] - is_crowds = [1, 0] - mask_content = [[[255, 255, 0, 0], - [255, 255, 0, 0], - [0, 0, 0, 0], - [0, 0, 0, 0]], - [[0, 255, 255, 255], - [0, 255, 255, 255], - [0, 255, 255, 255], - [0, 255, 255, 255]]] - masks = [ - tfexample_utils.encode_image(np.uint8(m), fmt='PNG') - for m in list(mask_content) - ] - serialized_example = tf.train.Example( - features=tf.train.Features( - feature={ - 'image/encoded': (tf.train.Feature( - bytes_list=tf.train.BytesList(value=[image]))), - 'image/source_id': (tf.train.Feature( - bytes_list=tf.train.BytesList( - value=[tfexample_utils.DUMP_SOURCE_ID]))), - 'image/height': (tf.train.Feature( - int64_list=tf.train.Int64List(value=[image_height]))), - 'image/width': (tf.train.Feature( - int64_list=tf.train.Int64List(value=[image_width]))), - 'image/object/bbox/xmin': (tf.train.Feature( - float_list=tf.train.FloatList(value=xmins))), - 'image/object/bbox/xmax': (tf.train.Feature( - float_list=tf.train.FloatList(value=xmaxs))), - 'image/object/bbox/ymin': (tf.train.Feature( - float_list=tf.train.FloatList(value=ymins))), - 'image/object/bbox/ymax': (tf.train.Feature( - float_list=tf.train.FloatList(value=ymaxs))), - 'image/object/class/label': (tf.train.Feature( - int64_list=tf.train.Int64List(value=labels))), - 'image/object/is_crowd': (tf.train.Feature( - int64_list=tf.train.Int64List(value=is_crowds))), - 'image/object/area': (tf.train.Feature( - float_list=tf.train.FloatList(value=areas))), - 'image/object/mask': (tf.train.Feature( - bytes_list=tf.train.BytesList(value=masks))), - })).SerializeToString() - decoded_tensors = decoder.decode( - tf.convert_to_tensor(value=serialized_example)) - - results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors) - - self.assertAllEqual( - (image_height, image_width, 3), results['image'].shape) - self.assertAllEqual(image_content, results['image']) - self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id']) - self.assertEqual(image_height, results['height']) - self.assertEqual(image_width, results['width']) - self.assertAllEqual( - (num_instances,), results['groundtruth_classes'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_is_crowd'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_area'].shape) - self.assertAllEqual( - (num_instances, 4), results['groundtruth_boxes'].shape) - self.assertAllEqual( - (num_instances, image_height, image_width), - results['groundtruth_instance_masks'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_instance_masks_png'].shape) - self.assertAllEqual( - [3, 1], results['groundtruth_classes']) - self.assertAllEqual( - [True, False], results['groundtruth_is_crowd']) - self.assertNDArrayNear( - [0.25 * image_height * image_width, 0.75 * image_height * image_width], - results['groundtruth_area'], 1e-4) - self.assertNDArrayNear( - [[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]], - results['groundtruth_boxes'], 1e-4) - self.assertNDArrayNear( - mask_content, results['groundtruth_instance_masks'], 1e-4) - self.assertAllEqual( - masks, results['groundtruth_instance_masks_png']) - - def test_handling_missing_fields(self): - decoder = tf_example_decoder.TfExampleDecoder(include_mask=True) - - image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], - [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]], - [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]], - [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]] - image = tfexample_utils.encode_image(np.uint8(image_content), fmt='PNG') - image_height = 4 - image_width = 4 - num_instances = 2 - xmins = [0, 0.25] - xmaxs = [0.5, 1.0] - ymins = [0, 0] - ymaxs = [0.5, 1.0] - labels = [3, 1] - mask_content = [[[255, 255, 0, 0], - [255, 255, 0, 0], - [0, 0, 0, 0], - [0, 0, 0, 0]], - [[0, 255, 255, 255], - [0, 255, 255, 255], - [0, 255, 255, 255], - [0, 255, 255, 255]]] - masks = [ - tfexample_utils.encode_image(np.uint8(m), fmt='PNG') - for m in list(mask_content) - ] - serialized_example = tf.train.Example( - features=tf.train.Features( - feature={ - 'image/encoded': (tf.train.Feature( - bytes_list=tf.train.BytesList(value=[image]))), - 'image/source_id': (tf.train.Feature( - bytes_list=tf.train.BytesList( - value=[tfexample_utils.DUMP_SOURCE_ID]))), - 'image/height': (tf.train.Feature( - int64_list=tf.train.Int64List(value=[image_height]))), - 'image/width': (tf.train.Feature( - int64_list=tf.train.Int64List(value=[image_width]))), - 'image/object/bbox/xmin': (tf.train.Feature( - float_list=tf.train.FloatList(value=xmins))), - 'image/object/bbox/xmax': (tf.train.Feature( - float_list=tf.train.FloatList(value=xmaxs))), - 'image/object/bbox/ymin': (tf.train.Feature( - float_list=tf.train.FloatList(value=ymins))), - 'image/object/bbox/ymax': (tf.train.Feature( - float_list=tf.train.FloatList(value=ymaxs))), - 'image/object/class/label': (tf.train.Feature( - int64_list=tf.train.Int64List(value=labels))), - 'image/object/mask': (tf.train.Feature( - bytes_list=tf.train.BytesList(value=masks))), - })).SerializeToString() - decoded_tensors = decoder.decode( - tf.convert_to_tensor(serialized_example)) - results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors) - - self.assertAllEqual( - (image_height, image_width, 3), results['image'].shape) - self.assertAllEqual(image_content, results['image']) - self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id']) - self.assertEqual(image_height, results['height']) - self.assertEqual(image_width, results['width']) - self.assertAllEqual( - (num_instances,), results['groundtruth_classes'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_is_crowd'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_area'].shape) - self.assertAllEqual( - (num_instances, 4), results['groundtruth_boxes'].shape) - self.assertAllEqual( - (num_instances, image_height, image_width), - results['groundtruth_instance_masks'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_instance_masks_png'].shape) - self.assertAllEqual( - [3, 1], results['groundtruth_classes']) - self.assertAllEqual( - [False, False], results['groundtruth_is_crowd']) - self.assertNDArrayNear( - [0.25 * image_height * image_width, 0.75 * image_height * image_width], - results['groundtruth_area'], 1e-4) - self.assertNDArrayNear( - [[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]], - results['groundtruth_boxes'], 1e-4) - self.assertNDArrayNear( - mask_content, results['groundtruth_instance_masks'], 1e-4) - self.assertAllEqual( - masks, results['groundtruth_instance_masks_png']) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/dataloaders/tf_example_label_map_decoder.py b/official/vision/beta/dataloaders/tf_example_label_map_decoder.py deleted file mode 100644 index c6da29ded..000000000 --- a/official/vision/beta/dataloaders/tf_example_label_map_decoder.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tensorflow Example proto decoder for object detection. - -A decoder to decode string tensors containing serialized tensorflow.Example -protos for object detection. -""" -import csv -# Import libraries -import tensorflow as tf - -from official.vision.beta.dataloaders import tf_example_decoder - - -class TfExampleDecoderLabelMap(tf_example_decoder.TfExampleDecoder): - """Tensorflow Example proto decoder.""" - - def __init__(self, label_map, include_mask=False, regenerate_source_id=False, - mask_binarize_threshold=None): - super(TfExampleDecoderLabelMap, self).__init__( - include_mask=include_mask, regenerate_source_id=regenerate_source_id, - mask_binarize_threshold=mask_binarize_threshold) - self._keys_to_features.update({ - 'image/object/class/text': tf.io.VarLenFeature(tf.string), - }) - name_to_id = self._process_label_map(label_map) - self._name_to_id_table = tf.lookup.StaticHashTable( - tf.lookup.KeyValueTensorInitializer( - keys=tf.constant(list(name_to_id.keys()), dtype=tf.string), - values=tf.constant(list(name_to_id.values()), dtype=tf.int64)), - default_value=-1) - - def _process_label_map(self, label_map): - if label_map.endswith('.csv'): - name_to_id = self._process_csv(label_map) - else: - raise ValueError('The label map file is in incorrect format.') - return name_to_id - - def _process_csv(self, label_map): - name_to_id = {} - with tf.io.gfile.GFile(label_map, 'r') as f: - reader = csv.reader(f, delimiter=',') - for row in reader: - if len(row) != 2: - raise ValueError('Each row of the csv label map file must be in ' - '`id,name` format. length = {}'.format(len(row))) - id_index = int(row[0]) - name = row[1] - name_to_id[name] = id_index - return name_to_id - - def _decode_classes(self, parsed_tensors): - return self._name_to_id_table.lookup( - parsed_tensors['image/object/class/text']) diff --git a/official/vision/beta/dataloaders/tf_example_label_map_decoder_test.py b/official/vision/beta/dataloaders/tf_example_label_map_decoder_test.py deleted file mode 100644 index 54d7810f6..000000000 --- a/official/vision/beta/dataloaders/tf_example_label_map_decoder_test.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for tf_example_label_map_decoder.py.""" - -import os -# Import libraries -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from official.vision.beta.dataloaders import tf_example_label_map_decoder -from official.vision.beta.dataloaders import tfexample_utils - - -LABEL_MAP_CSV_CONTENT = '0,class_0\n1,class_1\n2,class_2' - - -class TfExampleDecoderLabelMapTest(tf.test.TestCase, parameterized.TestCase): - - @parameterized.parameters( - (100, 100, 0), - (100, 100, 1), - (100, 100, 2), - (100, 100, 0), - (100, 100, 1), - (100, 100, 2), - ) - def test_result_shape(self, image_height, image_width, num_instances): - label_map_dir = self.get_temp_dir() - label_map_name = 'label_map.csv' - label_map_path = os.path.join(label_map_dir, label_map_name) - with open(label_map_path, 'w') as f: - f.write(LABEL_MAP_CSV_CONTENT) - - decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap( - label_map_path, include_mask=True) - - serialized_example = tfexample_utils.create_detection_test_example( - image_height=image_height, - image_width=image_width, - image_channel=3, - num_instances=num_instances).SerializeToString() - decoded_tensors = decoder.decode( - tf.convert_to_tensor(value=serialized_example)) - - results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors) - - self.assertAllEqual( - (image_height, image_width, 3), results['image'].shape) - self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id']) - self.assertEqual(image_height, results['height']) - self.assertEqual(image_width, results['width']) - self.assertAllEqual( - (num_instances,), results['groundtruth_classes'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_is_crowd'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_area'].shape) - self.assertAllEqual( - (num_instances, 4), results['groundtruth_boxes'].shape) - self.assertAllEqual( - (num_instances, image_height, image_width), - results['groundtruth_instance_masks'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_instance_masks_png'].shape) - - def test_result_content(self): - label_map_dir = self.get_temp_dir() - label_map_name = 'label_map.csv' - label_map_path = os.path.join(label_map_dir, label_map_name) - with open(label_map_path, 'w') as f: - f.write(LABEL_MAP_CSV_CONTENT) - - decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap( - label_map_path, include_mask=True) - - image_content = [[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], - [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]], - [[0, 0, 0], [255, 255, 255], [255, 255, 255], [0, 0, 0]], - [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]] - image = tfexample_utils.encode_image(np.uint8(image_content), fmt='PNG') - image_height = 4 - image_width = 4 - num_instances = 2 - xmins = [0, 0.25] - xmaxs = [0.5, 1.0] - ymins = [0, 0] - ymaxs = [0.5, 1.0] - labels = [b'class_2', b'class_0'] - areas = [ - 0.25 * image_height * image_width, 0.75 * image_height * image_width - ] - is_crowds = [1, 0] - mask_content = [[[255, 255, 0, 0], - [255, 255, 0, 0], - [0, 0, 0, 0], - [0, 0, 0, 0]], - [[0, 255, 255, 255], - [0, 255, 255, 255], - [0, 255, 255, 255], - [0, 255, 255, 255]]] - masks = [ - tfexample_utils.encode_image(np.uint8(m), fmt='PNG') - for m in list(mask_content) - ] - serialized_example = tf.train.Example( - features=tf.train.Features( - feature={ - 'image/encoded': (tf.train.Feature( - bytes_list=tf.train.BytesList(value=[image]))), - 'image/source_id': (tf.train.Feature( - bytes_list=tf.train.BytesList( - value=[tfexample_utils.DUMP_SOURCE_ID]))), - 'image/height': (tf.train.Feature( - int64_list=tf.train.Int64List(value=[image_height]))), - 'image/width': (tf.train.Feature( - int64_list=tf.train.Int64List(value=[image_width]))), - 'image/object/bbox/xmin': (tf.train.Feature( - float_list=tf.train.FloatList(value=xmins))), - 'image/object/bbox/xmax': (tf.train.Feature( - float_list=tf.train.FloatList(value=xmaxs))), - 'image/object/bbox/ymin': (tf.train.Feature( - float_list=tf.train.FloatList(value=ymins))), - 'image/object/bbox/ymax': (tf.train.Feature( - float_list=tf.train.FloatList(value=ymaxs))), - 'image/object/class/text': (tf.train.Feature( - bytes_list=tf.train.BytesList(value=labels))), - 'image/object/is_crowd': (tf.train.Feature( - int64_list=tf.train.Int64List(value=is_crowds))), - 'image/object/area': (tf.train.Feature( - float_list=tf.train.FloatList(value=areas))), - 'image/object/mask': (tf.train.Feature( - bytes_list=tf.train.BytesList(value=masks))), - })).SerializeToString() - decoded_tensors = decoder.decode( - tf.convert_to_tensor(value=serialized_example)) - - results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors) - - self.assertAllEqual( - (image_height, image_width, 3), results['image'].shape) - self.assertAllEqual(image_content, results['image']) - self.assertEqual(tfexample_utils.DUMP_SOURCE_ID, results['source_id']) - self.assertEqual(image_height, results['height']) - self.assertEqual(image_width, results['width']) - self.assertAllEqual( - (num_instances,), results['groundtruth_classes'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_is_crowd'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_area'].shape) - self.assertAllEqual( - (num_instances, 4), results['groundtruth_boxes'].shape) - self.assertAllEqual( - (num_instances, image_height, image_width), - results['groundtruth_instance_masks'].shape) - self.assertAllEqual( - (num_instances,), results['groundtruth_instance_masks_png'].shape) - self.assertAllEqual( - [2, 0], results['groundtruth_classes']) - self.assertAllEqual( - [True, False], results['groundtruth_is_crowd']) - self.assertNDArrayNear( - [0.25 * image_height * image_width, 0.75 * image_height * image_width], - results['groundtruth_area'], 1e-4) - self.assertNDArrayNear( - [[0, 0, 0.5, 0.5], [0, 0.25, 1.0, 1.0]], - results['groundtruth_boxes'], 1e-4) - self.assertNDArrayNear( - mask_content, results['groundtruth_instance_masks'], 1e-4) - self.assertAllEqual( - masks, results['groundtruth_instance_masks_png']) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/dataloaders/tfds_classification_decoders.py b/official/vision/beta/dataloaders/tfds_classification_decoders.py deleted file mode 100644 index cfb315590..000000000 --- a/official/vision/beta/dataloaders/tfds_classification_decoders.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""TFDS Classification decoders.""" - -import tensorflow as tf -from official.vision.beta.dataloaders import decoder - - -class ClassificationDecorder(decoder.Decoder): - """A tf.Example decoder for tfds classification datasets.""" - - def decode(self, serialized_example): - sample_dict = { - 'image/encoded': - tf.io.encode_jpeg(serialized_example['image'], quality=100), - 'image/class/label': - serialized_example['label'], - } - return sample_dict - - -TFDS_ID_TO_DECODER_MAP = { - 'cifar10': ClassificationDecorder, - 'cifar100': ClassificationDecorder, - 'imagenet2012': ClassificationDecorder, -} diff --git a/official/vision/beta/dataloaders/tfds_detection_decoders.py b/official/vision/beta/dataloaders/tfds_detection_decoders.py deleted file mode 100644 index 0ecf88ba9..000000000 --- a/official/vision/beta/dataloaders/tfds_detection_decoders.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""TFDS detection decoders.""" - -import tensorflow as tf -from official.vision.beta.dataloaders import decoder - - -class MSCOCODecoder(decoder.Decoder): - """A tf.Example decoder for tfds coco datasets.""" - - def decode(self, serialized_example): - """Decode the serialized example. - - Args: - serialized_example: a dictonary example produced by tfds. - - Returns: - decoded_tensors: a dictionary of tensors with the following fields: - - source_id: a string scalar tensor. - - image: a uint8 tensor of shape [None, None, 3]. - - height: an integer scalar tensor. - - width: an integer scalar tensor. - - groundtruth_classes: a int64 tensor of shape [None]. - - groundtruth_is_crowd: a bool tensor of shape [None]. - - groundtruth_area: a float32 tensor of shape [None]. - - groundtruth_boxes: a float32 tensor of shape [None, 4]. - """ - - decoded_tensors = { - 'source_id': tf.strings.as_string(serialized_example['image/id']), - 'image': serialized_example['image'], - 'height': tf.cast(tf.shape(serialized_example['image'])[0], tf.int64), - 'width': tf.cast(tf.shape(serialized_example['image'])[1], tf.int64), - 'groundtruth_classes': serialized_example['objects']['label'], - 'groundtruth_is_crowd': serialized_example['objects']['is_crowd'], - 'groundtruth_area': tf.cast( - serialized_example['objects']['area'], tf.float32), - 'groundtruth_boxes': serialized_example['objects']['bbox'], - } - return decoded_tensors - - -TFDS_ID_TO_DECODER_MAP = { - 'coco/2017': MSCOCODecoder, - 'coco/2014': MSCOCODecoder, - 'coco': MSCOCODecoder -} diff --git a/official/vision/beta/dataloaders/tfds_factory.py b/official/vision/beta/dataloaders/tfds_factory.py deleted file mode 100644 index bedaff491..000000000 --- a/official/vision/beta/dataloaders/tfds_factory.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""TFDS factory functions.""" -from official.vision.beta.dataloaders import decoder as base_decoder -from official.vision.beta.dataloaders import tfds_detection_decoders -from official.vision.beta.dataloaders import tfds_segmentation_decoders -from official.vision.beta.dataloaders import tfds_classification_decoders - - -def get_classification_decoder(tfds_name: str) -> base_decoder.Decoder: - """Gets classification decoder. - - Args: - tfds_name: `str`, name of the tfds classification decoder. - Returns: - `base_decoder.Decoder` instance. - Raises: - ValueError if the tfds_name doesn't exist in the available decoders. - """ - if tfds_name in tfds_classification_decoders.TFDS_ID_TO_DECODER_MAP: - decoder = tfds_classification_decoders.TFDS_ID_TO_DECODER_MAP[tfds_name]() - else: - raise ValueError( - f'TFDS Classification {tfds_name} is not supported') - return decoder - - -def get_detection_decoder(tfds_name: str) -> base_decoder.Decoder: - """Gets detection decoder. - - Args: - tfds_name: `str`, name of the tfds detection decoder. - Returns: - `base_decoder.Decoder` instance. - Raises: - ValueError if the tfds_name doesn't exist in the available decoders. - """ - if tfds_name in tfds_detection_decoders.TFDS_ID_TO_DECODER_MAP: - decoder = tfds_detection_decoders.TFDS_ID_TO_DECODER_MAP[tfds_name]() - else: - raise ValueError(f'TFDS Detection {tfds_name} is not supported') - return decoder - - -def get_segmentation_decoder(tfds_name: str) -> base_decoder.Decoder: - """Gets segmentation decoder. - - Args: - tfds_name: `str`, name of the tfds segmentation decoder. - Returns: - `base_decoder.Decoder` instance. - Raises: - ValueError if the tfds_name doesn't exist in the available decoders. - """ - if tfds_name in tfds_segmentation_decoders.TFDS_ID_TO_DECODER_MAP: - decoder = tfds_segmentation_decoders.TFDS_ID_TO_DECODER_MAP[tfds_name]() - else: - raise ValueError(f'TFDS Segmentation {tfds_name} is not supported') - return decoder diff --git a/official/vision/beta/dataloaders/tfds_factory_test.py b/official/vision/beta/dataloaders/tfds_factory_test.py deleted file mode 100644 index 7629b9423..000000000 --- a/official/vision/beta/dataloaders/tfds_factory_test.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for tfds factory functions.""" - -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.dataloaders import decoder as base_decoder -from official.vision.beta.dataloaders import tfds_factory - - -class TFDSFactoryTest(tf.test.TestCase, parameterized.TestCase): - - def _create_test_example(self): - serialized_example = { - 'image': tf.ones(shape=(100, 100, 3), dtype=tf.uint8), - 'label': 1, - 'image/id': 0, - 'objects': { - 'label': 1, - 'is_crowd': 0, - 'area': 0.5, - 'bbox': [0.1, 0.2, 0.3, 0.4] - }, - 'segmentation_label': tf.ones((100, 100, 1), dtype=tf.uint8), - 'image_left': tf.ones(shape=(100, 100, 3), dtype=tf.uint8) - } - return serialized_example - - @parameterized.parameters( - ('imagenet2012'), - ('cifar10'), - ('cifar100'), - ) - def test_classification_decoder(self, tfds_name): - decoder = tfds_factory.get_classification_decoder(tfds_name) - self.assertIsInstance(decoder, base_decoder.Decoder) - decoded_tensor = decoder.decode(self._create_test_example()) - self.assertLen(decoded_tensor, 2) - self.assertIn('image/encoded', decoded_tensor) - self.assertIn('image/class/label', decoded_tensor) - - @parameterized.parameters( - ('flowers'), - ('coco'), - ) - def test_doesnt_exit_classification_decoder(self, tfds_name): - with self.assertRaises(ValueError): - _ = tfds_factory.get_classification_decoder(tfds_name) - - @parameterized.parameters( - ('coco'), - ('coco/2014'), - ('coco/2017'), - ) - def test_detection_decoder(self, tfds_name): - decoder = tfds_factory.get_detection_decoder(tfds_name) - self.assertIsInstance(decoder, base_decoder.Decoder) - decoded_tensor = decoder.decode(self._create_test_example()) - self.assertLen(decoded_tensor, 8) - self.assertIn('image', decoded_tensor) - self.assertIn('source_id', decoded_tensor) - self.assertIn('height', decoded_tensor) - self.assertIn('width', decoded_tensor) - self.assertIn('groundtruth_classes', decoded_tensor) - self.assertIn('groundtruth_is_crowd', decoded_tensor) - self.assertIn('groundtruth_area', decoded_tensor) - self.assertIn('groundtruth_boxes', decoded_tensor) - - @parameterized.parameters( - ('pascal'), - ('cityscapes'), - ) - def test_doesnt_exit_detection_decoder(self, tfds_name): - with self.assertRaises(ValueError): - _ = tfds_factory.get_detection_decoder(tfds_name) - - @parameterized.parameters( - ('cityscapes'), - ('cityscapes/semantic_segmentation'), - ('cityscapes/semantic_segmentation_extra'), - ) - def test_segmentation_decoder(self, tfds_name): - decoder = tfds_factory.get_segmentation_decoder(tfds_name) - self.assertIsInstance(decoder, base_decoder.Decoder) - decoded_tensor = decoder.decode(self._create_test_example()) - self.assertLen(decoded_tensor, 4) - self.assertIn('image/encoded', decoded_tensor) - self.assertIn('image/segmentation/class/encoded', decoded_tensor) - self.assertIn('image/height', decoded_tensor) - self.assertIn('image/width', decoded_tensor) - - @parameterized.parameters( - ('coco'), - ('imagenet'), - ) - def test_doesnt_exit_segmentation_decoder(self, tfds_name): - with self.assertRaises(ValueError): - _ = tfds_factory.get_segmentation_decoder(tfds_name) - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/dataloaders/tfds_segmentation_decoders.py b/official/vision/beta/dataloaders/tfds_segmentation_decoders.py deleted file mode 100644 index 8d29b836a..000000000 --- a/official/vision/beta/dataloaders/tfds_segmentation_decoders.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""TFDS Semantic Segmentation decoders.""" - -import tensorflow as tf -from official.vision.beta.dataloaders import decoder - - -class CityScapesDecorder(decoder.Decoder): - """A tf.Example decoder for tfds cityscapes datasets.""" - - def __init__(self): - # Original labels to trainable labels map, 255 is the ignore class. - self._label_map = { - -1: 255, - 0: 255, - 1: 255, - 2: 255, - 3: 255, - 4: 255, - 5: 255, - 6: 255, - 7: 0, - 8: 1, - 9: 255, - 10: 255, - 11: 2, - 12: 3, - 13: 4, - 14: 255, - 15: 255, - 16: 255, - 17: 5, - 18: 255, - 19: 6, - 20: 7, - 21: 8, - 22: 9, - 23: 10, - 24: 11, - 25: 12, - 26: 13, - 27: 14, - 28: 15, - 29: 255, - 30: 255, - 31: 16, - 32: 17, - 33: 18, - } - - def decode(self, serialized_example): - # Convert labels according to the self._label_map - label = serialized_example['segmentation_label'] - for original_label in self._label_map: - label = tf.where(label == original_label, - self._label_map[original_label] * tf.ones_like(label), - label) - sample_dict = { - 'image/encoded': - tf.io.encode_jpeg(serialized_example['image_left'], quality=100), - 'image/height': serialized_example['image_left'].shape[0], - 'image/width': serialized_example['image_left'].shape[1], - 'image/segmentation/class/encoded': - tf.io.encode_png(label), - } - return sample_dict - - -TFDS_ID_TO_DECODER_MAP = { - 'cityscapes': CityScapesDecorder, - 'cityscapes/semantic_segmentation': CityScapesDecorder, - 'cityscapes/semantic_segmentation_extra': CityScapesDecorder, -} diff --git a/official/vision/beta/dataloaders/tfexample_utils.py b/official/vision/beta/dataloaders/tfexample_utils.py deleted file mode 100644 index cee17f170..000000000 --- a/official/vision/beta/dataloaders/tfexample_utils.py +++ /dev/null @@ -1,268 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Utility functions to create tf.Example and tf.SequnceExample for test. - -Example:video classification end-to-end test -i.e. from reading input file to train and eval. - -```python -class FooTrainTest(tf.test.TestCase): - - def setUp(self): - super(TrainTest, self).setUp() - - # Write the fake tf.train.SequenceExample to file for test. - data_dir = os.path.join(self.get_temp_dir(), 'data') - tf.io.gfile.makedirs(data_dir) - self._data_path = os.path.join(data_dir, 'data.tfrecord') - examples = [ - tfexample_utils.make_video_test_example( - image_shape=(36, 36, 3), - audio_shape=(20, 128), - label=random.randint(0, 100)) for _ in range(2) - ] - tfexample_utils.dump_to_tfrecord(self._data_path, tf_examples=examples) - - def test_foo(self): - dataset = tf.data.TFRecordDataset(self._data_path) - ... - -``` - -""" -import io -from typing import Sequence, Union - -import numpy as np -from PIL import Image -import tensorflow as tf - -IMAGE_KEY = 'image/encoded' -CLASSIFICATION_LABEL_KEY = 'image/class/label' -LABEL_KEY = 'clip/label/index' -AUDIO_KEY = 'features/audio' -DUMP_SOURCE_ID = b'123' - - -def encode_image(image_array: np.array, fmt: str) -> bytes: - image = Image.fromarray(image_array) - with io.BytesIO() as output: - image.save(output, format=fmt) - return output.getvalue() - - -def make_image_bytes(shape: Sequence[int], fmt: str = 'JPEG') -> bytes: - """Generates image and return bytes in specified format.""" - random_image = np.random.randint(0, 256, size=shape, dtype=np.uint8) - return encode_image(random_image, fmt=fmt) - - -def put_int64_to_context(seq_example: tf.train.SequenceExample, - label: int = 0, - key: str = LABEL_KEY): - """Puts int64 to SequenceExample context with key.""" - seq_example.context.feature[key].int64_list.value[:] = [label] - - -def put_bytes_list_to_feature(seq_example: tf.train.SequenceExample, - raw_image_bytes: bytes, - key: str = IMAGE_KEY, - repeat_num: int = 2): - """Puts bytes list to SequenceExample context with key.""" - for _ in range(repeat_num): - seq_example.feature_lists.feature_list.get_or_create( - key).feature.add().bytes_list.value[:] = [raw_image_bytes] - - -def put_float_list_to_feature(seq_example: tf.train.SequenceExample, - value: Sequence[Sequence[float]], key: str): - """Puts float list to SequenceExample context with key.""" - for s in value: - seq_example.feature_lists.feature_list.get_or_create( - key).feature.add().float_list.value[:] = s - - -def make_video_test_example(image_shape: Sequence[int] = (263, 320, 3), - audio_shape: Sequence[int] = (10, 256), - label: int = 42): - """Generates data for testing video models (inc. RGB, audio, & label).""" - raw_image_bytes = make_image_bytes(shape=image_shape) - random_audio = np.random.normal(size=audio_shape).tolist() - - seq_example = tf.train.SequenceExample() - put_int64_to_context(seq_example, label=label, key=LABEL_KEY) - put_bytes_list_to_feature( - seq_example, raw_image_bytes, key=IMAGE_KEY, repeat_num=4) - - put_float_list_to_feature(seq_example, value=random_audio, key=AUDIO_KEY) - return seq_example - - -def dump_to_tfrecord(record_file: str, - tf_examples: Sequence[Union[tf.train.Example, - tf.train.SequenceExample]]): - """Writes serialized Example to TFRecord file with path.""" - with tf.io.TFRecordWriter(record_file) as writer: - for tf_example in tf_examples: - writer.write(tf_example.SerializeToString()) - - -def _encode_image(image_array: np.ndarray, fmt: str) -> bytes: - """Util function to encode an image.""" - image = Image.fromarray(image_array) - with io.BytesIO() as output: - image.save(output, format=fmt) - return output.getvalue() - - -def create_classification_example( - image_height: int, - image_width: int, - image_format: str = 'JPEG', - is_multilabel: bool = False) -> tf.train.Example: - """Creates image and labels for image classification input pipeline.""" - image = _encode_image( - np.uint8(np.random.rand(image_height, image_width, 3) * 255), - fmt=image_format) - labels = [0, 1] if is_multilabel else [0] - serialized_example = tf.train.Example( - features=tf.train.Features( - feature={ - IMAGE_KEY: (tf.train.Feature( - bytes_list=tf.train.BytesList(value=[image]))), - CLASSIFICATION_LABEL_KEY: (tf.train.Feature( - int64_list=tf.train.Int64List(value=labels))), - })).SerializeToString() - return serialized_example - - -def create_3d_image_test_example(image_height: int, image_width: int, - image_volume: int, - image_channel: int) -> tf.train.Example: - """Creates 3D image and label.""" - images = np.random.rand(image_height, image_width, image_volume, - image_channel) - images = images.astype(np.float32) - - labels = np.random.randint( - low=2, size=(image_height, image_width, image_volume, image_channel)) - labels = labels.astype(np.float32) - - feature = { - IMAGE_KEY: (tf.train.Feature( - bytes_list=tf.train.BytesList(value=[images.tobytes()]))), - CLASSIFICATION_LABEL_KEY: (tf.train.Feature( - bytes_list=tf.train.BytesList(value=[labels.tobytes()]))) - } - return tf.train.Example(features=tf.train.Features(feature=feature)) - - -def create_detection_test_example(image_height: int, image_width: int, - image_channel: int, - num_instances: int) -> tf.train.Example: - """Creates and returns a test example containing box and mask annotations. - - Args: - image_height: The height of test image. - image_width: The width of test image. - image_channel: The channel of test image. - num_instances: The number of object instances per image. - - Returns: - A tf.train.Example for testing. - """ - image = make_image_bytes([image_height, image_width, image_channel]) - if num_instances == 0: - xmins = [] - xmaxs = [] - ymins = [] - ymaxs = [] - labels = [] - areas = [] - is_crowds = [] - masks = [] - labels_text = [] - else: - xmins = list(np.random.rand(num_instances)) - xmaxs = list(np.random.rand(num_instances)) - ymins = list(np.random.rand(num_instances)) - ymaxs = list(np.random.rand(num_instances)) - labels_text = [b'class_1'] * num_instances - labels = list(np.random.randint(100, size=num_instances)) - areas = [(xmax - xmin) * (ymax - ymin) * image_height * image_width - for xmin, xmax, ymin, ymax in zip(xmins, xmaxs, ymins, ymaxs)] - is_crowds = [0] * num_instances - masks = [] - for _ in range(num_instances): - mask = make_image_bytes([image_height, image_width], fmt='PNG') - masks.append(mask) - return tf.train.Example( - features=tf.train.Features( - feature={ - 'image/encoded': (tf.train.Feature( - bytes_list=tf.train.BytesList(value=[image]))), - 'image/source_id': (tf.train.Feature( - bytes_list=tf.train.BytesList(value=[DUMP_SOURCE_ID]))), - 'image/height': (tf.train.Feature( - int64_list=tf.train.Int64List(value=[image_height]))), - 'image/width': (tf.train.Feature( - int64_list=tf.train.Int64List(value=[image_width]))), - 'image/object/bbox/xmin': (tf.train.Feature( - float_list=tf.train.FloatList(value=xmins))), - 'image/object/bbox/xmax': (tf.train.Feature( - float_list=tf.train.FloatList(value=xmaxs))), - 'image/object/bbox/ymin': (tf.train.Feature( - float_list=tf.train.FloatList(value=ymins))), - 'image/object/bbox/ymax': (tf.train.Feature( - float_list=tf.train.FloatList(value=ymaxs))), - 'image/object/class/label': (tf.train.Feature( - int64_list=tf.train.Int64List(value=labels))), - 'image/object/class/text': (tf.train.Feature( - bytes_list=tf.train.BytesList(value=labels_text))), - 'image/object/is_crowd': (tf.train.Feature( - int64_list=tf.train.Int64List(value=is_crowds))), - 'image/object/area': (tf.train.Feature( - float_list=tf.train.FloatList(value=areas))), - 'image/object/mask': (tf.train.Feature( - bytes_list=tf.train.BytesList(value=masks))), - })) - - -def create_segmentation_test_example(image_height: int, image_width: int, - image_channel: int) -> tf.train.Example: - """Creates and returns a test example containing mask annotations. - - Args: - image_height: The height of test image. - image_width: The width of test image. - image_channel: The channel of test image. - - Returns: - A tf.train.Example for testing. - """ - image = make_image_bytes([image_height, image_width, image_channel]) - mask = make_image_bytes([image_height, image_width], fmt='PNG') - return tf.train.Example( - features=tf.train.Features( - feature={ - 'image/encoded': (tf.train.Feature( - bytes_list=tf.train.BytesList(value=[image]))), - 'image/segmentation/class/encoded': (tf.train.Feature( - bytes_list=tf.train.BytesList(value=[mask]))), - 'image/height': (tf.train.Feature( - int64_list=tf.train.Int64List(value=[image_height]))), - 'image/width': (tf.train.Feature( - int64_list=tf.train.Int64List(value=[image_width]))) - })) diff --git a/official/vision/beta/dataloaders/utils.py b/official/vision/beta/dataloaders/utils.py deleted file mode 100644 index a812c9645..000000000 --- a/official/vision/beta/dataloaders/utils.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Data loader utils.""" -from typing import Dict - -# Import libraries -import tensorflow as tf - -from official.vision.beta.ops import preprocess_ops - - -def process_source_id(source_id: tf.Tensor) -> tf.Tensor: - """Processes source_id to the right format. - - Args: - source_id: A `tf.Tensor` that contains the source ID. It can be empty. - - Returns: - A formatted source ID. - """ - if source_id.dtype == tf.string: - source_id = tf.strings.to_number(source_id, tf.int64) - with tf.control_dependencies([source_id]): - source_id = tf.cond( - pred=tf.equal(tf.size(input=source_id), 0), - true_fn=lambda: tf.cast(tf.constant(-1), tf.int64), - false_fn=lambda: tf.identity(source_id)) - return source_id - - -def pad_groundtruths_to_fixed_size(groundtruths: Dict[str, tf.Tensor], - size: int) -> Dict[str, tf.Tensor]: - """Pads the first dimension of groundtruths labels to the fixed size. - - Args: - groundtruths: A dictionary of {`str`: `tf.Tensor`} that contains groundtruth - annotations of `boxes`, `is_crowds`, `areas` and `classes`. - size: An `int` that specifies the expected size of the first dimension of - padded tensors. - - Returns: - A dictionary of the same keys as input and padded tensors as values. - - """ - groundtruths['boxes'] = preprocess_ops.clip_or_pad_to_fixed_size( - groundtruths['boxes'], size, -1) - groundtruths['is_crowds'] = preprocess_ops.clip_or_pad_to_fixed_size( - groundtruths['is_crowds'], size, 0) - groundtruths['areas'] = preprocess_ops.clip_or_pad_to_fixed_size( - groundtruths['areas'], size, -1) - groundtruths['classes'] = preprocess_ops.clip_or_pad_to_fixed_size( - groundtruths['classes'], size, -1) - if 'attributes' in groundtruths: - for k, v in groundtruths['attributes'].items(): - groundtruths['attributes'][k] = preprocess_ops.clip_or_pad_to_fixed_size( - v, size, -1) - return groundtruths diff --git a/official/vision/beta/dataloaders/utils_test.py b/official/vision/beta/dataloaders/utils_test.py deleted file mode 100644 index b3359f381..000000000 --- a/official/vision/beta/dataloaders/utils_test.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for dataloader utils functions.""" - -# Import libraries - -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.dataloaders import utils - - -class UtilsTest(tf.test.TestCase, parameterized.TestCase): - - def test_process_empty_source_id(self): - source_id = tf.constant([], dtype=tf.int64) - source_id = tf.strings.as_string(source_id) - self.assertEqual(-1, utils.process_source_id(source_id=source_id)) - - @parameterized.parameters( - ([128, 256], [128, 256]), - ([128, 32, 16], [128, 32, 16]), - ) - def test_process_source_id(self, source_id, expected_result): - source_id = tf.constant(source_id, dtype=tf.int64) - source_id = tf.strings.as_string(source_id) - self.assertSequenceAlmostEqual(expected_result, - utils.process_source_id(source_id=source_id)) - - @parameterized.parameters( - ([[10, 20, 30, 40]], [[100]], [[0]], 10, None), - ([[0.1, 0.2, 0.5, 0.6]], [[0.5]], [[1]], 2, [[1.0, 2.0]]), - ) - def test_pad_groundtruths_to_fixed_size(self, boxes, area, classes, size, - attributes): - groundtruths = {} - groundtruths['boxes'] = tf.constant(boxes) - groundtruths['is_crowds'] = tf.constant([[0]]) - groundtruths['areas'] = tf.constant(area) - groundtruths['classes'] = tf.constant(classes) - if attributes: - groundtruths['attributes'] = {'depth': tf.constant(attributes)} - - actual_result = utils.pad_groundtruths_to_fixed_size( - groundtruths=groundtruths, size=size) - - # Check that the first dimension is padded to the expected size. - for key in actual_result: - if key == 'attributes': - for _, v in actual_result[key].items(): - pad_shape = v.shape[0] - self.assertEqual(size, pad_shape) - else: - pad_shape = actual_result[key].shape[0] - self.assertEqual(size, pad_shape) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/dataloaders/video_input.py b/official/vision/beta/dataloaders/video_input.py deleted file mode 100644 index 4128039cb..000000000 --- a/official/vision/beta/dataloaders/video_input.py +++ /dev/null @@ -1,392 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Parser for video and label datasets.""" - -from typing import Dict, Optional, Tuple, Union - -from absl import logging -import tensorflow as tf - -from official.vision.beta.configs import video_classification as exp_cfg -from official.vision.beta.dataloaders import decoder -from official.vision.beta.dataloaders import parser -from official.vision.beta.ops import augment -from official.vision.beta.ops import preprocess_ops_3d - -IMAGE_KEY = 'image/encoded' -LABEL_KEY = 'clip/label/index' - - -def process_image(image: tf.Tensor, - is_training: bool = True, - num_frames: int = 32, - stride: int = 1, - random_stride_range: int = 0, - num_test_clips: int = 1, - min_resize: int = 256, - crop_size: int = 224, - num_crops: int = 1, - zero_centering_image: bool = False, - min_aspect_ratio: float = 0.5, - max_aspect_ratio: float = 2, - min_area_ratio: float = 0.49, - max_area_ratio: float = 1.0, - augmenter: Optional[augment.ImageAugment] = None, - seed: Optional[int] = None) -> tf.Tensor: - """Processes a serialized image tensor. - - Args: - image: Input Tensor of shape [timesteps] and type tf.string of serialized - frames. - is_training: Whether or not in training mode. If True, random sample, crop - and left right flip is used. - num_frames: Number of frames per subclip. - stride: Temporal stride to sample frames. - random_stride_range: An int indicating the min and max bounds to uniformly - sample different strides from the video. E.g., a value of 1 with stride=2 - will uniformly sample a stride in {1, 2, 3} for each video in a batch. - Only used enabled training for the purposes of frame-rate augmentation. - Defaults to 0, which disables random sampling. - num_test_clips: Number of test clips (1 by default). If more than 1, this - will sample multiple linearly spaced clips within each video at test time. - If 1, then a single clip in the middle of the video is sampled. The clips - are aggreagated in the batch dimension. - min_resize: Frames are resized so that min(height, width) is min_resize. - crop_size: Final size of the frame after cropping the resized frames. Both - height and width are the same. - num_crops: Number of crops to perform on the resized frames. - zero_centering_image: If True, frames are normalized to values in [-1, 1]. - If False, values in [0, 1]. - min_aspect_ratio: The minimum aspect range for cropping. - max_aspect_ratio: The maximum aspect range for cropping. - min_area_ratio: The minimum area range for cropping. - max_area_ratio: The maximum area range for cropping. - augmenter: Image augmenter to distort each image. - seed: A deterministic seed to use when sampling. - - Returns: - Processed frames. Tensor of shape - [num_frames * num_test_clips, crop_size, crop_size, 3]. - """ - # Validate parameters. - if is_training and num_test_clips != 1: - logging.warning( - '`num_test_clips` %d is ignored since `is_training` is `True`.', - num_test_clips) - - if random_stride_range < 0: - raise ValueError('Random stride range should be >= 0, got {}'.format( - random_stride_range)) - - # Temporal sampler. - if is_training: - if random_stride_range > 0: - # Uniformly sample different frame-rates - stride = tf.random.uniform( - [], - tf.maximum(stride - random_stride_range, 1), - stride + random_stride_range, - dtype=tf.int32) - - # Sample random clip. - image = preprocess_ops_3d.sample_sequence(image, num_frames, True, stride, - seed) - elif num_test_clips > 1: - # Sample linspace clips. - image = preprocess_ops_3d.sample_linspace_sequence(image, num_test_clips, - num_frames, stride) - else: - # Sample middle clip. - image = preprocess_ops_3d.sample_sequence(image, num_frames, False, stride) - - # Decode JPEG string to tf.uint8. - if image.dtype == tf.string: - image = preprocess_ops_3d.decode_jpeg(image, 3) - - if is_training: - # Standard image data augmentation: random resized crop and random flip. - image = preprocess_ops_3d.random_crop_resize( - image, crop_size, crop_size, num_frames, 3, - (min_aspect_ratio, max_aspect_ratio), - (min_area_ratio, max_area_ratio)) - image = preprocess_ops_3d.random_flip_left_right(image, seed) - - if augmenter is not None: - image = augmenter.distort(image) - else: - # Resize images (resize happens only if necessary to save compute). - image = preprocess_ops_3d.resize_smallest(image, min_resize) - # Crop of the frames. - image = preprocess_ops_3d.crop_image(image, crop_size, crop_size, False, - num_crops) - - # Cast the frames in float32, normalizing according to zero_centering_image. - return preprocess_ops_3d.normalize_image(image, zero_centering_image) - - -def postprocess_image(image: tf.Tensor, - is_training: bool = True, - num_frames: int = 32, - num_test_clips: int = 1, - num_test_crops: int = 1) -> tf.Tensor: - """Processes a batched Tensor of frames. - - The same parameters used in process should be used here. - - Args: - image: Input Tensor of shape [batch, timesteps, height, width, 3]. - is_training: Whether or not in training mode. If True, random sample, crop - and left right flip is used. - num_frames: Number of frames per subclip. - num_test_clips: Number of test clips (1 by default). If more than 1, this - will sample multiple linearly spaced clips within each video at test time. - If 1, then a single clip in the middle of the video is sampled. The clips - are aggreagated in the batch dimension. - num_test_crops: Number of test crops (1 by default). If more than 1, there - are multiple crops for each clip at test time. If 1, there is a single - central crop. The crops are aggreagated in the batch dimension. - - Returns: - Processed frames. Tensor of shape - [batch * num_test_clips * num_test_crops, num_frames, height, width, 3]. - """ - num_views = num_test_clips * num_test_crops - if num_views > 1 and not is_training: - # In this case, multiple views are merged together in batch dimenstion which - # will be batch * num_views. - image = tf.reshape(image, [-1, num_frames] + image.shape[2:].as_list()) - - return image - - -def process_label(label: tf.Tensor, - one_hot_label: bool = True, - num_classes: Optional[int] = None) -> tf.Tensor: - """Processes label Tensor.""" - # Validate parameters. - if one_hot_label and not num_classes: - raise ValueError( - '`num_classes` should be given when requesting one hot label.') - - # Cast to tf.int32. - label = tf.cast(label, dtype=tf.int32) - - if one_hot_label: - # Replace label index by one hot representation. - label = tf.one_hot(label, num_classes) - if len(label.shape.as_list()) > 1: - label = tf.reduce_sum(label, axis=0) - if num_classes == 1: - # The trick for single label. - label = 1 - label - - return label - - -class Decoder(decoder.Decoder): - """A tf.Example decoder for classification task.""" - - def __init__(self, image_key: str = IMAGE_KEY, label_key: str = LABEL_KEY): - self._context_description = { - # One integer stored in context. - label_key: tf.io.VarLenFeature(tf.int64), - } - self._sequence_description = { - # Each image is a string encoding JPEG. - image_key: tf.io.FixedLenSequenceFeature((), tf.string), - } - - def add_feature(self, feature_name: str, - feature_type: Union[tf.io.VarLenFeature, - tf.io.FixedLenFeature, - tf.io.FixedLenSequenceFeature]): - self._sequence_description[feature_name] = feature_type - - def add_context(self, feature_name: str, - feature_type: Union[tf.io.VarLenFeature, - tf.io.FixedLenFeature, - tf.io.FixedLenSequenceFeature]): - self._context_description[feature_name] = feature_type - - def decode(self, serialized_example): - """Parses a single tf.Example into image and label tensors.""" - result = {} - context, sequences = tf.io.parse_single_sequence_example( - serialized_example, self._context_description, - self._sequence_description) - result.update(context) - result.update(sequences) - for key, value in result.items(): - if isinstance(value, tf.SparseTensor): - result[key] = tf.sparse.to_dense(value) - return result - - -class VideoTfdsDecoder(decoder.Decoder): - """A tf.SequenceExample decoder for tfds video classification datasets.""" - - def __init__(self, image_key: str = IMAGE_KEY, label_key: str = LABEL_KEY): - self._image_key = image_key - self._label_key = label_key - - def decode(self, features): - """Decode the TFDS FeatureDict. - - Args: - features: features from TFDS video dataset. - See https://www.tensorflow.org/datasets/catalog/ucf101 for example. - Returns: - Dict of tensors. - """ - sample_dict = { - self._image_key: features['video'], - self._label_key: features['label'], - } - return sample_dict - - -class Parser(parser.Parser): - """Parses a video and label dataset.""" - - def __init__(self, - input_params: exp_cfg.DataConfig, - image_key: str = IMAGE_KEY, - label_key: str = LABEL_KEY): - self._num_frames = input_params.feature_shape[0] - self._stride = input_params.temporal_stride - self._random_stride_range = input_params.random_stride_range - self._num_test_clips = input_params.num_test_clips - self._min_resize = input_params.min_image_size - self._crop_size = input_params.feature_shape[1] - self._num_crops = input_params.num_test_crops - self._one_hot_label = input_params.one_hot - self._num_classes = input_params.num_classes - self._image_key = image_key - self._label_key = label_key - self._dtype = tf.dtypes.as_dtype(input_params.dtype) - self._output_audio = input_params.output_audio - self._min_aspect_ratio = input_params.aug_min_aspect_ratio - self._max_aspect_ratio = input_params.aug_max_aspect_ratio - self._min_area_ratio = input_params.aug_min_area_ratio - self._max_area_ratio = input_params.aug_max_area_ratio - if self._output_audio: - self._audio_feature = input_params.audio_feature - self._audio_shape = input_params.audio_feature_shape - - self._augmenter = None - if input_params.aug_type is not None: - aug_type = input_params.aug_type - if aug_type == 'autoaug': - logging.info('Using AutoAugment.') - self._augmenter = augment.AutoAugment() - elif aug_type == 'randaug': - logging.info('Using RandAugment.') - self._augmenter = augment.RandAugment() - else: - raise ValueError('Augmentation policy {} is not supported.'.format( - aug_type)) - - def _parse_train_data( - self, decoded_tensors: Dict[str, tf.Tensor] - ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]: - """Parses data for training.""" - # Process image and label. - image = decoded_tensors[self._image_key] - image = process_image( - image=image, - is_training=True, - num_frames=self._num_frames, - stride=self._stride, - random_stride_range=self._random_stride_range, - num_test_clips=self._num_test_clips, - min_resize=self._min_resize, - crop_size=self._crop_size, - min_aspect_ratio=self._min_aspect_ratio, - max_aspect_ratio=self._max_aspect_ratio, - min_area_ratio=self._min_area_ratio, - max_area_ratio=self._max_area_ratio, - augmenter=self._augmenter) - image = tf.cast(image, dtype=self._dtype) - - features = {'image': image} - - label = decoded_tensors[self._label_key] - label = process_label(label, self._one_hot_label, self._num_classes) - - if self._output_audio: - audio = decoded_tensors[self._audio_feature] - audio = tf.cast(audio, dtype=self._dtype) - # TODO(yeqing): synchronize audio/video sampling. Especially randomness. - audio = preprocess_ops_3d.sample_sequence( - audio, self._audio_shape[0], random=False, stride=1) - audio = tf.ensure_shape(audio, self._audio_shape) - features['audio'] = audio - - return features, label - - def _parse_eval_data( - self, decoded_tensors: Dict[str, tf.Tensor] - ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]: - """Parses data for evaluation.""" - image = decoded_tensors[self._image_key] - image = process_image( - image=image, - is_training=False, - num_frames=self._num_frames, - stride=self._stride, - num_test_clips=self._num_test_clips, - min_resize=self._min_resize, - crop_size=self._crop_size, - num_crops=self._num_crops) - image = tf.cast(image, dtype=self._dtype) - features = {'image': image} - - label = decoded_tensors[self._label_key] - label = process_label(label, self._one_hot_label, self._num_classes) - - if self._output_audio: - audio = decoded_tensors[self._audio_feature] - audio = tf.cast(audio, dtype=self._dtype) - audio = preprocess_ops_3d.sample_sequence( - audio, self._audio_shape[0], random=False, stride=1) - audio = tf.ensure_shape(audio, self._audio_shape) - features['audio'] = audio - - return features, label - - -class PostBatchProcessor(object): - """Processes a video and label dataset which is batched.""" - - def __init__(self, input_params: exp_cfg.DataConfig): - self._is_training = input_params.is_training - - self._num_frames = input_params.feature_shape[0] - self._num_test_clips = input_params.num_test_clips - self._num_test_crops = input_params.num_test_crops - - def __call__(self, features: Dict[str, tf.Tensor], - label: tf.Tensor) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]: - """Parses a single tf.Example into image and label tensors.""" - for key in ['image']: - if key in features: - features[key] = postprocess_image( - image=features[key], - is_training=self._is_training, - num_frames=self._num_frames, - num_test_clips=self._num_test_clips, - num_test_crops=self._num_test_crops) - - return features, label diff --git a/official/vision/beta/dataloaders/video_input_test.py b/official/vision/beta/dataloaders/video_input_test.py deleted file mode 100644 index f94605ab8..000000000 --- a/official/vision/beta/dataloaders/video_input_test.py +++ /dev/null @@ -1,194 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import io - -# Import libraries -import numpy as np -from PIL import Image -import tensorflow as tf -import tensorflow_datasets as tfds - -from official.vision.beta.configs import video_classification as exp_cfg -from official.vision.beta.dataloaders import video_input - - -AUDIO_KEY = 'features/audio' - - -def fake_seq_example(): - # Create fake data. - random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8) - random_image = Image.fromarray(random_image) - label = 42 - with io.BytesIO() as buffer: - random_image.save(buffer, format='JPEG') - raw_image_bytes = buffer.getvalue() - - seq_example = tf.train.SequenceExample() - seq_example.feature_lists.feature_list.get_or_create( - video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [ - raw_image_bytes - ] - seq_example.feature_lists.feature_list.get_or_create( - video_input.IMAGE_KEY).feature.add().bytes_list.value[:] = [ - raw_image_bytes - ] - seq_example.context.feature[video_input.LABEL_KEY].int64_list.value[:] = [ - label - ] - - random_audio = np.random.normal(size=(10, 256)).tolist() - for s in random_audio: - seq_example.feature_lists.feature_list.get_or_create( - AUDIO_KEY).feature.add().float_list.value[:] = s - return seq_example, label - - -class DecoderTest(tf.test.TestCase): - """A tf.SequenceExample decoder for the video classification task.""" - - def test_decoder(self): - decoder = video_input.Decoder() - - seq_example, label = fake_seq_example() - serialized_example = seq_example.SerializeToString() - - decoded_tensors = decoder.decode(tf.convert_to_tensor(serialized_example)) - results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors) - self.assertCountEqual([video_input.IMAGE_KEY, video_input.LABEL_KEY], - results.keys()) - self.assertEqual(label, results[video_input.LABEL_KEY]) - - def test_decode_audio(self): - decoder = video_input.Decoder() - decoder.add_feature(AUDIO_KEY, tf.io.VarLenFeature(dtype=tf.float32)) - - seq_example, label = fake_seq_example() - serialized_example = seq_example.SerializeToString() - - decoded_tensors = decoder.decode(tf.convert_to_tensor(serialized_example)) - results = tf.nest.map_structure(lambda x: x.numpy(), decoded_tensors) - self.assertCountEqual( - [video_input.IMAGE_KEY, video_input.LABEL_KEY, AUDIO_KEY], - results.keys()) - self.assertEqual(label, results[video_input.LABEL_KEY]) - self.assertEqual(results[AUDIO_KEY].shape, (10, 256)) - - def test_tfds_decode(self): - with tfds.testing.mock_data(num_examples=1): - dataset = tfds.load('ucf101', split='train').take(1) - data = next(iter(dataset)) - - decoder = video_input.VideoTfdsDecoder() - decoded_tensors = decoder.decode(data) - self.assertContainsSubset([video_input.LABEL_KEY, video_input.IMAGE_KEY], - decoded_tensors.keys()) - - -class VideoAndLabelParserTest(tf.test.TestCase): - - def test_video_input(self): - params = exp_cfg.kinetics600(is_training=True) - params.feature_shape = (2, 224, 224, 3) - params.min_image_size = 224 - - decoder = video_input.Decoder() - parser = video_input.Parser(params).parse_fn(params.is_training) - - seq_example, label = fake_seq_example() - - input_tensor = tf.constant(seq_example.SerializeToString()) - decoded_tensors = decoder.decode(input_tensor) - output_tensor = parser(decoded_tensors) - image_features, label = output_tensor - image = image_features['image'] - - self.assertAllEqual(image.shape, (2, 224, 224, 3)) - self.assertAllEqual(label.shape, (600,)) - - def test_video_audio_input(self): - params = exp_cfg.kinetics600(is_training=True) - params.feature_shape = (2, 224, 224, 3) - params.min_image_size = 224 - params.output_audio = True - params.audio_feature = AUDIO_KEY - params.audio_feature_shape = (15, 256) - - decoder = video_input.Decoder() - decoder.add_feature(params.audio_feature, - tf.io.VarLenFeature(dtype=tf.float32)) - parser = video_input.Parser(params).parse_fn(params.is_training) - - seq_example, label = fake_seq_example() - - input_tensor = tf.constant(seq_example.SerializeToString()) - decoded_tensors = decoder.decode(input_tensor) - output_tensor = parser(decoded_tensors) - features, label = output_tensor - image = features['image'] - audio = features['audio'] - - self.assertAllEqual(image.shape, (2, 224, 224, 3)) - self.assertAllEqual(label.shape, (600,)) - self.assertEqual(audio.shape, (15, 256)) - - def test_video_input_random_stride(self): - params = exp_cfg.kinetics600(is_training=True) - params.feature_shape = (2, 224, 224, 3) - params.min_image_size = 224 - - params.temporal_stride = 2 - params.random_stride_range = 1 - - decoder = video_input.Decoder() - parser = video_input.Parser(params).parse_fn(params.is_training) - - seq_example, label = fake_seq_example() - - input_tensor = tf.constant(seq_example.SerializeToString()) - decoded_tensors = decoder.decode(input_tensor) - output_tensor = parser(decoded_tensors) - image_features, label = output_tensor - image = image_features['image'] - - self.assertAllEqual(image.shape, (2, 224, 224, 3)) - self.assertAllEqual(label.shape, (600,)) - - def test_video_input_augmentation_returns_shape(self): - params = exp_cfg.kinetics600(is_training=True) - params.feature_shape = (2, 224, 224, 3) - params.min_image_size = 224 - - params.temporal_stride = 2 - params.aug_type = 'autoaug' - - decoder = video_input.Decoder() - parser = video_input.Parser(params).parse_fn(params.is_training) - - seq_example, label = fake_seq_example() - - input_tensor = tf.constant(seq_example.SerializeToString()) - decoded_tensors = decoder.decode(input_tensor) - output_tensor = parser(decoded_tensors) - image_features, label = output_tensor - image = image_features['image'] - - self.assertAllEqual(image.shape, (2, 224, 224, 3)) - self.assertAllEqual(label.shape, (600,)) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/evaluation/__init__.py b/official/vision/beta/evaluation/__init__.py deleted file mode 100644 index 310bfb28f..000000000 --- a/official/vision/beta/evaluation/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - diff --git a/official/vision/beta/evaluation/coco_evaluator.py b/official/vision/beta/evaluation/coco_evaluator.py deleted file mode 100644 index de5691dad..000000000 --- a/official/vision/beta/evaluation/coco_evaluator.py +++ /dev/null @@ -1,336 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""The COCO-style evaluator. - -The following snippet demonstrates the use of interfaces: - - evaluator = COCOEvaluator(...) - for _ in range(num_evals): - for _ in range(num_batches_per_eval): - predictions, groundtruth = predictor.predict(...) # pop a batch. - evaluator.update_state(groundtruths, predictions) - evaluator.result() # finish one full eval and reset states. - -See also: https://github.com/cocodataset/cocoapi/ -""" - -import atexit -import tempfile -# Import libraries -from absl import logging -import numpy as np -from pycocotools import cocoeval -import six -import tensorflow as tf - -from official.vision.beta.evaluation import coco_utils - - -class COCOEvaluator(object): - """COCO evaluation metric class.""" - - def __init__(self, - annotation_file, - include_mask, - need_rescale_bboxes=True, - per_category_metrics=False): - """Constructs COCO evaluation class. - - The class provides the interface to COCO metrics_fn. The - _update_op() takes detections from each image and push them to - self.detections. The _evaluate() loads a JSON file in COCO annotation format - as the groundtruths and runs COCO evaluation. - - Args: - annotation_file: a JSON file that stores annotations of the eval dataset. - If `annotation_file` is None, groundtruth annotations will be loaded - from the dataloader. - include_mask: a boolean to indicate whether or not to include the mask - eval. - need_rescale_bboxes: If true bboxes in `predictions` will be rescaled back - to absolute values (`image_info` is needed in this case). - per_category_metrics: Whether to return per category metrics. - """ - if annotation_file: - if annotation_file.startswith('gs://'): - _, local_val_json = tempfile.mkstemp(suffix='.json') - tf.io.gfile.remove(local_val_json) - - tf.io.gfile.copy(annotation_file, local_val_json) - atexit.register(tf.io.gfile.remove, local_val_json) - else: - local_val_json = annotation_file - self._coco_gt = coco_utils.COCOWrapper( - eval_type=('mask' if include_mask else 'box'), - annotation_file=local_val_json) - self._annotation_file = annotation_file - self._include_mask = include_mask - self._per_category_metrics = per_category_metrics - self._metric_names = [ - 'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1', 'ARmax10', - 'ARmax100', 'ARs', 'ARm', 'ARl' - ] - self._required_prediction_fields = [ - 'source_id', 'num_detections', 'detection_classes', 'detection_scores', - 'detection_boxes' - ] - self._need_rescale_bboxes = need_rescale_bboxes - if self._need_rescale_bboxes: - self._required_prediction_fields.append('image_info') - self._required_groundtruth_fields = [ - 'source_id', 'height', 'width', 'classes', 'boxes' - ] - if self._include_mask: - mask_metric_names = ['mask_' + x for x in self._metric_names] - self._metric_names.extend(mask_metric_names) - self._required_prediction_fields.extend(['detection_masks']) - self._required_groundtruth_fields.extend(['masks']) - - self.reset_states() - - @property - def name(self): - return 'coco_metric' - - def reset_states(self): - """Resets internal states for a fresh run.""" - self._predictions = {} - if not self._annotation_file: - self._groundtruths = {} - - def result(self): - """Evaluates detection results, and reset_states.""" - metric_dict = self.evaluate() - # Cleans up the internal variables in order for a fresh eval next time. - self.reset_states() - return metric_dict - - def evaluate(self): - """Evaluates with detections from all images with COCO API. - - Returns: - coco_metric: float numpy array with shape [24] representing the - coco-style evaluation metrics (box and mask). - """ - if not self._annotation_file: - logging.info('There is no annotation_file in COCOEvaluator.') - gt_dataset = coco_utils.convert_groundtruths_to_coco_dataset( - self._groundtruths) - coco_gt = coco_utils.COCOWrapper( - eval_type=('mask' if self._include_mask else 'box'), - gt_dataset=gt_dataset) - else: - logging.info('Using annotation file: %s', self._annotation_file) - coco_gt = self._coco_gt - coco_predictions = coco_utils.convert_predictions_to_coco_annotations( - self._predictions) - coco_dt = coco_gt.loadRes(predictions=coco_predictions) - image_ids = [ann['image_id'] for ann in coco_predictions] - - coco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='bbox') - coco_eval.params.imgIds = image_ids - coco_eval.evaluate() - coco_eval.accumulate() - coco_eval.summarize() - coco_metrics = coco_eval.stats - - if self._include_mask: - mcoco_eval = cocoeval.COCOeval(coco_gt, coco_dt, iouType='segm') - mcoco_eval.params.imgIds = image_ids - mcoco_eval.evaluate() - mcoco_eval.accumulate() - mcoco_eval.summarize() - mask_coco_metrics = mcoco_eval.stats - - if self._include_mask: - metrics = np.hstack((coco_metrics, mask_coco_metrics)) - else: - metrics = coco_metrics - - metrics_dict = {} - for i, name in enumerate(self._metric_names): - metrics_dict[name] = metrics[i].astype(np.float32) - - # Adds metrics per category. - if self._per_category_metrics: - metrics_dict.update(self._retrieve_per_category_metrics(coco_eval)) - - if self._include_mask: - metrics_dict.update(self._retrieve_per_category_metrics( - mcoco_eval, prefix='mask')) - - return metrics_dict - - def _retrieve_per_category_metrics(self, coco_eval, prefix=''): - """Retrieves and per-category metrics and retuns them in a dict. - - Args: - coco_eval: a cocoeval.COCOeval object containing evaluation data. - prefix: str, A string used to prefix metric names. - - Returns: - metrics_dict: A dictionary with per category metrics. - """ - - metrics_dict = {} - if prefix: - prefix = prefix + ' ' - - if hasattr(coco_eval, 'category_stats'): - for category_index, category_id in enumerate(coco_eval.params.catIds): - if self._annotation_file: - coco_category = self._coco_gt.cats[category_id] - # if 'name' is available use it, otherwise use `id` - category_display_name = coco_category.get('name', category_id) - else: - category_display_name = category_id - - metrics_dict[prefix + 'Precision mAP ByCategory/{}'.format( - category_display_name - )] = coco_eval.category_stats[0][category_index].astype(np.float32) - metrics_dict[prefix + 'Precision mAP ByCategory@50IoU/{}'.format( - category_display_name - )] = coco_eval.category_stats[1][category_index].astype(np.float32) - metrics_dict[prefix + 'Precision mAP ByCategory@75IoU/{}'.format( - category_display_name - )] = coco_eval.category_stats[2][category_index].astype(np.float32) - metrics_dict[prefix + 'Precision mAP ByCategory (small) /{}'.format( - category_display_name - )] = coco_eval.category_stats[3][category_index].astype(np.float32) - metrics_dict[prefix + 'Precision mAP ByCategory (medium) /{}'.format( - category_display_name - )] = coco_eval.category_stats[4][category_index].astype(np.float32) - metrics_dict[prefix + 'Precision mAP ByCategory (large) /{}'.format( - category_display_name - )] = coco_eval.category_stats[5][category_index].astype(np.float32) - metrics_dict[prefix + 'Recall AR@1 ByCategory/{}'.format( - category_display_name - )] = coco_eval.category_stats[6][category_index].astype(np.float32) - metrics_dict[prefix + 'Recall AR@10 ByCategory/{}'.format( - category_display_name - )] = coco_eval.category_stats[7][category_index].astype(np.float32) - metrics_dict[prefix + 'Recall AR@100 ByCategory/{}'.format( - category_display_name - )] = coco_eval.category_stats[8][category_index].astype(np.float32) - metrics_dict[prefix + 'Recall AR (small) ByCategory/{}'.format( - category_display_name - )] = coco_eval.category_stats[9][category_index].astype(np.float32) - metrics_dict[prefix + 'Recall AR (medium) ByCategory/{}'.format( - category_display_name - )] = coco_eval.category_stats[10][category_index].astype(np.float32) - metrics_dict[prefix + 'Recall AR (large) ByCategory/{}'.format( - category_display_name - )] = coco_eval.category_stats[11][category_index].astype(np.float32) - - return metrics_dict - - def _process_predictions(self, predictions): - image_scale = np.tile(predictions['image_info'][:, 2:3, :], (1, 1, 2)) - predictions['detection_boxes'] = ( - predictions['detection_boxes'].astype(np.float32)) - predictions['detection_boxes'] /= image_scale - if 'detection_outer_boxes' in predictions: - predictions['detection_outer_boxes'] = ( - predictions['detection_outer_boxes'].astype(np.float32)) - predictions['detection_outer_boxes'] /= image_scale - - def _convert_to_numpy(self, groundtruths, predictions): - """Converts tesnors to numpy arrays.""" - if groundtruths: - labels = tf.nest.map_structure(lambda x: x.numpy(), groundtruths) - numpy_groundtruths = {} - for key, val in labels.items(): - if isinstance(val, tuple): - val = np.concatenate(val) - numpy_groundtruths[key] = val - else: - numpy_groundtruths = groundtruths - - if predictions: - outputs = tf.nest.map_structure(lambda x: x.numpy(), predictions) - numpy_predictions = {} - for key, val in outputs.items(): - if isinstance(val, tuple): - val = np.concatenate(val) - numpy_predictions[key] = val - else: - numpy_predictions = predictions - - return numpy_groundtruths, numpy_predictions - - def update_state(self, groundtruths, predictions): - """Update and aggregate detection results and groundtruth data. - - Args: - groundtruths: a dictionary of Tensors including the fields below. - See also different parsers under `../dataloader` for more details. - Required fields: - - source_id: a numpy array of int or string of shape [batch_size]. - - height: a numpy array of int of shape [batch_size]. - - width: a numpy array of int of shape [batch_size]. - - num_detections: a numpy array of int of shape [batch_size]. - - boxes: a numpy array of float of shape [batch_size, K, 4]. - - classes: a numpy array of int of shape [batch_size, K]. - Optional fields: - - is_crowds: a numpy array of int of shape [batch_size, K]. If the - field is absent, it is assumed that this instance is not crowd. - - areas: a numy array of float of shape [batch_size, K]. If the - field is absent, the area is calculated using either boxes or - masks depending on which one is available. - - masks: a numpy array of float of shape - [batch_size, K, mask_height, mask_width], - predictions: a dictionary of tensors including the fields below. - See different parsers under `../dataloader` for more details. - Required fields: - - source_id: a numpy array of int or string of shape [batch_size]. - - image_info [if `need_rescale_bboxes` is True]: a numpy array of - float of shape [batch_size, 4, 2]. - - num_detections: a numpy array of - int of shape [batch_size]. - - detection_boxes: a numpy array of float of shape [batch_size, K, 4]. - - detection_classes: a numpy array of int of shape [batch_size, K]. - - detection_scores: a numpy array of float of shape [batch_size, K]. - Optional fields: - - detection_masks: a numpy array of float of shape - [batch_size, K, mask_height, mask_width]. - Raises: - ValueError: if the required prediction or groundtruth fields are not - present in the incoming `predictions` or `groundtruths`. - """ - groundtruths, predictions = self._convert_to_numpy(groundtruths, - predictions) - for k in self._required_prediction_fields: - if k not in predictions: - raise ValueError( - 'Missing the required key `{}` in predictions!'.format(k)) - if self._need_rescale_bboxes: - self._process_predictions(predictions) - for k, v in six.iteritems(predictions): - if k not in self._predictions: - self._predictions[k] = [v] - else: - self._predictions[k].append(v) - - if not self._annotation_file: - assert groundtruths - for k in self._required_groundtruth_fields: - if k not in groundtruths: - raise ValueError( - 'Missing the required key `{}` in groundtruths!'.format(k)) - for k, v in six.iteritems(groundtruths): - if k not in self._groundtruths: - self._groundtruths[k] = [v] - else: - self._groundtruths[k].append(v) diff --git a/official/vision/beta/evaluation/coco_utils.py b/official/vision/beta/evaluation/coco_utils.py deleted file mode 100644 index e968fa943..000000000 --- a/official/vision/beta/evaluation/coco_utils.py +++ /dev/null @@ -1,400 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Util functions related to pycocotools and COCO eval.""" - -import copy -import json - -# Import libraries - -from absl import logging -import numpy as np -from PIL import Image -from pycocotools import coco -from pycocotools import mask as mask_api -import six -import tensorflow as tf - -from official.common import dataset_fn -from official.vision.beta.dataloaders import tf_example_decoder -from official.vision.beta.ops import box_ops -from official.vision.beta.ops import mask_ops - - -class COCOWrapper(coco.COCO): - """COCO wrapper class. - - This class wraps COCO API object, which provides the following additional - functionalities: - 1. Support string type image id. - 2. Support loading the groundtruth dataset using the external annotation - dictionary. - 3. Support loading the prediction results using the external annotation - dictionary. - """ - - def __init__(self, eval_type='box', annotation_file=None, gt_dataset=None): - """Instantiates a COCO-style API object. - - Args: - eval_type: either 'box' or 'mask'. - annotation_file: a JSON file that stores annotations of the eval dataset. - This is required if `gt_dataset` is not provided. - gt_dataset: the groundtruth eval datatset in COCO API format. - """ - if ((annotation_file and gt_dataset) or - ((not annotation_file) and (not gt_dataset))): - raise ValueError('One and only one of `annotation_file` and `gt_dataset` ' - 'needs to be specified.') - - if eval_type not in ['box', 'mask']: - raise ValueError('The `eval_type` can only be either `box` or `mask`.') - - coco.COCO.__init__(self, annotation_file=annotation_file) - self._eval_type = eval_type - if gt_dataset: - self.dataset = gt_dataset - self.createIndex() - - def loadRes(self, predictions): - """Loads result file and return a result api object. - - Args: - predictions: a list of dictionary each representing an annotation in COCO - format. The required fields are `image_id`, `category_id`, `score`, - `bbox`, `segmentation`. - - Returns: - res: result COCO api object. - - Raises: - ValueError: if the set of image id from predctions is not the subset of - the set of image id of the groundtruth dataset. - """ - res = coco.COCO() - res.dataset['images'] = copy.deepcopy(self.dataset['images']) - res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) - - image_ids = [ann['image_id'] for ann in predictions] - if set(image_ids) != (set(image_ids) & set(self.getImgIds())): - raise ValueError('Results do not correspond to the current dataset!') - for ann in predictions: - x1, x2, y1, y2 = [ann['bbox'][0], ann['bbox'][0] + ann['bbox'][2], - ann['bbox'][1], ann['bbox'][1] + ann['bbox'][3]] - if self._eval_type == 'box': - ann['area'] = ann['bbox'][2] * ann['bbox'][3] - ann['segmentation'] = [ - [x1, y1, x1, y2, x2, y2, x2, y1]] - elif self._eval_type == 'mask': - ann['area'] = mask_api.area(ann['segmentation']) - - res.dataset['annotations'] = copy.deepcopy(predictions) - res.createIndex() - return res - - -def convert_predictions_to_coco_annotations(predictions): - """Converts a batch of predictions to annotations in COCO format. - - Args: - predictions: a dictionary of lists of numpy arrays including the following - fields. K below denotes the maximum number of instances per image. - Required fields: - - source_id: a list of numpy arrays of int or string of shape - [batch_size]. - - num_detections: a list of numpy arrays of int of shape [batch_size]. - - detection_boxes: a list of numpy arrays of float of shape - [batch_size, K, 4], where coordinates are in the original image - space (not the scaled image space). - - detection_classes: a list of numpy arrays of int of shape - [batch_size, K]. - - detection_scores: a list of numpy arrays of float of shape - [batch_size, K]. - Optional fields: - - detection_masks: a list of numpy arrays of float of shape - [batch_size, K, mask_height, mask_width]. - - Returns: - coco_predictions: prediction in COCO annotation format. - """ - coco_predictions = [] - num_batches = len(predictions['source_id']) - max_num_detections = predictions['detection_classes'][0].shape[1] - use_outer_box = 'detection_outer_boxes' in predictions - for i in range(num_batches): - predictions['detection_boxes'][i] = box_ops.yxyx_to_xywh( - predictions['detection_boxes'][i]) - if use_outer_box: - predictions['detection_outer_boxes'][i] = box_ops.yxyx_to_xywh( - predictions['detection_outer_boxes'][i]) - mask_boxes = predictions['detection_outer_boxes'] - else: - mask_boxes = predictions['detection_boxes'] - - batch_size = predictions['source_id'][i].shape[0] - for j in range(batch_size): - if 'detection_masks' in predictions: - image_masks = mask_ops.paste_instance_masks( - predictions['detection_masks'][i][j], - mask_boxes[i][j], - int(predictions['image_info'][i][j, 0, 0]), - int(predictions['image_info'][i][j, 0, 1])) - binary_masks = (image_masks > 0.0).astype(np.uint8) - encoded_masks = [ - mask_api.encode(np.asfortranarray(binary_mask)) - for binary_mask in list(binary_masks)] - for k in range(max_num_detections): - ann = {} - ann['image_id'] = predictions['source_id'][i][j] - ann['category_id'] = predictions['detection_classes'][i][j, k] - ann['bbox'] = predictions['detection_boxes'][i][j, k] - ann['score'] = predictions['detection_scores'][i][j, k] - if 'detection_masks' in predictions: - ann['segmentation'] = encoded_masks[k] - coco_predictions.append(ann) - - for i, ann in enumerate(coco_predictions): - ann['id'] = i + 1 - - return coco_predictions - - -def convert_groundtruths_to_coco_dataset(groundtruths, label_map=None): - """Converts groundtruths to the dataset in COCO format. - - Args: - groundtruths: a dictionary of numpy arrays including the fields below. - Note that each element in the list represent the number for a single - example without batch dimension. K below denotes the actual number of - instances for each image. - Required fields: - - source_id: a list of numpy arrays of int or string of shape - [batch_size]. - - height: a list of numpy arrays of int of shape [batch_size]. - - width: a list of numpy arrays of int of shape [batch_size]. - - num_detections: a list of numpy arrays of int of shape [batch_size]. - - boxes: a list of numpy arrays of float of shape [batch_size, K, 4], - where coordinates are in the original image space (not the - normalized coordinates). - - classes: a list of numpy arrays of int of shape [batch_size, K]. - Optional fields: - - is_crowds: a list of numpy arrays of int of shape [batch_size, K]. If - th field is absent, it is assumed that this instance is not crowd. - - areas: a list of numy arrays of float of shape [batch_size, K]. If the - field is absent, the area is calculated using either boxes or - masks depending on which one is available. - - masks: a list of numpy arrays of string of shape [batch_size, K], - label_map: (optional) a dictionary that defines items from the category id - to the category name. If `None`, collect the category mappping from the - `groundtruths`. - - Returns: - coco_groundtruths: the groundtruth dataset in COCO format. - """ - source_ids = np.concatenate(groundtruths['source_id'], axis=0) - heights = np.concatenate(groundtruths['height'], axis=0) - widths = np.concatenate(groundtruths['width'], axis=0) - gt_images = [{'id': int(i), 'height': int(h), 'width': int(w)} for i, h, w - in zip(source_ids, heights, widths)] - - gt_annotations = [] - num_batches = len(groundtruths['source_id']) - for i in range(num_batches): - logging.info( - 'convert_groundtruths_to_coco_dataset: Processing annotation %d', i) - max_num_instances = groundtruths['classes'][i].shape[1] - batch_size = groundtruths['source_id'][i].shape[0] - for j in range(batch_size): - num_instances = groundtruths['num_detections'][i][j] - if num_instances > max_num_instances: - logging.warning( - 'num_groundtruths is larger than max_num_instances, %d v.s. %d', - num_instances, max_num_instances) - num_instances = max_num_instances - for k in range(int(num_instances)): - ann = {} - ann['image_id'] = int(groundtruths['source_id'][i][j]) - if 'is_crowds' in groundtruths: - ann['iscrowd'] = int(groundtruths['is_crowds'][i][j, k]) - else: - ann['iscrowd'] = 0 - ann['category_id'] = int(groundtruths['classes'][i][j, k]) - boxes = groundtruths['boxes'][i] - ann['bbox'] = [ - float(boxes[j, k, 1]), - float(boxes[j, k, 0]), - float(boxes[j, k, 3] - boxes[j, k, 1]), - float(boxes[j, k, 2] - boxes[j, k, 0])] - if 'areas' in groundtruths: - ann['area'] = float(groundtruths['areas'][i][j, k]) - else: - ann['area'] = float( - (boxes[j, k, 3] - boxes[j, k, 1]) * - (boxes[j, k, 2] - boxes[j, k, 0])) - if 'masks' in groundtruths: - if isinstance(groundtruths['masks'][i][j, k], tf.Tensor): - mask = Image.open( - six.BytesIO(groundtruths['masks'][i][j, k].numpy())) - width, height = mask.size - np_mask = ( - np.array(mask.getdata()).reshape(height, - width).astype(np.uint8)) - else: - mask = Image.open( - six.BytesIO(groundtruths['masks'][i][j, k])) - width, height = mask.size - np_mask = ( - np.array(mask.getdata()).reshape(height, - width).astype(np.uint8)) - np_mask[np_mask > 0] = 255 - encoded_mask = mask_api.encode(np.asfortranarray(np_mask)) - ann['segmentation'] = encoded_mask - # Ensure the content of `counts` is JSON serializable string. - if 'counts' in ann['segmentation']: - ann['segmentation']['counts'] = six.ensure_str( - ann['segmentation']['counts']) - if 'areas' not in groundtruths: - ann['area'] = mask_api.area(encoded_mask) - gt_annotations.append(ann) - - for i, ann in enumerate(gt_annotations): - ann['id'] = i + 1 - - if label_map: - gt_categories = [{'id': i, 'name': label_map[i]} for i in label_map] - else: - category_ids = [gt['category_id'] for gt in gt_annotations] - gt_categories = [{'id': i} for i in set(category_ids)] - - gt_dataset = { - 'images': gt_images, - 'categories': gt_categories, - 'annotations': copy.deepcopy(gt_annotations), - } - return gt_dataset - - -class COCOGroundtruthGenerator: - """Generates the groundtruth annotations from a single example.""" - - def __init__(self, file_pattern, file_type, num_examples, include_mask, - regenerate_source_id=False): - self._file_pattern = file_pattern - self._num_examples = num_examples - self._include_mask = include_mask - self._dataset_fn = dataset_fn.pick_dataset_fn(file_type) - self._regenerate_source_id = regenerate_source_id - - def _parse_single_example(self, example): - """Parses a single serialized tf.Example proto. - - Args: - example: a serialized tf.Example proto string. - - Returns: - A dictionary of groundtruth with the following fields: - source_id: a scalar tensor of int64 representing the image source_id. - height: a scalar tensor of int64 representing the image height. - width: a scalar tensor of int64 representing the image width. - boxes: a float tensor of shape [K, 4], representing the groundtruth - boxes in absolute coordinates with respect to the original image size. - classes: a int64 tensor of shape [K], representing the class labels of - each instances. - is_crowds: a bool tensor of shape [K], indicating whether the instance - is crowd. - areas: a float tensor of shape [K], indicating the area of each - instance. - masks: a string tensor of shape [K], containing the bytes of the png - mask of each instance. - """ - decoder = tf_example_decoder.TfExampleDecoder( - include_mask=self._include_mask, - regenerate_source_id=self._regenerate_source_id) - decoded_tensors = decoder.decode(example) - - image = decoded_tensors['image'] - image_size = tf.shape(image)[0:2] - boxes = box_ops.denormalize_boxes( - decoded_tensors['groundtruth_boxes'], image_size) - - source_id = decoded_tensors['source_id'] - if source_id.dtype is tf.string: - source_id = tf.strings.to_number(source_id, out_type=tf.int64) - - groundtruths = { - 'source_id': source_id, - 'height': decoded_tensors['height'], - 'width': decoded_tensors['width'], - 'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0], - 'boxes': boxes, - 'classes': decoded_tensors['groundtruth_classes'], - 'is_crowds': decoded_tensors['groundtruth_is_crowd'], - 'areas': decoded_tensors['groundtruth_area'], - } - if self._include_mask: - groundtruths.update({ - 'masks': decoded_tensors['groundtruth_instance_masks_png'], - }) - return groundtruths - - def _build_pipeline(self): - """Builds data pipeline to generate groundtruth annotations.""" - dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False) - dataset = dataset.interleave( - map_func=lambda filename: self._dataset_fn(filename).prefetch(1), - cycle_length=None, - num_parallel_calls=tf.data.experimental.AUTOTUNE) - - dataset = dataset.take(self._num_examples) - dataset = dataset.map(self._parse_single_example, - num_parallel_calls=tf.data.experimental.AUTOTUNE) - dataset = dataset.batch(1, drop_remainder=False) - dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) - return dataset - - def __call__(self): - return self._build_pipeline() - - -def scan_and_generator_annotation_file(file_pattern: str, - file_type: str, - num_samples: int, - include_mask: bool, - annotation_file: str, - regenerate_source_id: bool = False): - """Scans and generate the COCO-style annotation JSON file given a dataset.""" - groundtruth_generator = COCOGroundtruthGenerator( - file_pattern, file_type, num_samples, include_mask, regenerate_source_id) - generate_annotation_file(groundtruth_generator, annotation_file) - - -def generate_annotation_file(groundtruth_generator, - annotation_file): - """Generates COCO-style annotation JSON file given a groundtruth generator.""" - groundtruths = {} - logging.info('Loading groundtruth annotations from dataset to memory...') - for i, groundtruth in enumerate(groundtruth_generator()): - logging.info('generate_annotation_file: Processing annotation %d', i) - for k, v in six.iteritems(groundtruth): - if k not in groundtruths: - groundtruths[k] = [v] - else: - groundtruths[k].append(v) - gt_dataset = convert_groundtruths_to_coco_dataset(groundtruths) - - logging.info('Saving groundtruth annotations to the JSON file...') - with tf.io.gfile.GFile(annotation_file, 'w') as f: - f.write(json.dumps(gt_dataset)) - logging.info('Done saving the JSON file...') diff --git a/official/vision/beta/evaluation/coco_utils_test.py b/official/vision/beta/evaluation/coco_utils_test.py deleted file mode 100644 index 9179c7dda..000000000 --- a/official/vision/beta/evaluation/coco_utils_test.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for coco_utils.""" - -import os - -import tensorflow as tf - -from official.vision.beta.dataloaders import tfexample_utils -from official.vision.beta.evaluation import coco_utils - - -class CocoUtilsTest(tf.test.TestCase): - - def test_scan_and_generator_annotation_file(self): - num_samples = 10 - example = tfexample_utils.create_detection_test_example( - image_height=512, image_width=512, image_channel=3, num_instances=10) - tf_examples = [example] * num_samples - data_file = os.path.join(self.create_tempdir(), 'test.tfrecord') - tfexample_utils.dump_to_tfrecord( - record_file=data_file, tf_examples=tf_examples) - annotation_file = os.path.join(self.create_tempdir(), 'annotation.json') - - coco_utils.scan_and_generator_annotation_file( - file_pattern=data_file, - file_type='tfrecord', - num_samples=num_samples, - include_mask=True, - annotation_file=annotation_file) - self.assertTrue( - tf.io.gfile.exists(annotation_file), - msg='Annotation file {annotation_file} does not exists.') - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/evaluation/iou.py b/official/vision/beta/evaluation/iou.py deleted file mode 100644 index 1dabd4af3..000000000 --- a/official/vision/beta/evaluation/iou.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""IOU Metrics used for semantic segmentation models.""" - -import numpy as np -import tensorflow as tf - - -class PerClassIoU(tf.keras.metrics.Metric): - """Computes the per-class Intersection-Over-Union metric. - - Mean Intersection-Over-Union is a common evaluation metric for semantic image - segmentation, which first computes the IOU for each semantic class. - IOU is defined as follows: - IOU = true_positive / (true_positive + false_positive + false_negative). - The predictions are accumulated in a confusion matrix, weighted by - `sample_weight` and the metric is then calculated from it. - - If `sample_weight` is `None`, weights default to 1. - Use `sample_weight` of 0 to mask values. - - Example: - - >>> # cm = [[1, 1], - >>> # [1, 1]] - >>> # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1] - >>> # iou = true_positives / (sum_row + sum_col - true_positives)) - >>> # result = [(1 / (2 + 2 - 1), 1 / (2 + 2 - 1)] = 0.33 - >>> m = tf.keras.metrics.MeanIoU(num_classes=2) - >>> m.update_state([0, 0, 1, 1], [0, 1, 0, 1]) - >>> m.result().numpy() - [0.33333334, 0.33333334] - - """ - - def __init__(self, num_classes, name=None, dtype=None): - """Initializes `PerClassIoU`. - - Args: - num_classes: The possible number of labels the prediction task can have. - This value must be provided, since a confusion matrix of dimension = - [num_classes, num_classes] will be allocated. - name: (Optional) string name of the metric instance. - dtype: (Optional) data type of the metric result. - - """ - - super(PerClassIoU, self).__init__(name=name, dtype=dtype) - self.num_classes = num_classes - - # Variable to accumulate the predictions in the confusion matrix. - self.total_cm = self.add_weight( - 'total_confusion_matrix', - shape=(num_classes, num_classes), - initializer=tf.compat.v1.zeros_initializer) - - def update_state(self, y_true, y_pred, sample_weight=None): - """Accumulates the confusion matrix statistics. - - Args: - y_true: The ground truth values. - y_pred: The predicted values. - sample_weight: Optional weighting of each example. Defaults to 1. Can be a - `Tensor` whose rank is either 0, or the same rank as `y_true`, and must - be broadcastable to `y_true`. - - Returns: - IOU per class. - """ - - y_true = tf.cast(y_true, self._dtype) - y_pred = tf.cast(y_pred, self._dtype) - - # Flatten the input if its rank > 1. - if y_pred.shape.ndims > 1: - y_pred = tf.reshape(y_pred, [-1]) - - if y_true.shape.ndims > 1: - y_true = tf.reshape(y_true, [-1]) - - if sample_weight is not None: - sample_weight = tf.cast(sample_weight, self._dtype) - if sample_weight.shape.ndims > 1: - sample_weight = tf.reshape(sample_weight, [-1]) - - # Accumulate the prediction to current confusion matrix. - current_cm = tf.math.confusion_matrix( - y_true, - y_pred, - self.num_classes, - weights=sample_weight, - dtype=self._dtype) - return self.total_cm.assign_add(current_cm) - - def result(self): - """Compute the mean intersection-over-union via the confusion matrix.""" - sum_over_row = tf.cast( - tf.reduce_sum(self.total_cm, axis=0), dtype=self._dtype) - sum_over_col = tf.cast( - tf.reduce_sum(self.total_cm, axis=1), dtype=self._dtype) - true_positives = tf.cast( - tf.linalg.tensor_diag_part(self.total_cm), dtype=self._dtype) - - # sum_over_row + sum_over_col = - # 2 * true_positives + false_positives + false_negatives. - denominator = sum_over_row + sum_over_col - true_positives - - return tf.math.divide_no_nan(true_positives, denominator) - - def reset_states(self): - tf.keras.backend.set_value( - self.total_cm, np.zeros((self.num_classes, self.num_classes))) - - def get_config(self): - config = {'num_classes': self.num_classes} - base_config = super(PerClassIoU, self).get_config() - return dict(list(base_config.items()) + list(config.items())) diff --git a/official/vision/beta/evaluation/iou_test.py b/official/vision/beta/evaluation/iou_test.py deleted file mode 100644 index e9cd2b39e..000000000 --- a/official/vision/beta/evaluation/iou_test.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for iou metric.""" - -import tensorflow as tf - -from official.vision.beta.evaluation import iou - - -class MeanIoUTest(tf.test.TestCase): - - def test_config(self): - m_obj = iou.PerClassIoU(num_classes=2, name='per_class_iou') - self.assertEqual(m_obj.name, 'per_class_iou') - self.assertEqual(m_obj.num_classes, 2) - - m_obj2 = iou.PerClassIoU.from_config(m_obj.get_config()) - self.assertEqual(m_obj2.name, 'per_class_iou') - self.assertEqual(m_obj2.num_classes, 2) - - def test_unweighted(self): - y_pred = [0, 1, 0, 1] - y_true = [0, 0, 1, 1] - - m_obj = iou.PerClassIoU(num_classes=2) - - result = m_obj(y_true, y_pred) - - # cm = [[1, 1], - # [1, 1]] - # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1] - # iou = true_positives / (sum_row + sum_col - true_positives)) - expected_result = [1 / (2 + 2 - 1), 1 / (2 + 2 - 1)] - self.assertAllClose(expected_result, result, atol=1e-3) - - def test_weighted(self): - y_pred = tf.constant([0, 1, 0, 1], dtype=tf.float32) - y_true = tf.constant([0, 0, 1, 1]) - sample_weight = tf.constant([0.2, 0.3, 0.4, 0.1]) - - m_obj = iou.PerClassIoU(num_classes=2) - - result = m_obj(y_true, y_pred, sample_weight=sample_weight) - - # cm = [[0.2, 0.3], - # [0.4, 0.1]] - # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1] - # iou = true_positives / (sum_row + sum_col - true_positives)) - expected_result = [0.2 / (0.6 + 0.5 - 0.2), 0.1 / (0.4 + 0.5 - 0.1)] - self.assertAllClose(expected_result, result, atol=1e-3) - - def test_multi_dim_input(self): - y_pred = tf.constant([[0, 1], [0, 1]], dtype=tf.float32) - y_true = tf.constant([[0, 0], [1, 1]]) - sample_weight = tf.constant([[0.2, 0.3], [0.4, 0.1]]) - - m_obj = iou.PerClassIoU(num_classes=2) - - result = m_obj(y_true, y_pred, sample_weight=sample_weight) - - # cm = [[0.2, 0.3], - # [0.4, 0.1]] - # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1] - # iou = true_positives / (sum_row + sum_col - true_positives)) - expected_result = [0.2 / (0.6 + 0.5 - 0.2), 0.1 / (0.4 + 0.5 - 0.1)] - self.assertAllClose(expected_result, result, atol=1e-3) - - def test_zero_valid_entries(self): - m_obj = iou.PerClassIoU(num_classes=2) - self.assertAllClose(m_obj.result(), [0, 0], atol=1e-3) - - def test_zero_and_non_zero_entries(self): - y_pred = tf.constant([1], dtype=tf.float32) - y_true = tf.constant([1]) - - m_obj = iou.PerClassIoU(num_classes=2) - result = m_obj(y_true, y_pred) - - # cm = [[0, 0], - # [0, 1]] - # sum_row = [0, 1], sum_col = [0, 1], true_positives = [0, 1] - # iou = true_positives / (sum_row + sum_col - true_positives)) - expected_result = [0, 1 / (1 + 1 - 1)] - self.assertAllClose(expected_result, result, atol=1e-3) - - def test_update_state_annd_result(self): - y_pred = [0, 1, 0, 1] - y_true = [0, 0, 1, 1] - - m_obj = iou.PerClassIoU(num_classes=2) - - m_obj.update_state(y_true, y_pred) - result = m_obj.result() - - # cm = [[1, 1], - # [1, 1]] - # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1] - # iou = true_positives / (sum_row + sum_col - true_positives)) - expected_result = [1 / (2 + 2 - 1), 1 / (2 + 2 - 1)] - self.assertAllClose(expected_result, result, atol=1e-3) - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/evaluation/panoptic_quality.py b/official/vision/beta/evaluation/panoptic_quality.py deleted file mode 100644 index 0546fb959..000000000 --- a/official/vision/beta/evaluation/panoptic_quality.py +++ /dev/null @@ -1,294 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Implementation of the Panoptic Quality metric. - -Panoptic Quality is an instance-based metric for evaluating the task of -image parsing, aka panoptic segmentation. - -Please see the paper for details: -"Panoptic Segmentation", Alexander Kirillov, Kaiming He, Ross Girshick, -Carsten Rother and Piotr Dollar. arXiv:1801.00868, 2018. - -Note that this metric class is branched from -https://github.com/tensorflow/models/blob/master/research/deeplab/evaluation/panoptic_quality.py -""" - -import collections -import numpy as np - -_EPSILON = 1e-10 - - -def realdiv_maybe_zero(x, y): - """Element-wise x / y where y may contain zeros, for those returns 0 too.""" - return np.where( - np.less(np.abs(y), _EPSILON), np.zeros_like(x), np.divide(x, y)) - - -def _ids_to_counts(id_array): - """Given a numpy array, a mapping from each unique entry to its count.""" - ids, counts = np.unique(id_array, return_counts=True) - return dict(zip(ids, counts)) - - -class PanopticQuality: - """Metric class for Panoptic Quality. - - "Panoptic Segmentation" by Alexander Kirillov, Kaiming He, Ross Girshick, - Carsten Rother, Piotr Dollar. - https://arxiv.org/abs/1801.00868 - """ - - def __init__(self, num_categories, ignored_label, max_instances_per_category, - offset): - """Initialization for PanopticQualityMetric. - - Args: - num_categories: The number of segmentation categories (or "classes" in the - dataset. - ignored_label: A category id that is ignored in evaluation, e.g. the void - label as defined in COCO panoptic segmentation dataset. - max_instances_per_category: The maximum number of instances for each - category. Used in ensuring unique instance labels. - offset: The maximum number of unique labels. This is used, by multiplying - the ground-truth labels, to generate unique ids for individual regions - of overlap between groundtruth and predicted segments. - """ - self.num_categories = num_categories - self.ignored_label = ignored_label - self.max_instances_per_category = max_instances_per_category - self.offset = offset - self.reset() - - def _naively_combine_labels(self, category_mask, instance_mask): - """Naively creates a combined label array from categories and instances.""" - return (category_mask.astype(np.uint32) * self.max_instances_per_category + - instance_mask.astype(np.uint32)) - - def compare_and_accumulate(self, groundtruths, predictions): - """Compares predicted segmentation with groundtruth, accumulates its metric. - - It is not assumed that instance ids are unique across different categories. - See for example combine_semantic_and_instance_predictions.py in official - PanopticAPI evaluation code for issues to consider when fusing category - and instance labels. - - Instances ids of the ignored category have the meaning that id 0 is "void" - and remaining ones are crowd instances. - - Args: - groundtruths: A dictionary contains groundtruth labels. It should contain - the following fields. - - category_mask: A 2D numpy uint16 array of groundtruth per-pixel - category labels. - - instance_mask: A 2D numpy uint16 array of groundtruth instance labels. - predictions: A dictionary contains the model outputs. It should contain - the following fields. - - category_array: A 2D numpy uint16 array of predicted per-pixel - category labels. - - instance_array: A 2D numpy uint16 array of predicted instance labels. - """ - groundtruth_category_mask = groundtruths['category_mask'] - groundtruth_instance_mask = groundtruths['instance_mask'] - predicted_category_mask = predictions['category_mask'] - predicted_instance_mask = predictions['instance_mask'] - - # First, combine the category and instance labels so that every unique - # value for (category, instance) is assigned a unique integer label. - pred_segment_id = self._naively_combine_labels(predicted_category_mask, - predicted_instance_mask) - gt_segment_id = self._naively_combine_labels(groundtruth_category_mask, - groundtruth_instance_mask) - - # Pre-calculate areas for all groundtruth and predicted segments. - gt_segment_areas = _ids_to_counts(gt_segment_id) - pred_segment_areas = _ids_to_counts(pred_segment_id) - - # We assume there is only one void segment and it has instance id = 0. - void_segment_id = self.ignored_label * self.max_instances_per_category - - # There may be other ignored groundtruth segments with instance id > 0, find - # those ids using the unique segment ids extracted with the area computation - # above. - ignored_segment_ids = { - gt_segment_id for gt_segment_id in gt_segment_areas - if (gt_segment_id // - self.max_instances_per_category) == self.ignored_label - } - - # Next, combine the groundtruth and predicted labels. Dividing up the pixels - # based on which groundtruth segment and which predicted segment they belong - # to, this will assign a different 32-bit integer label to each choice - # of (groundtruth segment, predicted segment), encoded as - # gt_segment_id * offset + pred_segment_id. - intersection_id_array = ( - gt_segment_id.astype(np.uint64) * self.offset + - pred_segment_id.astype(np.uint64)) - - # For every combination of (groundtruth segment, predicted segment) with a - # non-empty intersection, this counts the number of pixels in that - # intersection. - intersection_areas = _ids_to_counts(intersection_id_array) - - # Helper function that computes the area of the overlap between a predicted - # segment and the ground-truth void/ignored segment. - def prediction_void_overlap(pred_segment_id): - void_intersection_id = void_segment_id * self.offset + pred_segment_id - return intersection_areas.get(void_intersection_id, 0) - - # Compute overall ignored overlap. - def prediction_ignored_overlap(pred_segment_id): - total_ignored_overlap = 0 - for ignored_segment_id in ignored_segment_ids: - intersection_id = ignored_segment_id * self.offset + pred_segment_id - total_ignored_overlap += intersection_areas.get(intersection_id, 0) - return total_ignored_overlap - - # Sets that are populated with which segments groundtruth/predicted segments - # have been matched with overlapping predicted/groundtruth segments - # respectively. - gt_matched = set() - pred_matched = set() - - # Calculate IoU per pair of intersecting segments of the same category. - for intersection_id, intersection_area in intersection_areas.items(): - gt_segment_id = int(intersection_id // self.offset) - pred_segment_id = int(intersection_id % self.offset) - - gt_category = int(gt_segment_id // self.max_instances_per_category) - pred_category = int(pred_segment_id // self.max_instances_per_category) - if gt_category != pred_category: - continue - - # Union between the groundtruth and predicted segments being compared does - # not include the portion of the predicted segment that consists of - # groundtruth "void" pixels. - union = ( - gt_segment_areas[gt_segment_id] + - pred_segment_areas[pred_segment_id] - intersection_area - - prediction_void_overlap(pred_segment_id)) - iou = intersection_area / union - if iou > 0.5: - self.tp_per_class[gt_category] += 1 - self.iou_per_class[gt_category] += iou - gt_matched.add(gt_segment_id) - pred_matched.add(pred_segment_id) - - # Count false negatives for each category. - for gt_segment_id in gt_segment_areas: - if gt_segment_id in gt_matched: - continue - category = gt_segment_id // self.max_instances_per_category - # Failing to detect a void segment is not a false negative. - if category == self.ignored_label: - continue - self.fn_per_class[category] += 1 - - # Count false positives for each category. - for pred_segment_id in pred_segment_areas: - if pred_segment_id in pred_matched: - continue - # A false positive is not penalized if is mostly ignored in the - # groundtruth. - if (prediction_ignored_overlap(pred_segment_id) / - pred_segment_areas[pred_segment_id]) > 0.5: - continue - category = pred_segment_id // self.max_instances_per_category - self.fp_per_class[category] += 1 - - def _valid_categories(self): - """Categories with a "valid" value for the metric, have > 0 instances. - - We will ignore the `ignore_label` class and other classes which have - `tp + fn + fp = 0`. - - Returns: - Boolean array of shape `[num_categories]`. - """ - valid_categories = np.not_equal( - self.tp_per_class + self.fn_per_class + self.fp_per_class, 0) - if self.ignored_label >= 0 and self.ignored_label < self.num_categories: - valid_categories[self.ignored_label] = False - return valid_categories - - def result_per_category(self): - """For supported metrics, return individual per-category metric values. - - Returns: - A dictionary contains all per-class metrics, each metrics is a numpy array - of shape `[self.num_categories]`, where index `i` is the metrics value - over only that category. - """ - sq_per_class = realdiv_maybe_zero(self.iou_per_class, self.tp_per_class) - rq_per_class = realdiv_maybe_zero( - self.tp_per_class, - self.tp_per_class + 0.5 * self.fn_per_class + 0.5 * self.fp_per_class) - return { - 'sq_per_class': sq_per_class, - 'rq_per_class': rq_per_class, - 'pq_per_class': np.multiply(sq_per_class, rq_per_class) - } - - def result(self, is_thing=None): - """Computes and returns the detailed metric results over all comparisons. - - Args: - is_thing: A boolean array of length `num_categories`. The entry - `is_thing[category_id]` is True iff that category is a "thing" category - instead of "stuff." - - Returns: - A dictionary with a breakdown of metrics and/or metric factors by things, - stuff, and all categories. - """ - results = self.result_per_category() - valid_categories = self._valid_categories() - # If known, break down which categories are valid _and_ things/stuff. - category_sets = collections.OrderedDict() - category_sets['All'] = valid_categories - if is_thing is not None: - category_sets['Things'] = np.logical_and(valid_categories, is_thing) - category_sets['Stuff'] = np.logical_and(valid_categories, - np.logical_not(is_thing)) - - for category_set_name, in_category_set in category_sets.items(): - if np.any(in_category_set): - results.update({ - f'{category_set_name}_pq': - np.mean(results['pq_per_class'][in_category_set]), - f'{category_set_name}_sq': - np.mean(results['sq_per_class'][in_category_set]), - f'{category_set_name}_rq': - np.mean(results['rq_per_class'][in_category_set]), - # The number of categories in this subset. - f'{category_set_name}_num_categories': - np.sum(in_category_set.astype(np.int32)), - }) - else: - results[category_set_name] = { - f'{category_set_name}_pq': 0., - f'{category_set_name}_sq': 0., - f'{category_set_name}_rq': 0., - f'{category_set_name}_num_categories': 0 - } - - return results - - def reset(self): - """Resets the accumulation to the metric class's state at initialization.""" - self.iou_per_class = np.zeros(self.num_categories, dtype=np.float64) - self.tp_per_class = np.zeros(self.num_categories, dtype=np.float64) - self.fn_per_class = np.zeros(self.num_categories, dtype=np.float64) - self.fp_per_class = np.zeros(self.num_categories, dtype=np.float64) diff --git a/official/vision/beta/evaluation/panoptic_quality_evaluator.py b/official/vision/beta/evaluation/panoptic_quality_evaluator.py deleted file mode 100644 index c9775680c..000000000 --- a/official/vision/beta/evaluation/panoptic_quality_evaluator.py +++ /dev/null @@ -1,184 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""The panoptic quality evaluator. - -The following snippet demonstrates the use of interfaces: - - evaluator = PanopticQualityEvaluator(...) - for _ in range(num_evals): - for _ in range(num_batches_per_eval): - predictions, groundtruth = predictor.predict(...) # pop a batch. - evaluator.update_state(groundtruths, predictions) - evaluator.result() # finish one full eval and reset states. - -See also: https://github.com/cocodataset/cocoapi/ -""" - -import numpy as np -import tensorflow as tf - -from official.vision.beta.evaluation import panoptic_quality - - -def _crop_padding(mask, image_info): - """Crops padded masks to match original image shape. - - Args: - mask: a padded mask tensor. - image_info: a tensor that holds information about original and preprocessed - images. - Returns: - cropped and padded masks: tf.Tensor - """ - image_shape = tf.cast(image_info[0, :], tf.int32) - mask = tf.image.crop_to_bounding_box( - tf.expand_dims(mask, axis=-1), 0, 0, - image_shape[0], image_shape[1]) - return tf.expand_dims(mask[:, :, 0], axis=0) - - -class PanopticQualityEvaluator: - """Panoptic Quality metric class.""" - - def __init__(self, num_categories, ignored_label, max_instances_per_category, - offset, is_thing=None, rescale_predictions=False): - """Constructs Panoptic Quality evaluation class. - - The class provides the interface to Panoptic Quality metrics_fn. - - Args: - num_categories: The number of segmentation categories (or "classes" in the - dataset. - ignored_label: A category id that is ignored in evaluation, e.g. the void - label as defined in COCO panoptic segmentation dataset. - max_instances_per_category: The maximum number of instances for each - category. Used in ensuring unique instance labels. - offset: The maximum number of unique labels. This is used, by multiplying - the ground-truth labels, to generate unique ids for individual regions - of overlap between groundtruth and predicted segments. - is_thing: A boolean array of length `num_categories`. The entry - `is_thing[category_id]` is True iff that category is a "thing" category - instead of "stuff." Default to `None`, and it means categories are not - classified into these two categories. - rescale_predictions: `bool`, whether to scale back prediction to original - image sizes. If True, groundtruths['image_info'] is used to rescale - predictions. - """ - self._pq_metric_module = panoptic_quality.PanopticQuality( - num_categories, ignored_label, max_instances_per_category, offset) - self._is_thing = is_thing - self._rescale_predictions = rescale_predictions - self._required_prediction_fields = ['category_mask', 'instance_mask'] - self._required_groundtruth_fields = ['category_mask', 'instance_mask'] - self.reset_states() - - @property - def name(self): - return 'panoptic_quality' - - def reset_states(self): - """Resets internal states for a fresh run.""" - self._pq_metric_module.reset() - - def result(self): - """Evaluates detection results, and reset_states.""" - results = self._pq_metric_module.result(self._is_thing) - self.reset_states() - return results - - def _convert_to_numpy(self, groundtruths, predictions): - """Converts tesnors to numpy arrays.""" - if groundtruths: - labels = tf.nest.map_structure(lambda x: x.numpy(), groundtruths) - numpy_groundtruths = {} - for key, val in labels.items(): - if isinstance(val, tuple): - val = np.concatenate(val) - numpy_groundtruths[key] = val - else: - numpy_groundtruths = groundtruths - - if predictions: - outputs = tf.nest.map_structure(lambda x: x.numpy(), predictions) - numpy_predictions = {} - for key, val in outputs.items(): - if isinstance(val, tuple): - val = np.concatenate(val) - numpy_predictions[key] = val - else: - numpy_predictions = predictions - - return numpy_groundtruths, numpy_predictions - - def update_state(self, groundtruths, predictions): - """Update and aggregate detection results and groundtruth data. - - Args: - groundtruths: a dictionary of Tensors including the fields below. See also - different parsers under `../dataloader` for more details. - Required fields: - - category_mask: a numpy array of uint16 of shape [batch_size, H, W]. - - instance_mask: a numpy array of uint16 of shape [batch_size, H, W]. - - image_info: [batch, 4, 2], a tensor that holds information about - original and preprocessed images. Each entry is in the format of - [[original_height, original_width], [input_height, input_width], - [y_scale, x_scale], [y_offset, x_offset]], where [desired_height, - desired_width] is the actual scaled image size, and [y_scale, x_scale] - is the scaling factor, which is the ratio of scaled dimension / - original dimension. - predictions: a dictionary of tensors including the fields below. See - different parsers under `../dataloader` for more details. - Required fields: - - category_mask: a numpy array of uint16 of shape [batch_size, H, W]. - - instance_mask: a numpy array of uint16 of shape [batch_size, H, W]. - - Raises: - ValueError: if the required prediction or groundtruth fields are not - present in the incoming `predictions` or `groundtruths`. - """ - groundtruths, predictions = self._convert_to_numpy(groundtruths, - predictions) - for k in self._required_prediction_fields: - if k not in predictions: - raise ValueError( - 'Missing the required key `{}` in predictions!'.format(k)) - - for k in self._required_groundtruth_fields: - if k not in groundtruths: - raise ValueError( - 'Missing the required key `{}` in groundtruths!'.format(k)) - - if self._rescale_predictions: - for idx in range(len(groundtruths['category_mask'])): - image_info = groundtruths['image_info'][idx] - groundtruths_ = { - 'category_mask': - _crop_padding(groundtruths['category_mask'][idx], image_info), - 'instance_mask': - _crop_padding(groundtruths['instance_mask'][idx], image_info), - } - predictions_ = { - 'category_mask': - _crop_padding(predictions['category_mask'][idx], image_info), - 'instance_mask': - _crop_padding(predictions['instance_mask'][idx], image_info), - } - groundtruths_, predictions_ = self._convert_to_numpy( - groundtruths_, predictions_) - - self._pq_metric_module.compare_and_accumulate( - groundtruths_, predictions_) - else: - self._pq_metric_module.compare_and_accumulate(groundtruths, predictions) diff --git a/official/vision/beta/evaluation/panoptic_quality_evaluator_test.py b/official/vision/beta/evaluation/panoptic_quality_evaluator_test.py deleted file mode 100644 index e162058ae..000000000 --- a/official/vision/beta/evaluation/panoptic_quality_evaluator_test.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for panoptic_quality_evaluator.""" - -import numpy as np -import tensorflow as tf - -from official.vision.beta.evaluation import panoptic_quality_evaluator - - -class PanopticQualityEvaluatorTest(tf.test.TestCase): - - def test_multiple_batches(self): - category_mask = np.zeros([6, 6], np.uint16) - groundtruth_instance_mask = np.array([ - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - [1, 1, 2, 2, 2, 1], - [1, 2, 2, 2, 2, 1], - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - ], - dtype=np.uint16) - - good_det_instance_mask = np.array([ - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - [1, 2, 2, 2, 2, 1], - [1, 2, 2, 2, 1, 1], - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - ], - dtype=np.uint16) - - groundtruths = { - 'category_mask': - tf.convert_to_tensor([category_mask]), - 'instance_mask': - tf.convert_to_tensor([groundtruth_instance_mask]), - 'image_info': - tf.convert_to_tensor([[[6, 6], [6, 6], [1.0, 1.0], [0, 0]]], - dtype=tf.float32) - } - predictions = { - 'category_mask': tf.convert_to_tensor([category_mask]), - 'instance_mask': tf.convert_to_tensor([good_det_instance_mask]) - } - - pq_evaluator = panoptic_quality_evaluator.PanopticQualityEvaluator( - num_categories=1, - ignored_label=2, - max_instances_per_category=16, - offset=16, - rescale_predictions=True) - for _ in range(2): - pq_evaluator.update_state(groundtruths, predictions) - - bad_det_instance_mask = np.array([ - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 2, 2, 1], - [1, 1, 1, 2, 2, 1], - [1, 1, 1, 2, 2, 1], - [1, 1, 1, 1, 1, 1], - ], - dtype=np.uint16) - predictions['instance_mask'] = tf.convert_to_tensor([bad_det_instance_mask]) - for _ in range(2): - pq_evaluator.update_state(groundtruths, predictions) - - results = pq_evaluator.result() - np.testing.assert_array_equal(results['pq_per_class'], - [((28 / 30 + 6 / 8) + (27 / 32)) / 2 / 2]) - np.testing.assert_array_equal(results['rq_per_class'], [3 / 4]) - np.testing.assert_array_equal(results['sq_per_class'], - [((28 / 30 + 6 / 8) + (27 / 32)) / 3]) - self.assertAlmostEqual(results['All_pq'], 0.63177083) - self.assertAlmostEqual(results['All_rq'], 0.75) - self.assertAlmostEqual(results['All_sq'], 0.84236111) - self.assertEqual(results['All_num_categories'], 1) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/evaluation/panoptic_quality_test.py b/official/vision/beta/evaluation/panoptic_quality_test.py deleted file mode 100644 index 60cf27c22..000000000 --- a/official/vision/beta/evaluation/panoptic_quality_test.py +++ /dev/null @@ -1,305 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for Panoptic Quality metric. - -Note that this metric test class is branched from -https://github.com/tensorflow/models/blob/master/research/deeplab/evaluation/panoptic_quality_test.py -""" - - -from absl.testing import absltest -import numpy as np - -from official.vision.beta.evaluation import panoptic_quality - - -class PanopticQualityTest(absltest.TestCase): - - def test_perfect_match(self): - category_mask = np.zeros([6, 6], np.uint16) - instance_mask = np.array([ - [1, 1, 1, 1, 1, 1], - [1, 2, 2, 2, 2, 1], - [1, 2, 2, 2, 2, 1], - [1, 2, 2, 2, 2, 1], - [1, 2, 2, 1, 1, 1], - [1, 2, 1, 1, 1, 1], - ], - dtype=np.uint16) - - groundtruths = { - 'category_mask': category_mask, - 'instance_mask': instance_mask - } - predictions = { - 'category_mask': category_mask, - 'instance_mask': instance_mask - } - pq_metric = panoptic_quality.PanopticQuality( - num_categories=1, - ignored_label=2, - max_instances_per_category=16, - offset=16) - pq_metric.compare_and_accumulate(groundtruths, predictions) - - np.testing.assert_array_equal(pq_metric.iou_per_class, [2.0]) - np.testing.assert_array_equal(pq_metric.tp_per_class, [2]) - np.testing.assert_array_equal(pq_metric.fn_per_class, [0]) - np.testing.assert_array_equal(pq_metric.fp_per_class, [0]) - results = pq_metric.result() - np.testing.assert_array_equal(results['pq_per_class'], [1.0]) - np.testing.assert_array_equal(results['rq_per_class'], [1.0]) - np.testing.assert_array_equal(results['sq_per_class'], [1.0]) - self.assertAlmostEqual(results['All_pq'], 1.0) - self.assertAlmostEqual(results['All_rq'], 1.0) - self.assertAlmostEqual(results['All_sq'], 1.0) - self.assertEqual(results['All_num_categories'], 1) - - def test_totally_wrong(self): - category_mask = np.array([ - [0, 0, 0, 0, 0, 0], - [0, 1, 0, 0, 1, 0], - [0, 1, 1, 1, 1, 0], - [0, 1, 1, 1, 1, 0], - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - ], - dtype=np.uint16) - instance_mask = np.zeros([6, 6], np.uint16) - - groundtruths = { - 'category_mask': category_mask, - 'instance_mask': instance_mask - } - predictions = { - 'category_mask': 1 - category_mask, - 'instance_mask': instance_mask - } - - pq_metric = panoptic_quality.PanopticQuality( - num_categories=2, - ignored_label=2, - max_instances_per_category=1, - offset=16) - pq_metric.compare_and_accumulate(groundtruths, predictions) - np.testing.assert_array_equal(pq_metric.iou_per_class, [0.0, 0.0]) - np.testing.assert_array_equal(pq_metric.tp_per_class, [0, 0]) - np.testing.assert_array_equal(pq_metric.fn_per_class, [1, 1]) - np.testing.assert_array_equal(pq_metric.fp_per_class, [1, 1]) - results = pq_metric.result() - np.testing.assert_array_equal(results['pq_per_class'], [0.0, 0.0]) - np.testing.assert_array_equal(results['rq_per_class'], [0.0, 0.0]) - np.testing.assert_array_equal(results['sq_per_class'], [0.0, 0.0]) - self.assertAlmostEqual(results['All_pq'], 0.0) - self.assertAlmostEqual(results['All_rq'], 0.0) - self.assertAlmostEqual(results['All_sq'], 0.0) - self.assertEqual(results['All_num_categories'], 2) - - def test_matches_by_iou(self): - groundtruth_instance_mask = np.array( - [ - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - [1, 1, 2, 2, 2, 1], - [1, 2, 2, 2, 2, 1], - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - ], - dtype=np.uint16) - - good_det_instance_mask = np.array( - [ - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - [1, 2, 2, 2, 2, 1], - [1, 2, 2, 2, 1, 1], - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - ], - dtype=np.uint16) - - groundtruths = { - 'category_mask': np.zeros_like(groundtruth_instance_mask), - 'instance_mask': groundtruth_instance_mask - } - predictions = { - 'category_mask': np.zeros_like(good_det_instance_mask), - 'instance_mask': good_det_instance_mask - } - pq_metric = panoptic_quality.PanopticQuality( - num_categories=1, - ignored_label=2, - max_instances_per_category=16, - offset=16) - pq_metric.compare_and_accumulate(groundtruths, predictions) - - # iou(1, 1) = 28/30 - # iou(2, 2) = 6 / 8 - np.testing.assert_array_almost_equal(pq_metric.iou_per_class, - [28 / 30 + 6 / 8]) - np.testing.assert_array_equal(pq_metric.tp_per_class, [2]) - np.testing.assert_array_equal(pq_metric.fn_per_class, [0]) - np.testing.assert_array_equal(pq_metric.fp_per_class, [0]) - results = pq_metric.result() - np.testing.assert_array_equal(results['pq_per_class'], - [(28 / 30 + 6 / 8) / 2]) - np.testing.assert_array_equal(results['rq_per_class'], [1.0]) - np.testing.assert_array_equal(results['sq_per_class'], - [(28 / 30 + 6 / 8) / 2]) - self.assertAlmostEqual(results['All_pq'], (28 / 30 + 6 / 8) / 2) - self.assertAlmostEqual(results['All_rq'], 1.0) - self.assertAlmostEqual(results['All_sq'], (28 / 30 + 6 / 8) / 2) - self.assertEqual(results['All_num_categories'], 1) - - bad_det_instance_mask = np.array( - [ - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 2, 2, 1], - [1, 1, 1, 2, 2, 1], - [1, 1, 1, 2, 2, 1], - [1, 1, 1, 1, 1, 1], - ], - dtype=np.uint16) - predictions['instance_mask'] = bad_det_instance_mask - - pq_metric.reset() - pq_metric.compare_and_accumulate(groundtruths, predictions) - - # iou(1, 1) = 27/32 - np.testing.assert_array_almost_equal(pq_metric.iou_per_class, [27 / 32]) - np.testing.assert_array_equal(pq_metric.tp_per_class, [1]) - np.testing.assert_array_equal(pq_metric.fn_per_class, [1]) - np.testing.assert_array_equal(pq_metric.fp_per_class, [1]) - results = pq_metric.result() - np.testing.assert_array_equal(results['pq_per_class'], [27 / 32 / 2]) - np.testing.assert_array_equal(results['rq_per_class'], [0.5]) - np.testing.assert_array_equal(results['sq_per_class'], [27 / 32]) - self.assertAlmostEqual(results['All_pq'], 27 / 32 / 2) - self.assertAlmostEqual(results['All_rq'], 0.5) - self.assertAlmostEqual(results['All_sq'], 27 / 32) - self.assertEqual(results['All_num_categories'], 1) - - def test_wrong_instances(self): - category_mask = np.array([ - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - [1, 2, 2, 1, 2, 2], - [1, 2, 2, 1, 2, 2], - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - ], - dtype=np.uint16) - groundtruth_instance_mask = np.zeros([6, 6], dtype=np.uint16) - predicted_instance_mask = np.array([ - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 1, 1], - [0, 0, 0, 0, 1, 1], - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - ], - dtype=np.uint16) - - groundtruths = { - 'category_mask': category_mask, - 'instance_mask': groundtruth_instance_mask - } - predictions = { - 'category_mask': category_mask, - 'instance_mask': predicted_instance_mask - } - - pq_metric = panoptic_quality.PanopticQuality( - num_categories=3, - ignored_label=0, - max_instances_per_category=10, - offset=100) - pq_metric.compare_and_accumulate(groundtruths, predictions) - - np.testing.assert_array_equal(pq_metric.iou_per_class, [0.0, 1.0, 0.0]) - np.testing.assert_array_equal(pq_metric.tp_per_class, [0, 1, 0]) - np.testing.assert_array_equal(pq_metric.fn_per_class, [0, 0, 1]) - np.testing.assert_array_equal(pq_metric.fp_per_class, [0, 0, 2]) - results = pq_metric.result() - np.testing.assert_array_equal(results['pq_per_class'], [0.0, 1.0, 0.0]) - np.testing.assert_array_equal(results['rq_per_class'], [0.0, 1.0, 0.0]) - np.testing.assert_array_equal(results['sq_per_class'], [0.0, 1.0, 0.0]) - self.assertAlmostEqual(results['All_pq'], 0.5) - self.assertAlmostEqual(results['All_rq'], 0.5) - self.assertAlmostEqual(results['All_sq'], 0.5) - self.assertEqual(results['All_num_categories'], 2) - - def test_instance_order_is_arbitrary(self): - category_mask = np.array([ - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - [1, 2, 2, 1, 2, 2], - [1, 2, 2, 1, 2, 2], - [1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - ], - dtype=np.uint16) - groundtruth_instance_mask = np.array([ - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [0, 1, 1, 0, 0, 0], - [0, 1, 1, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - ], - dtype=np.uint16) - predicted_instance_mask = np.array([ - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 1, 1], - [0, 0, 0, 0, 1, 1], - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - ], - dtype=np.uint16) - - groundtruths = { - 'category_mask': category_mask, - 'instance_mask': groundtruth_instance_mask - } - predictions = { - 'category_mask': category_mask, - 'instance_mask': predicted_instance_mask - } - - pq_metric = panoptic_quality.PanopticQuality( - num_categories=3, - ignored_label=0, - max_instances_per_category=10, - offset=100) - pq_metric.compare_and_accumulate(groundtruths, predictions) - - np.testing.assert_array_equal(pq_metric.iou_per_class, [0.0, 1.0, 2.0]) - np.testing.assert_array_equal(pq_metric.tp_per_class, [0, 1, 2]) - np.testing.assert_array_equal(pq_metric.fn_per_class, [0, 0, 0]) - np.testing.assert_array_equal(pq_metric.fp_per_class, [0, 0, 0]) - results = pq_metric.result() - np.testing.assert_array_equal(results['pq_per_class'], [0.0, 1.0, 1.0]) - np.testing.assert_array_equal(results['rq_per_class'], [0.0, 1.0, 1.0]) - np.testing.assert_array_equal(results['sq_per_class'], [0.0, 1.0, 1.0]) - self.assertAlmostEqual(results['All_pq'], 1.0) - self.assertAlmostEqual(results['All_rq'], 1.0) - self.assertAlmostEqual(results['All_sq'], 1.0) - self.assertEqual(results['All_num_categories'], 2) - - -if __name__ == '__main__': - absltest.main() diff --git a/official/vision/beta/evaluation/segmentation_metrics.py b/official/vision/beta/evaluation/segmentation_metrics.py deleted file mode 100644 index 6d6bd8d64..000000000 --- a/official/vision/beta/evaluation/segmentation_metrics.py +++ /dev/null @@ -1,227 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Metrics for segmentation.""" -import tensorflow as tf - -from official.vision.beta.evaluation import iou - - -class MeanIoU(tf.keras.metrics.MeanIoU): - """Mean IoU metric for semantic segmentation. - - This class utilizes tf.keras.metrics.MeanIoU to perform batched mean iou when - both input images and groundtruth masks are resized to the same size - (rescale_predictions=False). It also computes mean iou on groundtruth original - sizes, in which case, each prediction is rescaled back to the original image - size. - """ - - def __init__( - self, num_classes, rescale_predictions=False, name=None, dtype=None): - """Constructs Segmentation evaluator class. - - Args: - num_classes: `int`, number of classes. - rescale_predictions: `bool`, whether to scale back prediction to original - image sizes. If True, y_true['image_info'] is used to rescale - predictions. - name: `str`, name of the metric instance.. - dtype: data type of the metric result. - """ - self._rescale_predictions = rescale_predictions - super().__init__(num_classes=num_classes, name=name, dtype=dtype) - - def update_state(self, y_true, y_pred): - """Updates metric state. - - Args: - y_true: `dict`, dictionary with the following name, and key values. - - masks: [batch, width, height, 1], groundtruth masks. - - valid_masks: [batch, width, height, 1], valid elements in the mask. - - image_info: [batch, 4, 2], a tensor that holds information about - original and preprocessed images. Each entry is in the format of - [[original_height, original_width], [input_height, input_width], - [y_scale, x_scale], [y_offset, x_offset]], where [desired_height, - desired_width] is the actual scaled image size, and [y_scale, x_scale] - is the scaling factor, which is the ratio of scaled dimension / - original dimension. - y_pred: Tensor [batch, width_p, height_p, num_classes], predicated masks. - """ - predictions = y_pred - masks = y_true['masks'] - valid_masks = y_true['valid_masks'] - images_info = y_true['image_info'] - - if isinstance(predictions, tuple) or isinstance(predictions, list): - predictions = tf.concat(predictions, axis=0) - masks = tf.concat(masks, axis=0) - valid_masks = tf.concat(valid_masks, axis=0) - images_info = tf.concat(images_info, axis=0) - - # Ignore mask elements is set to zero for argmax op. - masks = tf.where(valid_masks, masks, tf.zeros_like(masks)) - - if self._rescale_predictions: - # This part can only run on cpu/gpu due to dynamic image resizing. - for i in range(tf.shape(predictions)[0]): - mask = masks[i] - valid_mask = valid_masks[i] - predicted_mask = predictions[i] - image_info = images_info[i] - - rescale_size = tf.cast( - tf.math.ceil(image_info[1, :] / image_info[2, :]), tf.int32) - image_shape = tf.cast(image_info[0, :], tf.int32) - offsets = tf.cast(image_info[3, :], tf.int32) - - predicted_mask = tf.image.resize( - predicted_mask, - rescale_size, - method=tf.image.ResizeMethod.BILINEAR) - - predicted_mask = tf.image.crop_to_bounding_box(predicted_mask, - offsets[0], offsets[1], - image_shape[0], - image_shape[1]) - mask = tf.image.crop_to_bounding_box(mask, 0, 0, image_shape[0], - image_shape[1]) - valid_mask = tf.image.crop_to_bounding_box(valid_mask, 0, 0, - image_shape[0], - image_shape[1]) - - predicted_mask = tf.argmax(predicted_mask, axis=2) - flatten_predictions = tf.reshape(predicted_mask, shape=[1, -1]) - flatten_masks = tf.reshape(mask, shape=[1, -1]) - flatten_valid_masks = tf.reshape(valid_mask, shape=[1, -1]) - super(MeanIoU, self).update_state( - flatten_masks, flatten_predictions, - tf.cast(flatten_valid_masks, tf.float32)) - - else: - predictions = tf.image.resize( - predictions, - tf.shape(masks)[1:3], - method=tf.image.ResizeMethod.BILINEAR) - predictions = tf.argmax(predictions, axis=3) - flatten_predictions = tf.reshape(predictions, shape=[-1]) - flatten_masks = tf.reshape(masks, shape=[-1]) - flatten_valid_masks = tf.reshape(valid_masks, shape=[-1]) - - super().update_state(flatten_masks, flatten_predictions, - tf.cast(flatten_valid_masks, tf.float32)) - - -class PerClassIoU(iou.PerClassIoU): - """Per Class IoU metric for semantic segmentation. - - This class utilizes iou.PerClassIoU to perform batched per class - iou when both input images and groundtruth masks are resized to the same size - (rescale_predictions=False). It also computes per class iou on groundtruth - original sizes, in which case, each prediction is rescaled back to the - original image size. - """ - - def __init__( - self, num_classes, rescale_predictions=False, name=None, dtype=None): - """Constructs Segmentation evaluator class. - - Args: - num_classes: `int`, number of classes. - rescale_predictions: `bool`, whether to scale back prediction to original - image sizes. If True, y_true['image_info'] is used to rescale - predictions. - name: `str`, name of the metric instance.. - dtype: data type of the metric result. - """ - self._rescale_predictions = rescale_predictions - super().__init__(num_classes=num_classes, name=name, dtype=dtype) - - def update_state(self, y_true, y_pred): - """Updates metric state. - - Args: - y_true: `dict`, dictionary with the following name, and key values. - - masks: [batch, width, height, 1], groundtruth masks. - - valid_masks: [batch, width, height, 1], valid elements in the mask. - - image_info: [batch, 4, 2], a tensor that holds information about - original and preprocessed images. Each entry is in the format of - [[original_height, original_width], [input_height, input_width], - [y_scale, x_scale], [y_offset, x_offset]], where [desired_height, - desired_width] is the actual scaled image size, and [y_scale, x_scale] - is the scaling factor, which is the ratio of scaled dimension / - original dimension. - y_pred: Tensor [batch, width_p, height_p, num_classes], predicated masks. - """ - predictions = y_pred - masks = y_true['masks'] - valid_masks = y_true['valid_masks'] - images_info = y_true['image_info'] - - if isinstance(predictions, tuple) or isinstance(predictions, list): - predictions = tf.concat(predictions, axis=0) - masks = tf.concat(masks, axis=0) - valid_masks = tf.concat(valid_masks, axis=0) - images_info = tf.concat(images_info, axis=0) - - # Ignore mask elements is set to zero for argmax op. - masks = tf.where(valid_masks, masks, tf.zeros_like(masks)) - - if self._rescale_predictions: - # This part can only run on cpu/gpu due to dynamic image resizing. - for i in range(tf.shape(predictions)[0]): - mask = masks[i] - valid_mask = valid_masks[i] - predicted_mask = predictions[i] - image_info = images_info[i] - - rescale_size = tf.cast( - tf.math.ceil(image_info[1, :] / image_info[2, :]), tf.int32) - image_shape = tf.cast(image_info[0, :], tf.int32) - offsets = tf.cast(image_info[3, :], tf.int32) - - predicted_mask = tf.image.resize( - predicted_mask, - rescale_size, - method=tf.image.ResizeMethod.BILINEAR) - - predicted_mask = tf.image.crop_to_bounding_box(predicted_mask, - offsets[0], offsets[1], - image_shape[0], - image_shape[1]) - mask = tf.image.crop_to_bounding_box(mask, 0, 0, image_shape[0], - image_shape[1]) - valid_mask = tf.image.crop_to_bounding_box(valid_mask, 0, 0, - image_shape[0], - image_shape[1]) - - predicted_mask = tf.argmax(predicted_mask, axis=2) - flatten_predictions = tf.reshape(predicted_mask, shape=[1, -1]) - flatten_masks = tf.reshape(mask, shape=[1, -1]) - flatten_valid_masks = tf.reshape(valid_mask, shape=[1, -1]) - super().update_state(flatten_masks, flatten_predictions, - tf.cast(flatten_valid_masks, tf.float32)) - - else: - predictions = tf.image.resize( - predictions, - tf.shape(masks)[1:3], - method=tf.image.ResizeMethod.BILINEAR) - predictions = tf.argmax(predictions, axis=3) - flatten_predictions = tf.reshape(predictions, shape=[-1]) - flatten_masks = tf.reshape(masks, shape=[-1]) - flatten_valid_masks = tf.reshape(valid_masks, shape=[-1]) - - super().update_state(flatten_masks, flatten_predictions, - tf.cast(flatten_valid_masks, tf.float32)) diff --git a/official/vision/beta/evaluation/segmentation_metrics_test.py b/official/vision/beta/evaluation/segmentation_metrics_test.py deleted file mode 100644 index 27b0c3b8e..000000000 --- a/official/vision/beta/evaluation/segmentation_metrics_test.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for segmentation_metrics.""" - -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from official.vision.beta.evaluation import segmentation_metrics - - -class SegmentationMetricsTest(parameterized.TestCase, tf.test.TestCase): - - def _create_test_data(self): - y_pred_cls0 = np.expand_dims( - np.array([[1, 1, 0], [1, 1, 0], [0, 0, 0]], dtype=np.uint16), - axis=(0, -1)) - y_pred_cls1 = np.expand_dims( - np.array([[0, 0, 0], [0, 0, 1], [0, 0, 1]], dtype=np.uint16), - axis=(0, -1)) - y_pred = np.concatenate((y_pred_cls0, y_pred_cls1), axis=-1) - - y_true = { - 'masks': - np.expand_dims( - np.array([[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1], - [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1]], - dtype=np.uint16), - axis=(0, -1)), - 'valid_masks': - np.ones([1, 6, 6, 1], dtype=np.uint16), - 'image_info': - np.array([[[6, 6], [3, 3], [0.5, 0.5], [0, 0]]], dtype=np.float32) - } - return y_pred, y_true - - @parameterized.parameters(True, False) - def test_mean_iou_metric(self, rescale_predictions): - tf.config.experimental_run_functions_eagerly(True) - mean_iou_metric = segmentation_metrics.MeanIoU( - num_classes=2, rescale_predictions=rescale_predictions) - y_pred, y_true = self._create_test_data() - # Disable autograph for correct coverage statistics. - update_fn = tf.autograph.experimental.do_not_convert( - mean_iou_metric.update_state) - update_fn(y_true=y_true, y_pred=y_pred) - miou = mean_iou_metric.result() - self.assertAlmostEqual(miou.numpy(), 0.762, places=3) - - @parameterized.parameters(True, False) - def test_per_class_mean_iou_metric(self, rescale_predictions): - per_class_iou_metric = segmentation_metrics.PerClassIoU( - num_classes=2, rescale_predictions=rescale_predictions) - y_pred, y_true = self._create_test_data() - # Disable autograph for correct coverage statistics. - update_fn = tf.autograph.experimental.do_not_convert( - per_class_iou_metric.update_state) - update_fn(y_true=y_true, y_pred=y_pred) - per_class_miou = per_class_iou_metric.result() - self.assertAllClose(per_class_miou.numpy(), [0.857, 0.667], atol=1e-3) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/evaluation/wod_detection_evaluator.py b/official/vision/beta/evaluation/wod_detection_evaluator.py deleted file mode 100644 index d33aa52ca..000000000 --- a/official/vision/beta/evaluation/wod_detection_evaluator.py +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""2D detection evaluator for the Waymo Open Dataset.""" -import pprint -from absl import logging - -import tensorflow as tf -from official.vision.beta.ops import box_ops -from waymo_open_dataset import label_pb2 -from waymo_open_dataset.metrics.python import wod_detection_evaluator -from waymo_open_dataset.protos import breakdown_pb2 -from waymo_open_dataset.protos import metrics_pb2 - - -def get_2d_detection_default_config(): - """Returns the config proto for WOD 2D detection Evaluation.""" - config = metrics_pb2.Config() - - config.breakdown_generator_ids.append(breakdown_pb2.Breakdown.OBJECT_TYPE) - difficulty = config.difficulties.add() - difficulty.levels.append(label_pb2.Label.LEVEL_1) - difficulty.levels.append(label_pb2.Label.LEVEL_2) - config.breakdown_generator_ids.append(breakdown_pb2.Breakdown.ALL_BUT_SIGN) - difficulty = config.difficulties.add() - difficulty.levels.append(label_pb2.Label.LEVEL_1) - difficulty.levels.append(label_pb2.Label.LEVEL_2) - config.matcher_type = metrics_pb2.MatcherProto.TYPE_HUNGARIAN - config.iou_thresholds.append(0.0) - config.iou_thresholds.append(0.7) - config.iou_thresholds.append(0.5) - config.iou_thresholds.append(0.5) - config.iou_thresholds.append(0.5) - config.box_type = label_pb2.Label.Box.TYPE_2D - - for i in range(100): - config.score_cutoffs.append(i * 0.01) - config.score_cutoffs.append(1.0) - - return config - - -class WOD2dDetectionEvaluator(wod_detection_evaluator.WODDetectionEvaluator): - """WOD 2D detection evaluation metric class.""" - - def __init__(self, config=None): - if config is None: - config = get_2d_detection_default_config() - super().__init__(config=config) - - def _remove_padding(self, tensor_dict, num_valid): - """Remove the paddings of the prediction/groundtruth data.""" - result_tensor_dict = {} - gather_indices = tf.range(num_valid) - for k, v in tensor_dict.items(): - if 'frame_id' in k: - result_tensor_dict[k] = tf.tile([v], [num_valid]) - else: - result_tensor_dict[k] = tf.gather(v, gather_indices) - return result_tensor_dict - - def update_state(self, groundtruths, predictions): - """Update the metrics state with prediction and groundtruth data. - - Args: - groundtruths: a dictionary of Tensors including the fields below. - Required fields: - - source_id: a numpy array of int or string of shape [batch_size]. - - num_detections: a numpy array of int of shape [batch_size]. - - boxes: a numpy array of float of shape [batch_size, K, 4]. - - classes: a numpy array of int of shape [batch_size, K]. - - difficulties: a numpy array of int of shape [batch_size, K]. - - predictions: a dictionary of tensors including the fields below. - Required fields: - - source_id: a numpy array of int or string of shape [batch_size]. - - image_info: a numpy array of float of shape [batch_size, 4, 2]. - - num_detections: a numpy array of int of shape [batch_size]. - - detection_boxes: a numpy array of float of shape [batch_size, K, 4]. - - detection_classes: a numpy array of int of shape [batch_size, K]. - - detection_scores: a numpy array of float of shape [batch_size, K]. - """ - # Preprocess potentially aggregated tensors. - for k, v in groundtruths.items(): - if isinstance(v, tuple): - groundtruths[k] = tf.concat(v, axis=0) - for k, v in predictions.items(): - if isinstance(v, tuple): - predictions[k] = tf.concat(v, axis=0) - - # Change cyclists' type id from 3 to 4, where 3 is reserved for sign. - groundtruth_type = tf.cast(groundtruths['classes'], tf.uint8) - groundtruth_type = tf.where( - tf.equal(groundtruth_type, 3), - tf.ones_like(groundtruth_type) * 4, groundtruth_type) - prediction_type = tf.cast(predictions['detection_classes'], tf.uint8) - prediction_type = tf.where( - tf.equal(prediction_type, 3), - tf.ones_like(prediction_type) * 4, prediction_type) - - # Rescale the detection boxes back to original scale. - image_scale = tf.tile(predictions['image_info'][:, 2:3, :], (1, 1, 2)) - prediction_bbox = predictions['detection_boxes'] / image_scale - - batch_size = tf.shape(groundtruths['source_id'])[0] - - for i in tf.range(batch_size): - frame_groundtruths = { - 'ground_truth_frame_id': - groundtruths['source_id'][i], - 'ground_truth_bbox': - box_ops.yxyx_to_cycxhw( - tf.cast(groundtruths['boxes'][i], tf.float32)), - 'ground_truth_type': - groundtruth_type[i], - 'ground_truth_difficulty': - tf.cast(groundtruths['difficulties'][i], tf.uint8), - } - frame_groundtruths = self._remove_padding( - frame_groundtruths, groundtruths['num_detections'][i]) - frame_predictions = { - 'prediction_frame_id': - groundtruths['source_id'][i], - 'prediction_bbox': - box_ops.yxyx_to_cycxhw( - tf.cast(prediction_bbox[i], tf.float32)), - 'prediction_type': - prediction_type[i], - 'prediction_score': - tf.cast(predictions['detection_scores'][i], tf.float32), - 'prediction_overlap_nlz': - tf.zeros_like(predictions['detection_scores'][i], dtype=tf.bool) - } - frame_predictions = self._remove_padding(frame_predictions, - predictions['num_detections'][i]) - super().update_state(frame_groundtruths, frame_predictions) - - def evaluate(self): - """Compute the final metrics.""" - ap, _, _, _, _ = super().evaluate() - metric_dict = {} - for i, name in enumerate(self._breakdown_names): - # Skip sign metrics in 2d detection task. - if 'SIGN' in name: - continue - metric_dict['WOD metrics/{}/AP'.format(name)] = ap[i] - pp = pprint.PrettyPrinter() - logging.info('WOD Detection Metrics: \n %s', pp.pformat(metric_dict)) - - return metric_dict diff --git a/official/vision/beta/losses/__init__.py b/official/vision/beta/losses/__init__.py deleted file mode 100644 index 310bfb28f..000000000 --- a/official/vision/beta/losses/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - diff --git a/official/vision/beta/losses/focal_loss.py b/official/vision/beta/losses/focal_loss.py deleted file mode 100644 index 4a4ce70b3..000000000 --- a/official/vision/beta/losses/focal_loss.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Losses used for detection models.""" - -import tensorflow as tf - - -class FocalLoss(tf.keras.losses.Loss): - """Implements a Focal loss for classification problems. - - Reference: - [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002). - """ - - def __init__(self, - alpha, - gamma, - reduction=tf.keras.losses.Reduction.AUTO, - name=None): - """Initializes `FocalLoss`. - - Args: - alpha: The `alpha` weight factor for binary class imbalance. - gamma: The `gamma` focusing parameter to re-weight loss. - reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to - loss. Default value is `AUTO`. `AUTO` indicates that the reduction - option will be determined by the usage context. For almost all cases - this defaults to `SUM_OVER_BATCH_SIZE`. When used with - `tf.distribute.Strategy`, outside of built-in training loops such as - `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE` - will raise an error. Please see this custom training [tutorial]( - https://www.tensorflow.org/tutorials/distribute/custom_training) for - more details. - name: Optional name for the op. Defaults to 'retinanet_class_loss'. - """ - self._alpha = alpha - self._gamma = gamma - super(FocalLoss, self).__init__(reduction=reduction, name=name) - - def call(self, y_true, y_pred): - """Invokes the `FocalLoss`. - - Args: - y_true: A tensor of size [batch, num_anchors, num_classes] - y_pred: A tensor of size [batch, num_anchors, num_classes] - - Returns: - Summed loss float `Tensor`. - """ - with tf.name_scope('focal_loss'): - y_true = tf.cast(y_true, dtype=tf.float32) - y_pred = tf.cast(y_pred, dtype=tf.float32) - positive_label_mask = tf.equal(y_true, 1.0) - cross_entropy = ( - tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true, logits=y_pred)) - probs = tf.sigmoid(y_pred) - probs_gt = tf.where(positive_label_mask, probs, 1.0 - probs) - # With small gamma, the implementation could produce NaN during back prop. - modulator = tf.pow(1.0 - probs_gt, self._gamma) - loss = modulator * cross_entropy - weighted_loss = tf.where(positive_label_mask, self._alpha * loss, - (1.0 - self._alpha) * loss) - - return weighted_loss - - def get_config(self): - config = { - 'alpha': self._alpha, - 'gamma': self._gamma, - } - base_config = super(FocalLoss, self).get_config() - return dict(list(base_config.items()) + list(config.items())) diff --git a/official/vision/beta/losses/loss_utils.py b/official/vision/beta/losses/loss_utils.py deleted file mode 100644 index 1c59d0c89..000000000 --- a/official/vision/beta/losses/loss_utils.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Losses utilities for detection models.""" - -import tensorflow as tf - - -def multi_level_flatten(multi_level_inputs, last_dim=None): - """Flattens a multi-level input. - - Args: - multi_level_inputs: Ordered Dict with level to [batch, d1, ..., dm]. - last_dim: Whether the output should be [batch_size, None], or [batch_size, - None, last_dim]. Defaults to `None`. - - Returns: - Concatenated output [batch_size, None], or [batch_size, None, dm] - """ - flattened_inputs = [] - batch_size = None - for level in multi_level_inputs.keys(): - single_input = multi_level_inputs[level] - if batch_size is None: - batch_size = single_input.shape[0] or tf.shape(single_input)[0] - if last_dim is not None: - flattened_input = tf.reshape(single_input, [batch_size, -1, last_dim]) - else: - flattened_input = tf.reshape(single_input, [batch_size, -1]) - flattened_inputs.append(flattened_input) - return tf.concat(flattened_inputs, axis=1) diff --git a/official/vision/beta/losses/maskrcnn_losses.py b/official/vision/beta/losses/maskrcnn_losses.py deleted file mode 100644 index 99e0ac95b..000000000 --- a/official/vision/beta/losses/maskrcnn_losses.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Losses for maskrcn model.""" - -# Import libraries -import tensorflow as tf - - -class RpnScoreLoss(object): - """Region Proposal Network score loss function.""" - - def __init__(self, rpn_batch_size_per_im): - self._rpn_batch_size_per_im = rpn_batch_size_per_im - self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy( - reduction=tf.keras.losses.Reduction.SUM, from_logits=True) - - def __call__(self, score_outputs, labels): - """Computes total RPN detection loss. - - Computes total RPN detection loss including box and score from all levels. - - Args: - score_outputs: an OrderDict with keys representing levels and values - representing scores in [batch_size, height, width, num_anchors]. - labels: the dictionary that returned from dataloader that includes - groundturth targets. - - Returns: - rpn_score_loss: a scalar tensor representing total score loss. - """ - with tf.name_scope('rpn_loss'): - levels = sorted(score_outputs.keys()) - - score_losses = [] - for level in levels: - score_losses.append( - self._rpn_score_loss( - score_outputs[level], - labels[level], - normalizer=tf.cast( - tf.shape(score_outputs[level])[0] * - self._rpn_batch_size_per_im, - dtype=tf.float32))) - - # Sums per level losses to total loss. - return tf.math.add_n(score_losses) - - def _rpn_score_loss(self, score_outputs, score_targets, normalizer=1.0): - """Computes score loss.""" - # score_targets has three values: - # (1) score_targets[i]=1, the anchor is a positive sample. - # (2) score_targets[i]=0, negative. - # (3) score_targets[i]=-1, the anchor is don't care (ignore). - with tf.name_scope('rpn_score_loss'): - mask = tf.math.logical_or(tf.math.equal(score_targets, 1), - tf.math.equal(score_targets, 0)) - - score_targets = tf.math.maximum(score_targets, - tf.zeros_like(score_targets)) - - score_targets = tf.expand_dims(score_targets, axis=-1) - score_outputs = tf.expand_dims(score_outputs, axis=-1) - score_loss = self._binary_crossentropy( - score_targets, score_outputs, sample_weight=mask) - - score_loss /= normalizer - return score_loss - - -class RpnBoxLoss(object): - """Region Proposal Network box regression loss function.""" - - def __init__(self, huber_loss_delta: float): - # The delta is typically around the mean value of regression target. - # for instances, the regression targets of 512x512 input with 6 anchors on - # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2]. - self._huber_loss = tf.keras.losses.Huber( - delta=huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM) - - def __call__(self, box_outputs, labels): - """Computes total RPN detection loss. - - Computes total RPN detection loss including box and score from all levels. - - Args: - box_outputs: an OrderDict with keys representing levels and values - representing box regression targets in - [batch_size, height, width, num_anchors * 4]. - labels: the dictionary that returned from dataloader that includes - groundturth targets. - - Returns: - rpn_box_loss: a scalar tensor representing total box regression loss. - """ - with tf.name_scope('rpn_loss'): - levels = sorted(box_outputs.keys()) - - box_losses = [] - for level in levels: - box_losses.append(self._rpn_box_loss(box_outputs[level], labels[level])) - - # Sum per level losses to total loss. - return tf.add_n(box_losses) - - def _rpn_box_loss(self, box_outputs, box_targets, normalizer=1.0): - """Computes box regression loss.""" - with tf.name_scope('rpn_box_loss'): - mask = tf.cast(tf.not_equal(box_targets, 0.0), dtype=tf.float32) - box_targets = tf.expand_dims(box_targets, axis=-1) - box_outputs = tf.expand_dims(box_outputs, axis=-1) - box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask) - # The loss is normalized by the sum of non-zero weights and additional - # normalizer provided by the function caller. Using + 0.01 here to avoid - # division by zero. - box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01) - return box_loss - - -class FastrcnnClassLoss(object): - """Fast R-CNN classification loss function.""" - - def __init__(self): - self._categorical_crossentropy = tf.keras.losses.CategoricalCrossentropy( - reduction=tf.keras.losses.Reduction.SUM, from_logits=True) - - def __call__(self, class_outputs, class_targets): - """Computes the class loss (Fast-RCNN branch) of Mask-RCNN. - - This function implements the classification loss of the Fast-RCNN. - - The classification loss is softmax on all RoIs. - Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py # pylint: disable=line-too-long - - Args: - class_outputs: a float tensor representing the class prediction for each box - with a shape of [batch_size, num_boxes, num_classes]. - class_targets: a float tensor representing the class label for each box - with a shape of [batch_size, num_boxes]. - - Returns: - a scalar tensor representing total class loss. - """ - with tf.name_scope('fast_rcnn_loss'): - batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list() - class_targets = tf.cast(class_targets, dtype=tf.int32) - class_targets_one_hot = tf.one_hot(class_targets, num_classes) - return self._fast_rcnn_class_loss(class_outputs, class_targets_one_hot, - normalizer=batch_size * num_boxes) - - def _fast_rcnn_class_loss(self, class_outputs, class_targets_one_hot, - normalizer=1.0): - """Computes classification loss.""" - with tf.name_scope('fast_rcnn_class_loss'): - class_loss = self._categorical_crossentropy(class_targets_one_hot, - class_outputs) - - class_loss /= normalizer - return class_loss - - -class FastrcnnBoxLoss(object): - """Fast R-CNN box regression loss function.""" - - def __init__(self, - huber_loss_delta: float, - class_agnostic_bbox_pred: bool = False): - """Initiate Faster RCNN box loss. - - Args: - huber_loss_delta: the delta is typically around the mean value of - regression target. for instances, the regression targets of 512x512 - input with 6 anchors on P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2]. - class_agnostic_bbox_pred: if True, class agnostic bounding box prediction - is performed. - """ - self._huber_loss = tf.keras.losses.Huber( - delta=huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM) - self._class_agnostic_bbox_pred = class_agnostic_bbox_pred - - def __call__(self, box_outputs, class_targets, box_targets): - """Computes the box loss (Fast-RCNN branch) of Mask-RCNN. - - This function implements the box regression loss of the Fast-RCNN. As the - `box_outputs` produces `num_classes` boxes for each RoI, the reference model - expands `box_targets` to match the shape of `box_outputs` and selects only - the target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/fast_rcnn.py) # pylint: disable=line-too-long - Instead, this function selects the `box_outputs` by the `class_targets` so - that it doesn't expand `box_targets`. - - The box loss is smooth L1-loss on only positive samples of RoIs. - Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py # pylint: disable=line-too-long - - Args: - box_outputs: a float tensor representing the box prediction for each box - with a shape of [batch_size, num_boxes, num_classes * 4]. - class_targets: a float tensor representing the class label for each box - with a shape of [batch_size, num_boxes]. - box_targets: a float tensor representing the box label for each box - with a shape of [batch_size, num_boxes, 4]. - - Returns: - box_loss: a scalar tensor representing total box regression loss. - """ - with tf.name_scope('fast_rcnn_loss'): - class_targets = tf.cast(class_targets, dtype=tf.int32) - if not self._class_agnostic_bbox_pred: - box_outputs = self._assign_class_targets(box_outputs, class_targets) - - return self._fast_rcnn_box_loss(box_outputs, box_targets, class_targets) - - def _assign_class_targets(self, box_outputs, class_targets): - """Selects the box from `box_outputs` based on `class_targets`, with which the box has the maximum overlap.""" - (batch_size, num_rois, - num_class_specific_boxes) = box_outputs.get_shape().as_list() - num_classes = num_class_specific_boxes // 4 - box_outputs = tf.reshape(box_outputs, - [batch_size, num_rois, num_classes, 4]) - - box_indices = tf.reshape( - class_targets + tf.tile( - tf.expand_dims(tf.range(batch_size) * num_rois * num_classes, 1), - [1, num_rois]) + tf.tile( - tf.expand_dims(tf.range(num_rois) * num_classes, 0), - [batch_size, 1]), [-1]) - - box_outputs = tf.matmul( - tf.one_hot( - box_indices, - batch_size * num_rois * num_classes, - dtype=box_outputs.dtype), tf.reshape(box_outputs, [-1, 4])) - box_outputs = tf.reshape(box_outputs, [batch_size, -1, 4]) - - return box_outputs - - def _fast_rcnn_box_loss(self, box_outputs, box_targets, class_targets, - normalizer=1.0): - """Computes box regression loss.""" - with tf.name_scope('fast_rcnn_box_loss'): - mask = tf.tile(tf.expand_dims(tf.greater(class_targets, 0), axis=2), - [1, 1, 4]) - mask = tf.cast(mask, dtype=tf.float32) - box_targets = tf.expand_dims(box_targets, axis=-1) - box_outputs = tf.expand_dims(box_outputs, axis=-1) - box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask) - # The loss is normalized by the number of ones in mask, - # additianal normalizer provided by the user and using 0.01 here to avoid - # division by 0. - box_loss /= normalizer * (tf.reduce_sum(mask) + 0.01) - return box_loss - - -class MaskrcnnLoss(object): - """Mask R-CNN instance segmentation mask loss function.""" - - def __init__(self): - self._binary_crossentropy = tf.keras.losses.BinaryCrossentropy( - reduction=tf.keras.losses.Reduction.SUM, from_logits=True) - - def __call__(self, mask_outputs, mask_targets, select_class_targets): - """Computes the mask loss of Mask-RCNN. - - This function implements the mask loss of Mask-RCNN. As the `mask_outputs` - produces `num_classes` masks for each RoI, the reference model expands - `mask_targets` to match the shape of `mask_outputs` and selects only the - target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/mask_rcnn.py) # pylint: disable=line-too-long - Instead, this implementation selects the `mask_outputs` by the `class_targets` - so that it doesn't expand `mask_targets`. Note that the selection logic is - done in the post-processing of mask_rcnn_fn in mask_rcnn_architecture.py. - - Args: - mask_outputs: a float tensor representing the prediction for each mask, - with a shape of - [batch_size, num_masks, mask_height, mask_width]. - mask_targets: a float tensor representing the binary mask of ground truth - labels for each mask with a shape of - [batch_size, num_masks, mask_height, mask_width]. - select_class_targets: a tensor with a shape of [batch_size, num_masks], - representing the foreground mask targets. - - Returns: - mask_loss: a float tensor representing total mask loss. - """ - with tf.name_scope('mask_rcnn_loss'): - (batch_size, num_masks, mask_height, - mask_width) = mask_outputs.get_shape().as_list() - - weights = tf.tile( - tf.reshape(tf.greater(select_class_targets, 0), - [batch_size, num_masks, 1, 1]), - [1, 1, mask_height, mask_width]) - weights = tf.cast(weights, dtype=tf.float32) - - mask_targets = tf.expand_dims(mask_targets, axis=-1) - mask_outputs = tf.expand_dims(mask_outputs, axis=-1) - mask_loss = self._binary_crossentropy(mask_targets, mask_outputs, - sample_weight=weights) - - # The loss is normalized by the number of 1's in weights and - # + 0.01 is used to avoid division by zero. - return mask_loss / (tf.reduce_sum(weights) + 0.01) diff --git a/official/vision/beta/losses/retinanet_losses.py b/official/vision/beta/losses/retinanet_losses.py deleted file mode 100644 index 91aaecf08..000000000 --- a/official/vision/beta/losses/retinanet_losses.py +++ /dev/null @@ -1,206 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Losses used for detection models.""" - -# Import libraries -import tensorflow as tf - - -def focal_loss(logits, targets, alpha, gamma): - """Compute the focal loss between `logits` and the golden `target` values. - - Focal loss = -(1-pt)^gamma * log(pt) - where pt is the probability of being classified to the true class. - - Args: - logits: A float32 tensor of size - [batch, d_1, ..., d_k, n_classes]. - targets: A float32 tensor of size - [batch, d_1, ..., d_k, n_classes]. - alpha: A float32 scalar multiplying alpha to the loss from positive examples - and (1-alpha) to the loss from negative examples. - gamma: A float32 scalar modulating loss from hard and easy examples. - - Returns: - loss: A float32 Tensor of size - [batch, d_1, ..., d_k, n_classes] representing - normalized loss on the prediction map. - """ - with tf.name_scope('focal_loss'): - positive_label_mask = tf.equal(targets, 1.0) - cross_entropy = ( - tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=logits)) - probs = tf.sigmoid(logits) - probs_gt = tf.where(positive_label_mask, probs, 1.0 - probs) - # With small gamma, the implementation could produce NaN during back prop. - modulator = tf.pow(1.0 - probs_gt, gamma) - loss = modulator * cross_entropy - weighted_loss = tf.where(positive_label_mask, alpha * loss, - (1.0 - alpha) * loss) - - return weighted_loss - - -class FocalLoss(tf.keras.losses.Loss): - """Implements a Focal loss for classification problems. - - Reference: - [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002). - """ - - def __init__(self, - alpha, - gamma, - num_classes, - reduction=tf.keras.losses.Reduction.AUTO, - name=None): - """Initializes `FocalLoss`. - - Args: - alpha: The `alpha` weight factor for binary class imbalance. - gamma: The `gamma` focusing parameter to re-weight loss. - num_classes: Number of foreground classes. - reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to - loss. Default value is `AUTO`. `AUTO` indicates that the reduction - option will be determined by the usage context. For almost all cases - this defaults to `SUM_OVER_BATCH_SIZE`. When used with - `tf.distribute.Strategy`, outside of built-in training loops such as - `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE` - will raise an error. Please see this custom training [tutorial]( - https://www.tensorflow.org/tutorials/distribute/custom_training) for - more details. - name: Optional name for the op. Defaults to 'retinanet_class_loss'. - """ - self._num_classes = num_classes - self._alpha = alpha - self._gamma = gamma - super(FocalLoss, self).__init__(reduction=reduction, name=name) - - def call(self, y_true, y_pred): - """Invokes the `FocalLoss`. - - Args: - y_true: Ordered Dict with level to [batch, height, width, num_anchors]. - for example, - {3: tf.Tensor(shape=[32, 512, 512, 9], dtype=tf.float32), - 4: tf.Tensor([shape=32, 256, 256, 9, dtype=tf.float32])} - y_pred: Ordered Dict with level to [batch, height, width, num_anchors * - num_classes]. for example, - {3: tf.Tensor(shape=[32, 512, 512, 9], dtype=tf.int64), - 4: tf.Tensor(shape=[32, 256, 256, 9 * 21], dtype=tf.int64)} - - Returns: - Summed loss float `Tensor`. - """ - flattened_cls_outputs = [] - flattened_labels = [] - batch_size = None - for level in y_pred.keys(): - cls_output = y_pred[level] - label = y_true[level] - if batch_size is None: - batch_size = cls_output.shape[0] or tf.shape(cls_output)[0] - flattened_cls_outputs.append( - tf.reshape(cls_output, [batch_size, -1, self._num_classes])) - flattened_labels.append(tf.reshape(label, [batch_size, -1])) - cls_outputs = tf.concat(flattened_cls_outputs, axis=1) - labels = tf.concat(flattened_labels, axis=1) - - cls_targets_one_hot = tf.one_hot(labels, self._num_classes) - return focal_loss( - tf.cast(cls_outputs, dtype=tf.float32), - tf.cast(cls_targets_one_hot, dtype=tf.float32), self._alpha, - self._gamma) - - def get_config(self): - config = { - 'alpha': self._alpha, - 'gamma': self._gamma, - 'num_classes': self._num_classes, - } - base_config = super(FocalLoss, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - -class RetinanetBoxLoss(tf.keras.losses.Loss): - """RetinaNet box Huber loss.""" - - def __init__(self, - delta, - reduction=tf.keras.losses.Reduction.AUTO, - name=None): - """Initializes `RetinanetBoxLoss`. - - Args: - delta: A float, the point where the Huber loss function changes from a - quadratic to linear. - reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to - loss. Default value is `AUTO`. `AUTO` indicates that the reduction - option will be determined by the usage context. For almost all cases - this defaults to `SUM_OVER_BATCH_SIZE`. When used with - `tf.distribute.Strategy`, outside of built-in training loops such as - `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE` - will raise an error. Please see this custom training [tutorial]( - https://www.tensorflow.org/tutorials/distribute/custom_training) for - more details. - name: Optional name for the op. Defaults to 'retinanet_class_loss'. - """ - self._huber_loss = tf.keras.losses.Huber( - delta=delta, reduction=tf.keras.losses.Reduction.NONE) - self._delta = delta - super(RetinanetBoxLoss, self).__init__(reduction=reduction, name=name) - - def call(self, y_true, y_pred): - """Computes box detection loss. - - Computes total detection loss including box and class loss from all levels. - - Args: - y_true: Ordered Dict with level to [batch, height, width, - num_anchors * 4] for example, - {3: tf.Tensor(shape=[32, 512, 512, 9 * 4], dtype=tf.float32), - 4: tf.Tensor([shape=32, 256, 256, 9 * 4, dtype=tf.float32])} - y_pred: Ordered Dict with level to [batch, height, width, - num_anchors * 4]. for example, - {3: tf.Tensor(shape=[32, 512, 512, 9 * 4], dtype=tf.int64), - 4: tf.Tensor(shape=[32, 256, 256, 9 * 4], dtype=tf.int64)} - - Returns: - an integer tensor representing total box regression loss. - """ - # Sums all positives in a batch for normalization and avoids zero - # num_positives_sum, which would lead to inf loss during training - - flattened_box_outputs = [] - flattened_labels = [] - batch_size = None - for level in y_pred.keys(): - box_output = y_pred[level] - label = y_true[level] - if batch_size is None: - batch_size = box_output.shape[0] or tf.shape(box_output)[0] - flattened_box_outputs.append(tf.reshape(box_output, [batch_size, -1, 4])) - flattened_labels.append(tf.reshape(label, [batch_size, -1, 4])) - box_outputs = tf.concat(flattened_box_outputs, axis=1) - labels = tf.concat(flattened_labels, axis=1) - loss = self._huber_loss(labels, box_outputs) - return loss - - def get_config(self): - config = { - 'delta': self._delta, - } - base_config = super(RetinanetBoxLoss, self).get_config() - return dict(list(base_config.items()) + list(config.items())) diff --git a/official/vision/beta/losses/segmentation_losses.py b/official/vision/beta/losses/segmentation_losses.py deleted file mode 100644 index e336cbb27..000000000 --- a/official/vision/beta/losses/segmentation_losses.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Losses used for segmentation models.""" - -# Import libraries -import tensorflow as tf - -from official.modeling import tf_utils - -EPSILON = 1e-5 - - -class SegmentationLoss: - """Semantic segmentation loss.""" - - def __init__(self, label_smoothing, class_weights, ignore_label, - use_groundtruth_dimension, top_k_percent_pixels=1.0): - self._top_k_percent_pixels = top_k_percent_pixels - self._class_weights = class_weights - self._ignore_label = ignore_label - self._use_groundtruth_dimension = use_groundtruth_dimension - self._label_smoothing = label_smoothing - - def __call__(self, logits, labels): - _, height, width, num_classes = logits.get_shape().as_list() - - if self._use_groundtruth_dimension: - # TODO(arashwan): Test using align corners to match deeplab alignment. - logits = tf.image.resize( - logits, tf.shape(labels)[1:3], - method=tf.image.ResizeMethod.BILINEAR) - else: - labels = tf.image.resize( - labels, (height, width), - method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) - - valid_mask = tf.not_equal(labels, self._ignore_label) - normalizer = tf.reduce_sum(tf.cast(valid_mask, tf.float32)) + EPSILON - # Assign pixel with ignore label to class 0 (background). The loss on the - # pixel will later be masked out. - labels = tf.where(valid_mask, labels, tf.zeros_like(labels)) - - labels = tf.squeeze(tf.cast(labels, tf.int32), axis=3) - valid_mask = tf.squeeze(tf.cast(valid_mask, tf.float32), axis=3) - onehot_labels = tf.one_hot(labels, num_classes) - onehot_labels = onehot_labels * ( - 1 - self._label_smoothing) + self._label_smoothing / num_classes - cross_entropy_loss = tf.nn.softmax_cross_entropy_with_logits( - labels=onehot_labels, logits=logits) - - if not self._class_weights: - class_weights = [1] * num_classes - else: - class_weights = self._class_weights - - if num_classes != len(class_weights): - raise ValueError( - 'Length of class_weights should be {}'.format(num_classes)) - - weight_mask = tf.einsum('...y,y->...', - tf.one_hot(labels, num_classes, dtype=tf.float32), - tf.constant(class_weights, tf.float32)) - valid_mask *= weight_mask - cross_entropy_loss *= tf.cast(valid_mask, tf.float32) - - if self._top_k_percent_pixels >= 1.0: - loss = tf.reduce_sum(cross_entropy_loss) / normalizer - else: - cross_entropy_loss = tf.reshape(cross_entropy_loss, shape=[-1]) - top_k_pixels = tf.cast( - self._top_k_percent_pixels * - tf.cast(tf.size(cross_entropy_loss), tf.float32), tf.int32) - top_k_losses, _ = tf.math.top_k( - cross_entropy_loss, k=top_k_pixels, sorted=True) - normalizer = tf.reduce_sum( - tf.cast(tf.not_equal(top_k_losses, 0.0), tf.float32)) + EPSILON - loss = tf.reduce_sum(top_k_losses) / normalizer - - return loss - - -def get_actual_mask_scores(logits, labels, ignore_label): - """Gets actual mask scores.""" - _, height, width, num_classes = logits.get_shape().as_list() - batch_size = tf.shape(logits)[0] - logits = tf.stop_gradient(logits) - labels = tf.image.resize( - labels, (height, width), - method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) - predicted_labels = tf.argmax(logits, -1, output_type=tf.int32) - flat_predictions = tf.reshape(predicted_labels, [batch_size, -1]) - flat_labels = tf.cast(tf.reshape(labels, [batch_size, -1]), tf.int32) - - one_hot_predictions = tf.one_hot( - flat_predictions, num_classes, on_value=True, off_value=False) - one_hot_labels = tf.one_hot( - flat_labels, num_classes, on_value=True, off_value=False) - keep_mask = tf.not_equal(flat_labels, ignore_label) - keep_mask = tf.expand_dims(keep_mask, 2) - - overlap = tf.logical_and(one_hot_predictions, one_hot_labels) - overlap = tf.logical_and(overlap, keep_mask) - overlap = tf.reduce_sum(tf.cast(overlap, tf.float32), axis=1) - union = tf.logical_or(one_hot_predictions, one_hot_labels) - union = tf.logical_and(union, keep_mask) - union = tf.reduce_sum(tf.cast(union, tf.float32), axis=1) - actual_scores = tf.divide(overlap, tf.maximum(union, EPSILON)) - return actual_scores - - -class MaskScoringLoss: - """Mask Scoring loss.""" - - def __init__(self, ignore_label): - self._ignore_label = ignore_label - self._mse_loss = tf.keras.losses.MeanSquaredError( - reduction=tf.keras.losses.Reduction.NONE) - - def __call__(self, predicted_scores, logits, labels): - actual_scores = get_actual_mask_scores(logits, labels, self._ignore_label) - loss = tf_utils.safe_mean(self._mse_loss(actual_scores, predicted_scores)) - return loss diff --git a/official/vision/beta/modeling/__init__.py b/official/vision/beta/modeling/__init__.py deleted file mode 100644 index 92329b11e..000000000 --- a/official/vision/beta/modeling/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Modeling package definition.""" - -from official.vision.beta.modeling import backbones -from official.vision.beta.modeling import decoders -from official.vision.beta.modeling import heads -from official.vision.beta.modeling import layers diff --git a/official/vision/beta/modeling/backbones/__init__.py b/official/vision/beta/modeling/backbones/__init__.py deleted file mode 100644 index 7439667f5..000000000 --- a/official/vision/beta/modeling/backbones/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Backbones package definition.""" - -from official.vision.beta.modeling.backbones.efficientnet import EfficientNet -from official.vision.beta.modeling.backbones.mobiledet import MobileDet -from official.vision.beta.modeling.backbones.mobilenet import MobileNet -from official.vision.beta.modeling.backbones.resnet import ResNet -from official.vision.beta.modeling.backbones.resnet_3d import ResNet3D -from official.vision.beta.modeling.backbones.resnet_deeplab import DilatedResNet -from official.vision.beta.modeling.backbones.revnet import RevNet -from official.vision.beta.modeling.backbones.spinenet import SpineNet -from official.vision.beta.modeling.backbones.spinenet_mobile import SpineNetMobile diff --git a/official/vision/beta/modeling/backbones/efficientnet.py b/official/vision/beta/modeling/backbones/efficientnet.py deleted file mode 100644 index 2d3e4b12b..000000000 --- a/official/vision/beta/modeling/backbones/efficientnet.py +++ /dev/null @@ -1,318 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of EfficientNet Networks.""" - -import math -from typing import Any, List, Tuple - -# Import libraries - -import tensorflow as tf - -from official.modeling import hyperparams -from official.modeling import tf_utils -from official.vision.beta.modeling.backbones import factory -from official.vision.beta.modeling.layers import nn_blocks -from official.vision.beta.modeling.layers import nn_layers - -layers = tf.keras.layers - -# The fixed EfficientNet-B0 architecture discovered by NAS. -# Each element represents a specification of a building block: -# (block_fn, block_repeats, kernel_size, strides, expand_ratio, in_filters, -# out_filters, is_output) -EN_B0_BLOCK_SPECS = [ - ('mbconv', 1, 3, 1, 1, 32, 16, False), - ('mbconv', 2, 3, 2, 6, 16, 24, True), - ('mbconv', 2, 5, 2, 6, 24, 40, True), - ('mbconv', 3, 3, 2, 6, 40, 80, False), - ('mbconv', 3, 5, 1, 6, 80, 112, True), - ('mbconv', 4, 5, 2, 6, 112, 192, False), - ('mbconv', 1, 3, 1, 6, 192, 320, True), -] - -SCALING_MAP = { - 'b0': dict(width_scale=1.0, depth_scale=1.0), - 'b1': dict(width_scale=1.0, depth_scale=1.1), - 'b2': dict(width_scale=1.1, depth_scale=1.2), - 'b3': dict(width_scale=1.2, depth_scale=1.4), - 'b4': dict(width_scale=1.4, depth_scale=1.8), - 'b5': dict(width_scale=1.6, depth_scale=2.2), - 'b6': dict(width_scale=1.8, depth_scale=2.6), - 'b7': dict(width_scale=2.0, depth_scale=3.1), -} - - -class BlockSpec(): - """A container class that specifies the block configuration for MnasNet.""" - - def __init__(self, block_fn: str, block_repeats: int, kernel_size: int, - strides: int, expand_ratio: float, in_filters: int, - out_filters: int, is_output: bool, width_scale: float, - depth_scale: float): - self.block_fn = block_fn - self.block_repeats = round_repeats(block_repeats, depth_scale) - self.kernel_size = kernel_size - self.strides = strides - self.expand_ratio = expand_ratio - self.in_filters = nn_layers.round_filters(in_filters, width_scale) - self.out_filters = nn_layers.round_filters(out_filters, width_scale) - self.is_output = is_output - - -def round_repeats(repeats: int, multiplier: float, skip: bool = False) -> int: - """Returns rounded number of filters based on depth multiplier.""" - if skip or not multiplier: - return repeats - return int(math.ceil(multiplier * repeats)) - - -def block_spec_decoder(specs: List[Tuple[Any, ...]], width_scale: float, - depth_scale: float) -> List[BlockSpec]: - """Decodes and returns specs for a block.""" - decoded_specs = [] - for s in specs: - s = s + ( - width_scale, - depth_scale, - ) - decoded_specs.append(BlockSpec(*s)) - return decoded_specs - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class EfficientNet(tf.keras.Model): - """Creates an EfficientNet family model. - - This implements the EfficientNet model from: - Mingxing Tan, Quoc V. Le. - EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks. - (https://arxiv.org/pdf/1905.11946) - """ - - def __init__(self, - model_id: str, - input_specs: tf.keras.layers.InputSpec = layers.InputSpec( - shape=[None, None, None, 3]), - se_ratio: float = 0.0, - stochastic_depth_drop_rate: float = 0.0, - kernel_initializer: str = 'VarianceScaling', - kernel_regularizer: tf.keras.regularizers.Regularizer = None, - bias_regularizer: tf.keras.regularizers.Regularizer = None, - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, # pytype: disable=annotation-type-mismatch # typed-keras - **kwargs): - """Initializes an EfficientNet model. - - Args: - model_id: A `str` of model ID of EfficientNet. - input_specs: A `tf.keras.layers.InputSpec` of the input tensor. - se_ratio: A `float` of squeeze and excitation ratio for inverted - bottleneck blocks. - stochastic_depth_drop_rate: A `float` of drop rate for drop connect layer. - kernel_initializer: A `str` for kernel initializer of convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - Default to None. - activation: A `str` of name of the activation function. - use_sync_bn: If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - **kwargs: Additional keyword arguments to be passed. - """ - self._model_id = model_id - self._input_specs = input_specs - self._se_ratio = se_ratio - self._stochastic_depth_drop_rate = stochastic_depth_drop_rate - self._use_sync_bn = use_sync_bn - self._activation = activation - self._kernel_initializer = kernel_initializer - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - if use_sync_bn: - self._norm = layers.experimental.SyncBatchNormalization - else: - self._norm = layers.BatchNormalization - - if tf.keras.backend.image_data_format() == 'channels_last': - bn_axis = -1 - else: - bn_axis = 1 - - # Build EfficientNet. - inputs = tf.keras.Input(shape=input_specs.shape[1:]) - width_scale = SCALING_MAP[model_id]['width_scale'] - depth_scale = SCALING_MAP[model_id]['depth_scale'] - - # Build stem. - x = layers.Conv2D( - filters=nn_layers.round_filters(32, width_scale), - kernel_size=3, - strides=2, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - inputs) - x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - x = tf_utils.get_activation(activation)(x) - - # Build intermediate blocks. - endpoints = {} - endpoint_level = 2 - decoded_specs = block_spec_decoder(EN_B0_BLOCK_SPECS, width_scale, - depth_scale) - - for i, specs in enumerate(decoded_specs): - x = self._block_group( - inputs=x, specs=specs, name='block_group_{}'.format(i)) - if specs.is_output: - endpoints[str(endpoint_level)] = x - endpoint_level += 1 - - # Build output specs for downstream tasks. - self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} - - # Build the final conv for classification. - x = layers.Conv2D( - filters=nn_layers.round_filters(1280, width_scale), - kernel_size=1, - strides=1, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - x) - x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - endpoints[str(endpoint_level)] = tf_utils.get_activation(activation)(x) - - super(EfficientNet, self).__init__( - inputs=inputs, outputs=endpoints, **kwargs) - - def _block_group(self, - inputs: tf.Tensor, - specs: BlockSpec, - name: str = 'block_group'): - """Creates one group of blocks for the EfficientNet model. - - Args: - inputs: A `tf.Tensor` of size `[batch, channels, height, width]`. - specs: The specifications for one inverted bottleneck block group. - name: A `str` name for the block. - - Returns: - The output `tf.Tensor` of the block layer. - """ - if specs.block_fn == 'mbconv': - block_fn = nn_blocks.InvertedBottleneckBlock - else: - raise ValueError('Block func {} not supported.'.format(specs.block_fn)) - - x = block_fn( - in_filters=specs.in_filters, - out_filters=specs.out_filters, - expand_ratio=specs.expand_ratio, - strides=specs.strides, - kernel_size=specs.kernel_size, - se_ratio=self._se_ratio, - stochastic_depth_drop_rate=self._stochastic_depth_drop_rate, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activation=self._activation, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon)( - inputs) - - for _ in range(1, specs.block_repeats): - x = block_fn( - in_filters=specs.out_filters, # Set 'in_filters' to 'out_filters'. - out_filters=specs.out_filters, - expand_ratio=specs.expand_ratio, - strides=1, # Fix strides to 1. - kernel_size=specs.kernel_size, - se_ratio=self._se_ratio, - stochastic_depth_drop_rate=self._stochastic_depth_drop_rate, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activation=self._activation, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon)( - x) - - return tf.identity(x, name=name) - - def get_config(self): - config_dict = { - 'model_id': self._model_id, - 'se_ratio': self._se_ratio, - 'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - 'activation': self._activation, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon - } - return config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - @property - def output_specs(self): - """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs - - -@factory.register_backbone_builder('efficientnet') -def build_efficientnet( - input_specs: tf.keras.layers.InputSpec, - backbone_config: hyperparams.Config, - norm_activation_config: hyperparams.Config, - l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model: # pytype: disable=annotation-type-mismatch # typed-keras - """Builds EfficientNet backbone from a config.""" - backbone_type = backbone_config.type - backbone_cfg = backbone_config.get() - assert backbone_type == 'efficientnet', (f'Inconsistent backbone type ' - f'{backbone_type}') - - return EfficientNet( - model_id=backbone_cfg.model_id, - input_specs=input_specs, - stochastic_depth_drop_rate=backbone_cfg.stochastic_depth_drop_rate, - se_ratio=backbone_cfg.se_ratio, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer) diff --git a/official/vision/beta/modeling/backbones/efficientnet_test.py b/official/vision/beta/modeling/backbones/efficientnet_test.py deleted file mode 100644 index a2f4c7aac..000000000 --- a/official/vision/beta/modeling/backbones/efficientnet_test.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for EfficientNet.""" - -# Import libraries -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.modeling.backbones import efficientnet - - -class EfficientNetTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters(32, 224) - def test_network_creation(self, input_size): - """Test creation of EfficientNet family models.""" - tf.keras.backend.set_image_data_format('channels_last') - - network = efficientnet.EfficientNet(model_id='b0') - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - endpoints = network(inputs) - - self.assertAllEqual([1, input_size / 2**2, input_size / 2**2, 24], - endpoints['2'].shape.as_list()) - self.assertAllEqual([1, input_size / 2**3, input_size / 2**3, 40], - endpoints['3'].shape.as_list()) - self.assertAllEqual([1, input_size / 2**4, input_size / 2**4, 112], - endpoints['4'].shape.as_list()) - self.assertAllEqual([1, input_size / 2**5, input_size / 2**5, 320], - endpoints['5'].shape.as_list()) - - @parameterized.parameters('b0', 'b3', 'b6') - def test_network_scaling(self, model_id): - """Test compound scaling.""" - efficientnet_params = { - 'b0': 4049564, - 'b3': 10783528, - 'b6': 40960136, - } - tf.keras.backend.set_image_data_format('channels_last') - - input_size = 32 - network = efficientnet.EfficientNet(model_id=model_id, se_ratio=0.25) - self.assertEqual(network.count_params(), efficientnet_params[model_id]) - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - _ = network(inputs) - - @parameterized.parameters(1, 3) - def test_input_specs(self, input_dim): - """Test different input feature dimensions.""" - tf.keras.backend.set_image_data_format('channels_last') - - input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim]) - network = efficientnet.EfficientNet(model_id='b0', input_specs=input_specs) - - inputs = tf.keras.Input(shape=(128, 128, input_dim), batch_size=1) - _ = network(inputs) - - def test_serialize_deserialize(self): - # Create a network object that sets all of its config options. - kwargs = dict( - model_id='b0', - se_ratio=0.25, - stochastic_depth_drop_rate=None, - use_sync_bn=False, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - activation='relu', - norm_momentum=0.99, - norm_epsilon=0.001, - ) - network = efficientnet.EfficientNet(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(network.get_config(), expected_config) - - # Create another network object from the first object's config. - new_network = efficientnet.EfficientNet.from_config(network.get_config()) - - # Validate that the config can be forced to JSON. - _ = new_network.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(network.get_config(), new_network.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/backbones/factory.py b/official/vision/beta/modeling/backbones/factory.py deleted file mode 100644 index 8421250ae..000000000 --- a/official/vision/beta/modeling/backbones/factory.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Backbone registers and factory method. - -One can regitered a new backbone model by the following two steps: - -1 Import the factory and register the build in the backbone file. -2 Import the backbone class and add a build in __init__.py. - -``` -# my_backbone.py - -from modeling.backbones import factory - -class MyBackbone(): - ... - -@factory.register_backbone_builder('my_backbone') -def build_my_backbone(): - return MyBackbone() - -# backbones/__init__.py adds import -from modeling.backbones.my_backbone import MyBackbone -``` - -If one wants the MyBackbone class to be used only by those binary -then don't imported the backbone module in backbones/__init__.py, but import it -in place that uses it. - - -""" -from typing import Sequence, Union - -# Import libraries - -import tensorflow as tf - -from official.core import registry -from official.modeling import hyperparams - - -_REGISTERED_BACKBONE_CLS = {} - - -def register_backbone_builder(key: str): - """Decorates a builder of backbone class. - - The builder should be a Callable (a class or a function). - This decorator supports registration of backbone builder as follows: - - ``` - class MyBackbone(tf.keras.Model): - pass - - @register_backbone_builder('mybackbone') - def builder(input_specs, config, l2_reg): - return MyBackbone(...) - - # Builds a MyBackbone object. - my_backbone = build_backbone_3d(input_specs, config, l2_reg) - ``` - - Args: - key: A `str` of key to look up the builder. - - Returns: - A callable for using as class decorator that registers the decorated class - for creation from an instance of task_config_cls. - """ - return registry.register(_REGISTERED_BACKBONE_CLS, key) - - -def build_backbone(input_specs: Union[tf.keras.layers.InputSpec, - Sequence[tf.keras.layers.InputSpec]], - backbone_config: hyperparams.Config, - norm_activation_config: hyperparams.Config, - l2_regularizer: tf.keras.regularizers.Regularizer = None, - **kwargs) -> tf.keras.Model: # pytype: disable=annotation-type-mismatch # typed-keras - """Builds backbone from a config. - - Args: - input_specs: A (sequence of) `tf.keras.layers.InputSpec` of input. - backbone_config: A `OneOfConfig` of backbone config. - norm_activation_config: A config for normalization/activation layer. - l2_regularizer: A `tf.keras.regularizers.Regularizer` object. Default to - None. - **kwargs: Additional keyword args to be passed to backbone builder. - - Returns: - A `tf.keras.Model` instance of the backbone. - """ - backbone_builder = registry.lookup(_REGISTERED_BACKBONE_CLS, - backbone_config.type) - - return backbone_builder( - input_specs=input_specs, - backbone_config=backbone_config, - norm_activation_config=norm_activation_config, - l2_regularizer=l2_regularizer, - **kwargs) diff --git a/official/vision/beta/modeling/backbones/factory_test.py b/official/vision/beta/modeling/backbones/factory_test.py deleted file mode 100644 index 03bf1b550..000000000 --- a/official/vision/beta/modeling/backbones/factory_test.py +++ /dev/null @@ -1,227 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for factory functions.""" -# Import libraries -from absl.testing import parameterized -import tensorflow as tf - -from tensorflow.python.distribute import combinations -from official.vision.beta.configs import backbones as backbones_cfg -from official.vision.beta.configs import backbones_3d as backbones_3d_cfg -from official.vision.beta.configs import common as common_cfg -from official.vision.beta.modeling import backbones -from official.vision.beta.modeling.backbones import factory - - -class FactoryTest(tf.test.TestCase, parameterized.TestCase): - - @combinations.generate( - combinations.combine(model_id=[18, 34, 50, 101, 152],)) - def test_resnet_creation(self, model_id): - """Test creation of ResNet models.""" - - network = backbones.ResNet( - model_id=model_id, se_ratio=0.0, norm_momentum=0.99, norm_epsilon=1e-5) - - backbone_config = backbones_cfg.Backbone( - type='resnet', - resnet=backbones_cfg.ResNet(model_id=model_id, se_ratio=0.0)) - norm_activation_config = common_cfg.NormActivation( - norm_momentum=0.99, norm_epsilon=1e-5, use_sync_bn=False) - - factory_network = factory.build_backbone( - input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]), - backbone_config=backbone_config, - norm_activation_config=norm_activation_config) - - network_config = network.get_config() - factory_network_config = factory_network.get_config() - - self.assertEqual(network_config, factory_network_config) - - @combinations.generate( - combinations.combine( - model_id=['b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'], - se_ratio=[0.0, 0.25], - )) - def test_efficientnet_creation(self, model_id, se_ratio): - """Test creation of EfficientNet models.""" - - network = backbones.EfficientNet( - model_id=model_id, - se_ratio=se_ratio, - norm_momentum=0.99, - norm_epsilon=1e-5) - - backbone_config = backbones_cfg.Backbone( - type='efficientnet', - efficientnet=backbones_cfg.EfficientNet( - model_id=model_id, se_ratio=se_ratio)) - norm_activation_config = common_cfg.NormActivation( - norm_momentum=0.99, norm_epsilon=1e-5, use_sync_bn=False) - - factory_network = factory.build_backbone( - input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]), - backbone_config=backbone_config, - norm_activation_config=norm_activation_config) - - network_config = network.get_config() - factory_network_config = factory_network.get_config() - - self.assertEqual(network_config, factory_network_config) - - @combinations.generate( - combinations.combine( - model_id=['MobileNetV1', 'MobileNetV2', - 'MobileNetV3Large', 'MobileNetV3Small', - 'MobileNetV3EdgeTPU'], - filter_size_scale=[1.0, 0.75], - )) - def test_mobilenet_creation(self, model_id, filter_size_scale): - """Test creation of Mobilenet models.""" - - network = backbones.MobileNet( - model_id=model_id, - filter_size_scale=filter_size_scale, - norm_momentum=0.99, - norm_epsilon=1e-5) - - backbone_config = backbones_cfg.Backbone( - type='mobilenet', - mobilenet=backbones_cfg.MobileNet( - model_id=model_id, filter_size_scale=filter_size_scale)) - norm_activation_config = common_cfg.NormActivation( - norm_momentum=0.99, norm_epsilon=1e-5, use_sync_bn=False) - - factory_network = factory.build_backbone( - input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]), - backbone_config=backbone_config, - norm_activation_config=norm_activation_config) - - network_config = network.get_config() - factory_network_config = factory_network.get_config() - - self.assertEqual(network_config, factory_network_config) - - @combinations.generate(combinations.combine(model_id=['49'],)) - def test_spinenet_creation(self, model_id): - """Test creation of SpineNet models.""" - input_size = 128 - min_level = 3 - max_level = 7 - - input_specs = tf.keras.layers.InputSpec( - shape=[None, input_size, input_size, 3]) - network = backbones.SpineNet( - input_specs=input_specs, - min_level=min_level, - max_level=max_level, - norm_momentum=0.99, - norm_epsilon=1e-5) - - backbone_config = backbones_cfg.Backbone( - type='spinenet', - spinenet=backbones_cfg.SpineNet(model_id=model_id)) - norm_activation_config = common_cfg.NormActivation( - norm_momentum=0.99, norm_epsilon=1e-5, use_sync_bn=False) - - factory_network = factory.build_backbone( - input_specs=tf.keras.layers.InputSpec( - shape=[None, input_size, input_size, 3]), - backbone_config=backbone_config, - norm_activation_config=norm_activation_config) - - network_config = network.get_config() - factory_network_config = factory_network.get_config() - - self.assertEqual(network_config, factory_network_config) - - @combinations.generate( - combinations.combine(model_id=[38, 56, 104],)) - def test_revnet_creation(self, model_id): - """Test creation of RevNet models.""" - network = backbones.RevNet( - model_id=model_id, norm_momentum=0.99, norm_epsilon=1e-5) - - backbone_config = backbones_cfg.Backbone( - type='revnet', - revnet=backbones_cfg.RevNet(model_id=model_id)) - norm_activation_config = common_cfg.NormActivation( - norm_momentum=0.99, norm_epsilon=1e-5, use_sync_bn=False) - - factory_network = factory.build_backbone( - input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]), - backbone_config=backbone_config, - norm_activation_config=norm_activation_config) - - network_config = network.get_config() - factory_network_config = factory_network.get_config() - - self.assertEqual(network_config, factory_network_config) - - @combinations.generate(combinations.combine(model_type=['resnet_3d'],)) - def test_resnet_3d_creation(self, model_type): - """Test creation of ResNet 3D models.""" - backbone_cfg = backbones_3d_cfg.Backbone3D(type=model_type).get() - temporal_strides = [] - temporal_kernel_sizes = [] - for block_spec in backbone_cfg.block_specs: - temporal_strides.append(block_spec.temporal_strides) - temporal_kernel_sizes.append(block_spec.temporal_kernel_sizes) - - _ = backbones.ResNet3D( - model_id=backbone_cfg.model_id, - temporal_strides=temporal_strides, - temporal_kernel_sizes=temporal_kernel_sizes, - norm_momentum=0.99, - norm_epsilon=1e-5) - - @combinations.generate( - combinations.combine( - model_id=[ - 'MobileDetCPU', - 'MobileDetDSP', - 'MobileDetEdgeTPU', - 'MobileDetGPU'], - filter_size_scale=[1.0, 0.75], - )) - def test_mobiledet_creation(self, model_id, filter_size_scale): - """Test creation of Mobiledet models.""" - - network = backbones.MobileDet( - model_id=model_id, - filter_size_scale=filter_size_scale, - norm_momentum=0.99, - norm_epsilon=1e-5) - - backbone_config = backbones_cfg.Backbone( - type='mobiledet', - mobiledet=backbones_cfg.MobileDet( - model_id=model_id, filter_size_scale=filter_size_scale)) - norm_activation_config = common_cfg.NormActivation( - norm_momentum=0.99, norm_epsilon=1e-5, use_sync_bn=False) - - factory_network = factory.build_backbone( - input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]), - backbone_config=backbone_config, - norm_activation_config=norm_activation_config) - - network_config = network.get_config() - factory_network_config = factory_network.get_config() - - self.assertEqual(network_config, factory_network_config) - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/backbones/mobiledet.py b/official/vision/beta/modeling/backbones/mobiledet.py deleted file mode 100644 index 8bbd6691f..000000000 --- a/official/vision/beta/modeling/backbones/mobiledet.py +++ /dev/null @@ -1,579 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Definitions of MobileDet Networks.""" - -import dataclasses -from typing import Any, Dict, Optional, Tuple, List - -import tensorflow as tf - -from official.modeling import hyperparams -from official.vision.beta.modeling.backbones import factory -from official.vision.beta.modeling.backbones import mobilenet -from official.vision.beta.modeling.layers import nn_blocks -from official.vision.beta.modeling.layers import nn_layers - - -layers = tf.keras.layers - - -# pylint: disable=pointless-string-statement - -""" -Architecture: https://arxiv.org/abs/1704.04861. - -"MobileDets: Searching for Object Detection Architectures for -Mobile Accelerators" Yunyang Xiong, Hanxiao Liu, Suyog Gupta, Berkin Akin, -Gabriel Bender, Yongzhe Wang, Pieter-Jan Kindermans, Mingxing Tan, Vikas Singh, -Bo Chen - -Note that `round_down_protection` flag should be set to false for scaling -of the network. -""" - -MD_CPU_BLOCK_SPECS = { - 'spec_name': 'MobileDetCPU', - # [expand_ratio] is set to 1 and [use_residual] is set to false - # for inverted_bottleneck_no_expansion - # [se_ratio] is set to 0.25 for all inverted_bottleneck layers - # [activation] is set to 'hard_swish' for all applicable layers - 'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', - 'activation', 'se_ratio', 'expand_ratio', - 'use_residual', 'is_output'], - 'block_specs': [ - ('convbn', 3, 2, 16, 'hard_swish', None, None, None, False), - # inverted_bottleneck_no_expansion - ('invertedbottleneck', 3, 1, 8, 'hard_swish', 0.25, 1., False, True), - ('invertedbottleneck', 3, 2, 16, 'hard_swish', 0.25, 4., False, True), - ('invertedbottleneck', 3, 2, 32, 'hard_swish', 0.25, 8., False, False), - ('invertedbottleneck', 3, 1, 32, 'hard_swish', 0.25, 4., True, False), - ('invertedbottleneck', 3, 1, 32, 'hard_swish', 0.25, 4., True, False), - ('invertedbottleneck', 3, 1, 32, 'hard_swish', 0.25, 4., True, True), - ('invertedbottleneck', 5, 2, 72, 'hard_swish', 0.25, 8., False, False), - ('invertedbottleneck', 3, 1, 72, 'hard_swish', 0.25, 8., True, False), - ('invertedbottleneck', 5, 1, 72, 'hard_swish', 0.25, 4., True, False), - ('invertedbottleneck', 3, 1, 72, 'hard_swish', 0.25, 4., True, False), - ('invertedbottleneck', 3, 1, 72, 'hard_swish', 0.25, 8., False, False), - ('invertedbottleneck', 3, 1, 72, 'hard_swish', 0.25, 8., True, False), - ('invertedbottleneck', 3, 1, 72, 'hard_swish', 0.25, 8., True, False), - ('invertedbottleneck', 3, 1, 72, 'hard_swish', 0.25, 8., True, True), - ('invertedbottleneck', 5, 2, 104, 'hard_swish', 0.25, 8., False, False), - ('invertedbottleneck', 5, 1, 104, 'hard_swish', 0.25, 4., True, False), - ('invertedbottleneck', 5, 1, 104, 'hard_swish', 0.25, 4., True, False), - ('invertedbottleneck', 3, 1, 104, 'hard_swish', 0.25, 4., True, False), - ('invertedbottleneck', 3, 1, 144, 'hard_swish', 0.25, 8., False, True), - ] -} - -MD_DSP_BLOCK_SPECS = { - 'spec_name': 'MobileDetDSP', - # [expand_ratio] is set to 1 and [use_residual] is set to false - # for inverted_bottleneck_no_expansion - # [use_depthwise] is set to False for fused_conv - # [se_ratio] is set to None for all inverted_bottleneck layers - # [activation] is set to 'relu6' for all applicable layers - 'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', - 'activation', 'se_ratio', 'expand_ratio', - 'input_compression_ratio', 'output_compression_ratio', - 'use_depthwise', 'use_residual', 'is_output'], - 'block_specs': [ - ('convbn', 3, 2, 32, 'relu6', - None, None, None, None, None, None, False), - # inverted_bottleneck_no_expansion - ('invertedbottleneck', 3, 1, 24, 'relu6', - None, 1., None, None, True, False, True), - ('invertedbottleneck', 3, 2, 32, 'relu6', - None, 4., None, None, False, False, False), # fused_conv - ('invertedbottleneck', 3, 1, 32, 'relu6', - None, 4., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 32, 'relu6', - None, 4., None, None, True, True, False), - ('tucker', 3, 1, 32, 'relu6', - None, None, 0.25, 0.75, None, True, True), - ('invertedbottleneck', 3, 2, 64, 'relu6', - None, 8., None, None, False, False, False), # fused_conv - ('invertedbottleneck', 3, 1, 64, 'relu6', - None, 4., None, None, True, True, False), - ('invertedbottleneck', 3, 1, 64, 'relu6', - None, 4., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 64, 'relu6', - None, 4., None, None, False, True, True), # fused_conv - ('invertedbottleneck', 3, 2, 120, 'relu6', - None, 8., None, None, False, False, False), # fused_conv - ('invertedbottleneck', 3, 1, 120, 'relu6', - None, 4., None, None, True, True, False), - ('invertedbottleneck', 3, 1, 120, 'relu6', - None, 8, None, None, True, True, False), - ('invertedbottleneck', 3, 1, 120, 'relu6', - None, 8., None, None, True, True, False), - ('invertedbottleneck', 3, 1, 144, 'relu6', - None, 8., None, None, False, False, False), # fused_conv - ('invertedbottleneck', 3, 1, 144, 'relu6', - None, 8., None, None, True, True, False), - ('invertedbottleneck', 3, 1, 144, 'relu6', - None, 8, None, None, True, True, False), - ('invertedbottleneck', 3, 1, 144, 'relu6', - None, 8., None, None, True, True, True), - ('invertedbottleneck', 3, 2, 160, 'relu6', - None, 4, None, None, True, False, False), - ('invertedbottleneck', 3, 1, 160, 'relu6', - None, 4, None, None, True, True, False), - ('invertedbottleneck', 3, 1, 160, 'relu6', - None, 4., None, None, False, False, False), # fused_conv - ('tucker', 3, 1, 160, 'relu6', - None, None, 0.75, 0.75, None, True, False), - ('invertedbottleneck', 3, 1, 240, 'relu6', - None, 8, None, None, True, False, True), - ] -} - -MD_EdgeTPU_BLOCK_SPECS = { - 'spec_name': 'MobileDetEdgeTPU', - # [use_depthwise] is set to False for fused_conv - # [se_ratio] is set to None for all inverted_bottleneck layers - # [activation] is set to 'relu6' for all applicable layers - 'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', - 'activation', 'se_ratio', 'expand_ratio', - 'input_compression_ratio', 'output_compression_ratio', - 'use_depthwise', 'use_residual', 'is_output'], - 'block_specs': [ - ('convbn', 3, 2, 32, 'relu6', - None, None, None, None, None, None, False), - ('tucker', 3, 1, 16, 'relu6', - None, None, 0.25, 0.75, None, False, True), - ('invertedbottleneck', 3, 2, 16, 'relu6', - None, 8., None, None, False, False, False), # fused_conv - ('invertedbottleneck', 3, 1, 16, 'relu6', - None, 4., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 16, 'relu6', - None, 8., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 16, 'relu6', - None, 4., None, None, False, True, True), # fused_conv - ('invertedbottleneck', 5, 2, 40, 'relu6', - None, 8., None, None, False, False, False), # fused_conv - ('invertedbottleneck', 3, 1, 40, 'relu6', - None, 4., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 40, 'relu6', - None, 4., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 40, 'relu6', - None, 4., None, None, False, True, True), # fused_conv - ('invertedbottleneck', 3, 2, 72, 'relu6', - None, 8, None, None, True, False, False), - ('invertedbottleneck', 3, 1, 72, 'relu6', - None, 8, None, None, True, True, False), - ('invertedbottleneck', 3, 1, 72, 'relu6', - None, 4., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 72, 'relu6', - None, 4., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 5, 1, 96, 'relu6', - None, 8, None, None, True, False, False), - ('invertedbottleneck', 5, 1, 96, 'relu6', - None, 8, None, None, True, True, False), - ('invertedbottleneck', 3, 1, 96, 'relu6', - None, 8, None, None, True, True, False), - ('invertedbottleneck', 3, 1, 96, 'relu6', - None, 8, None, None, True, True, True), - ('invertedbottleneck', 5, 2, 120, 'relu6', - None, 8, None, None, True, False, False), - ('invertedbottleneck', 3, 1, 120, 'relu6', - None, 8, None, None, True, True, False), - ('invertedbottleneck', 5, 1, 120, 'relu6', - None, 4, None, None, True, True, False), - ('invertedbottleneck', 3, 1, 120, 'relu6', - None, 8, None, None, True, True, False), - ('invertedbottleneck', 5, 1, 384, 'relu6', - None, 8, None, None, True, False, True), - ] -} - -MD_GPU_BLOCK_SPECS = { - 'spec_name': 'MobileDetGPU', - # [use_depthwise] is set to False for fused_conv - # [se_ratio] is set to None for all inverted_bottleneck layers - # [activation] is set to 'relu6' for all applicable layers - 'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', - 'activation', 'se_ratio', 'expand_ratio', - 'input_compression_ratio', 'output_compression_ratio', - 'use_depthwise', 'use_residual', 'is_output'], - 'block_specs': [ - # block 0 - ('convbn', 3, 2, 32, 'relu6', - None, None, None, None, None, None, False), - # block 1 - ('tucker', 3, 1, 16, 'relu6', - None, None, 0.25, 0.25, None, False, True), - # block 2 - ('invertedbottleneck', 3, 2, 32, 'relu6', - None, 8., None, None, False, False, False), # fused_conv - ('tucker', 3, 1, 32, 'relu6', - None, None, 0.25, 0.25, None, True, False), - ('tucker', 3, 1, 32, 'relu6', - None, None, 0.25, 0.25, None, True, False), - ('tucker', 3, 1, 32, 'relu6', - None, None, 0.25, 0.25, None, True, True), - # block 3 - ('invertedbottleneck', 3, 2, 64, 'relu6', - None, 8., None, None, False, False, False), # fused_conv - ('invertedbottleneck', 3, 1, 64, 'relu6', - None, 8., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 64, 'relu6', - None, 8., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 64, 'relu6', - None, 4., None, None, False, True, True), # fused_conv - # block 4 - ('invertedbottleneck', 3, 2, 128, 'relu6', - None, 8., None, None, False, False, False), # fused_conv - ('invertedbottleneck', 3, 1, 128, 'relu6', - None, 4., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 128, 'relu6', - None, 4., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 128, 'relu6', - None, 4., None, None, False, True, False), # fused_conv - # block 5 - ('invertedbottleneck', 3, 1, 128, 'relu6', - None, 8., None, None, False, False, False), # fused_conv - ('invertedbottleneck', 3, 1, 128, 'relu6', - None, 8., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 128, 'relu6', - None, 8., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 128, 'relu6', - None, 8., None, None, False, True, True), # fused_conv - # block 6 - ('invertedbottleneck', 3, 2, 128, 'relu6', - None, 4., None, None, False, False, False), # fused_conv - ('invertedbottleneck', 3, 1, 128, 'relu6', - None, 4., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 128, 'relu6', - None, 4., None, None, False, True, False), # fused_conv - ('invertedbottleneck', 3, 1, 128, 'relu6', - None, 4., None, None, False, True, False), # fused_conv - # block 7 - ('invertedbottleneck', 3, 1, 384, 'relu6', - None, 8, None, None, True, False, True), - ] -} - -SUPPORTED_SPECS_MAP = { - 'MobileDetCPU': MD_CPU_BLOCK_SPECS, - 'MobileDetDSP': MD_DSP_BLOCK_SPECS, - 'MobileDetEdgeTPU': MD_EdgeTPU_BLOCK_SPECS, - 'MobileDetGPU': MD_GPU_BLOCK_SPECS, -} - - -@dataclasses.dataclass -class BlockSpec(hyperparams.Config): - """A container class that specifies the block configuration for MobileDet.""" - - block_fn: str = 'convbn' - kernel_size: int = 3 - strides: int = 1 - filters: int = 32 - use_bias: bool = False - use_normalization: bool = True - activation: str = 'relu6' - is_output: bool = True - # Used for block type InvertedResConv and TuckerConvBlock. - use_residual: bool = True - # Used for block type InvertedResConv only. - use_depthwise: bool = True - expand_ratio: Optional[float] = 8. - se_ratio: Optional[float] = None - # Used for block type TuckerConvBlock only. - input_compression_ratio: Optional[float] = None - output_compression_ratio: Optional[float] = None - - -def block_spec_decoder( - specs: Dict[Any, Any], - filter_size_scale: float, - divisible_by: int = 8) -> List[BlockSpec]: - """Decodes specs for a block. - - Args: - specs: A `dict` specification of block specs of a mobiledet version. - filter_size_scale: A `float` multiplier for the filter size for all - convolution ops. The value must be greater than zero. Typical usage will - be to set this value in (0, 1) to reduce the number of parameters or - computation cost of the model. - divisible_by: An `int` that ensures all inner dimensions are divisible by - this number. - - Returns: - A list of `BlockSpec` that defines structure of the base network. - """ - - spec_name = specs['spec_name'] - block_spec_schema = specs['block_spec_schema'] - block_specs = specs['block_specs'] - - if not block_specs: - raise ValueError( - 'The block spec cannot be empty for {} !'.format(spec_name)) - - if len(block_specs[0]) != len(block_spec_schema): - raise ValueError('The block spec values {} do not match with ' - 'the schema {}'.format(block_specs[0], block_spec_schema)) - - decoded_specs = [] - - for s in block_specs: - kw_s = dict(zip(block_spec_schema, s)) - decoded_specs.append(BlockSpec(**kw_s)) - - for ds in decoded_specs: - if ds.filters: - ds.filters = nn_layers.round_filters(filters=ds.filters, - multiplier=filter_size_scale, - divisor=divisible_by, - round_down_protect=False, - min_depth=8) - - return decoded_specs - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class MobileDet(tf.keras.Model): - """Creates a MobileDet family model.""" - - def __init__( - self, - model_id: str = 'MobileDetCPU', - filter_size_scale: float = 1.0, - input_specs: tf.keras.layers.InputSpec = layers.InputSpec( - shape=[None, None, None, 3]), - # The followings are for hyper-parameter tuning. - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_initializer: str = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - # The followings should be kept the same most of the times. - min_depth: int = 8, - divisible_by: int = 8, - regularize_depthwise: bool = False, - use_sync_bn: bool = False, - **kwargs): - """Initializes a MobileDet model. - - Args: - model_id: A `str` of MobileDet version. The supported values are - `MobileDetCPU`, `MobileDetDSP`, `MobileDetEdgeTPU`, `MobileDetGPU`. - filter_size_scale: A `float` of multiplier for the filters (number of - channels) for all convolution ops. The value must be greater than zero. - Typical usage will be to set this value in (0, 1) to reduce the number - of parameters or computation cost of the model. - input_specs: A `tf.keras.layers.InputSpec` of specs of the input tensor. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - kernel_initializer: A `str` for kernel initializer of convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - Default to None. - min_depth: An `int` of minimum depth (number of channels) for all - convolution ops. Enforced when filter_size_scale < 1, and not an active - constraint when filter_size_scale >= 1. - divisible_by: An `int` that ensures all inner dimensions are divisible by - this number. - regularize_depthwise: If Ture, apply regularization on depthwise. - use_sync_bn: If True, use synchronized batch normalization. - **kwargs: Additional keyword arguments to be passed. - """ - if model_id not in SUPPORTED_SPECS_MAP: - raise ValueError('The MobileDet version {} ' - 'is not supported'.format(model_id)) - - if filter_size_scale <= 0: - raise ValueError('filter_size_scale is not greater than zero.') - - self._model_id = model_id - self._input_specs = input_specs - self._filter_size_scale = filter_size_scale - self._min_depth = min_depth - self._divisible_by = divisible_by - self._regularize_depthwise = regularize_depthwise - self._kernel_initializer = kernel_initializer - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - self._use_sync_bn = use_sync_bn - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - - inputs = tf.keras.Input(shape=input_specs.shape[1:]) - - block_specs = SUPPORTED_SPECS_MAP.get(model_id) - self._decoded_specs = block_spec_decoder( - specs=block_specs, - filter_size_scale=self._filter_size_scale, - divisible_by=self._get_divisible_by()) - - x, endpoints, next_endpoint_level = self._mobiledet_base(inputs=inputs) - - self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} - - super(MobileDet, self).__init__( - inputs=inputs, outputs=endpoints, **kwargs) - - def _get_divisible_by(self): - return self._divisible_by - - def _mobiledet_base(self, - inputs: tf.Tensor - ) -> Tuple[tf.Tensor, Dict[str, tf.Tensor], int]: - """Builds the base MobileDet architecture. - - Args: - inputs: A `tf.Tensor` of shape `[batch_size, height, width, channels]`. - - Returns: - A tuple of output Tensor and dictionary that collects endpoints. - """ - - input_shape = inputs.get_shape().as_list() - if len(input_shape) != 4: - raise ValueError('Expected rank 4 input, was: %d' % len(input_shape)) - - net = inputs - endpoints = {} - endpoint_level = 1 - for i, block_def in enumerate(self._decoded_specs): - block_name = 'block_group_{}_{}'.format(block_def.block_fn, i) - - if block_def.block_fn == 'convbn': - - net = mobilenet.Conv2DBNBlock( - filters=block_def.filters, - kernel_size=block_def.kernel_size, - strides=block_def.strides, - activation=block_def.activation, - use_bias=block_def.use_bias, - use_normalization=block_def.use_normalization, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon - )(net) - - elif block_def.block_fn == 'invertedbottleneck': - - in_filters = net.shape.as_list()[-1] - net = nn_blocks.InvertedBottleneckBlock( - in_filters=in_filters, - out_filters=block_def.filters, - kernel_size=block_def.kernel_size, - strides=block_def.strides, - expand_ratio=block_def.expand_ratio, - se_ratio=block_def.se_ratio, - se_inner_activation=block_def.activation, - se_gating_activation='sigmoid', - se_round_down_protect=False, - expand_se_in_filters=True, - activation=block_def.activation, - use_depthwise=block_def.use_depthwise, - use_residual=block_def.use_residual, - regularize_depthwise=self._regularize_depthwise, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon, - divisible_by=self._get_divisible_by() - )(net) - - elif block_def.block_fn == 'tucker': - - in_filters = net.shape.as_list()[-1] - net = nn_blocks.TuckerConvBlock( - in_filters=in_filters, - out_filters=block_def.filters, - kernel_size=block_def.kernel_size, - strides=block_def.strides, - input_compression_ratio=block_def.input_compression_ratio, - output_compression_ratio=block_def.output_compression_ratio, - activation=block_def.activation, - use_residual=block_def.use_residual, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon, - divisible_by=self._get_divisible_by() - )(net) - - else: - raise ValueError('Unknown block type {} for layer {}'.format( - block_def.block_fn, i)) - - net = tf.keras.layers.Activation('linear', name=block_name)(net) - - if block_def.is_output: - endpoints[str(endpoint_level)] = net - endpoint_level += 1 - - return net, endpoints, endpoint_level - - def get_config(self): - config_dict = { - 'model_id': self._model_id, - 'filter_size_scale': self._filter_size_scale, - 'min_depth': self._min_depth, - 'divisible_by': self._divisible_by, - 'regularize_depthwise': self._regularize_depthwise, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon, - } - return config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - @property - def output_specs(self): - """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs - - -@factory.register_backbone_builder('mobiledet') -def build_mobiledet( - input_specs: tf.keras.layers.InputSpec, - backbone_config: hyperparams.Config, - norm_activation_config: hyperparams.Config, - l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None -) -> tf.keras.Model: - """Builds MobileDet backbone from a config.""" - backbone_type = backbone_config.type - backbone_cfg = backbone_config.get() - assert backbone_type == 'mobiledet', (f'Inconsistent backbone type ' - f'{backbone_type}') - - return MobileDet( - model_id=backbone_cfg.model_id, - filter_size_scale=backbone_cfg.filter_size_scale, - input_specs=input_specs, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer) diff --git a/official/vision/beta/modeling/backbones/mobiledet_test.py b/official/vision/beta/modeling/backbones/mobiledet_test.py deleted file mode 100644 index 9624b3219..000000000 --- a/official/vision/beta/modeling/backbones/mobiledet_test.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for Mobiledet.""" - -import itertools - -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.modeling.backbones import mobiledet - - -class MobileDetTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - 'MobileDetCPU', - 'MobileDetDSP', - 'MobileDetEdgeTPU', - 'MobileDetGPU', - ) - def test_serialize_deserialize(self, model_id): - # Create a network object that sets all of its config options. - kwargs = dict( - model_id=model_id, - filter_size_scale=1.0, - use_sync_bn=False, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - norm_momentum=0.99, - norm_epsilon=0.001, - min_depth=8, - divisible_by=8, - regularize_depthwise=False, - ) - network = mobiledet.MobileDet(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(network.get_config(), expected_config) - - # Create another network object from the first object's config. - new_network = mobiledet.MobileDet.from_config(network.get_config()) - - # Validate that the config can be forced to JSON. - _ = new_network.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(network.get_config(), new_network.get_config()) - - @parameterized.parameters( - itertools.product( - [1, 3], - [ - 'MobileDetCPU', - 'MobileDetDSP', - 'MobileDetEdgeTPU', - 'MobileDetGPU', - ], - )) - def test_input_specs(self, input_dim, model_id): - """Test different input feature dimensions.""" - tf.keras.backend.set_image_data_format('channels_last') - - input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim]) - network = mobiledet.MobileDet(model_id=model_id, input_specs=input_specs) - - inputs = tf.keras.Input(shape=(128, 128, input_dim), batch_size=1) - _ = network(inputs) - - @parameterized.parameters( - itertools.product( - [ - 'MobileDetCPU', - 'MobileDetDSP', - 'MobileDetEdgeTPU', - 'MobileDetGPU', - ], - [32, 224], - )) - def test_mobiledet_creation(self, model_id, input_size): - """Test creation of MobileDet family models.""" - tf.keras.backend.set_image_data_format('channels_last') - - mobiledet_layers = { - # The number of filters of layers having outputs been collected - # for filter_size_scale = 1.0 - 'MobileDetCPU': [8, 16, 32, 72, 144], - 'MobileDetDSP': [24, 32, 64, 144, 240], - 'MobileDetEdgeTPU': [16, 16, 40, 96, 384], - 'MobileDetGPU': [16, 32, 64, 128, 384], - } - - network = mobiledet.MobileDet(model_id=model_id, - filter_size_scale=1.0) - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - endpoints = network(inputs) - - for idx, num_filter in enumerate(mobiledet_layers[model_id]): - self.assertAllEqual( - [1, input_size / 2 ** (idx+1), input_size / 2 ** (idx+1), num_filter], - endpoints[str(idx+1)].shape.as_list()) diff --git a/official/vision/beta/modeling/backbones/mobilenet.py b/official/vision/beta/modeling/backbones/mobilenet.py deleted file mode 100644 index 8249720aa..000000000 --- a/official/vision/beta/modeling/backbones/mobilenet.py +++ /dev/null @@ -1,936 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of MobileNet Networks.""" - -import dataclasses -from typing import Optional, Dict, Any, Tuple - -# Import libraries -import tensorflow as tf -from official.modeling import hyperparams -from official.modeling import tf_utils -from official.vision.beta.modeling.backbones import factory -from official.vision.beta.modeling.layers import nn_blocks -from official.vision.beta.modeling.layers import nn_layers - -layers = tf.keras.layers - - -# pylint: disable=pointless-string-statement - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class Conv2DBNBlock(tf.keras.layers.Layer): - """A convolution block with batch normalization.""" - - def __init__( - self, - filters: int, - kernel_size: int = 3, - strides: int = 1, - use_bias: bool = False, - use_explicit_padding: bool = False, - activation: str = 'relu6', - kernel_initializer: str = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - use_normalization: bool = True, - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - **kwargs): - """A convolution block with batch normalization. - - Args: - filters: An `int` number of filters for the first two convolutions. Note - that the third and final convolution will use 4 times as many filters. - kernel_size: An `int` specifying the height and width of the 2D - convolution window. - strides: An `int` of block stride. If greater than 1, this block will - ultimately downsample the input. - use_bias: If True, use bias in the convolution layer. - use_explicit_padding: Use 'VALID' padding for convolutions, but prepad - inputs so that the output dimensions are the same as if 'SAME' padding - were used. - activation: A `str` name of the activation function. - kernel_initializer: A `str` for kernel initializer of convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - Default to None. - use_normalization: If True, use batch normalization. - use_sync_bn: If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - **kwargs: Additional keyword arguments to be passed. - """ - super(Conv2DBNBlock, self).__init__(**kwargs) - self._filters = filters - self._kernel_size = kernel_size - self._strides = strides - self._activation = activation - self._use_bias = use_bias - self._use_explicit_padding = use_explicit_padding - self._kernel_initializer = kernel_initializer - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - self._use_normalization = use_normalization - self._use_sync_bn = use_sync_bn - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - - if use_explicit_padding and kernel_size > 1: - self._padding = 'valid' - else: - self._padding = 'same' - if use_sync_bn: - self._norm = tf.keras.layers.experimental.SyncBatchNormalization - else: - self._norm = tf.keras.layers.BatchNormalization - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - - def get_config(self): - config = { - 'filters': self._filters, - 'strides': self._strides, - 'kernel_size': self._kernel_size, - 'use_bias': self._use_bias, - 'use_explicit_padding': self._use_explicit_padding, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - 'activation': self._activation, - 'use_sync_bn': self._use_sync_bn, - 'use_normalization': self._use_normalization, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon - } - base_config = super(Conv2DBNBlock, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def build(self, input_shape): - if self._use_explicit_padding and self._kernel_size > 1: - padding_size = nn_layers.get_padding_for_kernel_size(self._kernel_size) - self._pad = tf.keras.layers.ZeroPadding2D(padding_size) - self._conv0 = tf.keras.layers.Conv2D( - filters=self._filters, - kernel_size=self._kernel_size, - strides=self._strides, - padding=self._padding, - use_bias=self._use_bias, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - if self._use_normalization: - self._norm0 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - self._activation_layer = tf_utils.get_activation( - self._activation, use_keras_layer=True) - - super(Conv2DBNBlock, self).build(input_shape) - - def call(self, inputs, training=None): - if self._use_explicit_padding and self._kernel_size > 1: - inputs = self._pad(inputs) - x = self._conv0(inputs) - if self._use_normalization: - x = self._norm0(x) - return self._activation_layer(x) - -""" -Architecture: https://arxiv.org/abs/1704.04861. - -"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision -Applications" Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, -Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam -""" -MNV1_BLOCK_SPECS = { - 'spec_name': 'MobileNetV1', - 'block_spec_schema': ['block_fn', 'kernel_size', 'strides', - 'filters', 'is_output'], - 'block_specs': [ - ('convbn', 3, 2, 32, False), - ('depsepconv', 3, 1, 64, False), - ('depsepconv', 3, 2, 128, False), - ('depsepconv', 3, 1, 128, True), - ('depsepconv', 3, 2, 256, False), - ('depsepconv', 3, 1, 256, True), - ('depsepconv', 3, 2, 512, False), - ('depsepconv', 3, 1, 512, False), - ('depsepconv', 3, 1, 512, False), - ('depsepconv', 3, 1, 512, False), - ('depsepconv', 3, 1, 512, False), - ('depsepconv', 3, 1, 512, True), - ('depsepconv', 3, 2, 1024, False), - ('depsepconv', 3, 1, 1024, True), - ] -} - -""" -Architecture: https://arxiv.org/abs/1801.04381 - -"MobileNetV2: Inverted Residuals and Linear Bottlenecks" -Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen -""" -MNV2_BLOCK_SPECS = { - 'spec_name': 'MobileNetV2', - 'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', - 'expand_ratio', 'is_output'], - 'block_specs': [ - ('convbn', 3, 2, 32, None, False), - ('invertedbottleneck', 3, 1, 16, 1., False), - ('invertedbottleneck', 3, 2, 24, 6., False), - ('invertedbottleneck', 3, 1, 24, 6., True), - ('invertedbottleneck', 3, 2, 32, 6., False), - ('invertedbottleneck', 3, 1, 32, 6., False), - ('invertedbottleneck', 3, 1, 32, 6., True), - ('invertedbottleneck', 3, 2, 64, 6., False), - ('invertedbottleneck', 3, 1, 64, 6., False), - ('invertedbottleneck', 3, 1, 64, 6., False), - ('invertedbottleneck', 3, 1, 64, 6., False), - ('invertedbottleneck', 3, 1, 96, 6., False), - ('invertedbottleneck', 3, 1, 96, 6., False), - ('invertedbottleneck', 3, 1, 96, 6., True), - ('invertedbottleneck', 3, 2, 160, 6., False), - ('invertedbottleneck', 3, 1, 160, 6., False), - ('invertedbottleneck', 3, 1, 160, 6., False), - ('invertedbottleneck', 3, 1, 320, 6., True), - ('convbn', 1, 1, 1280, None, False), - ] -} - -""" -Architecture: https://arxiv.org/abs/1905.02244 - -"Searching for MobileNetV3" -Andrew Howard, Mark Sandler, Grace Chu, Liang-Chieh Chen, Bo Chen, Mingxing Tan, -Weijun Wang, Yukun Zhu, Ruoming Pang, Vijay Vasudevan, Quoc V. Le, Hartwig Adam -""" -MNV3Large_BLOCK_SPECS = { - 'spec_name': 'MobileNetV3Large', - 'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', - 'activation', 'se_ratio', 'expand_ratio', - 'use_normalization', 'use_bias', 'is_output'], - 'block_specs': [ - ('convbn', 3, 2, 16, - 'hard_swish', None, None, True, False, False), - ('invertedbottleneck', 3, 1, 16, - 'relu', None, 1., None, False, False), - ('invertedbottleneck', 3, 2, 24, - 'relu', None, 4., None, False, False), - ('invertedbottleneck', 3, 1, 24, - 'relu', None, 3., None, False, True), - ('invertedbottleneck', 5, 2, 40, - 'relu', 0.25, 3., None, False, False), - ('invertedbottleneck', 5, 1, 40, - 'relu', 0.25, 3., None, False, False), - ('invertedbottleneck', 5, 1, 40, - 'relu', 0.25, 3., None, False, True), - ('invertedbottleneck', 3, 2, 80, - 'hard_swish', None, 6., None, False, False), - ('invertedbottleneck', 3, 1, 80, - 'hard_swish', None, 2.5, None, False, False), - ('invertedbottleneck', 3, 1, 80, - 'hard_swish', None, 2.3, None, False, False), - ('invertedbottleneck', 3, 1, 80, - 'hard_swish', None, 2.3, None, False, False), - ('invertedbottleneck', 3, 1, 112, - 'hard_swish', 0.25, 6., None, False, False), - ('invertedbottleneck', 3, 1, 112, - 'hard_swish', 0.25, 6., None, False, True), - ('invertedbottleneck', 5, 2, 160, - 'hard_swish', 0.25, 6., None, False, False), - ('invertedbottleneck', 5, 1, 160, - 'hard_swish', 0.25, 6., None, False, False), - ('invertedbottleneck', 5, 1, 160, - 'hard_swish', 0.25, 6., None, False, True), - ('convbn', 1, 1, 960, - 'hard_swish', None, None, True, False, False), - ('gpooling', None, None, None, - None, None, None, None, None, False), - ('convbn', 1, 1, 1280, - 'hard_swish', None, None, False, True, False), - ] -} - -MNV3Small_BLOCK_SPECS = { - 'spec_name': 'MobileNetV3Small', - 'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', - 'activation', 'se_ratio', 'expand_ratio', - 'use_normalization', 'use_bias', 'is_output'], - 'block_specs': [ - ('convbn', 3, 2, 16, - 'hard_swish', None, None, True, False, False), - ('invertedbottleneck', 3, 2, 16, - 'relu', 0.25, 1, None, False, True), - ('invertedbottleneck', 3, 2, 24, - 'relu', None, 72. / 16, None, False, False), - ('invertedbottleneck', 3, 1, 24, - 'relu', None, 88. / 24, None, False, True), - ('invertedbottleneck', 5, 2, 40, - 'hard_swish', 0.25, 4., None, False, False), - ('invertedbottleneck', 5, 1, 40, - 'hard_swish', 0.25, 6., None, False, False), - ('invertedbottleneck', 5, 1, 40, - 'hard_swish', 0.25, 6., None, False, False), - ('invertedbottleneck', 5, 1, 48, - 'hard_swish', 0.25, 3., None, False, False), - ('invertedbottleneck', 5, 1, 48, - 'hard_swish', 0.25, 3., None, False, True), - ('invertedbottleneck', 5, 2, 96, - 'hard_swish', 0.25, 6., None, False, False), - ('invertedbottleneck', 5, 1, 96, - 'hard_swish', 0.25, 6., None, False, False), - ('invertedbottleneck', 5, 1, 96, - 'hard_swish', 0.25, 6., None, False, True), - ('convbn', 1, 1, 576, - 'hard_swish', None, None, True, False, False), - ('gpooling', None, None, None, - None, None, None, None, None, False), - ('convbn', 1, 1, 1024, - 'hard_swish', None, None, False, True, False), - ] -} - -""" -The EdgeTPU version is taken from -github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v3.py -""" -MNV3EdgeTPU_BLOCK_SPECS = { - 'spec_name': 'MobileNetV3EdgeTPU', - 'block_spec_schema': ['block_fn', 'kernel_size', 'strides', 'filters', - 'activation', 'se_ratio', 'expand_ratio', - 'use_residual', 'use_depthwise', 'is_output'], - 'block_specs': [ - ('convbn', 3, 2, 32, 'relu', None, None, None, None, False), - ('invertedbottleneck', 3, 1, 16, 'relu', None, 1., True, False, False), - ('invertedbottleneck', 3, 2, 32, 'relu', None, 8., True, False, False), - ('invertedbottleneck', 3, 1, 32, 'relu', None, 4., True, False, False), - ('invertedbottleneck', 3, 1, 32, 'relu', None, 4., True, False, False), - ('invertedbottleneck', 3, 1, 32, 'relu', None, 4., True, False, True), - ('invertedbottleneck', 3, 2, 48, 'relu', None, 8., True, False, False), - ('invertedbottleneck', 3, 1, 48, 'relu', None, 4., True, False, False), - ('invertedbottleneck', 3, 1, 48, 'relu', None, 4., True, False, False), - ('invertedbottleneck', 3, 1, 48, 'relu', None, 4., True, False, True), - ('invertedbottleneck', 3, 2, 96, 'relu', None, 8., True, True, False), - ('invertedbottleneck', 3, 1, 96, 'relu', None, 4., True, True, False), - ('invertedbottleneck', 3, 1, 96, 'relu', None, 4., True, True, False), - ('invertedbottleneck', 3, 1, 96, 'relu', None, 4., True, True, False), - ('invertedbottleneck', 3, 1, 96, 'relu', None, 8., False, True, False), - ('invertedbottleneck', 3, 1, 96, 'relu', None, 4., True, True, False), - ('invertedbottleneck', 3, 1, 96, 'relu', None, 4., True, True, False), - ('invertedbottleneck', 3, 1, 96, 'relu', None, 4., True, True, True), - ('invertedbottleneck', 5, 2, 160, 'relu', None, 8., True, True, False), - ('invertedbottleneck', 5, 1, 160, 'relu', None, 4., True, True, False), - ('invertedbottleneck', 5, 1, 160, 'relu', None, 4., True, True, False), - ('invertedbottleneck', 5, 1, 160, 'relu', None, 4., True, True, False), - ('invertedbottleneck', 3, 1, 192, 'relu', None, 8., True, True, True), - ('convbn', 1, 1, 1280, 'relu', None, None, None, None, False), - ] -} - -""" -Architecture: https://arxiv.org/pdf/2008.08178.pdf - -"Discovering Multi-Hardware Mobile Models via Architecture Search" -Grace Chu, Okan Arikan, Gabriel Bender, Weijun Wang, -Achille Brighton, Pieter-Jan Kindermans, Hanxiao Liu, -Berkin Akin, Suyog Gupta, and Andrew Howard -""" -MNMultiMAX_BLOCK_SPECS = { - 'spec_name': 'MobileNetMultiMAX', - 'block_spec_schema': [ - 'block_fn', 'kernel_size', 'strides', 'filters', 'activation', - 'expand_ratio', 'use_normalization', 'use_bias', 'is_output' - ], - 'block_specs': [ - ('convbn', 3, 2, 32, 'relu', None, True, False, False), - ('invertedbottleneck', 3, 2, 32, 'relu', 3., None, False, True), - ('invertedbottleneck', 5, 2, 64, 'relu', 6., None, False, False), - ('invertedbottleneck', 3, 1, 64, 'relu', 2., None, False, False), - ('invertedbottleneck', 3, 1, 64, 'relu', 2., None, False, True), - ('invertedbottleneck', 5, 2, 128, 'relu', 6., None, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 4., None, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 3., None, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 3., None, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 6., None, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 3., None, False, True), - ('invertedbottleneck', 3, 2, 160, 'relu', 6., None, False, False), - ('invertedbottleneck', 5, 1, 160, 'relu', 4., None, False, False), - ('invertedbottleneck', 3, 1, 160, 'relu', 5., None, False, False), - ('invertedbottleneck', 5, 1, 160, 'relu', 4., None, False, True), - ('convbn', 1, 1, 960, 'relu', None, True, False, False), - ('gpooling', None, None, None, None, None, None, None, False), - # Remove bias and add batch norm for the last layer to support QAT - # and achieve slightly better accuracy. - ('convbn', 1, 1, 1280, 'relu', None, True, False, False), - ] -} - -MNMultiAVG_BLOCK_SPECS = { - 'spec_name': 'MobileNetMultiAVG', - 'block_spec_schema': [ - 'block_fn', 'kernel_size', 'strides', 'filters', 'activation', - 'expand_ratio', 'use_normalization', 'use_bias', 'is_output' - ], - 'block_specs': [ - ('convbn', 3, 2, 32, 'relu', None, True, False, False), - ('invertedbottleneck', 3, 2, 32, 'relu', 3., None, False, False), - ('invertedbottleneck', 3, 1, 32, 'relu', 2., None, False, True), - ('invertedbottleneck', 5, 2, 64, 'relu', 5., None, False, False), - ('invertedbottleneck', 3, 1, 64, 'relu', 3., None, False, False), - ('invertedbottleneck', 3, 1, 64, 'relu', 2., None, False, False), - ('invertedbottleneck', 3, 1, 64, 'relu', 3., None, False, True), - ('invertedbottleneck', 5, 2, 128, 'relu', 6., None, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 3., None, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 3., None, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 3., None, False, False), - ('invertedbottleneck', 3, 1, 160, 'relu', 6., None, False, False), - ('invertedbottleneck', 3, 1, 160, 'relu', 4., None, False, True), - ('invertedbottleneck', 3, 2, 192, 'relu', 6., None, False, False), - ('invertedbottleneck', 5, 1, 192, 'relu', 4., None, False, False), - ('invertedbottleneck', 5, 1, 192, 'relu', 4., None, False, False), - ('invertedbottleneck', 5, 1, 192, 'relu', 4., None, False, True), - ('convbn', 1, 1, 960, 'relu', None, True, False, False), - ('gpooling', None, None, None, None, None, None, None, False), - # Remove bias and add batch norm for the last layer to support QAT - # and achieve slightly better accuracy. - ('convbn', 1, 1, 1280, 'relu', None, True, False, False), - ] -} - -# Similar to MobileNetMultiAVG and used for segmentation task. -# Reduced the filters by a factor of 2 in the last block. -MNMultiAVG_SEG_BLOCK_SPECS = { - 'spec_name': - 'MobileNetMultiAVGSeg', - 'block_spec_schema': [ - 'block_fn', 'kernel_size', 'strides', 'filters', 'activation', - 'expand_ratio', 'use_normalization', 'use_bias', 'is_output' - ], - 'block_specs': [ - ('convbn', 3, 2, 32, 'relu', None, True, False, False), - ('invertedbottleneck', 3, 2, 32, 'relu', 3., True, False, False), - ('invertedbottleneck', 3, 1, 32, 'relu', 2., True, False, True), - ('invertedbottleneck', 5, 2, 64, 'relu', 5., True, False, False), - ('invertedbottleneck', 3, 1, 64, 'relu', 3., True, False, False), - ('invertedbottleneck', 3, 1, 64, 'relu', 2., True, False, False), - ('invertedbottleneck', 3, 1, 64, 'relu', 3., True, False, True), - ('invertedbottleneck', 5, 2, 128, 'relu', 6., True, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 3., True, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 3., True, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 3., True, False, False), - ('invertedbottleneck', 3, 1, 160, 'relu', 6., True, False, False), - ('invertedbottleneck', 3, 1, 160, 'relu', 4., True, False, True), - ('invertedbottleneck', 3, 2, 192, 'relu', 6., True, False, False), - ('invertedbottleneck', 5, 1, 96, 'relu', 2., True, False, False), - ('invertedbottleneck', 5, 1, 96, 'relu', 4., True, False, False), - ('invertedbottleneck', 5, 1, 96, 'relu', 4., True, False, True), - ('convbn', 1, 1, 448, 'relu', None, True, False, True), - ('gpooling', None, None, None, None, None, None, None, False), - # Remove bias and add batch norm for the last layer to support QAT - # and achieve slightly better accuracy. - ('convbn', 1, 1, 1280, 'relu', None, True, False, False), - ] -} - -# Similar to MobileNetMultiMax and used for segmentation task. -# Reduced the filters by a factor of 2 in the last block. -MNMultiMAX_SEG_BLOCK_SPECS = { - 'spec_name': - 'MobileNetMultiMAXSeg', - 'block_spec_schema': [ - 'block_fn', 'kernel_size', 'strides', 'filters', 'activation', - 'expand_ratio', 'use_normalization', 'use_bias', 'is_output' - ], - 'block_specs': [ - ('convbn', 3, 2, 32, 'relu', None, True, False, False), - ('invertedbottleneck', 3, 2, 32, 'relu', 3., True, False, True), - ('invertedbottleneck', 5, 2, 64, 'relu', 6., True, False, False), - ('invertedbottleneck', 3, 1, 64, 'relu', 2., True, False, False), - ('invertedbottleneck', 3, 1, 64, 'relu', 2., True, False, True), - ('invertedbottleneck', 5, 2, 128, 'relu', 6., True, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 4., True, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 3., True, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 3., True, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 6., True, False, False), - ('invertedbottleneck', 3, 1, 128, 'relu', 3., True, False, True), - ('invertedbottleneck', 3, 2, 160, 'relu', 6., True, False, False), - ('invertedbottleneck', 5, 1, 96, 'relu', 2., True, False, False), - ('invertedbottleneck', 3, 1, 96, 'relu', 4., True, False, False), - ('invertedbottleneck', 5, 1, 96, 'relu', 320.0 / 96, True, False, True), - ('convbn', 1, 1, 448, 'relu', None, True, False, True), - ('gpooling', None, None, None, None, None, None, None, False), - # Remove bias and add batch norm for the last layer to support QAT - # and achieve slightly better accuracy. - ('convbn', 1, 1, 1280, 'relu', None, True, False, False), - ] -} - -# A smaller MNV3Small, with reduced filters for the last few layers -MNV3SmallReducedFilters = { - 'spec_name': - 'MobilenetV3SmallReducedFilters', - 'block_spec_schema': [ - 'block_fn', 'kernel_size', 'strides', 'filters', 'activation', - 'se_ratio', 'expand_ratio', 'use_normalization', 'use_bias', 'is_output' - ], - 'block_specs': [ - ('convbn', 3, 2, 16, 'hard_swish', None, None, True, False, False), - ('invertedbottleneck', 3, 2, 16, 'relu', 0.25, 1, None, False, True), - ('invertedbottleneck', 3, 2, 24, 'relu', None, 72. / 16, None, False, - False), - ('invertedbottleneck', 3, 1, 24, 'relu', None, 88. / 24, None, False, - True), - ('invertedbottleneck', 5, 2, 40, 'hard_swish', 0.25, 4, None, False, - False), - ('invertedbottleneck', 5, 1, 40, 'hard_swish', 0.25, 6, None, False, - False), - ('invertedbottleneck', 5, 1, 40, 'hard_swish', 0.25, 6, None, False, - False), - ('invertedbottleneck', 5, 1, 48, 'hard_swish', 0.25, 3, None, False, - False), - ('invertedbottleneck', 5, 1, 48, 'hard_swish', 0.25, 3, None, False, - True), - # Layers below are different from MobileNetV3Small and have - # half as many filters - ('invertedbottleneck', 5, 2, 48, 'hard_swish', 0.25, 3, None, False, - False), - ('invertedbottleneck', 5, 1, 48, 'hard_swish', 0.25, 6, None, False, - False), - ('invertedbottleneck', 5, 1, 48, 'hard_swish', 0.25, 6, None, False, - True), - ('convbn', 1, 1, 288, 'hard_swish', None, None, True, False, False), - ('gpooling', None, None, None, None, None, None, None, None, False), - ('convbn', 1, 1, 1024, 'hard_swish', None, None, False, True, False), - ] -} - -SUPPORTED_SPECS_MAP = { - 'MobileNetV1': MNV1_BLOCK_SPECS, - 'MobileNetV2': MNV2_BLOCK_SPECS, - 'MobileNetV3Large': MNV3Large_BLOCK_SPECS, - 'MobileNetV3Small': MNV3Small_BLOCK_SPECS, - 'MobileNetV3EdgeTPU': MNV3EdgeTPU_BLOCK_SPECS, - 'MobileNetMultiMAX': MNMultiMAX_BLOCK_SPECS, - 'MobileNetMultiAVG': MNMultiAVG_BLOCK_SPECS, - 'MobileNetMultiAVGSeg': MNMultiAVG_SEG_BLOCK_SPECS, - 'MobileNetMultiMAXSeg': MNMultiMAX_SEG_BLOCK_SPECS, - 'MobileNetV3SmallReducedFilters': MNV3SmallReducedFilters, -} - - -@dataclasses.dataclass -class BlockSpec(hyperparams.Config): - """A container class that specifies the block configuration for MobileNet.""" - - block_fn: str = 'convbn' - kernel_size: int = 3 - strides: int = 1 - filters: int = 32 - use_bias: bool = False - use_normalization: bool = True - activation: str = 'relu6' - # Used for block type InvertedResConv. - expand_ratio: Optional[float] = 6. - # Used for block type InvertedResConv with SE. - se_ratio: Optional[float] = None - use_depthwise: bool = True - use_residual: bool = True - is_output: bool = True - - -def block_spec_decoder( - specs: Dict[Any, Any], - filter_size_scale: float, - # Set to 1 for mobilenetv1. - divisible_by: int = 8, - finegrain_classification_mode: bool = True): - """Decodes specs for a block. - - Args: - specs: A `dict` specification of block specs of a mobilenet version. - filter_size_scale: A `float` multiplier for the filter size for all - convolution ops. The value must be greater than zero. Typical usage will - be to set this value in (0, 1) to reduce the number of parameters or - computation cost of the model. - divisible_by: An `int` that ensures all inner dimensions are divisible by - this number. - finegrain_classification_mode: If True, the model will keep the last layer - large even for small multipliers, following - https://arxiv.org/abs/1801.04381. - - Returns: - A list of `BlockSpec` that defines structure of the base network. - """ - - spec_name = specs['spec_name'] - block_spec_schema = specs['block_spec_schema'] - block_specs = specs['block_specs'] - - if not block_specs: - raise ValueError( - 'The block spec cannot be empty for {} !'.format(spec_name)) - - if len(block_specs[0]) != len(block_spec_schema): - raise ValueError('The block spec values {} do not match with ' - 'the schema {}'.format(block_specs[0], block_spec_schema)) - - decoded_specs = [] - - for s in block_specs: - kw_s = dict(zip(block_spec_schema, s)) - decoded_specs.append(BlockSpec(**kw_s)) - - # This adjustment applies to V2 and V3 - if (spec_name != 'MobileNetV1' - and finegrain_classification_mode - and filter_size_scale < 1.0): - decoded_specs[-1].filters /= filter_size_scale # pytype: disable=annotation-type-mismatch - - for ds in decoded_specs: - if ds.filters: - ds.filters = nn_layers.round_filters(filters=ds.filters, - multiplier=filter_size_scale, - divisor=divisible_by, - min_depth=8) - - return decoded_specs - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class MobileNet(tf.keras.Model): - """Creates a MobileNet family model.""" - - def __init__( - self, - model_id: str = 'MobileNetV2', - filter_size_scale: float = 1.0, - input_specs: tf.keras.layers.InputSpec = layers.InputSpec( - shape=[None, None, None, 3]), - # The followings are for hyper-parameter tuning. - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_initializer: str = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - # The followings should be kept the same most of the times. - output_stride: Optional[int] = None, - min_depth: int = 8, - # divisible is not used in MobileNetV1. - divisible_by: int = 8, - stochastic_depth_drop_rate: float = 0.0, - regularize_depthwise: bool = False, - use_sync_bn: bool = False, - # finegrain is not used in MobileNetV1. - finegrain_classification_mode: bool = True, - output_intermediate_endpoints: bool = False, - **kwargs): - """Initializes a MobileNet model. - - Args: - model_id: A `str` of MobileNet version. The supported values are - `MobileNetV1`, `MobileNetV2`, `MobileNetV3Large`, `MobileNetV3Small`, - `MobileNetV3EdgeTPU`, `MobileNetMultiMAX` and `MobileNetMultiAVG`. - filter_size_scale: A `float` of multiplier for the filters (number of - channels) for all convolution ops. The value must be greater than zero. - Typical usage will be to set this value in (0, 1) to reduce the number - of parameters or computation cost of the model. - input_specs: A `tf.keras.layers.InputSpec` of specs of the input tensor. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - kernel_initializer: A `str` for kernel initializer of convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - Default to None. - output_stride: An `int` that specifies the requested ratio of input to - output spatial resolution. If not None, then we invoke atrous - convolution if necessary to prevent the network from reducing the - spatial resolution of activation maps. Allowed values are 8 (accurate - fully convolutional mode), 16 (fast fully convolutional mode), 32 - (classification mode). - min_depth: An `int` of minimum depth (number of channels) for all - convolution ops. Enforced when filter_size_scale < 1, and not an active - constraint when filter_size_scale >= 1. - divisible_by: An `int` that ensures all inner dimensions are divisible by - this number. - stochastic_depth_drop_rate: A `float` of drop rate for drop connect layer. - regularize_depthwise: If Ture, apply regularization on depthwise. - use_sync_bn: If True, use synchronized batch normalization. - finegrain_classification_mode: If True, the model will keep the last layer - large even for small multipliers, following - https://arxiv.org/abs/1801.04381. - output_intermediate_endpoints: A `bool` of whether or not output the - intermediate endpoints. - **kwargs: Additional keyword arguments to be passed. - """ - if model_id not in SUPPORTED_SPECS_MAP: - raise ValueError('The MobileNet version {} ' - 'is not supported'.format(model_id)) - - if filter_size_scale <= 0: - raise ValueError('filter_size_scale is not greater than zero.') - - if output_stride is not None: - if model_id == 'MobileNetV1': - if output_stride not in [8, 16, 32]: - raise ValueError('Only allowed output_stride values are 8, 16, 32.') - else: - if output_stride == 0 or (output_stride > 1 and output_stride % 2): - raise ValueError('Output stride must be None, 1 or a multiple of 2.') - - self._model_id = model_id - self._input_specs = input_specs - self._filter_size_scale = filter_size_scale - self._min_depth = min_depth - self._output_stride = output_stride - self._divisible_by = divisible_by - self._stochastic_depth_drop_rate = stochastic_depth_drop_rate - self._regularize_depthwise = regularize_depthwise - self._kernel_initializer = kernel_initializer - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - self._use_sync_bn = use_sync_bn - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - self._finegrain_classification_mode = finegrain_classification_mode - self._output_intermediate_endpoints = output_intermediate_endpoints - - inputs = tf.keras.Input(shape=input_specs.shape[1:]) - - block_specs = SUPPORTED_SPECS_MAP.get(model_id) - self._decoded_specs = block_spec_decoder( - specs=block_specs, - filter_size_scale=self._filter_size_scale, - divisible_by=self._get_divisible_by(), - finegrain_classification_mode=self._finegrain_classification_mode) - - x, endpoints, next_endpoint_level = self._mobilenet_base(inputs=inputs) - - self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} - # Don't include the final layer in `self._output_specs` to support decoders. - endpoints[str(next_endpoint_level)] = x - - super(MobileNet, self).__init__( - inputs=inputs, outputs=endpoints, **kwargs) - - def _get_divisible_by(self): - if self._model_id == 'MobileNetV1': - return 1 - else: - return self._divisible_by - - def _mobilenet_base(self, - inputs: tf.Tensor - ) -> Tuple[tf.Tensor, Dict[str, tf.Tensor], int]: - """Builds the base MobileNet architecture. - - Args: - inputs: A `tf.Tensor` of shape `[batch_size, height, width, channels]`. - - Returns: - A tuple of output Tensor and dictionary that collects endpoints. - """ - - input_shape = inputs.get_shape().as_list() - if len(input_shape) != 4: - raise ValueError('Expected rank 4 input, was: %d' % len(input_shape)) - - # The current_stride variable keeps track of the output stride of the - # activations, i.e., the running product of convolution strides up to the - # current network layer. This allows us to invoke atrous convolution - # whenever applying the next convolution would result in the activations - # having output stride larger than the target output_stride. - current_stride = 1 - - # The atrous convolution rate parameter. - rate = 1 - - net = inputs - endpoints = {} - endpoint_level = 2 - for i, block_def in enumerate(self._decoded_specs): - block_name = 'block_group_{}_{}'.format(block_def.block_fn, i) - # A small catch for gpooling block with None strides - if not block_def.strides: - block_def.strides = 1 - if (self._output_stride is not None and - current_stride == self._output_stride): - # If we have reached the target output_stride, then we need to employ - # atrous convolution with stride=1 and multiply the atrous rate by the - # current unit's stride for use in subsequent layers. - layer_stride = 1 - layer_rate = rate - rate *= block_def.strides - else: - layer_stride = block_def.strides - layer_rate = 1 - current_stride *= block_def.strides - - intermediate_endpoints = {} - if block_def.block_fn == 'convbn': - - net = Conv2DBNBlock( - filters=block_def.filters, - kernel_size=block_def.kernel_size, - strides=block_def.strides, - activation=block_def.activation, - use_bias=block_def.use_bias, - use_normalization=block_def.use_normalization, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon - )(net) - - elif block_def.block_fn == 'depsepconv': - net = nn_blocks.DepthwiseSeparableConvBlock( - filters=block_def.filters, - kernel_size=block_def.kernel_size, - strides=layer_stride, - activation=block_def.activation, - dilation_rate=layer_rate, - regularize_depthwise=self._regularize_depthwise, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon, - )(net) - - elif block_def.block_fn == 'invertedbottleneck': - use_rate = rate - if layer_rate > 1 and block_def.kernel_size != 1: - # We will apply atrous rate in the following cases: - # 1) When kernel_size is not in params, the operation then uses - # default kernel size 3x3. - # 2) When kernel_size is in params, and if the kernel_size is not - # equal to (1, 1) (there is no need to apply atrous convolution to - # any 1x1 convolution). - use_rate = layer_rate - in_filters = net.shape.as_list()[-1] - block = nn_blocks.InvertedBottleneckBlock( - in_filters=in_filters, - out_filters=block_def.filters, - kernel_size=block_def.kernel_size, - strides=layer_stride, - expand_ratio=block_def.expand_ratio, - se_ratio=block_def.se_ratio, - expand_se_in_filters=True, - se_gating_activation='hard_sigmoid', - activation=block_def.activation, - use_depthwise=block_def.use_depthwise, - use_residual=block_def.use_residual, - dilation_rate=use_rate, - regularize_depthwise=self._regularize_depthwise, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon, - stochastic_depth_drop_rate=self._stochastic_depth_drop_rate, - divisible_by=self._get_divisible_by(), - output_intermediate_endpoints=self._output_intermediate_endpoints, - ) - if self._output_intermediate_endpoints: - net, intermediate_endpoints = block(net) - else: - net = block(net) - - elif block_def.block_fn == 'gpooling': - net = layers.GlobalAveragePooling2D()(net) - net = layers.Reshape((1, 1, net.shape[1]))(net) - - else: - raise ValueError('Unknown block type {} for layer {}'.format( - block_def.block_fn, i)) - - net = tf.keras.layers.Activation('linear', name=block_name)(net) - - if block_def.is_output: - endpoints[str(endpoint_level)] = net - for key, tensor in intermediate_endpoints.items(): - endpoints[str(endpoint_level) + '/' + key] = tensor - if current_stride != self._output_stride: - endpoint_level += 1 - - if str(endpoint_level) in endpoints: - endpoint_level += 1 - return net, endpoints, endpoint_level - - def get_config(self): - config_dict = { - 'model_id': self._model_id, - 'filter_size_scale': self._filter_size_scale, - 'min_depth': self._min_depth, - 'output_stride': self._output_stride, - 'divisible_by': self._divisible_by, - 'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate, - 'regularize_depthwise': self._regularize_depthwise, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon, - 'finegrain_classification_mode': self._finegrain_classification_mode, - } - return config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - @property - def output_specs(self): - """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs - - -@factory.register_backbone_builder('mobilenet') -def build_mobilenet( - input_specs: tf.keras.layers.InputSpec, - backbone_config: hyperparams.Config, - norm_activation_config: hyperparams.Config, - l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None -) -> tf.keras.Model: - """Builds MobileNet backbone from a config.""" - backbone_type = backbone_config.type - backbone_cfg = backbone_config.get() - assert backbone_type == 'mobilenet', (f'Inconsistent backbone type ' - f'{backbone_type}') - - return MobileNet( - model_id=backbone_cfg.model_id, - filter_size_scale=backbone_cfg.filter_size_scale, - input_specs=input_specs, - stochastic_depth_drop_rate=backbone_cfg.stochastic_depth_drop_rate, - output_stride=backbone_cfg.output_stride, - output_intermediate_endpoints=backbone_cfg.output_intermediate_endpoints, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer) diff --git a/official/vision/beta/modeling/backbones/mobilenet_test.py b/official/vision/beta/modeling/backbones/mobilenet_test.py deleted file mode 100644 index dabaa4c82..000000000 --- a/official/vision/beta/modeling/backbones/mobilenet_test.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for MobileNet.""" - -import itertools -import math - -# Import libraries - -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.modeling.backbones import mobilenet - - -class MobileNetTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - 'MobileNetV1', - 'MobileNetV2', - 'MobileNetV3Large', - 'MobileNetV3Small', - 'MobileNetV3EdgeTPU', - 'MobileNetMultiAVG', - 'MobileNetMultiMAX', - 'MobileNetMultiAVGSeg', - 'MobileNetMultiMAXSeg', - 'MobileNetV3SmallReducedFilters', - ) - def test_serialize_deserialize(self, model_id): - # Create a network object that sets all of its config options. - kwargs = dict( - model_id=model_id, - filter_size_scale=1.0, - stochastic_depth_drop_rate=None, - use_sync_bn=False, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - norm_momentum=0.99, - norm_epsilon=0.001, - output_stride=None, - min_depth=8, - divisible_by=8, - regularize_depthwise=False, - finegrain_classification_mode=True - ) - network = mobilenet.MobileNet(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(network.get_config(), expected_config) - - # Create another network object from the first object's config. - new_network = mobilenet.MobileNet.from_config(network.get_config()) - - # Validate that the config can be forced to JSON. - _ = new_network.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(network.get_config(), new_network.get_config()) - - @parameterized.parameters( - itertools.product( - [1, 3], - [ - 'MobileNetV1', - 'MobileNetV2', - 'MobileNetV3Large', - 'MobileNetV3Small', - 'MobileNetV3EdgeTPU', - 'MobileNetMultiAVG', - 'MobileNetMultiMAX', - 'MobileNetMultiAVGSeg', - 'MobileNetMultiMAXSeg', - 'MobileNetV3SmallReducedFilters', - ], - )) - def test_input_specs(self, input_dim, model_id): - """Test different input feature dimensions.""" - tf.keras.backend.set_image_data_format('channels_last') - - input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim]) - network = mobilenet.MobileNet(model_id=model_id, input_specs=input_specs) - - inputs = tf.keras.Input(shape=(128, 128, input_dim), batch_size=1) - _ = network(inputs) - - @parameterized.parameters( - itertools.product( - [ - 'MobileNetV1', - 'MobileNetV2', - 'MobileNetV3Large', - 'MobileNetV3Small', - 'MobileNetV3EdgeTPU', - 'MobileNetMultiAVG', - 'MobileNetMultiMAX', - 'MobileNetMultiAVGSeg', - 'MobileNetV3SmallReducedFilters', - ], - [32, 224], - )) - def test_mobilenet_creation(self, model_id, - input_size): - """Test creation of MobileNet family models.""" - tf.keras.backend.set_image_data_format('channels_last') - - mobilenet_layers = { - # The number of filters of layers having outputs been collected - # for filter_size_scale = 1.0 - 'MobileNetV1': [128, 256, 512, 1024], - 'MobileNetV2': [24, 32, 96, 320], - 'MobileNetV3Small': [16, 24, 48, 96], - 'MobileNetV3Large': [24, 40, 112, 160], - 'MobileNetV3EdgeTPU': [32, 48, 96, 192], - 'MobileNetMultiMAX': [32, 64, 128, 160], - 'MobileNetMultiAVG': [32, 64, 160, 192], - 'MobileNetMultiAVGSeg': [32, 64, 160, 96], - 'MobileNetMultiMAXSeg': [32, 64, 128, 96], - 'MobileNetV3SmallReducedFilters': [16, 24, 48, 48], - } - - network = mobilenet.MobileNet(model_id=model_id, - filter_size_scale=1.0) - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - endpoints = network(inputs) - - for idx, num_filter in enumerate(mobilenet_layers[model_id]): - self.assertAllEqual( - [1, input_size / 2 ** (idx+2), input_size / 2 ** (idx+2), num_filter], - endpoints[str(idx+2)].shape.as_list()) - - @parameterized.parameters( - itertools.product( - [ - 'MobileNetV1', - 'MobileNetV2', - 'MobileNetV3Large', - 'MobileNetV3Small', - 'MobileNetV3EdgeTPU', - 'MobileNetMultiAVG', - 'MobileNetMultiMAX', - 'MobileNetMultiAVGSeg', - 'MobileNetMultiMAXSeg', - 'MobileNetV3SmallReducedFilters', - ], - [32, 224], - )) - def test_mobilenet_intermediate_layers(self, model_id, input_size): - tf.keras.backend.set_image_data_format('channels_last') - # Tests the mobilenet intermediate depthwise layers. - mobilenet_depthwise_layers = { - # The number of filters of depthwise layers having outputs been - # collected for filter_size_scale = 1.0. Only tests the mobilenet - # model with inverted bottleneck block using depthwise which excludes - # MobileNetV1. - 'MobileNetV1': [], - 'MobileNetV2': [144, 192, 576, 960], - 'MobileNetV3Small': [16, 88, 144, 576], - 'MobileNetV3Large': [72, 120, 672, 960], - 'MobileNetV3EdgeTPU': [None, None, 384, 1280], - 'MobileNetMultiMAX': [96, 128, 384, 640], - 'MobileNetMultiAVG': [64, 192, 640, 768], - 'MobileNetMultiAVGSeg': [64, 192, 640, 384], - 'MobileNetMultiMAXSeg': [96, 128, 384, 320], - 'MobileNetV3SmallReducedFilters': [16, 88, 144, 288], - } - network = mobilenet.MobileNet(model_id=model_id, - filter_size_scale=1.0, - output_intermediate_endpoints=True) - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - endpoints = network(inputs) - - for idx, num_filter in enumerate(mobilenet_depthwise_layers[model_id]): - # Not using depthwise conv in this layer. - if num_filter is None: - continue - - self.assertAllEqual( - [1, input_size / 2**(idx + 2), input_size / 2**(idx + 2), num_filter], - endpoints[str(idx + 2) + '/depthwise'].shape.as_list()) - - @parameterized.parameters( - itertools.product( - [ - 'MobileNetV1', - 'MobileNetV2', - 'MobileNetV3Large', - 'MobileNetV3Small', - 'MobileNetV3EdgeTPU', - 'MobileNetMultiAVG', - 'MobileNetMultiMAX', - 'MobileNetMultiMAX', - 'MobileNetMultiAVGSeg', - 'MobileNetMultiMAXSeg', - 'MobileNetV3SmallReducedFilters', - ], - [1.0, 0.75], - )) - def test_mobilenet_scaling(self, model_id, - filter_size_scale): - """Test for creation of a MobileNet classifier.""" - mobilenet_params = { - ('MobileNetV1', 1.0): 3228864, - ('MobileNetV1', 0.75): 1832976, - ('MobileNetV2', 1.0): 2257984, - ('MobileNetV2', 0.75): 1382064, - ('MobileNetV3Large', 1.0): 4226432, - ('MobileNetV3Large', 0.75): 2731616, - ('MobileNetV3Small', 1.0): 1529968, - ('MobileNetV3Small', 0.75): 1026552, - ('MobileNetV3EdgeTPU', 1.0): 2849312, - ('MobileNetV3EdgeTPU', 0.75): 1737288, - ('MobileNetMultiAVG', 1.0): 3704416, - ('MobileNetMultiAVG', 0.75): 2349704, - ('MobileNetMultiMAX', 1.0): 3174560, - ('MobileNetMultiMAX', 0.75): 2045816, - ('MobileNetMultiAVGSeg', 1.0): 2239840, - ('MobileNetMultiAVGSeg', 0.75): 1395272, - ('MobileNetMultiMAXSeg', 1.0): 1929088, - ('MobileNetMultiMAXSeg', 0.75): 1216544, - ('MobileNetV3SmallReducedFilters', 1.0): 694880, - ('MobileNetV3SmallReducedFilters', 0.75): 505960, - } - - input_size = 224 - network = mobilenet.MobileNet(model_id=model_id, - filter_size_scale=filter_size_scale) - self.assertEqual(network.count_params(), - mobilenet_params[(model_id, filter_size_scale)]) - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - _ = network(inputs) - - @parameterized.parameters( - itertools.product( - [ - 'MobileNetV1', - 'MobileNetV2', - 'MobileNetV3Large', - 'MobileNetV3Small', - 'MobileNetV3EdgeTPU', - 'MobileNetMultiAVG', - 'MobileNetMultiMAX', - 'MobileNetMultiAVGSeg', - 'MobileNetMultiMAXSeg', - 'MobileNetV3SmallReducedFilters', - ], - [8, 16, 32], - )) - def test_mobilenet_output_stride(self, model_id, output_stride): - """Test for creation of a MobileNet with different output strides.""" - tf.keras.backend.set_image_data_format('channels_last') - - mobilenet_layers = { - # The number of filters of the layers outputs been collected - # for filter_size_scale = 1.0. - 'MobileNetV1': 1024, - 'MobileNetV2': 320, - 'MobileNetV3Small': 96, - 'MobileNetV3Large': 160, - 'MobileNetV3EdgeTPU': 192, - 'MobileNetMultiMAX': 160, - 'MobileNetMultiAVG': 192, - 'MobileNetMultiAVGSeg': 448, - 'MobileNetMultiMAXSeg': 448, - 'MobileNetV3SmallReducedFilters': 48, - } - - network = mobilenet.MobileNet( - model_id=model_id, filter_size_scale=1.0, output_stride=output_stride) - level = int(math.log2(output_stride)) - input_size = 224 - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - endpoints = network(inputs) - num_filter = mobilenet_layers[model_id] - self.assertAllEqual( - [1, input_size / output_stride, input_size / output_stride, num_filter], - endpoints[str(level)].shape.as_list()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/backbones/resnet.py b/official/vision/beta/modeling/backbones/resnet.py deleted file mode 100644 index a451216d5..000000000 --- a/official/vision/beta/modeling/backbones/resnet.py +++ /dev/null @@ -1,432 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of ResNet and ResNet-RS models.""" - -from typing import Callable, Optional - -# Import libraries -import tensorflow as tf - -from official.modeling import hyperparams -from official.modeling import tf_utils -from official.vision.beta.modeling.backbones import factory -from official.vision.beta.modeling.layers import nn_blocks -from official.vision.beta.modeling.layers import nn_layers - -layers = tf.keras.layers - -# Specifications for different ResNet variants. -# Each entry specifies block configurations of the particular ResNet variant. -# Each element in the block configuration is in the following format: -# (block_fn, num_filters, block_repeats) -RESNET_SPECS = { - 10: [ - ('residual', 64, 1), - ('residual', 128, 1), - ('residual', 256, 1), - ('residual', 512, 1), - ], - 18: [ - ('residual', 64, 2), - ('residual', 128, 2), - ('residual', 256, 2), - ('residual', 512, 2), - ], - 34: [ - ('residual', 64, 3), - ('residual', 128, 4), - ('residual', 256, 6), - ('residual', 512, 3), - ], - 50: [ - ('bottleneck', 64, 3), - ('bottleneck', 128, 4), - ('bottleneck', 256, 6), - ('bottleneck', 512, 3), - ], - 101: [ - ('bottleneck', 64, 3), - ('bottleneck', 128, 4), - ('bottleneck', 256, 23), - ('bottleneck', 512, 3), - ], - 152: [ - ('bottleneck', 64, 3), - ('bottleneck', 128, 8), - ('bottleneck', 256, 36), - ('bottleneck', 512, 3), - ], - 200: [ - ('bottleneck', 64, 3), - ('bottleneck', 128, 24), - ('bottleneck', 256, 36), - ('bottleneck', 512, 3), - ], - 270: [ - ('bottleneck', 64, 4), - ('bottleneck', 128, 29), - ('bottleneck', 256, 53), - ('bottleneck', 512, 4), - ], - 350: [ - ('bottleneck', 64, 4), - ('bottleneck', 128, 36), - ('bottleneck', 256, 72), - ('bottleneck', 512, 4), - ], - 420: [ - ('bottleneck', 64, 4), - ('bottleneck', 128, 44), - ('bottleneck', 256, 87), - ('bottleneck', 512, 4), - ], -} - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class ResNet(tf.keras.Model): - """Creates ResNet and ResNet-RS family models. - - This implements the Deep Residual Network from: - Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. - Deep Residual Learning for Image Recognition. - (https://arxiv.org/pdf/1512.03385) and - Irwan Bello, William Fedus, Xianzhi Du, Ekin D. Cubuk, Aravind Srinivas, - Tsung-Yi Lin, Jonathon Shlens, Barret Zoph. - Revisiting ResNets: Improved Training and Scaling Strategies. - (https://arxiv.org/abs/2103.07579). - """ - - def __init__( - self, - model_id: int, - input_specs: tf.keras.layers.InputSpec = layers.InputSpec( - shape=[None, None, None, 3]), - depth_multiplier: float = 1.0, - stem_type: str = 'v0', - resnetd_shortcut: bool = False, - replace_stem_max_pool: bool = False, - se_ratio: Optional[float] = None, - init_stochastic_depth_rate: float = 0.0, - scale_stem: bool = True, - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_initializer: str = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bn_trainable: bool = True, - **kwargs): - """Initializes a ResNet model. - - Args: - model_id: An `int` of the depth of ResNet backbone model. - input_specs: A `tf.keras.layers.InputSpec` of the input tensor. - depth_multiplier: A `float` of the depth multiplier to uniformaly scale up - all layers in channel size. This argument is also referred to as - `width_multiplier` in (https://arxiv.org/abs/2103.07579). - stem_type: A `str` of stem type of ResNet. Default to `v0`. If set to - `v1`, use ResNet-D type stem (https://arxiv.org/abs/1812.01187). - resnetd_shortcut: A `bool` of whether to use ResNet-D shortcut in - downsampling blocks. - replace_stem_max_pool: A `bool` of whether to replace the max pool in stem - with a stride-2 conv, - se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer. - init_stochastic_depth_rate: A `float` of initial stochastic depth rate. - scale_stem: A `bool` of whether to scale stem layers. - activation: A `str` name of the activation function. - use_sync_bn: If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A small `float` added to variance to avoid dividing by zero. - kernel_initializer: A str for kernel initializer of convolutional layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - Default to None. - bn_trainable: A `bool` that indicates whether batch norm layers should be - trainable. Default to True. - **kwargs: Additional keyword arguments to be passed. - """ - self._model_id = model_id - self._input_specs = input_specs - self._depth_multiplier = depth_multiplier - self._stem_type = stem_type - self._resnetd_shortcut = resnetd_shortcut - self._replace_stem_max_pool = replace_stem_max_pool - self._se_ratio = se_ratio - self._init_stochastic_depth_rate = init_stochastic_depth_rate - self._scale_stem = scale_stem - self._use_sync_bn = use_sync_bn - self._activation = activation - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - if use_sync_bn: - self._norm = layers.experimental.SyncBatchNormalization - else: - self._norm = layers.BatchNormalization - self._kernel_initializer = kernel_initializer - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - self._bn_trainable = bn_trainable - - if tf.keras.backend.image_data_format() == 'channels_last': - bn_axis = -1 - else: - bn_axis = 1 - - # Build ResNet. - inputs = tf.keras.Input(shape=input_specs.shape[1:]) - - stem_depth_multiplier = self._depth_multiplier if scale_stem else 1.0 - if stem_type == 'v0': - x = layers.Conv2D( - filters=int(64 * stem_depth_multiplier), - kernel_size=7, - strides=2, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - inputs) - x = self._norm( - axis=bn_axis, - momentum=norm_momentum, - epsilon=norm_epsilon, - trainable=bn_trainable)( - x) - x = tf_utils.get_activation(activation, use_keras_layer=True)(x) - elif stem_type == 'v1': - x = layers.Conv2D( - filters=int(32 * stem_depth_multiplier), - kernel_size=3, - strides=2, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - inputs) - x = self._norm( - axis=bn_axis, - momentum=norm_momentum, - epsilon=norm_epsilon, - trainable=bn_trainable)( - x) - x = tf_utils.get_activation(activation, use_keras_layer=True)(x) - x = layers.Conv2D( - filters=int(32 * stem_depth_multiplier), - kernel_size=3, - strides=1, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - x) - x = self._norm( - axis=bn_axis, - momentum=norm_momentum, - epsilon=norm_epsilon, - trainable=bn_trainable)( - x) - x = tf_utils.get_activation(activation, use_keras_layer=True)(x) - x = layers.Conv2D( - filters=int(64 * stem_depth_multiplier), - kernel_size=3, - strides=1, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - x) - x = self._norm( - axis=bn_axis, - momentum=norm_momentum, - epsilon=norm_epsilon, - trainable=bn_trainable)( - x) - x = tf_utils.get_activation(activation, use_keras_layer=True)(x) - else: - raise ValueError('Stem type {} not supported.'.format(stem_type)) - - if replace_stem_max_pool: - x = layers.Conv2D( - filters=int(64 * self._depth_multiplier), - kernel_size=3, - strides=2, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - x) - x = self._norm( - axis=bn_axis, - momentum=norm_momentum, - epsilon=norm_epsilon, - trainable=bn_trainable)( - x) - x = tf_utils.get_activation(activation, use_keras_layer=True)(x) - else: - x = layers.MaxPool2D(pool_size=3, strides=2, padding='same')(x) - - endpoints = {} - for i, spec in enumerate(RESNET_SPECS[model_id]): - if spec[0] == 'residual': - block_fn = nn_blocks.ResidualBlock - elif spec[0] == 'bottleneck': - block_fn = nn_blocks.BottleneckBlock - else: - raise ValueError('Block fn `{}` is not supported.'.format(spec[0])) - x = self._block_group( - inputs=x, - filters=int(spec[1] * self._depth_multiplier), - strides=(1 if i == 0 else 2), - block_fn=block_fn, - block_repeats=spec[2], - stochastic_depth_drop_rate=nn_layers.get_stochastic_depth_rate( - self._init_stochastic_depth_rate, i + 2, 5), - name='block_group_l{}'.format(i + 2)) - endpoints[str(i + 2)] = x - - self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} - - super(ResNet, self).__init__(inputs=inputs, outputs=endpoints, **kwargs) - - def _block_group(self, - inputs: tf.Tensor, - filters: int, - strides: int, - block_fn: Callable[..., tf.keras.layers.Layer], - block_repeats: int = 1, - stochastic_depth_drop_rate: float = 0.0, - name: str = 'block_group'): - """Creates one group of blocks for the ResNet model. - - Args: - inputs: A `tf.Tensor` of size `[batch, channels, height, width]`. - filters: An `int` number of filters for the first convolution of the - layer. - strides: An `int` stride to use for the first convolution of the layer. - If greater than 1, this layer will downsample the input. - block_fn: The type of block group. Either `nn_blocks.ResidualBlock` or - `nn_blocks.BottleneckBlock`. - block_repeats: An `int` number of blocks contained in the layer. - stochastic_depth_drop_rate: A `float` of drop rate of the current block - group. - name: A `str` name for the block. - - Returns: - The output `tf.Tensor` of the block layer. - """ - x = block_fn( - filters=filters, - strides=strides, - use_projection=True, - stochastic_depth_drop_rate=stochastic_depth_drop_rate, - se_ratio=self._se_ratio, - resnetd_shortcut=self._resnetd_shortcut, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activation=self._activation, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon, - bn_trainable=self._bn_trainable)( - inputs) - - for _ in range(1, block_repeats): - x = block_fn( - filters=filters, - strides=1, - use_projection=False, - stochastic_depth_drop_rate=stochastic_depth_drop_rate, - se_ratio=self._se_ratio, - resnetd_shortcut=self._resnetd_shortcut, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activation=self._activation, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon, - bn_trainable=self._bn_trainable)( - x) - - return tf.keras.layers.Activation('linear', name=name)(x) - - def get_config(self): - config_dict = { - 'model_id': self._model_id, - 'depth_multiplier': self._depth_multiplier, - 'stem_type': self._stem_type, - 'resnetd_shortcut': self._resnetd_shortcut, - 'replace_stem_max_pool': self._replace_stem_max_pool, - 'activation': self._activation, - 'se_ratio': self._se_ratio, - 'init_stochastic_depth_rate': self._init_stochastic_depth_rate, - 'scale_stem': self._scale_stem, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - 'bn_trainable': self._bn_trainable - } - return config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - @property - def output_specs(self): - """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs - - -@factory.register_backbone_builder('resnet') -def build_resnet( - input_specs: tf.keras.layers.InputSpec, - backbone_config: hyperparams.Config, - norm_activation_config: hyperparams.Config, - l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model: # pytype: disable=annotation-type-mismatch # typed-keras - """Builds ResNet backbone from a config.""" - backbone_type = backbone_config.type - backbone_cfg = backbone_config.get() - assert backbone_type == 'resnet', (f'Inconsistent backbone type ' - f'{backbone_type}') - - return ResNet( - model_id=backbone_cfg.model_id, - input_specs=input_specs, - depth_multiplier=backbone_cfg.depth_multiplier, - stem_type=backbone_cfg.stem_type, - resnetd_shortcut=backbone_cfg.resnetd_shortcut, - replace_stem_max_pool=backbone_cfg.replace_stem_max_pool, - se_ratio=backbone_cfg.se_ratio, - init_stochastic_depth_rate=backbone_cfg.stochastic_depth_drop_rate, - scale_stem=backbone_cfg.scale_stem, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer, - bn_trainable=backbone_cfg.bn_trainable) diff --git a/official/vision/beta/modeling/backbones/resnet_3d.py b/official/vision/beta/modeling/backbones/resnet_3d.py deleted file mode 100644 index ac949b58d..000000000 --- a/official/vision/beta/modeling/backbones/resnet_3d.py +++ /dev/null @@ -1,454 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of 3D Residual Networks.""" -from typing import Callable, List, Tuple, Optional - -# Import libraries -import tensorflow as tf - -from official.modeling import hyperparams -from official.modeling import tf_utils -from official.vision.beta.modeling.backbones import factory -from official.vision.beta.modeling.layers import nn_blocks_3d -from official.vision.beta.modeling.layers import nn_layers - -layers = tf.keras.layers - -RESNET_SPECS = { - 50: [ - ('bottleneck3d', 64, 3), - ('bottleneck3d', 128, 4), - ('bottleneck3d', 256, 6), - ('bottleneck3d', 512, 3), - ], - 101: [ - ('bottleneck3d', 64, 3), - ('bottleneck3d', 128, 4), - ('bottleneck3d', 256, 23), - ('bottleneck3d', 512, 3), - ], - 152: [ - ('bottleneck3d', 64, 3), - ('bottleneck3d', 128, 8), - ('bottleneck3d', 256, 36), - ('bottleneck3d', 512, 3), - ], - 200: [ - ('bottleneck3d', 64, 3), - ('bottleneck3d', 128, 24), - ('bottleneck3d', 256, 36), - ('bottleneck3d', 512, 3), - ], - 270: [ - ('bottleneck3d', 64, 4), - ('bottleneck3d', 128, 29), - ('bottleneck3d', 256, 53), - ('bottleneck3d', 512, 4), - ], - 300: [ - ('bottleneck3d', 64, 4), - ('bottleneck3d', 128, 36), - ('bottleneck3d', 256, 54), - ('bottleneck3d', 512, 4), - ], - 350: [ - ('bottleneck3d', 64, 4), - ('bottleneck3d', 128, 36), - ('bottleneck3d', 256, 72), - ('bottleneck3d', 512, 4), - ], -} - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class ResNet3D(tf.keras.Model): - """Creates a 3D ResNet family model.""" - - def __init__( - self, - model_id: int, - temporal_strides: List[int], - temporal_kernel_sizes: List[Tuple[int]], - use_self_gating: Optional[List[int]] = None, - input_specs: tf.keras.layers.InputSpec = layers.InputSpec( - shape=[None, None, None, None, 3]), - stem_type: str = 'v0', - stem_conv_temporal_kernel_size: int = 5, - stem_conv_temporal_stride: int = 2, - stem_pool_temporal_stride: int = 2, - init_stochastic_depth_rate: float = 0.0, - activation: str = 'relu', - se_ratio: Optional[float] = None, - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_initializer: str = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - **kwargs): - """Initializes a 3D ResNet model. - - Args: - model_id: An `int` of depth of ResNet backbone model. - temporal_strides: A list of integers that specifies the temporal strides - for all 3d blocks. - temporal_kernel_sizes: A list of tuples that specifies the temporal kernel - sizes for all 3d blocks in different block groups. - use_self_gating: A list of booleans to specify applying self-gating module - or not in each block group. If None, self-gating is not applied. - input_specs: A `tf.keras.layers.InputSpec` of the input tensor. - stem_type: A `str` of stem type of ResNet. Default to `v0`. If set to - `v1`, use ResNet-D type stem (https://arxiv.org/abs/1812.01187). - stem_conv_temporal_kernel_size: An `int` of temporal kernel size for the - first conv layer. - stem_conv_temporal_stride: An `int` of temporal stride for the first conv - layer. - stem_pool_temporal_stride: An `int` of temporal stride for the first pool - layer. - init_stochastic_depth_rate: A `float` of initial stochastic depth rate. - activation: A `str` of name of the activation function. - se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer. - use_sync_bn: If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - kernel_initializer: A str for kernel initializer of convolutional layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - Default to None. - **kwargs: Additional keyword arguments to be passed. - """ - self._model_id = model_id - self._temporal_strides = temporal_strides - self._temporal_kernel_sizes = temporal_kernel_sizes - self._input_specs = input_specs - self._stem_type = stem_type - self._stem_conv_temporal_kernel_size = stem_conv_temporal_kernel_size - self._stem_conv_temporal_stride = stem_conv_temporal_stride - self._stem_pool_temporal_stride = stem_pool_temporal_stride - self._use_self_gating = use_self_gating - self._se_ratio = se_ratio - self._init_stochastic_depth_rate = init_stochastic_depth_rate - self._use_sync_bn = use_sync_bn - self._activation = activation - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - if use_sync_bn: - self._norm = layers.experimental.SyncBatchNormalization - else: - self._norm = layers.BatchNormalization - self._kernel_initializer = kernel_initializer - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - if tf.keras.backend.image_data_format() == 'channels_last': - bn_axis = -1 - else: - bn_axis = 1 - - # Build ResNet3D backbone. - inputs = tf.keras.Input(shape=input_specs.shape[1:]) - - # Build stem. - if stem_type == 'v0': - x = layers.Conv3D( - filters=64, - kernel_size=[stem_conv_temporal_kernel_size, 7, 7], - strides=[stem_conv_temporal_stride, 2, 2], - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - inputs) - x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - x = tf_utils.get_activation(activation)(x) - elif stem_type == 'v1': - x = layers.Conv3D( - filters=32, - kernel_size=[stem_conv_temporal_kernel_size, 3, 3], - strides=[stem_conv_temporal_stride, 2, 2], - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - inputs) - x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - x = tf_utils.get_activation(activation)(x) - x = layers.Conv3D( - filters=32, - kernel_size=[1, 3, 3], - strides=[1, 1, 1], - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - x) - x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - x = tf_utils.get_activation(activation)(x) - x = layers.Conv3D( - filters=64, - kernel_size=[1, 3, 3], - strides=[1, 1, 1], - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - x) - x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - x = tf_utils.get_activation(activation)(x) - else: - raise ValueError(f'Stem type {stem_type} not supported.') - - temporal_kernel_size = 1 if stem_pool_temporal_stride == 1 else 3 - x = layers.MaxPool3D( - pool_size=[temporal_kernel_size, 3, 3], - strides=[stem_pool_temporal_stride, 2, 2], - padding='same')( - x) - - # Build intermediate blocks and endpoints. - resnet_specs = RESNET_SPECS[model_id] - if len(temporal_strides) != len(resnet_specs) or len( - temporal_kernel_sizes) != len(resnet_specs): - raise ValueError( - 'Number of blocks in temporal specs should equal to resnet_specs.') - - endpoints = {} - for i, resnet_spec in enumerate(resnet_specs): - if resnet_spec[0] == 'bottleneck3d': - block_fn = nn_blocks_3d.BottleneckBlock3D - else: - raise ValueError('Block fn `{}` is not supported.'.format( - resnet_spec[0])) - - x = self._block_group( - inputs=x, - filters=resnet_spec[1], - temporal_kernel_sizes=temporal_kernel_sizes[i], - temporal_strides=temporal_strides[i], - spatial_strides=(1 if i == 0 else 2), - block_fn=block_fn, - block_repeats=resnet_spec[2], - stochastic_depth_drop_rate=nn_layers.get_stochastic_depth_rate( - self._init_stochastic_depth_rate, i + 2, 5), - use_self_gating=use_self_gating[i] if use_self_gating else False, - name='block_group_l{}'.format(i + 2)) - endpoints[str(i + 2)] = x - - self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} - - super(ResNet3D, self).__init__(inputs=inputs, outputs=endpoints, **kwargs) - - def _block_group(self, - inputs: tf.Tensor, - filters: int, - temporal_kernel_sizes: Tuple[int], - temporal_strides: int, - spatial_strides: int, - block_fn: Callable[ - ..., - tf.keras.layers.Layer] = nn_blocks_3d.BottleneckBlock3D, - block_repeats: int = 1, - stochastic_depth_drop_rate: float = 0.0, - use_self_gating: bool = False, - name: str = 'block_group'): - """Creates one group of blocks for the ResNet3D model. - - Args: - inputs: A `tf.Tensor` of size `[batch, channels, height, width]`. - filters: An `int` of number of filters for the first convolution of the - layer. - temporal_kernel_sizes: A tuple that specifies the temporal kernel sizes - for each block in the current group. - temporal_strides: An `int` of temporal strides for the first convolution - in this group. - spatial_strides: An `int` stride to use for the first convolution of the - layer. If greater than 1, this layer will downsample the input. - block_fn: Either `nn_blocks.ResidualBlock` or `nn_blocks.BottleneckBlock`. - block_repeats: An `int` of number of blocks contained in the layer. - stochastic_depth_drop_rate: A `float` of drop rate of the current block - group. - use_self_gating: A `bool` that specifies whether to apply self-gating - module or not. - name: A `str` name for the block. - - Returns: - The output `tf.Tensor` of the block layer. - """ - if len(temporal_kernel_sizes) != block_repeats: - raise ValueError( - 'Number of elements in `temporal_kernel_sizes` must equal to `block_repeats`.' - ) - - # Only apply self-gating module in the last block. - use_self_gating_list = [False] * (block_repeats - 1) + [use_self_gating] - - x = block_fn( - filters=filters, - temporal_kernel_size=temporal_kernel_sizes[0], - temporal_strides=temporal_strides, - spatial_strides=spatial_strides, - stochastic_depth_drop_rate=stochastic_depth_drop_rate, - use_self_gating=use_self_gating_list[0], - se_ratio=self._se_ratio, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activation=self._activation, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon)( - inputs) - - for i in range(1, block_repeats): - x = block_fn( - filters=filters, - temporal_kernel_size=temporal_kernel_sizes[i], - temporal_strides=1, - spatial_strides=1, - stochastic_depth_drop_rate=stochastic_depth_drop_rate, - use_self_gating=use_self_gating_list[i], - se_ratio=self._se_ratio, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activation=self._activation, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon)( - x) - - return tf.identity(x, name=name) - - def get_config(self): - config_dict = { - 'model_id': self._model_id, - 'temporal_strides': self._temporal_strides, - 'temporal_kernel_sizes': self._temporal_kernel_sizes, - 'stem_type': self._stem_type, - 'stem_conv_temporal_kernel_size': self._stem_conv_temporal_kernel_size, - 'stem_conv_temporal_stride': self._stem_conv_temporal_stride, - 'stem_pool_temporal_stride': self._stem_pool_temporal_stride, - 'use_self_gating': self._use_self_gating, - 'se_ratio': self._se_ratio, - 'init_stochastic_depth_rate': self._init_stochastic_depth_rate, - 'activation': self._activation, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - } - return config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - @property - def output_specs(self): - """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs - - -@factory.register_backbone_builder('resnet_3d') -def build_resnet3d( - input_specs: tf.keras.layers.InputSpec, - backbone_config: hyperparams.Config, - norm_activation_config: hyperparams.Config, - l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None -) -> tf.keras.Model: - """Builds ResNet 3d backbone from a config.""" - backbone_cfg = backbone_config.get() - - # Flatten configs before passing to the backbone. - temporal_strides = [] - temporal_kernel_sizes = [] - use_self_gating = [] - for block_spec in backbone_cfg.block_specs: - temporal_strides.append(block_spec.temporal_strides) - temporal_kernel_sizes.append(block_spec.temporal_kernel_sizes) - use_self_gating.append(block_spec.use_self_gating) - - return ResNet3D( - model_id=backbone_cfg.model_id, - temporal_strides=temporal_strides, - temporal_kernel_sizes=temporal_kernel_sizes, - use_self_gating=use_self_gating, - input_specs=input_specs, - stem_type=backbone_cfg.stem_type, - stem_conv_temporal_kernel_size=backbone_cfg - .stem_conv_temporal_kernel_size, - stem_conv_temporal_stride=backbone_cfg.stem_conv_temporal_stride, - stem_pool_temporal_stride=backbone_cfg.stem_pool_temporal_stride, - init_stochastic_depth_rate=backbone_cfg.stochastic_depth_drop_rate, - se_ratio=backbone_cfg.se_ratio, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer) - - -@factory.register_backbone_builder('resnet_3d_rs') -def build_resnet3d_rs( - input_specs: tf.keras.layers.InputSpec, - backbone_config: hyperparams.Config, - norm_activation_config: hyperparams.Config, - l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None -) -> tf.keras.Model: - """Builds ResNet-3D-RS backbone from a config.""" - backbone_cfg = backbone_config.get() - - # Flatten configs before passing to the backbone. - temporal_strides = [] - temporal_kernel_sizes = [] - use_self_gating = [] - for i, block_spec in enumerate(backbone_cfg.block_specs): - temporal_strides.append(block_spec.temporal_strides) - use_self_gating.append(block_spec.use_self_gating) - block_repeats_i = RESNET_SPECS[backbone_cfg.model_id][i][-1] - temporal_kernel_sizes.append(list(block_spec.temporal_kernel_sizes) * - block_repeats_i) - return ResNet3D( - model_id=backbone_cfg.model_id, - temporal_strides=temporal_strides, - temporal_kernel_sizes=temporal_kernel_sizes, - use_self_gating=use_self_gating, - input_specs=input_specs, - stem_type=backbone_cfg.stem_type, - stem_conv_temporal_kernel_size=backbone_cfg - .stem_conv_temporal_kernel_size, - stem_conv_temporal_stride=backbone_cfg.stem_conv_temporal_stride, - stem_pool_temporal_stride=backbone_cfg.stem_pool_temporal_stride, - init_stochastic_depth_rate=backbone_cfg.stochastic_depth_drop_rate, - se_ratio=backbone_cfg.se_ratio, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer) diff --git a/official/vision/beta/modeling/backbones/resnet_3d_test.py b/official/vision/beta/modeling/backbones/resnet_3d_test.py deleted file mode 100644 index 4e65a9fc2..000000000 --- a/official/vision/beta/modeling/backbones/resnet_3d_test.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for resnet.""" - -# Import libraries -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.modeling.backbones import resnet_3d - - -class ResNet3DTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (128, 50, 4, 'v0', False, 0.0), - (128, 50, 4, 'v1', False, 0.2), - (256, 50, 4, 'v1', True, 0.2), - ) - def test_network_creation(self, input_size, model_id, endpoint_filter_scale, - stem_type, se_ratio, init_stochastic_depth_rate): - """Test creation of ResNet3D family models.""" - tf.keras.backend.set_image_data_format('channels_last') - temporal_strides = [1, 1, 1, 1] - temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1), - (1, 3, 1)] - use_self_gating = [True, False, True, False] - - network = resnet_3d.ResNet3D( - model_id=model_id, - temporal_strides=temporal_strides, - temporal_kernel_sizes=temporal_kernel_sizes, - use_self_gating=use_self_gating, - stem_type=stem_type, - se_ratio=se_ratio, - init_stochastic_depth_rate=init_stochastic_depth_rate) - inputs = tf.keras.Input(shape=(8, input_size, input_size, 3), batch_size=1) - endpoints = network(inputs) - - self.assertAllEqual([ - 1, 2, input_size / 2**2, input_size / 2**2, 64 * endpoint_filter_scale - ], endpoints['2'].shape.as_list()) - self.assertAllEqual([ - 1, 2, input_size / 2**3, input_size / 2**3, 128 * endpoint_filter_scale - ], endpoints['3'].shape.as_list()) - self.assertAllEqual([ - 1, 2, input_size / 2**4, input_size / 2**4, 256 * endpoint_filter_scale - ], endpoints['4'].shape.as_list()) - self.assertAllEqual([ - 1, 2, input_size / 2**5, input_size / 2**5, 512 * endpoint_filter_scale - ], endpoints['5'].shape.as_list()) - - def test_serialize_deserialize(self): - # Create a network object that sets all of its config options. - kwargs = dict( - model_id=50, - temporal_strides=[1, 1, 1, 1], - temporal_kernel_sizes=[(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1), - (1, 3, 1)], - stem_type='v0', - stem_conv_temporal_kernel_size=5, - stem_conv_temporal_stride=2, - stem_pool_temporal_stride=2, - se_ratio=0.0, - use_self_gating=None, - init_stochastic_depth_rate=0.0, - use_sync_bn=False, - activation='relu', - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - ) - network = resnet_3d.ResNet3D(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(network.get_config(), expected_config) - - # Create another network object from the first object's config. - new_network = resnet_3d.ResNet3D.from_config(network.get_config()) - - # Validate that the config can be forced to JSON. - _ = new_network.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(network.get_config(), new_network.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/backbones/resnet_deeplab.py b/official/vision/beta/modeling/backbones/resnet_deeplab.py deleted file mode 100644 index 5f7836448..000000000 --- a/official/vision/beta/modeling/backbones/resnet_deeplab.py +++ /dev/null @@ -1,366 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of Residual Networks with Deeplab modifications.""" - -from typing import Callable, Optional, Tuple, List - -import numpy as np -import tensorflow as tf -from official.modeling import hyperparams -from official.modeling import tf_utils -from official.vision.beta.modeling.backbones import factory -from official.vision.beta.modeling.layers import nn_blocks -from official.vision.beta.modeling.layers import nn_layers - -layers = tf.keras.layers - -# Specifications for different ResNet variants. -# Each entry specifies block configurations of the particular ResNet variant. -# Each element in the block configuration is in the following format: -# (block_fn, num_filters, block_repeats) -RESNET_SPECS = { - 50: [ - ('bottleneck', 64, 3), - ('bottleneck', 128, 4), - ('bottleneck', 256, 6), - ('bottleneck', 512, 3), - ], - 101: [ - ('bottleneck', 64, 3), - ('bottleneck', 128, 4), - ('bottleneck', 256, 23), - ('bottleneck', 512, 3), - ], -} - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class DilatedResNet(tf.keras.Model): - """Creates a ResNet model with Deeplabv3 modifications. - - This backbone is suitable for semantic segmentation. This implements - Liang-Chieh Chen, George Papandreou, Florian Schroff, Hartwig Adam. - Rethinking Atrous Convolution for Semantic Image Segmentation. - (https://arxiv.org/pdf/1706.05587) - """ - - def __init__( - self, - model_id: int, - output_stride: int, - input_specs: tf.keras.layers.InputSpec = layers.InputSpec( - shape=[None, None, None, 3]), - stem_type: str = 'v0', - se_ratio: Optional[float] = None, - init_stochastic_depth_rate: float = 0.0, - multigrid: Optional[Tuple[int]] = None, - last_stage_repeats: int = 1, - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_initializer: str = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - **kwargs): - """Initializes a ResNet model with DeepLab modification. - - Args: - model_id: An `int` specifies depth of ResNet backbone model. - output_stride: An `int` of output stride, ratio of input to output - resolution. - input_specs: A `tf.keras.layers.InputSpec` of the input tensor. - stem_type: A `str` of stem type. Can be `v0` or `v1`. `v1` replaces 7x7 - conv by 3 3x3 convs. - se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer. - init_stochastic_depth_rate: A `float` of initial stochastic depth rate. - multigrid: A tuple of the same length as the number of blocks in the last - resnet stage. - last_stage_repeats: An `int` that specifies how many times last stage is - repeated. - activation: A `str` name of the activation function. - use_sync_bn: If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - kernel_initializer: A str for kernel initializer of convolutional layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - Default to None. - **kwargs: Additional keyword arguments to be passed. - """ - self._model_id = model_id - self._output_stride = output_stride - self._input_specs = input_specs - self._use_sync_bn = use_sync_bn - self._activation = activation - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - if use_sync_bn: - self._norm = layers.experimental.SyncBatchNormalization - else: - self._norm = layers.BatchNormalization - self._kernel_initializer = kernel_initializer - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - self._stem_type = stem_type - self._se_ratio = se_ratio - self._init_stochastic_depth_rate = init_stochastic_depth_rate - - if tf.keras.backend.image_data_format() == 'channels_last': - bn_axis = -1 - else: - bn_axis = 1 - - # Build ResNet. - inputs = tf.keras.Input(shape=input_specs.shape[1:]) - - if stem_type == 'v0': - x = layers.Conv2D( - filters=64, - kernel_size=7, - strides=2, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - inputs) - x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - x = tf_utils.get_activation(activation)(x) - elif stem_type == 'v1': - x = layers.Conv2D( - filters=64, - kernel_size=3, - strides=2, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - inputs) - x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - x = tf_utils.get_activation(activation)(x) - x = layers.Conv2D( - filters=64, - kernel_size=3, - strides=1, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - x) - x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - x = tf_utils.get_activation(activation)(x) - x = layers.Conv2D( - filters=128, - kernel_size=3, - strides=1, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - x) - x = self._norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - x) - x = tf_utils.get_activation(activation)(x) - else: - raise ValueError('Stem type {} not supported.'.format(stem_type)) - - x = layers.MaxPool2D(pool_size=3, strides=2, padding='same')(x) - - normal_resnet_stage = int(np.math.log2(self._output_stride)) - 2 - - endpoints = {} - for i in range(normal_resnet_stage + 1): - spec = RESNET_SPECS[model_id][i] - if spec[0] == 'bottleneck': - block_fn = nn_blocks.BottleneckBlock - else: - raise ValueError('Block fn `{}` is not supported.'.format(spec[0])) - x = self._block_group( - inputs=x, - filters=spec[1], - strides=(1 if i == 0 else 2), - dilation_rate=1, - block_fn=block_fn, - block_repeats=spec[2], - stochastic_depth_drop_rate=nn_layers.get_stochastic_depth_rate( - self._init_stochastic_depth_rate, i + 2, 4 + last_stage_repeats), - name='block_group_l{}'.format(i + 2)) - endpoints[str(i + 2)] = x - - dilation_rate = 2 - for i in range(normal_resnet_stage + 1, 3 + last_stage_repeats): - spec = RESNET_SPECS[model_id][i] if i < 3 else RESNET_SPECS[model_id][-1] - if spec[0] == 'bottleneck': - block_fn = nn_blocks.BottleneckBlock - else: - raise ValueError('Block fn `{}` is not supported.'.format(spec[0])) - x = self._block_group( - inputs=x, - filters=spec[1], - strides=1, - dilation_rate=dilation_rate, - block_fn=block_fn, - block_repeats=spec[2], - stochastic_depth_drop_rate=nn_layers.get_stochastic_depth_rate( - self._init_stochastic_depth_rate, i + 2, 4 + last_stage_repeats), - multigrid=multigrid if i >= 3 else None, - name='block_group_l{}'.format(i + 2)) - dilation_rate *= 2 - - endpoints[str(normal_resnet_stage + 2)] = x - - self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} - - super(DilatedResNet, self).__init__( - inputs=inputs, outputs=endpoints, **kwargs) - - def _block_group(self, - inputs: tf.Tensor, - filters: int, - strides: int, - dilation_rate: int, - block_fn: Callable[..., tf.keras.layers.Layer], - block_repeats: int = 1, - stochastic_depth_drop_rate: float = 0.0, - multigrid: Optional[List[int]] = None, - name: str = 'block_group'): - """Creates one group of blocks for the ResNet model. - - Deeplab applies strides at the last block. - - Args: - inputs: A `tf.Tensor` of size `[batch, channels, height, width]`. - filters: An `int` off number of filters for the first convolution of the - layer. - strides: An `int` of stride to use for the first convolution of the layer. - If greater than 1, this layer will downsample the input. - dilation_rate: An `int` of diluted convolution rates. - block_fn: Either `nn_blocks.ResidualBlock` or `nn_blocks.BottleneckBlock`. - block_repeats: An `int` of number of blocks contained in the layer. - stochastic_depth_drop_rate: A `float` of drop rate of the current block - group. - multigrid: A list of `int` or None. If specified, dilation rates for each - block is scaled up by its corresponding factor in the multigrid. - name: A `str` name for the block. - - Returns: - The output `tf.Tensor` of the block layer. - """ - if multigrid is not None and len(multigrid) != block_repeats: - raise ValueError('multigrid has to match number of block_repeats') - - if multigrid is None: - multigrid = [1] * block_repeats - - # TODO(arashwan): move striding at the of the block. - x = block_fn( - filters=filters, - strides=strides, - dilation_rate=dilation_rate * multigrid[0], - use_projection=True, - stochastic_depth_drop_rate=stochastic_depth_drop_rate, - se_ratio=self._se_ratio, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activation=self._activation, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon)( - inputs) - for i in range(1, block_repeats): - x = block_fn( - filters=filters, - strides=1, - dilation_rate=dilation_rate * multigrid[i], - use_projection=False, - stochastic_depth_drop_rate=stochastic_depth_drop_rate, - se_ratio=self._se_ratio, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activation=self._activation, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon)( - x) - - return tf.identity(x, name=name) - - def get_config(self): - config_dict = { - 'model_id': self._model_id, - 'output_stride': self._output_stride, - 'stem_type': self._stem_type, - 'se_ratio': self._se_ratio, - 'init_stochastic_depth_rate': self._init_stochastic_depth_rate, - 'activation': self._activation, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - } - return config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - @property - def output_specs(self): - """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs - - -@factory.register_backbone_builder('dilated_resnet') -def build_dilated_resnet( - input_specs: tf.keras.layers.InputSpec, - backbone_config: hyperparams.Config, - norm_activation_config: hyperparams.Config, - l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model: # pytype: disable=annotation-type-mismatch # typed-keras - """Builds ResNet backbone from a config.""" - backbone_type = backbone_config.type - backbone_cfg = backbone_config.get() - assert backbone_type == 'dilated_resnet', (f'Inconsistent backbone type ' - f'{backbone_type}') - - return DilatedResNet( - model_id=backbone_cfg.model_id, - output_stride=backbone_cfg.output_stride, - input_specs=input_specs, - stem_type=backbone_cfg.stem_type, - se_ratio=backbone_cfg.se_ratio, - init_stochastic_depth_rate=backbone_cfg.stochastic_depth_drop_rate, - multigrid=backbone_cfg.multigrid, - last_stage_repeats=backbone_cfg.last_stage_repeats, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer) diff --git a/official/vision/beta/modeling/backbones/resnet_deeplab_test.py b/official/vision/beta/modeling/backbones/resnet_deeplab_test.py deleted file mode 100644 index 5621be1c4..000000000 --- a/official/vision/beta/modeling/backbones/resnet_deeplab_test.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for resnet_deeplab models.""" - -# Import libraries -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from tensorflow.python.distribute import combinations -from tensorflow.python.distribute import strategy_combinations -from official.vision.beta.modeling.backbones import resnet_deeplab - - -class ResNetTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (128, 50, 4, 8), - (128, 101, 4, 8), - (128, 50, 4, 16), - (128, 101, 4, 16), - ) - def test_network_creation(self, input_size, model_id, - endpoint_filter_scale, output_stride): - """Test creation of ResNet models.""" - tf.keras.backend.set_image_data_format('channels_last') - - network = resnet_deeplab.DilatedResNet(model_id=model_id, - output_stride=output_stride) - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - endpoints = network(inputs) - print(endpoints) - self.assertAllEqual([ - 1, input_size / output_stride, input_size / output_stride, - 512 * endpoint_filter_scale - ], endpoints[str(int(np.math.log2(output_stride)))].shape.as_list()) - - @parameterized.parameters( - ('v0', None, 0.0), - ('v1', None, 0.0), - ('v1', 0.25, 0.0), - ('v1', 0.25, 0.2), - ) - def test_network_features(self, stem_type, se_ratio, - init_stochastic_depth_rate): - """Test additional features of ResNet models.""" - input_size = 128 - model_id = 50 - endpoint_filter_scale = 4 - output_stride = 8 - - tf.keras.backend.set_image_data_format('channels_last') - - network = resnet_deeplab.DilatedResNet( - model_id=model_id, - output_stride=output_stride, - stem_type=stem_type, - se_ratio=se_ratio, - init_stochastic_depth_rate=init_stochastic_depth_rate) - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - endpoints = network(inputs) - print(endpoints) - self.assertAllEqual([ - 1, input_size / output_stride, input_size / output_stride, - 512 * endpoint_filter_scale - ], endpoints[str(int(np.math.log2(output_stride)))].shape.as_list()) - - @combinations.generate( - combinations.combine( - strategy=[ - strategy_combinations.cloud_tpu_strategy, - strategy_combinations.one_device_strategy_gpu, - ], - use_sync_bn=[False, True], - )) - def test_sync_bn_multiple_devices(self, strategy, use_sync_bn): - """Test for sync bn on TPU and GPU devices.""" - inputs = np.random.rand(64, 128, 128, 3) - - tf.keras.backend.set_image_data_format('channels_last') - - with strategy.scope(): - network = resnet_deeplab.DilatedResNet( - model_id=50, output_stride=8, use_sync_bn=use_sync_bn) - _ = network(inputs) - - @parameterized.parameters(1, 3, 4) - def test_input_specs(self, input_dim): - """Test different input feature dimensions.""" - tf.keras.backend.set_image_data_format('channels_last') - - input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim]) - network = resnet_deeplab.DilatedResNet( - model_id=50, output_stride=8, input_specs=input_specs) - - inputs = tf.keras.Input(shape=(128, 128, input_dim), batch_size=1) - _ = network(inputs) - - def test_serialize_deserialize(self): - # Create a network object that sets all of its config options. - kwargs = dict( - model_id=50, - output_stride=8, - stem_type='v0', - se_ratio=0.25, - init_stochastic_depth_rate=0.2, - use_sync_bn=False, - activation='relu', - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - ) - network = resnet_deeplab.DilatedResNet(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(network.get_config(), expected_config) - - # Create another network object from the first object's config. - new_network = resnet_deeplab.DilatedResNet.from_config(network.get_config()) - - # Validate that the config can be forced to JSON. - _ = new_network.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(network.get_config(), new_network.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/backbones/resnet_test.py b/official/vision/beta/modeling/backbones/resnet_test.py deleted file mode 100644 index 7e19f4980..000000000 --- a/official/vision/beta/modeling/backbones/resnet_test.py +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for resnet.""" - -# Import libraries -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from tensorflow.python.distribute import combinations -from tensorflow.python.distribute import strategy_combinations -from official.vision.beta.modeling.backbones import resnet - - -class ResNetTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (128, 10, 1), - (128, 18, 1), - (128, 34, 1), - (128, 50, 4), - (128, 101, 4), - (128, 152, 4), - ) - def test_network_creation(self, input_size, model_id, - endpoint_filter_scale): - """Test creation of ResNet family models.""" - resnet_params = { - 10: 4915904, - 18: 11190464, - 34: 21306048, - 50: 23561152, - 101: 42605504, - 152: 58295232, - } - tf.keras.backend.set_image_data_format('channels_last') - - network = resnet.ResNet(model_id=model_id) - self.assertEqual(network.count_params(), resnet_params[model_id]) - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - endpoints = network(inputs) - - self.assertAllEqual( - [1, input_size / 2**2, input_size / 2**2, 64 * endpoint_filter_scale], - endpoints['2'].shape.as_list()) - self.assertAllEqual( - [1, input_size / 2**3, input_size / 2**3, 128 * endpoint_filter_scale], - endpoints['3'].shape.as_list()) - self.assertAllEqual( - [1, input_size / 2**4, input_size / 2**4, 256 * endpoint_filter_scale], - endpoints['4'].shape.as_list()) - self.assertAllEqual( - [1, input_size / 2**5, input_size / 2**5, 512 * endpoint_filter_scale], - endpoints['5'].shape.as_list()) - - @combinations.generate( - combinations.combine( - strategy=[ - strategy_combinations.cloud_tpu_strategy, - strategy_combinations.one_device_strategy_gpu, - ], - use_sync_bn=[False, True], - )) - def test_sync_bn_multiple_devices(self, strategy, use_sync_bn): - """Test for sync bn on TPU and GPU devices.""" - inputs = np.random.rand(64, 128, 128, 3) - - tf.keras.backend.set_image_data_format('channels_last') - - with strategy.scope(): - network = resnet.ResNet(model_id=50, use_sync_bn=use_sync_bn) - _ = network(inputs) - - @parameterized.parameters( - (128, 34, 1, 'v0', None, 0.0, 1.0, False, False), - (128, 34, 1, 'v1', 0.25, 0.2, 1.25, True, True), - (128, 50, 4, 'v0', None, 0.0, 1.5, False, False), - (128, 50, 4, 'v1', 0.25, 0.2, 2.0, True, True), - ) - def test_resnet_rs(self, input_size, model_id, endpoint_filter_scale, - stem_type, se_ratio, init_stochastic_depth_rate, - depth_multiplier, resnetd_shortcut, replace_stem_max_pool): - """Test creation of ResNet family models.""" - tf.keras.backend.set_image_data_format('channels_last') - network = resnet.ResNet( - model_id=model_id, - depth_multiplier=depth_multiplier, - stem_type=stem_type, - resnetd_shortcut=resnetd_shortcut, - replace_stem_max_pool=replace_stem_max_pool, - se_ratio=se_ratio, - init_stochastic_depth_rate=init_stochastic_depth_rate) - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - _ = network(inputs) - - @parameterized.parameters(1, 3, 4) - def test_input_specs(self, input_dim): - """Test different input feature dimensions.""" - tf.keras.backend.set_image_data_format('channels_last') - - input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim]) - network = resnet.ResNet(model_id=50, input_specs=input_specs) - - inputs = tf.keras.Input(shape=(128, 128, input_dim), batch_size=1) - _ = network(inputs) - - def test_serialize_deserialize(self): - # Create a network object that sets all of its config options. - kwargs = dict( - model_id=50, - depth_multiplier=1.0, - stem_type='v0', - se_ratio=None, - resnetd_shortcut=False, - replace_stem_max_pool=False, - init_stochastic_depth_rate=0.0, - scale_stem=True, - use_sync_bn=False, - activation='relu', - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - bn_trainable=True) - network = resnet.ResNet(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(network.get_config(), expected_config) - - # Create another network object from the first object's config. - new_network = resnet.ResNet.from_config(network.get_config()) - - # Validate that the config can be forced to JSON. - _ = new_network.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(network.get_config(), new_network.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/backbones/revnet.py b/official/vision/beta/modeling/backbones/revnet.py deleted file mode 100644 index 2a5e6c19a..000000000 --- a/official/vision/beta/modeling/backbones/revnet.py +++ /dev/null @@ -1,232 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of RevNet.""" - -from typing import Any, Callable, Dict, Optional -# Import libraries -import tensorflow as tf -from official.modeling import hyperparams -from official.modeling import tf_utils -from official.vision.beta.modeling.backbones import factory -from official.vision.beta.modeling.layers import nn_blocks - - -# Specifications for different RevNet variants. -# Each entry specifies block configurations of the particular RevNet variant. -# Each element in the block configuration is in the following format: -# (block_fn, num_filters, block_repeats) -REVNET_SPECS = { - 38: [ - ('residual', 32, 3), - ('residual', 64, 3), - ('residual', 112, 3), - ], - 56: [ - ('bottleneck', 128, 2), - ('bottleneck', 256, 2), - ('bottleneck', 512, 3), - ('bottleneck', 832, 2), - ], - 104: [ - ('bottleneck', 128, 2), - ('bottleneck', 256, 2), - ('bottleneck', 512, 11), - ('bottleneck', 832, 2), - ], -} - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class RevNet(tf.keras.Model): - """Creates a Reversible ResNet (RevNet) family model. - - This implements: - Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse. - The Reversible Residual Network: Backpropagation Without Storing - Activations. - (https://arxiv.org/pdf/1707.04585.pdf) - """ - - def __init__( - self, - model_id: int, - input_specs: tf.keras.layers.InputSpec = tf.keras.layers.InputSpec( - shape=[None, None, None, 3]), - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_initializer: str = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - **kwargs): - """Initializes a RevNet model. - - Args: - model_id: An `int` of depth/id of ResNet backbone model. - input_specs: A `tf.keras.layers.InputSpec` of the input tensor. - activation: A `str` name of the activation function. - use_sync_bn: If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - kernel_initializer: A str for kernel initializer of convolutional layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - **kwargs: Additional keyword arguments to be passed. - """ - self._model_id = model_id - self._input_specs = input_specs - self._use_sync_bn = use_sync_bn - self._activation = activation - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - self._kernel_initializer = kernel_initializer - self._kernel_regularizer = kernel_regularizer - if use_sync_bn: - self._norm = tf.keras.layers.experimental.SyncBatchNormalization - else: - self._norm = tf.keras.layers.BatchNormalization - - axis = -1 if tf.keras.backend.image_data_format() == 'channels_last' else 1 - - # Build RevNet. - inputs = tf.keras.Input(shape=input_specs.shape[1:]) - - x = tf.keras.layers.Conv2D( - filters=REVNET_SPECS[model_id][0][1], - kernel_size=7, strides=2, use_bias=False, padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer)(inputs) - x = self._norm( - axis=axis, momentum=norm_momentum, epsilon=norm_epsilon)(x) - x = tf_utils.get_activation(activation)(x) - x = tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding='same')(x) - - endpoints = {} - for i, spec in enumerate(REVNET_SPECS[model_id]): - if spec[0] == 'residual': - inner_block_fn = nn_blocks.ResidualInner - elif spec[0] == 'bottleneck': - inner_block_fn = nn_blocks.BottleneckResidualInner - else: - raise ValueError('Block fn `{}` is not supported.'.format(spec[0])) - - if spec[1] % 2 != 0: - raise ValueError('Number of output filters must be even to ensure ' - 'splitting in channel dimension for reversible blocks') - - x = self._block_group( - inputs=x, - filters=spec[1], - strides=(1 if i == 0 else 2), - inner_block_fn=inner_block_fn, - block_repeats=spec[2], - batch_norm_first=(i != 0), # Only skip on first block - name='revblock_group_{}'.format(i + 2)) - endpoints[str(i + 2)] = x - - self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} - - super(RevNet, self).__init__(inputs=inputs, outputs=endpoints, **kwargs) - - def _block_group(self, - inputs: tf.Tensor, - filters: int, - strides: int, - inner_block_fn: Callable[..., tf.keras.layers.Layer], - block_repeats: int, - batch_norm_first: bool, - name: str = 'revblock_group') -> tf.Tensor: - """Creates one reversible block for RevNet model. - - Args: - inputs: A `tf.Tensor` of size `[batch, channels, height, width]`. - filters: An `int` number of filters for the first convolution of the - layer. - strides: An `int` stride to use for the first convolution of the layer. If - greater than 1, this block group will downsample the input. - inner_block_fn: Either `nn_blocks.ResidualInner` or - `nn_blocks.BottleneckResidualInner`. - block_repeats: An `int` number of blocks contained in this block group. - batch_norm_first: A `bool` that specifies whether to apply - BatchNormalization and activation layer before feeding into convolution - layers. - name: A `str` name for the block. - - Returns: - The output `tf.Tensor` of the block layer. - """ - x = inputs - for i in range(block_repeats): - is_first_block = i == 0 - # Only first residual layer in block gets downsampled - curr_strides = strides if is_first_block else 1 - f = inner_block_fn( - filters=filters // 2, - strides=curr_strides, - batch_norm_first=batch_norm_first and is_first_block, - kernel_regularizer=self._kernel_regularizer) - g = inner_block_fn( - filters=filters // 2, - strides=1, - batch_norm_first=batch_norm_first and is_first_block, - kernel_regularizer=self._kernel_regularizer) - x = nn_blocks.ReversibleLayer(f, g)(x) - - return tf.identity(x, name=name) - - def get_config(self) -> Dict[str, Any]: - config_dict = { - 'model_id': self._model_id, - 'activation': self._activation, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - } - return config_dict - - @classmethod - def from_config(cls, - config: Dict[str, Any], - custom_objects: Optional[Any] = None) -> tf.keras.Model: - return cls(**config) - - @property - def output_specs(self) -> Dict[int, tf.TensorShape]: - """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs # pytype: disable=bad-return-type # trace-all-classes - - -@factory.register_backbone_builder('revnet') -def build_revnet( - input_specs: tf.keras.layers.InputSpec, - backbone_config: hyperparams.Config, - norm_activation_config: hyperparams.Config, - l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model: # pytype: disable=annotation-type-mismatch # typed-keras - """Builds RevNet backbone from a config.""" - backbone_type = backbone_config.type - backbone_cfg = backbone_config.get() - assert backbone_type == 'revnet', (f'Inconsistent backbone type ' - f'{backbone_type}') - - return RevNet( - model_id=backbone_cfg.model_id, - input_specs=input_specs, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer) diff --git a/official/vision/beta/modeling/backbones/revnet_test.py b/official/vision/beta/modeling/backbones/revnet_test.py deleted file mode 100644 index 9cb0f7eba..000000000 --- a/official/vision/beta/modeling/backbones/revnet_test.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for RevNet.""" - -# Import libraries -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.modeling.backbones import revnet - - -class RevNetTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (128, 56, 4), - (128, 104, 4), - ) - def test_network_creation(self, input_size, model_id, - endpoint_filter_scale): - """Test creation of RevNet family models.""" - tf.keras.backend.set_image_data_format('channels_last') - - network = revnet.RevNet(model_id=model_id) - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - endpoints = network(inputs) - network.summary() - - self.assertAllEqual( - [1, input_size / 2**2, input_size / 2**2, 128 * endpoint_filter_scale], - endpoints['2'].shape.as_list()) - self.assertAllEqual( - [1, input_size / 2**3, input_size / 2**3, 256 * endpoint_filter_scale], - endpoints['3'].shape.as_list()) - self.assertAllEqual( - [1, input_size / 2**4, input_size / 2**4, 512 * endpoint_filter_scale], - endpoints['4'].shape.as_list()) - self.assertAllEqual( - [1, input_size / 2**5, input_size / 2**5, 832 * endpoint_filter_scale], - endpoints['5'].shape.as_list()) - - @parameterized.parameters(1, 3, 4) - def test_input_specs(self, input_dim): - """Test different input feature dimensions.""" - tf.keras.backend.set_image_data_format('channels_last') - - input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim]) - network = revnet.RevNet(model_id=56, input_specs=input_specs) - - inputs = tf.keras.Input(shape=(128, 128, input_dim), batch_size=1) - _ = network(inputs) - - def test_serialize_deserialize(self): - # Create a network object that sets all of its config options. - kwargs = dict( - model_id=56, - activation='relu', - use_sync_bn=False, - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - ) - network = revnet.RevNet(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(network.get_config(), expected_config) - - # Create another network object from the first object's config. - new_network = revnet.RevNet.from_config(network.get_config()) - - # Validate that the config can be forced to JSON. - _ = new_network.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(network.get_config(), new_network.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/backbones/spinenet.py b/official/vision/beta/modeling/backbones/spinenet.py deleted file mode 100644 index c381edde7..000000000 --- a/official/vision/beta/modeling/backbones/spinenet.py +++ /dev/null @@ -1,572 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of SpineNet Networks.""" - -import math -from typing import Any, List, Optional, Tuple - -# Import libraries - -from absl import logging -import tensorflow as tf - -from official.modeling import hyperparams -from official.modeling import tf_utils -from official.vision.beta.modeling.backbones import factory -from official.vision.beta.modeling.layers import nn_blocks -from official.vision.beta.modeling.layers import nn_layers -from official.vision.beta.ops import spatial_transform_ops - -layers = tf.keras.layers - -FILTER_SIZE_MAP = { - 1: 32, - 2: 64, - 3: 128, - 4: 256, - 5: 256, - 6: 256, - 7: 256, -} - -# The fixed SpineNet architecture discovered by NAS. -# Each element represents a specification of a building block: -# (block_level, block_fn, (input_offset0, input_offset1), is_output). -SPINENET_BLOCK_SPECS = [ - (2, 'bottleneck', (0, 1), False), - (4, 'residual', (0, 1), False), - (3, 'bottleneck', (2, 3), False), - (4, 'bottleneck', (2, 4), False), - (6, 'residual', (3, 5), False), - (4, 'bottleneck', (3, 5), False), - (5, 'residual', (6, 7), False), - (7, 'residual', (6, 8), False), - (5, 'bottleneck', (8, 9), False), - (5, 'bottleneck', (8, 10), False), - (4, 'bottleneck', (5, 10), True), - (3, 'bottleneck', (4, 10), True), - (5, 'bottleneck', (7, 12), True), - (7, 'bottleneck', (5, 14), True), - (6, 'bottleneck', (12, 14), True), - (2, 'bottleneck', (2, 13), True), -] - -SCALING_MAP = { - '49S': { - 'endpoints_num_filters': 128, - 'filter_size_scale': 0.65, - 'resample_alpha': 0.5, - 'block_repeats': 1, - }, - '49': { - 'endpoints_num_filters': 256, - 'filter_size_scale': 1.0, - 'resample_alpha': 0.5, - 'block_repeats': 1, - }, - '96': { - 'endpoints_num_filters': 256, - 'filter_size_scale': 1.0, - 'resample_alpha': 0.5, - 'block_repeats': 2, - }, - '143': { - 'endpoints_num_filters': 256, - 'filter_size_scale': 1.0, - 'resample_alpha': 1.0, - 'block_repeats': 3, - }, - # SpineNet-143 with 1.3x filter_size_scale. - '143L': { - 'endpoints_num_filters': 256, - 'filter_size_scale': 1.3, - 'resample_alpha': 1.0, - 'block_repeats': 3, - }, - '190': { - 'endpoints_num_filters': 512, - 'filter_size_scale': 1.3, - 'resample_alpha': 1.0, - 'block_repeats': 4, - }, -} - - -class BlockSpec(object): - """A container class that specifies the block configuration for SpineNet.""" - - def __init__(self, level: int, block_fn: str, input_offsets: Tuple[int, int], - is_output: bool): - self.level = level - self.block_fn = block_fn - self.input_offsets = input_offsets - self.is_output = is_output - - -def build_block_specs( - block_specs: Optional[List[Tuple[Any, ...]]] = None) -> List[BlockSpec]: - """Builds the list of BlockSpec objects for SpineNet.""" - if not block_specs: - block_specs = SPINENET_BLOCK_SPECS - logging.info('Building SpineNet block specs: %s', block_specs) - return [BlockSpec(*b) for b in block_specs] - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class SpineNet(tf.keras.Model): - """Creates a SpineNet family model. - - This implements: - Xianzhi Du, Tsung-Yi Lin, Pengchong Jin, Golnaz Ghiasi, Mingxing Tan, - Yin Cui, Quoc V. Le, Xiaodan Song. - SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization. - (https://arxiv.org/abs/1912.05027) - """ - - def __init__( - self, - input_specs: tf.keras.layers.InputSpec = tf.keras.layers.InputSpec( - shape=[None, None, None, 3]), - min_level: int = 3, - max_level: int = 7, - block_specs: List[BlockSpec] = build_block_specs(), - endpoints_num_filters: int = 256, - resample_alpha: float = 0.5, - block_repeats: int = 1, - filter_size_scale: float = 1.0, - init_stochastic_depth_rate: float = 0.0, - kernel_initializer: str = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - **kwargs): - """Initializes a SpineNet model. - - Args: - input_specs: A `tf.keras.layers.InputSpec` of the input tensor. - min_level: An `int` of min level for output mutiscale features. - max_level: An `int` of max level for output mutiscale features. - block_specs: A list of block specifications for the SpineNet model - discovered by NAS. - endpoints_num_filters: An `int` of feature dimension for the output - endpoints. - resample_alpha: A `float` of resampling factor in cross-scale connections. - block_repeats: An `int` of number of blocks contained in the layer. - filter_size_scale: A `float` of multiplier for the filters (number of - channels) for all convolution ops. The value must be greater than zero. - Typical usage will be to set this value in (0, 1) to reduce the number - of parameters or computation cost of the model. - init_stochastic_depth_rate: A `float` of initial stochastic depth rate. - kernel_initializer: A str for kernel initializer of convolutional layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - Default to None. - activation: A `str` name of the activation function. - use_sync_bn: If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A small `float` added to variance to avoid dividing by zero. - **kwargs: Additional keyword arguments to be passed. - """ - self._input_specs = input_specs - self._min_level = min_level - self._max_level = max_level - self._block_specs = block_specs - self._endpoints_num_filters = endpoints_num_filters - self._resample_alpha = resample_alpha - self._block_repeats = block_repeats - self._filter_size_scale = filter_size_scale - self._init_stochastic_depth_rate = init_stochastic_depth_rate - self._kernel_initializer = kernel_initializer - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - self._activation = activation - self._use_sync_bn = use_sync_bn - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - if activation == 'relu': - self._activation_fn = tf.nn.relu - elif activation == 'swish': - self._activation_fn = tf.nn.swish - else: - raise ValueError('Activation {} not implemented.'.format(activation)) - self._init_block_fn = 'bottleneck' - self._num_init_blocks = 2 - - if use_sync_bn: - self._norm = layers.experimental.SyncBatchNormalization - else: - self._norm = layers.BatchNormalization - - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - - # Build SpineNet. - inputs = tf.keras.Input(shape=input_specs.shape[1:]) - - net = self._build_stem(inputs=inputs) - input_width = input_specs.shape[2] - if input_width is None: - max_stride = max(map(lambda b: b.level, block_specs)) - input_width = 2 ** max_stride - net = self._build_scale_permuted_network(net=net, input_width=input_width) - endpoints = self._build_endpoints(net=net) - - self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} - super(SpineNet, self).__init__(inputs=inputs, outputs=endpoints) - - def _block_group(self, - inputs: tf.Tensor, - filters: int, - strides: int, - block_fn_cand: str, - block_repeats: int = 1, - stochastic_depth_drop_rate: Optional[float] = None, - name: str = 'block_group'): - """Creates one group of blocks for the SpineNet model.""" - block_fn_candidates = { - 'bottleneck': nn_blocks.BottleneckBlock, - 'residual': nn_blocks.ResidualBlock, - } - block_fn = block_fn_candidates[block_fn_cand] - _, _, _, num_filters = inputs.get_shape().as_list() - - if block_fn_cand == 'bottleneck': - use_projection = not (num_filters == (filters * 4) and strides == 1) - else: - use_projection = not (num_filters == filters and strides == 1) - - x = block_fn( - filters=filters, - strides=strides, - use_projection=use_projection, - stochastic_depth_drop_rate=stochastic_depth_drop_rate, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activation=self._activation, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon)( - inputs) - for _ in range(1, block_repeats): - x = block_fn( - filters=filters, - strides=1, - use_projection=False, - stochastic_depth_drop_rate=stochastic_depth_drop_rate, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activation=self._activation, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon)( - x) - return tf.identity(x, name=name) - - def _build_stem(self, inputs): - """Builds SpineNet stem.""" - x = layers.Conv2D( - filters=64, - kernel_size=7, - strides=2, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - inputs) - x = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon)( - x) - x = tf_utils.get_activation(self._activation_fn)(x) - x = layers.MaxPool2D(pool_size=3, strides=2, padding='same')(x) - - net = [] - # Build the initial level 2 blocks. - for i in range(self._num_init_blocks): - x = self._block_group( - inputs=x, - filters=int(FILTER_SIZE_MAP[2] * self._filter_size_scale), - strides=1, - block_fn_cand=self._init_block_fn, - block_repeats=self._block_repeats, - name='stem_block_{}'.format(i + 1)) - net.append(x) - return net - - def _build_scale_permuted_network(self, - net, - input_width, - weighted_fusion=False): - """Builds scale-permuted network.""" - net_sizes = [int(math.ceil(input_width / 2**2))] * len(net) - net_block_fns = [self._init_block_fn] * len(net) - num_outgoing_connections = [0] * len(net) - - endpoints = {} - for i, block_spec in enumerate(self._block_specs): - # Find out specs for the target block. - target_width = int(math.ceil(input_width / 2**block_spec.level)) - target_num_filters = int(FILTER_SIZE_MAP[block_spec.level] * - self._filter_size_scale) - target_block_fn = block_spec.block_fn - - # Resample then merge input0 and input1. - parents = [] - input0 = block_spec.input_offsets[0] - input1 = block_spec.input_offsets[1] - - x0 = self._resample_with_alpha( - inputs=net[input0], - input_width=net_sizes[input0], - input_block_fn=net_block_fns[input0], - target_width=target_width, - target_num_filters=target_num_filters, - target_block_fn=target_block_fn, - alpha=self._resample_alpha) - parents.append(x0) - num_outgoing_connections[input0] += 1 - - x1 = self._resample_with_alpha( - inputs=net[input1], - input_width=net_sizes[input1], - input_block_fn=net_block_fns[input1], - target_width=target_width, - target_num_filters=target_num_filters, - target_block_fn=target_block_fn, - alpha=self._resample_alpha) - parents.append(x1) - num_outgoing_connections[input1] += 1 - - # Merge 0 outdegree blocks to the output block. - if block_spec.is_output: - for j, (j_feat, - j_connections) in enumerate(zip(net, num_outgoing_connections)): - if j_connections == 0 and (j_feat.shape[2] == target_width and - j_feat.shape[3] == x0.shape[3]): - parents.append(j_feat) - num_outgoing_connections[j] += 1 - - # pylint: disable=g-direct-tensorflow-import - if weighted_fusion: - dtype = parents[0].dtype - parent_weights = [ - tf.nn.relu(tf.cast(tf.Variable(1.0, name='block{}_fusion{}'.format( - i, j)), dtype=dtype)) for j in range(len(parents))] - weights_sum = tf.add_n(parent_weights) - parents = [ - parents[i] * parent_weights[i] / (weights_sum + 0.0001) - for i in range(len(parents)) - ] - - # Fuse all parent nodes then build a new block. - x = tf_utils.get_activation(self._activation_fn)(tf.add_n(parents)) - x = self._block_group( - inputs=x, - filters=target_num_filters, - strides=1, - block_fn_cand=target_block_fn, - block_repeats=self._block_repeats, - stochastic_depth_drop_rate=nn_layers.get_stochastic_depth_rate( - self._init_stochastic_depth_rate, i + 1, len(self._block_specs)), - name='scale_permuted_block_{}'.format(i + 1)) - - net.append(x) - net_sizes.append(target_width) - net_block_fns.append(target_block_fn) - num_outgoing_connections.append(0) - - # Save output feats. - if block_spec.is_output: - if block_spec.level in endpoints: - raise ValueError('Duplicate feats found for output level {}.'.format( - block_spec.level)) - if (block_spec.level < self._min_level or - block_spec.level > self._max_level): - logging.warning( - 'SpineNet output level %s out of range [min_level, max_level] = ' - '[%s, %s] will not be used for further processing.', - block_spec.level, self._min_level, self._max_level) - endpoints[str(block_spec.level)] = x - - return endpoints - - def _build_endpoints(self, net): - """Matches filter size for endpoints before sharing conv layers.""" - endpoints = {} - for level in range(self._min_level, self._max_level + 1): - x = layers.Conv2D( - filters=self._endpoints_num_filters, - kernel_size=1, - strides=1, - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - net[str(level)]) - x = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon)( - x) - x = tf_utils.get_activation(self._activation_fn)(x) - endpoints[str(level)] = x - return endpoints - - def _resample_with_alpha(self, - inputs, - input_width, - input_block_fn, - target_width, - target_num_filters, - target_block_fn, - alpha=0.5): - """Matches resolution and feature dimension.""" - _, _, _, input_num_filters = inputs.get_shape().as_list() - if input_block_fn == 'bottleneck': - input_num_filters /= 4 - new_num_filters = int(input_num_filters * alpha) - - x = layers.Conv2D( - filters=new_num_filters, - kernel_size=1, - strides=1, - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - inputs) - x = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon)( - x) - x = tf_utils.get_activation(self._activation_fn)(x) - - # Spatial resampling. - if input_width > target_width: - x = layers.Conv2D( - filters=new_num_filters, - kernel_size=3, - strides=2, - padding='SAME', - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - x) - x = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon)( - x) - x = tf_utils.get_activation(self._activation_fn)(x) - input_width /= 2 - while input_width > target_width: - x = layers.MaxPool2D(pool_size=3, strides=2, padding='SAME')(x) - input_width /= 2 - elif input_width < target_width: - scale = target_width // input_width - x = spatial_transform_ops.nearest_upsampling(x, scale=scale) - - # Last 1x1 conv to match filter size. - if target_block_fn == 'bottleneck': - target_num_filters *= 4 - x = layers.Conv2D( - filters=target_num_filters, - kernel_size=1, - strides=1, - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - x) - x = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon)( - x) - return x - - def get_config(self): - config_dict = { - 'min_level': self._min_level, - 'max_level': self._max_level, - 'endpoints_num_filters': self._endpoints_num_filters, - 'resample_alpha': self._resample_alpha, - 'block_repeats': self._block_repeats, - 'filter_size_scale': self._filter_size_scale, - 'init_stochastic_depth_rate': self._init_stochastic_depth_rate, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - 'activation': self._activation, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon - } - return config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - @property - def output_specs(self): - """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs - - -@factory.register_backbone_builder('spinenet') -def build_spinenet( - input_specs: tf.keras.layers.InputSpec, - backbone_config: hyperparams.Config, - norm_activation_config: hyperparams.Config, - l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model: - """Builds SpineNet backbone from a config.""" - backbone_type = backbone_config.type - backbone_cfg = backbone_config.get() - assert backbone_type == 'spinenet', (f'Inconsistent backbone type ' - f'{backbone_type}') - - model_id = backbone_cfg.model_id - if model_id not in SCALING_MAP: - raise ValueError( - 'SpineNet-{} is not a valid architecture.'.format(model_id)) - scaling_params = SCALING_MAP[model_id] - - return SpineNet( - input_specs=input_specs, - min_level=backbone_cfg.min_level, - max_level=backbone_cfg.max_level, - endpoints_num_filters=scaling_params['endpoints_num_filters'], - resample_alpha=scaling_params['resample_alpha'], - block_repeats=scaling_params['block_repeats'], - filter_size_scale=scaling_params['filter_size_scale'], - init_stochastic_depth_rate=backbone_cfg.stochastic_depth_drop_rate, - kernel_regularizer=l2_regularizer, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon) diff --git a/official/vision/beta/modeling/backbones/spinenet_mobile.py b/official/vision/beta/modeling/backbones/spinenet_mobile.py deleted file mode 100644 index 2ef601899..000000000 --- a/official/vision/beta/modeling/backbones/spinenet_mobile.py +++ /dev/null @@ -1,538 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2020 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Contains definitions of Mobile SpineNet Networks.""" -import math -from typing import Any, List, Optional, Tuple - -# Import libraries - -from absl import logging -import tensorflow as tf - -from official.modeling import hyperparams -from official.modeling import tf_utils -from official.vision.beta.modeling.backbones import factory -from official.vision.beta.modeling.layers import nn_blocks -from official.vision.beta.modeling.layers import nn_layers -from official.vision.beta.ops import spatial_transform_ops - -layers = tf.keras.layers - -FILTER_SIZE_MAP = { - 0: 8, - 1: 16, - 2: 24, - 3: 40, - 4: 80, - 5: 112, - 6: 112, - 7: 112, -} - -# The fixed SpineNet architecture discovered by NAS. -# Each element represents a specification of a building block: -# (block_level, block_fn, (input_offset0, input_offset1), is_output). -SPINENET_BLOCK_SPECS = [ - (2, 'mbconv', (0, 1), False), - (2, 'mbconv', (1, 2), False), - (4, 'mbconv', (1, 2), False), - (3, 'mbconv', (3, 4), False), - (4, 'mbconv', (3, 5), False), - (6, 'mbconv', (4, 6), False), - (4, 'mbconv', (4, 6), False), - (5, 'mbconv', (7, 8), False), - (7, 'mbconv', (7, 9), False), - (5, 'mbconv', (9, 10), False), - (5, 'mbconv', (9, 11), False), - (4, 'mbconv', (6, 11), True), - (3, 'mbconv', (5, 11), True), - (5, 'mbconv', (8, 13), True), - (7, 'mbconv', (6, 15), True), - (6, 'mbconv', (13, 15), True), -] - -SCALING_MAP = { - '49': { - 'endpoints_num_filters': 48, - 'filter_size_scale': 1.0, - 'block_repeats': 1, - }, - '49S': { - 'endpoints_num_filters': 40, - 'filter_size_scale': 0.65, - 'block_repeats': 1, - }, - '49XS': { - 'endpoints_num_filters': 24, - 'filter_size_scale': 0.6, - 'block_repeats': 1, - }, -} - - -class BlockSpec(object): - """A container class that specifies the block configuration for SpineNet.""" - - def __init__(self, level: int, block_fn: str, input_offsets: Tuple[int, int], - is_output: bool): - self.level = level - self.block_fn = block_fn - self.input_offsets = input_offsets - self.is_output = is_output - - -def build_block_specs( - block_specs: Optional[List[Tuple[Any, ...]]] = None) -> List[BlockSpec]: - """Builds the list of BlockSpec objects for SpineNet.""" - if not block_specs: - block_specs = SPINENET_BLOCK_SPECS - logging.info('Building SpineNet block specs: %s', block_specs) - return [BlockSpec(*b) for b in block_specs] - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class SpineNetMobile(tf.keras.Model): - """Creates a Mobile SpineNet family model. - - This implements: - [1] Xianzhi Du, Tsung-Yi Lin, Pengchong Jin, Golnaz Ghiasi, Mingxing Tan, - Yin Cui, Quoc V. Le, Xiaodan Song. - SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization. - (https://arxiv.org/abs/1912.05027). - [2] Xianzhi Du, Tsung-Yi Lin, Pengchong Jin, Yin Cui, Mingxing Tan, - Quoc Le, Xiaodan Song. - Efficient Scale-Permuted Backbone with Learned Resource Distribution. - (https://arxiv.org/abs/2010.11426). - """ - - def __init__( - self, - input_specs: tf.keras.layers.InputSpec = tf.keras.layers.InputSpec( - shape=[None, None, None, 3]), - min_level: int = 3, - max_level: int = 7, - block_specs: List[BlockSpec] = build_block_specs(), - endpoints_num_filters: int = 256, - se_ratio: float = 0.2, - block_repeats: int = 1, - filter_size_scale: float = 1.0, - expand_ratio: int = 6, - init_stochastic_depth_rate=0.0, - kernel_initializer: str = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - use_keras_upsampling_2d: bool = False, - **kwargs): - """Initializes a Mobile SpineNet model. - - Args: - input_specs: A `tf.keras.layers.InputSpec` of the input tensor. - min_level: An `int` of min level for output mutiscale features. - max_level: An `int` of max level for output mutiscale features. - block_specs: The block specifications for the SpineNet model discovered by - NAS. - endpoints_num_filters: An `int` of feature dimension for the output - endpoints. - se_ratio: A `float` of Squeeze-and-Excitation ratio. - block_repeats: An `int` of number of blocks contained in the layer. - filter_size_scale: A `float` of multiplier for the filters (number of - channels) for all convolution ops. The value must be greater than zero. - Typical usage will be to set this value in (0, 1) to reduce the number - of parameters or computation cost of the model. - expand_ratio: An `integer` of expansion ratios for inverted bottleneck - blocks. - init_stochastic_depth_rate: A `float` of initial stochastic depth rate. - kernel_initializer: A str for kernel initializer of convolutional layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - Default to None. - activation: A `str` name of the activation function. - use_sync_bn: If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A small `float` added to variance to avoid dividing by zero. - use_keras_upsampling_2d: If True, use keras UpSampling2D layer. - **kwargs: Additional keyword arguments to be passed. - """ - self._input_specs = input_specs - self._min_level = min_level - self._max_level = max_level - self._block_specs = block_specs - self._endpoints_num_filters = endpoints_num_filters - self._se_ratio = se_ratio - self._block_repeats = block_repeats - self._filter_size_scale = filter_size_scale - self._expand_ratio = expand_ratio - self._init_stochastic_depth_rate = init_stochastic_depth_rate - self._kernel_initializer = kernel_initializer - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - self._activation = activation - self._use_sync_bn = use_sync_bn - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - self._use_keras_upsampling_2d = use_keras_upsampling_2d - self._num_init_blocks = 2 - - if use_sync_bn: - self._norm = layers.experimental.SyncBatchNormalization - else: - self._norm = layers.BatchNormalization - - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - - # Build SpineNet. - inputs = tf.keras.Input(shape=input_specs.shape[1:]) - - net = self._build_stem(inputs=inputs) - input_width = input_specs.shape[2] - if input_width is None: - max_stride = max(map(lambda b: b.level, block_specs)) - input_width = 2 ** max_stride - net = self._build_scale_permuted_network(net=net, input_width=input_width) - endpoints = self._build_endpoints(net=net) - - self._output_specs = {l: endpoints[l].get_shape() for l in endpoints} - super().__init__(inputs=inputs, outputs=endpoints) - - def _block_group(self, - inputs: tf.Tensor, - in_filters: int, - out_filters: int, - strides: int, - expand_ratio: int = 6, - block_repeats: int = 1, - se_ratio: float = 0.2, - stochastic_depth_drop_rate: Optional[float] = None, - name: str = 'block_group'): - """Creates one group of blocks for the SpineNet model.""" - x = nn_blocks.InvertedBottleneckBlock( - in_filters=in_filters, - out_filters=out_filters, - strides=strides, - se_ratio=se_ratio, - expand_ratio=expand_ratio, - stochastic_depth_drop_rate=stochastic_depth_drop_rate, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activation=self._activation, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon)( - inputs) - for _ in range(1, block_repeats): - x = nn_blocks.InvertedBottleneckBlock( - in_filters=in_filters, - out_filters=out_filters, - strides=1, - se_ratio=se_ratio, - expand_ratio=expand_ratio, - stochastic_depth_drop_rate=stochastic_depth_drop_rate, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activation=self._activation, - use_sync_bn=self._use_sync_bn, - norm_momentum=self._norm_momentum, - norm_epsilon=self._norm_epsilon)( - inputs) - return tf.keras.layers.Activation('linear', name=name)(x) - - def _build_stem(self, inputs): - """Builds SpineNet stem.""" - x = layers.Conv2D( - filters=int(FILTER_SIZE_MAP[0] * self._filter_size_scale), - kernel_size=3, - strides=2, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - inputs) - x = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon)( - x) - x = tf_utils.get_activation(self._activation, use_keras_layer=True)(x) - - net = [] - stem_strides = [1, 2] - # Build the initial level 2 blocks. - for i in range(self._num_init_blocks): - x = self._block_group( - inputs=x, - in_filters=int(FILTER_SIZE_MAP[i] * self._filter_size_scale), - out_filters=int(FILTER_SIZE_MAP[i + 1] * self._filter_size_scale), - expand_ratio=self._expand_ratio, - strides=stem_strides[i], - se_ratio=self._se_ratio, - block_repeats=self._block_repeats, - name='stem_block_{}'.format(i + 1)) - net.append(x) - return net - - def _build_scale_permuted_network(self, - net, - input_width, - weighted_fusion=False): - """Builds scale-permuted network.""" - net_sizes = [ - int(math.ceil(input_width / 2)), - int(math.ceil(input_width / 2**2)) - ] - num_outgoing_connections = [0] * len(net) - - endpoints = {} - for i, block_spec in enumerate(self._block_specs): - # Update block level if it is larger than max_level to avoid building - # blocks smaller than requested. - block_spec.level = min(block_spec.level, self._max_level) - # Find out specs for the target block. - target_width = int(math.ceil(input_width / 2**block_spec.level)) - target_num_filters = int(FILTER_SIZE_MAP[block_spec.level] * - self._filter_size_scale) - - # Resample then merge input0 and input1. - parents = [] - input0 = block_spec.input_offsets[0] - input1 = block_spec.input_offsets[1] - - x0 = self._resample_with_sepconv( - inputs=net[input0], - input_width=net_sizes[input0], - target_width=target_width, - target_num_filters=target_num_filters) - parents.append(x0) - num_outgoing_connections[input0] += 1 - - x1 = self._resample_with_sepconv( - inputs=net[input1], - input_width=net_sizes[input1], - target_width=target_width, - target_num_filters=target_num_filters) - parents.append(x1) - num_outgoing_connections[input1] += 1 - - # Merge 0 outdegree blocks to the output block. - if block_spec.is_output: - for j, (j_feat, - j_connections) in enumerate(zip(net, num_outgoing_connections)): - if j_connections == 0 and (j_feat.shape[2] == target_width and - j_feat.shape[3] == x0.shape[3]): - parents.append(j_feat) - num_outgoing_connections[j] += 1 - - # pylint: disable=g-direct-tensorflow-import - if weighted_fusion: - dtype = parents[0].dtype - parent_weights = [ - tf.nn.relu(tf.cast(tf.Variable(1.0, name='block{}_fusion{}'.format( - i, j)), dtype=dtype)) for j in range(len(parents))] - weights_sum = layers.Add()(parent_weights) - parents = [ - parents[i] * parent_weights[i] / (weights_sum + 0.0001) - for i in range(len(parents)) - ] - - # Fuse all parent nodes then build a new block. - x = tf_utils.get_activation( - self._activation, use_keras_layer=True)(layers.Add()(parents)) - x = self._block_group( - inputs=x, - in_filters=target_num_filters, - out_filters=target_num_filters, - strides=1, - se_ratio=self._se_ratio, - expand_ratio=self._expand_ratio, - block_repeats=self._block_repeats, - stochastic_depth_drop_rate=nn_layers.get_stochastic_depth_rate( - self._init_stochastic_depth_rate, i + 1, len(self._block_specs)), - name='scale_permuted_block_{}'.format(i + 1)) - - net.append(x) - net_sizes.append(target_width) - num_outgoing_connections.append(0) - - # Save output feats. - if block_spec.is_output: - if block_spec.level in endpoints: - raise ValueError('Duplicate feats found for output level {}.'.format( - block_spec.level)) - if (block_spec.level < self._min_level or - block_spec.level > self._max_level): - logging.warning( - 'SpineNet output level out of range [min_level, max_levle] = [%s, %s] will not be used for further processing.', - self._min_level, self._max_level) - endpoints[str(block_spec.level)] = x - - return endpoints - - def _build_endpoints(self, net): - """Matches filter size for endpoints before sharing conv layers.""" - endpoints = {} - for level in range(self._min_level, self._max_level + 1): - x = layers.Conv2D( - filters=self._endpoints_num_filters, - kernel_size=1, - strides=1, - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - net[str(level)]) - x = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon)( - x) - x = tf_utils.get_activation(self._activation, use_keras_layer=True)(x) - endpoints[str(level)] = x - return endpoints - - def _resample_with_sepconv(self, inputs, input_width, target_width, - target_num_filters): - """Matches resolution and feature dimension.""" - x = inputs - # Spatial resampling. - if input_width > target_width: - while input_width > target_width: - x = layers.DepthwiseConv2D( - kernel_size=3, - strides=2, - padding='SAME', - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - x) - x = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon)( - x) - x = tf_utils.get_activation( - self._activation, use_keras_layer=True)(x) - input_width /= 2 - elif input_width < target_width: - scale = target_width // input_width - x = spatial_transform_ops.nearest_upsampling( - x, scale=scale, use_keras_layer=self._use_keras_upsampling_2d) - - # Last 1x1 conv to match filter size. - x = layers.Conv2D( - filters=target_num_filters, - kernel_size=1, - strides=1, - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - x) - x = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon)( - x) - return x - - def get_config(self): - config_dict = { - 'min_level': self._min_level, - 'max_level': self._max_level, - 'endpoints_num_filters': self._endpoints_num_filters, - 'se_ratio': self._se_ratio, - 'expand_ratio': self._expand_ratio, - 'block_repeats': self._block_repeats, - 'filter_size_scale': self._filter_size_scale, - 'init_stochastic_depth_rate': self._init_stochastic_depth_rate, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - 'activation': self._activation, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon, - 'use_keras_upsampling_2d': self._use_keras_upsampling_2d, - } - return config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - @property - def output_specs(self): - """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs - - -@factory.register_backbone_builder('spinenet_mobile') -def build_spinenet_mobile( - input_specs: tf.keras.layers.InputSpec, - backbone_config: hyperparams.Config, - norm_activation_config: hyperparams.Config, - l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model: - """Builds Mobile SpineNet backbone from a config.""" - backbone_type = backbone_config.type - backbone_cfg = backbone_config.get() - assert backbone_type == 'spinenet_mobile', (f'Inconsistent backbone type ' - f'{backbone_type}') - - model_id = backbone_cfg.model_id - if model_id not in SCALING_MAP: - raise ValueError( - 'Mobile SpineNet-{} is not a valid architecture.'.format(model_id)) - scaling_params = SCALING_MAP[model_id] - - return SpineNetMobile( - input_specs=input_specs, - min_level=backbone_cfg.min_level, - max_level=backbone_cfg.max_level, - endpoints_num_filters=scaling_params['endpoints_num_filters'], - block_repeats=scaling_params['block_repeats'], - filter_size_scale=scaling_params['filter_size_scale'], - se_ratio=backbone_cfg.se_ratio, - expand_ratio=backbone_cfg.expand_ratio, - init_stochastic_depth_rate=backbone_cfg.stochastic_depth_drop_rate, - kernel_regularizer=l2_regularizer, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - use_keras_upsampling_2d=backbone_cfg.use_keras_upsampling_2d) diff --git a/official/vision/beta/modeling/backbones/spinenet_mobile_test.py b/official/vision/beta/modeling/backbones/spinenet_mobile_test.py deleted file mode 100644 index bc2b147e4..000000000 --- a/official/vision/beta/modeling/backbones/spinenet_mobile_test.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2020 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for SpineNet.""" -# Import libraries -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.modeling.backbones import spinenet_mobile - - -class SpineNetMobileTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (128, 0.6, 1, 0.0, 24), - (128, 0.65, 1, 0.2, 40), - (256, 1.0, 1, 0.2, 48), - ) - def test_network_creation(self, input_size, filter_size_scale, block_repeats, - se_ratio, endpoints_num_filters): - """Test creation of SpineNet models.""" - min_level = 3 - max_level = 7 - - tf.keras.backend.set_image_data_format('channels_last') - - input_specs = tf.keras.layers.InputSpec( - shape=[None, input_size, input_size, 3]) - model = spinenet_mobile.SpineNetMobile( - input_specs=input_specs, - min_level=min_level, - max_level=max_level, - endpoints_num_filters=endpoints_num_filters, - resample_alpha=se_ratio, - block_repeats=block_repeats, - filter_size_scale=filter_size_scale, - init_stochastic_depth_rate=0.2, - ) - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - endpoints = model(inputs) - - for l in range(min_level, max_level + 1): - self.assertIn(str(l), endpoints.keys()) - self.assertAllEqual( - [1, input_size / 2**l, input_size / 2**l, endpoints_num_filters], - endpoints[str(l)].shape.as_list()) - - def test_serialize_deserialize(self): - # Create a network object that sets all of its config options. - kwargs = dict( - min_level=3, - max_level=7, - endpoints_num_filters=256, - se_ratio=0.2, - expand_ratio=6, - block_repeats=1, - filter_size_scale=1.0, - init_stochastic_depth_rate=0.2, - use_sync_bn=False, - activation='relu', - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - use_keras_upsampling_2d=False, - ) - network = spinenet_mobile.SpineNetMobile(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(network.get_config(), expected_config) - - # Create another network object from the first object's config. - new_network = spinenet_mobile.SpineNetMobile.from_config( - network.get_config()) - - # Validate that the config can be forced to JSON. - _ = new_network.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(network.get_config(), new_network.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/backbones/spinenet_test.py b/official/vision/beta/modeling/backbones/spinenet_test.py deleted file mode 100644 index d1c3f32e5..000000000 --- a/official/vision/beta/modeling/backbones/spinenet_test.py +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for SpineNet.""" -# Import libraries -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.modeling.backbones import spinenet - - -class SpineNetTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (128, 0.65, 1, 0.5, 128, 4, 6), - (256, 1.0, 1, 0.5, 256, 3, 6), - (384, 1.0, 2, 0.5, 256, 4, 7), - (512, 1.0, 3, 1.0, 256, 3, 7), - (640, 1.3, 4, 1.0, 384, 3, 7), - ) - def test_network_creation(self, input_size, filter_size_scale, block_repeats, - resample_alpha, endpoints_num_filters, min_level, - max_level): - """Test creation of SpineNet models.""" - - tf.keras.backend.set_image_data_format('channels_last') - - input_specs = tf.keras.layers.InputSpec( - shape=[None, input_size, input_size, 3]) - model = spinenet.SpineNet( - input_specs=input_specs, - min_level=min_level, - max_level=max_level, - endpoints_num_filters=endpoints_num_filters, - resample_alpha=resample_alpha, - block_repeats=block_repeats, - filter_size_scale=filter_size_scale, - init_stochastic_depth_rate=0.2, - ) - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - endpoints = model(inputs) - - for l in range(min_level, max_level + 1): - self.assertIn(str(l), endpoints.keys()) - self.assertAllEqual( - [1, input_size / 2**l, input_size / 2**l, endpoints_num_filters], - endpoints[str(l)].shape.as_list()) - - @parameterized.parameters( - ((128, 128), (128, 128)), - ((128, 128), (256, 256)), - ((640, 640), (896, 1664)), - ) - def test_load_from_different_input_specs(self, input_size_1, input_size_2): - """Test loading checkpoints with different input size.""" - - def build_spinenet(input_size): - tf.keras.backend.set_image_data_format('channels_last') - input_specs = tf.keras.layers.InputSpec( - shape=[None, input_size[0], input_size[1], 3]) - model = spinenet.SpineNet( - input_specs=input_specs, - min_level=3, - max_level=7, - endpoints_num_filters=384, - resample_alpha=1.0, - block_repeats=2, - filter_size_scale=0.5) - return model - - model_1 = build_spinenet(input_size_1) - model_2 = build_spinenet(input_size_2) - - ckpt_1 = tf.train.Checkpoint(backbone=model_1) - ckpt_2 = tf.train.Checkpoint(backbone=model_2) - - ckpt_path = self.get_temp_dir() + '/ckpt' - ckpt_1.write(ckpt_path) - ckpt_2.restore(ckpt_path).expect_partial() - - def test_serialize_deserialize(self): - # Create a network object that sets all of its config options. - kwargs = dict( - min_level=3, - max_level=7, - endpoints_num_filters=256, - resample_alpha=0.5, - block_repeats=1, - filter_size_scale=1.0, - init_stochastic_depth_rate=0.2, - use_sync_bn=False, - activation='relu', - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - ) - network = spinenet.SpineNet(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(network.get_config(), expected_config) - - # Create another network object from the first object's config. - new_network = spinenet.SpineNet.from_config(network.get_config()) - - # Validate that the config can be forced to JSON. - _ = new_network.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(network.get_config(), new_network.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/classification_model.py b/official/vision/beta/modeling/classification_model.py deleted file mode 100644 index 5a97342ea..000000000 --- a/official/vision/beta/modeling/classification_model.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Build classification models.""" - -from typing import Any, Mapping, Optional -# Import libraries -import tensorflow as tf - -layers = tf.keras.layers - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class ClassificationModel(tf.keras.Model): - """A classification class builder.""" - - def __init__( - self, - backbone: tf.keras.Model, - num_classes: int, - input_specs: tf.keras.layers.InputSpec = layers.InputSpec( - shape=[None, None, None, 3]), - dropout_rate: float = 0.0, - kernel_initializer: str = 'random_uniform', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - add_head_batch_norm: bool = False, - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - skip_logits_layer: bool = False, - **kwargs): - """Classification initialization function. - - Args: - backbone: a backbone network. - num_classes: `int` number of classes in classification task. - input_specs: `tf.keras.layers.InputSpec` specs of the input tensor. - dropout_rate: `float` rate for dropout regularization. - kernel_initializer: kernel initializer for the dense layer. - kernel_regularizer: tf.keras.regularizers.Regularizer object. Default to - None. - bias_regularizer: tf.keras.regularizers.Regularizer object. Default to - None. - add_head_batch_norm: `bool` whether to add a batch normalization layer - before pool. - use_sync_bn: `bool` if True, use synchronized batch normalization. - norm_momentum: `float` normalization momentum for the moving average. - norm_epsilon: `float` small float added to variance to avoid dividing by - zero. - skip_logits_layer: `bool`, whether to skip the prediction layer. - **kwargs: keyword arguments to be passed. - """ - if use_sync_bn: - norm = tf.keras.layers.experimental.SyncBatchNormalization - else: - norm = tf.keras.layers.BatchNormalization - axis = -1 if tf.keras.backend.image_data_format() == 'channels_last' else 1 - - inputs = tf.keras.Input(shape=input_specs.shape[1:], name=input_specs.name) - endpoints = backbone(inputs) - x = endpoints[max(endpoints.keys())] - - if add_head_batch_norm: - x = norm(axis=axis, momentum=norm_momentum, epsilon=norm_epsilon)(x) - x = tf.keras.layers.GlobalAveragePooling2D()(x) - if not skip_logits_layer: - x = tf.keras.layers.Dropout(dropout_rate)(x) - x = tf.keras.layers.Dense( - num_classes, - kernel_initializer=kernel_initializer, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer)( - x) - - super(ClassificationModel, self).__init__( - inputs=inputs, outputs=x, **kwargs) - self._config_dict = { - 'backbone': backbone, - 'num_classes': num_classes, - 'input_specs': input_specs, - 'dropout_rate': dropout_rate, - 'kernel_initializer': kernel_initializer, - 'kernel_regularizer': kernel_regularizer, - 'bias_regularizer': bias_regularizer, - 'add_head_batch_norm': add_head_batch_norm, - 'use_sync_bn': use_sync_bn, - 'norm_momentum': norm_momentum, - 'norm_epsilon': norm_epsilon, - } - self._input_specs = input_specs - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - self._backbone = backbone - self._norm = norm - - @property - def checkpoint_items(self) -> Mapping[str, tf.keras.Model]: - """Returns a dictionary of items to be additionally checkpointed.""" - return dict(backbone=self.backbone) - - @property - def backbone(self) -> tf.keras.Model: - return self._backbone - - def get_config(self) -> Mapping[str, Any]: - return self._config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) diff --git a/official/vision/beta/modeling/classification_model_test.py b/official/vision/beta/modeling/classification_model_test.py deleted file mode 100644 index dd9fd9269..000000000 --- a/official/vision/beta/modeling/classification_model_test.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for classification network.""" - -# Import libraries -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from tensorflow.python.distribute import combinations -from tensorflow.python.distribute import strategy_combinations -from official.vision.beta.modeling import backbones -from official.vision.beta.modeling import classification_model - - -class ClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (128, 50, 'relu'), - (128, 50, 'relu'), - (128, 50, 'swish'), - ) - def test_resnet_network_creation( - self, input_size, resnet_model_id, activation): - """Test for creation of a ResNet-50 classifier.""" - inputs = np.random.rand(2, input_size, input_size, 3) - - tf.keras.backend.set_image_data_format('channels_last') - - backbone = backbones.ResNet( - model_id=resnet_model_id, activation=activation) - self.assertEqual(backbone.count_params(), 23561152) - - num_classes = 1000 - model = classification_model.ClassificationModel( - backbone=backbone, - num_classes=num_classes, - dropout_rate=0.2, - ) - self.assertEqual(model.count_params(), 25610152) - - logits = model(inputs) - self.assertAllEqual([2, num_classes], logits.numpy().shape) - - def test_revnet_network_creation(self): - """Test for creation of a RevNet-56 classifier.""" - revnet_model_id = 56 - inputs = np.random.rand(2, 224, 224, 3) - - tf.keras.backend.set_image_data_format('channels_last') - - backbone = backbones.RevNet(model_id=revnet_model_id) - self.assertEqual(backbone.count_params(), 19473792) - - num_classes = 1000 - model = classification_model.ClassificationModel( - backbone=backbone, - num_classes=num_classes, - dropout_rate=0.2, - add_head_batch_norm=True, - ) - self.assertEqual(model.count_params(), 22816104) - - logits = model(inputs) - self.assertAllEqual([2, num_classes], logits.numpy().shape) - - @combinations.generate( - combinations.combine( - mobilenet_model_id=[ - 'MobileNetV1', - 'MobileNetV2', - 'MobileNetV3Large', - 'MobileNetV3Small', - 'MobileNetV3EdgeTPU', - 'MobileNetMultiAVG', - 'MobileNetMultiMAX', - ], - filter_size_scale=[1.0, 0.75], - )) - def test_mobilenet_network_creation(self, mobilenet_model_id, - filter_size_scale): - """Test for creation of a MobileNet classifier.""" - inputs = np.random.rand(2, 224, 224, 3) - - tf.keras.backend.set_image_data_format('channels_last') - - backbone = backbones.MobileNet( - model_id=mobilenet_model_id, filter_size_scale=filter_size_scale) - - num_classes = 1001 - model = classification_model.ClassificationModel( - backbone=backbone, - num_classes=num_classes, - dropout_rate=0.2, - ) - - logits = model(inputs) - self.assertAllEqual([2, num_classes], logits.numpy().shape) - - @combinations.generate( - combinations.combine( - strategy=[ - strategy_combinations.cloud_tpu_strategy, - strategy_combinations.one_device_strategy_gpu, - ], - use_sync_bn=[False, True], - )) - def test_sync_bn_multiple_devices(self, strategy, use_sync_bn): - """Test for sync bn on TPU and GPU devices.""" - inputs = np.random.rand(64, 128, 128, 3) - - tf.keras.backend.set_image_data_format('channels_last') - - with strategy.scope(): - backbone = backbones.ResNet(model_id=50, use_sync_bn=use_sync_bn) - - model = classification_model.ClassificationModel( - backbone=backbone, - num_classes=1000, - dropout_rate=0.2, - ) - _ = model(inputs) - - @combinations.generate( - combinations.combine( - strategy=[ - strategy_combinations.one_device_strategy_gpu, - ], - data_format=['channels_last', 'channels_first'], - input_dim=[1, 3, 4])) - def test_data_format_gpu(self, strategy, data_format, input_dim): - """Test for different data formats on GPU devices.""" - if data_format == 'channels_last': - inputs = np.random.rand(2, 128, 128, input_dim) - else: - inputs = np.random.rand(2, input_dim, 128, 128) - input_specs = tf.keras.layers.InputSpec(shape=inputs.shape) - - tf.keras.backend.set_image_data_format(data_format) - - with strategy.scope(): - backbone = backbones.ResNet(model_id=50, input_specs=input_specs) - - model = classification_model.ClassificationModel( - backbone=backbone, - num_classes=1000, - input_specs=input_specs, - ) - _ = model(inputs) - - def test_serialize_deserialize(self): - """Validate the classification net can be serialized and deserialized.""" - - tf.keras.backend.set_image_data_format('channels_last') - backbone = backbones.ResNet(model_id=50) - - model = classification_model.ClassificationModel( - backbone=backbone, num_classes=1000) - - config = model.get_config() - new_model = classification_model.ClassificationModel.from_config(config) - - # Validate that the config can be forced to JSON. - _ = new_model.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(model.get_config(), new_model.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/decoders/__init__.py b/official/vision/beta/modeling/decoders/__init__.py deleted file mode 100644 index 14dd660f0..000000000 --- a/official/vision/beta/modeling/decoders/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoders package definition.""" - -from official.vision.beta.modeling.decoders.aspp import ASPP -from official.vision.beta.modeling.decoders.fpn import FPN -from official.vision.beta.modeling.decoders.nasfpn import NASFPN diff --git a/official/vision/beta/modeling/decoders/aspp.py b/official/vision/beta/modeling/decoders/aspp.py deleted file mode 100644 index a10fe5d97..000000000 --- a/official/vision/beta/modeling/decoders/aspp.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of Atrous Spatial Pyramid Pooling (ASPP) decoder.""" -from typing import Any, List, Mapping, Optional, Union - -# Import libraries - -import tensorflow as tf - -from official.modeling import hyperparams -from official.vision.beta.modeling.decoders import factory -from official.vision.beta.modeling.layers import deeplab -from official.vision.beta.modeling.layers import nn_layers - -TensorMapUnion = Union[tf.Tensor, Mapping[str, tf.Tensor]] - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class ASPP(tf.keras.layers.Layer): - """Creates an Atrous Spatial Pyramid Pooling (ASPP) layer.""" - - def __init__( - self, - level: int, - dilation_rates: List[int], - num_filters: int = 256, - pool_kernel_size: Optional[int] = None, - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - activation: str = 'relu', - dropout_rate: float = 0.0, - kernel_initializer: str = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - interpolation: str = 'bilinear', - use_depthwise_convolution: bool = False, - spp_layer_version: str = 'v1', - output_tensor: bool = False, - **kwargs): - """Initializes an Atrous Spatial Pyramid Pooling (ASPP) layer. - - Args: - level: An `int` level to apply ASPP. - dilation_rates: A `list` of dilation rates. - num_filters: An `int` number of output filters in ASPP. - pool_kernel_size: A `list` of [height, width] of pooling kernel size or - None. Pooling size is with respect to original image size, it will be - scaled down by 2**level. If None, global average pooling is used. - use_sync_bn: A `bool`. If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - activation: A `str` activation to be used in ASPP. - dropout_rate: A `float` rate for dropout regularization. - kernel_initializer: A `str` name of kernel_initializer for convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default is None. - interpolation: A `str` of interpolation method. It should be one of - `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`, - `gaussian`, or `mitchellcubic`. - use_depthwise_convolution: If True depthwise separable convolutions will - be added to the Atrous spatial pyramid pooling. - spp_layer_version: A `str` of spatial pyramid pooling layer version. - output_tensor: Whether to output a single tensor or a dictionary of tensor. - Default is false. - **kwargs: Additional keyword arguments to be passed. - """ - super().__init__(**kwargs) - self._config_dict = { - 'level': level, - 'dilation_rates': dilation_rates, - 'num_filters': num_filters, - 'pool_kernel_size': pool_kernel_size, - 'use_sync_bn': use_sync_bn, - 'norm_momentum': norm_momentum, - 'norm_epsilon': norm_epsilon, - 'activation': activation, - 'dropout_rate': dropout_rate, - 'kernel_initializer': kernel_initializer, - 'kernel_regularizer': kernel_regularizer, - 'interpolation': interpolation, - 'use_depthwise_convolution': use_depthwise_convolution, - 'spp_layer_version': spp_layer_version, - 'output_tensor': output_tensor - } - self._aspp_layer = deeplab.SpatialPyramidPooling if self._config_dict[ - 'spp_layer_version'] == 'v1' else nn_layers.SpatialPyramidPooling - - def build(self, input_shape): - pool_kernel_size = None - if self._config_dict['pool_kernel_size']: - pool_kernel_size = [ - int(p_size // 2**self._config_dict['level']) - for p_size in self._config_dict['pool_kernel_size'] # pytype: disable=attribute-error # trace-all-classes - ] - - self.aspp = self._aspp_layer( - output_channels=self._config_dict['num_filters'], - dilation_rates=self._config_dict['dilation_rates'], - pool_kernel_size=pool_kernel_size, - use_sync_bn=self._config_dict['use_sync_bn'], - batchnorm_momentum=self._config_dict['norm_momentum'], - batchnorm_epsilon=self._config_dict['norm_epsilon'], - activation=self._config_dict['activation'], - dropout=self._config_dict['dropout_rate'], - kernel_initializer=self._config_dict['kernel_initializer'], - kernel_regularizer=self._config_dict['kernel_regularizer'], - interpolation=self._config_dict['interpolation'], - use_depthwise_convolution=self._config_dict['use_depthwise_convolution'] - ) - - def call(self, inputs: TensorMapUnion) -> TensorMapUnion: - """Calls the Atrous Spatial Pyramid Pooling (ASPP) layer on an input. - - The output of ASPP will be a dict of {`level`, `tf.Tensor`} even if only one - level is present, if output_tensor is false. Hence, this will be compatible - with the rest of the segmentation model interfaces. - If output_tensor is true, a single tensot is output. - - Args: - inputs: A `tf.Tensor` of shape [batch, height_l, width_l, filter_size] or - a `dict` of `tf.Tensor` where - - key: A `str` of the level of the multilevel feature maps. - - values: A `tf.Tensor` of shape [batch, height_l, width_l, - filter_size]. - - Returns: - A `tf.Tensor` of shape [batch, height_l, width_l, filter_size] or a `dict` - of `tf.Tensor` where - - key: A `str` of the level of the multilevel feature maps. - - values: A `tf.Tensor` of output of ASPP module. - """ - outputs = {} - level = str(self._config_dict['level']) - backbone_output = inputs[level] if isinstance(inputs, dict) else inputs - outputs = self.aspp(backbone_output) - return outputs if self._config_dict['output_tensor'] else {level: outputs} - - def get_config(self) -> Mapping[str, Any]: - base_config = super().get_config() - return dict(list(base_config.items()) + list(self._config_dict.items())) - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - -@factory.register_decoder_builder('aspp') -def build_aspp_decoder( - input_specs: Mapping[str, tf.TensorShape], - model_config: hyperparams.Config, - l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None -) -> tf.keras.Model: - """Builds ASPP decoder from a config. - - Args: - input_specs: A `dict` of input specifications. A dictionary consists of - {level: TensorShape} from a backbone. Note this is for consistent - interface, and is not used by ASPP decoder. - model_config: A OneOfConfig. Model config. - l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to - None. - - Returns: - A `tf.keras.Model` instance of the ASPP decoder. - - Raises: - ValueError: If the model_config.decoder.type is not `aspp`. - """ - del input_specs # input_specs is not used by ASPP decoder. - decoder_type = model_config.decoder.type - decoder_cfg = model_config.decoder.get() - if decoder_type != 'aspp': - raise ValueError(f'Inconsistent decoder type {decoder_type}. ' - 'Need to be `aspp`.') - - norm_activation_config = model_config.norm_activation - return ASPP( - level=decoder_cfg.level, - dilation_rates=decoder_cfg.dilation_rates, - num_filters=decoder_cfg.num_filters, - use_depthwise_convolution=decoder_cfg.use_depthwise_convolution, - pool_kernel_size=decoder_cfg.pool_kernel_size, - dropout_rate=decoder_cfg.dropout_rate, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - activation=norm_activation_config.activation, - kernel_regularizer=l2_regularizer, - spp_layer_version=decoder_cfg.spp_layer_version, - output_tensor=decoder_cfg.output_tensor) diff --git a/official/vision/beta/modeling/decoders/aspp_test.py b/official/vision/beta/modeling/decoders/aspp_test.py deleted file mode 100644 index a6b2a2478..000000000 --- a/official/vision/beta/modeling/decoders/aspp_test.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for aspp.""" - -# Import libraries -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.modeling.backbones import resnet -from official.vision.beta.modeling.decoders import aspp - - -class ASPPTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (3, [6, 12, 18, 24], 128, 'v1'), - (3, [6, 12, 18], 128, 'v1'), - (3, [6, 12], 256, 'v1'), - (4, [6, 12, 18, 24], 128, 'v2'), - (4, [6, 12, 18], 128, 'v2'), - (4, [6, 12], 256, 'v2'), - ) - def test_network_creation(self, level, dilation_rates, num_filters, - spp_layer_version): - """Test creation of ASPP.""" - - input_size = 256 - tf.keras.backend.set_image_data_format('channels_last') - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - - backbone = resnet.ResNet(model_id=50) - network = aspp.ASPP( - level=level, - dilation_rates=dilation_rates, - num_filters=num_filters, - spp_layer_version=spp_layer_version) - - endpoints = backbone(inputs) - feats = network(endpoints) - - self.assertIn(str(level), feats) - self.assertAllEqual( - [1, input_size // 2**level, input_size // 2**level, num_filters], - feats[str(level)].shape.as_list()) - - def test_serialize_deserialize(self): - # Create a network object that sets all of its config options. - kwargs = dict( - level=3, - dilation_rates=[6, 12], - num_filters=256, - pool_kernel_size=None, - use_sync_bn=False, - norm_momentum=0.99, - norm_epsilon=0.001, - activation='relu', - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - interpolation='bilinear', - dropout_rate=0.2, - use_depthwise_convolution='false', - spp_layer_version='v1', - output_tensor=False, - dtype='float32', - name='aspp', - trainable=True) - network = aspp.ASPP(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(network.get_config(), expected_config) - - # Create another network object from the first object's config. - new_network = aspp.ASPP.from_config(network.get_config()) - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(network.get_config(), new_network.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/decoders/factory.py b/official/vision/beta/modeling/decoders/factory.py deleted file mode 100644 index d1f732b9b..000000000 --- a/official/vision/beta/modeling/decoders/factory.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Decoder registers and factory method. - -One can register a new decoder model by the following two steps: - -1 Import the factory and register the build in the decoder file. -2 Import the decoder class and add a build in __init__.py. - -``` -# my_decoder.py - -from modeling.decoders import factory - -class MyDecoder(): - ... - -@factory.register_decoder_builder('my_decoder') -def build_my_decoder(): - return MyDecoder() - -# decoders/__init__.py adds import -from modeling.decoders.my_decoder import MyDecoder -``` - -If one wants the MyDecoder class to be used only by those binary -then don't imported the decoder module in decoders/__init__.py, but import it -in place that uses it. -""" -from typing import Any, Callable, Mapping, Optional, Union - -# Import libraries - -import tensorflow as tf - -from official.core import registry -from official.modeling import hyperparams - -_REGISTERED_DECODER_CLS = {} - - -def register_decoder_builder(key: str) -> Callable[..., Any]: - """Decorates a builder of decoder class. - - The builder should be a Callable (a class or a function). - This decorator supports registration of decoder builder as follows: - - ``` - class MyDecoder(tf.keras.Model): - pass - - @register_decoder_builder('mydecoder') - def builder(input_specs, config, l2_reg): - return MyDecoder(...) - - # Builds a MyDecoder object. - my_decoder = build_decoder_3d(input_specs, config, l2_reg) - ``` - - Args: - key: A `str` of key to look up the builder. - - Returns: - A callable for using as class decorator that registers the decorated class - for creation from an instance of task_config_cls. - """ - return registry.register(_REGISTERED_DECODER_CLS, key) - - -@register_decoder_builder('identity') -def build_identity( - input_specs: Optional[Mapping[str, tf.TensorShape]] = None, - model_config: Optional[hyperparams.Config] = None, - l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None) -> None: - """Builds identity decoder from a config. - - All the input arguments are not used by identity decoder but kept here to - ensure the interface is consistent. - - Args: - input_specs: A `dict` of input specifications. A dictionary consists of - {level: TensorShape} from a backbone. - model_config: A `OneOfConfig` of model config. - l2_regularizer: A `tf.keras.regularizers.Regularizer` object. Default to - None. - - Returns: - An instance of the identity decoder. - """ - del input_specs, model_config, l2_regularizer # Unused by identity decoder. - - -def build_decoder( - input_specs: Mapping[str, tf.TensorShape], - model_config: hyperparams.Config, - l2_regularizer: tf.keras.regularizers.Regularizer = None, - **kwargs) -> Union[None, tf.keras.Model, tf.keras.layers.Layer]: # pytype: disable=annotation-type-mismatch # typed-keras - """Builds decoder from a config. - - A decoder can be a keras.Model, a keras.layers.Layer, or None. If it is not - None, the decoder will take features from the backbone as input and generate - decoded feature maps. If it is None, such as an identity decoder, the decoder - is skipped and features from the backbone are regarded as model output. - - Args: - input_specs: A `dict` of input specifications. A dictionary consists of - {level: TensorShape} from a backbone. - model_config: A `OneOfConfig` of model config. - l2_regularizer: A `tf.keras.regularizers.Regularizer` object. Default to - None. - **kwargs: Additional keyword args to be passed to decoder builder. - - Returns: - An instance of the decoder. - """ - decoder_builder = registry.lookup(_REGISTERED_DECODER_CLS, - model_config.decoder.type) - - return decoder_builder( - input_specs=input_specs, - model_config=model_config, - l2_regularizer=l2_regularizer, - **kwargs) diff --git a/official/vision/beta/modeling/decoders/factory_test.py b/official/vision/beta/modeling/decoders/factory_test.py deleted file mode 100644 index 844a5b020..000000000 --- a/official/vision/beta/modeling/decoders/factory_test.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for decoder factory functions.""" - -from absl.testing import parameterized -import tensorflow as tf - -from tensorflow.python.distribute import combinations -from official.vision.beta import configs -from official.vision.beta.configs import decoders as decoders_cfg -from official.vision.beta.modeling import decoders -from official.vision.beta.modeling.decoders import factory - - -class FactoryTest(tf.test.TestCase, parameterized.TestCase): - - @combinations.generate( - combinations.combine( - num_filters=[128, 256], use_separable_conv=[True, False])) - def test_fpn_decoder_creation(self, num_filters, use_separable_conv): - """Test creation of FPN decoder.""" - min_level = 3 - max_level = 7 - input_specs = {} - for level in range(min_level, max_level): - input_specs[str(level)] = tf.TensorShape( - [1, 128 // (2**level), 128 // (2**level), 3]) - - network = decoders.FPN( - input_specs=input_specs, - num_filters=num_filters, - use_separable_conv=use_separable_conv, - use_sync_bn=True) - - model_config = configs.retinanet.RetinaNet() - model_config.min_level = min_level - model_config.max_level = max_level - model_config.num_classes = 10 - model_config.input_size = [None, None, 3] - model_config.decoder = decoders_cfg.Decoder( - type='fpn', - fpn=decoders_cfg.FPN( - num_filters=num_filters, use_separable_conv=use_separable_conv)) - - factory_network = factory.build_decoder( - input_specs=input_specs, model_config=model_config) - - network_config = network.get_config() - factory_network_config = factory_network.get_config() - - self.assertEqual(network_config, factory_network_config) - - @combinations.generate( - combinations.combine( - num_filters=[128, 256], - num_repeats=[3, 5], - use_separable_conv=[True, False])) - def test_nasfpn_decoder_creation(self, num_filters, num_repeats, - use_separable_conv): - """Test creation of NASFPN decoder.""" - min_level = 3 - max_level = 7 - input_specs = {} - for level in range(min_level, max_level): - input_specs[str(level)] = tf.TensorShape( - [1, 128 // (2**level), 128 // (2**level), 3]) - - network = decoders.NASFPN( - input_specs=input_specs, - num_filters=num_filters, - num_repeats=num_repeats, - use_separable_conv=use_separable_conv, - use_sync_bn=True) - - model_config = configs.retinanet.RetinaNet() - model_config.min_level = min_level - model_config.max_level = max_level - model_config.num_classes = 10 - model_config.input_size = [None, None, 3] - model_config.decoder = decoders_cfg.Decoder( - type='nasfpn', - nasfpn=decoders_cfg.NASFPN( - num_filters=num_filters, - num_repeats=num_repeats, - use_separable_conv=use_separable_conv)) - - factory_network = factory.build_decoder( - input_specs=input_specs, model_config=model_config) - - network_config = network.get_config() - factory_network_config = factory_network.get_config() - - self.assertEqual(network_config, factory_network_config) - - @combinations.generate( - combinations.combine( - level=[3, 4], - dilation_rates=[[6, 12, 18], [6, 12]], - num_filters=[128, 256])) - def test_aspp_decoder_creation(self, level, dilation_rates, num_filters): - """Test creation of ASPP decoder.""" - input_specs = {'1': tf.TensorShape([1, 128, 128, 3])} - - network = decoders.ASPP( - level=level, - dilation_rates=dilation_rates, - num_filters=num_filters, - use_sync_bn=True) - - model_config = configs.semantic_segmentation.SemanticSegmentationModel() - model_config.num_classes = 10 - model_config.input_size = [None, None, 3] - model_config.decoder = decoders_cfg.Decoder( - type='aspp', - aspp=decoders_cfg.ASPP( - level=level, dilation_rates=dilation_rates, - num_filters=num_filters)) - - factory_network = factory.build_decoder( - input_specs=input_specs, model_config=model_config) - - network_config = network.get_config() - factory_network_config = factory_network.get_config() - # Due to calling `super().get_config()` in aspp layer, everything but the - # the name of two layer instances are the same, so we force equal name so it - # will not give false alarm. - factory_network_config['name'] = network_config['name'] - - self.assertEqual(network_config, factory_network_config) - - def test_identity_decoder_creation(self): - """Test creation of identity decoder.""" - model_config = configs.retinanet.RetinaNet() - model_config.num_classes = 2 - model_config.input_size = [None, None, 3] - - model_config.decoder = decoders_cfg.Decoder( - type='identity', identity=decoders_cfg.Identity()) - - factory_network = factory.build_decoder( - input_specs=None, model_config=model_config) - - self.assertIsNone(factory_network) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/decoders/fpn.py b/official/vision/beta/modeling/decoders/fpn.py deleted file mode 100644 index f48739f71..000000000 --- a/official/vision/beta/modeling/decoders/fpn.py +++ /dev/null @@ -1,246 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains the definitions of Feature Pyramid Networks (FPN).""" -from typing import Any, Mapping, Optional - -# Import libraries -from absl import logging -import tensorflow as tf - -from official.modeling import hyperparams -from official.modeling import tf_utils -from official.vision.beta.modeling.decoders import factory -from official.vision.beta.ops import spatial_transform_ops - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class FPN(tf.keras.Model): - """Creates a Feature Pyramid Network (FPN). - - This implemets the paper: - Tsung-Yi Lin, Piotr Dollar, Ross Girshick, Kaiming He, Bharath Hariharan, and - Serge Belongie. - Feature Pyramid Networks for Object Detection. - (https://arxiv.org/pdf/1612.03144) - """ - - def __init__( - self, - input_specs: Mapping[str, tf.TensorShape], - min_level: int = 3, - max_level: int = 7, - num_filters: int = 256, - fusion_type: str = 'sum', - use_separable_conv: bool = False, - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_initializer: str = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - **kwargs): - """Initializes a Feature Pyramid Network (FPN). - - Args: - input_specs: A `dict` of input specifications. A dictionary consists of - {level: TensorShape} from a backbone. - min_level: An `int` of minimum level in FPN output feature maps. - max_level: An `int` of maximum level in FPN output feature maps. - num_filters: An `int` number of filters in FPN layers. - fusion_type: A `str` of `sum` or `concat`. Whether performing sum or - concat for feature fusion. - use_separable_conv: A `bool`. If True use separable convolution for - convolution in FPN layers. - activation: A `str` name of the activation function. - use_sync_bn: A `bool`. If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - kernel_initializer: A `str` name of kernel_initializer for convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default is None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - **kwargs: Additional keyword arguments to be passed. - """ - self._config_dict = { - 'input_specs': input_specs, - 'min_level': min_level, - 'max_level': max_level, - 'num_filters': num_filters, - 'fusion_type': fusion_type, - 'use_separable_conv': use_separable_conv, - 'activation': activation, - 'use_sync_bn': use_sync_bn, - 'norm_momentum': norm_momentum, - 'norm_epsilon': norm_epsilon, - 'kernel_initializer': kernel_initializer, - 'kernel_regularizer': kernel_regularizer, - 'bias_regularizer': bias_regularizer, - } - if use_separable_conv: - conv2d = tf.keras.layers.SeparableConv2D - else: - conv2d = tf.keras.layers.Conv2D - if use_sync_bn: - norm = tf.keras.layers.experimental.SyncBatchNormalization - else: - norm = tf.keras.layers.BatchNormalization - activation_fn = tf.keras.layers.Activation( - tf_utils.get_activation(activation)) - - # Build input feature pyramid. - if tf.keras.backend.image_data_format() == 'channels_last': - bn_axis = -1 - else: - bn_axis = 1 - - # Get input feature pyramid from backbone. - logging.info('FPN input_specs: %s', input_specs) - inputs = self._build_input_pyramid(input_specs, min_level) - backbone_max_level = min(int(max(inputs.keys())), max_level) - - # Build lateral connections. - feats_lateral = {} - for level in range(min_level, backbone_max_level + 1): - feats_lateral[str(level)] = conv2d( - filters=num_filters, - kernel_size=1, - padding='same', - kernel_initializer=kernel_initializer, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer)( - inputs[str(level)]) - - # Build top-down path. - feats = {str(backbone_max_level): feats_lateral[str(backbone_max_level)]} - for level in range(backbone_max_level - 1, min_level - 1, -1): - feat_a = spatial_transform_ops.nearest_upsampling( - feats[str(level + 1)], 2) - feat_b = feats_lateral[str(level)] - - if fusion_type == 'sum': - feats[str(level)] = feat_a + feat_b - elif fusion_type == 'concat': - feats[str(level)] = tf.concat([feat_a, feat_b], axis=-1) - else: - raise ValueError('Fusion type {} not supported.'.format(fusion_type)) - - # TODO(xianzhi): consider to remove bias in conv2d. - # Build post-hoc 3x3 convolution kernel. - for level in range(min_level, backbone_max_level + 1): - feats[str(level)] = conv2d( - filters=num_filters, - strides=1, - kernel_size=3, - padding='same', - kernel_initializer=kernel_initializer, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer)( - feats[str(level)]) - - # TODO(xianzhi): consider to remove bias in conv2d. - # Build coarser FPN levels introduced for RetinaNet. - for level in range(backbone_max_level + 1, max_level + 1): - feats_in = feats[str(level - 1)] - if level > backbone_max_level + 1: - feats_in = activation_fn(feats_in) - feats[str(level)] = conv2d( - filters=num_filters, - strides=2, - kernel_size=3, - padding='same', - kernel_initializer=kernel_initializer, - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer)( - feats_in) - - # Apply batch norm layers. - for level in range(min_level, max_level + 1): - feats[str(level)] = norm( - axis=bn_axis, momentum=norm_momentum, epsilon=norm_epsilon)( - feats[str(level)]) - - self._output_specs = { - str(level): feats[str(level)].get_shape() - for level in range(min_level, max_level + 1) - } - - super(FPN, self).__init__(inputs=inputs, outputs=feats, **kwargs) - - def _build_input_pyramid(self, input_specs: Mapping[str, tf.TensorShape], - min_level: int): - assert isinstance(input_specs, dict) - if min(input_specs.keys()) > str(min_level): - raise ValueError( - 'Backbone min level should be less or equal to FPN min level') - - inputs = {} - for level, spec in input_specs.items(): - inputs[level] = tf.keras.Input(shape=spec[1:]) - return inputs - - def get_config(self) -> Mapping[str, Any]: - return self._config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - @property - def output_specs(self) -> Mapping[str, tf.TensorShape]: - """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs - - -@factory.register_decoder_builder('fpn') -def build_fpn_decoder( - input_specs: Mapping[str, tf.TensorShape], - model_config: hyperparams.Config, - l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None -) -> tf.keras.Model: - """Builds FPN decoder from a config. - - Args: - input_specs: A `dict` of input specifications. A dictionary consists of - {level: TensorShape} from a backbone. - model_config: A OneOfConfig. Model config. - l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to - None. - - Returns: - A `tf.keras.Model` instance of the FPN decoder. - - Raises: - ValueError: If the model_config.decoder.type is not `fpn`. - """ - decoder_type = model_config.decoder.type - decoder_cfg = model_config.decoder.get() - if decoder_type != 'fpn': - raise ValueError(f'Inconsistent decoder type {decoder_type}. ' - 'Need to be `fpn`.') - norm_activation_config = model_config.norm_activation - return FPN( - input_specs=input_specs, - min_level=model_config.min_level, - max_level=model_config.max_level, - num_filters=decoder_cfg.num_filters, - fusion_type=decoder_cfg.fusion_type, - use_separable_conv=decoder_cfg.use_separable_conv, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer) diff --git a/official/vision/beta/modeling/decoders/fpn_test.py b/official/vision/beta/modeling/decoders/fpn_test.py deleted file mode 100644 index 6d1903cc8..000000000 --- a/official/vision/beta/modeling/decoders/fpn_test.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for FPN.""" - -# Import libraries -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.modeling.backbones import mobilenet -from official.vision.beta.modeling.backbones import resnet -from official.vision.beta.modeling.decoders import fpn - - -class FPNTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (256, 3, 7, False, 'sum'), - (256, 3, 7, True, 'concat'), - ) - def test_network_creation(self, input_size, min_level, max_level, - use_separable_conv, fusion_type): - """Test creation of FPN.""" - tf.keras.backend.set_image_data_format('channels_last') - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - - backbone = resnet.ResNet(model_id=50) - network = fpn.FPN( - input_specs=backbone.output_specs, - min_level=min_level, - max_level=max_level, - fusion_type=fusion_type, - use_separable_conv=use_separable_conv) - - endpoints = backbone(inputs) - feats = network(endpoints) - - for level in range(min_level, max_level + 1): - self.assertIn(str(level), feats) - self.assertAllEqual( - [1, input_size // 2**level, input_size // 2**level, 256], - feats[str(level)].shape.as_list()) - - @parameterized.parameters( - (256, 3, 7, False), - (256, 3, 7, True), - ) - def test_network_creation_with_mobilenet(self, input_size, min_level, - max_level, use_separable_conv): - """Test creation of FPN with mobilenet backbone.""" - tf.keras.backend.set_image_data_format('channels_last') - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - - backbone = mobilenet.MobileNet(model_id='MobileNetV2') - network = fpn.FPN( - input_specs=backbone.output_specs, - min_level=min_level, - max_level=max_level, - use_separable_conv=use_separable_conv) - - endpoints = backbone(inputs) - feats = network(endpoints) - - for level in range(min_level, max_level + 1): - self.assertIn(str(level), feats) - self.assertAllEqual( - [1, input_size // 2**level, input_size // 2**level, 256], - feats[str(level)].shape.as_list()) - - def test_serialize_deserialize(self): - # Create a network object that sets all of its config options. - kwargs = dict( - input_specs=resnet.ResNet(model_id=50).output_specs, - min_level=3, - max_level=7, - num_filters=256, - fusion_type='sum', - use_separable_conv=False, - use_sync_bn=False, - activation='relu', - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - ) - network = fpn.FPN(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(network.get_config(), expected_config) - - # Create another network object from the first object's config. - new_network = fpn.FPN.from_config(network.get_config()) - - # Validate that the config can be forced to JSON. - _ = new_network.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(network.get_config(), new_network.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/decoders/nasfpn.py b/official/vision/beta/modeling/decoders/nasfpn.py deleted file mode 100644 index 2f338eccb..000000000 --- a/official/vision/beta/modeling/decoders/nasfpn.py +++ /dev/null @@ -1,368 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of NAS-FPN.""" - -from typing import Any, List, Mapping, Optional, Tuple - -# Import libraries - -from absl import logging -import tensorflow as tf - -from official.modeling import hyperparams -from official.modeling import tf_utils -from official.vision.beta.modeling.decoders import factory -from official.vision.beta.ops import spatial_transform_ops - - -# The fixed NAS-FPN architecture discovered by NAS. -# Each element represents a specification of a building block: -# (block_level, combine_fn, (input_offset0, input_offset1), is_output). -NASFPN_BLOCK_SPECS = [ - (4, 'attention', (1, 3), False), - (4, 'sum', (1, 5), False), - (3, 'sum', (0, 6), True), - (4, 'sum', (6, 7), True), - (5, 'attention', (7, 8), True), - (7, 'attention', (6, 9), True), - (6, 'attention', (9, 10), True), -] - - -class BlockSpec(): - """A container class that specifies the block configuration for NAS-FPN.""" - - def __init__(self, level: int, combine_fn: str, - input_offsets: Tuple[int, int], is_output: bool): - self.level = level - self.combine_fn = combine_fn - self.input_offsets = input_offsets - self.is_output = is_output - - -def build_block_specs( - block_specs: Optional[List[Tuple[Any, ...]]] = None) -> List[BlockSpec]: - """Builds the list of BlockSpec objects for NAS-FPN.""" - if not block_specs: - block_specs = NASFPN_BLOCK_SPECS - logging.info('Building NAS-FPN block specs: %s', block_specs) - return [BlockSpec(*b) for b in block_specs] - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class NASFPN(tf.keras.Model): - """Creates a NAS-FPN model. - - This implements the paper: - Golnaz Ghiasi, Tsung-Yi Lin, Ruoming Pang, Quoc V. Le. - NAS-FPN: Learning Scalable Feature Pyramid Architecture for Object Detection. - (https://arxiv.org/abs/1904.07392) - """ - - def __init__( - self, - input_specs: Mapping[str, tf.TensorShape], - min_level: int = 3, - max_level: int = 7, - block_specs: List[BlockSpec] = build_block_specs(), - num_filters: int = 256, - num_repeats: int = 5, - use_separable_conv: bool = False, - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_initializer: str = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - **kwargs): - """Initializes a NAS-FPN model. - - Args: - input_specs: A `dict` of input specifications. A dictionary consists of - {level: TensorShape} from a backbone. - min_level: An `int` of minimum level in FPN output feature maps. - max_level: An `int` of maximum level in FPN output feature maps. - block_specs: a list of BlockSpec objects that specifies the NAS-FPN - network topology. By default, the previously discovered architecture is - used. - num_filters: An `int` number of filters in FPN layers. - num_repeats: number of repeats for feature pyramid network. - use_separable_conv: A `bool`. If True use separable convolution for - convolution in FPN layers. - activation: A `str` name of the activation function. - use_sync_bn: A `bool`. If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - kernel_initializer: A `str` name of kernel_initializer for convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default is None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - **kwargs: Additional keyword arguments to be passed. - """ - self._config_dict = { - 'input_specs': input_specs, - 'min_level': min_level, - 'max_level': max_level, - 'num_filters': num_filters, - 'num_repeats': num_repeats, - 'use_separable_conv': use_separable_conv, - 'activation': activation, - 'use_sync_bn': use_sync_bn, - 'norm_momentum': norm_momentum, - 'norm_epsilon': norm_epsilon, - 'kernel_initializer': kernel_initializer, - 'kernel_regularizer': kernel_regularizer, - 'bias_regularizer': bias_regularizer, - } - self._min_level = min_level - self._max_level = max_level - self._block_specs = block_specs - self._num_repeats = num_repeats - self._conv_op = (tf.keras.layers.SeparableConv2D - if self._config_dict['use_separable_conv'] - else tf.keras.layers.Conv2D) - if self._config_dict['use_separable_conv']: - self._conv_kwargs = { - 'depthwise_initializer': tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - 'pointwise_initializer': tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - 'bias_initializer': tf.zeros_initializer(), - 'depthwise_regularizer': self._config_dict['kernel_regularizer'], - 'pointwise_regularizer': self._config_dict['kernel_regularizer'], - 'bias_regularizer': self._config_dict['bias_regularizer'], - } - else: - self._conv_kwargs = { - 'kernel_initializer': tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - 'bias_initializer': tf.zeros_initializer(), - 'kernel_regularizer': self._config_dict['kernel_regularizer'], - 'bias_regularizer': self._config_dict['bias_regularizer'], - } - self._norm_op = (tf.keras.layers.experimental.SyncBatchNormalization - if self._config_dict['use_sync_bn'] - else tf.keras.layers.BatchNormalization) - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - self._norm_kwargs = { - 'axis': self._bn_axis, - 'momentum': self._config_dict['norm_momentum'], - 'epsilon': self._config_dict['norm_epsilon'], - } - self._activation = tf_utils.get_activation(activation) - - # Gets input feature pyramid from backbone. - inputs = self._build_input_pyramid(input_specs, min_level) - - # Projects the input features. - feats = [] - for level in range(self._min_level, self._max_level + 1): - if str(level) in inputs.keys(): - feats.append(self._resample_feature_map( - inputs[str(level)], level, level, self._config_dict['num_filters'])) - else: - feats.append(self._resample_feature_map( - feats[-1], level - 1, level, self._config_dict['num_filters'])) - - # Repeatly builds the NAS-FPN modules. - for _ in range(self._num_repeats): - output_feats = self._build_feature_pyramid(feats) - feats = [output_feats[level] - for level in range(self._min_level, self._max_level + 1)] - - self._output_specs = { - str(level): output_feats[level].get_shape() - for level in range(min_level, max_level + 1) - } - output_feats = {str(level): output_feats[level] - for level in output_feats.keys()} - super(NASFPN, self).__init__(inputs=inputs, outputs=output_feats, **kwargs) - - def _build_input_pyramid(self, input_specs: Mapping[str, tf.TensorShape], - min_level: int): - assert isinstance(input_specs, dict) - if min(input_specs.keys()) > str(min_level): - raise ValueError( - 'Backbone min level should be less or equal to FPN min level') - - inputs = {} - for level, spec in input_specs.items(): - inputs[level] = tf.keras.Input(shape=spec[1:]) - return inputs - - def _resample_feature_map(self, - inputs, - input_level, - target_level, - target_num_filters=256): - x = inputs - _, _, _, input_num_filters = x.get_shape().as_list() - if input_num_filters != target_num_filters: - x = self._conv_op( - filters=target_num_filters, - kernel_size=1, - padding='same', - **self._conv_kwargs)(x) - x = self._norm_op(**self._norm_kwargs)(x) - - if input_level < target_level: - stride = int(2 ** (target_level - input_level)) - return tf.keras.layers.MaxPool2D( - pool_size=stride, strides=stride, padding='same')(x) - if input_level > target_level: - scale = int(2 ** (input_level - target_level)) - return spatial_transform_ops.nearest_upsampling(x, scale=scale) - - # Force output x to be the same dtype as mixed precision policy. This avoids - # dtype mismatch when one input (by default float32 dtype) does not meet all - # the above conditions and is output unchanged, while other inputs are - # processed to have different dtype, e.g., using bfloat16 on TPU. - compute_dtype = tf.keras.layers.Layer().dtype_policy.compute_dtype - if (compute_dtype is not None) and (x.dtype != compute_dtype): - return tf.cast(x, dtype=compute_dtype) - else: - return x - - def _global_attention(self, feat0, feat1): - m = tf.math.reduce_max(feat0, axis=[1, 2], keepdims=True) - m = tf.math.sigmoid(m) - return feat0 + feat1 * m - - def _build_feature_pyramid(self, feats): - num_output_connections = [0] * len(feats) - num_output_levels = self._max_level - self._min_level + 1 - feat_levels = list(range(self._min_level, self._max_level + 1)) - - for i, block_spec in enumerate(self._block_specs): - new_level = block_spec.level - - # Checks the range of input_offsets. - for input_offset in block_spec.input_offsets: - if input_offset >= len(feats): - raise ValueError( - 'input_offset ({}) is larger than num feats({})'.format( - input_offset, len(feats))) - input0 = block_spec.input_offsets[0] - input1 = block_spec.input_offsets[1] - - # Update graph with inputs. - node0 = feats[input0] - node0_level = feat_levels[input0] - num_output_connections[input0] += 1 - node0 = self._resample_feature_map(node0, node0_level, new_level) - node1 = feats[input1] - node1_level = feat_levels[input1] - num_output_connections[input1] += 1 - node1 = self._resample_feature_map(node1, node1_level, new_level) - - # Combine node0 and node1 to create new feat. - if block_spec.combine_fn == 'sum': - new_node = node0 + node1 - elif block_spec.combine_fn == 'attention': - if node0_level >= node1_level: - new_node = self._global_attention(node0, node1) - else: - new_node = self._global_attention(node1, node0) - else: - raise ValueError('unknown combine_fn `{}`.' - .format(block_spec.combine_fn)) - - # Add intermediate nodes that do not have any connections to output. - if block_spec.is_output: - for j, (feat, feat_level, num_output) in enumerate( - zip(feats, feat_levels, num_output_connections)): - if num_output == 0 and feat_level == new_level: - num_output_connections[j] += 1 - - feat_ = self._resample_feature_map(feat, feat_level, new_level) - new_node += feat_ - - new_node = self._activation(new_node) - new_node = self._conv_op( - filters=self._config_dict['num_filters'], - kernel_size=(3, 3), - padding='same', - **self._conv_kwargs)(new_node) - new_node = self._norm_op(**self._norm_kwargs)(new_node) - - feats.append(new_node) - feat_levels.append(new_level) - num_output_connections.append(0) - - output_feats = {} - for i in range(len(feats) - num_output_levels, len(feats)): - level = feat_levels[i] - output_feats[level] = feats[i] - logging.info('Output feature pyramid: %s', output_feats) - return output_feats - - def get_config(self) -> Mapping[str, Any]: - return self._config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - @property - def output_specs(self) -> Mapping[str, tf.TensorShape]: - """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs - - -@factory.register_decoder_builder('nasfpn') -def build_nasfpn_decoder( - input_specs: Mapping[str, tf.TensorShape], - model_config: hyperparams.Config, - l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None -) -> tf.keras.Model: - """Builds NASFPN decoder from a config. - - Args: - input_specs: A `dict` of input specifications. A dictionary consists of - {level: TensorShape} from a backbone. - model_config: A OneOfConfig. Model config. - l2_regularizer: A `tf.keras.regularizers.Regularizer` instance. Default to - None. - - Returns: - A `tf.keras.Model` instance of the NASFPN decoder. - - Raises: - ValueError: If the model_config.decoder.type is not `nasfpn`. - """ - decoder_type = model_config.decoder.type - decoder_cfg = model_config.decoder.get() - if decoder_type != 'nasfpn': - raise ValueError(f'Inconsistent decoder type {decoder_type}. ' - 'Need to be `nasfpn`.') - - norm_activation_config = model_config.norm_activation - return NASFPN( - input_specs=input_specs, - min_level=model_config.min_level, - max_level=model_config.max_level, - num_filters=decoder_cfg.num_filters, - num_repeats=decoder_cfg.num_repeats, - use_separable_conv=decoder_cfg.use_separable_conv, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer) diff --git a/official/vision/beta/modeling/decoders/nasfpn_test.py b/official/vision/beta/modeling/decoders/nasfpn_test.py deleted file mode 100644 index 99c8b7612..000000000 --- a/official/vision/beta/modeling/decoders/nasfpn_test.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for NAS-FPN.""" - -# Import libraries -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.modeling.backbones import resnet -from official.vision.beta.modeling.decoders import nasfpn - - -class NASFPNTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (256, 3, 7, False), - (256, 3, 7, True), - ) - def test_network_creation(self, input_size, min_level, max_level, - use_separable_conv): - """Test creation of NAS-FPN.""" - tf.keras.backend.set_image_data_format('channels_last') - - inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1) - - num_filters = 256 - backbone = resnet.ResNet(model_id=50) - network = nasfpn.NASFPN( - input_specs=backbone.output_specs, - min_level=min_level, - max_level=max_level, - num_filters=num_filters, - use_separable_conv=use_separable_conv) - - endpoints = backbone(inputs) - feats = network(endpoints) - - for level in range(min_level, max_level + 1): - self.assertIn(str(level), feats) - self.assertAllEqual( - [1, input_size // 2**level, input_size // 2**level, num_filters], - feats[str(level)].shape.as_list()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/factory.py b/official/vision/beta/modeling/factory.py deleted file mode 100644 index 637e05865..000000000 --- a/official/vision/beta/modeling/factory.py +++ /dev/null @@ -1,385 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Factory methods to build models.""" - -from typing import Optional - -import tensorflow as tf - -from official.vision.beta.configs import image_classification as classification_cfg -from official.vision.beta.configs import maskrcnn as maskrcnn_cfg -from official.vision.beta.configs import retinanet as retinanet_cfg -from official.vision.beta.configs import semantic_segmentation as segmentation_cfg -from official.vision.beta.modeling import backbones -from official.vision.beta.modeling import classification_model -from official.vision.beta.modeling import decoders -from official.vision.beta.modeling import maskrcnn_model -from official.vision.beta.modeling import retinanet_model -from official.vision.beta.modeling import segmentation_model -from official.vision.beta.modeling.heads import dense_prediction_heads -from official.vision.beta.modeling.heads import instance_heads -from official.vision.beta.modeling.heads import segmentation_heads -from official.vision.beta.modeling.layers import detection_generator -from official.vision.beta.modeling.layers import mask_sampler -from official.vision.beta.modeling.layers import roi_aligner -from official.vision.beta.modeling.layers import roi_generator -from official.vision.beta.modeling.layers import roi_sampler - - -def build_classification_model( - input_specs: tf.keras.layers.InputSpec, - model_config: classification_cfg.ImageClassificationModel, - l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - skip_logits_layer: bool = False, - backbone: Optional[tf.keras.Model] = None) -> tf.keras.Model: - """Builds the classification model.""" - norm_activation_config = model_config.norm_activation - if not backbone: - backbone = backbones.factory.build_backbone( - input_specs=input_specs, - backbone_config=model_config.backbone, - norm_activation_config=norm_activation_config, - l2_regularizer=l2_regularizer) - - model = classification_model.ClassificationModel( - backbone=backbone, - num_classes=model_config.num_classes, - input_specs=input_specs, - dropout_rate=model_config.dropout_rate, - kernel_initializer=model_config.kernel_initializer, - kernel_regularizer=l2_regularizer, - add_head_batch_norm=model_config.add_head_batch_norm, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - skip_logits_layer=skip_logits_layer) - return model - - -def build_maskrcnn(input_specs: tf.keras.layers.InputSpec, - model_config: maskrcnn_cfg.MaskRCNN, - l2_regularizer: Optional[ - tf.keras.regularizers.Regularizer] = None, - backbone: Optional[tf.keras.Model] = None, - decoder: Optional[tf.keras.Model] = None) -> tf.keras.Model: - """Builds Mask R-CNN model.""" - norm_activation_config = model_config.norm_activation - if not backbone: - backbone = backbones.factory.build_backbone( - input_specs=input_specs, - backbone_config=model_config.backbone, - norm_activation_config=norm_activation_config, - l2_regularizer=l2_regularizer) - backbone_features = backbone(tf.keras.Input(input_specs.shape[1:])) - - if not decoder: - decoder = decoders.factory.build_decoder( - input_specs=backbone.output_specs, - model_config=model_config, - l2_regularizer=l2_regularizer) - - rpn_head_config = model_config.rpn_head - roi_generator_config = model_config.roi_generator - roi_sampler_config = model_config.roi_sampler - roi_aligner_config = model_config.roi_aligner - detection_head_config = model_config.detection_head - generator_config = model_config.detection_generator - num_anchors_per_location = ( - len(model_config.anchor.aspect_ratios) * model_config.anchor.num_scales) - - rpn_head = dense_prediction_heads.RPNHead( - min_level=model_config.min_level, - max_level=model_config.max_level, - num_anchors_per_location=num_anchors_per_location, - num_convs=rpn_head_config.num_convs, - num_filters=rpn_head_config.num_filters, - use_separable_conv=rpn_head_config.use_separable_conv, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer) - - detection_head = instance_heads.DetectionHead( - num_classes=model_config.num_classes, - num_convs=detection_head_config.num_convs, - num_filters=detection_head_config.num_filters, - use_separable_conv=detection_head_config.use_separable_conv, - num_fcs=detection_head_config.num_fcs, - fc_dims=detection_head_config.fc_dims, - class_agnostic_bbox_pred=detection_head_config.class_agnostic_bbox_pred, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer, - name='detection_head') - - if decoder: - decoder_features = decoder(backbone_features) - rpn_head(decoder_features) - - if roi_sampler_config.cascade_iou_thresholds: - detection_head_cascade = [detection_head] - for cascade_num in range(len(roi_sampler_config.cascade_iou_thresholds)): - detection_head = instance_heads.DetectionHead( - num_classes=model_config.num_classes, - num_convs=detection_head_config.num_convs, - num_filters=detection_head_config.num_filters, - use_separable_conv=detection_head_config.use_separable_conv, - num_fcs=detection_head_config.num_fcs, - fc_dims=detection_head_config.fc_dims, - class_agnostic_bbox_pred=detection_head_config - .class_agnostic_bbox_pred, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer, - name='detection_head_{}'.format(cascade_num + 1)) - - detection_head_cascade.append(detection_head) - detection_head = detection_head_cascade - - roi_generator_obj = roi_generator.MultilevelROIGenerator( - pre_nms_top_k=roi_generator_config.pre_nms_top_k, - pre_nms_score_threshold=roi_generator_config.pre_nms_score_threshold, - pre_nms_min_size_threshold=( - roi_generator_config.pre_nms_min_size_threshold), - nms_iou_threshold=roi_generator_config.nms_iou_threshold, - num_proposals=roi_generator_config.num_proposals, - test_pre_nms_top_k=roi_generator_config.test_pre_nms_top_k, - test_pre_nms_score_threshold=( - roi_generator_config.test_pre_nms_score_threshold), - test_pre_nms_min_size_threshold=( - roi_generator_config.test_pre_nms_min_size_threshold), - test_nms_iou_threshold=roi_generator_config.test_nms_iou_threshold, - test_num_proposals=roi_generator_config.test_num_proposals, - use_batched_nms=roi_generator_config.use_batched_nms) - - roi_sampler_cascade = [] - roi_sampler_obj = roi_sampler.ROISampler( - mix_gt_boxes=roi_sampler_config.mix_gt_boxes, - num_sampled_rois=roi_sampler_config.num_sampled_rois, - foreground_fraction=roi_sampler_config.foreground_fraction, - foreground_iou_threshold=roi_sampler_config.foreground_iou_threshold, - background_iou_high_threshold=( - roi_sampler_config.background_iou_high_threshold), - background_iou_low_threshold=( - roi_sampler_config.background_iou_low_threshold)) - roi_sampler_cascade.append(roi_sampler_obj) - # Initialize addtional roi simplers for cascade heads. - if roi_sampler_config.cascade_iou_thresholds: - for iou in roi_sampler_config.cascade_iou_thresholds: - roi_sampler_obj = roi_sampler.ROISampler( - mix_gt_boxes=False, - num_sampled_rois=roi_sampler_config.num_sampled_rois, - foreground_iou_threshold=iou, - background_iou_high_threshold=iou, - background_iou_low_threshold=0.0, - skip_subsampling=True) - roi_sampler_cascade.append(roi_sampler_obj) - - roi_aligner_obj = roi_aligner.MultilevelROIAligner( - crop_size=roi_aligner_config.crop_size, - sample_offset=roi_aligner_config.sample_offset) - - detection_generator_obj = detection_generator.DetectionGenerator( - apply_nms=generator_config.apply_nms, - pre_nms_top_k=generator_config.pre_nms_top_k, - pre_nms_score_threshold=generator_config.pre_nms_score_threshold, - nms_iou_threshold=generator_config.nms_iou_threshold, - max_num_detections=generator_config.max_num_detections, - nms_version=generator_config.nms_version, - use_cpu_nms=generator_config.use_cpu_nms, - soft_nms_sigma=generator_config.soft_nms_sigma) - - if model_config.include_mask: - mask_head = instance_heads.MaskHead( - num_classes=model_config.num_classes, - upsample_factor=model_config.mask_head.upsample_factor, - num_convs=model_config.mask_head.num_convs, - num_filters=model_config.mask_head.num_filters, - use_separable_conv=model_config.mask_head.use_separable_conv, - activation=model_config.norm_activation.activation, - norm_momentum=model_config.norm_activation.norm_momentum, - norm_epsilon=model_config.norm_activation.norm_epsilon, - kernel_regularizer=l2_regularizer, - class_agnostic=model_config.mask_head.class_agnostic) - - mask_sampler_obj = mask_sampler.MaskSampler( - mask_target_size=( - model_config.mask_roi_aligner.crop_size * - model_config.mask_head.upsample_factor), - num_sampled_masks=model_config.mask_sampler.num_sampled_masks) - - mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner( - crop_size=model_config.mask_roi_aligner.crop_size, - sample_offset=model_config.mask_roi_aligner.sample_offset) - else: - mask_head = None - mask_sampler_obj = None - mask_roi_aligner_obj = None - - model = maskrcnn_model.MaskRCNNModel( - backbone=backbone, - decoder=decoder, - rpn_head=rpn_head, - detection_head=detection_head, - roi_generator=roi_generator_obj, - roi_sampler=roi_sampler_cascade, - roi_aligner=roi_aligner_obj, - detection_generator=detection_generator_obj, - mask_head=mask_head, - mask_sampler=mask_sampler_obj, - mask_roi_aligner=mask_roi_aligner_obj, - class_agnostic_bbox_pred=detection_head_config.class_agnostic_bbox_pred, - cascade_class_ensemble=detection_head_config.cascade_class_ensemble, - min_level=model_config.min_level, - max_level=model_config.max_level, - num_scales=model_config.anchor.num_scales, - aspect_ratios=model_config.anchor.aspect_ratios, - anchor_size=model_config.anchor.anchor_size) - return model - - -def build_retinanet( - input_specs: tf.keras.layers.InputSpec, - model_config: retinanet_cfg.RetinaNet, - l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - backbone: Optional[tf.keras.Model] = None, - decoder: Optional[tf.keras.regularizers.Regularizer] = None -) -> tf.keras.Model: - """Builds RetinaNet model.""" - norm_activation_config = model_config.norm_activation - if not backbone: - backbone = backbones.factory.build_backbone( - input_specs=input_specs, - backbone_config=model_config.backbone, - norm_activation_config=norm_activation_config, - l2_regularizer=l2_regularizer) - backbone_features = backbone(tf.keras.Input(input_specs.shape[1:])) - - if not decoder: - decoder = decoders.factory.build_decoder( - input_specs=backbone.output_specs, - model_config=model_config, - l2_regularizer=l2_regularizer) - - head_config = model_config.head - generator_config = model_config.detection_generator - num_anchors_per_location = ( - len(model_config.anchor.aspect_ratios) * model_config.anchor.num_scales) - - head = dense_prediction_heads.RetinaNetHead( - min_level=model_config.min_level, - max_level=model_config.max_level, - num_classes=model_config.num_classes, - num_anchors_per_location=num_anchors_per_location, - num_convs=head_config.num_convs, - num_filters=head_config.num_filters, - attribute_heads=[ - cfg.as_dict() for cfg in (head_config.attribute_heads or []) - ], - use_separable_conv=head_config.use_separable_conv, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer) - - # Builds decoder and head so that their trainable weights are initialized - if decoder: - decoder_features = decoder(backbone_features) - _ = head(decoder_features) - - detection_generator_obj = detection_generator.MultilevelDetectionGenerator( - apply_nms=generator_config.apply_nms, - pre_nms_top_k=generator_config.pre_nms_top_k, - pre_nms_score_threshold=generator_config.pre_nms_score_threshold, - nms_iou_threshold=generator_config.nms_iou_threshold, - max_num_detections=generator_config.max_num_detections, - nms_version=generator_config.nms_version, - use_cpu_nms=generator_config.use_cpu_nms, - soft_nms_sigma=generator_config.soft_nms_sigma) - - model = retinanet_model.RetinaNetModel( - backbone, - decoder, - head, - detection_generator_obj, - min_level=model_config.min_level, - max_level=model_config.max_level, - num_scales=model_config.anchor.num_scales, - aspect_ratios=model_config.anchor.aspect_ratios, - anchor_size=model_config.anchor.anchor_size) - return model - - -def build_segmentation_model( - input_specs: tf.keras.layers.InputSpec, - model_config: segmentation_cfg.SemanticSegmentationModel, - l2_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - backbone: Optional[tf.keras.regularizers.Regularizer] = None, - decoder: Optional[tf.keras.regularizers.Regularizer] = None -) -> tf.keras.Model: - """Builds Segmentation model.""" - norm_activation_config = model_config.norm_activation - if not backbone: - backbone = backbones.factory.build_backbone( - input_specs=input_specs, - backbone_config=model_config.backbone, - norm_activation_config=norm_activation_config, - l2_regularizer=l2_regularizer) - - if not decoder: - decoder = decoders.factory.build_decoder( - input_specs=backbone.output_specs, - model_config=model_config, - l2_regularizer=l2_regularizer) - - head_config = model_config.head - - head = segmentation_heads.SegmentationHead( - num_classes=model_config.num_classes, - level=head_config.level, - num_convs=head_config.num_convs, - prediction_kernel_size=head_config.prediction_kernel_size, - num_filters=head_config.num_filters, - use_depthwise_convolution=head_config.use_depthwise_convolution, - upsample_factor=head_config.upsample_factor, - feature_fusion=head_config.feature_fusion, - low_level=head_config.low_level, - low_level_num_filters=head_config.low_level_num_filters, - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer) - - mask_scoring_head = None - if model_config.mask_scoring_head: - mask_scoring_head = segmentation_heads.MaskScoring( - num_classes=model_config.num_classes, - **model_config.mask_scoring_head.as_dict(), - activation=norm_activation_config.activation, - use_sync_bn=norm_activation_config.use_sync_bn, - norm_momentum=norm_activation_config.norm_momentum, - norm_epsilon=norm_activation_config.norm_epsilon, - kernel_regularizer=l2_regularizer) - - model = segmentation_model.SegmentationModel( - backbone, decoder, head, mask_scoring_head=mask_scoring_head) - return model diff --git a/official/vision/beta/modeling/factory_3d.py b/official/vision/beta/modeling/factory_3d.py deleted file mode 100644 index b68bdd6c0..000000000 --- a/official/vision/beta/modeling/factory_3d.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Factory methods to build models.""" - -# Import libraries -import tensorflow as tf - -from official.core import registry -from official.vision.beta.configs import video_classification as video_classification_cfg -from official.vision.beta.modeling import video_classification_model -from official.vision.beta.modeling import backbones - -_REGISTERED_MODEL_CLS = {} - - -def register_model_builder(key: str): - """Decorates a builder of model class. - - The builder should be a Callable (a class or a function). - This decorator supports registration of backbone builder as follows: - - ``` - class MyModel(tf.keras.Model): - pass - - @register_backbone_builder('mybackbone') - def builder(input_specs, config, l2_reg): - return MyModel(...) - - # Builds a MyModel object. - my_backbone = build_backbone_3d(input_specs, config, l2_reg) - ``` - - Args: - key: the key to look up the builder. - - Returns: - A callable for use as class decorator that registers the decorated class - for creation from an instance of model class. - """ - return registry.register(_REGISTERED_MODEL_CLS, key) - - -def build_model( - model_type: str, - input_specs: tf.keras.layers.InputSpec, - model_config: video_classification_cfg.hyperparams.Config, - num_classes: int, - l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model: - """Builds backbone from a config. - - Args: - model_type: string name of model type. It should be consistent with - ModelConfig.model_type. - input_specs: tf.keras.layers.InputSpec. - model_config: a OneOfConfig. Model config. - num_classes: number of classes. - l2_regularizer: tf.keras.regularizers.Regularizer instance. Default to None. - - Returns: - tf.keras.Model instance of the backbone. - """ - model_builder = registry.lookup(_REGISTERED_MODEL_CLS, model_type) - - return model_builder(input_specs, model_config, num_classes, l2_regularizer) - - -@register_model_builder('video_classification') -def build_video_classification_model( - input_specs: tf.keras.layers.InputSpec, - model_config: video_classification_cfg.VideoClassificationModel, - num_classes: int, - l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model: - """Builds the video classification model.""" - input_specs_dict = {'image': input_specs} - norm_activation_config = model_config.norm_activation - backbone = backbones.factory.build_backbone( - input_specs=input_specs, - backbone_config=model_config.backbone, - norm_activation_config=norm_activation_config, - l2_regularizer=l2_regularizer) - - model = video_classification_model.VideoClassificationModel( - backbone=backbone, - num_classes=num_classes, - input_specs=input_specs_dict, - dropout_rate=model_config.dropout_rate, - aggregate_endpoints=model_config.aggregate_endpoints, - kernel_regularizer=l2_regularizer, - require_endpoints=model_config.require_endpoints) - return model diff --git a/official/vision/beta/modeling/factory_test.py b/official/vision/beta/modeling/factory_test.py deleted file mode 100644 index 1d075b705..000000000 --- a/official/vision/beta/modeling/factory_test.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for factory.py.""" - -# Import libraries -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.configs import backbones -from official.vision.beta.configs import backbones_3d -from official.vision.beta.configs import image_classification as classification_cfg -from official.vision.beta.configs import maskrcnn as maskrcnn_cfg -from official.vision.beta.configs import retinanet as retinanet_cfg -from official.vision.beta.configs import video_classification as video_classification_cfg -from official.vision.beta.modeling import factory -from official.vision.beta.modeling import factory_3d - - -class ClassificationModelBuilderTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - ('resnet', (224, 224), 5e-5), - ('resnet', (224, 224), None), - ('resnet', (None, None), 5e-5), - ('resnet', (None, None), None), - ) - def test_builder(self, backbone_type, input_size, weight_decay): - num_classes = 2 - input_specs = tf.keras.layers.InputSpec( - shape=[None, input_size[0], input_size[1], 3]) - model_config = classification_cfg.ImageClassificationModel( - num_classes=num_classes, - backbone=backbones.Backbone(type=backbone_type)) - l2_regularizer = ( - tf.keras.regularizers.l2(weight_decay) if weight_decay else None) - _ = factory.build_classification_model( - input_specs=input_specs, - model_config=model_config, - l2_regularizer=l2_regularizer) - - -class MaskRCNNBuilderTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - ('resnet', (640, 640)), - ('resnet', (None, None)), - ) - def test_builder(self, backbone_type, input_size): - num_classes = 2 - input_specs = tf.keras.layers.InputSpec( - shape=[None, input_size[0], input_size[1], 3]) - model_config = maskrcnn_cfg.MaskRCNN( - num_classes=num_classes, - backbone=backbones.Backbone(type=backbone_type)) - l2_regularizer = tf.keras.regularizers.l2(5e-5) - _ = factory.build_maskrcnn( - input_specs=input_specs, - model_config=model_config, - l2_regularizer=l2_regularizer) - - -class RetinaNetBuilderTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - ('resnet', (640, 640), False), - ('resnet', (None, None), True), - ) - def test_builder(self, backbone_type, input_size, has_att_heads): - num_classes = 2 - input_specs = tf.keras.layers.InputSpec( - shape=[None, input_size[0], input_size[1], 3]) - if has_att_heads: - attribute_heads_config = [ - retinanet_cfg.AttributeHead(name='att1'), - retinanet_cfg.AttributeHead( - name='att2', type='classification', size=2), - ] - else: - attribute_heads_config = None - model_config = retinanet_cfg.RetinaNet( - num_classes=num_classes, - backbone=backbones.Backbone(type=backbone_type), - head=retinanet_cfg.RetinaNetHead( - attribute_heads=attribute_heads_config)) - l2_regularizer = tf.keras.regularizers.l2(5e-5) - _ = factory.build_retinanet( - input_specs=input_specs, - model_config=model_config, - l2_regularizer=l2_regularizer) - if has_att_heads: - self.assertEqual(model_config.head.attribute_heads[0].as_dict(), - dict(name='att1', type='regression', size=1)) - self.assertEqual(model_config.head.attribute_heads[1].as_dict(), - dict(name='att2', type='classification', size=2)) - - -class VideoClassificationModelBuilderTest(parameterized.TestCase, - tf.test.TestCase): - - @parameterized.parameters( - ('resnet_3d', (8, 224, 224), 5e-5), - ('resnet_3d', (None, None, None), 5e-5), - ) - def test_builder(self, backbone_type, input_size, weight_decay): - input_specs = tf.keras.layers.InputSpec( - shape=[None, input_size[0], input_size[1], input_size[2], 3]) - model_config = video_classification_cfg.VideoClassificationModel( - backbone=backbones_3d.Backbone3D(type=backbone_type)) - l2_regularizer = ( - tf.keras.regularizers.l2(weight_decay) if weight_decay else None) - _ = factory_3d.build_video_classification_model( - input_specs=input_specs, - model_config=model_config, - num_classes=2, - l2_regularizer=l2_regularizer) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/heads/__init__.py b/official/vision/beta/modeling/heads/__init__.py deleted file mode 100644 index 8f0dc2325..000000000 --- a/official/vision/beta/modeling/heads/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Heads package definition.""" - -from official.vision.beta.modeling.heads.dense_prediction_heads import RetinaNetHead -from official.vision.beta.modeling.heads.dense_prediction_heads import RPNHead -from official.vision.beta.modeling.heads.instance_heads import DetectionHead -from official.vision.beta.modeling.heads.instance_heads import MaskHead -from official.vision.beta.modeling.heads.segmentation_heads import SegmentationHead diff --git a/official/vision/beta/modeling/heads/dense_prediction_heads.py b/official/vision/beta/modeling/heads/dense_prediction_heads.py deleted file mode 100644 index e3acbb591..000000000 --- a/official/vision/beta/modeling/heads/dense_prediction_heads.py +++ /dev/null @@ -1,517 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of dense prediction heads.""" - -from typing import Any, Dict, List, Mapping, Optional, Union - -# Import libraries - -import numpy as np -import tensorflow as tf - -from official.modeling import tf_utils - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class RetinaNetHead(tf.keras.layers.Layer): - """Creates a RetinaNet head.""" - - def __init__( - self, - min_level: int, - max_level: int, - num_classes: int, - num_anchors_per_location: int, - num_convs: int = 4, - num_filters: int = 256, - attribute_heads: Optional[List[Dict[str, Any]]] = None, - use_separable_conv: bool = False, - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - num_params_per_anchor: int = 4, - **kwargs): - """Initializes a RetinaNet head. - - Args: - min_level: An `int` number of minimum feature level. - max_level: An `int` number of maximum feature level. - num_classes: An `int` number of classes to predict. - num_anchors_per_location: An `int` number of number of anchors per pixel - location. - num_convs: An `int` number that represents the number of the intermediate - conv layers before the prediction. - num_filters: An `int` number that represents the number of filters of the - intermediate conv layers. - attribute_heads: If not None, a list that contains a dict for each - additional attribute head. Each dict consists of 3 key-value pairs: - `name`, `type` ('regression' or 'classification'), and `size` (number - of predicted values for each instance). - use_separable_conv: A `bool` that indicates whether the separable - convolution layers is used. - activation: A `str` that indicates which activation is used, e.g. 'relu', - 'swish', etc. - use_sync_bn: A `bool` that indicates whether to use synchronized batch - normalization across different replicas. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default is None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - num_params_per_anchor: Number of parameters required to specify an anchor - box. For example, `num_params_per_anchor` would be 4 for axis-aligned - anchor boxes specified by their y-centers, x-centers, heights, and - widths. - **kwargs: Additional keyword arguments to be passed. - """ - super(RetinaNetHead, self).__init__(**kwargs) - self._config_dict = { - 'min_level': min_level, - 'max_level': max_level, - 'num_classes': num_classes, - 'num_anchors_per_location': num_anchors_per_location, - 'num_convs': num_convs, - 'num_filters': num_filters, - 'attribute_heads': attribute_heads, - 'use_separable_conv': use_separable_conv, - 'activation': activation, - 'use_sync_bn': use_sync_bn, - 'norm_momentum': norm_momentum, - 'norm_epsilon': norm_epsilon, - 'kernel_regularizer': kernel_regularizer, - 'bias_regularizer': bias_regularizer, - 'num_params_per_anchor': num_params_per_anchor, - } - - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - self._activation = tf_utils.get_activation(activation) - - def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]): - """Creates the variables of the head.""" - conv_op = (tf.keras.layers.SeparableConv2D - if self._config_dict['use_separable_conv'] - else tf.keras.layers.Conv2D) - conv_kwargs = { - 'filters': self._config_dict['num_filters'], - 'kernel_size': 3, - 'padding': 'same', - 'bias_initializer': tf.zeros_initializer(), - 'bias_regularizer': self._config_dict['bias_regularizer'], - } - if not self._config_dict['use_separable_conv']: - conv_kwargs.update({ - 'kernel_initializer': tf.keras.initializers.RandomNormal( - stddev=0.01), - 'kernel_regularizer': self._config_dict['kernel_regularizer'], - }) - bn_op = (tf.keras.layers.experimental.SyncBatchNormalization - if self._config_dict['use_sync_bn'] - else tf.keras.layers.BatchNormalization) - bn_kwargs = { - 'axis': self._bn_axis, - 'momentum': self._config_dict['norm_momentum'], - 'epsilon': self._config_dict['norm_epsilon'], - } - - # Class net. - self._cls_convs = [] - self._cls_norms = [] - for level in range( - self._config_dict['min_level'], self._config_dict['max_level'] + 1): - this_level_cls_norms = [] - for i in range(self._config_dict['num_convs']): - if level == self._config_dict['min_level']: - cls_conv_name = 'classnet-conv_{}'.format(i) - self._cls_convs.append(conv_op(name=cls_conv_name, **conv_kwargs)) - cls_norm_name = 'classnet-conv-norm_{}_{}'.format(level, i) - this_level_cls_norms.append(bn_op(name=cls_norm_name, **bn_kwargs)) - self._cls_norms.append(this_level_cls_norms) - - classifier_kwargs = { - 'filters': ( - self._config_dict['num_classes'] * - self._config_dict['num_anchors_per_location']), - 'kernel_size': 3, - 'padding': 'same', - 'bias_initializer': tf.constant_initializer(-np.log((1 - 0.01) / 0.01)), - 'bias_regularizer': self._config_dict['bias_regularizer'], - } - if not self._config_dict['use_separable_conv']: - classifier_kwargs.update({ - 'kernel_initializer': tf.keras.initializers.RandomNormal(stddev=1e-5), - 'kernel_regularizer': self._config_dict['kernel_regularizer'], - }) - self._classifier = conv_op(name='scores', **classifier_kwargs) - - # Box net. - self._box_convs = [] - self._box_norms = [] - for level in range( - self._config_dict['min_level'], self._config_dict['max_level'] + 1): - this_level_box_norms = [] - for i in range(self._config_dict['num_convs']): - if level == self._config_dict['min_level']: - box_conv_name = 'boxnet-conv_{}'.format(i) - self._box_convs.append(conv_op(name=box_conv_name, **conv_kwargs)) - box_norm_name = 'boxnet-conv-norm_{}_{}'.format(level, i) - this_level_box_norms.append(bn_op(name=box_norm_name, **bn_kwargs)) - self._box_norms.append(this_level_box_norms) - - box_regressor_kwargs = { - 'filters': (self._config_dict['num_params_per_anchor'] * - self._config_dict['num_anchors_per_location']), - 'kernel_size': 3, - 'padding': 'same', - 'bias_initializer': tf.zeros_initializer(), - 'bias_regularizer': self._config_dict['bias_regularizer'], - } - if not self._config_dict['use_separable_conv']: - box_regressor_kwargs.update({ - 'kernel_initializer': tf.keras.initializers.RandomNormal( - stddev=1e-5), - 'kernel_regularizer': self._config_dict['kernel_regularizer'], - }) - self._box_regressor = conv_op(name='boxes', **box_regressor_kwargs) - - # Attribute learning nets. - if self._config_dict['attribute_heads']: - self._att_predictors = {} - self._att_convs = {} - self._att_norms = {} - - for att_config in self._config_dict['attribute_heads']: - att_name = att_config['name'] - att_type = att_config['type'] - att_size = att_config['size'] - att_convs_i = [] - att_norms_i = [] - - # Build conv and norm layers. - for level in range(self._config_dict['min_level'], - self._config_dict['max_level'] + 1): - this_level_att_norms = [] - for i in range(self._config_dict['num_convs']): - if level == self._config_dict['min_level']: - att_conv_name = '{}-conv_{}'.format(att_name, i) - att_convs_i.append(conv_op(name=att_conv_name, **conv_kwargs)) - att_norm_name = '{}-conv-norm_{}_{}'.format(att_name, level, i) - this_level_att_norms.append(bn_op(name=att_norm_name, **bn_kwargs)) - att_norms_i.append(this_level_att_norms) - self._att_convs[att_name] = att_convs_i - self._att_norms[att_name] = att_norms_i - - # Build the final prediction layer. - att_predictor_kwargs = { - 'filters': - (att_size * self._config_dict['num_anchors_per_location']), - 'kernel_size': 3, - 'padding': 'same', - 'bias_initializer': tf.zeros_initializer(), - 'bias_regularizer': self._config_dict['bias_regularizer'], - } - if att_type == 'regression': - att_predictor_kwargs.update( - {'bias_initializer': tf.zeros_initializer()}) - elif att_type == 'classification': - att_predictor_kwargs.update({ - 'bias_initializer': - tf.constant_initializer(-np.log((1 - 0.01) / 0.01)) - }) - else: - raise ValueError( - 'Attribute head type {} not supported.'.format(att_type)) - - if not self._config_dict['use_separable_conv']: - att_predictor_kwargs.update({ - 'kernel_initializer': - tf.keras.initializers.RandomNormal(stddev=1e-5), - 'kernel_regularizer': - self._config_dict['kernel_regularizer'], - }) - - self._att_predictors[att_name] = conv_op( - name='{}_attributes'.format(att_name), **att_predictor_kwargs) - - super(RetinaNetHead, self).build(input_shape) - - def call(self, features: Mapping[str, tf.Tensor]): - """Forward pass of the RetinaNet head. - - Args: - features: A `dict` of `tf.Tensor` where - - key: A `str` of the level of the multilevel features. - - values: A `tf.Tensor`, the feature map tensors, whose shape is - [batch, height_l, width_l, channels]. - - Returns: - scores: A `dict` of `tf.Tensor` which includes scores of the predictions. - - key: A `str` of the level of the multilevel predictions. - - values: A `tf.Tensor` of the box scores predicted from a particular - feature level, whose shape is - [batch, height_l, width_l, num_classes * num_anchors_per_location]. - boxes: A `dict` of `tf.Tensor` which includes coordinates of the - predictions. - - key: A `str` of the level of the multilevel predictions. - - values: A `tf.Tensor` of the box scores predicted from a particular - feature level, whose shape is - [batch, height_l, width_l, - num_params_per_anchor * num_anchors_per_location]. - attributes: a dict of (attribute_name, attribute_prediction). Each - `attribute_prediction` is a dict of: - - key: `str`, the level of the multilevel predictions. - - values: `Tensor`, the box scores predicted from a particular feature - level, whose shape is - [batch, height_l, width_l, - attribute_size * num_anchors_per_location]. - Can be an empty dictionary if no attribute learning is required. - """ - scores = {} - boxes = {} - if self._config_dict['attribute_heads']: - attributes = { - att_config['name']: {} - for att_config in self._config_dict['attribute_heads'] - } - else: - attributes = {} - - for i, level in enumerate( - range(self._config_dict['min_level'], - self._config_dict['max_level'] + 1)): - this_level_features = features[str(level)] - - # class net. - x = this_level_features - for conv, norm in zip(self._cls_convs, self._cls_norms[i]): - x = conv(x) - x = norm(x) - x = self._activation(x) - scores[str(level)] = self._classifier(x) - - # box net. - x = this_level_features - for conv, norm in zip(self._box_convs, self._box_norms[i]): - x = conv(x) - x = norm(x) - x = self._activation(x) - boxes[str(level)] = self._box_regressor(x) - - # attribute nets. - if self._config_dict['attribute_heads']: - for att_config in self._config_dict['attribute_heads']: - att_name = att_config['name'] - x = this_level_features - for conv, norm in zip(self._att_convs[att_name], - self._att_norms[att_name][i]): - x = conv(x) - x = norm(x) - x = self._activation(x) - attributes[att_name][str(level)] = self._att_predictors[att_name](x) - - return scores, boxes, attributes - - def get_config(self): - return self._config_dict - - @classmethod - def from_config(cls, config): - return cls(**config) - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class RPNHead(tf.keras.layers.Layer): - """Creates a Region Proposal Network (RPN) head.""" - - def __init__( - self, - min_level: int, - max_level: int, - num_anchors_per_location: int, - num_convs: int = 1, - num_filters: int = 256, - use_separable_conv: bool = False, - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - **kwargs): - """Initializes a Region Proposal Network head. - - Args: - min_level: An `int` number of minimum feature level. - max_level: An `int` number of maximum feature level. - num_anchors_per_location: An `int` number of number of anchors per pixel - location. - num_convs: An `int` number that represents the number of the intermediate - convolution layers before the prediction. - num_filters: An `int` number that represents the number of filters of the - intermediate convolution layers. - use_separable_conv: A `bool` that indicates whether the separable - convolution layers is used. - activation: A `str` that indicates which activation is used, e.g. 'relu', - 'swish', etc. - use_sync_bn: A `bool` that indicates whether to use synchronized batch - normalization across different replicas. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default is None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - **kwargs: Additional keyword arguments to be passed. - """ - super(RPNHead, self).__init__(**kwargs) - self._config_dict = { - 'min_level': min_level, - 'max_level': max_level, - 'num_anchors_per_location': num_anchors_per_location, - 'num_convs': num_convs, - 'num_filters': num_filters, - 'use_separable_conv': use_separable_conv, - 'activation': activation, - 'use_sync_bn': use_sync_bn, - 'norm_momentum': norm_momentum, - 'norm_epsilon': norm_epsilon, - 'kernel_regularizer': kernel_regularizer, - 'bias_regularizer': bias_regularizer, - } - - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - self._activation = tf_utils.get_activation(activation) - - def build(self, input_shape): - """Creates the variables of the head.""" - conv_op = (tf.keras.layers.SeparableConv2D - if self._config_dict['use_separable_conv'] - else tf.keras.layers.Conv2D) - conv_kwargs = { - 'filters': self._config_dict['num_filters'], - 'kernel_size': 3, - 'padding': 'same', - 'bias_initializer': tf.zeros_initializer(), - 'bias_regularizer': self._config_dict['bias_regularizer'], - } - if not self._config_dict['use_separable_conv']: - conv_kwargs.update({ - 'kernel_initializer': tf.keras.initializers.RandomNormal( - stddev=0.01), - 'kernel_regularizer': self._config_dict['kernel_regularizer'], - }) - bn_op = (tf.keras.layers.experimental.SyncBatchNormalization - if self._config_dict['use_sync_bn'] - else tf.keras.layers.BatchNormalization) - bn_kwargs = { - 'axis': self._bn_axis, - 'momentum': self._config_dict['norm_momentum'], - 'epsilon': self._config_dict['norm_epsilon'], - } - - self._convs = [] - self._norms = [] - for level in range( - self._config_dict['min_level'], self._config_dict['max_level'] + 1): - this_level_norms = [] - for i in range(self._config_dict['num_convs']): - if level == self._config_dict['min_level']: - conv_name = 'rpn-conv_{}'.format(i) - self._convs.append(conv_op(name=conv_name, **conv_kwargs)) - norm_name = 'rpn-conv-norm_{}_{}'.format(level, i) - this_level_norms.append(bn_op(name=norm_name, **bn_kwargs)) - self._norms.append(this_level_norms) - - classifier_kwargs = { - 'filters': self._config_dict['num_anchors_per_location'], - 'kernel_size': 1, - 'padding': 'valid', - 'bias_initializer': tf.zeros_initializer(), - 'bias_regularizer': self._config_dict['bias_regularizer'], - } - if not self._config_dict['use_separable_conv']: - classifier_kwargs.update({ - 'kernel_initializer': tf.keras.initializers.RandomNormal( - stddev=1e-5), - 'kernel_regularizer': self._config_dict['kernel_regularizer'], - }) - self._classifier = conv_op(name='rpn-scores', **classifier_kwargs) - - box_regressor_kwargs = { - 'filters': 4 * self._config_dict['num_anchors_per_location'], - 'kernel_size': 1, - 'padding': 'valid', - 'bias_initializer': tf.zeros_initializer(), - 'bias_regularizer': self._config_dict['bias_regularizer'], - } - if not self._config_dict['use_separable_conv']: - box_regressor_kwargs.update({ - 'kernel_initializer': tf.keras.initializers.RandomNormal( - stddev=1e-5), - 'kernel_regularizer': self._config_dict['kernel_regularizer'], - }) - self._box_regressor = conv_op(name='rpn-boxes', **box_regressor_kwargs) - - super(RPNHead, self).build(input_shape) - - def call(self, features: Mapping[str, tf.Tensor]): - """Forward pass of the RPN head. - - Args: - features: A `dict` of `tf.Tensor` where - - key: A `str` of the level of the multilevel features. - - values: A `tf.Tensor`, the feature map tensors, whose shape is [batch, - height_l, width_l, channels]. - - Returns: - scores: A `dict` of `tf.Tensor` which includes scores of the predictions. - - key: A `str` of the level of the multilevel predictions. - - values: A `tf.Tensor` of the box scores predicted from a particular - feature level, whose shape is - [batch, height_l, width_l, num_classes * num_anchors_per_location]. - boxes: A `dict` of `tf.Tensor` which includes coordinates of the - predictions. - - key: A `str` of the level of the multilevel predictions. - - values: A `tf.Tensor` of the box scores predicted from a particular - feature level, whose shape is - [batch, height_l, width_l, 4 * num_anchors_per_location]. - """ - scores = {} - boxes = {} - for i, level in enumerate( - range(self._config_dict['min_level'], - self._config_dict['max_level'] + 1)): - x = features[str(level)] - for conv, norm in zip(self._convs, self._norms[i]): - x = conv(x) - x = norm(x) - x = self._activation(x) - scores[str(level)] = self._classifier(x) - boxes[str(level)] = self._box_regressor(x) - return scores, boxes - - def get_config(self): - return self._config_dict - - @classmethod - def from_config(cls, config): - return cls(**config) diff --git a/official/vision/beta/modeling/heads/dense_prediction_heads_test.py b/official/vision/beta/modeling/heads/dense_prediction_heads_test.py deleted file mode 100644 index 3a15300cf..000000000 --- a/official/vision/beta/modeling/heads/dense_prediction_heads_test.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for dense_prediction_heads.py.""" - -# Import libraries -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from official.vision.beta.modeling.heads import dense_prediction_heads - - -class RetinaNetHeadTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (False, False, False), - (False, True, False), - (True, False, True), - (True, True, True), - ) - def test_forward(self, use_separable_conv, use_sync_bn, has_att_heads): - if has_att_heads: - attribute_heads = [dict(name='depth', type='regression', size=1)] - else: - attribute_heads = None - - retinanet_head = dense_prediction_heads.RetinaNetHead( - min_level=3, - max_level=4, - num_classes=3, - num_anchors_per_location=3, - num_convs=2, - num_filters=256, - attribute_heads=attribute_heads, - use_separable_conv=use_separable_conv, - activation='relu', - use_sync_bn=use_sync_bn, - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_regularizer=None, - bias_regularizer=None, - ) - features = { - '3': np.random.rand(2, 128, 128, 16), - '4': np.random.rand(2, 64, 64, 16), - } - scores, boxes, attributes = retinanet_head(features) - self.assertAllEqual(scores['3'].numpy().shape, [2, 128, 128, 9]) - self.assertAllEqual(scores['4'].numpy().shape, [2, 64, 64, 9]) - self.assertAllEqual(boxes['3'].numpy().shape, [2, 128, 128, 12]) - self.assertAllEqual(boxes['4'].numpy().shape, [2, 64, 64, 12]) - if has_att_heads: - for att in attributes.values(): - self.assertAllEqual(att['3'].numpy().shape, [2, 128, 128, 3]) - self.assertAllEqual(att['4'].numpy().shape, [2, 64, 64, 3]) - - def test_serialize_deserialize(self): - retinanet_head = dense_prediction_heads.RetinaNetHead( - min_level=3, - max_level=7, - num_classes=3, - num_anchors_per_location=9, - num_convs=2, - num_filters=16, - attribute_heads=None, - use_separable_conv=False, - activation='relu', - use_sync_bn=False, - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_regularizer=None, - bias_regularizer=None, - ) - config = retinanet_head.get_config() - new_retinanet_head = ( - dense_prediction_heads.RetinaNetHead.from_config(config)) - self.assertAllEqual( - retinanet_head.get_config(), new_retinanet_head.get_config()) - - -class RpnHeadTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (False, False), - (False, True), - (True, False), - (True, True), - ) - def test_forward(self, use_separable_conv, use_sync_bn): - rpn_head = dense_prediction_heads.RPNHead( - min_level=3, - max_level=4, - num_anchors_per_location=3, - num_convs=2, - num_filters=256, - use_separable_conv=use_separable_conv, - activation='relu', - use_sync_bn=use_sync_bn, - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_regularizer=None, - bias_regularizer=None, - ) - features = { - '3': np.random.rand(2, 128, 128, 16), - '4': np.random.rand(2, 64, 64, 16), - } - scores, boxes = rpn_head(features) - self.assertAllEqual(scores['3'].numpy().shape, [2, 128, 128, 3]) - self.assertAllEqual(scores['4'].numpy().shape, [2, 64, 64, 3]) - self.assertAllEqual(boxes['3'].numpy().shape, [2, 128, 128, 12]) - self.assertAllEqual(boxes['4'].numpy().shape, [2, 64, 64, 12]) - - def test_serialize_deserialize(self): - rpn_head = dense_prediction_heads.RPNHead( - min_level=3, - max_level=7, - num_anchors_per_location=9, - num_convs=2, - num_filters=16, - use_separable_conv=False, - activation='relu', - use_sync_bn=False, - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_regularizer=None, - bias_regularizer=None, - ) - config = rpn_head.get_config() - new_rpn_head = dense_prediction_heads.RPNHead.from_config(config) - self.assertAllEqual(rpn_head.get_config(), new_rpn_head.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/heads/instance_heads.py b/official/vision/beta/modeling/heads/instance_heads.py deleted file mode 100644 index b0e3ac74a..000000000 --- a/official/vision/beta/modeling/heads/instance_heads.py +++ /dev/null @@ -1,444 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of instance prediction heads.""" - -from typing import List, Union, Optional -# Import libraries -import tensorflow as tf - -from official.modeling import tf_utils - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class DetectionHead(tf.keras.layers.Layer): - """Creates a detection head.""" - - def __init__( - self, - num_classes: int, - num_convs: int = 0, - num_filters: int = 256, - use_separable_conv: bool = False, - num_fcs: int = 2, - fc_dims: int = 1024, - class_agnostic_bbox_pred: bool = False, - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - **kwargs): - """Initializes a detection head. - - Args: - num_classes: An `int` for the number of classes. - num_convs: An `int` number that represents the number of the intermediate - convolution layers before the FC layers. - num_filters: An `int` number that represents the number of filters of the - intermediate convolution layers. - use_separable_conv: A `bool` that indicates whether the separable - convolution layers is used. - num_fcs: An `int` number that represents the number of FC layers before - the predictions. - fc_dims: An `int` number that represents the number of dimension of the FC - layers. - class_agnostic_bbox_pred: `bool`, indicating whether bboxes should be - predicted for every class or not. - activation: A `str` that indicates which activation is used, e.g. 'relu', - 'swish', etc. - use_sync_bn: A `bool` that indicates whether to use synchronized batch - normalization across different replicas. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default is None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - **kwargs: Additional keyword arguments to be passed. - """ - super(DetectionHead, self).__init__(**kwargs) - self._config_dict = { - 'num_classes': num_classes, - 'num_convs': num_convs, - 'num_filters': num_filters, - 'use_separable_conv': use_separable_conv, - 'num_fcs': num_fcs, - 'fc_dims': fc_dims, - 'class_agnostic_bbox_pred': class_agnostic_bbox_pred, - 'activation': activation, - 'use_sync_bn': use_sync_bn, - 'norm_momentum': norm_momentum, - 'norm_epsilon': norm_epsilon, - 'kernel_regularizer': kernel_regularizer, - 'bias_regularizer': bias_regularizer, - } - - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - self._activation = tf_utils.get_activation(activation) - - def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]): - """Creates the variables of the head.""" - conv_op = (tf.keras.layers.SeparableConv2D - if self._config_dict['use_separable_conv'] - else tf.keras.layers.Conv2D) - conv_kwargs = { - 'filters': self._config_dict['num_filters'], - 'kernel_size': 3, - 'padding': 'same', - } - if self._config_dict['use_separable_conv']: - conv_kwargs.update({ - 'depthwise_initializer': tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - 'pointwise_initializer': tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - 'bias_initializer': tf.zeros_initializer(), - 'depthwise_regularizer': self._config_dict['kernel_regularizer'], - 'pointwise_regularizer': self._config_dict['kernel_regularizer'], - 'bias_regularizer': self._config_dict['bias_regularizer'], - }) - else: - conv_kwargs.update({ - 'kernel_initializer': tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - 'bias_initializer': tf.zeros_initializer(), - 'kernel_regularizer': self._config_dict['kernel_regularizer'], - 'bias_regularizer': self._config_dict['bias_regularizer'], - }) - bn_op = (tf.keras.layers.experimental.SyncBatchNormalization - if self._config_dict['use_sync_bn'] - else tf.keras.layers.BatchNormalization) - bn_kwargs = { - 'axis': self._bn_axis, - 'momentum': self._config_dict['norm_momentum'], - 'epsilon': self._config_dict['norm_epsilon'], - } - - self._convs = [] - self._conv_norms = [] - for i in range(self._config_dict['num_convs']): - conv_name = 'detection-conv_{}'.format(i) - self._convs.append(conv_op(name=conv_name, **conv_kwargs)) - bn_name = 'detection-conv-bn_{}'.format(i) - self._conv_norms.append(bn_op(name=bn_name, **bn_kwargs)) - - self._fcs = [] - self._fc_norms = [] - for i in range(self._config_dict['num_fcs']): - fc_name = 'detection-fc_{}'.format(i) - self._fcs.append( - tf.keras.layers.Dense( - units=self._config_dict['fc_dims'], - kernel_initializer=tf.keras.initializers.VarianceScaling( - scale=1 / 3.0, mode='fan_out', distribution='uniform'), - kernel_regularizer=self._config_dict['kernel_regularizer'], - bias_regularizer=self._config_dict['bias_regularizer'], - name=fc_name)) - bn_name = 'detection-fc-bn_{}'.format(i) - self._fc_norms.append(bn_op(name=bn_name, **bn_kwargs)) - - self._classifier = tf.keras.layers.Dense( - units=self._config_dict['num_classes'], - kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01), - bias_initializer=tf.zeros_initializer(), - kernel_regularizer=self._config_dict['kernel_regularizer'], - bias_regularizer=self._config_dict['bias_regularizer'], - name='detection-scores') - - num_box_outputs = (4 if self._config_dict['class_agnostic_bbox_pred'] else - self._config_dict['num_classes'] * 4) - self._box_regressor = tf.keras.layers.Dense( - units=num_box_outputs, - kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.001), - bias_initializer=tf.zeros_initializer(), - kernel_regularizer=self._config_dict['kernel_regularizer'], - bias_regularizer=self._config_dict['bias_regularizer'], - name='detection-boxes') - - super(DetectionHead, self).build(input_shape) - - def call(self, inputs: tf.Tensor, training: bool = None): - """Forward pass of box and class branches for the Mask-RCNN model. - - Args: - inputs: A `tf.Tensor` of the shape [batch_size, num_instances, roi_height, - roi_width, roi_channels], representing the ROI features. - training: a `bool` indicating whether it is in `training` mode. - - Returns: - class_outputs: A `tf.Tensor` of the shape - [batch_size, num_rois, num_classes], representing the class predictions. - box_outputs: A `tf.Tensor` of the shape - [batch_size, num_rois, num_classes * 4], representing the box - predictions. - """ - roi_features = inputs - _, num_rois, height, width, filters = roi_features.get_shape().as_list() - - x = tf.reshape(roi_features, [-1, height, width, filters]) - for conv, bn in zip(self._convs, self._conv_norms): - x = conv(x) - x = bn(x) - x = self._activation(x) - - _, _, _, filters = x.get_shape().as_list() - x = tf.reshape(x, [-1, num_rois, height * width * filters]) - - for fc, bn in zip(self._fcs, self._fc_norms): - x = fc(x) - x = bn(x) - x = self._activation(x) - - classes = self._classifier(x) - boxes = self._box_regressor(x) - return classes, boxes - - def get_config(self): - return self._config_dict - - @classmethod - def from_config(cls, config): - return cls(**config) - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class MaskHead(tf.keras.layers.Layer): - """Creates a mask head.""" - - def __init__( - self, - num_classes: int, - upsample_factor: int = 2, - num_convs: int = 4, - num_filters: int = 256, - use_separable_conv: bool = False, - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - class_agnostic: bool = False, - **kwargs): - """Initializes a mask head. - - Args: - num_classes: An `int` of the number of classes. - upsample_factor: An `int` that indicates the upsample factor to generate - the final predicted masks. It should be >= 1. - num_convs: An `int` number that represents the number of the intermediate - convolution layers before the mask prediction layers. - num_filters: An `int` number that represents the number of filters of the - intermediate convolution layers. - use_separable_conv: A `bool` that indicates whether the separable - convolution layers is used. - activation: A `str` that indicates which activation is used, e.g. 'relu', - 'swish', etc. - use_sync_bn: A `bool` that indicates whether to use synchronized batch - normalization across different replicas. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default is None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - class_agnostic: A `bool`. If set, we use a single channel mask head that - is shared between all classes. - **kwargs: Additional keyword arguments to be passed. - """ - super(MaskHead, self).__init__(**kwargs) - self._config_dict = { - 'num_classes': num_classes, - 'upsample_factor': upsample_factor, - 'num_convs': num_convs, - 'num_filters': num_filters, - 'use_separable_conv': use_separable_conv, - 'activation': activation, - 'use_sync_bn': use_sync_bn, - 'norm_momentum': norm_momentum, - 'norm_epsilon': norm_epsilon, - 'kernel_regularizer': kernel_regularizer, - 'bias_regularizer': bias_regularizer, - 'class_agnostic': class_agnostic - } - - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - self._activation = tf_utils.get_activation(activation) - - def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]): - """Creates the variables of the head.""" - conv_op = (tf.keras.layers.SeparableConv2D - if self._config_dict['use_separable_conv'] - else tf.keras.layers.Conv2D) - conv_kwargs = { - 'filters': self._config_dict['num_filters'], - 'kernel_size': 3, - 'padding': 'same', - } - if self._config_dict['use_separable_conv']: - conv_kwargs.update({ - 'depthwise_initializer': tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - 'pointwise_initializer': tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - 'bias_initializer': tf.zeros_initializer(), - 'depthwise_regularizer': self._config_dict['kernel_regularizer'], - 'pointwise_regularizer': self._config_dict['kernel_regularizer'], - 'bias_regularizer': self._config_dict['bias_regularizer'], - }) - else: - conv_kwargs.update({ - 'kernel_initializer': tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - 'bias_initializer': tf.zeros_initializer(), - 'kernel_regularizer': self._config_dict['kernel_regularizer'], - 'bias_regularizer': self._config_dict['bias_regularizer'], - }) - bn_op = (tf.keras.layers.experimental.SyncBatchNormalization - if self._config_dict['use_sync_bn'] - else tf.keras.layers.BatchNormalization) - bn_kwargs = { - 'axis': self._bn_axis, - 'momentum': self._config_dict['norm_momentum'], - 'epsilon': self._config_dict['norm_epsilon'], - } - - self._convs = [] - self._conv_norms = [] - for i in range(self._config_dict['num_convs']): - conv_name = 'mask-conv_{}'.format(i) - self._convs.append(conv_op(name=conv_name, **conv_kwargs)) - bn_name = 'mask-conv-bn_{}'.format(i) - self._conv_norms.append(bn_op(name=bn_name, **bn_kwargs)) - - self._deconv = tf.keras.layers.Conv2DTranspose( - filters=self._config_dict['num_filters'], - kernel_size=self._config_dict['upsample_factor'], - strides=self._config_dict['upsample_factor'], - padding='valid', - kernel_initializer=tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - bias_initializer=tf.zeros_initializer(), - kernel_regularizer=self._config_dict['kernel_regularizer'], - bias_regularizer=self._config_dict['bias_regularizer'], - name='mask-upsampling') - self._deconv_bn = bn_op(name='mask-deconv-bn', **bn_kwargs) - - if self._config_dict['class_agnostic']: - num_filters = 1 - else: - num_filters = self._config_dict['num_classes'] - - conv_kwargs = { - 'filters': num_filters, - 'kernel_size': 1, - 'padding': 'valid', - } - if self._config_dict['use_separable_conv']: - conv_kwargs.update({ - 'depthwise_initializer': tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - 'pointwise_initializer': tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - 'bias_initializer': tf.zeros_initializer(), - 'depthwise_regularizer': self._config_dict['kernel_regularizer'], - 'pointwise_regularizer': self._config_dict['kernel_regularizer'], - 'bias_regularizer': self._config_dict['bias_regularizer'], - }) - else: - conv_kwargs.update({ - 'kernel_initializer': tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - 'bias_initializer': tf.zeros_initializer(), - 'kernel_regularizer': self._config_dict['kernel_regularizer'], - 'bias_regularizer': self._config_dict['bias_regularizer'], - }) - self._mask_regressor = conv_op(name='mask-logits', **conv_kwargs) - - super(MaskHead, self).build(input_shape) - - def call(self, inputs: List[tf.Tensor], training: bool = None): - """Forward pass of mask branch for the Mask-RCNN model. - - Args: - inputs: A `list` of two tensors where - inputs[0]: A `tf.Tensor` of shape [batch_size, num_instances, - roi_height, roi_width, roi_channels], representing the ROI features. - inputs[1]: A `tf.Tensor` of shape [batch_size, num_instances], - representing the classes of the ROIs. - training: A `bool` indicating whether it is in `training` mode. - - Returns: - mask_outputs: A `tf.Tensor` of shape - [batch_size, num_instances, roi_height * upsample_factor, - roi_width * upsample_factor], representing the mask predictions. - """ - roi_features, roi_classes = inputs - batch_size, num_rois, height, width, filters = ( - roi_features.get_shape().as_list()) - if batch_size is None: - batch_size = tf.shape(roi_features)[0] - - x = tf.reshape(roi_features, [-1, height, width, filters]) - for conv, bn in zip(self._convs, self._conv_norms): - x = conv(x) - x = bn(x) - x = self._activation(x) - - x = self._deconv(x) - x = self._deconv_bn(x) - x = self._activation(x) - - logits = self._mask_regressor(x) - - mask_height = height * self._config_dict['upsample_factor'] - mask_width = width * self._config_dict['upsample_factor'] - - if self._config_dict['class_agnostic']: - logits = tf.reshape(logits, [-1, num_rois, mask_height, mask_width, 1]) - else: - logits = tf.reshape( - logits, - [-1, num_rois, mask_height, mask_width, - self._config_dict['num_classes']]) - - batch_indices = tf.tile( - tf.expand_dims(tf.range(batch_size), axis=1), [1, num_rois]) - mask_indices = tf.tile( - tf.expand_dims(tf.range(num_rois), axis=0), [batch_size, 1]) - - if self._config_dict['class_agnostic']: - class_gather_indices = tf.zeros_like(roi_classes, dtype=tf.int32) - else: - class_gather_indices = tf.cast(roi_classes, dtype=tf.int32) - - gather_indices = tf.stack( - [batch_indices, mask_indices, class_gather_indices], - axis=2) - mask_outputs = tf.gather_nd( - tf.transpose(logits, [0, 1, 4, 2, 3]), gather_indices) - return mask_outputs - - def get_config(self): - return self._config_dict - - @classmethod - def from_config(cls, config): - return cls(**config) diff --git a/official/vision/beta/modeling/heads/instance_heads_test.py b/official/vision/beta/modeling/heads/instance_heads_test.py deleted file mode 100644 index d485e05a4..000000000 --- a/official/vision/beta/modeling/heads/instance_heads_test.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for instance_heads.py.""" - -# Import libraries -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from official.vision.beta.modeling.heads import instance_heads - - -class DetectionHeadTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (0, 0, False, False), - (0, 1, False, False), - (1, 0, False, False), - (1, 1, False, False), - ) - def test_forward(self, num_convs, num_fcs, use_separable_conv, use_sync_bn): - detection_head = instance_heads.DetectionHead( - num_classes=3, - num_convs=num_convs, - num_filters=16, - use_separable_conv=use_separable_conv, - num_fcs=num_fcs, - fc_dims=4, - activation='relu', - use_sync_bn=use_sync_bn, - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_regularizer=None, - bias_regularizer=None, - ) - roi_features = np.random.rand(2, 10, 128, 128, 16) - scores, boxes = detection_head(roi_features) - self.assertAllEqual(scores.numpy().shape, [2, 10, 3]) - self.assertAllEqual(boxes.numpy().shape, [2, 10, 12]) - - def test_serialize_deserialize(self): - detection_head = instance_heads.DetectionHead( - num_classes=91, - num_convs=0, - num_filters=256, - use_separable_conv=False, - num_fcs=2, - fc_dims=1024, - activation='relu', - use_sync_bn=False, - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_regularizer=None, - bias_regularizer=None, - ) - config = detection_head.get_config() - new_detection_head = instance_heads.DetectionHead.from_config(config) - self.assertAllEqual( - detection_head.get_config(), new_detection_head.get_config()) - - -class MaskHeadTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (1, 1, False), - (1, 2, False), - (2, 1, False), - (2, 2, False), - ) - def test_forward(self, upsample_factor, num_convs, use_sync_bn): - mask_head = instance_heads.MaskHead( - num_classes=3, - upsample_factor=upsample_factor, - num_convs=num_convs, - num_filters=16, - use_separable_conv=False, - activation='relu', - use_sync_bn=use_sync_bn, - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_regularizer=None, - bias_regularizer=None, - ) - roi_features = np.random.rand(2, 10, 14, 14, 16) - roi_classes = np.zeros((2, 10)) - masks = mask_head([roi_features, roi_classes]) - self.assertAllEqual( - masks.numpy().shape, - [2, 10, 14 * upsample_factor, 14 * upsample_factor]) - - def test_serialize_deserialize(self): - mask_head = instance_heads.MaskHead( - num_classes=3, - upsample_factor=2, - num_convs=1, - num_filters=256, - use_separable_conv=False, - activation='relu', - use_sync_bn=False, - norm_momentum=0.99, - norm_epsilon=0.001, - kernel_regularizer=None, - bias_regularizer=None, - ) - config = mask_head.get_config() - new_mask_head = instance_heads.MaskHead.from_config(config) - self.assertAllEqual( - mask_head.get_config(), new_mask_head.get_config()) - - def test_forward_class_agnostic(self): - mask_head = instance_heads.MaskHead( - num_classes=3, - class_agnostic=True - ) - roi_features = np.random.rand(2, 10, 14, 14, 16) - roi_classes = np.zeros((2, 10)) - masks = mask_head([roi_features, roi_classes]) - self.assertAllEqual(masks.numpy().shape, [2, 10, 28, 28]) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/heads/segmentation_heads.py b/official/vision/beta/modeling/heads/segmentation_heads.py deleted file mode 100644 index 5d6a4d954..000000000 --- a/official/vision/beta/modeling/heads/segmentation_heads.py +++ /dev/null @@ -1,441 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of segmentation heads.""" -from typing import List, Union, Optional, Mapping, Tuple, Any -import tensorflow as tf - -from official.modeling import tf_utils -from official.vision.beta.modeling.layers import nn_layers -from official.vision.beta.ops import spatial_transform_ops - - -class MaskScoring(tf.keras.Model): - """Creates a mask scoring layer. - - This implements mask scoring layer from the paper: - - Zhaojin Huang, Lichao Huang, Yongchao Gong, Chang Huang, Xinggang Wang. - Mask Scoring R-CNN. - (https://arxiv.org/pdf/1903.00241.pdf) - """ - - def __init__( - self, - num_classes: int, - fc_input_size: List[int], - num_convs: int = 3, - num_filters: int = 256, - fc_dims: int = 1024, - num_fcs: int = 2, - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - **kwargs): - - """Initializes mask scoring layer. - - Args: - num_classes: An `int` for number of classes. - fc_input_size: A List of `int` for the input size of the - fully connected layers. - num_convs: An`int` for number of conv layers. - num_filters: An `int` for the number of filters for conv layers. - fc_dims: An `int` number of filters for each fully connected layers. - num_fcs: An `int` for number of fully connected layers. - activation: A `str` name of the activation function. - use_sync_bn: A bool, whether or not to use sync batch normalization. - norm_momentum: A float for the momentum in BatchNorm. Defaults to 0.99. - norm_epsilon: A float for the epsilon value in BatchNorm. Defaults to - 0.001. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default is None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - **kwargs: Additional keyword arguments to be passed. - """ - super(MaskScoring, self).__init__(**kwargs) - - self._config_dict = { - 'num_classes': num_classes, - 'num_convs': num_convs, - 'num_filters': num_filters, - 'fc_input_size': fc_input_size, - 'fc_dims': fc_dims, - 'num_fcs': num_fcs, - 'use_sync_bn': use_sync_bn, - 'norm_momentum': norm_momentum, - 'norm_epsilon': norm_epsilon, - 'activation': activation, - 'kernel_regularizer': kernel_regularizer, - 'bias_regularizer': bias_regularizer, - } - - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - self._activation = tf_utils.get_activation(activation) - - def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]): - """Creates the variables of the mask scoring head.""" - conv_op = tf.keras.layers.Conv2D - conv_kwargs = { - 'filters': self._config_dict['num_filters'], - 'kernel_size': 3, - 'padding': 'same', - } - conv_kwargs.update({ - 'kernel_initializer': tf.keras.initializers.VarianceScaling( - scale=2, mode='fan_out', distribution='untruncated_normal'), - 'bias_initializer': tf.zeros_initializer(), - 'kernel_regularizer': self._config_dict['kernel_regularizer'], - 'bias_regularizer': self._config_dict['bias_regularizer'], - }) - bn_op = (tf.keras.layers.experimental.SyncBatchNormalization - if self._config_dict['use_sync_bn'] - else tf.keras.layers.BatchNormalization) - bn_kwargs = { - 'axis': self._bn_axis, - 'momentum': self._config_dict['norm_momentum'], - 'epsilon': self._config_dict['norm_epsilon'], - } - - self._convs = [] - self._conv_norms = [] - for i in range(self._config_dict['num_convs']): - conv_name = 'mask-scoring_{}'.format(i) - self._convs.append(conv_op(name=conv_name, **conv_kwargs)) - bn_name = 'mask-scoring-bn_{}'.format(i) - self._conv_norms.append(bn_op(name=bn_name, **bn_kwargs)) - - self._fcs = [] - self._fc_norms = [] - for i in range(self._config_dict['num_fcs']): - fc_name = 'mask-scoring-fc_{}'.format(i) - self._fcs.append( - tf.keras.layers.Dense( - units=self._config_dict['fc_dims'], - kernel_initializer=tf.keras.initializers.VarianceScaling( - scale=1 / 3.0, mode='fan_out', distribution='uniform'), - kernel_regularizer=self._config_dict['kernel_regularizer'], - bias_regularizer=self._config_dict['bias_regularizer'], - name=fc_name)) - bn_name = 'mask-scoring-fc-bn_{}'.format(i) - self._fc_norms.append(bn_op(name=bn_name, **bn_kwargs)) - - self._classifier = tf.keras.layers.Dense( - units=self._config_dict['num_classes'], - kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01), - bias_initializer=tf.zeros_initializer(), - kernel_regularizer=self._config_dict['kernel_regularizer'], - bias_regularizer=self._config_dict['bias_regularizer'], - name='iou-scores') - - super(MaskScoring, self).build(input_shape) - - def call(self, inputs: tf.Tensor, training: bool = None): - """Forward pass mask scoring head. - - Args: - inputs: A `tf.Tensor` of the shape [batch_size, width, size, num_classes], - representing the segmentation logits. - training: a `bool` indicating whether it is in `training` mode. - - Returns: - mask_scores: A `tf.Tensor` of predicted mask scores - [batch_size, num_classes]. - """ - x = tf.stop_gradient(inputs) - for conv, bn in zip(self._convs, self._conv_norms): - x = conv(x) - x = bn(x) - x = self._activation(x) - - # Casts feat to float32 so the resize op can be run on TPU. - x = tf.cast(x, tf.float32) - x = tf.image.resize(x, size=self._config_dict['fc_input_size'], - method=tf.image.ResizeMethod.BILINEAR) - # Casts it back to be compatible with the rest opetations. - x = tf.cast(x, inputs.dtype) - - _, h, w, filters = x.get_shape().as_list() - x = tf.reshape(x, [-1, h * w * filters]) - - for fc, bn in zip(self._fcs, self._fc_norms): - x = fc(x) - x = bn(x) - x = self._activation(x) - - ious = self._classifier(x) - return ious - - def get_config(self) -> Mapping[str, Any]: - return self._config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class SegmentationHead(tf.keras.layers.Layer): - """Creates a segmentation head.""" - - def __init__( - self, - num_classes: int, - level: Union[int, str], - num_convs: int = 2, - num_filters: int = 256, - use_depthwise_convolution: bool = False, - prediction_kernel_size: int = 1, - upsample_factor: int = 1, - feature_fusion: Optional[str] = None, - decoder_min_level: Optional[int] = None, - decoder_max_level: Optional[int] = None, - low_level: int = 2, - low_level_num_filters: int = 48, - num_decoder_filters: int = 256, - activation: str = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - **kwargs): - """Initializes a segmentation head. - - Args: - num_classes: An `int` number of mask classification categories. The number - of classes does not include background class. - level: An `int` or `str`, level to use to build segmentation head. - num_convs: An `int` number of stacked convolution before the last - prediction layer. - num_filters: An `int` number to specify the number of filters used. - Default is 256. - use_depthwise_convolution: A bool to specify if use depthwise separable - convolutions. - prediction_kernel_size: An `int` number to specify the kernel size of the - prediction layer. - upsample_factor: An `int` number to specify the upsampling factor to - generate finer mask. Default 1 means no upsampling is applied. - feature_fusion: One of `deeplabv3plus`, `pyramid_fusion`, - `panoptic_fpn_fusion`, or None. If `deeplabv3plus`, features from - decoder_features[level] will be fused with low level feature maps from - backbone. If `pyramid_fusion`, multiscale features will be resized and - fused at the target level. - decoder_min_level: An `int` of minimum level from decoder to use in - feature fusion. It is only used when feature_fusion is set to - `panoptic_fpn_fusion`. - decoder_max_level: An `int` of maximum level from decoder to use in - feature fusion. It is only used when feature_fusion is set to - `panoptic_fpn_fusion`. - low_level: An `int` of backbone level to be used for feature fusion. It is - used when feature_fusion is set to `deeplabv3plus`. - low_level_num_filters: An `int` of reduced number of filters for the low - level features before fusing it with higher level features. It is only - used when feature_fusion is set to `deeplabv3plus`. - num_decoder_filters: An `int` of number of filters in the decoder outputs. - It is only used when feature_fusion is set to `panoptic_fpn_fusion`. - activation: A `str` that indicates which activation is used, e.g. 'relu', - 'swish', etc. - use_sync_bn: A `bool` that indicates whether to use synchronized batch - normalization across different replicas. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default is None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - **kwargs: Additional keyword arguments to be passed. - """ - super(SegmentationHead, self).__init__(**kwargs) - - self._config_dict = { - 'num_classes': num_classes, - 'level': level, - 'num_convs': num_convs, - 'num_filters': num_filters, - 'use_depthwise_convolution': use_depthwise_convolution, - 'prediction_kernel_size': prediction_kernel_size, - 'upsample_factor': upsample_factor, - 'feature_fusion': feature_fusion, - 'decoder_min_level': decoder_min_level, - 'decoder_max_level': decoder_max_level, - 'low_level': low_level, - 'low_level_num_filters': low_level_num_filters, - 'num_decoder_filters': num_decoder_filters, - 'activation': activation, - 'use_sync_bn': use_sync_bn, - 'norm_momentum': norm_momentum, - 'norm_epsilon': norm_epsilon, - 'kernel_regularizer': kernel_regularizer, - 'bias_regularizer': bias_regularizer - } - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - self._activation = tf_utils.get_activation(activation) - - def build(self, input_shape: Union[tf.TensorShape, List[tf.TensorShape]]): - """Creates the variables of the segmentation head.""" - use_depthwise_convolution = self._config_dict['use_depthwise_convolution'] - random_initializer = tf.keras.initializers.RandomNormal(stddev=0.01) - conv_op = tf.keras.layers.Conv2D - conv_kwargs = { - 'kernel_size': 3 if not use_depthwise_convolution else 1, - 'padding': 'same', - 'use_bias': False, - 'kernel_initializer': random_initializer, - 'kernel_regularizer': self._config_dict['kernel_regularizer'], - } - bn_op = (tf.keras.layers.experimental.SyncBatchNormalization - if self._config_dict['use_sync_bn'] - else tf.keras.layers.BatchNormalization) - bn_kwargs = { - 'axis': self._bn_axis, - 'momentum': self._config_dict['norm_momentum'], - 'epsilon': self._config_dict['norm_epsilon'], - } - - if self._config_dict['feature_fusion'] == 'deeplabv3plus': - # Deeplabv3+ feature fusion layers. - self._dlv3p_conv = conv_op( - kernel_size=1, - padding='same', - use_bias=False, - kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01), - kernel_regularizer=self._config_dict['kernel_regularizer'], - name='segmentation_head_deeplabv3p_fusion_conv', - filters=self._config_dict['low_level_num_filters']) - - self._dlv3p_norm = bn_op( - name='segmentation_head_deeplabv3p_fusion_norm', **bn_kwargs) - - elif self._config_dict['feature_fusion'] == 'panoptic_fpn_fusion': - self._panoptic_fpn_fusion = nn_layers.PanopticFPNFusion( - min_level=self._config_dict['decoder_min_level'], - max_level=self._config_dict['decoder_max_level'], - target_level=self._config_dict['level'], - num_filters=self._config_dict['num_filters'], - num_fpn_filters=self._config_dict['num_decoder_filters'], - activation=self._config_dict['activation'], - kernel_regularizer=self._config_dict['kernel_regularizer'], - bias_regularizer=self._config_dict['bias_regularizer']) - - # Segmentation head layers. - self._convs = [] - self._norms = [] - for i in range(self._config_dict['num_convs']): - if use_depthwise_convolution: - self._convs.append( - tf.keras.layers.DepthwiseConv2D( - name='segmentation_head_depthwise_conv_{}'.format(i), - kernel_size=3, - padding='same', - use_bias=False, - depthwise_initializer=random_initializer, - depthwise_regularizer=self._config_dict['kernel_regularizer'], - depth_multiplier=1)) - norm_name = 'segmentation_head_depthwise_norm_{}'.format(i) - self._norms.append(bn_op(name=norm_name, **bn_kwargs)) - conv_name = 'segmentation_head_conv_{}'.format(i) - self._convs.append( - conv_op( - name=conv_name, - filters=self._config_dict['num_filters'], - **conv_kwargs)) - norm_name = 'segmentation_head_norm_{}'.format(i) - self._norms.append(bn_op(name=norm_name, **bn_kwargs)) - - self._classifier = conv_op( - name='segmentation_output', - filters=self._config_dict['num_classes'], - kernel_size=self._config_dict['prediction_kernel_size'], - padding='same', - bias_initializer=tf.zeros_initializer(), - kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01), - kernel_regularizer=self._config_dict['kernel_regularizer'], - bias_regularizer=self._config_dict['bias_regularizer']) - - super().build(input_shape) - - def call(self, inputs: Tuple[Union[tf.Tensor, Mapping[str, tf.Tensor]], - Union[tf.Tensor, Mapping[str, tf.Tensor]]]): - """Forward pass of the segmentation head. - - It supports both a tuple of 2 tensors or 2 dictionaries. The first is - backbone endpoints, and the second is decoder endpoints. When inputs are - tensors, they are from a single level of feature maps. When inputs are - dictionaries, they contain multiple levels of feature maps, where the key - is the index of feature map. - - Args: - inputs: A tuple of 2 feature map tensors of shape - [batch, height_l, width_l, channels] or 2 dictionaries of tensors: - - key: A `str` of the level of the multilevel features. - - values: A `tf.Tensor` of the feature map tensors, whose shape is - [batch, height_l, width_l, channels]. - The first is backbone endpoints, and the second is decoder endpoints. - Returns: - segmentation prediction mask: A `tf.Tensor` of the segmentation mask - scores predicted from input features. - """ - - backbone_output = inputs[0] - decoder_output = inputs[1] - if self._config_dict['feature_fusion'] == 'deeplabv3plus': - # deeplabv3+ feature fusion - x = decoder_output[str(self._config_dict['level'])] if isinstance( - decoder_output, dict) else decoder_output - y = backbone_output[str(self._config_dict['low_level'])] if isinstance( - backbone_output, dict) else backbone_output - y = self._dlv3p_norm(self._dlv3p_conv(y)) - y = self._activation(y) - - x = tf.image.resize( - x, tf.shape(y)[1:3], method=tf.image.ResizeMethod.BILINEAR) - x = tf.cast(x, dtype=y.dtype) - x = tf.concat([x, y], axis=self._bn_axis) - elif self._config_dict['feature_fusion'] == 'pyramid_fusion': - if not isinstance(decoder_output, dict): - raise ValueError('Only support dictionary decoder_output.') - x = nn_layers.pyramid_feature_fusion(decoder_output, - self._config_dict['level']) - elif self._config_dict['feature_fusion'] == 'panoptic_fpn_fusion': - x = self._panoptic_fpn_fusion(decoder_output) - else: - x = decoder_output[str(self._config_dict['level'])] if isinstance( - decoder_output, dict) else decoder_output - - for conv, norm in zip(self._convs, self._norms): - x = conv(x) - x = norm(x) - x = self._activation(x) - if self._config_dict['upsample_factor'] > 1: - x = spatial_transform_ops.nearest_upsampling( - x, scale=self._config_dict['upsample_factor']) - - return self._classifier(x) - - def get_config(self): - base_config = super().get_config() - return dict(list(base_config.items()) + list(self._config_dict.items())) - - @classmethod - def from_config(cls, config): - return cls(**config) diff --git a/official/vision/beta/modeling/heads/segmentation_heads_test.py b/official/vision/beta/modeling/heads/segmentation_heads_test.py deleted file mode 100644 index 8d8bccba9..000000000 --- a/official/vision/beta/modeling/heads/segmentation_heads_test.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for segmentation_heads.py.""" - -# Import libraries -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from official.vision.beta.modeling.heads import segmentation_heads - - -class SegmentationHeadTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (2, 'pyramid_fusion', None, None), - (3, 'pyramid_fusion', None, None), - (2, 'panoptic_fpn_fusion', 2, 5), - (2, 'panoptic_fpn_fusion', 2, 6), - (3, 'panoptic_fpn_fusion', 3, 5), - (3, 'panoptic_fpn_fusion', 3, 6)) - def test_forward(self, level, feature_fusion, - decoder_min_level, decoder_max_level): - backbone_features = { - '3': np.random.rand(2, 128, 128, 16), - '4': np.random.rand(2, 64, 64, 16), - '5': np.random.rand(2, 32, 32, 16), - } - decoder_features = { - '3': np.random.rand(2, 128, 128, 64), - '4': np.random.rand(2, 64, 64, 64), - '5': np.random.rand(2, 32, 32, 64), - '6': np.random.rand(2, 16, 16, 64), - } - - if feature_fusion == 'panoptic_fpn_fusion': - backbone_features['2'] = np.random.rand(2, 256, 256, 16) - decoder_features['2'] = np.random.rand(2, 256, 256, 64) - - head = segmentation_heads.SegmentationHead( - num_classes=10, - level=level, - feature_fusion=feature_fusion, - decoder_min_level=decoder_min_level, - decoder_max_level=decoder_max_level, - num_decoder_filters=64) - - logits = head((backbone_features, decoder_features)) - - if level in decoder_features: - self.assertAllEqual(logits.numpy().shape, [ - 2, decoder_features[str(level)].shape[1], - decoder_features[str(level)].shape[2], 10 - ]) - - def test_serialize_deserialize(self): - head = segmentation_heads.SegmentationHead(num_classes=10, level=3) - config = head.get_config() - new_head = segmentation_heads.SegmentationHead.from_config(config) - self.assertAllEqual(head.get_config(), new_head.get_config()) - - -class MaskScoringHeadTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (1, 1, 64, [4, 4]), - (2, 1, 64, [4, 4]), - (3, 1, 64, [4, 4]), - (1, 2, 32, [8, 8]), - (2, 2, 32, [8, 8]), - (3, 2, 32, [8, 8]),) - def test_forward(self, num_convs, num_fcs, - num_filters, fc_input_size): - features = np.random.rand(2, 64, 64, 16) - - head = segmentation_heads.MaskScoring( - num_classes=2, - num_convs=num_convs, - num_filters=num_filters, - fc_dims=128, - fc_input_size=fc_input_size) - - scores = head(features) - self.assertAllEqual(scores.numpy().shape, [2, 2]) - - def test_serialize_deserialize(self): - head = segmentation_heads.MaskScoring( - num_classes=2, fc_input_size=[4, 4], fc_dims=128) - config = head.get_config() - new_head = segmentation_heads.MaskScoring.from_config(config) - self.assertAllEqual(head.get_config(), new_head.get_config()) - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/layers/__init__.py b/official/vision/beta/modeling/layers/__init__.py deleted file mode 100644 index df5b4c062..000000000 --- a/official/vision/beta/modeling/layers/__init__.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Layers package definition.""" - -from official.vision.beta.modeling.layers.box_sampler import BoxSampler -from official.vision.beta.modeling.layers.detection_generator import DetectionGenerator -from official.vision.beta.modeling.layers.detection_generator import MultilevelDetectionGenerator -from official.vision.beta.modeling.layers.mask_sampler import MaskSampler -from official.vision.beta.modeling.layers.nn_blocks import BottleneckBlock -from official.vision.beta.modeling.layers.nn_blocks import BottleneckResidualInner -from official.vision.beta.modeling.layers.nn_blocks import DepthwiseSeparableConvBlock -from official.vision.beta.modeling.layers.nn_blocks import InvertedBottleneckBlock -from official.vision.beta.modeling.layers.nn_blocks import ResidualBlock -from official.vision.beta.modeling.layers.nn_blocks import ResidualInner -from official.vision.beta.modeling.layers.nn_blocks import ReversibleLayer -from official.vision.beta.modeling.layers.nn_blocks_3d import BottleneckBlock3D -from official.vision.beta.modeling.layers.nn_blocks_3d import SelfGating -from official.vision.beta.modeling.layers.nn_layers import CausalConvMixin -from official.vision.beta.modeling.layers.nn_layers import Conv2D -from official.vision.beta.modeling.layers.nn_layers import Conv3D -from official.vision.beta.modeling.layers.nn_layers import DepthwiseConv2D -from official.vision.beta.modeling.layers.nn_layers import GlobalAveragePool3D -from official.vision.beta.modeling.layers.nn_layers import PositionalEncoding -from official.vision.beta.modeling.layers.nn_layers import Scale -from official.vision.beta.modeling.layers.nn_layers import SpatialAveragePool3D -from official.vision.beta.modeling.layers.nn_layers import SqueezeExcitation -from official.vision.beta.modeling.layers.nn_layers import StochasticDepth -from official.vision.beta.modeling.layers.nn_layers import TemporalSoftmaxPool -from official.vision.beta.modeling.layers.roi_aligner import MultilevelROIAligner -from official.vision.beta.modeling.layers.roi_generator import MultilevelROIGenerator -from official.vision.beta.modeling.layers.roi_sampler import ROISampler diff --git a/official/vision/beta/modeling/layers/box_sampler.py b/official/vision/beta/modeling/layers/box_sampler.py deleted file mode 100644 index f96898ad4..000000000 --- a/official/vision/beta/modeling/layers/box_sampler.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of box sampler.""" - -# Import libraries -import tensorflow as tf - -from official.vision.beta.ops import sampling_ops - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class BoxSampler(tf.keras.layers.Layer): - """Creates a BoxSampler to sample positive and negative boxes.""" - - def __init__(self, - num_samples: int = 512, - foreground_fraction: float = 0.25, - **kwargs): - """Initializes a box sampler. - - Args: - num_samples: An `int` of the number of sampled boxes per image. - foreground_fraction: A `float` in [0, 1], what percentage of boxes should - be sampled from the positive examples. - **kwargs: Additional keyword arguments passed to Layer. - """ - self._config_dict = { - 'num_samples': num_samples, - 'foreground_fraction': foreground_fraction, - } - super(BoxSampler, self).__init__(**kwargs) - - def call(self, positive_matches: tf.Tensor, negative_matches: tf.Tensor, - ignored_matches: tf.Tensor): - """Samples and selects positive and negative instances. - - Args: - positive_matches: A `bool` tensor of shape of [batch, N] where N is the - number of instances. For each element, `True` means the instance - corresponds to a positive example. - negative_matches: A `bool` tensor of shape of [batch, N] where N is the - number of instances. For each element, `True` means the instance - corresponds to a negative example. - ignored_matches: A `bool` tensor of shape of [batch, N] where N is the - number of instances. For each element, `True` means the instance should - be ignored. - - Returns: - A `tf.tensor` of shape of [batch_size, K], storing the indices of the - sampled examples, where K is `num_samples`. - """ - sample_candidates = tf.logical_and( - tf.logical_or(positive_matches, negative_matches), - tf.logical_not(ignored_matches)) - - sampler = sampling_ops.BalancedPositiveNegativeSampler( - positive_fraction=self._config_dict['foreground_fraction'], - is_static=True) - - batch_size = sample_candidates.shape[0] - sampled_indicators = [] - for i in range(batch_size): - sampled_indicator = sampler.subsample( - sample_candidates[i], - self._config_dict['num_samples'], - positive_matches[i]) - sampled_indicators.append(sampled_indicator) - sampled_indicators = tf.stack(sampled_indicators) - _, selected_indices = tf.nn.top_k( - tf.cast(sampled_indicators, dtype=tf.int32), - k=self._config_dict['num_samples'], - sorted=True) - - return selected_indices - - def get_config(self): - return self._config_dict - - @classmethod - def from_config(cls, config): - return cls(**config) diff --git a/official/vision/beta/modeling/layers/deeplab.py b/official/vision/beta/modeling/layers/deeplab.py deleted file mode 100644 index d48724f02..000000000 --- a/official/vision/beta/modeling/layers/deeplab.py +++ /dev/null @@ -1,211 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Layers for DeepLabV3.""" - -import tensorflow as tf - - -class SpatialPyramidPooling(tf.keras.layers.Layer): - """Implements the Atrous Spatial Pyramid Pooling. - - References: - [Rethinking Atrous Convolution for Semantic Image Segmentation]( - https://arxiv.org/pdf/1706.05587.pdf) - [Encoder-Decoder with Atrous Separable Convolution for Semantic Image - Segmentation](https://arxiv.org/pdf/1802.02611.pdf) - """ - - def __init__( - self, - output_channels, - dilation_rates, - pool_kernel_size=None, - use_sync_bn=False, - batchnorm_momentum=0.99, - batchnorm_epsilon=0.001, - activation='relu', - dropout=0.5, - kernel_initializer='glorot_uniform', - kernel_regularizer=None, - interpolation='bilinear', - use_depthwise_convolution=False, - **kwargs): - """Initializes `SpatialPyramidPooling`. - - Args: - output_channels: Number of channels produced by SpatialPyramidPooling. - dilation_rates: A list of integers for parallel dilated conv. - pool_kernel_size: A list of integers or None. If None, global average - pooling is applied, otherwise an average pooling of pool_kernel_size - is applied. - use_sync_bn: A bool, whether or not to use sync batch normalization. - batchnorm_momentum: A float for the momentum in BatchNorm. Defaults to - 0.99. - batchnorm_epsilon: A float for the epsilon value in BatchNorm. Defaults to - 0.001. - activation: A `str` for type of activation to be used. Defaults to 'relu'. - dropout: A float for the dropout rate before output. Defaults to 0.5. - kernel_initializer: Kernel initializer for conv layers. Defaults to - `glorot_uniform`. - kernel_regularizer: Kernel regularizer for conv layers. Defaults to None. - interpolation: The interpolation method for upsampling. Defaults to - `bilinear`. - use_depthwise_convolution: Allows spatial pooling to be separable - depthwise convolusions. [Encoder-Decoder with Atrous Separable - Convolution for Semantic Image Segmentation]( - https://arxiv.org/pdf/1802.02611.pdf) - **kwargs: Other keyword arguments for the layer. - """ - super(SpatialPyramidPooling, self).__init__(**kwargs) - - self.output_channels = output_channels - self.dilation_rates = dilation_rates - self.use_sync_bn = use_sync_bn - self.batchnorm_momentum = batchnorm_momentum - self.batchnorm_epsilon = batchnorm_epsilon - self.activation = activation - self.dropout = dropout - self.kernel_initializer = tf.keras.initializers.get(kernel_initializer) - self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer) - self.interpolation = interpolation - self.input_spec = tf.keras.layers.InputSpec(ndim=4) - self.pool_kernel_size = pool_kernel_size - self.use_depthwise_convolution = use_depthwise_convolution - - def build(self, input_shape): - height = input_shape[1] - width = input_shape[2] - channels = input_shape[3] - - self.aspp_layers = [] - - if self.use_sync_bn: - bn_op = tf.keras.layers.experimental.SyncBatchNormalization - else: - bn_op = tf.keras.layers.BatchNormalization - - if tf.keras.backend.image_data_format() == 'channels_last': - bn_axis = -1 - else: - bn_axis = 1 - - conv_sequential = tf.keras.Sequential([ - tf.keras.layers.Conv2D( - filters=self.output_channels, kernel_size=(1, 1), - kernel_initializer=self.kernel_initializer, - kernel_regularizer=self.kernel_regularizer, - use_bias=False), - bn_op( - axis=bn_axis, - momentum=self.batchnorm_momentum, - epsilon=self.batchnorm_epsilon), - tf.keras.layers.Activation(self.activation) - ]) - self.aspp_layers.append(conv_sequential) - - for dilation_rate in self.dilation_rates: - leading_layers = [] - kernel_size = (3, 3) - if self.use_depthwise_convolution: - leading_layers += [ - tf.keras.layers.DepthwiseConv2D( - depth_multiplier=1, kernel_size=kernel_size, - padding='same', depthwise_regularizer=self.kernel_regularizer, - depthwise_initializer=self.kernel_initializer, - dilation_rate=dilation_rate, use_bias=False) - ] - kernel_size = (1, 1) - conv_sequential = tf.keras.Sequential(leading_layers + [ - tf.keras.layers.Conv2D( - filters=self.output_channels, kernel_size=kernel_size, - padding='same', kernel_regularizer=self.kernel_regularizer, - kernel_initializer=self.kernel_initializer, - dilation_rate=dilation_rate, use_bias=False), - bn_op(axis=bn_axis, momentum=self.batchnorm_momentum, - epsilon=self.batchnorm_epsilon), - tf.keras.layers.Activation(self.activation)]) - self.aspp_layers.append(conv_sequential) - - if self.pool_kernel_size is None: - pool_sequential = tf.keras.Sequential([ - tf.keras.layers.GlobalAveragePooling2D(), - tf.keras.layers.Reshape((1, 1, channels))]) - else: - pool_sequential = tf.keras.Sequential([ - tf.keras.layers.AveragePooling2D(self.pool_kernel_size)]) - - pool_sequential.add( - tf.keras.Sequential([ - tf.keras.layers.Conv2D( - filters=self.output_channels, - kernel_size=(1, 1), - kernel_initializer=self.kernel_initializer, - kernel_regularizer=self.kernel_regularizer, - use_bias=False), - bn_op( - axis=bn_axis, - momentum=self.batchnorm_momentum, - epsilon=self.batchnorm_epsilon), - tf.keras.layers.Activation(self.activation), - tf.keras.layers.experimental.preprocessing.Resizing( - height, - width, - interpolation=self.interpolation, - dtype=tf.float32) - ])) - - self.aspp_layers.append(pool_sequential) - - self.projection = tf.keras.Sequential([ - tf.keras.layers.Conv2D( - filters=self.output_channels, kernel_size=(1, 1), - kernel_initializer=self.kernel_initializer, - kernel_regularizer=self.kernel_regularizer, - use_bias=False), - bn_op( - axis=bn_axis, - momentum=self.batchnorm_momentum, - epsilon=self.batchnorm_epsilon), - tf.keras.layers.Activation(self.activation), - tf.keras.layers.Dropout(rate=self.dropout)]) - - def call(self, inputs, training=None): - if training is None: - training = tf.keras.backend.learning_phase() - result = [] - for layer in self.aspp_layers: - result.append(tf.cast(layer(inputs, training=training), inputs.dtype)) - result = tf.concat(result, axis=-1) - result = self.projection(result, training=training) - return result - - def get_config(self): - config = { - 'output_channels': self.output_channels, - 'dilation_rates': self.dilation_rates, - 'pool_kernel_size': self.pool_kernel_size, - 'use_sync_bn': self.use_sync_bn, - 'batchnorm_momentum': self.batchnorm_momentum, - 'batchnorm_epsilon': self.batchnorm_epsilon, - 'activation': self.activation, - 'dropout': self.dropout, - 'kernel_initializer': tf.keras.initializers.serialize( - self.kernel_initializer), - 'kernel_regularizer': tf.keras.regularizers.serialize( - self.kernel_regularizer), - 'interpolation': self.interpolation, - } - base_config = super(SpatialPyramidPooling, self).get_config() - return dict(list(base_config.items()) + list(config.items())) diff --git a/official/vision/beta/modeling/layers/deeplab_test.py b/official/vision/beta/modeling/layers/deeplab_test.py deleted file mode 100644 index dcb7b1b0c..000000000 --- a/official/vision/beta/modeling/layers/deeplab_test.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for ASPP.""" - -import tensorflow as tf - -from tensorflow.python.keras import keras_parameterized -from official.vision.beta.modeling.layers import deeplab - - -@keras_parameterized.run_all_keras_modes -class DeeplabTest(keras_parameterized.TestCase): - - @keras_parameterized.parameterized.parameters( - (None,), - ([32, 32],), - ) - def test_aspp(self, pool_kernel_size): - inputs = tf.keras.Input(shape=(64, 64, 128), dtype=tf.float32) - layer = deeplab.SpatialPyramidPooling(output_channels=256, - dilation_rates=[6, 12, 18], - pool_kernel_size=None) - output = layer(inputs) - self.assertAllEqual([None, 64, 64, 256], output.shape) - - def test_aspp_invalid_shape(self): - inputs = tf.keras.Input(shape=(64, 64), dtype=tf.float32) - layer = deeplab.SpatialPyramidPooling(output_channels=256, - dilation_rates=[6, 12, 18]) - with self.assertRaises(ValueError): - _ = layer(inputs) - - def test_config_with_custom_name(self): - layer = deeplab.SpatialPyramidPooling(256, [5], name='aspp') - config = layer.get_config() - layer_1 = deeplab.SpatialPyramidPooling.from_config(config) - self.assertEqual(layer_1.name, layer.name) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/layers/detection_generator.py b/official/vision/beta/modeling/layers/detection_generator.py deleted file mode 100644 index 2ccea3dfd..000000000 --- a/official/vision/beta/modeling/layers/detection_generator.py +++ /dev/null @@ -1,852 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of generators to generate the final detections.""" -import contextlib -from typing import List, Optional, Mapping -# Import libraries -import tensorflow as tf - -from official.vision.beta.ops import box_ops -from official.vision.beta.ops import nms -from official.vision.beta.ops import preprocess_ops - - -def _generate_detections_v1(boxes: tf.Tensor, - scores: tf.Tensor, - attributes: Optional[Mapping[str, - tf.Tensor]] = None, - pre_nms_top_k: int = 5000, - pre_nms_score_threshold: float = 0.05, - nms_iou_threshold: float = 0.5, - max_num_detections: int = 100, - soft_nms_sigma: Optional[float] = None): - """Generates the final detections given the model outputs. - - The implementation unrolls the batch dimension and process images one by one. - It required the batch dimension to be statically known and it is TPU - compatible. - - Args: - boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or - `[batch_size, N, 1, 4]` for box predictions on all feature levels. The - N is the number of total anchors on all levels. - scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which - stacks class probability on all feature levels. The N is the number of - total anchors on all levels. The num_classes is the number of classes - predicted by the model. Note that the class_outputs here is the raw score. - attributes: None or a dict of (attribute_name, attributes) pairs. Each - attributes is a `tf.Tensor` with shape - `[batch_size, N, num_classes, attribute_size]` or - `[batch_size, N, 1, attribute_size]` for attribute predictions on all - feature levels. The N is the number of total anchors on all levels. Can - be None if no attribute learning is required. - pre_nms_top_k: An `int` number of top candidate detections per class before - NMS. - pre_nms_score_threshold: A `float` representing the threshold for deciding - when to remove boxes based on score. - nms_iou_threshold: A `float` representing the threshold for deciding whether - boxes overlap too much with respect to IOU. - max_num_detections: A scalar representing maximum number of boxes retained - over all classes. - soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS. - When soft_nms_sigma=0.0 (which is default), we fall back to standard NMS. - - Returns: - nms_boxes: A `float` type `tf.Tensor` of shape - `[batch_size, max_num_detections, 4]` representing top detected boxes in - `[y1, x1, y2, x2]`. - nms_scores: A `float` type `tf.Tensor` of shape - `[batch_size, max_num_detections]` representing sorted confidence scores - for detected boxes. The values are between `[0, 1]`. - nms_classes: An `int` type `tf.Tensor` of shape - `[batch_size, max_num_detections]` representing classes for detected - boxes. - valid_detections: An `int` type `tf.Tensor` of shape `[batch_size]` only the - top `valid_detections` boxes are valid detections. - nms_attributes: None or a dict of (attribute_name, attributes). Each - attribute is a `float` type `tf.Tensor` of shape - `[batch_size, max_num_detections, attribute_size]` representing attribute - predictions for detected boxes. Can be an empty dict if no attribute - learning is required. - """ - with tf.name_scope('generate_detections'): - batch_size = scores.get_shape().as_list()[0] - nmsed_boxes = [] - nmsed_classes = [] - nmsed_scores = [] - valid_detections = [] - if attributes: - nmsed_attributes = {att_name: [] for att_name in attributes.keys()} - else: - nmsed_attributes = {} - - for i in range(batch_size): - (nmsed_boxes_i, nmsed_scores_i, nmsed_classes_i, valid_detections_i, - nmsed_att_i) = _generate_detections_per_image( - boxes[i], - scores[i], - attributes={ - att_name: att[i] for att_name, att in attributes.items() - } if attributes else {}, - pre_nms_top_k=pre_nms_top_k, - pre_nms_score_threshold=pre_nms_score_threshold, - nms_iou_threshold=nms_iou_threshold, - max_num_detections=max_num_detections, - soft_nms_sigma=soft_nms_sigma) - nmsed_boxes.append(nmsed_boxes_i) - nmsed_scores.append(nmsed_scores_i) - nmsed_classes.append(nmsed_classes_i) - valid_detections.append(valid_detections_i) - if attributes: - for att_name in attributes.keys(): - nmsed_attributes[att_name].append(nmsed_att_i[att_name]) - - nmsed_boxes = tf.stack(nmsed_boxes, axis=0) - nmsed_scores = tf.stack(nmsed_scores, axis=0) - nmsed_classes = tf.stack(nmsed_classes, axis=0) - valid_detections = tf.stack(valid_detections, axis=0) - if attributes: - for att_name in attributes.keys(): - nmsed_attributes[att_name] = tf.stack(nmsed_attributes[att_name], axis=0) - - return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes - - -def _generate_detections_per_image( - boxes: tf.Tensor, - scores: tf.Tensor, - attributes: Optional[Mapping[str, tf.Tensor]] = None, - pre_nms_top_k: int = 5000, - pre_nms_score_threshold: float = 0.05, - nms_iou_threshold: float = 0.5, - max_num_detections: int = 100, - soft_nms_sigma: Optional[float] = None): - """Generates the final detections per image given the model outputs. - - Args: - boxes: A `tf.Tensor` with shape `[N, num_classes, 4]` or `[N, 1, 4]`, which - box predictions on all feature levels. The N is the number of total - anchors on all levels. - scores: A `tf.Tensor` with shape `[N, num_classes]`, which stacks class - probability on all feature levels. The N is the number of total anchors on - all levels. The num_classes is the number of classes predicted by the - model. Note that the class_outputs here is the raw score. - attributes: If not None, a dict of `tf.Tensor`. Each value is in shape - `[N, num_classes, attribute_size]` or `[N, 1, attribute_size]` of - attribute predictions on all feature levels. The N is the number of total - anchors on all levels. - pre_nms_top_k: An `int` number of top candidate detections per class before - NMS. - pre_nms_score_threshold: A `float` representing the threshold for deciding - when to remove boxes based on score. - nms_iou_threshold: A `float` representing the threshold for deciding whether - boxes overlap too much with respect to IOU. - max_num_detections: A `scalar` representing maximum number of boxes retained - over all classes. - soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS. - When soft_nms_sigma=0.0, we fall back to standard NMS. - If set to None, `tf.image.non_max_suppression_padded` is called instead. - - Returns: - nms_boxes: A `float` tf.Tensor of shape `[max_num_detections, 4]` - representing top detected boxes in `[y1, x1, y2, x2]`. - nms_scores: A `float` tf.Tensor of shape `[max_num_detections]` representing - sorted confidence scores for detected boxes. The values are between [0, - 1]. - nms_classes: An `int` tf.Tensor of shape `[max_num_detections]` representing - classes for detected boxes. - valid_detections: An `int` tf.Tensor of shape [1] only the top - `valid_detections` boxes are valid detections. - nms_attributes: None or a dict. Each value is a `float` tf.Tensor of shape - `[max_num_detections, attribute_size]` representing attribute predictions - for detected boxes. Can be an empty dict if `attributes` is None. - """ - nmsed_boxes = [] - nmsed_scores = [] - nmsed_classes = [] - num_classes_for_box = boxes.get_shape().as_list()[1] - num_classes = scores.get_shape().as_list()[1] - if attributes: - nmsed_attributes = {att_name: [] for att_name in attributes.keys()} - else: - nmsed_attributes = {} - - for i in range(num_classes): - boxes_i = boxes[:, min(num_classes_for_box - 1, i)] - scores_i = scores[:, i] - # Obtains pre_nms_top_k before running NMS. - scores_i, indices = tf.nn.top_k( - scores_i, k=tf.minimum(tf.shape(scores_i)[-1], pre_nms_top_k)) - boxes_i = tf.gather(boxes_i, indices) - - if soft_nms_sigma is not None: - (nmsed_indices_i, - nmsed_scores_i) = tf.image.non_max_suppression_with_scores( - tf.cast(boxes_i, tf.float32), - tf.cast(scores_i, tf.float32), - max_num_detections, - iou_threshold=nms_iou_threshold, - score_threshold=pre_nms_score_threshold, - soft_nms_sigma=soft_nms_sigma, - name='nms_detections_' + str(i)) - nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i) - nmsed_boxes_i = preprocess_ops.clip_or_pad_to_fixed_size( - nmsed_boxes_i, max_num_detections, 0.0) - nmsed_scores_i = preprocess_ops.clip_or_pad_to_fixed_size( - nmsed_scores_i, max_num_detections, -1.0) - else: - (nmsed_indices_i, - nmsed_num_valid_i) = tf.image.non_max_suppression_padded( - tf.cast(boxes_i, tf.float32), - tf.cast(scores_i, tf.float32), - max_num_detections, - iou_threshold=nms_iou_threshold, - score_threshold=pre_nms_score_threshold, - pad_to_max_output_size=True, - name='nms_detections_' + str(i)) - nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i) - nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i) - # Sets scores of invalid boxes to -1. - nmsed_scores_i = tf.where( - tf.less(tf.range(max_num_detections), [nmsed_num_valid_i]), - nmsed_scores_i, -tf.ones_like(nmsed_scores_i)) - - nmsed_classes_i = tf.fill([max_num_detections], i) - nmsed_boxes.append(nmsed_boxes_i) - nmsed_scores.append(nmsed_scores_i) - nmsed_classes.append(nmsed_classes_i) - if attributes: - for att_name, att in attributes.items(): - num_classes_for_attr = att.get_shape().as_list()[1] - att_i = att[:, min(num_classes_for_attr - 1, i)] - att_i = tf.gather(att_i, indices) - nmsed_att_i = tf.gather(att_i, nmsed_indices_i) - nmsed_att_i = preprocess_ops.clip_or_pad_to_fixed_size( - nmsed_att_i, max_num_detections, 0.0) - nmsed_attributes[att_name].append(nmsed_att_i) - - # Concats results from all classes and sort them. - nmsed_boxes = tf.concat(nmsed_boxes, axis=0) - nmsed_scores = tf.concat(nmsed_scores, axis=0) - nmsed_classes = tf.concat(nmsed_classes, axis=0) - nmsed_scores, indices = tf.nn.top_k( - nmsed_scores, k=max_num_detections, sorted=True) - nmsed_boxes = tf.gather(nmsed_boxes, indices) - nmsed_classes = tf.gather(nmsed_classes, indices) - valid_detections = tf.reduce_sum( - tf.cast(tf.greater(nmsed_scores, -1), tf.int32)) - if attributes: - for att_name in attributes.keys(): - nmsed_attributes[att_name] = tf.concat(nmsed_attributes[att_name], axis=0) - nmsed_attributes[att_name] = tf.gather(nmsed_attributes[att_name], - indices) - - return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes - - -def _select_top_k_scores(scores_in: tf.Tensor, pre_nms_num_detections: int): - """Selects top_k scores and indices for each class. - - Args: - scores_in: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which - stacks class logit outputs on all feature levels. The N is the number of - total anchors on all levels. The num_classes is the number of classes - predicted by the model. - pre_nms_num_detections: Number of candidates before NMS. - - Returns: - scores and indices: A `tf.Tensor` with shape - `[batch_size, pre_nms_num_detections, num_classes]`. - """ - batch_size, num_anchors, num_class = scores_in.get_shape().as_list() - if batch_size is None: - batch_size = tf.shape(scores_in)[0] - scores_trans = tf.transpose(scores_in, perm=[0, 2, 1]) - scores_trans = tf.reshape(scores_trans, [-1, num_anchors]) - - top_k_scores, top_k_indices = tf.nn.top_k( - scores_trans, k=pre_nms_num_detections, sorted=True) - - top_k_scores = tf.reshape(top_k_scores, - [batch_size, num_class, pre_nms_num_detections]) - top_k_indices = tf.reshape(top_k_indices, - [batch_size, num_class, pre_nms_num_detections]) - - return tf.transpose(top_k_scores, - [0, 2, 1]), tf.transpose(top_k_indices, [0, 2, 1]) - - -def _generate_detections_v2(boxes: tf.Tensor, - scores: tf.Tensor, - pre_nms_top_k: int = 5000, - pre_nms_score_threshold: float = 0.05, - nms_iou_threshold: float = 0.5, - max_num_detections: int = 100): - """Generates the final detections given the model outputs. - - This implementation unrolls classes dimension while using the tf.while_loop - to implement the batched NMS, so that it can be parallelized at the batch - dimension. It should give better performance comparing to v1 implementation. - It is TPU compatible. - - Args: - boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or - `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The - N is the number of total anchors on all levels. - scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which - stacks class probability on all feature levels. The N is the number of - total anchors on all levels. The num_classes is the number of classes - predicted by the model. Note that the class_outputs here is the raw score. - pre_nms_top_k: An `int` number of top candidate detections per class before - NMS. - pre_nms_score_threshold: A `float` representing the threshold for deciding - when to remove boxes based on score. - nms_iou_threshold: A `float` representing the threshold for deciding whether - boxes overlap too much with respect to IOU. - max_num_detections: A `scalar` representing maximum number of boxes retained - over all classes. - - Returns: - nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4] - representing top detected boxes in [y1, x1, y2, x2]. - nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections] - representing sorted confidence scores for detected boxes. The values are - between [0, 1]. - nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections] - representing classes for detected boxes. - valid_detections: An `int` tf.Tensor of shape [batch_size] only the top - `valid_detections` boxes are valid detections. - """ - with tf.name_scope('generate_detections'): - nmsed_boxes = [] - nmsed_classes = [] - nmsed_scores = [] - valid_detections = [] - batch_size, _, num_classes_for_box, _ = boxes.get_shape().as_list() - if batch_size is None: - batch_size = tf.shape(boxes)[0] - _, total_anchors, num_classes = scores.get_shape().as_list() - # Selects top pre_nms_num scores and indices before NMS. - scores, indices = _select_top_k_scores( - scores, min(total_anchors, pre_nms_top_k)) - for i in range(num_classes): - boxes_i = boxes[:, :, min(num_classes_for_box - 1, i), :] - scores_i = scores[:, :, i] - # Obtains pre_nms_top_k before running NMS. - boxes_i = tf.gather(boxes_i, indices[:, :, i], batch_dims=1, axis=1) - - # Filter out scores. - boxes_i, scores_i = box_ops.filter_boxes_by_scores( - boxes_i, scores_i, min_score_threshold=pre_nms_score_threshold) - - (nmsed_scores_i, nmsed_boxes_i) = nms.sorted_non_max_suppression_padded( - tf.cast(scores_i, tf.float32), - tf.cast(boxes_i, tf.float32), - max_num_detections, - iou_threshold=nms_iou_threshold) - nmsed_classes_i = tf.fill([batch_size, max_num_detections], i) - nmsed_boxes.append(nmsed_boxes_i) - nmsed_scores.append(nmsed_scores_i) - nmsed_classes.append(nmsed_classes_i) - nmsed_boxes = tf.concat(nmsed_boxes, axis=1) - nmsed_scores = tf.concat(nmsed_scores, axis=1) - nmsed_classes = tf.concat(nmsed_classes, axis=1) - nmsed_scores, indices = tf.nn.top_k( - nmsed_scores, k=max_num_detections, sorted=True) - nmsed_boxes = tf.gather(nmsed_boxes, indices, batch_dims=1, axis=1) - nmsed_classes = tf.gather(nmsed_classes, indices, batch_dims=1) - valid_detections = tf.reduce_sum( - input_tensor=tf.cast(tf.greater(nmsed_scores, 0.0), tf.int32), axis=1) - return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections - - -def _generate_detections_batched(boxes: tf.Tensor, scores: tf.Tensor, - pre_nms_score_threshold: float, - nms_iou_threshold: float, - max_num_detections: int): - """Generates detected boxes with scores and classes for one-stage detector. - - The function takes output of multi-level ConvNets and anchor boxes and - generates detected boxes. Note that this used batched nms, which is not - supported on TPU currently. - - Args: - boxes: A `tf.Tensor` with shape `[batch_size, N, num_classes, 4]` or - `[batch_size, N, 1, 4]`, which box predictions on all feature levels. The - N is the number of total anchors on all levels. - scores: A `tf.Tensor` with shape `[batch_size, N, num_classes]`, which - stacks class probability on all feature levels. The N is the number of - total anchors on all levels. The num_classes is the number of classes - predicted by the model. Note that the class_outputs here is the raw score. - pre_nms_score_threshold: A `float` representing the threshold for deciding - when to remove boxes based on score. - nms_iou_threshold: A `float` representing the threshold for deciding whether - boxes overlap too much with respect to IOU. - max_num_detections: A `scalar` representing maximum number of boxes retained - over all classes. - - Returns: - nms_boxes: A `float` tf.Tensor of shape [batch_size, max_num_detections, 4] - representing top detected boxes in [y1, x1, y2, x2]. - nms_scores: A `float` tf.Tensor of shape [batch_size, max_num_detections] - representing sorted confidence scores for detected boxes. The values are - between [0, 1]. - nms_classes: An `int` tf.Tensor of shape [batch_size, max_num_detections] - representing classes for detected boxes. - valid_detections: An `int` tf.Tensor of shape [batch_size] only the top - `valid_detections` boxes are valid detections. - """ - with tf.name_scope('generate_detections'): - nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = ( - tf.image.combined_non_max_suppression( - boxes, - scores, - max_output_size_per_class=max_num_detections, - max_total_size=max_num_detections, - iou_threshold=nms_iou_threshold, - score_threshold=pre_nms_score_threshold, - pad_per_class=False, - clip_boxes=False)) - nmsed_classes = tf.cast(nmsed_classes, tf.int32) - return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class DetectionGenerator(tf.keras.layers.Layer): - """Generates the final detected boxes with scores and classes.""" - - def __init__(self, - apply_nms: bool = True, - pre_nms_top_k: int = 5000, - pre_nms_score_threshold: float = 0.05, - nms_iou_threshold: float = 0.5, - max_num_detections: int = 100, - nms_version: str = 'v2', - use_cpu_nms: bool = False, - soft_nms_sigma: Optional[float] = None, - **kwargs): - """Initializes a detection generator. - - Args: - apply_nms: A `bool` of whether or not apply non maximum suppression. - If False, the decoded boxes and their scores are returned. - pre_nms_top_k: An `int` of the number of top scores proposals to be kept - before applying NMS. - pre_nms_score_threshold: A `float` of the score threshold to apply before - applying NMS. Proposals whose scores are below this threshold are - thrown away. - nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold. - max_num_detections: An `int` of the final number of total detections to - generate. - nms_version: A string of `batched`, `v1` or `v2` specifies NMS version. - use_cpu_nms: A `bool` of whether or not enforce NMS to run on CPU. - soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS. - When soft_nms_sigma=0.0, we fall back to standard NMS. - **kwargs: Additional keyword arguments passed to Layer. - """ - self._config_dict = { - 'apply_nms': apply_nms, - 'pre_nms_top_k': pre_nms_top_k, - 'pre_nms_score_threshold': pre_nms_score_threshold, - 'nms_iou_threshold': nms_iou_threshold, - 'max_num_detections': max_num_detections, - 'nms_version': nms_version, - 'use_cpu_nms': use_cpu_nms, - 'soft_nms_sigma': soft_nms_sigma, - } - super(DetectionGenerator, self).__init__(**kwargs) - - def __call__(self, - raw_boxes: tf.Tensor, - raw_scores: tf.Tensor, - anchor_boxes: tf.Tensor, - image_shape: tf.Tensor, - regression_weights: Optional[List[float]] = None, - bbox_per_class: bool = True): - """Generates final detections. - - Args: - raw_boxes: A `tf.Tensor` of shape of `[batch_size, K, num_classes * 4]` - representing the class-specific box coordinates relative to anchors. - raw_scores: A `tf.Tensor` of shape of `[batch_size, K, num_classes]` - representing the class logits before applying score activiation. - anchor_boxes: A `tf.Tensor` of shape of `[batch_size, K, 4]` representing - the corresponding anchor boxes w.r.t `box_outputs`. - image_shape: A `tf.Tensor` of shape of `[batch_size, 2]` storing the image - height and width w.r.t. the scaled image, i.e. the same image space as - `box_outputs` and `anchor_boxes`. - regression_weights: A list of four float numbers to scale coordinates. - bbox_per_class: A `bool`. If True, perform per-class box regression. - - Returns: - If `apply_nms` = True, the return is a dictionary with keys: - `detection_boxes`: A `float` tf.Tensor of shape - [batch, max_num_detections, 4] representing top detected boxes in - [y1, x1, y2, x2]. - `detection_scores`: A `float` `tf.Tensor` of shape - [batch, max_num_detections] representing sorted confidence scores for - detected boxes. The values are between [0, 1]. - `detection_classes`: An `int` tf.Tensor of shape - [batch, max_num_detections] representing classes for detected boxes. - `num_detections`: An `int` tf.Tensor of shape [batch] only the first - `num_detections` boxes are valid detections - If `apply_nms` = False, the return is a dictionary with keys: - `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4] - representing all the decoded boxes. - `decoded_box_scores`: A `float` tf.Tensor of shape - [batch, num_raw_boxes] representing socres of all the decoded boxes. - """ - box_scores = tf.nn.softmax(raw_scores, axis=-1) - - # Removes the background class. - box_scores_shape = tf.shape(box_scores) - box_scores_shape_list = box_scores.get_shape().as_list() - batch_size = box_scores_shape[0] - num_locations = box_scores_shape_list[1] - num_classes = box_scores_shape_list[-1] - - box_scores = tf.slice(box_scores, [0, 0, 1], [-1, -1, -1]) - - if bbox_per_class: - num_detections = num_locations * (num_classes - 1) - raw_boxes = tf.reshape(raw_boxes, - [batch_size, num_locations, num_classes, 4]) - raw_boxes = tf.slice(raw_boxes, [0, 0, 1, 0], [-1, -1, -1, -1]) - anchor_boxes = tf.tile( - tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1]) - raw_boxes = tf.reshape(raw_boxes, [batch_size, num_detections, 4]) - anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4]) - - # Box decoding. - decoded_boxes = box_ops.decode_boxes( - raw_boxes, anchor_boxes, weights=regression_weights) - - # Box clipping - decoded_boxes = box_ops.clip_boxes( - decoded_boxes, tf.expand_dims(image_shape, axis=1)) - - if bbox_per_class: - decoded_boxes = tf.reshape( - decoded_boxes, [batch_size, num_locations, num_classes - 1, 4]) - else: - decoded_boxes = tf.expand_dims(decoded_boxes, axis=2) - - if not self._config_dict['apply_nms']: - return { - 'decoded_boxes': decoded_boxes, - 'decoded_box_scores': box_scores, - } - - # Optionally force the NMS be run on CPU. - if self._config_dict['use_cpu_nms']: - nms_context = tf.device('cpu:0') - else: - nms_context = contextlib.nullcontext() - - with nms_context: - if self._config_dict['nms_version'] == 'batched': - (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = ( - _generate_detections_batched( - decoded_boxes, box_scores, - self._config_dict['pre_nms_score_threshold'], - self._config_dict['nms_iou_threshold'], - self._config_dict['max_num_detections'])) - elif self._config_dict['nms_version'] == 'v1': - (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, _) = ( - _generate_detections_v1( - decoded_boxes, - box_scores, - pre_nms_top_k=self._config_dict['pre_nms_top_k'], - pre_nms_score_threshold=self - ._config_dict['pre_nms_score_threshold'], - nms_iou_threshold=self._config_dict['nms_iou_threshold'], - max_num_detections=self._config_dict['max_num_detections'], - soft_nms_sigma=self._config_dict['soft_nms_sigma'])) - elif self._config_dict['nms_version'] == 'v2': - (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = ( - _generate_detections_v2( - decoded_boxes, - box_scores, - pre_nms_top_k=self._config_dict['pre_nms_top_k'], - pre_nms_score_threshold=self - ._config_dict['pre_nms_score_threshold'], - nms_iou_threshold=self._config_dict['nms_iou_threshold'], - max_num_detections=self._config_dict['max_num_detections'])) - else: - raise ValueError('NMS version {} not supported.'.format( - self._config_dict['nms_version'])) - - # Adds 1 to offset the background class which has index 0. - nmsed_classes += 1 - - return { - 'num_detections': valid_detections, - 'detection_boxes': nmsed_boxes, - 'detection_classes': nmsed_classes, - 'detection_scores': nmsed_scores, - } - - def get_config(self): - return self._config_dict - - @classmethod - def from_config(cls, config): - return cls(**config) - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class MultilevelDetectionGenerator(tf.keras.layers.Layer): - """Generates detected boxes with scores and classes for one-stage detector.""" - - def __init__(self, - apply_nms: bool = True, - pre_nms_top_k: int = 5000, - pre_nms_score_threshold: float = 0.05, - nms_iou_threshold: float = 0.5, - max_num_detections: int = 100, - nms_version: str = 'v1', - use_cpu_nms: bool = False, - soft_nms_sigma: Optional[float] = None, - **kwargs): - """Initializes a multi-level detection generator. - - Args: - apply_nms: A `bool` of whether or not apply non maximum suppression. If - False, the decoded boxes and their scores are returned. - pre_nms_top_k: An `int` of the number of top scores proposals to be kept - before applying NMS. - pre_nms_score_threshold: A `float` of the score threshold to apply before - applying NMS. Proposals whose scores are below this threshold are thrown - away. - nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold. - max_num_detections: An `int` of the final number of total detections to - generate. - nms_version: A string of `batched`, `v1` or `v2` specifies NMS version - use_cpu_nms: A `bool` of whether or not enforce NMS to run on CPU. - soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS. - When soft_nms_sigma=0.0, we fall back to standard NMS. - **kwargs: Additional keyword arguments passed to Layer. - """ - self._config_dict = { - 'apply_nms': apply_nms, - 'pre_nms_top_k': pre_nms_top_k, - 'pre_nms_score_threshold': pre_nms_score_threshold, - 'nms_iou_threshold': nms_iou_threshold, - 'max_num_detections': max_num_detections, - 'nms_version': nms_version, - 'use_cpu_nms': use_cpu_nms, - 'soft_nms_sigma': soft_nms_sigma, - } - super(MultilevelDetectionGenerator, self).__init__(**kwargs) - - def _decode_multilevel_outputs( - self, - raw_boxes: Mapping[str, tf.Tensor], - raw_scores: Mapping[str, tf.Tensor], - anchor_boxes: tf.Tensor, - image_shape: tf.Tensor, - raw_attributes: Optional[Mapping[str, tf.Tensor]] = None): - """Collects dict of multilevel boxes, scores, attributes into lists.""" - boxes = [] - scores = [] - if raw_attributes: - attributes = {att_name: [] for att_name in raw_attributes.keys()} - else: - attributes = {} - - levels = list(raw_boxes.keys()) - min_level = int(min(levels)) - max_level = int(max(levels)) - for i in range(min_level, max_level + 1): - raw_boxes_i = raw_boxes[str(i)] - raw_scores_i = raw_scores[str(i)] - batch_size = tf.shape(raw_boxes_i)[0] - (_, feature_h_i, feature_w_i, - num_anchors_per_locations_times_4) = raw_boxes_i.get_shape().as_list() - num_locations = feature_h_i * feature_w_i - num_anchors_per_locations = num_anchors_per_locations_times_4 // 4 - num_classes = raw_scores_i.get_shape().as_list( - )[-1] // num_anchors_per_locations - - # Applies score transformation and remove the implicit background class. - scores_i = tf.sigmoid( - tf.reshape(raw_scores_i, [ - batch_size, num_locations * num_anchors_per_locations, num_classes - ])) - scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1]) - - # Box decoding. - # The anchor boxes are shared for all data in a batch. - # One stage detector only supports class agnostic box regression. - anchor_boxes_i = tf.reshape( - anchor_boxes[str(i)], - [batch_size, num_locations * num_anchors_per_locations, 4]) - raw_boxes_i = tf.reshape( - raw_boxes_i, - [batch_size, num_locations * num_anchors_per_locations, 4]) - boxes_i = box_ops.decode_boxes(raw_boxes_i, anchor_boxes_i) - - # Box clipping. - boxes_i = box_ops.clip_boxes( - boxes_i, tf.expand_dims(image_shape, axis=1)) - - boxes.append(boxes_i) - scores.append(scores_i) - - if raw_attributes: - for att_name, raw_att in raw_attributes.items(): - attribute_size = raw_att[str( - i)].get_shape().as_list()[-1] // num_anchors_per_locations - att_i = tf.reshape(raw_att[str(i)], [ - batch_size, num_locations * num_anchors_per_locations, - attribute_size - ]) - attributes[att_name].append(att_i) - - boxes = tf.concat(boxes, axis=1) - boxes = tf.expand_dims(boxes, axis=2) - scores = tf.concat(scores, axis=1) - - if raw_attributes: - for att_name in raw_attributes.keys(): - attributes[att_name] = tf.concat(attributes[att_name], axis=1) - attributes[att_name] = tf.expand_dims(attributes[att_name], axis=2) - - return boxes, scores, attributes - - def __call__(self, - raw_boxes: Mapping[str, tf.Tensor], - raw_scores: Mapping[str, tf.Tensor], - anchor_boxes: tf.Tensor, - image_shape: tf.Tensor, - raw_attributes: Optional[Mapping[str, tf.Tensor]] = None): - """Generates final detections. - - Args: - raw_boxes: A `dict` with keys representing FPN levels and values - representing box tenors of shape `[batch, feature_h, feature_w, - num_anchors * 4]`. - raw_scores: A `dict` with keys representing FPN levels and values - representing logit tensors of shape `[batch, feature_h, feature_w, - num_anchors]`. - anchor_boxes: A `tf.Tensor` of shape of [batch_size, K, 4] representing - the corresponding anchor boxes w.r.t `box_outputs`. - image_shape: A `tf.Tensor` of shape of [batch_size, 2] storing the image - height and width w.r.t. the scaled image, i.e. the same image space as - `box_outputs` and `anchor_boxes`. - raw_attributes: If not None, a `dict` of (attribute_name, - attribute_prediction) pairs. `attribute_prediction` is a dict that - contains keys representing FPN levels and values representing tenors of - shape `[batch, feature_h, feature_w, num_anchors * attribute_size]`. - - Returns: - If `apply_nms` = True, the return is a dictionary with keys: - `detection_boxes`: A `float` tf.Tensor of shape - [batch, max_num_detections, 4] representing top detected boxes in - [y1, x1, y2, x2]. - `detection_scores`: A `float` tf.Tensor of shape - [batch, max_num_detections] representing sorted confidence scores for - detected boxes. The values are between [0, 1]. - `detection_classes`: An `int` tf.Tensor of shape - [batch, max_num_detections] representing classes for detected boxes. - `num_detections`: An `int` tf.Tensor of shape [batch] only the first - `num_detections` boxes are valid detections - `detection_attributes`: A dict. Values of the dict is a `float` - tf.Tensor of shape [batch, max_num_detections, attribute_size] - representing attribute predictions for detected boxes. - If `apply_nms` = False, the return is a dictionary with keys: - `decoded_boxes`: A `float` tf.Tensor of shape [batch, num_raw_boxes, 4] - representing all the decoded boxes. - `decoded_box_scores`: A `float` tf.Tensor of shape - [batch, num_raw_boxes] representing socres of all the decoded boxes. - `decoded_box_attributes`: A dict. Values in the dict is a - `float` tf.Tensor of shape [batch, num_raw_boxes, attribute_size] - representing attribute predictions of all the decoded boxes. - """ - boxes, scores, attributes = self._decode_multilevel_outputs( - raw_boxes, raw_scores, anchor_boxes, image_shape, raw_attributes) - - if not self._config_dict['apply_nms']: - return { - 'decoded_boxes': boxes, - 'decoded_box_scores': scores, - 'decoded_box_attributes': attributes, - } - - # Optionally force the NMS to run on CPU. - if self._config_dict['use_cpu_nms']: - nms_context = tf.device('cpu:0') - else: - nms_context = contextlib.nullcontext() - - with nms_context: - if raw_attributes and (self._config_dict['nms_version'] != 'v1'): - raise ValueError( - 'Attribute learning is only supported for NMSv1 but NMS {} is used.' - .format(self._config_dict['nms_version'])) - if self._config_dict['nms_version'] == 'batched': - (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = ( - _generate_detections_batched( - boxes, scores, self._config_dict['pre_nms_score_threshold'], - self._config_dict['nms_iou_threshold'], - self._config_dict['max_num_detections'])) - # Set `nmsed_attributes` to None for batched NMS. - nmsed_attributes = {} - elif self._config_dict['nms_version'] == 'v1': - (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, - nmsed_attributes) = ( - _generate_detections_v1( - boxes, - scores, - attributes=attributes if raw_attributes else None, - pre_nms_top_k=self._config_dict['pre_nms_top_k'], - pre_nms_score_threshold=self - ._config_dict['pre_nms_score_threshold'], - nms_iou_threshold=self._config_dict['nms_iou_threshold'], - max_num_detections=self._config_dict['max_num_detections'], - soft_nms_sigma=self._config_dict['soft_nms_sigma'])) - elif self._config_dict['nms_version'] == 'v2': - (nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections) = ( - _generate_detections_v2( - boxes, - scores, - pre_nms_top_k=self._config_dict['pre_nms_top_k'], - pre_nms_score_threshold=self - ._config_dict['pre_nms_score_threshold'], - nms_iou_threshold=self._config_dict['nms_iou_threshold'], - max_num_detections=self._config_dict['max_num_detections'])) - # Set `nmsed_attributes` to None for v2. - nmsed_attributes = {} - else: - raise ValueError('NMS version {} not supported.'.format( - self._config_dict['nms_version'])) - - # Adds 1 to offset the background class which has index 0. - nmsed_classes += 1 - - return { - 'num_detections': valid_detections, - 'detection_boxes': nmsed_boxes, - 'detection_classes': nmsed_classes, - 'detection_scores': nmsed_scores, - 'detection_attributes': nmsed_attributes, - } - - def get_config(self): - return self._config_dict - - @classmethod - def from_config(cls, config): - return cls(**config) diff --git a/official/vision/beta/modeling/layers/detection_generator_test.py b/official/vision/beta/modeling/layers/detection_generator_test.py deleted file mode 100644 index 1b72f969a..000000000 --- a/official/vision/beta/modeling/layers/detection_generator_test.py +++ /dev/null @@ -1,249 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for detection_generator.py.""" -# Import libraries - -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from official.vision.beta.modeling.layers import detection_generator -from official.vision.beta.ops import anchor - - -class SelectTopKScoresTest(tf.test.TestCase): - - def testSelectTopKScores(self): - pre_nms_num_boxes = 2 - scores_data = [[[0.2, 0.2], [0.1, 0.9], [0.5, 0.1], [0.3, 0.5]]] - scores_in = tf.constant(scores_data, dtype=tf.float32) - top_k_scores, top_k_indices = detection_generator._select_top_k_scores( - scores_in, pre_nms_num_detections=pre_nms_num_boxes) - expected_top_k_scores = np.array([[[0.5, 0.9], [0.3, 0.5]]], - dtype=np.float32) - - expected_top_k_indices = [[[2, 1], [3, 3]]] - - self.assertAllEqual(top_k_scores.numpy(), expected_top_k_scores) - self.assertAllEqual(top_k_indices.numpy(), expected_top_k_indices) - - -class DetectionGeneratorTest( - parameterized.TestCase, tf.test.TestCase): - - @parameterized.product( - nms_version=['batched', 'v1', 'v2'], - use_cpu_nms=[True, False], - soft_nms_sigma=[None, 0.1]) - def testDetectionsOutputShape(self, nms_version, use_cpu_nms, soft_nms_sigma): - max_num_detections = 10 - num_classes = 4 - pre_nms_top_k = 5000 - pre_nms_score_threshold = 0.01 - batch_size = 1 - kwargs = { - 'apply_nms': True, - 'pre_nms_top_k': pre_nms_top_k, - 'pre_nms_score_threshold': pre_nms_score_threshold, - 'nms_iou_threshold': 0.5, - 'max_num_detections': max_num_detections, - 'nms_version': nms_version, - 'use_cpu_nms': use_cpu_nms, - 'soft_nms_sigma': soft_nms_sigma, - } - generator = detection_generator.DetectionGenerator(**kwargs) - - cls_outputs_all = ( - np.random.rand(84, num_classes) - 0.5) * 3 # random 84x3 outputs. - box_outputs_all = np.random.rand(84, 4 * num_classes) # random 84 boxes. - anchor_boxes_all = np.random.rand(84, 4) # random 84 boxes. - class_outputs = tf.reshape( - tf.convert_to_tensor(cls_outputs_all, dtype=tf.float32), - [1, 84, num_classes]) - box_outputs = tf.reshape( - tf.convert_to_tensor(box_outputs_all, dtype=tf.float32), - [1, 84, 4 * num_classes]) - anchor_boxes = tf.reshape( - tf.convert_to_tensor(anchor_boxes_all, dtype=tf.float32), - [1, 84, 4]) - image_info = tf.constant( - [[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]], - dtype=tf.float32) - results = generator( - box_outputs, class_outputs, anchor_boxes, image_info[:, 1, :]) - boxes = results['detection_boxes'] - classes = results['detection_classes'] - scores = results['detection_scores'] - valid_detections = results['num_detections'] - - self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4)) - self.assertEqual(scores.numpy().shape, (batch_size, max_num_detections,)) - self.assertEqual(classes.numpy().shape, (batch_size, max_num_detections,)) - self.assertEqual(valid_detections.numpy().shape, (batch_size,)) - - def test_serialize_deserialize(self): - kwargs = { - 'apply_nms': True, - 'pre_nms_top_k': 1000, - 'pre_nms_score_threshold': 0.1, - 'nms_iou_threshold': 0.5, - 'max_num_detections': 10, - 'nms_version': 'v2', - 'use_cpu_nms': False, - 'soft_nms_sigma': None, - } - generator = detection_generator.DetectionGenerator(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(generator.get_config(), expected_config) - - new_generator = ( - detection_generator.DetectionGenerator.from_config( - generator.get_config())) - - self.assertAllEqual(generator.get_config(), new_generator.get_config()) - - -class MultilevelDetectionGeneratorTest( - parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - ('batched', False, True, None), - ('batched', False, False, None), - ('v2', False, True, None), - ('v2', False, False, None), - ('v1', True, True, 0.0), - ('v1', True, False, 0.1), - ('v1', True, False, None), - ) - def testDetectionsOutputShape(self, nms_version, has_att_heads, use_cpu_nms, - soft_nms_sigma): - min_level = 4 - max_level = 6 - num_scales = 2 - max_num_detections = 10 - aspect_ratios = [1.0, 2.0] - anchor_scale = 2.0 - output_size = [64, 64] - num_classes = 4 - pre_nms_top_k = 5000 - pre_nms_score_threshold = 0.01 - batch_size = 1 - kwargs = { - 'apply_nms': True, - 'pre_nms_top_k': pre_nms_top_k, - 'pre_nms_score_threshold': pre_nms_score_threshold, - 'nms_iou_threshold': 0.5, - 'max_num_detections': max_num_detections, - 'nms_version': nms_version, - 'use_cpu_nms': use_cpu_nms, - 'soft_nms_sigma': soft_nms_sigma, - } - - input_anchor = anchor.build_anchor_generator(min_level, max_level, - num_scales, aspect_ratios, - anchor_scale) - anchor_boxes = input_anchor(output_size) - cls_outputs_all = ( - np.random.rand(84, num_classes) - 0.5) * 3 # random 84x3 outputs. - box_outputs_all = np.random.rand(84, 4) # random 84 boxes. - class_outputs = { - '4': - tf.reshape( - tf.convert_to_tensor(cls_outputs_all[0:64], dtype=tf.float32), - [1, 8, 8, num_classes]), - '5': - tf.reshape( - tf.convert_to_tensor(cls_outputs_all[64:80], dtype=tf.float32), - [1, 4, 4, num_classes]), - '6': - tf.reshape( - tf.convert_to_tensor(cls_outputs_all[80:84], dtype=tf.float32), - [1, 2, 2, num_classes]), - } - box_outputs = { - '4': tf.reshape(tf.convert_to_tensor( - box_outputs_all[0:64], dtype=tf.float32), [1, 8, 8, 4]), - '5': tf.reshape(tf.convert_to_tensor( - box_outputs_all[64:80], dtype=tf.float32), [1, 4, 4, 4]), - '6': tf.reshape(tf.convert_to_tensor( - box_outputs_all[80:84], dtype=tf.float32), [1, 2, 2, 4]), - } - if has_att_heads: - att_outputs_all = np.random.rand(84, 1) # random attributes. - att_outputs = { - 'depth': { - '4': - tf.reshape( - tf.convert_to_tensor( - att_outputs_all[0:64], dtype=tf.float32), - [1, 8, 8, 1]), - '5': - tf.reshape( - tf.convert_to_tensor( - att_outputs_all[64:80], dtype=tf.float32), - [1, 4, 4, 1]), - '6': - tf.reshape( - tf.convert_to_tensor( - att_outputs_all[80:84], dtype=tf.float32), - [1, 2, 2, 1]), - } - } - else: - att_outputs = None - image_info = tf.constant([[[1000, 1000], [100, 100], [0.1, 0.1], [0, 0]]], - dtype=tf.float32) - generator = detection_generator.MultilevelDetectionGenerator(**kwargs) - results = generator(box_outputs, class_outputs, anchor_boxes, - image_info[:, 1, :], att_outputs) - boxes = results['detection_boxes'] - classes = results['detection_classes'] - scores = results['detection_scores'] - valid_detections = results['num_detections'] - - self.assertEqual(boxes.numpy().shape, (batch_size, max_num_detections, 4)) - self.assertEqual(scores.numpy().shape, (batch_size, max_num_detections,)) - self.assertEqual(classes.numpy().shape, (batch_size, max_num_detections,)) - self.assertEqual(valid_detections.numpy().shape, (batch_size,)) - if has_att_heads: - for att in results['detection_attributes'].values(): - self.assertEqual(att.numpy().shape, (batch_size, max_num_detections, 1)) - - def test_serialize_deserialize(self): - kwargs = { - 'apply_nms': True, - 'pre_nms_top_k': 1000, - 'pre_nms_score_threshold': 0.1, - 'nms_iou_threshold': 0.5, - 'max_num_detections': 10, - 'nms_version': 'v2', - 'use_cpu_nms': False, - 'soft_nms_sigma': None, - } - generator = detection_generator.MultilevelDetectionGenerator(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(generator.get_config(), expected_config) - - new_generator = ( - detection_generator.MultilevelDetectionGenerator.from_config( - generator.get_config())) - - self.assertAllEqual(generator.get_config(), new_generator.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/layers/mask_sampler.py b/official/vision/beta/modeling/layers/mask_sampler.py deleted file mode 100644 index 68b185b85..000000000 --- a/official/vision/beta/modeling/layers/mask_sampler.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of mask sampler.""" - -# Import libraries -import tensorflow as tf - -from official.vision.beta.ops import spatial_transform_ops - - -def _sample_and_crop_foreground_masks(candidate_rois: tf.Tensor, - candidate_gt_boxes: tf.Tensor, - candidate_gt_classes: tf.Tensor, - candidate_gt_indices: tf.Tensor, - gt_masks: tf.Tensor, - num_sampled_masks: int = 128, - mask_target_size: int = 28): - """Samples and creates cropped foreground masks for training. - - Args: - candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is the - number of candidate RoIs to be considered for mask sampling. It includes - both positive and negative RoIs. The `num_mask_samples_per_image` positive - RoIs will be sampled to create mask training targets. - candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing - the corresponding groundtruth boxes to the `candidate_rois`. - candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing the - corresponding groundtruth classes to the `candidate_rois`. 0 in the tensor - corresponds to the background class, i.e. negative RoIs. - candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the - corresponding groundtruth instance indices to the `candidate_gt_boxes`, - i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i] and - gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is - the superset of candidate_gt_boxes. - gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height, - mask_width] containing all the groundtruth masks which sample masks are - drawn from. - num_sampled_masks: An `int` that specifies the number of masks to sample. - mask_target_size: An `int` that specifies the final cropped mask size after - sampling. The output masks are resized w.r.t the sampled RoIs. - - Returns: - foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the - RoI that corresponds to the sampled foreground masks, where - K = num_mask_samples_per_image. - foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the - classes corresponding to the sampled foreground masks. - cropoped_foreground_masks: A `tf.Tensor` of shape of - [batch_size, K, mask_target_size, mask_target_size] storing the cropped - foreground masks used for training. - """ - _, fg_instance_indices = tf.nn.top_k( - tf.cast(tf.greater(candidate_gt_classes, 0), dtype=tf.int32), - k=num_sampled_masks) - - fg_instance_indices_shape = tf.shape(fg_instance_indices) - batch_indices = ( - tf.expand_dims(tf.range(fg_instance_indices_shape[0]), axis=-1) * - tf.ones([1, fg_instance_indices_shape[-1]], dtype=tf.int32)) - - gather_nd_instance_indices = tf.stack( - [batch_indices, fg_instance_indices], axis=-1) - foreground_rois = tf.gather_nd( - candidate_rois, gather_nd_instance_indices) - foreground_boxes = tf.gather_nd( - candidate_gt_boxes, gather_nd_instance_indices) - foreground_classes = tf.gather_nd( - candidate_gt_classes, gather_nd_instance_indices) - foreground_gt_indices = tf.gather_nd( - candidate_gt_indices, gather_nd_instance_indices) - foreground_gt_indices = tf.where( - tf.equal(foreground_gt_indices, -1), - tf.zeros_like(foreground_gt_indices), - foreground_gt_indices) - - foreground_gt_indices_shape = tf.shape(foreground_gt_indices) - batch_indices = ( - tf.expand_dims(tf.range(foreground_gt_indices_shape[0]), axis=-1) * - tf.ones([1, foreground_gt_indices_shape[-1]], dtype=tf.int32)) - gather_nd_gt_indices = tf.stack( - [batch_indices, foreground_gt_indices], axis=-1) - foreground_masks = tf.gather_nd(gt_masks, gather_nd_gt_indices) - - cropped_foreground_masks = spatial_transform_ops.crop_mask_in_target_box( - foreground_masks, foreground_boxes, foreground_rois, mask_target_size, - sample_offset=0.5) - - return foreground_rois, foreground_classes, cropped_foreground_masks - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class MaskSampler(tf.keras.layers.Layer): - """Samples and creates mask training targets.""" - - def __init__(self, mask_target_size: int, num_sampled_masks: int, **kwargs): - self._config_dict = { - 'mask_target_size': mask_target_size, - 'num_sampled_masks': num_sampled_masks, - } - super(MaskSampler, self).__init__(**kwargs) - - def call(self, candidate_rois: tf.Tensor, candidate_gt_boxes: tf.Tensor, - candidate_gt_classes: tf.Tensor, candidate_gt_indices: tf.Tensor, - gt_masks: tf.Tensor): - """Samples and creates mask targets for training. - - Args: - candidate_rois: A `tf.Tensor` of shape of [batch_size, N, 4], where N is - the number of candidate RoIs to be considered for mask sampling. It - includes both positive and negative RoIs. The - `num_mask_samples_per_image` positive RoIs will be sampled to create - mask training targets. - candidate_gt_boxes: A `tf.Tensor` of shape of [batch_size, N, 4], storing - the corresponding groundtruth boxes to the `candidate_rois`. - candidate_gt_classes: A `tf.Tensor` of shape of [batch_size, N], storing - the corresponding groundtruth classes to the `candidate_rois`. 0 in the - tensor corresponds to the background class, i.e. negative RoIs. - candidate_gt_indices: A `tf.Tensor` of shape [batch_size, N], storing the - corresponding groundtruth instance indices to the `candidate_gt_boxes`, - i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i], - where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= - N, is the superset of candidate_gt_boxes. - gt_masks: A `tf.Tensor` of [batch_size, MAX_INSTANCES, mask_height, - mask_width] containing all the groundtruth masks which sample masks are - drawn from. after sampling. The output masks are resized w.r.t the - sampled RoIs. - - Returns: - foreground_rois: A `tf.Tensor` of shape of [batch_size, K, 4] storing the - RoI that corresponds to the sampled foreground masks, where - K = num_mask_samples_per_image. - foreground_classes: A `tf.Tensor` of shape of [batch_size, K] storing the - classes corresponding to the sampled foreground masks. - cropoped_foreground_masks: A `tf.Tensor` of shape of - [batch_size, K, mask_target_size, mask_target_size] storing the - cropped foreground masks used for training. - """ - foreground_rois, foreground_classes, cropped_foreground_masks = ( - _sample_and_crop_foreground_masks( - candidate_rois, - candidate_gt_boxes, - candidate_gt_classes, - candidate_gt_indices, - gt_masks, - self._config_dict['num_sampled_masks'], - self._config_dict['mask_target_size'])) - return foreground_rois, foreground_classes, cropped_foreground_masks - - def get_config(self): - return self._config_dict - - @classmethod - def from_config(cls, config): - return cls(**config) diff --git a/official/vision/beta/modeling/layers/nn_blocks.py b/official/vision/beta/modeling/layers/nn_blocks.py deleted file mode 100644 index f5cd79e2d..000000000 --- a/official/vision/beta/modeling/layers/nn_blocks.py +++ /dev/null @@ -1,1512 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains common building blocks for neural networks.""" - -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Text - -# Import libraries -from absl import logging -import tensorflow as tf - -from official.modeling import tf_utils -from official.vision.beta.modeling.layers import nn_layers - - -def _pad_strides(strides: int, axis: int) -> Tuple[int, int, int, int]: - """Converts int to len 4 strides (`tf.nn.avg_pool` uses length 4).""" - if axis == 1: - return (1, 1, strides, strides) - else: - return (1, strides, strides, 1) - - -def _maybe_downsample(x: tf.Tensor, out_filter: int, strides: int, - axis: int) -> tf.Tensor: - """Downsamples feature map and 0-pads tensor if in_filter != out_filter.""" - data_format = 'NCHW' if axis == 1 else 'NHWC' - strides = _pad_strides(strides, axis=axis) - - x = tf.nn.avg_pool(x, strides, strides, 'VALID', data_format=data_format) - - in_filter = x.shape[axis] - if in_filter < out_filter: - # Pad on channel dimension with 0s: half on top half on bottom. - pad_size = [(out_filter - in_filter) // 2, (out_filter - in_filter) // 2] - if axis == 1: - x = tf.pad(x, [[0, 0], pad_size, [0, 0], [0, 0]]) - else: - x = tf.pad(x, [[0, 0], [0, 0], [0, 0], pad_size]) - - return x + 0. - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class ResidualBlock(tf.keras.layers.Layer): - """A residual block.""" - - def __init__(self, - filters, - strides, - use_projection=False, - se_ratio=None, - resnetd_shortcut=False, - stochastic_depth_drop_rate=None, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - activation='relu', - use_explicit_padding: bool = False, - use_sync_bn=False, - norm_momentum=0.99, - norm_epsilon=0.001, - bn_trainable=True, - **kwargs): - """Initializes a residual block with BN after convolutions. - - Args: - filters: An `int` number of filters for the first two convolutions. Note - that the third and final convolution will use 4 times as many filters. - strides: An `int` block stride. If greater than 1, this block will - ultimately downsample the input. - use_projection: A `bool` for whether this block should use a projection - shortcut (versus the default identity shortcut). This is usually `True` - for the first block of a block group, which may change the number of - filters and the resolution. - se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer. - resnetd_shortcut: A `bool` if True, apply the resnetd style modification - to the shortcut connection. Not implemented in residual blocks. - stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for - the stochastic depth layer. - kernel_initializer: A `str` of kernel_initializer for convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d. - Default to None. - activation: A `str` name of the activation function. - use_explicit_padding: Use 'VALID' padding for convolutions, but prepad - inputs so that the output dimensions are the same as if 'SAME' padding - were used. - use_sync_bn: A `bool`. If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - bn_trainable: A `bool` that indicates whether batch norm layers should be - trainable. Default to True. - **kwargs: Additional keyword arguments to be passed. - """ - super(ResidualBlock, self).__init__(**kwargs) - - self._filters = filters - self._strides = strides - self._use_projection = use_projection - self._se_ratio = se_ratio - self._resnetd_shortcut = resnetd_shortcut - self._use_explicit_padding = use_explicit_padding - self._use_sync_bn = use_sync_bn - self._activation = activation - self._stochastic_depth_drop_rate = stochastic_depth_drop_rate - self._kernel_initializer = kernel_initializer - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - - if use_sync_bn: - self._norm = tf.keras.layers.experimental.SyncBatchNormalization - else: - self._norm = tf.keras.layers.BatchNormalization - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - self._activation_fn = tf_utils.get_activation(activation) - self._bn_trainable = bn_trainable - - def build(self, input_shape): - if self._use_projection: - self._shortcut = tf.keras.layers.Conv2D( - filters=self._filters, - kernel_size=1, - strides=self._strides, - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm0 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon, - trainable=self._bn_trainable) - - conv1_padding = 'same' - # explicit padding here is added for centernet - if self._use_explicit_padding: - self._pad = tf.keras.layers.ZeroPadding2D(padding=(1, 1)) - conv1_padding = 'valid' - - self._conv1 = tf.keras.layers.Conv2D( - filters=self._filters, - kernel_size=3, - strides=self._strides, - padding=conv1_padding, - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm1 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon, - trainable=self._bn_trainable) - - self._conv2 = tf.keras.layers.Conv2D( - filters=self._filters, - kernel_size=3, - strides=1, - padding='same', - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm2 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon, - trainable=self._bn_trainable) - - if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1: - self._squeeze_excitation = nn_layers.SqueezeExcitation( - in_filters=self._filters, - out_filters=self._filters, - se_ratio=self._se_ratio, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - else: - self._squeeze_excitation = None - - if self._stochastic_depth_drop_rate: - self._stochastic_depth = nn_layers.StochasticDepth( - self._stochastic_depth_drop_rate) - else: - self._stochastic_depth = None - - super(ResidualBlock, self).build(input_shape) - - def get_config(self): - config = { - 'filters': self._filters, - 'strides': self._strides, - 'use_projection': self._use_projection, - 'se_ratio': self._se_ratio, - 'resnetd_shortcut': self._resnetd_shortcut, - 'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - 'activation': self._activation, - 'use_explicit_padding': self._use_explicit_padding, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon, - 'bn_trainable': self._bn_trainable - } - base_config = super(ResidualBlock, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def call(self, inputs, training=None): - shortcut = inputs - if self._use_projection: - shortcut = self._shortcut(shortcut) - shortcut = self._norm0(shortcut) - - if self._use_explicit_padding: - inputs = self._pad(inputs) - x = self._conv1(inputs) - x = self._norm1(x) - x = self._activation_fn(x) - - x = self._conv2(x) - x = self._norm2(x) - - if self._squeeze_excitation: - x = self._squeeze_excitation(x) - - if self._stochastic_depth: - x = self._stochastic_depth(x, training=training) - - return self._activation_fn(x + shortcut) - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class BottleneckBlock(tf.keras.layers.Layer): - """A standard bottleneck block.""" - - def __init__(self, - filters, - strides, - dilation_rate=1, - use_projection=False, - se_ratio=None, - resnetd_shortcut=False, - stochastic_depth_drop_rate=None, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - activation='relu', - use_sync_bn=False, - norm_momentum=0.99, - norm_epsilon=0.001, - bn_trainable=True, - **kwargs): - """Initializes a standard bottleneck block with BN after convolutions. - - Args: - filters: An `int` number of filters for the first two convolutions. Note - that the third and final convolution will use 4 times as many filters. - strides: An `int` block stride. If greater than 1, this block will - ultimately downsample the input. - dilation_rate: An `int` dilation_rate of convolutions. Default to 1. - use_projection: A `bool` for whether this block should use a projection - shortcut (versus the default identity shortcut). This is usually `True` - for the first block of a block group, which may change the number of - filters and the resolution. - se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer. - resnetd_shortcut: A `bool`. If True, apply the resnetd style modification - to the shortcut connection. - stochastic_depth_drop_rate: A `float` or None. If not None, drop rate for - the stochastic depth layer. - kernel_initializer: A `str` of kernel_initializer for convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d. - Default to None. - activation: A `str` name of the activation function. - use_sync_bn: A `bool`. If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - bn_trainable: A `bool` that indicates whether batch norm layers should be - trainable. Default to True. - **kwargs: Additional keyword arguments to be passed. - """ - super(BottleneckBlock, self).__init__(**kwargs) - - self._filters = filters - self._strides = strides - self._dilation_rate = dilation_rate - self._use_projection = use_projection - self._se_ratio = se_ratio - self._resnetd_shortcut = resnetd_shortcut - self._use_sync_bn = use_sync_bn - self._activation = activation - self._stochastic_depth_drop_rate = stochastic_depth_drop_rate - self._kernel_initializer = kernel_initializer - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - if use_sync_bn: - self._norm = tf.keras.layers.experimental.SyncBatchNormalization - else: - self._norm = tf.keras.layers.BatchNormalization - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - self._bn_trainable = bn_trainable - - def build(self, input_shape): - if self._use_projection: - if self._resnetd_shortcut: - self._shortcut0 = tf.keras.layers.AveragePooling2D( - pool_size=2, strides=self._strides, padding='same') - self._shortcut1 = tf.keras.layers.Conv2D( - filters=self._filters * 4, - kernel_size=1, - strides=1, - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - else: - self._shortcut = tf.keras.layers.Conv2D( - filters=self._filters * 4, - kernel_size=1, - strides=self._strides, - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - - self._norm0 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon, - trainable=self._bn_trainable) - - self._conv1 = tf.keras.layers.Conv2D( - filters=self._filters, - kernel_size=1, - strides=1, - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm1 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon, - trainable=self._bn_trainable) - self._activation1 = tf_utils.get_activation( - self._activation, use_keras_layer=True) - - self._conv2 = tf.keras.layers.Conv2D( - filters=self._filters, - kernel_size=3, - strides=self._strides, - dilation_rate=self._dilation_rate, - padding='same', - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm2 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon, - trainable=self._bn_trainable) - self._activation2 = tf_utils.get_activation( - self._activation, use_keras_layer=True) - - self._conv3 = tf.keras.layers.Conv2D( - filters=self._filters * 4, - kernel_size=1, - strides=1, - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm3 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon, - trainable=self._bn_trainable) - self._activation3 = tf_utils.get_activation( - self._activation, use_keras_layer=True) - - if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1: - self._squeeze_excitation = nn_layers.SqueezeExcitation( - in_filters=self._filters * 4, - out_filters=self._filters * 4, - se_ratio=self._se_ratio, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - else: - self._squeeze_excitation = None - - if self._stochastic_depth_drop_rate: - self._stochastic_depth = nn_layers.StochasticDepth( - self._stochastic_depth_drop_rate) - else: - self._stochastic_depth = None - self._add = tf.keras.layers.Add() - - super(BottleneckBlock, self).build(input_shape) - - def get_config(self): - config = { - 'filters': self._filters, - 'strides': self._strides, - 'dilation_rate': self._dilation_rate, - 'use_projection': self._use_projection, - 'se_ratio': self._se_ratio, - 'resnetd_shortcut': self._resnetd_shortcut, - 'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - 'activation': self._activation, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon, - 'bn_trainable': self._bn_trainable - } - base_config = super(BottleneckBlock, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def call(self, inputs, training=None): - shortcut = inputs - if self._use_projection: - if self._resnetd_shortcut: - shortcut = self._shortcut0(shortcut) - shortcut = self._shortcut1(shortcut) - else: - shortcut = self._shortcut(shortcut) - shortcut = self._norm0(shortcut) - - x = self._conv1(inputs) - x = self._norm1(x) - x = self._activation1(x) - - x = self._conv2(x) - x = self._norm2(x) - x = self._activation2(x) - - x = self._conv3(x) - x = self._norm3(x) - - if self._squeeze_excitation: - x = self._squeeze_excitation(x) - - if self._stochastic_depth: - x = self._stochastic_depth(x, training=training) - - x = self._add([x, shortcut]) - return self._activation3(x) - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class InvertedBottleneckBlock(tf.keras.layers.Layer): - """An inverted bottleneck block.""" - - def __init__(self, - in_filters, - out_filters, - expand_ratio, - strides, - kernel_size=3, - se_ratio=None, - stochastic_depth_drop_rate=None, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - activation='relu', - se_inner_activation='relu', - se_gating_activation='sigmoid', - se_round_down_protect=True, - expand_se_in_filters=False, - depthwise_activation=None, - use_sync_bn=False, - dilation_rate=1, - divisible_by=1, - regularize_depthwise=False, - use_depthwise=True, - use_residual=True, - norm_momentum=0.99, - norm_epsilon=0.001, - output_intermediate_endpoints=False, - **kwargs): - """Initializes an inverted bottleneck block with BN after convolutions. - - Args: - in_filters: An `int` number of filters of the input tensor. - out_filters: An `int` number of filters of the output tensor. - expand_ratio: An `int` of expand_ratio for an inverted bottleneck block. - strides: An `int` block stride. If greater than 1, this block will - ultimately downsample the input. - kernel_size: An `int` kernel_size of the depthwise conv layer. - se_ratio: A `float` or None. If not None, se ratio for the squeeze and - excitation layer. - stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for - the stochastic depth layer. - kernel_initializer: A `str` of kernel_initializer for convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d. - Default to None. - activation: A `str` name of the activation function. - se_inner_activation: A `str` name of squeeze-excitation inner activation. - se_gating_activation: A `str` name of squeeze-excitation gating - activation. - se_round_down_protect: A `bool` of whether round down more than 10% - will be allowed in SE layer. - expand_se_in_filters: A `bool` of whether or not to expand in_filter in - squeeze and excitation layer. - depthwise_activation: A `str` name of the activation function for - depthwise only. - use_sync_bn: A `bool`. If True, use synchronized batch normalization. - dilation_rate: An `int` that specifies the dilation rate to use for. - divisible_by: An `int` that ensures all inner dimensions are divisible by - this number. - dilated convolution: An `int` to specify the same value for all spatial - dimensions. - regularize_depthwise: A `bool` of whether or not apply regularization on - depthwise. - use_depthwise: A `bool` of whether to uses fused convolutions instead of - depthwise. - use_residual: A `bool` of whether to include residual connection between - input and output. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - output_intermediate_endpoints: A `bool` of whether or not output the - intermediate endpoints. - **kwargs: Additional keyword arguments to be passed. - """ - super(InvertedBottleneckBlock, self).__init__(**kwargs) - - self._in_filters = in_filters - self._out_filters = out_filters - self._expand_ratio = expand_ratio - self._strides = strides - self._kernel_size = kernel_size - self._se_ratio = se_ratio - self._divisible_by = divisible_by - self._stochastic_depth_drop_rate = stochastic_depth_drop_rate - self._dilation_rate = dilation_rate - self._use_sync_bn = use_sync_bn - self._regularize_depthwise = regularize_depthwise - self._use_depthwise = use_depthwise - self._use_residual = use_residual - self._activation = activation - self._se_inner_activation = se_inner_activation - self._se_gating_activation = se_gating_activation - self._depthwise_activation = depthwise_activation - self._se_round_down_protect = se_round_down_protect - self._kernel_initializer = kernel_initializer - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - self._expand_se_in_filters = expand_se_in_filters - self._output_intermediate_endpoints = output_intermediate_endpoints - - if use_sync_bn: - self._norm = tf.keras.layers.experimental.SyncBatchNormalization - else: - self._norm = tf.keras.layers.BatchNormalization - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - if not depthwise_activation: - self._depthwise_activation = activation - if regularize_depthwise: - self._depthsize_regularizer = kernel_regularizer - else: - self._depthsize_regularizer = None - - def build(self, input_shape): - expand_filters = self._in_filters - if self._expand_ratio > 1: - # First 1x1 conv for channel expansion. - expand_filters = nn_layers.make_divisible( - self._in_filters * self._expand_ratio, self._divisible_by) - - expand_kernel = 1 if self._use_depthwise else self._kernel_size - expand_stride = 1 if self._use_depthwise else self._strides - - self._conv0 = tf.keras.layers.Conv2D( - filters=expand_filters, - kernel_size=expand_kernel, - strides=expand_stride, - padding='same', - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm0 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - self._activation_layer = tf_utils.get_activation( - self._activation, use_keras_layer=True) - - if self._use_depthwise: - # Depthwise conv. - self._conv1 = tf.keras.layers.DepthwiseConv2D( - kernel_size=(self._kernel_size, self._kernel_size), - strides=self._strides, - padding='same', - depth_multiplier=1, - dilation_rate=self._dilation_rate, - use_bias=False, - depthwise_initializer=self._kernel_initializer, - depthwise_regularizer=self._depthsize_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm1 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - self._depthwise_activation_layer = tf_utils.get_activation( - self._depthwise_activation, use_keras_layer=True) - - # Squeeze and excitation. - if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1: - logging.info('Use Squeeze and excitation.') - in_filters = self._in_filters - if self._expand_se_in_filters: - in_filters = expand_filters - self._squeeze_excitation = nn_layers.SqueezeExcitation( - in_filters=in_filters, - out_filters=expand_filters, - se_ratio=self._se_ratio, - divisible_by=self._divisible_by, - round_down_protect=self._se_round_down_protect, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer, - activation=self._se_inner_activation, - gating_activation=self._se_gating_activation) - else: - self._squeeze_excitation = None - - # Last 1x1 conv. - self._conv2 = tf.keras.layers.Conv2D( - filters=self._out_filters, - kernel_size=1, - strides=1, - padding='same', - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm2 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - - if self._stochastic_depth_drop_rate: - self._stochastic_depth = nn_layers.StochasticDepth( - self._stochastic_depth_drop_rate) - else: - self._stochastic_depth = None - self._add = tf.keras.layers.Add() - - super(InvertedBottleneckBlock, self).build(input_shape) - - def get_config(self): - config = { - 'in_filters': self._in_filters, - 'out_filters': self._out_filters, - 'expand_ratio': self._expand_ratio, - 'strides': self._strides, - 'kernel_size': self._kernel_size, - 'se_ratio': self._se_ratio, - 'divisible_by': self._divisible_by, - 'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - 'activation': self._activation, - 'se_inner_activation': self._se_inner_activation, - 'se_gating_activation': self._se_gating_activation, - 'se_round_down_protect': self._se_round_down_protect, - 'expand_se_in_filters': self._expand_se_in_filters, - 'depthwise_activation': self._depthwise_activation, - 'dilation_rate': self._dilation_rate, - 'use_sync_bn': self._use_sync_bn, - 'regularize_depthwise': self._regularize_depthwise, - 'use_depthwise': self._use_depthwise, - 'use_residual': self._use_residual, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon, - 'output_intermediate_endpoints': self._output_intermediate_endpoints - } - base_config = super(InvertedBottleneckBlock, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def call(self, inputs, training=None): - endpoints = {} - shortcut = inputs - if self._expand_ratio > 1: - x = self._conv0(inputs) - x = self._norm0(x) - x = self._activation_layer(x) - else: - x = inputs - - if self._use_depthwise: - x = self._conv1(x) - x = self._norm1(x) - x = self._depthwise_activation_layer(x) - if self._output_intermediate_endpoints: - endpoints['depthwise'] = x - - if self._squeeze_excitation: - x = self._squeeze_excitation(x) - - x = self._conv2(x) - x = self._norm2(x) - - if (self._use_residual and self._in_filters == self._out_filters and - self._strides == 1): - if self._stochastic_depth: - x = self._stochastic_depth(x, training=training) - x = self._add([x, shortcut]) - - if self._output_intermediate_endpoints: - return x, endpoints - return x - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class ResidualInner(tf.keras.layers.Layer): - """Creates a single inner block of a residual. - - This corresponds to `F`/`G` functions in the RevNet paper: - Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse. - The Reversible Residual Network: Backpropagation Without Storing Activations. - (https://arxiv.org/pdf/1707.04585.pdf) - """ - - def __init__( - self, - filters: int, - strides: int, - kernel_initializer: Union[str, Callable[ - ..., tf.keras.initializers.Initializer]] = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - activation: Union[str, Callable[..., tf.Tensor]] = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - batch_norm_first: bool = True, - **kwargs): - """Initializes a ResidualInner. - - Args: - filters: An `int` of output filter size. - strides: An `int` of stride size for convolution for the residual block. - kernel_initializer: A `str` or `tf.keras.initializers.Initializer` - instance for convolutional layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D. - activation: A `str` or `callable` instance of the activation function. - use_sync_bn: A `bool`. If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - batch_norm_first: A `bool` of whether to apply activation and batch norm - before conv. - **kwargs: Additional keyword arguments to be passed. - """ - super(ResidualInner, self).__init__(**kwargs) - - self.strides = strides - self.filters = filters - self._kernel_initializer = tf.keras.initializers.get(kernel_initializer) - self._kernel_regularizer = kernel_regularizer - self._activation = tf.keras.activations.get(activation) - self._use_sync_bn = use_sync_bn - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - self._batch_norm_first = batch_norm_first - - if use_sync_bn: - self._norm = tf.keras.layers.experimental.SyncBatchNormalization - else: - self._norm = tf.keras.layers.BatchNormalization - - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - self._activation_fn = tf_utils.get_activation(activation) - - def build(self, input_shape: tf.TensorShape): - if self._batch_norm_first: - self._batch_norm_0 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - - self._conv2d_1 = tf.keras.layers.Conv2D( - filters=self.filters, - kernel_size=3, - strides=self.strides, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer) - - self._batch_norm_1 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - - self._conv2d_2 = tf.keras.layers.Conv2D( - filters=self.filters, - kernel_size=3, - strides=1, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer) - - super(ResidualInner, self).build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - 'filters': self.filters, - 'strides': self.strides, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'activation': self._activation, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon, - 'batch_norm_first': self._batch_norm_first, - } - base_config = super(ResidualInner, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def call(self, - inputs: tf.Tensor, - training: Optional[bool] = None) -> tf.Tensor: - x = inputs - if self._batch_norm_first: - x = self._batch_norm_0(x, training=training) - x = self._activation_fn(x) - x = self._conv2d_1(x) - - x = self._batch_norm_1(x, training=training) - x = self._activation_fn(x) - x = self._conv2d_2(x) - return x - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class BottleneckResidualInner(tf.keras.layers.Layer): - """Creates a single inner block of a bottleneck. - - This corresponds to `F`/`G` functions in the RevNet paper: - Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse. - The Reversible Residual Network: Backpropagation Without Storing Activations. - (https://arxiv.org/pdf/1707.04585.pdf) - """ - - def __init__( - self, - filters: int, - strides: int, - kernel_initializer: Union[str, Callable[ - ..., tf.keras.initializers.Initializer]] = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - activation: Union[str, Callable[..., tf.Tensor]] = 'relu', - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - batch_norm_first: bool = True, - **kwargs): - """Initializes a BottleneckResidualInner. - - Args: - filters: An `int` number of filters for first 2 convolutions. Last Last, - and thus the number of output channels from the bottlneck block is - `4*filters` - strides: An `int` of stride size for convolution for the residual block. - kernel_initializer: A `str` or `tf.keras.initializers.Initializer` - instance for convolutional layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` for Conv2D. - activation: A `str` or `callable` instance of the activation function. - use_sync_bn: A `bool`. If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - batch_norm_first: A `bool` of whether to apply activation and batch norm - before conv. - **kwargs: Additional keyword arguments to be passed. - """ - super(BottleneckResidualInner, self).__init__(**kwargs) - - self.strides = strides - self.filters = filters - self._kernel_initializer = tf.keras.initializers.get(kernel_initializer) - self._kernel_regularizer = kernel_regularizer - self._activation = tf.keras.activations.get(activation) - self._use_sync_bn = use_sync_bn - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - self._batch_norm_first = batch_norm_first - - if use_sync_bn: - self._norm = tf.keras.layers.experimental.SyncBatchNormalization - else: - self._norm = tf.keras.layers.BatchNormalization - - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - self._activation_fn = tf_utils.get_activation(activation) - - def build(self, input_shape: tf.TensorShape): - if self._batch_norm_first: - self._batch_norm_0 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - self._conv2d_1 = tf.keras.layers.Conv2D( - filters=self.filters, - kernel_size=1, - strides=self.strides, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer) - self._batch_norm_1 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - self._conv2d_2 = tf.keras.layers.Conv2D( - filters=self.filters, - kernel_size=3, - strides=1, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer) - self._batch_norm_2 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - self._conv2d_3 = tf.keras.layers.Conv2D( - filters=self.filters * 4, - kernel_size=1, - strides=1, - use_bias=False, - padding='same', - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer) - - super(BottleneckResidualInner, self).build(input_shape) - - def get_config(self) -> Dict[str, Any]: - config = { - 'filters': self.filters, - 'strides': self.strides, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'activation': self._activation, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon, - 'batch_norm_first': self._batch_norm_first, - } - base_config = super(BottleneckResidualInner, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def call(self, - inputs: tf.Tensor, - training: Optional[bool] = None) -> tf.Tensor: - x = inputs - if self._batch_norm_first: - x = self._batch_norm_0(x, training=training) - x = self._activation_fn(x) - x = self._conv2d_1(x) - - x = self._batch_norm_1(x, training=training) - x = self._activation_fn(x) - x = self._conv2d_2(x) - - x = self._batch_norm_2(x, training=training) - x = self._activation_fn(x) - x = self._conv2d_3(x) - - return x - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class ReversibleLayer(tf.keras.layers.Layer): - """Creates a reversible layer. - - Computes y1 = x1 + f(x2), y2 = x2 + g(y1), where f and g can be arbitrary - layers that are stateless, which in this case are `ResidualInner` layers. - """ - - def __init__(self, - f: tf.keras.layers.Layer, - g: tf.keras.layers.Layer, - manual_grads: bool = True, - **kwargs): - """Initializes a ReversibleLayer. - - Args: - f: A `tf.keras.layers.Layer` instance of `f` inner block referred to in - paper. Each reversible layer consists of two inner functions. For - example, in RevNet the reversible residual consists of two f/g inner - (bottleneck) residual functions. Where the input to the reversible layer - is x, the input gets partitioned in the channel dimension and the - forward pass follows (eq8): x = [x1; x2], z1 = x1 + f(x2), y2 = x2 + - g(z1), y1 = stop_gradient(z1). - g: A `tf.keras.layers.Layer` instance of `g` inner block referred to in - paper. Detailed explanation same as above as `f` arg. - manual_grads: A `bool` [Testing Only] of whether to manually take - gradients as in Algorithm 1 or defer to autograd. - **kwargs: Additional keyword arguments to be passed. - """ - super(ReversibleLayer, self).__init__(**kwargs) - - self._f = f - self._g = g - self._manual_grads = manual_grads - - if tf.keras.backend.image_data_format() == 'channels_last': - self._axis = -1 - else: - self._axis = 1 - - def get_config(self) -> Dict[str, Any]: - config = { - 'f': self._f, - 'g': self._g, - 'manual_grads': self._manual_grads, - } - base_config = super(ReversibleLayer, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def _ckpt_non_trainable_vars(self): - self._f_non_trainable_vars = [ - v.read_value() for v in self._f.non_trainable_variables - ] - self._g_non_trainable_vars = [ - v.read_value() for v in self._g.non_trainable_variables - ] - - def _load_ckpt_non_trainable_vars(self): - for v, v_chkpt in zip(self._f.non_trainable_variables, - self._f_non_trainable_vars): - v.assign(v_chkpt) - for v, v_chkpt in zip(self._g.non_trainable_variables, - self._g_non_trainable_vars): - v.assign(v_chkpt) - - def call(self, - inputs: tf.Tensor, - training: Optional[bool] = None) -> tf.Tensor: - - @tf.custom_gradient - def reversible( - x: tf.Tensor - ) -> Tuple[tf.Tensor, Callable[[Any], Tuple[List[tf.Tensor], - List[tf.Tensor]]]]: - """Implements Algorithm 1 in the RevNet paper. - - Aidan N. Gomez, Mengye Ren, Raquel Urtasun, Roger B. Grosse. - The Reversible Residual Network: Backpropagation Without Storing - Activations. - (https://arxiv.org/pdf/1707.04585.pdf) - - Args: - x: An input `tf.Tensor. - - Returns: - y: The output [y1; y2] in Algorithm 1. - grad_fn: A callable function that computes the gradients. - """ - with tf.GradientTape() as fwdtape: - fwdtape.watch(x) - x1, x2 = tf.split(x, num_or_size_splits=2, axis=self._axis) - f_x2 = self._f(x2, training=training) - x1_down = _maybe_downsample(x1, f_x2.shape[self._axis], self._f.strides, - self._axis) - z1 = f_x2 + x1_down - g_z1 = self._g(z1, training=training) - x2_down = _maybe_downsample(x2, g_z1.shape[self._axis], self._f.strides, - self._axis) - y2 = x2_down + g_z1 - - # Equation 8: https://arxiv.org/pdf/1707.04585.pdf - # Decouple y1 and z1 so that their derivatives are different. - y1 = tf.identity(z1) - y = tf.concat([y1, y2], axis=self._axis) - - irreversible = ((self._f.strides != 1 or self._g.strides != 1) or - (y.shape[self._axis] != inputs.shape[self._axis])) - - # Checkpointing moving mean/variance for batch normalization layers - # as they shouldn't be updated during the custom gradient pass of f/g. - self._ckpt_non_trainable_vars() - - def grad_fn( - dy: tf.Tensor, - variables: Optional[List[tf.Variable]] = None, - ) -> Tuple[List[tf.Tensor], List[tf.Tensor]]: - """Given dy calculate (dy/dx)|_{x_{input}} using f/g.""" - if irreversible or not self._manual_grads: - grads_combined = fwdtape.gradient( - y, [x] + variables, output_gradients=dy) - dx = grads_combined[0] - grad_vars = grads_combined[1:] - else: - y1_nograd = tf.stop_gradient(y1) - y2_nograd = tf.stop_gradient(y2) - dy1, dy2 = tf.split(dy, num_or_size_splits=2, axis=self._axis) - - # Index mapping from self.f/g.trainable_variables to grad_fn - # input `variables` kwarg so that we can reorder dwf + dwg - # variable gradient list to match `variables` order. - f_var_refs = [v.ref() for v in self._f.trainable_variables] - g_var_refs = [v.ref() for v in self._g.trainable_variables] - fg_var_refs = f_var_refs + g_var_refs - self_to_var_index = [fg_var_refs.index(v.ref()) for v in variables] - - # Algorithm 1 in paper (line # documented in-line) - z1 = y1_nograd # line 2 - with tf.GradientTape() as gtape: - gtape.watch(z1) - g_z1 = self._g(z1, training=training) - x2 = y2_nograd - g_z1 # line 3 - - with tf.GradientTape() as ftape: - ftape.watch(x2) - f_x2 = self._f(x2, training=training) - x1 = z1 - f_x2 # pylint: disable=unused-variable # line 4 - - # Compute gradients - g_grads_combined = gtape.gradient( - g_z1, [z1] + self._g.trainable_variables, output_gradients=dy2) - dz1 = dy1 + g_grads_combined[0] # line 5 - dwg = g_grads_combined[1:] # line 9 - - f_grads_combined = ftape.gradient( - f_x2, [x2] + self._f.trainable_variables, output_gradients=dz1) - dx2 = dy2 + f_grads_combined[0] # line 6 - dwf = f_grads_combined[1:] # line 8 - dx1 = dz1 # line 7 - - # Pack the input and variable gradients. - dx = tf.concat([dx1, dx2], axis=self._axis) - grad_vars = dwf + dwg - # Reorder gradients (trainable_variables to variables kwarg order) - grad_vars = [grad_vars[i] for i in self_to_var_index] - - # Restore batch normalization moving mean/variance for correctness. - self._load_ckpt_non_trainable_vars() - - return dx, grad_vars # grad_fn end - - return y, grad_fn # reversible end - - activations = reversible(inputs) - return activations - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class DepthwiseSeparableConvBlock(tf.keras.layers.Layer): - """Creates an depthwise separable convolution block with batch normalization.""" - - def __init__( - self, - filters: int, - kernel_size: int = 3, - strides: int = 1, - regularize_depthwise=False, - activation: Text = 'relu6', - kernel_initializer: Text = 'VarianceScaling', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - dilation_rate: int = 1, - use_sync_bn: bool = False, - norm_momentum: float = 0.99, - norm_epsilon: float = 0.001, - **kwargs): - """Initializes a convolution block with batch normalization. - - Args: - filters: An `int` number of filters for the first two convolutions. Note - that the third and final convolution will use 4 times as many filters. - kernel_size: An `int` that specifies the height and width of the 2D - convolution window. - strides: An `int` of block stride. If greater than 1, this block will - ultimately downsample the input. - regularize_depthwise: A `bool`. If Ture, apply regularization on - depthwise. - activation: A `str` name of the activation function. - kernel_initializer: A `str` of kernel_initializer for convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - dilation_rate: An `int` or tuple/list of 2 `int`, specifying the dilation - rate to use for dilated convolution. Can be a single integer to specify - the same value for all spatial dimensions. - use_sync_bn: A `bool`. If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - **kwargs: Additional keyword arguments to be passed. - """ - super(DepthwiseSeparableConvBlock, self).__init__(**kwargs) - self._filters = filters - self._kernel_size = kernel_size - self._strides = strides - self._activation = activation - self._regularize_depthwise = regularize_depthwise - self._kernel_initializer = kernel_initializer - self._kernel_regularizer = kernel_regularizer - self._dilation_rate = dilation_rate - self._use_sync_bn = use_sync_bn - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - - if use_sync_bn: - self._norm = tf.keras.layers.experimental.SyncBatchNormalization - else: - self._norm = tf.keras.layers.BatchNormalization - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - self._activation_fn = tf_utils.get_activation(activation) - if regularize_depthwise: - self._depthsize_regularizer = kernel_regularizer - else: - self._depthsize_regularizer = None - - def get_config(self): - config = { - 'filters': self._filters, - 'strides': self._strides, - 'regularize_depthwise': self._regularize_depthwise, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'activation': self._activation, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon - } - base_config = super(DepthwiseSeparableConvBlock, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def build(self, input_shape): - - self._dwconv0 = tf.keras.layers.DepthwiseConv2D( - kernel_size=self._kernel_size, - strides=self._strides, - padding='same', - depth_multiplier=1, - dilation_rate=self._dilation_rate, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._depthsize_regularizer, - use_bias=False) - self._norm0 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - - self._conv1 = tf.keras.layers.Conv2D( - filters=self._filters, - kernel_size=1, - strides=1, - padding='same', - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer) - self._norm1 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - - super(DepthwiseSeparableConvBlock, self).build(input_shape) - - def call(self, inputs, training=None): - x = self._dwconv0(inputs) - x = self._norm0(x) - x = self._activation_fn(x) - - x = self._conv1(x) - x = self._norm1(x) - return self._activation_fn(x) - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class TuckerConvBlock(tf.keras.layers.Layer): - """An Tucker block (generalized bottleneck).""" - - def __init__(self, - in_filters, - out_filters, - input_compression_ratio, - output_compression_ratio, - strides, - kernel_size=3, - stochastic_depth_drop_rate=None, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - activation='relu', - use_sync_bn=False, - divisible_by=1, - use_residual=True, - norm_momentum=0.99, - norm_epsilon=0.001, - **kwargs): - """Initializes an inverted bottleneck block with BN after convolutions. - - Args: - in_filters: An `int` number of filters of the input tensor. - out_filters: An `int` number of filters of the output tensor. - input_compression_ratio: An `float` of compression ratio for - input filters. - output_compression_ratio: An `float` of compression ratio for - output filters. - strides: An `int` block stride. If greater than 1, this block will - ultimately downsample the input. - kernel_size: An `int` kernel_size of the depthwise conv layer. - stochastic_depth_drop_rate: A `float` or None. if not None, drop rate for - the stochastic depth layer. - kernel_initializer: A `str` of kernel_initializer for convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d. - Default to None. - activation: A `str` name of the activation function. - use_sync_bn: A `bool`. If True, use synchronized batch normalization. - divisible_by: An `int` that ensures all inner dimensions are divisible by - this number. - use_residual: A `bool` of whether to include residual connection between - input and output. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - **kwargs: Additional keyword arguments to be passed. - """ - super(TuckerConvBlock, self).__init__(**kwargs) - - self._in_filters = in_filters - self._out_filters = out_filters - self._input_compression_ratio = input_compression_ratio - self._output_compression_ratio = output_compression_ratio - self._strides = strides - self._kernel_size = kernel_size - self._divisible_by = divisible_by - self._stochastic_depth_drop_rate = stochastic_depth_drop_rate - self._use_sync_bn = use_sync_bn - self._use_residual = use_residual - self._activation = activation - self._kernel_initializer = kernel_initializer - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - - if use_sync_bn: - self._norm = tf.keras.layers.experimental.SyncBatchNormalization - else: - self._norm = tf.keras.layers.BatchNormalization - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - - def build(self, input_shape): - input_compressed_filters = nn_layers.make_divisible( - value=self._in_filters * self._input_compression_ratio, - divisor=self._divisible_by, - round_down_protect=False) - - self._conv0 = tf.keras.layers.Conv2D( - filters=input_compressed_filters, - kernel_size=1, - strides=1, - padding='same', - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm0 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - self._activation_layer0 = tf_utils.get_activation( - self._activation, use_keras_layer=True) - - output_compressed_filters = nn_layers.make_divisible( - value=self._out_filters * self._output_compression_ratio, - divisor=self._divisible_by, - round_down_protect=False) - - self._conv1 = tf.keras.layers.Conv2D( - filters=output_compressed_filters, - kernel_size=self._kernel_size, - strides=self._strides, - padding='same', - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm1 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - self._activation_layer1 = tf_utils.get_activation( - self._activation, use_keras_layer=True) - - # Last 1x1 conv. - self._conv2 = tf.keras.layers.Conv2D( - filters=self._out_filters, - kernel_size=1, - strides=1, - padding='same', - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm2 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - - if self._stochastic_depth_drop_rate: - self._stochastic_depth = nn_layers.StochasticDepth( - self._stochastic_depth_drop_rate) - else: - self._stochastic_depth = None - self._add = tf.keras.layers.Add() - - super(TuckerConvBlock, self).build(input_shape) - - def get_config(self): - config = { - 'in_filters': self._in_filters, - 'out_filters': self._out_filters, - 'input_compression_ratio': self._input_compression_ratio, - 'output_compression_ratio': self._output_compression_ratio, - 'strides': self._strides, - 'kernel_size': self._kernel_size, - 'divisible_by': self._divisible_by, - 'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - 'activation': self._activation, - 'use_sync_bn': self._use_sync_bn, - 'use_residual': self._use_residual, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon - } - base_config = super(TuckerConvBlock, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def call(self, inputs, training=None): - shortcut = inputs - - x = self._conv0(inputs) - x = self._norm0(x) - x = self._activation_layer0(x) - - x = self._conv1(x) - x = self._norm1(x) - x = self._activation_layer1(x) - - x = self._conv2(x) - x = self._norm2(x) - - if (self._use_residual and - self._in_filters == self._out_filters and - self._strides == 1): - if self._stochastic_depth: - x = self._stochastic_depth(x, training=training) - x = self._add([x, shortcut]) - - return x diff --git a/official/vision/beta/modeling/layers/nn_blocks_3d.py b/official/vision/beta/modeling/layers/nn_blocks_3d.py deleted file mode 100644 index 29a2cd274..000000000 --- a/official/vision/beta/modeling/layers/nn_blocks_3d.py +++ /dev/null @@ -1,286 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains common building blocks for 3D networks.""" -# Import libraries -import tensorflow as tf - -from official.modeling import tf_utils -from official.vision.beta.modeling.layers import nn_layers - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class SelfGating(tf.keras.layers.Layer): - """Feature gating as used in S3D-G. - - This implements the S3D-G network from: - Saining Xie, Chen Sun, Jonathan Huang, Zhuowen Tu, Kevin Murphy. - Rethinking Spatiotemporal Feature Learning: Speed-Accuracy Trade-offs in Video - Classification. - (https://arxiv.org/pdf/1712.04851.pdf) - """ - - def __init__(self, filters, **kwargs): - """Initializes a self-gating layer. - - Args: - filters: An `int` number of filters for the convolutional layer. - **kwargs: Additional keyword arguments to be passed. - """ - super(SelfGating, self).__init__(**kwargs) - self._filters = filters - - def build(self, input_shape): - self._spatial_temporal_average = tf.keras.layers.GlobalAveragePooling3D() - - # No BN and activation after conv. - self._transformer_w = tf.keras.layers.Conv3D( - filters=self._filters, - kernel_size=[1, 1, 1], - use_bias=True, - kernel_initializer=tf.keras.initializers.TruncatedNormal( - mean=0.0, stddev=0.01)) - - super(SelfGating, self).build(input_shape) - - def call(self, inputs): - x = self._spatial_temporal_average(inputs) - - x = tf.expand_dims(x, 1) - x = tf.expand_dims(x, 2) - x = tf.expand_dims(x, 3) - - x = self._transformer_w(x) - x = tf.nn.sigmoid(x) - - return tf.math.multiply(x, inputs) - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class BottleneckBlock3D(tf.keras.layers.Layer): - """Creates a 3D bottleneck block.""" - - def __init__(self, - filters, - temporal_kernel_size, - temporal_strides, - spatial_strides, - stochastic_depth_drop_rate=0.0, - se_ratio=None, - use_self_gating=False, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - activation='relu', - use_sync_bn=False, - norm_momentum=0.99, - norm_epsilon=0.001, - **kwargs): - """Initializes a 3D bottleneck block with BN after convolutions. - - Args: - filters: An `int` number of filters for the first two convolutions. Note - that the third and final convolution will use 4 times as many filters. - temporal_kernel_size: An `int` of kernel size for the temporal - convolutional layer. - temporal_strides: An `int` of ftemporal stride for the temporal - convolutional layer. - spatial_strides: An `int` of spatial stride for the spatial convolutional - layer. - stochastic_depth_drop_rate: A `float` or None. If not None, drop rate for - the stochastic depth layer. - se_ratio: A `float` or None. Ratio of the Squeeze-and-Excitation layer. - use_self_gating: A `bool` of whether to apply self-gating module or not. - kernel_initializer: A `str` of kernel_initializer for convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d. - Default to None. - activation: A `str` name of the activation function. - use_sync_bn: A `bool`. If True, use synchronized batch normalization. - norm_momentum: A `float` of normalization momentum for the moving average. - norm_epsilon: A `float` added to variance to avoid dividing by zero. - **kwargs: Additional keyword arguments to be passed. - """ - super(BottleneckBlock3D, self).__init__(**kwargs) - - self._filters = filters - self._temporal_kernel_size = temporal_kernel_size - self._spatial_strides = spatial_strides - self._temporal_strides = temporal_strides - self._stochastic_depth_drop_rate = stochastic_depth_drop_rate - self._use_self_gating = use_self_gating - self._se_ratio = se_ratio - self._use_sync_bn = use_sync_bn - self._activation = activation - self._kernel_initializer = kernel_initializer - self._norm_momentum = norm_momentum - self._norm_epsilon = norm_epsilon - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - - if use_sync_bn: - self._norm = tf.keras.layers.experimental.SyncBatchNormalization - else: - self._norm = tf.keras.layers.BatchNormalization - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - self._activation_fn = tf_utils.get_activation(activation) - - def build(self, input_shape): - self._shortcut_maxpool = tf.keras.layers.MaxPool3D( - pool_size=[1, 1, 1], - strides=[ - self._temporal_strides, self._spatial_strides, self._spatial_strides - ]) - - self._shortcut_conv = tf.keras.layers.Conv3D( - filters=4 * self._filters, - kernel_size=1, - strides=[ - self._temporal_strides, self._spatial_strides, self._spatial_strides - ], - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm0 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - - self._temporal_conv = tf.keras.layers.Conv3D( - filters=self._filters, - kernel_size=[self._temporal_kernel_size, 1, 1], - strides=[self._temporal_strides, 1, 1], - padding='same', - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm1 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - - self._spatial_conv = tf.keras.layers.Conv3D( - filters=self._filters, - kernel_size=[1, 3, 3], - strides=[1, self._spatial_strides, self._spatial_strides], - padding='same', - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm2 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - - self._expand_conv = tf.keras.layers.Conv3D( - filters=4 * self._filters, - kernel_size=[1, 1, 1], - strides=[1, 1, 1], - padding='same', - use_bias=False, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - self._norm3 = self._norm( - axis=self._bn_axis, - momentum=self._norm_momentum, - epsilon=self._norm_epsilon) - - if self._se_ratio and self._se_ratio > 0 and self._se_ratio <= 1: - self._squeeze_excitation = nn_layers.SqueezeExcitation( - in_filters=self._filters * 4, - out_filters=self._filters * 4, - se_ratio=self._se_ratio, - use_3d_input=True, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - else: - self._squeeze_excitation = None - - if self._stochastic_depth_drop_rate: - self._stochastic_depth = nn_layers.StochasticDepth( - self._stochastic_depth_drop_rate) - else: - self._stochastic_depth = None - - if self._use_self_gating: - self._self_gating = SelfGating(filters=4 * self._filters) - else: - self._self_gating = None - - super(BottleneckBlock3D, self).build(input_shape) - - def get_config(self): - config = { - 'filters': self._filters, - 'temporal_kernel_size': self._temporal_kernel_size, - 'temporal_strides': self._temporal_strides, - 'spatial_strides': self._spatial_strides, - 'use_self_gating': self._use_self_gating, - 'se_ratio': self._se_ratio, - 'stochastic_depth_drop_rate': self._stochastic_depth_drop_rate, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - 'activation': self._activation, - 'use_sync_bn': self._use_sync_bn, - 'norm_momentum': self._norm_momentum, - 'norm_epsilon': self._norm_epsilon - } - base_config = super(BottleneckBlock3D, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def call(self, inputs, training=None): - in_filters = inputs.shape.as_list()[-1] - if in_filters == 4 * self._filters: - if self._temporal_strides == 1 and self._spatial_strides == 1: - shortcut = inputs - else: - shortcut = self._shortcut_maxpool(inputs) - else: - shortcut = self._shortcut_conv(inputs) - shortcut = self._norm0(shortcut) - - x = self._temporal_conv(inputs) - x = self._norm1(x) - x = self._activation_fn(x) - - x = self._spatial_conv(x) - x = self._norm2(x) - x = self._activation_fn(x) - - x = self._expand_conv(x) - x = self._norm3(x) - - # Apply self-gating, SE, stochastic depth. - if self._self_gating: - x = self._self_gating(x) - if self._squeeze_excitation: - x = self._squeeze_excitation(x) - if self._stochastic_depth: - x = self._stochastic_depth(x, training=training) - - # Apply activation before additional modules. - x = self._activation_fn(x + shortcut) - - return x diff --git a/official/vision/beta/modeling/layers/nn_blocks_3d_test.py b/official/vision/beta/modeling/layers/nn_blocks_3d_test.py deleted file mode 100644 index 4093ca010..000000000 --- a/official/vision/beta/modeling/layers/nn_blocks_3d_test.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for resnet.""" - -# Import libraries -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.modeling.layers import nn_blocks_3d - - -class NNBlocksTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (nn_blocks_3d.BottleneckBlock3D, 1, 1, 2, True, 0.2, 0.1), - (nn_blocks_3d.BottleneckBlock3D, 3, 2, 1, False, 0.0, 0.0), - ) - def test_bottleneck_block_creation(self, block_fn, temporal_kernel_size, - temporal_strides, spatial_strides, - use_self_gating, se_ratio, - stochastic_depth): - temporal_size = 16 - spatial_size = 128 - filters = 256 - inputs = tf.keras.Input( - shape=(temporal_size, spatial_size, spatial_size, filters * 4), - batch_size=1) - block = block_fn( - filters=filters, - temporal_kernel_size=temporal_kernel_size, - temporal_strides=temporal_strides, - spatial_strides=spatial_strides, - use_self_gating=use_self_gating, - se_ratio=se_ratio, - stochastic_depth_drop_rate=stochastic_depth) - - features = block(inputs) - - self.assertAllEqual([ - 1, temporal_size // temporal_strides, spatial_size // spatial_strides, - spatial_size // spatial_strides, filters * 4 - ], features.shape.as_list()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/layers/nn_blocks_test.py b/official/vision/beta/modeling/layers/nn_blocks_test.py deleted file mode 100644 index 5e92b8327..000000000 --- a/official/vision/beta/modeling/layers/nn_blocks_test.py +++ /dev/null @@ -1,340 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for nn_blocks.""" - -from typing import Any, Iterable, Tuple -# Import libraries -from absl.testing import parameterized -import tensorflow as tf - -from tensorflow.python.distribute import combinations -from tensorflow.python.distribute import strategy_combinations -from official.vision.beta.modeling.layers import nn_blocks - - -def distribution_strategy_combinations() -> Iterable[Tuple[Any, ...]]: - """Returns the combinations of end-to-end tests to run.""" - return combinations.combine( - distribution=[ - strategy_combinations.default_strategy, - strategy_combinations.cloud_tpu_strategy, - strategy_combinations.one_device_strategy_gpu, - ],) - - -class NNBlocksTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (nn_blocks.ResidualBlock, 1, False, 0.0, None), - (nn_blocks.ResidualBlock, 2, True, 0.2, 0.25), - ) - def test_residual_block_creation(self, block_fn, strides, use_projection, - stochastic_depth_drop_rate, se_ratio): - input_size = 128 - filter_size = 256 - inputs = tf.keras.Input( - shape=(input_size, input_size, filter_size), batch_size=1) - block = block_fn( - filter_size, - strides, - use_projection=use_projection, - se_ratio=se_ratio, - stochastic_depth_drop_rate=stochastic_depth_drop_rate, - ) - - features = block(inputs) - - self.assertAllEqual( - [1, input_size // strides, input_size // strides, filter_size], - features.shape.as_list()) - - @parameterized.parameters( - (nn_blocks.BottleneckBlock, 1, False, 0.0, None), - (nn_blocks.BottleneckBlock, 2, True, 0.2, 0.25), - ) - def test_bottleneck_block_creation(self, block_fn, strides, use_projection, - stochastic_depth_drop_rate, se_ratio): - input_size = 128 - filter_size = 256 - inputs = tf.keras.Input( - shape=(input_size, input_size, filter_size * 4), batch_size=1) - block = block_fn( - filter_size, - strides, - use_projection=use_projection, - se_ratio=se_ratio, - stochastic_depth_drop_rate=stochastic_depth_drop_rate) - - features = block(inputs) - - self.assertAllEqual( - [1, input_size // strides, input_size // strides, filter_size * 4], - features.shape.as_list()) - - @parameterized.parameters( - (nn_blocks.InvertedBottleneckBlock, 1, 1, None, None), - (nn_blocks.InvertedBottleneckBlock, 6, 1, None, None), - (nn_blocks.InvertedBottleneckBlock, 1, 2, None, None), - (nn_blocks.InvertedBottleneckBlock, 1, 1, 0.2, None), - (nn_blocks.InvertedBottleneckBlock, 1, 1, None, 0.2), - ) - def test_invertedbottleneck_block_creation(self, block_fn, expand_ratio, - strides, se_ratio, - stochastic_depth_drop_rate): - input_size = 128 - in_filters = 24 - out_filters = 40 - inputs = tf.keras.Input( - shape=(input_size, input_size, in_filters), batch_size=1) - block = block_fn( - in_filters=in_filters, - out_filters=out_filters, - expand_ratio=expand_ratio, - strides=strides, - se_ratio=se_ratio, - stochastic_depth_drop_rate=stochastic_depth_drop_rate) - - features = block(inputs) - - self.assertAllEqual( - [1, input_size // strides, input_size // strides, out_filters], - features.shape.as_list()) - - @parameterized.parameters( - (nn_blocks.TuckerConvBlock, 1, 0.25, 0.25), - (nn_blocks.TuckerConvBlock, 2, 0.25, 0.25), - ) - def test_tucker_conv_block( - self, block_fn, strides, - input_compression_ratio, output_compression_ratio): - input_size = 128 - in_filters = 24 - out_filters = 24 - inputs = tf.keras.Input( - shape=(input_size, input_size, in_filters), batch_size=1) - block = block_fn( - in_filters=in_filters, - out_filters=out_filters, - input_compression_ratio=input_compression_ratio, - output_compression_ratio=output_compression_ratio, - strides=strides) - - features = block(inputs) - - self.assertAllEqual( - [1, input_size // strides, input_size // strides, out_filters], - features.shape.as_list()) - - -class ResidualInnerTest(parameterized.TestCase, tf.test.TestCase): - - @combinations.generate(distribution_strategy_combinations()) - def test_shape(self, distribution): - bsz, h, w, c = 8, 32, 32, 32 - filters = 64 - strides = 2 - - input_tensor = tf.random.uniform(shape=[bsz, h, w, c]) - with distribution.scope(): - test_layer = nn_blocks.ResidualInner(filters, strides) - - output = test_layer(input_tensor) - expected_output_shape = [bsz, h // strides, w // strides, filters] - self.assertEqual(expected_output_shape, output.shape.as_list()) - - -class BottleneckResidualInnerTest(parameterized.TestCase, tf.test.TestCase): - - @combinations.generate(distribution_strategy_combinations()) - def test_shape(self, distribution): - bsz, h, w, c = 8, 32, 32, 32 - filters = 64 - strides = 2 - - input_tensor = tf.random.uniform(shape=[bsz, h, w, c]) - with distribution.scope(): - test_layer = nn_blocks.BottleneckResidualInner(filters, strides) - - output = test_layer(input_tensor) - expected_output_shape = [bsz, h // strides, w // strides, filters * 4] - self.assertEqual(expected_output_shape, output.shape.as_list()) - - -class DepthwiseSeparableConvBlockTest(parameterized.TestCase, tf.test.TestCase): - - @combinations.generate(distribution_strategy_combinations()) - def test_shape(self, distribution): - batch_size, height, width, num_channels = 8, 32, 32, 32 - num_filters = 64 - strides = 2 - - input_tensor = tf.random.normal( - shape=[batch_size, height, width, num_channels]) - with distribution.scope(): - block = nn_blocks.DepthwiseSeparableConvBlock( - num_filters, strides=strides) - config_dict = block.get_config() - recreate_block = nn_blocks.DepthwiseSeparableConvBlock(**config_dict) - - output_tensor = block(input_tensor) - expected_output_shape = [ - batch_size, height // strides, width // strides, num_filters - ] - self.assertEqual(output_tensor.shape.as_list(), expected_output_shape) - - output_tensor = recreate_block(input_tensor) - self.assertEqual(output_tensor.shape.as_list(), expected_output_shape) - - -class ReversibleLayerTest(parameterized.TestCase, tf.test.TestCase): - - @combinations.generate(distribution_strategy_combinations()) - def test_downsampling_non_reversible_step(self, distribution): - bsz, h, w, c = 8, 32, 32, 32 - filters = 64 - strides = 2 - - input_tensor = tf.random.uniform(shape=[bsz, h, w, c]) - with distribution.scope(): - f = nn_blocks.ResidualInner( - filters=filters // 2, strides=strides, batch_norm_first=True) - g = nn_blocks.ResidualInner( - filters=filters // 2, strides=1, batch_norm_first=True) - test_layer = nn_blocks.ReversibleLayer(f, g) - test_layer.build(input_tensor.shape) - optimizer = tf.keras.optimizers.SGD(learning_rate=0.01) - - @tf.function - def step_fn(): - with tf.GradientTape() as tape: - output = test_layer(input_tensor, training=True) - grads = tape.gradient(output, test_layer.trainable_variables) - # Test applying gradients with optimizer works - optimizer.apply_gradients(zip(grads, test_layer.trainable_variables)) - - return output - - replica_output = distribution.run(step_fn) - outputs = distribution.experimental_local_results(replica_output) - - # Assert forward pass shape - expected_output_shape = [bsz, h // strides, w // strides, filters] - for output in outputs: - self.assertEqual(expected_output_shape, output.shape.as_list()) - - @combinations.generate(distribution_strategy_combinations()) - def test_reversible_step(self, distribution): - # Reversible layers satisfy: (a) strides = 1 (b) in_filter = out_filter - bsz, h, w, c = 8, 32, 32, 32 - filters = c - strides = 1 - - input_tensor = tf.random.uniform(shape=[bsz, h, w, c]) - with distribution.scope(): - f = nn_blocks.ResidualInner( - filters=filters // 2, strides=strides, batch_norm_first=False) - g = nn_blocks.ResidualInner( - filters=filters // 2, strides=1, batch_norm_first=False) - test_layer = nn_blocks.ReversibleLayer(f, g) - test_layer(input_tensor, training=False) # init weights - optimizer = tf.keras.optimizers.SGD(learning_rate=0.01) - - @tf.function - def step_fn(): - with tf.GradientTape() as tape: - output = test_layer(input_tensor, training=True) - grads = tape.gradient(output, test_layer.trainable_variables) - # Test applying gradients with optimizer works - optimizer.apply_gradients(zip(grads, test_layer.trainable_variables)) - - return output - - @tf.function - def fwd(): - test_layer(input_tensor) - - distribution.run(fwd) # Initialize variables - prev_variables = tf.identity_n(test_layer.trainable_variables) - replica_output = distribution.run(step_fn) - outputs = distribution.experimental_local_results(replica_output) - - # Assert variables values have changed values - for v0, v1 in zip(prev_variables, test_layer.trainable_variables): - self.assertNotAllEqual(v0, v1) - - # Assert forward pass shape - expected_output_shape = [bsz, h // strides, w // strides, filters] - for output in outputs: - self.assertEqual(expected_output_shape, output.shape.as_list()) - - @combinations.generate(distribution_strategy_combinations()) - def test_manual_gradients_correctness(self, distribution): - bsz, h, w, c = 8, 32, 32, 32 - filters = c - strides = 1 - - input_tensor = tf.random.uniform(shape=[bsz, h, w, c * 4]) # bottleneck - with distribution.scope(): - f_manual = nn_blocks.BottleneckResidualInner( - filters=filters // 2, strides=strides, batch_norm_first=False) - g_manual = nn_blocks.BottleneckResidualInner( - filters=filters // 2, strides=1, batch_norm_first=False) - manual_grad_layer = nn_blocks.ReversibleLayer(f_manual, g_manual) - manual_grad_layer(input_tensor, training=False) # init weights - - f_auto = nn_blocks.BottleneckResidualInner( - filters=filters // 2, strides=strides, batch_norm_first=False) - g_auto = nn_blocks.BottleneckResidualInner( - filters=filters // 2, strides=1, batch_norm_first=False) - auto_grad_layer = nn_blocks.ReversibleLayer( - f_auto, g_auto, manual_grads=False) - auto_grad_layer(input_tensor) # init weights - # Clone all weights (tf.keras.layers.Layer has no .clone()) - auto_grad_layer._f.set_weights(manual_grad_layer._f.get_weights()) - auto_grad_layer._g.set_weights(manual_grad_layer._g.get_weights()) - - @tf.function - def manual_fn(): - with tf.GradientTape() as tape: - output = manual_grad_layer(input_tensor, training=True) - grads = tape.gradient(output, manual_grad_layer.trainable_variables) - return grads - - @tf.function - def auto_fn(): - with tf.GradientTape() as tape: - output = auto_grad_layer(input_tensor, training=True) - grads = tape.gradient(output, auto_grad_layer.trainable_variables) - return grads - - manual_grads = distribution.run(manual_fn) - auto_grads = distribution.run(auto_fn) - - # Assert gradients calculated manually are close to that from autograd - for manual_grad, auto_grad in zip(manual_grads, auto_grads): - self.assertAllClose( - distribution.experimental_local_results(manual_grad), - distribution.experimental_local_results(auto_grad), - atol=5e-3, - rtol=5e-3) - - # Verify that BN moving mean and variance is correct. - for manual_var, auto_var in zip(manual_grad_layer.non_trainable_variables, - auto_grad_layer.non_trainable_variables): - self.assertAllClose(manual_var, auto_var) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/layers/nn_layers.py b/official/vision/beta/modeling/layers/nn_layers.py deleted file mode 100644 index b1252373e..000000000 --- a/official/vision/beta/modeling/layers/nn_layers.py +++ /dev/null @@ -1,1277 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains common building blocks for neural networks.""" -from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union - -from absl import logging -import tensorflow as tf -import tensorflow_addons as tfa - -from official.modeling import tf_utils -from official.vision.beta.ops import spatial_transform_ops - - -# Type annotations. -States = Dict[str, tf.Tensor] -Activation = Union[str, Callable] - - -def make_divisible(value: float, - divisor: int, - min_value: Optional[float] = None, - round_down_protect: bool = True, - ) -> int: - """This is to ensure that all layers have channels that are divisible by 8. - - Args: - value: A `float` of original value. - divisor: An `int` of the divisor that need to be checked upon. - min_value: A `float` of minimum value threshold. - round_down_protect: A `bool` indicating whether round down more than 10% - will be allowed. - - Returns: - The adjusted value in `int` that is divisible against divisor. - """ - if min_value is None: - min_value = divisor - new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) - # Make sure that round down does not go down by more than 10%. - if round_down_protect and new_value < 0.9 * value: - new_value += divisor - return int(new_value) - - -def round_filters(filters: int, - multiplier: float, - divisor: int = 8, - min_depth: Optional[int] = None, - round_down_protect: bool = True, - skip: bool = False) -> int: - """Rounds number of filters based on width multiplier.""" - orig_f = filters - if skip or not multiplier: - return filters - - new_filters = make_divisible(value=filters * multiplier, - divisor=divisor, - min_value=min_depth, - round_down_protect=round_down_protect) - - logging.info('round_filter input=%s output=%s', orig_f, new_filters) - return int(new_filters) - - -def get_padding_for_kernel_size(kernel_size): - """Compute padding size given kernel size.""" - if kernel_size == 7: - return (3, 3) - elif kernel_size == 3: - return (1, 1) - else: - raise ValueError('Padding for kernel size {} not known.'.format( - kernel_size)) - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class SqueezeExcitation(tf.keras.layers.Layer): - """Creates a squeeze and excitation layer.""" - - def __init__(self, - in_filters, - out_filters, - se_ratio, - divisible_by=1, - use_3d_input=False, - kernel_initializer='VarianceScaling', - kernel_regularizer=None, - bias_regularizer=None, - activation='relu', - gating_activation='sigmoid', - round_down_protect=True, - **kwargs): - """Initializes a squeeze and excitation layer. - - Args: - in_filters: An `int` number of filters of the input tensor. - out_filters: An `int` number of filters of the output tensor. - se_ratio: A `float` or None. If not None, se ratio for the squeeze and - excitation layer. - divisible_by: An `int` that ensures all inner dimensions are divisible by - this number. - use_3d_input: A `bool` of whether input is 2D or 3D image. - kernel_initializer: A `str` of kernel_initializer for convolutional - layers. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default to None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2d. - Default to None. - activation: A `str` name of the activation function. - gating_activation: A `str` name of the activation function for final - gating function. - round_down_protect: A `bool` of whether round down more than 10% will be - allowed. - **kwargs: Additional keyword arguments to be passed. - """ - super(SqueezeExcitation, self).__init__(**kwargs) - - self._in_filters = in_filters - self._out_filters = out_filters - self._se_ratio = se_ratio - self._divisible_by = divisible_by - self._round_down_protect = round_down_protect - self._use_3d_input = use_3d_input - self._activation = activation - self._gating_activation = gating_activation - self._kernel_initializer = kernel_initializer - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - if tf.keras.backend.image_data_format() == 'channels_last': - if not use_3d_input: - self._spatial_axis = [1, 2] - else: - self._spatial_axis = [1, 2, 3] - else: - if not use_3d_input: - self._spatial_axis = [2, 3] - else: - self._spatial_axis = [2, 3, 4] - self._activation_fn = tf_utils.get_activation(activation) - self._gating_activation_fn = tf_utils.get_activation(gating_activation) - - def build(self, input_shape): - num_reduced_filters = make_divisible( - max(1, int(self._in_filters * self._se_ratio)), - divisor=self._divisible_by, - round_down_protect=self._round_down_protect) - - self._se_reduce = tf.keras.layers.Conv2D( - filters=num_reduced_filters, - kernel_size=1, - strides=1, - padding='same', - use_bias=True, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - - self._se_expand = tf.keras.layers.Conv2D( - filters=self._out_filters, - kernel_size=1, - strides=1, - padding='same', - use_bias=True, - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer) - - super(SqueezeExcitation, self).build(input_shape) - - def get_config(self): - config = { - 'in_filters': self._in_filters, - 'out_filters': self._out_filters, - 'se_ratio': self._se_ratio, - 'divisible_by': self._divisible_by, - 'use_3d_input': self._use_3d_input, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'bias_regularizer': self._bias_regularizer, - 'activation': self._activation, - 'gating_activation': self._gating_activation, - 'round_down_protect': self._round_down_protect, - } - base_config = super(SqueezeExcitation, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def call(self, inputs): - x = tf.reduce_mean(inputs, self._spatial_axis, keepdims=True) - x = self._activation_fn(self._se_reduce(x)) - x = self._gating_activation_fn(self._se_expand(x)) - return x * inputs - - -def get_stochastic_depth_rate(init_rate, i, n): - """Get drop connect rate for the ith block. - - Args: - init_rate: A `float` of initial drop rate. - i: An `int` of order of the current block. - n: An `int` total number of blocks. - - Returns: - Drop rate of the ith block. - """ - if init_rate is not None: - if init_rate < 0 or init_rate > 1: - raise ValueError('Initial drop rate must be within 0 and 1.') - rate = init_rate * float(i) / n - else: - rate = None - return rate - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class StochasticDepth(tf.keras.layers.Layer): - """Creates a stochastic depth layer.""" - - def __init__(self, stochastic_depth_drop_rate, **kwargs): - """Initializes a stochastic depth layer. - - Args: - stochastic_depth_drop_rate: A `float` of drop rate. - **kwargs: Additional keyword arguments to be passed. - - Returns: - A output `tf.Tensor` of which should have the same shape as input. - """ - super(StochasticDepth, self).__init__(**kwargs) - self._drop_rate = stochastic_depth_drop_rate - - def get_config(self): - config = {'drop_rate': self._drop_rate} - base_config = super(StochasticDepth, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def call(self, inputs, training=None): - if training is None: - training = tf.keras.backend.learning_phase() - if not training or self._drop_rate is None or self._drop_rate == 0: - return inputs - - keep_prob = 1.0 - self._drop_rate - batch_size = tf.shape(inputs)[0] - random_tensor = keep_prob - random_tensor += tf.random.uniform( - [batch_size] + [1] * (inputs.shape.rank - 1), dtype=inputs.dtype) - binary_tensor = tf.floor(random_tensor) - output = tf.math.divide(inputs, keep_prob) * binary_tensor - return output - - -@tf.keras.utils.register_keras_serializable(package='Beta') -def pyramid_feature_fusion(inputs, target_level): - """Fuses all feature maps in the feature pyramid at the target level. - - Args: - inputs: A dictionary containing the feature pyramid. The size of the input - tensor needs to be fixed. - target_level: An `int` of the target feature level for feature fusion. - - Returns: - A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width, - feature_channel]. - """ - # Convert keys to int. - pyramid_feats = {int(k): v for k, v in inputs.items()} - min_level = min(pyramid_feats.keys()) - max_level = max(pyramid_feats.keys()) - resampled_feats = [] - - for l in range(min_level, max_level + 1): - if l == target_level: - resampled_feats.append(pyramid_feats[l]) - else: - feat = pyramid_feats[l] - target_size = list(feat.shape[1:3]) - target_size[0] *= 2**(l - target_level) - target_size[1] *= 2**(l - target_level) - # Casts feat to float32 so the resize op can be run on TPU. - feat = tf.cast(feat, tf.float32) - feat = tf.image.resize( - feat, size=target_size, method=tf.image.ResizeMethod.BILINEAR) - # Casts it back to be compatible with the rest opetations. - feat = tf.cast(feat, pyramid_feats[l].dtype) - resampled_feats.append(feat) - - return tf.math.add_n(resampled_feats) - - -class PanopticFPNFusion(tf.keras.Model): - """Creates a Panoptic FPN feature Fusion layer. - - This implements feature fusion for semantic segmentation head from the paper: - Alexander Kirillov, Ross Girshick, Kaiming He and Piotr Dollar. - Panoptic Feature Pyramid Networks. - (https://arxiv.org/pdf/1901.02446.pdf) - """ - - def __init__( - self, - min_level: int = 2, - max_level: int = 5, - target_level: int = 2, - num_filters: int = 128, - num_fpn_filters: int = 256, - activation: str = 'relu', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - **kwargs): - - """Initializes panoptic FPN feature fusion layer. - - Args: - min_level: An `int` of minimum level to use in feature fusion. - max_level: An `int` of maximum level to use in feature fusion. - target_level: An `int` of the target feature level for feature fusion. - num_filters: An `int` number of filters in conv2d layers. - num_fpn_filters: An `int` number of filters in the FPN outputs - activation: A `str` name of the activation function. - kernel_regularizer: A `tf.keras.regularizers.Regularizer` object for - Conv2D. Default is None. - bias_regularizer: A `tf.keras.regularizers.Regularizer` object for Conv2D. - **kwargs: Additional keyword arguments to be passed. - Returns: - A `float` `tf.Tensor` of shape [batch_size, feature_height, feature_width, - feature_channel]. - """ - if target_level > max_level: - raise ValueError('target_level should be less than max_level') - - self._config_dict = { - 'min_level': min_level, - 'max_level': max_level, - 'target_level': target_level, - 'num_filters': num_filters, - 'num_fpn_filters': num_fpn_filters, - 'activation': activation, - 'kernel_regularizer': kernel_regularizer, - 'bias_regularizer': bias_regularizer, - } - norm = tfa.layers.GroupNormalization - conv2d = tf.keras.layers.Conv2D - activation_fn = tf_utils.get_activation(activation) - if tf.keras.backend.image_data_format() == 'channels_last': - norm_axis = -1 - else: - norm_axis = 1 - inputs = self._build_inputs(num_fpn_filters, min_level, max_level) - - upscaled_features = [] - for level in range(min_level, max_level + 1): - num_conv_layers = max(1, level - target_level) - x = inputs[str(level)] - for i in range(num_conv_layers): - x = conv2d( - filters=num_filters, - kernel_size=3, - padding='same', - kernel_initializer=tf.keras.initializers.VarianceScaling(), - kernel_regularizer=kernel_regularizer, - bias_regularizer=bias_regularizer)(x) - x = norm(groups=32, axis=norm_axis)(x) - x = activation_fn(x) - if level != target_level: - x = spatial_transform_ops.nearest_upsampling(x, scale=2) - upscaled_features.append(x) - - fused_features = tf.math.add_n(upscaled_features) - self._output_specs = {str(target_level): fused_features.get_shape()} - - super(PanopticFPNFusion, self).__init__( - inputs=inputs, outputs=fused_features, **kwargs) - - def _build_inputs(self, num_filters: int, - min_level: int, max_level: int): - inputs = {} - for level in range(min_level, max_level + 1): - inputs[str(level)] = tf.keras.Input(shape=[None, None, num_filters]) - return inputs - - def get_config(self) -> Mapping[str, Any]: - return self._config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) - - @property - def output_specs(self) -> Mapping[str, tf.TensorShape]: - """A dict of {level: TensorShape} pairs for the model output.""" - return self._output_specs - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class Scale(tf.keras.layers.Layer): - """Scales the input by a trainable scalar weight. - - This is useful for applying ReZero to layers, which improves convergence - speed. This implements the paper: - ReZero is All You Need: Fast Convergence at Large Depth. - (https://arxiv.org/pdf/2003.04887.pdf). - """ - - def __init__( - self, - initializer: tf.keras.initializers.Initializer = 'ones', - regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - **kwargs): - """Initializes a scale layer. - - Args: - initializer: A `str` of initializer for the scalar weight. - regularizer: A `tf.keras.regularizers.Regularizer` for the scalar weight. - **kwargs: Additional keyword arguments to be passed to this layer. - - Returns: - An `tf.Tensor` of which should have the same shape as input. - """ - super(Scale, self).__init__(**kwargs) - - self._initializer = initializer - self._regularizer = regularizer - - self._scale = self.add_weight( - name='scale', - shape=[], - dtype=self.dtype, - initializer=self._initializer, - regularizer=self._regularizer, - trainable=True) - - def get_config(self): - """Returns a dictionary containing the config used for initialization.""" - config = { - 'initializer': self._initializer, - 'regularizer': self._regularizer, - } - base_config = super(Scale, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def call(self, inputs): - """Calls the layer with the given inputs.""" - scale = tf.cast(self._scale, inputs.dtype) - return scale * inputs - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class TemporalSoftmaxPool(tf.keras.layers.Layer): - """Creates a network layer corresponding to temporal softmax pooling. - - This is useful for multi-class logits (used in e.g., Charades). Modified from - AssembleNet Charades evaluation from: - - Michael S. Ryoo, AJ Piergiovanni, Mingxing Tan, Anelia Angelova. - AssembleNet: Searching for Multi-Stream Neural Connectivity in Video - Architectures. - (https://arxiv.org/pdf/1905.13209.pdf). - """ - - def call(self, inputs): - """Calls the layer with the given inputs.""" - assert inputs.shape.rank in (3, 4, 5) - frames = tf.shape(inputs)[1] - pre_logits = inputs / tf.sqrt(tf.cast(frames, inputs.dtype)) - activations = tf.nn.softmax(pre_logits, axis=1) - outputs = inputs * activations - return outputs - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class PositionalEncoding(tf.keras.layers.Layer): - """Creates a network layer that adds a sinusoidal positional encoding. - - Positional encoding is incremented across frames, and is added to the input. - The positional encoding is first weighted at 0 so that the network can choose - to ignore it. This implements: - - Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, - Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin. - Attention Is All You Need. - (https://arxiv.org/pdf/1706.03762.pdf). - """ - - def __init__(self, - initializer: tf.keras.initializers.Initializer = 'zeros', - cache_encoding: bool = False, - state_prefix: Optional[str] = None, - **kwargs): - """Initializes positional encoding. - - Args: - initializer: A `str` of initializer for weighting the positional encoding. - cache_encoding: A `bool`. If True, cache the positional encoding tensor - after calling build. Otherwise, rebuild the tensor for every call. - Setting this to False can be useful when we want to input a variable - number of frames, so the positional encoding tensor can change shape. - state_prefix: a prefix string to identify states. - **kwargs: Additional keyword arguments to be passed to this layer. - - Returns: - A `tf.Tensor` of which should have the same shape as input. - """ - super(PositionalEncoding, self).__init__(**kwargs) - self._initializer = initializer - self._cache_encoding = cache_encoding - self._pos_encoding = None - self._rezero = Scale(initializer=initializer, name='rezero') - state_prefix = state_prefix if state_prefix is not None else '' - self._state_prefix = state_prefix - self._frame_count_name = f'{state_prefix}_pos_enc_frame_count' - - def get_config(self): - """Returns a dictionary containing the config used for initialization.""" - config = { - 'initializer': self._initializer, - 'cache_encoding': self._cache_encoding, - 'state_prefix': self._state_prefix, - } - base_config = super(PositionalEncoding, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def _positional_encoding(self, - num_positions: Union[int, tf.Tensor], - hidden_size: Union[int, tf.Tensor], - start_position: Union[int, tf.Tensor] = 0, - dtype: str = 'float32') -> tf.Tensor: - """Creates a sequence of sinusoidal positional encoding vectors. - - Args: - num_positions: the total number of positions (frames). - hidden_size: the number of channels used for the hidden vectors. - start_position: the start position. - dtype: the dtype of the output tensor. - - Returns: - The positional encoding tensor with shape [num_positions, hidden_size]. - """ - if isinstance(start_position, tf.Tensor) and start_position.shape.rank == 1: - start_position = start_position[0] - - # Calling `tf.range` with `dtype=tf.bfloat16` results in an error, - # so we cast afterward. - positions = tf.range(start_position, start_position + num_positions) - positions = tf.cast(positions, dtype)[:, tf.newaxis] - idx = tf.range(hidden_size)[tf.newaxis, :] - - power = tf.cast(2 * (idx // 2), dtype) - power /= tf.cast(hidden_size, dtype) - angles = 1. / tf.math.pow(10_000., power) - radians = positions * angles - - sin = tf.math.sin(radians[:, 0::2]) - cos = tf.math.cos(radians[:, 1::2]) - pos_encoding = tf.concat([sin, cos], axis=-1) - - return pos_encoding - - def _get_pos_encoding(self, - input_shape: tf.Tensor, - frame_count: int = 0) -> tf.Tensor: - """Calculates the positional encoding from the input shape. - - Args: - input_shape: the shape of the input. - frame_count: a count of frames that indicates the index of the first - frame. - - Returns: - The positional encoding tensor with shape [num_positions, hidden_size]. - - """ - frames = input_shape[1] - channels = input_shape[-1] - pos_encoding = self._positional_encoding( - frames, channels, start_position=frame_count, dtype=self.dtype) - pos_encoding = tf.reshape(pos_encoding, [1, frames, 1, 1, channels]) - return pos_encoding - - def build(self, input_shape): - """Builds the layer with the given input shape. - - Args: - input_shape: The input shape. - - Raises: - ValueError: If using 'channels_first' data format. - """ - if tf.keras.backend.image_data_format() == 'channels_first': - raise ValueError('"channels_first" mode is unsupported.') - - if self._cache_encoding: - self._pos_encoding = self._get_pos_encoding(input_shape) - - super(PositionalEncoding, self).build(input_shape) - - def call( - self, - inputs: tf.Tensor, - states: Optional[States] = None, - output_states: bool = True, - ) -> Union[tf.Tensor, Tuple[tf.Tensor, States]]: - """Calls the layer with the given inputs. - - Args: - inputs: An input `tf.Tensor`. - states: A `dict` of states such that, if any of the keys match for this - layer, will overwrite the contents of the buffer(s). Expected keys - include `state_prefix + '_pos_enc_frame_count'`. - output_states: A `bool`. If True, returns the output tensor and output - states. Returns just the output tensor otherwise. - - Returns: - An output `tf.Tensor` (and optionally the states if `output_states=True`). - - Raises: - ValueError: If using 'channels_first' data format. - """ - states = dict(states) if states is not None else {} - - # Keep a count of frames encountered across input iterations in - # num_frames to be able to accurately update the positional encoding. - num_frames = tf.shape(inputs)[1] - frame_count = tf.cast(states.get(self._frame_count_name, [0]), tf.int32) - states[self._frame_count_name] = frame_count + num_frames - - if self._cache_encoding: - pos_encoding = self._pos_encoding - else: - pos_encoding = self._get_pos_encoding( - tf.shape(inputs), frame_count=frame_count) - pos_encoding = tf.cast(pos_encoding, inputs.dtype) - pos_encoding = self._rezero(pos_encoding) - outputs = inputs + pos_encoding - - return (outputs, states) if output_states else outputs - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class GlobalAveragePool3D(tf.keras.layers.Layer): - """Creates a global average pooling layer with causal mode. - - Implements causal mode, which runs a cumulative sum (with `tf.cumsum`) across - frames in the time dimension, allowing the use of a stream buffer. Sums any - valid input state with the current input to allow state to accumulate over - several iterations. - """ - - def __init__(self, - keepdims: bool = False, - causal: bool = False, - state_prefix: Optional[str] = None, - **kwargs): - """Initializes a global average pool layer. - - Args: - keepdims: A `bool`. If True, keep the averaged dimensions. - causal: A `bool` of whether to run in causal mode with a cumulative sum - across frames. - state_prefix: a prefix string to identify states. - **kwargs: Additional keyword arguments to be passed to this layer. - - Returns: - An output `tf.Tensor`. - """ - super(GlobalAveragePool3D, self).__init__(**kwargs) - - self._keepdims = keepdims - self._causal = causal - state_prefix = state_prefix if state_prefix is not None else '' - self._state_prefix = state_prefix - - self._state_name = f'{state_prefix}_pool_buffer' - self._frame_count_name = f'{state_prefix}_pool_frame_count' - - def get_config(self): - """Returns a dictionary containing the config used for initialization.""" - config = { - 'keepdims': self._keepdims, - 'causal': self._causal, - 'state_prefix': self._state_prefix, - } - base_config = super(GlobalAveragePool3D, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def call(self, - inputs: tf.Tensor, - states: Optional[States] = None, - output_states: bool = True - ) -> Union[tf.Tensor, Tuple[tf.Tensor, States]]: - """Calls the layer with the given inputs. - - Args: - inputs: An input `tf.Tensor`. - states: A `dict` of states such that, if any of the keys match for this - layer, will overwrite the contents of the buffer(s). - Expected keys include `state_prefix + '__pool_buffer'` and - `state_prefix + '__pool_frame_count'`. - output_states: A `bool`. If True, returns the output tensor and output - states. Returns just the output tensor otherwise. - - Returns: - An output `tf.Tensor` (and optionally the states if `output_states=True`). - If `causal=True`, the output tensor will have shape - `[batch_size, num_frames, 1, 1, channels]` if `keepdims=True`. We keep - the frame dimension in this case to simulate a cumulative global average - as if we are inputting one frame at a time. If `causal=False`, the output - is equivalent to `tf.keras.layers.GlobalAveragePooling3D` with shape - `[batch_size, 1, 1, 1, channels]` if `keepdims=True` (plus the optional - buffer stored in `states`). - - Raises: - ValueError: If using 'channels_first' data format. - """ - states = dict(states) if states is not None else {} - - if tf.keras.backend.image_data_format() == 'channels_first': - raise ValueError('"channels_first" mode is unsupported.') - - # Shape: [batch_size, 1, 1, 1, channels] - buffer = states.get(self._state_name, None) - if buffer is None: - buffer = tf.zeros_like(inputs[:, :1, :1, :1], dtype=inputs.dtype) - states[self._state_name] = buffer - - # Keep a count of frames encountered across input iterations in - # num_frames to be able to accurately take a cumulative average across - # all frames when running in streaming mode - num_frames = tf.shape(inputs)[1] - frame_count = states.get(self._frame_count_name, tf.constant([0])) - frame_count = tf.cast(frame_count, tf.int32) - states[self._frame_count_name] = frame_count + num_frames - - if self._causal: - # Take a mean of spatial dimensions to make computation more efficient. - x = tf.reduce_mean(inputs, axis=[2, 3], keepdims=True) - x = tf.cumsum(x, axis=1) - x = x + buffer - - # The last frame will be the value of the next state - # Shape: [batch_size, 1, 1, 1, channels] - states[self._state_name] = x[:, -1:] - - # In causal mode, the divisor increments by 1 for every frame to - # calculate cumulative averages instead of one global average - mean_divisors = tf.range(num_frames) + frame_count + 1 - mean_divisors = tf.reshape(mean_divisors, [1, num_frames, 1, 1, 1]) - mean_divisors = tf.cast(mean_divisors, x.dtype) - - # Shape: [batch_size, num_frames, 1, 1, channels] - x = x / mean_divisors - else: - # In non-causal mode, we (optionally) sum across frames to take a - # cumulative average across input iterations rather than individual - # frames. If no buffer state is passed, this essentially becomes - # regular global average pooling. - # Shape: [batch_size, 1, 1, 1, channels] - x = tf.reduce_sum(inputs, axis=(1, 2, 3), keepdims=True) - x = x / tf.cast(tf.shape(inputs)[2] * tf.shape(inputs)[3], x.dtype) - x = x + buffer - - # Shape: [batch_size, 1, 1, 1, channels] - states[self._state_name] = x - - x = x / tf.cast(frame_count + num_frames, x.dtype) - - if not self._keepdims: - x = tf.squeeze(x, axis=(1, 2, 3)) - - return (x, states) if output_states else x - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class SpatialAveragePool3D(tf.keras.layers.Layer): - """Creates a global average pooling layer pooling across spatial dimentions.""" - - def __init__(self, keepdims: bool = False, **kwargs): - """Initializes a global average pool layer. - - Args: - keepdims: A `bool`. If True, keep the averaged dimensions. - **kwargs: Additional keyword arguments to be passed to this layer. - - Returns: - An output `tf.Tensor`. - """ - super(SpatialAveragePool3D, self).__init__(**kwargs) - self._keepdims = keepdims - - def get_config(self): - """Returns a dictionary containing the config used for initialization.""" - config = { - 'keepdims': self._keepdims, - } - base_config = super(SpatialAveragePool3D, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def build(self, input_shape): - """Builds the layer with the given input shape.""" - if tf.keras.backend.image_data_format() == 'channels_first': - raise ValueError('"channels_first" mode is unsupported.') - - super(SpatialAveragePool3D, self).build(input_shape) - - def call(self, inputs): - """Calls the layer with the given inputs.""" - if inputs.shape.rank != 5: - raise ValueError( - 'Input should have rank {}, got {}'.format(5, inputs.shape.rank)) - - return tf.reduce_mean(inputs, axis=(2, 3), keepdims=self._keepdims) - - -class CausalConvMixin: - """Mixin class to implement CausalConv for `tf.keras.layers.Conv` layers.""" - - @property - def use_buffered_input(self) -> bool: - return self._use_buffered_input - - @use_buffered_input.setter - def use_buffered_input(self, variable: bool): - self._use_buffered_input = variable - - def _compute_buffered_causal_padding(self, - inputs: tf.Tensor, - use_buffered_input: bool = False, - time_axis: int = 1, - ) -> List[List[int]]: - """Calculates padding for 'causal' option for conv layers. - - Args: - inputs: An optional input `tf.Tensor` to be padded. - use_buffered_input: A `bool`. If True, use 'valid' padding along the time - dimension. This should be set when applying the stream buffer. - time_axis: An `int` of the axis of the time dimension. - - Returns: - A list of paddings for `tf.pad`. - """ - input_shape = tf.shape(inputs)[1:-1] - - if tf.keras.backend.image_data_format() == 'channels_first': - raise ValueError('"channels_first" mode is unsupported.') - - kernel_size_effective = [ - (self.kernel_size[i] + - (self.kernel_size[i] - 1) * (self.dilation_rate[i] - 1)) - for i in range(self.rank) - ] - pad_total = [kernel_size_effective[0] - 1] - for i in range(1, self.rank): - overlap = (input_shape[i] - 1) % self.strides[i] + 1 - pad_total.append(tf.maximum(kernel_size_effective[i] - overlap, 0)) - pad_beg = [pad_total[i] // 2 for i in range(self.rank)] - pad_end = [pad_total[i] - pad_beg[i] for i in range(self.rank)] - padding = [[pad_beg[i], pad_end[i]] for i in range(self.rank)] - padding = [[0, 0]] + padding + [[0, 0]] - - if use_buffered_input: - padding[time_axis] = [0, 0] - else: - padding[time_axis] = [padding[time_axis][0] + padding[time_axis][1], 0] - return padding - - def _causal_validate_init(self): - """Validates the Conv layer initial configuration.""" - # Overriding this method is meant to circumvent unnecessary errors when - # using causal padding. - if (self.filters is not None - and self.filters % self.groups != 0): - raise ValueError( - 'The number of filters must be evenly divisible by the number of ' - 'groups. Received: groups={}, filters={}'.format( - self.groups, self.filters)) - - if not all(self.kernel_size): - raise ValueError('The argument `kernel_size` cannot contain 0(s). ' - 'Received: %s' % (self.kernel_size,)) - - def _buffered_spatial_output_shape(self, spatial_output_shape: List[int]): - """Computes the spatial output shape from the input shape.""" - # When buffer padding, use 'valid' padding across time. The output shape - # across time should be the input shape minus any padding, assuming - # the stride across time is 1. - if self._use_buffered_input and spatial_output_shape[0] is not None: - padding = self._compute_buffered_causal_padding( - tf.zeros([1] + spatial_output_shape + [1]), use_buffered_input=False) - spatial_output_shape[0] -= sum(padding[1]) - return spatial_output_shape - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class Conv2D(tf.keras.layers.Conv2D, CausalConvMixin): - """Conv2D layer supporting CausalConv. - - Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`), - which applies causal padding to the temporal dimension, and same padding in - the spatial dimensions. - """ - - def __init__(self, *args, use_buffered_input=False, **kwargs): - """Initializes conv2d. - - Args: - *args: Arguments to be passed. - use_buffered_input: A `bool`. If True, the input is expected to be padded - beforehand. In effect, calling this layer will use 'valid' padding on - the temporal dimension to simulate 'causal' padding. - **kwargs: Additional keyword arguments to be passed. - - Returns: - An output `tf.Tensor` of the Conv2D operation. - """ - super(Conv2D, self).__init__(*args, **kwargs) - self._use_buffered_input = use_buffered_input - - def get_config(self): - """Returns a dictionary containing the config used for initialization.""" - config = { - 'use_buffered_input': self._use_buffered_input, - } - base_config = super(Conv2D, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def _compute_causal_padding(self, inputs): - """Computes causal padding dimensions for the given inputs.""" - return self._compute_buffered_causal_padding( - inputs, use_buffered_input=self._use_buffered_input) - - def _validate_init(self): - """Validates the Conv layer initial configuration.""" - self._causal_validate_init() - - def _spatial_output_shape(self, spatial_input_shape: List[int]): - """Computes the spatial output shape from the input shape.""" - shape = super(Conv2D, self)._spatial_output_shape(spatial_input_shape) - return self._buffered_spatial_output_shape(shape) - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class DepthwiseConv2D(tf.keras.layers.DepthwiseConv2D, CausalConvMixin): - """DepthwiseConv2D layer supporting CausalConv. - - Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`), - which applies causal padding to the temporal dimension, and same padding in - the spatial dimensions. - """ - - def __init__(self, *args, use_buffered_input=False, **kwargs): - """Initializes depthwise conv2d. - - Args: - *args: Arguments to be passed. - use_buffered_input: A `bool`. If True, the input is expected to be padded - beforehand. In effect, calling this layer will use 'valid' padding on - the temporal dimension to simulate 'causal' padding. - **kwargs: Additional keyword arguments to be passed. - - Returns: - An output `tf.Tensor` of the DepthwiseConv2D operation. - """ - super(DepthwiseConv2D, self).__init__(*args, **kwargs) - self._use_buffered_input = use_buffered_input - - # Causal padding is unsupported by default for DepthwiseConv2D, - # so we resort to valid padding internally. However, we handle - # causal padding as a special case with `self._is_causal`, which is - # defined by the super class. - if self.padding == 'causal': - self.padding = 'valid' - - def get_config(self): - """Returns a dictionary containing the config used for initialization.""" - config = { - 'use_buffered_input': self._use_buffered_input, - } - base_config = super(DepthwiseConv2D, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def call(self, inputs): - """Calls the layer with the given inputs.""" - if self._is_causal: - inputs = tf.pad(inputs, self._compute_causal_padding(inputs)) - return super(DepthwiseConv2D, self).call(inputs) - - def _compute_causal_padding(self, inputs): - """Computes causal padding dimensions for the given inputs.""" - return self._compute_buffered_causal_padding( - inputs, use_buffered_input=self._use_buffered_input) - - def _validate_init(self): - """Validates the Conv layer initial configuration.""" - self._causal_validate_init() - - def _spatial_output_shape(self, spatial_input_shape: List[int]): - """Computes the spatial output shape from the input shape.""" - shape = super(DepthwiseConv2D, self)._spatial_output_shape( - spatial_input_shape) - return self._buffered_spatial_output_shape(shape) - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class Conv3D(tf.keras.layers.Conv3D, CausalConvMixin): - """Conv3D layer supporting CausalConv. - - Supports `padding='causal'` option (like in `tf.keras.layers.Conv1D`), - which applies causal padding to the temporal dimension, and same padding in - the spatial dimensions. - """ - - def __init__(self, *args, use_buffered_input=False, **kwargs): - """Initializes conv3d. - - Args: - *args: Arguments to be passed. - use_buffered_input: A `bool`. If True, the input is expected to be padded - beforehand. In effect, calling this layer will use 'valid' padding on - the temporal dimension to simulate 'causal' padding. - **kwargs: Additional keyword arguments to be passed. - - Returns: - An output `tf.Tensor` of the Conv3D operation. - """ - super(Conv3D, self).__init__(*args, **kwargs) - self._use_buffered_input = use_buffered_input - - def get_config(self): - """Returns a dictionary containing the config used for initialization.""" - config = { - 'use_buffered_input': self._use_buffered_input, - } - base_config = super(Conv3D, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def call(self, inputs): - """Call the layer with the given inputs.""" - # Note: tf.nn.conv3d with depthwise kernels on CPU is currently only - # supported when compiling with TF graph (XLA) using tf.function, so it - # is compiled by default here (b/186463870). - conv_fn = tf.function(super(Conv3D, self).call, jit_compile=True) - return conv_fn(inputs) - - def _compute_causal_padding(self, inputs): - """Computes causal padding dimensions for the given inputs.""" - return self._compute_buffered_causal_padding( - inputs, use_buffered_input=self._use_buffered_input) - - def _validate_init(self): - """Validates the Conv layer initial configuration.""" - self._causal_validate_init() - - def _spatial_output_shape(self, spatial_input_shape: List[int]): - """Computes the spatial output shape from the input shape.""" - shape = super(Conv3D, self)._spatial_output_shape(spatial_input_shape) - return self._buffered_spatial_output_shape(shape) - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class SpatialPyramidPooling(tf.keras.layers.Layer): - """Implements the Atrous Spatial Pyramid Pooling. - - References: - [Rethinking Atrous Convolution for Semantic Image Segmentation]( - https://arxiv.org/pdf/1706.05587.pdf) - [Encoder-Decoder with Atrous Separable Convolution for Semantic Image - Segmentation](https://arxiv.org/pdf/1802.02611.pdf) - """ - - def __init__( - self, - output_channels: int, - dilation_rates: List[int], - pool_kernel_size: Optional[List[int]] = None, - use_sync_bn: bool = False, - batchnorm_momentum: float = 0.99, - batchnorm_epsilon: float = 0.001, - activation: str = 'relu', - dropout: float = 0.5, - kernel_initializer: str = 'GlorotUniform', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - interpolation: str = 'bilinear', - use_depthwise_convolution: bool = False, - **kwargs): - """Initializes `SpatialPyramidPooling`. - - Args: - output_channels: Number of channels produced by SpatialPyramidPooling. - dilation_rates: A list of integers for parallel dilated conv. - pool_kernel_size: A list of integers or None. If None, global average - pooling is applied, otherwise an average pooling of pool_kernel_size is - applied. - use_sync_bn: A bool, whether or not to use sync batch normalization. - batchnorm_momentum: A float for the momentum in BatchNorm. Defaults to - 0.99. - batchnorm_epsilon: A float for the epsilon value in BatchNorm. Defaults to - 0.001. - activation: A `str` for type of activation to be used. Defaults to 'relu'. - dropout: A float for the dropout rate before output. Defaults to 0.5. - kernel_initializer: Kernel initializer for conv layers. Defaults to - `glorot_uniform`. - kernel_regularizer: Kernel regularizer for conv layers. Defaults to None. - interpolation: The interpolation method for upsampling. Defaults to - `bilinear`. - use_depthwise_convolution: Allows spatial pooling to be separable - depthwise convolusions. [Encoder-Decoder with Atrous Separable - Convolution for Semantic Image Segmentation]( - https://arxiv.org/pdf/1802.02611.pdf) - **kwargs: Other keyword arguments for the layer. - """ - super().__init__(**kwargs) - - self._output_channels = output_channels - self._dilation_rates = dilation_rates - self._use_sync_bn = use_sync_bn - self._batchnorm_momentum = batchnorm_momentum - self._batchnorm_epsilon = batchnorm_epsilon - self._activation = activation - self._dropout = dropout - self._kernel_initializer = kernel_initializer - self._kernel_regularizer = kernel_regularizer - self._interpolation = interpolation - self._pool_kernel_size = pool_kernel_size - self._use_depthwise_convolution = use_depthwise_convolution - self._activation_fn = tf_utils.get_activation(activation) - if self._use_sync_bn: - self._bn_op = tf.keras.layers.experimental.SyncBatchNormalization - else: - self._bn_op = tf.keras.layers.BatchNormalization - - if tf.keras.backend.image_data_format() == 'channels_last': - self._bn_axis = -1 - else: - self._bn_axis = 1 - - def build(self, input_shape): - height = input_shape[1] - width = input_shape[2] - channels = input_shape[3] - - self.aspp_layers = [] - - conv1 = tf.keras.layers.Conv2D( - filters=self._output_channels, - kernel_size=(1, 1), - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - use_bias=False) - norm1 = self._bn_op( - axis=self._bn_axis, - momentum=self._batchnorm_momentum, - epsilon=self._batchnorm_epsilon) - - self.aspp_layers.append([conv1, norm1]) - - for dilation_rate in self._dilation_rates: - leading_layers = [] - kernel_size = (3, 3) - if self._use_depthwise_convolution: - leading_layers += [ - tf.keras.layers.DepthwiseConv2D( - depth_multiplier=1, - kernel_size=kernel_size, - padding='same', - depthwise_regularizer=self._kernel_regularizer, - depthwise_initializer=self._kernel_initializer, - dilation_rate=dilation_rate, - use_bias=False) - ] - kernel_size = (1, 1) - conv_dilation = leading_layers + [ - tf.keras.layers.Conv2D( - filters=self._output_channels, - kernel_size=kernel_size, - padding='same', - kernel_regularizer=self._kernel_regularizer, - kernel_initializer=self._kernel_initializer, - dilation_rate=dilation_rate, - use_bias=False) - ] - norm_dilation = self._bn_op( - axis=self._bn_axis, - momentum=self._batchnorm_momentum, - epsilon=self._batchnorm_epsilon) - - self.aspp_layers.append(conv_dilation + [norm_dilation]) - - if self._pool_kernel_size is None: - pooling = [ - tf.keras.layers.GlobalAveragePooling2D(), - tf.keras.layers.Reshape((1, 1, channels)) - ] - else: - pooling = [tf.keras.layers.AveragePooling2D(self._pool_kernel_size)] - - conv2 = tf.keras.layers.Conv2D( - filters=self._output_channels, - kernel_size=(1, 1), - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - use_bias=False) - norm2 = self._bn_op( - axis=self._bn_axis, - momentum=self._batchnorm_momentum, - epsilon=self._batchnorm_epsilon) - - self.aspp_layers.append(pooling + [conv2, norm2]) - - self._resizing_layer = tf.keras.layers.Resizing( - height, width, interpolation=self._interpolation, dtype=tf.float32) - - self._projection = [ - tf.keras.layers.Conv2D( - filters=self._output_channels, - kernel_size=(1, 1), - kernel_initializer=self._kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - use_bias=False), - self._bn_op( - axis=self._bn_axis, - momentum=self._batchnorm_momentum, - epsilon=self._batchnorm_epsilon) - ] - self._dropout_layer = tf.keras.layers.Dropout(rate=self._dropout) - self._concat_layer = tf.keras.layers.Concatenate(axis=-1) - - def call(self, - inputs: tf.Tensor, - training: Optional[bool] = None) -> tf.Tensor: - if training is None: - training = tf.keras.backend.learning_phase() - result = [] - for i, layers in enumerate(self.aspp_layers): - x = inputs - for layer in layers: - # Apply layers sequentially. - x = layer(x, training=training) - x = self._activation_fn(x) - - # Apply resize layer to the end of the last set of layers. - if i == len(self.aspp_layers) - 1: - x = self._resizing_layer(x) - - result.append(tf.cast(x, inputs.dtype)) - x = self._concat_layer(result) - for layer in self._projection: - x = layer(x, training=training) - x = self._activation_fn(x) - return self._dropout_layer(x) - - def get_config(self): - config = { - 'output_channels': self._output_channels, - 'dilation_rates': self._dilation_rates, - 'pool_kernel_size': self._pool_kernel_size, - 'use_sync_bn': self._use_sync_bn, - 'batchnorm_momentum': self._batchnorm_momentum, - 'batchnorm_epsilon': self._batchnorm_epsilon, - 'activation': self._activation, - 'dropout': self._dropout, - 'kernel_initializer': self._kernel_initializer, - 'kernel_regularizer': self._kernel_regularizer, - 'interpolation': self._interpolation, - } - base_config = super().get_config() - return dict(list(base_config.items()) + list(config.items())) diff --git a/official/vision/beta/modeling/layers/nn_layers_test.py b/official/vision/beta/modeling/layers/nn_layers_test.py deleted file mode 100644 index b2131b2a3..000000000 --- a/official/vision/beta/modeling/layers/nn_layers_test.py +++ /dev/null @@ -1,418 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for nn_layers.""" - -# Import libraries -from absl.testing import parameterized -import tensorflow as tf - -from official.vision.beta.modeling.layers import nn_layers - - -class NNLayersTest(parameterized.TestCase, tf.test.TestCase): - - def test_scale(self): - scale = nn_layers.Scale(initializer=tf.keras.initializers.constant(10.)) - output = scale(3.) - self.assertAllEqual(output, 30.) - - def test_temporal_softmax_pool(self): - inputs = tf.range(4, dtype=tf.float32) + 1. - inputs = tf.reshape(inputs, [1, 4, 1, 1, 1]) - layer = nn_layers.TemporalSoftmaxPool() - output = layer(inputs) - self.assertAllClose( - output, - [[[[[0.10153633]]], - [[[0.33481020]]], - [[[0.82801306]]], - [[[1.82021690]]]]]) - - def test_positional_encoding(self): - pos_encoding = nn_layers.PositionalEncoding( - initializer='ones', cache_encoding=False) - pos_encoding_cached = nn_layers.PositionalEncoding( - initializer='ones', cache_encoding=True) - - inputs = tf.ones([1, 4, 1, 1, 3]) - outputs, _ = pos_encoding(inputs) - outputs_cached, _ = pos_encoding_cached(inputs) - - expected = tf.constant( - [[[[[1.0000000, 1.0000000, 2.0000000]]], - [[[1.8414710, 1.0021545, 1.5403023]]], - [[[1.9092975, 1.0043088, 0.5838531]]], - [[[1.1411200, 1.0064633, 0.0100075]]]]]) - - self.assertEqual(outputs.shape, expected.shape) - self.assertAllClose(outputs, expected) - - self.assertEqual(outputs.shape, outputs_cached.shape) - self.assertAllClose(outputs, outputs_cached) - - inputs = tf.ones([1, 5, 1, 1, 3]) - _ = pos_encoding(inputs) - - def test_positional_encoding_bfloat16(self): - pos_encoding = nn_layers.PositionalEncoding(initializer='ones') - - inputs = tf.ones([1, 4, 1, 1, 3], dtype=tf.bfloat16) - outputs, _ = pos_encoding(inputs) - - expected = tf.constant( - [[[[[1.0000000, 1.0000000, 2.0000000]]], - [[[1.8414710, 1.0021545, 1.5403023]]], - [[[1.9092975, 1.0043088, 0.5838531]]], - [[[1.1411200, 1.0064633, 0.0100075]]]]]) - - self.assertEqual(outputs.shape, expected.shape) - self.assertAllClose(outputs, expected) - - def test_global_average_pool_basic(self): - pool = nn_layers.GlobalAveragePool3D(keepdims=True) - - inputs = tf.ones([1, 2, 3, 4, 1]) - outputs = pool(inputs, output_states=False) - - expected = tf.ones([1, 1, 1, 1, 1]) - - self.assertEqual(outputs.shape, expected.shape) - self.assertAllEqual(outputs, expected) - - def test_positional_encoding_stream(self): - pos_encoding = nn_layers.PositionalEncoding( - initializer='ones', cache_encoding=False) - - inputs = tf.range(4, dtype=tf.float32) + 1. - inputs = tf.reshape(inputs, [1, 4, 1, 1, 1]) - inputs = tf.tile(inputs, [1, 1, 1, 1, 3]) - expected, _ = pos_encoding(inputs) - - for num_splits in [1, 2, 4]: - frames = tf.split(inputs, num_splits, axis=1) - states = {} - predicted = [] - for frame in frames: - output, states = pos_encoding(frame, states=states) - predicted.append(output) - predicted = tf.concat(predicted, axis=1) - - self.assertEqual(predicted.shape, expected.shape) - self.assertAllClose(predicted, expected) - self.assertAllClose(predicted, [[[[[1.0000000, 1.0000000, 2.0000000]]], - [[[2.8414710, 2.0021544, 2.5403023]]], - [[[3.9092975, 3.0043090, 2.5838532]]], - [[[4.1411200, 4.0064630, 3.0100074]]]]]) - - def test_global_average_pool_keras(self): - pool = nn_layers.GlobalAveragePool3D(keepdims=False) - keras_pool = tf.keras.layers.GlobalAveragePooling3D() - - inputs = 10 * tf.random.normal([1, 2, 3, 4, 1]) - - outputs = pool(inputs, output_states=False) - keras_output = keras_pool(inputs) - - self.assertAllEqual(outputs.shape, keras_output.shape) - self.assertAllClose(outputs, keras_output) - - def test_stream_global_average_pool(self): - gap = nn_layers.GlobalAveragePool3D(keepdims=True, causal=False) - - inputs = tf.range(4, dtype=tf.float32) + 1. - inputs = tf.reshape(inputs, [1, 4, 1, 1, 1]) - inputs = tf.tile(inputs, [1, 1, 2, 2, 3]) - expected, _ = gap(inputs) - - for num_splits in [1, 2, 4]: - frames = tf.split(inputs, num_splits, axis=1) - states = {} - predicted = None - for frame in frames: - predicted, states = gap(frame, states=states) - - self.assertEqual(predicted.shape, expected.shape) - self.assertAllClose(predicted, expected) - self.assertAllClose( - predicted, - [[[[[2.5, 2.5, 2.5]]]]]) - - def test_causal_stream_global_average_pool(self): - gap = nn_layers.GlobalAveragePool3D(keepdims=True, causal=True) - - inputs = tf.range(4, dtype=tf.float32) + 1. - inputs = tf.reshape(inputs, [1, 4, 1, 1, 1]) - inputs = tf.tile(inputs, [1, 1, 2, 2, 3]) - expected, _ = gap(inputs) - - for num_splits in [1, 2, 4]: - frames = tf.split(inputs, num_splits, axis=1) - states = {} - predicted = [] - for frame in frames: - x, states = gap(frame, states=states) - predicted.append(x) - predicted = tf.concat(predicted, axis=1) - - self.assertEqual(predicted.shape, expected.shape) - self.assertAllClose(predicted, expected) - self.assertAllClose( - predicted, - [[[[[1.0, 1.0, 1.0]]], - [[[1.5, 1.5, 1.5]]], - [[[2.0, 2.0, 2.0]]], - [[[2.5, 2.5, 2.5]]]]]) - - def test_spatial_average_pool(self): - pool = nn_layers.SpatialAveragePool3D(keepdims=True) - - inputs = tf.range(64, dtype=tf.float32) + 1. - inputs = tf.reshape(inputs, [1, 4, 4, 4, 1]) - - output = pool(inputs) - - self.assertEqual(output.shape, [1, 4, 1, 1, 1]) - self.assertAllClose( - output, - [[[[[8.50]]], - [[[24.5]]], - [[[40.5]]], - [[[56.5]]]]]) - - def test_conv2d_causal(self): - conv2d = nn_layers.Conv2D( - filters=3, - kernel_size=(3, 3), - strides=(1, 2), - padding='causal', - use_buffered_input=True, - kernel_initializer='ones', - use_bias=False, - ) - - inputs = tf.ones([1, 4, 2, 3]) - - paddings = [[0, 0], [2, 0], [0, 0], [0, 0]] - padded_inputs = tf.pad(inputs, paddings) - predicted = conv2d(padded_inputs) - - expected = tf.constant( - [[[[6.0, 6.0, 6.0]], - [[12., 12., 12.]], - [[18., 18., 18.]], - [[18., 18., 18.]]]]) - - self.assertEqual(predicted.shape, expected.shape) - self.assertAllClose(predicted, expected) - - conv2d.use_buffered_input = False - predicted = conv2d(inputs) - - self.assertFalse(conv2d.use_buffered_input) - self.assertEqual(predicted.shape, expected.shape) - self.assertAllClose(predicted, expected) - - def test_depthwise_conv2d_causal(self): - conv2d = nn_layers.DepthwiseConv2D( - kernel_size=(3, 3), - strides=(1, 1), - padding='causal', - use_buffered_input=True, - depthwise_initializer='ones', - use_bias=False, - ) - - inputs = tf.ones([1, 2, 2, 3]) - - paddings = [[0, 0], [2, 0], [0, 0], [0, 0]] - padded_inputs = tf.pad(inputs, paddings) - predicted = conv2d(padded_inputs) - - expected = tf.constant( - [[[[2., 2., 2.], - [2., 2., 2.]], - [[4., 4., 4.], - [4., 4., 4.]]]]) - - self.assertEqual(predicted.shape, expected.shape) - self.assertAllClose(predicted, expected) - - conv2d.use_buffered_input = False - predicted = conv2d(inputs) - - self.assertEqual(predicted.shape, expected.shape) - self.assertAllClose(predicted, expected) - - def test_conv3d_causal(self): - conv3d = nn_layers.Conv3D( - filters=3, - kernel_size=(3, 3, 3), - strides=(1, 2, 2), - padding='causal', - use_buffered_input=True, - kernel_initializer='ones', - use_bias=False, - ) - - inputs = tf.ones([1, 2, 4, 4, 3]) - - paddings = [[0, 0], [2, 0], [0, 0], [0, 0], [0, 0]] - padded_inputs = tf.pad(inputs, paddings) - predicted = conv3d(padded_inputs) - - expected = tf.constant( - [[[[[27., 27., 27.], - [18., 18., 18.]], - [[18., 18., 18.], - [12., 12., 12.]]], - [[[54., 54., 54.], - [36., 36., 36.]], - [[36., 36., 36.], - [24., 24., 24.]]]]]) - - self.assertEqual(predicted.shape, expected.shape) - self.assertAllClose(predicted, expected) - - conv3d.use_buffered_input = False - predicted = conv3d(inputs) - - self.assertEqual(predicted.shape, expected.shape) - self.assertAllClose(predicted, expected) - - def test_depthwise_conv3d_causal(self): - conv3d = nn_layers.Conv3D( - filters=3, - kernel_size=(3, 3, 3), - strides=(1, 2, 2), - padding='causal', - use_buffered_input=True, - kernel_initializer='ones', - use_bias=False, - groups=3, - ) - - inputs = tf.ones([1, 2, 4, 4, 3]) - - paddings = [[0, 0], [2, 0], [0, 0], [0, 0], [0, 0]] - padded_inputs = tf.pad(inputs, paddings) - predicted = conv3d(padded_inputs) - - expected = tf.constant( - [[[[[9.0, 9.0, 9.0], - [6.0, 6.0, 6.0]], - [[6.0, 6.0, 6.0], - [4.0, 4.0, 4.0]]], - [[[18.0, 18.0, 18.0], - [12., 12., 12.]], - [[12., 12., 12.], - [8., 8., 8.]]]]]) - - output_shape = conv3d._spatial_output_shape([4, 4, 4]) - self.assertAllClose(output_shape, [2, 2, 2]) - - self.assertEqual(predicted.shape, expected.shape) - self.assertAllClose(predicted, expected) - - conv3d.use_buffered_input = False - predicted = conv3d(inputs) - - self.assertEqual(predicted.shape, expected.shape) - self.assertAllClose(predicted, expected) - - def test_conv3d_causal_padding_2d(self): - """Test to ensure causal padding works like standard padding.""" - conv3d = nn_layers.Conv3D( - filters=1, - kernel_size=(1, 3, 3), - strides=(1, 2, 2), - padding='causal', - use_buffered_input=False, - kernel_initializer='ones', - use_bias=False, - ) - - keras_conv3d = tf.keras.layers.Conv3D( - filters=1, - kernel_size=(1, 3, 3), - strides=(1, 2, 2), - padding='same', - kernel_initializer='ones', - use_bias=False, - ) - - inputs = tf.ones([1, 1, 4, 4, 1]) - - predicted = conv3d(inputs) - expected = keras_conv3d(inputs) - - self.assertEqual(predicted.shape, expected.shape) - self.assertAllClose(predicted, expected) - - self.assertAllClose(predicted, - [[[[[9.], - [6.]], - [[6.], - [4.]]]]]) - - def test_conv3d_causal_padding_1d(self): - """Test to ensure causal padding works like standard padding.""" - conv3d = nn_layers.Conv3D( - filters=1, - kernel_size=(3, 1, 1), - strides=(2, 1, 1), - padding='causal', - use_buffered_input=False, - kernel_initializer='ones', - use_bias=False, - ) - - keras_conv1d = tf.keras.layers.Conv1D( - filters=1, - kernel_size=3, - strides=2, - padding='causal', - kernel_initializer='ones', - use_bias=False, - ) - - inputs = tf.ones([1, 4, 1, 1, 1]) - - predicted = conv3d(inputs) - expected = keras_conv1d(tf.squeeze(inputs, axis=[2, 3])) - expected = tf.reshape(expected, [1, 2, 1, 1, 1]) - - self.assertEqual(predicted.shape, expected.shape) - self.assertAllClose(predicted, expected) - - self.assertAllClose(predicted, - [[[[[1.]]], - [[[3.]]]]]) - - @parameterized.parameters( - (None, []), - (None, [6, 12, 18]), - ([32, 32], [6, 12, 18]), - ) - def test_aspp(self, pool_kernel_size, dilation_rates): - inputs = tf.keras.Input(shape=(64, 64, 128), dtype=tf.float32) - layer = nn_layers.SpatialPyramidPooling( - output_channels=256, - dilation_rates=dilation_rates, - pool_kernel_size=pool_kernel_size) - output = layer(inputs) - self.assertAllEqual([None, 64, 64, 256], output.shape) - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/layers/roi_aligner.py b/official/vision/beta/modeling/layers/roi_aligner.py deleted file mode 100644 index 351b8cc11..000000000 --- a/official/vision/beta/modeling/layers/roi_aligner.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of ROI aligner.""" - -from typing import Mapping -import tensorflow as tf - -from official.vision.beta.ops import spatial_transform_ops - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class MultilevelROIAligner(tf.keras.layers.Layer): - """Performs ROIAlign for the second stage processing.""" - - def __init__(self, crop_size: int = 7, sample_offset: float = 0.5, **kwargs): - """Initializes a ROI aligner. - - Args: - crop_size: An `int` of the output size of the cropped features. - sample_offset: A `float` in [0, 1] of the subpixel sample offset. - **kwargs: Additional keyword arguments passed to Layer. - """ - self._config_dict = { - 'crop_size': crop_size, - 'sample_offset': sample_offset, - } - super(MultilevelROIAligner, self).__init__(**kwargs) - - def call(self, - features: Mapping[str, tf.Tensor], - boxes: tf.Tensor, - training: bool = None): - """Generates ROIs. - - Args: - features: A dictionary with key as pyramid level and value as features. - The features are in shape of - [batch_size, height_l, width_l, num_filters]. - boxes: A 3-D `tf.Tensor` of shape [batch_size, num_boxes, 4]. Each row - represents a box with [y1, x1, y2, x2] in un-normalized coordinates. - from grid point. - training: A `bool` of whether it is in training mode. - - Returns: - A 5-D `tf.Tensor` representing feature crop of shape - [batch_size, num_boxes, crop_size, crop_size, num_filters]. - """ - roi_features = spatial_transform_ops.multilevel_crop_and_resize( - features, - boxes, - output_size=self._config_dict['crop_size'], - sample_offset=self._config_dict['sample_offset']) - return roi_features - - def get_config(self): - return self._config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) diff --git a/official/vision/beta/modeling/layers/roi_aligner_test.py b/official/vision/beta/modeling/layers/roi_aligner_test.py deleted file mode 100644 index 35f499a11..000000000 --- a/official/vision/beta/modeling/layers/roi_aligner_test.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for roi_aligner.py.""" - -# Import libraries -import tensorflow as tf - -from official.vision.beta.modeling.layers import roi_aligner - - -class MultilevelROIAlignerTest(tf.test.TestCase): - - def test_serialize_deserialize(self): - kwargs = dict( - crop_size=7, - sample_offset=0.5, - ) - aligner = roi_aligner.MultilevelROIAligner(**kwargs) - - expected_config = dict(kwargs) - self.assertEqual(aligner.get_config(), expected_config) - - new_aligner = roi_aligner.MultilevelROIAligner.from_config( - aligner.get_config()) - - self.assertAllEqual(aligner.get_config(), new_aligner.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/layers/roi_generator.py b/official/vision/beta/modeling/layers/roi_generator.py deleted file mode 100644 index af79cccb6..000000000 --- a/official/vision/beta/modeling/layers/roi_generator.py +++ /dev/null @@ -1,313 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of ROI generator.""" -from typing import Optional, Mapping -# Import libraries -import tensorflow as tf - -from official.vision.beta.ops import box_ops -from official.vision.beta.ops import nms - - -def _multilevel_propose_rois(raw_boxes: Mapping[str, tf.Tensor], - raw_scores: Mapping[str, tf.Tensor], - anchor_boxes: Mapping[str, tf.Tensor], - image_shape: tf.Tensor, - pre_nms_top_k: int = 2000, - pre_nms_score_threshold: float = 0.0, - pre_nms_min_size_threshold: float = 0.0, - nms_iou_threshold: float = 0.7, - num_proposals: int = 1000, - use_batched_nms: bool = False, - decode_boxes: bool = True, - clip_boxes: bool = True, - apply_sigmoid_to_score: bool = True): - """Proposes RoIs given a group of candidates from different FPN levels. - - The following describes the steps: - 1. For each individual level: - a. Apply sigmoid transform if specified. - b. Decode boxes if specified. - c. Clip boxes if specified. - d. Filter small boxes and those fall outside image if specified. - e. Apply pre-NMS filtering including pre-NMS top k and score thresholding. - f. Apply NMS. - 2. Aggregate post-NMS boxes from each level. - 3. Apply an overall top k to generate the final selected RoIs. - - Args: - raw_boxes: A `dict` with keys representing FPN levels and values - representing box tenors of shape - [batch_size, feature_h, feature_w, num_anchors * 4]. - raw_scores: A `dict` with keys representing FPN levels and values - representing logit tensors of shape - [batch_size, feature_h, feature_w, num_anchors]. - anchor_boxes: A `dict` with keys representing FPN levels and values - representing anchor box tensors of shape - [batch_size, feature_h * feature_w * num_anchors, 4]. - image_shape: A `tf.Tensor` of shape [batch_size, 2] where the last dimension - are [height, width] of the scaled image. - pre_nms_top_k: An `int` of top scoring RPN proposals *per level* to keep - before applying NMS. Default: 2000. - pre_nms_score_threshold: A `float` between 0 and 1 representing the minimal - box score to keep before applying NMS. This is often used as a - pre-filtering step for better performance. Default: 0, no filtering is - applied. - pre_nms_min_size_threshold: A `float` representing the minimal box size in - each side (w.r.t. the scaled image) to keep before applying NMS. This is - often used as a pre-filtering step for better performance. Default: 0, no - filtering is applied. - nms_iou_threshold: A `float` between 0 and 1 representing the IoU threshold - used for NMS. If 0.0, no NMS is applied. Default: 0.7. - num_proposals: An `int` of top scoring RPN proposals *in total* to keep - after applying NMS. Default: 1000. - use_batched_nms: A `bool` indicating whether NMS is applied in batch using - `tf.image.combined_non_max_suppression`. Currently only available in - CPU/GPU. Default is False. - decode_boxes: A `bool` indicating whether `raw_boxes` needs to be decoded - using `anchor_boxes`. If False, use `raw_boxes` directly and ignore - `anchor_boxes`. Default is True. - clip_boxes: A `bool` indicating whether boxes are first clipped to the - scaled image size before appliying NMS. If False, no clipping is applied - and `image_shape` is ignored. Default is True. - apply_sigmoid_to_score: A `bool` indicating whether apply sigmoid to - `raw_scores` before applying NMS. Default is True. - - Returns: - selected_rois: A `tf.Tensor` of shape [batch_size, num_proposals, 4], - representing the box coordinates of the selected proposals w.r.t. the - scaled image. - selected_roi_scores: A `tf.Tensor` of shape [batch_size, num_proposals, 1], - representing the scores of the selected proposals. - """ - with tf.name_scope('multilevel_propose_rois'): - rois = [] - roi_scores = [] - image_shape = tf.expand_dims(image_shape, axis=1) - for level in sorted(raw_scores.keys()): - with tf.name_scope('level_%s' % level): - _, feature_h, feature_w, num_anchors_per_location = ( - raw_scores[level].get_shape().as_list()) - - num_boxes = feature_h * feature_w * num_anchors_per_location - this_level_scores = tf.reshape(raw_scores[level], [-1, num_boxes]) - this_level_boxes = tf.reshape(raw_boxes[level], [-1, num_boxes, 4]) - this_level_anchors = tf.cast( - tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]), - dtype=this_level_scores.dtype) - - if apply_sigmoid_to_score: - this_level_scores = tf.sigmoid(this_level_scores) - - if decode_boxes: - this_level_boxes = box_ops.decode_boxes( - this_level_boxes, this_level_anchors) - if clip_boxes: - this_level_boxes = box_ops.clip_boxes( - this_level_boxes, image_shape) - - if pre_nms_min_size_threshold > 0.0: - this_level_boxes, this_level_scores = box_ops.filter_boxes( - this_level_boxes, - this_level_scores, - image_shape, - pre_nms_min_size_threshold) - - this_level_pre_nms_top_k = min(num_boxes, pre_nms_top_k) - this_level_post_nms_top_k = min(num_boxes, num_proposals) - if nms_iou_threshold > 0.0: - if use_batched_nms: - this_level_rois, this_level_roi_scores, _, _ = ( - tf.image.combined_non_max_suppression( - tf.expand_dims(this_level_boxes, axis=2), - tf.expand_dims(this_level_scores, axis=-1), - max_output_size_per_class=this_level_pre_nms_top_k, - max_total_size=this_level_post_nms_top_k, - iou_threshold=nms_iou_threshold, - score_threshold=pre_nms_score_threshold, - pad_per_class=False, - clip_boxes=False)) - else: - if pre_nms_score_threshold > 0.0: - this_level_boxes, this_level_scores = ( - box_ops.filter_boxes_by_scores( - this_level_boxes, - this_level_scores, - pre_nms_score_threshold)) - this_level_boxes, this_level_scores = box_ops.top_k_boxes( - this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k) - this_level_roi_scores, this_level_rois = ( - nms.sorted_non_max_suppression_padded( - this_level_scores, - this_level_boxes, - max_output_size=this_level_post_nms_top_k, - iou_threshold=nms_iou_threshold)) - else: - this_level_rois, this_level_roi_scores = box_ops.top_k_boxes( - this_level_boxes, - this_level_scores, - k=this_level_post_nms_top_k) - - rois.append(this_level_rois) - roi_scores.append(this_level_roi_scores) - - all_rois = tf.concat(rois, axis=1) - all_roi_scores = tf.concat(roi_scores, axis=1) - - with tf.name_scope('top_k_rois'): - _, num_valid_rois = all_roi_scores.get_shape().as_list() - overall_top_k = min(num_valid_rois, num_proposals) - - selected_rois, selected_roi_scores = box_ops.top_k_boxes( - all_rois, all_roi_scores, k=overall_top_k) - - return selected_rois, selected_roi_scores - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class MultilevelROIGenerator(tf.keras.layers.Layer): - """Proposes RoIs for the second stage processing.""" - - def __init__(self, - pre_nms_top_k: int = 2000, - pre_nms_score_threshold: float = 0.0, - pre_nms_min_size_threshold: float = 0.0, - nms_iou_threshold: float = 0.7, - num_proposals: int = 1000, - test_pre_nms_top_k: int = 1000, - test_pre_nms_score_threshold: float = 0.0, - test_pre_nms_min_size_threshold: float = 0.0, - test_nms_iou_threshold: float = 0.7, - test_num_proposals: int = 1000, - use_batched_nms: bool = False, - **kwargs): - """Initializes a ROI generator. - - The ROI generator transforms the raw predictions from RPN to ROIs. - - Args: - pre_nms_top_k: An `int` of the number of top scores proposals to be kept - before applying NMS. - pre_nms_score_threshold: A `float` of the score threshold to apply before - applying NMS. Proposals whose scores are below this threshold are - thrown away. - pre_nms_min_size_threshold: A `float` of the threshold of each side of the - box (w.r.t. the scaled image). Proposals whose sides are below this - threshold are thrown away. - nms_iou_threshold: A `float` in [0, 1], the NMS IoU threshold. - num_proposals: An `int` of the final number of proposals to generate. - test_pre_nms_top_k: An `int` of the number of top scores proposals to be - kept before applying NMS in testing. - test_pre_nms_score_threshold: A `float` of the score threshold to apply - before applying NMS in testing. Proposals whose scores are below this - threshold are thrown away. - test_pre_nms_min_size_threshold: A `float` of the threshold of each side - of the box (w.r.t. the scaled image) in testing. Proposals whose sides - are below this threshold are thrown away. - test_nms_iou_threshold: A `float` in [0, 1] of the NMS IoU threshold in - testing. - test_num_proposals: An `int` of the final number of proposals to generate - in testing. - use_batched_nms: A `bool` of whether or not use - `tf.image.combined_non_max_suppression`. - **kwargs: Additional keyword arguments passed to Layer. - """ - self._config_dict = { - 'pre_nms_top_k': pre_nms_top_k, - 'pre_nms_score_threshold': pre_nms_score_threshold, - 'pre_nms_min_size_threshold': pre_nms_min_size_threshold, - 'nms_iou_threshold': nms_iou_threshold, - 'num_proposals': num_proposals, - 'test_pre_nms_top_k': test_pre_nms_top_k, - 'test_pre_nms_score_threshold': test_pre_nms_score_threshold, - 'test_pre_nms_min_size_threshold': test_pre_nms_min_size_threshold, - 'test_nms_iou_threshold': test_nms_iou_threshold, - 'test_num_proposals': test_num_proposals, - 'use_batched_nms': use_batched_nms, - } - super(MultilevelROIGenerator, self).__init__(**kwargs) - - def call(self, - raw_boxes: Mapping[str, tf.Tensor], - raw_scores: Mapping[str, tf.Tensor], - anchor_boxes: Mapping[str, tf.Tensor], - image_shape: tf.Tensor, - training: Optional[bool] = None): - """Proposes RoIs given a group of candidates from different FPN levels. - - The following describes the steps: - 1. For each individual level: - a. Apply sigmoid transform if specified. - b. Decode boxes if specified. - c. Clip boxes if specified. - d. Filter small boxes and those fall outside image if specified. - e. Apply pre-NMS filtering including pre-NMS top k and score - thresholding. - f. Apply NMS. - 2. Aggregate post-NMS boxes from each level. - 3. Apply an overall top k to generate the final selected RoIs. - - Args: - raw_boxes: A `dict` with keys representing FPN levels and values - representing box tenors of shape - [batch, feature_h, feature_w, num_anchors * 4]. - raw_scores: A `dict` with keys representing FPN levels and values - representing logit tensors of shape - [batch, feature_h, feature_w, num_anchors]. - anchor_boxes: A `dict` with keys representing FPN levels and values - representing anchor box tensors of shape - [batch, feature_h * feature_w * num_anchors, 4]. - image_shape: A `tf.Tensor` of shape [batch, 2] where the last dimension - are [height, width] of the scaled image. - training: A `bool` that indicates whether it is in training mode. - - Returns: - roi_boxes: A `tf.Tensor` of shape [batch, num_proposals, 4], the proposed - ROIs in the scaled image coordinate. - roi_scores: A `tf.Tensor` of shape [batch, num_proposals], scores of the - proposed ROIs. - """ - roi_boxes, roi_scores = _multilevel_propose_rois( - raw_boxes, - raw_scores, - anchor_boxes, - image_shape, - pre_nms_top_k=( - self._config_dict['pre_nms_top_k'] if training - else self._config_dict['test_pre_nms_top_k']), - pre_nms_score_threshold=( - self._config_dict['pre_nms_score_threshold'] if training - else self._config_dict['test_pre_nms_score_threshold']), - pre_nms_min_size_threshold=( - self._config_dict['pre_nms_min_size_threshold'] if training - else self._config_dict['test_pre_nms_min_size_threshold']), - nms_iou_threshold=( - self._config_dict['nms_iou_threshold'] if training - else self._config_dict['test_nms_iou_threshold']), - num_proposals=( - self._config_dict['num_proposals'] if training - else self._config_dict['test_num_proposals']), - use_batched_nms=self._config_dict['use_batched_nms'], - decode_boxes=True, - clip_boxes=True, - apply_sigmoid_to_score=True) - return roi_boxes, roi_scores - - def get_config(self): - return self._config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) diff --git a/official/vision/beta/modeling/layers/roi_sampler.py b/official/vision/beta/modeling/layers/roi_sampler.py deleted file mode 100644 index f3a4fd05e..000000000 --- a/official/vision/beta/modeling/layers/roi_sampler.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Contains definitions of ROI sampler.""" -# Import libraries - -import tensorflow as tf - -from official.vision.beta.modeling.layers import box_sampler -from official.vision.beta.ops import box_matcher -from official.vision.beta.ops import iou_similarity -from official.vision.beta.ops import target_gather - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class ROISampler(tf.keras.layers.Layer): - """Samples ROIs and assigns targets to the sampled ROIs.""" - - def __init__(self, - mix_gt_boxes: bool = True, - num_sampled_rois: int = 512, - foreground_fraction: float = 0.25, - foreground_iou_threshold: float = 0.5, - background_iou_high_threshold: float = 0.5, - background_iou_low_threshold: float = 0, - skip_subsampling: bool = False, - **kwargs): - """Initializes a ROI sampler. - - Args: - mix_gt_boxes: A `bool` of whether to mix the groundtruth boxes with - proposed ROIs. - num_sampled_rois: An `int` of the number of sampled ROIs per image. - foreground_fraction: A `float` in [0, 1], what percentage of proposed ROIs - should be sampled from the foreground boxes. - foreground_iou_threshold: A `float` that represents the IoU threshold for - a box to be considered as positive (if >= `foreground_iou_threshold`). - background_iou_high_threshold: A `float` that represents the IoU threshold - for a box to be considered as negative (if overlap in - [`background_iou_low_threshold`, `background_iou_high_threshold`]). - background_iou_low_threshold: A `float` that represents the IoU threshold - for a box to be considered as negative (if overlap in - [`background_iou_low_threshold`, `background_iou_high_threshold`]) - skip_subsampling: a bool that determines if we want to skip the sampling - procedure than balances the fg/bg classes. Used for upper frcnn layers - in cascade RCNN. - **kwargs: Additional keyword arguments passed to Layer. - """ - self._config_dict = { - 'mix_gt_boxes': mix_gt_boxes, - 'num_sampled_rois': num_sampled_rois, - 'foreground_fraction': foreground_fraction, - 'foreground_iou_threshold': foreground_iou_threshold, - 'background_iou_high_threshold': background_iou_high_threshold, - 'background_iou_low_threshold': background_iou_low_threshold, - 'skip_subsampling': skip_subsampling, - } - - self._sim_calc = iou_similarity.IouSimilarity() - self._box_matcher = box_matcher.BoxMatcher( - thresholds=[ - background_iou_low_threshold, background_iou_high_threshold, - foreground_iou_threshold - ], - indicators=[-3, -1, -2, 1]) - self._target_gather = target_gather.TargetGather() - - self._sampler = box_sampler.BoxSampler( - num_sampled_rois, foreground_fraction) - super(ROISampler, self).__init__(**kwargs) - - def call(self, boxes: tf.Tensor, gt_boxes: tf.Tensor, gt_classes: tf.Tensor): - """Assigns the proposals with groundtruth classes and performs subsmpling. - - Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the - following algorithm to generate the final `num_samples_per_image` RoIs. - 1. Calculates the IoU between each proposal box and each gt_boxes. - 2. Assigns each proposed box with a groundtruth class and box by choosing - the largest IoU overlap. - 3. Samples `num_samples_per_image` boxes from all proposed boxes, and - returns box_targets, class_targets, and RoIs. - - Args: - boxes: A `tf.Tensor` of shape of [batch_size, N, 4]. N is the number of - proposals before groundtruth assignment. The last dimension is the - box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] - format. - gt_boxes: A `tf.Tensor` of shape of [batch_size, MAX_NUM_INSTANCES, 4]. - The coordinates of gt_boxes are in the pixel coordinates of the scaled - image. This tensor might have padding of values -1 indicating the - invalid box coordinates. - gt_classes: A `tf.Tensor` with a shape of [batch_size, MAX_NUM_INSTANCES]. - This tensor might have paddings with values of -1 indicating the invalid - classes. - - Returns: - sampled_rois: A `tf.Tensor` of shape of [batch_size, K, 4], representing - the coordinates of the sampled RoIs, where K is the number of the - sampled RoIs, i.e. K = num_samples_per_image. - sampled_gt_boxes: A `tf.Tensor` of shape of [batch_size, K, 4], storing - the box coordinates of the matched groundtruth boxes of the samples - RoIs. - sampled_gt_classes: A `tf.Tensor` of shape of [batch_size, K], storing the - classes of the matched groundtruth boxes of the sampled RoIs. - sampled_gt_indices: A `tf.Tensor` of shape of [batch_size, K], storing the - indices of the sampled groudntruth boxes in the original `gt_boxes` - tensor, i.e., - gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i]. - """ - gt_boxes = tf.cast(gt_boxes, dtype=boxes.dtype) - if self._config_dict['mix_gt_boxes']: - boxes = tf.concat([boxes, gt_boxes], axis=1) - - boxes_invalid_mask = tf.less( - tf.reduce_max(boxes, axis=-1, keepdims=True), 0.0) - gt_invalid_mask = tf.less( - tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0) - similarity_matrix = self._sim_calc(boxes, gt_boxes, boxes_invalid_mask, - gt_invalid_mask) - matched_gt_indices, match_indicators = self._box_matcher(similarity_matrix) - positive_matches = tf.greater_equal(match_indicators, 0) - negative_matches = tf.equal(match_indicators, -1) - ignored_matches = tf.equal(match_indicators, -2) - invalid_matches = tf.equal(match_indicators, -3) - - background_mask = tf.expand_dims( - tf.logical_or(negative_matches, invalid_matches), -1) - gt_classes = tf.expand_dims(gt_classes, axis=-1) - matched_gt_classes = self._target_gather(gt_classes, matched_gt_indices, - background_mask) - matched_gt_classes = tf.where(background_mask, - tf.zeros_like(matched_gt_classes), - matched_gt_classes) - matched_gt_boxes = self._target_gather(gt_boxes, matched_gt_indices, - tf.tile(background_mask, [1, 1, 4])) - matched_gt_boxes = tf.where(background_mask, - tf.zeros_like(matched_gt_boxes), - matched_gt_boxes) - matched_gt_indices = tf.where( - tf.squeeze(background_mask, -1), -tf.ones_like(matched_gt_indices), - matched_gt_indices) - - if self._config_dict['skip_subsampling']: - return (boxes, matched_gt_boxes, tf.squeeze(matched_gt_classes, - axis=-1), matched_gt_indices) - - sampled_indices = self._sampler( - positive_matches, negative_matches, ignored_matches) - - sampled_rois = self._target_gather(boxes, sampled_indices) - sampled_gt_boxes = self._target_gather(matched_gt_boxes, sampled_indices) - sampled_gt_classes = tf.squeeze(self._target_gather( - matched_gt_classes, sampled_indices), axis=-1) - sampled_gt_indices = tf.squeeze(self._target_gather( - tf.expand_dims(matched_gt_indices, -1), sampled_indices), axis=-1) - return (sampled_rois, sampled_gt_boxes, sampled_gt_classes, - sampled_gt_indices) - - def get_config(self): - return self._config_dict - - @classmethod - def from_config(cls, config): - return cls(**config) diff --git a/official/vision/beta/modeling/maskrcnn_model.py b/official/vision/beta/modeling/maskrcnn_model.py deleted file mode 100644 index ca76cfb7f..000000000 --- a/official/vision/beta/modeling/maskrcnn_model.py +++ /dev/null @@ -1,429 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""R-CNN(-RS) models.""" - -from typing import Any, List, Mapping, Optional, Tuple, Union - -import tensorflow as tf - -from official.vision.beta.ops import anchor -from official.vision.beta.ops import box_ops - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class MaskRCNNModel(tf.keras.Model): - """The Mask R-CNN(-RS) and Cascade RCNN-RS models.""" - - def __init__(self, - backbone: tf.keras.Model, - decoder: tf.keras.Model, - rpn_head: tf.keras.layers.Layer, - detection_head: Union[tf.keras.layers.Layer, - List[tf.keras.layers.Layer]], - roi_generator: tf.keras.layers.Layer, - roi_sampler: Union[tf.keras.layers.Layer, - List[tf.keras.layers.Layer]], - roi_aligner: tf.keras.layers.Layer, - detection_generator: tf.keras.layers.Layer, - mask_head: Optional[tf.keras.layers.Layer] = None, - mask_sampler: Optional[tf.keras.layers.Layer] = None, - mask_roi_aligner: Optional[tf.keras.layers.Layer] = None, - class_agnostic_bbox_pred: bool = False, - cascade_class_ensemble: bool = False, - min_level: Optional[int] = None, - max_level: Optional[int] = None, - num_scales: Optional[int] = None, - aspect_ratios: Optional[List[float]] = None, - anchor_size: Optional[float] = None, - **kwargs): - """Initializes the R-CNN(-RS) model. - - Args: - backbone: `tf.keras.Model`, the backbone network. - decoder: `tf.keras.Model`, the decoder network. - rpn_head: the RPN head. - detection_head: the detection head or a list of heads. - roi_generator: the ROI generator. - roi_sampler: a single ROI sampler or a list of ROI samplers for cascade - detection heads. - roi_aligner: the ROI aligner. - detection_generator: the detection generator. - mask_head: the mask head. - mask_sampler: the mask sampler. - mask_roi_aligner: the ROI alginer for mask prediction. - class_agnostic_bbox_pred: if True, perform class agnostic bounding box - prediction. Needs to be `True` for Cascade RCNN models. - cascade_class_ensemble: if True, ensemble classification scores over all - detection heads. - min_level: Minimum level in output feature maps. - max_level: Maximum level in output feature maps. - num_scales: A number representing intermediate scales added on each level. - For instances, num_scales=2 adds one additional intermediate anchor - scales [2^0, 2^0.5] on each level. - aspect_ratios: A list representing the aspect raito anchors added on each - level. The number indicates the ratio of width to height. For instances, - aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level. - anchor_size: A number representing the scale of size of the base anchor to - the feature stride 2^level. - **kwargs: keyword arguments to be passed. - """ - super(MaskRCNNModel, self).__init__(**kwargs) - self._config_dict = { - 'backbone': backbone, - 'decoder': decoder, - 'rpn_head': rpn_head, - 'detection_head': detection_head, - 'roi_generator': roi_generator, - 'roi_sampler': roi_sampler, - 'roi_aligner': roi_aligner, - 'detection_generator': detection_generator, - 'mask_head': mask_head, - 'mask_sampler': mask_sampler, - 'mask_roi_aligner': mask_roi_aligner, - 'class_agnostic_bbox_pred': class_agnostic_bbox_pred, - 'cascade_class_ensemble': cascade_class_ensemble, - 'min_level': min_level, - 'max_level': max_level, - 'num_scales': num_scales, - 'aspect_ratios': aspect_ratios, - 'anchor_size': anchor_size, - } - self.backbone = backbone - self.decoder = decoder - self.rpn_head = rpn_head - if not isinstance(detection_head, (list, tuple)): - self.detection_head = [detection_head] - else: - self.detection_head = detection_head - self.roi_generator = roi_generator - if not isinstance(roi_sampler, (list, tuple)): - self.roi_sampler = [roi_sampler] - else: - self.roi_sampler = roi_sampler - if len(self.roi_sampler) > 1 and not class_agnostic_bbox_pred: - raise ValueError( - '`class_agnostic_bbox_pred` needs to be True if multiple detection heads are specified.' - ) - self.roi_aligner = roi_aligner - self.detection_generator = detection_generator - self._include_mask = mask_head is not None - self.mask_head = mask_head - if self._include_mask and mask_sampler is None: - raise ValueError('`mask_sampler` is not provided in Mask R-CNN.') - self.mask_sampler = mask_sampler - if self._include_mask and mask_roi_aligner is None: - raise ValueError('`mask_roi_aligner` is not provided in Mask R-CNN.') - self.mask_roi_aligner = mask_roi_aligner - # Weights for the regression losses for each FRCNN layer. - # TODO(xianzhi): Make the weights configurable. - self._cascade_layer_to_weights = [ - [10.0, 10.0, 5.0, 5.0], - [20.0, 20.0, 10.0, 10.0], - [30.0, 30.0, 15.0, 15.0], - ] - - def call(self, - images: tf.Tensor, - image_shape: tf.Tensor, - anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None, - gt_boxes: Optional[tf.Tensor] = None, - gt_classes: Optional[tf.Tensor] = None, - gt_masks: Optional[tf.Tensor] = None, - training: Optional[bool] = None) -> Mapping[str, tf.Tensor]: - - model_outputs, intermediate_outputs = self._call_box_outputs( - images=images, image_shape=image_shape, anchor_boxes=anchor_boxes, - gt_boxes=gt_boxes, gt_classes=gt_classes, training=training) - if not self._include_mask: - return model_outputs - - model_mask_outputs = self._call_mask_outputs( - model_box_outputs=model_outputs, - features=model_outputs['decoder_features'], - current_rois=intermediate_outputs['current_rois'], - matched_gt_indices=intermediate_outputs['matched_gt_indices'], - matched_gt_boxes=intermediate_outputs['matched_gt_boxes'], - matched_gt_classes=intermediate_outputs['matched_gt_classes'], - gt_masks=gt_masks, - training=training) - model_outputs.update(model_mask_outputs) - return model_outputs - - def _get_backbone_and_decoder_features(self, images): - - backbone_features = self.backbone(images) - if self.decoder: - features = self.decoder(backbone_features) - else: - features = backbone_features - return backbone_features, features - - def _call_box_outputs( - self, images: tf.Tensor, - image_shape: tf.Tensor, - anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None, - gt_boxes: Optional[tf.Tensor] = None, - gt_classes: Optional[tf.Tensor] = None, - training: Optional[bool] = None) -> Tuple[ - Mapping[str, tf.Tensor], Mapping[str, tf.Tensor]]: - """Implementation of the Faster-RCNN logic for boxes.""" - model_outputs = {} - - # Feature extraction. - (backbone_features, - decoder_features) = self._get_backbone_and_decoder_features(images) - - # Region proposal network. - rpn_scores, rpn_boxes = self.rpn_head(decoder_features) - - model_outputs.update({ - 'backbone_features': backbone_features, - 'decoder_features': decoder_features, - 'rpn_boxes': rpn_boxes, - 'rpn_scores': rpn_scores - }) - - # Generate anchor boxes for this batch if not provided. - if anchor_boxes is None: - _, image_height, image_width, _ = images.get_shape().as_list() - anchor_boxes = anchor.Anchor( - min_level=self._config_dict['min_level'], - max_level=self._config_dict['max_level'], - num_scales=self._config_dict['num_scales'], - aspect_ratios=self._config_dict['aspect_ratios'], - anchor_size=self._config_dict['anchor_size'], - image_size=(image_height, image_width)).multilevel_boxes - for l in anchor_boxes: - anchor_boxes[l] = tf.tile( - tf.expand_dims(anchor_boxes[l], axis=0), - [tf.shape(images)[0], 1, 1, 1]) - - # Generate RoIs. - current_rois, _ = self.roi_generator(rpn_boxes, rpn_scores, anchor_boxes, - image_shape, training) - - next_rois = current_rois - all_class_outputs = [] - for cascade_num in range(len(self.roi_sampler)): - # In cascade RCNN we want the higher layers to have different regression - # weights as the predicted deltas become smaller and smaller. - regression_weights = self._cascade_layer_to_weights[cascade_num] - current_rois = next_rois - - (class_outputs, box_outputs, model_outputs, matched_gt_boxes, - matched_gt_classes, matched_gt_indices, - current_rois) = self._run_frcnn_head( - features=decoder_features, - rois=current_rois, - gt_boxes=gt_boxes, - gt_classes=gt_classes, - training=training, - model_outputs=model_outputs, - cascade_num=cascade_num, - regression_weights=regression_weights) - all_class_outputs.append(class_outputs) - - # Generate ROIs for the next cascade head if there is any. - if cascade_num < len(self.roi_sampler) - 1: - next_rois = box_ops.decode_boxes( - tf.cast(box_outputs, tf.float32), - current_rois, - weights=regression_weights) - next_rois = box_ops.clip_boxes(next_rois, - tf.expand_dims(image_shape, axis=1)) - - if not training: - if self._config_dict['cascade_class_ensemble']: - class_outputs = tf.add_n(all_class_outputs) / len(all_class_outputs) - - detections = self.detection_generator( - box_outputs, - class_outputs, - current_rois, - image_shape, - regression_weights, - bbox_per_class=(not self._config_dict['class_agnostic_bbox_pred'])) - model_outputs.update({ - 'cls_outputs': class_outputs, - 'box_outputs': box_outputs, - }) - if self.detection_generator.get_config()['apply_nms']: - model_outputs.update({ - 'detection_boxes': detections['detection_boxes'], - 'detection_scores': detections['detection_scores'], - 'detection_classes': detections['detection_classes'], - 'num_detections': detections['num_detections'] - }) - else: - model_outputs.update({ - 'decoded_boxes': detections['decoded_boxes'], - 'decoded_box_scores': detections['decoded_box_scores'] - }) - - intermediate_outputs = { - 'matched_gt_boxes': matched_gt_boxes, - 'matched_gt_indices': matched_gt_indices, - 'matched_gt_classes': matched_gt_classes, - 'current_rois': current_rois, - } - return (model_outputs, intermediate_outputs) - - def _call_mask_outputs( - self, - model_box_outputs: Mapping[str, tf.Tensor], - features: tf.Tensor, - current_rois: tf.Tensor, - matched_gt_indices: tf.Tensor, - matched_gt_boxes: tf.Tensor, - matched_gt_classes: tf.Tensor, - gt_masks: tf.Tensor, - training: Optional[bool] = None) -> Mapping[str, tf.Tensor]: - """Implementation of Mask-RCNN mask prediction logic.""" - - model_outputs = dict(model_box_outputs) - if training: - current_rois, roi_classes, roi_masks = self.mask_sampler( - current_rois, matched_gt_boxes, matched_gt_classes, - matched_gt_indices, gt_masks) - roi_masks = tf.stop_gradient(roi_masks) - - model_outputs.update({ - 'mask_class_targets': roi_classes, - 'mask_targets': roi_masks, - }) - else: - current_rois = model_outputs['detection_boxes'] - roi_classes = model_outputs['detection_classes'] - - mask_logits, mask_probs = self._features_to_mask_outputs( - features, current_rois, roi_classes) - - if training: - model_outputs.update({ - 'mask_outputs': mask_logits, - }) - else: - model_outputs.update({ - 'detection_masks': mask_probs, - }) - return model_outputs - - def _run_frcnn_head(self, features, rois, gt_boxes, gt_classes, training, - model_outputs, cascade_num, regression_weights): - """Runs the frcnn head that does both class and box prediction. - - Args: - features: `list` of features from the feature extractor. - rois: `list` of current rois that will be used to predict bbox refinement - and classes from. - gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. - This tensor might have paddings with a negative value. - gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box - classes. It is padded with -1s to indicate the invalid classes. - training: `bool`, if model is training or being evaluated. - model_outputs: `dict`, used for storing outputs used for eval and losses. - cascade_num: `int`, the current frcnn layer in the cascade. - regression_weights: `list`, weights used for l1 loss in bounding box - regression. - - Returns: - class_outputs: Class predictions for rois. - box_outputs: Box predictions for rois. These are formatted for the - regression loss and need to be converted before being used as rois - in the next stage. - model_outputs: Updated dict with predictions used for losses and eval. - matched_gt_boxes: If `is_training` is true, then these give the gt box - location of its positive match. - matched_gt_classes: If `is_training` is true, then these give the gt class - of the predicted box. - matched_gt_boxes: If `is_training` is true, then these give the box - location of its positive match. - matched_gt_indices: If `is_training` is true, then gives the index of - the positive box match. Used for mask prediction. - rois: The sampled rois used for this layer. - """ - # Only used during training. - matched_gt_boxes, matched_gt_classes, matched_gt_indices = (None, None, - None) - if training and gt_boxes is not None: - rois = tf.stop_gradient(rois) - - current_roi_sampler = self.roi_sampler[cascade_num] - rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = ( - current_roi_sampler(rois, gt_boxes, gt_classes)) - # Create bounding box training targets. - box_targets = box_ops.encode_boxes( - matched_gt_boxes, rois, weights=regression_weights) - # If the target is background, the box target is set to all 0s. - box_targets = tf.where( - tf.tile( - tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1), - [1, 1, 4]), tf.zeros_like(box_targets), box_targets) - model_outputs.update({ - 'class_targets_{}'.format(cascade_num) - if cascade_num else 'class_targets': - matched_gt_classes, - 'box_targets_{}'.format(cascade_num) - if cascade_num else 'box_targets': - box_targets, - }) - - # Get roi features. - roi_features = self.roi_aligner(features, rois) - - # Run frcnn head to get class and bbox predictions. - current_detection_head = self.detection_head[cascade_num] - class_outputs, box_outputs = current_detection_head(roi_features) - - model_outputs.update({ - 'class_outputs_{}'.format(cascade_num) - if cascade_num else 'class_outputs': - class_outputs, - 'box_outputs_{}'.format(cascade_num) if cascade_num else 'box_outputs': - box_outputs, - }) - return (class_outputs, box_outputs, model_outputs, matched_gt_boxes, - matched_gt_classes, matched_gt_indices, rois) - - def _features_to_mask_outputs(self, features, rois, roi_classes): - # Mask RoI align. - mask_roi_features = self.mask_roi_aligner(features, rois) - - # Mask head. - raw_masks = self.mask_head([mask_roi_features, roi_classes]) - - return raw_masks, tf.nn.sigmoid(raw_masks) - - @property - def checkpoint_items( - self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]: - """Returns a dictionary of items to be additionally checkpointed.""" - items = dict( - backbone=self.backbone, - rpn_head=self.rpn_head, - detection_head=self.detection_head) - if self.decoder is not None: - items.update(decoder=self.decoder) - if self._include_mask: - items.update(mask_head=self.mask_head) - - return items - - def get_config(self) -> Mapping[str, Any]: - return self._config_dict - - @classmethod - def from_config(cls, config): - return cls(**config) diff --git a/official/vision/beta/modeling/maskrcnn_model_test.py b/official/vision/beta/modeling/maskrcnn_model_test.py deleted file mode 100644 index 1011e6317..000000000 --- a/official/vision/beta/modeling/maskrcnn_model_test.py +++ /dev/null @@ -1,397 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for maskrcnn_model.py.""" - -import os -# Import libraries -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from tensorflow.python.distribute import combinations -from tensorflow.python.distribute import strategy_combinations -from official.vision.beta.modeling import maskrcnn_model -from official.vision.beta.modeling.backbones import resnet -from official.vision.beta.modeling.decoders import fpn -from official.vision.beta.modeling.heads import dense_prediction_heads -from official.vision.beta.modeling.heads import instance_heads -from official.vision.beta.modeling.layers import detection_generator -from official.vision.beta.modeling.layers import mask_sampler -from official.vision.beta.modeling.layers import roi_aligner -from official.vision.beta.modeling.layers import roi_generator -from official.vision.beta.modeling.layers import roi_sampler -from official.vision.beta.ops import anchor - - -class MaskRCNNModelTest(parameterized.TestCase, tf.test.TestCase): - - @combinations.generate( - combinations.combine( - include_mask=[True, False], - use_separable_conv=[True, False], - build_anchor_boxes=[True, False], - is_training=[True, False])) - def test_build_model(self, include_mask, use_separable_conv, - build_anchor_boxes, is_training): - num_classes = 3 - min_level = 3 - max_level = 7 - num_scales = 3 - aspect_ratios = [1.0] - anchor_size = 3 - resnet_model_id = 50 - num_anchors_per_location = num_scales * len(aspect_ratios) - image_size = 384 - images = np.random.rand(2, image_size, image_size, 3) - image_shape = np.array([[image_size, image_size], [image_size, image_size]]) - - if build_anchor_boxes: - anchor_boxes = anchor.Anchor( - min_level=min_level, - max_level=max_level, - num_scales=num_scales, - aspect_ratios=aspect_ratios, - anchor_size=3, - image_size=(image_size, image_size)).multilevel_boxes - for l in anchor_boxes: - anchor_boxes[l] = tf.tile( - tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1]) - else: - anchor_boxes = None - - backbone = resnet.ResNet(model_id=resnet_model_id) - decoder = fpn.FPN( - input_specs=backbone.output_specs, - min_level=min_level, - max_level=max_level, - use_separable_conv=use_separable_conv) - rpn_head = dense_prediction_heads.RPNHead( - min_level=min_level, - max_level=max_level, - num_anchors_per_location=num_anchors_per_location, - num_convs=1) - detection_head = instance_heads.DetectionHead(num_classes=num_classes) - roi_generator_obj = roi_generator.MultilevelROIGenerator() - roi_sampler_obj = roi_sampler.ROISampler() - roi_aligner_obj = roi_aligner.MultilevelROIAligner() - detection_generator_obj = detection_generator.DetectionGenerator() - if include_mask: - mask_head = instance_heads.MaskHead( - num_classes=num_classes, upsample_factor=2) - mask_sampler_obj = mask_sampler.MaskSampler( - mask_target_size=28, num_sampled_masks=1) - mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14) - else: - mask_head = None - mask_sampler_obj = None - mask_roi_aligner_obj = None - model = maskrcnn_model.MaskRCNNModel( - backbone, - decoder, - rpn_head, - detection_head, - roi_generator_obj, - roi_sampler_obj, - roi_aligner_obj, - detection_generator_obj, - mask_head, - mask_sampler_obj, - mask_roi_aligner_obj, - min_level=min_level, - max_level=max_level, - num_scales=num_scales, - aspect_ratios=aspect_ratios, - anchor_size=anchor_size) - - gt_boxes = np.array( - [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]], - [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]], - dtype=np.float32) - gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32) - if include_mask: - gt_masks = np.ones((2, 3, 100, 100)) - else: - gt_masks = None - - # Results will be checked in test_forward. - _ = model( - images, - image_shape, - anchor_boxes, - gt_boxes, - gt_classes, - gt_masks, - training=is_training) - - @combinations.generate( - combinations.combine( - strategy=[ - strategy_combinations.cloud_tpu_strategy, - strategy_combinations.one_device_strategy_gpu, - ], - include_mask=[True, False], - build_anchor_boxes=[True, False], - use_cascade_heads=[True, False], - training=[True, False], - )) - def test_forward(self, strategy, include_mask, build_anchor_boxes, training, - use_cascade_heads): - num_classes = 3 - min_level = 3 - max_level = 4 - num_scales = 3 - aspect_ratios = [1.0] - anchor_size = 3 - if use_cascade_heads: - cascade_iou_thresholds = [0.6] - class_agnostic_bbox_pred = True - cascade_class_ensemble = True - else: - cascade_iou_thresholds = None - class_agnostic_bbox_pred = False - cascade_class_ensemble = False - - image_size = (256, 256) - images = np.random.rand(2, image_size[0], image_size[1], 3) - image_shape = np.array([[224, 100], [100, 224]]) - with strategy.scope(): - if build_anchor_boxes: - anchor_boxes = anchor.Anchor( - min_level=min_level, - max_level=max_level, - num_scales=num_scales, - aspect_ratios=aspect_ratios, - anchor_size=anchor_size, - image_size=image_size).multilevel_boxes - else: - anchor_boxes = None - num_anchors_per_location = len(aspect_ratios) * num_scales - - input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3]) - backbone = resnet.ResNet(model_id=50, input_specs=input_specs) - decoder = fpn.FPN( - min_level=min_level, - max_level=max_level, - input_specs=backbone.output_specs) - rpn_head = dense_prediction_heads.RPNHead( - min_level=min_level, - max_level=max_level, - num_anchors_per_location=num_anchors_per_location) - detection_head = instance_heads.DetectionHead( - num_classes=num_classes, - class_agnostic_bbox_pred=class_agnostic_bbox_pred) - roi_generator_obj = roi_generator.MultilevelROIGenerator() - - roi_sampler_cascade = [] - roi_sampler_obj = roi_sampler.ROISampler() - roi_sampler_cascade.append(roi_sampler_obj) - if cascade_iou_thresholds: - for iou in cascade_iou_thresholds: - roi_sampler_obj = roi_sampler.ROISampler( - mix_gt_boxes=False, - foreground_iou_threshold=iou, - background_iou_high_threshold=iou, - background_iou_low_threshold=0.0, - skip_subsampling=True) - roi_sampler_cascade.append(roi_sampler_obj) - roi_aligner_obj = roi_aligner.MultilevelROIAligner() - detection_generator_obj = detection_generator.DetectionGenerator() - if include_mask: - mask_head = instance_heads.MaskHead( - num_classes=num_classes, upsample_factor=2) - mask_sampler_obj = mask_sampler.MaskSampler( - mask_target_size=28, num_sampled_masks=1) - mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14) - else: - mask_head = None - mask_sampler_obj = None - mask_roi_aligner_obj = None - model = maskrcnn_model.MaskRCNNModel( - backbone, - decoder, - rpn_head, - detection_head, - roi_generator_obj, - roi_sampler_obj, - roi_aligner_obj, - detection_generator_obj, - mask_head, - mask_sampler_obj, - mask_roi_aligner_obj, - class_agnostic_bbox_pred=class_agnostic_bbox_pred, - cascade_class_ensemble=cascade_class_ensemble, - min_level=min_level, - max_level=max_level, - num_scales=num_scales, - aspect_ratios=aspect_ratios, - anchor_size=anchor_size) - - gt_boxes = np.array( - [[[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]], - [[100, 100, 150, 150], [-1, -1, -1, -1], [-1, -1, -1, -1]]], - dtype=np.float32) - gt_classes = np.array([[2, 1, -1], [1, -1, -1]], dtype=np.int32) - if include_mask: - gt_masks = np.ones((2, 3, 100, 100)) - else: - gt_masks = None - - results = model( - images, - image_shape, - anchor_boxes, - gt_boxes, - gt_classes, - gt_masks, - training=training) - - self.assertIn('rpn_boxes', results) - self.assertIn('rpn_scores', results) - if training: - self.assertIn('class_targets', results) - self.assertIn('box_targets', results) - self.assertIn('class_outputs', results) - self.assertIn('box_outputs', results) - if include_mask: - self.assertIn('mask_outputs', results) - else: - self.assertIn('detection_boxes', results) - self.assertIn('detection_scores', results) - self.assertIn('detection_classes', results) - self.assertIn('num_detections', results) - if include_mask: - self.assertIn('detection_masks', results) - - @parameterized.parameters( - (False,), - (True,), - ) - def test_serialize_deserialize(self, include_mask): - input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3]) - backbone = resnet.ResNet(model_id=50, input_specs=input_specs) - decoder = fpn.FPN( - min_level=3, max_level=7, input_specs=backbone.output_specs) - rpn_head = dense_prediction_heads.RPNHead( - min_level=3, max_level=7, num_anchors_per_location=3) - detection_head = instance_heads.DetectionHead(num_classes=2) - roi_generator_obj = roi_generator.MultilevelROIGenerator() - roi_sampler_obj = roi_sampler.ROISampler() - roi_aligner_obj = roi_aligner.MultilevelROIAligner() - detection_generator_obj = detection_generator.DetectionGenerator() - if include_mask: - mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2) - mask_sampler_obj = mask_sampler.MaskSampler( - mask_target_size=28, num_sampled_masks=1) - mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14) - else: - mask_head = None - mask_sampler_obj = None - mask_roi_aligner_obj = None - model = maskrcnn_model.MaskRCNNModel( - backbone, - decoder, - rpn_head, - detection_head, - roi_generator_obj, - roi_sampler_obj, - roi_aligner_obj, - detection_generator_obj, - mask_head, - mask_sampler_obj, - mask_roi_aligner_obj, - min_level=3, - max_level=7, - num_scales=3, - aspect_ratios=[1.0], - anchor_size=3) - - config = model.get_config() - new_model = maskrcnn_model.MaskRCNNModel.from_config(config) - - # Validate that the config can be forced to JSON. - _ = new_model.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(model.get_config(), new_model.get_config()) - - @parameterized.parameters( - (False,), - (True,), - ) - def test_checkpoint(self, include_mask): - input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, 3]) - backbone = resnet.ResNet(model_id=50, input_specs=input_specs) - decoder = fpn.FPN( - min_level=3, max_level=7, input_specs=backbone.output_specs) - rpn_head = dense_prediction_heads.RPNHead( - min_level=3, max_level=7, num_anchors_per_location=3) - detection_head = instance_heads.DetectionHead(num_classes=2) - roi_generator_obj = roi_generator.MultilevelROIGenerator() - roi_sampler_obj = roi_sampler.ROISampler() - roi_aligner_obj = roi_aligner.MultilevelROIAligner() - detection_generator_obj = detection_generator.DetectionGenerator() - if include_mask: - mask_head = instance_heads.MaskHead(num_classes=2, upsample_factor=2) - mask_sampler_obj = mask_sampler.MaskSampler( - mask_target_size=28, num_sampled_masks=1) - mask_roi_aligner_obj = roi_aligner.MultilevelROIAligner(crop_size=14) - else: - mask_head = None - mask_sampler_obj = None - mask_roi_aligner_obj = None - model = maskrcnn_model.MaskRCNNModel( - backbone, - decoder, - rpn_head, - detection_head, - roi_generator_obj, - roi_sampler_obj, - roi_aligner_obj, - detection_generator_obj, - mask_head, - mask_sampler_obj, - mask_roi_aligner_obj, - min_level=3, - max_level=7, - num_scales=3, - aspect_ratios=[1.0], - anchor_size=3) - expect_checkpoint_items = dict( - backbone=backbone, - decoder=decoder, - rpn_head=rpn_head, - detection_head=[detection_head]) - if include_mask: - expect_checkpoint_items['mask_head'] = mask_head - self.assertAllEqual(expect_checkpoint_items, model.checkpoint_items) - - # Test save and load checkpoints. - ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items) - save_dir = self.create_tempdir().full_path - ckpt.save(os.path.join(save_dir, 'ckpt')) - - partial_ckpt = tf.train.Checkpoint(backbone=backbone) - partial_ckpt.read(tf.train.latest_checkpoint( - save_dir)).expect_partial().assert_existing_objects_matched() - - if include_mask: - partial_ckpt_mask = tf.train.Checkpoint( - backbone=backbone, mask_head=mask_head) - partial_ckpt_mask.restore(tf.train.latest_checkpoint( - save_dir)).expect_partial().assert_existing_objects_matched() - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/retinanet_model.py b/official/vision/beta/modeling/retinanet_model.py deleted file mode 100644 index 49de9c65b..000000000 --- a/official/vision/beta/modeling/retinanet_model.py +++ /dev/null @@ -1,216 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""RetinaNet.""" -from typing import Any, Mapping, List, Optional, Union - -# Import libraries -import tensorflow as tf - -from official.vision.beta.ops import anchor - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class RetinaNetModel(tf.keras.Model): - """The RetinaNet model class.""" - - def __init__(self, - backbone: tf.keras.Model, - decoder: tf.keras.Model, - head: tf.keras.layers.Layer, - detection_generator: tf.keras.layers.Layer, - min_level: Optional[int] = None, - max_level: Optional[int] = None, - num_scales: Optional[int] = None, - aspect_ratios: Optional[List[float]] = None, - anchor_size: Optional[float] = None, - **kwargs): - """Classification initialization function. - - Args: - backbone: `tf.keras.Model` a backbone network. - decoder: `tf.keras.Model` a decoder network. - head: `RetinaNetHead`, the RetinaNet head. - detection_generator: the detection generator. - min_level: Minimum level in output feature maps. - max_level: Maximum level in output feature maps. - num_scales: A number representing intermediate scales added - on each level. For instances, num_scales=2 adds one additional - intermediate anchor scales [2^0, 2^0.5] on each level. - aspect_ratios: A list representing the aspect raito - anchors added on each level. The number indicates the ratio of width to - height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors - on each scale level. - anchor_size: A number representing the scale of size of the base - anchor to the feature stride 2^level. - **kwargs: keyword arguments to be passed. - """ - super(RetinaNetModel, self).__init__(**kwargs) - self._config_dict = { - 'backbone': backbone, - 'decoder': decoder, - 'head': head, - 'detection_generator': detection_generator, - 'min_level': min_level, - 'max_level': max_level, - 'num_scales': num_scales, - 'aspect_ratios': aspect_ratios, - 'anchor_size': anchor_size, - } - self._backbone = backbone - self._decoder = decoder - self._head = head - self._detection_generator = detection_generator - - def call(self, - images: tf.Tensor, - image_shape: Optional[tf.Tensor] = None, - anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None, - output_intermediate_features: bool = False, - training: bool = None) -> Mapping[str, tf.Tensor]: - """Forward pass of the RetinaNet model. - - Args: - images: `Tensor`, the input batched images, whose shape is - [batch, height, width, 3]. - image_shape: `Tensor`, the actual shape of the input images, whose shape - is [batch, 2] where the last dimension is [height, width]. Note that - this is the actual image shape excluding paddings. For example, images - in the batch may be resized into different shapes before padding to the - fixed size. - anchor_boxes: a dict of tensors which includes multilevel anchors. - - key: `str`, the level of the multilevel predictions. - - values: `Tensor`, the anchor coordinates of a particular feature - level, whose shape is [height_l, width_l, num_anchors_per_location]. - output_intermediate_features: `bool` indicating whether to return the - intermediate feature maps generated by backbone and decoder. - training: `bool`, indicating whether it is in training mode. - - Returns: - scores: a dict of tensors which includes scores of the predictions. - - key: `str`, the level of the multilevel predictions. - - values: `Tensor`, the box scores predicted from a particular feature - level, whose shape is - [batch, height_l, width_l, num_classes * num_anchors_per_location]. - boxes: a dict of tensors which includes coordinates of the predictions. - - key: `str`, the level of the multilevel predictions. - - values: `Tensor`, the box coordinates predicted from a particular - feature level, whose shape is - [batch, height_l, width_l, 4 * num_anchors_per_location]. - attributes: a dict of (attribute_name, attribute_predictions). Each - attribute prediction is a dict that includes: - - key: `str`, the level of the multilevel predictions. - - values: `Tensor`, the attribute predictions from a particular - feature level, whose shape is - [batch, height_l, width_l, att_size * num_anchors_per_location]. - """ - outputs = {} - # Feature extraction. - features = self.backbone(images) - if output_intermediate_features: - outputs.update( - {'backbone_{}'.format(k): v for k, v in features.items()}) - if self.decoder: - features = self.decoder(features) - if output_intermediate_features: - outputs.update( - {'decoder_{}'.format(k): v for k, v in features.items()}) - - # Dense prediction. `raw_attributes` can be empty. - raw_scores, raw_boxes, raw_attributes = self.head(features) - - if training: - outputs.update({ - 'cls_outputs': raw_scores, - 'box_outputs': raw_boxes, - }) - if raw_attributes: - outputs.update({'attribute_outputs': raw_attributes}) - return outputs - else: - # Generate anchor boxes for this batch if not provided. - if anchor_boxes is None: - _, image_height, image_width, _ = images.get_shape().as_list() - anchor_boxes = anchor.Anchor( - min_level=self._config_dict['min_level'], - max_level=self._config_dict['max_level'], - num_scales=self._config_dict['num_scales'], - aspect_ratios=self._config_dict['aspect_ratios'], - anchor_size=self._config_dict['anchor_size'], - image_size=(image_height, image_width)).multilevel_boxes - for l in anchor_boxes: - anchor_boxes[l] = tf.tile( - tf.expand_dims(anchor_boxes[l], axis=0), - [tf.shape(images)[0], 1, 1, 1]) - - # Post-processing. - final_results = self.detection_generator(raw_boxes, raw_scores, - anchor_boxes, image_shape, - raw_attributes) - outputs.update({ - 'cls_outputs': raw_scores, - 'box_outputs': raw_boxes, - }) - if self.detection_generator.get_config()['apply_nms']: - outputs.update({ - 'detection_boxes': final_results['detection_boxes'], - 'detection_scores': final_results['detection_scores'], - 'detection_classes': final_results['detection_classes'], - 'num_detections': final_results['num_detections'] - }) - else: - outputs.update({ - 'decoded_boxes': final_results['decoded_boxes'], - 'decoded_box_scores': final_results['decoded_box_scores'] - }) - - if raw_attributes: - outputs.update({ - 'attribute_outputs': raw_attributes, - 'detection_attributes': final_results['detection_attributes'], - }) - return outputs - - @property - def checkpoint_items( - self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]: - """Returns a dictionary of items to be additionally checkpointed.""" - items = dict(backbone=self.backbone, head=self.head) - if self.decoder is not None: - items.update(decoder=self.decoder) - - return items - - @property - def backbone(self) -> tf.keras.Model: - return self._backbone - - @property - def decoder(self) -> tf.keras.Model: - return self._decoder - - @property - def head(self) -> tf.keras.layers.Layer: - return self._head - - @property - def detection_generator(self) -> tf.keras.layers.Layer: - return self._detection_generator - - def get_config(self) -> Mapping[str, Any]: - return self._config_dict - - @classmethod - def from_config(cls, config): - return cls(**config) diff --git a/official/vision/beta/modeling/retinanet_model_test.py b/official/vision/beta/modeling/retinanet_model_test.py deleted file mode 100644 index 680fa4d58..000000000 --- a/official/vision/beta/modeling/retinanet_model_test.py +++ /dev/null @@ -1,313 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for RetinaNet models.""" - -# Import libraries -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from tensorflow.python.distribute import combinations -from tensorflow.python.distribute import strategy_combinations -from official.vision.beta.modeling import retinanet_model -from official.vision.beta.modeling.backbones import resnet -from official.vision.beta.modeling.decoders import fpn -from official.vision.beta.modeling.heads import dense_prediction_heads -from official.vision.beta.modeling.layers import detection_generator -from official.vision.beta.ops import anchor - - -class RetinaNetTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - { - 'use_separable_conv': True, - 'build_anchor_boxes': True, - 'is_training': False, - 'has_att_heads': False - }, - { - 'use_separable_conv': False, - 'build_anchor_boxes': True, - 'is_training': False, - 'has_att_heads': False - }, - { - 'use_separable_conv': False, - 'build_anchor_boxes': False, - 'is_training': False, - 'has_att_heads': False - }, - { - 'use_separable_conv': False, - 'build_anchor_boxes': False, - 'is_training': True, - 'has_att_heads': False - }, - { - 'use_separable_conv': False, - 'build_anchor_boxes': True, - 'is_training': True, - 'has_att_heads': True - }, - { - 'use_separable_conv': False, - 'build_anchor_boxes': True, - 'is_training': False, - 'has_att_heads': True - }, - ) - def test_build_model(self, use_separable_conv, build_anchor_boxes, - is_training, has_att_heads): - num_classes = 3 - min_level = 3 - max_level = 7 - num_scales = 3 - aspect_ratios = [1.0] - anchor_size = 3 - fpn_num_filters = 256 - head_num_convs = 4 - head_num_filters = 256 - num_anchors_per_location = num_scales * len(aspect_ratios) - image_size = 384 - images = np.random.rand(2, image_size, image_size, 3) - image_shape = np.array([[image_size, image_size], [image_size, image_size]]) - - if build_anchor_boxes: - anchor_boxes = anchor.Anchor( - min_level=min_level, - max_level=max_level, - num_scales=num_scales, - aspect_ratios=aspect_ratios, - anchor_size=anchor_size, - image_size=(image_size, image_size)).multilevel_boxes - for l in anchor_boxes: - anchor_boxes[l] = tf.tile( - tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1]) - else: - anchor_boxes = None - - if has_att_heads: - attribute_heads = [dict(name='depth', type='regression', size=1)] - else: - attribute_heads = None - - backbone = resnet.ResNet(model_id=50) - decoder = fpn.FPN( - input_specs=backbone.output_specs, - min_level=min_level, - max_level=max_level, - num_filters=fpn_num_filters, - use_separable_conv=use_separable_conv) - head = dense_prediction_heads.RetinaNetHead( - min_level=min_level, - max_level=max_level, - num_classes=num_classes, - attribute_heads=attribute_heads, - num_anchors_per_location=num_anchors_per_location, - use_separable_conv=use_separable_conv, - num_convs=head_num_convs, - num_filters=head_num_filters) - generator = detection_generator.MultilevelDetectionGenerator( - max_num_detections=10) - model = retinanet_model.RetinaNetModel( - backbone=backbone, - decoder=decoder, - head=head, - detection_generator=generator, - min_level=min_level, - max_level=max_level, - num_scales=num_scales, - aspect_ratios=aspect_ratios, - anchor_size=anchor_size) - - _ = model(images, image_shape, anchor_boxes, training=is_training) - - @combinations.generate( - combinations.combine( - strategy=[ - strategy_combinations.cloud_tpu_strategy, - strategy_combinations.one_device_strategy_gpu, - ], - image_size=[ - (128, 128), - ], - training=[True, False], - has_att_heads=[True, False], - output_intermediate_features=[True, False], - soft_nms_sigma=[None, 0.0, 0.1], - )) - def test_forward(self, strategy, image_size, training, has_att_heads, - output_intermediate_features, soft_nms_sigma): - """Test for creation of a R50-FPN RetinaNet.""" - tf.keras.backend.set_image_data_format('channels_last') - num_classes = 3 - min_level = 3 - max_level = 7 - num_scales = 3 - aspect_ratios = [1.0] - num_anchors_per_location = num_scales * len(aspect_ratios) - - images = np.random.rand(2, image_size[0], image_size[1], 3) - image_shape = np.array( - [[image_size[0], image_size[1]], [image_size[0], image_size[1]]]) - - with strategy.scope(): - anchor_gen = anchor.build_anchor_generator( - min_level=min_level, - max_level=max_level, - num_scales=num_scales, - aspect_ratios=aspect_ratios, - anchor_size=3) - anchor_boxes = anchor_gen(image_size) - for l in anchor_boxes: - anchor_boxes[l] = tf.tile( - tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1]) - - backbone = resnet.ResNet(model_id=50) - decoder = fpn.FPN( - input_specs=backbone.output_specs, - min_level=min_level, - max_level=max_level) - - if has_att_heads: - attribute_heads = [dict(name='depth', type='regression', size=1)] - else: - attribute_heads = None - head = dense_prediction_heads.RetinaNetHead( - min_level=min_level, - max_level=max_level, - num_classes=num_classes, - attribute_heads=attribute_heads, - num_anchors_per_location=num_anchors_per_location) - generator = detection_generator.MultilevelDetectionGenerator( - max_num_detections=10, - nms_version='v1', - use_cpu_nms=soft_nms_sigma is not None, - soft_nms_sigma=soft_nms_sigma) - model = retinanet_model.RetinaNetModel( - backbone=backbone, - decoder=decoder, - head=head, - detection_generator=generator) - - model_outputs = model( - images, - image_shape, - anchor_boxes, - output_intermediate_features=output_intermediate_features, - training=training) - - if training: - cls_outputs = model_outputs['cls_outputs'] - box_outputs = model_outputs['box_outputs'] - for level in range(min_level, max_level + 1): - self.assertIn(str(level), cls_outputs) - self.assertIn(str(level), box_outputs) - self.assertAllEqual([ - 2, - image_size[0] // 2**level, - image_size[1] // 2**level, - num_classes * num_anchors_per_location - ], cls_outputs[str(level)].numpy().shape) - self.assertAllEqual([ - 2, - image_size[0] // 2**level, - image_size[1] // 2**level, - 4 * num_anchors_per_location - ], box_outputs[str(level)].numpy().shape) - if has_att_heads: - att_outputs = model_outputs['attribute_outputs'] - for att in att_outputs.values(): - self.assertAllEqual([ - 2, image_size[0] // 2**level, image_size[1] // 2**level, - 1 * num_anchors_per_location - ], att[str(level)].numpy().shape) - else: - self.assertIn('detection_boxes', model_outputs) - self.assertIn('detection_scores', model_outputs) - self.assertIn('detection_classes', model_outputs) - self.assertIn('num_detections', model_outputs) - self.assertAllEqual( - [2, 10, 4], model_outputs['detection_boxes'].numpy().shape) - self.assertAllEqual( - [2, 10], model_outputs['detection_scores'].numpy().shape) - self.assertAllEqual( - [2, 10], model_outputs['detection_classes'].numpy().shape) - self.assertAllEqual( - [2,], model_outputs['num_detections'].numpy().shape) - if has_att_heads: - self.assertIn('detection_attributes', model_outputs) - self.assertAllEqual( - [2, 10, 1], - model_outputs['detection_attributes']['depth'].numpy().shape) - if output_intermediate_features: - for l in range(2, 6): - self.assertIn('backbone_{}'.format(l), model_outputs) - self.assertAllEqual([ - 2, image_size[0] // 2**l, image_size[1] // 2**l, - backbone.output_specs[str(l)].as_list()[-1] - ], model_outputs['backbone_{}'.format(l)].numpy().shape) - for l in range(min_level, max_level + 1): - self.assertIn('decoder_{}'.format(l), model_outputs) - self.assertAllEqual([ - 2, image_size[0] // 2**l, image_size[1] // 2**l, - decoder.output_specs[str(l)].as_list()[-1] - ], model_outputs['decoder_{}'.format(l)].numpy().shape) - - def test_serialize_deserialize(self): - """Validate the network can be serialized and deserialized.""" - num_classes = 3 - min_level = 3 - max_level = 7 - num_scales = 3 - aspect_ratios = [1.0] - num_anchors_per_location = num_scales * len(aspect_ratios) - - backbone = resnet.ResNet(model_id=50) - decoder = fpn.FPN( - input_specs=backbone.output_specs, - min_level=min_level, - max_level=max_level) - head = dense_prediction_heads.RetinaNetHead( - min_level=min_level, - max_level=max_level, - num_classes=num_classes, - num_anchors_per_location=num_anchors_per_location) - generator = detection_generator.MultilevelDetectionGenerator( - max_num_detections=10) - model = retinanet_model.RetinaNetModel( - backbone=backbone, - decoder=decoder, - head=head, - detection_generator=generator, - min_level=min_level, - max_level=max_level, - num_scales=num_scales, - aspect_ratios=aspect_ratios, - anchor_size=3) - - config = model.get_config() - new_model = retinanet_model.RetinaNetModel.from_config(config) - - # Validate that the config can be forced to JSON. - _ = new_model.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(model.get_config(), new_model.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/segmentation_model.py b/official/vision/beta/modeling/segmentation_model.py deleted file mode 100644 index 0f213df5b..000000000 --- a/official/vision/beta/modeling/segmentation_model.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Build segmentation models.""" -from typing import Any, Mapping, Union, Optional, Dict - -# Import libraries -import tensorflow as tf - -layers = tf.keras.layers - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class SegmentationModel(tf.keras.Model): - """A Segmentation class model. - - Input images are passed through backbone first. Decoder network is then - applied, and finally, segmentation head is applied on the output of the - decoder network. Layers such as ASPP should be part of decoder. Any feature - fusion is done as part of the segmentation head (i.e. deeplabv3+ feature - fusion is not part of the decoder, instead it is part of the segmentation - head). This way, different feature fusion techniques can be combined with - different backbones, and decoders. - """ - - def __init__(self, backbone: tf.keras.Model, decoder: tf.keras.Model, - head: tf.keras.layers.Layer, - mask_scoring_head: Optional[tf.keras.layers.Layer] = None, - **kwargs): - """Segmentation initialization function. - - Args: - backbone: a backbone network. - decoder: a decoder network. E.g. FPN. - head: segmentation head. - mask_scoring_head: mask scoring head. - **kwargs: keyword arguments to be passed. - """ - super(SegmentationModel, self).__init__(**kwargs) - self._config_dict = { - 'backbone': backbone, - 'decoder': decoder, - 'head': head, - 'mask_scoring_head': mask_scoring_head, - } - self.backbone = backbone - self.decoder = decoder - self.head = head - self.mask_scoring_head = mask_scoring_head - - def call(self, inputs: tf.Tensor, training: bool = None - ) -> Dict[str, tf.Tensor]: - backbone_features = self.backbone(inputs) - - if self.decoder: - decoder_features = self.decoder(backbone_features) - else: - decoder_features = backbone_features - - logits = self.head((backbone_features, decoder_features)) - outputs = {'logits': logits} - if self.mask_scoring_head: - mask_scores = self.mask_scoring_head(logits) - outputs.update({'mask_scores': mask_scores}) - return outputs - - @property - def checkpoint_items( - self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]: - """Returns a dictionary of items to be additionally checkpointed.""" - items = dict(backbone=self.backbone, head=self.head) - if self.decoder is not None: - items.update(decoder=self.decoder) - if self.mask_scoring_head is not None: - items.update(mask_scoring_head=self.mask_scoring_head) - return items - - def get_config(self) -> Mapping[str, Any]: - return self._config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) diff --git a/official/vision/beta/modeling/segmentation_model_test.py b/official/vision/beta/modeling/segmentation_model_test.py deleted file mode 100644 index 4f7bdfed6..000000000 --- a/official/vision/beta/modeling/segmentation_model_test.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for segmentation network.""" - -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from official.vision.beta.modeling import backbones -from official.vision.beta.modeling import segmentation_model -from official.vision.beta.modeling.decoders import fpn -from official.vision.beta.modeling.heads import segmentation_heads - - -class SegmentationNetworkTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (128, 2), - (128, 3), - (128, 4), - (256, 2), - (256, 3), - (256, 4), - ) - def test_segmentation_network_creation( - self, input_size, level): - """Test for creation of a segmentation network.""" - num_classes = 10 - inputs = np.random.rand(2, input_size, input_size, 3) - tf.keras.backend.set_image_data_format('channels_last') - backbone = backbones.ResNet(model_id=50) - - decoder = fpn.FPN( - input_specs=backbone.output_specs, min_level=2, max_level=7) - head = segmentation_heads.SegmentationHead(num_classes, level=level) - - model = segmentation_model.SegmentationModel( - backbone=backbone, - decoder=decoder, - head=head, - mask_scoring_head=None, - ) - - outputs = model(inputs) - self.assertAllEqual( - [2, input_size // (2**level), input_size // (2**level), num_classes], - outputs['logits'].numpy().shape) - - def test_serialize_deserialize(self): - """Validate the network can be serialized and deserialized.""" - num_classes = 3 - backbone = backbones.ResNet(model_id=50) - decoder = fpn.FPN( - input_specs=backbone.output_specs, min_level=3, max_level=7) - head = segmentation_heads.SegmentationHead(num_classes, level=3) - model = segmentation_model.SegmentationModel( - backbone=backbone, - decoder=decoder, - head=head - ) - - config = model.get_config() - new_model = segmentation_model.SegmentationModel.from_config(config) - - # Validate that the config can be forced to JSON. - _ = new_model.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(model.get_config(), new_model.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/modeling/video_classification_model.py b/official/vision/beta/modeling/video_classification_model.py deleted file mode 100644 index 40c72f748..000000000 --- a/official/vision/beta/modeling/video_classification_model.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Build video classification models.""" -from typing import Any, Mapping, Optional, Union, List, Text - -import tensorflow as tf - -layers = tf.keras.layers - - -@tf.keras.utils.register_keras_serializable(package='Beta') -class VideoClassificationModel(tf.keras.Model): - """A video classification class builder.""" - - def __init__( - self, - backbone: tf.keras.Model, - num_classes: int, - input_specs: Optional[Mapping[str, tf.keras.layers.InputSpec]] = None, - dropout_rate: float = 0.0, - aggregate_endpoints: bool = False, - kernel_initializer: str = 'random_uniform', - kernel_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - bias_regularizer: Optional[tf.keras.regularizers.Regularizer] = None, - require_endpoints: Optional[List[Text]] = None, - **kwargs): - """Video Classification initialization function. - - Args: - backbone: a 3d backbone network. - num_classes: `int` number of classes in classification task. - input_specs: `tf.keras.layers.InputSpec` specs of the input tensor. - dropout_rate: `float` rate for dropout regularization. - aggregate_endpoints: `bool` aggregate all end ponits or only use the - final end point. - kernel_initializer: kernel initializer for the dense layer. - kernel_regularizer: tf.keras.regularizers.Regularizer object. Default to - None. - bias_regularizer: tf.keras.regularizers.Regularizer object. Default to - None. - require_endpoints: the required endpoints for prediction. If None or - empty, then only uses the final endpoint. - **kwargs: keyword arguments to be passed. - """ - if not input_specs: - input_specs = { - 'image': layers.InputSpec(shape=[None, None, None, None, 3]) - } - self._self_setattr_tracking = False - self._config_dict = { - 'backbone': backbone, - 'num_classes': num_classes, - 'input_specs': input_specs, - 'dropout_rate': dropout_rate, - 'aggregate_endpoints': aggregate_endpoints, - 'kernel_initializer': kernel_initializer, - 'kernel_regularizer': kernel_regularizer, - 'bias_regularizer': bias_regularizer, - 'require_endpoints': require_endpoints, - } - self._input_specs = input_specs - self._kernel_regularizer = kernel_regularizer - self._bias_regularizer = bias_regularizer - self._backbone = backbone - - inputs = { - k: tf.keras.Input(shape=v.shape[1:]) for k, v in input_specs.items() - } - endpoints = backbone(inputs['image']) - - if aggregate_endpoints: - pooled_feats = [] - for endpoint in endpoints.values(): - x_pool = tf.keras.layers.GlobalAveragePooling3D()(endpoint) - pooled_feats.append(x_pool) - x = tf.concat(pooled_feats, axis=1) - else: - if not require_endpoints: - # Uses the last endpoint for prediction. - x = endpoints[max(endpoints.keys())] - x = tf.keras.layers.GlobalAveragePooling3D()(x) - else: - # Concats all the required endpoints for prediction. - outputs = [] - for name in require_endpoints: - x = endpoints[name] - x = tf.keras.layers.GlobalAveragePooling3D()(x) - outputs.append(x) - x = tf.concat(outputs, axis=1) - - x = tf.keras.layers.Dropout(dropout_rate)(x) - x = tf.keras.layers.Dense( - num_classes, kernel_initializer=kernel_initializer, - kernel_regularizer=self._kernel_regularizer, - bias_regularizer=self._bias_regularizer)( - x) - - super(VideoClassificationModel, self).__init__( - inputs=inputs, outputs=x, **kwargs) - - @property - def checkpoint_items( - self) -> Mapping[str, Union[tf.keras.Model, tf.keras.layers.Layer]]: - """Returns a dictionary of items to be additionally checkpointed.""" - return dict(backbone=self.backbone) - - @property - def backbone(self) -> tf.keras.Model: - return self._backbone - - def get_config(self) -> Mapping[str, Any]: - return self._config_dict - - @classmethod - def from_config(cls, config, custom_objects=None): - return cls(**config) diff --git a/official/vision/beta/modeling/video_classification_model_test.py b/official/vision/beta/modeling/video_classification_model_test.py deleted file mode 100644 index d1f36f9b2..000000000 --- a/official/vision/beta/modeling/video_classification_model_test.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for video classification network.""" - -# Import libraries -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from official.vision.beta.modeling import backbones -from official.vision.beta.modeling import video_classification_model - - -class VideoClassificationNetworkTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - (50, 8, 112, 'relu', False), - (50, 8, 112, 'swish', True), - ) - def test_resnet3d_network_creation(self, model_id, temporal_size, - spatial_size, activation, - aggregate_endpoints): - """Test for creation of a ResNet3D-50 classifier.""" - input_specs = tf.keras.layers.InputSpec( - shape=[None, temporal_size, spatial_size, spatial_size, 3]) - temporal_strides = [1, 1, 1, 1] - temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1), - (1, 3, 1)] - - tf.keras.backend.set_image_data_format('channels_last') - - backbone = backbones.ResNet3D( - model_id=model_id, - temporal_strides=temporal_strides, - temporal_kernel_sizes=temporal_kernel_sizes, - input_specs=input_specs, - activation=activation) - - num_classes = 1000 - model = video_classification_model.VideoClassificationModel( - backbone=backbone, - num_classes=num_classes, - input_specs={'image': input_specs}, - dropout_rate=0.2, - aggregate_endpoints=aggregate_endpoints, - ) - - inputs = np.random.rand(2, temporal_size, spatial_size, spatial_size, 3) - logits = model(inputs) - self.assertAllEqual([2, num_classes], logits.numpy().shape) - - def test_serialize_deserialize(self): - """Validate the classification network can be serialized and deserialized.""" - model_id = 50 - temporal_strides = [1, 1, 1, 1] - temporal_kernel_sizes = [(3, 3, 3), (3, 1, 3, 1), (3, 1, 3, 1, 3, 1), - (1, 3, 1)] - - backbone = backbones.ResNet3D( - model_id=model_id, - temporal_strides=temporal_strides, - temporal_kernel_sizes=temporal_kernel_sizes) - - model = video_classification_model.VideoClassificationModel( - backbone=backbone, num_classes=1000) - - config = model.get_config() - new_model = video_classification_model.VideoClassificationModel.from_config( - config) - - # Validate that the config can be forced to JSON. - _ = new_model.to_json() - - # If the serialization was successful, the new config should match the old. - self.assertAllEqual(model.get_config(), new_model.get_config()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/ops/__init__.py b/official/vision/beta/ops/__init__.py deleted file mode 100644 index 310bfb28f..000000000 --- a/official/vision/beta/ops/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - diff --git a/official/vision/beta/ops/anchor.py b/official/vision/beta/ops/anchor.py deleted file mode 100644 index 9351864bd..000000000 --- a/official/vision/beta/ops/anchor.py +++ /dev/null @@ -1,373 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Anchor box and labeler definition.""" - -import collections - -# Import libraries - -import tensorflow as tf - -from official.vision.beta.ops import anchor_generator -from official.vision.beta.ops import box_matcher -from official.vision.beta.ops import iou_similarity -from official.vision.beta.ops import target_gather -from official.vision.utils.object_detection import balanced_positive_negative_sampler -from official.vision.utils.object_detection import box_list -from official.vision.utils.object_detection import faster_rcnn_box_coder - - -class Anchor(object): - """Anchor class for anchor-based object detectors.""" - - def __init__(self, - min_level, - max_level, - num_scales, - aspect_ratios, - anchor_size, - image_size): - """Constructs multiscale anchors. - - Args: - min_level: integer number of minimum level of the output feature pyramid. - max_level: integer number of maximum level of the output feature pyramid. - num_scales: integer number representing intermediate scales added - on each level. For instances, num_scales=2 adds one additional - intermediate anchor scales [2^0, 2^0.5] on each level. - aspect_ratios: list of float numbers representing the aspect raito anchors - added on each level. The number indicates the ratio of width to height. - For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each - scale level. - anchor_size: float number representing the scale of size of the base - anchor to the feature stride 2^level. - image_size: a list of integer numbers or Tensors representing - [height, width] of the input image size.The image_size should be divided - by the largest feature stride 2^max_level. - """ - self.min_level = min_level - self.max_level = max_level - self.num_scales = num_scales - self.aspect_ratios = aspect_ratios - self.anchor_size = anchor_size - self.image_size = image_size - self.boxes = self._generate_boxes() - - def _generate_boxes(self): - """Generates multiscale anchor boxes. - - Returns: - a Tensor of shape [N, 4], representing anchor boxes of all levels - concatenated together. - """ - boxes_all = [] - for level in range(self.min_level, self.max_level + 1): - boxes_l = [] - for scale in range(self.num_scales): - for aspect_ratio in self.aspect_ratios: - stride = 2 ** level - intermidate_scale = 2 ** (scale / float(self.num_scales)) - base_anchor_size = self.anchor_size * stride * intermidate_scale - aspect_x = aspect_ratio ** 0.5 - aspect_y = aspect_ratio ** -0.5 - half_anchor_size_x = base_anchor_size * aspect_x / 2.0 - half_anchor_size_y = base_anchor_size * aspect_y / 2.0 - x = tf.range(stride / 2, self.image_size[1], stride) - y = tf.range(stride / 2, self.image_size[0], stride) - xv, yv = tf.meshgrid(x, y) - xv = tf.cast(tf.reshape(xv, [-1]), dtype=tf.float32) - yv = tf.cast(tf.reshape(yv, [-1]), dtype=tf.float32) - # Tensor shape Nx4. - boxes = tf.stack([yv - half_anchor_size_y, xv - half_anchor_size_x, - yv + half_anchor_size_y, xv + half_anchor_size_x], - axis=1) - boxes_l.append(boxes) - # Concat anchors on the same level to tensor shape NxAx4. - boxes_l = tf.stack(boxes_l, axis=1) - boxes_l = tf.reshape(boxes_l, [-1, 4]) - boxes_all.append(boxes_l) - return tf.concat(boxes_all, axis=0) - - def unpack_labels(self, labels): - """Unpacks an array of labels into multiscales labels.""" - unpacked_labels = collections.OrderedDict() - count = 0 - for level in range(self.min_level, self.max_level + 1): - feat_size_y = tf.cast(self.image_size[0] / 2 ** level, tf.int32) - feat_size_x = tf.cast(self.image_size[1] / 2 ** level, tf.int32) - steps = feat_size_y * feat_size_x * self.anchors_per_location - unpacked_labels[str(level)] = tf.reshape( - labels[count:count + steps], [feat_size_y, feat_size_x, -1]) - count += steps - return unpacked_labels - - @property - def anchors_per_location(self): - return self.num_scales * len(self.aspect_ratios) - - @property - def multilevel_boxes(self): - return self.unpack_labels(self.boxes) - - -class AnchorLabeler(object): - """Labeler for dense object detector.""" - - def __init__(self, - match_threshold=0.5, - unmatched_threshold=0.5): - """Constructs anchor labeler to assign labels to anchors. - - Args: - match_threshold: a float number between 0 and 1 representing the - lower-bound threshold to assign positive labels for anchors. An anchor - with a score over the threshold is labeled positive. - unmatched_threshold: a float number between 0 and 1 representing the - upper-bound threshold to assign negative labels for anchors. An anchor - with a score below the threshold is labeled negative. - """ - self.similarity_calc = iou_similarity.IouSimilarity() - self.target_gather = target_gather.TargetGather() - self.matcher = box_matcher.BoxMatcher( - thresholds=[unmatched_threshold, match_threshold], - indicators=[-1, -2, 1], - force_match_for_each_col=True) - self.box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder() - - def label_anchors(self, - anchor_boxes, - gt_boxes, - gt_labels, - gt_attributes=None): - """Labels anchors with ground truth inputs. - - Args: - anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes. - For each row, it stores [y0, x0, y1, x1] for four corners of a box. - gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes. - For each row, it stores [y0, x0, y1, x1] for four corners of a box. - gt_labels: A integer tensor with shape [N, 1] representing groundtruth - classes. - gt_attributes: If not None, a dict of (name, gt_attribute) pairs. - `gt_attribute` is a float tensor with shape [N, attribute_size] - representing groundtruth attributes. - Returns: - cls_targets_dict: ordered dictionary with keys - [min_level, min_level+1, ..., max_level]. The values are tensor with - shape [height_l, width_l, num_anchors_per_location]. The height_l and - width_l represent the dimension of class logits at l-th level. - box_targets_dict: ordered dictionary with keys - [min_level, min_level+1, ..., max_level]. The values are tensor with - shape [height_l, width_l, num_anchors_per_location * 4]. The height_l - and width_l represent the dimension of bounding box regression output at - l-th level. - attribute_targets_dict: a dict with (name, attribute_targets) pairs. Each - `attribute_targets` represents an ordered dictionary with keys - [min_level, min_level+1, ..., max_level]. The values are tensor with - shape [height_l, width_l, num_anchors_per_location * attribute_size]. - The height_l and width_l represent the dimension of attribute prediction - output at l-th level. - cls_weights: A flattened Tensor with shape [batch_size, num_anchors], that - serves as masking / sample weight for classification loss. Its value - is 1.0 for positive and negative matched anchors, and 0.0 for ignored - anchors. - box_weights: A flattened Tensor with shape [batch_size, num_anchors], that - serves as masking / sample weight for regression loss. Its value is - 1.0 for positive matched anchors, and 0.0 for negative and ignored - anchors. - """ - flattened_anchor_boxes = [] - for anchors in anchor_boxes.values(): - flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4])) - flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0) - similarity_matrix = self.similarity_calc(flattened_anchor_boxes, gt_boxes) - match_indices, match_indicators = self.matcher(similarity_matrix) - - mask = tf.less_equal(match_indicators, 0) - cls_mask = tf.expand_dims(mask, -1) - cls_targets = self.target_gather(gt_labels, match_indices, cls_mask, -1) - box_mask = tf.tile(cls_mask, [1, 4]) - box_targets = self.target_gather(gt_boxes, match_indices, box_mask) - att_targets = {} - if gt_attributes: - for k, v in gt_attributes.items(): - att_size = v.get_shape().as_list()[-1] - att_mask = tf.tile(cls_mask, [1, att_size]) - att_targets[k] = self.target_gather(v, match_indices, att_mask, 0.0) - - weights = tf.squeeze(tf.ones_like(gt_labels, dtype=tf.float32), -1) - box_weights = self.target_gather(weights, match_indices, mask) - ignore_mask = tf.equal(match_indicators, -2) - cls_weights = self.target_gather(weights, match_indices, ignore_mask) - box_targets_list = box_list.BoxList(box_targets) - anchor_box_list = box_list.BoxList(flattened_anchor_boxes) - box_targets = self.box_coder.encode(box_targets_list, anchor_box_list) - - # Unpacks labels into multi-level representations. - cls_targets_dict = unpack_targets(cls_targets, anchor_boxes) - box_targets_dict = unpack_targets(box_targets, anchor_boxes) - attribute_targets_dict = {} - for k, v in att_targets.items(): - attribute_targets_dict[k] = unpack_targets(v, anchor_boxes) - - return cls_targets_dict, box_targets_dict, attribute_targets_dict, cls_weights, box_weights - - -class RpnAnchorLabeler(AnchorLabeler): - """Labeler for Region Proposal Network.""" - - def __init__(self, - match_threshold=0.7, - unmatched_threshold=0.3, - rpn_batch_size_per_im=256, - rpn_fg_fraction=0.5): - AnchorLabeler.__init__(self, match_threshold=match_threshold, - unmatched_threshold=unmatched_threshold) - self._rpn_batch_size_per_im = rpn_batch_size_per_im - self._rpn_fg_fraction = rpn_fg_fraction - - def _get_rpn_samples(self, match_results): - """Computes anchor labels. - - This function performs subsampling for foreground (fg) and background (bg) - anchors. - Args: - match_results: A integer tensor with shape [N] representing the - matching results of anchors. (1) match_results[i]>=0, - meaning that column i is matched with row match_results[i]. - (2) match_results[i]=-1, meaning that column i is not matched. - (3) match_results[i]=-2, meaning that column i is ignored. - Returns: - score_targets: a integer tensor with the a shape of [N]. - (1) score_targets[i]=1, the anchor is a positive sample. - (2) score_targets[i]=0, negative. (3) score_targets[i]=-1, the anchor is - don't care (ignore). - """ - sampler = ( - balanced_positive_negative_sampler.BalancedPositiveNegativeSampler( - positive_fraction=self._rpn_fg_fraction, is_static=False)) - # indicator includes both positive and negative labels. - # labels includes only positives labels. - # positives = indicator & labels. - # negatives = indicator & !labels. - # ignore = !indicator. - indicator = tf.greater(match_results, -2) - labels = tf.greater(match_results, -1) - - samples = sampler.subsample( - indicator, self._rpn_batch_size_per_im, labels) - positive_labels = tf.where( - tf.logical_and(samples, labels), - tf.constant(2, dtype=tf.int32, shape=match_results.shape), - tf.constant(0, dtype=tf.int32, shape=match_results.shape)) - negative_labels = tf.where( - tf.logical_and(samples, tf.logical_not(labels)), - tf.constant(1, dtype=tf.int32, shape=match_results.shape), - tf.constant(0, dtype=tf.int32, shape=match_results.shape)) - ignore_labels = tf.fill(match_results.shape, -1) - - return (ignore_labels + positive_labels + negative_labels, - positive_labels, negative_labels) - - def label_anchors(self, anchor_boxes, gt_boxes, gt_labels): - """Labels anchors with ground truth inputs. - - Args: - anchor_boxes: A float tensor with shape [N, 4] representing anchor boxes. - For each row, it stores [y0, x0, y1, x1] for four corners of a box. - gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes. - For each row, it stores [y0, x0, y1, x1] for four corners of a box. - gt_labels: A integer tensor with shape [N, 1] representing groundtruth - classes. - Returns: - score_targets_dict: ordered dictionary with keys - [min_level, min_level+1, ..., max_level]. The values are tensor with - shape [height_l, width_l, num_anchors]. The height_l and width_l - represent the dimension of class logits at l-th level. - box_targets_dict: ordered dictionary with keys - [min_level, min_level+1, ..., max_level]. The values are tensor with - shape [height_l, width_l, num_anchors * 4]. The height_l and - width_l represent the dimension of bounding box regression output at - l-th level. - """ - flattened_anchor_boxes = [] - for anchors in anchor_boxes.values(): - flattened_anchor_boxes.append(tf.reshape(anchors, [-1, 4])) - flattened_anchor_boxes = tf.concat(flattened_anchor_boxes, axis=0) - similarity_matrix = self.similarity_calc(flattened_anchor_boxes, gt_boxes) - match_indices, match_indicators = self.matcher(similarity_matrix) - box_mask = tf.tile(tf.expand_dims(tf.less_equal(match_indicators, 0), -1), - [1, 4]) - box_targets = self.target_gather(gt_boxes, match_indices, box_mask) - box_targets_list = box_list.BoxList(box_targets) - anchor_box_list = box_list.BoxList(flattened_anchor_boxes) - box_targets = self.box_coder.encode(box_targets_list, anchor_box_list) - - # Zero out the unmatched and ignored regression targets. - num_matches = match_indices.shape.as_list()[0] or tf.shape(match_indices)[0] - unmatched_ignored_box_targets = tf.zeros([num_matches, 4], dtype=tf.float32) - matched_anchors_mask = tf.greater_equal(match_indicators, 0) - # To broadcast matched_anchors_mask to the same shape as - # matched_reg_targets. - matched_anchors_mask = tf.tile( - tf.expand_dims(matched_anchors_mask, 1), - [1, tf.shape(box_targets)[1]]) - box_targets = tf.where(matched_anchors_mask, box_targets, - unmatched_ignored_box_targets) - - # score_targets contains the subsampled positive and negative anchors. - score_targets, _, _ = self._get_rpn_samples(match_indicators) - - # Unpacks labels. - score_targets_dict = unpack_targets(score_targets, anchor_boxes) - box_targets_dict = unpack_targets(box_targets, anchor_boxes) - - return score_targets_dict, box_targets_dict - - -def build_anchor_generator(min_level, max_level, num_scales, aspect_ratios, - anchor_size): - """Build anchor generator from levels.""" - anchor_sizes = collections.OrderedDict() - strides = collections.OrderedDict() - scales = [] - for scale in range(num_scales): - scales.append(2**(scale / float(num_scales))) - for level in range(min_level, max_level + 1): - stride = 2**level - strides[str(level)] = stride - anchor_sizes[str(level)] = anchor_size * stride - anchor_gen = anchor_generator.AnchorGenerator( - anchor_sizes=anchor_sizes, - scales=scales, - aspect_ratios=aspect_ratios, - strides=strides) - return anchor_gen - - -def unpack_targets(targets, anchor_boxes_dict): - """Unpacks an array of labels into multiscales labels.""" - unpacked_targets = collections.OrderedDict() - count = 0 - for level, anchor_boxes in anchor_boxes_dict.items(): - feat_size_shape = anchor_boxes.shape.as_list() - feat_size_y = feat_size_shape[0] - feat_size_x = feat_size_shape[1] - anchors_per_location = int(feat_size_shape[2] / 4) - steps = feat_size_y * feat_size_x * anchors_per_location - unpacked_targets[level] = tf.reshape(targets[count:count + steps], - [feat_size_y, feat_size_x, -1]) - count += steps - return unpacked_targets diff --git a/official/vision/beta/ops/anchor_generator.py b/official/vision/beta/ops/anchor_generator.py deleted file mode 100644 index b2ced0c62..000000000 --- a/official/vision/beta/ops/anchor_generator.py +++ /dev/null @@ -1,182 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multi scale anchor generator definition.""" - -import tensorflow as tf - - -# (TODO/tanzheny): consider having customized anchor offset. -class _SingleAnchorGenerator: - """Utility to generate anchors for a single feature map. - - Example: - ```python - anchor_gen = _SingleAnchorGenerator(32, [.5, 1., 2.], stride=16) - anchors = anchor_gen([512, 512, 3]) - ``` - """ - - def __init__(self, - anchor_size, - scales, - aspect_ratios, - stride, - clip_boxes=False): - """Constructs single scale anchor. - - Args: - anchor_size: A single int represents the base anchor size. The anchor - height will be `anchor_size / sqrt(aspect_ratio)`, anchor width will be - `anchor_size * sqrt(aspect_ratio)`. - scales: A list/tuple, or a list/tuple of a list/tuple of positive - floats representing the actual anchor size to the base `anchor_size`. - aspect_ratios: a list/tuple of positive floats representing the ratio of - anchor width to anchor height. - stride: A single int represents the anchor stride size between center of - each anchor. - clip_boxes: Boolean to represent whether the anchor coordinates should be - clipped to the image size. Defaults to `True`. - Input shape: the size of the image, `[H, W, C]` - Output shape: the size of anchors, `[(H / stride) * (W / stride), 4]` - """ - self.anchor_size = anchor_size - self.scales = scales - self.aspect_ratios = aspect_ratios - self.stride = stride - self.clip_boxes = clip_boxes - - def __call__(self, image_size): - image_height = tf.cast(image_size[0], tf.float32) - image_width = tf.cast(image_size[1], tf.float32) - - k = len(self.scales) * len(self.aspect_ratios) - aspect_ratios_sqrt = tf.cast(tf.sqrt(self.aspect_ratios), dtype=tf.float32) - anchor_size = tf.cast(self.anchor_size, tf.float32) - - # [K] - anchor_heights = [] - anchor_widths = [] - for scale in self.scales: - anchor_size_t = anchor_size * scale - anchor_height = anchor_size_t / aspect_ratios_sqrt - anchor_width = anchor_size_t * aspect_ratios_sqrt - anchor_heights.append(anchor_height) - anchor_widths.append(anchor_width) - anchor_heights = tf.concat(anchor_heights, axis=0) - anchor_widths = tf.concat(anchor_widths, axis=0) - half_anchor_heights = tf.reshape(0.5 * anchor_heights, [1, 1, k]) - half_anchor_widths = tf.reshape(0.5 * anchor_widths, [1, 1, k]) - - stride = tf.cast(self.stride, tf.float32) - # [W] - cx = tf.range(0.5 * stride, image_width, stride) - # [H] - cy = tf.range(0.5 * stride, image_height, stride) - # [H, W] - cx_grid, cy_grid = tf.meshgrid(cx, cy) - # [H, W, 1] - cx_grid = tf.expand_dims(cx_grid, axis=-1) - cy_grid = tf.expand_dims(cy_grid, axis=-1) - - # [H, W, K, 1] - y_min = tf.expand_dims(cy_grid - half_anchor_heights, axis=-1) - y_max = tf.expand_dims(cy_grid + half_anchor_heights, axis=-1) - x_min = tf.expand_dims(cx_grid - half_anchor_widths, axis=-1) - x_max = tf.expand_dims(cx_grid + half_anchor_widths, axis=-1) - - if self.clip_boxes: - y_min = tf.maximum(tf.minimum(y_min, image_height), 0.) - y_max = tf.maximum(tf.minimum(y_max, image_height), 0.) - x_min = tf.maximum(tf.minimum(x_min, image_width), 0.) - x_max = tf.maximum(tf.minimum(x_max, image_width), 0.) - - # [H, W, K, 4] - result = tf.concat([y_min, x_min, y_max, x_max], axis=-1) - shape = result.shape.as_list() - # [H, W, K * 4] - return tf.reshape(result, [shape[0], shape[1], shape[2] * shape[3]]) - - -class AnchorGenerator(): - """Utility to generate anchors for a multiple feature maps. - - Example: - ```python - anchor_gen = AnchorGenerator([32, 64], [.5, 1., 2.], - strides=[16, 32]) - anchors = anchor_gen([512, 512, 3]) - ``` - - """ - - def __init__(self, - anchor_sizes, - scales, - aspect_ratios, - strides, - clip_boxes=False): - """Constructs multiscale anchors. - - Args: - anchor_sizes: A list of int represents the anchor size for each scale. The - anchor height will be `anchor_size / sqrt(aspect_ratio)`, anchor width - will be `anchor_size * sqrt(aspect_ratio)` for each scale. - scales: A list/tuple, or a list/tuple of a list/tuple of positive - floats representing the actual anchor size to the base `anchor_size`. - aspect_ratios: A list/tuple, or a list/tuple of a list/tuple of positive - floats representing the ratio of anchor width to anchor height. - strides: A list/tuple of ints represent the anchor stride size between - center of anchors at each scale. - clip_boxes: Boolean to represents whether the anchor coordinates should be - clipped to the image size. Defaults to `False`. - Input shape: the size of the image, `[H, W, C]` - Output shape: the size of anchors concat on each level, `[(H / - strides) * (W / strides), K * 4]` - """ - # aspect_ratio is a single list that is the same across all levels. - aspect_ratios = maybe_map_structure_for_anchor(aspect_ratios, anchor_sizes) - scales = maybe_map_structure_for_anchor(scales, anchor_sizes) - if isinstance(anchor_sizes, dict): - self.anchor_generators = {} - for k in anchor_sizes.keys(): - self.anchor_generators[k] = _SingleAnchorGenerator( - anchor_sizes[k], scales[k], aspect_ratios[k], strides[k], - clip_boxes) - elif isinstance(anchor_sizes, (list, tuple)): - self.anchor_generators = [] - for anchor_size, scale_list, ar_list, stride in zip( - anchor_sizes, scales, aspect_ratios, strides): - self.anchor_generators.append( - _SingleAnchorGenerator(anchor_size, scale_list, ar_list, stride, - clip_boxes)) - - def __call__(self, image_size): - anchor_generators = tf.nest.flatten(self.anchor_generators) - results = [anchor_gen(image_size) for anchor_gen in anchor_generators] - return tf.nest.pack_sequence_as(self.anchor_generators, results) - - -def maybe_map_structure_for_anchor(params, anchor_sizes): - """broadcast the params to match anchor_sizes.""" - if all(isinstance(param, (int, float)) for param in params): - if isinstance(anchor_sizes, (tuple, list)): - return [params] * len(anchor_sizes) - elif isinstance(anchor_sizes, dict): - return tf.nest.map_structure(lambda _: params, anchor_sizes) - else: - raise ValueError("the structure of `anchor_sizes` must be a tuple, " - "list, or dict, given {}".format(anchor_sizes)) - else: - return params diff --git a/official/vision/beta/ops/anchor_generator_test.py b/official/vision/beta/ops/anchor_generator_test.py deleted file mode 100644 index c7d89e586..000000000 --- a/official/vision/beta/ops/anchor_generator_test.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for anchor_generator.py.""" - -from absl.testing import parameterized -import tensorflow as tf -from official.vision.beta.ops import anchor_generator - - -class AnchorGeneratorTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - # Single scale anchor. - (5, [1.0], [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]], - [[16., -16., 80., 48.], [16., 16., 80., 80.]]]), - # # Multi aspect ratio anchor. - (6, [1.0, 4.0, 0.25], - [[[-32., -32., 96., 96., 0., -96., 64., 160., -96., 0., 160., 64.]]]), - ) - def testAnchorGeneration(self, level, aspect_ratios, expected_boxes): - image_size = [64, 64] - anchor_size = 2**(level + 1) - stride = 2**level - anchor_gen = anchor_generator._SingleAnchorGenerator( - anchor_size=anchor_size, - scales=[1.], - aspect_ratios=aspect_ratios, - stride=stride, - clip_boxes=False) - anchors = anchor_gen(image_size).numpy() - self.assertAllClose(expected_boxes, anchors) - - @parameterized.parameters( - # Single scale anchor. - (5, [1.0], [[[0., 0., 48., 48.], [0., 16., 48., 64.]], - [[16., 0., 64., 48.], [16., 16., 64., 64.]]]), - # # Multi aspect ratio anchor. - (6, [1.0, 4.0, 0.25 - ], [[[0., 0., 64., 64., 0., 0., 64., 64., 0., 0., 64., 64.]]]), - ) - def testAnchorGenerationClipped(self, level, aspect_ratios, expected_boxes): - image_size = [64, 64] - anchor_size = 2**(level + 1) - stride = 2**level - anchor_gen = anchor_generator._SingleAnchorGenerator( - anchor_size=anchor_size, - scales=[1.], - aspect_ratios=aspect_ratios, - stride=stride, - clip_boxes=True) - anchors = anchor_gen(image_size).numpy() - self.assertAllClose(expected_boxes, anchors) - - -class MultiScaleAnchorGeneratorTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - # Multi scale anchor. - (5, 6, [[1.0], [1.0]], [[-16, -16, 48, 48], [-16, 16, 48, 80], - [16, -16, 80, 48], [16, 16, 80, 80], - [-32, -32, 96, 96]]),) - def testAnchorGeneration(self, min_level, max_level, aspect_ratios, - expected_boxes): - image_size = [64, 64] - levels = range(min_level, max_level + 1) - anchor_sizes = [2**(level + 1) for level in levels] - strides = [2**level for level in levels] - anchor_gen = anchor_generator.AnchorGenerator( - anchor_sizes=anchor_sizes, - scales=[1.], - aspect_ratios=aspect_ratios, - strides=strides) - anchors = anchor_gen(image_size) - anchors = [tf.reshape(anchor, [-1, 4]) for anchor in anchors] - anchors = tf.concat(anchors, axis=0).numpy() - self.assertAllClose(expected_boxes, anchors) - - @parameterized.parameters( - # Multi scale anchor. - (5, 6, [[1.0], [1.0]], [[-16, -16, 48, 48], [-16, 16, 48, 80], - [16, -16, 80, 48], [16, 16, 80, 80], - [-32, -32, 96, 96]]),) - def testAnchorGenerationClipped(self, min_level, max_level, aspect_ratios, - expected_boxes): - image_size = [64, 64] - levels = range(min_level, max_level + 1) - anchor_sizes = [2**(level + 1) for level in levels] - strides = [2**level for level in levels] - anchor_gen = anchor_generator.AnchorGenerator( - anchor_sizes=anchor_sizes, - scales=[1.], - aspect_ratios=aspect_ratios, - strides=strides, - clip_boxes=False) - anchors = anchor_gen(image_size) - anchors = [tf.reshape(anchor, [-1, 4]) for anchor in anchors] - anchors = tf.concat(anchors, axis=0).numpy() - self.assertAllClose(expected_boxes, anchors) - - @parameterized.parameters( - # Multi scale anchor. - (5, 6, [1.0], { - '5': [[[-16., -16., 48., 48.], [-16., 16., 48., 80.]], - [[16., -16., 80., 48.], [16., 16., 80., 80.]]], - '6': [[[-32, -32, 96, 96]]] - }),) - def testAnchorGenerationDict(self, min_level, max_level, aspect_ratios, - expected_boxes): - image_size = [64, 64] - levels = range(min_level, max_level + 1) - anchor_sizes = dict((str(level), 2**(level + 1)) for level in levels) - strides = dict((str(level), 2**level) for level in levels) - anchor_gen = anchor_generator.AnchorGenerator( - anchor_sizes=anchor_sizes, - scales=[1.], - aspect_ratios=aspect_ratios, - strides=strides, - clip_boxes=False) - anchors = anchor_gen(image_size) - for k in expected_boxes.keys(): - self.assertAllClose(expected_boxes[k], anchors[k].numpy()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/ops/anchor_test.py b/official/vision/beta/ops/anchor_test.py deleted file mode 100644 index 9d55cbcff..000000000 --- a/official/vision/beta/ops/anchor_test.py +++ /dev/null @@ -1,186 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for anchor.py.""" - -# Import libraries -from absl.testing import parameterized -import numpy as np -import tensorflow as tf -from official.vision.beta.ops import anchor - - -class AnchorTest(parameterized.TestCase, tf.test.TestCase): - - # The set of parameters are tailored for the MLPerf configuration, where - # the number of anchors is 495132, rpn_batch_size_per_im=256, and - # rpn_fg_fraction=0.5. - @parameterized.parameters( - (512, 25, 25, 25, 25, (512, 512)), - (512, 25, 25, 25, 25, (512, 640)), - (512, 25, 25, 25, 25, (640, 512)), - (495132, 100, 100, 100, 100, (512, 512)), - (495132, 200, 100, 128, 100, (512, 512)), - (495132, 100, 120, 100, 120, (512, 512)), - (495132, 100, 200, 100, 156, (512, 512)), - (495132, 200, 200, 128, 128, (512, 512)), - ) - def testAnchorRpnSample(self, num_anchors, num_positives, - num_negatives, expected_positives, - expected_negatives, image_size): - match_results_np = np.empty([num_anchors]) - match_results_np.fill(-2) - match_results_np[:num_positives] = 0 - match_results_np[num_positives:num_positives + num_negatives] = -1 - match_results = tf.convert_to_tensor(value=match_results_np, dtype=tf.int32) - anchor_labeler = anchor.RpnAnchorLabeler( - match_threshold=0.7, - unmatched_threshold=0.3, - rpn_batch_size_per_im=256, - rpn_fg_fraction=0.5) - rpn_sample_op = anchor_labeler._get_rpn_samples(match_results) - labels = [v.numpy() for v in rpn_sample_op] - self.assertLen(labels[0], num_anchors) - positives = np.sum(np.array(labels[0]) == 1) - negatives = np.sum(np.array(labels[0]) == 0) - self.assertEqual(positives, expected_positives) - self.assertEqual(negatives, expected_negatives) - - @parameterized.parameters( - # Single scale anchor. - (5, 5, 1, [1.0], 2.0, - [[-16, -16, 48, 48], [-16, 16, 48, 80], - [16, -16, 80, 48], [16, 16, 80, 80]]), - # Multi scale anchor. - (5, 6, 1, [1.0], 2.0, - [[-16, -16, 48, 48], [-16, 16, 48, 80], - [16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]), - # # Multi aspect ratio anchor. - (6, 6, 1, [1.0, 4.0, 0.25], 2.0, - [[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]), - - ) - def testAnchorGeneration(self, min_level, max_level, num_scales, - aspect_ratios, anchor_size, expected_boxes): - image_size = [64, 64] - anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios, - anchor_size, image_size) - boxes = anchors.boxes.numpy() - self.assertEqual(expected_boxes, boxes.tolist()) - - @parameterized.parameters( - # Single scale anchor. - (5, 5, 1, [1.0], 2.0, - [[-16, -16, 48, 48], [-16, 16, 48, 80], - [16, -16, 80, 48], [16, 16, 80, 80]]), - # Multi scale anchor. - (5, 6, 1, [1.0], 2.0, - [[-16, -16, 48, 48], [-16, 16, 48, 80], - [16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]), - # # Multi aspect ratio anchor. - (6, 6, 1, [1.0, 4.0, 0.25], 2.0, - [[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]), - - ) - def testAnchorGenerationWithImageSizeAsTensor(self, - min_level, - max_level, - num_scales, - aspect_ratios, - anchor_size, - expected_boxes): - image_size = tf.constant([64, 64], tf.int32) - anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios, - anchor_size, image_size) - boxes = anchors.boxes.numpy() - self.assertEqual(expected_boxes, boxes.tolist()) - - @parameterized.parameters( - (3, 6, 2, [1.0], 2.0, False), - (3, 6, 2, [1.0], 2.0, True), - ) - def testLabelAnchors(self, min_level, max_level, num_scales, aspect_ratios, - anchor_size, has_attribute): - input_size = [512, 512] - ground_truth_class_id = 2 - attribute_name = 'depth' - ground_truth_depth = 3.0 - - # The matched anchors are the anchors used as ground truth and the anchors - # at the next octave scale on the same location. - expected_anchor_locations = [[0, 0, 0], [0, 0, 1]] - anchor_gen = anchor.build_anchor_generator(min_level, max_level, num_scales, - aspect_ratios, anchor_size) - anchor_boxes = anchor_gen(input_size) - anchor_labeler = anchor.AnchorLabeler() - - # Uses the first anchors as ground truth. The ground truth should map to - # two anchors with two intermediate scales at the same location. - gt_boxes = anchor_boxes['3'][0:1, 0, 0:4] - gt_classes = tf.constant([[ground_truth_class_id]], dtype=tf.float32) - gt_attributes = { - attribute_name: tf.constant([[ground_truth_depth]], dtype=tf.float32) - } if has_attribute else {} - - (cls_targets, box_targets, att_targets, _, - box_weights) = anchor_labeler.label_anchors(anchor_boxes, gt_boxes, - gt_classes, gt_attributes) - - for k, v in cls_targets.items(): - cls_targets[k] = v.numpy() - for k, v in box_targets.items(): - box_targets[k] = v.numpy() - box_weights = box_weights.numpy() - - anchor_locations = np.vstack( - np.where(cls_targets[str(min_level)] > -1)).transpose() - self.assertAllClose(expected_anchor_locations, anchor_locations) - # Two anchor boxes on min_level got matched to the gt_boxes. - self.assertAllClose(tf.reduce_sum(box_weights), 2) - - if has_attribute: - self.assertIn(attribute_name, att_targets) - for k, v in att_targets[attribute_name].items(): - att_targets[attribute_name][k] = v.numpy() - anchor_locations = np.vstack( - np.where( - att_targets[attribute_name][str(min_level)] > 0.0)).transpose() - self.assertAllClose(expected_anchor_locations, anchor_locations) - else: - self.assertEmpty(att_targets) - - @parameterized.parameters( - (3, 7, [.5, 1., 2.], 2, 8, (256, 256)), - (3, 8, [1.], 3, 32, (512, 512)), - (3, 3, [1.], 2, 4, (32, 32)), - ) - def testEquivalentResult(self, min_level, max_level, aspect_ratios, - num_scales, anchor_size, image_size): - anchor_gen = anchor.build_anchor_generator( - min_level=min_level, - max_level=max_level, - num_scales=num_scales, - aspect_ratios=aspect_ratios, - anchor_size=anchor_size) - anchors = anchor_gen(image_size) - expected_anchor_gen = anchor.Anchor(min_level, max_level, num_scales, - aspect_ratios, anchor_size, image_size) - - expected_anchors = expected_anchor_gen.multilevel_boxes - for k in expected_anchors.keys(): - self.assertAllClose(expected_anchors[k], anchors[k]) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/ops/augment.py b/official/vision/beta/ops/augment.py deleted file mode 100644 index 69894ebcf..000000000 --- a/official/vision/beta/ops/augment.py +++ /dev/null @@ -1,2320 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Augmentation policies for enhanced image/video preprocessing. - -AutoAugment Reference: - - AutoAugment Reference: https://arxiv.org/abs/1805.09501 - - AutoAugment for Object Detection Reference: https://arxiv.org/abs/1906.11172 -RandAugment Reference: https://arxiv.org/abs/1909.13719 -RandomErasing Reference: https://arxiv.org/abs/1708.04896 -MixupAndCutmix: - - Mixup: https://arxiv.org/abs/1710.09412 - - Cutmix: https://arxiv.org/abs/1905.04899 - -RandomErasing, Mixup and Cutmix are inspired by -https://github.com/rwightman/pytorch-image-models - -""" -import inspect -import math -from typing import Any, List, Iterable, Optional, Text, Tuple - -from keras.layers.preprocessing import image_preprocessing as image_ops -import numpy as np -import tensorflow as tf - - -# This signifies the max integer that the controller RNN could predict for the -# augmentation scheme. -_MAX_LEVEL = 10. - - -def to_4d(image: tf.Tensor) -> tf.Tensor: - """Converts an input Tensor to 4 dimensions. - - 4D image => [N, H, W, C] or [N, C, H, W] - 3D image => [1, H, W, C] or [1, C, H, W] - 2D image => [1, H, W, 1] - - Args: - image: The 2/3/4D input tensor. - - Returns: - A 4D image tensor. - - Raises: - `TypeError` if `image` is not a 2/3/4D tensor. - - """ - shape = tf.shape(image) - original_rank = tf.rank(image) - left_pad = tf.cast(tf.less_equal(original_rank, 3), dtype=tf.int32) - right_pad = tf.cast(tf.equal(original_rank, 2), dtype=tf.int32) - new_shape = tf.concat( - [ - tf.ones(shape=left_pad, dtype=tf.int32), - shape, - tf.ones(shape=right_pad, dtype=tf.int32), - ], - axis=0, - ) - return tf.reshape(image, new_shape) - - -def from_4d(image: tf.Tensor, ndims: tf.Tensor) -> tf.Tensor: - """Converts a 4D image back to `ndims` rank.""" - shape = tf.shape(image) - begin = tf.cast(tf.less_equal(ndims, 3), dtype=tf.int32) - end = 4 - tf.cast(tf.equal(ndims, 2), dtype=tf.int32) - new_shape = shape[begin:end] - return tf.reshape(image, new_shape) - - -def _convert_translation_to_transform(translations: tf.Tensor) -> tf.Tensor: - """Converts translations to a projective transform. - - The translation matrix looks like this: - [[1 0 -dx] - [0 1 -dy] - [0 0 1]] - - Args: - translations: The 2-element list representing [dx, dy], or a matrix of - 2-element lists representing [dx dy] to translate for each image. The - shape must be static. - - Returns: - The transformation matrix of shape (num_images, 8). - - Raises: - `TypeError` if - - the shape of `translations` is not known or - - the shape of `translations` is not rank 1 or 2. - - """ - translations = tf.convert_to_tensor(translations, dtype=tf.float32) - if translations.get_shape().ndims is None: - raise TypeError('translations rank must be statically known') - elif len(translations.get_shape()) == 1: - translations = translations[None] - elif len(translations.get_shape()) != 2: - raise TypeError('translations should have rank 1 or 2.') - num_translations = tf.shape(translations)[0] - - return tf.concat( - values=[ - tf.ones((num_translations, 1), tf.dtypes.float32), - tf.zeros((num_translations, 1), tf.dtypes.float32), - -translations[:, 0, None], - tf.zeros((num_translations, 1), tf.dtypes.float32), - tf.ones((num_translations, 1), tf.dtypes.float32), - -translations[:, 1, None], - tf.zeros((num_translations, 2), tf.dtypes.float32), - ], - axis=1, - ) - - -def _convert_angles_to_transform(angles: tf.Tensor, image_width: tf.Tensor, - image_height: tf.Tensor) -> tf.Tensor: - """Converts an angle or angles to a projective transform. - - Args: - angles: A scalar to rotate all images, or a vector to rotate a batch of - images. This must be a scalar. - image_width: The width of the image(s) to be transformed. - image_height: The height of the image(s) to be transformed. - - Returns: - A tensor of shape (num_images, 8). - - Raises: - `TypeError` if `angles` is not rank 0 or 1. - - """ - angles = tf.convert_to_tensor(angles, dtype=tf.float32) - if len(angles.get_shape()) == 0: # pylint:disable=g-explicit-length-test - angles = angles[None] - elif len(angles.get_shape()) != 1: - raise TypeError('Angles should have a rank 0 or 1.') - x_offset = ((image_width - 1) - - (tf.math.cos(angles) * (image_width - 1) - tf.math.sin(angles) * - (image_height - 1))) / 2.0 - y_offset = ((image_height - 1) - - (tf.math.sin(angles) * (image_width - 1) + tf.math.cos(angles) * - (image_height - 1))) / 2.0 - num_angles = tf.shape(angles)[0] - return tf.concat( - values=[ - tf.math.cos(angles)[:, None], - -tf.math.sin(angles)[:, None], - x_offset[:, None], - tf.math.sin(angles)[:, None], - tf.math.cos(angles)[:, None], - y_offset[:, None], - tf.zeros((num_angles, 2), tf.dtypes.float32), - ], - axis=1, - ) - - -def transform(image: tf.Tensor, transforms) -> tf.Tensor: - """Prepares input data for `image_ops.transform`.""" - original_ndims = tf.rank(image) - transforms = tf.convert_to_tensor(transforms, dtype=tf.float32) - if transforms.shape.rank == 1: - transforms = transforms[None] - image = to_4d(image) - image = image_ops.transform( - images=image, transforms=transforms, interpolation='nearest') - return from_4d(image, original_ndims) - - -def translate(image: tf.Tensor, translations) -> tf.Tensor: - """Translates image(s) by provided vectors. - - Args: - image: An image Tensor of type uint8. - translations: A vector or matrix representing [dx dy]. - - Returns: - The translated version of the image. - - """ - transforms = _convert_translation_to_transform(translations) - return transform(image, transforms=transforms) - - -def rotate(image: tf.Tensor, degrees: float) -> tf.Tensor: - """Rotates the image by degrees either clockwise or counterclockwise. - - Args: - image: An image Tensor of type uint8. - degrees: Float, a scalar angle in degrees to rotate all images by. If - degrees is positive the image will be rotated clockwise otherwise it will - be rotated counterclockwise. - - Returns: - The rotated version of image. - - """ - # Convert from degrees to radians. - degrees_to_radians = math.pi / 180.0 - radians = tf.cast(degrees * degrees_to_radians, tf.float32) - - original_ndims = tf.rank(image) - image = to_4d(image) - - image_height = tf.cast(tf.shape(image)[1], tf.float32) - image_width = tf.cast(tf.shape(image)[2], tf.float32) - transforms = _convert_angles_to_transform( - angles=radians, image_width=image_width, image_height=image_height) - # In practice, we should randomize the rotation degrees by flipping - # it negatively half the time, but that's done on 'degrees' outside - # of the function. - image = transform(image, transforms=transforms) - return from_4d(image, original_ndims) - - -def blend(image1: tf.Tensor, image2: tf.Tensor, factor: float) -> tf.Tensor: - """Blend image1 and image2 using 'factor'. - - Factor can be above 0.0. A value of 0.0 means only image1 is used. - A value of 1.0 means only image2 is used. A value between 0.0 and - 1.0 means we linearly interpolate the pixel values between the two - images. A value greater than 1.0 "extrapolates" the difference - between the two pixel values, and we clip the results to values - between 0 and 255. - - Args: - image1: An image Tensor of type uint8. - image2: An image Tensor of type uint8. - factor: A floating point value above 0.0. - - Returns: - A blended image Tensor of type uint8. - """ - if factor == 0.0: - return tf.convert_to_tensor(image1) - if factor == 1.0: - return tf.convert_to_tensor(image2) - - image1 = tf.cast(image1, tf.float32) - image2 = tf.cast(image2, tf.float32) - - difference = image2 - image1 - scaled = factor * difference - - # Do addition in float. - temp = tf.cast(image1, tf.float32) + scaled - - # Interpolate - if factor > 0.0 and factor < 1.0: - # Interpolation means we always stay within 0 and 255. - return tf.cast(temp, tf.uint8) - - # Extrapolate: - # - # We need to clip and then cast. - return tf.cast(tf.clip_by_value(temp, 0.0, 255.0), tf.uint8) - - -def cutout(image: tf.Tensor, pad_size: int, replace: int = 0) -> tf.Tensor: - """Apply cutout (https://arxiv.org/abs/1708.04552) to image. - - This operation applies a (2*pad_size x 2*pad_size) mask of zeros to - a random location within `image`. The pixel values filled in will be of the - value `replace`. The location where the mask will be applied is randomly - chosen uniformly over the whole image. - - Args: - image: An image Tensor of type uint8. - pad_size: Specifies how big the zero mask that will be generated is that is - applied to the image. The mask will be of size (2*pad_size x 2*pad_size). - replace: What pixel value to fill in the image in the area that has the - cutout mask applied to it. - - Returns: - An image Tensor that is of type uint8. - """ - if image.shape.rank not in [3, 4]: - raise ValueError('Bad image rank: {}'.format(image.shape.rank)) - - if image.shape.rank == 4: - return cutout_video(image, replace=replace) - - image_height = tf.shape(image)[0] - image_width = tf.shape(image)[1] - - # Sample the center location in the image where the zero mask will be applied. - cutout_center_height = tf.random.uniform( - shape=[], minval=0, maxval=image_height, dtype=tf.int32) - - cutout_center_width = tf.random.uniform( - shape=[], minval=0, maxval=image_width, dtype=tf.int32) - - image = _fill_rectangle(image, cutout_center_width, cutout_center_height, - pad_size, pad_size, replace) - - return image - - -def _fill_rectangle(image, - center_width, - center_height, - half_width, - half_height, - replace=None): - """Fill blank area.""" - image_height = tf.shape(image)[0] - image_width = tf.shape(image)[1] - - lower_pad = tf.maximum(0, center_height - half_height) - upper_pad = tf.maximum(0, image_height - center_height - half_height) - left_pad = tf.maximum(0, center_width - half_width) - right_pad = tf.maximum(0, image_width - center_width - half_width) - - cutout_shape = [ - image_height - (lower_pad + upper_pad), - image_width - (left_pad + right_pad) - ] - padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]] - mask = tf.pad( - tf.zeros(cutout_shape, dtype=image.dtype), - padding_dims, - constant_values=1) - mask = tf.expand_dims(mask, -1) - mask = tf.tile(mask, [1, 1, 3]) - - if replace is None: - fill = tf.random.normal(tf.shape(image), dtype=image.dtype) - elif isinstance(replace, tf.Tensor): - fill = replace - else: - fill = tf.ones_like(image, dtype=image.dtype) * replace - image = tf.where(tf.equal(mask, 0), fill, image) - - return image - - -def cutout_video(image: tf.Tensor, replace: int = 0) -> tf.Tensor: - """Apply cutout (https://arxiv.org/abs/1708.04552) to a video. - - This operation applies a random size 3D mask of zeros to a random location - within `image`. The mask is padded The pixel values filled in will be of the - value `replace`. The location where the mask will be applied is randomly - chosen uniformly over the whole image. The size of the mask is randomly - sampled uniformly from [0.25*height, 0.5*height], [0.25*width, 0.5*width], - and [1, 0.25*depth], which represent the height, width, and number of frames - of the input video tensor respectively. - - Args: - image: A video Tensor of type uint8. - replace: What pixel value to fill in the image in the area that has the - cutout mask applied to it. - - Returns: - An video Tensor that is of type uint8. - """ - image_depth = tf.shape(image)[0] - image_height = tf.shape(image)[1] - image_width = tf.shape(image)[2] - - # Sample the center location in the image where the zero mask will be applied. - cutout_center_height = tf.random.uniform( - shape=[], minval=0, maxval=image_height, dtype=tf.int32) - - cutout_center_width = tf.random.uniform( - shape=[], minval=0, maxval=image_width, dtype=tf.int32) - - cutout_center_depth = tf.random.uniform( - shape=[], minval=0, maxval=image_depth, dtype=tf.int32) - - pad_size_height = tf.random.uniform( - shape=[], - minval=tf.maximum(1, tf.cast(image_height / 4, tf.int32)), - maxval=tf.maximum(2, tf.cast(image_height / 2, tf.int32)), - dtype=tf.int32) - pad_size_width = tf.random.uniform( - shape=[], - minval=tf.maximum(1, tf.cast(image_width / 4, tf.int32)), - maxval=tf.maximum(2, tf.cast(image_width / 2, tf.int32)), - dtype=tf.int32) - pad_size_depth = tf.random.uniform( - shape=[], - minval=1, - maxval=tf.maximum(2, tf.cast(image_depth / 4, tf.int32)), - dtype=tf.int32) - - lower_pad = tf.maximum(0, cutout_center_height - pad_size_height) - upper_pad = tf.maximum( - 0, image_height - cutout_center_height - pad_size_height) - left_pad = tf.maximum(0, cutout_center_width - pad_size_width) - right_pad = tf.maximum(0, image_width - cutout_center_width - pad_size_width) - back_pad = tf.maximum(0, cutout_center_depth - pad_size_depth) - forward_pad = tf.maximum( - 0, image_depth - cutout_center_depth - pad_size_depth) - - cutout_shape = [ - image_depth - (back_pad + forward_pad), - image_height - (lower_pad + upper_pad), - image_width - (left_pad + right_pad), - ] - padding_dims = [[back_pad, forward_pad], - [lower_pad, upper_pad], - [left_pad, right_pad]] - mask = tf.pad( - tf.zeros(cutout_shape, dtype=image.dtype), - padding_dims, - constant_values=1) - mask = tf.expand_dims(mask, -1) - mask = tf.tile(mask, [1, 1, 1, 3]) - image = tf.where( - tf.equal(mask, 0), - tf.ones_like(image, dtype=image.dtype) * replace, image) - return image - - -def solarize(image: tf.Tensor, threshold: int = 128) -> tf.Tensor: - """Solarize the input image(s).""" - # For each pixel in the image, select the pixel - # if the value is less than the threshold. - # Otherwise, subtract 255 from the pixel. - return tf.where(image < threshold, image, 255 - image) - - -def solarize_add(image: tf.Tensor, - addition: int = 0, - threshold: int = 128) -> tf.Tensor: - """Additive solarize the input image(s).""" - # For each pixel in the image less than threshold - # we add 'addition' amount to it and then clip the - # pixel value to be between 0 and 255. The value - # of 'addition' is between -128 and 128. - added_image = tf.cast(image, tf.int64) + addition - added_image = tf.cast(tf.clip_by_value(added_image, 0, 255), tf.uint8) - return tf.where(image < threshold, added_image, image) - - -def color(image: tf.Tensor, factor: float) -> tf.Tensor: - """Equivalent of PIL Color.""" - degenerate = tf.image.grayscale_to_rgb(tf.image.rgb_to_grayscale(image)) - return blend(degenerate, image, factor) - - -def contrast(image: tf.Tensor, factor: float) -> tf.Tensor: - """Equivalent of PIL Contrast.""" - degenerate = tf.image.rgb_to_grayscale(image) - # Cast before calling tf.histogram. - degenerate = tf.cast(degenerate, tf.int32) - - # Compute the grayscale histogram, then compute the mean pixel value, - # and create a constant image size of that value. Use that as the - # blending degenerate target of the original image. - hist = tf.histogram_fixed_width(degenerate, [0, 255], nbins=256) - mean = tf.reduce_sum(tf.cast(hist, tf.float32)) / 256.0 - degenerate = tf.ones_like(degenerate, dtype=tf.float32) * mean - degenerate = tf.clip_by_value(degenerate, 0.0, 255.0) - degenerate = tf.image.grayscale_to_rgb(tf.cast(degenerate, tf.uint8)) - return blend(degenerate, image, factor) - - -def brightness(image: tf.Tensor, factor: float) -> tf.Tensor: - """Equivalent of PIL Brightness.""" - degenerate = tf.zeros_like(image) - return blend(degenerate, image, factor) - - -def posterize(image: tf.Tensor, bits: int) -> tf.Tensor: - """Equivalent of PIL Posterize.""" - shift = 8 - bits - return tf.bitwise.left_shift(tf.bitwise.right_shift(image, shift), shift) - - -def wrapped_rotate(image: tf.Tensor, degrees: float, replace: int) -> tf.Tensor: - """Applies rotation with wrap/unwrap.""" - image = rotate(wrap(image), degrees=degrees) - return unwrap(image, replace) - - -def translate_x(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor: - """Equivalent of PIL Translate in X dimension.""" - image = translate(wrap(image), [-pixels, 0]) - return unwrap(image, replace) - - -def translate_y(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor: - """Equivalent of PIL Translate in Y dimension.""" - image = translate(wrap(image), [0, -pixels]) - return unwrap(image, replace) - - -def shear_x(image: tf.Tensor, level: float, replace: int) -> tf.Tensor: - """Equivalent of PIL Shearing in X dimension.""" - # Shear parallel to x axis is a projective transform - # with a matrix form of: - # [1 level - # 0 1]. - image = transform( - image=wrap(image), transforms=[1., level, 0., 0., 1., 0., 0., 0.]) - return unwrap(image, replace) - - -def shear_y(image: tf.Tensor, level: float, replace: int) -> tf.Tensor: - """Equivalent of PIL Shearing in Y dimension.""" - # Shear parallel to y axis is a projective transform - # with a matrix form of: - # [1 0 - # level 1]. - image = transform( - image=wrap(image), transforms=[1., 0., 0., level, 1., 0., 0., 0.]) - return unwrap(image, replace) - - -def autocontrast(image: tf.Tensor) -> tf.Tensor: - """Implements Autocontrast function from PIL using TF ops. - - Args: - image: A 3D uint8 tensor. - - Returns: - The image after it has had autocontrast applied to it and will be of type - uint8. - """ - - def scale_channel(image: tf.Tensor) -> tf.Tensor: - """Scale the 2D image using the autocontrast rule.""" - # A possibly cheaper version can be done using cumsum/unique_with_counts - # over the histogram values, rather than iterating over the entire image. - # to compute mins and maxes. - lo = tf.cast(tf.reduce_min(image), tf.float32) - hi = tf.cast(tf.reduce_max(image), tf.float32) - - # Scale the image, making the lowest value 0 and the highest value 255. - def scale_values(im): - scale = 255.0 / (hi - lo) - offset = -lo * scale - im = tf.cast(im, tf.float32) * scale + offset - im = tf.clip_by_value(im, 0.0, 255.0) - return tf.cast(im, tf.uint8) - - result = tf.cond(hi > lo, lambda: scale_values(image), lambda: image) - return result - - # Assumes RGB for now. Scales each channel independently - # and then stacks the result. - s1 = scale_channel(image[..., 0]) - s2 = scale_channel(image[..., 1]) - s3 = scale_channel(image[..., 2]) - image = tf.stack([s1, s2, s3], -1) - - return image - - -def sharpness(image: tf.Tensor, factor: float) -> tf.Tensor: - """Implements Sharpness function from PIL using TF ops.""" - orig_image = image - image = tf.cast(image, tf.float32) - # Make image 4D for conv operation. - image = tf.expand_dims(image, 0) - # SMOOTH PIL Kernel. - if orig_image.shape.rank == 3: - kernel = tf.constant([[1, 1, 1], [1, 5, 1], [1, 1, 1]], - dtype=tf.float32, - shape=[3, 3, 1, 1]) / 13. - # Tile across channel dimension. - kernel = tf.tile(kernel, [1, 1, 3, 1]) - strides = [1, 1, 1, 1] - degenerate = tf.nn.depthwise_conv2d( - image, kernel, strides, padding='VALID', dilations=[1, 1]) - elif orig_image.shape.rank == 4: - kernel = tf.constant([[1, 1, 1], [1, 5, 1], [1, 1, 1]], - dtype=tf.float32, - shape=[1, 3, 3, 1, 1]) / 13. - strides = [1, 1, 1, 1, 1] - # Run the kernel across each channel - channels = tf.split(image, 3, axis=-1) - degenerates = [ - tf.nn.conv3d(channel, kernel, strides, padding='VALID', - dilations=[1, 1, 1, 1, 1]) - for channel in channels - ] - degenerate = tf.concat(degenerates, -1) - else: - raise ValueError('Bad image rank: {}'.format(image.shape.rank)) - degenerate = tf.clip_by_value(degenerate, 0.0, 255.0) - degenerate = tf.squeeze(tf.cast(degenerate, tf.uint8), [0]) - - # For the borders of the resulting image, fill in the values of the - # original image. - mask = tf.ones_like(degenerate) - paddings = [[0, 0]] * (orig_image.shape.rank - 3) - padded_mask = tf.pad(mask, paddings + [[1, 1], [1, 1], [0, 0]]) - padded_degenerate = tf.pad(degenerate, paddings + [[1, 1], [1, 1], [0, 0]]) - result = tf.where(tf.equal(padded_mask, 1), padded_degenerate, orig_image) - - # Blend the final result. - return blend(result, orig_image, factor) - - -def equalize(image: tf.Tensor) -> tf.Tensor: - """Implements Equalize function from PIL using TF ops.""" - - def scale_channel(im, c): - """Scale the data in the channel to implement equalize.""" - im = tf.cast(im[..., c], tf.int32) - # Compute the histogram of the image channel. - histo = tf.histogram_fixed_width(im, [0, 255], nbins=256) - - # For the purposes of computing the step, filter out the nonzeros. - nonzero = tf.where(tf.not_equal(histo, 0)) - nonzero_histo = tf.reshape(tf.gather(histo, nonzero), [-1]) - step = (tf.reduce_sum(nonzero_histo) - nonzero_histo[-1]) // 255 - - def build_lut(histo, step): - # Compute the cumulative sum, shifting by step // 2 - # and then normalization by step. - lut = (tf.cumsum(histo) + (step // 2)) // step - # Shift lut, prepending with 0. - lut = tf.concat([[0], lut[:-1]], 0) - # Clip the counts to be in range. This is done - # in the C code for image.point. - return tf.clip_by_value(lut, 0, 255) - - # If step is zero, return the original image. Otherwise, build - # lut from the full histogram and step and then index from it. - result = tf.cond( - tf.equal(step, 0), lambda: im, - lambda: tf.gather(build_lut(histo, step), im)) - - return tf.cast(result, tf.uint8) - - # Assumes RGB for now. Scales each channel independently - # and then stacks the result. - s1 = scale_channel(image, 0) - s2 = scale_channel(image, 1) - s3 = scale_channel(image, 2) - image = tf.stack([s1, s2, s3], -1) - return image - - -def invert(image: tf.Tensor) -> tf.Tensor: - """Inverts the image pixels.""" - image = tf.convert_to_tensor(image) - return 255 - image - - -def wrap(image: tf.Tensor) -> tf.Tensor: - """Returns 'image' with an extra channel set to all 1s.""" - shape = tf.shape(image) - extended_channel = tf.expand_dims(tf.ones(shape[:-1], image.dtype), -1) - extended = tf.concat([image, extended_channel], axis=-1) - return extended - - -def unwrap(image: tf.Tensor, replace: int) -> tf.Tensor: - """Unwraps an image produced by wrap. - - Where there is a 0 in the last channel for every spatial position, - the rest of the three channels in that spatial dimension are grayed - (set to 128). Operations like translate and shear on a wrapped - Tensor will leave 0s in empty locations. Some transformations look - at the intensity of values to do preprocessing, and we want these - empty pixels to assume the 'average' value, rather than pure black. - - - Args: - image: A 3D Image Tensor with 4 channels. - replace: A one or three value 1D tensor to fill empty pixels. - - Returns: - image: A 3D image Tensor with 3 channels. - """ - image_shape = tf.shape(image) - # Flatten the spatial dimensions. - flattened_image = tf.reshape(image, [-1, image_shape[-1]]) - - # Find all pixels where the last channel is zero. - alpha_channel = tf.expand_dims(flattened_image[..., 3], axis=-1) - - replace = tf.concat([replace, tf.ones([1], image.dtype)], 0) - - # Where they are zero, fill them in with 'replace'. - flattened_image = tf.where( - tf.equal(alpha_channel, 0), - tf.ones_like(flattened_image, dtype=image.dtype) * replace, - flattened_image) - - image = tf.reshape(flattened_image, image_shape) - image = tf.slice( - image, - [0] * image.shape.rank, - tf.concat([image_shape[:-1], [3]], -1)) - return image - - -def _scale_bbox_only_op_probability(prob): - """Reduce the probability of the bbox-only operation. - - Probability is reduced so that we do not distort the content of too many - bounding boxes that are close to each other. The value of 3.0 was a chosen - hyper parameter when designing the autoaugment algorithm that we found - empirically to work well. - - Args: - prob: Float that is the probability of applying the bbox-only operation. - - Returns: - Reduced probability. - """ - return prob / 3.0 - - -def _apply_bbox_augmentation(image, bbox, augmentation_func, *args): - """Applies augmentation_func to the subsection of image indicated by bbox. - - Args: - image: 3D uint8 Tensor. - bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) - of type float that represents the normalized coordinates between 0 and 1. - augmentation_func: Augmentation function that will be applied to the - subsection of image. - *args: Additional parameters that will be passed into augmentation_func - when it is called. - - Returns: - A modified version of image, where the bbox location in the image will - have `ugmentation_func applied to it. - """ - image_height = tf.cast(tf.shape(image)[0], tf.float32) - image_width = tf.cast(tf.shape(image)[1], tf.float32) - min_y = tf.cast(image_height * bbox[0], tf.int32) - min_x = tf.cast(image_width * bbox[1], tf.int32) - max_y = tf.cast(image_height * bbox[2], tf.int32) - max_x = tf.cast(image_width * bbox[3], tf.int32) - image_height = tf.cast(image_height, tf.int32) - image_width = tf.cast(image_width, tf.int32) - - # Clip to be sure the max values do not fall out of range. - max_y = tf.minimum(max_y, image_height - 1) - max_x = tf.minimum(max_x, image_width - 1) - - # Get the sub-tensor that is the image within the bounding box region. - bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :] - - # Apply the augmentation function to the bbox portion of the image. - augmented_bbox_content = augmentation_func(bbox_content, *args) - - # Pad the augmented_bbox_content and the mask to match the shape of original - # image. - augmented_bbox_content = tf.pad(augmented_bbox_content, - [[min_y, (image_height - 1) - max_y], - [min_x, (image_width - 1) - max_x], - [0, 0]]) - - # Create a mask that will be used to zero out a part of the original image. - mask_tensor = tf.zeros_like(bbox_content) - - mask_tensor = tf.pad(mask_tensor, - [[min_y, (image_height - 1) - max_y], - [min_x, (image_width - 1) - max_x], - [0, 0]], - constant_values=1) - # Replace the old bbox content with the new augmented content. - image = image * mask_tensor + augmented_bbox_content - return image - - -def _concat_bbox(bbox, bboxes): - """Helper function that concates bbox to bboxes along the first dimension.""" - - # Note if all elements in bboxes are -1 (_INVALID_BOX), then this means - # we discard bboxes and start the bboxes Tensor with the current bbox. - bboxes_sum_check = tf.reduce_sum(bboxes) - bbox = tf.expand_dims(bbox, 0) - # This check will be true when it is an _INVALID_BOX - bboxes = tf.cond(tf.equal(bboxes_sum_check, -4.0), - lambda: bbox, - lambda: tf.concat([bboxes, bbox], 0)) - return bboxes - - -def _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob, - augmentation_func, func_changes_bbox, - *args): - """Applies _apply_bbox_augmentation with probability prob. - - Args: - image: 3D uint8 Tensor. - bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) - of type float that represents the normalized coordinates between 0 and 1. - new_bboxes: 2D Tensor that is a list of the bboxes in the image after they - have been altered by aug_func. These will only be changed when - func_changes_bbox is set to true. Each bbox has 4 elements - (min_y, min_x, max_y, max_x) of type float that are the normalized - bbox coordinates between 0 and 1. - prob: Float that is the probability of applying _apply_bbox_augmentation. - augmentation_func: Augmentation function that will be applied to the - subsection of image. - func_changes_bbox: Boolean. Does augmentation_func return bbox in addition - to image. - *args: Additional parameters that will be passed into augmentation_func - when it is called. - - Returns: - A tuple. Fist element is a modified version of image, where the bbox - location in the image will have augmentation_func applied to it if it is - chosen to be called with probability `prob`. The second element is a - Tensor of Tensors of length 4 that will contain the altered bbox after - applying augmentation_func. - """ - should_apply_op = tf.cast( - tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool) - if func_changes_bbox: - augmented_image, bbox = tf.cond( - should_apply_op, - lambda: augmentation_func(image, bbox, *args), - lambda: (image, bbox)) - else: - augmented_image = tf.cond( - should_apply_op, - lambda: _apply_bbox_augmentation(image, bbox, augmentation_func, *args), - lambda: image) - new_bboxes = _concat_bbox(bbox, new_bboxes) - return augmented_image, new_bboxes - - -def _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func, - func_changes_bbox, *args): - """Checks to be sure num bboxes > 0 before calling inner function.""" - num_bboxes = tf.shape(bboxes)[0] - image, bboxes = tf.cond( - tf.equal(num_bboxes, 0), - lambda: (image, bboxes), - # pylint:disable=g-long-lambda - lambda: _apply_multi_bbox_augmentation( - image, bboxes, prob, aug_func, func_changes_bbox, *args)) - # pylint:enable=g-long-lambda - return image, bboxes - - -# Represents an invalid bounding box that is used for checking for padding -# lists of bounding box coordinates for a few augmentation operations -_INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]] - - -def _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func, - func_changes_bbox, *args): - """Applies aug_func to the image for each bbox in bboxes. - - Args: - image: 3D uint8 Tensor. - bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox - has 4 elements (min_y, min_x, max_y, max_x) of type float. - prob: Float that is the probability of applying aug_func to a specific - bounding box within the image. - aug_func: Augmentation function that will be applied to the - subsections of image indicated by the bbox values in bboxes. - func_changes_bbox: Boolean. Does augmentation_func return bbox in addition - to image. - *args: Additional parameters that will be passed into augmentation_func - when it is called. - - Returns: - A modified version of image, where each bbox location in the image will - have augmentation_func applied to it if it is chosen to be called with - probability prob independently across all bboxes. Also the final - bboxes are returned that will be unchanged if func_changes_bbox is set to - false and if true, the new altered ones will be returned. - - Raises: - ValueError if applied to video. - """ - if image.shape.rank == 4: - raise ValueError('Image rank 4 is not supported') - - # Will keep track of the new altered bboxes after aug_func is repeatedly - # applied. The -1 values are a dummy value and this first Tensor will be - # removed upon appending the first real bbox. - new_bboxes = tf.constant(_INVALID_BOX) - - # If the bboxes are empty, then just give it _INVALID_BOX. The result - # will be thrown away. - bboxes = tf.cond(tf.equal(tf.size(bboxes), 0), - lambda: tf.constant(_INVALID_BOX), - lambda: bboxes) - - bboxes = tf.ensure_shape(bboxes, (None, 4)) - - # pylint:disable=g-long-lambda - wrapped_aug_func = ( - lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper( - _image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args)) - # pylint:enable=g-long-lambda - - # Setup the while_loop. - num_bboxes = tf.shape(bboxes)[0] # We loop until we go over all bboxes. - idx = tf.constant(0) # Counter for the while loop. - - # Conditional function when to end the loop once we go over all bboxes - # images_and_bboxes contain (_image, _new_bboxes) - cond = lambda _idx, _images_and_bboxes: tf.less(_idx, num_bboxes) - - # Shuffle the bboxes so that the augmentation order is not deterministic if - # we are not changing the bboxes with aug_func. - if not func_changes_bbox: - loop_bboxes = tf.random.shuffle(bboxes) - else: - loop_bboxes = bboxes - - # Main function of while_loop where we repeatedly apply augmentation on the - # bboxes in the image. - # pylint:disable=g-long-lambda - body = lambda _idx, _images_and_bboxes: [ - _idx + 1, wrapped_aug_func(_images_and_bboxes[0], - loop_bboxes[_idx], - _images_and_bboxes[1])] - # pylint:enable=g-long-lambda - - _, (image, new_bboxes) = tf.while_loop( - cond, body, [idx, (image, new_bboxes)], - shape_invariants=[idx.get_shape(), - (image.get_shape(), tf.TensorShape([None, 4]))]) - - # Either return the altered bboxes or the original ones depending on if - # we altered them in anyway. - if func_changes_bbox: - final_bboxes = new_bboxes - else: - final_bboxes = bboxes - return image, final_bboxes - - -def _clip_bbox(min_y, min_x, max_y, max_x): - """Clip bounding box coordinates between 0 and 1. - - Args: - min_y: Normalized bbox coordinate of type float between 0 and 1. - min_x: Normalized bbox coordinate of type float between 0 and 1. - max_y: Normalized bbox coordinate of type float between 0 and 1. - max_x: Normalized bbox coordinate of type float between 0 and 1. - - Returns: - Clipped coordinate values between 0 and 1. - """ - min_y = tf.clip_by_value(min_y, 0.0, 1.0) - min_x = tf.clip_by_value(min_x, 0.0, 1.0) - max_y = tf.clip_by_value(max_y, 0.0, 1.0) - max_x = tf.clip_by_value(max_x, 0.0, 1.0) - return min_y, min_x, max_y, max_x - - -def _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05): - """Adjusts bbox coordinates to make sure the area is > 0. - - Args: - min_y: Normalized bbox coordinate of type float between 0 and 1. - min_x: Normalized bbox coordinate of type float between 0 and 1. - max_y: Normalized bbox coordinate of type float between 0 and 1. - max_x: Normalized bbox coordinate of type float between 0 and 1. - delta: Float, this is used to create a gap of size 2 * delta between - bbox min/max coordinates that are the same on the boundary. - This prevents the bbox from having an area of zero. - - Returns: - Tuple of new bbox coordinates between 0 and 1 that will now have a - guaranteed area > 0. - """ - height = max_y - min_y - width = max_x - min_x - def _adjust_bbox_boundaries(min_coord, max_coord): - # Make sure max is never 0 and min is never 1. - max_coord = tf.maximum(max_coord, 0.0 + delta) - min_coord = tf.minimum(min_coord, 1.0 - delta) - return min_coord, max_coord - min_y, max_y = tf.cond(tf.equal(height, 0.0), - lambda: _adjust_bbox_boundaries(min_y, max_y), - lambda: (min_y, max_y)) - min_x, max_x = tf.cond(tf.equal(width, 0.0), - lambda: _adjust_bbox_boundaries(min_x, max_x), - lambda: (min_x, max_x)) - return min_y, min_x, max_y, max_x - - -def _rotate_bbox(bbox, image_height, image_width, degrees): - """Rotates the bbox coordinated by degrees. - - Args: - bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) - of type float that represents the normalized coordinates between 0 and 1. - image_height: Int, height of the image. - image_width: Int, height of the image. - degrees: Float, a scalar angle in degrees to rotate all images by. If - degrees is positive the image will be rotated clockwise otherwise it will - be rotated counterclockwise. - - Returns: - A tensor of the same shape as bbox, but now with the rotated coordinates. - """ - image_height, image_width = ( - tf.cast(image_height, tf.float32), tf.cast(image_width, tf.float32)) - - # Convert from degrees to radians. - degrees_to_radians = math.pi / 180.0 - radians = degrees * degrees_to_radians - - # Translate the bbox to the center of the image and turn the normalized 0-1 - # coordinates to absolute pixel locations. - # Y coordinates are made negative as the y axis of images goes down with - # increasing pixel values, so we negate to make sure x axis and y axis points - # are in the traditionally positive direction. - min_y = -tf.cast(image_height * (bbox[0] - 0.5), tf.int32) - min_x = tf.cast(image_width * (bbox[1] - 0.5), tf.int32) - max_y = -tf.cast(image_height * (bbox[2] - 0.5), tf.int32) - max_x = tf.cast(image_width * (bbox[3] - 0.5), tf.int32) - coordinates = tf.stack( - [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]]) - coordinates = tf.cast(coordinates, tf.float32) - # Rotate the coordinates according to the rotation matrix clockwise if - # radians is positive, else negative - rotation_matrix = tf.stack( - [[tf.cos(radians), tf.sin(radians)], - [-tf.sin(radians), tf.cos(radians)]]) - new_coords = tf.cast( - tf.matmul(rotation_matrix, tf.transpose(coordinates)), tf.int32) - # Find min/max values and convert them back to normalized 0-1 floats. - min_y = -( - tf.cast(tf.reduce_max(new_coords[0, :]), tf.float32) / image_height - 0.5) - min_x = tf.cast(tf.reduce_min(new_coords[1, :]), - tf.float32) / image_width + 0.5 - max_y = -( - tf.cast(tf.reduce_min(new_coords[0, :]), tf.float32) / image_height - 0.5) - max_x = tf.cast(tf.reduce_max(new_coords[1, :]), - tf.float32) / image_width + 0.5 - - # Clip the bboxes to be sure the fall between [0, 1]. - min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) - min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) - return tf.stack([min_y, min_x, max_y, max_x]) - - -def rotate_with_bboxes(image, bboxes, degrees, replace): - """Equivalent of PIL Rotate that rotates the image and bbox. - - Args: - image: 3D uint8 Tensor. - bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox - has 4 elements (min_y, min_x, max_y, max_x) of type float. - degrees: Float, a scalar angle in degrees to rotate all images by. If - degrees is positive the image will be rotated clockwise otherwise it will - be rotated counterclockwise. - replace: A one or three value 1D tensor to fill empty pixels. - - Returns: - A tuple containing a 3D uint8 Tensor that will be the result of rotating - image by degrees. The second element of the tuple is bboxes, where now - the coordinates will be shifted to reflect the rotated image. - - Raises: - ValueError: If applied to video. - """ - if image.shape.rank == 4: - raise ValueError('Image rank 4 is not supported') - - # Rotate the image. - image = wrapped_rotate(image, degrees, replace) - - # Convert bbox coordinates to pixel values. - image_height = tf.shape(image)[0] - image_width = tf.shape(image)[1] - # pylint:disable=g-long-lambda - wrapped_rotate_bbox = lambda bbox: _rotate_bbox( - bbox, image_height, image_width, degrees) - # pylint:enable=g-long-lambda - bboxes = tf.map_fn(wrapped_rotate_bbox, bboxes) - return image, bboxes - - -def _shear_bbox(bbox, image_height, image_width, level, shear_horizontal): - """Shifts the bbox according to how the image was sheared. - - Args: - bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) - of type float that represents the normalized coordinates between 0 and 1. - image_height: Int, height of the image. - image_width: Int, height of the image. - level: Float. How much to shear the image. - shear_horizontal: If true then shear in X dimension else shear in - the Y dimension. - - Returns: - A tensor of the same shape as bbox, but now with the shifted coordinates. - """ - image_height, image_width = ( - tf.cast(image_height, tf.float32), tf.cast(image_width, tf.float32)) - - # Change bbox coordinates to be pixels. - min_y = tf.cast(image_height * bbox[0], tf.int32) - min_x = tf.cast(image_width * bbox[1], tf.int32) - max_y = tf.cast(image_height * bbox[2], tf.int32) - max_x = tf.cast(image_width * bbox[3], tf.int32) - coordinates = tf.stack( - [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]]) - coordinates = tf.cast(coordinates, tf.float32) - - # Shear the coordinates according to the translation matrix. - if shear_horizontal: - translation_matrix = tf.stack( - [[1, 0], [-level, 1]]) - else: - translation_matrix = tf.stack( - [[1, -level], [0, 1]]) - translation_matrix = tf.cast(translation_matrix, tf.float32) - new_coords = tf.cast( - tf.matmul(translation_matrix, tf.transpose(coordinates)), tf.int32) - - # Find min/max values and convert them back to floats. - min_y = tf.cast(tf.reduce_min(new_coords[0, :]), tf.float32) / image_height - min_x = tf.cast(tf.reduce_min(new_coords[1, :]), tf.float32) / image_width - max_y = tf.cast(tf.reduce_max(new_coords[0, :]), tf.float32) / image_height - max_x = tf.cast(tf.reduce_max(new_coords[1, :]), tf.float32) / image_width - - # Clip the bboxes to be sure the fall between [0, 1]. - min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) - min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) - return tf.stack([min_y, min_x, max_y, max_x]) - - -def shear_with_bboxes(image, bboxes, level, replace, shear_horizontal): - """Applies Shear Transformation to the image and shifts the bboxes. - - Args: - image: 3D uint8 Tensor. - bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox - has 4 elements (min_y, min_x, max_y, max_x) of type float with values - between [0, 1]. - level: Float. How much to shear the image. This value will be between - -0.3 to 0.3. - replace: A one or three value 1D tensor to fill empty pixels. - shear_horizontal: Boolean. If true then shear in X dimension else shear in - the Y dimension. - - Returns: - A tuple containing a 3D uint8 Tensor that will be the result of shearing - image by level. The second element of the tuple is bboxes, where now - the coordinates will be shifted to reflect the sheared image. - - Raises: - ValueError: If applied to video. - """ - if image.shape.rank == 4: - raise ValueError('Image rank 4 is not supported') - - if shear_horizontal: - image = shear_x(image, level, replace) - else: - image = shear_y(image, level, replace) - - # Convert bbox coordinates to pixel values. - image_height = tf.shape(image)[0] - image_width = tf.shape(image)[1] - # pylint:disable=g-long-lambda - wrapped_shear_bbox = lambda bbox: _shear_bbox( - bbox, image_height, image_width, level, shear_horizontal) - # pylint:enable=g-long-lambda - bboxes = tf.map_fn(wrapped_shear_bbox, bboxes) - return image, bboxes - - -def _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal): - """Shifts the bbox coordinates by pixels. - - Args: - bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) - of type float that represents the normalized coordinates between 0 and 1. - image_height: Int, height of the image. - image_width: Int, width of the image. - pixels: An int. How many pixels to shift the bbox. - shift_horizontal: Boolean. If true then shift in X dimension else shift in - Y dimension. - - Returns: - A tensor of the same shape as bbox, but now with the shifted coordinates. - """ - pixels = tf.cast(pixels, tf.int32) - # Convert bbox to integer pixel locations. - min_y = tf.cast(tf.cast(image_height, tf.float32) * bbox[0], tf.int32) - min_x = tf.cast(tf.cast(image_width, tf.float32) * bbox[1], tf.int32) - max_y = tf.cast(tf.cast(image_height, tf.float32) * bbox[2], tf.int32) - max_x = tf.cast(tf.cast(image_width, tf.float32) * bbox[3], tf.int32) - - if shift_horizontal: - min_x = tf.maximum(0, min_x - pixels) - max_x = tf.minimum(image_width, max_x - pixels) - else: - min_y = tf.maximum(0, min_y - pixels) - max_y = tf.minimum(image_height, max_y - pixels) - - # Convert bbox back to floats. - min_y = tf.cast(min_y, tf.float32) / tf.cast(image_height, tf.float32) - min_x = tf.cast(min_x, tf.float32) / tf.cast(image_width, tf.float32) - max_y = tf.cast(max_y, tf.float32) / tf.cast(image_height, tf.float32) - max_x = tf.cast(max_x, tf.float32) / tf.cast(image_width, tf.float32) - - # Clip the bboxes to be sure the fall between [0, 1]. - min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) - min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) - return tf.stack([min_y, min_x, max_y, max_x]) - - -def translate_bbox(image, bboxes, pixels, replace, shift_horizontal): - """Equivalent of PIL Translate in X/Y dimension that shifts image and bbox. - - Args: - image: 3D uint8 Tensor. - bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox - has 4 elements (min_y, min_x, max_y, max_x) of type float with values - between [0, 1]. - pixels: An int. How many pixels to shift the image and bboxes - replace: A one or three value 1D tensor to fill empty pixels. - shift_horizontal: Boolean. If true then shift in X dimension else shift in - Y dimension. - - Returns: - A tuple containing a 3D uint8 Tensor that will be the result of translating - image by pixels. The second element of the tuple is bboxes, where now - the coordinates will be shifted to reflect the shifted image. - - Raises: - ValueError if applied to video. - """ - if image.shape.rank == 4: - raise ValueError('Image rank 4 is not supported') - - if shift_horizontal: - image = translate_x(image, pixels, replace) - else: - image = translate_y(image, pixels, replace) - - # Convert bbox coordinates to pixel values. - image_height = tf.shape(image)[0] - image_width = tf.shape(image)[1] - # pylint:disable=g-long-lambda - wrapped_shift_bbox = lambda bbox: _shift_bbox( - bbox, image_height, image_width, pixels, shift_horizontal) - # pylint:enable=g-long-lambda - bboxes = tf.map_fn(wrapped_shift_bbox, bboxes) - return image, bboxes - - -def translate_y_only_bboxes( - image: tf.Tensor, bboxes: tf.Tensor, prob: float, pixels: int, replace): - """Apply translate_y to each bbox in the image with probability prob.""" - if bboxes.shape.rank == 4: - raise ValueError('translate_y_only_bboxes does not support rank 4 boxes') - - func_changes_bbox = False - prob = _scale_bbox_only_op_probability(prob) - return _apply_multi_bbox_augmentation_wrapper( - image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace) - - -def _randomly_negate_tensor(tensor): - """With 50% prob turn the tensor negative.""" - should_flip = tf.cast(tf.floor(tf.random.uniform([]) + 0.5), tf.bool) - final_tensor = tf.cond(should_flip, lambda: tensor, lambda: -tensor) - return final_tensor - - -def _rotate_level_to_arg(level: float): - level = (level / _MAX_LEVEL) * 30. - level = _randomly_negate_tensor(level) - return (level,) - - -def _shrink_level_to_arg(level: float): - """Converts level to ratio by which we shrink the image content.""" - if level == 0: - return (1.0,) # if level is zero, do not shrink the image - # Maximum shrinking ratio is 2.9. - level = 2. / (_MAX_LEVEL / level) + 0.9 - return (level,) - - -def _enhance_level_to_arg(level: float): - return ((level / _MAX_LEVEL) * 1.8 + 0.1,) - - -def _shear_level_to_arg(level: float): - level = (level / _MAX_LEVEL) * 0.3 - # Flip level to negative with 50% chance. - level = _randomly_negate_tensor(level) - return (level,) - - -def _translate_level_to_arg(level: float, translate_const: float): - level = (level / _MAX_LEVEL) * float(translate_const) - # Flip level to negative with 50% chance. - level = _randomly_negate_tensor(level) - return (level,) - - -def _mult_to_arg(level: float, multiplier: float = 1.): - return (int((level / _MAX_LEVEL) * multiplier),) - - -def _apply_func_with_prob(func: Any, image: tf.Tensor, - bboxes: Optional[tf.Tensor], args: Any, prob: float): - """Apply `func` to image w/ `args` as input with probability `prob`.""" - assert isinstance(args, tuple) - assert inspect.getfullargspec(func)[0][1] == 'bboxes' - - # Apply the function with probability `prob`. - should_apply_op = tf.cast( - tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool) - augmented_image, augmented_bboxes = tf.cond( - should_apply_op, - lambda: func(image, bboxes, *args), - lambda: (image, bboxes)) - return augmented_image, augmented_bboxes - - -def select_and_apply_random_policy(policies: Any, - image: tf.Tensor, - bboxes: Optional[tf.Tensor] = None): - """Select a random policy from `policies` and apply it to `image`.""" - policy_to_select = tf.random.uniform([], maxval=len(policies), dtype=tf.int32) - # Note that using tf.case instead of tf.conds would result in significantly - # larger graphs and would even break export for some larger policies. - for (i, policy) in enumerate(policies): - image, bboxes = tf.cond( - tf.equal(i, policy_to_select), - lambda selected_policy=policy: selected_policy(image, bboxes), - lambda: (image, bboxes)) - return image, bboxes - - -NAME_TO_FUNC = { - 'AutoContrast': autocontrast, - 'Equalize': equalize, - 'Invert': invert, - 'Rotate': wrapped_rotate, - 'Posterize': posterize, - 'Solarize': solarize, - 'SolarizeAdd': solarize_add, - 'Color': color, - 'Contrast': contrast, - 'Brightness': brightness, - 'Sharpness': sharpness, - 'ShearX': shear_x, - 'ShearY': shear_y, - 'TranslateX': translate_x, - 'TranslateY': translate_y, - 'Cutout': cutout, - 'Rotate_BBox': rotate_with_bboxes, - # pylint:disable=g-long-lambda - 'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes( - image, bboxes, level, replace, shear_horizontal=True), - 'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes( - image, bboxes, level, replace, shear_horizontal=False), - 'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox( - image, bboxes, pixels, replace, shift_horizontal=True), - 'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox( - image, bboxes, pixels, replace, shift_horizontal=False), - # pylint:enable=g-long-lambda - 'TranslateY_Only_BBoxes': translate_y_only_bboxes, -} - -# Functions that require a `bboxes` parameter. -REQUIRE_BOXES_FUNCS = frozenset({ - 'Rotate_BBox', - 'ShearX_BBox', - 'ShearY_BBox', - 'TranslateX_BBox', - 'TranslateY_BBox', - 'TranslateY_Only_BBoxes', -}) - -# Functions that have a 'prob' parameter -PROB_FUNCS = frozenset({ - 'TranslateY_Only_BBoxes', -}) - -# Functions that have a 'replace' parameter -REPLACE_FUNCS = frozenset({ - 'Rotate', - 'TranslateX', - 'ShearX', - 'ShearY', - 'TranslateY', - 'Cutout', - 'Rotate_BBox', - 'ShearX_BBox', - 'ShearY_BBox', - 'TranslateX_BBox', - 'TranslateY_BBox', - 'TranslateY_Only_BBoxes', -}) - - -def level_to_arg(cutout_const: float, translate_const: float): - """Creates a dict mapping image operation names to their arguments.""" - - no_arg = lambda level: () - posterize_arg = lambda level: _mult_to_arg(level, 4) - solarize_arg = lambda level: _mult_to_arg(level, 256) - solarize_add_arg = lambda level: _mult_to_arg(level, 110) - cutout_arg = lambda level: _mult_to_arg(level, cutout_const) - translate_arg = lambda level: _translate_level_to_arg(level, translate_const) - translate_bbox_arg = lambda level: _translate_level_to_arg(level, 120) - - args = { - 'AutoContrast': no_arg, - 'Equalize': no_arg, - 'Invert': no_arg, - 'Rotate': _rotate_level_to_arg, - 'Posterize': posterize_arg, - 'Solarize': solarize_arg, - 'SolarizeAdd': solarize_add_arg, - 'Color': _enhance_level_to_arg, - 'Contrast': _enhance_level_to_arg, - 'Brightness': _enhance_level_to_arg, - 'Sharpness': _enhance_level_to_arg, - 'ShearX': _shear_level_to_arg, - 'ShearY': _shear_level_to_arg, - 'Cutout': cutout_arg, - 'TranslateX': translate_arg, - 'TranslateY': translate_arg, - 'Rotate_BBox': _rotate_level_to_arg, - 'ShearX_BBox': _shear_level_to_arg, - 'ShearY_BBox': _shear_level_to_arg, - # pylint:disable=g-long-lambda - 'TranslateX_BBox': lambda level: _translate_level_to_arg( - level, translate_const), - 'TranslateY_BBox': lambda level: _translate_level_to_arg( - level, translate_const), - # pylint:enable=g-long-lambda - 'TranslateY_Only_BBoxes': translate_bbox_arg, - } - return args - - -def bbox_wrapper(func): - """Adds a bboxes function argument to func and returns unchanged bboxes.""" - def wrapper(images, bboxes, *args, **kwargs): - return (func(images, *args, **kwargs), bboxes) - return wrapper - - -def _parse_policy_info(name: Text, - prob: float, - level: float, - replace_value: List[int], - cutout_const: float, - translate_const: float, - level_std: float = 0.) -> Tuple[Any, float, Any]: - """Return the function that corresponds to `name` and update `level` param.""" - func = NAME_TO_FUNC[name] - - if level_std > 0: - level += tf.random.normal([], dtype=tf.float32) - level = tf.clip_by_value(level, 0., _MAX_LEVEL) - - args = level_to_arg(cutout_const, translate_const)[name](level) - - if name in PROB_FUNCS: - # Add in the prob arg if it is required for the function that is called. - args = tuple([prob] + list(args)) - - if name in REPLACE_FUNCS: - # Add in replace arg if it is required for the function that is called. - args = tuple(list(args) + [replace_value]) - - # Add bboxes as the second positional argument for the function if it does - # not already exist. - if 'bboxes' not in inspect.getfullargspec(func)[0]: - func = bbox_wrapper(func) - - return func, prob, args - - -class ImageAugment(object): - """Image augmentation class for applying image distortions.""" - - def distort( - self, - image: tf.Tensor - ) -> tf.Tensor: - """Given an image tensor, returns a distorted image with the same shape. - - Args: - image: `Tensor` of shape [height, width, 3] or - [num_frames, height, width, 3] representing an image or image sequence. - - Returns: - The augmented version of `image`. - """ - raise NotImplementedError() - - def distort_with_boxes( - self, - image: tf.Tensor, - bboxes: tf.Tensor - ) -> Tuple[tf.Tensor, tf.Tensor]: - """Distorts the image and bounding boxes. - - Args: - image: `Tensor` of shape [height, width, 3] or - [num_frames, height, width, 3] representing an image or image sequence. - bboxes: `Tensor` of shape [num_boxes, 4] or [num_frames, num_boxes, 4] - representing bounding boxes for an image or image sequence. - - Returns: - The augmented version of `image` and `bboxes`. - """ - raise NotImplementedError - - -class AutoAugment(ImageAugment): - """Applies the AutoAugment policy to images. - - AutoAugment is from the paper: https://arxiv.org/abs/1805.09501. - """ - - def __init__(self, - augmentation_name: Text = 'v0', - policies: Optional[Iterable[Iterable[Tuple[Text, float, - float]]]] = None, - cutout_const: float = 100, - translate_const: float = 250): - """Applies the AutoAugment policy to images. - - Args: - augmentation_name: The name of the AutoAugment policy to use. The - available options are `v0`, `test`, `reduced_cifar10`, `svhn` and - `reduced_imagenet`. `v0` is the policy used for all - of the results in the paper and was found to achieve the best results on - the COCO dataset. `v1`, `v2` and `v3` are additional good policies found - on the COCO dataset that have slight variation in what operations were - used during the search procedure along with how many operations are - applied in parallel to a single image (2 vs 3). Make sure to set - `policies` to `None` (the default) if you want to set options using - `augmentation_name`. - policies: list of lists of tuples in the form `(func, prob, level)`, - `func` is a string name of the augmentation function, `prob` is the - probability of applying the `func` operation, `level` (or magnitude) is - the input argument for `func`. For example: - ``` - [[('Equalize', 0.9, 3), ('Color', 0.7, 8)], - [('Invert', 0.6, 5), ('Rotate', 0.2, 9), ('ShearX', 0.1, 2)], ...] - ``` - The outer-most list must be 3-d. The number of operations in a - sub-policy can vary from one sub-policy to another. - If you provide `policies` as input, any option set with - `augmentation_name` will get overriden as they are mutually exclusive. - cutout_const: multiplier for applying cutout. - translate_const: multiplier for applying translation. - - Raises: - ValueError if `augmentation_name` is unsupported. - """ - super(AutoAugment, self).__init__() - - self.augmentation_name = augmentation_name - self.cutout_const = float(cutout_const) - self.translate_const = float(translate_const) - self.available_policies = { - 'detection_v0': self.detection_policy_v0(), - 'v0': self.policy_v0(), - 'test': self.policy_test(), - 'simple': self.policy_simple(), - 'reduced_cifar10': self.policy_reduced_cifar10(), - 'svhn': self.policy_svhn(), - 'reduced_imagenet': self.policy_reduced_imagenet(), - } - - if not policies: - if augmentation_name not in self.available_policies: - raise ValueError( - 'Invalid augmentation_name: {}'.format(augmentation_name)) - - self.policies = self.available_policies[augmentation_name] - - else: - self._check_policy_shape(policies) - self.policies = policies - - def _check_policy_shape(self, policies): - """Checks dimension and shape of the custom policy. - - Args: - policies: List of list of tuples in the form `(func, prob, level)`. Must - have shape of `(:, :, 3)`. - - Raises: - ValueError if the shape of `policies` is unexpected. - """ - in_shape = np.array(policies).shape - if len(in_shape) != 3 or in_shape[-1:] != (3,): - raise ValueError('Wrong shape detected for custom policy. Expected ' - '(:, :, 3) but got {}.'.format(in_shape)) - - def _make_tf_policies(self): - """Prepares the TF functions for augmentations based on the policies.""" - replace_value = [128] * 3 - - # func is the string name of the augmentation function, prob is the - # probability of applying the operation and level is the parameter - # associated with the tf op. - - # tf_policies are functions that take in an image and return an augmented - # image. - tf_policies = [] - for policy in self.policies: - tf_policy = [] - assert_ranges = [] - # Link string name to the correct python function and make sure the - # correct argument is passed into that function. - for policy_info in policy: - _, prob, level = policy_info - assert_ranges.append(tf.Assert(tf.less_equal(prob, 1.), [prob])) - assert_ranges.append( - tf.Assert(tf.less_equal(level, int(_MAX_LEVEL)), [level])) - - policy_info = list(policy_info) + [ - replace_value, self.cutout_const, self.translate_const - ] - tf_policy.append(_parse_policy_info(*policy_info)) - # Now build the tf policy that will apply the augmentation procedue - # on image. - def make_final_policy(tf_policy_): - - def final_policy(image_, bboxes_): - for func, prob, args in tf_policy_: - image_, bboxes_ = _apply_func_with_prob(func, image_, bboxes_, args, - prob) - return image_, bboxes_ - - return final_policy - - with tf.control_dependencies(assert_ranges): - tf_policies.append(make_final_policy(tf_policy)) - - return tf_policies - - def distort(self, image: tf.Tensor) -> tf.Tensor: - """See base class.""" - input_image_type = image.dtype - if input_image_type != tf.uint8: - image = tf.clip_by_value(image, 0.0, 255.0) - image = tf.cast(image, dtype=tf.uint8) - - tf_policies = self._make_tf_policies() - image, _ = select_and_apply_random_policy(tf_policies, image, bboxes=None) - image = tf.cast(image, dtype=input_image_type) - return image - - def distort_with_boxes(self, image: tf.Tensor, - bboxes: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: - """See base class.""" - input_image_type = image.dtype - if input_image_type != tf.uint8: - image = tf.clip_by_value(image, 0.0, 255.0) - image = tf.cast(image, dtype=tf.uint8) - - tf_policies = self._make_tf_policies() - image, bboxes = select_and_apply_random_policy(tf_policies, image, bboxes) - return image, bboxes - - @staticmethod - def detection_policy_v0(): - """Autoaugment policy that was used in AutoAugment Paper for Detection. - - https://arxiv.org/pdf/1906.11172 - - Each tuple is an augmentation operation of the form - (operation, probability, magnitude). Each element in policy is a - sub-policy that will be applied sequentially on the image. - - Returns: - the policy. - """ - policy = [ - [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)], - [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)], - [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)], - [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)], - [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)], - ] - return policy - - @staticmethod - def policy_v0(): - """Autoaugment policy that was used in AutoAugment Paper. - - Each tuple is an augmentation operation of the form - (operation, probability, magnitude). Each element in policy is a - sub-policy that will be applied sequentially on the image. - - Returns: - the policy. - """ - - policy = [ - [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)], - [('Color', 0.4, 9), ('Equalize', 0.6, 3)], - [('Color', 0.4, 1), ('Rotate', 0.6, 8)], - [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)], - [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)], - [('Color', 0.2, 0), ('Equalize', 0.8, 8)], - [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)], - [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)], - [('Color', 0.6, 1), ('Equalize', 1.0, 2)], - [('Invert', 0.4, 9), ('Rotate', 0.6, 0)], - [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)], - [('Color', 0.4, 7), ('Equalize', 0.6, 0)], - [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)], - [('Solarize', 0.6, 8), ('Color', 0.6, 9)], - [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)], - [('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)], - [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)], - [('ShearY', 0.8, 0), ('Color', 0.6, 4)], - [('Color', 1.0, 0), ('Rotate', 0.6, 2)], - [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)], - [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)], - [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)], - [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)], - [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)], - [('Color', 0.8, 6), ('Rotate', 0.4, 5)], - ] - return policy - - @staticmethod - def policy_reduced_cifar10(): - """Autoaugment policy for reduced CIFAR-10 dataset. - - Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501. - - Each tuple is an augmentation operation of the form - (operation, probability, magnitude). Each element in policy is a - sub-policy that will be applied sequentially on the image. - - Returns: - the policy. - """ - policy = [ - [('Invert', 0.1, 7), ('Contrast', 0.2, 6)], - [('Rotate', 0.7, 2), ('TranslateX', 0.3, 9)], - [('Sharpness', 0.8, 1), ('Sharpness', 0.9, 3)], - [('ShearY', 0.5, 8), ('TranslateY', 0.7, 9)], - [('AutoContrast', 0.5, 8), ('Equalize', 0.9, 2)], - [('ShearY', 0.2, 7), ('Posterize', 0.3, 7)], - [('Color', 0.4, 3), ('Brightness', 0.6, 7)], - [('Sharpness', 0.3, 9), ('Brightness', 0.7, 9)], - [('Equalize', 0.6, 5), ('Equalize', 0.5, 1)], - [('Contrast', 0.6, 7), ('Sharpness', 0.6, 5)], - [('Color', 0.7, 7), ('TranslateX', 0.5, 8)], - [('Equalize', 0.3, 7), ('AutoContrast', 0.4, 8)], - [('TranslateY', 0.4, 3), ('Sharpness', 0.2, 6)], - [('Brightness', 0.9, 6), ('Color', 0.2, 8)], - [('Solarize', 0.5, 2), ('Invert', 0.0, 3)], - [('Equalize', 0.2, 0), ('AutoContrast', 0.6, 0)], - [('Equalize', 0.2, 8), ('Equalize', 0.6, 4)], - [('Color', 0.9, 9), ('Equalize', 0.6, 6)], - [('AutoContrast', 0.8, 4), ('Solarize', 0.2, 8)], - [('Brightness', 0.1, 3), ('Color', 0.7, 0)], - [('Solarize', 0.4, 5), ('AutoContrast', 0.9, 3)], - [('TranslateY', 0.9, 9), ('TranslateY', 0.7, 9)], - [('AutoContrast', 0.9, 2), ('Solarize', 0.8, 3)], - [('Equalize', 0.8, 8), ('Invert', 0.1, 3)], - [('TranslateY', 0.7, 9), ('AutoContrast', 0.9, 1)], - ] - return policy - - @staticmethod - def policy_svhn(): - """Autoaugment policy for SVHN dataset. - - Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501. - - Each tuple is an augmentation operation of the form - (operation, probability, magnitude). Each element in policy is a - sub-policy that will be applied sequentially on the image. - - Returns: - the policy. - """ - policy = [ - [('ShearX', 0.9, 4), ('Invert', 0.2, 3)], - [('ShearY', 0.9, 8), ('Invert', 0.7, 5)], - [('Equalize', 0.6, 5), ('Solarize', 0.6, 6)], - [('Invert', 0.9, 3), ('Equalize', 0.6, 3)], - [('Equalize', 0.6, 1), ('Rotate', 0.9, 3)], - [('ShearX', 0.9, 4), ('AutoContrast', 0.8, 3)], - [('ShearY', 0.9, 8), ('Invert', 0.4, 5)], - [('ShearY', 0.9, 5), ('Solarize', 0.2, 6)], - [('Invert', 0.9, 6), ('AutoContrast', 0.8, 1)], - [('Equalize', 0.6, 3), ('Rotate', 0.9, 3)], - [('ShearX', 0.9, 4), ('Solarize', 0.3, 3)], - [('ShearY', 0.8, 8), ('Invert', 0.7, 4)], - [('Equalize', 0.9, 5), ('TranslateY', 0.6, 6)], - [('Invert', 0.9, 4), ('Equalize', 0.6, 7)], - [('Contrast', 0.3, 3), ('Rotate', 0.8, 4)], - [('Invert', 0.8, 5), ('TranslateY', 0.0, 2)], - [('ShearY', 0.7, 6), ('Solarize', 0.4, 8)], - [('Invert', 0.6, 4), ('Rotate', 0.8, 4)], - [('ShearY', 0.3, 7), ('TranslateX', 0.9, 3)], - [('ShearX', 0.1, 6), ('Invert', 0.6, 5)], - [('Solarize', 0.7, 2), ('TranslateY', 0.6, 7)], - [('ShearY', 0.8, 4), ('Invert', 0.8, 8)], - [('ShearX', 0.7, 9), ('TranslateY', 0.8, 3)], - [('ShearY', 0.8, 5), ('AutoContrast', 0.7, 3)], - [('ShearX', 0.7, 2), ('Invert', 0.1, 5)], - ] - return policy - - @staticmethod - def policy_reduced_imagenet(): - """Autoaugment policy for reduced ImageNet dataset. - - Result is from the AutoAugment paper: https://arxiv.org/abs/1805.09501. - - Each tuple is an augmentation operation of the form - (operation, probability, magnitude). Each element in policy is a - sub-policy that will be applied sequentially on the image. - - Returns: - the policy. - """ - policy = [ - [('Posterize', 0.4, 8), ('Rotate', 0.6, 9)], - [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], - [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], - [('Posterize', 0.6, 7), ('Posterize', 0.6, 6)], - [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], - [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)], - [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)], - [('Posterize', 0.8, 5), ('Equalize', 1.0, 2)], - [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)], - [('Equalize', 0.6, 8), ('Posterize', 0.4, 6)], - [('Rotate', 0.8, 8), ('Color', 0.4, 0)], - [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)], - [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)], - [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], - [('Color', 0.6, 4), ('Contrast', 1.0, 8)], - [('Rotate', 0.8, 8), ('Color', 1.0, 2)], - [('Color', 0.8, 8), ('Solarize', 0.8, 7)], - [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)], - [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)], - [('Color', 0.4, 0), ('Equalize', 0.6, 3)], - [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], - [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], - [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], - [('Color', 0.6, 4), ('Contrast', 1.0, 8)], - [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)] - ] - return policy - - @staticmethod - def policy_simple(): - """Same as `policy_v0`, except with custom ops removed.""" - - policy = [ - [('Color', 0.4, 9), ('Equalize', 0.6, 3)], - [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)], - [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)], - [('Color', 0.2, 0), ('Equalize', 0.8, 8)], - [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)], - [('Color', 0.6, 1), ('Equalize', 1.0, 2)], - [('Color', 0.4, 7), ('Equalize', 0.6, 0)], - [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)], - [('Solarize', 0.6, 8), ('Color', 0.6, 9)], - [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)], - [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)], - [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)], - [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)], - ] - return policy - - @staticmethod - def policy_test(): - """Autoaugment test policy for debugging.""" - policy = [ - [('TranslateX', 1.0, 4), ('Equalize', 1.0, 10)], - ] - return policy - - -def _maybe_identity(x: Optional[tf.Tensor]) -> Optional[tf.Tensor]: - return tf.identity(x) if x is not None else None - - -class RandAugment(ImageAugment): - """Applies the RandAugment policy to images. - - RandAugment is from the paper https://arxiv.org/abs/1909.13719, - """ - - def __init__(self, - num_layers: int = 2, - magnitude: float = 10., - cutout_const: float = 40., - translate_const: float = 100., - magnitude_std: float = 0.0, - prob_to_apply: Optional[float] = None, - exclude_ops: Optional[List[str]] = None): - """Applies the RandAugment policy to images. - - Args: - num_layers: Integer, the number of augmentation transformations to apply - sequentially to an image. Represented as (N) in the paper. Usually best - values will be in the range [1, 3]. - magnitude: Integer, shared magnitude across all augmentation operations. - Represented as (M) in the paper. Usually best values are in the range - [5, 10]. - cutout_const: multiplier for applying cutout. - translate_const: multiplier for applying translation. - magnitude_std: randomness of the severity as proposed by the authors of - the timm library. - prob_to_apply: The probability to apply the selected augmentation at each - layer. - exclude_ops: exclude selected operations. - """ - super(RandAugment, self).__init__() - - self.num_layers = num_layers - self.magnitude = float(magnitude) - self.cutout_const = float(cutout_const) - self.translate_const = float(translate_const) - self.prob_to_apply = ( - float(prob_to_apply) if prob_to_apply is not None else None) - self.available_ops = [ - 'AutoContrast', 'Equalize', 'Invert', 'Rotate', 'Posterize', 'Solarize', - 'Color', 'Contrast', 'Brightness', 'Sharpness', 'ShearX', 'ShearY', - 'TranslateX', 'TranslateY', 'Cutout', 'SolarizeAdd' - ] - self.magnitude_std = magnitude_std - if exclude_ops: - self.available_ops = [ - op for op in self.available_ops if op not in exclude_ops - ] - - @classmethod - def build_for_detection(cls, - num_layers: int = 2, - magnitude: float = 10., - cutout_const: float = 40., - translate_const: float = 100., - magnitude_std: float = 0.0, - prob_to_apply: Optional[float] = None, - exclude_ops: Optional[List[str]] = None): - """Builds a RandAugment that modifies bboxes for geometric transforms.""" - augmenter = cls( - num_layers=num_layers, - magnitude=magnitude, - cutout_const=cutout_const, - translate_const=translate_const, - magnitude_std=magnitude_std, - prob_to_apply=prob_to_apply, - exclude_ops=exclude_ops) - box_aware_ops_by_base_name = { - 'Rotate': 'Rotate_BBox', - 'ShearX': 'ShearX_BBox', - 'ShearY': 'ShearY_BBox', - 'TranslateX': 'TranslateX_BBox', - 'TranslateY': 'TranslateY_BBox', - } - augmenter.available_ops = [ - box_aware_ops_by_base_name.get(op_name) or op_name - for op_name in augmenter.available_ops - ] - return augmenter - - def _distort_common( - self, - image: tf.Tensor, - bboxes: Optional[tf.Tensor] = None - ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]: - """Distorts the image and optionally bounding boxes.""" - input_image_type = image.dtype - - if input_image_type != tf.uint8: - image = tf.clip_by_value(image, 0.0, 255.0) - image = tf.cast(image, dtype=tf.uint8) - - replace_value = [128] * 3 - min_prob, max_prob = 0.2, 0.8 - - aug_image = image - aug_bboxes = bboxes - - for _ in range(self.num_layers): - op_to_select = tf.random.uniform([], - maxval=len(self.available_ops) + 1, - dtype=tf.int32) - - branch_fns = [] - for (i, op_name) in enumerate(self.available_ops): - prob = tf.random.uniform([], - minval=min_prob, - maxval=max_prob, - dtype=tf.float32) - func, _, args = _parse_policy_info(op_name, prob, self.magnitude, - replace_value, self.cutout_const, - self.translate_const, - self.magnitude_std) - branch_fns.append(( - i, - # pylint:disable=g-long-lambda - lambda selected_func=func, selected_args=args: selected_func( - image, bboxes, *selected_args))) - # pylint:enable=g-long-lambda - - aug_image, aug_bboxes = tf.switch_case( - branch_index=op_to_select, - branch_fns=branch_fns, - default=lambda: (tf.identity(image), _maybe_identity(bboxes))) - - if self.prob_to_apply is not None: - aug_image, aug_bboxes = tf.cond( - tf.random.uniform(shape=[], dtype=tf.float32) < self.prob_to_apply, - lambda: (tf.identity(aug_image), _maybe_identity(aug_bboxes)), - lambda: (tf.identity(image), _maybe_identity(bboxes))) - image = aug_image - bboxes = aug_bboxes - - image = tf.cast(image, dtype=input_image_type) - return image, bboxes - - def distort(self, image: tf.Tensor) -> tf.Tensor: - """See base class.""" - image, _ = self._distort_common(image) - return image - - def distort_with_boxes(self, image: tf.Tensor, - bboxes: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: - """See base class.""" - image, bboxes = self._distort_common(image, bboxes) - return image, bboxes - - -class RandomErasing(ImageAugment): - """Applies RandomErasing to a single image. - - Reference: https://arxiv.org/abs/1708.04896 - - Implementaion is inspired by https://github.com/rwightman/pytorch-image-models - """ - - def __init__(self, - probability: float = 0.25, - min_area: float = 0.02, - max_area: float = 1 / 3, - min_aspect: float = 0.3, - max_aspect=None, - min_count=1, - max_count=1, - trials=10): - """Applies RandomErasing to a single image. - - Args: - probability (float, optional): Probability of augmenting the image. - Defaults to 0.25. - min_area (float, optional): Minimum area of the random erasing rectangle. - Defaults to 0.02. - max_area (float, optional): Maximum area of the random erasing rectangle. - Defaults to 1/3. - min_aspect (float, optional): Minimum aspect rate of the random erasing - rectangle. Defaults to 0.3. - max_aspect ([type], optional): Maximum aspect rate of the random erasing - rectangle. Defaults to None. - min_count (int, optional): Minimum number of erased rectangles. Defaults - to 1. - max_count (int, optional): Maximum number of erased rectangles. Defaults - to 1. - trials (int, optional): Maximum number of trials to randomly sample a - rectangle that fulfills constraint. Defaults to 10. - """ - self._probability = probability - self._min_area = float(min_area) - self._max_area = float(max_area) - self._min_log_aspect = math.log(min_aspect) - self._max_log_aspect = math.log(max_aspect or 1 / min_aspect) - self._min_count = min_count - self._max_count = max_count - self._trials = trials - - def distort(self, image: tf.Tensor) -> tf.Tensor: - """Applies RandomErasing to single `image`. - - Args: - image (tf.Tensor): Of shape [height, width, 3] representing an image. - - Returns: - tf.Tensor: The augmented version of `image`. - """ - uniform_random = tf.random.uniform(shape=[], minval=0., maxval=1.0) - mirror_cond = tf.less(uniform_random, self._probability) - image = tf.cond(mirror_cond, lambda: self._erase(image), lambda: image) - return image - - @tf.function - def _erase(self, image: tf.Tensor) -> tf.Tensor: - """Erase an area.""" - if self._min_count == self._max_count: - count = self._min_count - else: - count = tf.random.uniform( - shape=[], - minval=int(self._min_count), - maxval=int(self._max_count - self._min_count + 1), - dtype=tf.int32) - - image_height = tf.shape(image)[0] - image_width = tf.shape(image)[1] - area = tf.cast(image_width * image_height, tf.float32) - - for _ in range(count): - # Work around since break is not supported in tf.function - is_trial_successfull = False - for _ in range(self._trials): - if not is_trial_successfull: - erase_area = tf.random.uniform( - shape=[], - minval=area * self._min_area, - maxval=area * self._max_area) - aspect_ratio = tf.math.exp( - tf.random.uniform( - shape=[], - minval=self._min_log_aspect, - maxval=self._max_log_aspect)) - - half_height = tf.cast( - tf.math.round(tf.math.sqrt(erase_area * aspect_ratio) / 2), - dtype=tf.int32) - half_width = tf.cast( - tf.math.round(tf.math.sqrt(erase_area / aspect_ratio) / 2), - dtype=tf.int32) - - if 2 * half_height < image_height and 2 * half_width < image_width: - center_height = tf.random.uniform( - shape=[], - minval=0, - maxval=int(image_height - 2 * half_height), - dtype=tf.int32) - center_width = tf.random.uniform( - shape=[], - minval=0, - maxval=int(image_width - 2 * half_width), - dtype=tf.int32) - - image = _fill_rectangle( - image, - center_width, - center_height, - half_width, - half_height, - replace=None) - - is_trial_successfull = True - - return image - - -class MixupAndCutmix: - """Applies Mixup and/or Cutmix to a batch of images. - - - Mixup: https://arxiv.org/abs/1710.09412 - - Cutmix: https://arxiv.org/abs/1905.04899 - - Implementaion is inspired by https://github.com/rwightman/pytorch-image-models - """ - - def __init__(self, - mixup_alpha: float = .8, - cutmix_alpha: float = 1., - prob: float = 1.0, - switch_prob: float = 0.5, - label_smoothing: float = 0.1, - num_classes: int = 1001): - """Applies Mixup and/or Cutmix to a batch of images. - - Args: - mixup_alpha (float, optional): For drawing a random lambda (`lam`) from a - beta distribution (for each image). If zero Mixup is deactivated. - Defaults to .8. - cutmix_alpha (float, optional): For drawing a random lambda (`lam`) from a - beta distribution (for each image). If zero Cutmix is deactivated. - Defaults to 1.. - prob (float, optional): Of augmenting the batch. Defaults to 1.0. - switch_prob (float, optional): Probability of applying Cutmix for the - batch. Defaults to 0.5. - label_smoothing (float, optional): Constant for label smoothing. Defaults - to 0.1. - num_classes (int, optional): Number of classes. Defaults to 1001. - """ - self.mixup_alpha = mixup_alpha - self.cutmix_alpha = cutmix_alpha - self.mix_prob = prob - self.switch_prob = switch_prob - self.label_smoothing = label_smoothing - self.num_classes = num_classes - self.mode = 'batch' - self.mixup_enabled = True - - if self.mixup_alpha and not self.cutmix_alpha: - self.switch_prob = -1 - elif not self.mixup_alpha and self.cutmix_alpha: - self.switch_prob = 1 - - def __call__(self, images: tf.Tensor, - labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: - return self.distort(images, labels) - - def distort(self, images: tf.Tensor, - labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: - """Applies Mixup and/or Cutmix to batch of images and transforms labels. - - Args: - images (tf.Tensor): Of shape [batch_size,height, width, 3] representing a - batch of image. - labels (tf.Tensor): Of shape [batch_size, ] representing the class id for - each image of the batch. - - Returns: - Tuple[tf.Tensor, tf.Tensor]: The augmented version of `image` and - `labels`. - """ - augment_cond = tf.less( - tf.random.uniform(shape=[], minval=0., maxval=1.0), self.mix_prob) - # pylint: disable=g-long-lambda - augment_a = lambda: self._update_labels(*tf.cond( - tf.less( - tf.random.uniform(shape=[], minval=0., maxval=1.0), self.switch_prob - ), lambda: self._cutmix(images, labels), lambda: self._mixup( - images, labels))) - augment_b = lambda: (images, self._smooth_labels(labels)) - # pylint: enable=g-long-lambda - - return tf.cond(augment_cond, augment_a, augment_b) - - @staticmethod - def _sample_from_beta(alpha, beta, shape): - sample_alpha = tf.random.gamma(shape, 1., beta=alpha) - sample_beta = tf.random.gamma(shape, 1., beta=beta) - return sample_alpha / (sample_alpha + sample_beta) - - def _cutmix(self, images: tf.Tensor, - labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: - """Apply cutmix.""" - lam = MixupAndCutmix._sample_from_beta(self.cutmix_alpha, self.cutmix_alpha, - tf.shape(labels)) - - ratio = tf.math.sqrt(1 - lam) - - batch_size = tf.shape(images)[0] - image_height, image_width = tf.shape(images)[1], tf.shape(images)[2] - - cut_height = tf.cast( - ratio * tf.cast(image_height, dtype=tf.float32), dtype=tf.int32) - cut_width = tf.cast( - ratio * tf.cast(image_height, dtype=tf.float32), dtype=tf.int32) - - random_center_height = tf.random.uniform( - shape=[batch_size], minval=0, maxval=image_height, dtype=tf.int32) - random_center_width = tf.random.uniform( - shape=[batch_size], minval=0, maxval=image_width, dtype=tf.int32) - - bbox_area = cut_height * cut_width - lam = 1. - bbox_area / (image_height * image_width) - lam = tf.cast(lam, dtype=tf.float32) - - images = tf.map_fn( - lambda x: _fill_rectangle(*x), - (images, random_center_width, random_center_height, cut_width // 2, - cut_height // 2, tf.reverse(images, [0])), - dtype=( - images.dtype, tf.int32, tf.int32, tf.int32, tf.int32, images.dtype), - fn_output_signature=tf.TensorSpec(images.shape[1:], dtype=images.dtype)) - - return images, labels, lam - - def _mixup(self, images: tf.Tensor, - labels: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: - lam = MixupAndCutmix._sample_from_beta(self.mixup_alpha, self.mixup_alpha, - tf.shape(labels)) - lam = tf.reshape(lam, [-1, 1, 1, 1]) - lam_cast = tf.cast(lam, dtype=images.dtype) - images = lam_cast * images + (1. - lam_cast) * tf.reverse(images, [0]) - - return images, labels, tf.squeeze(lam) - - def _smooth_labels(self, labels: tf.Tensor) -> tf.Tensor: - off_value = self.label_smoothing / self.num_classes - on_value = 1. - self.label_smoothing + off_value - - smooth_labels = tf.one_hot( - labels, self.num_classes, on_value=on_value, off_value=off_value) - return smooth_labels - - def _update_labels(self, images: tf.Tensor, labels: tf.Tensor, - lam: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: - labels_1 = self._smooth_labels(labels) - labels_2 = tf.reverse(labels_1, [0]) - - lam = tf.reshape(lam, [-1, 1]) - labels = lam * labels_1 + (1. - lam) * labels_2 - - return images, labels diff --git a/official/vision/beta/ops/augment_test.py b/official/vision/beta/ops/augment_test.py deleted file mode 100644 index c9e77972c..000000000 --- a/official/vision/beta/ops/augment_test.py +++ /dev/null @@ -1,440 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for autoaugment.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import random -from absl.testing import parameterized - -import tensorflow as tf - -from official.vision.beta.ops import augment - - -def get_dtype_test_cases(): - return [ - ('uint8', tf.uint8), - ('int32', tf.int32), - ('float16', tf.float16), - ('float32', tf.float32), - ] - - -@parameterized.named_parameters(get_dtype_test_cases()) -class TransformsTest(parameterized.TestCase, tf.test.TestCase): - """Basic tests for fundamental transformations.""" - - def test_to_from_4d(self, dtype): - for shape in [(10, 10), (10, 10, 10), (10, 10, 10, 10)]: - original_ndims = len(shape) - image = tf.zeros(shape, dtype=dtype) - image_4d = augment.to_4d(image) - self.assertEqual(4, tf.rank(image_4d)) - self.assertAllEqual(image, augment.from_4d(image_4d, original_ndims)) - - def test_transform(self, dtype): - image = tf.constant([[1, 2], [3, 4]], dtype=dtype) - self.assertAllEqual( - augment.transform(image, transforms=[1] * 8), [[4, 4], [4, 4]]) - - def test_translate(self, dtype): - image = tf.constant( - [[1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]], dtype=dtype) - translations = [-1, -1] - translated = augment.translate(image=image, translations=translations) - expected = [[1, 0, 1, 1], [0, 1, 0, 0], [1, 0, 1, 1], [1, 0, 1, 1]] - self.assertAllEqual(translated, expected) - - def test_translate_shapes(self, dtype): - translation = [0, 0] - for shape in [(3, 3), (5, 5), (224, 224, 3)]: - image = tf.zeros(shape, dtype=dtype) - self.assertAllEqual(image, augment.translate(image, translation)) - - def test_translate_invalid_translation(self, dtype): - image = tf.zeros((1, 1), dtype=dtype) - invalid_translation = [[[1, 1]]] - with self.assertRaisesRegex(TypeError, 'rank 1 or 2'): - _ = augment.translate(image, invalid_translation) - - def test_rotate(self, dtype): - image = tf.reshape(tf.cast(tf.range(9), dtype), (3, 3)) - rotation = 90. - transformed = augment.rotate(image=image, degrees=rotation) - expected = [[2, 5, 8], [1, 4, 7], [0, 3, 6]] - self.assertAllEqual(transformed, expected) - - def test_rotate_shapes(self, dtype): - degrees = 0. - for shape in [(3, 3), (5, 5), (224, 224, 3)]: - image = tf.zeros(shape, dtype=dtype) - self.assertAllEqual(image, augment.rotate(image, degrees)) - - -class AutoaugmentTest(tf.test.TestCase, parameterized.TestCase): - - AVAILABLE_POLICIES = [ - 'v0', - 'test', - 'simple', - 'reduced_cifar10', - 'svhn', - 'reduced_imagenet', - 'detection_v0', - ] - - def test_autoaugment(self): - """Smoke test to be sure there are no syntax errors.""" - image = tf.zeros((224, 224, 3), dtype=tf.uint8) - - for policy in self.AVAILABLE_POLICIES: - augmenter = augment.AutoAugment(augmentation_name=policy) - aug_image = augmenter.distort(image) - - self.assertEqual((224, 224, 3), aug_image.shape) - - def test_autoaugment_with_bboxes(self): - """Smoke test to be sure there are no syntax errors with bboxes.""" - image = tf.zeros((224, 224, 3), dtype=tf.uint8) - bboxes = tf.ones((2, 4), dtype=tf.float32) - - for policy in self.AVAILABLE_POLICIES: - augmenter = augment.AutoAugment(augmentation_name=policy) - aug_image, aug_bboxes = augmenter.distort_with_boxes(image, bboxes) - - self.assertEqual((224, 224, 3), aug_image.shape) - self.assertEqual((2, 4), aug_bboxes.shape) - - def test_randaug(self): - """Smoke test to be sure there are no syntax errors.""" - image = tf.zeros((224, 224, 3), dtype=tf.uint8) - - augmenter = augment.RandAugment() - aug_image = augmenter.distort(image) - - self.assertEqual((224, 224, 3), aug_image.shape) - - def test_randaug_with_bboxes(self): - """Smoke test to be sure there are no syntax errors with bboxes.""" - image = tf.zeros((224, 224, 3), dtype=tf.uint8) - bboxes = tf.ones((2, 4), dtype=tf.float32) - - augmenter = augment.RandAugment() - aug_image, aug_bboxes = augmenter.distort_with_boxes(image, bboxes) - - self.assertEqual((224, 224, 3), aug_image.shape) - self.assertEqual((2, 4), aug_bboxes.shape) - - def test_randaug_build_for_detection(self): - """Smoke test to be sure there are no syntax errors built for detection.""" - image = tf.zeros((224, 224, 3), dtype=tf.uint8) - bboxes = tf.ones((2, 4), dtype=tf.float32) - - augmenter = augment.RandAugment.build_for_detection() - self.assertCountEqual(augmenter.available_ops, [ - 'AutoContrast', 'Equalize', 'Invert', 'Posterize', 'Solarize', 'Color', - 'Contrast', 'Brightness', 'Sharpness', 'Cutout', 'SolarizeAdd', - 'Rotate_BBox', 'ShearX_BBox', 'ShearY_BBox', 'TranslateX_BBox', - 'TranslateY_BBox' - ]) - - aug_image, aug_bboxes = augmenter.distort_with_boxes(image, bboxes) - self.assertEqual((224, 224, 3), aug_image.shape) - self.assertEqual((2, 4), aug_bboxes.shape) - - def test_all_policy_ops(self): - """Smoke test to be sure all augmentation functions can execute.""" - - prob = 1 - magnitude = 10 - replace_value = [128] * 3 - cutout_const = 100 - translate_const = 250 - - image = tf.ones((224, 224, 3), dtype=tf.uint8) - bboxes = None - - for op_name in augment.NAME_TO_FUNC.keys() - augment.REQUIRE_BOXES_FUNCS: - func, _, args = augment._parse_policy_info(op_name, prob, magnitude, - replace_value, cutout_const, - translate_const) - image, bboxes = func(image, bboxes, *args) - - self.assertEqual((224, 224, 3), image.shape) - self.assertIsNone(bboxes) - - def test_all_policy_ops_with_bboxes(self): - """Smoke test to be sure all augmentation functions can execute.""" - - prob = 1 - magnitude = 10 - replace_value = [128] * 3 - cutout_const = 100 - translate_const = 250 - - image = tf.ones((224, 224, 3), dtype=tf.uint8) - bboxes = tf.ones((2, 4), dtype=tf.float32) - - for op_name in augment.NAME_TO_FUNC: - func, _, args = augment._parse_policy_info(op_name, prob, magnitude, - replace_value, cutout_const, - translate_const) - image, bboxes = func(image, bboxes, *args) - - self.assertEqual((224, 224, 3), image.shape) - self.assertEqual((2, 4), bboxes.shape) - - def test_autoaugment_video(self): - """Smoke test with video to be sure there are no syntax errors.""" - image = tf.zeros((2, 224, 224, 3), dtype=tf.uint8) - - for policy in self.AVAILABLE_POLICIES: - augmenter = augment.AutoAugment(augmentation_name=policy) - aug_image = augmenter.distort(image) - - self.assertEqual((2, 224, 224, 3), aug_image.shape) - - def test_autoaugment_video_with_boxes(self): - """Smoke test with video to be sure there are no syntax errors.""" - image = tf.zeros((2, 224, 224, 3), dtype=tf.uint8) - bboxes = tf.ones((2, 2, 4), dtype=tf.float32) - - for policy in self.AVAILABLE_POLICIES: - augmenter = augment.AutoAugment(augmentation_name=policy) - aug_image, aug_bboxes = augmenter.distort_with_boxes(image, bboxes) - - self.assertEqual((2, 224, 224, 3), aug_image.shape) - self.assertEqual((2, 2, 4), aug_bboxes.shape) - - def test_randaug_video(self): - """Smoke test with video to be sure there are no syntax errors.""" - image = tf.zeros((2, 224, 224, 3), dtype=tf.uint8) - - augmenter = augment.RandAugment() - aug_image = augmenter.distort(image) - - self.assertEqual((2, 224, 224, 3), aug_image.shape) - - def test_all_policy_ops_video(self): - """Smoke test to be sure all video augmentation functions can execute.""" - - prob = 1 - magnitude = 10 - replace_value = [128] * 3 - cutout_const = 100 - translate_const = 250 - - image = tf.ones((2, 224, 224, 3), dtype=tf.uint8) - bboxes = None - - for op_name in augment.NAME_TO_FUNC.keys() - augment.REQUIRE_BOXES_FUNCS: - func, _, args = augment._parse_policy_info(op_name, prob, magnitude, - replace_value, cutout_const, - translate_const) - image, bboxes = func(image, bboxes, *args) - - self.assertEqual((2, 224, 224, 3), image.shape) - self.assertIsNone(bboxes) - - def test_all_policy_ops_video_with_bboxes(self): - """Smoke test to be sure all video augmentation functions can execute.""" - - prob = 1 - magnitude = 10 - replace_value = [128] * 3 - cutout_const = 100 - translate_const = 250 - - image = tf.ones((2, 224, 224, 3), dtype=tf.uint8) - bboxes = tf.ones((2, 2, 4), dtype=tf.float32) - - for op_name in augment.NAME_TO_FUNC: - func, _, args = augment._parse_policy_info(op_name, prob, magnitude, - replace_value, cutout_const, - translate_const) - if op_name in { - 'Rotate_BBox', - 'ShearX_BBox', - 'ShearY_BBox', - 'TranslateX_BBox', - 'TranslateY_BBox', - 'TranslateY_Only_BBoxes', - }: - with self.assertRaises(ValueError): - func(image, bboxes, *args) - else: - image, bboxes = func(image, bboxes, *args) - - self.assertEqual((2, 224, 224, 3), image.shape) - self.assertEqual((2, 2, 4), bboxes.shape) - - def _generate_test_policy(self): - """Generate a test policy at random.""" - op_list = list(augment.NAME_TO_FUNC.keys()) - size = 6 - prob = [round(random.uniform(0., 1.), 1) for _ in range(size)] - mag = [round(random.uniform(0, 10)) for _ in range(size)] - policy = [] - for i in range(0, size, 2): - policy.append([(op_list[i], prob[i], mag[i]), - (op_list[i + 1], prob[i + 1], mag[i + 1])]) - return policy - - def test_custom_policy(self): - """Test autoaugment with a custom policy.""" - image = tf.zeros((224, 224, 3), dtype=tf.uint8) - augmenter = augment.AutoAugment(policies=self._generate_test_policy()) - aug_image = augmenter.distort(image) - - self.assertEqual((224, 224, 3), aug_image.shape) - - @parameterized.named_parameters( - {'testcase_name': '_OutOfRangeProb', - 'sub_policy': ('Equalize', 1.1, 3), 'value': '1.1'}, - {'testcase_name': '_OutOfRangeMag', - 'sub_policy': ('Equalize', 0.9, 11), 'value': '11'}, - ) - def test_invalid_custom_sub_policy(self, sub_policy, value): - """Test autoaugment with out-of-range values in the custom policy.""" - image = tf.zeros((224, 224, 3), dtype=tf.uint8) - policy = self._generate_test_policy() - policy[0][0] = sub_policy - augmenter = augment.AutoAugment(policies=policy) - - with self.assertRaisesRegex( - tf.errors.InvalidArgumentError, - r'Expected \'tf.Tensor\(False, shape=\(\), dtype=bool\)\' to be true. ' - r'Summarized data: ({})'.format(value)): - augmenter.distort(image) - - def test_invalid_custom_policy_ndim(self): - """Test autoaugment with wrong dimension in the custom policy.""" - policy = [[('Equalize', 0.8, 1), ('Shear', 0.8, 4)], - [('TranslateY', 0.6, 3), ('Rotate', 0.9, 3)]] - policy = [[policy]] - - with self.assertRaisesRegex( - ValueError, - r'Expected \(:, :, 3\) but got \(1, 1, 2, 2, 3\).'): - augment.AutoAugment(policies=policy) - - def test_invalid_custom_policy_shape(self): - """Test autoaugment with wrong shape in the custom policy.""" - policy = [[('Equalize', 0.8, 1, 1), ('Shear', 0.8, 4, 1)], - [('TranslateY', 0.6, 3, 1), ('Rotate', 0.9, 3, 1)]] - - with self.assertRaisesRegex( - ValueError, - r'Expected \(:, :, 3\) but got \(2, 2, 4\)'): - augment.AutoAugment(policies=policy) - - def test_invalid_custom_policy_key(self): - """Test autoaugment with invalid key in the custom policy.""" - image = tf.zeros((224, 224, 3), dtype=tf.uint8) - policy = [[('AAAAA', 0.8, 1), ('Shear', 0.8, 4)], - [('TranslateY', 0.6, 3), ('Rotate', 0.9, 3)]] - augmenter = augment.AutoAugment(policies=policy) - - with self.assertRaisesRegex(KeyError, '\'AAAAA\''): - augmenter.distort(image) - - -class RandomErasingTest(tf.test.TestCase, parameterized.TestCase): - - def test_random_erase_replaces_some_pixels(self): - image = tf.zeros((224, 224, 3), dtype=tf.float32) - augmenter = augment.RandomErasing(probability=1., max_count=10) - - aug_image = augmenter.distort(image) - - self.assertEqual((224, 224, 3), aug_image.shape) - self.assertNotEqual(0, tf.reduce_max(aug_image)) - - -@parameterized.named_parameters([ - ('float16_images', tf.float16), - ('bfloat16_images', tf.bfloat16), - ('float32_images', tf.float32), -]) -class MixupAndCutmixTest(parameterized.TestCase, tf.test.TestCase): - - def test_mixup_and_cutmix_smoothes_labels(self, image_dtype): - batch_size = 12 - num_classes = 1000 - label_smoothing = 0.1 - - images = tf.random.normal((batch_size, 224, 224, 3), dtype=image_dtype) - labels = tf.range(batch_size) - augmenter = augment.MixupAndCutmix( - num_classes=num_classes, label_smoothing=label_smoothing) - - aug_images, aug_labels = augmenter.distort(images, labels) - - self.assertEqual(images.shape, aug_images.shape) - self.assertEqual(images.dtype, aug_images.dtype) - self.assertEqual([batch_size, num_classes], aug_labels.shape) - self.assertAllLessEqual(aug_labels, 1. - label_smoothing + - 2. / num_classes) # With tolerance - self.assertAllGreaterEqual(aug_labels, label_smoothing / num_classes - - 1e4) # With tolerance - - def test_mixup_changes_image(self, image_dtype): - batch_size = 12 - num_classes = 1000 - label_smoothing = 0.1 - - images = tf.random.normal((batch_size, 224, 224, 3), dtype=image_dtype) - labels = tf.range(batch_size) - augmenter = augment.MixupAndCutmix( - mixup_alpha=1., cutmix_alpha=0., num_classes=num_classes) - - aug_images, aug_labels = augmenter.distort(images, labels) - - self.assertEqual(images.shape, aug_images.shape) - self.assertEqual(images.dtype, aug_images.dtype) - self.assertEqual([batch_size, num_classes], aug_labels.shape) - self.assertAllLessEqual(aug_labels, 1. - label_smoothing + - 2. / num_classes) # With tolerance - self.assertAllGreaterEqual(aug_labels, label_smoothing / num_classes - - 1e4) # With tolerance - self.assertFalse(tf.math.reduce_all(images == aug_images)) - - def test_cutmix_changes_image(self, image_dtype): - batch_size = 12 - num_classes = 1000 - label_smoothing = 0.1 - - images = tf.random.normal((batch_size, 224, 224, 3), dtype=image_dtype) - labels = tf.range(batch_size) - augmenter = augment.MixupAndCutmix( - mixup_alpha=0., cutmix_alpha=1., num_classes=num_classes) - - aug_images, aug_labels = augmenter.distort(images, labels) - - self.assertEqual(images.shape, aug_images.shape) - self.assertEqual(images.dtype, aug_images.dtype) - self.assertEqual([batch_size, num_classes], aug_labels.shape) - self.assertAllLessEqual(aug_labels, 1. - label_smoothing + - 2. / num_classes) # With tolerance - self.assertAllGreaterEqual(aug_labels, label_smoothing / num_classes - - 1e4) # With tolerance - self.assertFalse(tf.math.reduce_all(images == aug_images)) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/ops/box_matcher.py b/official/vision/beta/ops/box_matcher.py deleted file mode 100644 index 9c38ad7fe..000000000 --- a/official/vision/beta/ops/box_matcher.py +++ /dev/null @@ -1,191 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -"""Box matcher implementation.""" - - -import tensorflow as tf - - -class BoxMatcher: - """Matcher based on highest value. - - This class computes matches from a similarity matrix. Each column is matched - to a single row. - - To support object detection target assignment this class enables setting both - positive_threshold (upper threshold) and negative_threshold (lower thresholds) - defining three categories of similarity which define whether examples are - positive, negative, or ignored, for example: - (1) thresholds=[negative_threshold, positive_threshold], and - indicators=[negative_value, ignore_value, positive_value]: The similarity - metrics below negative_threshold will be assigned with negative_value, - the metrics between negative_threshold and positive_threshold will be - assigned ignore_value, and the metrics above positive_threshold will be - assigned positive_value. - (2) thresholds=[negative_threshold, positive_threshold], and - indicators=[ignore_value, negative_value, positive_value]: The similarity - metric below negative_threshold will be assigned with ignore_value, - the metrics between negative_threshold and positive_threshold will be - assigned negative_value, and the metrics above positive_threshold will be - assigned positive_value. - """ - - def __init__(self, thresholds, indicators, force_match_for_each_col=False): - """Construct BoxMatcher. - - Args: - thresholds: A list of thresholds to classify boxes into - different buckets. The list needs to be sorted, and will be prepended - with -Inf and appended with +Inf. - indicators: A list of values to assign for each bucket. len(`indicators`) - must equal to len(`thresholds`) + 1. - force_match_for_each_col: If True, ensures that each column is matched to - at least one row (which is not guaranteed otherwise if the - positive_threshold is high). Defaults to False. If True, all force - matched row will be assigned to `indicators[-1]`. - - Raises: - ValueError: If `threshold` not sorted, - or len(indicators) != len(threshold) + 1 - """ - if not all([lo <= hi for (lo, hi) in zip(thresholds[:-1], thresholds[1:])]): - raise ValueError('`threshold` must be sorted, got {}'.format(thresholds)) - self.indicators = indicators - if len(indicators) != len(thresholds) + 1: - raise ValueError('len(`indicators`) must be len(`thresholds`) + 1, got ' - 'indicators {}, thresholds {}'.format( - indicators, thresholds)) - thresholds = thresholds[:] - thresholds.insert(0, -float('inf')) - thresholds.append(float('inf')) - self.thresholds = thresholds - self._force_match_for_each_col = force_match_for_each_col - - def __call__(self, similarity_matrix): - """Tries to match each column of the similarity matrix to a row. - - Args: - similarity_matrix: A float tensor of shape [N, M] representing any - similarity metric. - - Returns: - A integer tensor of shape [N] with corresponding match indices for each - of M columns, for positive match, the match result will be the - corresponding row index, for negative match, the match will be - `negative_value`, for ignored match, the match result will be - `ignore_value`. - """ - squeeze_result = False - if len(similarity_matrix.shape) == 2: - squeeze_result = True - similarity_matrix = tf.expand_dims(similarity_matrix, axis=0) - - static_shape = similarity_matrix.shape.as_list() - num_rows = static_shape[1] or tf.shape(similarity_matrix)[1] - batch_size = static_shape[0] or tf.shape(similarity_matrix)[0] - - def _match_when_rows_are_empty(): - """Performs matching when the rows of similarity matrix are empty. - - When the rows are empty, all detections are false positives. So we return - a tensor of -1's to indicate that the columns do not match to any rows. - - Returns: - matches: int32 tensor indicating the row each column matches to. - """ - with tf.name_scope('empty_gt_boxes'): - matches = tf.zeros([batch_size, num_rows], dtype=tf.int32) - match_labels = -tf.ones([batch_size, num_rows], dtype=tf.int32) - return matches, match_labels - - def _match_when_rows_are_non_empty(): - """Performs matching when the rows of similarity matrix are non empty. - - Returns: - matches: int32 tensor indicating the row each column matches to. - """ - # Matches for each column - with tf.name_scope('non_empty_gt_boxes'): - matches = tf.argmax(similarity_matrix, axis=-1, output_type=tf.int32) - - # Get logical indices of ignored and unmatched columns as tf.int64 - matched_vals = tf.reduce_max(similarity_matrix, axis=-1) - matched_indicators = tf.zeros([batch_size, num_rows], tf.int32) - - match_dtype = matched_vals.dtype - for (ind, low, high) in zip(self.indicators, self.thresholds[:-1], - self.thresholds[1:]): - low_threshold = tf.cast(low, match_dtype) - high_threshold = tf.cast(high, match_dtype) - mask = tf.logical_and( - tf.greater_equal(matched_vals, low_threshold), - tf.less(matched_vals, high_threshold)) - matched_indicators = self._set_values_using_indicator( - matched_indicators, mask, ind) - - if self._force_match_for_each_col: - # [batch_size, M], for each col (groundtruth_box), find the best - # matching row (anchor). - force_match_column_ids = tf.argmax( - input=similarity_matrix, axis=1, output_type=tf.int32) - # [batch_size, M, N] - force_match_column_indicators = tf.one_hot( - force_match_column_ids, depth=num_rows) - # [batch_size, N], for each row (anchor), find the largest column - # index for groundtruth box - force_match_row_ids = tf.argmax( - input=force_match_column_indicators, axis=1, output_type=tf.int32) - # [batch_size, N] - force_match_column_mask = tf.cast( - tf.reduce_max(force_match_column_indicators, axis=1), - tf.bool) - # [batch_size, N] - final_matches = tf.where(force_match_column_mask, force_match_row_ids, - matches) - final_matched_indicators = tf.where( - force_match_column_mask, self.indicators[-1] * - tf.ones([batch_size, num_rows], dtype=tf.int32), - matched_indicators) - return final_matches, final_matched_indicators - else: - return matches, matched_indicators - - num_gt_boxes = similarity_matrix.shape.as_list()[-1] or tf.shape( - similarity_matrix)[-1] - result_match, result_matched_indicators = tf.cond( - pred=tf.greater(num_gt_boxes, 0), - true_fn=_match_when_rows_are_non_empty, - false_fn=_match_when_rows_are_empty) - - if squeeze_result: - result_match = tf.squeeze(result_match, axis=0) - result_matched_indicators = tf.squeeze(result_matched_indicators, axis=0) - - return result_match, result_matched_indicators - - def _set_values_using_indicator(self, x, indicator, val): - """Set the indicated fields of x to val. - - Args: - x: tensor. - indicator: boolean with same shape as x. - val: scalar with value to set. - - Returns: - modified tensor. - """ - indicator = tf.cast(indicator, x.dtype) - return tf.add(tf.multiply(x, 1 - indicator), val * indicator) diff --git a/official/vision/beta/ops/box_matcher_test.py b/official/vision/beta/ops/box_matcher_test.py deleted file mode 100644 index 0048b0e47..000000000 --- a/official/vision/beta/ops/box_matcher_test.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for box_matcher.py.""" - -import tensorflow as tf - -from official.vision.beta.ops import box_matcher - - -class BoxMatcherTest(tf.test.TestCase): - - def test_box_matcher_unbatched(self): - sim_matrix = tf.constant( - [[0.04, 0, 0, 0], - [0, 0, 1., 0]], - dtype=tf.float32) - - fg_threshold = 0.5 - bg_thresh_hi = 0.2 - bg_thresh_lo = 0.0 - - matcher = box_matcher.BoxMatcher( - thresholds=[bg_thresh_lo, bg_thresh_hi, fg_threshold], - indicators=[-3, -2, -1, 1]) - match_indices, match_indicators = matcher(sim_matrix) - positive_matches = tf.greater_equal(match_indicators, 0) - negative_matches = tf.equal(match_indicators, -2) - - self.assertAllEqual( - positive_matches.numpy(), [False, True]) - self.assertAllEqual( - negative_matches.numpy(), [True, False]) - self.assertAllEqual( - match_indices.numpy(), [0, 2]) - self.assertAllEqual( - match_indicators.numpy(), [-2, 1]) - - def test_box_matcher_batched(self): - sim_matrix = tf.constant( - [[[0.04, 0, 0, 0], - [0, 0, 1., 0]]], - dtype=tf.float32) - - fg_threshold = 0.5 - bg_thresh_hi = 0.2 - bg_thresh_lo = 0.0 - - matcher = box_matcher.BoxMatcher( - thresholds=[bg_thresh_lo, bg_thresh_hi, fg_threshold], - indicators=[-3, -2, -1, 1]) - match_indices, match_indicators = matcher(sim_matrix) - positive_matches = tf.greater_equal(match_indicators, 0) - negative_matches = tf.equal(match_indicators, -2) - - self.assertAllEqual( - positive_matches.numpy(), [[False, True]]) - self.assertAllEqual( - negative_matches.numpy(), [[True, False]]) - self.assertAllEqual( - match_indices.numpy(), [[0, 2]]) - self.assertAllEqual( - match_indicators.numpy(), [[-2, 1]]) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/ops/box_ops.py b/official/vision/beta/ops/box_ops.py deleted file mode 100644 index 2868881f8..000000000 --- a/official/vision/beta/ops/box_ops.py +++ /dev/null @@ -1,763 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Box related ops.""" - -# Import libraries -import numpy as np -import tensorflow as tf - - -EPSILON = 1e-8 -BBOX_XFORM_CLIP = np.log(1000. / 16.) - - -def yxyx_to_xywh(boxes): - """Converts boxes from ymin, xmin, ymax, xmax to xmin, ymin, width, height. - - Args: - boxes: a numpy array whose last dimension is 4 representing the coordinates - of boxes in ymin, xmin, ymax, xmax order. - - Returns: - boxes: a numpy array whose shape is the same as `boxes` in new format. - - Raises: - ValueError: If the last dimension of boxes is not 4. - """ - if boxes.shape[-1] != 4: - raise ValueError( - 'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1])) - - boxes_ymin = boxes[..., 0] - boxes_xmin = boxes[..., 1] - boxes_width = boxes[..., 3] - boxes[..., 1] - boxes_height = boxes[..., 2] - boxes[..., 0] - new_boxes = np.stack( - [boxes_xmin, boxes_ymin, boxes_width, boxes_height], axis=-1) - - return new_boxes - - -def yxyx_to_cycxhw(boxes): - """Converts box corner coordinates to center plus height and width terms. - - Args: - boxes: a `Tensor` with last dimension of 4, representing the coordinates of - boxes in ymin, xmin, ymax, xmax order. - - Returns: - boxes: a `Tensor` with the same shape as the inputted boxes, in the format - of cy, cx, height, width. - - Raises: - ValueError: if the last dimension of boxes is not 4. - """ - if boxes.shape[-1] != 4: - raise ValueError('Last dimension of boxes must be 4 but is {:d}'.format( - boxes.shape[-1])) - - boxes_ycenter = (boxes[..., 0] + boxes[..., 2]) / 2 - boxes_xcenter = (boxes[..., 1] + boxes[..., 3]) / 2 - boxes_height = boxes[..., 2] - boxes[..., 0] - boxes_width = boxes[..., 3] - boxes[..., 1] - - new_boxes = tf.stack( - [boxes_ycenter, boxes_xcenter, boxes_height, boxes_width], axis=-1) - return new_boxes - - -def cycxhw_to_yxyx(boxes): - """Converts box center coordinates plus height and width terms to corner. - - Args: - boxes: a numpy array whose last dimension is 4 representing the coordinates - of boxes in cy, cx, height, width order. - - Returns: - boxes: a numpy array whose shape is the same as `boxes` in new format. - - Raises: - ValueError: If the last dimension of boxes is not 4. - """ - if boxes.shape[-1] != 4: - raise ValueError( - 'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1])) - - boxes_ymin = boxes[..., 0] - boxes[..., 2] / 2 - boxes_xmin = boxes[..., 1] - boxes[..., 3] / 2 - boxes_ymax = boxes[..., 0] + boxes[..., 2] / 2 - boxes_xmax = boxes[..., 1] + boxes[..., 3] / 2 - new_boxes = tf.stack([ - boxes_ymin, boxes_xmin, boxes_ymax, boxes_xmax], axis=-1) - return new_boxes - - -def jitter_boxes(boxes, noise_scale=0.025): - """Jitter the box coordinates by some noise distribution. - - Args: - boxes: a tensor whose last dimension is 4 representing the coordinates of - boxes in ymin, xmin, ymax, xmax order. - noise_scale: a python float which specifies the magnitude of noise. The rule - of thumb is to set this between (0, 0.1]. The default value is found to - mimic the noisy detections best empirically. - - Returns: - jittered_boxes: a tensor whose shape is the same as `boxes` representing - the jittered boxes. - - Raises: - ValueError: If the last dimension of boxes is not 4. - """ - if boxes.shape[-1] != 4: - raise ValueError( - 'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1])) - - with tf.name_scope('jitter_boxes'): - bbox_jitters = tf.random.normal(tf.shape(boxes), stddev=noise_scale) - ymin = boxes[..., 0:1] - xmin = boxes[..., 1:2] - ymax = boxes[..., 2:3] - xmax = boxes[..., 3:4] - width = xmax - xmin - height = ymax - ymin - new_center_x = (xmin + xmax) / 2.0 + bbox_jitters[..., 0:1] * width - new_center_y = (ymin + ymax) / 2.0 + bbox_jitters[..., 1:2] * height - new_width = width * tf.math.exp(bbox_jitters[..., 2:3]) - new_height = height * tf.math.exp(bbox_jitters[..., 3:4]) - jittered_boxes = tf.concat( - [new_center_y - new_height * 0.5, new_center_x - new_width * 0.5, - new_center_y + new_height * 0.5, new_center_x + new_width * 0.5], - axis=-1) - - return jittered_boxes - - -def normalize_boxes(boxes, image_shape): - """Converts boxes to the normalized coordinates. - - Args: - boxes: a tensor whose last dimension is 4 representing the coordinates - of boxes in ymin, xmin, ymax, xmax order. - image_shape: a list of two integers, a two-element vector or a tensor such - that all but the last dimensions are `broadcastable` to `boxes`. The last - dimension is 2, which represents [height, width]. - - Returns: - normalized_boxes: a tensor whose shape is the same as `boxes` representing - the normalized boxes. - - Raises: - ValueError: If the last dimension of boxes is not 4. - """ - if boxes.shape[-1] != 4: - raise ValueError( - 'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1])) - - with tf.name_scope('normalize_boxes'): - if isinstance(image_shape, list) or isinstance(image_shape, tuple): - height, width = image_shape - else: - image_shape = tf.cast(image_shape, dtype=boxes.dtype) - height = image_shape[..., 0:1] - width = image_shape[..., 1:2] - - ymin = boxes[..., 0:1] / height - xmin = boxes[..., 1:2] / width - ymax = boxes[..., 2:3] / height - xmax = boxes[..., 3:4] / width - - normalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1) - return normalized_boxes - - -def denormalize_boxes(boxes, image_shape): - """Converts boxes normalized by [height, width] to pixel coordinates. - - Args: - boxes: a tensor whose last dimension is 4 representing the coordinates - of boxes in ymin, xmin, ymax, xmax order. - image_shape: a list of two integers, a two-element vector or a tensor such - that all but the last dimensions are `broadcastable` to `boxes`. The last - dimension is 2, which represents [height, width]. - - Returns: - denormalized_boxes: a tensor whose shape is the same as `boxes` representing - the denormalized boxes. - - Raises: - ValueError: If the last dimension of boxes is not 4. - """ - with tf.name_scope('denormalize_boxes'): - if isinstance(image_shape, list) or isinstance(image_shape, tuple): - height, width = image_shape - else: - image_shape = tf.cast(image_shape, dtype=boxes.dtype) - height, width = tf.split(image_shape, 2, axis=-1) - - ymin, xmin, ymax, xmax = tf.split(boxes, 4, axis=-1) - ymin = ymin * height - xmin = xmin * width - ymax = ymax * height - xmax = xmax * width - - denormalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1) - return denormalized_boxes - - -def clip_boxes(boxes, image_shape): - """Clips boxes to image boundaries. - - Args: - boxes: a tensor whose last dimension is 4 representing the coordinates - of boxes in ymin, xmin, ymax, xmax order. - image_shape: a list of two integers, a two-element vector or a tensor such - that all but the last dimensions are `broadcastable` to `boxes`. The last - dimension is 2, which represents [height, width]. - - Returns: - clipped_boxes: a tensor whose shape is the same as `boxes` representing the - clipped boxes. - - Raises: - ValueError: If the last dimension of boxes is not 4. - """ - if boxes.shape[-1] != 4: - raise ValueError( - 'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1])) - - with tf.name_scope('clip_boxes'): - if isinstance(image_shape, list) or isinstance(image_shape, tuple): - height, width = image_shape - max_length = [height, width, height, width] - else: - image_shape = tf.cast(image_shape, dtype=boxes.dtype) - height, width = tf.unstack(image_shape, axis=-1) - max_length = tf.stack([height, width, height, width], axis=-1) - - clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0) - return clipped_boxes - - -def compute_outer_boxes(boxes, image_shape, scale=1.0): - """Compute outer box encloses an object with a margin. - - Args: - boxes: a tensor whose last dimension is 4 representing the coordinates of - boxes in ymin, xmin, ymax, xmax order. - image_shape: a list of two integers, a two-element vector or a tensor such - that all but the last dimensions are `broadcastable` to `boxes`. The last - dimension is 2, which represents [height, width]. - scale: a float number specifying the scale of output outer boxes to input - `boxes`. - - Returns: - outer_boxes: a tensor whose shape is the same as `boxes` representing the - outer boxes. - """ - if scale < 1.0: - raise ValueError( - 'scale is {}, but outer box scale must be greater than 1.0.'.format( - scale)) - centers_y = (boxes[..., 0] + boxes[..., 2]) / 2.0 - centers_x = (boxes[..., 1] + boxes[..., 3]) / 2.0 - box_height = (boxes[..., 2] - boxes[..., 0]) * scale - box_width = (boxes[..., 3] - boxes[..., 1]) * scale - outer_boxes = tf.stack( - [centers_y - box_height / 2.0, centers_x - box_width / 2.0, - centers_y + box_height / 2.0, centers_x + box_width / 2.0], - axis=1) - outer_boxes = clip_boxes(outer_boxes, image_shape) - return outer_boxes - - -def encode_boxes(boxes, anchors, weights=None): - """Encode boxes to targets. - - Args: - boxes: a tensor whose last dimension is 4 representing the coordinates - of boxes in ymin, xmin, ymax, xmax order. - anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`, - representing the coordinates of anchors in ymin, xmin, ymax, xmax order. - weights: None or a list of four float numbers used to scale coordinates. - - Returns: - encoded_boxes: a tensor whose shape is the same as `boxes` representing the - encoded box targets. - - Raises: - ValueError: If the last dimension of boxes is not 4. - """ - if boxes.shape[-1] != 4: - raise ValueError( - 'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1])) - - with tf.name_scope('encode_boxes'): - boxes = tf.cast(boxes, dtype=anchors.dtype) - ymin = boxes[..., 0:1] - xmin = boxes[..., 1:2] - ymax = boxes[..., 2:3] - xmax = boxes[..., 3:4] - box_h = ymax - ymin - box_w = xmax - xmin - box_yc = ymin + 0.5 * box_h - box_xc = xmin + 0.5 * box_w - - anchor_ymin = anchors[..., 0:1] - anchor_xmin = anchors[..., 1:2] - anchor_ymax = anchors[..., 2:3] - anchor_xmax = anchors[..., 3:4] - anchor_h = anchor_ymax - anchor_ymin - anchor_w = anchor_xmax - anchor_xmin - anchor_yc = anchor_ymin + 0.5 * anchor_h - anchor_xc = anchor_xmin + 0.5 * anchor_w - - encoded_dy = (box_yc - anchor_yc) / anchor_h - encoded_dx = (box_xc - anchor_xc) / anchor_w - encoded_dh = tf.math.log(box_h / anchor_h) - encoded_dw = tf.math.log(box_w / anchor_w) - if weights: - encoded_dy *= weights[0] - encoded_dx *= weights[1] - encoded_dh *= weights[2] - encoded_dw *= weights[3] - - encoded_boxes = tf.concat( - [encoded_dy, encoded_dx, encoded_dh, encoded_dw], axis=-1) - return encoded_boxes - - -def decode_boxes(encoded_boxes, anchors, weights=None): - """Decode boxes. - - Args: - encoded_boxes: a tensor whose last dimension is 4 representing the - coordinates of encoded boxes in ymin, xmin, ymax, xmax order. - anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`, - representing the coordinates of anchors in ymin, xmin, ymax, xmax order. - weights: None or a list of four float numbers used to scale coordinates. - - Returns: - encoded_boxes: a tensor whose shape is the same as `boxes` representing the - decoded box targets. - """ - if encoded_boxes.shape[-1] != 4: - raise ValueError( - 'encoded_boxes.shape[-1] is {:d}, but must be 4.' - .format(encoded_boxes.shape[-1])) - - with tf.name_scope('decode_boxes'): - encoded_boxes = tf.cast(encoded_boxes, dtype=anchors.dtype) - dy = encoded_boxes[..., 0:1] - dx = encoded_boxes[..., 1:2] - dh = encoded_boxes[..., 2:3] - dw = encoded_boxes[..., 3:4] - if weights: - dy /= weights[0] - dx /= weights[1] - dh /= weights[2] - dw /= weights[3] - dh = tf.math.minimum(dh, BBOX_XFORM_CLIP) - dw = tf.math.minimum(dw, BBOX_XFORM_CLIP) - - anchor_ymin = anchors[..., 0:1] - anchor_xmin = anchors[..., 1:2] - anchor_ymax = anchors[..., 2:3] - anchor_xmax = anchors[..., 3:4] - anchor_h = anchor_ymax - anchor_ymin - anchor_w = anchor_xmax - anchor_xmin - anchor_yc = anchor_ymin + 0.5 * anchor_h - anchor_xc = anchor_xmin + 0.5 * anchor_w - - decoded_boxes_yc = dy * anchor_h + anchor_yc - decoded_boxes_xc = dx * anchor_w + anchor_xc - decoded_boxes_h = tf.math.exp(dh) * anchor_h - decoded_boxes_w = tf.math.exp(dw) * anchor_w - - decoded_boxes_ymin = decoded_boxes_yc - 0.5 * decoded_boxes_h - decoded_boxes_xmin = decoded_boxes_xc - 0.5 * decoded_boxes_w - decoded_boxes_ymax = decoded_boxes_ymin + decoded_boxes_h - decoded_boxes_xmax = decoded_boxes_xmin + decoded_boxes_w - - decoded_boxes = tf.concat( - [decoded_boxes_ymin, decoded_boxes_xmin, - decoded_boxes_ymax, decoded_boxes_xmax], - axis=-1) - return decoded_boxes - - -def filter_boxes(boxes, scores, image_shape, min_size_threshold): - """Filter and remove boxes that are too small or fall outside the image. - - Args: - boxes: a tensor whose last dimension is 4 representing the coordinates of - boxes in ymin, xmin, ymax, xmax order. - scores: a tensor whose shape is the same as tf.shape(boxes)[:-1] - representing the original scores of the boxes. - image_shape: a tensor whose shape is the same as, or `broadcastable` to - `boxes` except the last dimension, which is 2, representing [height, - width] of the scaled image. - min_size_threshold: a float representing the minimal box size in each side - (w.r.t. the scaled image). Boxes whose sides are smaller than it will be - filtered out. - - Returns: - filtered_boxes: a tensor whose shape is the same as `boxes` but with - the position of the filtered boxes are filled with 0. - filtered_scores: a tensor whose shape is the same as 'scores' but with - the positinon of the filtered boxes filled with 0. - """ - if boxes.shape[-1] != 4: - raise ValueError( - 'boxes.shape[1] is {:d}, but must be 4.'.format(boxes.shape[-1])) - - with tf.name_scope('filter_boxes'): - if isinstance(image_shape, list) or isinstance(image_shape, tuple): - height, width = image_shape - else: - image_shape = tf.cast(image_shape, dtype=boxes.dtype) - height = image_shape[..., 0] - width = image_shape[..., 1] - - ymin = boxes[..., 0] - xmin = boxes[..., 1] - ymax = boxes[..., 2] - xmax = boxes[..., 3] - - h = ymax - ymin - w = xmax - xmin - yc = ymin + 0.5 * h - xc = xmin + 0.5 * w - - min_size = tf.cast( - tf.math.maximum(min_size_threshold, 0.0), dtype=boxes.dtype) - - filtered_size_mask = tf.math.logical_and( - tf.math.greater(h, min_size), tf.math.greater(w, min_size)) - filtered_center_mask = tf.logical_and( - tf.math.logical_and(tf.math.greater(yc, 0.0), tf.math.less(yc, height)), - tf.math.logical_and(tf.math.greater(xc, 0.0), tf.math.less(xc, width))) - filtered_mask = tf.math.logical_and( - filtered_size_mask, filtered_center_mask) - - filtered_scores = tf.where(filtered_mask, scores, tf.zeros_like(scores)) - filtered_boxes = tf.cast( - tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes - return filtered_boxes, filtered_scores - - -def filter_boxes_by_scores(boxes, scores, min_score_threshold): - """Filter and remove boxes whose scores are smaller than the threshold. - - Args: - boxes: a tensor whose last dimension is 4 representing the coordinates of - boxes in ymin, xmin, ymax, xmax order. - scores: a tensor whose shape is the same as tf.shape(boxes)[:-1] - representing the original scores of the boxes. - min_score_threshold: a float representing the minimal box score threshold. - Boxes whose score are smaller than it will be filtered out. - - Returns: - filtered_boxes: a tensor whose shape is the same as `boxes` but with - the position of the filtered boxes are filled with -1. - filtered_scores: a tensor whose shape is the same as 'scores' but with - the - """ - if boxes.shape[-1] != 4: - raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format( - boxes.shape[-1])) - - with tf.name_scope('filter_boxes_by_scores'): - filtered_mask = tf.math.greater(scores, min_score_threshold) - filtered_scores = tf.where(filtered_mask, scores, -tf.ones_like(scores)) - filtered_boxes = tf.cast( - tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes - - return filtered_boxes, filtered_scores - - -def gather_instances(selected_indices, instances, *aux_instances): - """Gather instances by indices. - - Args: - selected_indices: a Tensor of shape [batch, K] which indicates the selected - indices in instance dimension (2nd dimension). - instances: a Tensor of shape [batch, N, ...] where the 2nd dimension is - the instance dimension to be selected from. - *aux_instances: the additional Tensors whose shapes are in [batch, N, ...] - which are the tensors to be selected from using the `selected_indices`. - - Returns: - selected_instances: the tensor of shape [batch, K, ...] which corresponds to - the selected instances of the `instances` tensor. - selected_aux_instances: the additional tensors of shape [batch, K, ...] - which corresponds to the selected instances of the `aus_instances` - tensors. - """ - batch_size = instances.shape[0] - if batch_size == 1: - selected_instances = tf.squeeze( - tf.gather(instances, selected_indices, axis=1), axis=1) - if aux_instances: - selected_aux_instances = [ - tf.squeeze( - tf.gather(a, selected_indices, axis=1), axis=1) - for a in aux_instances - ] - return tuple([selected_instances] + selected_aux_instances) - else: - return selected_instances - else: - indices_shape = tf.shape(selected_indices) - batch_indices = ( - tf.expand_dims(tf.range(indices_shape[0]), axis=-1) * - tf.ones([1, indices_shape[-1]], dtype=tf.int32)) - gather_nd_indices = tf.stack( - [batch_indices, selected_indices], axis=-1) - selected_instances = tf.gather_nd(instances, gather_nd_indices) - if aux_instances: - selected_aux_instances = [ - tf.gather_nd(a, gather_nd_indices) for a in aux_instances - ] - return tuple([selected_instances] + selected_aux_instances) - else: - return selected_instances - - -def top_k_boxes(boxes, scores, k): - """Sort and select top k boxes according to the scores. - - Args: - boxes: a tensor of shape [batch_size, N, 4] representing the coordinate of - the boxes. N is the number of boxes per image. - scores: a tensor of shsape [batch_size, N] representing the socre of the - boxes. - k: an integer or a tensor indicating the top k number. - - Returns: - selected_boxes: a tensor of shape [batch_size, k, 4] representing the - selected top k box coordinates. - selected_scores: a tensor of shape [batch_size, k] representing the selected - top k box scores. - """ - with tf.name_scope('top_k_boxes'): - selected_scores, top_k_indices = tf.nn.top_k(scores, k=k, sorted=True) - selected_boxes = gather_instances(top_k_indices, boxes) - return selected_boxes, selected_scores - - -def get_non_empty_box_indices(boxes): - """Get indices for non-empty boxes.""" - # Selects indices if box height or width is 0. - height = boxes[:, 2] - boxes[:, 0] - width = boxes[:, 3] - boxes[:, 1] - indices = tf.where(tf.logical_and(tf.greater(height, 0), - tf.greater(width, 0))) - return indices[:, 0] - - -def bbox_overlap(boxes, gt_boxes): - """Calculates the overlap between proposal and ground truth boxes. - - Some `boxes` or `gt_boxes` may have been padded. The returned `iou` tensor - for these boxes will be -1. - - Args: - boxes: a tensor with a shape of [batch_size, N, 4]. N is the number of - proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The - last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form. - gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. This - tensor might have paddings with a negative value. - - Returns: - iou: a tensor with as a shape of [batch_size, N, MAX_NUM_INSTANCES]. - """ - with tf.name_scope('bbox_overlap'): - bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split( - value=boxes, num_or_size_splits=4, axis=2) - gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split( - value=gt_boxes, num_or_size_splits=4, axis=2) - - # Calculates the intersection area. - i_xmin = tf.math.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1])) - i_xmax = tf.math.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1])) - i_ymin = tf.math.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1])) - i_ymax = tf.math.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1])) - i_area = ( - tf.math.maximum((i_xmax - i_xmin), 0) * - tf.math.maximum((i_ymax - i_ymin), 0)) - - # Calculates the union area. - bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min) - gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min) - # Adds a small epsilon to avoid divide-by-zero. - u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8 - - # Calculates IoU. - iou = i_area / u_area - - # Fills -1 for IoU entries between the padded ground truth boxes. - gt_invalid_mask = tf.less( - tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0) - padding_mask = tf.logical_or( - tf.zeros_like(bb_x_min, dtype=tf.bool), - tf.transpose(gt_invalid_mask, [0, 2, 1])) - iou = tf.where(padding_mask, -tf.ones_like(iou), iou) - - # Fills -1 for for invalid (-1) boxes. - boxes_invalid_mask = tf.less( - tf.reduce_max(boxes, axis=-1, keepdims=True), 0.0) - iou = tf.where(boxes_invalid_mask, -tf.ones_like(iou), iou) - - return iou - - -def bbox_generalized_overlap(boxes, gt_boxes): - """Calculates the GIOU between proposal and ground truth boxes. - - The generalized intersection of union is an adjustment of the traditional IOU - metric which provides continuous updates even for predictions with no overlap. - This metric is defined in https://giou.stanford.edu/GIoU.pdf. Note, some - `gt_boxes` may have been padded. The returned `giou` tensor for these boxes - will be -1. - - Args: - boxes: a `Tensor` with a shape of [batch_size, N, 4]. N is the number of - proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The - last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form. - gt_boxes: a `Tensor` with a shape of [batch_size, max_num_instances, 4]. - This tensor may have paddings with a negative value and will also be in - the [ymin, xmin, ymax, xmax] format. - - Returns: - giou: a `Tensor` with as a shape of [batch_size, N, max_num_instances]. - """ - with tf.name_scope('bbox_generalized_overlap'): - assert boxes.shape.as_list( - )[-1] == 4, 'Boxes must be defined by 4 coordinates.' - assert gt_boxes.shape.as_list( - )[-1] == 4, 'Groundtruth boxes must be defined by 4 coordinates.' - - bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split( - value=boxes, num_or_size_splits=4, axis=2) - gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split( - value=gt_boxes, num_or_size_splits=4, axis=2) - - # Calculates the hull area for each pair of boxes, with one from - # boxes and the other from gt_boxes. - # Outputs for coordinates are of shape [batch_size, N, max_num_instances] - h_xmin = tf.minimum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1])) - h_xmax = tf.maximum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1])) - h_ymin = tf.minimum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1])) - h_ymax = tf.maximum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1])) - h_area = tf.maximum((h_xmax - h_xmin), 0) * tf.maximum((h_ymax - h_ymin), 0) - # Add a small epsilon to avoid divide-by-zero. - h_area = h_area + 1e-8 - - # Calculates the intersection area. - i_xmin = tf.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1])) - i_xmax = tf.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1])) - i_ymin = tf.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1])) - i_ymax = tf.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1])) - i_area = tf.maximum((i_xmax - i_xmin), 0) * tf.maximum((i_ymax - i_ymin), 0) - - # Calculates the union area. - bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min) - gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min) - - # Adds a small epsilon to avoid divide-by-zero. - u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8 - - # Calculates IoU. - iou = i_area / u_area - # Calculates GIoU. - giou = iou - (h_area - u_area) / h_area - - # Fills -1 for GIoU entries between the padded ground truth boxes. - gt_invalid_mask = tf.less( - tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0) - padding_mask = tf.broadcast_to( - tf.transpose(gt_invalid_mask, [0, 2, 1]), tf.shape(giou)) - giou = tf.where(padding_mask, -tf.ones_like(giou), giou) - return giou - - -def box_matching(boxes, gt_boxes, gt_classes): - """Match boxes to groundtruth boxes. - - Given the proposal boxes and the groundtruth boxes and classes, perform the - groundtruth matching by taking the argmax of the IoU between boxes and - groundtruth boxes. - - Args: - boxes: a tensor of shape of [batch_size, N, 4] representing the box - coordiantes to be matched to groundtruth boxes. - gt_boxes: a tensor of shape of [batch_size, MAX_INSTANCES, 4] representing - the groundtruth box coordinates. It is padded with -1s to indicate the - invalid boxes. - gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box - classes. It is padded with -1s to indicate the invalid classes. - - Returns: - matched_gt_boxes: a tensor of shape of [batch_size, N, 4], representing - the matched groundtruth box coordinates for each input box. If the box - does not overlap with any groundtruth boxes, the matched boxes of it - will be set to all 0s. - matched_gt_classes: a tensor of shape of [batch_size, N], representing - the matched groundtruth classes for each input box. If the box does not - overlap with any groundtruth boxes, the matched box classes of it will - be set to 0, which corresponds to the background class. - matched_gt_indices: a tensor of shape of [batch_size, N], representing - the indices of the matched groundtruth boxes in the original gt_boxes - tensor. If the box does not overlap with any groundtruth boxes, the - index of the matched groundtruth will be set to -1. - matched_iou: a tensor of shape of [batch_size, N], representing the IoU - between the box and its matched groundtruth box. The matched IoU is the - maximum IoU of the box and all the groundtruth boxes. - iou: a tensor of shape of [batch_size, N, K], representing the IoU matrix - between boxes and the groundtruth boxes. The IoU between a box and the - invalid groundtruth boxes whose coordinates are [-1, -1, -1, -1] is -1. - """ - # Compute IoU between boxes and gt_boxes. - # iou <- [batch_size, N, K] - iou = bbox_overlap(boxes, gt_boxes) - - # max_iou <- [batch_size, N] - # 0.0 -> no match to gt, or -1.0 match to no gt - matched_iou = tf.reduce_max(iou, axis=-1) - - # background_box_mask <- bool, [batch_size, N] - background_box_mask = tf.less_equal(matched_iou, 0.0) - - argmax_iou_indices = tf.argmax(iou, axis=-1, output_type=tf.int32) - - matched_gt_boxes, matched_gt_classes = gather_instances( - argmax_iou_indices, gt_boxes, gt_classes) - matched_gt_boxes = tf.where( - tf.tile(tf.expand_dims(background_box_mask, axis=-1), [1, 1, 4]), - tf.zeros_like(matched_gt_boxes, dtype=matched_gt_boxes.dtype), - matched_gt_boxes) - matched_gt_classes = tf.where( - background_box_mask, - tf.zeros_like(matched_gt_classes), - matched_gt_classes) - - matched_gt_indices = tf.where( - background_box_mask, - -tf.ones_like(argmax_iou_indices), - argmax_iou_indices) - - return (matched_gt_boxes, matched_gt_classes, matched_gt_indices, - matched_iou, iou) diff --git a/official/vision/beta/ops/iou_similarity.py b/official/vision/beta/ops/iou_similarity.py deleted file mode 100644 index c73a95773..000000000 --- a/official/vision/beta/ops/iou_similarity.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Region Similarity Calculators.""" - -import tensorflow as tf - - -def area(box): - """Computes area of boxes. - - B: batch_size - N: number of boxes - - Args: - box: a float Tensor with [N, 4], or [B, N, 4]. - - Returns: - a float Tensor with [N], or [B, N] - """ - with tf.name_scope('Area'): - y_min, x_min, y_max, x_max = tf.split( - value=box, num_or_size_splits=4, axis=-1) - return tf.squeeze((y_max - y_min) * (x_max - x_min), axis=-1) - - -def intersection(gt_boxes, boxes): - """Compute pairwise intersection areas between boxes. - - B: batch_size - N: number of groundtruth boxes. - M: number of anchor boxes. - - Args: - gt_boxes: a float Tensor with [N, 4], or [B, N, 4] - boxes: a float Tensor with [M, 4], or [B, M, 4] - - Returns: - a float Tensor with shape [N, M] or [B, N, M] representing pairwise - intersections. - """ - with tf.name_scope('Intersection'): - y_min1, x_min1, y_max1, x_max1 = tf.split( - value=gt_boxes, num_or_size_splits=4, axis=-1) - y_min2, x_min2, y_max2, x_max2 = tf.split( - value=boxes, num_or_size_splits=4, axis=-1) - - boxes_rank = len(boxes.shape) - perm = [1, 0] if boxes_rank == 2 else [0, 2, 1] - # [N, M] or [B, N, M] - y_min_max = tf.minimum(y_max1, tf.transpose(y_max2, perm)) - y_max_min = tf.maximum(y_min1, tf.transpose(y_min2, perm)) - x_min_max = tf.minimum(x_max1, tf.transpose(x_max2, perm)) - x_max_min = tf.maximum(x_min1, tf.transpose(x_min2, perm)) - - intersect_heights = y_min_max - y_max_min - intersect_widths = x_min_max - x_max_min - zeros_t = tf.cast(0, intersect_heights.dtype) - intersect_heights = tf.maximum(zeros_t, intersect_heights) - intersect_widths = tf.maximum(zeros_t, intersect_widths) - return intersect_heights * intersect_widths - - -def iou(gt_boxes, boxes): - """Computes pairwise intersection-over-union between box collections. - - Args: - gt_boxes: a float Tensor with [N, 4]. - boxes: a float Tensor with [M, 4]. - - Returns: - a Tensor with shape [N, M] representing pairwise iou scores. - """ - with tf.name_scope('IOU'): - intersections = intersection(gt_boxes, boxes) - gt_boxes_areas = area(gt_boxes) - boxes_areas = area(boxes) - boxes_rank = len(boxes_areas.shape) - boxes_axis = 1 if (boxes_rank == 2) else 0 - gt_boxes_areas = tf.expand_dims(gt_boxes_areas, -1) - boxes_areas = tf.expand_dims(boxes_areas, boxes_axis) - unions = gt_boxes_areas + boxes_areas - unions = unions - intersections - return tf.where( - tf.equal(intersections, 0.0), tf.zeros_like(intersections), - tf.truediv(intersections, unions)) - - -class IouSimilarity: - """Class to compute similarity based on Intersection over Union (IOU) metric. - - """ - - def __init__(self, mask_val=-1): - self.mask_val = mask_val - - def __call__(self, boxes_1, boxes_2, boxes_1_masks=None, boxes_2_masks=None): - """Compute pairwise IOU similarity between ground truth boxes and anchors. - - B: batch_size - N: Number of groundtruth boxes. - M: Number of anchor boxes. - - Args: - boxes_1: a float Tensor with M or B * M boxes. - boxes_2: a float Tensor with N or B * N boxes, the rank must be less than - or equal to rank of `boxes_1`. - boxes_1_masks: a boolean Tensor with M or B * M boxes. Optional. - boxes_2_masks: a boolean Tensor with N or B * N boxes. Optional. - - Returns: - A Tensor with shape [M, N] or [B, M, N] representing pairwise - iou scores, anchor per row and groundtruth_box per colulmn. - - Input shape: - boxes_1: [N, 4], or [B, N, 4] - boxes_2: [M, 4], or [B, M, 4] - boxes_1_masks: [N, 1], or [B, N, 1] - boxes_2_masks: [M, 1], or [B, M, 1] - - Output shape: - [M, N], or [B, M, N] - """ - boxes_1 = tf.cast(boxes_1, tf.float32) - boxes_2 = tf.cast(boxes_2, tf.float32) - - boxes_1_rank = len(boxes_1.shape) - boxes_2_rank = len(boxes_2.shape) - if boxes_1_rank < 2 or boxes_1_rank > 3: - raise ValueError( - '`groudtruth_boxes` must be rank 2 or 3, got {}'.format(boxes_1_rank)) - if boxes_2_rank < 2 or boxes_2_rank > 3: - raise ValueError( - '`anchors` must be rank 2 or 3, got {}'.format(boxes_2_rank)) - if boxes_1_rank < boxes_2_rank: - raise ValueError('`groundtruth_boxes` is unbatched while `anchors` is ' - 'batched is not a valid use case, got groundtruth_box ' - 'rank {}, and anchors rank {}'.format( - boxes_1_rank, boxes_2_rank)) - - result = iou(boxes_1, boxes_2) - if boxes_1_masks is None and boxes_2_masks is None: - return result - background_mask = None - mask_val_t = tf.cast(self.mask_val, result.dtype) * tf.ones_like(result) - perm = [1, 0] if boxes_2_rank == 2 else [0, 2, 1] - if boxes_1_masks is not None and boxes_2_masks is not None: - background_mask = tf.logical_or(boxes_1_masks, - tf.transpose(boxes_2_masks, perm)) - elif boxes_1_masks is not None: - background_mask = boxes_1_masks - else: - background_mask = tf.logical_or( - tf.zeros(tf.shape(boxes_2)[:-1], dtype=tf.bool), - tf.transpose(boxes_2_masks, perm)) - return tf.where(background_mask, mask_val_t, result) diff --git a/official/vision/beta/ops/iou_similarity_test.py b/official/vision/beta/ops/iou_similarity_test.py deleted file mode 100644 index ea99f5aab..000000000 --- a/official/vision/beta/ops/iou_similarity_test.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for iou_similarity.py.""" - -import tensorflow as tf - -from official.vision.beta.ops import iou_similarity - - -class BoxMatcherTest(tf.test.TestCase): - - def test_similarity_unbatched(self): - boxes = tf.constant( - [ - [0, 0, 1, 1], - [5, 0, 10, 5], - ], - dtype=tf.float32) - - gt_boxes = tf.constant( - [ - [0, 0, 5, 5], - [0, 5, 5, 10], - [5, 0, 10, 5], - [5, 5, 10, 10], - ], - dtype=tf.float32) - - sim_calc = iou_similarity.IouSimilarity() - sim_matrix = sim_calc(boxes, gt_boxes) - - self.assertAllClose( - sim_matrix.numpy(), - [[0.04, 0, 0, 0], - [0, 0, 1., 0]]) - - def test_similarity_batched(self): - boxes = tf.constant( - [[ - [0, 0, 1, 1], - [5, 0, 10, 5], - ]], - dtype=tf.float32) - - gt_boxes = tf.constant( - [[ - [0, 0, 5, 5], - [0, 5, 5, 10], - [5, 0, 10, 5], - [5, 5, 10, 10], - ]], - dtype=tf.float32) - - sim_calc = iou_similarity.IouSimilarity() - sim_matrix = sim_calc(boxes, gt_boxes) - - self.assertAllClose( - sim_matrix.numpy(), - [[[0.04, 0, 0, 0], - [0, 0, 1., 0]]]) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/ops/mask_ops.py b/official/vision/beta/ops/mask_ops.py deleted file mode 100644 index cbdb41caa..000000000 --- a/official/vision/beta/ops/mask_ops.py +++ /dev/null @@ -1,190 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Utility functions for segmentations.""" - -import math -# Import libraries -import cv2 -import numpy as np - - -def paste_instance_masks(masks, - detected_boxes, - image_height, - image_width): - """Paste instance masks to generate the image segmentation results. - - Args: - masks: a numpy array of shape [N, mask_height, mask_width] representing the - instance masks w.r.t. the `detected_boxes`. - detected_boxes: a numpy array of shape [N, 4] representing the reference - bounding boxes. - image_height: an integer representing the height of the image. - image_width: an integer representing the width of the image. - - Returns: - segms: a numpy array of shape [N, image_height, image_width] representing - the instance masks *pasted* on the image canvas. - """ - - def expand_boxes(boxes, scale): - """Expands an array of boxes by a given scale.""" - # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227 # pylint: disable=line-too-long - # The `boxes` in the reference implementation is in [x1, y1, x2, y2] form, - # whereas `boxes` here is in [x1, y1, w, h] form - w_half = boxes[:, 2] * .5 - h_half = boxes[:, 3] * .5 - x_c = boxes[:, 0] + w_half - y_c = boxes[:, 1] + h_half - - w_half *= scale - h_half *= scale - - boxes_exp = np.zeros(boxes.shape) - boxes_exp[:, 0] = x_c - w_half - boxes_exp[:, 2] = x_c + w_half - boxes_exp[:, 1] = y_c - h_half - boxes_exp[:, 3] = y_c + h_half - - return boxes_exp - - # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812 # pylint: disable=line-too-long - # To work around an issue with cv2.resize (it seems to automatically pad - # with repeated border values), we manually zero-pad the masks by 1 pixel - # prior to resizing back to the original image resolution. This prevents - # "top hat" artifacts. We therefore need to expand the reference boxes by an - # appropriate factor. - _, mask_height, mask_width = masks.shape - scale = max((mask_width + 2.0) / mask_width, - (mask_height + 2.0) / mask_height) - - ref_boxes = expand_boxes(detected_boxes, scale) - ref_boxes = ref_boxes.astype(np.int32) - padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32) - segms = [] - for mask_ind, mask in enumerate(masks): - im_mask = np.zeros((image_height, image_width), dtype=np.uint8) - # Process mask inside bounding boxes. - padded_mask[1:-1, 1:-1] = mask[:, :] - - ref_box = ref_boxes[mask_ind, :] - w = ref_box[2] - ref_box[0] + 1 - h = ref_box[3] - ref_box[1] + 1 - w = np.maximum(w, 1) - h = np.maximum(h, 1) - - mask = cv2.resize(padded_mask, (w, h)) - mask = np.array(mask > 0.5, dtype=np.uint8) - - x_0 = min(max(ref_box[0], 0), image_width) - x_1 = min(max(ref_box[2] + 1, 0), image_width) - y_0 = min(max(ref_box[1], 0), image_height) - y_1 = min(max(ref_box[3] + 1, 0), image_height) - - im_mask[y_0:y_1, x_0:x_1] = mask[ - (y_0 - ref_box[1]):(y_1 - ref_box[1]), - (x_0 - ref_box[0]):(x_1 - ref_box[0]) - ] - segms.append(im_mask) - - segms = np.array(segms) - assert masks.shape[0] == segms.shape[0] - return segms - - -def paste_instance_masks_v2(masks, - detected_boxes, - image_height, - image_width): - """Paste instance masks to generate the image segmentation (v2). - - Args: - masks: a numpy array of shape [N, mask_height, mask_width] representing the - instance masks w.r.t. the `detected_boxes`. - detected_boxes: a numpy array of shape [N, 4] representing the reference - bounding boxes. - image_height: an integer representing the height of the image. - image_width: an integer representing the width of the image. - - Returns: - segms: a numpy array of shape [N, image_height, image_width] representing - the instance masks *pasted* on the image canvas. - """ - _, mask_height, mask_width = masks.shape - - segms = [] - for i, mask in enumerate(masks): - box = detected_boxes[i, :] - xmin = box[0] - ymin = box[1] - xmax = xmin + box[2] - ymax = ymin + box[3] - - # Sample points of the cropped mask w.r.t. the image grid. - # Note that these coordinates may fall beyond the image. - # Pixel clipping will happen after warping. - xmin_int = int(math.floor(xmin)) - xmax_int = int(math.ceil(xmax)) - ymin_int = int(math.floor(ymin)) - ymax_int = int(math.ceil(ymax)) - - alpha = box[2] / (1.0 * mask_width) - beta = box[3] / (1.0 * mask_height) - # pylint: disable=invalid-name - # Transformation from mask pixel indices to image coordinate. - M_mask_to_image = np.array( - [[alpha, 0, xmin], - [0, beta, ymin], - [0, 0, 1]], - dtype=np.float32) - # Transformation from image to cropped mask coordinate. - M_image_to_crop = np.array( - [[1, 0, -xmin_int], - [0, 1, -ymin_int], - [0, 0, 1]], - dtype=np.float32) - M = np.dot(M_image_to_crop, M_mask_to_image) - # Compensate the half pixel offset that OpenCV has in the - # warpPerspective implementation: the top-left pixel is sampled - # at (0,0), but we want it to be at (0.5, 0.5). - M = np.dot( - np.dot( - np.array([[1, 0, -0.5], - [0, 1, -0.5], - [0, 0, 1]], np.float32), - M), - np.array([[1, 0, 0.5], - [0, 1, 0.5], - [0, 0, 1]], np.float32)) - # pylint: enable=invalid-name - cropped_mask = cv2.warpPerspective( - mask.astype(np.float32), M, - (xmax_int - xmin_int, ymax_int - ymin_int)) - cropped_mask = np.array(cropped_mask > 0.5, dtype=np.uint8) - - img_mask = np.zeros((image_height, image_width)) - x0 = max(min(xmin_int, image_width), 0) - x1 = max(min(xmax_int, image_width), 0) - y0 = max(min(ymin_int, image_height), 0) - y1 = max(min(ymax_int, image_height), 0) - img_mask[y0:y1, x0:x1] = cropped_mask[ - (y0 - ymin_int):(y1 - ymin_int), - (x0 - xmin_int):(x1 - xmin_int)] - - segms.append(img_mask) - - segms = np.array(segms) - return segms - diff --git a/official/vision/beta/ops/mask_ops_test.py b/official/vision/beta/ops/mask_ops_test.py deleted file mode 100644 index f63892bc8..000000000 --- a/official/vision/beta/ops/mask_ops_test.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -"""Tests for mask_ops.py.""" - -# Import libraries -import numpy as np -import tensorflow as tf -from official.vision.beta.ops import mask_ops - - -class MaskUtilsTest(tf.test.TestCase): - - def testPasteInstanceMasks(self): - image_height = 10 - image_width = 10 - mask_height = 6 - mask_width = 6 - masks = np.random.randint(0, 255, (1, mask_height, mask_width)) - detected_boxes = np.array([[0.0, 2.0, mask_width, mask_height]]) - - _ = mask_ops.paste_instance_masks( - masks, detected_boxes, image_height, image_width) - - def testPasteInstanceMasksV2(self): - image_height = 10 - image_width = 10 - mask_height = 6 - mask_width = 6 - masks = np.random.randint(0, 255, (1, mask_height, mask_width)) - detected_boxes = np.array([[0.0, 2.0, mask_width, mask_height]]) - - image_masks = mask_ops.paste_instance_masks_v2( - masks, detected_boxes, image_height, image_width) - - self.assertNDArrayNear( - image_masks[:, 2:8, 0:6], - np.array(masks > 0.5, dtype=np.uint8), - 1e-5) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/ops/nms.py b/official/vision/beta/ops/nms.py deleted file mode 100644 index 7d1ab3c51..000000000 --- a/official/vision/beta/ops/nms.py +++ /dev/null @@ -1,202 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tensorflow implementation of non max suppression.""" - -# Import libraries -import tensorflow as tf - -from official.vision.beta.ops import box_ops - - -NMS_TILE_SIZE = 512 - - -def _self_suppression(iou, _, iou_sum): - batch_size = tf.shape(iou)[0] - can_suppress_others = tf.cast( - tf.reshape(tf.reduce_max(iou, 1) <= 0.5, [batch_size, -1, 1]), iou.dtype) - iou_suppressed = tf.reshape( - tf.cast(tf.reduce_max(can_suppress_others * iou, 1) <= 0.5, iou.dtype), - [batch_size, -1, 1]) * iou - iou_sum_new = tf.reduce_sum(iou_suppressed, [1, 2]) - return [ - iou_suppressed, - tf.reduce_any(iou_sum - iou_sum_new > 0.5), iou_sum_new - ] - - -def _cross_suppression(boxes, box_slice, iou_threshold, inner_idx): - batch_size = tf.shape(boxes)[0] - new_slice = tf.slice(boxes, [0, inner_idx * NMS_TILE_SIZE, 0], - [batch_size, NMS_TILE_SIZE, 4]) - iou = box_ops.bbox_overlap(new_slice, box_slice) - ret_slice = tf.expand_dims( - tf.cast(tf.reduce_all(iou < iou_threshold, [1]), box_slice.dtype), - 2) * box_slice - return boxes, ret_slice, iou_threshold, inner_idx + 1 - - -def _suppression_loop_body(boxes, iou_threshold, output_size, idx): - """Process boxes in the range [idx*NMS_TILE_SIZE, (idx+1)*NMS_TILE_SIZE). - - Args: - boxes: a tensor with a shape of [batch_size, anchors, 4]. - iou_threshold: a float representing the threshold for deciding whether boxes - overlap too much with respect to IOU. - output_size: an int32 tensor of size [batch_size]. Representing the number - of selected boxes for each batch. - idx: an integer scalar representing induction variable. - - Returns: - boxes: updated boxes. - iou_threshold: pass down iou_threshold to the next iteration. - output_size: the updated output_size. - idx: the updated induction variable. - """ - num_tiles = tf.shape(boxes)[1] // NMS_TILE_SIZE - batch_size = tf.shape(boxes)[0] - - # Iterates over tiles that can possibly suppress the current tile. - box_slice = tf.slice(boxes, [0, idx * NMS_TILE_SIZE, 0], - [batch_size, NMS_TILE_SIZE, 4]) - _, box_slice, _, _ = tf.while_loop( - lambda _boxes, _box_slice, _threshold, inner_idx: inner_idx < idx, - _cross_suppression, [boxes, box_slice, iou_threshold, - tf.constant(0)]) - - # Iterates over the current tile to compute self-suppression. - iou = box_ops.bbox_overlap(box_slice, box_slice) - mask = tf.expand_dims( - tf.reshape(tf.range(NMS_TILE_SIZE), [1, -1]) > tf.reshape( - tf.range(NMS_TILE_SIZE), [-1, 1]), 0) - iou *= tf.cast(tf.logical_and(mask, iou >= iou_threshold), iou.dtype) - suppressed_iou, _, _ = tf.while_loop( - lambda _iou, loop_condition, _iou_sum: loop_condition, _self_suppression, - [iou, tf.constant(True), - tf.reduce_sum(iou, [1, 2])]) - suppressed_box = tf.reduce_sum(suppressed_iou, 1) > 0 - box_slice *= tf.expand_dims(1.0 - tf.cast(suppressed_box, box_slice.dtype), 2) - - # Uses box_slice to update the input boxes. - mask = tf.reshape( - tf.cast(tf.equal(tf.range(num_tiles), idx), boxes.dtype), [1, -1, 1, 1]) - boxes = tf.tile(tf.expand_dims( - box_slice, [1]), [1, num_tiles, 1, 1]) * mask + tf.reshape( - boxes, [batch_size, num_tiles, NMS_TILE_SIZE, 4]) * (1 - mask) - boxes = tf.reshape(boxes, [batch_size, -1, 4]) - - # Updates output_size. - output_size += tf.reduce_sum( - tf.cast(tf.reduce_any(box_slice > 0, [2]), tf.int32), [1]) - return boxes, iou_threshold, output_size, idx + 1 - - -def sorted_non_max_suppression_padded(scores, - boxes, - max_output_size, - iou_threshold): - """A wrapper that handles non-maximum suppression. - - Assumption: - * The boxes are sorted by scores unless the box is a dot (all coordinates - are zero). - * Boxes with higher scores can be used to suppress boxes with lower scores. - - The overal design of the algorithm is to handle boxes tile-by-tile: - - boxes = boxes.pad_to_multiply_of(tile_size) - num_tiles = len(boxes) // tile_size - output_boxes = [] - for i in range(num_tiles): - box_tile = boxes[i*tile_size : (i+1)*tile_size] - for j in range(i - 1): - suppressing_tile = boxes[j*tile_size : (j+1)*tile_size] - iou = bbox_overlap(box_tile, suppressing_tile) - # if the box is suppressed in iou, clear it to a dot - box_tile *= _update_boxes(iou) - # Iteratively handle the diagnal tile. - iou = _box_overlap(box_tile, box_tile) - iou_changed = True - while iou_changed: - # boxes that are not suppressed by anything else - suppressing_boxes = _get_suppressing_boxes(iou) - # boxes that are suppressed by suppressing_boxes - suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes) - # clear iou to 0 for boxes that are suppressed, as they cannot be used - # to suppress other boxes any more - new_iou = _clear_iou(iou, suppressed_boxes) - iou_changed = (new_iou != iou) - iou = new_iou - # remaining boxes that can still suppress others, are selected boxes. - output_boxes.append(_get_suppressing_boxes(iou)) - if len(output_boxes) >= max_output_size: - break - - Args: - scores: a tensor with a shape of [batch_size, anchors]. - boxes: a tensor with a shape of [batch_size, anchors, 4]. - max_output_size: a scalar integer `Tensor` representing the maximum number - of boxes to be selected by non max suppression. - iou_threshold: a float representing the threshold for deciding whether boxes - overlap too much with respect to IOU. - - Returns: - nms_scores: a tensor with a shape of [batch_size, anchors]. It has same - dtype as input scores. - nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has - same dtype as input boxes. - """ - batch_size = tf.shape(boxes)[0] - num_boxes = tf.shape(boxes)[1] - pad = tf.cast( - tf.math.ceil(tf.cast(num_boxes, tf.float32) / NMS_TILE_SIZE), - tf.int32) * NMS_TILE_SIZE - num_boxes - boxes = tf.pad(tf.cast(boxes, tf.float32), [[0, 0], [0, pad], [0, 0]]) - scores = tf.pad( - tf.cast(scores, tf.float32), [[0, 0], [0, pad]], constant_values=-1) - num_boxes += pad - - def _loop_cond(unused_boxes, unused_threshold, output_size, idx): - return tf.logical_and( - tf.reduce_min(output_size) < max_output_size, - idx < num_boxes // NMS_TILE_SIZE) - - selected_boxes, _, output_size, _ = tf.while_loop( - _loop_cond, _suppression_loop_body, [ - boxes, iou_threshold, - tf.zeros([batch_size], tf.int32), - tf.constant(0) - ]) - idx = num_boxes - tf.cast( - tf.nn.top_k( - tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) * - tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0], - tf.int32) - idx = tf.minimum(idx, num_boxes - 1) - idx = tf.reshape( - idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]), [-1]) - boxes = tf.reshape( - tf.gather(tf.reshape(boxes, [-1, 4]), idx), - [batch_size, max_output_size, 4]) - boxes = boxes * tf.cast( - tf.reshape(tf.range(max_output_size), [1, -1, 1]) < tf.reshape( - output_size, [-1, 1, 1]), boxes.dtype) - scores = tf.reshape( - tf.gather(tf.reshape(scores, [-1, 1]), idx), - [batch_size, max_output_size]) - scores = scores * tf.cast( - tf.reshape(tf.range(max_output_size), [1, -1]) < tf.reshape( - output_size, [-1, 1]), scores.dtype) - return scores, boxes diff --git a/official/vision/beta/ops/preprocess_ops.py b/official/vision/beta/ops/preprocess_ops.py deleted file mode 100644 index d8c40fc1f..000000000 --- a/official/vision/beta/ops/preprocess_ops.py +++ /dev/null @@ -1,919 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Preprocessing ops.""" - -import math -from typing import Optional, Tuple, Union -from six.moves import range -import tensorflow as tf - -from official.vision.beta.ops import augment -from official.vision.beta.ops import box_ops - -CENTER_CROP_FRACTION = 0.875 - - -def clip_or_pad_to_fixed_size(input_tensor, size, constant_values=0): - """Pads data to a fixed length at the first dimension. - - Args: - input_tensor: `Tensor` with any dimension. - size: `int` number for the first dimension of output Tensor. - constant_values: `int` value assigned to the paddings. - - Returns: - `Tensor` with the first dimension padded to `size`. - """ - input_shape = input_tensor.get_shape().as_list() - padding_shape = [] - - # Computes the padding length on the first dimension, clip input tensor if it - # is longer than `size`. - input_length = tf.shape(input_tensor)[0] - input_length = tf.clip_by_value(input_length, 0, size) - input_tensor = input_tensor[:input_length] - - padding_length = tf.maximum(0, size - input_length) - padding_shape.append(padding_length) - - # Copies shapes of the rest of input shape dimensions. - for i in range(1, len(input_shape)): - padding_shape.append(tf.shape(input_tensor)[i]) - - # Pads input tensor to the fixed first dimension. - paddings = tf.cast(constant_values * tf.ones(padding_shape), - input_tensor.dtype) - padded_tensor = tf.concat([input_tensor, paddings], axis=0) - output_shape = input_shape - output_shape[0] = size - padded_tensor.set_shape(output_shape) - return padded_tensor - - -def normalize_image(image, - offset=(0.485, 0.456, 0.406), - scale=(0.229, 0.224, 0.225)): - """Normalizes the image to zero mean and unit variance.""" - with tf.name_scope('normalize_image'): - image = tf.image.convert_image_dtype(image, dtype=tf.float32) - offset = tf.constant(offset) - offset = tf.expand_dims(offset, axis=0) - offset = tf.expand_dims(offset, axis=0) - image -= offset - - scale = tf.constant(scale) - scale = tf.expand_dims(scale, axis=0) - scale = tf.expand_dims(scale, axis=0) - image /= scale - return image - - -def compute_padded_size(desired_size, stride): - """Compute the padded size given the desired size and the stride. - - The padded size will be the smallest rectangle, such that each dimension is - the smallest multiple of the stride which is larger than the desired - dimension. For example, if desired_size = (100, 200) and stride = 32, - the output padded_size = (128, 224). - - Args: - desired_size: a `Tensor` or `int` list/tuple of two elements representing - [height, width] of the target output image size. - stride: an integer, the stride of the backbone network. - - Returns: - padded_size: a `Tensor` or `int` list/tuple of two elements representing - [height, width] of the padded output image size. - """ - if isinstance(desired_size, list) or isinstance(desired_size, tuple): - padded_size = [int(math.ceil(d * 1.0 / stride) * stride) - for d in desired_size] - else: - padded_size = tf.cast( - tf.math.ceil( - tf.cast(desired_size, dtype=tf.float32) / stride) * stride, - tf.int32) - return padded_size - - -def resize_and_crop_image(image, - desired_size, - padded_size, - aug_scale_min=1.0, - aug_scale_max=1.0, - seed=1, - method=tf.image.ResizeMethod.BILINEAR): - """Resizes the input image to output size (RetinaNet style). - - Resize and pad images given the desired output size of the image and - stride size. - - Here are the preprocessing steps. - 1. For a given image, keep its aspect ratio and rescale the image to make it - the largest rectangle to be bounded by the rectangle specified by the - `desired_size`. - 2. Pad the rescaled image to the padded_size. - - Args: - image: a `Tensor` of shape [height, width, 3] representing an image. - desired_size: a `Tensor` or `int` list/tuple of two elements representing - [height, width] of the desired actual output image size. - padded_size: a `Tensor` or `int` list/tuple of two elements representing - [height, width] of the padded output image size. Padding will be applied - after scaling the image to the desired_size. - aug_scale_min: a `float` with range between [0, 1.0] representing minimum - random scale applied to desired_size for training scale jittering. - aug_scale_max: a `float` with range between [1.0, inf] representing maximum - random scale applied to desired_size for training scale jittering. - seed: seed for random scale jittering. - method: function to resize input image to scaled image. - - Returns: - output_image: `Tensor` of shape [height, width, 3] where [height, width] - equals to `output_size`. - image_info: a 2D `Tensor` that encodes the information of the image and the - applied preprocessing. It is in the format of - [[original_height, original_width], [desired_height, desired_width], - [y_scale, x_scale], [y_offset, x_offset]], where [desired_height, - desired_width] is the actual scaled image size, and [y_scale, x_scale] is - the scaling factor, which is the ratio of - scaled dimension / original dimension. - """ - with tf.name_scope('resize_and_crop_image'): - image_size = tf.cast(tf.shape(image)[0:2], tf.float32) - - random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0) - - if random_jittering: - random_scale = tf.random.uniform( - [], aug_scale_min, aug_scale_max, seed=seed) - scaled_size = tf.round(random_scale * desired_size) - else: - scaled_size = desired_size - - scale = tf.minimum( - scaled_size[0] / image_size[0], scaled_size[1] / image_size[1]) - scaled_size = tf.round(image_size * scale) - - # Computes 2D image_scale. - image_scale = scaled_size / image_size - - # Selects non-zero random offset (x, y) if scaled image is larger than - # desired_size. - if random_jittering: - max_offset = scaled_size - desired_size - max_offset = tf.where( - tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset) - offset = max_offset * tf.random.uniform([2,], 0, 1, seed=seed) - offset = tf.cast(offset, tf.int32) - else: - offset = tf.zeros((2,), tf.int32) - - scaled_image = tf.image.resize( - image, tf.cast(scaled_size, tf.int32), method=method) - - if random_jittering: - scaled_image = scaled_image[ - offset[0]:offset[0] + desired_size[0], - offset[1]:offset[1] + desired_size[1], :] - - output_image = tf.image.pad_to_bounding_box( - scaled_image, 0, 0, padded_size[0], padded_size[1]) - - image_info = tf.stack([ - image_size, - tf.constant(desired_size, dtype=tf.float32), - image_scale, - tf.cast(offset, tf.float32)]) - return output_image, image_info - - -def resize_and_crop_image_v2(image, - short_side, - long_side, - padded_size, - aug_scale_min=1.0, - aug_scale_max=1.0, - seed=1, - method=tf.image.ResizeMethod.BILINEAR): - """Resizes the input image to output size (Faster R-CNN style). - - Resize and pad images given the specified short / long side length and the - stride size. - - Here are the preprocessing steps. - 1. For a given image, keep its aspect ratio and first try to rescale the short - side of the original image to `short_side`. - 2. If the scaled image after 1 has a long side that exceeds `long_side`, keep - the aspect ratio and rescal the long side of the image to `long_side`. - 2. Pad the rescaled image to the padded_size. - - Args: - image: a `Tensor` of shape [height, width, 3] representing an image. - short_side: a scalar `Tensor` or `int` representing the desired short side - to be rescaled to. - long_side: a scalar `Tensor` or `int` representing the desired long side to - be rescaled to. - padded_size: a `Tensor` or `int` list/tuple of two elements representing - [height, width] of the padded output image size. Padding will be applied - after scaling the image to the desired_size. - aug_scale_min: a `float` with range between [0, 1.0] representing minimum - random scale applied to desired_size for training scale jittering. - aug_scale_max: a `float` with range between [1.0, inf] representing maximum - random scale applied to desired_size for training scale jittering. - seed: seed for random scale jittering. - method: function to resize input image to scaled image. - - Returns: - output_image: `Tensor` of shape [height, width, 3] where [height, width] - equals to `output_size`. - image_info: a 2D `Tensor` that encodes the information of the image and the - applied preprocessing. It is in the format of - [[original_height, original_width], [desired_height, desired_width], - [y_scale, x_scale], [y_offset, x_offset]], where [desired_height, - desired_width] is the actual scaled image size, and [y_scale, x_scale] is - the scaling factor, which is the ratio of - scaled dimension / original dimension. - """ - with tf.name_scope('resize_and_crop_image_v2'): - image_size = tf.cast(tf.shape(image)[0:2], tf.float32) - - scale_using_short_side = ( - short_side / tf.math.minimum(image_size[0], image_size[1])) - scale_using_long_side = ( - long_side / tf.math.maximum(image_size[0], image_size[1])) - - scaled_size = tf.math.round(image_size * scale_using_short_side) - scaled_size = tf.where( - tf.math.greater( - tf.math.maximum(scaled_size[0], scaled_size[1]), long_side), - tf.math.round(image_size * scale_using_long_side), - scaled_size) - desired_size = scaled_size - - random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0) - - if random_jittering: - random_scale = tf.random.uniform( - [], aug_scale_min, aug_scale_max, seed=seed) - scaled_size = tf.math.round(random_scale * scaled_size) - - # Computes 2D image_scale. - image_scale = scaled_size / image_size - - # Selects non-zero random offset (x, y) if scaled image is larger than - # desired_size. - if random_jittering: - max_offset = scaled_size - desired_size - max_offset = tf.where( - tf.math.less(max_offset, 0), tf.zeros_like(max_offset), max_offset) - offset = max_offset * tf.random.uniform([2,], 0, 1, seed=seed) - offset = tf.cast(offset, tf.int32) - else: - offset = tf.zeros((2,), tf.int32) - - scaled_image = tf.image.resize( - image, tf.cast(scaled_size, tf.int32), method=method) - - if random_jittering: - scaled_image = scaled_image[ - offset[0]:offset[0] + desired_size[0], - offset[1]:offset[1] + desired_size[1], :] - - output_image = tf.image.pad_to_bounding_box( - scaled_image, 0, 0, padded_size[0], padded_size[1]) - - image_info = tf.stack([ - image_size, - tf.cast(desired_size, dtype=tf.float32), - image_scale, - tf.cast(offset, tf.float32)]) - return output_image, image_info - - -def resize_image( - image: tf.Tensor, - size: Union[Tuple[int, int], int], - max_size: Optional[int] = None, - method: tf.image.ResizeMethod = tf.image.ResizeMethod.BILINEAR): - """Resize image with size and max_size. - - Args: - image: the image to be resized. - size: if list to tuple, resize to it. If scalar, we keep the same - aspect ratio and resize the short side to the value. - max_size: only used when size is a scalar. When the larger side is larger - than max_size after resized with size we used max_size to keep the aspect - ratio instead. - method: the method argument passed to tf.image.resize. - - Returns: - the resized image and image_info to be used for downstream processing. - image_info: a 2D `Tensor` that encodes the information of the image and the - applied preprocessing. It is in the format of - [[original_height, original_width], [resized_height, resized_width], - [y_scale, x_scale], [0, 0]], where [resized_height, resized_width] - is the actual scaled image size, and [y_scale, x_scale] is the - scaling factor, which is the ratio of - scaled dimension / original dimension. - """ - - def get_size_with_aspect_ratio(image_size, size, max_size=None): - h = image_size[0] - w = image_size[1] - if max_size is not None: - min_original_size = tf.cast(tf.math.minimum(w, h), dtype=tf.float32) - max_original_size = tf.cast(tf.math.maximum(w, h), dtype=tf.float32) - if max_original_size / min_original_size * size > max_size: - size = tf.cast( - tf.math.floor(max_size * min_original_size / max_original_size), - dtype=tf.int32) - else: - size = tf.cast(size, tf.int32) - - else: - size = tf.cast(size, tf.int32) - if (w <= h and w == size) or (h <= w and h == size): - return tf.stack([h, w]) - - if w < h: - ow = size - oh = tf.cast( - (tf.cast(size, dtype=tf.float32) * tf.cast(h, dtype=tf.float32) / - tf.cast(w, dtype=tf.float32)), - dtype=tf.int32) - else: - oh = size - ow = tf.cast( - (tf.cast(size, dtype=tf.float32) * tf.cast(w, dtype=tf.float32) / - tf.cast(h, dtype=tf.float32)), - dtype=tf.int32) - - return tf.stack([oh, ow]) - - def get_size(image_size, size, max_size=None): - if isinstance(size, (list, tuple)): - return size[::-1] - else: - return get_size_with_aspect_ratio(image_size, size, max_size) - - orignal_size = tf.shape(image)[0:2] - size = get_size(orignal_size, size, max_size) - rescaled_image = tf.image.resize( - image, tf.cast(size, tf.int32), method=method) - image_scale = size / orignal_size - image_info = tf.stack([ - tf.cast(orignal_size, dtype=tf.float32), - tf.cast(size, dtype=tf.float32), - tf.cast(image_scale, tf.float32), - tf.constant([0.0, 0.0], dtype=tf.float32) - ]) - return rescaled_image, image_info - - -def center_crop_image(image): - """Center crop a square shape slice from the input image. - - It crops a square shape slice from the image. The side of the actual crop - is 224 / 256 = 0.875 of the short side of the original image. References: - [1] Very Deep Convolutional Networks for Large-Scale Image Recognition - https://arxiv.org/abs/1409.1556 - [2] Deep Residual Learning for Image Recognition - https://arxiv.org/abs/1512.03385 - - Args: - image: a Tensor of shape [height, width, 3] representing the input image. - - Returns: - cropped_image: a Tensor representing the center cropped image. - """ - with tf.name_scope('center_crop_image'): - image_size = tf.cast(tf.shape(image)[:2], dtype=tf.float32) - crop_size = ( - CENTER_CROP_FRACTION * tf.math.minimum(image_size[0], image_size[1])) - crop_offset = tf.cast((image_size - crop_size) / 2.0, dtype=tf.int32) - crop_size = tf.cast(crop_size, dtype=tf.int32) - cropped_image = image[ - crop_offset[0]:crop_offset[0] + crop_size, - crop_offset[1]:crop_offset[1] + crop_size, :] - return cropped_image - - -def center_crop_image_v2(image_bytes, image_shape): - """Center crop a square shape slice from the input image. - - It crops a square shape slice from the image. The side of the actual crop - is 224 / 256 = 0.875 of the short side of the original image. References: - [1] Very Deep Convolutional Networks for Large-Scale Image Recognition - https://arxiv.org/abs/1409.1556 - [2] Deep Residual Learning for Image Recognition - https://arxiv.org/abs/1512.03385 - - This is a faster version of `center_crop_image` which takes the original - image bytes and image size as the inputs, and partially decode the JPEG - bytes according to the center crop. - - Args: - image_bytes: a Tensor of type string representing the raw image bytes. - image_shape: a Tensor specifying the shape of the raw image. - - Returns: - cropped_image: a Tensor representing the center cropped image. - """ - with tf.name_scope('center_image_crop_v2'): - image_shape = tf.cast(image_shape, tf.float32) - crop_size = ( - CENTER_CROP_FRACTION * tf.math.minimum(image_shape[0], image_shape[1])) - crop_offset = tf.cast((image_shape - crop_size) / 2.0, dtype=tf.int32) - crop_size = tf.cast(crop_size, dtype=tf.int32) - crop_window = tf.stack( - [crop_offset[0], crop_offset[1], crop_size, crop_size]) - cropped_image = tf.image.decode_and_crop_jpeg( - image_bytes, crop_window, channels=3) - return cropped_image - - -def random_crop_image(image, - aspect_ratio_range=(3. / 4., 4. / 3.), - area_range=(0.08, 1.0), - max_attempts=10, - seed=1): - """Randomly crop an arbitrary shaped slice from the input image. - - Args: - image: a Tensor of shape [height, width, 3] representing the input image. - aspect_ratio_range: a list of floats. The cropped area of the image must - have an aspect ratio = width / height within this range. - area_range: a list of floats. The cropped reas of the image must contain - a fraction of the input image within this range. - max_attempts: the number of attempts at generating a cropped region of the - image of the specified constraints. After max_attempts failures, return - the entire image. - seed: the seed of the random generator. - - Returns: - cropped_image: a Tensor representing the random cropped image. Can be the - original image if max_attempts is exhausted. - """ - with tf.name_scope('random_crop_image'): - crop_offset, crop_size, _ = tf.image.sample_distorted_bounding_box( - tf.shape(image), - tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]), - seed=seed, - min_object_covered=area_range[0], - aspect_ratio_range=aspect_ratio_range, - area_range=area_range, - max_attempts=max_attempts) - cropped_image = tf.slice(image, crop_offset, crop_size) - return cropped_image - - -def random_crop_image_v2(image_bytes, - image_shape, - aspect_ratio_range=(3. / 4., 4. / 3.), - area_range=(0.08, 1.0), - max_attempts=10, - seed=1): - """Randomly crop an arbitrary shaped slice from the input image. - - This is a faster version of `random_crop_image` which takes the original - image bytes and image size as the inputs, and partially decode the JPEG - bytes according to the generated crop. - - Args: - image_bytes: a Tensor of type string representing the raw image bytes. - image_shape: a Tensor specifying the shape of the raw image. - aspect_ratio_range: a list of floats. The cropped area of the image must - have an aspect ratio = width / height within this range. - area_range: a list of floats. The cropped reas of the image must contain - a fraction of the input image within this range. - max_attempts: the number of attempts at generating a cropped region of the - image of the specified constraints. After max_attempts failures, return - the entire image. - seed: the seed of the random generator. - - Returns: - cropped_image: a Tensor representing the random cropped image. Can be the - original image if max_attempts is exhausted. - """ - with tf.name_scope('random_crop_image_v2'): - crop_offset, crop_size, _ = tf.image.sample_distorted_bounding_box( - image_shape, - tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]), - seed=seed, - min_object_covered=area_range[0], - aspect_ratio_range=aspect_ratio_range, - area_range=area_range, - max_attempts=max_attempts) - offset_y, offset_x, _ = tf.unstack(crop_offset) - crop_height, crop_width, _ = tf.unstack(crop_size) - crop_window = tf.stack([offset_y, offset_x, crop_height, crop_width]) - cropped_image = tf.image.decode_and_crop_jpeg( - image_bytes, crop_window, channels=3) - return cropped_image - - -def resize_and_crop_boxes(boxes, - image_scale, - output_size, - offset): - """Resizes boxes to output size with scale and offset. - - Args: - boxes: `Tensor` of shape [N, 4] representing ground truth boxes. - image_scale: 2D float `Tensor` representing scale factors that apply to - [height, width] of input image. - output_size: 2D `Tensor` or `int` representing [height, width] of target - output image size. - offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled - boxes. - - Returns: - boxes: `Tensor` of shape [N, 4] representing the scaled boxes. - """ - with tf.name_scope('resize_and_crop_boxes'): - # Adjusts box coordinates based on image_scale and offset. - boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) - boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) - # Clips the boxes. - boxes = box_ops.clip_boxes(boxes, output_size) - return boxes - - -def resize_and_crop_masks(masks, - image_scale, - output_size, - offset): - """Resizes boxes to output size with scale and offset. - - Args: - masks: `Tensor` of shape [N, H, W, 1] representing ground truth masks. - image_scale: 2D float `Tensor` representing scale factors that apply to - [height, width] of input image. - output_size: 2D `Tensor` or `int` representing [height, width] of target - output image size. - offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled - boxes. - - Returns: - masks: `Tensor` of shape [N, H, W, 1] representing the scaled masks. - """ - with tf.name_scope('resize_and_crop_masks'): - mask_size = tf.cast(tf.shape(masks)[1:3], tf.float32) - # Pad masks to avoid empty mask annotations. - masks = tf.concat( - [tf.zeros([1, mask_size[0], mask_size[1], 1]), masks], axis=0) - - scaled_size = tf.cast(image_scale * mask_size, tf.int32) - scaled_masks = tf.image.resize( - masks, scaled_size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) - offset = tf.cast(offset, tf.int32) - scaled_masks = scaled_masks[ - :, - offset[0]:offset[0] + output_size[0], - offset[1]:offset[1] + output_size[1], - :] - - output_masks = tf.image.pad_to_bounding_box( - scaled_masks, 0, 0, output_size[0], output_size[1]) - # Remove padding. - output_masks = output_masks[1::] - return output_masks - - -def horizontal_flip_image(image): - """Flips image horizontally.""" - return tf.image.flip_left_right(image) - - -def horizontal_flip_boxes(normalized_boxes): - """Flips normalized boxes horizontally.""" - ymin, xmin, ymax, xmax = tf.split( - value=normalized_boxes, num_or_size_splits=4, axis=1) - flipped_xmin = tf.subtract(1.0, xmax) - flipped_xmax = tf.subtract(1.0, xmin) - flipped_boxes = tf.concat([ymin, flipped_xmin, ymax, flipped_xmax], 1) - return flipped_boxes - - -def horizontal_flip_masks(masks): - """Flips masks horizontally.""" - return masks[:, :, ::-1] - - -def random_horizontal_flip(image, normalized_boxes=None, masks=None, seed=1): - """Randomly flips input image and bounding boxes.""" - with tf.name_scope('random_horizontal_flip'): - do_flip = tf.greater(tf.random.uniform([], seed=seed), 0.5) - - image = tf.cond( - do_flip, - lambda: horizontal_flip_image(image), - lambda: image) - - if normalized_boxes is not None: - normalized_boxes = tf.cond( - do_flip, - lambda: horizontal_flip_boxes(normalized_boxes), - lambda: normalized_boxes) - - if masks is not None: - masks = tf.cond( - do_flip, - lambda: horizontal_flip_masks(masks), - lambda: masks) - - return image, normalized_boxes, masks - - -def color_jitter(image: tf.Tensor, - brightness: Optional[float] = 0., - contrast: Optional[float] = 0., - saturation: Optional[float] = 0., - seed: Optional[int] = None) -> tf.Tensor: - """Applies color jitter to an image, similarly to torchvision`s ColorJitter. - - Args: - image (tf.Tensor): Of shape [height, width, 3] and type uint8. - brightness (float, optional): Magnitude for brightness jitter. Defaults to - 0. - contrast (float, optional): Magnitude for contrast jitter. Defaults to 0. - saturation (float, optional): Magnitude for saturation jitter. Defaults to - 0. - seed (int, optional): Random seed. Defaults to None. - - Returns: - tf.Tensor: The augmented `image` of type uint8. - """ - image = tf.cast(image, dtype=tf.uint8) - image = random_brightness(image, brightness, seed=seed) - image = random_contrast(image, contrast, seed=seed) - image = random_saturation(image, saturation, seed=seed) - return image - - -def random_brightness(image: tf.Tensor, - brightness: float = 0., - seed: Optional[int] = None) -> tf.Tensor: - """Jitters brightness of an image. - - Args: - image (tf.Tensor): Of shape [height, width, 3] and type uint8. - brightness (float, optional): Magnitude for brightness jitter. Defaults to - 0. - seed (int, optional): Random seed. Defaults to None. - - Returns: - tf.Tensor: The augmented `image` of type uint8. - """ - assert brightness >= 0, '`brightness` must be positive' - brightness = tf.random.uniform([], - max(0, 1 - brightness), - 1 + brightness, - seed=seed, - dtype=tf.float32) - return augment.brightness(image, brightness) - - -def random_contrast(image: tf.Tensor, - contrast: float = 0., - seed: Optional[int] = None) -> tf.Tensor: - """Jitters contrast of an image, similarly to torchvision`s ColorJitter. - - Args: - image (tf.Tensor): Of shape [height, width, 3] and type uint8. - contrast (float, optional): Magnitude for contrast jitter. Defaults to 0. - seed (int, optional): Random seed. Defaults to None. - - Returns: - tf.Tensor: The augmented `image` of type uint8. - """ - assert contrast >= 0, '`contrast` must be positive' - contrast = tf.random.uniform([], - max(0, 1 - contrast), - 1 + contrast, - seed=seed, - dtype=tf.float32) - return augment.contrast(image, contrast) - - -def random_saturation(image: tf.Tensor, - saturation: float = 0., - seed: Optional[int] = None) -> tf.Tensor: - """Jitters saturation of an image, similarly to torchvision`s ColorJitter. - - Args: - image (tf.Tensor): Of shape [height, width, 3] and type uint8. - saturation (float, optional): Magnitude for saturation jitter. Defaults to - 0. - seed (int, optional): Random seed. Defaults to None. - - Returns: - tf.Tensor: The augmented `image` of type uint8. - """ - assert saturation >= 0, '`saturation` must be positive' - saturation = tf.random.uniform([], - max(0, 1 - saturation), - 1 + saturation, - seed=seed, - dtype=tf.float32) - return _saturation(image, saturation) - - -def _saturation(image: tf.Tensor, - saturation: Optional[float] = 0.) -> tf.Tensor: - return augment.blend( - tf.repeat(tf.image.rgb_to_grayscale(image), 3, axis=-1), image, - saturation) - - -def random_crop_image_with_boxes_and_labels(img, boxes, labels, min_scale, - aspect_ratio_range, - min_overlap_params, max_retry): - """Crops a random slice from the input image. - - The function will correspondingly recompute the bounding boxes and filter out - outside boxes and their labels. - - References: - [1] End-to-End Object Detection with Transformers - https://arxiv.org/abs/2005.12872 - - The preprocessing steps: - 1. Sample a minimum IoU overlap. - 2. For each trial, sample the new image width, height, and top-left corner. - 3. Compute the IoUs of bounding boxes with the cropped image and retry if - the maximum IoU is below the sampled threshold. - 4. Find boxes whose centers are in the cropped image. - 5. Compute new bounding boxes in the cropped region and only select those - boxes' labels. - - Args: - img: a 'Tensor' of shape [height, width, 3] representing the input image. - boxes: a 'Tensor' of shape [N, 4] representing the ground-truth bounding - boxes with (ymin, xmin, ymax, xmax). - labels: a 'Tensor' of shape [N,] representing the class labels of the boxes. - min_scale: a 'float' in [0.0, 1.0) indicating the lower bound of the random - scale variable. - aspect_ratio_range: a list of two 'float' that specifies the lower and upper - bound of the random aspect ratio. - min_overlap_params: a list of four 'float' representing the min value, max - value, step size, and offset for the minimum overlap sample. - max_retry: an 'int' representing the number of trials for cropping. If it is - exhausted, no cropping will be performed. - - Returns: - img: a Tensor representing the random cropped image. Can be the - original image if max_retry is exhausted. - boxes: a Tensor representing the bounding boxes in the cropped image. - labels: a Tensor representing the new bounding boxes' labels. - """ - - shape = tf.shape(img) - original_h = shape[0] - original_w = shape[1] - - minval, maxval, step, offset = min_overlap_params - - min_overlap = tf.math.floordiv( - tf.random.uniform([], minval=minval, maxval=maxval), step) * step - offset - - min_overlap = tf.clip_by_value(min_overlap, 0.0, 1.1) - - if min_overlap > 1.0: - return img, boxes, labels - - aspect_ratio_low = aspect_ratio_range[0] - aspect_ratio_high = aspect_ratio_range[1] - - for _ in tf.range(max_retry): - scale_h = tf.random.uniform([], min_scale, 1.0) - scale_w = tf.random.uniform([], min_scale, 1.0) - new_h = tf.cast( - scale_h * tf.cast(original_h, dtype=tf.float32), dtype=tf.int32) - new_w = tf.cast( - scale_w * tf.cast(original_w, dtype=tf.float32), dtype=tf.int32) - - # Aspect ratio has to be in the prespecified range - aspect_ratio = new_h / new_w - if aspect_ratio_low > aspect_ratio or aspect_ratio > aspect_ratio_high: - continue - - left = tf.random.uniform([], 0, original_w - new_w, dtype=tf.int32) - right = left + new_w - top = tf.random.uniform([], 0, original_h - new_h, dtype=tf.int32) - bottom = top + new_h - - normalized_left = tf.cast( - left, dtype=tf.float32) / tf.cast( - original_w, dtype=tf.float32) - normalized_right = tf.cast( - right, dtype=tf.float32) / tf.cast( - original_w, dtype=tf.float32) - normalized_top = tf.cast( - top, dtype=tf.float32) / tf.cast( - original_h, dtype=tf.float32) - normalized_bottom = tf.cast( - bottom, dtype=tf.float32) / tf.cast( - original_h, dtype=tf.float32) - - cropped_box = tf.expand_dims( - tf.stack([ - normalized_top, - normalized_left, - normalized_bottom, - normalized_right, - ]), - axis=0) - iou = box_ops.bbox_overlap( - tf.expand_dims(cropped_box, axis=0), - tf.expand_dims(boxes, axis=0)) # (1, 1, n_ground_truth) - iou = tf.squeeze(iou, axis=[0, 1]) - - # If not a single bounding box has a Jaccard overlap of greater than - # the minimum, try again - if tf.reduce_max(iou) < min_overlap: - continue - - centroids = box_ops.yxyx_to_cycxhw(boxes) - mask = tf.math.logical_and( - tf.math.logical_and(centroids[:, 0] > normalized_top, - centroids[:, 0] < normalized_bottom), - tf.math.logical_and(centroids[:, 1] > normalized_left, - centroids[:, 1] < normalized_right)) - # If not a single bounding box has its center in the crop, try again. - if tf.reduce_sum(tf.cast(mask, dtype=tf.int32)) > 0: - indices = tf.squeeze(tf.where(mask), axis=1) - - filtered_boxes = tf.gather(boxes, indices) - - boxes = tf.clip_by_value( - (filtered_boxes[..., :] * tf.cast( - tf.stack([original_h, original_w, original_h, original_w]), - dtype=tf.float32) - - tf.cast(tf.stack([top, left, top, left]), dtype=tf.float32)) / - tf.cast(tf.stack([new_h, new_w, new_h, new_w]), dtype=tf.float32), - 0.0, 1.0) - - img = tf.image.crop_to_bounding_box(img, top, left, bottom - top, - right - left) - - labels = tf.gather(labels, indices) - break - - return img, boxes, labels - - -def random_crop(image, - boxes, - labels, - min_scale=0.3, - aspect_ratio_range=(0.5, 2.0), - min_overlap_params=(0.0, 1.4, 0.2, 0.1), - max_retry=50, - seed=None): - """Randomly crop the image and boxes, filtering labels. - - Args: - image: a 'Tensor' of shape [height, width, 3] representing the input image. - boxes: a 'Tensor' of shape [N, 4] representing the ground-truth bounding - boxes with (ymin, xmin, ymax, xmax). - labels: a 'Tensor' of shape [N,] representing the class labels of the boxes. - min_scale: a 'float' in [0.0, 1.0) indicating the lower bound of the random - scale variable. - aspect_ratio_range: a list of two 'float' that specifies the lower and upper - bound of the random aspect ratio. - min_overlap_params: a list of four 'float' representing the min value, max - value, step size, and offset for the minimum overlap sample. - max_retry: an 'int' representing the number of trials for cropping. If it is - exhausted, no cropping will be performed. - seed: the random number seed of int, but could be None. - - Returns: - image: a Tensor representing the random cropped image. Can be the - original image if max_retry is exhausted. - boxes: a Tensor representing the bounding boxes in the cropped image. - labels: a Tensor representing the new bounding boxes' labels. - """ - with tf.name_scope('random_crop'): - do_crop = tf.greater(tf.random.uniform([], seed=seed), 0.5) - if do_crop: - return random_crop_image_with_boxes_and_labels(image, boxes, labels, - min_scale, - aspect_ratio_range, - min_overlap_params, - max_retry) - else: - return image, boxes, labels diff --git a/official/vision/beta/ops/preprocess_ops_3d.py b/official/vision/beta/ops/preprocess_ops_3d.py deleted file mode 100644 index 25f680169..000000000 --- a/official/vision/beta/ops/preprocess_ops_3d.py +++ /dev/null @@ -1,354 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Utils for processing video dataset features.""" - -from typing import Optional, Tuple -import tensorflow as tf - - -def _sample_or_pad_sequence_indices(sequence: tf.Tensor, - num_steps: int, - stride: int, - offset: tf.Tensor) -> tf.Tensor: - """Returns indices to take for sampling or padding sequences to fixed size.""" - sequence_length = tf.shape(sequence)[0] - sel_idx = tf.range(sequence_length) - - # Repeats sequence until num_steps are available in total. - max_length = num_steps * stride + offset - num_repeats = tf.math.floordiv( - max_length + sequence_length - 1, sequence_length) - sel_idx = tf.tile(sel_idx, [num_repeats]) - - steps = tf.range(offset, offset + num_steps * stride, stride) - return tf.gather(sel_idx, steps) - - -def sample_linspace_sequence(sequence: tf.Tensor, - num_windows: int, - num_steps: int, - stride: int) -> tf.Tensor: - """Samples `num_windows` segments from sequence with linearly spaced offsets. - - The samples are concatenated in a single `tf.Tensor` in order to have the same - format structure per timestep (e.g. a single frame). If `num_steps` * `stride` - is bigger than the number of timesteps, the sequence is repeated. This - function can be used in evaluation in order to extract enough segments to span - the entire sequence. - - Args: - sequence: Any tensor where the first dimension is timesteps. - num_windows: Number of windows retrieved from the sequence. - num_steps: Number of steps (e.g. frames) to take. - stride: Distance to sample between timesteps. - - Returns: - A single `tf.Tensor` with first dimension `num_windows` * `num_steps`. The - tensor contains the concatenated list of `num_windows` tensors which offsets - have been linearly spaced from input. - """ - sequence_length = tf.shape(sequence)[0] - max_offset = tf.maximum(0, sequence_length - num_steps * stride) - offsets = tf.linspace(0.0, tf.cast(max_offset, tf.float32), num_windows) - offsets = tf.cast(offsets, tf.int32) - - all_indices = [] - for i in range(num_windows): - all_indices.append(_sample_or_pad_sequence_indices( - sequence=sequence, - num_steps=num_steps, - stride=stride, - offset=offsets[i])) - - indices = tf.concat(all_indices, axis=0) - indices.set_shape((num_windows * num_steps,)) - return tf.gather(sequence, indices) - - -def sample_sequence(sequence: tf.Tensor, - num_steps: int, - random: bool, - stride: int, - seed: Optional[int] = None) -> tf.Tensor: - """Samples a single segment of size `num_steps` from a given sequence. - - If `random` is not `True`, this function will simply sample the central window - of the sequence. Otherwise, a random offset will be chosen in a way that the - desired `num_steps` might be extracted from the sequence. - - Args: - sequence: Any tensor where the first dimension is timesteps. - num_steps: Number of steps (e.g. frames) to take. - random: A boolean indicating whether to random sample the single window. If - `True`, the offset is randomized. If `False`, the middle frame minus half - of `num_steps` is the first frame. - stride: Distance to sample between timesteps. - seed: A deterministic seed to use when sampling. - - Returns: - A single `tf.Tensor` with first dimension `num_steps` with the sampled - segment. - """ - sequence_length = tf.shape(sequence)[0] - - if random: - sequence_length = tf.cast(sequence_length, tf.float32) - frame_stride = tf.cast(stride, tf.float32) - max_offset = tf.cond( - sequence_length > (num_steps - 1) * frame_stride, - lambda: sequence_length - (num_steps - 1) * frame_stride, - lambda: sequence_length) - offset = tf.random.uniform( - (), - maxval=tf.cast(max_offset, dtype=tf.int32), - dtype=tf.int32, - seed=seed) - else: - offset = (sequence_length - num_steps * stride) // 2 - offset = tf.maximum(0, offset) - - indices = _sample_or_pad_sequence_indices( - sequence=sequence, - num_steps=num_steps, - stride=stride, - offset=offset) - indices.set_shape((num_steps,)) - - return tf.gather(sequence, indices) - - -def decode_jpeg(image_string: tf.Tensor, channels: int = 0) -> tf.Tensor: - """Decodes JPEG raw bytes string into a RGB uint8 Tensor. - - Args: - image_string: A `tf.Tensor` of type strings with the raw JPEG bytes where - the first dimension is timesteps. - channels: Number of channels of the JPEG image. Allowed values are 0, 1 and - 3. If 0, the number of channels will be calculated at runtime and no - static shape is set. - - Returns: - A Tensor of shape [T, H, W, C] of type uint8 with the decoded images. - """ - return tf.map_fn( - lambda x: tf.image.decode_jpeg(x, channels=channels), - image_string, back_prop=False, dtype=tf.uint8) - - -def crop_image(frames: tf.Tensor, - target_height: int, - target_width: int, - random: bool = False, - num_crops: int = 1, - seed: Optional[int] = None) -> tf.Tensor: - """Crops the image sequence of images. - - If requested size is bigger than image size, image is padded with 0. If not - random cropping, a central crop is performed if num_crops is 1. - - Args: - frames: A Tensor of dimension [timesteps, in_height, in_width, channels]. - target_height: Target cropped image height. - target_width: Target cropped image width. - random: A boolean indicating if crop should be randomized. - num_crops: Number of crops (support 1 for central crop and 3 for 3-crop). - seed: A deterministic seed to use when random cropping. - - Returns: - A Tensor of shape [timesteps, out_height, out_width, channels] of type uint8 - with the cropped images. - """ - if random: - # Random spatial crop. - shape = tf.shape(frames) - # If a static_shape is available (e.g. when using this method from add_image - # method), it will be used to have an output tensor with static shape. - static_shape = frames.shape.as_list() - seq_len = shape[0] if static_shape[0] is None else static_shape[0] - channels = shape[3] if static_shape[3] is None else static_shape[3] - frames = tf.image.random_crop( - frames, (seq_len, target_height, target_width, channels), seed) - else: - if num_crops == 1: - # Central crop or pad. - frames = tf.image.resize_with_crop_or_pad(frames, target_height, - target_width) - - elif num_crops == 3: - # Three-crop evaluation. - shape = tf.shape(frames) - static_shape = frames.shape.as_list() - seq_len = shape[0] if static_shape[0] is None else static_shape[0] - height = shape[1] if static_shape[1] is None else static_shape[1] - width = shape[2] if static_shape[2] is None else static_shape[2] - channels = shape[3] if static_shape[3] is None else static_shape[3] - - size = tf.convert_to_tensor( - (seq_len, target_height, target_width, channels)) - - offset_1 = tf.broadcast_to([0, 0, 0, 0], [4]) - # pylint:disable=g-long-lambda - offset_2 = tf.cond( - tf.greater_equal(height, width), - true_fn=lambda: tf.broadcast_to([ - 0, tf.cast(height, tf.float32) / 2 - target_height // 2, 0, 0 - ], [4]), - false_fn=lambda: tf.broadcast_to([ - 0, 0, tf.cast(width, tf.float32) / 2 - target_width // 2, 0 - ], [4])) - offset_3 = tf.cond( - tf.greater_equal(height, width), - true_fn=lambda: tf.broadcast_to( - [0, tf.cast(height, tf.float32) - target_height, 0, 0], [4]), - false_fn=lambda: tf.broadcast_to( - [0, 0, tf.cast(width, tf.float32) - target_width, 0], [4])) - # pylint:disable=g-long-lambda - - crops = [] - for offset in [offset_1, offset_2, offset_3]: - offset = tf.cast(tf.math.round(offset), tf.int32) - crops.append(tf.slice(frames, offset, size)) - frames = tf.concat(crops, axis=0) - - else: - raise NotImplementedError( - f"Only 1-crop and 3-crop are supported. Found {num_crops!r}.") - - return frames - - -def resize_smallest(frames: tf.Tensor, - min_resize: int) -> tf.Tensor: - """Resizes frames so that min(`height`, `width`) is equal to `min_resize`. - - This function will not do anything if the min(`height`, `width`) is already - equal to `min_resize`. This allows to save compute time. - - Args: - frames: A Tensor of dimension [timesteps, input_h, input_w, channels]. - min_resize: Minimum size of the final image dimensions. - - Returns: - A Tensor of shape [timesteps, output_h, output_w, channels] of type - frames.dtype where min(output_h, output_w) = min_resize. - """ - shape = tf.shape(frames) - input_h = shape[1] - input_w = shape[2] - - output_h = tf.maximum(min_resize, (input_h * min_resize) // input_w) - output_w = tf.maximum(min_resize, (input_w * min_resize) // input_h) - - def resize_fn(): - frames_resized = tf.image.resize(frames, (output_h, output_w)) - return tf.cast(frames_resized, frames.dtype) - - should_resize = tf.math.logical_or(tf.not_equal(input_w, output_w), - tf.not_equal(input_h, output_h)) - frames = tf.cond(should_resize, resize_fn, lambda: frames) - - return frames - - -def random_crop_resize(frames: tf.Tensor, - output_h: int, - output_w: int, - num_frames: int, - num_channels: int, - aspect_ratio: Tuple[float, float], - area_range: Tuple[float, float]) -> tf.Tensor: - """First crops clip with jittering and then resizes to (output_h, output_w). - - Args: - frames: A Tensor of dimension [timesteps, input_h, input_w, channels]. - output_h: Resized image height. - output_w: Resized image width. - num_frames: Number of input frames per clip. - num_channels: Number of channels of the clip. - aspect_ratio: Float tuple with the aspect range for cropping. - area_range: Float tuple with the area range for cropping. - Returns: - A Tensor of shape [timesteps, output_h, output_w, channels] of type - frames.dtype. - """ - shape = tf.shape(frames) - seq_len, _, _, channels = shape[0], shape[1], shape[2], shape[3] - bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]) - factor = output_w / output_h - aspect_ratio = (aspect_ratio[0] * factor, aspect_ratio[1] * factor) - sample_distorted_bbox = tf.image.sample_distorted_bounding_box( - shape[1:], - bounding_boxes=bbox, - min_object_covered=0.1, - aspect_ratio_range=aspect_ratio, - area_range=area_range, - max_attempts=100, - use_image_if_no_bounding_boxes=True) - bbox_begin, bbox_size, _ = sample_distorted_bbox - offset_y, offset_x, _ = tf.unstack(bbox_begin) - target_height, target_width, _ = tf.unstack(bbox_size) - size = tf.convert_to_tensor(( - seq_len, target_height, target_width, channels)) - offset = tf.convert_to_tensor(( - 0, offset_y, offset_x, 0)) - frames = tf.slice(frames, offset, size) - frames = tf.cast( - tf.image.resize(frames, (output_h, output_w)), - frames.dtype) - frames.set_shape((num_frames, output_h, output_w, num_channels)) - return frames - - -def random_flip_left_right( - frames: tf.Tensor, - seed: Optional[int] = None) -> tf.Tensor: - """Flips all the frames with a probability of 50%. - - Args: - frames: A Tensor of shape [timesteps, input_h, input_w, channels]. - seed: A seed to use for the random sampling. - - Returns: - A Tensor of shape [timesteps, output_h, output_w, channels] eventually - flipped left right. - """ - is_flipped = tf.random.uniform( - (), minval=0, maxval=2, dtype=tf.int32, seed=seed) - - frames = tf.cond(tf.equal(is_flipped, 1), - true_fn=lambda: tf.image.flip_left_right(frames), - false_fn=lambda: frames) - return frames - - -def normalize_image(frames: tf.Tensor, - zero_centering_image: bool, - dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor: - """Normalizes images. - - Args: - frames: A Tensor of numbers. - zero_centering_image: If True, results are in [-1, 1], if False, results are - in [0, 1]. - dtype: Type of output Tensor. - - Returns: - A Tensor of same shape as the input and of the given type. - """ - frames = tf.cast(frames, dtype) - if zero_centering_image: - return frames * (2.0 / 255.0) - 1.0 - else: - return frames / 255.0 diff --git a/official/vision/beta/ops/preprocess_ops_3d_test.py b/official/vision/beta/ops/preprocess_ops_3d_test.py deleted file mode 100644 index 2d2b75832..000000000 --- a/official/vision/beta/ops/preprocess_ops_3d_test.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import io -import itertools -import numpy as np -from PIL import Image -import tensorflow as tf - -from official.vision.beta.ops import preprocess_ops_3d - - -class ParserUtilsTest(tf.test.TestCase): - - def setUp(self): - super().setUp() - # [[0, 1, ..., 119], [1, 2, ..., 120], ..., [119, 120, ..., 218]]. - self._frames = tf.stack([tf.range(i, i + 120) for i in range(90)]) - self._frames = tf.cast(self._frames, tf.uint8) - self._frames = self._frames[tf.newaxis, :, :, tf.newaxis] - self._frames = tf.broadcast_to(self._frames, (6, 90, 120, 3)) - - # Create an equivalent numpy array for assertions. - self._np_frames = np.array([range(i, i + 120) for i in range(90)]) - self._np_frames = self._np_frames[np.newaxis, :, :, np.newaxis] - self._np_frames = np.broadcast_to(self._np_frames, (6, 90, 120, 3)) - - def test_sample_linspace_sequence(self): - sequence = tf.range(100) - sampled_seq_1 = preprocess_ops_3d.sample_linspace_sequence( - sequence, 10, 10, 1) - sampled_seq_2 = preprocess_ops_3d.sample_linspace_sequence( - sequence, 7, 10, 1) - sampled_seq_3 = preprocess_ops_3d.sample_linspace_sequence( - sequence, 7, 5, 2) - sampled_seq_4 = preprocess_ops_3d.sample_linspace_sequence( - sequence, 101, 1, 1) - - self.assertAllEqual(sampled_seq_1, range(100)) - # [0, 1, 2, 3, 4, ..., 8, 9, 15, 16, ..., 97, 98, 99] - self.assertAllEqual( - sampled_seq_2, - [15 * i + j for i, j in itertools.product(range(7), range(10))]) - # [0, 2, 4, 6, 8, 15, 17, 19, ..., 96, 98] - self.assertAllEqual( - sampled_seq_3, - [15 * i + 2 * j for i, j in itertools.product(range(7), range(5))]) - self.assertAllEqual(sampled_seq_4, [0] + list(range(100))) - - def test_sample_sequence(self): - sequence = tf.range(100) - sampled_seq_1 = preprocess_ops_3d.sample_sequence(sequence, 10, False, 1) - sampled_seq_2 = preprocess_ops_3d.sample_sequence(sequence, 10, False, 2) - sampled_seq_3 = preprocess_ops_3d.sample_sequence(sequence, 10, True, 1) - - self.assertAllEqual(sampled_seq_1, range(45, 55)) - self.assertAllEqual(sampled_seq_2, range(40, 60, 2)) - - offset_3 = sampled_seq_3[0] - self.assertBetween(offset_3, 0, 99) - self.assertAllEqual(sampled_seq_3, range(offset_3, offset_3 + 10)) - - def test_decode_jpeg(self): - # Create a random RGB JPEG image. - random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8) - random_image = Image.fromarray(random_image) - with io.BytesIO() as buffer: - random_image.save(buffer, format='JPEG') - raw_image_bytes = buffer.getvalue() - - raw_image = tf.constant([raw_image_bytes, raw_image_bytes]) - decoded_image = preprocess_ops_3d.decode_jpeg(raw_image, 3) - - self.assertEqual(decoded_image.shape.as_list()[3], 3) - self.assertAllEqual(decoded_image.shape, (2, 263, 320, 3)) - - def test_crop_image(self): - cropped_image_1 = preprocess_ops_3d.crop_image(self._frames, 50, 70) - cropped_image_2 = preprocess_ops_3d.crop_image(self._frames, 200, 200) - cropped_image_3 = preprocess_ops_3d.crop_image(self._frames, 50, 70, True) - cropped_image_4 = preprocess_ops_3d.crop_image( - self._frames, 90, 90, False, 3) - - self.assertAllEqual(cropped_image_1.shape, (6, 50, 70, 3)) - self.assertAllEqual(cropped_image_1, self._np_frames[:, 20:70, 25:95, :]) - - self.assertAllEqual(cropped_image_2.shape, (6, 200, 200, 3)) - expected = np.pad( - self._np_frames, ((0, 0), (55, 55), (40, 40), (0, 0)), 'constant') - self.assertAllEqual(cropped_image_2, expected) - - self.assertAllEqual(cropped_image_3.shape, (6, 50, 70, 3)) - offset = cropped_image_3[0, 0, 0, 0] - expected = np.array([range(i, i + 70) for i in range(offset, offset + 50)]) - expected = expected[np.newaxis, :, :, np.newaxis] - expected = np.broadcast_to(expected, (6, 50, 70, 3)) - self.assertAllEqual(cropped_image_3, expected) - self.assertAllEqual(cropped_image_4.shape, (18, 90, 90, 3)) - - def test_resize_smallest(self): - resized_frames_1 = preprocess_ops_3d.resize_smallest(self._frames, 180) - resized_frames_2 = preprocess_ops_3d.resize_smallest(self._frames, 45) - resized_frames_3 = preprocess_ops_3d.resize_smallest(self._frames, 90) - resized_frames_4 = preprocess_ops_3d.resize_smallest( - tf.transpose(self._frames, (0, 2, 1, 3)), 45) - - self.assertAllEqual(resized_frames_1.shape, (6, 180, 240, 3)) - self.assertAllEqual(resized_frames_2.shape, (6, 45, 60, 3)) - self.assertAllEqual(resized_frames_3.shape, (6, 90, 120, 3)) - self.assertAllEqual(resized_frames_4.shape, (6, 60, 45, 3)) - - def test_random_crop_resize(self): - resized_frames_1 = preprocess_ops_3d.random_crop_resize( - self._frames, 256, 256, 6, 3, (0.5, 2), (0.3, 1)) - resized_frames_2 = preprocess_ops_3d.random_crop_resize( - self._frames, 224, 224, 6, 3, (0.5, 2), (0.3, 1)) - resized_frames_3 = preprocess_ops_3d.random_crop_resize( - self._frames, 256, 256, 6, 3, (0.8, 1.2), (0.3, 1)) - resized_frames_4 = preprocess_ops_3d.random_crop_resize( - self._frames, 256, 256, 6, 3, (0.5, 2), (0.1, 1)) - self.assertAllEqual(resized_frames_1.shape, (6, 256, 256, 3)) - self.assertAllEqual(resized_frames_2.shape, (6, 224, 224, 3)) - self.assertAllEqual(resized_frames_3.shape, (6, 256, 256, 3)) - self.assertAllEqual(resized_frames_4.shape, (6, 256, 256, 3)) - - def test_random_flip_left_right(self): - flipped_frames = preprocess_ops_3d.random_flip_left_right(self._frames) - - flipped = np.fliplr(self._np_frames[0, :, :, 0]) - flipped = flipped[np.newaxis, :, :, np.newaxis] - flipped = np.broadcast_to(flipped, (6, 90, 120, 3)) - self.assertTrue((flipped_frames == self._np_frames).numpy().all() or ( - flipped_frames == flipped).numpy().all()) - - def test_normalize_image(self): - normalized_images_1 = preprocess_ops_3d.normalize_image( - self._frames, False, tf.float32) - normalized_images_2 = preprocess_ops_3d.normalize_image( - self._frames, True, tf.float32) - - self.assertAllClose(normalized_images_1, self._np_frames / 255) - self.assertAllClose(normalized_images_2, self._np_frames * 2 / 255 - 1.0) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/ops/preprocess_ops_test.py b/official/vision/beta/ops/preprocess_ops_test.py deleted file mode 100644 index 1ec56bf79..000000000 --- a/official/vision/beta/ops/preprocess_ops_test.py +++ /dev/null @@ -1,246 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for preprocess_ops.py.""" - -import io -# Import libraries -from absl.testing import parameterized -import numpy as np -from PIL import Image -import tensorflow as tf - -from official.vision.beta.ops import preprocess_ops - - -def _encode_image(image_array, fmt): - image = Image.fromarray(image_array) - with io.BytesIO() as output: - image.save(output, format=fmt) - return output.getvalue() - - -class InputUtilsTest(parameterized.TestCase, tf.test.TestCase): - - @parameterized.parameters( - ([1], 10), - ([1, 2], 10), - ([1, 2, 3], 10), - ([11], 10), - ([12, 2], 10), - ([13, 2, 3], 10), - ) - def test_pad_to_fixed_size(self, input_shape, output_size): - # Copies input shape to padding shape. - clip_shape = input_shape[:] - clip_shape[0] = min(output_size, clip_shape[0]) - padding_shape = input_shape[:] - padding_shape[0] = max(output_size - input_shape[0], 0) - expected_outputs = np.concatenate( - [np.ones(clip_shape), np.zeros(padding_shape)], axis=0) - - data = tf.ones(input_shape) - output_data = preprocess_ops.clip_or_pad_to_fixed_size( - data, output_size, constant_values=0) - output_data = output_data.numpy() - self.assertAllClose(output_size, output_data.shape[0]) - self.assertAllClose(expected_outputs, output_data) - - @parameterized.parameters( - (100, 200, 100, 200, 32, 1.0, 1.0, 128, 224), - (100, 256, 128, 256, 32, 1.0, 1.0, 128, 256), - (200, 512, 200, 128, 32, 0.25, 0.25, 224, 128), - ) - def test_resize_and_crop_image_rectangluar_case(self, input_height, - input_width, desired_height, - desired_width, stride, - scale_y, scale_x, - output_height, output_width): - image = tf.convert_to_tensor( - np.random.rand(input_height, input_width, 3)) - - desired_size = (desired_height, desired_width) - resized_image, image_info = preprocess_ops.resize_and_crop_image( - image, - desired_size=desired_size, - padded_size=preprocess_ops.compute_padded_size(desired_size, stride)) - resized_image_shape = tf.shape(resized_image) - - self.assertAllEqual( - [output_height, output_width, 3], - resized_image_shape.numpy()) - self.assertNDArrayNear( - [[input_height, input_width], - [desired_height, desired_width], - [scale_y, scale_x], - [0.0, 0.0]], - image_info.numpy(), - 1e-5) - - @parameterized.parameters( - (100, 200, 220, 220, 32, 1.1, 1.1, 224, 224), - (512, 512, 1024, 1024, 32, 2.0, 2.0, 1024, 1024), - ) - def test_resize_and_crop_image_square_case(self, input_height, input_width, - desired_height, desired_width, - stride, scale_y, scale_x, - output_height, output_width): - image = tf.convert_to_tensor( - np.random.rand(input_height, input_width, 3)) - - desired_size = (desired_height, desired_width) - resized_image, image_info = preprocess_ops.resize_and_crop_image( - image, - desired_size=desired_size, - padded_size=preprocess_ops.compute_padded_size(desired_size, stride)) - resized_image_shape = tf.shape(resized_image) - - self.assertAllEqual( - [output_height, output_width, 3], - resized_image_shape.numpy()) - self.assertNDArrayNear( - [[input_height, input_width], - [desired_height, desired_width], - [scale_y, scale_x], - [0.0, 0.0]], - image_info.numpy(), - 1e-5) - - @parameterized.parameters( - (100, 200, 100, 300, 32, 1.0, 1.0, 100, 200, 128, 320), - (200, 100, 100, 300, 32, 1.0, 1.0, 200, 100, 320, 128), - (100, 200, 80, 100, 32, 0.5, 0.5, 50, 100, 96, 128), - (200, 100, 80, 100, 32, 0.5, 0.5, 100, 50, 128, 96), - ) - def test_resize_and_crop_image_v2(self, input_height, input_width, short_side, - long_side, stride, scale_y, scale_x, - desired_height, desired_width, - output_height, output_width): - image = tf.convert_to_tensor( - np.random.rand(input_height, input_width, 3)) - image_shape = tf.shape(image)[0:2] - - desired_size = tf.where( - tf.greater(image_shape[0], image_shape[1]), - tf.constant([long_side, short_side], dtype=tf.int32), - tf.constant([short_side, long_side], dtype=tf.int32)) - resized_image, image_info = preprocess_ops.resize_and_crop_image_v2( - image, - short_side=short_side, - long_side=long_side, - padded_size=preprocess_ops.compute_padded_size(desired_size, stride)) - resized_image_shape = tf.shape(resized_image) - - self.assertAllEqual( - [output_height, output_width, 3], - resized_image_shape.numpy()) - self.assertNDArrayNear( - [[input_height, input_width], - [desired_height, desired_width], - [scale_y, scale_x], - [0.0, 0.0]], - image_info.numpy(), - 1e-5) - - @parameterized.parameters( - (400, 600), (600, 400), - ) - def test_center_crop_image(self, input_height, input_width): - image = tf.convert_to_tensor( - np.random.rand(input_height, input_width, 3)) - cropped_image = preprocess_ops.center_crop_image(image) - cropped_image_shape = tf.shape(cropped_image) - self.assertAllEqual([350, 350, 3], cropped_image_shape.numpy()) - - @parameterized.parameters( - (400, 600), (600, 400), - ) - def test_center_crop_image_v2(self, input_height, input_width): - image_bytes = tf.constant( - _encode_image( - np.uint8(np.random.rand(input_height, input_width, 3) * 255), - fmt='JPEG'), - dtype=tf.string) - cropped_image = preprocess_ops.center_crop_image_v2( - image_bytes, tf.constant([input_height, input_width, 3], tf.int32)) - cropped_image_shape = tf.shape(cropped_image) - self.assertAllEqual([350, 350, 3], cropped_image_shape.numpy()) - - @parameterized.parameters( - (400, 600), (600, 400), - ) - def test_random_crop_image(self, input_height, input_width): - image = tf.convert_to_tensor( - np.random.rand(input_height, input_width, 3)) - _ = preprocess_ops.random_crop_image(image) - - @parameterized.parameters( - (400, 600), (600, 400), - ) - def test_random_crop_image_v2(self, input_height, input_width): - image_bytes = tf.constant( - _encode_image( - np.uint8(np.random.rand(input_height, input_width, 3) * 255), - fmt='JPEG'), - dtype=tf.string) - _ = preprocess_ops.random_crop_image_v2( - image_bytes, tf.constant([input_height, input_width, 3], tf.int32)) - - @parameterized.parameters((400, 600, 0), (400, 600, 0.4), (600, 400, 1.4)) - def testColorJitter(self, input_height, input_width, color_jitter): - image = tf.convert_to_tensor(np.random.rand(input_height, input_width, 3)) - jittered_image = preprocess_ops.color_jitter(image, color_jitter, - color_jitter, color_jitter) - assert jittered_image.shape == image.shape - - @parameterized.parameters((400, 600, 0), (400, 600, 0.4), (600, 400, 1)) - def testSaturation(self, input_height, input_width, saturation): - image = tf.convert_to_tensor(np.random.rand(input_height, input_width, 3)) - jittered_image = preprocess_ops._saturation(image, saturation) - assert jittered_image.shape == image.shape - - @parameterized.parameters((640, 640, 20), (1280, 1280, 30)) - def test_random_crop(self, input_height, input_width, num_boxes): - image = tf.convert_to_tensor(np.random.rand(input_height, input_width, 3)) - boxes_height = np.random.randint(0, input_height, size=(num_boxes, 1)) - top = np.random.randint(0, high=(input_height - boxes_height)) - down = top + boxes_height - boxes_width = np.random.randint(0, input_width, size=(num_boxes, 1)) - left = np.random.randint(0, high=(input_width - boxes_width)) - right = left + boxes_width - boxes = tf.constant( - np.concatenate([top, left, down, right], axis=-1), tf.float32) - labels = tf.constant( - np.random.randint(low=0, high=num_boxes, size=(num_boxes,)), tf.int64) - _ = preprocess_ops.random_crop(image, boxes, labels) - - @parameterized.parameters( - ((640, 640, 3), (1000, 1000), None, (1000, 1000, 3)), - ((1280, 640, 3), 320, None, (640, 320, 3)), - ((640, 1280, 3), 320, None, (320, 640, 3)), - ((640, 640, 3), 320, 100, (100, 100, 3))) - def test_resize_image(self, input_shape, size, max_size, expected_shape): - resized_img, image_info = preprocess_ops.resize_image( - tf.zeros((input_shape)), size, max_size) - self.assertAllEqual(tf.shape(resized_img), expected_shape) - self.assertAllEqual(image_info[0], input_shape[:-1]) - self.assertAllEqual(image_info[1], expected_shape[:-1]) - self.assertAllEqual( - image_info[2], - np.array(expected_shape[:-1]) / np.array(input_shape[:-1])) - self.assertAllEqual(image_info[3], [0, 0]) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/ops/sampling_ops.py b/official/vision/beta/ops/sampling_ops.py deleted file mode 100644 index f86979e13..000000000 --- a/official/vision/beta/ops/sampling_ops.py +++ /dev/null @@ -1,383 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Class to subsample minibatches by balancing positives and negatives. - -Subsamples minibatches based on a pre-specified positive fraction in range -[0,1]. The class presumes there are many more negatives than positive examples: -if the desired batch_size cannot be achieved with the pre-specified positive -fraction, it fills the rest with negative examples. If this is not sufficient -for obtaining the desired batch_size, it returns fewer examples. - -The main function to call is Subsample(self, indicator, labels). For convenience -one can also call SubsampleWeights(self, weights, labels) which is defined in -the minibatch_sampler base class. - -When is_static is True, it implements a method that guarantees static shapes. -It also ensures the length of output of the subsample is always batch_size, even -when number of examples set to True in indicator is less than batch_size. - -This is originally implemented in TensorFlow Object Detection API. -""" - -# Import libraries -import tensorflow as tf - - -def combined_static_and_dynamic_shape(tensor): - """Returns a list containing static and dynamic values for the dimensions. - - Returns a list of static and dynamic values for shape dimensions. This is - useful to preserve static shapes when available in reshape operation. - - Args: - tensor: A tensor of any type. - - Returns: - A list of size tensor.shape.ndims containing integers or a scalar tensor. - """ - static_tensor_shape = tensor.shape.as_list() - dynamic_tensor_shape = tf.shape(input=tensor) - combined_shape = [] - for index, dim in enumerate(static_tensor_shape): - if dim is not None: - combined_shape.append(dim) - else: - combined_shape.append(dynamic_tensor_shape[index]) - return combined_shape - - -def indices_to_dense_vector(indices, - size, - indices_value=1., - default_value=0, - dtype=tf.float32): - """Creates dense vector with indices set to specific value and rest to zeros. - - This function exists because it is unclear if it is safe to use - tf.sparse_to_dense(indices, [size], 1, validate_indices=False) - with indices which are not ordered. - This function accepts a dynamic size (e.g. tf.shape(tensor)[0]) - - Args: - indices: 1d Tensor with integer indices which are to be set to - indices_values. - size: scalar with size (integer) of output Tensor. - indices_value: values of elements specified by indices in the output vector - default_value: values of other elements in the output vector. - dtype: data type. - - Returns: - dense 1D Tensor of shape [size] with indices set to indices_values and the - rest set to default_value. - """ - size = tf.cast(size, dtype=tf.int32) - zeros = tf.ones([size], dtype=dtype) * default_value - values = tf.ones_like(indices, dtype=dtype) * indices_value - - return tf.dynamic_stitch( - [tf.range(size), tf.cast(indices, dtype=tf.int32)], [zeros, values]) - - -def matmul_gather_on_zeroth_axis(params, indices, scope=None): - """Matrix multiplication based implementation of tf.gather on zeroth axis. - - TODO(rathodv, jonathanhuang): enable sparse matmul option. - - Args: - params: A float32 Tensor. The tensor from which to gather values. - Must be at least rank 1. - indices: A Tensor. Must be one of the following types: int32, int64. - Must be in range [0, params.shape[0]) - scope: A name for the operation (optional). - - Returns: - A Tensor. Has the same type as params. Values from params gathered - from indices given by indices, with shape indices.shape + params.shape[1:]. - """ - scope = scope or 'MatMulGather' - with tf.name_scope(scope): - params_shape = combined_static_and_dynamic_shape(params) - indices_shape = combined_static_and_dynamic_shape(indices) - params2d = tf.reshape(params, [params_shape[0], -1]) - indicator_matrix = tf.one_hot(indices, params_shape[0]) - gathered_result_flattened = tf.matmul(indicator_matrix, params2d) - return tf.reshape(gathered_result_flattened, - tf.stack(indices_shape + params_shape[1:])) - - -class BalancedPositiveNegativeSampler: - """Subsamples minibatches to a desired balance of positives and negatives.""" - - def __init__(self, positive_fraction=0.5, is_static=False): - """Constructs a minibatch sampler. - - Args: - positive_fraction: desired fraction of positive examples (scalar in [0,1]) - in the batch. - is_static: If True, uses an implementation with static shape guarantees. - - Raises: - ValueError: if positive_fraction < 0, or positive_fraction > 1 - """ - if positive_fraction < 0 or positive_fraction > 1: - raise ValueError('positive_fraction should be in range [0,1]. ' - 'Received: %s.' % positive_fraction) - self._positive_fraction = positive_fraction - self._is_static = is_static - - @staticmethod - def subsample_indicator(indicator, num_samples): - """Subsample indicator vector. - - Given a boolean indicator vector with M elements set to `True`, the function - assigns all but `num_samples` of these previously `True` elements to - `False`. If `num_samples` is greater than M, the original indicator vector - is returned. - - Args: - indicator: a 1-dimensional boolean tensor indicating which elements - are allowed to be sampled and which are not. - num_samples: int32 scalar tensor - - Returns: - a boolean tensor with the same shape as input (indicator) tensor - """ - indices = tf.where(indicator) - indices = tf.random.shuffle(indices) - indices = tf.reshape(indices, [-1]) - - num_samples = tf.minimum(tf.size(input=indices), num_samples) - selected_indices = tf.slice(indices, [0], tf.reshape(num_samples, [1])) - - selected_indicator = indices_to_dense_vector( - selected_indices, - tf.shape(input=indicator)[0]) - - return tf.equal(selected_indicator, 1) - - def _get_num_pos_neg_samples(self, sorted_indices_tensor, sample_size): - """Counts the number of positives and negatives numbers to be sampled. - - Args: - sorted_indices_tensor: A sorted int32 tensor of shape [N] which contains - the signed indices of the examples where the sign is based on the label - value. The examples that cannot be sampled are set to 0. It samples - at most sample_size*positive_fraction positive examples and remaining - from negative examples. - sample_size: Size of subsamples. - - Returns: - A tuple containing the number of positive and negative labels in the - subsample. - """ - input_length = tf.shape(input=sorted_indices_tensor)[0] - valid_positive_index = tf.greater(sorted_indices_tensor, - tf.zeros(input_length, tf.int32)) - num_sampled_pos = tf.reduce_sum( - input_tensor=tf.cast(valid_positive_index, tf.int32)) - max_num_positive_samples = tf.constant( - int(sample_size * self._positive_fraction), tf.int32) - num_positive_samples = tf.minimum(max_num_positive_samples, num_sampled_pos) - num_negative_samples = tf.constant(sample_size, - tf.int32) - num_positive_samples - - return num_positive_samples, num_negative_samples - - def _get_values_from_start_and_end(self, input_tensor, num_start_samples, - num_end_samples, total_num_samples): - """slices num_start_samples and last num_end_samples from input_tensor. - - Args: - input_tensor: An int32 tensor of shape [N] to be sliced. - num_start_samples: Number of examples to be sliced from the beginning - of the input tensor. - num_end_samples: Number of examples to be sliced from the end of the - input tensor. - total_num_samples: Sum of is num_start_samples and num_end_samples. This - should be a scalar. - - Returns: - A tensor containing the first num_start_samples and last num_end_samples - from input_tensor. - - """ - input_length = tf.shape(input=input_tensor)[0] - start_positions = tf.less(tf.range(input_length), num_start_samples) - end_positions = tf.greater_equal( - tf.range(input_length), input_length - num_end_samples) - selected_positions = tf.logical_or(start_positions, end_positions) - selected_positions = tf.cast(selected_positions, tf.float32) - indexed_positions = tf.multiply(tf.cumsum(selected_positions), - selected_positions) - one_hot_selector = tf.one_hot(tf.cast(indexed_positions, tf.int32) - 1, - total_num_samples, - dtype=tf.float32) - return tf.cast(tf.tensordot(tf.cast(input_tensor, tf.float32), - one_hot_selector, axes=[0, 0]), tf.int32) - - def _static_subsample(self, indicator, batch_size, labels): - """Returns subsampled minibatch. - - Args: - indicator: boolean tensor of shape [N] whose True entries can be sampled. - N should be a complie time constant. - batch_size: desired batch size. This scalar cannot be None. - labels: boolean tensor of shape [N] denoting positive(=True) and negative - (=False) examples. N should be a complie time constant. - - Returns: - sampled_idx_indicator: boolean tensor of shape [N], True for entries which - are sampled. It ensures the length of output of the subsample is always - batch_size, even when number of examples set to True in indicator is - less than batch_size. - - Raises: - ValueError: if labels and indicator are not 1D boolean tensors. - """ - # Check if indicator and labels have a static size. - if not indicator.shape.is_fully_defined(): - raise ValueError('indicator must be static in shape when is_static is' - 'True') - if not labels.shape.is_fully_defined(): - raise ValueError('labels must be static in shape when is_static is' - 'True') - if not isinstance(batch_size, int): - raise ValueError('batch_size has to be an integer when is_static is' - 'True.') - - input_length = tf.shape(input=indicator)[0] - - # Set the number of examples set True in indicator to be at least - # batch_size. - num_true_sampled = tf.reduce_sum( - input_tensor=tf.cast(indicator, tf.float32)) - additional_false_sample = tf.less_equal( - tf.cumsum(tf.cast(tf.logical_not(indicator), tf.float32)), - batch_size - num_true_sampled) - indicator = tf.logical_or(indicator, additional_false_sample) - - # Shuffle indicator and label. Need to store the permutation to restore the - # order post sampling. - permutation = tf.random.shuffle(tf.range(input_length)) - indicator = matmul_gather_on_zeroth_axis( - tf.cast(indicator, tf.float32), permutation) - labels = matmul_gather_on_zeroth_axis( - tf.cast(labels, tf.float32), permutation) - - # index (starting from 1) when indicator is True, 0 when False - indicator_idx = tf.where( - tf.cast(indicator, tf.bool), tf.range(1, input_length + 1), - tf.zeros(input_length, tf.int32)) - - # Replace -1 for negative, +1 for positive labels - signed_label = tf.where( - tf.cast(labels, tf.bool), tf.ones(input_length, tf.int32), - tf.scalar_mul(-1, tf.ones(input_length, tf.int32))) - # negative of index for negative label, positive index for positive label, - # 0 when indicator is False. - signed_indicator_idx = tf.multiply(indicator_idx, signed_label) - sorted_signed_indicator_idx = tf.nn.top_k( - signed_indicator_idx, input_length, sorted=True).values - - [num_positive_samples, - num_negative_samples] = self._get_num_pos_neg_samples( - sorted_signed_indicator_idx, batch_size) - - sampled_idx = self._get_values_from_start_and_end( - sorted_signed_indicator_idx, num_positive_samples, - num_negative_samples, batch_size) - - # Shift the indices to start from 0 and remove any samples that are set as - # False. - sampled_idx = tf.abs(sampled_idx) - tf.ones(batch_size, tf.int32) - sampled_idx = tf.multiply( - tf.cast(tf.greater_equal(sampled_idx, tf.constant(0)), tf.int32), - sampled_idx) - - sampled_idx_indicator = tf.cast( - tf.reduce_sum( - input_tensor=tf.one_hot(sampled_idx, depth=input_length), axis=0), - tf.bool) - - # project back the order based on stored permutations - reprojections = tf.one_hot(permutation, depth=input_length, - dtype=tf.float32) - return tf.cast(tf.tensordot( - tf.cast(sampled_idx_indicator, tf.float32), - reprojections, axes=[0, 0]), tf.bool) - - def subsample(self, indicator, batch_size, labels, scope=None): - """Returns subsampled minibatch. - - Args: - indicator: boolean tensor of shape [N] whose True entries can be sampled. - batch_size: desired batch size. If None, keeps all positive samples and - randomly selects negative samples so that the positive sample fraction - matches self._positive_fraction. It cannot be None is is_static is True. - labels: boolean tensor of shape [N] denoting positive(=True) and negative - (=False) examples. - scope: name scope. - - Returns: - sampled_idx_indicator: boolean tensor of shape [N], True for entries which - are sampled. - - Raises: - ValueError: if labels and indicator are not 1D boolean tensors. - """ - if len(indicator.get_shape().as_list()) != 1: - raise ValueError('indicator must be 1 dimensional, got a tensor of ' - 'shape %s' % indicator.get_shape()) - if len(labels.get_shape().as_list()) != 1: - raise ValueError('labels must be 1 dimensional, got a tensor of ' - 'shape %s' % labels.get_shape()) - if labels.dtype != tf.bool: - raise ValueError('labels should be of type bool. Received: %s' % - labels.dtype) - if indicator.dtype != tf.bool: - raise ValueError('indicator should be of type bool. Received: %s' % - indicator.dtype) - scope = scope or 'BalancedPositiveNegativeSampler' - with tf.name_scope(scope): - if self._is_static: - return self._static_subsample(indicator, batch_size, labels) - - else: - # Only sample from indicated samples - negative_idx = tf.logical_not(labels) - positive_idx = tf.logical_and(labels, indicator) - negative_idx = tf.logical_and(negative_idx, indicator) - - # Sample positive and negative samples separately - if batch_size is None: - max_num_pos = tf.reduce_sum( - input_tensor=tf.cast(positive_idx, dtype=tf.int32)) - else: - max_num_pos = int(self._positive_fraction * batch_size) - sampled_pos_idx = self.subsample_indicator(positive_idx, max_num_pos) - num_sampled_pos = tf.reduce_sum( - input_tensor=tf.cast(sampled_pos_idx, tf.int32)) - if batch_size is None: - negative_positive_ratio = ( - 1 - self._positive_fraction) / self._positive_fraction - max_num_neg = tf.cast( - negative_positive_ratio * - tf.cast(num_sampled_pos, dtype=tf.float32), - dtype=tf.int32) - else: - max_num_neg = batch_size - num_sampled_pos - sampled_neg_idx = self.subsample_indicator(negative_idx, max_num_neg) - - return tf.logical_or(sampled_pos_idx, sampled_neg_idx) diff --git a/official/vision/beta/ops/spatial_transform_ops.py b/official/vision/beta/ops/spatial_transform_ops.py deleted file mode 100644 index c2f6658df..000000000 --- a/official/vision/beta/ops/spatial_transform_ops.py +++ /dev/null @@ -1,544 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Spatial transform ops.""" - -import tensorflow as tf - -_EPSILON = 1e-8 - - -def _feature_bilinear_interpolation(features, kernel_y, kernel_x): - """Feature bilinear interpolation. - - The RoIAlign feature f can be computed by bilinear interpolation - of four neighboring feature points f0, f1, f2, and f3. - - f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T - [f10, f11]] - f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11 - f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11 - kernel_y = [hy, ly] - kernel_x = [hx, lx] - - Args: - features: The features are in shape of [batch_size, num_boxes, output_size * - 2, output_size * 2, num_filters]. - kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1]. - kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1]. - - Returns: - A 5-D tensor representing feature crop of shape - [batch_size, num_boxes, output_size, output_size, num_filters]. - - """ - features_shape = tf.shape(features) - batch_size, num_boxes, output_size, num_filters = ( - features_shape[0], features_shape[1], features_shape[2], - features_shape[4]) - - output_size = output_size // 2 - kernel_y = tf.reshape(kernel_y, [batch_size, num_boxes, output_size * 2, 1]) - kernel_x = tf.reshape(kernel_x, [batch_size, num_boxes, 1, output_size * 2]) - # Use implicit broadcast to generate the interpolation kernel. The - # multiplier `4` is for avg pooling. - interpolation_kernel = kernel_y * kernel_x * 4 - - # Interpolate the gathered features with computed interpolation kernels. - features *= tf.cast( - tf.expand_dims(interpolation_kernel, axis=-1), dtype=features.dtype) - features = tf.reshape( - features, - [batch_size * num_boxes, output_size * 2, output_size * 2, num_filters]) - features = tf.nn.avg_pool(features, [1, 2, 2, 1], [1, 2, 2, 1], 'VALID') - features = tf.reshape( - features, [batch_size, num_boxes, output_size, output_size, num_filters]) - return features - - -def _compute_grid_positions(boxes, boundaries, output_size, sample_offset): - """Computes the grid position w.r.t. the corresponding feature map. - - Args: - boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the - information of each box w.r.t. the corresponding feature map. - boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left - corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float) - in terms of the number of pixels of the corresponding feature map size. - boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing - the boundary (in (y, x)) of the corresponding feature map for each box. - Any resampled grid points that go beyond the bounary will be clipped. - output_size: a scalar indicating the output crop size. - sample_offset: a float number in [0, 1] indicates the subpixel sample offset - from grid point. - - Returns: - kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1]. - kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1]. - box_grid_y0y1: Tensor of size [batch_size, boxes, output_size, 2] - box_grid_x0x1: Tensor of size [batch_size, boxes, output_size, 2] - """ - boxes_shape = tf.shape(boxes) - batch_size, num_boxes = boxes_shape[0], boxes_shape[1] - if batch_size is None: - batch_size = tf.shape(boxes)[0] - box_grid_x = [] - box_grid_y = [] - for i in range(output_size): - box_grid_x.append(boxes[:, :, 1] + - (i + sample_offset) * boxes[:, :, 3] / output_size) - box_grid_y.append(boxes[:, :, 0] + - (i + sample_offset) * boxes[:, :, 2] / output_size) - box_grid_x = tf.stack(box_grid_x, axis=2) - box_grid_y = tf.stack(box_grid_y, axis=2) - - box_grid_y0 = tf.floor(box_grid_y) - box_grid_x0 = tf.floor(box_grid_x) - box_grid_x0 = tf.maximum(tf.cast(0., dtype=box_grid_x0.dtype), box_grid_x0) - box_grid_y0 = tf.maximum(tf.cast(0., dtype=box_grid_y0.dtype), box_grid_y0) - - box_grid_x0 = tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1)) - box_grid_x1 = tf.minimum(box_grid_x0 + 1, - tf.expand_dims(boundaries[:, :, 1], -1)) - box_grid_y0 = tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1)) - box_grid_y1 = tf.minimum(box_grid_y0 + 1, - tf.expand_dims(boundaries[:, :, 0], -1)) - - box_gridx0x1 = tf.stack([box_grid_x0, box_grid_x1], axis=-1) - box_gridy0y1 = tf.stack([box_grid_y0, box_grid_y1], axis=-1) - - # The RoIAlign feature f can be computed by bilinear interpolation of four - # neighboring feature points f0, f1, f2, and f3. - # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T - # [f10, f11]] - # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11 - # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11 - ly = box_grid_y - box_grid_y0 - lx = box_grid_x - box_grid_x0 - hy = 1.0 - ly - hx = 1.0 - lx - kernel_y = tf.reshape( - tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size, 2, 1]) - kernel_x = tf.reshape( - tf.stack([hx, lx], axis=3), [batch_size, num_boxes, output_size, 2, 1]) - return kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 - - -def multilevel_crop_and_resize(features, - boxes, - output_size=7, - sample_offset=0.5): - """Crop and resize on multilevel feature pyramid. - - Generate the (output_size, output_size) set of pixels for each input box - by first locating the box into the correct feature level, and then cropping - and resizing it using the correspoding feature map of that level. - - Args: - features: A dictionary with key as pyramid level and value as features. The - features are in shape of [batch_size, height_l, width_l, num_filters]. - boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents - a box with [y1, x1, y2, x2] in un-normalized coordinates. - output_size: A scalar to indicate the output crop size. - sample_offset: a float number in [0, 1] indicates the subpixel sample offset - from grid point. - - Returns: - A 5-D tensor representing feature crop of shape - [batch_size, num_boxes, output_size, output_size, num_filters]. - """ - - with tf.name_scope('multilevel_crop_and_resize'): - levels = list(features.keys()) - min_level = int(min(levels)) - max_level = int(max(levels)) - features_shape = tf.shape(features[str(min_level)]) - batch_size, max_feature_height, max_feature_width, num_filters = ( - features_shape[0], features_shape[1], features_shape[2], - features_shape[3]) - - num_boxes = tf.shape(boxes)[1] - - # Stack feature pyramid into a features_all of shape - # [batch_size, levels, height, width, num_filters]. - features_all = [] - feature_heights = [] - feature_widths = [] - for level in range(min_level, max_level + 1): - shape = features[str(level)].get_shape().as_list() - feature_heights.append(shape[1]) - feature_widths.append(shape[2]) - # Concat tensor of [batch_size, height_l * width_l, num_filters] for each - # levels. - features_all.append( - tf.reshape(features[str(level)], [batch_size, -1, num_filters])) - features_r2 = tf.reshape(tf.concat(features_all, 1), [-1, num_filters]) - - # Calculate height_l * width_l for each level. - level_dim_sizes = [ - feature_widths[i] * feature_heights[i] - for i in range(len(feature_widths)) - ] - # level_dim_offsets is accumulated sum of level_dim_size. - level_dim_offsets = [0] - for i in range(len(feature_widths) - 1): - level_dim_offsets.append(level_dim_offsets[i] + level_dim_sizes[i]) - batch_dim_size = level_dim_offsets[-1] + level_dim_sizes[-1] - level_dim_offsets = tf.constant(level_dim_offsets, tf.int32) - height_dim_sizes = tf.constant(feature_widths, tf.int32) - - # Assigns boxes to the right level. - box_width = boxes[:, :, 3] - boxes[:, :, 1] - box_height = boxes[:, :, 2] - boxes[:, :, 0] - areas_sqrt = tf.sqrt( - tf.cast(box_height, tf.float32) * tf.cast(box_width, tf.float32)) - - levels = tf.cast( - tf.math.floordiv( - tf.math.log(tf.math.divide_no_nan(areas_sqrt, 224.0)), - tf.math.log(2.0)) + 4.0, - dtype=tf.int32) - # Maps levels between [min_level, max_level]. - levels = tf.minimum(max_level, tf.maximum(levels, min_level)) - - # Projects box location and sizes to corresponding feature levels. - scale_to_level = tf.cast( - tf.pow(tf.constant(2.0), tf.cast(levels, tf.float32)), - dtype=boxes.dtype) - boxes /= tf.expand_dims(scale_to_level, axis=2) - box_width /= scale_to_level - box_height /= scale_to_level - boxes = tf.concat([boxes[:, :, 0:2], - tf.expand_dims(box_height, -1), - tf.expand_dims(box_width, -1)], axis=-1) - - # Maps levels to [0, max_level-min_level]. - levels -= min_level - level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32)) - boundary = tf.cast( - tf.concat([ - tf.expand_dims( - [[tf.cast(max_feature_height, tf.float32)]] / level_strides - 1, - axis=-1), - tf.expand_dims( - [[tf.cast(max_feature_width, tf.float32)]] / level_strides - 1, - axis=-1), - ], - axis=-1), boxes.dtype) - - # Compute grid positions. - kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = _compute_grid_positions( - boxes, boundary, output_size, sample_offset) - - x_indices = tf.cast( - tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]), - dtype=tf.int32) - y_indices = tf.cast( - tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]), - dtype=tf.int32) - - batch_size_offset = tf.tile( - tf.reshape( - tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]), - [1, num_boxes, output_size * 2, output_size * 2]) - # Get level offset for each box. Each box belongs to one level. - levels_offset = tf.tile( - tf.reshape( - tf.gather(level_dim_offsets, levels), - [batch_size, num_boxes, 1, 1]), - [1, 1, output_size * 2, output_size * 2]) - y_indices_offset = tf.tile( - tf.reshape( - y_indices * tf.expand_dims(tf.gather(height_dim_sizes, levels), -1), - [batch_size, num_boxes, output_size * 2, 1]), - [1, 1, 1, output_size * 2]) - x_indices_offset = tf.tile( - tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]), - [1, 1, output_size * 2, 1]) - indices = tf.reshape( - batch_size_offset + levels_offset + y_indices_offset + x_indices_offset, - [-1]) - - # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar - # performance. - features_per_box = tf.reshape( - tf.gather(features_r2, indices), - [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters]) - - # Bilinear interpolation. - features_per_box = _feature_bilinear_interpolation( - features_per_box, kernel_y, kernel_x) - return features_per_box - - -def _selective_crop_and_resize(features, - boxes, - box_levels, - boundaries, - output_size=7, - sample_offset=0.5, - use_einsum_gather=False): - """Crop and resize boxes on a set of feature maps. - - Given multiple features maps indexed by different levels, and a set of boxes - where each box is mapped to a certain level, it selectively crops and resizes - boxes from the corresponding feature maps to generate the box features. - - We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf, - figure 3 for reference). Specifically, for each feature map, we select an - (output_size, output_size) set of pixels corresponding to the box location, - and then use bilinear interpolation to select the feature value for each - pixel. - - For performance, we perform the gather and interpolation on all layers as a - single operation. In this op the multi-level features are first stacked and - gathered into [2*output_size, 2*output_size] feature points. Then bilinear - interpolation is performed on the gathered feature points to generate - [output_size, output_size] RoIAlign feature map. - - Here is the step-by-step algorithm: - 1. The multi-level features are gathered into a - [batch_size, num_boxes, output_size*2, output_size*2, num_filters] - Tensor. The Tensor contains four neighboring feature points for each - vertex in the output grid. - 2. Compute the interpolation kernel of shape - [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis - can be seen as stacking 2x2 interpolation kernels for all vertices in the - output grid. - 3. Element-wise multiply the gathered features and interpolation kernel. - Then apply 2x2 average pooling to reduce spatial dimension to - output_size. - - Args: - features: a 5-D tensor of shape [batch_size, num_levels, max_height, - max_width, num_filters] where cropping and resizing are based. - boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the - information of each box w.r.t. the corresponding feature map. - boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left - corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float) - in terms of the number of pixels of the corresponding feature map size. - box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing - the 0-based corresponding feature level index of each box. - boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing - the boundary (in (y, x)) of the corresponding feature map for each box. - Any resampled grid points that go beyond the bounary will be clipped. - output_size: a scalar indicating the output crop size. - sample_offset: a float number in [0, 1] indicates the subpixel sample offset - from grid point. - use_einsum_gather: use einsum to replace gather or not. Replacing einsum - with gather can improve performance when feature size is not large, einsum - is friendly with model partition as well. Gather's performance is better - when feature size is very large and there are multiple box levels. - - Returns: - features_per_box: a 5-D tensor of shape - [batch_size, num_boxes, output_size, output_size, num_filters] - representing the cropped features. - """ - (batch_size, num_levels, max_feature_height, max_feature_width, - num_filters) = features.get_shape().as_list() - if batch_size is None: - batch_size = tf.shape(features)[0] - _, num_boxes, _ = boxes.get_shape().as_list() - - kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = _compute_grid_positions( - boxes, boundaries, output_size, sample_offset) - x_indices = tf.cast( - tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]), - dtype=tf.int32) - y_indices = tf.cast( - tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]), - dtype=tf.int32) - - if use_einsum_gather: - # Blinear interpolation is done during the last two gathers: - # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T - # [f10, f11]] - # [[f00, f01], - # [f10, f11]] = tf.einsum(tf.einsum(features, y_one_hot), x_one_hot) - # where [hy, ly] and [hx, lx] are the bilinear interpolation kernel. - y_indices = tf.cast( - tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size, 2]), - dtype=tf.int32) - x_indices = tf.cast( - tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size, 2]), - dtype=tf.int32) - - # shape is [batch_size, num_boxes, output_size, 2, height] - grid_y_one_hot = tf.one_hot( - tf.cast(y_indices, tf.int32), max_feature_height, dtype=kernel_y.dtype) - # shape is [batch_size, num_boxes, output_size, 2, width] - grid_x_one_hot = tf.one_hot( - tf.cast(x_indices, tf.int32), max_feature_width, dtype=kernel_x.dtype) - - # shape is [batch_size, num_boxes, output_size, height] - grid_y_weight = tf.reduce_sum( - tf.multiply(grid_y_one_hot, kernel_y), axis=-2) - # shape is [batch_size, num_boxes, output_size, width] - grid_x_weight = tf.reduce_sum( - tf.multiply(grid_x_one_hot, kernel_x), axis=-2) - - # Gather for y_axis. - # shape is [batch_size, num_boxes, output_size, width, features] - features_per_box = tf.einsum('bmhwf,bmoh->bmowf', features, - tf.cast(grid_y_weight, features.dtype)) - # Gather for x_axis. - # shape is [batch_size, num_boxes, output_size, output_size, features] - features_per_box = tf.einsum('bmhwf,bmow->bmhof', features_per_box, - tf.cast(grid_x_weight, features.dtype)) - else: - height_dim_offset = max_feature_width - level_dim_offset = max_feature_height * height_dim_offset - batch_dim_offset = num_levels * level_dim_offset - - batch_size_offset = tf.tile( - tf.reshape( - tf.range(batch_size) * batch_dim_offset, [batch_size, 1, 1, 1]), - [1, num_boxes, output_size * 2, output_size * 2]) - box_levels_offset = tf.tile( - tf.reshape(box_levels * level_dim_offset, - [batch_size, num_boxes, 1, 1]), - [1, 1, output_size * 2, output_size * 2]) - y_indices_offset = tf.tile( - tf.reshape(y_indices * height_dim_offset, - [batch_size, num_boxes, output_size * 2, 1]), - [1, 1, 1, output_size * 2]) - x_indices_offset = tf.tile( - tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]), - [1, 1, output_size * 2, 1]) - - indices = tf.reshape( - batch_size_offset + box_levels_offset + y_indices_offset + - x_indices_offset, [-1]) - - features = tf.reshape(features, [-1, num_filters]) - # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar - # performance. - features_per_box = tf.reshape( - tf.gather(features, indices), - [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters]) - features_per_box = _feature_bilinear_interpolation( - features_per_box, kernel_y, kernel_x) - - return features_per_box - - -def crop_mask_in_target_box(masks, - boxes, - target_boxes, - output_size, - sample_offset=0, - use_einsum=True): - """Crop masks in target boxes. - - Args: - masks: A tensor with a shape of [batch_size, num_masks, height, width]. - boxes: a float tensor representing box cooridnates that tightly enclose - masks with a shape of [batch_size, num_masks, 4] in un-normalized - coordinates. A box is represented by [ymin, xmin, ymax, xmax]. - target_boxes: a float tensor representing target box cooridnates for masks - with a shape of [batch_size, num_masks, 4] in un-normalized coordinates. A - box is represented by [ymin, xmin, ymax, xmax]. - output_size: A scalar to indicate the output crop size. It currently only - supports to output a square shape outputs. - sample_offset: a float number in [0, 1] indicates the subpixel sample offset - from grid point. - use_einsum: Use einsum to replace gather in selective_crop_and_resize. - - Returns: - A 4-D tensor representing feature crop of shape - [batch_size, num_boxes, output_size, output_size]. - """ - with tf.name_scope('crop_mask_in_target_box'): - # Cast to float32, as the y_transform and other transform variables may - # overflow in float16 - masks = tf.cast(masks, tf.float32) - boxes = tf.cast(boxes, tf.float32) - target_boxes = tf.cast(target_boxes, tf.float32) - - batch_size, num_masks, height, width = masks.get_shape().as_list() - if batch_size is None: - batch_size = tf.shape(masks)[0] - masks = tf.reshape(masks, [batch_size * num_masks, height, width, 1]) - # Pad zeros on the boundary of masks. - masks = tf.image.pad_to_bounding_box(masks, 2, 2, height + 4, width + 4) - masks = tf.reshape(masks, [batch_size, num_masks, height+4, width+4, 1]) - - # Projects target box locations and sizes to corresponding cropped - # mask coordinates. - gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split( - value=boxes, num_or_size_splits=4, axis=2) - bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split( - value=target_boxes, num_or_size_splits=4, axis=2) - y_transform = (bb_y_min - gt_y_min) * height / ( - gt_y_max - gt_y_min + _EPSILON) + 2 - x_transform = (bb_x_min - gt_x_min) * height / ( - gt_x_max - gt_x_min + _EPSILON) + 2 - h_transform = (bb_y_max - bb_y_min) * width / ( - gt_y_max - gt_y_min + _EPSILON) - w_transform = (bb_x_max - bb_x_min) * width / ( - gt_x_max - gt_x_min + _EPSILON) - - boundaries = tf.concat( - [tf.ones_like(y_transform) * ((height + 4) - 1), - tf.ones_like(x_transform) * ((width + 4) - 1)], - axis=-1) - boundaries = tf.cast(boundaries, dtype=y_transform.dtype) - - # Reshape tensors to have the right shape for selective_crop_and_resize. - trasnformed_boxes = tf.concat( - [y_transform, x_transform, h_transform, w_transform], -1) - levels = tf.tile(tf.reshape(tf.range(num_masks), [1, num_masks]), - [batch_size, 1]) - - cropped_masks = _selective_crop_and_resize( - masks, - trasnformed_boxes, - levels, - boundaries, - output_size, - sample_offset=sample_offset, - use_einsum_gather=use_einsum) - cropped_masks = tf.squeeze(cropped_masks, axis=-1) - - return cropped_masks - - -def nearest_upsampling(data, scale, use_keras_layer=False): - """Nearest neighbor upsampling implementation. - - Args: - data: A tensor with a shape of [batch, height_in, width_in, channels]. - scale: An integer multiple to scale resolution of input data. - use_keras_layer: If True, use keras Upsampling2D layer. - - Returns: - data_up: A tensor with a shape of - [batch, height_in*scale, width_in*scale, channels]. Same dtype as input - data. - """ - if use_keras_layer: - return tf.keras.layers.UpSampling2D(size=(scale, scale), - interpolation='nearest')(data) - with tf.name_scope('nearest_upsampling'): - bs, _, _, c = data.get_shape().as_list() - shape = tf.shape(input=data) - h = shape[1] - w = shape[2] - bs = -1 if bs is None else bs - # Uses reshape to quickly upsample the input. The nearest pixel is selected - # via tiling. - data = tf.tile( - tf.reshape(data, [bs, h, 1, w, 1, c]), [1, 1, scale, 1, scale, 1]) - return tf.reshape(data, [bs, h * scale, w * scale, c]) diff --git a/official/vision/beta/ops/target_gather.py b/official/vision/beta/ops/target_gather.py deleted file mode 100644 index 3c8c3a0a4..000000000 --- a/official/vision/beta/ops/target_gather.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Definition of target gather, which gathers targets from indices.""" - -import tensorflow as tf - - -class TargetGather: - """Targer gather for dense object detector.""" - - def __call__(self, labels, match_indices, mask=None, mask_val=0.0): - """Labels anchors with ground truth inputs. - - B: batch_size - N: number of groundtruth boxes. - - Args: - labels: An integer tensor with shape [N, dims] or [B, N, ...] representing - groundtruth labels. - match_indices: An integer tensor with shape [M] or [B, M] representing - match label index. - mask: An boolean tensor with shape [M, dims] or [B, M,...] representing - match labels. - mask_val: An integer to fill in for mask. - - Returns: - target: An integer Tensor with shape [M] or [B, M] - Raises: - ValueError: If `labels` is higher than rank 3. - """ - if len(labels.shape) <= 2: - return self._gather_unbatched(labels, match_indices, mask, mask_val) - elif len(labels.shape) == 3: - return self._gather_batched(labels, match_indices, mask, mask_val) - else: - raise ValueError("`TargetGather` does not support `labels` with rank " - "larger than 3, got {}".format(len(labels.shape))) - - def _gather_unbatched(self, labels, match_indices, mask, mask_val): - """Gather based on unbatched labels and boxes.""" - num_gt_boxes = tf.shape(labels)[0] - - def _assign_when_rows_empty(): - if len(labels.shape) > 1: - mask_shape = [match_indices.shape[0], labels.shape[-1]] - else: - mask_shape = [match_indices.shape[0]] - return tf.cast(mask_val, labels.dtype) * tf.ones( - mask_shape, dtype=labels.dtype) - - def _assign_when_rows_not_empty(): - targets = tf.gather(labels, match_indices) - if mask is None: - return targets - else: - masked_targets = tf.cast(mask_val, labels.dtype) * tf.ones_like( - mask, dtype=labels.dtype) - return tf.where(mask, masked_targets, targets) - - return tf.cond(tf.greater(num_gt_boxes, 0), - _assign_when_rows_not_empty, - _assign_when_rows_empty) - - def _gather_batched(self, labels, match_indices, mask, mask_val): - """Gather based on batched labels.""" - batch_size = labels.shape[0] - if batch_size == 1: - if mask is not None: - result = self._gather_unbatched( - tf.squeeze(labels, axis=0), tf.squeeze(match_indices, axis=0), - tf.squeeze(mask, axis=0), mask_val) - else: - result = self._gather_unbatched( - tf.squeeze(labels, axis=0), tf.squeeze(match_indices, axis=0), - None, mask_val) - return tf.expand_dims(result, axis=0) - else: - indices_shape = tf.shape(match_indices) - indices_dtype = match_indices.dtype - batch_indices = (tf.expand_dims( - tf.range(indices_shape[0], dtype=indices_dtype), axis=-1) * - tf.ones([1, indices_shape[-1]], dtype=indices_dtype)) - gather_nd_indices = tf.stack( - [batch_indices, match_indices], axis=-1) - targets = tf.gather_nd(labels, gather_nd_indices) - if mask is None: - return targets - else: - masked_targets = tf.cast(mask_val, labels.dtype) * tf.ones_like( - mask, dtype=labels.dtype) - return tf.where(mask, masked_targets, targets) diff --git a/official/vision/beta/ops/target_gather_test.py b/official/vision/beta/ops/target_gather_test.py deleted file mode 100644 index 962a31d88..000000000 --- a/official/vision/beta/ops/target_gather_test.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for target_gather.py.""" - -import tensorflow as tf - -from official.vision.beta.ops import target_gather - - -class TargetGatherTest(tf.test.TestCase): - - def test_target_gather_batched(self): - gt_boxes = tf.constant( - [[ - [0, 0, 5, 5], - [0, 5, 5, 10], - [5, 0, 10, 5], - [5, 5, 10, 10], - ]], - dtype=tf.float32) - gt_classes = tf.constant([[[2], [10], [3], [-1]]], dtype=tf.int32) - - labeler = target_gather.TargetGather() - - match_indices = tf.constant([[0, 2]], dtype=tf.int32) - match_indicators = tf.constant([[-2, 1]]) - mask = tf.less_equal(match_indicators, 0) - cls_mask = tf.expand_dims(mask, -1) - matched_gt_classes = labeler(gt_classes, match_indices, cls_mask) - box_mask = tf.tile(cls_mask, [1, 1, 4]) - matched_gt_boxes = labeler(gt_boxes, match_indices, box_mask) - - self.assertAllEqual( - matched_gt_classes.numpy(), [[[0], [3]]]) - self.assertAllClose( - matched_gt_boxes.numpy(), [[[0, 0, 0, 0], [5, 0, 10, 5]]]) - - def test_target_gather_unbatched(self): - gt_boxes = tf.constant( - [ - [0, 0, 5, 5], - [0, 5, 5, 10], - [5, 0, 10, 5], - [5, 5, 10, 10], - ], - dtype=tf.float32) - gt_classes = tf.constant([[2], [10], [3], [-1]], dtype=tf.int32) - - labeler = target_gather.TargetGather() - - match_indices = tf.constant([0, 2], dtype=tf.int32) - match_indicators = tf.constant([-2, 1]) - mask = tf.less_equal(match_indicators, 0) - cls_mask = tf.expand_dims(mask, -1) - matched_gt_classes = labeler(gt_classes, match_indices, cls_mask) - box_mask = tf.tile(cls_mask, [1, 4]) - matched_gt_boxes = labeler(gt_boxes, match_indices, box_mask) - - self.assertAllEqual( - matched_gt_classes.numpy(), [[0], [3]]) - self.assertAllClose( - matched_gt_boxes.numpy(), [[0, 0, 0, 0], [5, 0, 10, 5]]) - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/serving/__init__.py b/official/vision/beta/serving/__init__.py deleted file mode 100644 index 310bfb28f..000000000 --- a/official/vision/beta/serving/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - diff --git a/official/vision/beta/serving/detection.py b/official/vision/beta/serving/detection.py deleted file mode 100644 index 617979c64..000000000 --- a/official/vision/beta/serving/detection.py +++ /dev/null @@ -1,205 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Detection input and model functions for serving/inference.""" - -from typing import Mapping, Text -import tensorflow as tf - -from official.vision.beta import configs -from official.vision.beta.modeling import factory -from official.vision.beta.ops import anchor -from official.vision.beta.ops import box_ops -from official.vision.beta.ops import preprocess_ops -from official.vision.beta.serving import export_base - - -MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255) -STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255) - - -class DetectionModule(export_base.ExportModule): - """Detection Module.""" - - def _build_model(self): - - if self._batch_size is None: - raise ValueError('batch_size cannot be None for detection models.') - input_specs = tf.keras.layers.InputSpec(shape=[self._batch_size] + - self._input_image_size + [3]) - - if isinstance(self.params.task.model, configs.maskrcnn.MaskRCNN): - model = factory.build_maskrcnn( - input_specs=input_specs, model_config=self.params.task.model) - elif isinstance(self.params.task.model, configs.retinanet.RetinaNet): - model = factory.build_retinanet( - input_specs=input_specs, model_config=self.params.task.model) - else: - raise ValueError('Detection module not implemented for {} model.'.format( - type(self.params.task.model))) - - return model - - def _build_anchor_boxes(self): - """Builds and returns anchor boxes.""" - model_params = self.params.task.model - input_anchor = anchor.build_anchor_generator( - min_level=model_params.min_level, - max_level=model_params.max_level, - num_scales=model_params.anchor.num_scales, - aspect_ratios=model_params.anchor.aspect_ratios, - anchor_size=model_params.anchor.anchor_size) - return input_anchor( - image_size=(self._input_image_size[0], self._input_image_size[1])) - - def _build_inputs(self, image): - """Builds detection model inputs for serving.""" - model_params = self.params.task.model - # Normalizes image with mean and std pixel values. - image = preprocess_ops.normalize_image(image, - offset=MEAN_RGB, - scale=STDDEV_RGB) - - image, image_info = preprocess_ops.resize_and_crop_image( - image, - self._input_image_size, - padded_size=preprocess_ops.compute_padded_size( - self._input_image_size, 2**model_params.max_level), - aug_scale_min=1.0, - aug_scale_max=1.0) - anchor_boxes = self._build_anchor_boxes() - - return image, anchor_boxes, image_info - - def preprocess(self, images: tf.Tensor) -> ( - tf.Tensor, Mapping[Text, tf.Tensor], tf.Tensor): - """Preprocess inputs to be suitable for the model. - - Args: - images: The images tensor. - Returns: - images: The images tensor cast to float. - anchor_boxes: Dict mapping anchor levels to anchor boxes. - image_info: Tensor containing the details of the image resizing. - - """ - model_params = self.params.task.model - with tf.device('cpu:0'): - images = tf.cast(images, dtype=tf.float32) - - # Tensor Specs for map_fn outputs (images, anchor_boxes, and image_info). - images_spec = tf.TensorSpec(shape=self._input_image_size + [3], - dtype=tf.float32) - - num_anchors = model_params.anchor.num_scales * len( - model_params.anchor.aspect_ratios) * 4 - anchor_shapes = [] - for level in range(model_params.min_level, model_params.max_level + 1): - anchor_level_spec = tf.TensorSpec( - shape=[ - self._input_image_size[0] // 2**level, - self._input_image_size[1] // 2**level, num_anchors - ], - dtype=tf.float32) - anchor_shapes.append((str(level), anchor_level_spec)) - - image_info_spec = tf.TensorSpec(shape=[4, 2], dtype=tf.float32) - - images, anchor_boxes, image_info = tf.nest.map_structure( - tf.identity, - tf.map_fn( - self._build_inputs, - elems=images, - fn_output_signature=(images_spec, dict(anchor_shapes), - image_info_spec), - parallel_iterations=32)) - - return images, anchor_boxes, image_info - - def serve(self, images: tf.Tensor): - """Cast image to float and run inference. - - Args: - images: uint8 Tensor of shape [batch_size, None, None, 3] - Returns: - Tensor holding detection output logits. - """ - - # Skip image preprocessing when input_type is tflite so it is compatible - # with TFLite quantization. - if self._input_type != 'tflite': - images, anchor_boxes, image_info = self.preprocess(images) - else: - with tf.device('cpu:0'): - anchor_boxes = self._build_anchor_boxes() - # image_info is a 3D tensor of shape [batch_size, 4, 2]. It is in the - # format of [[original_height, original_width], - # [desired_height, desired_width], [y_scale, x_scale], - # [y_offset, x_offset]]. When input_type is tflite, input image is - # supposed to be preprocessed already. - image_info = tf.convert_to_tensor([[ - self._input_image_size, self._input_image_size, [1.0, 1.0], [0, 0] - ]], - dtype=tf.float32) - input_image_shape = image_info[:, 1, :] - - # To overcome keras.Model extra limitation to save a model with layers that - # have multiple inputs, we use `model.call` here to trigger the forward - # path. Note that, this disables some keras magics happens in `__call__`. - detections = self.model.call( - images=images, - image_shape=input_image_shape, - anchor_boxes=anchor_boxes, - training=False) - - if self.params.task.model.detection_generator.apply_nms: - # For RetinaNet model, apply export_config. - # TODO(huizhongc): Add export_config to fasterrcnn and maskrcnn as needed. - if isinstance(self.params.task.model, configs.retinanet.RetinaNet): - export_config = self.params.task.export_config - # Normalize detection box coordinates to [0, 1]. - if export_config.output_normalized_coordinates: - detection_boxes = ( - detections['detection_boxes'] / - tf.tile(image_info[:, 2:3, :], [1, 1, 2])) - detections['detection_boxes'] = box_ops.normalize_boxes( - detection_boxes, image_info[:, 0:1, :]) - - # Cast num_detections and detection_classes to float. This allows the - # model inference to work on chain (go/chain) as chain requires floating - # point outputs. - if export_config.cast_num_detections_to_float: - detections['num_detections'] = tf.cast( - detections['num_detections'], dtype=tf.float32) - if export_config.cast_detection_classes_to_float: - detections['detection_classes'] = tf.cast( - detections['detection_classes'], dtype=tf.float32) - - final_outputs = { - 'detection_boxes': detections['detection_boxes'], - 'detection_scores': detections['detection_scores'], - 'detection_classes': detections['detection_classes'], - 'num_detections': detections['num_detections'] - } - else: - final_outputs = { - 'decoded_boxes': detections['decoded_boxes'], - 'decoded_box_scores': detections['decoded_box_scores'] - } - - if 'detection_masks' in detections.keys(): - final_outputs['detection_masks'] = detections['detection_masks'] - - final_outputs.update({'image_info': image_info}) - return final_outputs diff --git a/official/vision/beta/serving/detection_test.py b/official/vision/beta/serving/detection_test.py deleted file mode 100644 index 865f291c7..000000000 --- a/official/vision/beta/serving/detection_test.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Test for image detection export lib.""" - -import io -import os - -from absl.testing import parameterized -import numpy as np -from PIL import Image -import tensorflow as tf - -from official.core import exp_factory -from official.vision.beta import configs # pylint: disable=unused-import -from official.vision.beta.serving import detection - - -class DetectionExportTest(tf.test.TestCase, parameterized.TestCase): - - def _get_detection_module(self, experiment_name, input_type): - params = exp_factory.get_exp_config(experiment_name) - params.task.model.backbone.resnet.model_id = 18 - params.task.model.detection_generator.nms_version = 'batched' - detection_module = detection.DetectionModule( - params, - batch_size=1, - input_image_size=[640, 640], - input_type=input_type) - return detection_module - - def _export_from_module(self, module, input_type, save_directory): - signatures = module.get_inference_signatures( - {input_type: 'serving_default'}) - tf.saved_model.save(module, save_directory, signatures=signatures) - - def _get_dummy_input(self, input_type, batch_size, image_size): - """Get dummy input for the given input type.""" - h, w = image_size - - if input_type == 'image_tensor': - return tf.zeros((batch_size, h, w, 3), dtype=np.uint8) - elif input_type == 'image_bytes': - image = Image.fromarray(np.zeros((h, w, 3), dtype=np.uint8)) - byte_io = io.BytesIO() - image.save(byte_io, 'PNG') - return [byte_io.getvalue() for b in range(batch_size)] - elif input_type == 'tf_example': - image_tensor = tf.zeros((h, w, 3), dtype=tf.uint8) - encoded_jpeg = tf.image.encode_jpeg(tf.constant(image_tensor)).numpy() - example = tf.train.Example( - features=tf.train.Features( - feature={ - 'image/encoded': - tf.train.Feature( - bytes_list=tf.train.BytesList(value=[encoded_jpeg])), - })).SerializeToString() - return [example for b in range(batch_size)] - elif input_type == 'tflite': - return tf.zeros((batch_size, h, w, 3), dtype=np.float32) - - @parameterized.parameters( - ('image_tensor', 'fasterrcnn_resnetfpn_coco', [384, 384]), - ('image_bytes', 'fasterrcnn_resnetfpn_coco', [640, 640]), - ('tf_example', 'fasterrcnn_resnetfpn_coco', [640, 640]), - ('tflite', 'fasterrcnn_resnetfpn_coco', [640, 640]), - ('image_tensor', 'maskrcnn_resnetfpn_coco', [640, 640]), - ('image_bytes', 'maskrcnn_resnetfpn_coco', [640, 384]), - ('tf_example', 'maskrcnn_resnetfpn_coco', [640, 640]), - ('tflite', 'maskrcnn_resnetfpn_coco', [640, 640]), - ('image_tensor', 'retinanet_resnetfpn_coco', [640, 640]), - ('image_bytes', 'retinanet_resnetfpn_coco', [640, 640]), - ('tf_example', 'retinanet_resnetfpn_coco', [384, 640]), - ('tflite', 'retinanet_resnetfpn_coco', [640, 640]), - ('image_tensor', 'retinanet_resnetfpn_coco', [384, 384]), - ('image_bytes', 'retinanet_spinenet_coco', [640, 640]), - ('tf_example', 'retinanet_spinenet_coco', [640, 384]), - ('tflite', 'retinanet_spinenet_coco', [640, 640]), - ) - def test_export(self, input_type, experiment_name, image_size): - tmp_dir = self.get_temp_dir() - module = self._get_detection_module(experiment_name, input_type) - - self._export_from_module(module, input_type, tmp_dir) - - self.assertTrue(os.path.exists(os.path.join(tmp_dir, 'saved_model.pb'))) - self.assertTrue( - os.path.exists(os.path.join(tmp_dir, 'variables', 'variables.index'))) - self.assertTrue( - os.path.exists( - os.path.join(tmp_dir, 'variables', - 'variables.data-00000-of-00001'))) - - imported = tf.saved_model.load(tmp_dir) - detection_fn = imported.signatures['serving_default'] - - images = self._get_dummy_input( - input_type, batch_size=1, image_size=image_size) - - if input_type == 'tflite': - processed_images = tf.zeros(image_size + [3], dtype=tf.float32) - anchor_boxes = module._build_anchor_boxes() - image_info = tf.convert_to_tensor( - [image_size, image_size, [1.0, 1.0], [0, 0]], dtype=tf.float32) - else: - processed_images, anchor_boxes, image_info = module._build_inputs( - tf.zeros((224, 224, 3), dtype=tf.uint8)) - image_shape = image_info[1, :] - image_shape = tf.expand_dims(image_shape, 0) - processed_images = tf.expand_dims(processed_images, 0) - for l, l_boxes in anchor_boxes.items(): - anchor_boxes[l] = tf.expand_dims(l_boxes, 0) - - expected_outputs = module.model( - images=processed_images, - image_shape=image_shape, - anchor_boxes=anchor_boxes, - training=False) - outputs = detection_fn(tf.constant(images)) - - self.assertAllClose(outputs['num_detections'].numpy(), - expected_outputs['num_detections'].numpy()) - - def test_build_model_fail_with_none_batch_size(self): - params = exp_factory.get_exp_config('retinanet_resnetfpn_coco') - with self.assertRaisesRegex( - ValueError, 'batch_size cannot be None for detection models.'): - detection.DetectionModule( - params, batch_size=None, input_image_size=[640, 640]) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/serving/export_base.py b/official/vision/beta/serving/export_base.py deleted file mode 100644 index d7dcd61ea..000000000 --- a/official/vision/beta/serving/export_base.py +++ /dev/null @@ -1,191 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Base class for model export.""" - -import abc -from typing import Dict, List, Mapping, Optional, Text - -import tensorflow as tf -from official.core import config_definitions as cfg -from official.core import export_base - - -class ExportModule(export_base.ExportModule, metaclass=abc.ABCMeta): - """Base Export Module.""" - - def __init__(self, - params: cfg.ExperimentConfig, - *, - batch_size: int, - input_image_size: List[int], - input_type: str = 'image_tensor', - num_channels: int = 3, - model: Optional[tf.keras.Model] = None): - """Initializes a module for export. - - Args: - params: Experiment params. - batch_size: The batch size of the model input. Can be `int` or None. - input_image_size: List or Tuple of size of the input image. For 2D image, - it is [height, width]. - input_type: The input signature type. - num_channels: The number of the image channels. - model: A tf.keras.Model instance to be exported. - """ - self.params = params - self._batch_size = batch_size - self._input_image_size = input_image_size - self._num_channels = num_channels - self._input_type = input_type - if model is None: - model = self._build_model() # pylint: disable=assignment-from-none - super().__init__(params=params, model=model) - - def _decode_image(self, encoded_image_bytes: str) -> tf.Tensor: - """Decodes an image bytes to an image tensor. - - Use `tf.image.decode_image` to decode an image if input is expected to be 2D - image; otherwise use `tf.io.decode_raw` to convert the raw bytes to tensor - and reshape it to desire shape. - - Args: - encoded_image_bytes: An encoded image string to be decoded. - - Returns: - A decoded image tensor. - """ - if len(self._input_image_size) == 2: - # Decode an image if 2D input is expected. - image_tensor = tf.image.decode_image( - encoded_image_bytes, channels=self._num_channels) - image_tensor.set_shape((None, None, self._num_channels)) - else: - # Convert raw bytes into a tensor and reshape it, if not 2D input. - image_tensor = tf.io.decode_raw(encoded_image_bytes, out_type=tf.uint8) - image_tensor = tf.reshape(image_tensor, - self._input_image_size + [self._num_channels]) - return image_tensor - - def _decode_tf_example( - self, tf_example_string_tensor: tf.train.Example) -> tf.Tensor: - """Decodes a TF Example to an image tensor. - - Args: - tf_example_string_tensor: A tf.train.Example of encoded image and other - information. - - Returns: - A decoded image tensor. - """ - keys_to_features = {'image/encoded': tf.io.FixedLenFeature((), tf.string)} - parsed_tensors = tf.io.parse_single_example( - serialized=tf_example_string_tensor, features=keys_to_features) - image_tensor = self._decode_image(parsed_tensors['image/encoded']) - return image_tensor - - def _build_model(self, **kwargs): - """Returns a model built from the params.""" - return None - - @tf.function - def inference_from_image_tensors( - self, inputs: tf.Tensor) -> Mapping[str, tf.Tensor]: - return self.serve(inputs) - - @tf.function - def inference_for_tflite(self, inputs: tf.Tensor) -> Mapping[str, tf.Tensor]: - return self.serve(inputs) - - @tf.function - def inference_from_image_bytes(self, inputs: tf.Tensor): - with tf.device('cpu:0'): - images = tf.nest.map_structure( - tf.identity, - tf.map_fn( - self._decode_image, - elems=inputs, - fn_output_signature=tf.TensorSpec( - shape=[None] * len(self._input_image_size) + - [self._num_channels], - dtype=tf.uint8), - parallel_iterations=32)) - images = tf.stack(images) - return self.serve(images) - - @tf.function - def inference_from_tf_example(self, - inputs: tf.Tensor) -> Mapping[str, tf.Tensor]: - with tf.device('cpu:0'): - images = tf.nest.map_structure( - tf.identity, - tf.map_fn( - self._decode_tf_example, - elems=inputs, - # Height/width of the shape of input images is unspecified (None) - # at the time of decoding the example, but the shape will - # be adjusted to conform to the input layer of the model, - # by _run_inference_on_image_tensors() below. - fn_output_signature=tf.TensorSpec( - shape=[None] * len(self._input_image_size) + - [self._num_channels], - dtype=tf.uint8), - dtype=tf.uint8, - parallel_iterations=32)) - images = tf.stack(images) - return self.serve(images) - - def get_inference_signatures(self, function_keys: Dict[Text, Text]): - """Gets defined function signatures. - - Args: - function_keys: A dictionary with keys as the function to create signature - for and values as the signature keys when returns. - - Returns: - A dictionary with key as signature key and value as concrete functions - that can be used for tf.saved_model.save. - """ - signatures = {} - for key, def_name in function_keys.items(): - if key == 'image_tensor': - input_signature = tf.TensorSpec( - shape=[self._batch_size] + [None] * len(self._input_image_size) + - [self._num_channels], - dtype=tf.uint8) - signatures[ - def_name] = self.inference_from_image_tensors.get_concrete_function( - input_signature) - elif key == 'image_bytes': - input_signature = tf.TensorSpec( - shape=[self._batch_size], dtype=tf.string) - signatures[ - def_name] = self.inference_from_image_bytes.get_concrete_function( - input_signature) - elif key == 'serve_examples' or key == 'tf_example': - input_signature = tf.TensorSpec( - shape=[self._batch_size], dtype=tf.string) - signatures[ - def_name] = self.inference_from_tf_example.get_concrete_function( - input_signature) - elif key == 'tflite': - input_signature = tf.TensorSpec( - shape=[self._batch_size] + self._input_image_size + - [self._num_channels], - dtype=tf.float32) - signatures[def_name] = self.inference_for_tflite.get_concrete_function( - input_signature) - else: - raise ValueError('Unrecognized `input_type`') - return signatures diff --git a/official/vision/beta/serving/export_base_v2.py b/official/vision/beta/serving/export_base_v2.py deleted file mode 100644 index 25469b1bb..000000000 --- a/official/vision/beta/serving/export_base_v2.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Base class for model export.""" - -from typing import Dict, Optional, Text, Callable, Any, Union - -import tensorflow as tf - -from official.core import export_base - - -class ExportModule(export_base.ExportModule): - """Base Export Module.""" - - def __init__(self, - params, - model: tf.keras.Model, - input_signature: Union[tf.TensorSpec, Dict[str, tf.TensorSpec]], - preprocessor: Optional[Callable[..., Any]] = None, - inference_step: Optional[Callable[..., Any]] = None, - postprocessor: Optional[Callable[..., Any]] = None): - """Initializes a module for export. - - Args: - params: A dataclass for parameters to the module. - model: A tf.keras.Model instance to be exported. - input_signature: tf.TensorSpec, e.g. - tf.TensorSpec(shape=[None, 224, 224, 3], dtype=tf.uint8) - preprocessor: An optional callable to preprocess the inputs. - inference_step: An optional callable to forward-pass the model. - postprocessor: An optional callable to postprocess the model outputs. - """ - super().__init__( - params, - model=model, - preprocessor=preprocessor, - inference_step=inference_step, - postprocessor=postprocessor) - self.input_signature = input_signature - - @tf.function - def serve(self, inputs): - x = self.preprocessor(inputs=inputs) if self.preprocessor else inputs - x = self.inference_step(x) - x = self.postprocessor(x) if self.postprocessor else x - return x - - def get_inference_signatures(self, function_keys: Dict[Text, Text]): - """Gets defined function signatures. - - Args: - function_keys: A dictionary with keys as the function to create signature - for and values as the signature keys when returns. - - Returns: - A dictionary with key as signature key and value as concrete functions - that can be used for tf.saved_model.save. - """ - signatures = {} - for _, def_name in function_keys.items(): - signatures[def_name] = self.serve.get_concrete_function( - self.input_signature) - return signatures diff --git a/official/vision/beta/serving/export_base_v2_test.py b/official/vision/beta/serving/export_base_v2_test.py deleted file mode 100644 index 4d88fe8d6..000000000 --- a/official/vision/beta/serving/export_base_v2_test.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for official.core.export_base_v2.""" -import os - -import tensorflow as tf - -from official.core import export_base -from official.vision.beta.serving import export_base_v2 - - -class TestModel(tf.keras.Model): - - def __init__(self): - super().__init__() - self._dense = tf.keras.layers.Dense(2) - - def call(self, inputs): - return {'outputs': self._dense(inputs)} - - -class ExportBaseTest(tf.test.TestCase): - - def test_preprocessor(self): - tmp_dir = self.get_temp_dir() - model = TestModel() - inputs = tf.ones([2, 4], tf.float32) - - preprocess_fn = lambda inputs: 2 * inputs - - module = export_base_v2.ExportModule( - params=None, - input_signature=tf.TensorSpec(shape=[2, 4]), - model=model, - preprocessor=preprocess_fn) - expected_output = model(preprocess_fn(inputs)) - ckpt_path = tf.train.Checkpoint(model=model).save( - os.path.join(tmp_dir, 'ckpt')) - export_dir = export_base.export( - module, ['serving_default'], - export_savedmodel_dir=tmp_dir, - checkpoint_path=ckpt_path, - timestamped=False) - imported = tf.saved_model.load(export_dir) - output = imported.signatures['serving_default'](inputs) - print('output', output) - self.assertAllClose( - output['outputs'].numpy(), expected_output['outputs'].numpy()) - - def test_postprocessor(self): - tmp_dir = self.get_temp_dir() - model = TestModel() - inputs = tf.ones([2, 4], tf.float32) - - postprocess_fn = lambda logits: {'outputs': 2 * logits['outputs']} - - module = export_base_v2.ExportModule( - params=None, - model=model, - input_signature=tf.TensorSpec(shape=[2, 4]), - postprocessor=postprocess_fn) - expected_output = postprocess_fn(model(inputs)) - ckpt_path = tf.train.Checkpoint(model=model).save( - os.path.join(tmp_dir, 'ckpt')) - export_dir = export_base.export( - module, ['serving_default'], - export_savedmodel_dir=tmp_dir, - checkpoint_path=ckpt_path, - timestamped=False) - imported = tf.saved_model.load(export_dir) - output = imported.signatures['serving_default'](inputs) - self.assertAllClose( - output['outputs'].numpy(), expected_output['outputs'].numpy()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/serving/export_module_factory.py b/official/vision/beta/serving/export_module_factory.py deleted file mode 100644 index 73f1abffc..000000000 --- a/official/vision/beta/serving/export_module_factory.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Factory for vision export modules.""" - -from typing import List, Optional - -import tensorflow as tf - -from official.core import config_definitions as cfg -from official.vision.beta import configs -from official.vision.beta.dataloaders import classification_input -from official.vision.beta.modeling import factory -from official.vision.beta.serving import export_base_v2 as export_base -from official.vision.beta.serving import export_utils - - -def create_classification_export_module(params: cfg.ExperimentConfig, - input_type: str, - batch_size: int, - input_image_size: List[int], - num_channels: int = 3): - """Creats classification export module.""" - input_signature = export_utils.get_image_input_signatures( - input_type, batch_size, input_image_size, num_channels) - input_specs = tf.keras.layers.InputSpec( - shape=[batch_size] + input_image_size + [num_channels]) - - model = factory.build_classification_model( - input_specs=input_specs, - model_config=params.task.model, - l2_regularizer=None) - - def preprocess_fn(inputs): - image_tensor = export_utils.parse_image(inputs, input_type, - input_image_size, num_channels) - # If input_type is `tflite`, do not apply image preprocessing. - if input_type == 'tflite': - return image_tensor - - def preprocess_image_fn(inputs): - return classification_input.Parser.inference_fn( - inputs, input_image_size, num_channels) - - images = tf.map_fn( - preprocess_image_fn, elems=image_tensor, - fn_output_signature=tf.TensorSpec( - shape=input_image_size + [num_channels], - dtype=tf.float32)) - - return images - - def postprocess_fn(logits): - probs = tf.nn.softmax(logits) - return {'logits': logits, 'probs': probs} - - export_module = export_base.ExportModule(params, - model=model, - input_signature=input_signature, - preprocessor=preprocess_fn, - postprocessor=postprocess_fn) - return export_module - - -def get_export_module(params: cfg.ExperimentConfig, - input_type: str, - batch_size: Optional[int], - input_image_size: List[int], - num_channels: int = 3) -> export_base.ExportModule: - """Factory for export modules.""" - if isinstance(params.task, - configs.image_classification.ImageClassificationTask): - export_module = create_classification_export_module( - params, input_type, batch_size, input_image_size, num_channels) - else: - raise ValueError('Export module not implemented for {} task.'.format( - type(params.task))) - return export_module diff --git a/official/vision/beta/serving/export_module_factory_test.py b/official/vision/beta/serving/export_module_factory_test.py deleted file mode 100644 index f3ffed10f..000000000 --- a/official/vision/beta/serving/export_module_factory_test.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Test for vision modules.""" - -import io -import os - -from absl.testing import parameterized -import numpy as np -from PIL import Image -import tensorflow as tf - -from official.common import registry_imports # pylint: disable=unused-import -from official.core import exp_factory -from official.core import export_base -from official.vision.beta.dataloaders import classification_input -from official.vision.beta.serving import export_module_factory - - -class ImageClassificationExportTest(tf.test.TestCase, parameterized.TestCase): - - def _get_classification_module(self, input_type, input_image_size): - params = exp_factory.get_exp_config('resnet_imagenet') - params.task.model.backbone.resnet.model_id = 18 - module = export_module_factory.create_classification_export_module( - params, input_type, batch_size=1, input_image_size=input_image_size) - return module - - def _get_dummy_input(self, input_type): - """Get dummy input for the given input type.""" - - if input_type == 'image_tensor': - return tf.zeros((1, 32, 32, 3), dtype=np.uint8) - elif input_type == 'image_bytes': - image = Image.fromarray(np.zeros((32, 32, 3), dtype=np.uint8)) - byte_io = io.BytesIO() - image.save(byte_io, 'PNG') - return [byte_io.getvalue()] - elif input_type == 'tf_example': - image_tensor = tf.zeros((32, 32, 3), dtype=tf.uint8) - encoded_jpeg = tf.image.encode_jpeg(tf.constant(image_tensor)).numpy() - example = tf.train.Example( - features=tf.train.Features( - feature={ - 'image/encoded': - tf.train.Feature( - bytes_list=tf.train.BytesList(value=[encoded_jpeg])), - })).SerializeToString() - return [example] - - @parameterized.parameters( - {'input_type': 'image_tensor'}, - {'input_type': 'image_bytes'}, - {'input_type': 'tf_example'}, - ) - def test_export(self, input_type='image_tensor'): - input_image_size = [32, 32] - tmp_dir = self.get_temp_dir() - module = self._get_classification_module(input_type, input_image_size) - # Test that the model restores any attrs that are trackable objects - # (eg: tables, resource variables, keras models/layers, tf.hub modules). - module.model.test_trackable = tf.keras.layers.InputLayer(input_shape=(4,)) - ckpt_path = tf.train.Checkpoint(model=module.model).save( - os.path.join(tmp_dir, 'ckpt')) - export_dir = export_base.export( - module, [input_type], - export_savedmodel_dir=tmp_dir, - checkpoint_path=ckpt_path, - timestamped=False) - - self.assertTrue(os.path.exists(os.path.join(tmp_dir, 'saved_model.pb'))) - self.assertTrue(os.path.exists( - os.path.join(tmp_dir, 'variables', 'variables.index'))) - self.assertTrue(os.path.exists( - os.path.join(tmp_dir, 'variables', 'variables.data-00000-of-00001'))) - - imported = tf.saved_model.load(export_dir) - classification_fn = imported.signatures['serving_default'] - - images = self._get_dummy_input(input_type) - - def preprocess_image_fn(inputs): - return classification_input.Parser.inference_fn( - inputs, input_image_size, num_channels=3) - - processed_images = tf.map_fn( - preprocess_image_fn, - elems=tf.zeros([1] + input_image_size + [3], dtype=tf.uint8), - fn_output_signature=tf.TensorSpec( - shape=input_image_size + [3], dtype=tf.float32)) - expected_logits = module.model(processed_images, training=False) - expected_prob = tf.nn.softmax(expected_logits) - out = classification_fn(tf.constant(images)) - - # The imported model should contain any trackable attrs that the original - # model had. - self.assertTrue(hasattr(imported.model, 'test_trackable')) - self.assertAllClose( - out['logits'].numpy(), expected_logits.numpy(), rtol=1e-04, atol=1e-04) - self.assertAllClose( - out['probs'].numpy(), expected_prob.numpy(), rtol=1e-04, atol=1e-04) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/serving/export_saved_model.py b/official/vision/beta/serving/export_saved_model.py deleted file mode 100644 index 95027be13..000000000 --- a/official/vision/beta/serving/export_saved_model.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -r"""Vision models export binary for serving/inference. - -To export a trained checkpoint in saved_model format (shell script): - -EXPERIMENT_TYPE = XX -CHECKPOINT_PATH = XX -EXPORT_DIR_PATH = XX -export_saved_model --experiment=${EXPERIMENT_TYPE} \ - --export_dir=${EXPORT_DIR_PATH}/ \ - --checkpoint_path=${CHECKPOINT_PATH} \ - --batch_size=2 \ - --input_image_size=224,224 - -To serve (python): - -export_dir_path = XX -input_type = XX -input_images = XX -imported = tf.saved_model.load(export_dir_path) -model_fn = imported.signatures['serving_default'] -output = model_fn(input_images) -""" - -from absl import app -from absl import flags - -from official.common import registry_imports # pylint: disable=unused-import -from official.core import exp_factory -from official.modeling import hyperparams -from official.vision.beta.serving import export_saved_model_lib - -FLAGS = flags.FLAGS - - -flags.DEFINE_string( - 'experiment', None, 'experiment type, e.g. retinanet_resnetfpn_coco') -flags.DEFINE_string('export_dir', None, 'The export directory.') -flags.DEFINE_string('checkpoint_path', None, 'Checkpoint path.') -flags.DEFINE_multi_string( - 'config_file', - default=None, - help='YAML/JSON files which specifies overrides. The override order ' - 'follows the order of args. Note that each file ' - 'can be used as an override template to override the default parameters ' - 'specified in Python. If the same parameter is specified in both ' - '`--config_file` and `--params_override`, `config_file` will be used ' - 'first, followed by params_override.') -flags.DEFINE_string( - 'params_override', '', - 'The JSON/YAML file or string which specifies the parameter to be overriden' - ' on top of `config_file` template.') -flags.DEFINE_integer( - 'batch_size', None, 'The batch size.') -flags.DEFINE_string( - 'input_type', 'image_tensor', - 'One of `image_tensor`, `image_bytes`, `tf_example` and `tflite`.') -flags.DEFINE_string( - 'input_image_size', '224,224', - 'The comma-separated string of two integers representing the height,width ' - 'of the input to the model.') -flags.DEFINE_string('export_checkpoint_subdir', 'checkpoint', - 'The subdirectory for checkpoints.') -flags.DEFINE_string('export_saved_model_subdir', 'saved_model', - 'The subdirectory for saved model.') - - -def main(_): - - params = exp_factory.get_exp_config(FLAGS.experiment) - for config_file in FLAGS.config_file or []: - params = hyperparams.override_params_dict( - params, config_file, is_strict=True) - if FLAGS.params_override: - params = hyperparams.override_params_dict( - params, FLAGS.params_override, is_strict=True) - - params.validate() - params.lock() - - export_saved_model_lib.export_inference_graph( - input_type=FLAGS.input_type, - batch_size=FLAGS.batch_size, - input_image_size=[int(x) for x in FLAGS.input_image_size.split(',')], - params=params, - checkpoint_path=FLAGS.checkpoint_path, - export_dir=FLAGS.export_dir, - export_checkpoint_subdir=FLAGS.export_checkpoint_subdir, - export_saved_model_subdir=FLAGS.export_saved_model_subdir) - - -if __name__ == '__main__': - app.run(main) diff --git a/official/vision/beta/serving/export_saved_model_lib.py b/official/vision/beta/serving/export_saved_model_lib.py deleted file mode 100644 index e3848d4e3..000000000 --- a/official/vision/beta/serving/export_saved_model_lib.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -r"""Vision models export utility function for serving/inference.""" - -import os -from typing import Optional, List - -from absl import logging -import tensorflow as tf - -from official.core import config_definitions as cfg -from official.core import export_base -from official.core import train_utils -from official.vision.beta import configs -from official.vision.beta.serving import detection -from official.vision.beta.serving import image_classification -from official.vision.beta.serving import semantic_segmentation -from official.vision.beta.serving import video_classification - - -def export_inference_graph( - input_type: str, - batch_size: Optional[int], - input_image_size: List[int], - params: cfg.ExperimentConfig, - checkpoint_path: str, - export_dir: str, - num_channels: Optional[int] = 3, - export_module: Optional[export_base.ExportModule] = None, - export_checkpoint_subdir: Optional[str] = None, - export_saved_model_subdir: Optional[str] = None, - save_options: Optional[tf.saved_model.SaveOptions] = None, - log_model_flops_and_params: bool = False): - """Exports inference graph for the model specified in the exp config. - - Saved model is stored at export_dir/saved_model, checkpoint is saved - at export_dir/checkpoint, and params is saved at export_dir/params.yaml. - - Args: - input_type: One of `image_tensor`, `image_bytes`, `tf_example` or `tflite`. - batch_size: 'int', or None. - input_image_size: List or Tuple of height and width. - params: Experiment params. - checkpoint_path: Trained checkpoint path or directory. - export_dir: Export directory path. - num_channels: The number of input image channels. - export_module: Optional export module to be used instead of using params - to create one. If None, the params will be used to create an export - module. - export_checkpoint_subdir: Optional subdirectory under export_dir - to store checkpoint. - export_saved_model_subdir: Optional subdirectory under export_dir - to store saved model. - save_options: `SaveOptions` for `tf.saved_model.save`. - log_model_flops_and_params: If True, writes model FLOPs to model_flops.txt - and model parameters to model_params.txt. - """ - - if export_checkpoint_subdir: - output_checkpoint_directory = os.path.join( - export_dir, export_checkpoint_subdir) - else: - output_checkpoint_directory = None - - if export_saved_model_subdir: - output_saved_model_directory = os.path.join( - export_dir, export_saved_model_subdir) - else: - output_saved_model_directory = export_dir - - # TODO(arashwan): Offers a direct path to use ExportModule with Task objects. - if not export_module: - if isinstance(params.task, - configs.image_classification.ImageClassificationTask): - export_module = image_classification.ClassificationModule( - params=params, - batch_size=batch_size, - input_image_size=input_image_size, - input_type=input_type, - num_channels=num_channels) - elif isinstance(params.task, configs.retinanet.RetinaNetTask) or isinstance( - params.task, configs.maskrcnn.MaskRCNNTask): - export_module = detection.DetectionModule( - params=params, - batch_size=batch_size, - input_image_size=input_image_size, - input_type=input_type, - num_channels=num_channels) - elif isinstance(params.task, - configs.semantic_segmentation.SemanticSegmentationTask): - export_module = semantic_segmentation.SegmentationModule( - params=params, - batch_size=batch_size, - input_image_size=input_image_size, - input_type=input_type, - num_channels=num_channels) - elif isinstance(params.task, - configs.video_classification.VideoClassificationTask): - export_module = video_classification.VideoClassificationModule( - params=params, - batch_size=batch_size, - input_image_size=input_image_size, - input_type=input_type, - num_channels=num_channels) - else: - raise ValueError('Export module not implemented for {} task.'.format( - type(params.task))) - - export_base.export( - export_module, - function_keys=[input_type], - export_savedmodel_dir=output_saved_model_directory, - checkpoint_path=checkpoint_path, - timestamped=False, - save_options=save_options) - - if output_checkpoint_directory: - ckpt = tf.train.Checkpoint(model=export_module.model) - ckpt.save(os.path.join(output_checkpoint_directory, 'ckpt')) - train_utils.serialize_config(params, export_dir) - - if log_model_flops_and_params: - inputs_kwargs = None - if isinstance( - params.task, - (configs.retinanet.RetinaNetTask, configs.maskrcnn.MaskRCNNTask)): - # We need to create inputs_kwargs argument to specify the input shapes for - # subclass model that overrides model.call to take multiple inputs, - # e.g., RetinaNet model. - inputs_kwargs = { - 'images': - tf.TensorSpec([1] + input_image_size + [num_channels], - tf.float32), - 'image_shape': - tf.TensorSpec([1, 2], tf.float32) - } - dummy_inputs = { - k: tf.ones(v.shape.as_list(), tf.float32) - for k, v in inputs_kwargs.items() - } - # Must do forward pass to build the model. - export_module.model(**dummy_inputs) - else: - logging.info( - 'Logging model flops and params not implemented for %s task.', - type(params.task)) - return - train_utils.try_count_flops(export_module.model, inputs_kwargs, - os.path.join(export_dir, 'model_flops.txt')) - train_utils.write_model_params(export_module.model, - os.path.join(export_dir, 'model_params.txt')) diff --git a/official/vision/beta/serving/export_saved_model_lib_test.py b/official/vision/beta/serving/export_saved_model_lib_test.py deleted file mode 100644 index ff65cf7a1..000000000 --- a/official/vision/beta/serving/export_saved_model_lib_test.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for official.core.export_saved_model_lib.""" - -import os -from unittest import mock - -import tensorflow as tf - -from official.core import export_base -from official.vision.beta import configs -from official.vision.beta.serving import export_saved_model_lib - - -class WriteModelFlopsAndParamsTest(tf.test.TestCase): - - def setUp(self): - super().setUp() - self.tempdir = self.create_tempdir() - self.enter_context( - mock.patch.object(export_base, 'export', autospec=True, spec_set=True)) - - def _export_model_with_log_model_flops_and_params(self, params): - export_saved_model_lib.export_inference_graph( - input_type='image_tensor', - batch_size=1, - input_image_size=[64, 64], - params=params, - checkpoint_path=os.path.join(self.tempdir, 'unused-ckpt'), - export_dir=self.tempdir, - log_model_flops_and_params=True) - - def assertModelAnalysisFilesExist(self): - self.assertTrue( - tf.io.gfile.exists(os.path.join(self.tempdir, 'model_params.txt'))) - self.assertTrue( - tf.io.gfile.exists(os.path.join(self.tempdir, 'model_flops.txt'))) - - def test_retinanet_task(self): - params = configs.retinanet.retinanet_resnetfpn_coco() - params.task.model.backbone.resnet.model_id = 18 - params.task.model.num_classes = 2 - params.task.model.max_level = 6 - self._export_model_with_log_model_flops_and_params(params) - self.assertModelAnalysisFilesExist() - - def test_maskrcnn_task(self): - params = configs.maskrcnn.maskrcnn_resnetfpn_coco() - params.task.model.backbone.resnet.model_id = 18 - params.task.model.num_classes = 2 - params.task.model.max_level = 6 - self._export_model_with_log_model_flops_and_params(params) - self.assertModelAnalysisFilesExist() - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/serving/export_saved_model_lib_v2.py b/official/vision/beta/serving/export_saved_model_lib_v2.py deleted file mode 100644 index aa9e22778..000000000 --- a/official/vision/beta/serving/export_saved_model_lib_v2.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -r"""Vision models export utility function for serving/inference.""" - -import os -from typing import Optional, List - -import tensorflow as tf - -from official.core import config_definitions as cfg -from official.core import export_base -from official.core import train_utils -from official.vision.beta.serving import export_module_factory - - -def export( - input_type: str, - batch_size: Optional[int], - input_image_size: List[int], - params: cfg.ExperimentConfig, - checkpoint_path: str, - export_dir: str, - num_channels: Optional[int] = 3, - export_module: Optional[export_base.ExportModule] = None, - export_checkpoint_subdir: Optional[str] = None, - export_saved_model_subdir: Optional[str] = None, - save_options: Optional[tf.saved_model.SaveOptions] = None): - """Exports the model specified in the exp config. - - Saved model is stored at export_dir/saved_model, checkpoint is saved - at export_dir/checkpoint, and params is saved at export_dir/params.yaml. - - Args: - input_type: One of `image_tensor`, `image_bytes`, `tf_example`. - batch_size: 'int', or None. - input_image_size: List or Tuple of height and width. - params: Experiment params. - checkpoint_path: Trained checkpoint path or directory. - export_dir: Export directory path. - num_channels: The number of input image channels. - export_module: Optional export module to be used instead of using params - to create one. If None, the params will be used to create an export - module. - export_checkpoint_subdir: Optional subdirectory under export_dir - to store checkpoint. - export_saved_model_subdir: Optional subdirectory under export_dir - to store saved model. - save_options: `SaveOptions` for `tf.saved_model.save`. - """ - - if export_checkpoint_subdir: - output_checkpoint_directory = os.path.join( - export_dir, export_checkpoint_subdir) - else: - output_checkpoint_directory = None - - if export_saved_model_subdir: - output_saved_model_directory = os.path.join( - export_dir, export_saved_model_subdir) - else: - output_saved_model_directory = export_dir - - export_module = export_module_factory.get_export_module( - params, - input_type=input_type, - batch_size=batch_size, - input_image_size=input_image_size, - num_channels=num_channels) - - export_base.export( - export_module, - function_keys=[input_type], - export_savedmodel_dir=output_saved_model_directory, - checkpoint_path=checkpoint_path, - timestamped=False, - save_options=save_options) - - if output_checkpoint_directory: - ckpt = tf.train.Checkpoint(model=export_module.model) - ckpt.save(os.path.join(output_checkpoint_directory, 'ckpt')) - train_utils.serialize_config(params, export_dir) diff --git a/official/vision/beta/serving/export_tfhub.py b/official/vision/beta/serving/export_tfhub.py deleted file mode 100644 index 3d999a2ab..000000000 --- a/official/vision/beta/serving/export_tfhub.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""A script to export the image classification as a TF-Hub SavedModel.""" - -# Import libraries -from absl import app -from absl import flags - -import tensorflow as tf - -from official.common import registry_imports # pylint: disable=unused-import -from official.core import exp_factory -from official.modeling import hyperparams -from official.vision.beta.modeling import factory - - -FLAGS = flags.FLAGS - -flags.DEFINE_string( - 'experiment', None, 'experiment type, e.g. resnet_imagenet') -flags.DEFINE_string( - 'checkpoint_path', None, 'Checkpoint path.') -flags.DEFINE_string( - 'export_path', None, 'The export directory.') -flags.DEFINE_multi_string( - 'config_file', - None, - 'A YAML/JSON files which specifies overrides. The override order ' - 'follows the order of args. Note that each file ' - 'can be used as an override template to override the default parameters ' - 'specified in Python. If the same parameter is specified in both ' - '`--config_file` and `--params_override`, `config_file` will be used ' - 'first, followed by params_override.') -flags.DEFINE_string( - 'params_override', '', - 'The JSON/YAML file or string which specifies the parameter to be overriden' - ' on top of `config_file` template.') -flags.DEFINE_integer( - 'batch_size', None, 'The batch size.') -flags.DEFINE_string( - 'input_image_size', - '224,224', - 'The comma-separated string of two integers representing the height,width ' - 'of the input to the model.') -flags.DEFINE_boolean( - 'skip_logits_layer', - False, - 'Whether to skip the prediction layer and only output the feature vector.') - - -def export_model_to_tfhub(params, - batch_size, - input_image_size, - skip_logits_layer, - checkpoint_path, - export_path): - """Export an image classification model to TF-Hub.""" - input_specs = tf.keras.layers.InputSpec(shape=[batch_size] + - input_image_size + [3]) - - model = factory.build_classification_model( - input_specs=input_specs, - model_config=params.task.model, - l2_regularizer=None, - skip_logits_layer=skip_logits_layer) - checkpoint = tf.train.Checkpoint(model=model) - checkpoint.restore(checkpoint_path).assert_existing_objects_matched() - model.save(export_path, include_optimizer=False, save_format='tf') - - -def main(_): - params = exp_factory.get_exp_config(FLAGS.experiment) - for config_file in FLAGS.config_file or []: - params = hyperparams.override_params_dict( - params, config_file, is_strict=True) - if FLAGS.params_override: - params = hyperparams.override_params_dict( - params, FLAGS.params_override, is_strict=True) - params.validate() - params.lock() - - export_model_to_tfhub( - params=params, - batch_size=FLAGS.batch_size, - input_image_size=[int(x) for x in FLAGS.input_image_size.split(',')], - skip_logits_layer=FLAGS.skip_logits_layer, - checkpoint_path=FLAGS.checkpoint_path, - export_path=FLAGS.export_path) - - -if __name__ == '__main__': - app.run(main) diff --git a/official/vision/beta/serving/export_tflite.py b/official/vision/beta/serving/export_tflite.py deleted file mode 100644 index eb1bfd690..000000000 --- a/official/vision/beta/serving/export_tflite.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -r"""Binary to convert a saved model to tflite model. - -It requires a SavedModel exported using export_saved_model.py with batch size 1 -and input type `tflite`, and using the same config file used for exporting saved -model. It includes optional post-training quantization. When using integer -quantization, calibration steps need to be provided to calibrate model input. - -To convert a SavedModel to a TFLite model: - -EXPERIMENT_TYPE = XX -TFLITE_PATH = XX -SAVED_MOODEL_DIR = XX -CONFIG_FILE = XX -export_tflite --experiment=${EXPERIMENT_TYPE} \ - --saved_model_dir=${SAVED_MOODEL_DIR} \ - --tflite_path=${TFLITE_PATH} \ - --config_file=${CONFIG_FILE} \ - --quant_type=fp16 \ - --calibration_steps=500 -""" -from absl import app -from absl import flags -from absl import logging - -import tensorflow as tf -from official.common import registry_imports # pylint: disable=unused-import -from official.core import exp_factory -from official.modeling import hyperparams -from official.vision.beta.serving import export_tflite_lib - -FLAGS = flags.FLAGS - -flags.DEFINE_string( - 'experiment', - None, - 'experiment type, e.g. retinanet_resnetfpn_coco', - required=True) -flags.DEFINE_multi_string( - 'config_file', - default='', - help='YAML/JSON files which specifies overrides. The override order ' - 'follows the order of args. Note that each file ' - 'can be used as an override template to override the default parameters ' - 'specified in Python. If the same parameter is specified in both ' - '`--config_file` and `--params_override`, `config_file` will be used ' - 'first, followed by params_override.') -flags.DEFINE_string( - 'params_override', '', - 'The JSON/YAML file or string which specifies the parameter to be overriden' - ' on top of `config_file` template.') -flags.DEFINE_string( - 'saved_model_dir', None, 'The directory to the saved model.', required=True) -flags.DEFINE_string( - 'tflite_path', None, 'The path to the output tflite model.', required=True) -flags.DEFINE_string( - 'quant_type', - default=None, - help='Post training quantization type. Support `int8`, `int8_full`, ' - '`fp16`, and `default`. See ' - 'https://www.tensorflow.org/lite/performance/post_training_quantization ' - 'for more details.') -flags.DEFINE_integer('calibration_steps', 500, - 'The number of calibration steps for integer model.') - - -def main(_) -> None: - params = exp_factory.get_exp_config(FLAGS.experiment) - if FLAGS.config_file is not None: - for config_file in FLAGS.config_file: - params = hyperparams.override_params_dict( - params, config_file, is_strict=True) - if FLAGS.params_override: - params = hyperparams.override_params_dict( - params, FLAGS.params_override, is_strict=True) - - params.validate() - params.lock() - - logging.info('Converting SavedModel from %s to TFLite model...', - FLAGS.saved_model_dir) - tflite_model = export_tflite_lib.convert_tflite_model( - saved_model_dir=FLAGS.saved_model_dir, - quant_type=FLAGS.quant_type, - params=params, - calibration_steps=FLAGS.calibration_steps) - - with tf.io.gfile.GFile(FLAGS.tflite_path, 'wb') as fw: - fw.write(tflite_model) - - logging.info('TFLite model converted and saved to %s.', FLAGS.tflite_path) - - -if __name__ == '__main__': - app.run(main) diff --git a/official/vision/beta/serving/export_tflite_lib.py b/official/vision/beta/serving/export_tflite_lib.py deleted file mode 100644 index d5535a924..000000000 --- a/official/vision/beta/serving/export_tflite_lib.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Library to facilitate TFLite model conversion.""" -import functools -from typing import Iterator, List, Optional - -from absl import logging -import tensorflow as tf - -from official.core import config_definitions as cfg -from official.vision.beta import configs -from official.vision.beta import tasks - - -def create_representative_dataset( - params: cfg.ExperimentConfig) -> tf.data.Dataset: - """Creates a tf.data.Dataset to load images for representative dataset. - - Args: - params: An ExperimentConfig. - - Returns: - A tf.data.Dataset instance. - - Raises: - ValueError: If task is not supported. - """ - if isinstance(params.task, - configs.image_classification.ImageClassificationTask): - - task = tasks.image_classification.ImageClassificationTask(params.task) - elif isinstance(params.task, configs.retinanet.RetinaNetTask): - task = tasks.retinanet.RetinaNetTask(params.task) - elif isinstance(params.task, configs.maskrcnn.MaskRCNNTask): - task = tasks.maskrcnn.MaskRCNNTask(params.task) - elif isinstance(params.task, - configs.semantic_segmentation.SemanticSegmentationTask): - task = tasks.semantic_segmentation.SemanticSegmentationTask(params.task) - else: - raise ValueError('Task {} not supported.'.format(type(params.task))) - # Ensure batch size is 1 for TFLite model. - params.task.train_data.global_batch_size = 1 - params.task.train_data.dtype = 'float32' - logging.info('Task config: %s', params.task.as_dict()) - return task.build_inputs(params=params.task.train_data) - - -def representative_dataset( - params: cfg.ExperimentConfig, - calibration_steps: int = 2000) -> Iterator[List[tf.Tensor]]: - """"Creates representative dataset for input calibration. - - Args: - params: An ExperimentConfig. - calibration_steps: The steps to do calibration. - - Yields: - An input image tensor. - """ - dataset = create_representative_dataset(params=params) - for image, _ in dataset.take(calibration_steps): - # Skip images that do not have 3 channels. - if image.shape[-1] != 3: - continue - yield [image] - - -def convert_tflite_model(saved_model_dir: str, - quant_type: Optional[str] = None, - params: Optional[cfg.ExperimentConfig] = None, - calibration_steps: Optional[int] = 2000) -> bytes: - """Converts and returns a TFLite model. - - Args: - saved_model_dir: The directory to the SavedModel. - quant_type: The post training quantization (PTQ) method. It can be one of - `default` (dynamic range), `fp16` (float16), `int8` (integer wih float - fallback), `int8_full` (integer only) and None (no quantization). - params: An optional ExperimentConfig to load and preprocess input images to - do calibration for integer quantization. - calibration_steps: The steps to do calibration. - - Returns: - A converted TFLite model with optional PTQ. - - Raises: - ValueError: If `representative_dataset_path` is not present if integer - quantization is requested. - """ - converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir) - if quant_type: - if quant_type.startswith('int8'): - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.representative_dataset = functools.partial( - representative_dataset, - params=params, - calibration_steps=calibration_steps) - if quant_type == 'int8_full': - converter.target_spec.supported_ops = [ - tf.lite.OpsSet.TFLITE_BUILTINS_INT8 - ] - converter.inference_input_type = tf.uint8 # or tf.int8 - converter.inference_output_type = tf.uint8 # or tf.int8 - elif quant_type == 'fp16': - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.target_spec.supported_types = [tf.float16] - elif quant_type == 'default': - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - return converter.convert() diff --git a/official/vision/beta/serving/export_tflite_lib_test.py b/official/vision/beta/serving/export_tflite_lib_test.py deleted file mode 100644 index 138dde6ef..000000000 --- a/official/vision/beta/serving/export_tflite_lib_test.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for export_tflite_lib.""" -import os - -from absl.testing import parameterized -import tensorflow as tf - -from tensorflow.python.distribute import combinations -from official.core import exp_factory -from official.vision.beta import configs # pylint: disable=unused-import -from official.vision.beta.dataloaders import tfexample_utils -from official.vision.beta.serving import detection as detection_serving -from official.vision.beta.serving import export_tflite_lib -from official.vision.beta.serving import image_classification as image_classification_serving -from official.vision.beta.serving import semantic_segmentation as semantic_segmentation_serving - - -class ExportTfliteLibTest(tf.test.TestCase, parameterized.TestCase): - - def _create_test_tfrecord(self, tfrecord_file, example, num_samples): - examples = [example] * num_samples - tfexample_utils.dump_to_tfrecord( - record_file=tfrecord_file, tf_examples=examples) - - def _export_from_module(self, module, input_type, saved_model_dir): - signatures = module.get_inference_signatures( - {input_type: 'serving_default'}) - tf.saved_model.save(module, saved_model_dir, signatures=signatures) - - @combinations.generate( - combinations.combine( - experiment=['mobilenet_imagenet'], - quant_type=[None, 'default', 'fp16', 'int8', 'int8_full'], - input_image_size=[[224, 224]])) - def test_export_tflite_image_classification(self, experiment, quant_type, - input_image_size): - test_tfrecord_file = os.path.join(self.get_temp_dir(), 'cls_test.tfrecord') - example = tf.train.Example.FromString( - tfexample_utils.create_classification_example( - image_height=input_image_size[0], image_width=input_image_size[1])) - self._create_test_tfrecord( - tfrecord_file=test_tfrecord_file, example=example, num_samples=10) - params = exp_factory.get_exp_config(experiment) - params.task.validation_data.input_path = test_tfrecord_file - params.task.train_data.input_path = test_tfrecord_file - temp_dir = self.get_temp_dir() - module = image_classification_serving.ClassificationModule( - params=params, - batch_size=1, - input_image_size=input_image_size, - input_type='tflite') - self._export_from_module( - module=module, - input_type='tflite', - saved_model_dir=os.path.join(temp_dir, 'saved_model')) - - tflite_model = export_tflite_lib.convert_tflite_model( - saved_model_dir=os.path.join(temp_dir, 'saved_model'), - quant_type=quant_type, - params=params, - calibration_steps=5) - - self.assertIsInstance(tflite_model, bytes) - - @combinations.generate( - combinations.combine( - experiment=['retinanet_mobile_coco'], - quant_type=[None, 'default', 'fp16'], - input_image_size=[[384, 384]])) - def test_export_tflite_detection(self, experiment, quant_type, - input_image_size): - test_tfrecord_file = os.path.join(self.get_temp_dir(), 'det_test.tfrecord') - example = tfexample_utils.create_detection_test_example( - image_height=input_image_size[0], - image_width=input_image_size[1], - image_channel=3, - num_instances=10) - self._create_test_tfrecord( - tfrecord_file=test_tfrecord_file, example=example, num_samples=10) - params = exp_factory.get_exp_config(experiment) - params.task.validation_data.input_path = test_tfrecord_file - params.task.train_data.input_path = test_tfrecord_file - temp_dir = self.get_temp_dir() - module = detection_serving.DetectionModule( - params=params, - batch_size=1, - input_image_size=input_image_size, - input_type='tflite') - self._export_from_module( - module=module, - input_type='tflite', - saved_model_dir=os.path.join(temp_dir, 'saved_model')) - - tflite_model = export_tflite_lib.convert_tflite_model( - saved_model_dir=os.path.join(temp_dir, 'saved_model'), - quant_type=quant_type, - params=params, - calibration_steps=5) - - self.assertIsInstance(tflite_model, bytes) - - @combinations.generate( - combinations.combine( - experiment=['mnv2_deeplabv3_pascal'], - quant_type=[None, 'default', 'fp16', 'int8', 'int8_full'], - input_image_size=[[512, 512]])) - def test_export_tflite_semantic_segmentation(self, experiment, quant_type, - input_image_size): - test_tfrecord_file = os.path.join(self.get_temp_dir(), 'seg_test.tfrecord') - example = tfexample_utils.create_segmentation_test_example( - image_height=input_image_size[0], - image_width=input_image_size[1], - image_channel=3) - self._create_test_tfrecord( - tfrecord_file=test_tfrecord_file, example=example, num_samples=10) - params = exp_factory.get_exp_config(experiment) - params.task.validation_data.input_path = test_tfrecord_file - params.task.train_data.input_path = test_tfrecord_file - temp_dir = self.get_temp_dir() - module = semantic_segmentation_serving.SegmentationModule( - params=params, - batch_size=1, - input_image_size=input_image_size, - input_type='tflite') - self._export_from_module( - module=module, - input_type='tflite', - saved_model_dir=os.path.join(temp_dir, 'saved_model')) - - tflite_model = export_tflite_lib.convert_tflite_model( - saved_model_dir=os.path.join(temp_dir, 'saved_model'), - quant_type=quant_type, - params=params, - calibration_steps=5) - - self.assertIsInstance(tflite_model, bytes) - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/serving/export_utils.py b/official/vision/beta/serving/export_utils.py deleted file mode 100644 index 5c9c5ea5e..000000000 --- a/official/vision/beta/serving/export_utils.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Helper utils for export library.""" - -from typing import List, Optional -import tensorflow as tf - -# pylint: disable=g-long-lambda - - -def get_image_input_signatures(input_type: str, - batch_size: Optional[int], - input_image_size: List[int], - num_channels: int = 3): - """Gets input signatures for an image. - - Args: - input_type: A `str`, can be either tf_example, image_bytes, or image_tensor. - batch_size: `int` for batch size or None. - input_image_size: List[int] for the height and width of the input image. - num_channels: `int` for number of channels in the input image. - Returns: - tf.TensorSpec of the input tensor. - """ - if input_type == 'image_tensor': - input_signature = tf.TensorSpec( - shape=[batch_size] + [None] * len(input_image_size) + [num_channels], - dtype=tf.uint8) - elif input_type in ['image_bytes', 'serve_examples', 'tf_example']: - input_signature = tf.TensorSpec(shape=[batch_size], dtype=tf.string) - elif input_type == 'tflite': - input_signature = tf.TensorSpec( - shape=[1] + input_image_size + [num_channels], dtype=tf.float32) - else: - raise ValueError('Unrecognized `input_type`') - return input_signature - - -def decode_image(encoded_image_bytes: str, - input_image_size: List[int], - num_channels: int = 3,) -> tf.Tensor: - """Decodes an image bytes to an image tensor. - - Use `tf.image.decode_image` to decode an image if input is expected to be 2D - image; otherwise use `tf.io.decode_raw` to convert the raw bytes to tensor - and reshape it to desire shape. - - Args: - encoded_image_bytes: An encoded image string to be decoded. - input_image_size: List[int] for the desired input size. This will be used to - infer whether the image is 2d or 3d. - num_channels: `int` for number of image channels. - - Returns: - A decoded image tensor. - """ - if len(input_image_size) == 2: - # Decode an image if 2D input is expected. - image_tensor = tf.image.decode_image( - encoded_image_bytes, channels=num_channels) - else: - # Convert raw bytes into a tensor and reshape it, if not 2D input. - image_tensor = tf.io.decode_raw(encoded_image_bytes, out_type=tf.uint8) - image_tensor.set_shape([None] * len(input_image_size) + [num_channels]) - return image_tensor - - -def decode_image_tf_example(tf_example_string_tensor: tf.train.Example, - input_image_size: List[int], - num_channels: int = 3, - encoded_key: str = 'image/encoded' - ) -> tf.Tensor: - """Decodes a TF Example to an image tensor.""" - - keys_to_features = { - encoded_key: tf.io.FixedLenFeature((), tf.string, default_value=''), - } - parsed_tensors = tf.io.parse_single_example( - serialized=tf_example_string_tensor, features=keys_to_features) - image_tensor = decode_image( - parsed_tensors[encoded_key], - input_image_size=input_image_size, - num_channels=num_channels) - return image_tensor - - -def parse_image( - inputs, input_type: str, input_image_size: List[int], num_channels: int): - """Parses image.""" - if input_type in ['tf_example', 'serve_examples']: - decode_image_tf_example_fn = ( - lambda x: decode_image_tf_example(x, input_image_size, num_channels)) - image_tensor = tf.map_fn( - decode_image_tf_example_fn, - elems=inputs, - fn_output_signature=tf.TensorSpec( - shape=[None] * len(input_image_size) + [num_channels], - dtype=tf.uint8), - ) - elif input_type == 'image_bytes': - decode_image_fn = lambda x: decode_image(x, input_image_size, num_channels) - image_tensor = tf.map_fn( - decode_image_fn, elems=inputs, - fn_output_signature=tf.TensorSpec( - shape=[None] * len(input_image_size) + [num_channels], - dtype=tf.uint8),) - else: - image_tensor = inputs - return image_tensor diff --git a/official/vision/beta/serving/image_classification.py b/official/vision/beta/serving/image_classification.py deleted file mode 100644 index f4220a897..000000000 --- a/official/vision/beta/serving/image_classification.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Image classification input and model functions for serving/inference.""" - -import tensorflow as tf - -from official.vision.beta.modeling import factory -from official.vision.beta.ops import preprocess_ops -from official.vision.beta.serving import export_base - - -MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255) -STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255) - - -class ClassificationModule(export_base.ExportModule): - """classification Module.""" - - def _build_model(self): - input_specs = tf.keras.layers.InputSpec( - shape=[self._batch_size] + self._input_image_size + [3]) - - return factory.build_classification_model( - input_specs=input_specs, - model_config=self.params.task.model, - l2_regularizer=None) - - def _build_inputs(self, image): - """Builds classification model inputs for serving.""" - # Center crops and resizes image. - image = preprocess_ops.center_crop_image(image) - - image = tf.image.resize( - image, self._input_image_size, method=tf.image.ResizeMethod.BILINEAR) - - image = tf.reshape( - image, [self._input_image_size[0], self._input_image_size[1], 3]) - - # Normalizes image with mean and std pixel values. - image = preprocess_ops.normalize_image(image, - offset=MEAN_RGB, - scale=STDDEV_RGB) - return image - - def serve(self, images): - """Cast image to float and run inference. - - Args: - images: uint8 Tensor of shape [batch_size, None, None, 3] - Returns: - Tensor holding classification output logits. - """ - # Skip image preprocessing when input_type is tflite so it is compatible - # with TFLite quantization. - if self._input_type != 'tflite': - with tf.device('cpu:0'): - images = tf.cast(images, dtype=tf.float32) - - images = tf.nest.map_structure( - tf.identity, - tf.map_fn( - self._build_inputs, - elems=images, - fn_output_signature=tf.TensorSpec( - shape=self._input_image_size + [3], dtype=tf.float32), - parallel_iterations=32)) - - logits = self.inference_step(images) - probs = tf.nn.softmax(logits) - - return {'logits': logits, 'probs': probs} diff --git a/official/vision/beta/serving/image_classification_test.py b/official/vision/beta/serving/image_classification_test.py deleted file mode 100644 index 4469ef3a8..000000000 --- a/official/vision/beta/serving/image_classification_test.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Test for image classification export lib.""" - -import io -import os - -from absl.testing import parameterized -import numpy as np -from PIL import Image -import tensorflow as tf - -from official.core import exp_factory -from official.vision.beta import configs # pylint: disable=unused-import -from official.vision.beta.serving import image_classification - - -class ImageClassificationExportTest(tf.test.TestCase, parameterized.TestCase): - - def _get_classification_module(self, input_type): - params = exp_factory.get_exp_config('resnet_imagenet') - params.task.model.backbone.resnet.model_id = 18 - classification_module = image_classification.ClassificationModule( - params, - batch_size=1, - input_image_size=[224, 224], - input_type=input_type) - return classification_module - - def _export_from_module(self, module, input_type, save_directory): - signatures = module.get_inference_signatures( - {input_type: 'serving_default'}) - tf.saved_model.save(module, - save_directory, - signatures=signatures) - - def _get_dummy_input(self, input_type): - """Get dummy input for the given input type.""" - - if input_type == 'image_tensor': - return tf.zeros((1, 224, 224, 3), dtype=np.uint8) - elif input_type == 'image_bytes': - image = Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8)) - byte_io = io.BytesIO() - image.save(byte_io, 'PNG') - return [byte_io.getvalue()] - elif input_type == 'tf_example': - image_tensor = tf.zeros((224, 224, 3), dtype=tf.uint8) - encoded_jpeg = tf.image.encode_jpeg(tf.constant(image_tensor)).numpy() - example = tf.train.Example( - features=tf.train.Features( - feature={ - 'image/encoded': - tf.train.Feature( - bytes_list=tf.train.BytesList(value=[encoded_jpeg])), - })).SerializeToString() - return [example] - elif input_type == 'tflite': - return tf.zeros((1, 224, 224, 3), dtype=np.float32) - - @parameterized.parameters( - {'input_type': 'image_tensor'}, - {'input_type': 'image_bytes'}, - {'input_type': 'tf_example'}, - {'input_type': 'tflite'}, - ) - def test_export(self, input_type='image_tensor'): - tmp_dir = self.get_temp_dir() - module = self._get_classification_module(input_type) - # Test that the model restores any attrs that are trackable objects - # (eg: tables, resource variables, keras models/layers, tf.hub modules). - module.model.test_trackable = tf.keras.layers.InputLayer(input_shape=(4,)) - - self._export_from_module(module, input_type, tmp_dir) - - self.assertTrue(os.path.exists(os.path.join(tmp_dir, 'saved_model.pb'))) - self.assertTrue(os.path.exists( - os.path.join(tmp_dir, 'variables', 'variables.index'))) - self.assertTrue(os.path.exists( - os.path.join(tmp_dir, 'variables', 'variables.data-00000-of-00001'))) - - imported = tf.saved_model.load(tmp_dir) - classification_fn = imported.signatures['serving_default'] - - images = self._get_dummy_input(input_type) - if input_type != 'tflite': - processed_images = tf.nest.map_structure( - tf.stop_gradient, - tf.map_fn( - module._build_inputs, - elems=tf.zeros((1, 224, 224, 3), dtype=tf.uint8), - fn_output_signature=tf.TensorSpec( - shape=[224, 224, 3], dtype=tf.float32))) - else: - processed_images = images - expected_logits = module.model(processed_images, training=False) - expected_prob = tf.nn.softmax(expected_logits) - out = classification_fn(tf.constant(images)) - - # The imported model should contain any trackable attrs that the original - # model had. - self.assertTrue(hasattr(imported.model, 'test_trackable')) - self.assertAllClose(out['logits'].numpy(), expected_logits.numpy()) - self.assertAllClose(out['probs'].numpy(), expected_prob.numpy()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/serving/semantic_segmentation.py b/official/vision/beta/serving/semantic_segmentation.py deleted file mode 100644 index b8d50367e..000000000 --- a/official/vision/beta/serving/semantic_segmentation.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Semantic segmentation input and model functions for serving/inference.""" - -import tensorflow as tf - -from official.vision.beta.modeling import factory -from official.vision.beta.ops import preprocess_ops -from official.vision.beta.serving import export_base - - -MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255) -STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255) - - -class SegmentationModule(export_base.ExportModule): - """Segmentation Module.""" - - def _build_model(self): - input_specs = tf.keras.layers.InputSpec( - shape=[self._batch_size] + self._input_image_size + [3]) - - return factory.build_segmentation_model( - input_specs=input_specs, - model_config=self.params.task.model, - l2_regularizer=None) - - def _build_inputs(self, image): - """Builds classification model inputs for serving.""" - - # Normalizes image with mean and std pixel values. - image = preprocess_ops.normalize_image(image, - offset=MEAN_RGB, - scale=STDDEV_RGB) - - image, image_info = preprocess_ops.resize_and_crop_image( - image, - self._input_image_size, - padded_size=self._input_image_size, - aug_scale_min=1.0, - aug_scale_max=1.0) - return image, image_info - - def serve(self, images): - """Cast image to float and run inference. - - Args: - images: uint8 Tensor of shape [batch_size, None, None, 3] - Returns: - Tensor holding classification output logits. - """ - # Skip image preprocessing when input_type is tflite so it is compatible - # with TFLite quantization. - image_info = None - if self._input_type != 'tflite': - with tf.device('cpu:0'): - images = tf.cast(images, dtype=tf.float32) - images_spec = tf.TensorSpec( - shape=self._input_image_size + [3], dtype=tf.float32) - image_info_spec = tf.TensorSpec(shape=[4, 2], dtype=tf.float32) - - images, image_info = tf.nest.map_structure( - tf.identity, - tf.map_fn( - self._build_inputs, - elems=images, - fn_output_signature=(images_spec, image_info_spec), - parallel_iterations=32)) - - outputs = self.inference_step(images) - outputs['logits'] = tf.image.resize( - outputs['logits'], self._input_image_size, method='bilinear') - - if image_info is not None: - outputs.update({'image_info': image_info}) - - return outputs diff --git a/official/vision/beta/serving/semantic_segmentation_test.py b/official/vision/beta/serving/semantic_segmentation_test.py deleted file mode 100644 index 24a2df048..000000000 --- a/official/vision/beta/serving/semantic_segmentation_test.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Test for semantic segmentation export lib.""" - -import io -import os - -from absl.testing import parameterized -import numpy as np -from PIL import Image -import tensorflow as tf - -from official.core import exp_factory -from official.vision.beta import configs # pylint: disable=unused-import -from official.vision.beta.serving import semantic_segmentation - - -class SemanticSegmentationExportTest(tf.test.TestCase, parameterized.TestCase): - - def _get_segmentation_module(self, input_type): - params = exp_factory.get_exp_config('mnv2_deeplabv3_pascal') - segmentation_module = semantic_segmentation.SegmentationModule( - params, - batch_size=1, - input_image_size=[112, 112], - input_type=input_type) - return segmentation_module - - def _export_from_module(self, module, input_type, save_directory): - signatures = module.get_inference_signatures( - {input_type: 'serving_default'}) - tf.saved_model.save(module, save_directory, signatures=signatures) - - def _get_dummy_input(self, input_type): - """Get dummy input for the given input type.""" - - if input_type == 'image_tensor': - return tf.zeros((1, 112, 112, 3), dtype=np.uint8) - elif input_type == 'image_bytes': - image = Image.fromarray(np.zeros((112, 112, 3), dtype=np.uint8)) - byte_io = io.BytesIO() - image.save(byte_io, 'PNG') - return [byte_io.getvalue()] - elif input_type == 'tf_example': - image_tensor = tf.zeros((112, 112, 3), dtype=tf.uint8) - encoded_jpeg = tf.image.encode_jpeg(tf.constant(image_tensor)).numpy() - example = tf.train.Example( - features=tf.train.Features( - feature={ - 'image/encoded': - tf.train.Feature( - bytes_list=tf.train.BytesList(value=[encoded_jpeg])), - })).SerializeToString() - return [example] - elif input_type == 'tflite': - return tf.zeros((1, 112, 112, 3), dtype=np.float32) - - @parameterized.parameters( - {'input_type': 'image_tensor'}, - {'input_type': 'image_bytes'}, - {'input_type': 'tf_example'}, - {'input_type': 'tflite'}, - ) - def test_export(self, input_type='image_tensor'): - tmp_dir = self.get_temp_dir() - module = self._get_segmentation_module(input_type) - - self._export_from_module(module, input_type, tmp_dir) - - self.assertTrue(os.path.exists(os.path.join(tmp_dir, 'saved_model.pb'))) - self.assertTrue( - os.path.exists(os.path.join(tmp_dir, 'variables', 'variables.index'))) - self.assertTrue( - os.path.exists( - os.path.join(tmp_dir, 'variables', - 'variables.data-00000-of-00001'))) - - imported = tf.saved_model.load(tmp_dir) - segmentation_fn = imported.signatures['serving_default'] - - images = self._get_dummy_input(input_type) - if input_type != 'tflite': - processed_images, _ = tf.nest.map_structure( - tf.stop_gradient, - tf.map_fn( - module._build_inputs, - elems=tf.zeros((1, 112, 112, 3), dtype=tf.uint8), - fn_output_signature=(tf.TensorSpec( - shape=[112, 112, 3], dtype=tf.float32), - tf.TensorSpec( - shape=[4, 2], dtype=tf.float32)))) - else: - processed_images = images - expected_output = tf.image.resize( - module.model(processed_images, training=False)['logits'], [112, 112], - method='bilinear') - out = segmentation_fn(tf.constant(images)) - self.assertAllClose(out['logits'].numpy(), expected_output.numpy()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/serving/video_classification.py b/official/vision/beta/serving/video_classification.py deleted file mode 100644 index e86fd41c3..000000000 --- a/official/vision/beta/serving/video_classification.py +++ /dev/null @@ -1,190 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Video classification input and model functions for serving/inference.""" -from typing import Mapping, Dict, Text - -import tensorflow as tf - -from official.vision.beta.dataloaders import video_input -from official.vision.beta.serving import export_base -from official.vision.beta.tasks import video_classification - -MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255) -STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255) - - -class VideoClassificationModule(export_base.ExportModule): - """Video classification Module.""" - - def _build_model(self): - input_params = self.params.task.train_data - self._num_frames = input_params.feature_shape[0] - self._stride = input_params.temporal_stride - self._min_resize = input_params.min_image_size - self._crop_size = input_params.feature_shape[1] - - self._output_audio = input_params.output_audio - task = video_classification.VideoClassificationTask(self.params.task) - return task.build_model() - - def _decode_tf_example(self, encoded_inputs: tf.Tensor): - sequence_description = { - # Each image is a string encoding JPEG. - video_input.IMAGE_KEY: - tf.io.FixedLenSequenceFeature((), tf.string), - } - if self._output_audio: - sequence_description[self._params.task.validation_data.audio_feature] = ( - tf.io.VarLenFeature(dtype=tf.float32)) - _, decoded_tensors = tf.io.parse_single_sequence_example( - encoded_inputs, {}, sequence_description) - for key, value in decoded_tensors.items(): - if isinstance(value, tf.SparseTensor): - decoded_tensors[key] = tf.sparse.to_dense(value) - return decoded_tensors - - def _preprocess_image(self, image): - image = video_input.process_image( - image=image, - is_training=False, - num_frames=self._num_frames, - stride=self._stride, - num_test_clips=1, - min_resize=self._min_resize, - crop_size=self._crop_size, - num_crops=1) - image = tf.cast(image, tf.float32) # Use config. - features = {'image': image} - return features - - def _preprocess_audio(self, audio): - features = {} - audio = tf.cast(audio, dtype=tf.float32) # Use config. - audio = video_input.preprocess_ops_3d.sample_sequence( - audio, 20, random=False, stride=1) - audio = tf.ensure_shape( - audio, self._params.task.validation_data.audio_feature_shape) - features['audio'] = audio - return features - - @tf.function - def inference_from_tf_example( - self, encoded_inputs: tf.Tensor) -> Mapping[str, tf.Tensor]: - with tf.device('cpu:0'): - if self._output_audio: - inputs = tf.map_fn( - self._decode_tf_example, (encoded_inputs), - fn_output_signature={ - video_input.IMAGE_KEY: tf.string, - self._params.task.validation_data.audio_feature: tf.float32 - }) - return self.serve(inputs['image'], inputs['audio']) - else: - inputs = tf.map_fn( - self._decode_tf_example, (encoded_inputs), - fn_output_signature={ - video_input.IMAGE_KEY: tf.string, - }) - return self.serve(inputs[video_input.IMAGE_KEY], tf.zeros([1, 1])) - - @tf.function - def inference_from_image_tensors( - self, input_frames: tf.Tensor) -> Mapping[str, tf.Tensor]: - return self.serve(input_frames, tf.zeros([1, 1])) - - @tf.function - def inference_from_image_audio_tensors( - self, input_frames: tf.Tensor, - input_audio: tf.Tensor) -> Mapping[str, tf.Tensor]: - return self.serve(input_frames, input_audio) - - @tf.function - def inference_from_image_bytes(self, inputs: tf.Tensor): - raise NotImplementedError( - 'Video classification do not support image bytes input.') - - def serve(self, input_frames: tf.Tensor, input_audio: tf.Tensor): - """Cast image to float and run inference. - - Args: - input_frames: uint8 Tensor of shape [batch_size, None, None, 3] - input_audio: float32 - - Returns: - Tensor holding classification output logits. - """ - with tf.device('cpu:0'): - inputs = tf.map_fn( - self._preprocess_image, (input_frames), - fn_output_signature={ - 'image': tf.float32, - }) - if self._output_audio: - inputs.update( - tf.map_fn( - self._preprocess_audio, (input_audio), - fn_output_signature={'audio': tf.float32})) - logits = self.inference_step(inputs) - if self.params.task.train_data.is_multilabel: - probs = tf.math.sigmoid(logits) - else: - probs = tf.nn.softmax(logits) - return {'logits': logits, 'probs': probs} - - def get_inference_signatures(self, function_keys: Dict[Text, Text]): - """Gets defined function signatures. - - Args: - function_keys: A dictionary with keys as the function to create signature - for and values as the signature keys when returns. - - Returns: - A dictionary with key as signature key and value as concrete functions - that can be used for tf.saved_model.save. - """ - signatures = {} - for key, def_name in function_keys.items(): - if key == 'image_tensor': - input_signature = tf.TensorSpec( - shape=[self._batch_size] + self._input_image_size + [3], - dtype=tf.uint8, - name='INPUT_FRAMES') - signatures[ - def_name] = self.inference_from_image_tensors.get_concrete_function( - input_signature) - elif key == 'frames_audio': - input_signature = [ - tf.TensorSpec( - shape=[self._batch_size] + self._input_image_size + [3], - dtype=tf.uint8, - name='INPUT_FRAMES'), - tf.TensorSpec( - shape=[self._batch_size] + - self.params.task.train_data.audio_feature_shape, - dtype=tf.float32, - name='INPUT_AUDIO') - ] - signatures[ - def_name] = self.inference_from_image_audio_tensors.get_concrete_function( - input_signature) - elif key == 'serve_examples' or key == 'tf_example': - input_signature = tf.TensorSpec( - shape=[self._batch_size], dtype=tf.string) - signatures[ - def_name] = self.inference_from_tf_example.get_concrete_function( - input_signature) - else: - raise ValueError('Unrecognized `input_type`') - return signatures diff --git a/official/vision/beta/serving/video_classification_test.py b/official/vision/beta/serving/video_classification_test.py deleted file mode 100644 index fe348d9dc..000000000 --- a/official/vision/beta/serving/video_classification_test.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# import io -import os -import random - -from absl.testing import parameterized -import numpy as np -import tensorflow as tf - -from official.core import exp_factory -from official.vision.beta import configs # pylint: disable=unused-import -from official.vision.beta.dataloaders import tfexample_utils -from official.vision.beta.serving import video_classification - - -class VideoClassificationTest(tf.test.TestCase, parameterized.TestCase): - - def _get_classification_module(self): - params = exp_factory.get_exp_config('video_classification_ucf101') - params.task.train_data.feature_shape = (8, 64, 64, 3) - params.task.validation_data.feature_shape = (8, 64, 64, 3) - params.task.model.backbone.resnet_3d.model_id = 50 - classification_module = video_classification.VideoClassificationModule( - params, batch_size=1, input_image_size=[8, 64, 64]) - return classification_module - - def _export_from_module(self, module, input_type, save_directory): - signatures = module.get_inference_signatures( - {input_type: 'serving_default'}) - tf.saved_model.save(module, save_directory, signatures=signatures) - - def _get_dummy_input(self, input_type, module=None): - """Get dummy input for the given input type.""" - - if input_type == 'image_tensor': - images = np.random.randint( - low=0, high=255, size=(1, 8, 64, 64, 3), dtype=np.uint8) - # images = np.zeros((1, 8, 64, 64, 3), dtype=np.uint8) - return images, images - elif input_type == 'tf_example': - example = tfexample_utils.make_video_test_example( - image_shape=(64, 64, 3), - audio_shape=(20, 128), - label=random.randint(0, 100)).SerializeToString() - images = tf.nest.map_structure( - tf.stop_gradient, - tf.map_fn( - module._decode_tf_example, - elems=tf.constant([example]), - fn_output_signature={ - video_classification.video_input.IMAGE_KEY: tf.string, - })) - images = images[video_classification.video_input.IMAGE_KEY] - return [example], images - else: - raise ValueError(f'{input_type}') - - @parameterized.parameters( - {'input_type': 'image_tensor'}, - {'input_type': 'tf_example'}, - ) - def test_export(self, input_type): - tmp_dir = self.get_temp_dir() - module = self._get_classification_module() - - self._export_from_module(module, input_type, tmp_dir) - - self.assertTrue(os.path.exists(os.path.join(tmp_dir, 'saved_model.pb'))) - self.assertTrue( - os.path.exists(os.path.join(tmp_dir, 'variables', 'variables.index'))) - self.assertTrue( - os.path.exists( - os.path.join(tmp_dir, 'variables', - 'variables.data-00000-of-00001'))) - - imported = tf.saved_model.load(tmp_dir) - classification_fn = imported.signatures['serving_default'] - - images, images_tensor = self._get_dummy_input(input_type, module) - processed_images = tf.nest.map_structure( - tf.stop_gradient, - tf.map_fn( - module._preprocess_image, - elems=images_tensor, - fn_output_signature={ - 'image': tf.float32, - })) - expected_logits = module.model(processed_images, training=False) - expected_prob = tf.nn.softmax(expected_logits) - out = classification_fn(tf.constant(images)) - - # The imported model should contain any trackable attrs that the original - # model had. - self.assertAllClose(out['logits'].numpy(), expected_logits.numpy()) - self.assertAllClose(out['probs'].numpy(), expected_prob.numpy()) - - -if __name__ == '__main__': - tf.test.main() diff --git a/official/vision/beta/tasks/__init__.py b/official/vision/beta/tasks/__init__.py deleted file mode 100644 index 4c10fc510..000000000 --- a/official/vision/beta/tasks/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tasks package definition.""" - -from official.vision.beta.tasks import image_classification -from official.vision.beta.tasks import maskrcnn -from official.vision.beta.tasks import retinanet -from official.vision.beta.tasks import semantic_segmentation -from official.vision.beta.tasks import video_classification diff --git a/official/vision/beta/tasks/image_classification.py b/official/vision/beta/tasks/image_classification.py deleted file mode 100644 index 748e9ab04..000000000 --- a/official/vision/beta/tasks/image_classification.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Image classification task definition.""" -from typing import Any, Optional, List, Tuple -from absl import logging -import tensorflow as tf - -from official.common import dataset_fn -from official.core import base_task -from official.core import task_factory -from official.modeling import tf_utils -from official.vision.beta.configs import image_classification as exp_cfg -from official.vision.beta.dataloaders import classification_input -from official.vision.beta.dataloaders import input_reader_factory -from official.vision.beta.dataloaders import tfds_factory -from official.vision.beta.modeling import factory -from official.vision.beta.ops import augment - - -@task_factory.register_task_cls(exp_cfg.ImageClassificationTask) -class ImageClassificationTask(base_task.Task): - """A task for image classification.""" - - def build_model(self): - """Builds classification model.""" - input_specs = tf.keras.layers.InputSpec( - shape=[None] + self.task_config.model.input_size) - - l2_weight_decay = self.task_config.losses.l2_weight_decay - # Divide weight decay by 2.0 to match the implementation of tf.nn.l2_loss. - # (https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/l2) - # (https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss) - l2_regularizer = (tf.keras.regularizers.l2( - l2_weight_decay / 2.0) if l2_weight_decay else None) - - model = factory.build_classification_model( - input_specs=input_specs, - model_config=self.task_config.model, - l2_regularizer=l2_regularizer) - return model - - def initialize(self, model: tf.keras.Model): - """Loads pretrained checkpoint.""" - if not self.task_config.init_checkpoint: - return - - ckpt_dir_or_file = self.task_config.init_checkpoint - if tf.io.gfile.isdir(ckpt_dir_or_file): - ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file) - - # Restoring checkpoint. - if self.task_config.init_checkpoint_modules == 'all': - ckpt = tf.train.Checkpoint(model=model) - status = ckpt.read(ckpt_dir_or_file) - status.expect_partial().assert_existing_objects_matched() - elif self.task_config.init_checkpoint_modules == 'backbone': - ckpt = tf.train.Checkpoint(backbone=model.backbone) - status = ckpt.read(ckpt_dir_or_file) - status.expect_partial().assert_existing_objects_matched() - else: - raise ValueError( - "Only 'all' or 'backbone' can be used to initialize the model.") - - logging.info('Finished loading pretrained checkpoint from %s', - ckpt_dir_or_file) - - def build_inputs( - self, - params: exp_cfg.DataConfig, - input_context: Optional[tf.distribute.InputContext] = None - ) -> tf.data.Dataset: - """Builds classification input.""" - - num_classes = self.task_config.model.num_classes - input_size = self.task_config.model.input_size - image_field_key = self.task_config.train_data.image_field_key - label_field_key = self.task_config.train_data.label_field_key - is_multilabel = self.task_config.train_data.is_multilabel - - if params.tfds_name: - decoder = tfds_factory.get_classification_decoder(params.tfds_name) - else: - decoder = classification_input.Decoder( - image_field_key=image_field_key, label_field_key=label_field_key, - is_multilabel=is_multilabel) - - parser = classification_input.Parser( - output_size=input_size[:2], - num_classes=num_classes, - image_field_key=image_field_key, - label_field_key=label_field_key, - decode_jpeg_only=params.decode_jpeg_only, - aug_rand_hflip=params.aug_rand_hflip, - aug_type=params.aug_type, - color_jitter=params.color_jitter, - random_erasing=params.random_erasing, - is_multilabel=is_multilabel, - dtype=params.dtype) - - postprocess_fn = None - if params.mixup_and_cutmix: - postprocess_fn = augment.MixupAndCutmix( - mixup_alpha=params.mixup_and_cutmix.mixup_alpha, - cutmix_alpha=params.mixup_and_cutmix.cutmix_alpha, - prob=params.mixup_and_cutmix.prob, - label_smoothing=params.mixup_and_cutmix.label_smoothing, - num_classes=num_classes) - - reader = input_reader_factory.input_reader_generator( - params, - dataset_fn=dataset_fn.pick_dataset_fn(params.file_type), - decoder_fn=decoder.decode, - parser_fn=parser.parse_fn(params.is_training), - postprocess_fn=postprocess_fn) - - dataset = reader.read(input_context=input_context) - - return dataset - - def build_losses(self, - labels: tf.Tensor, - model_outputs: tf.Tensor, - aux_losses: Optional[Any] = None) -> tf.Tensor: - """Builds sparse categorical cross entropy loss. - - Args: - labels: Input groundtruth labels. - model_outputs: Output logits of the classifier. - aux_losses: The auxiliarly loss tensors, i.e. `losses` in tf.keras.Model. - - Returns: - The total loss tensor. - """ - losses_config = self.task_config.losses - is_multilabel = self.task_config.train_data.is_multilabel - - if not is_multilabel: - if losses_config.one_hot: - total_loss = tf.keras.losses.categorical_crossentropy( - labels, - model_outputs, - from_logits=True, - label_smoothing=losses_config.label_smoothing) - elif losses_config.soft_labels: - total_loss = tf.nn.softmax_cross_entropy_with_logits( - labels, model_outputs) - else: - total_loss = tf.keras.losses.sparse_categorical_crossentropy( - labels, model_outputs, from_logits=True) - else: - # Multi-label weighted binary cross entropy loss. - total_loss = tf.nn.sigmoid_cross_entropy_with_logits( - labels=labels, logits=model_outputs) - total_loss = tf.reduce_sum(total_loss, axis=-1) - - total_loss = tf_utils.safe_mean(total_loss) - if aux_losses: - total_loss += tf.add_n(aux_losses) - - total_loss = losses_config.loss_weight * total_loss - return total_loss - - def build_metrics(self, - training: bool = True) -> List[tf.keras.metrics.Metric]: - """Gets streaming metrics for training/validation.""" - is_multilabel = self.task_config.train_data.is_multilabel - if not is_multilabel: - k = self.task_config.evaluation.top_k - if (self.task_config.losses.one_hot or - self.task_config.losses.soft_labels): - metrics = [ - tf.keras.metrics.CategoricalAccuracy(name='accuracy'), - tf.keras.metrics.TopKCategoricalAccuracy( - k=k, name='top_{}_accuracy'.format(k))] - else: - metrics = [ - tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'), - tf.keras.metrics.SparseTopKCategoricalAccuracy( - k=k, name='top_{}_accuracy'.format(k))] - else: - metrics = [] - # These metrics destablize the training if included in training. The jobs - # fail due to OOM. - # TODO(arashwan): Investigate adding following metric to train. - if not training: - metrics = [ - tf.keras.metrics.AUC( - name='globalPR-AUC', - curve='PR', - multi_label=False, - from_logits=True), - tf.keras.metrics.AUC( - name='meanPR-AUC', - curve='PR', - multi_label=True, - num_labels=self.task_config.model.num_classes, - from_logits=True), - ] - return metrics - - def train_step(self, - inputs: Tuple[Any, Any], - model: tf.keras.Model, - optimizer: tf.keras.optimizers.Optimizer, - metrics: Optional[List[Any]] = None): - """Does forward and backward. - - Args: - inputs: A tuple of of input tensors of (features, labels). - model: A tf.keras.Model instance. - optimizer: The optimizer for this training step. - metrics: A nested structure of metrics objects. - - Returns: - A dictionary of logs. - """ - features, labels = inputs - is_multilabel = self.task_config.train_data.is_multilabel - if self.task_config.losses.one_hot and not is_multilabel: - labels = tf.one_hot(labels, self.task_config.model.num_classes) - - num_replicas = tf.distribute.get_strategy().num_replicas_in_sync - with tf.GradientTape() as tape: - outputs = model(features, training=True) - # Casting output layer as float32 is necessary when mixed_precision is - # mixed_float16 or mixed_bfloat16 to ensure output is casted as float32. - outputs = tf.nest.map_structure( - lambda x: tf.cast(x, tf.float32), outputs) - - # Computes per-replica loss. - loss = self.build_losses( - model_outputs=outputs, - labels=labels, - aux_losses=model.losses) - # Scales loss as the default gradients allreduce performs sum inside the - # optimizer. - scaled_loss = loss / num_replicas - - # For mixed_precision policy, when LossScaleOptimizer is used, loss is - # scaled for numerical stability. - if isinstance( - optimizer, tf.keras.mixed_precision.LossScaleOptimizer): - scaled_loss = optimizer.get_scaled_loss(scaled_loss) - - tvars = model.trainable_variables - grads = tape.gradient(scaled_loss, tvars) - # Scales back gradient before apply_gradients when LossScaleOptimizer is - # used. - if isinstance( - optimizer, tf.keras.mixed_precision.LossScaleOptimizer): - grads = optimizer.get_unscaled_gradients(grads) - optimizer.apply_gradients(list(zip(grads, tvars))) - - logs = {self.loss: loss} - if metrics: - self.process_metrics(metrics, labels, outputs) - elif model.compiled_metrics: - self.process_compiled_metrics(model.compiled_metrics, labels, outputs) - logs.update({m.name: m.result() for m in model.metrics}) - return logs - - def validation_step(self, - inputs: Tuple[Any, Any], - model: tf.keras.Model, - metrics: Optional[List[Any]] = None): - """Runs validatation step. - - Args: - inputs: A tuple of of input tensors of (features, labels). - model: A tf.keras.Model instance. - metrics: A nested structure of metrics objects. - - Returns: - A dictionary of logs. - """ - features, labels = inputs - one_hot = self.task_config.losses.one_hot - soft_labels = self.task_config.losses.soft_labels - is_multilabel = self.task_config.train_data.is_multilabel - if (one_hot or soft_labels) and not is_multilabel: - labels = tf.one_hot(labels, self.task_config.model.num_classes) - - outputs = self.inference_step(features, model) - outputs = tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), outputs) - loss = self.build_losses( - model_outputs=outputs, - labels=labels, - aux_losses=model.losses) - - logs = {self.loss: loss} - if metrics: - self.process_metrics(metrics, labels, outputs) - elif model.compiled_metrics: - self.process_compiled_metrics(model.compiled_metrics, labels, outputs) - logs.update({m.name: m.result() for m in model.metrics}) - return logs - - def inference_step(self, inputs: tf.Tensor, model: tf.keras.Model): - """Performs the forward step.""" - return model(inputs, training=False) diff --git a/official/vision/beta/tasks/maskrcnn.py b/official/vision/beta/tasks/maskrcnn.py deleted file mode 100644 index 5ded06de3..000000000 --- a/official/vision/beta/tasks/maskrcnn.py +++ /dev/null @@ -1,455 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""MaskRCNN task definition.""" -import os -from typing import Any, Optional, List, Tuple, Mapping - -from absl import logging -import tensorflow as tf -from official.common import dataset_fn -from official.core import base_task -from official.core import task_factory -from official.vision.beta.configs import maskrcnn as exp_cfg -from official.vision.beta.dataloaders import input_reader_factory -from official.vision.beta.dataloaders import maskrcnn_input -from official.vision.beta.dataloaders import tf_example_decoder -from official.vision.beta.dataloaders import tf_example_label_map_decoder -from official.vision.beta.evaluation import coco_evaluator -from official.vision.beta.evaluation import coco_utils -from official.vision.beta.losses import maskrcnn_losses -from official.vision.beta.modeling import factory - - -def zero_out_disallowed_class_ids(batch_class_ids: tf.Tensor, - allowed_class_ids: List[int]): - """Zero out IDs of classes not in allowed_class_ids. - - Args: - batch_class_ids: A [batch_size, num_instances] int tensor of input - class IDs. - allowed_class_ids: A python list of class IDs which we want to allow. - - Returns: - filtered_class_ids: A [batch_size, num_instances] int tensor with any - class ID not in allowed_class_ids set to 0. - """ - - allowed_class_ids = tf.constant(allowed_class_ids, - dtype=batch_class_ids.dtype) - - match_ids = (batch_class_ids[:, :, tf.newaxis] == - allowed_class_ids[tf.newaxis, tf.newaxis, :]) - - match_ids = tf.reduce_any(match_ids, axis=2) - return tf.where(match_ids, batch_class_ids, tf.zeros_like(batch_class_ids)) - - -@task_factory.register_task_cls(exp_cfg.MaskRCNNTask) -class MaskRCNNTask(base_task.Task): - """A single-replica view of training procedure. - - Mask R-CNN task provides artifacts for training/evalution procedures, - including loading/iterating over Datasets, initializing the model, calculating - the loss, post-processing, and customized metrics with reduction. - """ - - def build_model(self): - """Build Mask R-CNN model.""" - - input_specs = tf.keras.layers.InputSpec( - shape=[None] + self.task_config.model.input_size) - - l2_weight_decay = self.task_config.losses.l2_weight_decay - # Divide weight decay by 2.0 to match the implementation of tf.nn.l2_loss. - # (https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/l2) - # (https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss) - l2_regularizer = (tf.keras.regularizers.l2( - l2_weight_decay / 2.0) if l2_weight_decay else None) - - model = factory.build_maskrcnn( - input_specs=input_specs, - model_config=self.task_config.model, - l2_regularizer=l2_regularizer) - return model - - def initialize(self, model: tf.keras.Model): - """Loading pretrained checkpoint.""" - if not self.task_config.init_checkpoint: - return - - ckpt_dir_or_file = self.task_config.init_checkpoint - if tf.io.gfile.isdir(ckpt_dir_or_file): - ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file) - - # Restoring checkpoint. - if self.task_config.init_checkpoint_modules == 'all': - ckpt = tf.train.Checkpoint(**model.checkpoint_items) - status = ckpt.read(ckpt_dir_or_file) - status.expect_partial().assert_existing_objects_matched() - else: - ckpt_items = {} - if 'backbone' in self.task_config.init_checkpoint_modules: - ckpt_items.update(backbone=model.backbone) - if 'decoder' in self.task_config.init_checkpoint_modules: - ckpt_items.update(decoder=model.decoder) - - ckpt = tf.train.Checkpoint(**ckpt_items) - status = ckpt.read(ckpt_dir_or_file) - status.expect_partial().assert_existing_objects_matched() - - logging.info('Finished loading pretrained checkpoint from %s', - ckpt_dir_or_file) - - def build_inputs(self, - params: exp_cfg.DataConfig, - input_context: Optional[tf.distribute.InputContext] = None): - """Build input dataset.""" - decoder_cfg = params.decoder.get() - if params.decoder.type == 'simple_decoder': - decoder = tf_example_decoder.TfExampleDecoder( - include_mask=self._task_config.model.include_mask, - regenerate_source_id=decoder_cfg.regenerate_source_id, - mask_binarize_threshold=decoder_cfg.mask_binarize_threshold) - elif params.decoder.type == 'label_map_decoder': - decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap( - label_map=decoder_cfg.label_map, - include_mask=self._task_config.model.include_mask, - regenerate_source_id=decoder_cfg.regenerate_source_id, - mask_binarize_threshold=decoder_cfg.mask_binarize_threshold) - else: - raise ValueError('Unknown decoder type: {}!'.format(params.decoder.type)) - - parser = maskrcnn_input.Parser( - output_size=self.task_config.model.input_size[:2], - min_level=self.task_config.model.min_level, - max_level=self.task_config.model.max_level, - num_scales=self.task_config.model.anchor.num_scales, - aspect_ratios=self.task_config.model.anchor.aspect_ratios, - anchor_size=self.task_config.model.anchor.anchor_size, - dtype=params.dtype, - rpn_match_threshold=params.parser.rpn_match_threshold, - rpn_unmatched_threshold=params.parser.rpn_unmatched_threshold, - rpn_batch_size_per_im=params.parser.rpn_batch_size_per_im, - rpn_fg_fraction=params.parser.rpn_fg_fraction, - aug_rand_hflip=params.parser.aug_rand_hflip, - aug_scale_min=params.parser.aug_scale_min, - aug_scale_max=params.parser.aug_scale_max, - skip_crowd_during_training=params.parser.skip_crowd_during_training, - max_num_instances=params.parser.max_num_instances, - include_mask=self._task_config.model.include_mask, - mask_crop_size=params.parser.mask_crop_size) - - reader = input_reader_factory.input_reader_generator( - params, - dataset_fn=dataset_fn.pick_dataset_fn(params.file_type), - decoder_fn=decoder.decode, - parser_fn=parser.parse_fn(params.is_training)) - dataset = reader.read(input_context=input_context) - - return dataset - - def build_losses(self, - outputs: Mapping[str, Any], - labels: Mapping[str, Any], - aux_losses: Optional[Any] = None): - """Build Mask R-CNN losses.""" - params = self.task_config - cascade_ious = params.model.roi_sampler.cascade_iou_thresholds - - rpn_score_loss_fn = maskrcnn_losses.RpnScoreLoss( - tf.shape(outputs['box_outputs'])[1]) - rpn_box_loss_fn = maskrcnn_losses.RpnBoxLoss( - params.losses.rpn_huber_loss_delta) - rpn_score_loss = tf.reduce_mean( - rpn_score_loss_fn( - outputs['rpn_scores'], labels['rpn_score_targets'])) - rpn_box_loss = tf.reduce_mean( - rpn_box_loss_fn( - outputs['rpn_boxes'], labels['rpn_box_targets'])) - - frcnn_cls_loss_fn = maskrcnn_losses.FastrcnnClassLoss() - frcnn_box_loss_fn = maskrcnn_losses.FastrcnnBoxLoss( - params.losses.frcnn_huber_loss_delta, - params.model.detection_head.class_agnostic_bbox_pred) - - # Final cls/box losses are computed as an average of all detection heads. - frcnn_cls_loss = 0.0 - frcnn_box_loss = 0.0 - num_det_heads = 1 if cascade_ious is None else 1 + len(cascade_ious) - for cas_num in range(num_det_heads): - frcnn_cls_loss_i = tf.reduce_mean( - frcnn_cls_loss_fn( - outputs['class_outputs_{}' - .format(cas_num) if cas_num else 'class_outputs'], - outputs['class_targets_{}' - .format(cas_num) if cas_num else 'class_targets'])) - frcnn_box_loss_i = tf.reduce_mean( - frcnn_box_loss_fn( - outputs['box_outputs_{}'.format(cas_num - ) if cas_num else 'box_outputs'], - outputs['class_targets_{}' - .format(cas_num) if cas_num else 'class_targets'], - outputs['box_targets_{}'.format(cas_num - ) if cas_num else 'box_targets'])) - frcnn_cls_loss += frcnn_cls_loss_i - frcnn_box_loss += frcnn_box_loss_i - frcnn_cls_loss /= num_det_heads - frcnn_box_loss /= num_det_heads - - if params.model.include_mask: - mask_loss_fn = maskrcnn_losses.MaskrcnnLoss() - mask_class_targets = outputs['mask_class_targets'] - if self._task_config.allowed_mask_class_ids is not None: - # Classes with ID=0 are ignored by mask_loss_fn in loss computation. - mask_class_targets = zero_out_disallowed_class_ids( - mask_class_targets, self._task_config.allowed_mask_class_ids) - - mask_loss = tf.reduce_mean( - mask_loss_fn( - outputs['mask_outputs'], - outputs['mask_targets'], - mask_class_targets)) - else: - mask_loss = 0.0 - - model_loss = ( - params.losses.rpn_score_weight * rpn_score_loss + - params.losses.rpn_box_weight * rpn_box_loss + - params.losses.frcnn_class_weight * frcnn_cls_loss + - params.losses.frcnn_box_weight * frcnn_box_loss + - params.losses.mask_weight * mask_loss) - - total_loss = model_loss - if aux_losses: - reg_loss = tf.reduce_sum(aux_losses) - total_loss = model_loss + reg_loss - - total_loss = params.losses.loss_weight * total_loss - losses = { - 'total_loss': total_loss, - 'rpn_score_loss': rpn_score_loss, - 'rpn_box_loss': rpn_box_loss, - 'frcnn_cls_loss': frcnn_cls_loss, - 'frcnn_box_loss': frcnn_box_loss, - 'mask_loss': mask_loss, - 'model_loss': model_loss, - } - return losses - - def _build_coco_metrics(self): - """Build COCO metrics evaluator.""" - if (not self._task_config.model.include_mask - ) or self._task_config.annotation_file: - self.coco_metric = coco_evaluator.COCOEvaluator( - annotation_file=self._task_config.annotation_file, - include_mask=self._task_config.model.include_mask, - per_category_metrics=self._task_config.per_category_metrics) - else: - # Builds COCO-style annotation file if include_mask is True, and - # annotation_file isn't provided. - annotation_path = os.path.join(self._logging_dir, 'annotation.json') - if tf.io.gfile.exists(annotation_path): - logging.info( - 'annotation.json file exists, skipping creating the annotation' - ' file.') - else: - if self._task_config.validation_data.num_examples <= 0: - logging.info('validation_data.num_examples needs to be > 0') - if not self._task_config.validation_data.input_path: - logging.info('Can not create annotation file for tfds.') - logging.info( - 'Creating coco-style annotation file: %s', annotation_path) - coco_utils.scan_and_generator_annotation_file( - self._task_config.validation_data.input_path, - self._task_config.validation_data.file_type, - self._task_config.validation_data.num_examples, - self.task_config.model.include_mask, annotation_path, - regenerate_source_id=self._task_config.validation_data.decoder - .simple_decoder.regenerate_source_id) - self.coco_metric = coco_evaluator.COCOEvaluator( - annotation_file=annotation_path, - include_mask=self._task_config.model.include_mask, - per_category_metrics=self._task_config.per_category_metrics) - - def build_metrics(self, training: bool = True): - """Build detection metrics.""" - metrics = [] - if training: - metric_names = [ - 'total_loss', - 'rpn_score_loss', - 'rpn_box_loss', - 'frcnn_cls_loss', - 'frcnn_box_loss', - 'mask_loss', - 'model_loss' - ] - for name in metric_names: - metrics.append(tf.keras.metrics.Mean(name, dtype=tf.float32)) - - else: - if self._task_config.use_coco_metrics: - self._build_coco_metrics() - if self._task_config.use_wod_metrics: - # To use Waymo open dataset metrics, please install one of the pip - # package `waymo-open-dataset-tf-*` from - # https://github.com/waymo-research/waymo-open-dataset/blob/master/docs/quick_start.md#use-pre-compiled-pippip3-packages-for-linux - # Note that the package is built with specific tensorflow version and - # will produce error if it does not match the tf version that is - # currently used. - try: - from official.vision.beta.evaluation import wod_detection_evaluator # pylint: disable=g-import-not-at-top - except ModuleNotFoundError: - logging.error('waymo-open-dataset should be installed to enable Waymo' - ' evaluator.') - raise - self.wod_metric = wod_detection_evaluator.WOD2dDetectionEvaluator() - - return metrics - - def train_step(self, - inputs: Tuple[Any, Any], - model: tf.keras.Model, - optimizer: tf.keras.optimizers.Optimizer, - metrics: Optional[List[Any]] = None): - """Does forward and backward. - - Args: - inputs: a dictionary of input tensors. - model: the model, forward pass definition. - optimizer: the optimizer for this training step. - metrics: a nested structure of metrics objects. - - Returns: - A dictionary of logs. - """ - images, labels = inputs - num_replicas = tf.distribute.get_strategy().num_replicas_in_sync - with tf.GradientTape() as tape: - outputs = model( - images, - image_shape=labels['image_info'][:, 1, :], - anchor_boxes=labels['anchor_boxes'], - gt_boxes=labels['gt_boxes'], - gt_classes=labels['gt_classes'], - gt_masks=(labels['gt_masks'] if self.task_config.model.include_mask - else None), - training=True) - outputs = tf.nest.map_structure( - lambda x: tf.cast(x, tf.float32), outputs) - - # Computes per-replica loss. - losses = self.build_losses( - outputs=outputs, labels=labels, aux_losses=model.losses) - scaled_loss = losses['total_loss'] / num_replicas - - # For mixed_precision policy, when LossScaleOptimizer is used, loss is - # scaled for numerical stability. - if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer): - scaled_loss = optimizer.get_scaled_loss(scaled_loss) - - tvars = model.trainable_variables - grads = tape.gradient(scaled_loss, tvars) - # Scales back gradient when LossScaleOptimizer is used. - if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer): - grads = optimizer.get_unscaled_gradients(grads) - optimizer.apply_gradients(list(zip(grads, tvars))) - - logs = {self.loss: losses['total_loss']} - - if metrics: - for m in metrics: - m.update_state(losses[m.name]) - - return logs - - def validation_step(self, - inputs: Tuple[Any, Any], - model: tf.keras.Model, - metrics: Optional[List[Any]] = None): - """Validatation step. - - Args: - inputs: a dictionary of input tensors. - model: the keras.Model. - metrics: a nested structure of metrics objects. - - Returns: - A dictionary of logs. - """ - images, labels = inputs - - outputs = model( - images, - anchor_boxes=labels['anchor_boxes'], - image_shape=labels['image_info'][:, 1, :], - training=False) - - logs = {self.loss: 0} - if self._task_config.use_coco_metrics: - coco_model_outputs = { - 'detection_boxes': outputs['detection_boxes'], - 'detection_scores': outputs['detection_scores'], - 'detection_classes': outputs['detection_classes'], - 'num_detections': outputs['num_detections'], - 'source_id': labels['groundtruths']['source_id'], - 'image_info': labels['image_info'] - } - if self.task_config.model.include_mask: - coco_model_outputs.update({ - 'detection_masks': outputs['detection_masks'], - }) - logs.update( - {self.coco_metric.name: (labels['groundtruths'], coco_model_outputs)}) - - if self.task_config.use_wod_metrics: - wod_model_outputs = { - 'detection_boxes': outputs['detection_boxes'], - 'detection_scores': outputs['detection_scores'], - 'detection_classes': outputs['detection_classes'], - 'num_detections': outputs['num_detections'], - 'source_id': labels['groundtruths']['source_id'], - 'image_info': labels['image_info'] - } - logs.update( - {self.wod_metric.name: (labels['groundtruths'], wod_model_outputs)}) - return logs - - def aggregate_logs(self, state=None, step_outputs=None): - if self._task_config.use_coco_metrics: - if state is None: - self.coco_metric.reset_states() - self.coco_metric.update_state( - step_outputs[self.coco_metric.name][0], - step_outputs[self.coco_metric.name][1]) - if self._task_config.use_wod_metrics: - if state is None: - self.wod_metric.reset_states() - self.wod_metric.update_state( - step_outputs[self.wod_metric.name][0], - step_outputs[self.wod_metric.name][1]) - if state is None: - # Create an arbitrary state to indicate it's not the first step in the - # following calls to this function. - state = True - return state - - def reduce_aggregated_logs(self, aggregated_logs, global_step=None): - logs = {} - if self._task_config.use_coco_metrics: - logs.update(self.coco_metric.result()) - if self._task_config.use_wod_metrics: - logs.update(self.wod_metric.result()) - return logs diff --git a/official/vision/beta/tasks/retinanet.py b/official/vision/beta/tasks/retinanet.py deleted file mode 100644 index e1f28e779..000000000 --- a/official/vision/beta/tasks/retinanet.py +++ /dev/null @@ -1,358 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""RetinaNet task definition.""" -from typing import Any, List, Mapping, Optional, Tuple - -from absl import logging -import tensorflow as tf - -from official.common import dataset_fn -from official.core import base_task -from official.core import task_factory -from official.vision.beta.configs import retinanet as exp_cfg -from official.vision.beta.dataloaders import input_reader_factory -from official.vision.beta.dataloaders import retinanet_input -from official.vision.beta.dataloaders import tf_example_decoder -from official.vision.beta.dataloaders import tfds_factory -from official.vision.beta.dataloaders import tf_example_label_map_decoder -from official.vision.beta.evaluation import coco_evaluator -from official.vision.beta.losses import focal_loss -from official.vision.beta.losses import loss_utils -from official.vision.beta.modeling import factory - - -@task_factory.register_task_cls(exp_cfg.RetinaNetTask) -class RetinaNetTask(base_task.Task): - """A single-replica view of training procedure. - - RetinaNet task provides artifacts for training/evalution procedures, including - loading/iterating over Datasets, initializing the model, calculating the loss, - post-processing, and customized metrics with reduction. - """ - - def build_model(self): - """Build RetinaNet model.""" - - input_specs = tf.keras.layers.InputSpec( - shape=[None] + self.task_config.model.input_size) - - l2_weight_decay = self.task_config.losses.l2_weight_decay - # Divide weight decay by 2.0 to match the implementation of tf.nn.l2_loss. - # (https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/l2) - # (https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss) - l2_regularizer = (tf.keras.regularizers.l2( - l2_weight_decay / 2.0) if l2_weight_decay else None) - - model = factory.build_retinanet( - input_specs=input_specs, - model_config=self.task_config.model, - l2_regularizer=l2_regularizer) - return model - - def initialize(self, model: tf.keras.Model): - """Loading pretrained checkpoint.""" - if not self.task_config.init_checkpoint: - return - - ckpt_dir_or_file = self.task_config.init_checkpoint - if tf.io.gfile.isdir(ckpt_dir_or_file): - ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file) - - # Restoring checkpoint. - if self.task_config.init_checkpoint_modules == 'all': - ckpt = tf.train.Checkpoint(**model.checkpoint_items) - status = ckpt.read(ckpt_dir_or_file) - status.expect_partial().assert_existing_objects_matched() - else: - ckpt_items = {} - if 'backbone' in self.task_config.init_checkpoint_modules: - ckpt_items.update(backbone=model.backbone) - if 'decoder' in self.task_config.init_checkpoint_modules: - ckpt_items.update(decoder=model.decoder) - - ckpt = tf.train.Checkpoint(**ckpt_items) - status = ckpt.read(ckpt_dir_or_file) - status.expect_partial().assert_existing_objects_matched() - - logging.info('Finished loading pretrained checkpoint from %s', - ckpt_dir_or_file) - - def build_inputs(self, - params: exp_cfg.DataConfig, - input_context: Optional[tf.distribute.InputContext] = None): - """Build input dataset.""" - - if params.tfds_name: - decoder = tfds_factory.get_detection_decoder(params.tfds_name) - else: - decoder_cfg = params.decoder.get() - if params.decoder.type == 'simple_decoder': - decoder = tf_example_decoder.TfExampleDecoder( - regenerate_source_id=decoder_cfg.regenerate_source_id) - elif params.decoder.type == 'label_map_decoder': - decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap( - label_map=decoder_cfg.label_map, - regenerate_source_id=decoder_cfg.regenerate_source_id) - else: - raise ValueError('Unknown decoder type: {}!'.format( - params.decoder.type)) - - parser = retinanet_input.Parser( - output_size=self.task_config.model.input_size[:2], - min_level=self.task_config.model.min_level, - max_level=self.task_config.model.max_level, - num_scales=self.task_config.model.anchor.num_scales, - aspect_ratios=self.task_config.model.anchor.aspect_ratios, - anchor_size=self.task_config.model.anchor.anchor_size, - dtype=params.dtype, - match_threshold=params.parser.match_threshold, - unmatched_threshold=params.parser.unmatched_threshold, - aug_type=params.parser.aug_type, - aug_rand_hflip=params.parser.aug_rand_hflip, - aug_scale_min=params.parser.aug_scale_min, - aug_scale_max=params.parser.aug_scale_max, - skip_crowd_during_training=params.parser.skip_crowd_during_training, - max_num_instances=params.parser.max_num_instances) - - reader = input_reader_factory.input_reader_generator( - params, - dataset_fn=dataset_fn.pick_dataset_fn(params.file_type), - decoder_fn=decoder.decode, - parser_fn=parser.parse_fn(params.is_training)) - dataset = reader.read(input_context=input_context) - - return dataset - - def build_attribute_loss(self, - attribute_heads: List[exp_cfg.AttributeHead], - outputs: Mapping[str, Any], - labels: Mapping[str, Any], - box_sample_weight: tf.Tensor) -> float: - """Computes attribute loss. - - Args: - attribute_heads: a list of attribute head configs. - outputs: RetinaNet model outputs. - labels: RetinaNet labels. - box_sample_weight: normalized bounding box sample weights. - - Returns: - Attribute loss of all attribute heads. - """ - attribute_loss = 0.0 - for head in attribute_heads: - if head.name not in labels['attribute_targets']: - raise ValueError(f'Attribute {head.name} not found in label targets.') - if head.name not in outputs['attribute_outputs']: - raise ValueError(f'Attribute {head.name} not found in model outputs.') - - y_true_att = loss_utils.multi_level_flatten( - labels['attribute_targets'][head.name], last_dim=head.size) - y_pred_att = loss_utils.multi_level_flatten( - outputs['attribute_outputs'][head.name], last_dim=head.size) - if head.type == 'regression': - att_loss_fn = tf.keras.losses.Huber( - 1.0, reduction=tf.keras.losses.Reduction.SUM) - att_loss = att_loss_fn( - y_true=y_true_att, - y_pred=y_pred_att, - sample_weight=box_sample_weight) - else: - raise ValueError(f'Attribute type {head.type} not supported.') - attribute_loss += att_loss - - return attribute_loss - - def build_losses(self, - outputs: Mapping[str, Any], - labels: Mapping[str, Any], - aux_losses: Optional[Any] = None): - """Build RetinaNet losses.""" - params = self.task_config - attribute_heads = self.task_config.model.head.attribute_heads - - cls_loss_fn = focal_loss.FocalLoss( - alpha=params.losses.focal_loss_alpha, - gamma=params.losses.focal_loss_gamma, - reduction=tf.keras.losses.Reduction.SUM) - box_loss_fn = tf.keras.losses.Huber( - params.losses.huber_loss_delta, reduction=tf.keras.losses.Reduction.SUM) - - # Sums all positives in a batch for normalization and avoids zero - # num_positives_sum, which would lead to inf loss during training - cls_sample_weight = labels['cls_weights'] - box_sample_weight = labels['box_weights'] - num_positives = tf.reduce_sum(box_sample_weight) + 1.0 - cls_sample_weight = cls_sample_weight / num_positives - box_sample_weight = box_sample_weight / num_positives - y_true_cls = loss_utils.multi_level_flatten( - labels['cls_targets'], last_dim=None) - y_true_cls = tf.one_hot(y_true_cls, params.model.num_classes) - y_pred_cls = loss_utils.multi_level_flatten( - outputs['cls_outputs'], last_dim=params.model.num_classes) - y_true_box = loss_utils.multi_level_flatten( - labels['box_targets'], last_dim=4) - y_pred_box = loss_utils.multi_level_flatten( - outputs['box_outputs'], last_dim=4) - - cls_loss = cls_loss_fn( - y_true=y_true_cls, y_pred=y_pred_cls, sample_weight=cls_sample_weight) - box_loss = box_loss_fn( - y_true=y_true_box, y_pred=y_pred_box, sample_weight=box_sample_weight) - - model_loss = cls_loss + params.losses.box_loss_weight * box_loss - - if attribute_heads: - model_loss += self.build_attribute_loss(attribute_heads, outputs, labels, - box_sample_weight) - - total_loss = model_loss - if aux_losses: - reg_loss = tf.reduce_sum(aux_losses) - total_loss = model_loss + reg_loss - - total_loss = params.losses.loss_weight * total_loss - - return total_loss, cls_loss, box_loss, model_loss - - def build_metrics(self, training: bool = True): - """Build detection metrics.""" - metrics = [] - metric_names = ['total_loss', 'cls_loss', 'box_loss', 'model_loss'] - for name in metric_names: - metrics.append(tf.keras.metrics.Mean(name, dtype=tf.float32)) - - if not training: - if self.task_config.validation_data.tfds_name and self.task_config.annotation_file: - raise ValueError( - "Can't evaluate using annotation file when TFDS is used.") - self.coco_metric = coco_evaluator.COCOEvaluator( - annotation_file=self.task_config.annotation_file, - include_mask=False, - per_category_metrics=self.task_config.per_category_metrics) - - return metrics - - def train_step(self, - inputs: Tuple[Any, Any], - model: tf.keras.Model, - optimizer: tf.keras.optimizers.Optimizer, - metrics: Optional[List[Any]] = None): - """Does forward and backward. - - Args: - inputs: a dictionary of input tensors. - model: the model, forward pass definition. - optimizer: the optimizer for this training step. - metrics: a nested structure of metrics objects. - - Returns: - A dictionary of logs. - """ - features, labels = inputs - num_replicas = tf.distribute.get_strategy().num_replicas_in_sync - with tf.GradientTape() as tape: - outputs = model(features, training=True) - outputs = tf.nest.map_structure( - lambda x: tf.cast(x, tf.float32), outputs) - - # Computes per-replica loss. - loss, cls_loss, box_loss, model_loss = self.build_losses( - outputs=outputs, labels=labels, aux_losses=model.losses) - scaled_loss = loss / num_replicas - - # For mixed_precision policy, when LossScaleOptimizer is used, loss is - # scaled for numerical stability. - if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer): - scaled_loss = optimizer.get_scaled_loss(scaled_loss) - - tvars = model.trainable_variables - grads = tape.gradient(scaled_loss, tvars) - # Scales back gradient when LossScaleOptimizer is used. - if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer): - grads = optimizer.get_unscaled_gradients(grads) - optimizer.apply_gradients(list(zip(grads, tvars))) - - logs = {self.loss: loss} - - all_losses = { - 'total_loss': loss, - 'cls_loss': cls_loss, - 'box_loss': box_loss, - 'model_loss': model_loss, - } - if metrics: - for m in metrics: - m.update_state(all_losses[m.name]) - logs.update({m.name: m.result()}) - - return logs - - def validation_step(self, - inputs: Tuple[Any, Any], - model: tf.keras.Model, - metrics: Optional[List[Any]] = None): - """Validatation step. - - Args: - inputs: a dictionary of input tensors. - model: the keras.Model. - metrics: a nested structure of metrics objects. - - Returns: - A dictionary of logs. - """ - features, labels = inputs - - outputs = model(features, anchor_boxes=labels['anchor_boxes'], - image_shape=labels['image_info'][:, 1, :], - training=False) - loss, cls_loss, box_loss, model_loss = self.build_losses( - outputs=outputs, labels=labels, aux_losses=model.losses) - logs = {self.loss: loss} - - all_losses = { - 'total_loss': loss, - 'cls_loss': cls_loss, - 'box_loss': box_loss, - 'model_loss': model_loss, - } - - coco_model_outputs = { - 'detection_boxes': outputs['detection_boxes'], - 'detection_scores': outputs['detection_scores'], - 'detection_classes': outputs['detection_classes'], - 'num_detections': outputs['num_detections'], - 'source_id': labels['groundtruths']['source_id'], - 'image_info': labels['image_info'] - } - logs.update({self.coco_metric.name: (labels['groundtruths'], - coco_model_outputs)}) - if metrics: - for m in metrics: - m.update_state(all_losses[m.name]) - logs.update({m.name: m.result()}) - return logs - - def aggregate_logs(self, state=None, step_outputs=None): - if state is None: - self.coco_metric.reset_states() - state = self.coco_metric - self.coco_metric.update_state(step_outputs[self.coco_metric.name][0], - step_outputs[self.coco_metric.name][1]) - return state - - def reduce_aggregated_logs(self, aggregated_logs, global_step=None): - return self.coco_metric.result() diff --git a/official/vision/beta/tasks/semantic_segmentation.py b/official/vision/beta/tasks/semantic_segmentation.py deleted file mode 100644 index 70d7ce192..000000000 --- a/official/vision/beta/tasks/semantic_segmentation.py +++ /dev/null @@ -1,337 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Image segmentation task definition.""" -from typing import Any, Optional, List, Tuple, Mapping, Union - -from absl import logging -import tensorflow as tf -from official.common import dataset_fn -from official.core import base_task -from official.core import task_factory -from official.vision.beta.configs import semantic_segmentation as exp_cfg -from official.vision.beta.dataloaders import input_reader_factory -from official.vision.beta.dataloaders import segmentation_input -from official.vision.beta.dataloaders import tfds_factory -from official.vision.beta.evaluation import segmentation_metrics -from official.vision.beta.losses import segmentation_losses -from official.vision.beta.modeling import factory - - -@task_factory.register_task_cls(exp_cfg.SemanticSegmentationTask) -class SemanticSegmentationTask(base_task.Task): - """A task for semantic segmentation.""" - - def build_model(self): - """Builds segmentation model.""" - input_specs = tf.keras.layers.InputSpec( - shape=[None] + self.task_config.model.input_size) - - l2_weight_decay = self.task_config.losses.l2_weight_decay - # Divide weight decay by 2.0 to match the implementation of tf.nn.l2_loss. - # (https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/l2) - # (https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss) - l2_regularizer = (tf.keras.regularizers.l2( - l2_weight_decay / 2.0) if l2_weight_decay else None) - - model = factory.build_segmentation_model( - input_specs=input_specs, - model_config=self.task_config.model, - l2_regularizer=l2_regularizer) - return model - - def initialize(self, model: tf.keras.Model): - """Loads pretrained checkpoint.""" - if not self.task_config.init_checkpoint: - return - - ckpt_dir_or_file = self.task_config.init_checkpoint - if tf.io.gfile.isdir(ckpt_dir_or_file): - ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file) - - # Restoring checkpoint. - if 'all' in self.task_config.init_checkpoint_modules: - ckpt = tf.train.Checkpoint(**model.checkpoint_items) - status = ckpt.read(ckpt_dir_or_file) - status.expect_partial().assert_existing_objects_matched() - else: - ckpt_items = {} - if 'backbone' in self.task_config.init_checkpoint_modules: - ckpt_items.update(backbone=model.backbone) - if 'decoder' in self.task_config.init_checkpoint_modules: - ckpt_items.update(decoder=model.decoder) - - ckpt = tf.train.Checkpoint(**ckpt_items) - status = ckpt.read(ckpt_dir_or_file) - status.expect_partial().assert_existing_objects_matched() - - logging.info('Finished loading pretrained checkpoint from %s', - ckpt_dir_or_file) - - def build_inputs(self, - params: exp_cfg.DataConfig, - input_context: Optional[tf.distribute.InputContext] = None): - """Builds classification input.""" - - ignore_label = self.task_config.losses.ignore_label - - if params.tfds_name: - decoder = tfds_factory.get_segmentation_decoder(params.tfds_name) - else: - decoder = segmentation_input.Decoder() - - parser = segmentation_input.Parser( - output_size=params.output_size, - crop_size=params.crop_size, - ignore_label=ignore_label, - resize_eval_groundtruth=params.resize_eval_groundtruth, - groundtruth_padded_size=params.groundtruth_padded_size, - aug_scale_min=params.aug_scale_min, - aug_scale_max=params.aug_scale_max, - aug_rand_hflip=params.aug_rand_hflip, - preserve_aspect_ratio=params.preserve_aspect_ratio, - dtype=params.dtype) - - reader = input_reader_factory.input_reader_generator( - params, - dataset_fn=dataset_fn.pick_dataset_fn(params.file_type), - decoder_fn=decoder.decode, - parser_fn=parser.parse_fn(params.is_training)) - - dataset = reader.read(input_context=input_context) - - return dataset - - def build_losses(self, - labels: Mapping[str, tf.Tensor], - model_outputs: Union[Mapping[str, tf.Tensor], tf.Tensor], - aux_losses: Optional[Any] = None): - """Segmentation loss. - - Args: - labels: labels. - model_outputs: Output logits of the classifier. - aux_losses: auxiliarly loss tensors, i.e. `losses` in keras.Model. - - Returns: - The total loss tensor. - """ - loss_params = self._task_config.losses - segmentation_loss_fn = segmentation_losses.SegmentationLoss( - loss_params.label_smoothing, - loss_params.class_weights, - loss_params.ignore_label, - use_groundtruth_dimension=loss_params.use_groundtruth_dimension, - top_k_percent_pixels=loss_params.top_k_percent_pixels) - - total_loss = segmentation_loss_fn(model_outputs['logits'], labels['masks']) - - if 'mask_scores' in model_outputs: - mask_scoring_loss_fn = segmentation_losses.MaskScoringLoss( - loss_params.ignore_label) - total_loss += mask_scoring_loss_fn( - model_outputs['mask_scores'], - model_outputs['logits'], - labels['masks']) - - if aux_losses: - total_loss += tf.add_n(aux_losses) - - total_loss = loss_params.loss_weight * total_loss - - return total_loss - - def process_metrics(self, metrics, labels, model_outputs, **kwargs): - """Process and update metrics. - - Called when using custom training loop API. - - Args: - metrics: a nested structure of metrics objects. The return of function - self.build_metrics. - labels: a tensor or a nested structure of tensors. - model_outputs: a tensor or a nested structure of tensors. For example, - output of the keras model built by self.build_model. - **kwargs: other args. - """ - for metric in metrics: - if 'mask_scores_mse' is metric.name: - actual_mask_scores = segmentation_losses.get_actual_mask_scores( - model_outputs['logits'], labels['masks'], - self.task_config.losses.ignore_label) - metric.update_state(actual_mask_scores, model_outputs['mask_scores']) - else: - metric.update_state(labels, model_outputs['logits']) - - def build_metrics(self, training: bool = True): - """Gets streaming metrics for training/validation.""" - metrics = [] - if training and self.task_config.evaluation.report_train_mean_iou: - metrics.append(segmentation_metrics.MeanIoU( - name='mean_iou', - num_classes=self.task_config.model.num_classes, - rescale_predictions=False, - dtype=tf.float32)) - if self.task_config.model.get('mask_scoring_head'): - metrics.append( - tf.keras.metrics.MeanSquaredError(name='mask_scores_mse')) - else: - self.iou_metric = segmentation_metrics.PerClassIoU( - name='per_class_iou', - num_classes=self.task_config.model.num_classes, - rescale_predictions=not self.task_config.validation_data - .resize_eval_groundtruth, - dtype=tf.float32) - if self.task_config.validation_data.resize_eval_groundtruth and self.task_config.model.get('mask_scoring_head'): # pylint: disable=line-too-long - # Masks scores metric can only be computed if labels are scaled to match - # preticted mask scores. - metrics.append( - tf.keras.metrics.MeanSquaredError(name='mask_scores_mse')) - - # Update state on CPU if TPUStrategy due to dynamic resizing. - self._process_iou_metric_on_cpu = isinstance( - tf.distribute.get_strategy(), tf.distribute.TPUStrategy) - - return metrics - - def train_step(self, - inputs: Tuple[Any, Any], - model: tf.keras.Model, - optimizer: tf.keras.optimizers.Optimizer, - metrics: Optional[List[Any]] = None): - """Does forward and backward. - - Args: - inputs: a dictionary of input tensors. - model: the model, forward pass definition. - optimizer: the optimizer for this training step. - metrics: a nested structure of metrics objects. - - Returns: - A dictionary of logs. - """ - features, labels = inputs - - input_partition_dims = self.task_config.train_input_partition_dims - if input_partition_dims: - strategy = tf.distribute.get_strategy() - features = strategy.experimental_split_to_logical_devices( - features, input_partition_dims) - - num_replicas = tf.distribute.get_strategy().num_replicas_in_sync - with tf.GradientTape() as tape: - outputs = model(features, training=True) - if isinstance(outputs, tf.Tensor): - outputs = {'logits': outputs} - # Casting output layer as float32 is necessary when mixed_precision is - # mixed_float16 or mixed_bfloat16 to ensure output is casted as float32. - outputs = tf.nest.map_structure( - lambda x: tf.cast(x, tf.float32), outputs) - - # Computes per-replica loss. - loss = self.build_losses( - model_outputs=outputs, labels=labels, aux_losses=model.losses) - # Scales loss as the default gradients allreduce performs sum inside the - # optimizer. - scaled_loss = loss / num_replicas - - # For mixed_precision policy, when LossScaleOptimizer is used, loss is - # scaled for numerical stability. - if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer): - scaled_loss = optimizer.get_scaled_loss(scaled_loss) - - tvars = model.trainable_variables - grads = tape.gradient(scaled_loss, tvars) - # Scales back gradient before apply_gradients when LossScaleOptimizer is - # used. - if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer): - grads = optimizer.get_unscaled_gradients(grads) - optimizer.apply_gradients(list(zip(grads, tvars))) - - logs = {self.loss: loss} - if metrics: - self.process_metrics(metrics, labels, outputs) - logs.update({m.name: m.result() for m in metrics}) - - return logs - - def validation_step(self, - inputs: Tuple[Any, Any], - model: tf.keras.Model, - metrics: Optional[List[Any]] = None): - """Validatation step. - - Args: - inputs: a dictionary of input tensors. - model: the keras.Model. - metrics: a nested structure of metrics objects. - - Returns: - A dictionary of logs. - """ - features, labels = inputs - - input_partition_dims = self.task_config.eval_input_partition_dims - if input_partition_dims: - strategy = tf.distribute.get_strategy() - features = strategy.experimental_split_to_logical_devices( - features, input_partition_dims) - - outputs = self.inference_step(features, model) - if isinstance(outputs, tf.Tensor): - outputs = {'logits': outputs} - outputs = tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), outputs) - - if self.task_config.validation_data.resize_eval_groundtruth: - loss = self.build_losses(model_outputs=outputs, labels=labels, - aux_losses=model.losses) - else: - loss = 0 - - logs = {self.loss: loss} - - if self._process_iou_metric_on_cpu: - logs.update({self.iou_metric.name: (labels, outputs['logits'])}) - else: - self.iou_metric.update_state(labels, outputs['logits']) - - if metrics: - self.process_metrics(metrics, labels, outputs) - logs.update({m.name: m.result() for m in metrics}) - - return logs - - def inference_step(self, inputs: tf.Tensor, model: tf.keras.Model): - """Performs the forward step.""" - return model(inputs, training=False) - - def aggregate_logs(self, state=None, step_outputs=None): - if state is None: - self.iou_metric.reset_states() - state = self.iou_metric - if self._process_iou_metric_on_cpu: - self.iou_metric.update_state(step_outputs[self.iou_metric.name][0], - step_outputs[self.iou_metric.name][1]) - return state - - def reduce_aggregated_logs(self, aggregated_logs, global_step=None): - result = {} - ious = self.iou_metric.result() - # TODO(arashwan): support loading class name from a label map file. - if self.task_config.evaluation.report_per_class_iou: - for i, value in enumerate(ious.numpy()): - result.update({'iou/{}'.format(i): value}) - # Computes mean IoU - result.update({'mean_iou': tf.reduce_mean(ious).numpy()}) - return result diff --git a/official/vision/beta/tasks/video_classification.py b/official/vision/beta/tasks/video_classification.py deleted file mode 100644 index 599a5ee90..000000000 --- a/official/vision/beta/tasks/video_classification.py +++ /dev/null @@ -1,353 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Video classification task definition.""" -from typing import Any, Optional, List, Tuple - -from absl import logging -import tensorflow as tf -from official.core import base_task -from official.core import task_factory -from official.modeling import tf_utils -from official.vision.beta.configs import video_classification as exp_cfg -from official.vision.beta.dataloaders import input_reader_factory -from official.vision.beta.dataloaders import video_input -from official.vision.beta.modeling import factory_3d - - -@task_factory.register_task_cls(exp_cfg.VideoClassificationTask) -class VideoClassificationTask(base_task.Task): - """A task for video classification.""" - - def _get_num_classes(self): - """Gets the number of classes.""" - return self.task_config.train_data.num_classes - - def _get_feature_shape(self): - """Get the common feature shape for train and eval.""" - return [ - d1 if d1 == d2 else None - for d1, d2 in zip(self.task_config.train_data.feature_shape, - self.task_config.validation_data.feature_shape) - ] - - def _get_num_test_views(self): - """Gets number of views for test.""" - num_test_clips = self.task_config.validation_data.num_test_clips - num_test_crops = self.task_config.validation_data.num_test_crops - num_test_views = num_test_clips * num_test_crops - return num_test_views - - def _is_multilabel(self): - """If the label is multi-labels.""" - return self.task_config.train_data.is_multilabel - - def build_model(self): - """Builds video classification model.""" - common_input_shape = self._get_feature_shape() - input_specs = tf.keras.layers.InputSpec(shape=[None] + common_input_shape) - logging.info('Build model input %r', common_input_shape) - - l2_weight_decay = self.task_config.losses.l2_weight_decay - # Divide weight decay by 2.0 to match the implementation of tf.nn.l2_loss. - # (https://www.tensorflow.org/api_docs/python/tf/keras/regularizers/l2) - # (https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss) - l2_regularizer = (tf.keras.regularizers.l2( - l2_weight_decay / 2.0) if l2_weight_decay else None) - - model = factory_3d.build_model( - self.task_config.model.model_type, - input_specs=input_specs, - model_config=self.task_config.model, - num_classes=self._get_num_classes(), - l2_regularizer=l2_regularizer) - return model - - def initialize(self, model: tf.keras.Model): - """Loads pretrained checkpoint.""" - if not self.task_config.init_checkpoint: - return - - ckpt_dir_or_file = self.task_config.init_checkpoint - if tf.io.gfile.isdir(ckpt_dir_or_file): - ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file) - - # Restoring checkpoint. - if self.task_config.init_checkpoint_modules == 'all': - ckpt = tf.train.Checkpoint(**model.checkpoint_items) - status = ckpt.read(ckpt_dir_or_file) - status.expect_partial().assert_existing_objects_matched() - elif self.task_config.init_checkpoint_modules == 'backbone': - ckpt = tf.train.Checkpoint(backbone=model.backbone) - status = ckpt.read(ckpt_dir_or_file) - status.expect_partial().assert_existing_objects_matched() - else: - raise ValueError( - "Only 'all' or 'backbone' can be used to initialize the model.") - - logging.info('Finished loading pretrained checkpoint from %s', - ckpt_dir_or_file) - - def _get_dataset_fn(self, params): - if params.file_type == 'tfrecord': - return tf.data.TFRecordDataset - else: - raise ValueError('Unknown input file type {!r}'.format(params.file_type)) - - def _get_decoder_fn(self, params): - if params.tfds_name: - decoder = video_input.VideoTfdsDecoder( - image_key=params.image_field_key, label_key=params.label_field_key) - else: - decoder = video_input.Decoder( - image_key=params.image_field_key, label_key=params.label_field_key) - if self.task_config.train_data.output_audio: - assert self.task_config.train_data.audio_feature, 'audio feature is empty' - decoder.add_feature(self.task_config.train_data.audio_feature, - tf.io.VarLenFeature(dtype=tf.float32)) - return decoder.decode - - def build_inputs(self, - params: exp_cfg.DataConfig, - input_context: Optional[tf.distribute.InputContext] = None): - """Builds classification input.""" - - parser = video_input.Parser( - input_params=params, - image_key=params.image_field_key, - label_key=params.label_field_key) - postprocess_fn = video_input.PostBatchProcessor(params) - - reader = input_reader_factory.input_reader_generator( - params, - dataset_fn=self._get_dataset_fn(params), - decoder_fn=self._get_decoder_fn(params), - parser_fn=parser.parse_fn(params.is_training), - postprocess_fn=postprocess_fn) - - dataset = reader.read(input_context=input_context) - - return dataset - - def build_losses(self, - labels: Any, - model_outputs: Any, - aux_losses: Optional[Any] = None): - """Sparse categorical cross entropy loss. - - Args: - labels: labels. - model_outputs: Output logits of the classifier. - aux_losses: auxiliarly loss tensors, i.e. `losses` in keras.Model. - - Returns: - The total loss tensor. - """ - all_losses = {} - losses_config = self.task_config.losses - total_loss = None - if self._is_multilabel(): - entropy = -tf.reduce_mean( - tf.reduce_sum(model_outputs * tf.math.log(model_outputs + 1e-8), -1)) - total_loss = tf.keras.losses.binary_crossentropy( - labels, model_outputs, from_logits=False) - all_losses.update({ - 'class_loss': total_loss, - 'entropy': entropy, - }) - else: - if losses_config.one_hot: - total_loss = tf.keras.losses.categorical_crossentropy( - labels, - model_outputs, - from_logits=False, - label_smoothing=losses_config.label_smoothing) - else: - total_loss = tf.keras.losses.sparse_categorical_crossentropy( - labels, model_outputs, from_logits=False) - - total_loss = tf_utils.safe_mean(total_loss) - all_losses.update({ - 'class_loss': total_loss, - }) - if aux_losses: - all_losses.update({ - 'reg_loss': aux_losses, - }) - total_loss += tf.add_n(aux_losses) - all_losses[self.loss] = total_loss - - return all_losses - - def build_metrics(self, training: bool = True): - """Gets streaming metrics for training/validation.""" - if self.task_config.losses.one_hot: - metrics = [ - tf.keras.metrics.CategoricalAccuracy(name='accuracy'), - tf.keras.metrics.TopKCategoricalAccuracy(k=1, name='top_1_accuracy'), - tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='top_5_accuracy') - ] - if self._is_multilabel(): - metrics.append( - tf.keras.metrics.AUC( - curve='ROC', multi_label=self._is_multilabel(), name='ROC-AUC')) - metrics.append( - tf.keras.metrics.RecallAtPrecision( - 0.95, name='RecallAtPrecision95')) - metrics.append( - tf.keras.metrics.AUC( - curve='PR', multi_label=self._is_multilabel(), name='PR-AUC')) - if self.task_config.metrics.use_per_class_recall: - for i in range(self._get_num_classes()): - metrics.append( - tf.keras.metrics.Recall(class_id=i, name=f'recall-{i}')) - else: - metrics = [ - tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'), - tf.keras.metrics.SparseTopKCategoricalAccuracy( - k=1, name='top_1_accuracy'), - tf.keras.metrics.SparseTopKCategoricalAccuracy( - k=5, name='top_5_accuracy') - ] - return metrics - - def process_metrics(self, metrics: List[Any], labels: Any, - model_outputs: Any): - """Process and update metrics. - - Called when using custom training loop API. - - Args: - metrics: a nested structure of metrics objects. The return of function - self.build_metrics. - labels: a tensor or a nested structure of tensors. - model_outputs: a tensor or a nested structure of tensors. For example, - output of the keras model built by self.build_model. - """ - for metric in metrics: - metric.update_state(labels, model_outputs) - - def train_step(self, - inputs: Tuple[Any, Any], - model: tf.keras.Model, - optimizer: tf.keras.optimizers.Optimizer, - metrics: Optional[List[Any]] = None): - """Does forward and backward. - - Args: - inputs: a dictionary of input tensors. - model: the model, forward pass definition. - optimizer: the optimizer for this training step. - metrics: a nested structure of metrics objects. - - Returns: - A dictionary of logs. - """ - features, labels = inputs - input_partition_dims = self.task_config.train_input_partition_dims - if input_partition_dims: - strategy = tf.distribute.get_strategy() - features['image'] = strategy.experimental_split_to_logical_devices( - features['image'], input_partition_dims) - - num_replicas = tf.distribute.get_strategy().num_replicas_in_sync - with tf.GradientTape() as tape: - outputs = model(features, training=True) - # Casting output layer as float32 is necessary when mixed_precision is - # mixed_float16 or mixed_bfloat16 to ensure output is casted as float32. - outputs = tf.nest.map_structure( - lambda x: tf.cast(x, tf.float32), outputs) - - # Computes per-replica loss. - if self._is_multilabel(): - outputs = tf.math.sigmoid(outputs) - else: - outputs = tf.math.softmax(outputs) - all_losses = self.build_losses( - model_outputs=outputs, labels=labels, aux_losses=model.losses) - loss = all_losses[self.loss] - # Scales loss as the default gradients allreduce performs sum inside the - # optimizer. - scaled_loss = loss / num_replicas - - # For mixed_precision policy, when LossScaleOptimizer is used, loss is - # scaled for numerical stability. - if isinstance( - optimizer, tf.keras.mixed_precision.LossScaleOptimizer): - scaled_loss = optimizer.get_scaled_loss(scaled_loss) - - tvars = model.trainable_variables - grads = tape.gradient(scaled_loss, tvars) - # Scales back gradient before apply_gradients when LossScaleOptimizer is - # used. - if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer): - grads = optimizer.get_unscaled_gradients(grads) - optimizer.apply_gradients(list(zip(grads, tvars))) - - logs = all_losses - if metrics: - self.process_metrics(metrics, labels, outputs) - logs.update({m.name: m.result() for m in metrics}) - elif model.compiled_metrics: - self.process_compiled_metrics(model.compiled_metrics, labels, outputs) - logs.update({m.name: m.result() for m in model.metrics}) - return logs - - def validation_step(self, - inputs: Tuple[Any, Any], - model: tf.keras.Model, - metrics: Optional[List[Any]] = None): - """Validatation step. - - Args: - inputs: a dictionary of input tensors. - model: the keras.Model. - metrics: a nested structure of metrics objects. - - Returns: - A dictionary of logs. - """ - features, labels = inputs - input_partition_dims = self.task_config.eval_input_partition_dims - if input_partition_dims: - strategy = tf.distribute.get_strategy() - features['image'] = strategy.experimental_split_to_logical_devices( - features['image'], input_partition_dims) - - outputs = self.inference_step(features, model) - outputs = tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), outputs) - logs = self.build_losses(model_outputs=outputs, labels=labels, - aux_losses=model.losses) - - if metrics: - self.process_metrics(metrics, labels, outputs) - logs.update({m.name: m.result() for m in metrics}) - elif model.compiled_metrics: - self.process_compiled_metrics(model.compiled_metrics, labels, outputs) - logs.update({m.name: m.result() for m in model.metrics}) - return logs - - def inference_step(self, features: tf.Tensor, model: tf.keras.Model): - """Performs the forward step.""" - outputs = model(features, training=False) - if self._is_multilabel(): - outputs = tf.math.sigmoid(outputs) - else: - outputs = tf.math.softmax(outputs) - num_test_views = self._get_num_test_views() - if num_test_views > 1: - # Averaging output probabilities across multiples views. - outputs = tf.reshape(outputs, [-1, num_test_views, outputs.shape[-1]]) - outputs = tf.reduce_mean(outputs, axis=1) - return outputs diff --git a/official/vision/beta/train.py b/official/vision/beta/train.py deleted file mode 100644 index cf1501f04..000000000 --- a/official/vision/beta/train.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""TensorFlow Model Garden Vision training driver.""" - -from absl import app -from absl import flags -import gin - -# pylint: disable=unused-import -from official.common import registry_imports -# pylint: enable=unused-import -from official.common import distribute_utils -from official.common import flags as tfm_flags -from official.core import task_factory -from official.core import train_lib -from official.core import train_utils -from official.modeling import performance - -FLAGS = flags.FLAGS - - -def main(_): - gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) - params = train_utils.parse_configuration(FLAGS) - model_dir = FLAGS.model_dir - if 'train' in FLAGS.mode: - # Pure eval modes do not output yaml files. Otherwise continuous eval job - # may race against the train job for writing the same file. - train_utils.serialize_config(params, model_dir) - - # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' - # can have significant impact on model speeds by utilizing float16 in case of - # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when - # dtype is float16 - if params.runtime.mixed_precision_dtype: - performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype) - distribution_strategy = distribute_utils.get_distribution_strategy( - distribution_strategy=params.runtime.distribution_strategy, - all_reduce_alg=params.runtime.all_reduce_alg, - num_gpus=params.runtime.num_gpus, - tpu_address=params.runtime.tpu) - with distribution_strategy.scope(): - task = task_factory.get_task(params.task, logging_dir=model_dir) - - train_lib.run_experiment( - distribution_strategy=distribution_strategy, - task=task, - mode=FLAGS.mode, - params=params, - model_dir=model_dir) - - train_utils.save_gin_config(FLAGS.mode, model_dir) - -if __name__ == '__main__': - tfm_flags.define_flags() - flags.mark_flags_as_required(['experiment', 'mode', 'model_dir']) - app.run(main) diff --git a/official/vision/beta/train_spatial_partitioning.py b/official/vision/beta/train_spatial_partitioning.py deleted file mode 100644 index 30bf604fa..000000000 --- a/official/vision/beta/train_spatial_partitioning.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright 2022 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""TensorFlow Model Garden Vision training driver with spatial partitioning.""" -from typing import Sequence - -from absl import app -from absl import flags -import gin -import numpy as np -import tensorflow as tf - -from official.common import registry_imports # pylint: disable=unused-import -from official.common import distribute_utils -from official.common import flags as tfm_flags -from official.core import task_factory -from official.core import train_lib -from official.core import train_utils -from official.modeling import performance - - -FLAGS = flags.FLAGS - - -def get_computation_shape_for_model_parallelism( - input_partition_dims: Sequence[int]) -> Sequence[int]: - """Returns computation shape to be used for TPUStrategy spatial partition. - - Args: - input_partition_dims: The number of partitions along each dimension. - - Returns: - A list of integers specifying the computation shape. - - Raises: - ValueError: If the number of logical devices is not supported. - """ - num_logical_devices = np.prod(input_partition_dims) - if num_logical_devices == 1: - return [1, 1, 1, 1] - elif num_logical_devices == 2: - return [1, 1, 1, 2] - elif num_logical_devices == 4: - return [1, 2, 1, 2] - elif num_logical_devices == 8: - return [2, 2, 1, 2] - elif num_logical_devices == 16: - return [4, 2, 1, 2] - else: - raise ValueError( - 'The number of logical devices %d is not supported. Supported numbers ' - 'are 1, 2, 4, 8, 16' % num_logical_devices) - - -def create_distribution_strategy(distribution_strategy, - tpu_address, - input_partition_dims=None, - num_gpus=None): - """Creates distribution strategy to use for computation.""" - - if input_partition_dims is not None: - if distribution_strategy != 'tpu': - raise ValueError('Spatial partitioning is only supported ' - 'for TPUStrategy.') - - # When `input_partition_dims` is specified create custom TPUStrategy - # instance with computation shape for model parallelism. - resolver = tf.distribute.cluster_resolver.TPUClusterResolver( - tpu=tpu_address) - if tpu_address not in ('', 'local'): - tf.config.experimental_connect_to_cluster(resolver) - - topology = tf.tpu.experimental.initialize_tpu_system(resolver) - num_replicas = resolver.get_tpu_system_metadata().num_cores // np.prod( - input_partition_dims) - device_assignment = tf.tpu.experimental.DeviceAssignment.build( - topology, - num_replicas=num_replicas, - computation_shape=input_partition_dims) - return tf.distribute.TPUStrategy( - resolver, experimental_device_assignment=device_assignment) - - return distribute_utils.get_distribution_strategy( - distribution_strategy=distribution_strategy, - tpu_address=tpu_address, - num_gpus=num_gpus) - - -def main(_): - gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) - params = train_utils.parse_configuration(FLAGS) - model_dir = FLAGS.model_dir - if 'train' in FLAGS.mode: - # Pure eval modes do not output yaml files. Otherwise continuous eval job - # may race against the train job for writing the same file. - train_utils.serialize_config(params, model_dir) - - # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' - # can have significant impact on model speeds by utilizing float16 in case of - # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when - # dtype is float16 - if params.runtime.mixed_precision_dtype: - performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype) - - input_partition_dims = None - if FLAGS.mode == 'train_and_eval': - if np.prod(params.task.train_input_partition_dims) != np.prod( - params.task.eval_input_partition_dims): - raise ValueError('Train and eval input partition dims can not be' - 'partitioned on the same node') - else: - input_partition_dims = get_computation_shape_for_model_parallelism( - params.task.train_input_partition_dims) - elif FLAGS.mode == 'train': - if params.task.train_input_partition_dims: - input_partition_dims = get_computation_shape_for_model_parallelism( - params.task.train_input_partition_dims) - elif FLAGS.mode == 'eval' or FLAGS.mode == 'continuous_eval': - if params.task.eval_input_partition_dims: - input_partition_dims = get_computation_shape_for_model_parallelism( - params.task.eval_input_partition_dims) - - distribution_strategy = create_distribution_strategy( - distribution_strategy=params.runtime.distribution_strategy, - num_gpus=params.runtime.num_gpus, - input_partition_dims=input_partition_dims, - tpu_address=params.runtime.tpu) - with distribution_strategy.scope(): - task = task_factory.get_task(params.task, logging_dir=model_dir) - - train_lib.run_experiment( - distribution_strategy=distribution_strategy, - task=task, - mode=FLAGS.mode, - params=params, - model_dir=model_dir) - -if __name__ == '__main__': - tfm_flags.define_flags() - app.run(main) -- GitLab From 357fa547057387d4f53dea0d70c2e2f64252189b Mon Sep 17 00:00:00 2001 From: Yeqing Li Date: Fri, 25 Mar 2022 10:45:53 -0700 Subject: [PATCH 54/54] Corrects the video ssl import path. PiperOrigin-RevId: 437284209 --- official/projects/video_ssl/tasks/linear_eval.py | 2 +- official/projects/video_ssl/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/official/projects/video_ssl/tasks/linear_eval.py b/official/projects/video_ssl/tasks/linear_eval.py index 16ec19696..5d7849422 100644 --- a/official/projects/video_ssl/tasks/linear_eval.py +++ b/official/projects/video_ssl/tasks/linear_eval.py @@ -19,7 +19,7 @@ import tensorflow as tf # pylint: disable=unused-import from official.core import task_factory -from official.projects.video_ssl.configs.google import video_ssl as exp_cfg +from official.projects.video_ssl.configs import video_ssl as exp_cfg from official.projects.video_ssl.modeling import video_ssl_model from official.vision.tasks import video_classification diff --git a/official/projects/video_ssl/train.py b/official/projects/video_ssl/train.py index 1a1fb5bbc..5d1f4e8f5 100644 --- a/official/projects/video_ssl/train.py +++ b/official/projects/video_ssl/train.py @@ -27,7 +27,7 @@ from official.core import train_utils from official.modeling import performance from official.projects.video_ssl.modeling import video_ssl_model from official.projects.video_ssl.tasks import linear_eval -from official.projects.video_ssl.tasks.google import pretrain +from official.projects.video_ssl.tasks import pretrain from official.vision import registry_imports # pylint: disable=unused-import -- GitLab