Setup pre-commit github action and apply pre-commit to all files (#849)

* adding pre-commit files * applying pre-commit to all files * adding no-strict-optional argument to mypy in circle ci config * fix typo * updating python versions * [skip ci] remove extra args * adding python 3.9 * [skip ci] set pre-commit version in requirements-dev.txt * set CACHE_VERSION * move linters from circleci to github actions * update python version * update python version in benchmarks_2 * moving to python 3.9.7

Setup pre-commit github action and apply pre-commit to all files (#849)
* adding pre-commit files * applying pre-commit to all files * adding no-strict-optional argument to mypy in circle ci config * fix typo * updating python versions * [skip ci] remove extra args * adding python 3.9 * [skip ci] set pre-commit version in requirements-dev.txt * set CACHE_VERSION * move linters from circleci to github actions * update python version * update python version in benchmarks_2 * moving to python 3.9.7
7d7edf6d · Anupam Bhatnagar · GitHub · 6f3931a4 · 7d7edf6d · 7d7edf6d
Unverified Commit 7d7edf6d authored Nov 11, 2021 by Anupam Bhatnagar Committed by GitHub Nov 11, 2021
20 changed files
--- a/tests/nn/checkpoint/test_checkpoint_activations.py
+++ b/tests/nn/checkpoint/test_checkpoint_activations.py
@@ -263,11 +263,19 @@ def test_deprecated_path():
    # from fairscale.nn.misc.checkpoint_activations import checkpoint_wrapper
    from fairscale.nn import checkpoint_wrapper
-    ffn = nn.Sequential(nn.Linear(32, 128), nn.Dropout(p=0.5), nn.Linear(128, 32),)
+    ffn = nn.Sequential(
+        nn.Linear(32, 128),
+        nn.Dropout(p=0.5),
+        nn.Linear(128, 32),
+    )
    ffn = checkpoint_wrapper(ffn, {})
    # Check if direct import works as before.
-    ffn = nn.Sequential(nn.Linear(32, 128), nn.Dropout(p=0.5), nn.Linear(128, 32),)
+    ffn = nn.Sequential(
+        nn.Linear(32, 128),
+        nn.Dropout(p=0.5),
+        nn.Linear(128, 32),
+    )
    ffn = deprecated_checkpoint_wrapper(ffn, {})

--- a/tests/nn/data_parallel/test_fsdp.py
+++ b/tests/nn/data_parallel/test_fsdp.py
@@ -83,7 +83,16 @@ class DistributedTest(unittest.TestCase):
    @classmethod
    def _test_identical_outputs(
-        cls, model_init_fn, config, rank, group, num_steps=2, use_cuda=True, lr=0.01, ref_ddp_fn=None, norm_type=2,
+        cls,
+        model_init_fn,
+        config,
+        rank,
+        group,
+        num_steps=2,
+        use_cuda=True,
+        lr=0.01,
+        ref_ddp_fn=None,
+        norm_type=2,
    ):
        if config.get("mixed_precision", False):
            autocast = True
@@ -265,7 +274,10 @@ CONFIG_OPTIONS = [[dict(zip(keys, config))] for config in itertools.product([Tru
 def rename_test(testcase_func, param_num, param):
-    return "%s_%s" % (testcase_func.__name__, parameterized.to_safe_name(str(param.args)),)
+    return "%s_%s" % (
+        testcase_func.__name__,
+        parameterized.to_safe_name(str(param.args)),
+    )
 class TestComparisonToPyTorchDDP(DistributedTest):
@@ -373,7 +385,11 @@ class TestComparisonToPyTorchDDP(DistributedTest):
    def test_mixture_of_experts_grad_clip_breaks(self):
        config = {"mixed_precision": True}
        test_fn = functools.partial(
-            self._test_identical_outputs, MixtureOfExperts, config, ref_ddp_fn=self._dummy_ddp_fn, norm_type=2,
+            self._test_identical_outputs,
+            MixtureOfExperts,
+            config,
+            ref_ddp_fn=self._dummy_ddp_fn,
+            norm_type=2,
        )
        with self.assertRaises(Exception):
            spawn_and_init(test_fn)
@@ -386,7 +402,10 @@ class TestComparisonToPyTorchDDP(DistributedTest):
    def test_clip_norm_transformer(self, norm_type):
        config = {"mixed_precision": True}
        test_fn = functools.partial(
-            self._test_identical_outputs, TransformerWithSharedParams, config, norm_type=norm_type,
+            self._test_identical_outputs,
+            TransformerWithSharedParams,
+            config,
+            norm_type=norm_type,
        )
        spawn_and_init(test_fn)
@@ -593,7 +612,11 @@ class TransformerWithSharedParams(nn.Module):
        assert d_vocab >= 12  # we use torch.arange(12) as input
        self.embed_tokens = nn.Embedding(d_vocab, d_model)
        self.transformer = nn.Transformer(
-            d_model=d_model, num_encoder_layers=2, num_decoder_layers=2, dim_feedforward=8, dropout=0.1,
+            d_model=d_model,
+            num_encoder_layers=2,
+            num_decoder_layers=2,
+            dim_feedforward=8,
+            dropout=0.1,
        )
        self.output_proj = nn.Linear(d_model, d_vocab)
@@ -642,7 +665,12 @@ class NestedWrappedModule(nn.Module):
        torch.manual_seed(0)  # keep everything deterministic
        self.module = nn.Sequential(
            nn.Linear(8, 4),
-            _maybe_wrap(nn.Sequential(_maybe_wrap(nn.Linear(4, 16)), nn.Linear(16, 16),)),
+            _maybe_wrap(
+                nn.Sequential(
+                    _maybe_wrap(nn.Linear(4, 16)),
+                    nn.Linear(16, 16),
+                )
+            ),
            _maybe_wrap(nn.Linear(16, 4)),
            nn.Linear(4, 8),
        )

--- a/tests/nn/data_parallel/test_fsdp_freezing_weights.py
+++ b/tests/nn/data_parallel/test_fsdp_freezing_weights.py
@@ -43,7 +43,10 @@ class FreezeModel(nn.Module):
 def _freeze_distributed_worker(
-    gpu_id, world_size, tempfile_name, unused,
+    gpu_id,
+    world_size,
+    tempfile_name,
+    unused,
 ):
    torch.cuda.set_device(gpu_id)
@@ -88,7 +91,9 @@ def _freeze_distributed_worker(
 def test_submodule_freezing_weights(temp_files):
    world_size = 2
    mp.spawn(
-        _freeze_distributed_worker, (world_size, temp_files[0], temp_files[1]), nprocs=world_size,
+        _freeze_distributed_worker,
+        (world_size, temp_files[0], temp_files[1]),
+        nprocs=world_size,
    )
@@ -120,7 +125,11 @@ class NestedTrunkModel(nn.Module):
            self._create_block(3, 64, with_fsdp, freeze_after_wrap_fsdp),
            self._create_block(64, 64, with_fsdp, freeze_after_wrap_fsdp),
        )
-        self.head = nn.Sequential(nn.AdaptiveAvgPool2d(output_size=(1, 1)), nn.Flatten(), nn.Linear(64, 10),)
+        self.head = nn.Sequential(
+            nn.AdaptiveAvgPool2d(output_size=(1, 1)),
+            nn.Flatten(),
+            nn.Linear(64, 10),
+        )
        if with_fsdp and freeze_after_wrap_fsdp:
            self.fsdp_wrap()
@@ -135,7 +144,10 @@ class NestedTrunkModel(nn.Module):
        return self.head(self.trunk(x))
    def _create_block(self, in_channels, out_channels, with_fsdp, freeze_after_wrap_fsdp):
-        block = nn.Sequential(nn.Conv2d(in_channels, out_channels, kernel_size=3), nn.ReLU(inplace=True),)
+        block = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=3),
+            nn.ReLU(inplace=True),
+        )
        return block

--- a/tests/nn/data_parallel/test_fsdp_input.py
+++ b/tests/nn/data_parallel/test_fsdp_input.py
@@ -38,7 +38,8 @@ def temp_files():
 # We only test on GPU since mix-precision only works on GPU.
 @skip_if_no_cuda
 @pytest.mark.parametrize(
-    "fsdp_config", [{}, {"mixed_precision": True}],
+    "fsdp_config",
+    [{}, {"mixed_precision": True}],
 )
 @pytest.mark.parametrize("input_cls", [dict, list])
 def test_input_type(temp_files, fsdp_config, input_cls):

--- a/tests/nn/data_parallel/test_fsdp_metadata.py
+++ b/tests/nn/data_parallel/test_fsdp_metadata.py
@@ -43,7 +43,9 @@ class ConvolutionalModel(nn.Module):
    @staticmethod
    def _conv_block(in_channels: int, out_channels: int):
        return nn.Sequential(
-            nn.Conv2d(in_channels, out_channels, kernel_size=(3, 3)), nn.BatchNorm2d(out_channels), nn.ReLU(),
+            nn.Conv2d(in_channels, out_channels, kernel_size=(3, 3)),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
        )
    def forward(self, x):
@@ -78,7 +80,10 @@ def _worker(gpu_id: int, sync_file: str, world_size: int, embedding_size: int, f
    torch.manual_seed(0)
    torch.cuda.set_device(gpu_id)
    torch.distributed.init_process_group(
-        backend="nccl", init_method=f"file://{sync_file}", world_size=world_size, rank=gpu_id,
+        backend="nccl",
+        init_method=f"file://{sync_file}",
+        world_size=world_size,
+        rank=gpu_id,
    )
    process_group = torch.distributed.new_group()
@@ -116,7 +121,8 @@ def _worker(gpu_id: int, sync_file: str, world_size: int, embedding_size: int, f
    # Reconstruct a full checkpoint from the sharded checkpoints
    all_checkpoints = [_load_sharded_checkpoint(rank) for rank in range(world_size)]
    consolidated_checkpoint = FullyShardedDataParallel.consolidate_shard_weights(
-        shard_weights=[c["weights"] for c in all_checkpoints], shard_metadata=[c["meta"] for c in all_checkpoints],
+        shard_weights=[c["weights"] for c in all_checkpoints],
+        shard_metadata=[c["meta"] for c in all_checkpoints],
    )
    # Check that the reconstructed parameters are correct and of the right shape
@@ -159,7 +165,8 @@ def test_consolidation(embedding_size: int, flatten_parameters: bool):
 @skip_if_single_gpu
 class TestConsolidatedWeights(DistributedTest):
    @parameterized.expand(
-        [[True], [False]], name_func=rename_test,
+        [[True], [False]],
+        name_func=rename_test,
    )
    def test_consolidate_weights(self, transformer):
        config = {"mixed_precision": True, "flatten_parameters": True, "compute_dtype": torch.float32}
@@ -186,7 +193,10 @@ class TestConsolidatedWeights(DistributedTest):
        else:
            fsdp = FullyShardedDataParallel(MixtureOfExperts(group, wrapper_config=config)).cuda()
-        optim = Adam(fsdp.parameters(), lr=0.01,)
+        optim = Adam(
+            fsdp.parameters(),
+            lr=0.01,
+        )
        optim.zero_grad()
        with torch.cuda.amp.autocast(enabled=True):
            x = fsdp.module.get_input(torch.device("cuda"))
@@ -207,7 +217,8 @@ class TestConsolidatedWeights(DistributedTest):
            return
        all_checkpoints = [torch.load(p) for p in paths]
        consolidated_checkpoint = FullyShardedDataParallel.consolidate_shard_weights(
-            shard_weights=[c["weights"] for c in all_checkpoints], shard_metadata=[c["meta"] for c in all_checkpoints],
+            shard_weights=[c["weights"] for c in all_checkpoints],
+            shard_metadata=[c["meta"] for c in all_checkpoints],
        )
        full_model_extra = set(full_model_state_dict).difference(set(consolidated_checkpoint))
        consolidated_extra = set(consolidated_checkpoint).difference(set(full_model_state_dict))

--- a/tests/nn/data_parallel/test_fsdp_multiple_forward.py
+++ b/tests/nn/data_parallel/test_fsdp_multiple_forward.py
@@ -76,5 +76,8 @@ def test1(precision, flatten):
    # the tensor dimensions.
    world_size = 2
    mp.spawn(
-        _test_func, args=(world_size, fsdp_config, temp_file_name, unused), nprocs=world_size, join=True,
+        _test_func,
+        args=(world_size, fsdp_config, temp_file_name, unused),
+        nprocs=world_size,
+        join=True,
    )
--- a/tests/nn/data_parallel/test_fsdp_multiple_forward_checkpoint.py
+++ b/tests/nn/data_parallel/test_fsdp_multiple_forward_checkpoint.py
@@ -247,7 +247,7 @@ def _get_cached_results(
    fp32_reduce_scatter,
    bucket_cap_mb,
 ):
-    """ Cache the training to save time. For DDP, flatten, wrap_bn etc. doesn't matter, so
+    """Cache the training to save time. For DDP, flatten, wrap_bn etc. doesn't matter, so
    the results can be cached.
    """
    if not with_fsdp:

--- a/tests/nn/data_parallel/test_fsdp_multiple_wrapping.py
+++ b/tests/nn/data_parallel/test_fsdp_multiple_wrapping.py
@@ -32,7 +32,9 @@ def _test_func(rank, world_size, fsdp_config, tempfile_name, unused):
    class InnerModel(Module):
        def __init__(self):
            super().__init__()
-            self.layers = Sequential(FSDP(Linear(5, 5), **fsdp_config),)
+            self.layers = Sequential(
+                FSDP(Linear(5, 5), **fsdp_config),
+            )
        def forward(self, x):
            return self.layers(x)
@@ -85,5 +87,8 @@ def test(world_size, precision, flatten):
    }
    mp.spawn(
-        _test_func, args=(world_size, fsdp_config, temp_file_name, unused), nprocs=world_size, join=True,
+        _test_func,
+        args=(world_size, fsdp_config, temp_file_name, unused),
+        nprocs=world_size,
+        join=True,
    )
--- a/tests/nn/data_parallel/test_fsdp_offload.py
+++ b/tests/nn/data_parallel/test_fsdp_offload.py
@@ -67,7 +67,15 @@ class DistributedTest(unittest.TestCase):
    @classmethod
    def _test_identical_outputs_eval(
-        cls, model_init_fn, config, rank, group, num_steps=2, use_cuda=True, lr=0.01, ref_ddp_fn=None,
+        cls,
+        model_init_fn,
+        config,
+        rank,
+        group,
+        num_steps=2,
+        use_cuda=True,
+        lr=0.01,
+        ref_ddp_fn=None,
    ):
        if config.get("mixed_precision", False):
            autocast = True
@@ -116,7 +124,10 @@ CONFIG_OPTIONS = [[dict(zip(keys, config))] for config in itertools.product([Tru
 def rename_test(testcase_func, param_num, param):
-    return "%s_%s" % (testcase_func.__name__, parameterized.to_safe_name(str(param.args)),)
+    return "%s_%s" % (
+        testcase_func.__name__,
+        parameterized.to_safe_name(str(param.args)),
+    )
 class TestSsdMemory(DistributedTest):
@@ -284,7 +295,11 @@ class TransformerWithSharedParams(nn.Module):
        assert d_vocab >= 12  # we use torch.arange(12) as input
        self.embed_tokens = nn.Embedding(d_vocab, d_model)
        self.transformer = nn.Transformer(
-            d_model=d_model, num_encoder_layers=2, num_decoder_layers=2, dim_feedforward=8, dropout=0.1,
+            d_model=d_model,
+            num_encoder_layers=2,
+            num_decoder_layers=2,
+            dim_feedforward=8,
+            dropout=0.1,
        )
        self.output_proj = nn.Linear(d_model, d_vocab)
@@ -333,7 +348,12 @@ class NestedWrappedModule(nn.Module):
        torch.manual_seed(0)  # keep everything deterministic
        self.module = nn.Sequential(
            nn.Linear(8, 4),
-            _maybe_wrap(nn.Sequential(_maybe_wrap(nn.Linear(4, 16)), nn.Linear(16, 16),)),
+            _maybe_wrap(
+                nn.Sequential(
+                    _maybe_wrap(nn.Linear(4, 16)),
+                    nn.Linear(16, 16),
+                )
+            ),
            _maybe_wrap(nn.Linear(16, 4)),
            nn.Linear(4, 8),
        )

--- a/tests/nn/data_parallel/test_fsdp_optimizer_utils.py
+++ b/tests/nn/data_parallel/test_fsdp_optimizer_utils.py
@@ -62,7 +62,10 @@ class TestOptimizerUtils(DistributedTest):
            fsdp = FullyShardedDataParallel(MixtureOfExperts(group, wrapper_config=config)).cuda()
        try:
-            fsdp_optim = optim_fn(fsdp.parameters(), lr=0.01,)
+            fsdp_optim = optim_fn(
+                fsdp.parameters(),
+                lr=0.01,
+            )
            optim_unwrapped = optim_fn(unwrapped_model.parameters(), lr=0.01)
        except TypeError:  # Adadelta
            fsdp_optim = optim_fn(fsdp.parameters())

--- a/tests/nn/data_parallel/test_fsdp_overlap.py
+++ b/tests/nn/data_parallel/test_fsdp_overlap.py
@@ -82,7 +82,11 @@ class Min10:
 def _distributed_worker(
-    gpu_id, world_size, fsdp_config, tempfile, tempfile_rpc,
+    gpu_id,
+    world_size,
+    fsdp_config,
+    tempfile,
+    tempfile_rpc,
 ):
    torch.cuda.set_device(gpu_id)
@@ -252,5 +256,7 @@ def test_forward_overlap(world_size, flatten, mixed):
    }
    with temp_files_ctx(2) as temp_files:
        mp.spawn(
-            _distributed_worker, (world_size, fsdp_config, temp_files[0], temp_files[1]), nprocs=world_size,
+            _distributed_worker,
+            (world_size, fsdp_config, temp_files[0], temp_files[1]),
+            nprocs=world_size,
        )
--- a/tests/nn/data_parallel/test_fsdp_regnet.py
+++ b/tests/nn/data_parallel/test_fsdp_regnet.py
@@ -79,7 +79,9 @@ class ResBlock(Module):
        self.bn = BatchNorm2d(width_out)
        self.f = Sequential(
            Sequential(  # block a
-                Conv2d(width_in, width_out, (1, 1), (1, 1), bias=False), BatchNorm2d(width_out), ReLU(_relu_inplace),
+                Conv2d(width_in, width_out, (1, 1), (1, 1), bias=False),
+                BatchNorm2d(width_out),
+                ReLU(_relu_inplace),
            ),
            Sequential(  # block b
                Conv2d(width_out, width_out, (3, 3), (2, 2), (1, 1), groups=2, bias=False),

--- a/tests/nn/data_parallel/test_fsdp_shared_weights.py
+++ b/tests/nn/data_parallel/test_fsdp_shared_weights.py
@@ -93,7 +93,9 @@ def test_shared_weight(temp_files, outer_flat, inner_flat, sharing):
    # Run FSDP
    mp.spawn(
-        _dist_worker, (world_size, temp_files, outer_flat, inner_flat, sharing), nprocs=world_size,
+        _dist_worker,
+        (world_size, temp_files, outer_flat, inner_flat, sharing),
+        nprocs=world_size,
    )

--- a/tests/nn/data_parallel/test_fsdp_shared_weights_mevo.py
+++ b/tests/nn/data_parallel/test_fsdp_shared_weights_mevo.py
@@ -109,7 +109,9 @@ def test_shared_weight_mevo(temp_files, wrap_middle):
    # Run FSDP
    mp.spawn(
-        _dist_worker, (world_size, temp_files, wrap_middle), nprocs=world_size,
+        _dist_worker,
+        (world_size, temp_files, wrap_middle),
+        nprocs=world_size,
    )

--- a/tests/nn/data_parallel/test_fsdp_uneven.py
+++ b/tests/nn/data_parallel/test_fsdp_uneven.py
@@ -80,7 +80,8 @@ def _test_func(rank, world_size, model, fsdp_config, tempfile_name, unused, test
 @skip_if_single_gpu
 @pytest.mark.parametrize("test_case", [{"inputs": [torch.rand(8, 3)], "assert_ref_out": True}])
 @pytest.mark.parametrize(
-    "fsdp_config", [{}, {"flatten_parameters": False}, {"mixed_precision": True}],
+    "fsdp_config",
+    [{}, {"flatten_parameters": False}, {"mixed_precision": True}],
 )
 @pytest.mark.parametrize("world_size", list(range(2, 9)))
 def test_one_iteration(world_size, test_case, fsdp_config):

--- a/tests/nn/data_parallel/test_sharded_ddp_features.py
+++ b/tests/nn/data_parallel/test_sharded_ddp_features.py
@@ -35,8 +35,7 @@ def _get_mlp(tripwire: bool = False):
        return Sequential(Linear(2, 3), Linear(3, 3), Linear(3, 3), Linear(3, 3), Linear(3, 3), Linear(3, 3))
    class Tripwire(torch.nn.Module):
-        """A model made to expose possible corner cases
+        """A model made to expose possible corner cases"""
-        """
        def __init__(self) -> None:
            super().__init__()
@@ -323,7 +322,10 @@ def test_train_eval_change():
    world_size = 4
    with temp_files_ctx(num=1) as temp_files:
        mp.spawn(
-            run_test_train_eval_change, args=(world_size, temp_files[0]), nprocs=world_size, join=True,
+            run_test_train_eval_change,
+            args=(world_size, temp_files[0]),
+            nprocs=world_size,
+            join=True,
        )

--- a/tests/nn/data_parallel/test_sharded_ddp_pytorch_parity.py
+++ b/tests/nn/data_parallel/test_sharded_ddp_pytorch_parity.py
@@ -169,7 +169,11 @@ def run_ddp_parity(
        def sharded_closure(input_tensor=input_tensor):
            return closure(
-                sharded_ddp_model, sharded_scaler, input_tensor, grad_accumulation, _manual_reduction=manual_reduction,
+                sharded_ddp_model,
+                sharded_scaler,
+                input_tensor,
+                grad_accumulation,
+                _manual_reduction=manual_reduction,
            )
        # Step/scale both
@@ -329,7 +333,9 @@ def run_ddp_parity_two_optim(rank, world_size, backend, temp_file_name, reduce_b
        torch.cuda.synchronize(device)
        check_same_model_params(
-            sharded_ddp_model, ddp_model, f"DDP parity two optim test failing, step {i}, buffers {reduce_buffer_size}",
+            sharded_ddp_model,
+            ddp_model,
+            f"DDP parity two optim test failing, step {i}, buffers {reduce_buffer_size}",
        )
    dist.destroy_process_group()

--- a/tests/nn/misc/test_flatten_params_wrapper.py
+++ b/tests/nn/misc/test_flatten_params_wrapper.py
@@ -15,7 +15,7 @@ from fairscale.utils.testing import objects_are_equal
 class TestFlattenParams(unittest.TestCase):
-    """ Base test class and used for CPU case. """
+    """Base test class and used for CPU case."""
    def _get_module_init_fns(self):
        return [
@@ -42,7 +42,11 @@ class TestFlattenParams(unittest.TestCase):
    def _get_transformer(self, seed=0):
        torch.manual_seed(seed)  # keep everything deterministic
        module = torch.nn.Transformer(
-            d_model=32, num_encoder_layers=2, num_decoder_layers=2, dim_feedforward=128, dropout=0.1,
+            d_model=32,
+            num_encoder_layers=2,
+            num_decoder_layers=2,
+            dim_feedforward=128,
+            dropout=0.1,
        )
        module.register_buffer("dummy_buffer", torch.tensor(1.0))

--- a/tests/nn/pipe/skip/test_verify_skippables.py
+++ b/tests/nn/pipe/skip/test_verify_skippables.py
@@ -161,5 +161,10 @@ def test_double_stash_pop_but_isolated():
    ns2 = Namespace()
    verify_skippables(
-        nn.Sequential(Layer1().isolate(ns1), Layer2().isolate(ns1), Layer3().isolate(ns2), Layer4().isolate(ns2),)
+        nn.Sequential(
+            Layer1().isolate(ns1),
+            Layer2().isolate(ns1),
+            Layer3().isolate(ns2),
+            Layer4().isolate(ns2),
+        )
    )
--- a/tests/nn/pipe/test_parity.py
+++ b/tests/nn/pipe/test_parity.py
@@ -97,5 +97,10 @@ def test_correctness(use_fp16, checkpoint, chunks):
    model = _get_model()
    rmodel, ropt, rloss = _train_reg_model(model)
-    pmodel, popt, ploss = _train_pipe_model(model, use_fp16=use_fp16, checkpoint=checkpoint, chunks=chunks,)
+    pmodel, popt, ploss = _train_pipe_model(
+        model,
+        use_fp16=use_fp16,
+        checkpoint=checkpoint,
+        chunks=chunks,
+    )
    _check_parity(rmodel, pmodel, ropt, popt, rloss, ploss)