adhere to lazy import rules

Summary: Pull Request resolved: https://github.com/facebookresearch/d2go/pull/668 Lazy import changes `Python` import semantics, specifically when it comes to initialization of packages/modules: https://www.internalfb.com/intern/wiki/Python/Cinder/Onboarding/Tutorial/Lazy_Imports/Troubleshooting/ For example, this pattern is not guaranteed to work: ``` import torch.optim ... torch.optim._multi_tensor.Adam # may fail to resolve _multi_tensor ``` And this is guaranteed to work: ``` import torch.optim._multi_tensor ... torch.optim._multi_tensor.Adam # will always work ``` A recent change to `PyTorch` changed module initialization logic in a way that exposed this issue. But the code has been working for years? This is the nature of undefined behavior, any change in the environment (in this the `PyTorch` code base can make it fail. Reviewed By: wat3rBro Differential Revision: D58876582 fbshipit-source-id: c8f3f53605822517d646e57ddbf4359af54dba0d

adhere to lazy import rules
Summary: Pull Request resolved: https://github.com/facebookresearch/d2go/pull/668 Lazy import changes `Python` import semantics, specifically when it comes to initialization of packages/modules: https://www.internalfb.com/intern/wiki/Python/Cinder/Onboarding/Tutorial/Lazy_Imports/Troubleshooting/ For example, this pattern is not guaranteed to work: ``` import torch.optim ... torch.optim._multi_tensor.Adam # may fail to resolve _multi_tensor ``` And this is guaranteed to work: ``` import torch.optim._multi_tensor ... torch.optim._multi_tensor.Adam # will always work ``` A recent change to `PyTorch` changed module initialization logic in a way that exposed this issue. But the code has been working for years? This is the nature of undefined behavior, any change in the environment (in this the `PyTorch` code base can make it fail. Reviewed By: wat3rBro Differential Revision: D58876582 fbshipit-source-id: c8f3f53605822517d646e57ddbf4359af54dba0d
040a7167 · Ahmed Gheith · Facebook GitHub Bot · 8eab506b · 040a7167
Commit 040a7167 authored Jun 21, 2024 by Ahmed Gheith Committed by Facebook GitHub Bot Jun 21, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 10 deletions

d2go/optimizer/build.py d2go/optimizer/build.py +14 -10

No files found.
--- a/d2go/optimizer/build.py
+++ b/d2go/optimizer/build.py
@@ -16,6 +16,10 @@ from detectron2.solver.build import (
 from detectron2.utils.file_io import PathManager
 from detectron2.utils.registry import Registry

+# lazy import doesn't guarantee that sub-packages are implicitly imported
+# https://www.internalfb.com/intern/wiki/Python/Cinder/Onboarding/Tutorial/Lazy_Imports/Troubleshooting/
+from torch.optim import _multi_tensor
+

 D2GO_OPTIM_MAPPER_REGISTRY = Registry("D2GO_OPTIM_MAPPER")

@@ -304,7 +308,7 @@ def sgd_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
    of parameter groups needs to be reduced using `reduce_param_groups`.
    """
    params = get_optimizer_param_groups(model, cfg)
-    return maybe_add_gradient_clipping(cfg, torch.optim._multi_tensor.SGD)(
+    return maybe_add_gradient_clipping(cfg, _multi_tensor.SGD)(
        params=params,
        lr=cfg.SOLVER.BASE_LR,
        momentum=cfg.SOLVER.MOMENTUM,
@@ -321,7 +325,7 @@ def adamw_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
    of parameter groups needs to be reduced using `reduce_param_groups`.
    """
    params = get_optimizer_param_groups(model, cfg)
-    return maybe_add_gradient_clipping(cfg, torch.optim._multi_tensor.AdamW)(
+    return maybe_add_gradient_clipping(cfg, _multi_tensor.AdamW)(
        params=params, lr=cfg.SOLVER.BASE_LR, eps=cfg.SOLVER.EPS
    )

@@ -335,7 +339,7 @@ def nadam_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
    of parameter groups needs to be reduced using `reduce_param_groups`.
    """
    params = get_optimizer_param_groups(model, cfg)
-    return maybe_add_gradient_clipping(cfg, torch.optim._multi_tensor.NAdam)(
+    return maybe_add_gradient_clipping(cfg, _multi_tensor.NAdam)(
        params=params,
        lr=cfg.SOLVER.BASE_LR,
    )
@@ -350,7 +354,7 @@ def radam_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
    of parameter groups needs to be reduced using `reduce_param_groups`.
    """
    params = get_optimizer_param_groups(model, cfg)
-    return maybe_add_gradient_clipping(cfg, torch.optim._multi_tensor.RAdam)(
+    return maybe_add_gradient_clipping(cfg, _multi_tensor.RAdam)(
        params=params,
        lr=cfg.SOLVER.BASE_LR,
    )
@@ -365,7 +369,7 @@ def rmsprop_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
    of parameter groups needs to be reduced using `reduce_param_groups`.
    """
    params = get_optimizer_param_groups(model, cfg)
-    return maybe_add_gradient_clipping(cfg, torch.optim._multi_tensor.RMSprop)(
+    return maybe_add_gradient_clipping(cfg, _multi_tensor.RMSprop)(
        params=params,
        lr=cfg.SOLVER.BASE_LR,
    )
@@ -380,7 +384,7 @@ def rprop_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
    of parameter groups needs to be reduced using `reduce_param_groups`.
    """
    params = get_optimizer_param_groups(model, cfg)
-    return maybe_add_gradient_clipping(cfg, torch.optim._multi_tensor.Rprop)(
+    return maybe_add_gradient_clipping(cfg, _multi_tensor.Rprop)(
        params=params,
        lr=cfg.SOLVER.BASE_LR,
    )
@@ -395,7 +399,7 @@ def asgd_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
    of parameter groups needs to be reduced using `reduce_param_groups`.
    """
    params = get_optimizer_param_groups(model, cfg)
-    return maybe_add_gradient_clipping(cfg, torch.optim._multi_tensor.ASGD)(
+    return maybe_add_gradient_clipping(cfg, _multi_tensor.ASGD)(
        params=params,
        lr=cfg.SOLVER.BASE_LR,
    )
@@ -410,7 +414,7 @@ def adamax_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
    of parameter groups needs to be reduced using `reduce_param_groups`.
    """
    params = get_optimizer_param_groups(model, cfg)
-    return maybe_add_gradient_clipping(cfg, torch.optim._multi_tensor.Adamax)(
+    return maybe_add_gradient_clipping(cfg, _multi_tensor.Adamax)(
        params=params,
        lr=cfg.SOLVER.BASE_LR,
    )
@@ -425,7 +429,7 @@ def adadelta_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
    of parameter groups needs to be reduced using `reduce_param_groups`.
    """
    params = get_optimizer_param_groups(model, cfg)
-    return maybe_add_gradient_clipping(cfg, torch.optim._multi_tensor.Adadelta)(
+    return maybe_add_gradient_clipping(cfg, _multi_tensor.Adadelta)(
        params=params,
        lr=cfg.SOLVER.BASE_LR,
    )
@@ -440,7 +444,7 @@ def adagrad_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
    of parameter groups needs to be reduced using `reduce_param_groups`.
    """
    params = get_optimizer_param_groups(model, cfg)
-    return maybe_add_gradient_clipping(cfg, torch.optim._multi_tensor.Adagrad)(
+    return maybe_add_gradient_clipping(cfg, _multi_tensor.Adagrad)(
        params=params,
        lr=cfg.SOLVER.BASE_LR,
    )