experiment.py 9.82 KB
Newer Older
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
1
2
3
4
5
6
7
#!/usr/bin/env python
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

8
9
# pyre-unsafe

Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
10
11
12
13
14
15
""""
This file is the entry point for launching experiments with Implicitron.

Launch Training
---------------
Experiment config .yaml files are located in the
16
17
18
`projects/implicitron_trainer/configs` folder. To launch an experiment,
specify the name of the file. Specific config values can also be overridden
from the command line, for example:
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
19
20
21
22
23

```
./experiment.py --config-name base_config.yaml override.param.one=42 override.param.two=84
```

24
25
26
27
28
29
30
Main functions
---------------
- The Experiment class defines `run` which creates the model, optimizer, and other
  objects used in training, then starts TrainingLoop's `run` function.
- TrainingLoop takes care of the actual training logic: forward and backward passes,
  evaluation and testing, as well as model checkpointing, visualization, and metric
  printing.
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
31
32
33
34
35
36
37
38
39
40
41

Outputs
--------
The outputs of the experiment are saved and logged in multiple ways:
  - Checkpoints:
        Model, optimizer and stats are stored in the directory
        named by the `exp_dir` key from the config file / CLI parameters.
  - Stats
        Stats are logged and plotted to the file "train_stats.pdf" in the
        same directory. The stats are also saved as part of the checkpoint file.
  - Visualizations
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
42
        Predictions are plotted to a visdom server running at the
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
43
44
45
46
47
48
49
        port specified by the `visdom_server` and `visdom_port` keys in the
        config file.

"""
import logging
import os
import warnings
50
51

from dataclasses import field
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
52
53

import hydra
54

Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
55
import torch
56
from accelerate import Accelerator
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
57
58
from omegaconf import DictConfig, OmegaConf
from packaging import version
59
60
61
62
63

from pytorch3d.implicitron.dataset.data_source import (
    DataSourceBase,
    ImplicitronDataSource,
)
Emilien Garreau's avatar
Emilien Garreau committed
64
from pytorch3d.implicitron.models.base_model import ImplicitronModelBase
65

66
67
68
69
from pytorch3d.implicitron.models.renderer.multipass_ea import (
    MultiPassEmissionAbsorptionRenderer,
)
from pytorch3d.implicitron.models.renderer.ray_sampler import AdaptiveRaySampler
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
70
from pytorch3d.implicitron.tools.config import (
71
    Configurable,
72
    expand_args_fields,
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
73
    remove_unused_components,
74
    run_auto_creation,
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
75
76
)

77
78
79
from .impl.model_factory import ModelFactoryBase
from .impl.optimizer_factory import OptimizerFactoryBase
from .impl.training_loop import TrainingLoopBase
80
from .impl.utils import seed_all_random_engines
81

Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
82
83
logger = logging.getLogger(__name__)

Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
84
85
86
# workaround for https://github.com/facebookresearch/hydra/issues/2262
_RUN = hydra.types.RunMode.RUN

Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
87
88
89
90
91
92
93
94
95
96
97
98
if version.parse(hydra.__version__) < version.Version("1.1"):
    raise ValueError(
        f"Hydra version {hydra.__version__} is too old."
        " (Implicitron requires version 1.1 or later.)"
    )

try:
    # only makes sense in FAIR cluster
    import pytorch3d.implicitron.fair_cluster.slurm  # noqa: F401
except ModuleNotFoundError:
    pass

99
100
no_accelerate = os.environ.get("PYTORCH3D_NO_ACCELERATE") is not None

Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
101

102
class Experiment(Configurable):
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
103
    """
104
105
106
107
108
109
110
111
112
113
114
115
    This class is at the top level of Implicitron's config hierarchy. Its
    members are high-level components necessary for training an implicit rende-
    ring network.

    Members:
        data_source: An object that produces datasets and dataloaders.
        model_factory: An object that produces an implicit rendering model as
            well as its corresponding Stats object.
        optimizer_factory: An object that produces the optimizer and lr
            scheduler.
        training_loop: An object that runs training given the outputs produced
            by the data_source, model_factory and optimizer_factory.
116
        seed: A random seed to ensure reproducibility.
117
118
119
120
        detect_anomaly: Whether torch.autograd should detect anomalies. Useful
            for debugging, but might slow down the training.
        exp_dir: Root experimentation directory. Checkpoints and training stats
            will be saved here.
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
121
122
    """

123
    # pyre-fixme[13]: Attribute `data_source` is never initialized.
124
125
    data_source: DataSourceBase
    data_source_class_type: str = "ImplicitronDataSource"
126
    # pyre-fixme[13]: Attribute `model_factory` is never initialized.
127
128
    model_factory: ModelFactoryBase
    model_factory_class_type: str = "ImplicitronModelFactory"
129
    # pyre-fixme[13]: Attribute `optimizer_factory` is never initialized.
130
131
    optimizer_factory: OptimizerFactoryBase
    optimizer_factory_class_type: str = "ImplicitronOptimizerFactory"
132
    # pyre-fixme[13]: Attribute `training_loop` is never initialized.
133
134
135
    training_loop: TrainingLoopBase
    training_loop_class_type: str = "ImplicitronTrainingLoop"

136
    seed: int = 42
137
138
139
140
141
142
143
    detect_anomaly: bool = False
    exp_dir: str = "./data/default_experiment/"

    hydra: dict = field(
        default_factory=lambda: {
            "run": {"dir": "."},  # Make hydra not change the working dir.
            "output_subdir": None,  # disable storing the .hydra logs
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
144
            "mode": _RUN,
145
        }
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
146
147
    )

148
    def __post_init__(self):
149
150
151
152
        seed_all_random_engines(
            self.seed
        )  # Set all random engine seeds for reproducibility

153
        run_auto_creation(self)
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
154

155
156
157
158
159
160
161
162
163
    def run(self) -> None:
        # Initialize the accelerator if desired.
        if no_accelerate:
            accelerator = None
            device = torch.device("cuda:0")
        else:
            accelerator = Accelerator(device_placement=False)
            logger.info(accelerator.state)
            device = accelerator.device
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
164

165
166
        logger.info(f"Running experiment on device: {device}")
        os.makedirs(self.exp_dir, exist_ok=True)
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
167

168
169
170
171
        # set the debug mode
        if self.detect_anomaly:
            logger.info("Anomaly detection!")
        torch.autograd.set_detect_anomaly(self.detect_anomaly)
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
172

173
174
        # Initialize the datasets and dataloaders.
        datasets, dataloaders = self.data_source.get_datasets_and_dataloaders()
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
175

176
177
178
179
180
        # Init the model and the corresponding Stats object.
        model = self.model_factory(
            accelerator=accelerator,
            exp_dir=self.exp_dir,
        )
181

182
        stats = self.training_loop.load_stats(
183
            log_vars=model.log_vars,
184
185
186
            exp_dir=self.exp_dir,
            resume=self.model_factory.resume,
            resume_epoch=self.model_factory.resume_epoch,  # pyre-ignore [16]
187
188
        )
        start_epoch = stats.epoch + 1
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
189

190
        model.to(device)
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
191

192
193
194
195
196
197
        # Init the optimizer and LR scheduler.
        optimizer, scheduler = self.optimizer_factory(
            accelerator=accelerator,
            exp_dir=self.exp_dir,
            last_epoch=start_epoch,
            model=model,
198
199
            resume=self.model_factory.resume,
            resume_epoch=self.model_factory.resume_epoch,
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
200
        )
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
201

202
203
204
205
206
207
208
209
        # Wrap all modules in the distributed library
        # Note: we don't pass the scheduler to prepare as it
        # doesn't need to be stepped at each optimizer step
        train_loader = dataloaders.train
        val_loader = dataloaders.val
        test_loader = dataloaders.test
        if accelerator is not None:
            (
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
210
211
                model,
                optimizer,
212
213
214
215
216
217
218
219
220
                train_loader,
                val_loader,
            ) = accelerator.prepare(model, optimizer, train_loader, val_loader)

        # Enter the main training loop.
        self.training_loop.run(
            train_loader=train_loader,
            val_loader=val_loader,
            test_loader=test_loader,
221
            # pyre-ignore[6]
222
            train_dataset=datasets.train,
223
224
225
226
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            accelerator=accelerator,
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
227
            device=device,
228
229
            exp_dir=self.exp_dir,
            stats=stats,
230
            seed=self.seed,
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
231
        )
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
232
233


234
def _setup_envvars_for_cluster() -> bool:
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
    """
    Prepares to run on cluster if relevant.
    Returns whether FAIR cluster in use.
    """
    # TODO: How much of this is needed in general?

    try:
        import submitit
    except ImportError:
        return False

    try:
        # Only needed when launching on cluster with slurm and submitit
        job_env = submitit.JobEnvironment()
    except RuntimeError:
        return False

    os.environ["LOCAL_RANK"] = str(job_env.local_rank)
    os.environ["RANK"] = str(job_env.global_rank)
    os.environ["WORLD_SIZE"] = str(job_env.num_tasks)
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "42918"
    logger.info(
        "Num tasks %s, global_rank %s"
        % (str(job_env.num_tasks), str(job_env.global_rank))
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
260
261
    )

262
    return True
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
263

264

265
266
267
268
269
270
271
272
273
274
275
276
def dump_cfg(cfg: DictConfig) -> None:
    remove_unused_components(cfg)
    # dump the exp config to the exp dir
    os.makedirs(cfg.exp_dir, exist_ok=True)
    try:
        cfg_filename = os.path.join(cfg.exp_dir, "expconfig.yaml")
        OmegaConf.save(config=cfg, f=cfg_filename)
    except PermissionError:
        warnings.warn("Can't dump config due to insufficient permissions!")


expand_args_fields(Experiment)
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
277
cs = hydra.core.config_store.ConfigStore.instance()
278
cs.store(name="default_config", node=Experiment)
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
279
280
281
282


@hydra.main(config_path="./configs/", config_name="default_config")
def experiment(cfg: DictConfig) -> None:
283
284
285
286
287
288
289
290
291
    # CUDA_VISIBLE_DEVICES must have been set.

    if "CUDA_DEVICE_ORDER" not in os.environ:
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

    if not _setup_envvars_for_cluster():
        logger.info("Running locally")

    # TODO: The following may be needed for hydra/submitit it to work
292
    expand_args_fields(ImplicitronModelBase)
293
294
295
296
    expand_args_fields(AdaptiveRaySampler)
    expand_args_fields(MultiPassEmissionAbsorptionRenderer)
    expand_args_fields(ImplicitronDataSource)

297
298
299
    experiment = Experiment(**cfg)
    dump_cfg(cfg)
    experiment.run()
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
300
301
302
303


if __name__ == "__main__":
    experiment()