Merge pull request #4291 from microsoft/v2.5

merge v2.5 back to master

Merge pull request #4291 from microsoft/v2.5
merge v2.5 back to master
070df4a0 · liuzhe-lz · GitHub · 821706b8 · 6a082fe9 · 070df4a0
Unverified Commit 070df4a0 authored Nov 04, 2021 by liuzhe-lz Committed by GitHub Nov 04, 2021
3 changed files
--- a/test/ut/retiarii/test_convert_models.py
+++ b/test/ut/retiarii/test_convert_models.py
@@ -73,6 +73,7 @@ class TestModels(unittest.TestCase, ConvertMixin):
    def test_append_input_tensor(self):
        from typing import List
        class Net(nn.Module):
            def __init__(self, num_nodes):
                super().__init__()
@@ -80,6 +81,7 @@ class TestModels(unittest.TestCase, ConvertMixin):
                self.num_nodes = num_nodes
                for _ in range(num_nodes):
                    self.ops.append(nn.Linear(16, 16))
            def forward(self, x: List[torch.Tensor]):
                state = x
                for ops in self.ops:
@@ -90,5 +92,48 @@ class TestModels(unittest.TestCase, ConvertMixin):
        x = torch.rand((1, 16), dtype=torch.float)
        self.run_test(model, ([x], ))
+    def test_channels_shuffle(self):
+        class Net(nn.Module):
+            def forward(self, x):
+                bs, num_channels, height, width = x.size()
+                x = x.reshape(bs * num_channels // 2, 2, height * width)
+                x = x.permute(1, 0, 2)
+                x = x.reshape(2, -1, num_channels // 2, height, width)
+                return x[0], x[1]
+        model = Net()
+        x = torch.rand((1, 64, 224, 224), dtype=torch.float)
+        self.run_test(model, (x, ))
+    def test_identity_node(self):
+        class Net(nn.Module):
+            def forward(self, x):
+                return x
+        model = Net()
+        x = torch.rand((1, 64, 224, 224), dtype=torch.float)
+        self.run_test(model, (x, ))
+    def test_nn_sequential_inherit(self):
+        class ConvBNReLU(nn.Sequential):
+            def __init__(self):
+                super().__init__(
+                    nn.Conv2d(3, 3, 1, 1, bias=False),
+                    nn.BatchNorm2d(3),
+                    nn.ReLU(inplace=False)
+                )
+        class Net(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv_bn_relu = ConvBNReLU()
+            def forward(self, x):
+                return self.conv_bn_relu(x)
+        model = Net()
+        x = torch.rand((1, 3, 224, 224), dtype=torch.float)
+        self.run_test(model, (x, ))
 class TestModelsWithShape(TestModels, ConvertWithShapeMixin):
    pass
--- a/test/ut/sdk/test_compressor_torch.py
+++ b/test/ut/sdk/test_compressor_torch.py
@@ -9,7 +9,7 @@ import torch.nn.functional as F
 import schema
 import nni.algorithms.compression.pytorch.pruning as torch_pruner
 import nni.algorithms.compression.pytorch.quantization as torch_quantizer
-from nni.compression.pytorch.quantization.utils import calculate_qmin_qmax, get_quant_shape, get_min_max_value
+from nni.compression.pytorch.quantization.utils import calculate_qmin_qmax, get_quant_shape
 import math
@@ -398,11 +398,11 @@ class CompressorTestCase(TestCase):
                            target_zero_point = torch.ones([2, 1, 1, 1]) * 127
                    elif qscheme == 'per_tensor_symmetric':
                        if dtype == 'int':
-                            target_scale = torch.tensor(18. / 127)
+                            target_scale = torch.tensor([18. / 127])
-                            target_zero_point = torch.zeros([])
+                            target_zero_point = torch.zeros([1])
                        else:
-                            target_scale = torch.tensor(18. / 127.5)
+                            target_scale = torch.tensor([18. / 127.5])
-                            target_zero_point = torch.ones([]) * 127
+                            target_zero_point = torch.ones([1]) * 127
                    elif qscheme == 'per_channel_affine':
                        min_val = torch.tensor([0., 0.]).view([2, 1, 1, 1])
                        if dtype == 'int':
@@ -413,10 +413,10 @@ class CompressorTestCase(TestCase):
                            target_zero_point = 0 - torch.round(min_val / target_scale)
                    else:
                        if dtype == 'int':
-                            target_scale = torch.tensor(18. / 254)
+                            target_scale = torch.tensor([18. / 254])
                            target_zero_point = -127 - torch.round(0 / target_scale)
                        else:
-                            target_scale = torch.tensor(18. / 255)
+                            target_scale = torch.tensor([18. / 255])
                            target_zero_point = 0 - torch.round(0 / target_scale)
                    wrapper = getattr(model, name)
                    wrapper.module.weight = weight
@@ -434,11 +434,11 @@ class CompressorTestCase(TestCase):
                            target_zero_point = torch.ones([1, 1, 1, 1]) * 127
                    elif qscheme == 'per_tensor_symmetric':
                        if dtype == 'int':
-                            target_scale = torch.tensor(15. / 127)
+                            target_scale = torch.tensor([15. / 127])
-                            target_zero_point = torch.zeros([])
+                            target_zero_point = torch.zeros([1])
                        else:
-                            target_scale = torch.tensor(15. / 127.5)
+                            target_scale = torch.tensor([15. / 127.5])
-                            target_zero_point = torch.ones([]) * 127
+                            target_zero_point = torch.ones([1]) * 127
                    elif qscheme == 'per_channel_affine':
                        min_val = torch.tensor([0.]).view([1, 1, 1, 1])
                        if dtype == 'int':
@@ -449,10 +449,10 @@ class CompressorTestCase(TestCase):
                            target_zero_point = 0 - torch.round(min_val / target_scale)
                    else:
                        if dtype == 'int':
-                            target_scale = torch.tensor(15. / 254)
+                            target_scale = torch.tensor([15. / 254])
                            target_zero_point = -127 - torch.round(0 / target_scale)
                        else:
-                            target_scale = torch.tensor(15. / 255)
+                            target_scale = torch.tensor([15. / 255])
                            target_zero_point = 0 - torch.round(0 / target_scale)
                    quantizer.quantize_input(inp, wrapper)
                    self.assertTrue(torch.equal(getattr(model, name).module.input_scale, target_scale))
@@ -488,7 +488,7 @@ class CompressorTestCase(TestCase):
        assert model.conv2.module.weight_zero_point == 0
        quantizer.quantize_input(input, model.conv2)
        self.assertTrue(torch.allclose(model.conv2.module.input_scale, torch.tensor([4. / 255])))
-        self.assertTrue(torch.equal(model.conv2.module.input_zero_point, torch.tensor(0.)))
+        self.assertTrue(torch.equal(model.conv2.module.input_zero_point, torch.tensor([0.])))
        # range including 0
        weight = torch.tensor([[-1, 2], [3, 5]]).float()
        model.conv2.module.weight = weight
@@ -497,7 +497,7 @@ class CompressorTestCase(TestCase):
        assert model.conv2.module.weight_zero_point in (42, 43)
        quantizer.quantize_input(input, model.conv2)
        self.assertTrue(torch.allclose(model.conv2.module.input_scale, torch.tensor([4. / 255])))
-        self.assertTrue(torch.equal(model.conv2.module.input_zero_point, torch.tensor(0.)))
+        self.assertTrue(torch.equal(model.conv2.module.input_zero_point, torch.tensor([0.])))
        # test value of weight and bias after quantization
        weight = torch.tensor([[1.1287, 2.3456], [3.7814, 5.9723]])
        weight_valid = torch.tensor([[1.1242, 2.3421], [3.7707, 5.9723]])
@@ -513,14 +513,14 @@ class CompressorTestCase(TestCase):
        eps = 1e-7
        x = torch.tensor([[-0.2, 0], [0.1, 0.2]])
        model.relu(x)
-        self.assertTrue(torch.equal(model.relu.module.tracked_min_output, torch.tensor(0.)))
+        self.assertTrue(torch.equal(model.relu.module.tracked_min_output, torch.tensor([0.])))
-        self.assertTrue(torch.equal(model.relu.module.tracked_max_output, torch.tensor(0.2)))
+        self.assertTrue(torch.equal(model.relu.module.tracked_max_output, torch.tensor([0.2])))
        quantizer.step_with_optimizer()
        x = torch.tensor([[0.2, 0.4], [0.6, 0.8]])
        model.relu(x)
-        self.assertTrue(torch.equal(model.relu.module.tracked_min_output, torch.tensor(0.002)))
+        self.assertTrue(torch.equal(model.relu.module.tracked_min_output, torch.tensor([0.002])))
-        self.assertTrue(torch.equal(model.relu.module.tracked_max_output, torch.tensor(0.2060)))
+        self.assertTrue(torch.equal(model.relu.module.tracked_max_output, torch.tensor([0.2060])))
    def test_torch_quantizer_export(self):
        config_list_qat = [{

--- a/ts/nni_manager/training_service/reusable/trialDispatcher.ts
+++ b/ts/nni_manager/training_service/reusable/trialDispatcher.ts
@@ -209,42 +209,39 @@ class TrialDispatcher implements TrainingService {
        }
    }
+    private getStorageService(environmentService: EnvironmentService): StorageService {
+        let storageService: StorageService;
+        if (this.useSharedStorage) {
+            this.log.debug(`TrialDispatcher: use shared storage service.`);
+            storageService = component.get<SharedStorageService>(SharedStorageService).storageService;
+        } else if (environmentService.hasStorageService) {
+            this.log.debug(`TrialDispatcher: use existing storage service.`);
+            storageService = component.get<StorageService>(StorageService);
+        } else {
+            this.log.debug(`TrialDispatcher: create temp storage service to temp folder.`);
+            storageService = new MountedStorageService();
+            const environmentLocalTempFolder = path.join(this.experimentRootDir, "environment-temp");
+            storageService.initialize(this.config.trialCodeDirectory, environmentLocalTempFolder);
+        }
+        return storageService;
+    }
    public async run(): Promise<void> {
        await Promise.all(this.environmentServiceList.map(env => env.init()));
        for(const environmentService of this.environmentServiceList) {
-            const runnerSettings: RunnerSettings = new RunnerSettings();
-            runnerSettings.nniManagerIP = this.config.nniManagerIp === undefined? await getIPV4Address() : this.config.nniManagerIp;
-            runnerSettings.nniManagerPort = getBasePort() + 1;
-            runnerSettings.commandChannel = environmentService.getCommandChannel.channelName;
-            runnerSettings.enableGpuCollector = this.enableGpuScheduler;
-            runnerSettings.command = this.config.trialCommand;
-            runnerSettings.nniManagerVersion = this.enableVersionCheck ? await getVersion() : '';
-            runnerSettings.logCollection = this.logCollection;
-            runnerSettings.platform = environmentService.getName;
-            runnerSettings.experimentId = this.experimentId;
            await environmentService.getCommandChannel.start();
            this.log.info(`TrialDispatcher: started channel: ${environmentService.getCommandChannel.constructor.name}`);
-            this.log.info(`TrialDispatcher: copying code and settings.`);
+            this.log.info(`TrialDispatcher: copying code.`);
-            let storageService: StorageService;
            if (this.useSharedStorage) {
                if (this.fileCopyCompleted) {
-                    this.log.debug(`TrialDispatcher: file already copy to shared storage.`);
                    continue;
                }
-                this.log.debug(`TrialDispatcher: use shared storage service.`);
-                storageService = component.get<SharedStorageService>(SharedStorageService).storageService;
-            } else if (environmentService.hasStorageService) {
-                this.log.debug(`TrialDispatcher: use existing storage service.`);
-                storageService = component.get<StorageService>(StorageService);
-            } else {
-                this.log.debug(`TrialDispatcher: create temp storage service to temp folder.`);
-                storageService = new MountedStorageService();
-                const environmentLocalTempFolder = path.join(this.experimentRootDir, "environment-temp");
-                storageService.initialize(this.config.trialCodeDirectory, environmentLocalTempFolder);
            }
+            const storageService: StorageService = this.getStorageService(environmentService);
            // Copy the compressed file to remoteDirectory and delete it
            const codeDir = path.resolve(this.config.trialCodeDirectory);
            const envDir = storageService.joinPath("envs");
@@ -256,9 +253,6 @@ class TrialDispatcher implements TrainingService {
            await storageService.save(CONTAINER_INSTALL_NNI_SHELL_FORMAT, installFileName);
            await storageService.save(CONTAINER_INSTALL_NNI_SHELL_FORMAT_FOR_WIN, installFileNameForWin);
-            const runnerSettingsConfig = storageService.joinPath(envDir, "settings.json");
-            await storageService.save(JSON.stringify(runnerSettings), runnerSettingsConfig);
            if (this.isDeveloping) {
                let trialToolsPath = path.join(__dirname, "../../../../../tools/nni_trial_tool");
                if (false === fs.existsSync(trialToolsPath)) {
@@ -655,6 +649,27 @@ class TrialDispatcher implements TrainingService {
        }
    }
+    private async setEnvironmentSetting(environment: EnvironmentInformation): Promise<void> {
+        if (environment.environmentService === undefined) {
+            throw new Error(`Environmentservice for ${environment.id} not initialized!`);
+        }
+        const environmentService = environment.environmentService;
+        const runnerSettings: RunnerSettings = new RunnerSettings();
+        runnerSettings.nniManagerIP = this.config.nniManagerIp === undefined? await getIPV4Address() : this.config.nniManagerIp;
+        runnerSettings.nniManagerPort = getBasePort() + 1;
+        runnerSettings.commandChannel = environmentService.getCommandChannel.channelName;
+        runnerSettings.enableGpuCollector = this.enableGpuScheduler;
+        runnerSettings.command = this.config.trialCommand;
+        runnerSettings.nniManagerVersion = this.enableVersionCheck ? await getVersion() : '';
+        runnerSettings.logCollection = this.logCollection;
+        runnerSettings.platform = environmentService.getName;
+        runnerSettings.experimentId = this.experimentId;
+        const storageService: StorageService = this.getStorageService(environmentService);
+        const envDir = storageService.joinPath("envs");
+        const runnerSettingsConfig = storageService.joinPath(envDir, environment.id, "settings.json");
+        await storageService.save(JSON.stringify(runnerSettings), runnerSettingsConfig);
+    }
    private async requestEnvironment(environmentService: EnvironmentService): Promise<void> {
        if (this.stopping) {
            this.log.info(`Experiment is stopping, stop creating new environment`);
@@ -674,6 +689,8 @@ class TrialDispatcher implements TrainingService {
        environment.command = `mkdir -p envs/${envId} && cd envs/${envId} && ${environment.command}`;
        environment.useSharedStorage = this.useSharedStorage;
+        // Generate setting.json file per environment to avoid conflict
+        await this.setEnvironmentSetting(environment);
        await environmentService.startEnvironment(environment);
        this.environments.set(environment.id, environment);