Unverified Commit 6cdf7595 authored by Ziyue Yang's avatar Ziyue Yang Committed by GitHub
Browse files

Benchmarks: Revise Code - Eliminate NUMA binding for device-to-device tests in gpu_copy (#302)

**Description**
This commit remove NUMA binding for device-to-device tests because NUMA doesn't affect performance, and revise benchmark metrics accordingly.
parent 433785fd
...@@ -187,15 +187,15 @@ Measure the memory copy bandwidth performed by GPU SM/DMA engine, including devi ...@@ -187,15 +187,15 @@ Measure the memory copy bandwidth performed by GPU SM/DMA engine, including devi
#### Metrics #### Metrics
| Name | Unit | Description | | Name | Unit | Description |
|------------------------------------------------------------------------------------|------------------|------------------------------------------------------------------------------------------------------------------------------------------| |-------------------------------------------------------------|------------------|------------------------------------------------------------------------------------------------------------------------------------------|
| cpu\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_uni\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading one NUMA node's host memory using DMA engine or GPU SM. | | cpu\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading one NUMA node's host memory using DMA engine or GPU SM. |
| gpu[0-9]+\_to\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_uni\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU writing one NUMA node's host memory using DMA engine or GPU SM. | | gpu[0-9]+\_to\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU writing one NUMA node's host memory using DMA engine or GPU SM. |
| gpu[0-9]+\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_uni\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading or writing self's memory using DMA engine or GPU SM with peer communication enabled. | | gpu[0-9]+\_to\_gpu[0-9]+\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading or writing self's memory using DMA engine or GPU SM. |
| gpu[0-9]+\_to\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_under\_numa[0-9]+\_uni\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading or writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. | | gpu[0-9]+\_to\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The unidirectional bandwidth of one GPU reading or writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. |
| cpu\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_bi\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing one NUMA node's host memory using DMA engine or GPU SM. | | cpu\_and\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing one NUMA node's host memory using DMA engine or GPU SM. |
| gpu[0-9]+\_to\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_bi\_bw | bandwidth (GB/s) | Same as above. | | gpu[0-9]+\_and\_cpu\_by\_(sm\|dma)\_under\_numa[0-9]+\_bw | bandwidth (GB/s) | Same as above, but generated by --dtoh --bidirectional. |
| gpu[0-9]+\_to\_gpu[0-9]+\_by\_(sm\|dma)\_under\_numa[0-9]+\_bi\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing self's memory using DMA engine or GPU SM with peer communication enabled. | | gpu[0-9]+\_and\_gpu[0-9]+\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing self's memory using DMA engine or GPU SM. |
| gpu[0-9]+\_to\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_under\_numa[0-9]+\_bi\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. | | gpu[0-9]+\_and\_gpu[0-9]+\_(read\|write)\_by\_(sm\|dma)\_bw | bandwidth (GB/s) | The bidirectional bandwidth of one GPU reading and writing peer GPU's memory using DMA engine or GPU SM with peer communication enabled. |
### `ib-loopback` ### `ib-loopback`
......
...@@ -544,7 +544,7 @@ void PrintResultTag(const BenchArgs &args) { ...@@ -544,7 +544,7 @@ void PrintResultTag(const BenchArgs &args) {
} else { } else {
printf("cpu"); printf("cpu");
} }
printf("_to_"); printf("%s", args.num_subs == 1 ? "_to_" : "_and_");
if (args.subs[0].is_dst_dev_gpu) { if (args.subs[0].is_dst_dev_gpu) {
printf("gpu%d", args.subs[0].dst_gpu_id); printf("gpu%d", args.subs[0].dst_gpu_id);
} else { } else {
...@@ -558,11 +558,9 @@ void PrintResultTag(const BenchArgs &args) { ...@@ -558,11 +558,9 @@ void PrintResultTag(const BenchArgs &args) {
printf("_read"); printf("_read");
} }
} }
printf("_by_%s_under_numa%lu", args.is_sm_copy ? "sm" : "dma", args.numa_id); printf("_by_%s", args.is_sm_copy ? "sm" : "dma");
if (args.num_subs == 1) { if (!args.subs[0].is_src_dev_gpu || !args.subs[0].is_dst_dev_gpu) {
printf("_uni"); printf("_under_numa%lu", args.numa_id);
} else {
printf("_bi");
} }
} }
...@@ -810,13 +808,16 @@ int main(int argc, char **argv) { ...@@ -810,13 +808,16 @@ int main(int argc, char **argv) {
args_list.push_back(args); args_list.push_back(args);
} }
} }
if (args.numa_id != 0) {
continue;
}
// Device-to-device benchmark // Device-to-device benchmark
if (opts.dtod_enabled) { if (opts.dtod_enabled) {
// Scan all peers // Scan all peers
for (int k = 0; k < gpu_count; k++) { for (int k = 0; k < gpu_count; k++) {
// Skip second half for bidirectional test // src_dev_id always <= dst_dev_id for bidirectional test
if (opts.bidirectional_enabled && k > j) { if (opts.bidirectional_enabled && j > k) {
break; continue;
} }
// P2P write // P2P write
ret = EnablePeerAccess(j, k, &can_access); ret = EnablePeerAccess(j, k, &can_access);
......
cpu_to_gpu0_by_sm_under_numa0_uni 26.1736 cpu_to_gpu0_by_sm_under_numa0 26.2409
cpu_to_gpu0_by_dma_under_numa0_uni 26.1878 cpu_to_gpu0_by_dma_under_numa0 26.2387
gpu0_to_cpu_by_sm_under_numa0_uni 5.01589 gpu0_to_cpu_by_sm_under_numa0 5.67346
gpu0_to_cpu_by_dma_under_numa0_uni 21.8659 gpu0_to_cpu_by_dma_under_numa0 25.8516
gpu0_to_gpu0_by_sm_under_numa0_uni 655.759 gpu0_to_gpu0_by_sm 682.667
gpu0_to_gpu0_by_dma_under_numa0_uni 633.325 gpu0_to_gpu0_by_dma 657.332
gpu0_to_gpu1_write_by_sm_under_numa0_uni 250.122 gpu0_to_gpu1_write_by_sm 258.397
gpu0_to_gpu1_write_by_dma_under_numa0_uni 274.951 gpu0_to_gpu1_write_by_dma 279.287
gpu0_to_gpu1_read_by_sm_under_numa0_uni 253.563 gpu0_to_gpu1_read_by_sm 261.856
gpu0_to_gpu1_read_by_dma_under_numa0_uni 264.009 gpu0_to_gpu1_read_by_dma 275.854
cpu_to_gpu1_by_sm_under_numa0_uni 26.187 cpu_to_gpu1_by_sm_under_numa0 26.2401
cpu_to_gpu1_by_dma_under_numa0_uni 26.207 cpu_to_gpu1_by_dma_under_numa0 26.2392
gpu1_to_cpu_by_sm_under_numa0_uni 5.01132 gpu1_to_cpu_by_sm_under_numa0 5.67114
gpu1_to_cpu_by_dma_under_numa0_uni 21.8635 gpu1_to_cpu_by_dma_under_numa0 26.0584
gpu1_to_gpu0_write_by_sm_under_numa0_uni 249.824 gpu1_to_gpu0_write_by_sm 258.729
gpu1_to_gpu0_write_by_dma_under_numa0_uni 275.123 gpu1_to_gpu0_write_by_dma 278.308
gpu1_to_gpu0_read_by_sm_under_numa0_uni 253.469 gpu1_to_gpu0_read_by_sm 261.804
gpu1_to_gpu0_read_by_dma_under_numa0_uni 264.908 gpu1_to_gpu0_read_by_dma 275.825
gpu1_to_gpu1_by_sm_under_numa0_uni 658.338 gpu1_to_gpu1_by_sm 682.311
gpu1_to_gpu1_by_dma_under_numa0_uni 631.148 gpu1_to_gpu1_by_dma 656.673
cpu_to_gpu0_by_sm_under_numa1_uni 26.1542 cpu_to_gpu0_by_sm_under_numa1 26.2414
cpu_to_gpu0_by_dma_under_numa1_uni 26.2007 cpu_to_gpu0_by_dma_under_numa1 26.2332
gpu0_to_cpu_by_sm_under_numa1_uni 5.67356 gpu0_to_cpu_by_sm_under_numa1 6.40701
gpu0_to_cpu_by_dma_under_numa1_uni 21.8599 gpu0_to_cpu_by_dma_under_numa1 26.104
gpu0_to_gpu0_by_sm_under_numa1_uni 656.935 cpu_to_gpu1_by_sm_under_numa1 26.2404
gpu0_to_gpu0_by_dma_under_numa1_uni 631.974 cpu_to_gpu1_by_dma_under_numa1 26.2412
gpu0_to_gpu1_write_by_sm_under_numa1_uni 250.118 gpu1_to_cpu_by_sm_under_numa1 6.40865
gpu0_to_gpu1_write_by_dma_under_numa1_uni 274.778 gpu1_to_cpu_by_dma_under_numa1 26.0804
gpu0_to_gpu1_read_by_sm_under_numa1_uni 253.625 cpu_and_gpu0_by_sm_under_numa0 9.31711
gpu0_to_gpu1_read_by_dma_under_numa1_uni 264.347 cpu_and_gpu0_by_dma_under_numa0 49.4624
cpu_to_gpu1_by_sm_under_numa1_uni 26.1905 gpu0_and_cpu_by_sm_under_numa0 9.32671
cpu_to_gpu1_by_dma_under_numa1_uni 26.2007 gpu0_and_cpu_by_dma_under_numa0 49.4572
gpu1_to_cpu_by_sm_under_numa1_uni 5.67716 gpu0_and_gpu0_by_sm 685.523
gpu1_to_cpu_by_dma_under_numa1_uni 21.8579 gpu0_and_gpu0_by_dma 666.016
gpu1_to_gpu0_write_by_sm_under_numa1_uni 250.064 gpu0_and_gpu1_write_by_sm 440.023
gpu1_to_gpu0_write_by_dma_under_numa1_uni 274.924 gpu0_and_gpu1_write_by_dma 531.244
gpu1_to_gpu0_read_by_sm_under_numa1_uni 253.746 gpu0_and_gpu1_read_by_sm 460.831
gpu1_to_gpu0_read_by_dma_under_numa1_uni 264.256 gpu0_and_gpu1_read_by_dma 526.288
gpu1_to_gpu1_by_sm_under_numa1_uni 655.623 cpu_and_gpu1_by_sm_under_numa0 9.29908
gpu1_to_gpu1_by_dma_under_numa1_uni 634.062 cpu_and_gpu1_by_dma_under_numa0 49.4357
cpu_to_gpu0_by_sm_under_numa0_bi 8.45975 gpu1_and_cpu_by_sm_under_numa0 9.32654
cpu_to_gpu0_by_dma_under_numa0_bi 36.4282 gpu1_and_cpu_by_dma_under_numa0 49.4429
gpu0_to_gpu0_by_sm_under_numa0_bi 689.063 gpu1_and_gpu1_by_sm 672.768
gpu0_to_gpu0_by_dma_under_numa0_bi 661.7 gpu1_and_gpu1_by_dma 665.763
gpu0_to_gpu1_write_by_sm_under_numa0_bi 427.446 cpu_and_gpu0_by_sm_under_numa1 10.2742
gpu0_to_gpu1_write_by_dma_under_numa0_bi 521.577 cpu_and_gpu0_by_dma_under_numa1 49.3646
gpu0_to_gpu1_read_by_sm_under_numa0_bi 446.835 gpu0_and_cpu_by_sm_under_numa1 10.2896
gpu0_to_gpu1_read_by_dma_under_numa0_bi 503.158 gpu0_and_cpu_by_dma_under_numa1 49.3639
cpu_to_gpu1_by_sm_under_numa0_bi 8.4487 cpu_and_gpu1_by_sm_under_numa1 10.2994
cpu_to_gpu1_by_dma_under_numa0_bi 36.4272 cpu_and_gpu1_by_dma_under_numa1 49.3615
cpu_to_gpu0_by_sm_under_numa1_bi 9.36164 gpu1_and_cpu_by_sm_under_numa1 10.2817
cpu_to_gpu0_by_dma_under_numa1_bi 36.411 gpu1_and_cpu_by_dma_under_numa1 49.3653
gpu0_to_gpu0_by_sm_under_numa1_bi 688.156
gpu0_to_gpu0_by_dma_under_numa1_bi 662.077
gpu0_to_gpu1_write_by_sm_under_numa1_bi 427.033
gpu0_to_gpu1_write_by_dma_under_numa1_bi 521.367
gpu0_to_gpu1_read_by_sm_under_numa1_bi 446.179
gpu0_to_gpu1_read_by_dma_under_numa1_bi 503.843
cpu_to_gpu1_by_sm_under_numa1_bi 9.37368
cpu_to_gpu1_by_dma_under_numa1_bi 36.4128
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment