tables.tex 1.54 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
\documentclass[multi,convert]{standalone}
\usepackage{multirow}
\standaloneenv{tabular}

\begin{document}

\begin{tabular}{cccccc}
  Case & Hidden Size & Attention Heads & Layers & Parameters (billions) & Model Parallel Partitions \\
  \hline
  1B & 1920 & 15 & 24 & 1.16 & 1 \\
  2B & 2304 & 18 & 30 & 2.03 & 2 \\
  4B & 3072 & 24 & 36 & 4.24 & 4 \\
  8B & 4096 & 32 & 42 & 8.67 & 8 \\
\end{tabular}

\begin{tabular}{cc|ccc|ccc}
  & & \multicolumn{3}{c|}{\textbf{DGX-2 (V100) batch size 8}} & \multicolumn{3}{c}{\textbf{DGX-A100 batch size 16}} \\
  \hline
  \multirow{2}{*}{Case} & Number of & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs \\
                        & GPUs      & Time (ms) &                          & per GPU   & Time (ms) &                          & per GPU \\
  \hline
  1B & 1 & 1121 & 100.0\% & 71.9 & 1076 & 100\%  & 149.8 \\
  2B & 2 & 1093 & 89.6\%  & 64.2 & 1026 & 91.7\% & 136.8 \\
  4B & 4 & 1238 & 82.5\%  & 58.5 & 1162 & 84.5\% & 124.7 \\
  8B & 8 & 1407 & 74.3\%  & 52.2 & 1343 & 74.7\% & 109.3 \\
\end{tabular}

\begin{tabular}{cc|ccc}
  & & \multicolumn{3}{c}{\textbf{DGX-A100 batch size 2048}} \\
  \hline
  \multirow{2}{*}{Case} & Number of & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs \\
                        & GPUs      & Time (ms) &                          & per GPU   \\
  \hline
  1B & 128  & 1153 & 93.3\% & 139.8 \\
  2B & 256  & 1101 & 85.5\% & 127.5 \\
  4B & 512  & 1242 & 79.0\% & 116.7 \\
  8B & 1024 & 1380 & 72.7\% & 106.5 \\
\end{tabular}

\end{document}