arch_comparison.d2 1.43 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
direction: right

aggregated: Aggregated {
  width: 600
  height: 450

  frontend: Frontend {
    width: 180
    height: 60
    style.font-size: 20
  }
  router: Router {
    width: 180
    height: 60
    style.font-size: 20
  }
  w1: "W1 (TP2)" {
    width: 180
    height: 60
    style.font-size: 20
  }
  w2: "W2 (TP2)" {
    width: 180
    height: 60
    style.font-size: 20
  }
  w3: "W3 (TP2)" {
    width: 180
    height: 60
    style.font-size: 20
  }
  w4: "W4 (TP2)" {
    width: 180
    height: 60
    style.font-size: 20
  }

  frontend -> router
  router -> w1
  router -> w2
  router -> w3
  router -> w4

  note: |md
    Each worker handles both prefill and decode.
  |
  note.style.font-size: 18
}

disaggregated: Disaggregated {
  width: 600
  height: 450

  frontend: Frontend {
    width: 180
    height: 60
    style.font-size: 20
  }
  router: Router {
    width: 180
    height: 60
    style.font-size: 20
  }
  p1: "Prefill 1 (TP2)" {
    width: 220
    height: 60
    style.font-size: 20
  }
  p2: "Prefill 2 (TP2)" {
    width: 220
    height: 60
    style.font-size: 20
  }
  decode: "Decode (TP4)" {
    width: 220
    height: 60
    style.font-size: 20
  }

  frontend -> router
  router -> p1
  router -> p2
  p1 -> decode: "KV Cache via RDMA"
  p2 -> decode: "KV Cache via RDMA"

  note: |md
    Prefill and decode on separate workers.
  |
  note.style.font-size: 18
}

aggregated.style.font-size: 24
disaggregated.style.font-size: 24