net_switch.cc 15.2 KB
Newer Older
Antoine Kaufmann's avatar
Antoine Kaufmann committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/*
 * Copyright 2021 Max Planck Institute for Software Systems, and
 * National University of Singapore
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

25
#include <unistd.h>
Jialin Li's avatar
Jialin Li committed
26
#include <pcap/pcap.h>
27
28
29
#include <linux/ip.h>
#include <linux/if_ether.h>
#include <arpa/inet.h>
30
31
32
33

#include <cassert>
#include <climits>
#include <csignal>
Jialin Li's avatar
Jialin Li committed
34
35
36
37
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <unordered_map>
38
#include <vector>
Jialin Li's avatar
Jialin Li committed
39
40

extern "C" {
41
#include <simbricks/netif/netif.h>
42
#include <simbricks/nicif/nicif.h>
43
#include <simbricks/proto/base.h>
Jialin Li's avatar
Jialin Li committed
44
45
};

46
//#define NETSWITCH_DEBUG
Hejing Li's avatar
Hejing Li committed
47
#define NETSWITCH_STAT
48

Antoine Kaufmann's avatar
Antoine Kaufmann committed
49
50
static uint64_t sync_period = (500 * 1000ULL);  // 500ns
static uint64_t eth_latency = (500 * 1000ULL);  // 500ns
51
static int sync_mode = SIMBRICKS_PROTO_SYNC_SIMBRICKS;
Jialin Li's avatar
Jialin Li committed
52
static pcap_dumper_t *dumpfile = nullptr;
Jialin Li's avatar
Jialin Li committed
53

Hejing Li's avatar
Hejing Li committed
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#ifdef NETSWITCH_STAT
#endif

#ifdef NETSWITCH_STAT
static uint64_t d2n_poll_total = 0;
static uint64_t d2n_poll_suc = 0;
static uint64_t d2n_poll_sync = 0;

static uint64_t s_d2n_poll_total = 0;
static uint64_t s_d2n_poll_suc = 0;
static uint64_t s_d2n_poll_sync = 0;

static int stat_flag = 0;
#endif

Jialin Li's avatar
Jialin Li committed
69
70
/* MAC address type */
struct MAC {
71
  const uint8_t *data;
Jialin Li's avatar
Jialin Li committed
72

73
  explicit MAC(const uint8_t *data) : data(data) {
74
  }
Jialin Li's avatar
Jialin Li committed
75

76
77
78
79
80
  bool operator==(const MAC &other) const {
    for (int i = 0; i < 6; i++) {
      if (data[i] != other.data[i]) {
        return false;
      }
Jialin Li's avatar
Jialin Li committed
81
    }
82
83
    return true;
  }
Jialin Li's avatar
Jialin Li committed
84
85
};
namespace std {
Antoine Kaufmann's avatar
Antoine Kaufmann committed
86
template <>
87
88
struct hash<MAC> {
  size_t operator()(const MAC &m) const {
Antoine Kaufmann's avatar
Antoine Kaufmann committed
89
90
    size_t res = 0;
    for (int i = 0; i < 6; i++) {
91
      res = (res << 4) | (res ^ m.data[i]);
Antoine Kaufmann's avatar
Antoine Kaufmann committed
92
93
    }
    return res;
94
  }
Antoine Kaufmann's avatar
Antoine Kaufmann committed
95
96
};
}  // namespace std
Jialin Li's avatar
Jialin Li committed
97

98

99
/** Abstract base switch port */
100
class Port {
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
 public:
  enum RxPollState {
    kRxPollSuccess = 0,
    kRxPollFail = 1,
    kRxPollSync = 2,
  };

  virtual ~Port() = default;

  virtual bool Connect(const char *path, int sync) = 0;
  virtual bool IsSync() = 0;
  virtual void Sync(uint64_t cur_ts) = 0;
  virtual void AdvanceEpoch(uint64_t cur_ts) = 0;
  virtual uint64_t NextTimestamp() = 0;
  virtual enum RxPollState RxPacket(
      const void *& data, size_t &len, uint64_t cur_ts) = 0;
  virtual void RxDone() = 0;
  virtual bool TxPacket(const void *data, size_t len, uint64_t cur_ts) = 0;
};

121

122
123
/** Normal network switch port (conneting to a NIC) */
class NetPort : public Port {
124
125
126
127
128
129
 protected:
  struct SimbricksNetIf netif_;
  volatile union SimbricksProtoNetD2N *rx_;
  int sync_;

 public:
130
  NetPort() : rx_(nullptr), sync_(0) {
131
132
133
    memset(&netif_, 0, sizeof(netif_));
  }

134
  NetPort(const NetPort &other) : netif_(other.netif_), rx_(other.rx_),
135
136
      sync_(other.sync_) {}

137
  virtual bool Connect(const char *path, int sync) override {
138
139
140
141
    sync_ = sync;
    return SimbricksNetIfInit(&netif_, path, &sync_) == 0;
  }

142
  virtual bool IsSync() override {
143
144
145
    return sync_;
  }

146
  virtual void Sync(uint64_t cur_ts) override {
147
148
    while (SimbricksNetIfN2DSync(&netif_, cur_ts, eth_latency, sync_period,
                              sync_mode));
149
150
  }

151
  virtual void AdvanceEpoch(uint64_t cur_ts) override {
152
153
154
    SimbricksNetIfAdvanceEpoch(cur_ts, sync_period, sync_mode);
  }

155
  virtual uint64_t NextTimestamp() override {
156
157
158
159
    return SimbricksNetIfD2NTimestamp(&netif_);
  }

  virtual enum RxPollState RxPacket(
160
      const void *& data, size_t &len, uint64_t cur_ts) override {
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
    assert(rx_ == nullptr);

    rx_ = SimbricksNetIfD2NPoll(&netif_, cur_ts);
    if (!rx_)
      return kRxPollFail;

    uint8_t type = rx_->dummy.own_type & SIMBRICKS_PROTO_NET_D2N_MSG_MASK;
    if (type == SIMBRICKS_PROTO_NET_D2N_MSG_SEND) {
      data = (const void *)rx_->send.data;
      len = rx_->send.len;
      return kRxPollSuccess;
    } else if (type == SIMBRICKS_PROTO_NET_D2N_MSG_SYNC) {
      return kRxPollSync;
    } else {
      fprintf(stderr, "switch_pkt: unsupported type=%u\n", type);
      abort();
    }
  }

180
  virtual void RxDone() override {
181
182
183
184
185
186
    assert(rx_ != nullptr);

    SimbricksNetIfD2NDone(&netif_, rx_);
    rx_ = nullptr;
  }

187
188
  virtual bool TxPacket(
      const void *data, size_t len, uint64_t cur_ts) override {
189
190
    volatile union SimbricksProtoNetN2D *msg_to =
      SimbricksNetIfN2DAlloc(&netif_, cur_ts, eth_latency);
191
    if (!msg_to && !sync_) {
192
      return false;
193
194
195
196
    } else if (!msg_to && sync_) {
      while (!msg_to)
        msg_to = SimbricksNetIfN2DAlloc(&netif_, cur_ts, eth_latency);
    }
197
198
199
200
201
202
203
204
205
206
207
208
209
    volatile struct SimbricksProtoNetN2DRecv *rx;
    rx = &msg_to->recv;
    rx->len = len;
    rx->port = 0;
    memcpy((void *)rx->data, data, len);

    // WMB();
    rx->own_type =
        SIMBRICKS_PROTO_NET_N2D_MSG_RECV | SIMBRICKS_PROTO_NET_N2D_OWN_DEV;
    return true;
  }
};

210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253

/** Hosting network switch port (connected to another network) */
class NetHostPort : public Port {
 protected:
  struct SimbricksNicIf nicif_;
  volatile union SimbricksProtoNetN2D *rx_;
  int sync_;

 public:
  NetHostPort() : rx_(nullptr), sync_(0) {
    memset(&nicif_, 0, sizeof(nicif_));
  }

  NetHostPort(const NetHostPort &other) : nicif_(other.nicif_), rx_(other.rx_),
      sync_(other.sync_) {}

  virtual bool Connect(const char *path, int sync) override {
    sync_ = sync;
    std::string shm_path = path;
    shm_path += "-shm";
    struct SimbricksNicIfParams params = {
      .pci_socket_path = nullptr,
      .eth_socket_path = path,
      .shm_path = shm_path.c_str(),

      .pci_latency = 0,
      .eth_latency = eth_latency,
      .sync_delay = sync_period,

      .sync_pci = 0,
      .sync_eth = sync,
      .sync_mode = sync_mode,
    };
    struct SimbricksProtoPcieDevIntro di;
    int ret = SimbricksNicIfInit(&nicif_, &params, &di);
    sync_ = params.sync_eth;
    return ret == 0;
  }

  virtual bool IsSync() override {
    return sync_;
  }

  virtual void Sync(uint64_t cur_ts) override {
254
    while (SimbricksNicIfSync(&nicif_, cur_ts));
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
  }

  virtual void AdvanceEpoch(uint64_t cur_ts) override {
    SimbricksNicIfAdvanceEpoch(&nicif_, cur_ts);
  }

  virtual uint64_t NextTimestamp() override {
    return SimbricksNicIfNextTimestamp(&nicif_);
  }

  virtual enum RxPollState RxPacket(
      const void *& data, size_t &len, uint64_t cur_ts) override {
    assert(rx_ == nullptr);

    rx_ = SimbricksNicIfN2DPoll(&nicif_, cur_ts);
    if (!rx_)
      return kRxPollFail;

    uint8_t type = rx_->dummy.own_type & SIMBRICKS_PROTO_NET_N2D_MSG_MASK;
    if (type == SIMBRICKS_PROTO_NET_N2D_MSG_RECV) {
      data = (const void *)rx_->recv.data;
      len = rx_->recv.len;
      return kRxPollSuccess;
    } else if (type == SIMBRICKS_PROTO_NET_N2D_MSG_SYNC) {
      return kRxPollSync;
    } else {
      fprintf(stderr, "switch_pkt: unsupported type=%u\n", type);
      abort();
    }
  }

  virtual void RxDone() override {
    assert(rx_ != nullptr);

    SimbricksNicIfN2DDone(&nicif_, rx_);
    SimbricksNicIfN2DNext(&nicif_);
    rx_ = nullptr;
  }

  virtual bool TxPacket(
      const void *data, size_t len, uint64_t cur_ts) override {
    volatile union SimbricksProtoNetD2N *msg_to =
      SimbricksNicIfD2NAlloc(&nicif_, cur_ts);
298
    if (!msg_to && !sync_) {
299
      return false;
300
301
302
303
    } else if (!msg_to && sync_) {
      while (!msg_to)
        msg_to = SimbricksNicIfD2NAlloc(&nicif_, cur_ts);
    }
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318

    volatile struct SimbricksProtoNetD2NSend *rx;
    rx = &msg_to->send;
    rx->len = len;
    rx->port = 0;
    memcpy((void *)rx->data, data, len);

    // WMB();
    rx->own_type =
        SIMBRICKS_PROTO_NET_D2N_MSG_SEND | SIMBRICKS_PROTO_NET_D2N_OWN_NET;
    return true;
  }
};


Jialin Li's avatar
Jialin Li committed
319
320
321
/* Global variables */
static uint64_t cur_ts = 0;
static int exiting = 0;
322
static const uint8_t bcast[6] = {0xFF};
Jialin Li's avatar
Jialin Li committed
323
static const MAC bcast_addr(bcast);
324
static std::vector<Port *> ports;
Jialin Li's avatar
Jialin Li committed
325
326
static std::unordered_map<MAC, int> mac_table;

327
328
static void sigint_handler(int dummy) {
  exiting = 1;
Jialin Li's avatar
Jialin Li committed
329
330
}

Hejing Li's avatar
Hejing Li committed
331
332
333
334
335
336
337
338
339
340
static void sigusr1_handler(int dummy) {
  fprintf(stderr, "main_time = %lu\n", cur_ts);
}

#ifdef NETSWITCH_STAT
static void sigusr2_handler(int dummy) {
  stat_flag = 1;
}
#endif

341
342
static void forward_pkt(const void *pkt_data, size_t pkt_len, size_t port_id,
                        size_t iport_id) {
Jialin Li's avatar
Jialin Li committed
343
  struct pcap_pkthdr ph;
344
  Port &dest_port = *ports[port_id];
345

Jialin Li's avatar
Jialin Li committed
346
347
348
349
350
  // log to pcap file if initialized
  if (dumpfile) {
      memset(&ph, 0, sizeof(ph));
      ph.ts.tv_sec = cur_ts / 1000000000000ULL;
      ph.ts.tv_usec = (cur_ts % 1000000000000ULL) / 1000ULL;
351
352
353
      ph.caplen = pkt_len;
      ph.len = pkt_len;
      pcap_dump((unsigned char *)dumpfile, &ph, (unsigned char *)pkt_data);
Jialin Li's avatar
Jialin Li committed
354
  }
355
  // print sending tick: [packet type] source_IP -> dest_IP len:
356

357
358
359
360
#ifdef NETSWITCH_DEBUG
  uint16_t eth_proto;
  struct ethhdr *hdr;
  struct iphdr *iph;
361
  hdr = (struct ethhdr*)pkt_data;
362
363
  eth_proto = ntohs(hdr->h_proto);
  iph = (struct iphdr *)(hdr + 1);
364
365
366
367
  uint64_t dmac = (*(uint64_t *) hdr->h_dest) & 0xFFFFFFFFFFULL;
  uint64_t smac = (*(uint64_t *) hdr->h_source) & 0xFFFFFFFFFFULL;
  fprintf(stderr, "%20lu: [P %zu -> %zu] %lx -> %lx ", cur_ts, iport_id,
          port_id, smac, dmac);
368
369
  if (eth_proto == ETH_P_IP){
    fprintf(stderr, "[ IP] ");
370
371
    fprintf(stderr, "%8X -> %8X len: %lu\n", iph->saddr, iph->daddr,
            ntohs(iph->tot_len) + sizeof(struct ethhdr));
372
373
  } 
  else if(eth_proto == ETH_P_ARP){
374
375
376
    fprintf(stderr, "[ARP] %8X -> %8X\n",
            *(uint32_t *) ((uint8_t *) pkt_data + 28),
            *(uint32_t *) ((uint8_t *) pkt_data + 38) );
377
378
  } 
  else{
379
    fprintf(stderr, "unknown eth type\n");
380
381
382
  }
#endif

383
384
  if (!dest_port.TxPacket(pkt_data, pkt_len, cur_ts))
    fprintf(stderr, "forward_pkt: dropping packet on port %zu\n", port_id);
Jialin Li's avatar
Jialin Li committed
385
386
}

387
388
389
390
static void switch_pkt(Port &port, size_t iport) {
  const void *pkt_data;
  size_t pkt_len;

Hejing Li's avatar
Hejing Li committed
391
392
393
394
395
396
397
#ifdef NETSWITCH_STAT
  d2n_poll_total += 1;
  if (stat_flag){
    s_d2n_poll_total += 1;
  }
#endif

398
399
  enum Port::RxPollState poll = port.RxPacket(pkt_data, pkt_len, cur_ts);
  if (poll == Port::kRxPollFail) {
400
401
402
    return;
  }

Hejing Li's avatar
Hejing Li committed
403
404
405
406
407
408
409
#ifdef NETSWITCH_STAT
  d2n_poll_suc += 1;
  if (stat_flag){
    s_d2n_poll_suc += 1;
  }
#endif

410
  if (poll == Port::kRxPollSuccess) {
411
    // Get MAC addresses
412
    MAC dst((const uint8_t *)pkt_data), src((const uint8_t *)pkt_data + 6);
413
414
415
    // MAC learning
    if (!(src == bcast_addr)) {
      mac_table[src] = iport;
Jialin Li's avatar
Jialin Li committed
416
    }
417
    // L2 forwarding
418
419
420
    auto i = mac_table.find(dst);
    if (i != mac_table.end()) {
      size_t eport = i->second;
421
      if (eport != iport)
422
        forward_pkt(pkt_data, pkt_len, eport, iport);
Jialin Li's avatar
Jialin Li committed
423
    } else {
424
      // Broadcast
425
      for (size_t eport = 0; eport < ports.size(); eport++) {
426
427
        if (eport != iport) {
          // Do not forward to ingress port
428
          forward_pkt(pkt_data, pkt_len, eport, iport);
429
430
        }
      }
Jialin Li's avatar
Jialin Li committed
431
    }
432
  } else if (poll == Port::kRxPollSync) {
Hejing Li's avatar
Hejing Li committed
433
434
435
436
437
438
#ifdef NETSWITCH_STAT
    d2n_poll_sync += 1;
    if (stat_flag){
      s_d2n_poll_sync += 1;
    }
#endif
439
  } else {
440
    fprintf(stderr, "switch_pkt: unsupported poll result=%u\n", poll);
441
442
    abort();
  }
443
  port.RxDone();
Jialin Li's avatar
Jialin Li committed
444
445
}

446
447
448
int main(int argc, char *argv[]) {
  int c;
  int bad_option = 0;
449
  int sync_eth = 1;
Jialin Li's avatar
Jialin Li committed
450
  pcap_t *pc = nullptr;
451
452

  // Parse command line argument
453
  while ((c = getopt(argc, argv, "s:h:uS:E:m:p:")) != -1 && !bad_option) {
454
455
    switch (c) {
      case 's': {
456
        NetPort *port = new NetPort;
457
        fprintf(stderr, "Switch connecting to: %s\n", optarg);
458
        if (!port->Connect(optarg, sync_eth)) {
459
460
          fprintf(stderr, "connecting to %s failed\n", optarg);
          return EXIT_FAILURE;
Jialin Li's avatar
Jialin Li committed
461
        }
462
        ports.push_back(port);
463
464
465
        break;
      }

466
467
468
469
470
471
472
473
474
475
476
      case 'h': {
        NetHostPort *port = new NetHostPort;
        fprintf(stderr, "Switch listening on: %s\n", optarg);
        if (!port->Connect(optarg, sync_eth)) {
          fprintf(stderr, "listening on %s failed\n", optarg);
          return EXIT_FAILURE;
        }
        ports.push_back(port);
        break;
      }

477
478
479
480
      case 'u':
        sync_eth = 0;
        break;

481
482
483
484
485
486
487
488
489
490
      case 'S':
        sync_period = strtoull(optarg, NULL, 0) * 1000ULL;
        break;

      case 'E':
        eth_latency = strtoull(optarg, NULL, 0) * 1000ULL;
        break;

      case 'm':
        sync_mode = strtol(optarg, NULL, 0);
491
        assert(sync_mode == SIMBRICKS_PROTO_SYNC_SIMBRICKS ||
492
               sync_mode == SIMBRICKS_PROTO_SYNC_BARRIER);
493
494
        break;

Jialin Li's avatar
Jialin Li committed
495
496
497
498
499
500
501
502
503
504
505
      case 'p':
        pc = pcap_open_dead_with_tstamp_precision(DLT_EN10MB, 65535,
                                                  PCAP_TSTAMP_PRECISION_NANO);
        if (pc == nullptr) {
            perror("pcap_open_dead failed");
            return EXIT_FAILURE;
        }

        dumpfile = pcap_dump_open(pc, optarg);
        break;

506
507
508
509
      default:
        fprintf(stderr, "unknown option %c\n", c);
        bad_option = 1;
        break;
Jialin Li's avatar
Jialin Li committed
510
    }
511
512
  }

513
  if (ports.empty() || bad_option) {
514
515
516
517
518
519
520
521
    fprintf(stderr,
            "Usage: net_switch [-S SYNC-PERIOD] [-E ETH-LATENCY] "
            "-s SOCKET-A [-s SOCKET-B ...]\n");
    return EXIT_FAILURE;
  }

  signal(SIGINT, sigint_handler);
  signal(SIGTERM, sigint_handler);
Hejing Li's avatar
Hejing Li committed
522
523
524
525
526
527
  signal(SIGUSR1, sigusr1_handler);

#ifdef NETSWITCH_STAT
  signal(SIGUSR2, sigusr2_handler);
#endif

528
529
530
531

  printf("start polling\n");
  while (!exiting) {
    // Sync all interfaces
532
533
534
535
    for (auto port : ports)
      port->Sync(cur_ts);
    for (auto port : ports)
      port->AdvanceEpoch(cur_ts);
536
537
538
539
540

    // Switch packets
    uint64_t min_ts;
    do {
      min_ts = ULLONG_MAX;
541
      for (size_t port_i = 0; port_i < ports.size(); port_i++) {
542
        auto &port = *ports[port_i];
543
544
545
        switch_pkt(port, port_i);
        if (port.IsSync()) {
          uint64_t ts = port.NextTimestamp();
546
          min_ts = ts < min_ts ? ts : min_ts;
Jialin Li's avatar
Jialin Li committed
547
        }
548
549
550
551
552
      }
    } while (!exiting && (min_ts <= cur_ts));

    // Update cur_ts
    if (min_ts < ULLONG_MAX) {
553
      // a bit broken but should probably do
554
      cur_ts = SimbricksNetIfAdvanceTime(min_ts, sync_period, sync_mode);
Jialin Li's avatar
Jialin Li committed
555
    }
556
  }
Jialin Li's avatar
Jialin Li committed
557

Hejing Li's avatar
Hejing Li committed
558
559
560
561
562
563
564
565
566
567
568
569
570
571
#ifdef NETSWITCH_STAT
  fprintf(stderr, "%20s: %22lu %20s: %22lu  poll_suc_rate: %f\n",
          "d2n_poll_total", d2n_poll_total, "d2n_poll_suc", d2n_poll_suc,
          (double)d2n_poll_suc / d2n_poll_total);
  fprintf(stderr, "%65s: %22lu  sync_rate: %f\n", "d2n_poll_sync",
          d2n_poll_sync, (double)d2n_poll_sync / d2n_poll_suc);

  fprintf(stderr, "%20s: %22lu %20s: %22lu  poll_suc_rate: %f\n",
          "s_d2n_poll_total", s_d2n_poll_total, "s_d2n_poll_suc", s_d2n_poll_suc,
          (double)s_d2n_poll_suc / s_d2n_poll_total);
  fprintf(stderr, "%65s: %22lu  sync_rate: %f\n", "s_d2n_poll_sync",
          s_d2n_poll_sync, (double)s_d2n_poll_sync / s_d2n_poll_suc);
#endif

572
  return 0;
Jialin Li's avatar
Jialin Li committed
573
}