net_switch.cc 14.7 KB
Newer Older
Antoine Kaufmann's avatar
Antoine Kaufmann committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/*
 * Copyright 2021 Max Planck Institute for Software Systems, and
 * National University of Singapore
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

25
#include <unistd.h>
Jialin Li's avatar
Jialin Li committed
26
#include <pcap/pcap.h>
27
28
29
#include <linux/ip.h>
#include <linux/if_ether.h>
#include <arpa/inet.h>
30
31
32
33

#include <cassert>
#include <climits>
#include <csignal>
Jialin Li's avatar
Jialin Li committed
34
35
36
37
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <unordered_map>
38
#include <vector>
Jialin Li's avatar
Jialin Li committed
39
40

extern "C" {
41
#include <simbricks/netif/netif.h>
42
#include <simbricks/nicif/nicif.h>
43
#include <simbricks/proto/base.h>
Jialin Li's avatar
Jialin Li committed
44
45
};

46
//#define NETSWITCH_DEBUG
Hejing Li's avatar
Hejing Li committed
47
#define NETSWITCH_STAT
48

Antoine Kaufmann's avatar
Antoine Kaufmann committed
49
50
static uint64_t sync_period = (500 * 1000ULL);  // 500ns
static uint64_t eth_latency = (500 * 1000ULL);  // 500ns
51
static int sync_mode = SIMBRICKS_PROTO_SYNC_SIMBRICKS;
Jialin Li's avatar
Jialin Li committed
52
static pcap_dumper_t *dumpfile = nullptr;
Jialin Li's avatar
Jialin Li committed
53

Hejing Li's avatar
Hejing Li committed
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#ifdef NETSWITCH_STAT
#endif

#ifdef NETSWITCH_STAT
static uint64_t d2n_poll_total = 0;
static uint64_t d2n_poll_suc = 0;
static uint64_t d2n_poll_sync = 0;

static uint64_t s_d2n_poll_total = 0;
static uint64_t s_d2n_poll_suc = 0;
static uint64_t s_d2n_poll_sync = 0;

static int stat_flag = 0;
#endif

Jialin Li's avatar
Jialin Li committed
69
70
/* MAC address type */
struct MAC {
71
  const uint8_t *data;
Jialin Li's avatar
Jialin Li committed
72

73
  explicit MAC(const uint8_t *data) : data(data) {
74
  }
Jialin Li's avatar
Jialin Li committed
75

76
77
78
79
80
  bool operator==(const MAC &other) const {
    for (int i = 0; i < 6; i++) {
      if (data[i] != other.data[i]) {
        return false;
      }
Jialin Li's avatar
Jialin Li committed
81
    }
82
83
    return true;
  }
Jialin Li's avatar
Jialin Li committed
84
85
};
namespace std {
Antoine Kaufmann's avatar
Antoine Kaufmann committed
86
template <>
87
88
struct hash<MAC> {
  size_t operator()(const MAC &m) const {
Antoine Kaufmann's avatar
Antoine Kaufmann committed
89
90
    size_t res = 0;
    for (int i = 0; i < 6; i++) {
91
      res = (res << 4) | (res ^ m.data[i]);
Antoine Kaufmann's avatar
Antoine Kaufmann committed
92
93
    }
    return res;
94
  }
Antoine Kaufmann's avatar
Antoine Kaufmann committed
95
96
};
}  // namespace std
Jialin Li's avatar
Jialin Li committed
97

98

99
/** Abstract base switch port */
100
class Port {
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
 public:
  enum RxPollState {
    kRxPollSuccess = 0,
    kRxPollFail = 1,
    kRxPollSync = 2,
  };

  virtual ~Port() = default;

  virtual bool Connect(const char *path, int sync) = 0;
  virtual bool IsSync() = 0;
  virtual void Sync(uint64_t cur_ts) = 0;
  virtual void AdvanceEpoch(uint64_t cur_ts) = 0;
  virtual uint64_t NextTimestamp() = 0;
  virtual enum RxPollState RxPacket(
      const void *& data, size_t &len, uint64_t cur_ts) = 0;
  virtual void RxDone() = 0;
  virtual bool TxPacket(const void *data, size_t len, uint64_t cur_ts) = 0;
};

121

122
123
/** Normal network switch port (conneting to a NIC) */
class NetPort : public Port {
124
125
126
127
128
129
 protected:
  struct SimbricksNetIf netif_;
  volatile union SimbricksProtoNetD2N *rx_;
  int sync_;

 public:
130
  NetPort() : rx_(nullptr), sync_(0) {
131
132
133
    memset(&netif_, 0, sizeof(netif_));
  }

134
  NetPort(const NetPort &other) : netif_(other.netif_), rx_(other.rx_),
135
136
      sync_(other.sync_) {}

137
  virtual bool Connect(const char *path, int sync) override {
138
139
140
141
    sync_ = sync;
    return SimbricksNetIfInit(&netif_, path, &sync_) == 0;
  }

142
  virtual bool IsSync() override {
143
144
145
    return sync_;
  }

146
  virtual void Sync(uint64_t cur_ts) override {
147
148
149
150
151
152
153
    if (SimbricksNetIfN2DSync(&netif_, cur_ts, eth_latency, sync_period,
                              sync_mode) != 0) {
      fprintf(stderr, "SimbricksNetIfN2DSync failed\n");
      abort();
    }
  }

154
  virtual void AdvanceEpoch(uint64_t cur_ts) override {
155
156
157
    SimbricksNetIfAdvanceEpoch(cur_ts, sync_period, sync_mode);
  }

158
  virtual uint64_t NextTimestamp() override {
159
160
161
162
    return SimbricksNetIfD2NTimestamp(&netif_);
  }

  virtual enum RxPollState RxPacket(
163
      const void *& data, size_t &len, uint64_t cur_ts) override {
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
    assert(rx_ == nullptr);

    rx_ = SimbricksNetIfD2NPoll(&netif_, cur_ts);
    if (!rx_)
      return kRxPollFail;

    uint8_t type = rx_->dummy.own_type & SIMBRICKS_PROTO_NET_D2N_MSG_MASK;
    if (type == SIMBRICKS_PROTO_NET_D2N_MSG_SEND) {
      data = (const void *)rx_->send.data;
      len = rx_->send.len;
      return kRxPollSuccess;
    } else if (type == SIMBRICKS_PROTO_NET_D2N_MSG_SYNC) {
      return kRxPollSync;
    } else {
      fprintf(stderr, "switch_pkt: unsupported type=%u\n", type);
      abort();
    }
  }

183
  virtual void RxDone() override {
184
185
186
187
188
189
    assert(rx_ != nullptr);

    SimbricksNetIfD2NDone(&netif_, rx_);
    rx_ = nullptr;
  }

190
191
  virtual bool TxPacket(
      const void *data, size_t len, uint64_t cur_ts) override {
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
    volatile union SimbricksProtoNetN2D *msg_to =
      SimbricksNetIfN2DAlloc(&netif_, cur_ts, eth_latency);
    if (!msg_to)
      return false;

    volatile struct SimbricksProtoNetN2DRecv *rx;
    rx = &msg_to->recv;
    rx->len = len;
    rx->port = 0;
    memcpy((void *)rx->data, data, len);

    // WMB();
    rx->own_type =
        SIMBRICKS_PROTO_NET_N2D_MSG_RECV | SIMBRICKS_PROTO_NET_N2D_OWN_DEV;
    return true;
  }
};

210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317

/** Hosting network switch port (connected to another network) */
class NetHostPort : public Port {
 protected:
  struct SimbricksNicIf nicif_;
  volatile union SimbricksProtoNetN2D *rx_;
  int sync_;

 public:
  NetHostPort() : rx_(nullptr), sync_(0) {
    memset(&nicif_, 0, sizeof(nicif_));
  }

  NetHostPort(const NetHostPort &other) : nicif_(other.nicif_), rx_(other.rx_),
      sync_(other.sync_) {}

  virtual bool Connect(const char *path, int sync) override {
    sync_ = sync;
    std::string shm_path = path;
    shm_path += "-shm";
    struct SimbricksNicIfParams params = {
      .pci_socket_path = nullptr,
      .eth_socket_path = path,
      .shm_path = shm_path.c_str(),

      .pci_latency = 0,
      .eth_latency = eth_latency,
      .sync_delay = sync_period,

      .sync_pci = 0,
      .sync_eth = sync,
      .sync_mode = sync_mode,
    };
    struct SimbricksProtoPcieDevIntro di;
    int ret = SimbricksNicIfInit(&nicif_, &params, &di);
    sync_ = params.sync_eth;
    return ret == 0;
  }

  virtual bool IsSync() override {
    return sync_;
  }

  virtual void Sync(uint64_t cur_ts) override {
    if (SimbricksNicIfSync(&nicif_, cur_ts) != 0) {
      fprintf(stderr, "SimbricksNicIfSync failed\n");
      abort();
    }
  }

  virtual void AdvanceEpoch(uint64_t cur_ts) override {
    SimbricksNicIfAdvanceEpoch(&nicif_, cur_ts);
  }

  virtual uint64_t NextTimestamp() override {
    return SimbricksNicIfNextTimestamp(&nicif_);
  }

  virtual enum RxPollState RxPacket(
      const void *& data, size_t &len, uint64_t cur_ts) override {
    assert(rx_ == nullptr);

    rx_ = SimbricksNicIfN2DPoll(&nicif_, cur_ts);
    if (!rx_)
      return kRxPollFail;

    uint8_t type = rx_->dummy.own_type & SIMBRICKS_PROTO_NET_N2D_MSG_MASK;
    if (type == SIMBRICKS_PROTO_NET_N2D_MSG_RECV) {
      data = (const void *)rx_->recv.data;
      len = rx_->recv.len;
      return kRxPollSuccess;
    } else if (type == SIMBRICKS_PROTO_NET_N2D_MSG_SYNC) {
      return kRxPollSync;
    } else {
      fprintf(stderr, "switch_pkt: unsupported type=%u\n", type);
      abort();
    }
  }

  virtual void RxDone() override {
    assert(rx_ != nullptr);

    SimbricksNicIfN2DDone(&nicif_, rx_);
    SimbricksNicIfN2DNext(&nicif_);
    rx_ = nullptr;
  }

  virtual bool TxPacket(
      const void *data, size_t len, uint64_t cur_ts) override {
    volatile union SimbricksProtoNetD2N *msg_to =
      SimbricksNicIfD2NAlloc(&nicif_, cur_ts);
    if (!msg_to)
      return false;

    volatile struct SimbricksProtoNetD2NSend *rx;
    rx = &msg_to->send;
    rx->len = len;
    rx->port = 0;
    memcpy((void *)rx->data, data, len);

    // WMB();
    rx->own_type =
        SIMBRICKS_PROTO_NET_D2N_MSG_SEND | SIMBRICKS_PROTO_NET_D2N_OWN_NET;
    return true;
  }
};


Jialin Li's avatar
Jialin Li committed
318
319
320
/* Global variables */
static uint64_t cur_ts = 0;
static int exiting = 0;
321
static const uint8_t bcast[6] = {0xFF};
Jialin Li's avatar
Jialin Li committed
322
static const MAC bcast_addr(bcast);
323
static std::vector<Port *> ports;
Jialin Li's avatar
Jialin Li committed
324
325
static std::unordered_map<MAC, int> mac_table;

326
327
static void sigint_handler(int dummy) {
  exiting = 1;
Jialin Li's avatar
Jialin Li committed
328
329
}

Hejing Li's avatar
Hejing Li committed
330
331
332
333
334
335
336
337
338
339
static void sigusr1_handler(int dummy) {
  fprintf(stderr, "main_time = %lu\n", cur_ts);
}

#ifdef NETSWITCH_STAT
static void sigusr2_handler(int dummy) {
  stat_flag = 1;
}
#endif

340
static void forward_pkt(const void *pkt_data, size_t pkt_len, size_t port_id) {
Jialin Li's avatar
Jialin Li committed
341
  struct pcap_pkthdr ph;
342
  Port &dest_port = *ports[port_id];
343

Jialin Li's avatar
Jialin Li committed
344
345
346
347
348
  // log to pcap file if initialized
  if (dumpfile) {
      memset(&ph, 0, sizeof(ph));
      ph.ts.tv_sec = cur_ts / 1000000000000ULL;
      ph.ts.tv_usec = (cur_ts % 1000000000000ULL) / 1000ULL;
349
350
351
      ph.caplen = pkt_len;
      ph.len = pkt_len;
      pcap_dump((unsigned char *)dumpfile, &ph, (unsigned char *)pkt_data);
Jialin Li's avatar
Jialin Li committed
352
  }
353
  // print sending tick: [packet type] source_IP -> dest_IP len:
354

355
356
357
358
#ifdef NETSWITCH_DEBUG
  uint16_t eth_proto;
  struct ethhdr *hdr;
  struct iphdr *iph;
359
  hdr = (struct ethhdr*)pkt_data;
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
  eth_proto = ntohs(hdr->h_proto);
  iph = (struct iphdr *)(hdr + 1);
  fprintf(stderr, "%20lu: ", cur_ts);
  if (eth_proto == ETH_P_IP){
    fprintf(stderr, "[ IP] ");
    
  } 
  else if(eth_proto == ETH_P_ARP){
    fprintf(stderr, "[ARP] ");
  } 
  else{
    fprintf(stderr, "unkwon eth type\n");
  }

  fprintf(stderr, "%8X -> %8X len: %lu\n ", iph->saddr, iph->daddr, iph->tot_len + sizeof(struct ethhdr));
#endif

377
378
  if (!dest_port.TxPacket(pkt_data, pkt_len, cur_ts))
    fprintf(stderr, "forward_pkt: dropping packet on port %zu\n", port_id);
Jialin Li's avatar
Jialin Li committed
379
380
}

381
382
383
384
static void switch_pkt(Port &port, size_t iport) {
  const void *pkt_data;
  size_t pkt_len;

Hejing Li's avatar
Hejing Li committed
385
386
387
388
389
390
391
#ifdef NETSWITCH_STAT
  d2n_poll_total += 1;
  if (stat_flag){
    s_d2n_poll_total += 1;
  }
#endif

392
393
  enum Port::RxPollState poll = port.RxPacket(pkt_data, pkt_len, cur_ts);
  if (poll == Port::kRxPollFail) {
394
395
396
    return;
  }

Hejing Li's avatar
Hejing Li committed
397
398
399
400
401
402
403
#ifdef NETSWITCH_STAT
  d2n_poll_suc += 1;
  if (stat_flag){
    s_d2n_poll_suc += 1;
  }
#endif

404
  if (poll == Port::kRxPollSuccess) {
405
    // Get MAC addresses
406
    MAC dst((const uint8_t *)pkt_data), src((const uint8_t *)pkt_data + 6);
407
408
409
    // MAC learning
    if (!(src == bcast_addr)) {
      mac_table[src] = iport;
Jialin Li's avatar
Jialin Li committed
410
    }
411
    // L2 forwarding
412
413
414
    auto i = mac_table.find(dst);
    if (i != mac_table.end()) {
      size_t eport = i->second;
415
      forward_pkt(pkt_data, pkt_len, eport);
Jialin Li's avatar
Jialin Li committed
416
    } else {
417
      // Broadcast
418
      for (size_t eport = 0; eport < ports.size(); eport++) {
419
420
        if (eport != iport) {
          // Do not forward to ingress port
421
          forward_pkt(pkt_data, pkt_len, eport);
422
423
        }
      }
Jialin Li's avatar
Jialin Li committed
424
    }
425
  } else if (poll == Port::kRxPollSync) {
Hejing Li's avatar
Hejing Li committed
426
427
428
429
430
431
#ifdef NETSWITCH_STAT
    d2n_poll_sync += 1;
    if (stat_flag){
      s_d2n_poll_sync += 1;
    }
#endif
432
  } else {
433
    fprintf(stderr, "switch_pkt: unsupported poll result=%u\n", poll);
434
435
    abort();
  }
436
  port.RxDone();
Jialin Li's avatar
Jialin Li committed
437
438
}

439
440
441
int main(int argc, char *argv[]) {
  int c;
  int bad_option = 0;
442
  int sync_eth = 1;
Jialin Li's avatar
Jialin Li committed
443
  pcap_t *pc = nullptr;
444
445

  // Parse command line argument
446
  while ((c = getopt(argc, argv, "s:h:uS:E:m:p:")) != -1 && !bad_option) {
447
448
    switch (c) {
      case 's': {
449
        NetPort *port = new NetPort;
450
        fprintf(stderr, "Switch connecting to: %s\n", optarg);
451
        if (!port->Connect(optarg, sync_eth)) {
452
453
          fprintf(stderr, "connecting to %s failed\n", optarg);
          return EXIT_FAILURE;
Jialin Li's avatar
Jialin Li committed
454
        }
455
        ports.push_back(port);
456
457
458
        break;
      }

459
460
461
462
463
464
465
466
467
468
469
      case 'h': {
        NetHostPort *port = new NetHostPort;
        fprintf(stderr, "Switch listening on: %s\n", optarg);
        if (!port->Connect(optarg, sync_eth)) {
          fprintf(stderr, "listening on %s failed\n", optarg);
          return EXIT_FAILURE;
        }
        ports.push_back(port);
        break;
      }

470
471
472
473
      case 'u':
        sync_eth = 0;
        break;

474
475
476
477
478
479
480
481
482
483
      case 'S':
        sync_period = strtoull(optarg, NULL, 0) * 1000ULL;
        break;

      case 'E':
        eth_latency = strtoull(optarg, NULL, 0) * 1000ULL;
        break;

      case 'm':
        sync_mode = strtol(optarg, NULL, 0);
484
        assert(sync_mode == SIMBRICKS_PROTO_SYNC_SIMBRICKS ||
485
               sync_mode == SIMBRICKS_PROTO_SYNC_BARRIER);
486
487
        break;

Jialin Li's avatar
Jialin Li committed
488
489
490
491
492
493
494
495
496
497
498
      case 'p':
        pc = pcap_open_dead_with_tstamp_precision(DLT_EN10MB, 65535,
                                                  PCAP_TSTAMP_PRECISION_NANO);
        if (pc == nullptr) {
            perror("pcap_open_dead failed");
            return EXIT_FAILURE;
        }

        dumpfile = pcap_dump_open(pc, optarg);
        break;

499
500
501
502
      default:
        fprintf(stderr, "unknown option %c\n", c);
        bad_option = 1;
        break;
Jialin Li's avatar
Jialin Li committed
503
    }
504
505
  }

506
  if (ports.empty() || bad_option) {
507
508
509
510
511
512
513
514
    fprintf(stderr,
            "Usage: net_switch [-S SYNC-PERIOD] [-E ETH-LATENCY] "
            "-s SOCKET-A [-s SOCKET-B ...]\n");
    return EXIT_FAILURE;
  }

  signal(SIGINT, sigint_handler);
  signal(SIGTERM, sigint_handler);
Hejing Li's avatar
Hejing Li committed
515
516
517
518
519
520
  signal(SIGUSR1, sigusr1_handler);

#ifdef NETSWITCH_STAT
  signal(SIGUSR2, sigusr2_handler);
#endif

521
522
523
524

  printf("start polling\n");
  while (!exiting) {
    // Sync all interfaces
525
526
527
528
    for (auto port : ports)
      port->Sync(cur_ts);
    for (auto port : ports)
      port->AdvanceEpoch(cur_ts);
529
530
531
532
533

    // Switch packets
    uint64_t min_ts;
    do {
      min_ts = ULLONG_MAX;
534
      for (size_t port_i = 0; port_i < ports.size(); port_i++) {
535
        auto &port = *ports[port_i];
536
537
538
        switch_pkt(port, port_i);
        if (port.IsSync()) {
          uint64_t ts = port.NextTimestamp();
539
          min_ts = ts < min_ts ? ts : min_ts;
Jialin Li's avatar
Jialin Li committed
540
        }
541
542
543
544
545
      }
    } while (!exiting && (min_ts <= cur_ts));

    // Update cur_ts
    if (min_ts < ULLONG_MAX) {
546
      // a bit broken but should probably do
547
      cur_ts = SimbricksNetIfAdvanceTime(min_ts, sync_period, sync_mode);
Jialin Li's avatar
Jialin Li committed
548
    }
549
  }
Jialin Li's avatar
Jialin Li committed
550

Hejing Li's avatar
Hejing Li committed
551
552
553
554
555
556
557
558
559
560
561
562
563
564
#ifdef NETSWITCH_STAT
  fprintf(stderr, "%20s: %22lu %20s: %22lu  poll_suc_rate: %f\n",
          "d2n_poll_total", d2n_poll_total, "d2n_poll_suc", d2n_poll_suc,
          (double)d2n_poll_suc / d2n_poll_total);
  fprintf(stderr, "%65s: %22lu  sync_rate: %f\n", "d2n_poll_sync",
          d2n_poll_sync, (double)d2n_poll_sync / d2n_poll_suc);

  fprintf(stderr, "%20s: %22lu %20s: %22lu  poll_suc_rate: %f\n",
          "s_d2n_poll_total", s_d2n_poll_total, "s_d2n_poll_suc", s_d2n_poll_suc,
          (double)s_d2n_poll_suc / s_d2n_poll_total);
  fprintf(stderr, "%65s: %22lu  sync_rate: %f\n", "s_d2n_poll_sync",
          s_d2n_poll_sync, (double)s_d2n_poll_sync / s_d2n_poll_suc);
#endif

565
  return 0;
Jialin Li's avatar
Jialin Li committed
566
}