net_switch.cc 14.9 KB
Newer Older
Antoine Kaufmann's avatar
Antoine Kaufmann committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/*
 * Copyright 2021 Max Planck Institute for Software Systems, and
 * National University of Singapore
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

25
#include <unistd.h>
Jialin Li's avatar
Jialin Li committed
26
#include <pcap/pcap.h>
27
28
29
#include <linux/ip.h>
#include <linux/if_ether.h>
#include <arpa/inet.h>
30
31
32
33

#include <cassert>
#include <climits>
#include <csignal>
Jialin Li's avatar
Jialin Li committed
34
35
36
37
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <unordered_map>
38
#include <vector>
Jialin Li's avatar
Jialin Li committed
39
40

extern "C" {
41
#include <simbricks/netif/netif.h>
42
#include <simbricks/nicif/nicif.h>
43
#include <simbricks/proto/base.h>
Jialin Li's avatar
Jialin Li committed
44
45
};

46
//#define NETSWITCH_DEBUG
Hejing Li's avatar
Hejing Li committed
47
#define NETSWITCH_STAT
48

Antoine Kaufmann's avatar
Antoine Kaufmann committed
49
50
static uint64_t sync_period = (500 * 1000ULL);  // 500ns
static uint64_t eth_latency = (500 * 1000ULL);  // 500ns
51
static int sync_mode = SIMBRICKS_PROTO_SYNC_SIMBRICKS;
Jialin Li's avatar
Jialin Li committed
52
static pcap_dumper_t *dumpfile = nullptr;
Jialin Li's avatar
Jialin Li committed
53

Hejing Li's avatar
Hejing Li committed
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#ifdef NETSWITCH_STAT
#endif

#ifdef NETSWITCH_STAT
static uint64_t d2n_poll_total = 0;
static uint64_t d2n_poll_suc = 0;
static uint64_t d2n_poll_sync = 0;

static uint64_t s_d2n_poll_total = 0;
static uint64_t s_d2n_poll_suc = 0;
static uint64_t s_d2n_poll_sync = 0;

static int stat_flag = 0;
#endif

Jialin Li's avatar
Jialin Li committed
69
70
/* MAC address type */
struct MAC {
71
  const uint8_t *data;
Jialin Li's avatar
Jialin Li committed
72

73
  explicit MAC(const uint8_t *data) : data(data) {
74
  }
Jialin Li's avatar
Jialin Li committed
75

76
77
78
79
80
  bool operator==(const MAC &other) const {
    for (int i = 0; i < 6; i++) {
      if (data[i] != other.data[i]) {
        return false;
      }
Jialin Li's avatar
Jialin Li committed
81
    }
82
83
    return true;
  }
Jialin Li's avatar
Jialin Li committed
84
85
};
namespace std {
Antoine Kaufmann's avatar
Antoine Kaufmann committed
86
template <>
87
88
struct hash<MAC> {
  size_t operator()(const MAC &m) const {
Antoine Kaufmann's avatar
Antoine Kaufmann committed
89
90
    size_t res = 0;
    for (int i = 0; i < 6; i++) {
91
      res = (res << 4) | (res ^ m.data[i]);
Antoine Kaufmann's avatar
Antoine Kaufmann committed
92
93
    }
    return res;
94
  }
Antoine Kaufmann's avatar
Antoine Kaufmann committed
95
96
};
}  // namespace std
Jialin Li's avatar
Jialin Li committed
97

98

99
/** Abstract base switch port */
100
class Port {
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
 public:
  enum RxPollState {
    kRxPollSuccess = 0,
    kRxPollFail = 1,
    kRxPollSync = 2,
  };

  virtual ~Port() = default;

  virtual bool Connect(const char *path, int sync) = 0;
  virtual bool IsSync() = 0;
  virtual void Sync(uint64_t cur_ts) = 0;
  virtual void AdvanceEpoch(uint64_t cur_ts) = 0;
  virtual uint64_t NextTimestamp() = 0;
  virtual enum RxPollState RxPacket(
      const void *& data, size_t &len, uint64_t cur_ts) = 0;
  virtual void RxDone() = 0;
  virtual bool TxPacket(const void *data, size_t len, uint64_t cur_ts) = 0;
};

121

122
123
/** Normal network switch port (conneting to a NIC) */
class NetPort : public Port {
124
125
126
127
128
129
 protected:
  struct SimbricksNetIf netif_;
  volatile union SimbricksProtoNetD2N *rx_;
  int sync_;

 public:
130
  NetPort() : rx_(nullptr), sync_(0) {
131
132
133
    memset(&netif_, 0, sizeof(netif_));
  }

134
  NetPort(const NetPort &other) : netif_(other.netif_), rx_(other.rx_),
135
136
      sync_(other.sync_) {}

137
  virtual bool Connect(const char *path, int sync) override {
138
139
140
141
    sync_ = sync;
    return SimbricksNetIfInit(&netif_, path, &sync_) == 0;
  }

142
  virtual bool IsSync() override {
143
144
145
    return sync_;
  }

146
  virtual void Sync(uint64_t cur_ts) override {
147
148
    while (SimbricksNetIfN2DSync(&netif_, cur_ts, eth_latency, sync_period,
                              sync_mode));
149
150
  }

151
  virtual void AdvanceEpoch(uint64_t cur_ts) override {
152
153
154
    SimbricksNetIfAdvanceEpoch(cur_ts, sync_period, sync_mode);
  }

155
  virtual uint64_t NextTimestamp() override {
156
157
158
159
    return SimbricksNetIfD2NTimestamp(&netif_);
  }

  virtual enum RxPollState RxPacket(
160
      const void *& data, size_t &len, uint64_t cur_ts) override {
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
    assert(rx_ == nullptr);

    rx_ = SimbricksNetIfD2NPoll(&netif_, cur_ts);
    if (!rx_)
      return kRxPollFail;

    uint8_t type = rx_->dummy.own_type & SIMBRICKS_PROTO_NET_D2N_MSG_MASK;
    if (type == SIMBRICKS_PROTO_NET_D2N_MSG_SEND) {
      data = (const void *)rx_->send.data;
      len = rx_->send.len;
      return kRxPollSuccess;
    } else if (type == SIMBRICKS_PROTO_NET_D2N_MSG_SYNC) {
      return kRxPollSync;
    } else {
      fprintf(stderr, "switch_pkt: unsupported type=%u\n", type);
      abort();
    }
  }

180
  virtual void RxDone() override {
181
182
183
184
185
186
    assert(rx_ != nullptr);

    SimbricksNetIfD2NDone(&netif_, rx_);
    rx_ = nullptr;
  }

187
188
  virtual bool TxPacket(
      const void *data, size_t len, uint64_t cur_ts) override {
189
190
    volatile union SimbricksProtoNetN2D *msg_to =
      SimbricksNetIfN2DAlloc(&netif_, cur_ts, eth_latency);
191
    if (!msg_to && !sync_) {
192
      return false;
193
194
195
196
    } else if (!msg_to && sync_) {
      while (!msg_to)
        msg_to = SimbricksNetIfN2DAlloc(&netif_, cur_ts, eth_latency);
    }
197
198
199
200
201
202
203
204
205
206
207
208
209
    volatile struct SimbricksProtoNetN2DRecv *rx;
    rx = &msg_to->recv;
    rx->len = len;
    rx->port = 0;
    memcpy((void *)rx->data, data, len);

    // WMB();
    rx->own_type =
        SIMBRICKS_PROTO_NET_N2D_MSG_RECV | SIMBRICKS_PROTO_NET_N2D_OWN_DEV;
    return true;
  }
};

210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253

/** Hosting network switch port (connected to another network) */
class NetHostPort : public Port {
 protected:
  struct SimbricksNicIf nicif_;
  volatile union SimbricksProtoNetN2D *rx_;
  int sync_;

 public:
  NetHostPort() : rx_(nullptr), sync_(0) {
    memset(&nicif_, 0, sizeof(nicif_));
  }

  NetHostPort(const NetHostPort &other) : nicif_(other.nicif_), rx_(other.rx_),
      sync_(other.sync_) {}

  virtual bool Connect(const char *path, int sync) override {
    sync_ = sync;
    std::string shm_path = path;
    shm_path += "-shm";
    struct SimbricksNicIfParams params = {
      .pci_socket_path = nullptr,
      .eth_socket_path = path,
      .shm_path = shm_path.c_str(),

      .pci_latency = 0,
      .eth_latency = eth_latency,
      .sync_delay = sync_period,

      .sync_pci = 0,
      .sync_eth = sync,
      .sync_mode = sync_mode,
    };
    struct SimbricksProtoPcieDevIntro di;
    int ret = SimbricksNicIfInit(&nicif_, &params, &di);
    sync_ = params.sync_eth;
    return ret == 0;
  }

  virtual bool IsSync() override {
    return sync_;
  }

  virtual void Sync(uint64_t cur_ts) override {
254
    while (SimbricksNicIfSync(&nicif_, cur_ts));
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
  }

  virtual void AdvanceEpoch(uint64_t cur_ts) override {
    SimbricksNicIfAdvanceEpoch(&nicif_, cur_ts);
  }

  virtual uint64_t NextTimestamp() override {
    return SimbricksNicIfNextTimestamp(&nicif_);
  }

  virtual enum RxPollState RxPacket(
      const void *& data, size_t &len, uint64_t cur_ts) override {
    assert(rx_ == nullptr);

    rx_ = SimbricksNicIfN2DPoll(&nicif_, cur_ts);
    if (!rx_)
      return kRxPollFail;

    uint8_t type = rx_->dummy.own_type & SIMBRICKS_PROTO_NET_N2D_MSG_MASK;
    if (type == SIMBRICKS_PROTO_NET_N2D_MSG_RECV) {
      data = (const void *)rx_->recv.data;
      len = rx_->recv.len;
      return kRxPollSuccess;
    } else if (type == SIMBRICKS_PROTO_NET_N2D_MSG_SYNC) {
      return kRxPollSync;
    } else {
      fprintf(stderr, "switch_pkt: unsupported type=%u\n", type);
      abort();
    }
  }

  virtual void RxDone() override {
    assert(rx_ != nullptr);

    SimbricksNicIfN2DDone(&nicif_, rx_);
    SimbricksNicIfN2DNext(&nicif_);
    rx_ = nullptr;
  }

  virtual bool TxPacket(
      const void *data, size_t len, uint64_t cur_ts) override {
    volatile union SimbricksProtoNetD2N *msg_to =
      SimbricksNicIfD2NAlloc(&nicif_, cur_ts);
298
    if (!msg_to && !sync_) {
299
      return false;
300
301
302
303
    } else if (!msg_to && sync_) {
      while (!msg_to)
        msg_to = SimbricksNicIfD2NAlloc(&nicif_, cur_ts);
    }
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318

    volatile struct SimbricksProtoNetD2NSend *rx;
    rx = &msg_to->send;
    rx->len = len;
    rx->port = 0;
    memcpy((void *)rx->data, data, len);

    // WMB();
    rx->own_type =
        SIMBRICKS_PROTO_NET_D2N_MSG_SEND | SIMBRICKS_PROTO_NET_D2N_OWN_NET;
    return true;
  }
};


Jialin Li's avatar
Jialin Li committed
319
320
321
/* Global variables */
static uint64_t cur_ts = 0;
static int exiting = 0;
322
static const uint8_t bcast[6] = {0xFF};
Jialin Li's avatar
Jialin Li committed
323
static const MAC bcast_addr(bcast);
324
static std::vector<Port *> ports;
Jialin Li's avatar
Jialin Li committed
325
326
static std::unordered_map<MAC, int> mac_table;

327
328
static void sigint_handler(int dummy) {
  exiting = 1;
Jialin Li's avatar
Jialin Li committed
329
330
}

Hejing Li's avatar
Hejing Li committed
331
332
333
334
335
336
337
338
339
340
static void sigusr1_handler(int dummy) {
  fprintf(stderr, "main_time = %lu\n", cur_ts);
}

#ifdef NETSWITCH_STAT
static void sigusr2_handler(int dummy) {
  stat_flag = 1;
}
#endif

341
static void forward_pkt(const void *pkt_data, size_t pkt_len, size_t port_id) {
Jialin Li's avatar
Jialin Li committed
342
  struct pcap_pkthdr ph;
343
  Port &dest_port = *ports[port_id];
344

Jialin Li's avatar
Jialin Li committed
345
346
347
348
349
  // log to pcap file if initialized
  if (dumpfile) {
      memset(&ph, 0, sizeof(ph));
      ph.ts.tv_sec = cur_ts / 1000000000000ULL;
      ph.ts.tv_usec = (cur_ts % 1000000000000ULL) / 1000ULL;
350
351
352
      ph.caplen = pkt_len;
      ph.len = pkt_len;
      pcap_dump((unsigned char *)dumpfile, &ph, (unsigned char *)pkt_data);
Jialin Li's avatar
Jialin Li committed
353
  }
354
  // print sending tick: [packet type] source_IP -> dest_IP len:
355

356
357
358
359
#ifdef NETSWITCH_DEBUG
  uint16_t eth_proto;
  struct ethhdr *hdr;
  struct iphdr *iph;
360
  hdr = (struct ethhdr*)pkt_data;
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
  eth_proto = ntohs(hdr->h_proto);
  iph = (struct iphdr *)(hdr + 1);
  fprintf(stderr, "%20lu: ", cur_ts);
  if (eth_proto == ETH_P_IP){
    fprintf(stderr, "[ IP] ");
    
  } 
  else if(eth_proto == ETH_P_ARP){
    fprintf(stderr, "[ARP] ");
  } 
  else{
    fprintf(stderr, "unkwon eth type\n");
  }

  fprintf(stderr, "%8X -> %8X len: %lu\n ", iph->saddr, iph->daddr, iph->tot_len + sizeof(struct ethhdr));
#endif

378
379
  if (!dest_port.TxPacket(pkt_data, pkt_len, cur_ts))
    fprintf(stderr, "forward_pkt: dropping packet on port %zu\n", port_id);
Jialin Li's avatar
Jialin Li committed
380
381
}

382
383
384
385
static void switch_pkt(Port &port, size_t iport) {
  const void *pkt_data;
  size_t pkt_len;

Hejing Li's avatar
Hejing Li committed
386
387
388
389
390
391
392
#ifdef NETSWITCH_STAT
  d2n_poll_total += 1;
  if (stat_flag){
    s_d2n_poll_total += 1;
  }
#endif

393
394
  enum Port::RxPollState poll = port.RxPacket(pkt_data, pkt_len, cur_ts);
  if (poll == Port::kRxPollFail) {
395
396
397
    return;
  }

Hejing Li's avatar
Hejing Li committed
398
399
400
401
402
403
404
#ifdef NETSWITCH_STAT
  d2n_poll_suc += 1;
  if (stat_flag){
    s_d2n_poll_suc += 1;
  }
#endif

405
  if (poll == Port::kRxPollSuccess) {
406
    // Get MAC addresses
407
    MAC dst((const uint8_t *)pkt_data), src((const uint8_t *)pkt_data + 6);
408
409
410
    // MAC learning
    if (!(src == bcast_addr)) {
      mac_table[src] = iport;
Jialin Li's avatar
Jialin Li committed
411
    }
412
    // L2 forwarding
413
414
415
    auto i = mac_table.find(dst);
    if (i != mac_table.end()) {
      size_t eport = i->second;
416
417
      if (eport != iport)
        forward_pkt(pkt_data, pkt_len, eport);
Jialin Li's avatar
Jialin Li committed
418
    } else {
419
      // Broadcast
420
      for (size_t eport = 0; eport < ports.size(); eport++) {
421
422
        if (eport != iport) {
          // Do not forward to ingress port
423
          forward_pkt(pkt_data, pkt_len, eport);
424
425
        }
      }
Jialin Li's avatar
Jialin Li committed
426
    }
427
  } else if (poll == Port::kRxPollSync) {
Hejing Li's avatar
Hejing Li committed
428
429
430
431
432
433
#ifdef NETSWITCH_STAT
    d2n_poll_sync += 1;
    if (stat_flag){
      s_d2n_poll_sync += 1;
    }
#endif
434
  } else {
435
    fprintf(stderr, "switch_pkt: unsupported poll result=%u\n", poll);
436
437
    abort();
  }
438
  port.RxDone();
Jialin Li's avatar
Jialin Li committed
439
440
}

441
442
443
int main(int argc, char *argv[]) {
  int c;
  int bad_option = 0;
444
  int sync_eth = 1;
Jialin Li's avatar
Jialin Li committed
445
  pcap_t *pc = nullptr;
446
447

  // Parse command line argument
448
  while ((c = getopt(argc, argv, "s:h:uS:E:m:p:")) != -1 && !bad_option) {
449
450
    switch (c) {
      case 's': {
451
        NetPort *port = new NetPort;
452
        fprintf(stderr, "Switch connecting to: %s\n", optarg);
453
        if (!port->Connect(optarg, sync_eth)) {
454
455
          fprintf(stderr, "connecting to %s failed\n", optarg);
          return EXIT_FAILURE;
Jialin Li's avatar
Jialin Li committed
456
        }
457
        ports.push_back(port);
458
459
460
        break;
      }

461
462
463
464
465
466
467
468
469
470
471
      case 'h': {
        NetHostPort *port = new NetHostPort;
        fprintf(stderr, "Switch listening on: %s\n", optarg);
        if (!port->Connect(optarg, sync_eth)) {
          fprintf(stderr, "listening on %s failed\n", optarg);
          return EXIT_FAILURE;
        }
        ports.push_back(port);
        break;
      }

472
473
474
475
      case 'u':
        sync_eth = 0;
        break;

476
477
478
479
480
481
482
483
484
485
      case 'S':
        sync_period = strtoull(optarg, NULL, 0) * 1000ULL;
        break;

      case 'E':
        eth_latency = strtoull(optarg, NULL, 0) * 1000ULL;
        break;

      case 'm':
        sync_mode = strtol(optarg, NULL, 0);
486
        assert(sync_mode == SIMBRICKS_PROTO_SYNC_SIMBRICKS ||
487
               sync_mode == SIMBRICKS_PROTO_SYNC_BARRIER);
488
489
        break;

Jialin Li's avatar
Jialin Li committed
490
491
492
493
494
495
496
497
498
499
500
      case 'p':
        pc = pcap_open_dead_with_tstamp_precision(DLT_EN10MB, 65535,
                                                  PCAP_TSTAMP_PRECISION_NANO);
        if (pc == nullptr) {
            perror("pcap_open_dead failed");
            return EXIT_FAILURE;
        }

        dumpfile = pcap_dump_open(pc, optarg);
        break;

501
502
503
504
      default:
        fprintf(stderr, "unknown option %c\n", c);
        bad_option = 1;
        break;
Jialin Li's avatar
Jialin Li committed
505
    }
506
507
  }

508
  if (ports.empty() || bad_option) {
509
510
511
512
513
514
515
516
    fprintf(stderr,
            "Usage: net_switch [-S SYNC-PERIOD] [-E ETH-LATENCY] "
            "-s SOCKET-A [-s SOCKET-B ...]\n");
    return EXIT_FAILURE;
  }

  signal(SIGINT, sigint_handler);
  signal(SIGTERM, sigint_handler);
Hejing Li's avatar
Hejing Li committed
517
518
519
520
521
522
  signal(SIGUSR1, sigusr1_handler);

#ifdef NETSWITCH_STAT
  signal(SIGUSR2, sigusr2_handler);
#endif

523
524
525
526

  printf("start polling\n");
  while (!exiting) {
    // Sync all interfaces
527
528
529
530
    for (auto port : ports)
      port->Sync(cur_ts);
    for (auto port : ports)
      port->AdvanceEpoch(cur_ts);
531
532
533
534
535

    // Switch packets
    uint64_t min_ts;
    do {
      min_ts = ULLONG_MAX;
536
      for (size_t port_i = 0; port_i < ports.size(); port_i++) {
537
        auto &port = *ports[port_i];
538
539
540
        switch_pkt(port, port_i);
        if (port.IsSync()) {
          uint64_t ts = port.NextTimestamp();
541
          min_ts = ts < min_ts ? ts : min_ts;
Jialin Li's avatar
Jialin Li committed
542
        }
543
544
545
546
547
      }
    } while (!exiting && (min_ts <= cur_ts));

    // Update cur_ts
    if (min_ts < ULLONG_MAX) {
548
      // a bit broken but should probably do
549
      cur_ts = SimbricksNetIfAdvanceTime(min_ts, sync_period, sync_mode);
Jialin Li's avatar
Jialin Li committed
550
    }
551
  }
Jialin Li's avatar
Jialin Li committed
552

Hejing Li's avatar
Hejing Li committed
553
554
555
556
557
558
559
560
561
562
563
564
565
566
#ifdef NETSWITCH_STAT
  fprintf(stderr, "%20s: %22lu %20s: %22lu  poll_suc_rate: %f\n",
          "d2n_poll_total", d2n_poll_total, "d2n_poll_suc", d2n_poll_suc,
          (double)d2n_poll_suc / d2n_poll_total);
  fprintf(stderr, "%65s: %22lu  sync_rate: %f\n", "d2n_poll_sync",
          d2n_poll_sync, (double)d2n_poll_sync / d2n_poll_suc);

  fprintf(stderr, "%20s: %22lu %20s: %22lu  poll_suc_rate: %f\n",
          "s_d2n_poll_total", s_d2n_poll_total, "s_d2n_poll_suc", s_d2n_poll_suc,
          (double)s_d2n_poll_suc / s_d2n_poll_total);
  fprintf(stderr, "%65s: %22lu  sync_rate: %f\n", "s_d2n_poll_sync",
          s_d2n_poll_sync, (double)s_d2n_poll_sync / s_d2n_poll_suc);
#endif

567
  return 0;
Jialin Li's avatar
Jialin Li committed
568
}