1 // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 3 /* 4 * AF_XDP user-space access library. 5 * 6 * Copyright(c) 2018 - 2019 Intel Corporation. 7 * 8 * Author(s): Magnus Karlsson <magnus.karlsson@intel.com> 9 */ 10 11 #include <errno.h> 12 #include <stdlib.h> 13 #include <string.h> 14 #include <unistd.h> 15 #include <arpa/inet.h> 16 #include <asm/barrier.h> 17 #include <linux/compiler.h> 18 #include <linux/ethtool.h> 19 #include <linux/filter.h> 20 #include <linux/if_ether.h> 21 #include <linux/if_packet.h> 22 #include <linux/if_xdp.h> 23 #include <linux/kernel.h> 24 #include <linux/list.h> 25 #include <linux/sockios.h> 26 #include <net/if.h> 27 #include <sys/ioctl.h> 28 #include <sys/mman.h> 29 #include <sys/socket.h> 30 #include <sys/types.h> 31 #include <linux/if_link.h> 32 33 #include <bpf/bpf.h> 34 #include <bpf/libbpf.h> 35 #include "xsk.h" 36 #include "bpf_util.h" 37 38 #ifndef SOL_XDP 39 #define SOL_XDP 283 40 #endif 41 42 #ifndef AF_XDP 43 #define AF_XDP 44 44 #endif 45 46 #ifndef PF_XDP 47 #define PF_XDP AF_XDP 48 #endif 49 50 #define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) 51 52 enum xsk_prog { 53 XSK_PROG_FALLBACK, 54 XSK_PROG_REDIRECT_FLAGS, 55 }; 56 57 struct xsk_umem { 58 struct xsk_ring_prod *fill_save; 59 struct xsk_ring_cons *comp_save; 60 char *umem_area; 61 struct xsk_umem_config config; 62 int fd; 63 int refcount; 64 struct list_head ctx_list; 65 bool rx_ring_setup_done; 66 bool tx_ring_setup_done; 67 }; 68 69 struct xsk_ctx { 70 struct xsk_ring_prod *fill; 71 struct xsk_ring_cons *comp; 72 __u32 queue_id; 73 struct xsk_umem *umem; 74 int refcount; 75 int ifindex; 76 struct list_head list; 77 int prog_fd; 78 int link_fd; 79 int xsks_map_fd; 80 char ifname[IFNAMSIZ]; 81 bool has_bpf_link; 82 }; 83 84 struct xsk_socket { 85 struct xsk_ring_cons *rx; 86 struct xsk_ring_prod *tx; 87 __u64 outstanding_tx; 88 struct xsk_ctx *ctx; 89 struct xsk_socket_config config; 90 int fd; 91 }; 92 93 struct xsk_nl_info { 94 bool xdp_prog_attached; 95 int ifindex; 96 int fd; 97 }; 98 99 /* Up until and including Linux 5.3 */ 100 struct xdp_ring_offset_v1 { 101 __u64 producer; 102 __u64 consumer; 103 __u64 desc; 104 }; 105 106 /* Up until and including Linux 5.3 */ 107 struct xdp_mmap_offsets_v1 { 108 struct xdp_ring_offset_v1 rx; 109 struct xdp_ring_offset_v1 tx; 110 struct xdp_ring_offset_v1 fr; 111 struct xdp_ring_offset_v1 cr; 112 }; 113 114 int xsk_umem__fd(const struct xsk_umem *umem) 115 { 116 return umem ? umem->fd : -EINVAL; 117 } 118 119 int xsk_socket__fd(const struct xsk_socket *xsk) 120 { 121 return xsk ? xsk->fd : -EINVAL; 122 } 123 124 static bool xsk_page_aligned(void *buffer) 125 { 126 unsigned long addr = (unsigned long)buffer; 127 128 return !(addr & (getpagesize() - 1)); 129 } 130 131 static void xsk_set_umem_config(struct xsk_umem_config *cfg, 132 const struct xsk_umem_config *usr_cfg) 133 { 134 if (!usr_cfg) { 135 cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; 136 cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; 137 cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; 138 cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; 139 cfg->flags = XSK_UMEM__DEFAULT_FLAGS; 140 return; 141 } 142 143 cfg->fill_size = usr_cfg->fill_size; 144 cfg->comp_size = usr_cfg->comp_size; 145 cfg->frame_size = usr_cfg->frame_size; 146 cfg->frame_headroom = usr_cfg->frame_headroom; 147 cfg->flags = usr_cfg->flags; 148 } 149 150 static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, 151 const struct xsk_socket_config *usr_cfg) 152 { 153 if (!usr_cfg) { 154 cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; 155 cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; 156 cfg->libbpf_flags = 0; 157 cfg->xdp_flags = 0; 158 cfg->bind_flags = 0; 159 return 0; 160 } 161 162 if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD) 163 return -EINVAL; 164 165 cfg->rx_size = usr_cfg->rx_size; 166 cfg->tx_size = usr_cfg->tx_size; 167 cfg->libbpf_flags = usr_cfg->libbpf_flags; 168 cfg->xdp_flags = usr_cfg->xdp_flags; 169 cfg->bind_flags = usr_cfg->bind_flags; 170 171 return 0; 172 } 173 174 static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off) 175 { 176 struct xdp_mmap_offsets_v1 off_v1; 177 178 /* getsockopt on a kernel <= 5.3 has no flags fields. 179 * Copy over the offsets to the correct places in the >=5.4 format 180 * and put the flags where they would have been on that kernel. 181 */ 182 memcpy(&off_v1, off, sizeof(off_v1)); 183 184 off->rx.producer = off_v1.rx.producer; 185 off->rx.consumer = off_v1.rx.consumer; 186 off->rx.desc = off_v1.rx.desc; 187 off->rx.flags = off_v1.rx.consumer + sizeof(__u32); 188 189 off->tx.producer = off_v1.tx.producer; 190 off->tx.consumer = off_v1.tx.consumer; 191 off->tx.desc = off_v1.tx.desc; 192 off->tx.flags = off_v1.tx.consumer + sizeof(__u32); 193 194 off->fr.producer = off_v1.fr.producer; 195 off->fr.consumer = off_v1.fr.consumer; 196 off->fr.desc = off_v1.fr.desc; 197 off->fr.flags = off_v1.fr.consumer + sizeof(__u32); 198 199 off->cr.producer = off_v1.cr.producer; 200 off->cr.consumer = off_v1.cr.consumer; 201 off->cr.desc = off_v1.cr.desc; 202 off->cr.flags = off_v1.cr.consumer + sizeof(__u32); 203 } 204 205 static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off) 206 { 207 socklen_t optlen; 208 int err; 209 210 optlen = sizeof(*off); 211 err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen); 212 if (err) 213 return err; 214 215 if (optlen == sizeof(*off)) 216 return 0; 217 218 if (optlen == sizeof(struct xdp_mmap_offsets_v1)) { 219 xsk_mmap_offsets_v1(off); 220 return 0; 221 } 222 223 return -EINVAL; 224 } 225 226 static int xsk_create_umem_rings(struct xsk_umem *umem, int fd, 227 struct xsk_ring_prod *fill, 228 struct xsk_ring_cons *comp) 229 { 230 struct xdp_mmap_offsets off; 231 void *map; 232 int err; 233 234 err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING, 235 &umem->config.fill_size, 236 sizeof(umem->config.fill_size)); 237 if (err) 238 return -errno; 239 240 err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING, 241 &umem->config.comp_size, 242 sizeof(umem->config.comp_size)); 243 if (err) 244 return -errno; 245 246 err = xsk_get_mmap_offsets(fd, &off); 247 if (err) 248 return -errno; 249 250 map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64), 251 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 252 XDP_UMEM_PGOFF_FILL_RING); 253 if (map == MAP_FAILED) 254 return -errno; 255 256 fill->mask = umem->config.fill_size - 1; 257 fill->size = umem->config.fill_size; 258 fill->producer = map + off.fr.producer; 259 fill->consumer = map + off.fr.consumer; 260 fill->flags = map + off.fr.flags; 261 fill->ring = map + off.fr.desc; 262 fill->cached_cons = umem->config.fill_size; 263 264 map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64), 265 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 266 XDP_UMEM_PGOFF_COMPLETION_RING); 267 if (map == MAP_FAILED) { 268 err = -errno; 269 goto out_mmap; 270 } 271 272 comp->mask = umem->config.comp_size - 1; 273 comp->size = umem->config.comp_size; 274 comp->producer = map + off.cr.producer; 275 comp->consumer = map + off.cr.consumer; 276 comp->flags = map + off.cr.flags; 277 comp->ring = map + off.cr.desc; 278 279 return 0; 280 281 out_mmap: 282 munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64)); 283 return err; 284 } 285 286 int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, 287 __u64 size, struct xsk_ring_prod *fill, 288 struct xsk_ring_cons *comp, 289 const struct xsk_umem_config *usr_config) 290 { 291 struct xdp_umem_reg mr; 292 struct xsk_umem *umem; 293 int err; 294 295 if (!umem_area || !umem_ptr || !fill || !comp) 296 return -EFAULT; 297 if (!size && !xsk_page_aligned(umem_area)) 298 return -EINVAL; 299 300 umem = calloc(1, sizeof(*umem)); 301 if (!umem) 302 return -ENOMEM; 303 304 umem->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0); 305 if (umem->fd < 0) { 306 err = -errno; 307 goto out_umem_alloc; 308 } 309 310 umem->umem_area = umem_area; 311 INIT_LIST_HEAD(&umem->ctx_list); 312 xsk_set_umem_config(&umem->config, usr_config); 313 314 memset(&mr, 0, sizeof(mr)); 315 mr.addr = (uintptr_t)umem_area; 316 mr.len = size; 317 mr.chunk_size = umem->config.frame_size; 318 mr.headroom = umem->config.frame_headroom; 319 mr.flags = umem->config.flags; 320 321 err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)); 322 if (err) { 323 err = -errno; 324 goto out_socket; 325 } 326 327 err = xsk_create_umem_rings(umem, umem->fd, fill, comp); 328 if (err) 329 goto out_socket; 330 331 umem->fill_save = fill; 332 umem->comp_save = comp; 333 *umem_ptr = umem; 334 return 0; 335 336 out_socket: 337 close(umem->fd); 338 out_umem_alloc: 339 free(umem); 340 return err; 341 } 342 343 struct xsk_umem_config_v1 { 344 __u32 fill_size; 345 __u32 comp_size; 346 __u32 frame_size; 347 __u32 frame_headroom; 348 }; 349 350 static enum xsk_prog get_xsk_prog(void) 351 { 352 enum xsk_prog detected = XSK_PROG_FALLBACK; 353 char data_in = 0, data_out; 354 struct bpf_insn insns[] = { 355 BPF_LD_MAP_FD(BPF_REG_1, 0), 356 BPF_MOV64_IMM(BPF_REG_2, 0), 357 BPF_MOV64_IMM(BPF_REG_3, XDP_PASS), 358 BPF_EMIT_CALL(BPF_FUNC_redirect_map), 359 BPF_EXIT_INSN(), 360 }; 361 LIBBPF_OPTS(bpf_test_run_opts, opts, 362 .data_in = &data_in, 363 .data_size_in = 1, 364 .data_out = &data_out, 365 ); 366 367 int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns); 368 369 map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, NULL, sizeof(int), sizeof(int), 1, NULL); 370 if (map_fd < 0) 371 return detected; 372 373 insns[0].imm = map_fd; 374 375 prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL); 376 if (prog_fd < 0) { 377 close(map_fd); 378 return detected; 379 } 380 381 ret = bpf_prog_test_run_opts(prog_fd, &opts); 382 if (!ret && opts.retval == XDP_PASS) 383 detected = XSK_PROG_REDIRECT_FLAGS; 384 close(prog_fd); 385 close(map_fd); 386 return detected; 387 } 388 389 static int xsk_load_xdp_prog(struct xsk_socket *xsk) 390 { 391 static const int log_buf_size = 16 * 1024; 392 struct xsk_ctx *ctx = xsk->ctx; 393 char log_buf[log_buf_size]; 394 int prog_fd; 395 396 /* This is the fallback C-program: 397 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) 398 * { 399 * int ret, index = ctx->rx_queue_index; 400 * 401 * // A set entry here means that the correspnding queue_id 402 * // has an active AF_XDP socket bound to it. 403 * ret = bpf_redirect_map(&xsks_map, index, XDP_PASS); 404 * if (ret > 0) 405 * return ret; 406 * 407 * // Fallback for pre-5.3 kernels, not supporting default 408 * // action in the flags parameter. 409 * if (bpf_map_lookup_elem(&xsks_map, &index)) 410 * return bpf_redirect_map(&xsks_map, index, 0); 411 * return XDP_PASS; 412 * } 413 */ 414 struct bpf_insn prog[] = { 415 /* r2 = *(u32 *)(r1 + 16) */ 416 BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16), 417 /* *(u32 *)(r10 - 4) = r2 */ 418 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4), 419 /* r1 = xskmap[] */ 420 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), 421 /* r3 = XDP_PASS */ 422 BPF_MOV64_IMM(BPF_REG_3, 2), 423 /* call bpf_redirect_map */ 424 BPF_EMIT_CALL(BPF_FUNC_redirect_map), 425 /* if w0 != 0 goto pc+13 */ 426 BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13), 427 /* r2 = r10 */ 428 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 429 /* r2 += -4 */ 430 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), 431 /* r1 = xskmap[] */ 432 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), 433 /* call bpf_map_lookup_elem */ 434 BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 435 /* r1 = r0 */ 436 BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), 437 /* r0 = XDP_PASS */ 438 BPF_MOV64_IMM(BPF_REG_0, 2), 439 /* if r1 == 0 goto pc+5 */ 440 BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), 441 /* r2 = *(u32 *)(r10 - 4) */ 442 BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4), 443 /* r1 = xskmap[] */ 444 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), 445 /* r3 = 0 */ 446 BPF_MOV64_IMM(BPF_REG_3, 0), 447 /* call bpf_redirect_map */ 448 BPF_EMIT_CALL(BPF_FUNC_redirect_map), 449 /* The jumps are to this instruction */ 450 BPF_EXIT_INSN(), 451 }; 452 453 /* This is the post-5.3 kernel C-program: 454 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) 455 * { 456 * return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS); 457 * } 458 */ 459 struct bpf_insn prog_redirect_flags[] = { 460 /* r2 = *(u32 *)(r1 + 16) */ 461 BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16), 462 /* r1 = xskmap[] */ 463 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), 464 /* r3 = XDP_PASS */ 465 BPF_MOV64_IMM(BPF_REG_3, 2), 466 /* call bpf_redirect_map */ 467 BPF_EMIT_CALL(BPF_FUNC_redirect_map), 468 BPF_EXIT_INSN(), 469 }; 470 size_t insns_cnt[] = {ARRAY_SIZE(prog), 471 ARRAY_SIZE(prog_redirect_flags), 472 }; 473 struct bpf_insn *progs[] = {prog, prog_redirect_flags}; 474 enum xsk_prog option = get_xsk_prog(); 475 LIBBPF_OPTS(bpf_prog_load_opts, opts, 476 .log_buf = log_buf, 477 .log_size = log_buf_size, 478 ); 479 480 prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "LGPL-2.1 or BSD-2-Clause", 481 progs[option], insns_cnt[option], &opts); 482 if (prog_fd < 0) { 483 pr_warn("BPF log buffer:\n%s", log_buf); 484 return prog_fd; 485 } 486 487 ctx->prog_fd = prog_fd; 488 return 0; 489 } 490 491 static int xsk_create_bpf_link(struct xsk_socket *xsk) 492 { 493 DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); 494 struct xsk_ctx *ctx = xsk->ctx; 495 __u32 prog_id = 0; 496 int link_fd; 497 int err; 498 499 err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id); 500 if (err) { 501 pr_warn("getting XDP prog id failed\n"); 502 return err; 503 } 504 505 /* if there's a netlink-based XDP prog loaded on interface, bail out 506 * and ask user to do the removal by himself 507 */ 508 if (prog_id) { 509 pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n"); 510 return -EINVAL; 511 } 512 513 opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE); 514 515 link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts); 516 if (link_fd < 0) { 517 pr_warn("bpf_link_create failed: %s\n", strerror(errno)); 518 return link_fd; 519 } 520 521 ctx->link_fd = link_fd; 522 return 0; 523 } 524 525 static int xsk_get_max_queues(struct xsk_socket *xsk) 526 { 527 struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; 528 struct xsk_ctx *ctx = xsk->ctx; 529 struct ifreq ifr = {}; 530 int fd, err, ret; 531 532 fd = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0); 533 if (fd < 0) 534 return -errno; 535 536 ifr.ifr_data = (void *)&channels; 537 bpf_strlcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ); 538 err = ioctl(fd, SIOCETHTOOL, &ifr); 539 if (err && errno != EOPNOTSUPP) { 540 ret = -errno; 541 goto out; 542 } 543 544 if (err) { 545 /* If the device says it has no channels, then all traffic 546 * is sent to a single stream, so max queues = 1. 547 */ 548 ret = 1; 549 } else { 550 /* Take the max of rx, tx, combined. Drivers return 551 * the number of channels in different ways. 552 */ 553 ret = max(channels.max_rx, channels.max_tx); 554 ret = max(ret, (int)channels.max_combined); 555 } 556 557 out: 558 close(fd); 559 return ret; 560 } 561 562 static int xsk_create_bpf_maps(struct xsk_socket *xsk) 563 { 564 struct xsk_ctx *ctx = xsk->ctx; 565 int max_queues; 566 int fd; 567 568 max_queues = xsk_get_max_queues(xsk); 569 if (max_queues < 0) 570 return max_queues; 571 572 fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map", 573 sizeof(int), sizeof(int), max_queues, NULL); 574 if (fd < 0) 575 return fd; 576 577 ctx->xsks_map_fd = fd; 578 579 return 0; 580 } 581 582 static void xsk_delete_bpf_maps(struct xsk_socket *xsk) 583 { 584 struct xsk_ctx *ctx = xsk->ctx; 585 586 bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id); 587 close(ctx->xsks_map_fd); 588 } 589 590 static int xsk_lookup_bpf_maps(struct xsk_socket *xsk) 591 { 592 __u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info); 593 __u32 map_len = sizeof(struct bpf_map_info); 594 struct bpf_prog_info prog_info = {}; 595 struct xsk_ctx *ctx = xsk->ctx; 596 struct bpf_map_info map_info; 597 int fd, err; 598 599 err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len); 600 if (err) 601 return err; 602 603 num_maps = prog_info.nr_map_ids; 604 605 map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids)); 606 if (!map_ids) 607 return -ENOMEM; 608 609 memset(&prog_info, 0, prog_len); 610 prog_info.nr_map_ids = num_maps; 611 prog_info.map_ids = (__u64)(unsigned long)map_ids; 612 613 err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len); 614 if (err) 615 goto out_map_ids; 616 617 ctx->xsks_map_fd = -1; 618 619 for (i = 0; i < prog_info.nr_map_ids; i++) { 620 fd = bpf_map_get_fd_by_id(map_ids[i]); 621 if (fd < 0) 622 continue; 623 624 memset(&map_info, 0, map_len); 625 err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len); 626 if (err) { 627 close(fd); 628 continue; 629 } 630 631 if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) { 632 ctx->xsks_map_fd = fd; 633 break; 634 } 635 636 close(fd); 637 } 638 639 if (ctx->xsks_map_fd == -1) 640 err = -ENOENT; 641 642 out_map_ids: 643 free(map_ids); 644 return err; 645 } 646 647 static int xsk_set_bpf_maps(struct xsk_socket *xsk) 648 { 649 struct xsk_ctx *ctx = xsk->ctx; 650 651 return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id, 652 &xsk->fd, 0); 653 } 654 655 static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd) 656 { 657 struct bpf_link_info link_info; 658 __u32 link_len; 659 __u32 id = 0; 660 int err; 661 int fd; 662 663 while (true) { 664 err = bpf_link_get_next_id(id, &id); 665 if (err) { 666 if (errno == ENOENT) { 667 err = 0; 668 break; 669 } 670 pr_warn("can't get next link: %s\n", strerror(errno)); 671 break; 672 } 673 674 fd = bpf_link_get_fd_by_id(id); 675 if (fd < 0) { 676 if (errno == ENOENT) 677 continue; 678 pr_warn("can't get link by id (%u): %s\n", id, strerror(errno)); 679 err = -errno; 680 break; 681 } 682 683 link_len = sizeof(struct bpf_link_info); 684 memset(&link_info, 0, link_len); 685 err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len); 686 if (err) { 687 pr_warn("can't get link info: %s\n", strerror(errno)); 688 close(fd); 689 break; 690 } 691 if (link_info.type == BPF_LINK_TYPE_XDP) { 692 if (link_info.xdp.ifindex == ifindex) { 693 *link_fd = fd; 694 if (prog_id) 695 *prog_id = link_info.prog_id; 696 break; 697 } 698 } 699 close(fd); 700 } 701 702 return err; 703 } 704 705 static bool xsk_probe_bpf_link(void) 706 { 707 LIBBPF_OPTS(bpf_link_create_opts, opts, .flags = XDP_FLAGS_SKB_MODE); 708 struct bpf_insn insns[2] = { 709 BPF_MOV64_IMM(BPF_REG_0, XDP_PASS), 710 BPF_EXIT_INSN() 711 }; 712 int prog_fd, link_fd = -1, insn_cnt = ARRAY_SIZE(insns); 713 int ifindex_lo = 1; 714 bool ret = false; 715 int err; 716 717 err = xsk_link_lookup(ifindex_lo, NULL, &link_fd); 718 if (err) 719 return ret; 720 721 if (link_fd >= 0) 722 return true; 723 724 prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL); 725 if (prog_fd < 0) 726 return ret; 727 728 link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts); 729 close(prog_fd); 730 731 if (link_fd >= 0) { 732 ret = true; 733 close(link_fd); 734 } 735 736 return ret; 737 } 738 739 static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) 740 { 741 char ifname[IFNAMSIZ]; 742 struct xsk_ctx *ctx; 743 char *interface; 744 745 ctx = calloc(1, sizeof(*ctx)); 746 if (!ctx) 747 return -ENOMEM; 748 749 interface = if_indextoname(ifindex, &ifname[0]); 750 if (!interface) { 751 free(ctx); 752 return -errno; 753 } 754 755 ctx->ifindex = ifindex; 756 bpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ); 757 758 xsk->ctx = ctx; 759 xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); 760 761 return 0; 762 } 763 764 static int xsk_init_xdp_res(struct xsk_socket *xsk, 765 int *xsks_map_fd) 766 { 767 struct xsk_ctx *ctx = xsk->ctx; 768 int err; 769 770 err = xsk_create_bpf_maps(xsk); 771 if (err) 772 return err; 773 774 err = xsk_load_xdp_prog(xsk); 775 if (err) 776 goto err_load_xdp_prog; 777 778 if (ctx->has_bpf_link) 779 err = xsk_create_bpf_link(xsk); 780 else 781 err = bpf_xdp_attach(xsk->ctx->ifindex, ctx->prog_fd, 782 xsk->config.xdp_flags, NULL); 783 784 if (err) 785 goto err_attach_xdp_prog; 786 787 if (!xsk->rx) 788 return err; 789 790 err = xsk_set_bpf_maps(xsk); 791 if (err) 792 goto err_set_bpf_maps; 793 794 return err; 795 796 err_set_bpf_maps: 797 if (ctx->has_bpf_link) 798 close(ctx->link_fd); 799 else 800 bpf_xdp_detach(ctx->ifindex, 0, NULL); 801 err_attach_xdp_prog: 802 close(ctx->prog_fd); 803 err_load_xdp_prog: 804 xsk_delete_bpf_maps(xsk); 805 return err; 806 } 807 808 static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id) 809 { 810 struct xsk_ctx *ctx = xsk->ctx; 811 int err; 812 813 ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); 814 if (ctx->prog_fd < 0) { 815 err = -errno; 816 goto err_prog_fd; 817 } 818 err = xsk_lookup_bpf_maps(xsk); 819 if (err) 820 goto err_lookup_maps; 821 822 if (!xsk->rx) 823 return err; 824 825 err = xsk_set_bpf_maps(xsk); 826 if (err) 827 goto err_set_maps; 828 829 return err; 830 831 err_set_maps: 832 close(ctx->xsks_map_fd); 833 err_lookup_maps: 834 close(ctx->prog_fd); 835 err_prog_fd: 836 if (ctx->has_bpf_link) 837 close(ctx->link_fd); 838 return err; 839 } 840 841 static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd) 842 { 843 struct xsk_socket *xsk = _xdp; 844 struct xsk_ctx *ctx = xsk->ctx; 845 __u32 prog_id = 0; 846 int err; 847 848 if (ctx->has_bpf_link) 849 err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd); 850 else 851 err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id); 852 853 if (err) 854 return err; 855 856 err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) : 857 xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id); 858 859 if (!err && xsks_map_fd) 860 *xsks_map_fd = ctx->xsks_map_fd; 861 862 return err; 863 } 864 865 int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd) 866 { 867 return __xsk_setup_xdp_prog(xsk, xsks_map_fd); 868 } 869 870 static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex, 871 __u32 queue_id) 872 { 873 struct xsk_ctx *ctx; 874 875 if (list_empty(&umem->ctx_list)) 876 return NULL; 877 878 list_for_each_entry(ctx, &umem->ctx_list, list) { 879 if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) { 880 ctx->refcount++; 881 return ctx; 882 } 883 } 884 885 return NULL; 886 } 887 888 static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap) 889 { 890 struct xsk_umem *umem = ctx->umem; 891 struct xdp_mmap_offsets off; 892 int err; 893 894 if (--ctx->refcount) 895 return; 896 897 if (!unmap) 898 goto out_free; 899 900 err = xsk_get_mmap_offsets(umem->fd, &off); 901 if (err) 902 goto out_free; 903 904 munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size * 905 sizeof(__u64)); 906 munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size * 907 sizeof(__u64)); 908 909 out_free: 910 list_del(&ctx->list); 911 free(ctx); 912 } 913 914 static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, 915 struct xsk_umem *umem, int ifindex, 916 const char *ifname, __u32 queue_id, 917 struct xsk_ring_prod *fill, 918 struct xsk_ring_cons *comp) 919 { 920 struct xsk_ctx *ctx; 921 int err; 922 923 ctx = calloc(1, sizeof(*ctx)); 924 if (!ctx) 925 return NULL; 926 927 if (!umem->fill_save) { 928 err = xsk_create_umem_rings(umem, xsk->fd, fill, comp); 929 if (err) { 930 free(ctx); 931 return NULL; 932 } 933 } else if (umem->fill_save != fill || umem->comp_save != comp) { 934 /* Copy over rings to new structs. */ 935 memcpy(fill, umem->fill_save, sizeof(*fill)); 936 memcpy(comp, umem->comp_save, sizeof(*comp)); 937 } 938 939 ctx->ifindex = ifindex; 940 ctx->refcount = 1; 941 ctx->umem = umem; 942 ctx->queue_id = queue_id; 943 bpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ); 944 945 ctx->fill = fill; 946 ctx->comp = comp; 947 list_add(&ctx->list, &umem->ctx_list); 948 ctx->has_bpf_link = xsk_probe_bpf_link(); 949 return ctx; 950 } 951 952 static void xsk_destroy_xsk_struct(struct xsk_socket *xsk) 953 { 954 free(xsk->ctx); 955 free(xsk); 956 } 957 958 int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd) 959 { 960 xsk->ctx->xsks_map_fd = fd; 961 return xsk_set_bpf_maps(xsk); 962 } 963 964 int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd) 965 { 966 struct xsk_socket *xsk; 967 int res; 968 969 xsk = calloc(1, sizeof(*xsk)); 970 if (!xsk) 971 return -ENOMEM; 972 973 res = xsk_create_xsk_struct(ifindex, xsk); 974 if (res) { 975 free(xsk); 976 return -EINVAL; 977 } 978 979 res = __xsk_setup_xdp_prog(xsk, xsks_map_fd); 980 981 xsk_destroy_xsk_struct(xsk); 982 983 return res; 984 } 985 986 int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, 987 const char *ifname, 988 __u32 queue_id, struct xsk_umem *umem, 989 struct xsk_ring_cons *rx, 990 struct xsk_ring_prod *tx, 991 struct xsk_ring_prod *fill, 992 struct xsk_ring_cons *comp, 993 const struct xsk_socket_config *usr_config) 994 { 995 bool unmap, rx_setup_done = false, tx_setup_done = false; 996 void *rx_map = NULL, *tx_map = NULL; 997 struct sockaddr_xdp sxdp = {}; 998 struct xdp_mmap_offsets off; 999 struct xsk_socket *xsk; 1000 struct xsk_ctx *ctx; 1001 int err, ifindex; 1002 1003 if (!umem || !xsk_ptr || !(rx || tx)) 1004 return -EFAULT; 1005 1006 unmap = umem->fill_save != fill; 1007 1008 xsk = calloc(1, sizeof(*xsk)); 1009 if (!xsk) 1010 return -ENOMEM; 1011 1012 err = xsk_set_xdp_socket_config(&xsk->config, usr_config); 1013 if (err) 1014 goto out_xsk_alloc; 1015 1016 xsk->outstanding_tx = 0; 1017 ifindex = if_nametoindex(ifname); 1018 if (!ifindex) { 1019 err = -errno; 1020 goto out_xsk_alloc; 1021 } 1022 1023 if (umem->refcount++ > 0) { 1024 xsk->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0); 1025 if (xsk->fd < 0) { 1026 err = -errno; 1027 goto out_xsk_alloc; 1028 } 1029 } else { 1030 xsk->fd = umem->fd; 1031 rx_setup_done = umem->rx_ring_setup_done; 1032 tx_setup_done = umem->tx_ring_setup_done; 1033 } 1034 1035 ctx = xsk_get_ctx(umem, ifindex, queue_id); 1036 if (!ctx) { 1037 if (!fill || !comp) { 1038 err = -EFAULT; 1039 goto out_socket; 1040 } 1041 1042 ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id, 1043 fill, comp); 1044 if (!ctx) { 1045 err = -ENOMEM; 1046 goto out_socket; 1047 } 1048 } 1049 xsk->ctx = ctx; 1050 1051 if (rx && !rx_setup_done) { 1052 err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, 1053 &xsk->config.rx_size, 1054 sizeof(xsk->config.rx_size)); 1055 if (err) { 1056 err = -errno; 1057 goto out_put_ctx; 1058 } 1059 if (xsk->fd == umem->fd) 1060 umem->rx_ring_setup_done = true; 1061 } 1062 if (tx && !tx_setup_done) { 1063 err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING, 1064 &xsk->config.tx_size, 1065 sizeof(xsk->config.tx_size)); 1066 if (err) { 1067 err = -errno; 1068 goto out_put_ctx; 1069 } 1070 if (xsk->fd == umem->fd) 1071 umem->tx_ring_setup_done = true; 1072 } 1073 1074 err = xsk_get_mmap_offsets(xsk->fd, &off); 1075 if (err) { 1076 err = -errno; 1077 goto out_put_ctx; 1078 } 1079 1080 if (rx) { 1081 rx_map = mmap(NULL, off.rx.desc + 1082 xsk->config.rx_size * sizeof(struct xdp_desc), 1083 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, 1084 xsk->fd, XDP_PGOFF_RX_RING); 1085 if (rx_map == MAP_FAILED) { 1086 err = -errno; 1087 goto out_put_ctx; 1088 } 1089 1090 rx->mask = xsk->config.rx_size - 1; 1091 rx->size = xsk->config.rx_size; 1092 rx->producer = rx_map + off.rx.producer; 1093 rx->consumer = rx_map + off.rx.consumer; 1094 rx->flags = rx_map + off.rx.flags; 1095 rx->ring = rx_map + off.rx.desc; 1096 rx->cached_prod = *rx->producer; 1097 rx->cached_cons = *rx->consumer; 1098 } 1099 xsk->rx = rx; 1100 1101 if (tx) { 1102 tx_map = mmap(NULL, off.tx.desc + 1103 xsk->config.tx_size * sizeof(struct xdp_desc), 1104 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, 1105 xsk->fd, XDP_PGOFF_TX_RING); 1106 if (tx_map == MAP_FAILED) { 1107 err = -errno; 1108 goto out_mmap_rx; 1109 } 1110 1111 tx->mask = xsk->config.tx_size - 1; 1112 tx->size = xsk->config.tx_size; 1113 tx->producer = tx_map + off.tx.producer; 1114 tx->consumer = tx_map + off.tx.consumer; 1115 tx->flags = tx_map + off.tx.flags; 1116 tx->ring = tx_map + off.tx.desc; 1117 tx->cached_prod = *tx->producer; 1118 /* cached_cons is r->size bigger than the real consumer pointer 1119 * See xsk_prod_nb_free 1120 */ 1121 tx->cached_cons = *tx->consumer + xsk->config.tx_size; 1122 } 1123 xsk->tx = tx; 1124 1125 sxdp.sxdp_family = PF_XDP; 1126 sxdp.sxdp_ifindex = ctx->ifindex; 1127 sxdp.sxdp_queue_id = ctx->queue_id; 1128 if (umem->refcount > 1) { 1129 sxdp.sxdp_flags |= XDP_SHARED_UMEM; 1130 sxdp.sxdp_shared_umem_fd = umem->fd; 1131 } else { 1132 sxdp.sxdp_flags = xsk->config.bind_flags; 1133 } 1134 1135 err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp)); 1136 if (err) { 1137 err = -errno; 1138 goto out_mmap_tx; 1139 } 1140 1141 if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { 1142 err = __xsk_setup_xdp_prog(xsk, NULL); 1143 if (err) 1144 goto out_mmap_tx; 1145 } 1146 1147 *xsk_ptr = xsk; 1148 umem->fill_save = NULL; 1149 umem->comp_save = NULL; 1150 return 0; 1151 1152 out_mmap_tx: 1153 if (tx) 1154 munmap(tx_map, off.tx.desc + 1155 xsk->config.tx_size * sizeof(struct xdp_desc)); 1156 out_mmap_rx: 1157 if (rx) 1158 munmap(rx_map, off.rx.desc + 1159 xsk->config.rx_size * sizeof(struct xdp_desc)); 1160 out_put_ctx: 1161 xsk_put_ctx(ctx, unmap); 1162 out_socket: 1163 if (--umem->refcount) 1164 close(xsk->fd); 1165 out_xsk_alloc: 1166 free(xsk); 1167 return err; 1168 } 1169 1170 int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, 1171 __u32 queue_id, struct xsk_umem *umem, 1172 struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, 1173 const struct xsk_socket_config *usr_config) 1174 { 1175 if (!umem) 1176 return -EFAULT; 1177 1178 return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem, 1179 rx, tx, umem->fill_save, 1180 umem->comp_save, usr_config); 1181 } 1182 1183 int xsk_umem__delete(struct xsk_umem *umem) 1184 { 1185 struct xdp_mmap_offsets off; 1186 int err; 1187 1188 if (!umem) 1189 return 0; 1190 1191 if (umem->refcount) 1192 return -EBUSY; 1193 1194 err = xsk_get_mmap_offsets(umem->fd, &off); 1195 if (!err && umem->fill_save && umem->comp_save) { 1196 munmap(umem->fill_save->ring - off.fr.desc, 1197 off.fr.desc + umem->config.fill_size * sizeof(__u64)); 1198 munmap(umem->comp_save->ring - off.cr.desc, 1199 off.cr.desc + umem->config.comp_size * sizeof(__u64)); 1200 } 1201 1202 close(umem->fd); 1203 free(umem); 1204 1205 return 0; 1206 } 1207 1208 void xsk_socket__delete(struct xsk_socket *xsk) 1209 { 1210 size_t desc_sz = sizeof(struct xdp_desc); 1211 struct xdp_mmap_offsets off; 1212 struct xsk_umem *umem; 1213 struct xsk_ctx *ctx; 1214 int err; 1215 1216 if (!xsk) 1217 return; 1218 1219 ctx = xsk->ctx; 1220 umem = ctx->umem; 1221 1222 if (ctx->refcount == 1) { 1223 xsk_delete_bpf_maps(xsk); 1224 close(ctx->prog_fd); 1225 if (ctx->has_bpf_link) 1226 close(ctx->link_fd); 1227 } 1228 1229 xsk_put_ctx(ctx, true); 1230 1231 err = xsk_get_mmap_offsets(xsk->fd, &off); 1232 if (!err) { 1233 if (xsk->rx) { 1234 munmap(xsk->rx->ring - off.rx.desc, 1235 off.rx.desc + xsk->config.rx_size * desc_sz); 1236 } 1237 if (xsk->tx) { 1238 munmap(xsk->tx->ring - off.tx.desc, 1239 off.tx.desc + xsk->config.tx_size * desc_sz); 1240 } 1241 } 1242 1243 umem->refcount--; 1244 /* Do not close an fd that also has an associated umem connected 1245 * to it. 1246 */ 1247 if (xsk->fd != umem->fd) 1248 close(xsk->fd); 1249 free(xsk); 1250 } 1251