1 // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 3 /* 4 * AF_XDP user-space access library. 5 * 6 * Copyright(c) 2018 - 2019 Intel Corporation. 7 * 8 * Author(s): Magnus Karlsson <magnus.karlsson@intel.com> 9 */ 10 11 #include <errno.h> 12 #include <stdlib.h> 13 #include <string.h> 14 #include <unistd.h> 15 #include <arpa/inet.h> 16 #include <asm/barrier.h> 17 #include <linux/compiler.h> 18 #include <linux/ethtool.h> 19 #include <linux/filter.h> 20 #include <linux/if_ether.h> 21 #include <linux/if_packet.h> 22 #include <linux/if_xdp.h> 23 #include <linux/kernel.h> 24 #include <linux/list.h> 25 #include <linux/sockios.h> 26 #include <net/if.h> 27 #include <sys/ioctl.h> 28 #include <sys/mman.h> 29 #include <sys/socket.h> 30 #include <sys/types.h> 31 #include <linux/if_link.h> 32 33 #include <bpf/bpf.h> 34 #include <bpf/libbpf.h> 35 #include "xsk.h" 36 37 #ifndef SOL_XDP 38 #define SOL_XDP 283 39 #endif 40 41 #ifndef AF_XDP 42 #define AF_XDP 44 43 #endif 44 45 #ifndef PF_XDP 46 #define PF_XDP AF_XDP 47 #endif 48 49 #define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) 50 51 enum xsk_prog { 52 XSK_PROG_FALLBACK, 53 XSK_PROG_REDIRECT_FLAGS, 54 }; 55 56 struct xsk_umem { 57 struct xsk_ring_prod *fill_save; 58 struct xsk_ring_cons *comp_save; 59 char *umem_area; 60 struct xsk_umem_config config; 61 int fd; 62 int refcount; 63 struct list_head ctx_list; 64 bool rx_ring_setup_done; 65 bool tx_ring_setup_done; 66 }; 67 68 struct xsk_ctx { 69 struct xsk_ring_prod *fill; 70 struct xsk_ring_cons *comp; 71 __u32 queue_id; 72 struct xsk_umem *umem; 73 int refcount; 74 int ifindex; 75 struct list_head list; 76 int prog_fd; 77 int link_fd; 78 int xsks_map_fd; 79 char ifname[IFNAMSIZ]; 80 bool has_bpf_link; 81 }; 82 83 struct xsk_socket { 84 struct xsk_ring_cons *rx; 85 struct xsk_ring_prod *tx; 86 __u64 outstanding_tx; 87 struct xsk_ctx *ctx; 88 struct xsk_socket_config config; 89 int fd; 90 }; 91 92 struct xsk_nl_info { 93 bool xdp_prog_attached; 94 int ifindex; 95 int fd; 96 }; 97 98 /* Up until and including Linux 5.3 */ 99 struct xdp_ring_offset_v1 { 100 __u64 producer; 101 __u64 consumer; 102 __u64 desc; 103 }; 104 105 /* Up until and including Linux 5.3 */ 106 struct xdp_mmap_offsets_v1 { 107 struct xdp_ring_offset_v1 rx; 108 struct xdp_ring_offset_v1 tx; 109 struct xdp_ring_offset_v1 fr; 110 struct xdp_ring_offset_v1 cr; 111 }; 112 113 int xsk_umem__fd(const struct xsk_umem *umem) 114 { 115 return umem ? umem->fd : -EINVAL; 116 } 117 118 int xsk_socket__fd(const struct xsk_socket *xsk) 119 { 120 return xsk ? xsk->fd : -EINVAL; 121 } 122 123 static bool xsk_page_aligned(void *buffer) 124 { 125 unsigned long addr = (unsigned long)buffer; 126 127 return !(addr & (getpagesize() - 1)); 128 } 129 130 static void xsk_set_umem_config(struct xsk_umem_config *cfg, 131 const struct xsk_umem_config *usr_cfg) 132 { 133 if (!usr_cfg) { 134 cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; 135 cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; 136 cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; 137 cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; 138 cfg->flags = XSK_UMEM__DEFAULT_FLAGS; 139 return; 140 } 141 142 cfg->fill_size = usr_cfg->fill_size; 143 cfg->comp_size = usr_cfg->comp_size; 144 cfg->frame_size = usr_cfg->frame_size; 145 cfg->frame_headroom = usr_cfg->frame_headroom; 146 cfg->flags = usr_cfg->flags; 147 } 148 149 static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, 150 const struct xsk_socket_config *usr_cfg) 151 { 152 if (!usr_cfg) { 153 cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; 154 cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; 155 cfg->libbpf_flags = 0; 156 cfg->xdp_flags = 0; 157 cfg->bind_flags = 0; 158 return 0; 159 } 160 161 if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD) 162 return -EINVAL; 163 164 cfg->rx_size = usr_cfg->rx_size; 165 cfg->tx_size = usr_cfg->tx_size; 166 cfg->libbpf_flags = usr_cfg->libbpf_flags; 167 cfg->xdp_flags = usr_cfg->xdp_flags; 168 cfg->bind_flags = usr_cfg->bind_flags; 169 170 return 0; 171 } 172 173 static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off) 174 { 175 struct xdp_mmap_offsets_v1 off_v1; 176 177 /* getsockopt on a kernel <= 5.3 has no flags fields. 178 * Copy over the offsets to the correct places in the >=5.4 format 179 * and put the flags where they would have been on that kernel. 180 */ 181 memcpy(&off_v1, off, sizeof(off_v1)); 182 183 off->rx.producer = off_v1.rx.producer; 184 off->rx.consumer = off_v1.rx.consumer; 185 off->rx.desc = off_v1.rx.desc; 186 off->rx.flags = off_v1.rx.consumer + sizeof(__u32); 187 188 off->tx.producer = off_v1.tx.producer; 189 off->tx.consumer = off_v1.tx.consumer; 190 off->tx.desc = off_v1.tx.desc; 191 off->tx.flags = off_v1.tx.consumer + sizeof(__u32); 192 193 off->fr.producer = off_v1.fr.producer; 194 off->fr.consumer = off_v1.fr.consumer; 195 off->fr.desc = off_v1.fr.desc; 196 off->fr.flags = off_v1.fr.consumer + sizeof(__u32); 197 198 off->cr.producer = off_v1.cr.producer; 199 off->cr.consumer = off_v1.cr.consumer; 200 off->cr.desc = off_v1.cr.desc; 201 off->cr.flags = off_v1.cr.consumer + sizeof(__u32); 202 } 203 204 static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off) 205 { 206 socklen_t optlen; 207 int err; 208 209 optlen = sizeof(*off); 210 err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen); 211 if (err) 212 return err; 213 214 if (optlen == sizeof(*off)) 215 return 0; 216 217 if (optlen == sizeof(struct xdp_mmap_offsets_v1)) { 218 xsk_mmap_offsets_v1(off); 219 return 0; 220 } 221 222 return -EINVAL; 223 } 224 225 static int xsk_create_umem_rings(struct xsk_umem *umem, int fd, 226 struct xsk_ring_prod *fill, 227 struct xsk_ring_cons *comp) 228 { 229 struct xdp_mmap_offsets off; 230 void *map; 231 int err; 232 233 err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING, 234 &umem->config.fill_size, 235 sizeof(umem->config.fill_size)); 236 if (err) 237 return -errno; 238 239 err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING, 240 &umem->config.comp_size, 241 sizeof(umem->config.comp_size)); 242 if (err) 243 return -errno; 244 245 err = xsk_get_mmap_offsets(fd, &off); 246 if (err) 247 return -errno; 248 249 map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64), 250 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 251 XDP_UMEM_PGOFF_FILL_RING); 252 if (map == MAP_FAILED) 253 return -errno; 254 255 fill->mask = umem->config.fill_size - 1; 256 fill->size = umem->config.fill_size; 257 fill->producer = map + off.fr.producer; 258 fill->consumer = map + off.fr.consumer; 259 fill->flags = map + off.fr.flags; 260 fill->ring = map + off.fr.desc; 261 fill->cached_cons = umem->config.fill_size; 262 263 map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64), 264 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 265 XDP_UMEM_PGOFF_COMPLETION_RING); 266 if (map == MAP_FAILED) { 267 err = -errno; 268 goto out_mmap; 269 } 270 271 comp->mask = umem->config.comp_size - 1; 272 comp->size = umem->config.comp_size; 273 comp->producer = map + off.cr.producer; 274 comp->consumer = map + off.cr.consumer; 275 comp->flags = map + off.cr.flags; 276 comp->ring = map + off.cr.desc; 277 278 return 0; 279 280 out_mmap: 281 munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64)); 282 return err; 283 } 284 285 int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, 286 __u64 size, struct xsk_ring_prod *fill, 287 struct xsk_ring_cons *comp, 288 const struct xsk_umem_config *usr_config) 289 { 290 struct xdp_umem_reg mr; 291 struct xsk_umem *umem; 292 int err; 293 294 if (!umem_area || !umem_ptr || !fill || !comp) 295 return -EFAULT; 296 if (!size && !xsk_page_aligned(umem_area)) 297 return -EINVAL; 298 299 umem = calloc(1, sizeof(*umem)); 300 if (!umem) 301 return -ENOMEM; 302 303 umem->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0); 304 if (umem->fd < 0) { 305 err = -errno; 306 goto out_umem_alloc; 307 } 308 309 umem->umem_area = umem_area; 310 INIT_LIST_HEAD(&umem->ctx_list); 311 xsk_set_umem_config(&umem->config, usr_config); 312 313 memset(&mr, 0, sizeof(mr)); 314 mr.addr = (uintptr_t)umem_area; 315 mr.len = size; 316 mr.chunk_size = umem->config.frame_size; 317 mr.headroom = umem->config.frame_headroom; 318 mr.flags = umem->config.flags; 319 320 err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)); 321 if (err) { 322 err = -errno; 323 goto out_socket; 324 } 325 326 err = xsk_create_umem_rings(umem, umem->fd, fill, comp); 327 if (err) 328 goto out_socket; 329 330 umem->fill_save = fill; 331 umem->comp_save = comp; 332 *umem_ptr = umem; 333 return 0; 334 335 out_socket: 336 close(umem->fd); 337 out_umem_alloc: 338 free(umem); 339 return err; 340 } 341 342 struct xsk_umem_config_v1 { 343 __u32 fill_size; 344 __u32 comp_size; 345 __u32 frame_size; 346 __u32 frame_headroom; 347 }; 348 349 static enum xsk_prog get_xsk_prog(void) 350 { 351 enum xsk_prog detected = XSK_PROG_FALLBACK; 352 char data_in = 0, data_out; 353 struct bpf_insn insns[] = { 354 BPF_LD_MAP_FD(BPF_REG_1, 0), 355 BPF_MOV64_IMM(BPF_REG_2, 0), 356 BPF_MOV64_IMM(BPF_REG_3, XDP_PASS), 357 BPF_EMIT_CALL(BPF_FUNC_redirect_map), 358 BPF_EXIT_INSN(), 359 }; 360 LIBBPF_OPTS(bpf_test_run_opts, opts, 361 .data_in = &data_in, 362 .data_size_in = 1, 363 .data_out = &data_out, 364 ); 365 366 int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns); 367 368 map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, NULL, sizeof(int), sizeof(int), 1, NULL); 369 if (map_fd < 0) 370 return detected; 371 372 insns[0].imm = map_fd; 373 374 prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL); 375 if (prog_fd < 0) { 376 close(map_fd); 377 return detected; 378 } 379 380 ret = bpf_prog_test_run_opts(prog_fd, &opts); 381 if (!ret && opts.retval == XDP_PASS) 382 detected = XSK_PROG_REDIRECT_FLAGS; 383 close(prog_fd); 384 close(map_fd); 385 return detected; 386 } 387 388 static int xsk_load_xdp_prog(struct xsk_socket *xsk) 389 { 390 static const int log_buf_size = 16 * 1024; 391 struct xsk_ctx *ctx = xsk->ctx; 392 char log_buf[log_buf_size]; 393 int prog_fd; 394 395 /* This is the fallback C-program: 396 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) 397 * { 398 * int ret, index = ctx->rx_queue_index; 399 * 400 * // A set entry here means that the correspnding queue_id 401 * // has an active AF_XDP socket bound to it. 402 * ret = bpf_redirect_map(&xsks_map, index, XDP_PASS); 403 * if (ret > 0) 404 * return ret; 405 * 406 * // Fallback for pre-5.3 kernels, not supporting default 407 * // action in the flags parameter. 408 * if (bpf_map_lookup_elem(&xsks_map, &index)) 409 * return bpf_redirect_map(&xsks_map, index, 0); 410 * return XDP_PASS; 411 * } 412 */ 413 struct bpf_insn prog[] = { 414 /* r2 = *(u32 *)(r1 + 16) */ 415 BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16), 416 /* *(u32 *)(r10 - 4) = r2 */ 417 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4), 418 /* r1 = xskmap[] */ 419 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), 420 /* r3 = XDP_PASS */ 421 BPF_MOV64_IMM(BPF_REG_3, 2), 422 /* call bpf_redirect_map */ 423 BPF_EMIT_CALL(BPF_FUNC_redirect_map), 424 /* if w0 != 0 goto pc+13 */ 425 BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13), 426 /* r2 = r10 */ 427 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 428 /* r2 += -4 */ 429 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), 430 /* r1 = xskmap[] */ 431 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), 432 /* call bpf_map_lookup_elem */ 433 BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), 434 /* r1 = r0 */ 435 BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), 436 /* r0 = XDP_PASS */ 437 BPF_MOV64_IMM(BPF_REG_0, 2), 438 /* if r1 == 0 goto pc+5 */ 439 BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), 440 /* r2 = *(u32 *)(r10 - 4) */ 441 BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4), 442 /* r1 = xskmap[] */ 443 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), 444 /* r3 = 0 */ 445 BPF_MOV64_IMM(BPF_REG_3, 0), 446 /* call bpf_redirect_map */ 447 BPF_EMIT_CALL(BPF_FUNC_redirect_map), 448 /* The jumps are to this instruction */ 449 BPF_EXIT_INSN(), 450 }; 451 452 /* This is the post-5.3 kernel C-program: 453 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) 454 * { 455 * return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS); 456 * } 457 */ 458 struct bpf_insn prog_redirect_flags[] = { 459 /* r2 = *(u32 *)(r1 + 16) */ 460 BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16), 461 /* r1 = xskmap[] */ 462 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd), 463 /* r3 = XDP_PASS */ 464 BPF_MOV64_IMM(BPF_REG_3, 2), 465 /* call bpf_redirect_map */ 466 BPF_EMIT_CALL(BPF_FUNC_redirect_map), 467 BPF_EXIT_INSN(), 468 }; 469 size_t insns_cnt[] = {ARRAY_SIZE(prog), 470 ARRAY_SIZE(prog_redirect_flags), 471 }; 472 struct bpf_insn *progs[] = {prog, prog_redirect_flags}; 473 enum xsk_prog option = get_xsk_prog(); 474 LIBBPF_OPTS(bpf_prog_load_opts, opts, 475 .log_buf = log_buf, 476 .log_size = log_buf_size, 477 ); 478 479 prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "LGPL-2.1 or BSD-2-Clause", 480 progs[option], insns_cnt[option], &opts); 481 if (prog_fd < 0) { 482 pr_warn("BPF log buffer:\n%s", log_buf); 483 return prog_fd; 484 } 485 486 ctx->prog_fd = prog_fd; 487 return 0; 488 } 489 490 static int xsk_create_bpf_link(struct xsk_socket *xsk) 491 { 492 DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); 493 struct xsk_ctx *ctx = xsk->ctx; 494 __u32 prog_id = 0; 495 int link_fd; 496 int err; 497 498 err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id); 499 if (err) { 500 pr_warn("getting XDP prog id failed\n"); 501 return err; 502 } 503 504 /* if there's a netlink-based XDP prog loaded on interface, bail out 505 * and ask user to do the removal by himself 506 */ 507 if (prog_id) { 508 pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n"); 509 return -EINVAL; 510 } 511 512 opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE); 513 514 link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts); 515 if (link_fd < 0) { 516 pr_warn("bpf_link_create failed: %s\n", strerror(errno)); 517 return link_fd; 518 } 519 520 ctx->link_fd = link_fd; 521 return 0; 522 } 523 524 /* Copy up to sz - 1 bytes from zero-terminated src string and ensure that dst 525 * is zero-terminated string no matter what (unless sz == 0, in which case 526 * it's a no-op). It's conceptually close to FreeBSD's strlcpy(), but differs 527 * in what is returned. Given this is internal helper, it's trivial to extend 528 * this, when necessary. Use this instead of strncpy inside libbpf source code. 529 */ 530 static inline void libbpf_strlcpy(char *dst, const char *src, size_t sz) 531 { 532 size_t i; 533 534 if (sz == 0) 535 return; 536 537 sz--; 538 for (i = 0; i < sz && src[i]; i++) 539 dst[i] = src[i]; 540 dst[i] = '\0'; 541 } 542 543 static int xsk_get_max_queues(struct xsk_socket *xsk) 544 { 545 struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; 546 struct xsk_ctx *ctx = xsk->ctx; 547 struct ifreq ifr = {}; 548 int fd, err, ret; 549 550 fd = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0); 551 if (fd < 0) 552 return -errno; 553 554 ifr.ifr_data = (void *)&channels; 555 libbpf_strlcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ); 556 err = ioctl(fd, SIOCETHTOOL, &ifr); 557 if (err && errno != EOPNOTSUPP) { 558 ret = -errno; 559 goto out; 560 } 561 562 if (err) { 563 /* If the device says it has no channels, then all traffic 564 * is sent to a single stream, so max queues = 1. 565 */ 566 ret = 1; 567 } else { 568 /* Take the max of rx, tx, combined. Drivers return 569 * the number of channels in different ways. 570 */ 571 ret = max(channels.max_rx, channels.max_tx); 572 ret = max(ret, (int)channels.max_combined); 573 } 574 575 out: 576 close(fd); 577 return ret; 578 } 579 580 static int xsk_create_bpf_maps(struct xsk_socket *xsk) 581 { 582 struct xsk_ctx *ctx = xsk->ctx; 583 int max_queues; 584 int fd; 585 586 max_queues = xsk_get_max_queues(xsk); 587 if (max_queues < 0) 588 return max_queues; 589 590 fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map", 591 sizeof(int), sizeof(int), max_queues, NULL); 592 if (fd < 0) 593 return fd; 594 595 ctx->xsks_map_fd = fd; 596 597 return 0; 598 } 599 600 static void xsk_delete_bpf_maps(struct xsk_socket *xsk) 601 { 602 struct xsk_ctx *ctx = xsk->ctx; 603 604 bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id); 605 close(ctx->xsks_map_fd); 606 } 607 608 static int xsk_lookup_bpf_maps(struct xsk_socket *xsk) 609 { 610 __u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info); 611 __u32 map_len = sizeof(struct bpf_map_info); 612 struct bpf_prog_info prog_info = {}; 613 struct xsk_ctx *ctx = xsk->ctx; 614 struct bpf_map_info map_info; 615 int fd, err; 616 617 err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len); 618 if (err) 619 return err; 620 621 num_maps = prog_info.nr_map_ids; 622 623 map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids)); 624 if (!map_ids) 625 return -ENOMEM; 626 627 memset(&prog_info, 0, prog_len); 628 prog_info.nr_map_ids = num_maps; 629 prog_info.map_ids = (__u64)(unsigned long)map_ids; 630 631 err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len); 632 if (err) 633 goto out_map_ids; 634 635 ctx->xsks_map_fd = -1; 636 637 for (i = 0; i < prog_info.nr_map_ids; i++) { 638 fd = bpf_map_get_fd_by_id(map_ids[i]); 639 if (fd < 0) 640 continue; 641 642 memset(&map_info, 0, map_len); 643 err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len); 644 if (err) { 645 close(fd); 646 continue; 647 } 648 649 if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) { 650 ctx->xsks_map_fd = fd; 651 break; 652 } 653 654 close(fd); 655 } 656 657 if (ctx->xsks_map_fd == -1) 658 err = -ENOENT; 659 660 out_map_ids: 661 free(map_ids); 662 return err; 663 } 664 665 static int xsk_set_bpf_maps(struct xsk_socket *xsk) 666 { 667 struct xsk_ctx *ctx = xsk->ctx; 668 669 return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id, 670 &xsk->fd, 0); 671 } 672 673 static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd) 674 { 675 struct bpf_link_info link_info; 676 __u32 link_len; 677 __u32 id = 0; 678 int err; 679 int fd; 680 681 while (true) { 682 err = bpf_link_get_next_id(id, &id); 683 if (err) { 684 if (errno == ENOENT) { 685 err = 0; 686 break; 687 } 688 pr_warn("can't get next link: %s\n", strerror(errno)); 689 break; 690 } 691 692 fd = bpf_link_get_fd_by_id(id); 693 if (fd < 0) { 694 if (errno == ENOENT) 695 continue; 696 pr_warn("can't get link by id (%u): %s\n", id, strerror(errno)); 697 err = -errno; 698 break; 699 } 700 701 link_len = sizeof(struct bpf_link_info); 702 memset(&link_info, 0, link_len); 703 err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len); 704 if (err) { 705 pr_warn("can't get link info: %s\n", strerror(errno)); 706 close(fd); 707 break; 708 } 709 if (link_info.type == BPF_LINK_TYPE_XDP) { 710 if (link_info.xdp.ifindex == ifindex) { 711 *link_fd = fd; 712 if (prog_id) 713 *prog_id = link_info.prog_id; 714 break; 715 } 716 } 717 close(fd); 718 } 719 720 return err; 721 } 722 723 static bool xsk_probe_bpf_link(void) 724 { 725 LIBBPF_OPTS(bpf_link_create_opts, opts, .flags = XDP_FLAGS_SKB_MODE); 726 struct bpf_insn insns[2] = { 727 BPF_MOV64_IMM(BPF_REG_0, XDP_PASS), 728 BPF_EXIT_INSN() 729 }; 730 int prog_fd, link_fd = -1, insn_cnt = ARRAY_SIZE(insns); 731 int ifindex_lo = 1; 732 bool ret = false; 733 int err; 734 735 err = xsk_link_lookup(ifindex_lo, NULL, &link_fd); 736 if (err) 737 return ret; 738 739 if (link_fd >= 0) 740 return true; 741 742 prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL); 743 if (prog_fd < 0) 744 return ret; 745 746 link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts); 747 close(prog_fd); 748 749 if (link_fd >= 0) { 750 ret = true; 751 close(link_fd); 752 } 753 754 return ret; 755 } 756 757 static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) 758 { 759 char ifname[IFNAMSIZ]; 760 struct xsk_ctx *ctx; 761 char *interface; 762 763 ctx = calloc(1, sizeof(*ctx)); 764 if (!ctx) 765 return -ENOMEM; 766 767 interface = if_indextoname(ifindex, &ifname[0]); 768 if (!interface) { 769 free(ctx); 770 return -errno; 771 } 772 773 ctx->ifindex = ifindex; 774 libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ); 775 776 xsk->ctx = ctx; 777 xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); 778 779 return 0; 780 } 781 782 static int xsk_init_xdp_res(struct xsk_socket *xsk, 783 int *xsks_map_fd) 784 { 785 struct xsk_ctx *ctx = xsk->ctx; 786 int err; 787 788 err = xsk_create_bpf_maps(xsk); 789 if (err) 790 return err; 791 792 err = xsk_load_xdp_prog(xsk); 793 if (err) 794 goto err_load_xdp_prog; 795 796 if (ctx->has_bpf_link) 797 err = xsk_create_bpf_link(xsk); 798 else 799 err = bpf_xdp_attach(xsk->ctx->ifindex, ctx->prog_fd, 800 xsk->config.xdp_flags, NULL); 801 802 if (err) 803 goto err_attach_xdp_prog; 804 805 if (!xsk->rx) 806 return err; 807 808 err = xsk_set_bpf_maps(xsk); 809 if (err) 810 goto err_set_bpf_maps; 811 812 return err; 813 814 err_set_bpf_maps: 815 if (ctx->has_bpf_link) 816 close(ctx->link_fd); 817 else 818 bpf_xdp_detach(ctx->ifindex, 0, NULL); 819 err_attach_xdp_prog: 820 close(ctx->prog_fd); 821 err_load_xdp_prog: 822 xsk_delete_bpf_maps(xsk); 823 return err; 824 } 825 826 static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id) 827 { 828 struct xsk_ctx *ctx = xsk->ctx; 829 int err; 830 831 ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); 832 if (ctx->prog_fd < 0) { 833 err = -errno; 834 goto err_prog_fd; 835 } 836 err = xsk_lookup_bpf_maps(xsk); 837 if (err) 838 goto err_lookup_maps; 839 840 if (!xsk->rx) 841 return err; 842 843 err = xsk_set_bpf_maps(xsk); 844 if (err) 845 goto err_set_maps; 846 847 return err; 848 849 err_set_maps: 850 close(ctx->xsks_map_fd); 851 err_lookup_maps: 852 close(ctx->prog_fd); 853 err_prog_fd: 854 if (ctx->has_bpf_link) 855 close(ctx->link_fd); 856 return err; 857 } 858 859 static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd) 860 { 861 struct xsk_socket *xsk = _xdp; 862 struct xsk_ctx *ctx = xsk->ctx; 863 __u32 prog_id = 0; 864 int err; 865 866 if (ctx->has_bpf_link) 867 err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd); 868 else 869 err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id); 870 871 if (err) 872 return err; 873 874 err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) : 875 xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id); 876 877 if (!err && xsks_map_fd) 878 *xsks_map_fd = ctx->xsks_map_fd; 879 880 return err; 881 } 882 883 int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd) 884 { 885 return __xsk_setup_xdp_prog(xsk, xsks_map_fd); 886 } 887 888 static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex, 889 __u32 queue_id) 890 { 891 struct xsk_ctx *ctx; 892 893 if (list_empty(&umem->ctx_list)) 894 return NULL; 895 896 list_for_each_entry(ctx, &umem->ctx_list, list) { 897 if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) { 898 ctx->refcount++; 899 return ctx; 900 } 901 } 902 903 return NULL; 904 } 905 906 static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap) 907 { 908 struct xsk_umem *umem = ctx->umem; 909 struct xdp_mmap_offsets off; 910 int err; 911 912 if (--ctx->refcount) 913 return; 914 915 if (!unmap) 916 goto out_free; 917 918 err = xsk_get_mmap_offsets(umem->fd, &off); 919 if (err) 920 goto out_free; 921 922 munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size * 923 sizeof(__u64)); 924 munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size * 925 sizeof(__u64)); 926 927 out_free: 928 list_del(&ctx->list); 929 free(ctx); 930 } 931 932 static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, 933 struct xsk_umem *umem, int ifindex, 934 const char *ifname, __u32 queue_id, 935 struct xsk_ring_prod *fill, 936 struct xsk_ring_cons *comp) 937 { 938 struct xsk_ctx *ctx; 939 int err; 940 941 ctx = calloc(1, sizeof(*ctx)); 942 if (!ctx) 943 return NULL; 944 945 if (!umem->fill_save) { 946 err = xsk_create_umem_rings(umem, xsk->fd, fill, comp); 947 if (err) { 948 free(ctx); 949 return NULL; 950 } 951 } else if (umem->fill_save != fill || umem->comp_save != comp) { 952 /* Copy over rings to new structs. */ 953 memcpy(fill, umem->fill_save, sizeof(*fill)); 954 memcpy(comp, umem->comp_save, sizeof(*comp)); 955 } 956 957 ctx->ifindex = ifindex; 958 ctx->refcount = 1; 959 ctx->umem = umem; 960 ctx->queue_id = queue_id; 961 libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ); 962 963 ctx->fill = fill; 964 ctx->comp = comp; 965 list_add(&ctx->list, &umem->ctx_list); 966 ctx->has_bpf_link = xsk_probe_bpf_link(); 967 return ctx; 968 } 969 970 static void xsk_destroy_xsk_struct(struct xsk_socket *xsk) 971 { 972 free(xsk->ctx); 973 free(xsk); 974 } 975 976 int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd) 977 { 978 xsk->ctx->xsks_map_fd = fd; 979 return xsk_set_bpf_maps(xsk); 980 } 981 982 int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd) 983 { 984 struct xsk_socket *xsk; 985 int res; 986 987 xsk = calloc(1, sizeof(*xsk)); 988 if (!xsk) 989 return -ENOMEM; 990 991 res = xsk_create_xsk_struct(ifindex, xsk); 992 if (res) { 993 free(xsk); 994 return -EINVAL; 995 } 996 997 res = __xsk_setup_xdp_prog(xsk, xsks_map_fd); 998 999 xsk_destroy_xsk_struct(xsk); 1000 1001 return res; 1002 } 1003 1004 int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, 1005 const char *ifname, 1006 __u32 queue_id, struct xsk_umem *umem, 1007 struct xsk_ring_cons *rx, 1008 struct xsk_ring_prod *tx, 1009 struct xsk_ring_prod *fill, 1010 struct xsk_ring_cons *comp, 1011 const struct xsk_socket_config *usr_config) 1012 { 1013 bool unmap, rx_setup_done = false, tx_setup_done = false; 1014 void *rx_map = NULL, *tx_map = NULL; 1015 struct sockaddr_xdp sxdp = {}; 1016 struct xdp_mmap_offsets off; 1017 struct xsk_socket *xsk; 1018 struct xsk_ctx *ctx; 1019 int err, ifindex; 1020 1021 if (!umem || !xsk_ptr || !(rx || tx)) 1022 return -EFAULT; 1023 1024 unmap = umem->fill_save != fill; 1025 1026 xsk = calloc(1, sizeof(*xsk)); 1027 if (!xsk) 1028 return -ENOMEM; 1029 1030 err = xsk_set_xdp_socket_config(&xsk->config, usr_config); 1031 if (err) 1032 goto out_xsk_alloc; 1033 1034 xsk->outstanding_tx = 0; 1035 ifindex = if_nametoindex(ifname); 1036 if (!ifindex) { 1037 err = -errno; 1038 goto out_xsk_alloc; 1039 } 1040 1041 if (umem->refcount++ > 0) { 1042 xsk->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0); 1043 if (xsk->fd < 0) { 1044 err = -errno; 1045 goto out_xsk_alloc; 1046 } 1047 } else { 1048 xsk->fd = umem->fd; 1049 rx_setup_done = umem->rx_ring_setup_done; 1050 tx_setup_done = umem->tx_ring_setup_done; 1051 } 1052 1053 ctx = xsk_get_ctx(umem, ifindex, queue_id); 1054 if (!ctx) { 1055 if (!fill || !comp) { 1056 err = -EFAULT; 1057 goto out_socket; 1058 } 1059 1060 ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id, 1061 fill, comp); 1062 if (!ctx) { 1063 err = -ENOMEM; 1064 goto out_socket; 1065 } 1066 } 1067 xsk->ctx = ctx; 1068 1069 if (rx && !rx_setup_done) { 1070 err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, 1071 &xsk->config.rx_size, 1072 sizeof(xsk->config.rx_size)); 1073 if (err) { 1074 err = -errno; 1075 goto out_put_ctx; 1076 } 1077 if (xsk->fd == umem->fd) 1078 umem->rx_ring_setup_done = true; 1079 } 1080 if (tx && !tx_setup_done) { 1081 err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING, 1082 &xsk->config.tx_size, 1083 sizeof(xsk->config.tx_size)); 1084 if (err) { 1085 err = -errno; 1086 goto out_put_ctx; 1087 } 1088 if (xsk->fd == umem->fd) 1089 umem->tx_ring_setup_done = true; 1090 } 1091 1092 err = xsk_get_mmap_offsets(xsk->fd, &off); 1093 if (err) { 1094 err = -errno; 1095 goto out_put_ctx; 1096 } 1097 1098 if (rx) { 1099 rx_map = mmap(NULL, off.rx.desc + 1100 xsk->config.rx_size * sizeof(struct xdp_desc), 1101 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, 1102 xsk->fd, XDP_PGOFF_RX_RING); 1103 if (rx_map == MAP_FAILED) { 1104 err = -errno; 1105 goto out_put_ctx; 1106 } 1107 1108 rx->mask = xsk->config.rx_size - 1; 1109 rx->size = xsk->config.rx_size; 1110 rx->producer = rx_map + off.rx.producer; 1111 rx->consumer = rx_map + off.rx.consumer; 1112 rx->flags = rx_map + off.rx.flags; 1113 rx->ring = rx_map + off.rx.desc; 1114 rx->cached_prod = *rx->producer; 1115 rx->cached_cons = *rx->consumer; 1116 } 1117 xsk->rx = rx; 1118 1119 if (tx) { 1120 tx_map = mmap(NULL, off.tx.desc + 1121 xsk->config.tx_size * sizeof(struct xdp_desc), 1122 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, 1123 xsk->fd, XDP_PGOFF_TX_RING); 1124 if (tx_map == MAP_FAILED) { 1125 err = -errno; 1126 goto out_mmap_rx; 1127 } 1128 1129 tx->mask = xsk->config.tx_size - 1; 1130 tx->size = xsk->config.tx_size; 1131 tx->producer = tx_map + off.tx.producer; 1132 tx->consumer = tx_map + off.tx.consumer; 1133 tx->flags = tx_map + off.tx.flags; 1134 tx->ring = tx_map + off.tx.desc; 1135 tx->cached_prod = *tx->producer; 1136 /* cached_cons is r->size bigger than the real consumer pointer 1137 * See xsk_prod_nb_free 1138 */ 1139 tx->cached_cons = *tx->consumer + xsk->config.tx_size; 1140 } 1141 xsk->tx = tx; 1142 1143 sxdp.sxdp_family = PF_XDP; 1144 sxdp.sxdp_ifindex = ctx->ifindex; 1145 sxdp.sxdp_queue_id = ctx->queue_id; 1146 if (umem->refcount > 1) { 1147 sxdp.sxdp_flags |= XDP_SHARED_UMEM; 1148 sxdp.sxdp_shared_umem_fd = umem->fd; 1149 } else { 1150 sxdp.sxdp_flags = xsk->config.bind_flags; 1151 } 1152 1153 err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp)); 1154 if (err) { 1155 err = -errno; 1156 goto out_mmap_tx; 1157 } 1158 1159 if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { 1160 err = __xsk_setup_xdp_prog(xsk, NULL); 1161 if (err) 1162 goto out_mmap_tx; 1163 } 1164 1165 *xsk_ptr = xsk; 1166 umem->fill_save = NULL; 1167 umem->comp_save = NULL; 1168 return 0; 1169 1170 out_mmap_tx: 1171 if (tx) 1172 munmap(tx_map, off.tx.desc + 1173 xsk->config.tx_size * sizeof(struct xdp_desc)); 1174 out_mmap_rx: 1175 if (rx) 1176 munmap(rx_map, off.rx.desc + 1177 xsk->config.rx_size * sizeof(struct xdp_desc)); 1178 out_put_ctx: 1179 xsk_put_ctx(ctx, unmap); 1180 out_socket: 1181 if (--umem->refcount) 1182 close(xsk->fd); 1183 out_xsk_alloc: 1184 free(xsk); 1185 return err; 1186 } 1187 1188 int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, 1189 __u32 queue_id, struct xsk_umem *umem, 1190 struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, 1191 const struct xsk_socket_config *usr_config) 1192 { 1193 if (!umem) 1194 return -EFAULT; 1195 1196 return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem, 1197 rx, tx, umem->fill_save, 1198 umem->comp_save, usr_config); 1199 } 1200 1201 int xsk_umem__delete(struct xsk_umem *umem) 1202 { 1203 struct xdp_mmap_offsets off; 1204 int err; 1205 1206 if (!umem) 1207 return 0; 1208 1209 if (umem->refcount) 1210 return -EBUSY; 1211 1212 err = xsk_get_mmap_offsets(umem->fd, &off); 1213 if (!err && umem->fill_save && umem->comp_save) { 1214 munmap(umem->fill_save->ring - off.fr.desc, 1215 off.fr.desc + umem->config.fill_size * sizeof(__u64)); 1216 munmap(umem->comp_save->ring - off.cr.desc, 1217 off.cr.desc + umem->config.comp_size * sizeof(__u64)); 1218 } 1219 1220 close(umem->fd); 1221 free(umem); 1222 1223 return 0; 1224 } 1225 1226 void xsk_socket__delete(struct xsk_socket *xsk) 1227 { 1228 size_t desc_sz = sizeof(struct xdp_desc); 1229 struct xdp_mmap_offsets off; 1230 struct xsk_umem *umem; 1231 struct xsk_ctx *ctx; 1232 int err; 1233 1234 if (!xsk) 1235 return; 1236 1237 ctx = xsk->ctx; 1238 umem = ctx->umem; 1239 1240 xsk_put_ctx(ctx, true); 1241 1242 if (!ctx->refcount) { 1243 xsk_delete_bpf_maps(xsk); 1244 close(ctx->prog_fd); 1245 if (ctx->has_bpf_link) 1246 close(ctx->link_fd); 1247 } 1248 1249 err = xsk_get_mmap_offsets(xsk->fd, &off); 1250 if (!err) { 1251 if (xsk->rx) { 1252 munmap(xsk->rx->ring - off.rx.desc, 1253 off.rx.desc + xsk->config.rx_size * desc_sz); 1254 } 1255 if (xsk->tx) { 1256 munmap(xsk->tx->ring - off.tx.desc, 1257 off.tx.desc + xsk->config.tx_size * desc_sz); 1258 } 1259 } 1260 1261 umem->refcount--; 1262 /* Do not close an fd that also has an associated umem connected 1263 * to it. 1264 */ 1265 if (xsk->fd != umem->fd) 1266 close(xsk->fd); 1267 free(xsk); 1268 } 1269