xref: /openbmc/linux/tools/testing/selftests/bpf/xsk.c (revision 979ac5ef)
1 // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
2 
3 /*
4  * AF_XDP user-space access library.
5  *
6  * Copyright(c) 2018 - 2019 Intel Corporation.
7  *
8  * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
9  */
10 
11 #include <errno.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15 #include <arpa/inet.h>
16 #include <asm/barrier.h>
17 #include <linux/compiler.h>
18 #include <linux/ethtool.h>
19 #include <linux/filter.h>
20 #include <linux/if_ether.h>
21 #include <linux/if_packet.h>
22 #include <linux/if_xdp.h>
23 #include <linux/kernel.h>
24 #include <linux/list.h>
25 #include <linux/sockios.h>
26 #include <net/if.h>
27 #include <sys/ioctl.h>
28 #include <sys/mman.h>
29 #include <sys/socket.h>
30 #include <sys/types.h>
31 #include <linux/if_link.h>
32 
33 #include <bpf/bpf.h>
34 #include <bpf/libbpf.h>
35 #include "xsk.h"
36 #include "bpf_util.h"
37 
38 #ifndef SOL_XDP
39  #define SOL_XDP 283
40 #endif
41 
42 #ifndef AF_XDP
43  #define AF_XDP 44
44 #endif
45 
46 #ifndef PF_XDP
47  #define PF_XDP AF_XDP
48 #endif
49 
50 #define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__)
51 
52 enum xsk_prog {
53 	XSK_PROG_FALLBACK,
54 	XSK_PROG_REDIRECT_FLAGS,
55 };
56 
57 struct xsk_umem {
58 	struct xsk_ring_prod *fill_save;
59 	struct xsk_ring_cons *comp_save;
60 	char *umem_area;
61 	struct xsk_umem_config config;
62 	int fd;
63 	int refcount;
64 	struct list_head ctx_list;
65 	bool rx_ring_setup_done;
66 	bool tx_ring_setup_done;
67 };
68 
69 struct xsk_ctx {
70 	struct xsk_ring_prod *fill;
71 	struct xsk_ring_cons *comp;
72 	__u32 queue_id;
73 	struct xsk_umem *umem;
74 	int refcount;
75 	int ifindex;
76 	struct list_head list;
77 	int prog_fd;
78 	int link_fd;
79 	int xsks_map_fd;
80 	char ifname[IFNAMSIZ];
81 	bool has_bpf_link;
82 };
83 
84 struct xsk_socket {
85 	struct xsk_ring_cons *rx;
86 	struct xsk_ring_prod *tx;
87 	__u64 outstanding_tx;
88 	struct xsk_ctx *ctx;
89 	struct xsk_socket_config config;
90 	int fd;
91 };
92 
93 struct xsk_nl_info {
94 	bool xdp_prog_attached;
95 	int ifindex;
96 	int fd;
97 };
98 
99 /* Up until and including Linux 5.3 */
100 struct xdp_ring_offset_v1 {
101 	__u64 producer;
102 	__u64 consumer;
103 	__u64 desc;
104 };
105 
106 /* Up until and including Linux 5.3 */
107 struct xdp_mmap_offsets_v1 {
108 	struct xdp_ring_offset_v1 rx;
109 	struct xdp_ring_offset_v1 tx;
110 	struct xdp_ring_offset_v1 fr;
111 	struct xdp_ring_offset_v1 cr;
112 };
113 
114 int xsk_umem__fd(const struct xsk_umem *umem)
115 {
116 	return umem ? umem->fd : -EINVAL;
117 }
118 
119 int xsk_socket__fd(const struct xsk_socket *xsk)
120 {
121 	return xsk ? xsk->fd : -EINVAL;
122 }
123 
124 static bool xsk_page_aligned(void *buffer)
125 {
126 	unsigned long addr = (unsigned long)buffer;
127 
128 	return !(addr & (getpagesize() - 1));
129 }
130 
131 static void xsk_set_umem_config(struct xsk_umem_config *cfg,
132 				const struct xsk_umem_config *usr_cfg)
133 {
134 	if (!usr_cfg) {
135 		cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
136 		cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
137 		cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
138 		cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
139 		cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
140 		return;
141 	}
142 
143 	cfg->fill_size = usr_cfg->fill_size;
144 	cfg->comp_size = usr_cfg->comp_size;
145 	cfg->frame_size = usr_cfg->frame_size;
146 	cfg->frame_headroom = usr_cfg->frame_headroom;
147 	cfg->flags = usr_cfg->flags;
148 }
149 
150 static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
151 				     const struct xsk_socket_config *usr_cfg)
152 {
153 	if (!usr_cfg) {
154 		cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
155 		cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
156 		cfg->libbpf_flags = 0;
157 		cfg->xdp_flags = 0;
158 		cfg->bind_flags = 0;
159 		return 0;
160 	}
161 
162 	if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)
163 		return -EINVAL;
164 
165 	cfg->rx_size = usr_cfg->rx_size;
166 	cfg->tx_size = usr_cfg->tx_size;
167 	cfg->libbpf_flags = usr_cfg->libbpf_flags;
168 	cfg->xdp_flags = usr_cfg->xdp_flags;
169 	cfg->bind_flags = usr_cfg->bind_flags;
170 
171 	return 0;
172 }
173 
174 static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off)
175 {
176 	struct xdp_mmap_offsets_v1 off_v1;
177 
178 	/* getsockopt on a kernel <= 5.3 has no flags fields.
179 	 * Copy over the offsets to the correct places in the >=5.4 format
180 	 * and put the flags where they would have been on that kernel.
181 	 */
182 	memcpy(&off_v1, off, sizeof(off_v1));
183 
184 	off->rx.producer = off_v1.rx.producer;
185 	off->rx.consumer = off_v1.rx.consumer;
186 	off->rx.desc = off_v1.rx.desc;
187 	off->rx.flags = off_v1.rx.consumer + sizeof(__u32);
188 
189 	off->tx.producer = off_v1.tx.producer;
190 	off->tx.consumer = off_v1.tx.consumer;
191 	off->tx.desc = off_v1.tx.desc;
192 	off->tx.flags = off_v1.tx.consumer + sizeof(__u32);
193 
194 	off->fr.producer = off_v1.fr.producer;
195 	off->fr.consumer = off_v1.fr.consumer;
196 	off->fr.desc = off_v1.fr.desc;
197 	off->fr.flags = off_v1.fr.consumer + sizeof(__u32);
198 
199 	off->cr.producer = off_v1.cr.producer;
200 	off->cr.consumer = off_v1.cr.consumer;
201 	off->cr.desc = off_v1.cr.desc;
202 	off->cr.flags = off_v1.cr.consumer + sizeof(__u32);
203 }
204 
205 static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off)
206 {
207 	socklen_t optlen;
208 	int err;
209 
210 	optlen = sizeof(*off);
211 	err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen);
212 	if (err)
213 		return err;
214 
215 	if (optlen == sizeof(*off))
216 		return 0;
217 
218 	if (optlen == sizeof(struct xdp_mmap_offsets_v1)) {
219 		xsk_mmap_offsets_v1(off);
220 		return 0;
221 	}
222 
223 	return -EINVAL;
224 }
225 
226 static int xsk_create_umem_rings(struct xsk_umem *umem, int fd,
227 				 struct xsk_ring_prod *fill,
228 				 struct xsk_ring_cons *comp)
229 {
230 	struct xdp_mmap_offsets off;
231 	void *map;
232 	int err;
233 
234 	err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING,
235 			 &umem->config.fill_size,
236 			 sizeof(umem->config.fill_size));
237 	if (err)
238 		return -errno;
239 
240 	err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
241 			 &umem->config.comp_size,
242 			 sizeof(umem->config.comp_size));
243 	if (err)
244 		return -errno;
245 
246 	err = xsk_get_mmap_offsets(fd, &off);
247 	if (err)
248 		return -errno;
249 
250 	map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
251 		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
252 		   XDP_UMEM_PGOFF_FILL_RING);
253 	if (map == MAP_FAILED)
254 		return -errno;
255 
256 	fill->mask = umem->config.fill_size - 1;
257 	fill->size = umem->config.fill_size;
258 	fill->producer = map + off.fr.producer;
259 	fill->consumer = map + off.fr.consumer;
260 	fill->flags = map + off.fr.flags;
261 	fill->ring = map + off.fr.desc;
262 	fill->cached_cons = umem->config.fill_size;
263 
264 	map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
265 		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
266 		   XDP_UMEM_PGOFF_COMPLETION_RING);
267 	if (map == MAP_FAILED) {
268 		err = -errno;
269 		goto out_mmap;
270 	}
271 
272 	comp->mask = umem->config.comp_size - 1;
273 	comp->size = umem->config.comp_size;
274 	comp->producer = map + off.cr.producer;
275 	comp->consumer = map + off.cr.consumer;
276 	comp->flags = map + off.cr.flags;
277 	comp->ring = map + off.cr.desc;
278 
279 	return 0;
280 
281 out_mmap:
282 	munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64));
283 	return err;
284 }
285 
286 int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area,
287 		     __u64 size, struct xsk_ring_prod *fill,
288 		     struct xsk_ring_cons *comp,
289 		     const struct xsk_umem_config *usr_config)
290 {
291 	struct xdp_umem_reg mr;
292 	struct xsk_umem *umem;
293 	int err;
294 
295 	if (!umem_area || !umem_ptr || !fill || !comp)
296 		return -EFAULT;
297 	if (!size && !xsk_page_aligned(umem_area))
298 		return -EINVAL;
299 
300 	umem = calloc(1, sizeof(*umem));
301 	if (!umem)
302 		return -ENOMEM;
303 
304 	umem->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0);
305 	if (umem->fd < 0) {
306 		err = -errno;
307 		goto out_umem_alloc;
308 	}
309 
310 	umem->umem_area = umem_area;
311 	INIT_LIST_HEAD(&umem->ctx_list);
312 	xsk_set_umem_config(&umem->config, usr_config);
313 
314 	memset(&mr, 0, sizeof(mr));
315 	mr.addr = (uintptr_t)umem_area;
316 	mr.len = size;
317 	mr.chunk_size = umem->config.frame_size;
318 	mr.headroom = umem->config.frame_headroom;
319 	mr.flags = umem->config.flags;
320 
321 	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
322 	if (err) {
323 		err = -errno;
324 		goto out_socket;
325 	}
326 
327 	err = xsk_create_umem_rings(umem, umem->fd, fill, comp);
328 	if (err)
329 		goto out_socket;
330 
331 	umem->fill_save = fill;
332 	umem->comp_save = comp;
333 	*umem_ptr = umem;
334 	return 0;
335 
336 out_socket:
337 	close(umem->fd);
338 out_umem_alloc:
339 	free(umem);
340 	return err;
341 }
342 
343 struct xsk_umem_config_v1 {
344 	__u32 fill_size;
345 	__u32 comp_size;
346 	__u32 frame_size;
347 	__u32 frame_headroom;
348 };
349 
350 static enum xsk_prog get_xsk_prog(void)
351 {
352 	enum xsk_prog detected = XSK_PROG_FALLBACK;
353 	char data_in = 0, data_out;
354 	struct bpf_insn insns[] = {
355 		BPF_LD_MAP_FD(BPF_REG_1, 0),
356 		BPF_MOV64_IMM(BPF_REG_2, 0),
357 		BPF_MOV64_IMM(BPF_REG_3, XDP_PASS),
358 		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
359 		BPF_EXIT_INSN(),
360 	};
361 	LIBBPF_OPTS(bpf_test_run_opts, opts,
362 		.data_in = &data_in,
363 		.data_size_in = 1,
364 		.data_out = &data_out,
365 	);
366 
367 	int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns);
368 
369 	map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, NULL, sizeof(int), sizeof(int), 1, NULL);
370 	if (map_fd < 0)
371 		return detected;
372 
373 	insns[0].imm = map_fd;
374 
375 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
376 	if (prog_fd < 0) {
377 		close(map_fd);
378 		return detected;
379 	}
380 
381 	ret = bpf_prog_test_run_opts(prog_fd, &opts);
382 	if (!ret && opts.retval == XDP_PASS)
383 		detected = XSK_PROG_REDIRECT_FLAGS;
384 	close(prog_fd);
385 	close(map_fd);
386 	return detected;
387 }
388 
389 static int xsk_load_xdp_prog(struct xsk_socket *xsk)
390 {
391 	static const int log_buf_size = 16 * 1024;
392 	struct xsk_ctx *ctx = xsk->ctx;
393 	char log_buf[log_buf_size];
394 	int prog_fd;
395 
396 	/* This is the fallback C-program:
397 	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
398 	 * {
399 	 *     int ret, index = ctx->rx_queue_index;
400 	 *
401 	 *     // A set entry here means that the correspnding queue_id
402 	 *     // has an active AF_XDP socket bound to it.
403 	 *     ret = bpf_redirect_map(&xsks_map, index, XDP_PASS);
404 	 *     if (ret > 0)
405 	 *         return ret;
406 	 *
407 	 *     // Fallback for pre-5.3 kernels, not supporting default
408 	 *     // action in the flags parameter.
409 	 *     if (bpf_map_lookup_elem(&xsks_map, &index))
410 	 *         return bpf_redirect_map(&xsks_map, index, 0);
411 	 *     return XDP_PASS;
412 	 * }
413 	 */
414 	struct bpf_insn prog[] = {
415 		/* r2 = *(u32 *)(r1 + 16) */
416 		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
417 		/* *(u32 *)(r10 - 4) = r2 */
418 		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4),
419 		/* r1 = xskmap[] */
420 		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
421 		/* r3 = XDP_PASS */
422 		BPF_MOV64_IMM(BPF_REG_3, 2),
423 		/* call bpf_redirect_map */
424 		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
425 		/* if w0 != 0 goto pc+13 */
426 		BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13),
427 		/* r2 = r10 */
428 		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
429 		/* r2 += -4 */
430 		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
431 		/* r1 = xskmap[] */
432 		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
433 		/* call bpf_map_lookup_elem */
434 		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
435 		/* r1 = r0 */
436 		BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
437 		/* r0 = XDP_PASS */
438 		BPF_MOV64_IMM(BPF_REG_0, 2),
439 		/* if r1 == 0 goto pc+5 */
440 		BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
441 		/* r2 = *(u32 *)(r10 - 4) */
442 		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
443 		/* r1 = xskmap[] */
444 		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
445 		/* r3 = 0 */
446 		BPF_MOV64_IMM(BPF_REG_3, 0),
447 		/* call bpf_redirect_map */
448 		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
449 		/* The jumps are to this instruction */
450 		BPF_EXIT_INSN(),
451 	};
452 
453 	/* This is the post-5.3 kernel C-program:
454 	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
455 	 * {
456 	 *     return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS);
457 	 * }
458 	 */
459 	struct bpf_insn prog_redirect_flags[] = {
460 		/* r2 = *(u32 *)(r1 + 16) */
461 		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
462 		/* r1 = xskmap[] */
463 		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
464 		/* r3 = XDP_PASS */
465 		BPF_MOV64_IMM(BPF_REG_3, 2),
466 		/* call bpf_redirect_map */
467 		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
468 		BPF_EXIT_INSN(),
469 	};
470 	size_t insns_cnt[] = {ARRAY_SIZE(prog),
471 			      ARRAY_SIZE(prog_redirect_flags),
472 	};
473 	struct bpf_insn *progs[] = {prog, prog_redirect_flags};
474 	enum xsk_prog option = get_xsk_prog();
475 	LIBBPF_OPTS(bpf_prog_load_opts, opts,
476 		.log_buf = log_buf,
477 		.log_size = log_buf_size,
478 	);
479 
480 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "LGPL-2.1 or BSD-2-Clause",
481 				progs[option], insns_cnt[option], &opts);
482 	if (prog_fd < 0) {
483 		pr_warn("BPF log buffer:\n%s", log_buf);
484 		return prog_fd;
485 	}
486 
487 	ctx->prog_fd = prog_fd;
488 	return 0;
489 }
490 
491 static int xsk_create_bpf_link(struct xsk_socket *xsk)
492 {
493 	DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
494 	struct xsk_ctx *ctx = xsk->ctx;
495 	__u32 prog_id = 0;
496 	int link_fd;
497 	int err;
498 
499 	err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id);
500 	if (err) {
501 		pr_warn("getting XDP prog id failed\n");
502 		return err;
503 	}
504 
505 	/* if there's a netlink-based XDP prog loaded on interface, bail out
506 	 * and ask user to do the removal by himself
507 	 */
508 	if (prog_id) {
509 		pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n");
510 		return -EINVAL;
511 	}
512 
513 	opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE);
514 
515 	link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts);
516 	if (link_fd < 0) {
517 		pr_warn("bpf_link_create failed: %s\n", strerror(errno));
518 		return link_fd;
519 	}
520 
521 	ctx->link_fd = link_fd;
522 	return 0;
523 }
524 
525 static int xsk_get_max_queues(struct xsk_socket *xsk)
526 {
527 	struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
528 	struct xsk_ctx *ctx = xsk->ctx;
529 	struct ifreq ifr = {};
530 	int fd, err, ret;
531 
532 	fd = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0);
533 	if (fd < 0)
534 		return -errno;
535 
536 	ifr.ifr_data = (void *)&channels;
537 	bpf_strlcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ);
538 	err = ioctl(fd, SIOCETHTOOL, &ifr);
539 	if (err && errno != EOPNOTSUPP) {
540 		ret = -errno;
541 		goto out;
542 	}
543 
544 	if (err) {
545 		/* If the device says it has no channels, then all traffic
546 		 * is sent to a single stream, so max queues = 1.
547 		 */
548 		ret = 1;
549 	} else {
550 		/* Take the max of rx, tx, combined. Drivers return
551 		 * the number of channels in different ways.
552 		 */
553 		ret = max(channels.max_rx, channels.max_tx);
554 		ret = max(ret, (int)channels.max_combined);
555 	}
556 
557 out:
558 	close(fd);
559 	return ret;
560 }
561 
562 static int xsk_create_bpf_maps(struct xsk_socket *xsk)
563 {
564 	struct xsk_ctx *ctx = xsk->ctx;
565 	int max_queues;
566 	int fd;
567 
568 	max_queues = xsk_get_max_queues(xsk);
569 	if (max_queues < 0)
570 		return max_queues;
571 
572 	fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map",
573 			    sizeof(int), sizeof(int), max_queues, NULL);
574 	if (fd < 0)
575 		return fd;
576 
577 	ctx->xsks_map_fd = fd;
578 
579 	return 0;
580 }
581 
582 static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
583 {
584 	struct xsk_ctx *ctx = xsk->ctx;
585 
586 	bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id);
587 	close(ctx->xsks_map_fd);
588 }
589 
590 static int xsk_lookup_bpf_maps(struct xsk_socket *xsk)
591 {
592 	__u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info);
593 	__u32 map_len = sizeof(struct bpf_map_info);
594 	struct bpf_prog_info prog_info = {};
595 	struct xsk_ctx *ctx = xsk->ctx;
596 	struct bpf_map_info map_info;
597 	int fd, err;
598 
599 	err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
600 	if (err)
601 		return err;
602 
603 	num_maps = prog_info.nr_map_ids;
604 
605 	map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
606 	if (!map_ids)
607 		return -ENOMEM;
608 
609 	memset(&prog_info, 0, prog_len);
610 	prog_info.nr_map_ids = num_maps;
611 	prog_info.map_ids = (__u64)(unsigned long)map_ids;
612 
613 	err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
614 	if (err)
615 		goto out_map_ids;
616 
617 	ctx->xsks_map_fd = -1;
618 
619 	for (i = 0; i < prog_info.nr_map_ids; i++) {
620 		fd = bpf_map_get_fd_by_id(map_ids[i]);
621 		if (fd < 0)
622 			continue;
623 
624 		memset(&map_info, 0, map_len);
625 		err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
626 		if (err) {
627 			close(fd);
628 			continue;
629 		}
630 
631 		if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) {
632 			ctx->xsks_map_fd = fd;
633 			break;
634 		}
635 
636 		close(fd);
637 	}
638 
639 	if (ctx->xsks_map_fd == -1)
640 		err = -ENOENT;
641 
642 out_map_ids:
643 	free(map_ids);
644 	return err;
645 }
646 
647 static int xsk_set_bpf_maps(struct xsk_socket *xsk)
648 {
649 	struct xsk_ctx *ctx = xsk->ctx;
650 
651 	return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id,
652 				   &xsk->fd, 0);
653 }
654 
655 static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd)
656 {
657 	struct bpf_link_info link_info;
658 	__u32 link_len;
659 	__u32 id = 0;
660 	int err;
661 	int fd;
662 
663 	while (true) {
664 		err = bpf_link_get_next_id(id, &id);
665 		if (err) {
666 			if (errno == ENOENT) {
667 				err = 0;
668 				break;
669 			}
670 			pr_warn("can't get next link: %s\n", strerror(errno));
671 			break;
672 		}
673 
674 		fd = bpf_link_get_fd_by_id(id);
675 		if (fd < 0) {
676 			if (errno == ENOENT)
677 				continue;
678 			pr_warn("can't get link by id (%u): %s\n", id, strerror(errno));
679 			err = -errno;
680 			break;
681 		}
682 
683 		link_len = sizeof(struct bpf_link_info);
684 		memset(&link_info, 0, link_len);
685 		err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len);
686 		if (err) {
687 			pr_warn("can't get link info: %s\n", strerror(errno));
688 			close(fd);
689 			break;
690 		}
691 		if (link_info.type == BPF_LINK_TYPE_XDP) {
692 			if (link_info.xdp.ifindex == ifindex) {
693 				*link_fd = fd;
694 				if (prog_id)
695 					*prog_id = link_info.prog_id;
696 				break;
697 			}
698 		}
699 		close(fd);
700 	}
701 
702 	return err;
703 }
704 
705 static bool xsk_probe_bpf_link(void)
706 {
707 	LIBBPF_OPTS(bpf_link_create_opts, opts, .flags = XDP_FLAGS_SKB_MODE);
708 	struct bpf_insn insns[2] = {
709 		BPF_MOV64_IMM(BPF_REG_0, XDP_PASS),
710 		BPF_EXIT_INSN()
711 	};
712 	int prog_fd, link_fd = -1, insn_cnt = ARRAY_SIZE(insns);
713 	int ifindex_lo = 1;
714 	bool ret = false;
715 	int err;
716 
717 	err = xsk_link_lookup(ifindex_lo, NULL, &link_fd);
718 	if (err)
719 		return ret;
720 
721 	if (link_fd >= 0)
722 		return true;
723 
724 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
725 	if (prog_fd < 0)
726 		return ret;
727 
728 	link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts);
729 	close(prog_fd);
730 
731 	if (link_fd >= 0) {
732 		ret = true;
733 		close(link_fd);
734 	}
735 
736 	return ret;
737 }
738 
739 static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk)
740 {
741 	char ifname[IFNAMSIZ];
742 	struct xsk_ctx *ctx;
743 	char *interface;
744 
745 	ctx = calloc(1, sizeof(*ctx));
746 	if (!ctx)
747 		return -ENOMEM;
748 
749 	interface = if_indextoname(ifindex, &ifname[0]);
750 	if (!interface) {
751 		free(ctx);
752 		return -errno;
753 	}
754 
755 	ctx->ifindex = ifindex;
756 	bpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
757 
758 	xsk->ctx = ctx;
759 	xsk->ctx->has_bpf_link = xsk_probe_bpf_link();
760 
761 	return 0;
762 }
763 
764 static int xsk_init_xdp_res(struct xsk_socket *xsk,
765 			    int *xsks_map_fd)
766 {
767 	struct xsk_ctx *ctx = xsk->ctx;
768 	int err;
769 
770 	err = xsk_create_bpf_maps(xsk);
771 	if (err)
772 		return err;
773 
774 	err = xsk_load_xdp_prog(xsk);
775 	if (err)
776 		goto err_load_xdp_prog;
777 
778 	if (ctx->has_bpf_link)
779 		err = xsk_create_bpf_link(xsk);
780 	else
781 		err = bpf_xdp_attach(xsk->ctx->ifindex, ctx->prog_fd,
782 				     xsk->config.xdp_flags, NULL);
783 
784 	if (err)
785 		goto err_attach_xdp_prog;
786 
787 	if (!xsk->rx)
788 		return err;
789 
790 	err = xsk_set_bpf_maps(xsk);
791 	if (err)
792 		goto err_set_bpf_maps;
793 
794 	return err;
795 
796 err_set_bpf_maps:
797 	if (ctx->has_bpf_link)
798 		close(ctx->link_fd);
799 	else
800 		bpf_xdp_detach(ctx->ifindex, 0, NULL);
801 err_attach_xdp_prog:
802 	close(ctx->prog_fd);
803 err_load_xdp_prog:
804 	xsk_delete_bpf_maps(xsk);
805 	return err;
806 }
807 
808 static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id)
809 {
810 	struct xsk_ctx *ctx = xsk->ctx;
811 	int err;
812 
813 	ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
814 	if (ctx->prog_fd < 0) {
815 		err = -errno;
816 		goto err_prog_fd;
817 	}
818 	err = xsk_lookup_bpf_maps(xsk);
819 	if (err)
820 		goto err_lookup_maps;
821 
822 	if (!xsk->rx)
823 		return err;
824 
825 	err = xsk_set_bpf_maps(xsk);
826 	if (err)
827 		goto err_set_maps;
828 
829 	return err;
830 
831 err_set_maps:
832 	close(ctx->xsks_map_fd);
833 err_lookup_maps:
834 	close(ctx->prog_fd);
835 err_prog_fd:
836 	if (ctx->has_bpf_link)
837 		close(ctx->link_fd);
838 	return err;
839 }
840 
841 static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd)
842 {
843 	struct xsk_socket *xsk = _xdp;
844 	struct xsk_ctx *ctx = xsk->ctx;
845 	__u32 prog_id = 0;
846 	int err;
847 
848 	if (ctx->has_bpf_link)
849 		err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd);
850 	else
851 		err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id);
852 
853 	if (err)
854 		return err;
855 
856 	err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) :
857 			 xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id);
858 
859 	if (!err && xsks_map_fd)
860 		*xsks_map_fd = ctx->xsks_map_fd;
861 
862 	return err;
863 }
864 
865 int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd)
866 {
867 	return __xsk_setup_xdp_prog(xsk, xsks_map_fd);
868 }
869 
870 static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex,
871 				   __u32 queue_id)
872 {
873 	struct xsk_ctx *ctx;
874 
875 	if (list_empty(&umem->ctx_list))
876 		return NULL;
877 
878 	list_for_each_entry(ctx, &umem->ctx_list, list) {
879 		if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) {
880 			ctx->refcount++;
881 			return ctx;
882 		}
883 	}
884 
885 	return NULL;
886 }
887 
888 static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap)
889 {
890 	struct xsk_umem *umem = ctx->umem;
891 	struct xdp_mmap_offsets off;
892 	int err;
893 
894 	if (--ctx->refcount)
895 		return;
896 
897 	if (!unmap)
898 		goto out_free;
899 
900 	err = xsk_get_mmap_offsets(umem->fd, &off);
901 	if (err)
902 		goto out_free;
903 
904 	munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size *
905 	       sizeof(__u64));
906 	munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size *
907 	       sizeof(__u64));
908 
909 out_free:
910 	list_del(&ctx->list);
911 	free(ctx);
912 }
913 
914 static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
915 				      struct xsk_umem *umem, int ifindex,
916 				      const char *ifname, __u32 queue_id,
917 				      struct xsk_ring_prod *fill,
918 				      struct xsk_ring_cons *comp)
919 {
920 	struct xsk_ctx *ctx;
921 	int err;
922 
923 	ctx = calloc(1, sizeof(*ctx));
924 	if (!ctx)
925 		return NULL;
926 
927 	if (!umem->fill_save) {
928 		err = xsk_create_umem_rings(umem, xsk->fd, fill, comp);
929 		if (err) {
930 			free(ctx);
931 			return NULL;
932 		}
933 	} else if (umem->fill_save != fill || umem->comp_save != comp) {
934 		/* Copy over rings to new structs. */
935 		memcpy(fill, umem->fill_save, sizeof(*fill));
936 		memcpy(comp, umem->comp_save, sizeof(*comp));
937 	}
938 
939 	ctx->ifindex = ifindex;
940 	ctx->refcount = 1;
941 	ctx->umem = umem;
942 	ctx->queue_id = queue_id;
943 	bpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
944 
945 	ctx->fill = fill;
946 	ctx->comp = comp;
947 	list_add(&ctx->list, &umem->ctx_list);
948 	ctx->has_bpf_link = xsk_probe_bpf_link();
949 	return ctx;
950 }
951 
952 static void xsk_destroy_xsk_struct(struct xsk_socket *xsk)
953 {
954 	free(xsk->ctx);
955 	free(xsk);
956 }
957 
958 int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd)
959 {
960 	xsk->ctx->xsks_map_fd = fd;
961 	return xsk_set_bpf_maps(xsk);
962 }
963 
964 int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd)
965 {
966 	struct xsk_socket *xsk;
967 	int res;
968 
969 	xsk = calloc(1, sizeof(*xsk));
970 	if (!xsk)
971 		return -ENOMEM;
972 
973 	res = xsk_create_xsk_struct(ifindex, xsk);
974 	if (res) {
975 		free(xsk);
976 		return -EINVAL;
977 	}
978 
979 	res = __xsk_setup_xdp_prog(xsk, xsks_map_fd);
980 
981 	xsk_destroy_xsk_struct(xsk);
982 
983 	return res;
984 }
985 
986 int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
987 			      const char *ifname,
988 			      __u32 queue_id, struct xsk_umem *umem,
989 			      struct xsk_ring_cons *rx,
990 			      struct xsk_ring_prod *tx,
991 			      struct xsk_ring_prod *fill,
992 			      struct xsk_ring_cons *comp,
993 			      const struct xsk_socket_config *usr_config)
994 {
995 	bool unmap, rx_setup_done = false, tx_setup_done = false;
996 	void *rx_map = NULL, *tx_map = NULL;
997 	struct sockaddr_xdp sxdp = {};
998 	struct xdp_mmap_offsets off;
999 	struct xsk_socket *xsk;
1000 	struct xsk_ctx *ctx;
1001 	int err, ifindex;
1002 
1003 	if (!umem || !xsk_ptr || !(rx || tx))
1004 		return -EFAULT;
1005 
1006 	unmap = umem->fill_save != fill;
1007 
1008 	xsk = calloc(1, sizeof(*xsk));
1009 	if (!xsk)
1010 		return -ENOMEM;
1011 
1012 	err = xsk_set_xdp_socket_config(&xsk->config, usr_config);
1013 	if (err)
1014 		goto out_xsk_alloc;
1015 
1016 	xsk->outstanding_tx = 0;
1017 	ifindex = if_nametoindex(ifname);
1018 	if (!ifindex) {
1019 		err = -errno;
1020 		goto out_xsk_alloc;
1021 	}
1022 
1023 	if (umem->refcount++ > 0) {
1024 		xsk->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0);
1025 		if (xsk->fd < 0) {
1026 			err = -errno;
1027 			goto out_xsk_alloc;
1028 		}
1029 	} else {
1030 		xsk->fd = umem->fd;
1031 		rx_setup_done = umem->rx_ring_setup_done;
1032 		tx_setup_done = umem->tx_ring_setup_done;
1033 	}
1034 
1035 	ctx = xsk_get_ctx(umem, ifindex, queue_id);
1036 	if (!ctx) {
1037 		if (!fill || !comp) {
1038 			err = -EFAULT;
1039 			goto out_socket;
1040 		}
1041 
1042 		ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id,
1043 				     fill, comp);
1044 		if (!ctx) {
1045 			err = -ENOMEM;
1046 			goto out_socket;
1047 		}
1048 	}
1049 	xsk->ctx = ctx;
1050 
1051 	if (rx && !rx_setup_done) {
1052 		err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
1053 				 &xsk->config.rx_size,
1054 				 sizeof(xsk->config.rx_size));
1055 		if (err) {
1056 			err = -errno;
1057 			goto out_put_ctx;
1058 		}
1059 		if (xsk->fd == umem->fd)
1060 			umem->rx_ring_setup_done = true;
1061 	}
1062 	if (tx && !tx_setup_done) {
1063 		err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
1064 				 &xsk->config.tx_size,
1065 				 sizeof(xsk->config.tx_size));
1066 		if (err) {
1067 			err = -errno;
1068 			goto out_put_ctx;
1069 		}
1070 		if (xsk->fd == umem->fd)
1071 			umem->tx_ring_setup_done = true;
1072 	}
1073 
1074 	err = xsk_get_mmap_offsets(xsk->fd, &off);
1075 	if (err) {
1076 		err = -errno;
1077 		goto out_put_ctx;
1078 	}
1079 
1080 	if (rx) {
1081 		rx_map = mmap(NULL, off.rx.desc +
1082 			      xsk->config.rx_size * sizeof(struct xdp_desc),
1083 			      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
1084 			      xsk->fd, XDP_PGOFF_RX_RING);
1085 		if (rx_map == MAP_FAILED) {
1086 			err = -errno;
1087 			goto out_put_ctx;
1088 		}
1089 
1090 		rx->mask = xsk->config.rx_size - 1;
1091 		rx->size = xsk->config.rx_size;
1092 		rx->producer = rx_map + off.rx.producer;
1093 		rx->consumer = rx_map + off.rx.consumer;
1094 		rx->flags = rx_map + off.rx.flags;
1095 		rx->ring = rx_map + off.rx.desc;
1096 		rx->cached_prod = *rx->producer;
1097 		rx->cached_cons = *rx->consumer;
1098 	}
1099 	xsk->rx = rx;
1100 
1101 	if (tx) {
1102 		tx_map = mmap(NULL, off.tx.desc +
1103 			      xsk->config.tx_size * sizeof(struct xdp_desc),
1104 			      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
1105 			      xsk->fd, XDP_PGOFF_TX_RING);
1106 		if (tx_map == MAP_FAILED) {
1107 			err = -errno;
1108 			goto out_mmap_rx;
1109 		}
1110 
1111 		tx->mask = xsk->config.tx_size - 1;
1112 		tx->size = xsk->config.tx_size;
1113 		tx->producer = tx_map + off.tx.producer;
1114 		tx->consumer = tx_map + off.tx.consumer;
1115 		tx->flags = tx_map + off.tx.flags;
1116 		tx->ring = tx_map + off.tx.desc;
1117 		tx->cached_prod = *tx->producer;
1118 		/* cached_cons is r->size bigger than the real consumer pointer
1119 		 * See xsk_prod_nb_free
1120 		 */
1121 		tx->cached_cons = *tx->consumer + xsk->config.tx_size;
1122 	}
1123 	xsk->tx = tx;
1124 
1125 	sxdp.sxdp_family = PF_XDP;
1126 	sxdp.sxdp_ifindex = ctx->ifindex;
1127 	sxdp.sxdp_queue_id = ctx->queue_id;
1128 	if (umem->refcount > 1) {
1129 		sxdp.sxdp_flags |= XDP_SHARED_UMEM;
1130 		sxdp.sxdp_shared_umem_fd = umem->fd;
1131 	} else {
1132 		sxdp.sxdp_flags = xsk->config.bind_flags;
1133 	}
1134 
1135 	err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
1136 	if (err) {
1137 		err = -errno;
1138 		goto out_mmap_tx;
1139 	}
1140 
1141 	if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
1142 		err = __xsk_setup_xdp_prog(xsk, NULL);
1143 		if (err)
1144 			goto out_mmap_tx;
1145 	}
1146 
1147 	*xsk_ptr = xsk;
1148 	umem->fill_save = NULL;
1149 	umem->comp_save = NULL;
1150 	return 0;
1151 
1152 out_mmap_tx:
1153 	if (tx)
1154 		munmap(tx_map, off.tx.desc +
1155 		       xsk->config.tx_size * sizeof(struct xdp_desc));
1156 out_mmap_rx:
1157 	if (rx)
1158 		munmap(rx_map, off.rx.desc +
1159 		       xsk->config.rx_size * sizeof(struct xdp_desc));
1160 out_put_ctx:
1161 	xsk_put_ctx(ctx, unmap);
1162 out_socket:
1163 	if (--umem->refcount)
1164 		close(xsk->fd);
1165 out_xsk_alloc:
1166 	free(xsk);
1167 	return err;
1168 }
1169 
1170 int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
1171 		       __u32 queue_id, struct xsk_umem *umem,
1172 		       struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
1173 		       const struct xsk_socket_config *usr_config)
1174 {
1175 	if (!umem)
1176 		return -EFAULT;
1177 
1178 	return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem,
1179 					 rx, tx, umem->fill_save,
1180 					 umem->comp_save, usr_config);
1181 }
1182 
1183 int xsk_umem__delete(struct xsk_umem *umem)
1184 {
1185 	struct xdp_mmap_offsets off;
1186 	int err;
1187 
1188 	if (!umem)
1189 		return 0;
1190 
1191 	if (umem->refcount)
1192 		return -EBUSY;
1193 
1194 	err = xsk_get_mmap_offsets(umem->fd, &off);
1195 	if (!err && umem->fill_save && umem->comp_save) {
1196 		munmap(umem->fill_save->ring - off.fr.desc,
1197 		       off.fr.desc + umem->config.fill_size * sizeof(__u64));
1198 		munmap(umem->comp_save->ring - off.cr.desc,
1199 		       off.cr.desc + umem->config.comp_size * sizeof(__u64));
1200 	}
1201 
1202 	close(umem->fd);
1203 	free(umem);
1204 
1205 	return 0;
1206 }
1207 
1208 void xsk_socket__delete(struct xsk_socket *xsk)
1209 {
1210 	size_t desc_sz = sizeof(struct xdp_desc);
1211 	struct xdp_mmap_offsets off;
1212 	struct xsk_umem *umem;
1213 	struct xsk_ctx *ctx;
1214 	int err;
1215 
1216 	if (!xsk)
1217 		return;
1218 
1219 	ctx = xsk->ctx;
1220 	umem = ctx->umem;
1221 
1222 	if (ctx->refcount == 1) {
1223 		xsk_delete_bpf_maps(xsk);
1224 		close(ctx->prog_fd);
1225 		if (ctx->has_bpf_link)
1226 			close(ctx->link_fd);
1227 	}
1228 
1229 	xsk_put_ctx(ctx, true);
1230 
1231 	err = xsk_get_mmap_offsets(xsk->fd, &off);
1232 	if (!err) {
1233 		if (xsk->rx) {
1234 			munmap(xsk->rx->ring - off.rx.desc,
1235 			       off.rx.desc + xsk->config.rx_size * desc_sz);
1236 		}
1237 		if (xsk->tx) {
1238 			munmap(xsk->tx->ring - off.tx.desc,
1239 			       off.tx.desc + xsk->config.tx_size * desc_sz);
1240 		}
1241 	}
1242 
1243 	umem->refcount--;
1244 	/* Do not close an fd that also has an associated umem connected
1245 	 * to it.
1246 	 */
1247 	if (xsk->fd != umem->fd)
1248 		close(xsk->fd);
1249 	free(xsk);
1250 }
1251