xref: /openbmc/linux/tools/testing/selftests/bpf/xsk.c (revision 36926a7d)
1 // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
2 
3 /*
4  * AF_XDP user-space access library.
5  *
6  * Copyright(c) 2018 - 2019 Intel Corporation.
7  *
8  * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
9  */
10 
11 #include <errno.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15 #include <arpa/inet.h>
16 #include <asm/barrier.h>
17 #include <linux/compiler.h>
18 #include <linux/ethtool.h>
19 #include <linux/filter.h>
20 #include <linux/if_ether.h>
21 #include <linux/if_packet.h>
22 #include <linux/if_xdp.h>
23 #include <linux/kernel.h>
24 #include <linux/list.h>
25 #include <linux/sockios.h>
26 #include <net/if.h>
27 #include <sys/ioctl.h>
28 #include <sys/mman.h>
29 #include <sys/socket.h>
30 #include <sys/types.h>
31 #include <linux/if_link.h>
32 
33 #include <bpf/bpf.h>
34 #include <bpf/libbpf.h>
35 #include "xsk.h"
36 
37 #ifndef SOL_XDP
38  #define SOL_XDP 283
39 #endif
40 
41 #ifndef AF_XDP
42  #define AF_XDP 44
43 #endif
44 
45 #ifndef PF_XDP
46  #define PF_XDP AF_XDP
47 #endif
48 
49 #define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__)
50 
51 enum xsk_prog {
52 	XSK_PROG_FALLBACK,
53 	XSK_PROG_REDIRECT_FLAGS,
54 };
55 
56 struct xsk_umem {
57 	struct xsk_ring_prod *fill_save;
58 	struct xsk_ring_cons *comp_save;
59 	char *umem_area;
60 	struct xsk_umem_config config;
61 	int fd;
62 	int refcount;
63 	struct list_head ctx_list;
64 	bool rx_ring_setup_done;
65 	bool tx_ring_setup_done;
66 };
67 
68 struct xsk_ctx {
69 	struct xsk_ring_prod *fill;
70 	struct xsk_ring_cons *comp;
71 	__u32 queue_id;
72 	struct xsk_umem *umem;
73 	int refcount;
74 	int ifindex;
75 	struct list_head list;
76 	int prog_fd;
77 	int link_fd;
78 	int xsks_map_fd;
79 	char ifname[IFNAMSIZ];
80 	bool has_bpf_link;
81 };
82 
83 struct xsk_socket {
84 	struct xsk_ring_cons *rx;
85 	struct xsk_ring_prod *tx;
86 	__u64 outstanding_tx;
87 	struct xsk_ctx *ctx;
88 	struct xsk_socket_config config;
89 	int fd;
90 };
91 
92 struct xsk_nl_info {
93 	bool xdp_prog_attached;
94 	int ifindex;
95 	int fd;
96 };
97 
98 /* Up until and including Linux 5.3 */
99 struct xdp_ring_offset_v1 {
100 	__u64 producer;
101 	__u64 consumer;
102 	__u64 desc;
103 };
104 
105 /* Up until and including Linux 5.3 */
106 struct xdp_mmap_offsets_v1 {
107 	struct xdp_ring_offset_v1 rx;
108 	struct xdp_ring_offset_v1 tx;
109 	struct xdp_ring_offset_v1 fr;
110 	struct xdp_ring_offset_v1 cr;
111 };
112 
113 int xsk_umem__fd(const struct xsk_umem *umem)
114 {
115 	return umem ? umem->fd : -EINVAL;
116 }
117 
118 int xsk_socket__fd(const struct xsk_socket *xsk)
119 {
120 	return xsk ? xsk->fd : -EINVAL;
121 }
122 
123 static bool xsk_page_aligned(void *buffer)
124 {
125 	unsigned long addr = (unsigned long)buffer;
126 
127 	return !(addr & (getpagesize() - 1));
128 }
129 
130 static void xsk_set_umem_config(struct xsk_umem_config *cfg,
131 				const struct xsk_umem_config *usr_cfg)
132 {
133 	if (!usr_cfg) {
134 		cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
135 		cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
136 		cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
137 		cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
138 		cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
139 		return;
140 	}
141 
142 	cfg->fill_size = usr_cfg->fill_size;
143 	cfg->comp_size = usr_cfg->comp_size;
144 	cfg->frame_size = usr_cfg->frame_size;
145 	cfg->frame_headroom = usr_cfg->frame_headroom;
146 	cfg->flags = usr_cfg->flags;
147 }
148 
149 static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
150 				     const struct xsk_socket_config *usr_cfg)
151 {
152 	if (!usr_cfg) {
153 		cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
154 		cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
155 		cfg->libbpf_flags = 0;
156 		cfg->xdp_flags = 0;
157 		cfg->bind_flags = 0;
158 		return 0;
159 	}
160 
161 	if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)
162 		return -EINVAL;
163 
164 	cfg->rx_size = usr_cfg->rx_size;
165 	cfg->tx_size = usr_cfg->tx_size;
166 	cfg->libbpf_flags = usr_cfg->libbpf_flags;
167 	cfg->xdp_flags = usr_cfg->xdp_flags;
168 	cfg->bind_flags = usr_cfg->bind_flags;
169 
170 	return 0;
171 }
172 
173 static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off)
174 {
175 	struct xdp_mmap_offsets_v1 off_v1;
176 
177 	/* getsockopt on a kernel <= 5.3 has no flags fields.
178 	 * Copy over the offsets to the correct places in the >=5.4 format
179 	 * and put the flags where they would have been on that kernel.
180 	 */
181 	memcpy(&off_v1, off, sizeof(off_v1));
182 
183 	off->rx.producer = off_v1.rx.producer;
184 	off->rx.consumer = off_v1.rx.consumer;
185 	off->rx.desc = off_v1.rx.desc;
186 	off->rx.flags = off_v1.rx.consumer + sizeof(__u32);
187 
188 	off->tx.producer = off_v1.tx.producer;
189 	off->tx.consumer = off_v1.tx.consumer;
190 	off->tx.desc = off_v1.tx.desc;
191 	off->tx.flags = off_v1.tx.consumer + sizeof(__u32);
192 
193 	off->fr.producer = off_v1.fr.producer;
194 	off->fr.consumer = off_v1.fr.consumer;
195 	off->fr.desc = off_v1.fr.desc;
196 	off->fr.flags = off_v1.fr.consumer + sizeof(__u32);
197 
198 	off->cr.producer = off_v1.cr.producer;
199 	off->cr.consumer = off_v1.cr.consumer;
200 	off->cr.desc = off_v1.cr.desc;
201 	off->cr.flags = off_v1.cr.consumer + sizeof(__u32);
202 }
203 
204 static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off)
205 {
206 	socklen_t optlen;
207 	int err;
208 
209 	optlen = sizeof(*off);
210 	err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen);
211 	if (err)
212 		return err;
213 
214 	if (optlen == sizeof(*off))
215 		return 0;
216 
217 	if (optlen == sizeof(struct xdp_mmap_offsets_v1)) {
218 		xsk_mmap_offsets_v1(off);
219 		return 0;
220 	}
221 
222 	return -EINVAL;
223 }
224 
225 static int xsk_create_umem_rings(struct xsk_umem *umem, int fd,
226 				 struct xsk_ring_prod *fill,
227 				 struct xsk_ring_cons *comp)
228 {
229 	struct xdp_mmap_offsets off;
230 	void *map;
231 	int err;
232 
233 	err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING,
234 			 &umem->config.fill_size,
235 			 sizeof(umem->config.fill_size));
236 	if (err)
237 		return -errno;
238 
239 	err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
240 			 &umem->config.comp_size,
241 			 sizeof(umem->config.comp_size));
242 	if (err)
243 		return -errno;
244 
245 	err = xsk_get_mmap_offsets(fd, &off);
246 	if (err)
247 		return -errno;
248 
249 	map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
250 		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
251 		   XDP_UMEM_PGOFF_FILL_RING);
252 	if (map == MAP_FAILED)
253 		return -errno;
254 
255 	fill->mask = umem->config.fill_size - 1;
256 	fill->size = umem->config.fill_size;
257 	fill->producer = map + off.fr.producer;
258 	fill->consumer = map + off.fr.consumer;
259 	fill->flags = map + off.fr.flags;
260 	fill->ring = map + off.fr.desc;
261 	fill->cached_cons = umem->config.fill_size;
262 
263 	map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
264 		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
265 		   XDP_UMEM_PGOFF_COMPLETION_RING);
266 	if (map == MAP_FAILED) {
267 		err = -errno;
268 		goto out_mmap;
269 	}
270 
271 	comp->mask = umem->config.comp_size - 1;
272 	comp->size = umem->config.comp_size;
273 	comp->producer = map + off.cr.producer;
274 	comp->consumer = map + off.cr.consumer;
275 	comp->flags = map + off.cr.flags;
276 	comp->ring = map + off.cr.desc;
277 
278 	return 0;
279 
280 out_mmap:
281 	munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64));
282 	return err;
283 }
284 
285 int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area,
286 		     __u64 size, struct xsk_ring_prod *fill,
287 		     struct xsk_ring_cons *comp,
288 		     const struct xsk_umem_config *usr_config)
289 {
290 	struct xdp_umem_reg mr;
291 	struct xsk_umem *umem;
292 	int err;
293 
294 	if (!umem_area || !umem_ptr || !fill || !comp)
295 		return -EFAULT;
296 	if (!size && !xsk_page_aligned(umem_area))
297 		return -EINVAL;
298 
299 	umem = calloc(1, sizeof(*umem));
300 	if (!umem)
301 		return -ENOMEM;
302 
303 	umem->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0);
304 	if (umem->fd < 0) {
305 		err = -errno;
306 		goto out_umem_alloc;
307 	}
308 
309 	umem->umem_area = umem_area;
310 	INIT_LIST_HEAD(&umem->ctx_list);
311 	xsk_set_umem_config(&umem->config, usr_config);
312 
313 	memset(&mr, 0, sizeof(mr));
314 	mr.addr = (uintptr_t)umem_area;
315 	mr.len = size;
316 	mr.chunk_size = umem->config.frame_size;
317 	mr.headroom = umem->config.frame_headroom;
318 	mr.flags = umem->config.flags;
319 
320 	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
321 	if (err) {
322 		err = -errno;
323 		goto out_socket;
324 	}
325 
326 	err = xsk_create_umem_rings(umem, umem->fd, fill, comp);
327 	if (err)
328 		goto out_socket;
329 
330 	umem->fill_save = fill;
331 	umem->comp_save = comp;
332 	*umem_ptr = umem;
333 	return 0;
334 
335 out_socket:
336 	close(umem->fd);
337 out_umem_alloc:
338 	free(umem);
339 	return err;
340 }
341 
342 struct xsk_umem_config_v1 {
343 	__u32 fill_size;
344 	__u32 comp_size;
345 	__u32 frame_size;
346 	__u32 frame_headroom;
347 };
348 
349 static enum xsk_prog get_xsk_prog(void)
350 {
351 	enum xsk_prog detected = XSK_PROG_FALLBACK;
352 	char data_in = 0, data_out;
353 	struct bpf_insn insns[] = {
354 		BPF_LD_MAP_FD(BPF_REG_1, 0),
355 		BPF_MOV64_IMM(BPF_REG_2, 0),
356 		BPF_MOV64_IMM(BPF_REG_3, XDP_PASS),
357 		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
358 		BPF_EXIT_INSN(),
359 	};
360 	LIBBPF_OPTS(bpf_test_run_opts, opts,
361 		.data_in = &data_in,
362 		.data_size_in = 1,
363 		.data_out = &data_out,
364 	);
365 
366 	int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns);
367 
368 	map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, NULL, sizeof(int), sizeof(int), 1, NULL);
369 	if (map_fd < 0)
370 		return detected;
371 
372 	insns[0].imm = map_fd;
373 
374 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
375 	if (prog_fd < 0) {
376 		close(map_fd);
377 		return detected;
378 	}
379 
380 	ret = bpf_prog_test_run_opts(prog_fd, &opts);
381 	if (!ret && opts.retval == XDP_PASS)
382 		detected = XSK_PROG_REDIRECT_FLAGS;
383 	close(prog_fd);
384 	close(map_fd);
385 	return detected;
386 }
387 
388 static int xsk_load_xdp_prog(struct xsk_socket *xsk)
389 {
390 	static const int log_buf_size = 16 * 1024;
391 	struct xsk_ctx *ctx = xsk->ctx;
392 	char log_buf[log_buf_size];
393 	int prog_fd;
394 
395 	/* This is the fallback C-program:
396 	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
397 	 * {
398 	 *     int ret, index = ctx->rx_queue_index;
399 	 *
400 	 *     // A set entry here means that the correspnding queue_id
401 	 *     // has an active AF_XDP socket bound to it.
402 	 *     ret = bpf_redirect_map(&xsks_map, index, XDP_PASS);
403 	 *     if (ret > 0)
404 	 *         return ret;
405 	 *
406 	 *     // Fallback for pre-5.3 kernels, not supporting default
407 	 *     // action in the flags parameter.
408 	 *     if (bpf_map_lookup_elem(&xsks_map, &index))
409 	 *         return bpf_redirect_map(&xsks_map, index, 0);
410 	 *     return XDP_PASS;
411 	 * }
412 	 */
413 	struct bpf_insn prog[] = {
414 		/* r2 = *(u32 *)(r1 + 16) */
415 		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
416 		/* *(u32 *)(r10 - 4) = r2 */
417 		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4),
418 		/* r1 = xskmap[] */
419 		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
420 		/* r3 = XDP_PASS */
421 		BPF_MOV64_IMM(BPF_REG_3, 2),
422 		/* call bpf_redirect_map */
423 		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
424 		/* if w0 != 0 goto pc+13 */
425 		BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13),
426 		/* r2 = r10 */
427 		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
428 		/* r2 += -4 */
429 		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
430 		/* r1 = xskmap[] */
431 		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
432 		/* call bpf_map_lookup_elem */
433 		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
434 		/* r1 = r0 */
435 		BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
436 		/* r0 = XDP_PASS */
437 		BPF_MOV64_IMM(BPF_REG_0, 2),
438 		/* if r1 == 0 goto pc+5 */
439 		BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
440 		/* r2 = *(u32 *)(r10 - 4) */
441 		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
442 		/* r1 = xskmap[] */
443 		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
444 		/* r3 = 0 */
445 		BPF_MOV64_IMM(BPF_REG_3, 0),
446 		/* call bpf_redirect_map */
447 		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
448 		/* The jumps are to this instruction */
449 		BPF_EXIT_INSN(),
450 	};
451 
452 	/* This is the post-5.3 kernel C-program:
453 	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
454 	 * {
455 	 *     return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS);
456 	 * }
457 	 */
458 	struct bpf_insn prog_redirect_flags[] = {
459 		/* r2 = *(u32 *)(r1 + 16) */
460 		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
461 		/* r1 = xskmap[] */
462 		BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
463 		/* r3 = XDP_PASS */
464 		BPF_MOV64_IMM(BPF_REG_3, 2),
465 		/* call bpf_redirect_map */
466 		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
467 		BPF_EXIT_INSN(),
468 	};
469 	size_t insns_cnt[] = {ARRAY_SIZE(prog),
470 			      ARRAY_SIZE(prog_redirect_flags),
471 	};
472 	struct bpf_insn *progs[] = {prog, prog_redirect_flags};
473 	enum xsk_prog option = get_xsk_prog();
474 	LIBBPF_OPTS(bpf_prog_load_opts, opts,
475 		.log_buf = log_buf,
476 		.log_size = log_buf_size,
477 	);
478 
479 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "LGPL-2.1 or BSD-2-Clause",
480 				progs[option], insns_cnt[option], &opts);
481 	if (prog_fd < 0) {
482 		pr_warn("BPF log buffer:\n%s", log_buf);
483 		return prog_fd;
484 	}
485 
486 	ctx->prog_fd = prog_fd;
487 	return 0;
488 }
489 
490 static int xsk_create_bpf_link(struct xsk_socket *xsk)
491 {
492 	DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
493 	struct xsk_ctx *ctx = xsk->ctx;
494 	__u32 prog_id = 0;
495 	int link_fd;
496 	int err;
497 
498 	err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id);
499 	if (err) {
500 		pr_warn("getting XDP prog id failed\n");
501 		return err;
502 	}
503 
504 	/* if there's a netlink-based XDP prog loaded on interface, bail out
505 	 * and ask user to do the removal by himself
506 	 */
507 	if (prog_id) {
508 		pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n");
509 		return -EINVAL;
510 	}
511 
512 	opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE);
513 
514 	link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts);
515 	if (link_fd < 0) {
516 		pr_warn("bpf_link_create failed: %s\n", strerror(errno));
517 		return link_fd;
518 	}
519 
520 	ctx->link_fd = link_fd;
521 	return 0;
522 }
523 
524 /* Copy up to sz - 1 bytes from zero-terminated src string and ensure that dst
525  * is zero-terminated string no matter what (unless sz == 0, in which case
526  * it's a no-op). It's conceptually close to FreeBSD's strlcpy(), but differs
527  * in what is returned. Given this is internal helper, it's trivial to extend
528  * this, when necessary. Use this instead of strncpy inside libbpf source code.
529  */
530 static inline void libbpf_strlcpy(char *dst, const char *src, size_t sz)
531 {
532         size_t i;
533 
534         if (sz == 0)
535                 return;
536 
537         sz--;
538         for (i = 0; i < sz && src[i]; i++)
539                 dst[i] = src[i];
540         dst[i] = '\0';
541 }
542 
543 static int xsk_get_max_queues(struct xsk_socket *xsk)
544 {
545 	struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
546 	struct xsk_ctx *ctx = xsk->ctx;
547 	struct ifreq ifr = {};
548 	int fd, err, ret;
549 
550 	fd = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0);
551 	if (fd < 0)
552 		return -errno;
553 
554 	ifr.ifr_data = (void *)&channels;
555 	libbpf_strlcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ);
556 	err = ioctl(fd, SIOCETHTOOL, &ifr);
557 	if (err && errno != EOPNOTSUPP) {
558 		ret = -errno;
559 		goto out;
560 	}
561 
562 	if (err) {
563 		/* If the device says it has no channels, then all traffic
564 		 * is sent to a single stream, so max queues = 1.
565 		 */
566 		ret = 1;
567 	} else {
568 		/* Take the max of rx, tx, combined. Drivers return
569 		 * the number of channels in different ways.
570 		 */
571 		ret = max(channels.max_rx, channels.max_tx);
572 		ret = max(ret, (int)channels.max_combined);
573 	}
574 
575 out:
576 	close(fd);
577 	return ret;
578 }
579 
580 static int xsk_create_bpf_maps(struct xsk_socket *xsk)
581 {
582 	struct xsk_ctx *ctx = xsk->ctx;
583 	int max_queues;
584 	int fd;
585 
586 	max_queues = xsk_get_max_queues(xsk);
587 	if (max_queues < 0)
588 		return max_queues;
589 
590 	fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map",
591 			    sizeof(int), sizeof(int), max_queues, NULL);
592 	if (fd < 0)
593 		return fd;
594 
595 	ctx->xsks_map_fd = fd;
596 
597 	return 0;
598 }
599 
600 static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
601 {
602 	struct xsk_ctx *ctx = xsk->ctx;
603 
604 	bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id);
605 	close(ctx->xsks_map_fd);
606 }
607 
608 static int xsk_lookup_bpf_maps(struct xsk_socket *xsk)
609 {
610 	__u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info);
611 	__u32 map_len = sizeof(struct bpf_map_info);
612 	struct bpf_prog_info prog_info = {};
613 	struct xsk_ctx *ctx = xsk->ctx;
614 	struct bpf_map_info map_info;
615 	int fd, err;
616 
617 	err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
618 	if (err)
619 		return err;
620 
621 	num_maps = prog_info.nr_map_ids;
622 
623 	map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
624 	if (!map_ids)
625 		return -ENOMEM;
626 
627 	memset(&prog_info, 0, prog_len);
628 	prog_info.nr_map_ids = num_maps;
629 	prog_info.map_ids = (__u64)(unsigned long)map_ids;
630 
631 	err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
632 	if (err)
633 		goto out_map_ids;
634 
635 	ctx->xsks_map_fd = -1;
636 
637 	for (i = 0; i < prog_info.nr_map_ids; i++) {
638 		fd = bpf_map_get_fd_by_id(map_ids[i]);
639 		if (fd < 0)
640 			continue;
641 
642 		memset(&map_info, 0, map_len);
643 		err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
644 		if (err) {
645 			close(fd);
646 			continue;
647 		}
648 
649 		if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) {
650 			ctx->xsks_map_fd = fd;
651 			break;
652 		}
653 
654 		close(fd);
655 	}
656 
657 	if (ctx->xsks_map_fd == -1)
658 		err = -ENOENT;
659 
660 out_map_ids:
661 	free(map_ids);
662 	return err;
663 }
664 
665 static int xsk_set_bpf_maps(struct xsk_socket *xsk)
666 {
667 	struct xsk_ctx *ctx = xsk->ctx;
668 
669 	return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id,
670 				   &xsk->fd, 0);
671 }
672 
673 static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd)
674 {
675 	struct bpf_link_info link_info;
676 	__u32 link_len;
677 	__u32 id = 0;
678 	int err;
679 	int fd;
680 
681 	while (true) {
682 		err = bpf_link_get_next_id(id, &id);
683 		if (err) {
684 			if (errno == ENOENT) {
685 				err = 0;
686 				break;
687 			}
688 			pr_warn("can't get next link: %s\n", strerror(errno));
689 			break;
690 		}
691 
692 		fd = bpf_link_get_fd_by_id(id);
693 		if (fd < 0) {
694 			if (errno == ENOENT)
695 				continue;
696 			pr_warn("can't get link by id (%u): %s\n", id, strerror(errno));
697 			err = -errno;
698 			break;
699 		}
700 
701 		link_len = sizeof(struct bpf_link_info);
702 		memset(&link_info, 0, link_len);
703 		err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len);
704 		if (err) {
705 			pr_warn("can't get link info: %s\n", strerror(errno));
706 			close(fd);
707 			break;
708 		}
709 		if (link_info.type == BPF_LINK_TYPE_XDP) {
710 			if (link_info.xdp.ifindex == ifindex) {
711 				*link_fd = fd;
712 				if (prog_id)
713 					*prog_id = link_info.prog_id;
714 				break;
715 			}
716 		}
717 		close(fd);
718 	}
719 
720 	return err;
721 }
722 
723 static bool xsk_probe_bpf_link(void)
724 {
725 	LIBBPF_OPTS(bpf_link_create_opts, opts, .flags = XDP_FLAGS_SKB_MODE);
726 	struct bpf_insn insns[2] = {
727 		BPF_MOV64_IMM(BPF_REG_0, XDP_PASS),
728 		BPF_EXIT_INSN()
729 	};
730 	int prog_fd, link_fd = -1, insn_cnt = ARRAY_SIZE(insns);
731 	int ifindex_lo = 1;
732 	bool ret = false;
733 	int err;
734 
735 	err = xsk_link_lookup(ifindex_lo, NULL, &link_fd);
736 	if (err)
737 		return ret;
738 
739 	if (link_fd >= 0)
740 		return true;
741 
742 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
743 	if (prog_fd < 0)
744 		return ret;
745 
746 	link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts);
747 	close(prog_fd);
748 
749 	if (link_fd >= 0) {
750 		ret = true;
751 		close(link_fd);
752 	}
753 
754 	return ret;
755 }
756 
757 static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk)
758 {
759 	char ifname[IFNAMSIZ];
760 	struct xsk_ctx *ctx;
761 	char *interface;
762 
763 	ctx = calloc(1, sizeof(*ctx));
764 	if (!ctx)
765 		return -ENOMEM;
766 
767 	interface = if_indextoname(ifindex, &ifname[0]);
768 	if (!interface) {
769 		free(ctx);
770 		return -errno;
771 	}
772 
773 	ctx->ifindex = ifindex;
774 	libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
775 
776 	xsk->ctx = ctx;
777 	xsk->ctx->has_bpf_link = xsk_probe_bpf_link();
778 
779 	return 0;
780 }
781 
782 static int xsk_init_xdp_res(struct xsk_socket *xsk,
783 			    int *xsks_map_fd)
784 {
785 	struct xsk_ctx *ctx = xsk->ctx;
786 	int err;
787 
788 	err = xsk_create_bpf_maps(xsk);
789 	if (err)
790 		return err;
791 
792 	err = xsk_load_xdp_prog(xsk);
793 	if (err)
794 		goto err_load_xdp_prog;
795 
796 	if (ctx->has_bpf_link)
797 		err = xsk_create_bpf_link(xsk);
798 	else
799 		err = bpf_xdp_attach(xsk->ctx->ifindex, ctx->prog_fd,
800 				     xsk->config.xdp_flags, NULL);
801 
802 	if (err)
803 		goto err_attach_xdp_prog;
804 
805 	if (!xsk->rx)
806 		return err;
807 
808 	err = xsk_set_bpf_maps(xsk);
809 	if (err)
810 		goto err_set_bpf_maps;
811 
812 	return err;
813 
814 err_set_bpf_maps:
815 	if (ctx->has_bpf_link)
816 		close(ctx->link_fd);
817 	else
818 		bpf_xdp_detach(ctx->ifindex, 0, NULL);
819 err_attach_xdp_prog:
820 	close(ctx->prog_fd);
821 err_load_xdp_prog:
822 	xsk_delete_bpf_maps(xsk);
823 	return err;
824 }
825 
826 static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id)
827 {
828 	struct xsk_ctx *ctx = xsk->ctx;
829 	int err;
830 
831 	ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
832 	if (ctx->prog_fd < 0) {
833 		err = -errno;
834 		goto err_prog_fd;
835 	}
836 	err = xsk_lookup_bpf_maps(xsk);
837 	if (err)
838 		goto err_lookup_maps;
839 
840 	if (!xsk->rx)
841 		return err;
842 
843 	err = xsk_set_bpf_maps(xsk);
844 	if (err)
845 		goto err_set_maps;
846 
847 	return err;
848 
849 err_set_maps:
850 	close(ctx->xsks_map_fd);
851 err_lookup_maps:
852 	close(ctx->prog_fd);
853 err_prog_fd:
854 	if (ctx->has_bpf_link)
855 		close(ctx->link_fd);
856 	return err;
857 }
858 
859 static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd)
860 {
861 	struct xsk_socket *xsk = _xdp;
862 	struct xsk_ctx *ctx = xsk->ctx;
863 	__u32 prog_id = 0;
864 	int err;
865 
866 	if (ctx->has_bpf_link)
867 		err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd);
868 	else
869 		err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id);
870 
871 	if (err)
872 		return err;
873 
874 	err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) :
875 			 xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id);
876 
877 	if (!err && xsks_map_fd)
878 		*xsks_map_fd = ctx->xsks_map_fd;
879 
880 	return err;
881 }
882 
883 int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd)
884 {
885 	return __xsk_setup_xdp_prog(xsk, xsks_map_fd);
886 }
887 
888 static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex,
889 				   __u32 queue_id)
890 {
891 	struct xsk_ctx *ctx;
892 
893 	if (list_empty(&umem->ctx_list))
894 		return NULL;
895 
896 	list_for_each_entry(ctx, &umem->ctx_list, list) {
897 		if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) {
898 			ctx->refcount++;
899 			return ctx;
900 		}
901 	}
902 
903 	return NULL;
904 }
905 
906 static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap)
907 {
908 	struct xsk_umem *umem = ctx->umem;
909 	struct xdp_mmap_offsets off;
910 	int err;
911 
912 	if (--ctx->refcount)
913 		return;
914 
915 	if (!unmap)
916 		goto out_free;
917 
918 	err = xsk_get_mmap_offsets(umem->fd, &off);
919 	if (err)
920 		goto out_free;
921 
922 	munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size *
923 	       sizeof(__u64));
924 	munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size *
925 	       sizeof(__u64));
926 
927 out_free:
928 	list_del(&ctx->list);
929 	free(ctx);
930 }
931 
932 static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
933 				      struct xsk_umem *umem, int ifindex,
934 				      const char *ifname, __u32 queue_id,
935 				      struct xsk_ring_prod *fill,
936 				      struct xsk_ring_cons *comp)
937 {
938 	struct xsk_ctx *ctx;
939 	int err;
940 
941 	ctx = calloc(1, sizeof(*ctx));
942 	if (!ctx)
943 		return NULL;
944 
945 	if (!umem->fill_save) {
946 		err = xsk_create_umem_rings(umem, xsk->fd, fill, comp);
947 		if (err) {
948 			free(ctx);
949 			return NULL;
950 		}
951 	} else if (umem->fill_save != fill || umem->comp_save != comp) {
952 		/* Copy over rings to new structs. */
953 		memcpy(fill, umem->fill_save, sizeof(*fill));
954 		memcpy(comp, umem->comp_save, sizeof(*comp));
955 	}
956 
957 	ctx->ifindex = ifindex;
958 	ctx->refcount = 1;
959 	ctx->umem = umem;
960 	ctx->queue_id = queue_id;
961 	libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
962 
963 	ctx->fill = fill;
964 	ctx->comp = comp;
965 	list_add(&ctx->list, &umem->ctx_list);
966 	ctx->has_bpf_link = xsk_probe_bpf_link();
967 	return ctx;
968 }
969 
970 static void xsk_destroy_xsk_struct(struct xsk_socket *xsk)
971 {
972 	free(xsk->ctx);
973 	free(xsk);
974 }
975 
976 int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd)
977 {
978 	xsk->ctx->xsks_map_fd = fd;
979 	return xsk_set_bpf_maps(xsk);
980 }
981 
982 int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd)
983 {
984 	struct xsk_socket *xsk;
985 	int res;
986 
987 	xsk = calloc(1, sizeof(*xsk));
988 	if (!xsk)
989 		return -ENOMEM;
990 
991 	res = xsk_create_xsk_struct(ifindex, xsk);
992 	if (res) {
993 		free(xsk);
994 		return -EINVAL;
995 	}
996 
997 	res = __xsk_setup_xdp_prog(xsk, xsks_map_fd);
998 
999 	xsk_destroy_xsk_struct(xsk);
1000 
1001 	return res;
1002 }
1003 
1004 int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
1005 			      const char *ifname,
1006 			      __u32 queue_id, struct xsk_umem *umem,
1007 			      struct xsk_ring_cons *rx,
1008 			      struct xsk_ring_prod *tx,
1009 			      struct xsk_ring_prod *fill,
1010 			      struct xsk_ring_cons *comp,
1011 			      const struct xsk_socket_config *usr_config)
1012 {
1013 	bool unmap, rx_setup_done = false, tx_setup_done = false;
1014 	void *rx_map = NULL, *tx_map = NULL;
1015 	struct sockaddr_xdp sxdp = {};
1016 	struct xdp_mmap_offsets off;
1017 	struct xsk_socket *xsk;
1018 	struct xsk_ctx *ctx;
1019 	int err, ifindex;
1020 
1021 	if (!umem || !xsk_ptr || !(rx || tx))
1022 		return -EFAULT;
1023 
1024 	unmap = umem->fill_save != fill;
1025 
1026 	xsk = calloc(1, sizeof(*xsk));
1027 	if (!xsk)
1028 		return -ENOMEM;
1029 
1030 	err = xsk_set_xdp_socket_config(&xsk->config, usr_config);
1031 	if (err)
1032 		goto out_xsk_alloc;
1033 
1034 	xsk->outstanding_tx = 0;
1035 	ifindex = if_nametoindex(ifname);
1036 	if (!ifindex) {
1037 		err = -errno;
1038 		goto out_xsk_alloc;
1039 	}
1040 
1041 	if (umem->refcount++ > 0) {
1042 		xsk->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0);
1043 		if (xsk->fd < 0) {
1044 			err = -errno;
1045 			goto out_xsk_alloc;
1046 		}
1047 	} else {
1048 		xsk->fd = umem->fd;
1049 		rx_setup_done = umem->rx_ring_setup_done;
1050 		tx_setup_done = umem->tx_ring_setup_done;
1051 	}
1052 
1053 	ctx = xsk_get_ctx(umem, ifindex, queue_id);
1054 	if (!ctx) {
1055 		if (!fill || !comp) {
1056 			err = -EFAULT;
1057 			goto out_socket;
1058 		}
1059 
1060 		ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id,
1061 				     fill, comp);
1062 		if (!ctx) {
1063 			err = -ENOMEM;
1064 			goto out_socket;
1065 		}
1066 	}
1067 	xsk->ctx = ctx;
1068 
1069 	if (rx && !rx_setup_done) {
1070 		err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
1071 				 &xsk->config.rx_size,
1072 				 sizeof(xsk->config.rx_size));
1073 		if (err) {
1074 			err = -errno;
1075 			goto out_put_ctx;
1076 		}
1077 		if (xsk->fd == umem->fd)
1078 			umem->rx_ring_setup_done = true;
1079 	}
1080 	if (tx && !tx_setup_done) {
1081 		err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
1082 				 &xsk->config.tx_size,
1083 				 sizeof(xsk->config.tx_size));
1084 		if (err) {
1085 			err = -errno;
1086 			goto out_put_ctx;
1087 		}
1088 		if (xsk->fd == umem->fd)
1089 			umem->tx_ring_setup_done = true;
1090 	}
1091 
1092 	err = xsk_get_mmap_offsets(xsk->fd, &off);
1093 	if (err) {
1094 		err = -errno;
1095 		goto out_put_ctx;
1096 	}
1097 
1098 	if (rx) {
1099 		rx_map = mmap(NULL, off.rx.desc +
1100 			      xsk->config.rx_size * sizeof(struct xdp_desc),
1101 			      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
1102 			      xsk->fd, XDP_PGOFF_RX_RING);
1103 		if (rx_map == MAP_FAILED) {
1104 			err = -errno;
1105 			goto out_put_ctx;
1106 		}
1107 
1108 		rx->mask = xsk->config.rx_size - 1;
1109 		rx->size = xsk->config.rx_size;
1110 		rx->producer = rx_map + off.rx.producer;
1111 		rx->consumer = rx_map + off.rx.consumer;
1112 		rx->flags = rx_map + off.rx.flags;
1113 		rx->ring = rx_map + off.rx.desc;
1114 		rx->cached_prod = *rx->producer;
1115 		rx->cached_cons = *rx->consumer;
1116 	}
1117 	xsk->rx = rx;
1118 
1119 	if (tx) {
1120 		tx_map = mmap(NULL, off.tx.desc +
1121 			      xsk->config.tx_size * sizeof(struct xdp_desc),
1122 			      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
1123 			      xsk->fd, XDP_PGOFF_TX_RING);
1124 		if (tx_map == MAP_FAILED) {
1125 			err = -errno;
1126 			goto out_mmap_rx;
1127 		}
1128 
1129 		tx->mask = xsk->config.tx_size - 1;
1130 		tx->size = xsk->config.tx_size;
1131 		tx->producer = tx_map + off.tx.producer;
1132 		tx->consumer = tx_map + off.tx.consumer;
1133 		tx->flags = tx_map + off.tx.flags;
1134 		tx->ring = tx_map + off.tx.desc;
1135 		tx->cached_prod = *tx->producer;
1136 		/* cached_cons is r->size bigger than the real consumer pointer
1137 		 * See xsk_prod_nb_free
1138 		 */
1139 		tx->cached_cons = *tx->consumer + xsk->config.tx_size;
1140 	}
1141 	xsk->tx = tx;
1142 
1143 	sxdp.sxdp_family = PF_XDP;
1144 	sxdp.sxdp_ifindex = ctx->ifindex;
1145 	sxdp.sxdp_queue_id = ctx->queue_id;
1146 	if (umem->refcount > 1) {
1147 		sxdp.sxdp_flags |= XDP_SHARED_UMEM;
1148 		sxdp.sxdp_shared_umem_fd = umem->fd;
1149 	} else {
1150 		sxdp.sxdp_flags = xsk->config.bind_flags;
1151 	}
1152 
1153 	err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
1154 	if (err) {
1155 		err = -errno;
1156 		goto out_mmap_tx;
1157 	}
1158 
1159 	if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
1160 		err = __xsk_setup_xdp_prog(xsk, NULL);
1161 		if (err)
1162 			goto out_mmap_tx;
1163 	}
1164 
1165 	*xsk_ptr = xsk;
1166 	umem->fill_save = NULL;
1167 	umem->comp_save = NULL;
1168 	return 0;
1169 
1170 out_mmap_tx:
1171 	if (tx)
1172 		munmap(tx_map, off.tx.desc +
1173 		       xsk->config.tx_size * sizeof(struct xdp_desc));
1174 out_mmap_rx:
1175 	if (rx)
1176 		munmap(rx_map, off.rx.desc +
1177 		       xsk->config.rx_size * sizeof(struct xdp_desc));
1178 out_put_ctx:
1179 	xsk_put_ctx(ctx, unmap);
1180 out_socket:
1181 	if (--umem->refcount)
1182 		close(xsk->fd);
1183 out_xsk_alloc:
1184 	free(xsk);
1185 	return err;
1186 }
1187 
1188 int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
1189 		       __u32 queue_id, struct xsk_umem *umem,
1190 		       struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
1191 		       const struct xsk_socket_config *usr_config)
1192 {
1193 	if (!umem)
1194 		return -EFAULT;
1195 
1196 	return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem,
1197 					 rx, tx, umem->fill_save,
1198 					 umem->comp_save, usr_config);
1199 }
1200 
1201 int xsk_umem__delete(struct xsk_umem *umem)
1202 {
1203 	struct xdp_mmap_offsets off;
1204 	int err;
1205 
1206 	if (!umem)
1207 		return 0;
1208 
1209 	if (umem->refcount)
1210 		return -EBUSY;
1211 
1212 	err = xsk_get_mmap_offsets(umem->fd, &off);
1213 	if (!err && umem->fill_save && umem->comp_save) {
1214 		munmap(umem->fill_save->ring - off.fr.desc,
1215 		       off.fr.desc + umem->config.fill_size * sizeof(__u64));
1216 		munmap(umem->comp_save->ring - off.cr.desc,
1217 		       off.cr.desc + umem->config.comp_size * sizeof(__u64));
1218 	}
1219 
1220 	close(umem->fd);
1221 	free(umem);
1222 
1223 	return 0;
1224 }
1225 
1226 void xsk_socket__delete(struct xsk_socket *xsk)
1227 {
1228 	size_t desc_sz = sizeof(struct xdp_desc);
1229 	struct xdp_mmap_offsets off;
1230 	struct xsk_umem *umem;
1231 	struct xsk_ctx *ctx;
1232 	int err;
1233 
1234 	if (!xsk)
1235 		return;
1236 
1237 	ctx = xsk->ctx;
1238 	umem = ctx->umem;
1239 
1240 	if (ctx->refcount == 1) {
1241 		xsk_delete_bpf_maps(xsk);
1242 		close(ctx->prog_fd);
1243 		if (ctx->has_bpf_link)
1244 			close(ctx->link_fd);
1245 	}
1246 
1247 	xsk_put_ctx(ctx, true);
1248 
1249 	err = xsk_get_mmap_offsets(xsk->fd, &off);
1250 	if (!err) {
1251 		if (xsk->rx) {
1252 			munmap(xsk->rx->ring - off.rx.desc,
1253 			       off.rx.desc + xsk->config.rx_size * desc_sz);
1254 		}
1255 		if (xsk->tx) {
1256 			munmap(xsk->tx->ring - off.tx.desc,
1257 			       off.tx.desc + xsk->config.tx_size * desc_sz);
1258 		}
1259 	}
1260 
1261 	umem->refcount--;
1262 	/* Do not close an fd that also has an associated umem connected
1263 	 * to it.
1264 	 */
1265 	if (xsk->fd != umem->fd)
1266 		close(xsk->fd);
1267 	free(xsk);
1268 }
1269