1f85d2086SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2b95a5c4dSDaniel Mack /*
3b95a5c4dSDaniel Mack * Longest prefix match list implementation
4b95a5c4dSDaniel Mack *
5b95a5c4dSDaniel Mack * Copyright (c) 2016,2017 Daniel Mack
6b95a5c4dSDaniel Mack * Copyright (c) 2016 David Herrmann
7b95a5c4dSDaniel Mack */
8b95a5c4dSDaniel Mack
9b95a5c4dSDaniel Mack #include <linux/bpf.h>
10e8d2bec0SDaniel Borkmann #include <linux/btf.h>
11b95a5c4dSDaniel Mack #include <linux/err.h>
12b95a5c4dSDaniel Mack #include <linux/slab.h>
13b95a5c4dSDaniel Mack #include <linux/spinlock.h>
14b95a5c4dSDaniel Mack #include <linux/vmalloc.h>
15b95a5c4dSDaniel Mack #include <net/ipv6.h>
16e8d2bec0SDaniel Borkmann #include <uapi/linux/btf.h>
17c317ab71SMenglong Dong #include <linux/btf_ids.h>
18b95a5c4dSDaniel Mack
19b95a5c4dSDaniel Mack /* Intermediate node */
20b95a5c4dSDaniel Mack #define LPM_TREE_NODE_FLAG_IM BIT(0)
21b95a5c4dSDaniel Mack
22b95a5c4dSDaniel Mack struct lpm_trie_node;
23b95a5c4dSDaniel Mack
24b95a5c4dSDaniel Mack struct lpm_trie_node {
25b95a5c4dSDaniel Mack struct rcu_head rcu;
26b95a5c4dSDaniel Mack struct lpm_trie_node __rcu *child[2];
27b95a5c4dSDaniel Mack u32 prefixlen;
28b95a5c4dSDaniel Mack u32 flags;
29d7f10df8SGustavo A. R. Silva u8 data[];
30b95a5c4dSDaniel Mack };
31b95a5c4dSDaniel Mack
32b95a5c4dSDaniel Mack struct lpm_trie {
33b95a5c4dSDaniel Mack struct bpf_map map;
34b95a5c4dSDaniel Mack struct lpm_trie_node __rcu *root;
35b95a5c4dSDaniel Mack size_t n_entries;
36b95a5c4dSDaniel Mack size_t max_prefixlen;
37b95a5c4dSDaniel Mack size_t data_size;
3866150d0dSThomas Gleixner spinlock_t lock;
39b95a5c4dSDaniel Mack };
40b95a5c4dSDaniel Mack
41b95a5c4dSDaniel Mack /* This trie implements a longest prefix match algorithm that can be used to
42b95a5c4dSDaniel Mack * match IP addresses to a stored set of ranges.
43b95a5c4dSDaniel Mack *
44b95a5c4dSDaniel Mack * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
45b95a5c4dSDaniel Mack * interpreted as big endian, so data[0] stores the most significant byte.
46b95a5c4dSDaniel Mack *
47b95a5c4dSDaniel Mack * Match ranges are internally stored in instances of struct lpm_trie_node
48b95a5c4dSDaniel Mack * which each contain their prefix length as well as two pointers that may
49b95a5c4dSDaniel Mack * lead to more nodes containing more specific matches. Each node also stores
50b95a5c4dSDaniel Mack * a value that is defined by and returned to userspace via the update_elem
51b95a5c4dSDaniel Mack * and lookup functions.
52b95a5c4dSDaniel Mack *
53b95a5c4dSDaniel Mack * For instance, let's start with a trie that was created with a prefix length
54b95a5c4dSDaniel Mack * of 32, so it can be used for IPv4 addresses, and one single element that
55b95a5c4dSDaniel Mack * matches 192.168.0.0/16. The data array would hence contain
56b95a5c4dSDaniel Mack * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
57b95a5c4dSDaniel Mack * stick to IP-address notation for readability though.
58b95a5c4dSDaniel Mack *
59b95a5c4dSDaniel Mack * As the trie is empty initially, the new node (1) will be places as root
60b95a5c4dSDaniel Mack * node, denoted as (R) in the example below. As there are no other node, both
61b95a5c4dSDaniel Mack * child pointers are %NULL.
62b95a5c4dSDaniel Mack *
63b95a5c4dSDaniel Mack * +----------------+
64b95a5c4dSDaniel Mack * | (1) (R) |
65b95a5c4dSDaniel Mack * | 192.168.0.0/16 |
66b95a5c4dSDaniel Mack * | value: 1 |
67b95a5c4dSDaniel Mack * | [0] [1] |
68b95a5c4dSDaniel Mack * +----------------+
69b95a5c4dSDaniel Mack *
70b95a5c4dSDaniel Mack * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
71b95a5c4dSDaniel Mack * a node with the same data and a smaller prefix (ie, a less specific one),
72b95a5c4dSDaniel Mack * node (2) will become a child of (1). In child index depends on the next bit
73b95a5c4dSDaniel Mack * that is outside of what (1) matches, and that bit is 0, so (2) will be
74b95a5c4dSDaniel Mack * child[0] of (1):
75b95a5c4dSDaniel Mack *
76b95a5c4dSDaniel Mack * +----------------+
77b95a5c4dSDaniel Mack * | (1) (R) |
78b95a5c4dSDaniel Mack * | 192.168.0.0/16 |
79b95a5c4dSDaniel Mack * | value: 1 |
80b95a5c4dSDaniel Mack * | [0] [1] |
81b95a5c4dSDaniel Mack * +----------------+
82b95a5c4dSDaniel Mack * |
83b95a5c4dSDaniel Mack * +----------------+
84b95a5c4dSDaniel Mack * | (2) |
85b95a5c4dSDaniel Mack * | 192.168.0.0/24 |
86b95a5c4dSDaniel Mack * | value: 2 |
87b95a5c4dSDaniel Mack * | [0] [1] |
88b95a5c4dSDaniel Mack * +----------------+
89b95a5c4dSDaniel Mack *
90b95a5c4dSDaniel Mack * The child[1] slot of (1) could be filled with another node which has bit #17
91b95a5c4dSDaniel Mack * (the next bit after the ones that (1) matches on) set to 1. For instance,
92b95a5c4dSDaniel Mack * 192.168.128.0/24:
93b95a5c4dSDaniel Mack *
94b95a5c4dSDaniel Mack * +----------------+
95b95a5c4dSDaniel Mack * | (1) (R) |
96b95a5c4dSDaniel Mack * | 192.168.0.0/16 |
97b95a5c4dSDaniel Mack * | value: 1 |
98b95a5c4dSDaniel Mack * | [0] [1] |
99b95a5c4dSDaniel Mack * +----------------+
100b95a5c4dSDaniel Mack * | |
101b95a5c4dSDaniel Mack * +----------------+ +------------------+
102b95a5c4dSDaniel Mack * | (2) | | (3) |
103b95a5c4dSDaniel Mack * | 192.168.0.0/24 | | 192.168.128.0/24 |
104b95a5c4dSDaniel Mack * | value: 2 | | value: 3 |
105b95a5c4dSDaniel Mack * | [0] [1] | | [0] [1] |
106b95a5c4dSDaniel Mack * +----------------+ +------------------+
107b95a5c4dSDaniel Mack *
108b95a5c4dSDaniel Mack * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
109b95a5c4dSDaniel Mack * it, node (1) is looked at first, and because (4) of the semantics laid out
110b95a5c4dSDaniel Mack * above (bit #17 is 0), it would normally be attached to (1) as child[0].
111b95a5c4dSDaniel Mack * However, that slot is already allocated, so a new node is needed in between.
112b95a5c4dSDaniel Mack * That node does not have a value attached to it and it will never be
113b95a5c4dSDaniel Mack * returned to users as result of a lookup. It is only there to differentiate
114b95a5c4dSDaniel Mack * the traversal further. It will get a prefix as wide as necessary to
115b95a5c4dSDaniel Mack * distinguish its two children:
116b95a5c4dSDaniel Mack *
117b95a5c4dSDaniel Mack * +----------------+
118b95a5c4dSDaniel Mack * | (1) (R) |
119b95a5c4dSDaniel Mack * | 192.168.0.0/16 |
120b95a5c4dSDaniel Mack * | value: 1 |
121b95a5c4dSDaniel Mack * | [0] [1] |
122b95a5c4dSDaniel Mack * +----------------+
123b95a5c4dSDaniel Mack * | |
124b95a5c4dSDaniel Mack * +----------------+ +------------------+
125b95a5c4dSDaniel Mack * | (4) (I) | | (3) |
126b95a5c4dSDaniel Mack * | 192.168.0.0/23 | | 192.168.128.0/24 |
127b95a5c4dSDaniel Mack * | value: --- | | value: 3 |
128b95a5c4dSDaniel Mack * | [0] [1] | | [0] [1] |
129b95a5c4dSDaniel Mack * +----------------+ +------------------+
130b95a5c4dSDaniel Mack * | |
131b95a5c4dSDaniel Mack * +----------------+ +----------------+
132b95a5c4dSDaniel Mack * | (2) | | (5) |
133b95a5c4dSDaniel Mack * | 192.168.0.0/24 | | 192.168.1.0/24 |
134b95a5c4dSDaniel Mack * | value: 2 | | value: 5 |
135b95a5c4dSDaniel Mack * | [0] [1] | | [0] [1] |
136b95a5c4dSDaniel Mack * +----------------+ +----------------+
137b95a5c4dSDaniel Mack *
138b95a5c4dSDaniel Mack * 192.168.1.1/32 would be a child of (5) etc.
139b95a5c4dSDaniel Mack *
140b95a5c4dSDaniel Mack * An intermediate node will be turned into a 'real' node on demand. In the
141b95a5c4dSDaniel Mack * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
142b95a5c4dSDaniel Mack *
143b95a5c4dSDaniel Mack * A fully populated trie would have a height of 32 nodes, as the trie was
144b95a5c4dSDaniel Mack * created with a prefix length of 32.
145b95a5c4dSDaniel Mack *
146b95a5c4dSDaniel Mack * The lookup starts at the root node. If the current node matches and if there
147b95a5c4dSDaniel Mack * is a child that can be used to become more specific, the trie is traversed
148b95a5c4dSDaniel Mack * downwards. The last node in the traversal that is a non-intermediate one is
149b95a5c4dSDaniel Mack * returned.
150b95a5c4dSDaniel Mack */
151b95a5c4dSDaniel Mack
extract_bit(const u8 * data,size_t index)152b95a5c4dSDaniel Mack static inline int extract_bit(const u8 *data, size_t index)
153b95a5c4dSDaniel Mack {
154b95a5c4dSDaniel Mack return !!(data[index / 8] & (1 << (7 - (index % 8))));
155b95a5c4dSDaniel Mack }
156b95a5c4dSDaniel Mack
157b95a5c4dSDaniel Mack /**
158b95a5c4dSDaniel Mack * longest_prefix_match() - determine the longest prefix
159b95a5c4dSDaniel Mack * @trie: The trie to get internal sizes from
160b95a5c4dSDaniel Mack * @node: The node to operate on
161b95a5c4dSDaniel Mack * @key: The key to compare to @node
162b95a5c4dSDaniel Mack *
163b95a5c4dSDaniel Mack * Determine the longest prefix of @node that matches the bits in @key.
164b95a5c4dSDaniel Mack */
longest_prefix_match(const struct lpm_trie * trie,const struct lpm_trie_node * node,const struct bpf_lpm_trie_key_u8 * key)165b95a5c4dSDaniel Mack static size_t longest_prefix_match(const struct lpm_trie *trie,
166b95a5c4dSDaniel Mack const struct lpm_trie_node *node,
167ef33f029SKees Cook const struct bpf_lpm_trie_key_u8 *key)
168b95a5c4dSDaniel Mack {
1698d75839bSEric Dumazet u32 limit = min(node->prefixlen, key->prefixlen);
1708d75839bSEric Dumazet u32 prefixlen = 0, i = 0;
171b95a5c4dSDaniel Mack
1728d75839bSEric Dumazet BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
173ef33f029SKees Cook BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key_u8, data) % sizeof(u32));
174b95a5c4dSDaniel Mack
1758d75839bSEric Dumazet #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
176b95a5c4dSDaniel Mack
1778d75839bSEric Dumazet /* data_size >= 16 has very small probability.
1788d75839bSEric Dumazet * We do not use a loop for optimal code generation.
1798d75839bSEric Dumazet */
1808d75839bSEric Dumazet if (trie->data_size >= 8) {
1818d75839bSEric Dumazet u64 diff = be64_to_cpu(*(__be64 *)node->data ^
1828d75839bSEric Dumazet *(__be64 *)key->data);
183b95a5c4dSDaniel Mack
1848d75839bSEric Dumazet prefixlen = 64 - fls64(diff);
1858d75839bSEric Dumazet if (prefixlen >= limit)
1868d75839bSEric Dumazet return limit;
1878d75839bSEric Dumazet if (diff)
1888d75839bSEric Dumazet return prefixlen;
1898d75839bSEric Dumazet i = 8;
1908d75839bSEric Dumazet }
1918d75839bSEric Dumazet #endif
1928d75839bSEric Dumazet
1938d75839bSEric Dumazet while (trie->data_size >= i + 4) {
1948d75839bSEric Dumazet u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^
1958d75839bSEric Dumazet *(__be32 *)&key->data[i]);
1968d75839bSEric Dumazet
1978d75839bSEric Dumazet prefixlen += 32 - fls(diff);
1988d75839bSEric Dumazet if (prefixlen >= limit)
1998d75839bSEric Dumazet return limit;
2008d75839bSEric Dumazet if (diff)
2018d75839bSEric Dumazet return prefixlen;
2028d75839bSEric Dumazet i += 4;
2038d75839bSEric Dumazet }
2048d75839bSEric Dumazet
2058d75839bSEric Dumazet if (trie->data_size >= i + 2) {
2068d75839bSEric Dumazet u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^
2078d75839bSEric Dumazet *(__be16 *)&key->data[i]);
2088d75839bSEric Dumazet
2098d75839bSEric Dumazet prefixlen += 16 - fls(diff);
2108d75839bSEric Dumazet if (prefixlen >= limit)
2118d75839bSEric Dumazet return limit;
2128d75839bSEric Dumazet if (diff)
2138d75839bSEric Dumazet return prefixlen;
2148d75839bSEric Dumazet i += 2;
2158d75839bSEric Dumazet }
2168d75839bSEric Dumazet
2178d75839bSEric Dumazet if (trie->data_size >= i + 1) {
2188d75839bSEric Dumazet prefixlen += 8 - fls(node->data[i] ^ key->data[i]);
2198d75839bSEric Dumazet
2208d75839bSEric Dumazet if (prefixlen >= limit)
2218d75839bSEric Dumazet return limit;
222b95a5c4dSDaniel Mack }
223b95a5c4dSDaniel Mack
224b95a5c4dSDaniel Mack return prefixlen;
225b95a5c4dSDaniel Mack }
226b95a5c4dSDaniel Mack
227b95a5c4dSDaniel Mack /* Called from syscall or from eBPF program */
trie_lookup_elem(struct bpf_map * map,void * _key)228b95a5c4dSDaniel Mack static void *trie_lookup_elem(struct bpf_map *map, void *_key)
229b95a5c4dSDaniel Mack {
230b95a5c4dSDaniel Mack struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
231b95a5c4dSDaniel Mack struct lpm_trie_node *node, *found = NULL;
232ef33f029SKees Cook struct bpf_lpm_trie_key_u8 *key = _key;
233b95a5c4dSDaniel Mack
234de0b27e6SFlorian Lehner if (key->prefixlen > trie->max_prefixlen)
235de0b27e6SFlorian Lehner return NULL;
236de0b27e6SFlorian Lehner
237b95a5c4dSDaniel Mack /* Start walking the trie from the root node ... */
238b95a5c4dSDaniel Mack
239694cea39SToke Høiland-Jørgensen for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held());
240694cea39SToke Høiland-Jørgensen node;) {
241b95a5c4dSDaniel Mack unsigned int next_bit;
242b95a5c4dSDaniel Mack size_t matchlen;
243b95a5c4dSDaniel Mack
244b95a5c4dSDaniel Mack /* Determine the longest prefix of @node that matches @key.
245b95a5c4dSDaniel Mack * If it's the maximum possible prefix for this trie, we have
246b95a5c4dSDaniel Mack * an exact match and can return it directly.
247b95a5c4dSDaniel Mack */
248b95a5c4dSDaniel Mack matchlen = longest_prefix_match(trie, node, key);
249b95a5c4dSDaniel Mack if (matchlen == trie->max_prefixlen) {
250b95a5c4dSDaniel Mack found = node;
251b95a5c4dSDaniel Mack break;
252b95a5c4dSDaniel Mack }
253b95a5c4dSDaniel Mack
254b95a5c4dSDaniel Mack /* If the number of bits that match is smaller than the prefix
255b95a5c4dSDaniel Mack * length of @node, bail out and return the node we have seen
256b95a5c4dSDaniel Mack * last in the traversal (ie, the parent).
257b95a5c4dSDaniel Mack */
258b95a5c4dSDaniel Mack if (matchlen < node->prefixlen)
259b95a5c4dSDaniel Mack break;
260b95a5c4dSDaniel Mack
261b95a5c4dSDaniel Mack /* Consider this node as return candidate unless it is an
262b95a5c4dSDaniel Mack * artificially added intermediate one.
263b95a5c4dSDaniel Mack */
264b95a5c4dSDaniel Mack if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
265b95a5c4dSDaniel Mack found = node;
266b95a5c4dSDaniel Mack
267b95a5c4dSDaniel Mack /* If the node match is fully satisfied, let's see if we can
268b95a5c4dSDaniel Mack * become more specific. Determine the next bit in the key and
269b95a5c4dSDaniel Mack * traverse down.
270b95a5c4dSDaniel Mack */
271b95a5c4dSDaniel Mack next_bit = extract_bit(key->data, node->prefixlen);
272694cea39SToke Høiland-Jørgensen node = rcu_dereference_check(node->child[next_bit],
273694cea39SToke Høiland-Jørgensen rcu_read_lock_bh_held());
274b95a5c4dSDaniel Mack }
275b95a5c4dSDaniel Mack
276b95a5c4dSDaniel Mack if (!found)
277b95a5c4dSDaniel Mack return NULL;
278b95a5c4dSDaniel Mack
279b95a5c4dSDaniel Mack return found->data + trie->data_size;
280b95a5c4dSDaniel Mack }
281b95a5c4dSDaniel Mack
lpm_trie_node_alloc(const struct lpm_trie * trie,const void * value)282b95a5c4dSDaniel Mack static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
283b95a5c4dSDaniel Mack const void *value)
284b95a5c4dSDaniel Mack {
285b95a5c4dSDaniel Mack struct lpm_trie_node *node;
286b95a5c4dSDaniel Mack size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
287b95a5c4dSDaniel Mack
288b95a5c4dSDaniel Mack if (value)
289b95a5c4dSDaniel Mack size += trie->map.value_size;
290b95a5c4dSDaniel Mack
291ace2bee8SYafang Shao node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
29296eabe7aSMartin KaFai Lau trie->map.numa_node);
293b95a5c4dSDaniel Mack if (!node)
294b95a5c4dSDaniel Mack return NULL;
295b95a5c4dSDaniel Mack
296b95a5c4dSDaniel Mack node->flags = 0;
297b95a5c4dSDaniel Mack
298b95a5c4dSDaniel Mack if (value)
299b95a5c4dSDaniel Mack memcpy(node->data + trie->data_size, value,
300b95a5c4dSDaniel Mack trie->map.value_size);
301b95a5c4dSDaniel Mack
302b95a5c4dSDaniel Mack return node;
303b95a5c4dSDaniel Mack }
304b95a5c4dSDaniel Mack
trie_check_add_elem(struct lpm_trie * trie,u64 flags)3052e9ff3f4SHou Tao static int trie_check_add_elem(struct lpm_trie *trie, u64 flags)
3062e9ff3f4SHou Tao {
3072e9ff3f4SHou Tao if (flags == BPF_EXIST)
3082e9ff3f4SHou Tao return -ENOENT;
3092e9ff3f4SHou Tao if (trie->n_entries == trie->map.max_entries)
3102e9ff3f4SHou Tao return -ENOSPC;
3112e9ff3f4SHou Tao trie->n_entries++;
3122e9ff3f4SHou Tao return 0;
3132e9ff3f4SHou Tao }
3142e9ff3f4SHou Tao
315b95a5c4dSDaniel Mack /* Called from syscall or from eBPF program */
trie_update_elem(struct bpf_map * map,void * _key,void * value,u64 flags)316d7ba4cc9SJP Kobryn static long trie_update_elem(struct bpf_map *map,
317b95a5c4dSDaniel Mack void *_key, void *value, u64 flags)
318b95a5c4dSDaniel Mack {
319b95a5c4dSDaniel Mack struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
320c1ab31edSHou Tao struct lpm_trie_node *node, *im_node, *new_node = NULL;
32163f13eb5SAlexei Starovoitov struct lpm_trie_node *free_node = NULL;
322b95a5c4dSDaniel Mack struct lpm_trie_node __rcu **slot;
323ef33f029SKees Cook struct bpf_lpm_trie_key_u8 *key = _key;
324b95a5c4dSDaniel Mack unsigned long irq_flags;
325b95a5c4dSDaniel Mack unsigned int next_bit;
326b95a5c4dSDaniel Mack size_t matchlen = 0;
327b95a5c4dSDaniel Mack int ret = 0;
328b95a5c4dSDaniel Mack
329b95a5c4dSDaniel Mack if (unlikely(flags > BPF_EXIST))
330b95a5c4dSDaniel Mack return -EINVAL;
331b95a5c4dSDaniel Mack
332b95a5c4dSDaniel Mack if (key->prefixlen > trie->max_prefixlen)
333b95a5c4dSDaniel Mack return -EINVAL;
334b95a5c4dSDaniel Mack
33566150d0dSThomas Gleixner spin_lock_irqsave(&trie->lock, irq_flags);
336b95a5c4dSDaniel Mack
337b95a5c4dSDaniel Mack /* Allocate and fill a new node */
338b95a5c4dSDaniel Mack new_node = lpm_trie_node_alloc(trie, value);
339b95a5c4dSDaniel Mack if (!new_node) {
340b95a5c4dSDaniel Mack ret = -ENOMEM;
341b95a5c4dSDaniel Mack goto out;
342b95a5c4dSDaniel Mack }
343b95a5c4dSDaniel Mack
344b95a5c4dSDaniel Mack new_node->prefixlen = key->prefixlen;
345b95a5c4dSDaniel Mack RCU_INIT_POINTER(new_node->child[0], NULL);
346b95a5c4dSDaniel Mack RCU_INIT_POINTER(new_node->child[1], NULL);
347b95a5c4dSDaniel Mack memcpy(new_node->data, key->data, trie->data_size);
348b95a5c4dSDaniel Mack
349b95a5c4dSDaniel Mack /* Now find a slot to attach the new node. To do that, walk the tree
350b95a5c4dSDaniel Mack * from the root and match as many bits as possible for each node until
351b95a5c4dSDaniel Mack * we either find an empty slot or a slot that needs to be replaced by
352b95a5c4dSDaniel Mack * an intermediate node.
353b95a5c4dSDaniel Mack */
354b95a5c4dSDaniel Mack slot = &trie->root;
355b95a5c4dSDaniel Mack
356b95a5c4dSDaniel Mack while ((node = rcu_dereference_protected(*slot,
357b95a5c4dSDaniel Mack lockdep_is_held(&trie->lock)))) {
358b95a5c4dSDaniel Mack matchlen = longest_prefix_match(trie, node, key);
359b95a5c4dSDaniel Mack
360b95a5c4dSDaniel Mack if (node->prefixlen != matchlen ||
361b95a5c4dSDaniel Mack node->prefixlen == key->prefixlen ||
362b95a5c4dSDaniel Mack node->prefixlen == trie->max_prefixlen)
363b95a5c4dSDaniel Mack break;
364b95a5c4dSDaniel Mack
365b95a5c4dSDaniel Mack next_bit = extract_bit(key->data, node->prefixlen);
366b95a5c4dSDaniel Mack slot = &node->child[next_bit];
367b95a5c4dSDaniel Mack }
368b95a5c4dSDaniel Mack
369b95a5c4dSDaniel Mack /* If the slot is empty (a free child pointer or an empty root),
370b95a5c4dSDaniel Mack * simply assign the @new_node to that slot and be done.
371b95a5c4dSDaniel Mack */
372b95a5c4dSDaniel Mack if (!node) {
3732e9ff3f4SHou Tao ret = trie_check_add_elem(trie, flags);
3742e9ff3f4SHou Tao if (ret)
375c5325e6eSHou Tao goto out;
3762e9ff3f4SHou Tao
377b95a5c4dSDaniel Mack rcu_assign_pointer(*slot, new_node);
378b95a5c4dSDaniel Mack goto out;
379b95a5c4dSDaniel Mack }
380b95a5c4dSDaniel Mack
381b95a5c4dSDaniel Mack /* If the slot we picked already exists, replace it with @new_node
382b95a5c4dSDaniel Mack * which already has the correct data array set.
383b95a5c4dSDaniel Mack */
384b95a5c4dSDaniel Mack if (node->prefixlen == matchlen) {
385c5325e6eSHou Tao if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) {
386c5325e6eSHou Tao if (flags == BPF_NOEXIST) {
387c5325e6eSHou Tao ret = -EEXIST;
388c5325e6eSHou Tao goto out;
389c5325e6eSHou Tao }
3902e9ff3f4SHou Tao } else {
3912e9ff3f4SHou Tao ret = trie_check_add_elem(trie, flags);
3922e9ff3f4SHou Tao if (ret)
393c5325e6eSHou Tao goto out;
394c5325e6eSHou Tao }
395c5325e6eSHou Tao
396b95a5c4dSDaniel Mack new_node->child[0] = node->child[0];
397b95a5c4dSDaniel Mack new_node->child[1] = node->child[1];
398b95a5c4dSDaniel Mack
399b95a5c4dSDaniel Mack rcu_assign_pointer(*slot, new_node);
40063f13eb5SAlexei Starovoitov free_node = node;
401b95a5c4dSDaniel Mack
402b95a5c4dSDaniel Mack goto out;
403b95a5c4dSDaniel Mack }
404b95a5c4dSDaniel Mack
4052e9ff3f4SHou Tao ret = trie_check_add_elem(trie, flags);
4062e9ff3f4SHou Tao if (ret)
407c5325e6eSHou Tao goto out;
408c5325e6eSHou Tao
409b95a5c4dSDaniel Mack /* If the new node matches the prefix completely, it must be inserted
410b95a5c4dSDaniel Mack * as an ancestor. Simply insert it between @node and *@slot.
411b95a5c4dSDaniel Mack */
412b95a5c4dSDaniel Mack if (matchlen == key->prefixlen) {
413b95a5c4dSDaniel Mack next_bit = extract_bit(node->data, matchlen);
414b95a5c4dSDaniel Mack rcu_assign_pointer(new_node->child[next_bit], node);
415b95a5c4dSDaniel Mack rcu_assign_pointer(*slot, new_node);
416b95a5c4dSDaniel Mack goto out;
417b95a5c4dSDaniel Mack }
418b95a5c4dSDaniel Mack
419b95a5c4dSDaniel Mack im_node = lpm_trie_node_alloc(trie, NULL);
420b95a5c4dSDaniel Mack if (!im_node) {
4212e9ff3f4SHou Tao trie->n_entries--;
422b95a5c4dSDaniel Mack ret = -ENOMEM;
423b95a5c4dSDaniel Mack goto out;
424b95a5c4dSDaniel Mack }
425b95a5c4dSDaniel Mack
426b95a5c4dSDaniel Mack im_node->prefixlen = matchlen;
427b95a5c4dSDaniel Mack im_node->flags |= LPM_TREE_NODE_FLAG_IM;
428b95a5c4dSDaniel Mack memcpy(im_node->data, node->data, trie->data_size);
429b95a5c4dSDaniel Mack
430b95a5c4dSDaniel Mack /* Now determine which child to install in which slot */
431b95a5c4dSDaniel Mack if (extract_bit(key->data, matchlen)) {
432b95a5c4dSDaniel Mack rcu_assign_pointer(im_node->child[0], node);
433b95a5c4dSDaniel Mack rcu_assign_pointer(im_node->child[1], new_node);
434b95a5c4dSDaniel Mack } else {
435b95a5c4dSDaniel Mack rcu_assign_pointer(im_node->child[0], new_node);
436b95a5c4dSDaniel Mack rcu_assign_pointer(im_node->child[1], node);
437b95a5c4dSDaniel Mack }
438b95a5c4dSDaniel Mack
4399e6b19a6SLeon Huayra /* Finally, assign the intermediate node to the determined slot */
440b95a5c4dSDaniel Mack rcu_assign_pointer(*slot, im_node);
441b95a5c4dSDaniel Mack
442b95a5c4dSDaniel Mack out:
4432e9ff3f4SHou Tao if (ret)
444b95a5c4dSDaniel Mack kfree(new_node);
44566150d0dSThomas Gleixner spin_unlock_irqrestore(&trie->lock, irq_flags);
44663f13eb5SAlexei Starovoitov kfree_rcu(free_node, rcu);
447b95a5c4dSDaniel Mack
448b95a5c4dSDaniel Mack return ret;
449b95a5c4dSDaniel Mack }
450b95a5c4dSDaniel Mack
451e454cf59SCraig Gallek /* Called from syscall or from eBPF program */
trie_delete_elem(struct bpf_map * map,void * _key)452d7ba4cc9SJP Kobryn static long trie_delete_elem(struct bpf_map *map, void *_key)
453b95a5c4dSDaniel Mack {
454e454cf59SCraig Gallek struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
45563f13eb5SAlexei Starovoitov struct lpm_trie_node *free_node = NULL, *free_parent = NULL;
456ef33f029SKees Cook struct bpf_lpm_trie_key_u8 *key = _key;
457b5d7388fSCraig Gallek struct lpm_trie_node __rcu **trim, **trim2;
458b5d7388fSCraig Gallek struct lpm_trie_node *node, *parent;
459e454cf59SCraig Gallek unsigned long irq_flags;
460e454cf59SCraig Gallek unsigned int next_bit;
461e454cf59SCraig Gallek size_t matchlen = 0;
462e454cf59SCraig Gallek int ret = 0;
463e454cf59SCraig Gallek
464e454cf59SCraig Gallek if (key->prefixlen > trie->max_prefixlen)
465e454cf59SCraig Gallek return -EINVAL;
466e454cf59SCraig Gallek
46766150d0dSThomas Gleixner spin_lock_irqsave(&trie->lock, irq_flags);
468e454cf59SCraig Gallek
469e454cf59SCraig Gallek /* Walk the tree looking for an exact key/length match and keeping
470b5d7388fSCraig Gallek * track of the path we traverse. We will need to know the node
471b5d7388fSCraig Gallek * we wish to delete, and the slot that points to the node we want
472b5d7388fSCraig Gallek * to delete. We may also need to know the nodes parent and the
473b5d7388fSCraig Gallek * slot that contains it.
474e454cf59SCraig Gallek */
475e454cf59SCraig Gallek trim = &trie->root;
476b5d7388fSCraig Gallek trim2 = trim;
477b5d7388fSCraig Gallek parent = NULL;
478b5d7388fSCraig Gallek while ((node = rcu_dereference_protected(
479b5d7388fSCraig Gallek *trim, lockdep_is_held(&trie->lock)))) {
480e454cf59SCraig Gallek matchlen = longest_prefix_match(trie, node, key);
481e454cf59SCraig Gallek
482e454cf59SCraig Gallek if (node->prefixlen != matchlen ||
483e454cf59SCraig Gallek node->prefixlen == key->prefixlen)
484e454cf59SCraig Gallek break;
485e454cf59SCraig Gallek
486b5d7388fSCraig Gallek parent = node;
487b5d7388fSCraig Gallek trim2 = trim;
488e454cf59SCraig Gallek next_bit = extract_bit(key->data, node->prefixlen);
489e454cf59SCraig Gallek trim = &node->child[next_bit];
490e454cf59SCraig Gallek }
491e454cf59SCraig Gallek
492e454cf59SCraig Gallek if (!node || node->prefixlen != key->prefixlen ||
4937c0cdf0bSAlban Crequy node->prefixlen != matchlen ||
494e454cf59SCraig Gallek (node->flags & LPM_TREE_NODE_FLAG_IM)) {
495e454cf59SCraig Gallek ret = -ENOENT;
496e454cf59SCraig Gallek goto out;
497e454cf59SCraig Gallek }
498e454cf59SCraig Gallek
499e454cf59SCraig Gallek trie->n_entries--;
500e454cf59SCraig Gallek
501b5d7388fSCraig Gallek /* If the node we are removing has two children, simply mark it
502e454cf59SCraig Gallek * as intermediate and we are done.
503e454cf59SCraig Gallek */
504b5d7388fSCraig Gallek if (rcu_access_pointer(node->child[0]) &&
505e454cf59SCraig Gallek rcu_access_pointer(node->child[1])) {
506e454cf59SCraig Gallek node->flags |= LPM_TREE_NODE_FLAG_IM;
507e454cf59SCraig Gallek goto out;
508e454cf59SCraig Gallek }
509e454cf59SCraig Gallek
510b5d7388fSCraig Gallek /* If the parent of the node we are about to delete is an intermediate
511b5d7388fSCraig Gallek * node, and the deleted node doesn't have any children, we can delete
512b5d7388fSCraig Gallek * the intermediate parent as well and promote its other child
513b5d7388fSCraig Gallek * up the tree. Doing this maintains the invariant that all
514b5d7388fSCraig Gallek * intermediate nodes have exactly 2 children and that there are no
515b5d7388fSCraig Gallek * unnecessary intermediate nodes in the tree.
516e454cf59SCraig Gallek */
517b5d7388fSCraig Gallek if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) &&
518b5d7388fSCraig Gallek !node->child[0] && !node->child[1]) {
519b5d7388fSCraig Gallek if (node == rcu_access_pointer(parent->child[0]))
520b5d7388fSCraig Gallek rcu_assign_pointer(
521b5d7388fSCraig Gallek *trim2, rcu_access_pointer(parent->child[1]));
522b5d7388fSCraig Gallek else
523b5d7388fSCraig Gallek rcu_assign_pointer(
524b5d7388fSCraig Gallek *trim2, rcu_access_pointer(parent->child[0]));
52563f13eb5SAlexei Starovoitov free_parent = parent;
52663f13eb5SAlexei Starovoitov free_node = node;
527b5d7388fSCraig Gallek goto out;
528e454cf59SCraig Gallek }
529e454cf59SCraig Gallek
530b5d7388fSCraig Gallek /* The node we are removing has either zero or one child. If there
531b5d7388fSCraig Gallek * is a child, move it into the removed node's slot then delete
532b5d7388fSCraig Gallek * the node. Otherwise just clear the slot and delete the node.
533b5d7388fSCraig Gallek */
534b5d7388fSCraig Gallek if (node->child[0])
535b5d7388fSCraig Gallek rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0]));
536b5d7388fSCraig Gallek else if (node->child[1])
537b5d7388fSCraig Gallek rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
538b5d7388fSCraig Gallek else
539b5d7388fSCraig Gallek RCU_INIT_POINTER(*trim, NULL);
54063f13eb5SAlexei Starovoitov free_node = node;
541b5d7388fSCraig Gallek
542e454cf59SCraig Gallek out:
54366150d0dSThomas Gleixner spin_unlock_irqrestore(&trie->lock, irq_flags);
54463f13eb5SAlexei Starovoitov kfree_rcu(free_parent, rcu);
54563f13eb5SAlexei Starovoitov kfree_rcu(free_node, rcu);
546e454cf59SCraig Gallek
547e454cf59SCraig Gallek return ret;
548b95a5c4dSDaniel Mack }
549b95a5c4dSDaniel Mack
550c502faf9SDaniel Borkmann #define LPM_DATA_SIZE_MAX 256
551c502faf9SDaniel Borkmann #define LPM_DATA_SIZE_MIN 1
552c502faf9SDaniel Borkmann
553c502faf9SDaniel Borkmann #define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \
554c502faf9SDaniel Borkmann sizeof(struct lpm_trie_node))
555c502faf9SDaniel Borkmann #define LPM_VAL_SIZE_MIN 1
556c502faf9SDaniel Borkmann
557ef33f029SKees Cook #define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key_u8) + (X))
558c502faf9SDaniel Borkmann #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
559c502faf9SDaniel Borkmann #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
560c502faf9SDaniel Borkmann
5616e71b04aSChenbo Feng #define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \
562591fe988SDaniel Borkmann BPF_F_ACCESS_MASK)
56396eabe7aSMartin KaFai Lau
trie_alloc(union bpf_attr * attr)564b95a5c4dSDaniel Mack static struct bpf_map *trie_alloc(union bpf_attr *attr)
565b95a5c4dSDaniel Mack {
566b95a5c4dSDaniel Mack struct lpm_trie *trie;
567b95a5c4dSDaniel Mack
568b95a5c4dSDaniel Mack /* check sanity of attributes */
569b95a5c4dSDaniel Mack if (attr->max_entries == 0 ||
57096eabe7aSMartin KaFai Lau !(attr->map_flags & BPF_F_NO_PREALLOC) ||
57196eabe7aSMartin KaFai Lau attr->map_flags & ~LPM_CREATE_FLAG_MASK ||
572591fe988SDaniel Borkmann !bpf_map_flags_access_ok(attr->map_flags) ||
573c502faf9SDaniel Borkmann attr->key_size < LPM_KEY_SIZE_MIN ||
574c502faf9SDaniel Borkmann attr->key_size > LPM_KEY_SIZE_MAX ||
575c502faf9SDaniel Borkmann attr->value_size < LPM_VAL_SIZE_MIN ||
576c502faf9SDaniel Borkmann attr->value_size > LPM_VAL_SIZE_MAX)
577b95a5c4dSDaniel Mack return ERR_PTR(-EINVAL);
578b95a5c4dSDaniel Mack
57973cf09a3SYafang Shao trie = bpf_map_area_alloc(sizeof(*trie), NUMA_NO_NODE);
580b95a5c4dSDaniel Mack if (!trie)
581b95a5c4dSDaniel Mack return ERR_PTR(-ENOMEM);
582b95a5c4dSDaniel Mack
583b95a5c4dSDaniel Mack /* copy mandatory map attributes */
584bd475643SJakub Kicinski bpf_map_init_from_attr(&trie->map, attr);
585b95a5c4dSDaniel Mack trie->data_size = attr->key_size -
586ef33f029SKees Cook offsetof(struct bpf_lpm_trie_key_u8, data);
587b95a5c4dSDaniel Mack trie->max_prefixlen = trie->data_size * 8;
588b95a5c4dSDaniel Mack
58966150d0dSThomas Gleixner spin_lock_init(&trie->lock);
590b95a5c4dSDaniel Mack
591b95a5c4dSDaniel Mack return &trie->map;
592b95a5c4dSDaniel Mack }
593b95a5c4dSDaniel Mack
trie_free(struct bpf_map * map)594b95a5c4dSDaniel Mack static void trie_free(struct bpf_map *map)
595b95a5c4dSDaniel Mack {
596b95a5c4dSDaniel Mack struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
597b95a5c4dSDaniel Mack struct lpm_trie_node __rcu **slot;
598b95a5c4dSDaniel Mack struct lpm_trie_node *node;
599b95a5c4dSDaniel Mack
600b95a5c4dSDaniel Mack /* Always start at the root and walk down to a node that has no
601b95a5c4dSDaniel Mack * children. Then free that node, nullify its reference in the parent
602b95a5c4dSDaniel Mack * and start over.
603b95a5c4dSDaniel Mack */
604b95a5c4dSDaniel Mack
605b95a5c4dSDaniel Mack for (;;) {
606b95a5c4dSDaniel Mack slot = &trie->root;
607b95a5c4dSDaniel Mack
608b95a5c4dSDaniel Mack for (;;) {
6096c5f6102SYonghong Song node = rcu_dereference_protected(*slot, 1);
610b95a5c4dSDaniel Mack if (!node)
6119a3efb6bSYonghong Song goto out;
612b95a5c4dSDaniel Mack
613b95a5c4dSDaniel Mack if (rcu_access_pointer(node->child[0])) {
614b95a5c4dSDaniel Mack slot = &node->child[0];
615b95a5c4dSDaniel Mack continue;
616b95a5c4dSDaniel Mack }
617b95a5c4dSDaniel Mack
618b95a5c4dSDaniel Mack if (rcu_access_pointer(node->child[1])) {
619b95a5c4dSDaniel Mack slot = &node->child[1];
620b95a5c4dSDaniel Mack continue;
621b95a5c4dSDaniel Mack }
622b95a5c4dSDaniel Mack
623b95a5c4dSDaniel Mack kfree(node);
624b95a5c4dSDaniel Mack RCU_INIT_POINTER(*slot, NULL);
625b95a5c4dSDaniel Mack break;
626b95a5c4dSDaniel Mack }
627b95a5c4dSDaniel Mack }
628b95a5c4dSDaniel Mack
6299a3efb6bSYonghong Song out:
63073cf09a3SYafang Shao bpf_map_area_free(trie);
631b95a5c4dSDaniel Mack }
632b95a5c4dSDaniel Mack
trie_get_next_key(struct bpf_map * map,void * _key,void * _next_key)633b471f2f1SYonghong Song static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
634f38837b0SAlexei Starovoitov {
6356dd1ec6cSYonghong Song struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root;
636b471f2f1SYonghong Song struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
637ef33f029SKees Cook struct bpf_lpm_trie_key_u8 *key = _key, *next_key = _next_key;
638b471f2f1SYonghong Song struct lpm_trie_node **node_stack = NULL;
639b471f2f1SYonghong Song int err = 0, stack_ptr = -1;
640b471f2f1SYonghong Song unsigned int next_bit;
641*68570b5cSHou Tao size_t matchlen = 0;
642b471f2f1SYonghong Song
643b471f2f1SYonghong Song /* The get_next_key follows postorder. For the 4 node example in
644b471f2f1SYonghong Song * the top of this file, the trie_get_next_key() returns the following
645b471f2f1SYonghong Song * one after another:
646b471f2f1SYonghong Song * 192.168.0.0/24
647b471f2f1SYonghong Song * 192.168.1.0/24
648b471f2f1SYonghong Song * 192.168.128.0/24
649b471f2f1SYonghong Song * 192.168.0.0/16
650b471f2f1SYonghong Song *
651b471f2f1SYonghong Song * The idea is to return more specific keys before less specific ones.
652b471f2f1SYonghong Song */
653b471f2f1SYonghong Song
654b471f2f1SYonghong Song /* Empty trie */
6556dd1ec6cSYonghong Song search_root = rcu_dereference(trie->root);
6566dd1ec6cSYonghong Song if (!search_root)
657b471f2f1SYonghong Song return -ENOENT;
658b471f2f1SYonghong Song
659b471f2f1SYonghong Song /* For invalid key, find the leftmost node in the trie */
6606dd1ec6cSYonghong Song if (!key || key->prefixlen > trie->max_prefixlen)
661b471f2f1SYonghong Song goto find_leftmost;
662b471f2f1SYonghong Song
66390a6e0e1SByeonguk Jeong node_stack = kmalloc_array(trie->max_prefixlen + 1,
6646da2ec56SKees Cook sizeof(struct lpm_trie_node *),
6652310035fSYonghong Song GFP_ATOMIC | __GFP_NOWARN);
666b471f2f1SYonghong Song if (!node_stack)
667b471f2f1SYonghong Song return -ENOMEM;
668b471f2f1SYonghong Song
669b471f2f1SYonghong Song /* Try to find the exact node for the given key */
6706dd1ec6cSYonghong Song for (node = search_root; node;) {
671b471f2f1SYonghong Song node_stack[++stack_ptr] = node;
672b471f2f1SYonghong Song matchlen = longest_prefix_match(trie, node, key);
673b471f2f1SYonghong Song if (node->prefixlen != matchlen ||
674b471f2f1SYonghong Song node->prefixlen == key->prefixlen)
675b471f2f1SYonghong Song break;
676b471f2f1SYonghong Song
677b471f2f1SYonghong Song next_bit = extract_bit(key->data, node->prefixlen);
678b471f2f1SYonghong Song node = rcu_dereference(node->child[next_bit]);
679b471f2f1SYonghong Song }
680*68570b5cSHou Tao if (!node || node->prefixlen != matchlen ||
6816dd1ec6cSYonghong Song (node->flags & LPM_TREE_NODE_FLAG_IM))
682b471f2f1SYonghong Song goto find_leftmost;
683b471f2f1SYonghong Song
684b471f2f1SYonghong Song /* The node with the exactly-matching key has been found,
685b471f2f1SYonghong Song * find the first node in postorder after the matched node.
686b471f2f1SYonghong Song */
687b471f2f1SYonghong Song node = node_stack[stack_ptr];
688b471f2f1SYonghong Song while (stack_ptr > 0) {
689b471f2f1SYonghong Song parent = node_stack[stack_ptr - 1];
6906dd1ec6cSYonghong Song if (rcu_dereference(parent->child[0]) == node) {
6916dd1ec6cSYonghong Song search_root = rcu_dereference(parent->child[1]);
6926dd1ec6cSYonghong Song if (search_root)
693b471f2f1SYonghong Song goto find_leftmost;
694b471f2f1SYonghong Song }
695b471f2f1SYonghong Song if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
696b471f2f1SYonghong Song next_node = parent;
697b471f2f1SYonghong Song goto do_copy;
698b471f2f1SYonghong Song }
699b471f2f1SYonghong Song
700b471f2f1SYonghong Song node = parent;
701b471f2f1SYonghong Song stack_ptr--;
702b471f2f1SYonghong Song }
703b471f2f1SYonghong Song
704b471f2f1SYonghong Song /* did not find anything */
705b471f2f1SYonghong Song err = -ENOENT;
706b471f2f1SYonghong Song goto free_stack;
707b471f2f1SYonghong Song
708b471f2f1SYonghong Song find_leftmost:
709b471f2f1SYonghong Song /* Find the leftmost non-intermediate node, all intermediate nodes
710b471f2f1SYonghong Song * have exact two children, so this function will never return NULL.
711b471f2f1SYonghong Song */
7126dd1ec6cSYonghong Song for (node = search_root; node;) {
713da2577fdSJonathan Lemon if (node->flags & LPM_TREE_NODE_FLAG_IM) {
714da2577fdSJonathan Lemon node = rcu_dereference(node->child[0]);
715da2577fdSJonathan Lemon } else {
716b471f2f1SYonghong Song next_node = node;
717b471f2f1SYonghong Song node = rcu_dereference(node->child[0]);
718da2577fdSJonathan Lemon if (!node)
719da2577fdSJonathan Lemon node = rcu_dereference(next_node->child[1]);
720da2577fdSJonathan Lemon }
721b471f2f1SYonghong Song }
722b471f2f1SYonghong Song do_copy:
723b471f2f1SYonghong Song next_key->prefixlen = next_node->prefixlen;
724ef33f029SKees Cook memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key_u8, data),
725b471f2f1SYonghong Song next_node->data, trie->data_size);
726b471f2f1SYonghong Song free_stack:
727b471f2f1SYonghong Song kfree(node_stack);
728b471f2f1SYonghong Song return err;
729f38837b0SAlexei Starovoitov }
730f38837b0SAlexei Starovoitov
trie_check_btf(const struct bpf_map * map,const struct btf * btf,const struct btf_type * key_type,const struct btf_type * value_type)731e8d2bec0SDaniel Borkmann static int trie_check_btf(const struct bpf_map *map,
7321b2b234bSRoman Gushchin const struct btf *btf,
733e8d2bec0SDaniel Borkmann const struct btf_type *key_type,
734e8d2bec0SDaniel Borkmann const struct btf_type *value_type)
735e8d2bec0SDaniel Borkmann {
736ef33f029SKees Cook /* Keys must have struct bpf_lpm_trie_key_u8 embedded. */
737e8d2bec0SDaniel Borkmann return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ?
738e8d2bec0SDaniel Borkmann -EINVAL : 0;
739e8d2bec0SDaniel Borkmann }
740e8d2bec0SDaniel Borkmann
trie_mem_usage(const struct bpf_map * map)74141d5941eSYafang Shao static u64 trie_mem_usage(const struct bpf_map *map)
74241d5941eSYafang Shao {
74341d5941eSYafang Shao struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
74441d5941eSYafang Shao u64 elem_size;
74541d5941eSYafang Shao
74641d5941eSYafang Shao elem_size = sizeof(struct lpm_trie_node) + trie->data_size +
74741d5941eSYafang Shao trie->map.value_size;
74841d5941eSYafang Shao return elem_size * READ_ONCE(trie->n_entries);
74941d5941eSYafang Shao }
75041d5941eSYafang Shao
751c317ab71SMenglong Dong BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie)
75240077e0cSJohannes Berg const struct bpf_map_ops trie_map_ops = {
753f4d05259SMartin KaFai Lau .map_meta_equal = bpf_map_meta_equal,
754b95a5c4dSDaniel Mack .map_alloc = trie_alloc,
755b95a5c4dSDaniel Mack .map_free = trie_free,
756f38837b0SAlexei Starovoitov .map_get_next_key = trie_get_next_key,
757b95a5c4dSDaniel Mack .map_lookup_elem = trie_lookup_elem,
758b95a5c4dSDaniel Mack .map_update_elem = trie_update_elem,
759b95a5c4dSDaniel Mack .map_delete_elem = trie_delete_elem,
760f56387c5SPedro Tammela .map_lookup_batch = generic_map_lookup_batch,
761f56387c5SPedro Tammela .map_update_batch = generic_map_update_batch,
762f56387c5SPedro Tammela .map_delete_batch = generic_map_delete_batch,
763e8d2bec0SDaniel Borkmann .map_check_btf = trie_check_btf,
76441d5941eSYafang Shao .map_mem_usage = trie_mem_usage,
765c317ab71SMenglong Dong .map_btf_id = &trie_map_btf_ids[0],
766b95a5c4dSDaniel Mack };
767