1 /*
2 * eBPF RSS program
3 *
4 * Developed by Daynix Computing LTD (http://www.daynix.com)
5 *
6 * Authors:
7 * Andrew Melnychenko <andrew@daynix.com>
8 * Yuri Benditovich <yuri.benditovich@daynix.com>
9 *
10 * This work is licensed under the terms of the GNU GPL, version 2. See
11 * the COPYING file in the top-level directory.
12 *
13 * Prepare:
14 * Requires llvm, clang, bpftool, linux kernel tree
15 *
16 * Build rss.bpf.skeleton.h:
17 * make -f Makefile.ebpf clean all
18 */
19
20 #include <stddef.h>
21 #include <stdbool.h>
22 #include <linux/bpf.h>
23
24 #include <linux/in.h>
25 #include <linux/if_ether.h>
26 #include <linux/ip.h>
27 #include <linux/ipv6.h>
28
29 #include <linux/udp.h>
30 #include <linux/tcp.h>
31
32 #include <bpf/bpf_helpers.h>
33 #include <bpf/bpf_endian.h>
34 #include <linux/virtio_net.h>
35
36 #define INDIRECTION_TABLE_SIZE 128
37 #define HASH_CALCULATION_BUFFER_SIZE 36
38
39 struct rss_config_t {
40 __u8 redirect;
41 __u8 populate_hash;
42 __u32 hash_types;
43 __u16 indirections_len;
44 __u16 default_queue;
45 } __attribute__((packed));
46
47 struct toeplitz_key_data_t {
48 __u32 leftmost_32_bits;
49 __u8 next_byte[HASH_CALCULATION_BUFFER_SIZE];
50 };
51
52 struct packet_hash_info_t {
53 __u8 is_ipv4;
54 __u8 is_ipv6;
55 __u8 is_udp;
56 __u8 is_tcp;
57 __u8 is_ipv6_ext_src;
58 __u8 is_ipv6_ext_dst;
59 __u8 is_fragmented;
60
61 __u16 src_port;
62 __u16 dst_port;
63
64 union {
65 struct {
66 __be32 in_src;
67 __be32 in_dst;
68 };
69
70 struct {
71 struct in6_addr in6_src;
72 struct in6_addr in6_dst;
73 struct in6_addr in6_ext_src;
74 struct in6_addr in6_ext_dst;
75 };
76 };
77 };
78
79 struct {
80 __uint(type, BPF_MAP_TYPE_ARRAY);
81 __uint(key_size, sizeof(__u32));
82 __uint(value_size, sizeof(struct rss_config_t));
83 __uint(max_entries, 1);
84 } tap_rss_map_configurations SEC(".maps");
85
86 struct {
87 __uint(type, BPF_MAP_TYPE_ARRAY);
88 __uint(key_size, sizeof(__u32));
89 __uint(value_size, sizeof(struct toeplitz_key_data_t));
90 __uint(max_entries, 1);
91 } tap_rss_map_toeplitz_key SEC(".maps");
92
93 struct {
94 __uint(type, BPF_MAP_TYPE_ARRAY);
95 __uint(key_size, sizeof(__u32));
96 __uint(value_size, sizeof(__u16));
97 __uint(max_entries, INDIRECTION_TABLE_SIZE);
98 } tap_rss_map_indirection_table SEC(".maps");
99
net_rx_rss_add_chunk(__u8 * rss_input,size_t * bytes_written,const void * ptr,size_t size)100 static inline void net_rx_rss_add_chunk(__u8 *rss_input, size_t *bytes_written,
101 const void *ptr, size_t size) {
102 __builtin_memcpy(&rss_input[*bytes_written], ptr, size);
103 *bytes_written += size;
104 }
105
106 static inline
net_toeplitz_add(__u32 * result,__u8 * input,__u32 len,struct toeplitz_key_data_t * key)107 void net_toeplitz_add(__u32 *result,
108 __u8 *input,
109 __u32 len
110 , struct toeplitz_key_data_t *key) {
111
112 __u32 accumulator = *result;
113 __u32 leftmost_32_bits = key->leftmost_32_bits;
114 __u32 byte;
115
116 for (byte = 0; byte < HASH_CALCULATION_BUFFER_SIZE; byte++) {
117 __u8 input_byte = input[byte];
118 __u8 key_byte = key->next_byte[byte];
119 __u8 bit;
120
121 for (bit = 0; bit < 8; bit++) {
122 if (input_byte & (1 << 7)) {
123 accumulator ^= leftmost_32_bits;
124 }
125
126 leftmost_32_bits =
127 (leftmost_32_bits << 1) | ((key_byte & (1 << 7)) >> 7);
128
129 input_byte <<= 1;
130 key_byte <<= 1;
131 }
132 }
133
134 *result = accumulator;
135 }
136
137
ip6_extension_header_type(__u8 hdr_type)138 static inline int ip6_extension_header_type(__u8 hdr_type)
139 {
140 switch (hdr_type) {
141 case IPPROTO_HOPOPTS:
142 case IPPROTO_ROUTING:
143 case IPPROTO_FRAGMENT:
144 case IPPROTO_ICMPV6:
145 case IPPROTO_NONE:
146 case IPPROTO_DSTOPTS:
147 case IPPROTO_MH:
148 return 1;
149 default:
150 return 0;
151 }
152 }
153 /*
154 * According to
155 * https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml
156 * we expect that there are would be no more than 11 extensions in IPv6 header,
157 * also there is 27 TLV options for Destination and Hop-by-hop extensions.
158 * Need to choose reasonable amount of maximum extensions/options we may
159 * check to find ext src/dst.
160 */
161 #define IP6_EXTENSIONS_COUNT 11
162 #define IP6_OPTIONS_COUNT 30
163
parse_ipv6_ext(struct __sk_buff * skb,struct packet_hash_info_t * info,__u8 * l4_protocol,size_t * l4_offset)164 static inline int parse_ipv6_ext(struct __sk_buff *skb,
165 struct packet_hash_info_t *info,
166 __u8 *l4_protocol, size_t *l4_offset)
167 {
168 int err = 0;
169
170 if (!ip6_extension_header_type(*l4_protocol)) {
171 return 0;
172 }
173
174 struct ipv6_opt_hdr ext_hdr = {};
175
176 for (unsigned int i = 0; i < IP6_EXTENSIONS_COUNT; ++i) {
177
178 err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_hdr,
179 sizeof(ext_hdr), BPF_HDR_START_NET);
180 if (err) {
181 goto error;
182 }
183
184 if (*l4_protocol == IPPROTO_ROUTING) {
185 struct ipv6_rt_hdr ext_rt = {};
186
187 err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_rt,
188 sizeof(ext_rt), BPF_HDR_START_NET);
189 if (err) {
190 goto error;
191 }
192
193 if ((ext_rt.type == IPV6_SRCRT_TYPE_2) &&
194 (ext_rt.hdrlen == sizeof(struct in6_addr) / 8) &&
195 (ext_rt.segments_left == 1)) {
196
197 err = bpf_skb_load_bytes_relative(skb,
198 *l4_offset + offsetof(struct rt2_hdr, addr),
199 &info->in6_ext_dst, sizeof(info->in6_ext_dst),
200 BPF_HDR_START_NET);
201 if (err) {
202 goto error;
203 }
204
205 info->is_ipv6_ext_dst = 1;
206 }
207
208 } else if (*l4_protocol == IPPROTO_DSTOPTS) {
209 struct ipv6_opt_t {
210 __u8 type;
211 __u8 length;
212 } __attribute__((packed)) opt = {};
213
214 size_t opt_offset = sizeof(ext_hdr);
215
216 for (unsigned int j = 0; j < IP6_OPTIONS_COUNT; ++j) {
217 err = bpf_skb_load_bytes_relative(skb, *l4_offset + opt_offset,
218 &opt, sizeof(opt), BPF_HDR_START_NET);
219 if (err) {
220 goto error;
221 }
222
223 if (opt.type == IPV6_TLV_HAO) {
224 err = bpf_skb_load_bytes_relative(skb,
225 *l4_offset + opt_offset
226 + offsetof(struct ipv6_destopt_hao, addr),
227 &info->in6_ext_src, sizeof(info->in6_ext_src),
228 BPF_HDR_START_NET);
229 if (err) {
230 goto error;
231 }
232
233 info->is_ipv6_ext_src = 1;
234 break;
235 }
236
237 opt_offset += (opt.type == IPV6_TLV_PAD1) ?
238 1 : opt.length + sizeof(opt);
239
240 if (opt_offset + 1 >= ext_hdr.hdrlen * 8) {
241 break;
242 }
243 }
244 } else if (*l4_protocol == IPPROTO_FRAGMENT) {
245 info->is_fragmented = true;
246 }
247
248 *l4_protocol = ext_hdr.nexthdr;
249 *l4_offset += (ext_hdr.hdrlen + 1) * 8;
250
251 if (!ip6_extension_header_type(ext_hdr.nexthdr)) {
252 return 0;
253 }
254 }
255
256 return 0;
257 error:
258 return err;
259 }
260
parse_eth_type(struct __sk_buff * skb)261 static __be16 parse_eth_type(struct __sk_buff *skb)
262 {
263 unsigned int offset = 12;
264 __be16 ret = 0;
265 int err = 0;
266
267 err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
268 BPF_HDR_START_MAC);
269 if (err) {
270 return 0;
271 }
272
273 switch (bpf_ntohs(ret)) {
274 case ETH_P_8021AD:
275 offset += 4;
276 case ETH_P_8021Q:
277 offset += 4;
278 err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
279 BPF_HDR_START_MAC);
280 default:
281 break;
282 }
283
284 if (err) {
285 return 0;
286 }
287
288 return ret;
289 }
290
parse_packet(struct __sk_buff * skb,struct packet_hash_info_t * info)291 static inline int parse_packet(struct __sk_buff *skb,
292 struct packet_hash_info_t *info)
293 {
294 int err = 0;
295
296 if (!info || !skb) {
297 return -1;
298 }
299
300 size_t l4_offset = 0;
301 __u8 l4_protocol = 0;
302 __u16 l3_protocol = bpf_ntohs(parse_eth_type(skb));
303 if (l3_protocol == 0) {
304 err = -1;
305 goto error;
306 }
307
308 if (l3_protocol == ETH_P_IP) {
309 info->is_ipv4 = 1;
310
311 struct iphdr ip = {};
312 err = bpf_skb_load_bytes_relative(skb, 0, &ip, sizeof(ip),
313 BPF_HDR_START_NET);
314 if (err) {
315 goto error;
316 }
317
318 info->in_src = ip.saddr;
319 info->in_dst = ip.daddr;
320 info->is_fragmented = !!ip.frag_off;
321
322 l4_protocol = ip.protocol;
323 l4_offset = ip.ihl * 4;
324 } else if (l3_protocol == ETH_P_IPV6) {
325 info->is_ipv6 = 1;
326
327 struct ipv6hdr ip6 = {};
328 err = bpf_skb_load_bytes_relative(skb, 0, &ip6, sizeof(ip6),
329 BPF_HDR_START_NET);
330 if (err) {
331 goto error;
332 }
333
334 info->in6_src = ip6.saddr;
335 info->in6_dst = ip6.daddr;
336
337 l4_protocol = ip6.nexthdr;
338 l4_offset = sizeof(ip6);
339
340 err = parse_ipv6_ext(skb, info, &l4_protocol, &l4_offset);
341 if (err) {
342 goto error;
343 }
344 }
345
346 if (l4_protocol != 0 && !info->is_fragmented) {
347 if (l4_protocol == IPPROTO_TCP) {
348 info->is_tcp = 1;
349
350 struct tcphdr tcp = {};
351 err = bpf_skb_load_bytes_relative(skb, l4_offset, &tcp, sizeof(tcp),
352 BPF_HDR_START_NET);
353 if (err) {
354 goto error;
355 }
356
357 info->src_port = tcp.source;
358 info->dst_port = tcp.dest;
359 } else if (l4_protocol == IPPROTO_UDP) { /* TODO: add udplite? */
360 info->is_udp = 1;
361
362 struct udphdr udp = {};
363 err = bpf_skb_load_bytes_relative(skb, l4_offset, &udp, sizeof(udp),
364 BPF_HDR_START_NET);
365 if (err) {
366 goto error;
367 }
368
369 info->src_port = udp.source;
370 info->dst_port = udp.dest;
371 }
372 }
373
374 return 0;
375
376 error:
377 return err;
378 }
379
calculate_rss_hash(struct __sk_buff * skb,struct rss_config_t * config,struct toeplitz_key_data_t * toe)380 static inline __u32 calculate_rss_hash(struct __sk_buff *skb,
381 struct rss_config_t *config, struct toeplitz_key_data_t *toe)
382 {
383 __u8 rss_input[HASH_CALCULATION_BUFFER_SIZE] = {};
384 size_t bytes_written = 0;
385 __u32 result = 0;
386 int err = 0;
387 struct packet_hash_info_t packet_info = {};
388
389 err = parse_packet(skb, &packet_info);
390 if (err) {
391 return 0;
392 }
393
394 if (packet_info.is_ipv4) {
395 if (packet_info.is_tcp &&
396 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
397
398 net_rx_rss_add_chunk(rss_input, &bytes_written,
399 &packet_info.in_src,
400 sizeof(packet_info.in_src));
401 net_rx_rss_add_chunk(rss_input, &bytes_written,
402 &packet_info.in_dst,
403 sizeof(packet_info.in_dst));
404 net_rx_rss_add_chunk(rss_input, &bytes_written,
405 &packet_info.src_port,
406 sizeof(packet_info.src_port));
407 net_rx_rss_add_chunk(rss_input, &bytes_written,
408 &packet_info.dst_port,
409 sizeof(packet_info.dst_port));
410 } else if (packet_info.is_udp &&
411 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
412
413 net_rx_rss_add_chunk(rss_input, &bytes_written,
414 &packet_info.in_src,
415 sizeof(packet_info.in_src));
416 net_rx_rss_add_chunk(rss_input, &bytes_written,
417 &packet_info.in_dst,
418 sizeof(packet_info.in_dst));
419 net_rx_rss_add_chunk(rss_input, &bytes_written,
420 &packet_info.src_port,
421 sizeof(packet_info.src_port));
422 net_rx_rss_add_chunk(rss_input, &bytes_written,
423 &packet_info.dst_port,
424 sizeof(packet_info.dst_port));
425 } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
426 net_rx_rss_add_chunk(rss_input, &bytes_written,
427 &packet_info.in_src,
428 sizeof(packet_info.in_src));
429 net_rx_rss_add_chunk(rss_input, &bytes_written,
430 &packet_info.in_dst,
431 sizeof(packet_info.in_dst));
432 }
433 } else if (packet_info.is_ipv6) {
434 if (packet_info.is_tcp &&
435 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
436
437 if (packet_info.is_ipv6_ext_src &&
438 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
439
440 net_rx_rss_add_chunk(rss_input, &bytes_written,
441 &packet_info.in6_ext_src,
442 sizeof(packet_info.in6_ext_src));
443 } else {
444 net_rx_rss_add_chunk(rss_input, &bytes_written,
445 &packet_info.in6_src,
446 sizeof(packet_info.in6_src));
447 }
448 if (packet_info.is_ipv6_ext_dst &&
449 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
450
451 net_rx_rss_add_chunk(rss_input, &bytes_written,
452 &packet_info.in6_ext_dst,
453 sizeof(packet_info.in6_ext_dst));
454 } else {
455 net_rx_rss_add_chunk(rss_input, &bytes_written,
456 &packet_info.in6_dst,
457 sizeof(packet_info.in6_dst));
458 }
459 net_rx_rss_add_chunk(rss_input, &bytes_written,
460 &packet_info.src_port,
461 sizeof(packet_info.src_port));
462 net_rx_rss_add_chunk(rss_input, &bytes_written,
463 &packet_info.dst_port,
464 sizeof(packet_info.dst_port));
465 } else if (packet_info.is_udp &&
466 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
467
468 if (packet_info.is_ipv6_ext_src &&
469 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
470
471 net_rx_rss_add_chunk(rss_input, &bytes_written,
472 &packet_info.in6_ext_src,
473 sizeof(packet_info.in6_ext_src));
474 } else {
475 net_rx_rss_add_chunk(rss_input, &bytes_written,
476 &packet_info.in6_src,
477 sizeof(packet_info.in6_src));
478 }
479 if (packet_info.is_ipv6_ext_dst &&
480 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
481
482 net_rx_rss_add_chunk(rss_input, &bytes_written,
483 &packet_info.in6_ext_dst,
484 sizeof(packet_info.in6_ext_dst));
485 } else {
486 net_rx_rss_add_chunk(rss_input, &bytes_written,
487 &packet_info.in6_dst,
488 sizeof(packet_info.in6_dst));
489 }
490
491 net_rx_rss_add_chunk(rss_input, &bytes_written,
492 &packet_info.src_port,
493 sizeof(packet_info.src_port));
494 net_rx_rss_add_chunk(rss_input, &bytes_written,
495 &packet_info.dst_port,
496 sizeof(packet_info.dst_port));
497
498 } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
499 if (packet_info.is_ipv6_ext_src &&
500 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
501
502 net_rx_rss_add_chunk(rss_input, &bytes_written,
503 &packet_info.in6_ext_src,
504 sizeof(packet_info.in6_ext_src));
505 } else {
506 net_rx_rss_add_chunk(rss_input, &bytes_written,
507 &packet_info.in6_src,
508 sizeof(packet_info.in6_src));
509 }
510 if (packet_info.is_ipv6_ext_dst &&
511 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
512
513 net_rx_rss_add_chunk(rss_input, &bytes_written,
514 &packet_info.in6_ext_dst,
515 sizeof(packet_info.in6_ext_dst));
516 } else {
517 net_rx_rss_add_chunk(rss_input, &bytes_written,
518 &packet_info.in6_dst,
519 sizeof(packet_info.in6_dst));
520 }
521 }
522 }
523
524 if (bytes_written) {
525 net_toeplitz_add(&result, rss_input, bytes_written, toe);
526 }
527
528 return result;
529 }
530
531 SEC("tun_rss_steering")
tun_rss_steering_prog(struct __sk_buff * skb)532 int tun_rss_steering_prog(struct __sk_buff *skb)
533 {
534
535 struct rss_config_t *config;
536 struct toeplitz_key_data_t *toe;
537
538 __u32 key = 0;
539 __u32 hash = 0;
540
541 config = bpf_map_lookup_elem(&tap_rss_map_configurations, &key);
542 toe = bpf_map_lookup_elem(&tap_rss_map_toeplitz_key, &key);
543
544 if (config && toe) {
545 if (!config->redirect) {
546 return config->default_queue;
547 }
548
549 hash = calculate_rss_hash(skb, config, toe);
550 if (hash) {
551 __u32 table_idx = hash % config->indirections_len;
552 __u16 *queue = 0;
553
554 queue = bpf_map_lookup_elem(&tap_rss_map_indirection_table,
555 &table_idx);
556
557 if (queue) {
558 return *queue;
559 }
560 }
561
562 return config->default_queue;
563 }
564
565 return -1;
566 }
567
568 char _license[] SEC("license") = "GPL v2";
569