1 /* 2 * eBPF RSS program 3 * 4 * Developed by Daynix Computing LTD (http://www.daynix.com) 5 * 6 * Authors: 7 * Andrew Melnychenko <andrew@daynix.com> 8 * Yuri Benditovich <yuri.benditovich@daynix.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2. See 11 * the COPYING file in the top-level directory. 12 * 13 * Prepare: 14 * Requires llvm, clang, bpftool, linux kernel tree 15 * 16 * Build rss.bpf.skeleton.h: 17 * make -f Makefile.ebpf clean all 18 */ 19 20 #include <stddef.h> 21 #include <stdbool.h> 22 #include <linux/bpf.h> 23 24 #include <linux/in.h> 25 #include <linux/if_ether.h> 26 #include <linux/ip.h> 27 #include <linux/ipv6.h> 28 29 #include <linux/udp.h> 30 #include <linux/tcp.h> 31 32 #include <bpf/bpf_helpers.h> 33 #include <bpf/bpf_endian.h> 34 #include <linux/virtio_net.h> 35 36 #define INDIRECTION_TABLE_SIZE 128 37 #define HASH_CALCULATION_BUFFER_SIZE 36 38 39 struct rss_config_t { 40 __u8 redirect; 41 __u8 populate_hash; 42 __u32 hash_types; 43 __u16 indirections_len; 44 __u16 default_queue; 45 } __attribute__((packed)); 46 47 struct toeplitz_key_data_t { 48 __u32 leftmost_32_bits; 49 __u8 next_byte[HASH_CALCULATION_BUFFER_SIZE]; 50 }; 51 52 struct packet_hash_info_t { 53 __u8 is_ipv4; 54 __u8 is_ipv6; 55 __u8 is_udp; 56 __u8 is_tcp; 57 __u8 is_ipv6_ext_src; 58 __u8 is_ipv6_ext_dst; 59 __u8 is_fragmented; 60 61 __u16 src_port; 62 __u16 dst_port; 63 64 union { 65 struct { 66 __be32 in_src; 67 __be32 in_dst; 68 }; 69 70 struct { 71 struct in6_addr in6_src; 72 struct in6_addr in6_dst; 73 struct in6_addr in6_ext_src; 74 struct in6_addr in6_ext_dst; 75 }; 76 }; 77 }; 78 79 struct { 80 __uint(type, BPF_MAP_TYPE_ARRAY); 81 __uint(key_size, sizeof(__u32)); 82 __uint(value_size, sizeof(struct rss_config_t)); 83 __uint(max_entries, 1); 84 } tap_rss_map_configurations SEC(".maps"); 85 86 struct { 87 __uint(type, BPF_MAP_TYPE_ARRAY); 88 __uint(key_size, sizeof(__u32)); 89 __uint(value_size, sizeof(struct toeplitz_key_data_t)); 90 __uint(max_entries, 1); 91 } tap_rss_map_toeplitz_key SEC(".maps"); 92 93 struct { 94 __uint(type, BPF_MAP_TYPE_ARRAY); 95 __uint(key_size, sizeof(__u32)); 96 __uint(value_size, sizeof(__u16)); 97 __uint(max_entries, INDIRECTION_TABLE_SIZE); 98 } tap_rss_map_indirection_table SEC(".maps"); 99 100 static inline void net_rx_rss_add_chunk(__u8 *rss_input, size_t *bytes_written, 101 const void *ptr, size_t size) { 102 __builtin_memcpy(&rss_input[*bytes_written], ptr, size); 103 *bytes_written += size; 104 } 105 106 static inline 107 void net_toeplitz_add(__u32 *result, 108 __u8 *input, 109 __u32 len 110 , struct toeplitz_key_data_t *key) { 111 112 __u32 accumulator = *result; 113 __u32 leftmost_32_bits = key->leftmost_32_bits; 114 __u32 byte; 115 116 for (byte = 0; byte < HASH_CALCULATION_BUFFER_SIZE; byte++) { 117 __u8 input_byte = input[byte]; 118 __u8 key_byte = key->next_byte[byte]; 119 __u8 bit; 120 121 for (bit = 0; bit < 8; bit++) { 122 if (input_byte & (1 << 7)) { 123 accumulator ^= leftmost_32_bits; 124 } 125 126 leftmost_32_bits = 127 (leftmost_32_bits << 1) | ((key_byte & (1 << 7)) >> 7); 128 129 input_byte <<= 1; 130 key_byte <<= 1; 131 } 132 } 133 134 *result = accumulator; 135 } 136 137 138 static inline int ip6_extension_header_type(__u8 hdr_type) 139 { 140 switch (hdr_type) { 141 case IPPROTO_HOPOPTS: 142 case IPPROTO_ROUTING: 143 case IPPROTO_FRAGMENT: 144 case IPPROTO_ICMPV6: 145 case IPPROTO_NONE: 146 case IPPROTO_DSTOPTS: 147 case IPPROTO_MH: 148 return 1; 149 default: 150 return 0; 151 } 152 } 153 /* 154 * According to 155 * https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml 156 * we expect that there are would be no more than 11 extensions in IPv6 header, 157 * also there is 27 TLV options for Destination and Hop-by-hop extensions. 158 * Need to choose reasonable amount of maximum extensions/options we may 159 * check to find ext src/dst. 160 */ 161 #define IP6_EXTENSIONS_COUNT 11 162 #define IP6_OPTIONS_COUNT 30 163 164 static inline int parse_ipv6_ext(struct __sk_buff *skb, 165 struct packet_hash_info_t *info, 166 __u8 *l4_protocol, size_t *l4_offset) 167 { 168 int err = 0; 169 170 if (!ip6_extension_header_type(*l4_protocol)) { 171 return 0; 172 } 173 174 struct ipv6_opt_hdr ext_hdr = {}; 175 176 for (unsigned int i = 0; i < IP6_EXTENSIONS_COUNT; ++i) { 177 178 err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_hdr, 179 sizeof(ext_hdr), BPF_HDR_START_NET); 180 if (err) { 181 goto error; 182 } 183 184 if (*l4_protocol == IPPROTO_ROUTING) { 185 struct ipv6_rt_hdr ext_rt = {}; 186 187 err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_rt, 188 sizeof(ext_rt), BPF_HDR_START_NET); 189 if (err) { 190 goto error; 191 } 192 193 if ((ext_rt.type == IPV6_SRCRT_TYPE_2) && 194 (ext_rt.hdrlen == sizeof(struct in6_addr) / 8) && 195 (ext_rt.segments_left == 1)) { 196 197 err = bpf_skb_load_bytes_relative(skb, 198 *l4_offset + offsetof(struct rt2_hdr, addr), 199 &info->in6_ext_dst, sizeof(info->in6_ext_dst), 200 BPF_HDR_START_NET); 201 if (err) { 202 goto error; 203 } 204 205 info->is_ipv6_ext_dst = 1; 206 } 207 208 } else if (*l4_protocol == IPPROTO_DSTOPTS) { 209 struct ipv6_opt_t { 210 __u8 type; 211 __u8 length; 212 } __attribute__((packed)) opt = {}; 213 214 size_t opt_offset = sizeof(ext_hdr); 215 216 for (unsigned int j = 0; j < IP6_OPTIONS_COUNT; ++j) { 217 err = bpf_skb_load_bytes_relative(skb, *l4_offset + opt_offset, 218 &opt, sizeof(opt), BPF_HDR_START_NET); 219 if (err) { 220 goto error; 221 } 222 223 if (opt.type == IPV6_TLV_HAO) { 224 err = bpf_skb_load_bytes_relative(skb, 225 *l4_offset + opt_offset 226 + offsetof(struct ipv6_destopt_hao, addr), 227 &info->in6_ext_src, sizeof(info->in6_ext_src), 228 BPF_HDR_START_NET); 229 if (err) { 230 goto error; 231 } 232 233 info->is_ipv6_ext_src = 1; 234 break; 235 } 236 237 opt_offset += (opt.type == IPV6_TLV_PAD1) ? 238 1 : opt.length + sizeof(opt); 239 240 if (opt_offset + 1 >= ext_hdr.hdrlen * 8) { 241 break; 242 } 243 } 244 } else if (*l4_protocol == IPPROTO_FRAGMENT) { 245 info->is_fragmented = true; 246 } 247 248 *l4_protocol = ext_hdr.nexthdr; 249 *l4_offset += (ext_hdr.hdrlen + 1) * 8; 250 251 if (!ip6_extension_header_type(ext_hdr.nexthdr)) { 252 return 0; 253 } 254 } 255 256 return 0; 257 error: 258 return err; 259 } 260 261 static __be16 parse_eth_type(struct __sk_buff *skb) 262 { 263 unsigned int offset = 12; 264 __be16 ret = 0; 265 int err = 0; 266 267 err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret), 268 BPF_HDR_START_MAC); 269 if (err) { 270 return 0; 271 } 272 273 switch (bpf_ntohs(ret)) { 274 case ETH_P_8021AD: 275 offset += 4; 276 case ETH_P_8021Q: 277 offset += 4; 278 err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret), 279 BPF_HDR_START_MAC); 280 default: 281 break; 282 } 283 284 if (err) { 285 return 0; 286 } 287 288 return ret; 289 } 290 291 static inline int parse_packet(struct __sk_buff *skb, 292 struct packet_hash_info_t *info) 293 { 294 int err = 0; 295 296 if (!info || !skb) { 297 return -1; 298 } 299 300 size_t l4_offset = 0; 301 __u8 l4_protocol = 0; 302 __u16 l3_protocol = bpf_ntohs(parse_eth_type(skb)); 303 if (l3_protocol == 0) { 304 err = -1; 305 goto error; 306 } 307 308 if (l3_protocol == ETH_P_IP) { 309 info->is_ipv4 = 1; 310 311 struct iphdr ip = {}; 312 err = bpf_skb_load_bytes_relative(skb, 0, &ip, sizeof(ip), 313 BPF_HDR_START_NET); 314 if (err) { 315 goto error; 316 } 317 318 info->in_src = ip.saddr; 319 info->in_dst = ip.daddr; 320 info->is_fragmented = !!ip.frag_off; 321 322 l4_protocol = ip.protocol; 323 l4_offset = ip.ihl * 4; 324 } else if (l3_protocol == ETH_P_IPV6) { 325 info->is_ipv6 = 1; 326 327 struct ipv6hdr ip6 = {}; 328 err = bpf_skb_load_bytes_relative(skb, 0, &ip6, sizeof(ip6), 329 BPF_HDR_START_NET); 330 if (err) { 331 goto error; 332 } 333 334 info->in6_src = ip6.saddr; 335 info->in6_dst = ip6.daddr; 336 337 l4_protocol = ip6.nexthdr; 338 l4_offset = sizeof(ip6); 339 340 err = parse_ipv6_ext(skb, info, &l4_protocol, &l4_offset); 341 if (err) { 342 goto error; 343 } 344 } 345 346 if (l4_protocol != 0 && !info->is_fragmented) { 347 if (l4_protocol == IPPROTO_TCP) { 348 info->is_tcp = 1; 349 350 struct tcphdr tcp = {}; 351 err = bpf_skb_load_bytes_relative(skb, l4_offset, &tcp, sizeof(tcp), 352 BPF_HDR_START_NET); 353 if (err) { 354 goto error; 355 } 356 357 info->src_port = tcp.source; 358 info->dst_port = tcp.dest; 359 } else if (l4_protocol == IPPROTO_UDP) { /* TODO: add udplite? */ 360 info->is_udp = 1; 361 362 struct udphdr udp = {}; 363 err = bpf_skb_load_bytes_relative(skb, l4_offset, &udp, sizeof(udp), 364 BPF_HDR_START_NET); 365 if (err) { 366 goto error; 367 } 368 369 info->src_port = udp.source; 370 info->dst_port = udp.dest; 371 } 372 } 373 374 return 0; 375 376 error: 377 return err; 378 } 379 380 static inline __u32 calculate_rss_hash(struct __sk_buff *skb, 381 struct rss_config_t *config, struct toeplitz_key_data_t *toe) 382 { 383 __u8 rss_input[HASH_CALCULATION_BUFFER_SIZE] = {}; 384 size_t bytes_written = 0; 385 __u32 result = 0; 386 int err = 0; 387 struct packet_hash_info_t packet_info = {}; 388 389 err = parse_packet(skb, &packet_info); 390 if (err) { 391 return 0; 392 } 393 394 if (packet_info.is_ipv4) { 395 if (packet_info.is_tcp && 396 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) { 397 398 net_rx_rss_add_chunk(rss_input, &bytes_written, 399 &packet_info.in_src, 400 sizeof(packet_info.in_src)); 401 net_rx_rss_add_chunk(rss_input, &bytes_written, 402 &packet_info.in_dst, 403 sizeof(packet_info.in_dst)); 404 net_rx_rss_add_chunk(rss_input, &bytes_written, 405 &packet_info.src_port, 406 sizeof(packet_info.src_port)); 407 net_rx_rss_add_chunk(rss_input, &bytes_written, 408 &packet_info.dst_port, 409 sizeof(packet_info.dst_port)); 410 } else if (packet_info.is_udp && 411 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) { 412 413 net_rx_rss_add_chunk(rss_input, &bytes_written, 414 &packet_info.in_src, 415 sizeof(packet_info.in_src)); 416 net_rx_rss_add_chunk(rss_input, &bytes_written, 417 &packet_info.in_dst, 418 sizeof(packet_info.in_dst)); 419 net_rx_rss_add_chunk(rss_input, &bytes_written, 420 &packet_info.src_port, 421 sizeof(packet_info.src_port)); 422 net_rx_rss_add_chunk(rss_input, &bytes_written, 423 &packet_info.dst_port, 424 sizeof(packet_info.dst_port)); 425 } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) { 426 net_rx_rss_add_chunk(rss_input, &bytes_written, 427 &packet_info.in_src, 428 sizeof(packet_info.in_src)); 429 net_rx_rss_add_chunk(rss_input, &bytes_written, 430 &packet_info.in_dst, 431 sizeof(packet_info.in_dst)); 432 } 433 } else if (packet_info.is_ipv6) { 434 if (packet_info.is_tcp && 435 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) { 436 437 if (packet_info.is_ipv6_ext_src && 438 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) { 439 440 net_rx_rss_add_chunk(rss_input, &bytes_written, 441 &packet_info.in6_ext_src, 442 sizeof(packet_info.in6_ext_src)); 443 } else { 444 net_rx_rss_add_chunk(rss_input, &bytes_written, 445 &packet_info.in6_src, 446 sizeof(packet_info.in6_src)); 447 } 448 if (packet_info.is_ipv6_ext_dst && 449 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) { 450 451 net_rx_rss_add_chunk(rss_input, &bytes_written, 452 &packet_info.in6_ext_dst, 453 sizeof(packet_info.in6_ext_dst)); 454 } else { 455 net_rx_rss_add_chunk(rss_input, &bytes_written, 456 &packet_info.in6_dst, 457 sizeof(packet_info.in6_dst)); 458 } 459 net_rx_rss_add_chunk(rss_input, &bytes_written, 460 &packet_info.src_port, 461 sizeof(packet_info.src_port)); 462 net_rx_rss_add_chunk(rss_input, &bytes_written, 463 &packet_info.dst_port, 464 sizeof(packet_info.dst_port)); 465 } else if (packet_info.is_udp && 466 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) { 467 468 if (packet_info.is_ipv6_ext_src && 469 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) { 470 471 net_rx_rss_add_chunk(rss_input, &bytes_written, 472 &packet_info.in6_ext_src, 473 sizeof(packet_info.in6_ext_src)); 474 } else { 475 net_rx_rss_add_chunk(rss_input, &bytes_written, 476 &packet_info.in6_src, 477 sizeof(packet_info.in6_src)); 478 } 479 if (packet_info.is_ipv6_ext_dst && 480 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) { 481 482 net_rx_rss_add_chunk(rss_input, &bytes_written, 483 &packet_info.in6_ext_dst, 484 sizeof(packet_info.in6_ext_dst)); 485 } else { 486 net_rx_rss_add_chunk(rss_input, &bytes_written, 487 &packet_info.in6_dst, 488 sizeof(packet_info.in6_dst)); 489 } 490 491 net_rx_rss_add_chunk(rss_input, &bytes_written, 492 &packet_info.src_port, 493 sizeof(packet_info.src_port)); 494 net_rx_rss_add_chunk(rss_input, &bytes_written, 495 &packet_info.dst_port, 496 sizeof(packet_info.dst_port)); 497 498 } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) { 499 if (packet_info.is_ipv6_ext_src && 500 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) { 501 502 net_rx_rss_add_chunk(rss_input, &bytes_written, 503 &packet_info.in6_ext_src, 504 sizeof(packet_info.in6_ext_src)); 505 } else { 506 net_rx_rss_add_chunk(rss_input, &bytes_written, 507 &packet_info.in6_src, 508 sizeof(packet_info.in6_src)); 509 } 510 if (packet_info.is_ipv6_ext_dst && 511 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) { 512 513 net_rx_rss_add_chunk(rss_input, &bytes_written, 514 &packet_info.in6_ext_dst, 515 sizeof(packet_info.in6_ext_dst)); 516 } else { 517 net_rx_rss_add_chunk(rss_input, &bytes_written, 518 &packet_info.in6_dst, 519 sizeof(packet_info.in6_dst)); 520 } 521 } 522 } 523 524 if (bytes_written) { 525 net_toeplitz_add(&result, rss_input, bytes_written, toe); 526 } 527 528 return result; 529 } 530 531 SEC("tun_rss_steering") 532 int tun_rss_steering_prog(struct __sk_buff *skb) 533 { 534 535 struct rss_config_t *config; 536 struct toeplitz_key_data_t *toe; 537 538 __u32 key = 0; 539 __u32 hash = 0; 540 541 config = bpf_map_lookup_elem(&tap_rss_map_configurations, &key); 542 toe = bpf_map_lookup_elem(&tap_rss_map_toeplitz_key, &key); 543 544 if (config && toe) { 545 if (!config->redirect) { 546 return config->default_queue; 547 } 548 549 hash = calculate_rss_hash(skb, config, toe); 550 if (hash) { 551 __u32 table_idx = hash % config->indirections_len; 552 __u16 *queue = 0; 553 554 queue = bpf_map_lookup_elem(&tap_rss_map_indirection_table, 555 &table_idx); 556 557 if (queue) { 558 return *queue; 559 } 560 } 561 562 return config->default_queue; 563 } 564 565 return -1; 566 } 567 568 char _license[] SEC("license") = "GPL v2"; 569