1 /* 2 * eBPF RSS program 3 * 4 * Developed by Daynix Computing LTD (http://www.daynix.com) 5 * 6 * Authors: 7 * Andrew Melnychenko <andrew@daynix.com> 8 * Yuri Benditovich <yuri.benditovich@daynix.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2. See 11 * the COPYING file in the top-level directory. 12 * 13 * Prepare: 14 * Requires llvm, clang, bpftool, linux kernel tree 15 * 16 * Build rss.bpf.skeleton.h: 17 * make -f Makefile.ebpf clean all 18 */ 19 20 #include <stddef.h> 21 #include <stdbool.h> 22 #include <linux/bpf.h> 23 24 #include <linux/in.h> 25 #include <linux/if_ether.h> 26 #include <linux/ip.h> 27 #include <linux/ipv6.h> 28 29 #include <linux/udp.h> 30 #include <linux/tcp.h> 31 32 #include <bpf/bpf_helpers.h> 33 #include <bpf/bpf_endian.h> 34 #include <linux/virtio_net.h> 35 36 #define INDIRECTION_TABLE_SIZE 128 37 #define HASH_CALCULATION_BUFFER_SIZE 36 38 39 struct rss_config_t { 40 __u8 redirect; 41 __u8 populate_hash; 42 __u32 hash_types; 43 __u16 indirections_len; 44 __u16 default_queue; 45 } __attribute__((packed)); 46 47 struct toeplitz_key_data_t { 48 __u32 leftmost_32_bits; 49 __u8 next_byte[HASH_CALCULATION_BUFFER_SIZE]; 50 }; 51 52 struct packet_hash_info_t { 53 __u8 is_ipv4; 54 __u8 is_ipv6; 55 __u8 is_udp; 56 __u8 is_tcp; 57 __u8 is_ipv6_ext_src; 58 __u8 is_ipv6_ext_dst; 59 __u8 is_fragmented; 60 61 __u16 src_port; 62 __u16 dst_port; 63 64 union { 65 struct { 66 __be32 in_src; 67 __be32 in_dst; 68 }; 69 70 struct { 71 struct in6_addr in6_src; 72 struct in6_addr in6_dst; 73 struct in6_addr in6_ext_src; 74 struct in6_addr in6_ext_dst; 75 }; 76 }; 77 }; 78 79 struct { 80 __uint(type, BPF_MAP_TYPE_ARRAY); 81 __uint(key_size, sizeof(__u32)); 82 __uint(value_size, sizeof(struct rss_config_t)); 83 __uint(max_entries, 1); 84 __uint(map_flags, BPF_F_MMAPABLE); 85 } tap_rss_map_configurations SEC(".maps"); 86 87 struct { 88 __uint(type, BPF_MAP_TYPE_ARRAY); 89 __uint(key_size, sizeof(__u32)); 90 __uint(value_size, sizeof(struct toeplitz_key_data_t)); 91 __uint(max_entries, 1); 92 __uint(map_flags, BPF_F_MMAPABLE); 93 } tap_rss_map_toeplitz_key SEC(".maps"); 94 95 struct { 96 __uint(type, BPF_MAP_TYPE_ARRAY); 97 __uint(key_size, sizeof(__u32)); 98 __uint(value_size, sizeof(__u16)); 99 __uint(max_entries, INDIRECTION_TABLE_SIZE); 100 __uint(map_flags, BPF_F_MMAPABLE); 101 } tap_rss_map_indirection_table SEC(".maps"); 102 103 static inline void net_rx_rss_add_chunk(__u8 *rss_input, size_t *bytes_written, 104 const void *ptr, size_t size) { 105 __builtin_memcpy(&rss_input[*bytes_written], ptr, size); 106 *bytes_written += size; 107 } 108 109 static inline 110 void net_toeplitz_add(__u32 *result, 111 __u8 *input, 112 __u32 len 113 , struct toeplitz_key_data_t *key) { 114 115 __u32 accumulator = *result; 116 __u32 leftmost_32_bits = key->leftmost_32_bits; 117 __u32 byte; 118 119 for (byte = 0; byte < HASH_CALCULATION_BUFFER_SIZE; byte++) { 120 __u8 input_byte = input[byte]; 121 __u8 key_byte = key->next_byte[byte]; 122 __u8 bit; 123 124 for (bit = 0; bit < 8; bit++) { 125 if (input_byte & (1 << 7)) { 126 accumulator ^= leftmost_32_bits; 127 } 128 129 leftmost_32_bits = 130 (leftmost_32_bits << 1) | ((key_byte & (1 << 7)) >> 7); 131 132 input_byte <<= 1; 133 key_byte <<= 1; 134 } 135 } 136 137 *result = accumulator; 138 } 139 140 141 static inline int ip6_extension_header_type(__u8 hdr_type) 142 { 143 switch (hdr_type) { 144 case IPPROTO_HOPOPTS: 145 case IPPROTO_ROUTING: 146 case IPPROTO_FRAGMENT: 147 case IPPROTO_ICMPV6: 148 case IPPROTO_NONE: 149 case IPPROTO_DSTOPTS: 150 case IPPROTO_MH: 151 return 1; 152 default: 153 return 0; 154 } 155 } 156 /* 157 * According to 158 * https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml 159 * we expect that there are would be no more than 11 extensions in IPv6 header, 160 * also there is 27 TLV options for Destination and Hop-by-hop extensions. 161 * Need to choose reasonable amount of maximum extensions/options we may 162 * check to find ext src/dst. 163 */ 164 #define IP6_EXTENSIONS_COUNT 11 165 #define IP6_OPTIONS_COUNT 30 166 167 static inline int parse_ipv6_ext(struct __sk_buff *skb, 168 struct packet_hash_info_t *info, 169 __u8 *l4_protocol, size_t *l4_offset) 170 { 171 int err = 0; 172 173 if (!ip6_extension_header_type(*l4_protocol)) { 174 return 0; 175 } 176 177 struct ipv6_opt_hdr ext_hdr = {}; 178 179 for (unsigned int i = 0; i < IP6_EXTENSIONS_COUNT; ++i) { 180 181 err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_hdr, 182 sizeof(ext_hdr), BPF_HDR_START_NET); 183 if (err) { 184 goto error; 185 } 186 187 if (*l4_protocol == IPPROTO_ROUTING) { 188 struct ipv6_rt_hdr ext_rt = {}; 189 190 err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_rt, 191 sizeof(ext_rt), BPF_HDR_START_NET); 192 if (err) { 193 goto error; 194 } 195 196 if ((ext_rt.type == IPV6_SRCRT_TYPE_2) && 197 (ext_rt.hdrlen == sizeof(struct in6_addr) / 8) && 198 (ext_rt.segments_left == 1)) { 199 200 err = bpf_skb_load_bytes_relative(skb, 201 *l4_offset + offsetof(struct rt2_hdr, addr), 202 &info->in6_ext_dst, sizeof(info->in6_ext_dst), 203 BPF_HDR_START_NET); 204 if (err) { 205 goto error; 206 } 207 208 info->is_ipv6_ext_dst = 1; 209 } 210 211 } else if (*l4_protocol == IPPROTO_DSTOPTS) { 212 struct ipv6_opt_t { 213 __u8 type; 214 __u8 length; 215 } __attribute__((packed)) opt = {}; 216 217 size_t opt_offset = sizeof(ext_hdr); 218 219 for (unsigned int j = 0; j < IP6_OPTIONS_COUNT; ++j) { 220 err = bpf_skb_load_bytes_relative(skb, *l4_offset + opt_offset, 221 &opt, sizeof(opt), BPF_HDR_START_NET); 222 if (err) { 223 goto error; 224 } 225 226 if (opt.type == IPV6_TLV_HAO) { 227 err = bpf_skb_load_bytes_relative(skb, 228 *l4_offset + opt_offset 229 + offsetof(struct ipv6_destopt_hao, addr), 230 &info->in6_ext_src, sizeof(info->in6_ext_src), 231 BPF_HDR_START_NET); 232 if (err) { 233 goto error; 234 } 235 236 info->is_ipv6_ext_src = 1; 237 break; 238 } 239 240 opt_offset += (opt.type == IPV6_TLV_PAD1) ? 241 1 : opt.length + sizeof(opt); 242 243 if (opt_offset + 1 >= ext_hdr.hdrlen * 8) { 244 break; 245 } 246 } 247 } else if (*l4_protocol == IPPROTO_FRAGMENT) { 248 info->is_fragmented = true; 249 } 250 251 *l4_protocol = ext_hdr.nexthdr; 252 *l4_offset += (ext_hdr.hdrlen + 1) * 8; 253 254 if (!ip6_extension_header_type(ext_hdr.nexthdr)) { 255 return 0; 256 } 257 } 258 259 return 0; 260 error: 261 return err; 262 } 263 264 static __be16 parse_eth_type(struct __sk_buff *skb) 265 { 266 unsigned int offset = 12; 267 __be16 ret = 0; 268 int err = 0; 269 270 err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret), 271 BPF_HDR_START_MAC); 272 if (err) { 273 return 0; 274 } 275 276 switch (bpf_ntohs(ret)) { 277 case ETH_P_8021AD: 278 offset += 4; 279 case ETH_P_8021Q: 280 offset += 4; 281 err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret), 282 BPF_HDR_START_MAC); 283 default: 284 break; 285 } 286 287 if (err) { 288 return 0; 289 } 290 291 return ret; 292 } 293 294 static inline int parse_packet(struct __sk_buff *skb, 295 struct packet_hash_info_t *info) 296 { 297 int err = 0; 298 299 if (!info || !skb) { 300 return -1; 301 } 302 303 size_t l4_offset = 0; 304 __u8 l4_protocol = 0; 305 __u16 l3_protocol = bpf_ntohs(parse_eth_type(skb)); 306 if (l3_protocol == 0) { 307 err = -1; 308 goto error; 309 } 310 311 if (l3_protocol == ETH_P_IP) { 312 info->is_ipv4 = 1; 313 314 struct iphdr ip = {}; 315 err = bpf_skb_load_bytes_relative(skb, 0, &ip, sizeof(ip), 316 BPF_HDR_START_NET); 317 if (err) { 318 goto error; 319 } 320 321 info->in_src = ip.saddr; 322 info->in_dst = ip.daddr; 323 info->is_fragmented = !!(bpf_ntohs(ip.frag_off) & (0x2000 | 0x1fff)); 324 325 l4_protocol = ip.protocol; 326 l4_offset = ip.ihl * 4; 327 } else if (l3_protocol == ETH_P_IPV6) { 328 info->is_ipv6 = 1; 329 330 struct ipv6hdr ip6 = {}; 331 err = bpf_skb_load_bytes_relative(skb, 0, &ip6, sizeof(ip6), 332 BPF_HDR_START_NET); 333 if (err) { 334 goto error; 335 } 336 337 info->in6_src = ip6.saddr; 338 info->in6_dst = ip6.daddr; 339 340 l4_protocol = ip6.nexthdr; 341 l4_offset = sizeof(ip6); 342 343 err = parse_ipv6_ext(skb, info, &l4_protocol, &l4_offset); 344 if (err) { 345 goto error; 346 } 347 } 348 349 if (l4_protocol != 0 && !info->is_fragmented) { 350 if (l4_protocol == IPPROTO_TCP) { 351 info->is_tcp = 1; 352 353 struct tcphdr tcp = {}; 354 err = bpf_skb_load_bytes_relative(skb, l4_offset, &tcp, sizeof(tcp), 355 BPF_HDR_START_NET); 356 if (err) { 357 goto error; 358 } 359 360 info->src_port = tcp.source; 361 info->dst_port = tcp.dest; 362 } else if (l4_protocol == IPPROTO_UDP) { /* TODO: add udplite? */ 363 info->is_udp = 1; 364 365 struct udphdr udp = {}; 366 err = bpf_skb_load_bytes_relative(skb, l4_offset, &udp, sizeof(udp), 367 BPF_HDR_START_NET); 368 if (err) { 369 goto error; 370 } 371 372 info->src_port = udp.source; 373 info->dst_port = udp.dest; 374 } 375 } 376 377 return 0; 378 379 error: 380 return err; 381 } 382 383 static inline bool calculate_rss_hash(struct __sk_buff *skb, 384 struct rss_config_t *config, 385 struct toeplitz_key_data_t *toe, 386 __u32 *result) 387 { 388 __u8 rss_input[HASH_CALCULATION_BUFFER_SIZE] = {}; 389 size_t bytes_written = 0; 390 int err = 0; 391 struct packet_hash_info_t packet_info = {}; 392 393 err = parse_packet(skb, &packet_info); 394 if (err) { 395 return false; 396 } 397 398 if (packet_info.is_ipv4) { 399 if (packet_info.is_tcp && 400 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) { 401 402 net_rx_rss_add_chunk(rss_input, &bytes_written, 403 &packet_info.in_src, 404 sizeof(packet_info.in_src)); 405 net_rx_rss_add_chunk(rss_input, &bytes_written, 406 &packet_info.in_dst, 407 sizeof(packet_info.in_dst)); 408 net_rx_rss_add_chunk(rss_input, &bytes_written, 409 &packet_info.src_port, 410 sizeof(packet_info.src_port)); 411 net_rx_rss_add_chunk(rss_input, &bytes_written, 412 &packet_info.dst_port, 413 sizeof(packet_info.dst_port)); 414 } else if (packet_info.is_udp && 415 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) { 416 417 net_rx_rss_add_chunk(rss_input, &bytes_written, 418 &packet_info.in_src, 419 sizeof(packet_info.in_src)); 420 net_rx_rss_add_chunk(rss_input, &bytes_written, 421 &packet_info.in_dst, 422 sizeof(packet_info.in_dst)); 423 net_rx_rss_add_chunk(rss_input, &bytes_written, 424 &packet_info.src_port, 425 sizeof(packet_info.src_port)); 426 net_rx_rss_add_chunk(rss_input, &bytes_written, 427 &packet_info.dst_port, 428 sizeof(packet_info.dst_port)); 429 } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) { 430 net_rx_rss_add_chunk(rss_input, &bytes_written, 431 &packet_info.in_src, 432 sizeof(packet_info.in_src)); 433 net_rx_rss_add_chunk(rss_input, &bytes_written, 434 &packet_info.in_dst, 435 sizeof(packet_info.in_dst)); 436 } 437 } else if (packet_info.is_ipv6) { 438 if (packet_info.is_tcp && 439 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) { 440 441 if (packet_info.is_ipv6_ext_src && 442 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) { 443 444 net_rx_rss_add_chunk(rss_input, &bytes_written, 445 &packet_info.in6_ext_src, 446 sizeof(packet_info.in6_ext_src)); 447 } else { 448 net_rx_rss_add_chunk(rss_input, &bytes_written, 449 &packet_info.in6_src, 450 sizeof(packet_info.in6_src)); 451 } 452 if (packet_info.is_ipv6_ext_dst && 453 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) { 454 455 net_rx_rss_add_chunk(rss_input, &bytes_written, 456 &packet_info.in6_ext_dst, 457 sizeof(packet_info.in6_ext_dst)); 458 } else { 459 net_rx_rss_add_chunk(rss_input, &bytes_written, 460 &packet_info.in6_dst, 461 sizeof(packet_info.in6_dst)); 462 } 463 net_rx_rss_add_chunk(rss_input, &bytes_written, 464 &packet_info.src_port, 465 sizeof(packet_info.src_port)); 466 net_rx_rss_add_chunk(rss_input, &bytes_written, 467 &packet_info.dst_port, 468 sizeof(packet_info.dst_port)); 469 } else if (packet_info.is_udp && 470 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) { 471 472 if (packet_info.is_ipv6_ext_src && 473 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) { 474 475 net_rx_rss_add_chunk(rss_input, &bytes_written, 476 &packet_info.in6_ext_src, 477 sizeof(packet_info.in6_ext_src)); 478 } else { 479 net_rx_rss_add_chunk(rss_input, &bytes_written, 480 &packet_info.in6_src, 481 sizeof(packet_info.in6_src)); 482 } 483 if (packet_info.is_ipv6_ext_dst && 484 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) { 485 486 net_rx_rss_add_chunk(rss_input, &bytes_written, 487 &packet_info.in6_ext_dst, 488 sizeof(packet_info.in6_ext_dst)); 489 } else { 490 net_rx_rss_add_chunk(rss_input, &bytes_written, 491 &packet_info.in6_dst, 492 sizeof(packet_info.in6_dst)); 493 } 494 495 net_rx_rss_add_chunk(rss_input, &bytes_written, 496 &packet_info.src_port, 497 sizeof(packet_info.src_port)); 498 net_rx_rss_add_chunk(rss_input, &bytes_written, 499 &packet_info.dst_port, 500 sizeof(packet_info.dst_port)); 501 502 } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) { 503 if (packet_info.is_ipv6_ext_src && 504 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) { 505 506 net_rx_rss_add_chunk(rss_input, &bytes_written, 507 &packet_info.in6_ext_src, 508 sizeof(packet_info.in6_ext_src)); 509 } else { 510 net_rx_rss_add_chunk(rss_input, &bytes_written, 511 &packet_info.in6_src, 512 sizeof(packet_info.in6_src)); 513 } 514 if (packet_info.is_ipv6_ext_dst && 515 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) { 516 517 net_rx_rss_add_chunk(rss_input, &bytes_written, 518 &packet_info.in6_ext_dst, 519 sizeof(packet_info.in6_ext_dst)); 520 } else { 521 net_rx_rss_add_chunk(rss_input, &bytes_written, 522 &packet_info.in6_dst, 523 sizeof(packet_info.in6_dst)); 524 } 525 } 526 } 527 528 if (!bytes_written) { 529 return false; 530 } 531 532 net_toeplitz_add(result, rss_input, bytes_written, toe); 533 534 return true; 535 } 536 537 SEC("socket") 538 int tun_rss_steering_prog(struct __sk_buff *skb) 539 { 540 541 struct rss_config_t *config; 542 struct toeplitz_key_data_t *toe; 543 544 __u32 key = 0; 545 __u32 hash = 0; 546 547 config = bpf_map_lookup_elem(&tap_rss_map_configurations, &key); 548 toe = bpf_map_lookup_elem(&tap_rss_map_toeplitz_key, &key); 549 550 if (!config || !toe) { 551 return 0; 552 } 553 554 if (config->redirect && calculate_rss_hash(skb, config, toe, &hash)) { 555 __u32 table_idx = hash % config->indirections_len; 556 __u16 *queue = 0; 557 558 queue = bpf_map_lookup_elem(&tap_rss_map_indirection_table, 559 &table_idx); 560 561 if (queue) { 562 return *queue; 563 } 564 } 565 566 return config->default_queue; 567 } 568 569 char _license[] SEC("license") = "GPL v2"; 570