1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* COMMON Applications Kept Enhanced (CAKE) discipline 4 * 5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> 6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> 7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> 8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> 9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> 10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> 11 * 12 * The CAKE Principles: 13 * (or, how to have your cake and eat it too) 14 * 15 * This is a combination of several shaping, AQM and FQ techniques into one 16 * easy-to-use package: 17 * 18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE 19 * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), 20 * eliminating the need for any sort of burst parameter (eg. token bucket 21 * depth). Burst support is limited to that necessary to overcome scheduling 22 * latency. 23 * 24 * - A Diffserv-aware priority queue, giving more priority to certain classes, 25 * up to a specified fraction of bandwidth. Above that bandwidth threshold, 26 * the priority is reduced to avoid starving other tins. 27 * 28 * - Each priority tin has a separate Flow Queue system, to isolate traffic 29 * flows from each other. This prevents a burst on one flow from increasing 30 * the delay to another. Flows are distributed to queues using a 31 * set-associative hash function. 32 * 33 * - Each queue is actively managed by Cobalt, which is a combination of the 34 * Codel and Blue AQM algorithms. This serves flows fairly, and signals 35 * congestion early via ECN (if available) and/or packet drops, to keep 36 * latency low. The codel parameters are auto-tuned based on the bandwidth 37 * setting, as is necessary at low bandwidths. 38 * 39 * The configuration parameters are kept deliberately simple for ease of use. 40 * Everything has sane defaults. Complete generality of configuration is *not* 41 * a goal. 42 * 43 * The priority queue operates according to a weighted DRR scheme, combined with 44 * a bandwidth tracker which reuses the shaper logic to detect which side of the 45 * bandwidth sharing threshold the tin is operating. This determines whether a 46 * priority-based weight (high) or a bandwidth-based weight (low) is used for 47 * that tin in the current pass. 48 * 49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly 50 * granted us permission to leverage. 51 */ 52 53 #include <linux/module.h> 54 #include <linux/types.h> 55 #include <linux/kernel.h> 56 #include <linux/jiffies.h> 57 #include <linux/string.h> 58 #include <linux/in.h> 59 #include <linux/errno.h> 60 #include <linux/init.h> 61 #include <linux/skbuff.h> 62 #include <linux/jhash.h> 63 #include <linux/slab.h> 64 #include <linux/vmalloc.h> 65 #include <linux/reciprocal_div.h> 66 #include <net/netlink.h> 67 #include <linux/if_vlan.h> 68 #include <net/pkt_sched.h> 69 #include <net/pkt_cls.h> 70 #include <net/tcp.h> 71 #include <net/flow_dissector.h> 72 73 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 74 #include <net/netfilter/nf_conntrack_core.h> 75 #endif 76 77 #define CAKE_SET_WAYS (8) 78 #define CAKE_MAX_TINS (8) 79 #define CAKE_QUEUES (1024) 80 #define CAKE_FLOW_MASK 63 81 #define CAKE_FLOW_NAT_FLAG 64 82 83 /* struct cobalt_params - contains codel and blue parameters 84 * @interval: codel initial drop rate 85 * @target: maximum persistent sojourn time & blue update rate 86 * @mtu_time: serialisation delay of maximum-size packet 87 * @p_inc: increment of blue drop probability (0.32 fxp) 88 * @p_dec: decrement of blue drop probability (0.32 fxp) 89 */ 90 struct cobalt_params { 91 u64 interval; 92 u64 target; 93 u64 mtu_time; 94 u32 p_inc; 95 u32 p_dec; 96 }; 97 98 /* struct cobalt_vars - contains codel and blue variables 99 * @count: codel dropping frequency 100 * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 101 * @drop_next: time to drop next packet, or when we dropped last 102 * @blue_timer: Blue time to next drop 103 * @p_drop: BLUE drop probability (0.32 fxp) 104 * @dropping: set if in dropping state 105 * @ecn_marked: set if marked 106 */ 107 struct cobalt_vars { 108 u32 count; 109 u32 rec_inv_sqrt; 110 ktime_t drop_next; 111 ktime_t blue_timer; 112 u32 p_drop; 113 bool dropping; 114 bool ecn_marked; 115 }; 116 117 enum { 118 CAKE_SET_NONE = 0, 119 CAKE_SET_SPARSE, 120 CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ 121 CAKE_SET_BULK, 122 CAKE_SET_DECAYING 123 }; 124 125 struct cake_flow { 126 /* this stuff is all needed per-flow at dequeue time */ 127 struct sk_buff *head; 128 struct sk_buff *tail; 129 struct list_head flowchain; 130 s32 deficit; 131 u32 dropped; 132 struct cobalt_vars cvars; 133 u16 srchost; /* index into cake_host table */ 134 u16 dsthost; 135 u8 set; 136 }; /* please try to keep this structure <= 64 bytes */ 137 138 struct cake_host { 139 u32 srchost_tag; 140 u32 dsthost_tag; 141 u16 srchost_bulk_flow_count; 142 u16 dsthost_bulk_flow_count; 143 }; 144 145 struct cake_heap_entry { 146 u16 t:3, b:10; 147 }; 148 149 struct cake_tin_data { 150 struct cake_flow flows[CAKE_QUEUES]; 151 u32 backlogs[CAKE_QUEUES]; 152 u32 tags[CAKE_QUEUES]; /* for set association */ 153 u16 overflow_idx[CAKE_QUEUES]; 154 struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ 155 u16 flow_quantum; 156 157 struct cobalt_params cparams; 158 u32 drop_overlimit; 159 u16 bulk_flow_count; 160 u16 sparse_flow_count; 161 u16 decaying_flow_count; 162 u16 unresponsive_flow_count; 163 164 u32 max_skblen; 165 166 struct list_head new_flows; 167 struct list_head old_flows; 168 struct list_head decaying_flows; 169 170 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 171 ktime_t time_next_packet; 172 u64 tin_rate_ns; 173 u64 tin_rate_bps; 174 u16 tin_rate_shft; 175 176 u16 tin_quantum; 177 s32 tin_deficit; 178 u32 tin_backlog; 179 u32 tin_dropped; 180 u32 tin_ecn_mark; 181 182 u32 packets; 183 u64 bytes; 184 185 u32 ack_drops; 186 187 /* moving averages */ 188 u64 avge_delay; 189 u64 peak_delay; 190 u64 base_delay; 191 192 /* hash function stats */ 193 u32 way_directs; 194 u32 way_hits; 195 u32 way_misses; 196 u32 way_collisions; 197 }; /* number of tins is small, so size of this struct doesn't matter much */ 198 199 struct cake_sched_data { 200 struct tcf_proto __rcu *filter_list; /* optional external classifier */ 201 struct tcf_block *block; 202 struct cake_tin_data *tins; 203 204 struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; 205 u16 overflow_timeout; 206 207 u16 tin_cnt; 208 u8 tin_mode; 209 u8 flow_mode; 210 u8 ack_filter; 211 u8 atm_mode; 212 213 u32 fwmark_mask; 214 u16 fwmark_shft; 215 216 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 217 u16 rate_shft; 218 ktime_t time_next_packet; 219 ktime_t failsafe_next_packet; 220 u64 rate_ns; 221 u64 rate_bps; 222 u16 rate_flags; 223 s16 rate_overhead; 224 u16 rate_mpu; 225 u64 interval; 226 u64 target; 227 228 /* resource tracking */ 229 u32 buffer_used; 230 u32 buffer_max_used; 231 u32 buffer_limit; 232 u32 buffer_config_limit; 233 234 /* indices for dequeue */ 235 u16 cur_tin; 236 u16 cur_flow; 237 238 struct qdisc_watchdog watchdog; 239 const u8 *tin_index; 240 const u8 *tin_order; 241 242 /* bandwidth capacity estimate */ 243 ktime_t last_packet_time; 244 ktime_t avg_window_begin; 245 u64 avg_packet_interval; 246 u64 avg_window_bytes; 247 u64 avg_peak_bandwidth; 248 ktime_t last_reconfig_time; 249 250 /* packet length stats */ 251 u32 avg_netoff; 252 u16 max_netlen; 253 u16 max_adjlen; 254 u16 min_netlen; 255 u16 min_adjlen; 256 }; 257 258 enum { 259 CAKE_FLAG_OVERHEAD = BIT(0), 260 CAKE_FLAG_AUTORATE_INGRESS = BIT(1), 261 CAKE_FLAG_INGRESS = BIT(2), 262 CAKE_FLAG_WASH = BIT(3), 263 CAKE_FLAG_SPLIT_GSO = BIT(4) 264 }; 265 266 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to 267 * obtain the best features of each. Codel is excellent on flows which 268 * respond to congestion signals in a TCP-like way. BLUE is more effective on 269 * unresponsive flows. 270 */ 271 272 struct cobalt_skb_cb { 273 ktime_t enqueue_time; 274 u32 adjusted_len; 275 }; 276 277 static u64 us_to_ns(u64 us) 278 { 279 return us * NSEC_PER_USEC; 280 } 281 282 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) 283 { 284 qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); 285 return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; 286 } 287 288 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) 289 { 290 return get_cobalt_cb(skb)->enqueue_time; 291 } 292 293 static void cobalt_set_enqueue_time(struct sk_buff *skb, 294 ktime_t now) 295 { 296 get_cobalt_cb(skb)->enqueue_time = now; 297 } 298 299 static u16 quantum_div[CAKE_QUEUES + 1] = {0}; 300 301 /* Diffserv lookup tables */ 302 303 static const u8 precedence[] = { 304 0, 0, 0, 0, 0, 0, 0, 0, 305 1, 1, 1, 1, 1, 1, 1, 1, 306 2, 2, 2, 2, 2, 2, 2, 2, 307 3, 3, 3, 3, 3, 3, 3, 3, 308 4, 4, 4, 4, 4, 4, 4, 4, 309 5, 5, 5, 5, 5, 5, 5, 5, 310 6, 6, 6, 6, 6, 6, 6, 6, 311 7, 7, 7, 7, 7, 7, 7, 7, 312 }; 313 314 static const u8 diffserv8[] = { 315 2, 5, 1, 2, 4, 2, 2, 2, 316 0, 2, 1, 2, 1, 2, 1, 2, 317 5, 2, 4, 2, 4, 2, 4, 2, 318 3, 2, 3, 2, 3, 2, 3, 2, 319 6, 2, 3, 2, 3, 2, 3, 2, 320 6, 2, 2, 2, 6, 2, 6, 2, 321 7, 2, 2, 2, 2, 2, 2, 2, 322 7, 2, 2, 2, 2, 2, 2, 2, 323 }; 324 325 static const u8 diffserv4[] = { 326 0, 2, 0, 0, 2, 0, 0, 0, 327 1, 0, 0, 0, 0, 0, 0, 0, 328 2, 0, 2, 0, 2, 0, 2, 0, 329 2, 0, 2, 0, 2, 0, 2, 0, 330 3, 0, 2, 0, 2, 0, 2, 0, 331 3, 0, 0, 0, 3, 0, 3, 0, 332 3, 0, 0, 0, 0, 0, 0, 0, 333 3, 0, 0, 0, 0, 0, 0, 0, 334 }; 335 336 static const u8 diffserv3[] = { 337 0, 0, 0, 0, 2, 0, 0, 0, 338 1, 0, 0, 0, 0, 0, 0, 0, 339 0, 0, 0, 0, 0, 0, 0, 0, 340 0, 0, 0, 0, 0, 0, 0, 0, 341 0, 0, 0, 0, 0, 0, 0, 0, 342 0, 0, 0, 0, 2, 0, 2, 0, 343 2, 0, 0, 0, 0, 0, 0, 0, 344 2, 0, 0, 0, 0, 0, 0, 0, 345 }; 346 347 static const u8 besteffort[] = { 348 0, 0, 0, 0, 0, 0, 0, 0, 349 0, 0, 0, 0, 0, 0, 0, 0, 350 0, 0, 0, 0, 0, 0, 0, 0, 351 0, 0, 0, 0, 0, 0, 0, 0, 352 0, 0, 0, 0, 0, 0, 0, 0, 353 0, 0, 0, 0, 0, 0, 0, 0, 354 0, 0, 0, 0, 0, 0, 0, 0, 355 0, 0, 0, 0, 0, 0, 0, 0, 356 }; 357 358 /* tin priority order for stats dumping */ 359 360 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; 361 static const u8 bulk_order[] = {1, 0, 2, 3}; 362 363 #define REC_INV_SQRT_CACHE (16) 364 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; 365 366 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots 367 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) 368 * 369 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 370 */ 371 372 static void cobalt_newton_step(struct cobalt_vars *vars) 373 { 374 u32 invsqrt, invsqrt2; 375 u64 val; 376 377 invsqrt = vars->rec_inv_sqrt; 378 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; 379 val = (3LL << 32) - ((u64)vars->count * invsqrt2); 380 381 val >>= 2; /* avoid overflow in following multiply */ 382 val = (val * invsqrt) >> (32 - 2 + 1); 383 384 vars->rec_inv_sqrt = val; 385 } 386 387 static void cobalt_invsqrt(struct cobalt_vars *vars) 388 { 389 if (vars->count < REC_INV_SQRT_CACHE) 390 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; 391 else 392 cobalt_newton_step(vars); 393 } 394 395 /* There is a big difference in timing between the accurate values placed in 396 * the cache and the approximations given by a single Newton step for small 397 * count values, particularly when stepping from count 1 to 2 or vice versa. 398 * Above 16, a single Newton step gives sufficient accuracy in either 399 * direction, given the precision stored. 400 * 401 * The magnitude of the error when stepping up to count 2 is such as to give 402 * the value that *should* have been produced at count 4. 403 */ 404 405 static void cobalt_cache_init(void) 406 { 407 struct cobalt_vars v; 408 409 memset(&v, 0, sizeof(v)); 410 v.rec_inv_sqrt = ~0U; 411 cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; 412 413 for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { 414 cobalt_newton_step(&v); 415 cobalt_newton_step(&v); 416 cobalt_newton_step(&v); 417 cobalt_newton_step(&v); 418 419 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; 420 } 421 } 422 423 static void cobalt_vars_init(struct cobalt_vars *vars) 424 { 425 memset(vars, 0, sizeof(*vars)); 426 427 if (!cobalt_rec_inv_sqrt_cache[0]) { 428 cobalt_cache_init(); 429 cobalt_rec_inv_sqrt_cache[0] = ~0; 430 } 431 } 432 433 /* CoDel control_law is t + interval/sqrt(count) 434 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid 435 * both sqrt() and divide operation. 436 */ 437 static ktime_t cobalt_control(ktime_t t, 438 u64 interval, 439 u32 rec_inv_sqrt) 440 { 441 return ktime_add_ns(t, reciprocal_scale(interval, 442 rec_inv_sqrt)); 443 } 444 445 /* Call this when a packet had to be dropped due to queue overflow. Returns 446 * true if the BLUE state was quiescent before but active after this call. 447 */ 448 static bool cobalt_queue_full(struct cobalt_vars *vars, 449 struct cobalt_params *p, 450 ktime_t now) 451 { 452 bool up = false; 453 454 if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 455 up = !vars->p_drop; 456 vars->p_drop += p->p_inc; 457 if (vars->p_drop < p->p_inc) 458 vars->p_drop = ~0; 459 vars->blue_timer = now; 460 } 461 vars->dropping = true; 462 vars->drop_next = now; 463 if (!vars->count) 464 vars->count = 1; 465 466 return up; 467 } 468 469 /* Call this when the queue was serviced but turned out to be empty. Returns 470 * true if the BLUE state was active before but quiescent after this call. 471 */ 472 static bool cobalt_queue_empty(struct cobalt_vars *vars, 473 struct cobalt_params *p, 474 ktime_t now) 475 { 476 bool down = false; 477 478 if (vars->p_drop && 479 ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 480 if (vars->p_drop < p->p_dec) 481 vars->p_drop = 0; 482 else 483 vars->p_drop -= p->p_dec; 484 vars->blue_timer = now; 485 down = !vars->p_drop; 486 } 487 vars->dropping = false; 488 489 if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { 490 vars->count--; 491 cobalt_invsqrt(vars); 492 vars->drop_next = cobalt_control(vars->drop_next, 493 p->interval, 494 vars->rec_inv_sqrt); 495 } 496 497 return down; 498 } 499 500 /* Call this with a freshly dequeued packet for possible congestion marking. 501 * Returns true as an instruction to drop the packet, false for delivery. 502 */ 503 static bool cobalt_should_drop(struct cobalt_vars *vars, 504 struct cobalt_params *p, 505 ktime_t now, 506 struct sk_buff *skb, 507 u32 bulk_flows) 508 { 509 bool next_due, over_target, drop = false; 510 ktime_t schedule; 511 u64 sojourn; 512 513 /* The 'schedule' variable records, in its sign, whether 'now' is before or 514 * after 'drop_next'. This allows 'drop_next' to be updated before the next 515 * scheduling decision is actually branched, without destroying that 516 * information. Similarly, the first 'schedule' value calculated is preserved 517 * in the boolean 'next_due'. 518 * 519 * As for 'drop_next', we take advantage of the fact that 'interval' is both 520 * the delay between first exceeding 'target' and the first signalling event, 521 * *and* the scaling factor for the signalling frequency. It's therefore very 522 * natural to use a single mechanism for both purposes, and eliminates a 523 * significant amount of reference Codel's spaghetti code. To help with this, 524 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close 525 * as possible to 1.0 in fixed-point. 526 */ 527 528 sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 529 schedule = ktime_sub(now, vars->drop_next); 530 over_target = sojourn > p->target && 531 sojourn > p->mtu_time * bulk_flows * 2 && 532 sojourn > p->mtu_time * 4; 533 next_due = vars->count && ktime_to_ns(schedule) >= 0; 534 535 vars->ecn_marked = false; 536 537 if (over_target) { 538 if (!vars->dropping) { 539 vars->dropping = true; 540 vars->drop_next = cobalt_control(now, 541 p->interval, 542 vars->rec_inv_sqrt); 543 } 544 if (!vars->count) 545 vars->count = 1; 546 } else if (vars->dropping) { 547 vars->dropping = false; 548 } 549 550 if (next_due && vars->dropping) { 551 /* Use ECN mark if possible, otherwise drop */ 552 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); 553 554 vars->count++; 555 if (!vars->count) 556 vars->count--; 557 cobalt_invsqrt(vars); 558 vars->drop_next = cobalt_control(vars->drop_next, 559 p->interval, 560 vars->rec_inv_sqrt); 561 schedule = ktime_sub(now, vars->drop_next); 562 } else { 563 while (next_due) { 564 vars->count--; 565 cobalt_invsqrt(vars); 566 vars->drop_next = cobalt_control(vars->drop_next, 567 p->interval, 568 vars->rec_inv_sqrt); 569 schedule = ktime_sub(now, vars->drop_next); 570 next_due = vars->count && ktime_to_ns(schedule) >= 0; 571 } 572 } 573 574 /* Simple BLUE implementation. Lack of ECN is deliberate. */ 575 if (vars->p_drop) 576 drop |= (prandom_u32() < vars->p_drop); 577 578 /* Overload the drop_next field as an activity timeout */ 579 if (!vars->count) 580 vars->drop_next = ktime_add_ns(now, p->interval); 581 else if (ktime_to_ns(schedule) > 0 && !drop) 582 vars->drop_next = now; 583 584 return drop; 585 } 586 587 static bool cake_update_flowkeys(struct flow_keys *keys, 588 const struct sk_buff *skb) 589 { 590 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 591 struct nf_conntrack_tuple tuple = {}; 592 bool rev = !skb->_nfct, upd = false; 593 __be32 ip; 594 595 if (tc_skb_protocol(skb) != htons(ETH_P_IP)) 596 return false; 597 598 if (!nf_ct_get_tuple_skb(&tuple, skb)) 599 return false; 600 601 ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; 602 if (ip != keys->addrs.v4addrs.src) { 603 keys->addrs.v4addrs.src = ip; 604 upd = true; 605 } 606 ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; 607 if (ip != keys->addrs.v4addrs.dst) { 608 keys->addrs.v4addrs.dst = ip; 609 upd = true; 610 } 611 612 if (keys->ports.ports) { 613 __be16 port; 614 615 port = rev ? tuple.dst.u.all : tuple.src.u.all; 616 if (port != keys->ports.src) { 617 keys->ports.src = port; 618 upd = true; 619 } 620 port = rev ? tuple.src.u.all : tuple.dst.u.all; 621 if (port != keys->ports.dst) { 622 port = keys->ports.dst; 623 upd = true; 624 } 625 } 626 return upd; 627 #else 628 return false; 629 #endif 630 } 631 632 /* Cake has several subtle multiple bit settings. In these cases you 633 * would be matching triple isolate mode as well. 634 */ 635 636 static bool cake_dsrc(int flow_mode) 637 { 638 return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; 639 } 640 641 static bool cake_ddst(int flow_mode) 642 { 643 return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; 644 } 645 646 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, 647 int flow_mode, u16 flow_override, u16 host_override) 648 { 649 bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); 650 bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); 651 bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); 652 u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; 653 u16 reduced_hash, srchost_idx, dsthost_idx; 654 struct flow_keys keys, host_keys; 655 bool use_skbhash = skb->l4_hash; 656 657 if (unlikely(flow_mode == CAKE_FLOW_NONE)) 658 return 0; 659 660 /* If both overrides are set, or we can use the SKB hash and nat mode is 661 * disabled, we can skip packet dissection entirely. If nat mode is 662 * enabled there's another check below after doing the conntrack lookup. 663 */ 664 if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) 665 goto skip_hash; 666 667 skb_flow_dissect_flow_keys(skb, &keys, 668 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); 669 670 /* Don't use the SKB hash if we change the lookup keys from conntrack */ 671 if (nat_enabled && cake_update_flowkeys(&keys, skb)) 672 use_skbhash = false; 673 674 /* If we can still use the SKB hash and don't need the host hash, we can 675 * skip the rest of the hashing procedure 676 */ 677 if (use_skbhash && !hash_hosts) 678 goto skip_hash; 679 680 /* flow_hash_from_keys() sorts the addresses by value, so we have 681 * to preserve their order in a separate data structure to treat 682 * src and dst host addresses as independently selectable. 683 */ 684 host_keys = keys; 685 host_keys.ports.ports = 0; 686 host_keys.basic.ip_proto = 0; 687 host_keys.keyid.keyid = 0; 688 host_keys.tags.flow_label = 0; 689 690 switch (host_keys.control.addr_type) { 691 case FLOW_DISSECTOR_KEY_IPV4_ADDRS: 692 host_keys.addrs.v4addrs.src = 0; 693 dsthost_hash = flow_hash_from_keys(&host_keys); 694 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 695 host_keys.addrs.v4addrs.dst = 0; 696 srchost_hash = flow_hash_from_keys(&host_keys); 697 break; 698 699 case FLOW_DISSECTOR_KEY_IPV6_ADDRS: 700 memset(&host_keys.addrs.v6addrs.src, 0, 701 sizeof(host_keys.addrs.v6addrs.src)); 702 dsthost_hash = flow_hash_from_keys(&host_keys); 703 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 704 memset(&host_keys.addrs.v6addrs.dst, 0, 705 sizeof(host_keys.addrs.v6addrs.dst)); 706 srchost_hash = flow_hash_from_keys(&host_keys); 707 break; 708 709 default: 710 dsthost_hash = 0; 711 srchost_hash = 0; 712 } 713 714 /* This *must* be after the above switch, since as a 715 * side-effect it sorts the src and dst addresses. 716 */ 717 if (hash_flows && !use_skbhash) 718 flow_hash = flow_hash_from_keys(&keys); 719 720 skip_hash: 721 if (flow_override) 722 flow_hash = flow_override - 1; 723 else if (use_skbhash) 724 flow_hash = skb->hash; 725 if (host_override) { 726 dsthost_hash = host_override - 1; 727 srchost_hash = host_override - 1; 728 } 729 730 if (!(flow_mode & CAKE_FLOW_FLOWS)) { 731 if (flow_mode & CAKE_FLOW_SRC_IP) 732 flow_hash ^= srchost_hash; 733 734 if (flow_mode & CAKE_FLOW_DST_IP) 735 flow_hash ^= dsthost_hash; 736 } 737 738 reduced_hash = flow_hash % CAKE_QUEUES; 739 740 /* set-associative hashing */ 741 /* fast path if no hash collision (direct lookup succeeds) */ 742 if (likely(q->tags[reduced_hash] == flow_hash && 743 q->flows[reduced_hash].set)) { 744 q->way_directs++; 745 } else { 746 u32 inner_hash = reduced_hash % CAKE_SET_WAYS; 747 u32 outer_hash = reduced_hash - inner_hash; 748 bool allocate_src = false; 749 bool allocate_dst = false; 750 u32 i, k; 751 752 /* check if any active queue in the set is reserved for 753 * this flow. 754 */ 755 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 756 i++, k = (k + 1) % CAKE_SET_WAYS) { 757 if (q->tags[outer_hash + k] == flow_hash) { 758 if (i) 759 q->way_hits++; 760 761 if (!q->flows[outer_hash + k].set) { 762 /* need to increment host refcnts */ 763 allocate_src = cake_dsrc(flow_mode); 764 allocate_dst = cake_ddst(flow_mode); 765 } 766 767 goto found; 768 } 769 } 770 771 /* no queue is reserved for this flow, look for an 772 * empty one. 773 */ 774 for (i = 0; i < CAKE_SET_WAYS; 775 i++, k = (k + 1) % CAKE_SET_WAYS) { 776 if (!q->flows[outer_hash + k].set) { 777 q->way_misses++; 778 allocate_src = cake_dsrc(flow_mode); 779 allocate_dst = cake_ddst(flow_mode); 780 goto found; 781 } 782 } 783 784 /* With no empty queues, default to the original 785 * queue, accept the collision, update the host tags. 786 */ 787 q->way_collisions++; 788 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { 789 q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; 790 q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; 791 } 792 allocate_src = cake_dsrc(flow_mode); 793 allocate_dst = cake_ddst(flow_mode); 794 found: 795 /* reserve queue for future packets in same flow */ 796 reduced_hash = outer_hash + k; 797 q->tags[reduced_hash] = flow_hash; 798 799 if (allocate_src) { 800 srchost_idx = srchost_hash % CAKE_QUEUES; 801 inner_hash = srchost_idx % CAKE_SET_WAYS; 802 outer_hash = srchost_idx - inner_hash; 803 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 804 i++, k = (k + 1) % CAKE_SET_WAYS) { 805 if (q->hosts[outer_hash + k].srchost_tag == 806 srchost_hash) 807 goto found_src; 808 } 809 for (i = 0; i < CAKE_SET_WAYS; 810 i++, k = (k + 1) % CAKE_SET_WAYS) { 811 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) 812 break; 813 } 814 q->hosts[outer_hash + k].srchost_tag = srchost_hash; 815 found_src: 816 srchost_idx = outer_hash + k; 817 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 818 q->hosts[srchost_idx].srchost_bulk_flow_count++; 819 q->flows[reduced_hash].srchost = srchost_idx; 820 } 821 822 if (allocate_dst) { 823 dsthost_idx = dsthost_hash % CAKE_QUEUES; 824 inner_hash = dsthost_idx % CAKE_SET_WAYS; 825 outer_hash = dsthost_idx - inner_hash; 826 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 827 i++, k = (k + 1) % CAKE_SET_WAYS) { 828 if (q->hosts[outer_hash + k].dsthost_tag == 829 dsthost_hash) 830 goto found_dst; 831 } 832 for (i = 0; i < CAKE_SET_WAYS; 833 i++, k = (k + 1) % CAKE_SET_WAYS) { 834 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) 835 break; 836 } 837 q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; 838 found_dst: 839 dsthost_idx = outer_hash + k; 840 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 841 q->hosts[dsthost_idx].dsthost_bulk_flow_count++; 842 q->flows[reduced_hash].dsthost = dsthost_idx; 843 } 844 } 845 846 return reduced_hash; 847 } 848 849 /* helper functions : might be changed when/if skb use a standard list_head */ 850 /* remove one skb from head of slot queue */ 851 852 static struct sk_buff *dequeue_head(struct cake_flow *flow) 853 { 854 struct sk_buff *skb = flow->head; 855 856 if (skb) { 857 flow->head = skb->next; 858 skb_mark_not_on_list(skb); 859 } 860 861 return skb; 862 } 863 864 /* add skb to flow queue (tail add) */ 865 866 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) 867 { 868 if (!flow->head) 869 flow->head = skb; 870 else 871 flow->tail->next = skb; 872 flow->tail = skb; 873 skb->next = NULL; 874 } 875 876 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, 877 struct ipv6hdr *buf) 878 { 879 unsigned int offset = skb_network_offset(skb); 880 struct iphdr *iph; 881 882 iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); 883 884 if (!iph) 885 return NULL; 886 887 if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) 888 return skb_header_pointer(skb, offset + iph->ihl * 4, 889 sizeof(struct ipv6hdr), buf); 890 891 else if (iph->version == 4) 892 return iph; 893 894 else if (iph->version == 6) 895 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), 896 buf); 897 898 return NULL; 899 } 900 901 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, 902 void *buf, unsigned int bufsize) 903 { 904 unsigned int offset = skb_network_offset(skb); 905 const struct ipv6hdr *ipv6h; 906 const struct tcphdr *tcph; 907 const struct iphdr *iph; 908 struct ipv6hdr _ipv6h; 909 struct tcphdr _tcph; 910 911 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); 912 913 if (!ipv6h) 914 return NULL; 915 916 if (ipv6h->version == 4) { 917 iph = (struct iphdr *)ipv6h; 918 offset += iph->ihl * 4; 919 920 /* special-case 6in4 tunnelling, as that is a common way to get 921 * v6 connectivity in the home 922 */ 923 if (iph->protocol == IPPROTO_IPV6) { 924 ipv6h = skb_header_pointer(skb, offset, 925 sizeof(_ipv6h), &_ipv6h); 926 927 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) 928 return NULL; 929 930 offset += sizeof(struct ipv6hdr); 931 932 } else if (iph->protocol != IPPROTO_TCP) { 933 return NULL; 934 } 935 936 } else if (ipv6h->version == 6) { 937 if (ipv6h->nexthdr != IPPROTO_TCP) 938 return NULL; 939 940 offset += sizeof(struct ipv6hdr); 941 } else { 942 return NULL; 943 } 944 945 tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); 946 if (!tcph) 947 return NULL; 948 949 return skb_header_pointer(skb, offset, 950 min(__tcp_hdrlen(tcph), bufsize), buf); 951 } 952 953 static const void *cake_get_tcpopt(const struct tcphdr *tcph, 954 int code, int *oplen) 955 { 956 /* inspired by tcp_parse_options in tcp_input.c */ 957 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 958 const u8 *ptr = (const u8 *)(tcph + 1); 959 960 while (length > 0) { 961 int opcode = *ptr++; 962 int opsize; 963 964 if (opcode == TCPOPT_EOL) 965 break; 966 if (opcode == TCPOPT_NOP) { 967 length--; 968 continue; 969 } 970 opsize = *ptr++; 971 if (opsize < 2 || opsize > length) 972 break; 973 974 if (opcode == code) { 975 *oplen = opsize; 976 return ptr; 977 } 978 979 ptr += opsize - 2; 980 length -= opsize; 981 } 982 983 return NULL; 984 } 985 986 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more 987 * bytes than the other. In the case where both sequences ACKs bytes that the 988 * other doesn't, A is considered greater. DSACKs in A also makes A be 989 * considered greater. 990 * 991 * @return -1, 0 or 1 as normal compare functions 992 */ 993 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, 994 const struct tcphdr *tcph_b) 995 { 996 const struct tcp_sack_block_wire *sack_a, *sack_b; 997 u32 ack_seq_a = ntohl(tcph_a->ack_seq); 998 u32 bytes_a = 0, bytes_b = 0; 999 int oplen_a, oplen_b; 1000 bool first = true; 1001 1002 sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); 1003 sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); 1004 1005 /* pointers point to option contents */ 1006 oplen_a -= TCPOLEN_SACK_BASE; 1007 oplen_b -= TCPOLEN_SACK_BASE; 1008 1009 if (sack_a && oplen_a >= sizeof(*sack_a) && 1010 (!sack_b || oplen_b < sizeof(*sack_b))) 1011 return -1; 1012 else if (sack_b && oplen_b >= sizeof(*sack_b) && 1013 (!sack_a || oplen_a < sizeof(*sack_a))) 1014 return 1; 1015 else if ((!sack_a || oplen_a < sizeof(*sack_a)) && 1016 (!sack_b || oplen_b < sizeof(*sack_b))) 1017 return 0; 1018 1019 while (oplen_a >= sizeof(*sack_a)) { 1020 const struct tcp_sack_block_wire *sack_tmp = sack_b; 1021 u32 start_a = get_unaligned_be32(&sack_a->start_seq); 1022 u32 end_a = get_unaligned_be32(&sack_a->end_seq); 1023 int oplen_tmp = oplen_b; 1024 bool found = false; 1025 1026 /* DSACK; always considered greater to prevent dropping */ 1027 if (before(start_a, ack_seq_a)) 1028 return -1; 1029 1030 bytes_a += end_a - start_a; 1031 1032 while (oplen_tmp >= sizeof(*sack_tmp)) { 1033 u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); 1034 u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); 1035 1036 /* first time through we count the total size */ 1037 if (first) 1038 bytes_b += end_b - start_b; 1039 1040 if (!after(start_b, start_a) && !before(end_b, end_a)) { 1041 found = true; 1042 if (!first) 1043 break; 1044 } 1045 oplen_tmp -= sizeof(*sack_tmp); 1046 sack_tmp++; 1047 } 1048 1049 if (!found) 1050 return -1; 1051 1052 oplen_a -= sizeof(*sack_a); 1053 sack_a++; 1054 first = false; 1055 } 1056 1057 /* If we made it this far, all ranges SACKed by A are covered by B, so 1058 * either the SACKs are equal, or B SACKs more bytes. 1059 */ 1060 return bytes_b > bytes_a ? 1 : 0; 1061 } 1062 1063 static void cake_tcph_get_tstamp(const struct tcphdr *tcph, 1064 u32 *tsval, u32 *tsecr) 1065 { 1066 const u8 *ptr; 1067 int opsize; 1068 1069 ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); 1070 1071 if (ptr && opsize == TCPOLEN_TIMESTAMP) { 1072 *tsval = get_unaligned_be32(ptr); 1073 *tsecr = get_unaligned_be32(ptr + 4); 1074 } 1075 } 1076 1077 static bool cake_tcph_may_drop(const struct tcphdr *tcph, 1078 u32 tstamp_new, u32 tsecr_new) 1079 { 1080 /* inspired by tcp_parse_options in tcp_input.c */ 1081 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1082 const u8 *ptr = (const u8 *)(tcph + 1); 1083 u32 tstamp, tsecr; 1084 1085 /* 3 reserved flags must be unset to avoid future breakage 1086 * ACK must be set 1087 * ECE/CWR are handled separately 1088 * All other flags URG/PSH/RST/SYN/FIN must be unset 1089 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) 1090 * 0x00C00000 = CWR/ECE (handled separately) 1091 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 1092 */ 1093 if (((tcp_flag_word(tcph) & 1094 cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) 1095 return false; 1096 1097 while (length > 0) { 1098 int opcode = *ptr++; 1099 int opsize; 1100 1101 if (opcode == TCPOPT_EOL) 1102 break; 1103 if (opcode == TCPOPT_NOP) { 1104 length--; 1105 continue; 1106 } 1107 opsize = *ptr++; 1108 if (opsize < 2 || opsize > length) 1109 break; 1110 1111 switch (opcode) { 1112 case TCPOPT_MD5SIG: /* doesn't influence state */ 1113 break; 1114 1115 case TCPOPT_SACK: /* stricter checking performed later */ 1116 if (opsize % 8 != 2) 1117 return false; 1118 break; 1119 1120 case TCPOPT_TIMESTAMP: 1121 /* only drop timestamps lower than new */ 1122 if (opsize != TCPOLEN_TIMESTAMP) 1123 return false; 1124 tstamp = get_unaligned_be32(ptr); 1125 tsecr = get_unaligned_be32(ptr + 4); 1126 if (after(tstamp, tstamp_new) || 1127 after(tsecr, tsecr_new)) 1128 return false; 1129 break; 1130 1131 case TCPOPT_MSS: /* these should only be set on SYN */ 1132 case TCPOPT_WINDOW: 1133 case TCPOPT_SACK_PERM: 1134 case TCPOPT_FASTOPEN: 1135 case TCPOPT_EXP: 1136 default: /* don't drop if any unknown options are present */ 1137 return false; 1138 } 1139 1140 ptr += opsize - 2; 1141 length -= opsize; 1142 } 1143 1144 return true; 1145 } 1146 1147 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, 1148 struct cake_flow *flow) 1149 { 1150 bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; 1151 struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; 1152 struct sk_buff *skb_check, *skb_prev = NULL; 1153 const struct ipv6hdr *ipv6h, *ipv6h_check; 1154 unsigned char _tcph[64], _tcph_check[64]; 1155 const struct tcphdr *tcph, *tcph_check; 1156 const struct iphdr *iph, *iph_check; 1157 struct ipv6hdr _iph, _iph_check; 1158 const struct sk_buff *skb; 1159 int seglen, num_found = 0; 1160 u32 tstamp = 0, tsecr = 0; 1161 __be32 elig_flags = 0; 1162 int sack_comp; 1163 1164 /* no other possible ACKs to filter */ 1165 if (flow->head == flow->tail) 1166 return NULL; 1167 1168 skb = flow->tail; 1169 tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); 1170 iph = cake_get_iphdr(skb, &_iph); 1171 if (!tcph) 1172 return NULL; 1173 1174 cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); 1175 1176 /* the 'triggering' packet need only have the ACK flag set. 1177 * also check that SYN is not set, as there won't be any previous ACKs. 1178 */ 1179 if ((tcp_flag_word(tcph) & 1180 (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) 1181 return NULL; 1182 1183 /* the 'triggering' ACK is at the tail of the queue, we have already 1184 * returned if it is the only packet in the flow. loop through the rest 1185 * of the queue looking for pure ACKs with the same 5-tuple as the 1186 * triggering one. 1187 */ 1188 for (skb_check = flow->head; 1189 skb_check && skb_check != skb; 1190 skb_prev = skb_check, skb_check = skb_check->next) { 1191 iph_check = cake_get_iphdr(skb_check, &_iph_check); 1192 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, 1193 sizeof(_tcph_check)); 1194 1195 /* only TCP packets with matching 5-tuple are eligible, and only 1196 * drop safe headers 1197 */ 1198 if (!tcph_check || iph->version != iph_check->version || 1199 tcph_check->source != tcph->source || 1200 tcph_check->dest != tcph->dest) 1201 continue; 1202 1203 if (iph_check->version == 4) { 1204 if (iph_check->saddr != iph->saddr || 1205 iph_check->daddr != iph->daddr) 1206 continue; 1207 1208 seglen = ntohs(iph_check->tot_len) - 1209 (4 * iph_check->ihl); 1210 } else if (iph_check->version == 6) { 1211 ipv6h = (struct ipv6hdr *)iph; 1212 ipv6h_check = (struct ipv6hdr *)iph_check; 1213 1214 if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || 1215 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) 1216 continue; 1217 1218 seglen = ntohs(ipv6h_check->payload_len); 1219 } else { 1220 WARN_ON(1); /* shouldn't happen */ 1221 continue; 1222 } 1223 1224 /* If the ECE/CWR flags changed from the previous eligible 1225 * packet in the same flow, we should no longer be dropping that 1226 * previous packet as this would lose information. 1227 */ 1228 if (elig_ack && (tcp_flag_word(tcph_check) & 1229 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { 1230 elig_ack = NULL; 1231 elig_ack_prev = NULL; 1232 num_found--; 1233 } 1234 1235 /* Check TCP options and flags, don't drop ACKs with segment 1236 * data, and don't drop ACKs with a higher cumulative ACK 1237 * counter than the triggering packet. Check ACK seqno here to 1238 * avoid parsing SACK options of packets we are going to exclude 1239 * anyway. 1240 */ 1241 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || 1242 (seglen - __tcp_hdrlen(tcph_check)) != 0 || 1243 after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) 1244 continue; 1245 1246 /* Check SACK options. The triggering packet must SACK more data 1247 * than the ACK under consideration, or SACK the same range but 1248 * have a larger cumulative ACK counter. The latter is a 1249 * pathological case, but is contained in the following check 1250 * anyway, just to be safe. 1251 */ 1252 sack_comp = cake_tcph_sack_compare(tcph_check, tcph); 1253 1254 if (sack_comp < 0 || 1255 (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && 1256 sack_comp == 0)) 1257 continue; 1258 1259 /* At this point we have found an eligible pure ACK to drop; if 1260 * we are in aggressive mode, we are done. Otherwise, keep 1261 * searching unless this is the second eligible ACK we 1262 * found. 1263 * 1264 * Since we want to drop ACK closest to the head of the queue, 1265 * save the first eligible ACK we find, even if we need to loop 1266 * again. 1267 */ 1268 if (!elig_ack) { 1269 elig_ack = skb_check; 1270 elig_ack_prev = skb_prev; 1271 elig_flags = (tcp_flag_word(tcph_check) 1272 & (TCP_FLAG_ECE | TCP_FLAG_CWR)); 1273 } 1274 1275 if (num_found++ > 0) 1276 goto found; 1277 } 1278 1279 /* We made it through the queue without finding two eligible ACKs . If 1280 * we found a single eligible ACK we can drop it in aggressive mode if 1281 * we can guarantee that this does not interfere with ECN flag 1282 * information. We ensure this by dropping it only if the enqueued 1283 * packet is consecutive with the eligible ACK, and their flags match. 1284 */ 1285 if (elig_ack && aggressive && elig_ack->next == skb && 1286 (elig_flags == (tcp_flag_word(tcph) & 1287 (TCP_FLAG_ECE | TCP_FLAG_CWR)))) 1288 goto found; 1289 1290 return NULL; 1291 1292 found: 1293 if (elig_ack_prev) 1294 elig_ack_prev->next = elig_ack->next; 1295 else 1296 flow->head = elig_ack->next; 1297 1298 skb_mark_not_on_list(elig_ack); 1299 1300 return elig_ack; 1301 } 1302 1303 static u64 cake_ewma(u64 avg, u64 sample, u32 shift) 1304 { 1305 avg -= avg >> shift; 1306 avg += sample >> shift; 1307 return avg; 1308 } 1309 1310 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) 1311 { 1312 if (q->rate_flags & CAKE_FLAG_OVERHEAD) 1313 len -= off; 1314 1315 if (q->max_netlen < len) 1316 q->max_netlen = len; 1317 if (q->min_netlen > len) 1318 q->min_netlen = len; 1319 1320 len += q->rate_overhead; 1321 1322 if (len < q->rate_mpu) 1323 len = q->rate_mpu; 1324 1325 if (q->atm_mode == CAKE_ATM_ATM) { 1326 len += 47; 1327 len /= 48; 1328 len *= 53; 1329 } else if (q->atm_mode == CAKE_ATM_PTM) { 1330 /* Add one byte per 64 bytes or part thereof. 1331 * This is conservative and easier to calculate than the 1332 * precise value. 1333 */ 1334 len += (len + 63) / 64; 1335 } 1336 1337 if (q->max_adjlen < len) 1338 q->max_adjlen = len; 1339 if (q->min_adjlen > len) 1340 q->min_adjlen = len; 1341 1342 return len; 1343 } 1344 1345 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) 1346 { 1347 const struct skb_shared_info *shinfo = skb_shinfo(skb); 1348 unsigned int hdr_len, last_len = 0; 1349 u32 off = skb_network_offset(skb); 1350 u32 len = qdisc_pkt_len(skb); 1351 u16 segs = 1; 1352 1353 q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); 1354 1355 if (!shinfo->gso_size) 1356 return cake_calc_overhead(q, len, off); 1357 1358 /* borrowed from qdisc_pkt_len_init() */ 1359 hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 1360 1361 /* + transport layer */ 1362 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | 1363 SKB_GSO_TCPV6))) { 1364 const struct tcphdr *th; 1365 struct tcphdr _tcphdr; 1366 1367 th = skb_header_pointer(skb, skb_transport_offset(skb), 1368 sizeof(_tcphdr), &_tcphdr); 1369 if (likely(th)) 1370 hdr_len += __tcp_hdrlen(th); 1371 } else { 1372 struct udphdr _udphdr; 1373 1374 if (skb_header_pointer(skb, skb_transport_offset(skb), 1375 sizeof(_udphdr), &_udphdr)) 1376 hdr_len += sizeof(struct udphdr); 1377 } 1378 1379 if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) 1380 segs = DIV_ROUND_UP(skb->len - hdr_len, 1381 shinfo->gso_size); 1382 else 1383 segs = shinfo->gso_segs; 1384 1385 len = shinfo->gso_size + hdr_len; 1386 last_len = skb->len - shinfo->gso_size * (segs - 1); 1387 1388 return (cake_calc_overhead(q, len, off) * (segs - 1) + 1389 cake_calc_overhead(q, last_len, off)); 1390 } 1391 1392 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) 1393 { 1394 struct cake_heap_entry ii = q->overflow_heap[i]; 1395 struct cake_heap_entry jj = q->overflow_heap[j]; 1396 1397 q->overflow_heap[i] = jj; 1398 q->overflow_heap[j] = ii; 1399 1400 q->tins[ii.t].overflow_idx[ii.b] = j; 1401 q->tins[jj.t].overflow_idx[jj.b] = i; 1402 } 1403 1404 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) 1405 { 1406 struct cake_heap_entry ii = q->overflow_heap[i]; 1407 1408 return q->tins[ii.t].backlogs[ii.b]; 1409 } 1410 1411 static void cake_heapify(struct cake_sched_data *q, u16 i) 1412 { 1413 static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; 1414 u32 mb = cake_heap_get_backlog(q, i); 1415 u32 m = i; 1416 1417 while (m < a) { 1418 u32 l = m + m + 1; 1419 u32 r = l + 1; 1420 1421 if (l < a) { 1422 u32 lb = cake_heap_get_backlog(q, l); 1423 1424 if (lb > mb) { 1425 m = l; 1426 mb = lb; 1427 } 1428 } 1429 1430 if (r < a) { 1431 u32 rb = cake_heap_get_backlog(q, r); 1432 1433 if (rb > mb) { 1434 m = r; 1435 mb = rb; 1436 } 1437 } 1438 1439 if (m != i) { 1440 cake_heap_swap(q, i, m); 1441 i = m; 1442 } else { 1443 break; 1444 } 1445 } 1446 } 1447 1448 static void cake_heapify_up(struct cake_sched_data *q, u16 i) 1449 { 1450 while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { 1451 u16 p = (i - 1) >> 1; 1452 u32 ib = cake_heap_get_backlog(q, i); 1453 u32 pb = cake_heap_get_backlog(q, p); 1454 1455 if (ib > pb) { 1456 cake_heap_swap(q, i, p); 1457 i = p; 1458 } else { 1459 break; 1460 } 1461 } 1462 } 1463 1464 static int cake_advance_shaper(struct cake_sched_data *q, 1465 struct cake_tin_data *b, 1466 struct sk_buff *skb, 1467 ktime_t now, bool drop) 1468 { 1469 u32 len = get_cobalt_cb(skb)->adjusted_len; 1470 1471 /* charge packet bandwidth to this tin 1472 * and to the global shaper. 1473 */ 1474 if (q->rate_ns) { 1475 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; 1476 u64 global_dur = (len * q->rate_ns) >> q->rate_shft; 1477 u64 failsafe_dur = global_dur + (global_dur >> 1); 1478 1479 if (ktime_before(b->time_next_packet, now)) 1480 b->time_next_packet = ktime_add_ns(b->time_next_packet, 1481 tin_dur); 1482 1483 else if (ktime_before(b->time_next_packet, 1484 ktime_add_ns(now, tin_dur))) 1485 b->time_next_packet = ktime_add_ns(now, tin_dur); 1486 1487 q->time_next_packet = ktime_add_ns(q->time_next_packet, 1488 global_dur); 1489 if (!drop) 1490 q->failsafe_next_packet = \ 1491 ktime_add_ns(q->failsafe_next_packet, 1492 failsafe_dur); 1493 } 1494 return len; 1495 } 1496 1497 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) 1498 { 1499 struct cake_sched_data *q = qdisc_priv(sch); 1500 ktime_t now = ktime_get(); 1501 u32 idx = 0, tin = 0, len; 1502 struct cake_heap_entry qq; 1503 struct cake_tin_data *b; 1504 struct cake_flow *flow; 1505 struct sk_buff *skb; 1506 1507 if (!q->overflow_timeout) { 1508 int i; 1509 /* Build fresh max-heap */ 1510 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) 1511 cake_heapify(q, i); 1512 } 1513 q->overflow_timeout = 65535; 1514 1515 /* select longest queue for pruning */ 1516 qq = q->overflow_heap[0]; 1517 tin = qq.t; 1518 idx = qq.b; 1519 1520 b = &q->tins[tin]; 1521 flow = &b->flows[idx]; 1522 skb = dequeue_head(flow); 1523 if (unlikely(!skb)) { 1524 /* heap has gone wrong, rebuild it next time */ 1525 q->overflow_timeout = 0; 1526 return idx + (tin << 16); 1527 } 1528 1529 if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) 1530 b->unresponsive_flow_count++; 1531 1532 len = qdisc_pkt_len(skb); 1533 q->buffer_used -= skb->truesize; 1534 b->backlogs[idx] -= len; 1535 b->tin_backlog -= len; 1536 sch->qstats.backlog -= len; 1537 qdisc_tree_reduce_backlog(sch, 1, len); 1538 1539 flow->dropped++; 1540 b->tin_dropped++; 1541 sch->qstats.drops++; 1542 1543 if (q->rate_flags & CAKE_FLAG_INGRESS) 1544 cake_advance_shaper(q, b, skb, now, true); 1545 1546 __qdisc_drop(skb, to_free); 1547 sch->q.qlen--; 1548 1549 cake_heapify(q, 0); 1550 1551 return idx + (tin << 16); 1552 } 1553 1554 static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) 1555 { 1556 int wlen = skb_network_offset(skb); 1557 u8 dscp; 1558 1559 switch (tc_skb_protocol(skb)) { 1560 case htons(ETH_P_IP): 1561 wlen += sizeof(struct iphdr); 1562 if (!pskb_may_pull(skb, wlen) || 1563 skb_try_make_writable(skb, wlen)) 1564 return 0; 1565 1566 dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; 1567 if (wash && dscp) 1568 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); 1569 return dscp; 1570 1571 case htons(ETH_P_IPV6): 1572 wlen += sizeof(struct ipv6hdr); 1573 if (!pskb_may_pull(skb, wlen) || 1574 skb_try_make_writable(skb, wlen)) 1575 return 0; 1576 1577 dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; 1578 if (wash && dscp) 1579 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); 1580 return dscp; 1581 1582 case htons(ETH_P_ARP): 1583 return 0x38; /* CS7 - Net Control */ 1584 1585 default: 1586 /* If there is no Diffserv field, treat as best-effort */ 1587 return 0; 1588 } 1589 } 1590 1591 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, 1592 struct sk_buff *skb) 1593 { 1594 struct cake_sched_data *q = qdisc_priv(sch); 1595 u32 tin, mark; 1596 u8 dscp; 1597 1598 /* Tin selection: Default to diffserv-based selection, allow overriding 1599 * using firewall marks or skb->priority. 1600 */ 1601 dscp = cake_handle_diffserv(skb, 1602 q->rate_flags & CAKE_FLAG_WASH); 1603 mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; 1604 1605 if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) 1606 tin = 0; 1607 1608 else if (mark && mark <= q->tin_cnt) 1609 tin = q->tin_order[mark - 1]; 1610 1611 else if (TC_H_MAJ(skb->priority) == sch->handle && 1612 TC_H_MIN(skb->priority) > 0 && 1613 TC_H_MIN(skb->priority) <= q->tin_cnt) 1614 tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; 1615 1616 else { 1617 tin = q->tin_index[dscp]; 1618 1619 if (unlikely(tin >= q->tin_cnt)) 1620 tin = 0; 1621 } 1622 1623 return &q->tins[tin]; 1624 } 1625 1626 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, 1627 struct sk_buff *skb, int flow_mode, int *qerr) 1628 { 1629 struct cake_sched_data *q = qdisc_priv(sch); 1630 struct tcf_proto *filter; 1631 struct tcf_result res; 1632 u16 flow = 0, host = 0; 1633 int result; 1634 1635 filter = rcu_dereference_bh(q->filter_list); 1636 if (!filter) 1637 goto hash; 1638 1639 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 1640 result = tcf_classify(skb, filter, &res, false); 1641 1642 if (result >= 0) { 1643 #ifdef CONFIG_NET_CLS_ACT 1644 switch (result) { 1645 case TC_ACT_STOLEN: 1646 case TC_ACT_QUEUED: 1647 case TC_ACT_TRAP: 1648 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 1649 /* fall through */ 1650 case TC_ACT_SHOT: 1651 return 0; 1652 } 1653 #endif 1654 if (TC_H_MIN(res.classid) <= CAKE_QUEUES) 1655 flow = TC_H_MIN(res.classid); 1656 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16)) 1657 host = TC_H_MAJ(res.classid) >> 16; 1658 } 1659 hash: 1660 *t = cake_select_tin(sch, skb); 1661 return cake_hash(*t, skb, flow_mode, flow, host) + 1; 1662 } 1663 1664 static void cake_reconfigure(struct Qdisc *sch); 1665 1666 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1667 struct sk_buff **to_free) 1668 { 1669 struct cake_sched_data *q = qdisc_priv(sch); 1670 int len = qdisc_pkt_len(skb); 1671 int uninitialized_var(ret); 1672 struct sk_buff *ack = NULL; 1673 ktime_t now = ktime_get(); 1674 struct cake_tin_data *b; 1675 struct cake_flow *flow; 1676 u32 idx; 1677 1678 /* choose flow to insert into */ 1679 idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); 1680 if (idx == 0) { 1681 if (ret & __NET_XMIT_BYPASS) 1682 qdisc_qstats_drop(sch); 1683 __qdisc_drop(skb, to_free); 1684 return ret; 1685 } 1686 idx--; 1687 flow = &b->flows[idx]; 1688 1689 /* ensure shaper state isn't stale */ 1690 if (!b->tin_backlog) { 1691 if (ktime_before(b->time_next_packet, now)) 1692 b->time_next_packet = now; 1693 1694 if (!sch->q.qlen) { 1695 if (ktime_before(q->time_next_packet, now)) { 1696 q->failsafe_next_packet = now; 1697 q->time_next_packet = now; 1698 } else if (ktime_after(q->time_next_packet, now) && 1699 ktime_after(q->failsafe_next_packet, now)) { 1700 u64 next = \ 1701 min(ktime_to_ns(q->time_next_packet), 1702 ktime_to_ns( 1703 q->failsafe_next_packet)); 1704 sch->qstats.overlimits++; 1705 qdisc_watchdog_schedule_ns(&q->watchdog, next); 1706 } 1707 } 1708 } 1709 1710 if (unlikely(len > b->max_skblen)) 1711 b->max_skblen = len; 1712 1713 if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { 1714 struct sk_buff *segs, *nskb; 1715 netdev_features_t features = netif_skb_features(skb); 1716 unsigned int slen = 0, numsegs = 0; 1717 1718 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 1719 if (IS_ERR_OR_NULL(segs)) 1720 return qdisc_drop(skb, sch, to_free); 1721 1722 skb_list_walk_safe(segs, segs, nskb) { 1723 skb_mark_not_on_list(segs); 1724 qdisc_skb_cb(segs)->pkt_len = segs->len; 1725 cobalt_set_enqueue_time(segs, now); 1726 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, 1727 segs); 1728 flow_queue_add(flow, segs); 1729 1730 sch->q.qlen++; 1731 numsegs++; 1732 slen += segs->len; 1733 q->buffer_used += segs->truesize; 1734 b->packets++; 1735 } 1736 1737 /* stats */ 1738 b->bytes += slen; 1739 b->backlogs[idx] += slen; 1740 b->tin_backlog += slen; 1741 sch->qstats.backlog += slen; 1742 q->avg_window_bytes += slen; 1743 1744 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); 1745 consume_skb(skb); 1746 } else { 1747 /* not splitting */ 1748 cobalt_set_enqueue_time(skb, now); 1749 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); 1750 flow_queue_add(flow, skb); 1751 1752 if (q->ack_filter) 1753 ack = cake_ack_filter(q, flow); 1754 1755 if (ack) { 1756 b->ack_drops++; 1757 sch->qstats.drops++; 1758 b->bytes += qdisc_pkt_len(ack); 1759 len -= qdisc_pkt_len(ack); 1760 q->buffer_used += skb->truesize - ack->truesize; 1761 if (q->rate_flags & CAKE_FLAG_INGRESS) 1762 cake_advance_shaper(q, b, ack, now, true); 1763 1764 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); 1765 consume_skb(ack); 1766 } else { 1767 sch->q.qlen++; 1768 q->buffer_used += skb->truesize; 1769 } 1770 1771 /* stats */ 1772 b->packets++; 1773 b->bytes += len; 1774 b->backlogs[idx] += len; 1775 b->tin_backlog += len; 1776 sch->qstats.backlog += len; 1777 q->avg_window_bytes += len; 1778 } 1779 1780 if (q->overflow_timeout) 1781 cake_heapify_up(q, b->overflow_idx[idx]); 1782 1783 /* incoming bandwidth capacity estimate */ 1784 if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { 1785 u64 packet_interval = \ 1786 ktime_to_ns(ktime_sub(now, q->last_packet_time)); 1787 1788 if (packet_interval > NSEC_PER_SEC) 1789 packet_interval = NSEC_PER_SEC; 1790 1791 /* filter out short-term bursts, eg. wifi aggregation */ 1792 q->avg_packet_interval = \ 1793 cake_ewma(q->avg_packet_interval, 1794 packet_interval, 1795 (packet_interval > q->avg_packet_interval ? 1796 2 : 8)); 1797 1798 q->last_packet_time = now; 1799 1800 if (packet_interval > q->avg_packet_interval) { 1801 u64 window_interval = \ 1802 ktime_to_ns(ktime_sub(now, 1803 q->avg_window_begin)); 1804 u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; 1805 1806 b = div64_u64(b, window_interval); 1807 q->avg_peak_bandwidth = 1808 cake_ewma(q->avg_peak_bandwidth, b, 1809 b > q->avg_peak_bandwidth ? 2 : 8); 1810 q->avg_window_bytes = 0; 1811 q->avg_window_begin = now; 1812 1813 if (ktime_after(now, 1814 ktime_add_ms(q->last_reconfig_time, 1815 250))) { 1816 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; 1817 cake_reconfigure(sch); 1818 } 1819 } 1820 } else { 1821 q->avg_window_bytes = 0; 1822 q->last_packet_time = now; 1823 } 1824 1825 /* flowchain */ 1826 if (!flow->set || flow->set == CAKE_SET_DECAYING) { 1827 struct cake_host *srchost = &b->hosts[flow->srchost]; 1828 struct cake_host *dsthost = &b->hosts[flow->dsthost]; 1829 u16 host_load = 1; 1830 1831 if (!flow->set) { 1832 list_add_tail(&flow->flowchain, &b->new_flows); 1833 } else { 1834 b->decaying_flow_count--; 1835 list_move_tail(&flow->flowchain, &b->new_flows); 1836 } 1837 flow->set = CAKE_SET_SPARSE; 1838 b->sparse_flow_count++; 1839 1840 if (cake_dsrc(q->flow_mode)) 1841 host_load = max(host_load, srchost->srchost_bulk_flow_count); 1842 1843 if (cake_ddst(q->flow_mode)) 1844 host_load = max(host_load, dsthost->dsthost_bulk_flow_count); 1845 1846 flow->deficit = (b->flow_quantum * 1847 quantum_div[host_load]) >> 16; 1848 } else if (flow->set == CAKE_SET_SPARSE_WAIT) { 1849 struct cake_host *srchost = &b->hosts[flow->srchost]; 1850 struct cake_host *dsthost = &b->hosts[flow->dsthost]; 1851 1852 /* this flow was empty, accounted as a sparse flow, but actually 1853 * in the bulk rotation. 1854 */ 1855 flow->set = CAKE_SET_BULK; 1856 b->sparse_flow_count--; 1857 b->bulk_flow_count++; 1858 1859 if (cake_dsrc(q->flow_mode)) 1860 srchost->srchost_bulk_flow_count++; 1861 1862 if (cake_ddst(q->flow_mode)) 1863 dsthost->dsthost_bulk_flow_count++; 1864 1865 } 1866 1867 if (q->buffer_used > q->buffer_max_used) 1868 q->buffer_max_used = q->buffer_used; 1869 1870 if (q->buffer_used > q->buffer_limit) { 1871 u32 dropped = 0; 1872 1873 while (q->buffer_used > q->buffer_limit) { 1874 dropped++; 1875 cake_drop(sch, to_free); 1876 } 1877 b->drop_overlimit += dropped; 1878 } 1879 return NET_XMIT_SUCCESS; 1880 } 1881 1882 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) 1883 { 1884 struct cake_sched_data *q = qdisc_priv(sch); 1885 struct cake_tin_data *b = &q->tins[q->cur_tin]; 1886 struct cake_flow *flow = &b->flows[q->cur_flow]; 1887 struct sk_buff *skb = NULL; 1888 u32 len; 1889 1890 if (flow->head) { 1891 skb = dequeue_head(flow); 1892 len = qdisc_pkt_len(skb); 1893 b->backlogs[q->cur_flow] -= len; 1894 b->tin_backlog -= len; 1895 sch->qstats.backlog -= len; 1896 q->buffer_used -= skb->truesize; 1897 sch->q.qlen--; 1898 1899 if (q->overflow_timeout) 1900 cake_heapify(q, b->overflow_idx[q->cur_flow]); 1901 } 1902 return skb; 1903 } 1904 1905 /* Discard leftover packets from a tin no longer in use. */ 1906 static void cake_clear_tin(struct Qdisc *sch, u16 tin) 1907 { 1908 struct cake_sched_data *q = qdisc_priv(sch); 1909 struct sk_buff *skb; 1910 1911 q->cur_tin = tin; 1912 for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) 1913 while (!!(skb = cake_dequeue_one(sch))) 1914 kfree_skb(skb); 1915 } 1916 1917 static struct sk_buff *cake_dequeue(struct Qdisc *sch) 1918 { 1919 struct cake_sched_data *q = qdisc_priv(sch); 1920 struct cake_tin_data *b = &q->tins[q->cur_tin]; 1921 struct cake_host *srchost, *dsthost; 1922 ktime_t now = ktime_get(); 1923 struct cake_flow *flow; 1924 struct list_head *head; 1925 bool first_flow = true; 1926 struct sk_buff *skb; 1927 u16 host_load; 1928 u64 delay; 1929 u32 len; 1930 1931 begin: 1932 if (!sch->q.qlen) 1933 return NULL; 1934 1935 /* global hard shaper */ 1936 if (ktime_after(q->time_next_packet, now) && 1937 ktime_after(q->failsafe_next_packet, now)) { 1938 u64 next = min(ktime_to_ns(q->time_next_packet), 1939 ktime_to_ns(q->failsafe_next_packet)); 1940 1941 sch->qstats.overlimits++; 1942 qdisc_watchdog_schedule_ns(&q->watchdog, next); 1943 return NULL; 1944 } 1945 1946 /* Choose a class to work on. */ 1947 if (!q->rate_ns) { 1948 /* In unlimited mode, can't rely on shaper timings, just balance 1949 * with DRR 1950 */ 1951 bool wrapped = false, empty = true; 1952 1953 while (b->tin_deficit < 0 || 1954 !(b->sparse_flow_count + b->bulk_flow_count)) { 1955 if (b->tin_deficit <= 0) 1956 b->tin_deficit += b->tin_quantum; 1957 if (b->sparse_flow_count + b->bulk_flow_count) 1958 empty = false; 1959 1960 q->cur_tin++; 1961 b++; 1962 if (q->cur_tin >= q->tin_cnt) { 1963 q->cur_tin = 0; 1964 b = q->tins; 1965 1966 if (wrapped) { 1967 /* It's possible for q->qlen to be 1968 * nonzero when we actually have no 1969 * packets anywhere. 1970 */ 1971 if (empty) 1972 return NULL; 1973 } else { 1974 wrapped = true; 1975 } 1976 } 1977 } 1978 } else { 1979 /* In shaped mode, choose: 1980 * - Highest-priority tin with queue and meeting schedule, or 1981 * - The earliest-scheduled tin with queue. 1982 */ 1983 ktime_t best_time = KTIME_MAX; 1984 int tin, best_tin = 0; 1985 1986 for (tin = 0; tin < q->tin_cnt; tin++) { 1987 b = q->tins + tin; 1988 if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { 1989 ktime_t time_to_pkt = \ 1990 ktime_sub(b->time_next_packet, now); 1991 1992 if (ktime_to_ns(time_to_pkt) <= 0 || 1993 ktime_compare(time_to_pkt, 1994 best_time) <= 0) { 1995 best_time = time_to_pkt; 1996 best_tin = tin; 1997 } 1998 } 1999 } 2000 2001 q->cur_tin = best_tin; 2002 b = q->tins + best_tin; 2003 2004 /* No point in going further if no packets to deliver. */ 2005 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) 2006 return NULL; 2007 } 2008 2009 retry: 2010 /* service this class */ 2011 head = &b->decaying_flows; 2012 if (!first_flow || list_empty(head)) { 2013 head = &b->new_flows; 2014 if (list_empty(head)) { 2015 head = &b->old_flows; 2016 if (unlikely(list_empty(head))) { 2017 head = &b->decaying_flows; 2018 if (unlikely(list_empty(head))) 2019 goto begin; 2020 } 2021 } 2022 } 2023 flow = list_first_entry(head, struct cake_flow, flowchain); 2024 q->cur_flow = flow - b->flows; 2025 first_flow = false; 2026 2027 /* triple isolation (modified DRR++) */ 2028 srchost = &b->hosts[flow->srchost]; 2029 dsthost = &b->hosts[flow->dsthost]; 2030 host_load = 1; 2031 2032 /* flow isolation (DRR++) */ 2033 if (flow->deficit <= 0) { 2034 /* Keep all flows with deficits out of the sparse and decaying 2035 * rotations. No non-empty flow can go into the decaying 2036 * rotation, so they can't get deficits 2037 */ 2038 if (flow->set == CAKE_SET_SPARSE) { 2039 if (flow->head) { 2040 b->sparse_flow_count--; 2041 b->bulk_flow_count++; 2042 2043 if (cake_dsrc(q->flow_mode)) 2044 srchost->srchost_bulk_flow_count++; 2045 2046 if (cake_ddst(q->flow_mode)) 2047 dsthost->dsthost_bulk_flow_count++; 2048 2049 flow->set = CAKE_SET_BULK; 2050 } else { 2051 /* we've moved it to the bulk rotation for 2052 * correct deficit accounting but we still want 2053 * to count it as a sparse flow, not a bulk one. 2054 */ 2055 flow->set = CAKE_SET_SPARSE_WAIT; 2056 } 2057 } 2058 2059 if (cake_dsrc(q->flow_mode)) 2060 host_load = max(host_load, srchost->srchost_bulk_flow_count); 2061 2062 if (cake_ddst(q->flow_mode)) 2063 host_load = max(host_load, dsthost->dsthost_bulk_flow_count); 2064 2065 WARN_ON(host_load > CAKE_QUEUES); 2066 2067 /* The shifted prandom_u32() is a way to apply dithering to 2068 * avoid accumulating roundoff errors 2069 */ 2070 flow->deficit += (b->flow_quantum * quantum_div[host_load] + 2071 (prandom_u32() >> 16)) >> 16; 2072 list_move_tail(&flow->flowchain, &b->old_flows); 2073 2074 goto retry; 2075 } 2076 2077 /* Retrieve a packet via the AQM */ 2078 while (1) { 2079 skb = cake_dequeue_one(sch); 2080 if (!skb) { 2081 /* this queue was actually empty */ 2082 if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) 2083 b->unresponsive_flow_count--; 2084 2085 if (flow->cvars.p_drop || flow->cvars.count || 2086 ktime_before(now, flow->cvars.drop_next)) { 2087 /* keep in the flowchain until the state has 2088 * decayed to rest 2089 */ 2090 list_move_tail(&flow->flowchain, 2091 &b->decaying_flows); 2092 if (flow->set == CAKE_SET_BULK) { 2093 b->bulk_flow_count--; 2094 2095 if (cake_dsrc(q->flow_mode)) 2096 srchost->srchost_bulk_flow_count--; 2097 2098 if (cake_ddst(q->flow_mode)) 2099 dsthost->dsthost_bulk_flow_count--; 2100 2101 b->decaying_flow_count++; 2102 } else if (flow->set == CAKE_SET_SPARSE || 2103 flow->set == CAKE_SET_SPARSE_WAIT) { 2104 b->sparse_flow_count--; 2105 b->decaying_flow_count++; 2106 } 2107 flow->set = CAKE_SET_DECAYING; 2108 } else { 2109 /* remove empty queue from the flowchain */ 2110 list_del_init(&flow->flowchain); 2111 if (flow->set == CAKE_SET_SPARSE || 2112 flow->set == CAKE_SET_SPARSE_WAIT) 2113 b->sparse_flow_count--; 2114 else if (flow->set == CAKE_SET_BULK) { 2115 b->bulk_flow_count--; 2116 2117 if (cake_dsrc(q->flow_mode)) 2118 srchost->srchost_bulk_flow_count--; 2119 2120 if (cake_ddst(q->flow_mode)) 2121 dsthost->dsthost_bulk_flow_count--; 2122 2123 } else 2124 b->decaying_flow_count--; 2125 2126 flow->set = CAKE_SET_NONE; 2127 } 2128 goto begin; 2129 } 2130 2131 /* Last packet in queue may be marked, shouldn't be dropped */ 2132 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, 2133 (b->bulk_flow_count * 2134 !!(q->rate_flags & 2135 CAKE_FLAG_INGRESS))) || 2136 !flow->head) 2137 break; 2138 2139 /* drop this packet, get another one */ 2140 if (q->rate_flags & CAKE_FLAG_INGRESS) { 2141 len = cake_advance_shaper(q, b, skb, 2142 now, true); 2143 flow->deficit -= len; 2144 b->tin_deficit -= len; 2145 } 2146 flow->dropped++; 2147 b->tin_dropped++; 2148 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); 2149 qdisc_qstats_drop(sch); 2150 kfree_skb(skb); 2151 if (q->rate_flags & CAKE_FLAG_INGRESS) 2152 goto retry; 2153 } 2154 2155 b->tin_ecn_mark += !!flow->cvars.ecn_marked; 2156 qdisc_bstats_update(sch, skb); 2157 2158 /* collect delay stats */ 2159 delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 2160 b->avge_delay = cake_ewma(b->avge_delay, delay, 8); 2161 b->peak_delay = cake_ewma(b->peak_delay, delay, 2162 delay > b->peak_delay ? 2 : 8); 2163 b->base_delay = cake_ewma(b->base_delay, delay, 2164 delay < b->base_delay ? 2 : 8); 2165 2166 len = cake_advance_shaper(q, b, skb, now, false); 2167 flow->deficit -= len; 2168 b->tin_deficit -= len; 2169 2170 if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { 2171 u64 next = min(ktime_to_ns(q->time_next_packet), 2172 ktime_to_ns(q->failsafe_next_packet)); 2173 2174 qdisc_watchdog_schedule_ns(&q->watchdog, next); 2175 } else if (!sch->q.qlen) { 2176 int i; 2177 2178 for (i = 0; i < q->tin_cnt; i++) { 2179 if (q->tins[i].decaying_flow_count) { 2180 ktime_t next = \ 2181 ktime_add_ns(now, 2182 q->tins[i].cparams.target); 2183 2184 qdisc_watchdog_schedule_ns(&q->watchdog, 2185 ktime_to_ns(next)); 2186 break; 2187 } 2188 } 2189 } 2190 2191 if (q->overflow_timeout) 2192 q->overflow_timeout--; 2193 2194 return skb; 2195 } 2196 2197 static void cake_reset(struct Qdisc *sch) 2198 { 2199 u32 c; 2200 2201 for (c = 0; c < CAKE_MAX_TINS; c++) 2202 cake_clear_tin(sch, c); 2203 } 2204 2205 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { 2206 [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, 2207 [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, 2208 [TCA_CAKE_ATM] = { .type = NLA_U32 }, 2209 [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, 2210 [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, 2211 [TCA_CAKE_RTT] = { .type = NLA_U32 }, 2212 [TCA_CAKE_TARGET] = { .type = NLA_U32 }, 2213 [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, 2214 [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, 2215 [TCA_CAKE_NAT] = { .type = NLA_U32 }, 2216 [TCA_CAKE_RAW] = { .type = NLA_U32 }, 2217 [TCA_CAKE_WASH] = { .type = NLA_U32 }, 2218 [TCA_CAKE_MPU] = { .type = NLA_U32 }, 2219 [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, 2220 [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, 2221 [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, 2222 [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, 2223 }; 2224 2225 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, 2226 u64 target_ns, u64 rtt_est_ns) 2227 { 2228 /* convert byte-rate into time-per-byte 2229 * so it will always unwedge in reasonable time. 2230 */ 2231 static const u64 MIN_RATE = 64; 2232 u32 byte_target = mtu; 2233 u64 byte_target_ns; 2234 u8 rate_shft = 0; 2235 u64 rate_ns = 0; 2236 2237 b->flow_quantum = 1514; 2238 if (rate) { 2239 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); 2240 rate_shft = 34; 2241 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; 2242 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); 2243 while (!!(rate_ns >> 34)) { 2244 rate_ns >>= 1; 2245 rate_shft--; 2246 } 2247 } /* else unlimited, ie. zero delay */ 2248 2249 b->tin_rate_bps = rate; 2250 b->tin_rate_ns = rate_ns; 2251 b->tin_rate_shft = rate_shft; 2252 2253 byte_target_ns = (byte_target * rate_ns) >> rate_shft; 2254 2255 b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); 2256 b->cparams.interval = max(rtt_est_ns + 2257 b->cparams.target - target_ns, 2258 b->cparams.target * 2); 2259 b->cparams.mtu_time = byte_target_ns; 2260 b->cparams.p_inc = 1 << 24; /* 1/256 */ 2261 b->cparams.p_dec = 1 << 20; /* 1/4096 */ 2262 } 2263 2264 static int cake_config_besteffort(struct Qdisc *sch) 2265 { 2266 struct cake_sched_data *q = qdisc_priv(sch); 2267 struct cake_tin_data *b = &q->tins[0]; 2268 u32 mtu = psched_mtu(qdisc_dev(sch)); 2269 u64 rate = q->rate_bps; 2270 2271 q->tin_cnt = 1; 2272 2273 q->tin_index = besteffort; 2274 q->tin_order = normal_order; 2275 2276 cake_set_rate(b, rate, mtu, 2277 us_to_ns(q->target), us_to_ns(q->interval)); 2278 b->tin_quantum = 65535; 2279 2280 return 0; 2281 } 2282 2283 static int cake_config_precedence(struct Qdisc *sch) 2284 { 2285 /* convert high-level (user visible) parameters into internal format */ 2286 struct cake_sched_data *q = qdisc_priv(sch); 2287 u32 mtu = psched_mtu(qdisc_dev(sch)); 2288 u64 rate = q->rate_bps; 2289 u32 quantum = 256; 2290 u32 i; 2291 2292 q->tin_cnt = 8; 2293 q->tin_index = precedence; 2294 q->tin_order = normal_order; 2295 2296 for (i = 0; i < q->tin_cnt; i++) { 2297 struct cake_tin_data *b = &q->tins[i]; 2298 2299 cake_set_rate(b, rate, mtu, us_to_ns(q->target), 2300 us_to_ns(q->interval)); 2301 2302 b->tin_quantum = max_t(u16, 1U, quantum); 2303 2304 /* calculate next class's parameters */ 2305 rate *= 7; 2306 rate >>= 3; 2307 2308 quantum *= 7; 2309 quantum >>= 3; 2310 } 2311 2312 return 0; 2313 } 2314 2315 /* List of known Diffserv codepoints: 2316 * 2317 * Least Effort (CS1) 2318 * Best Effort (CS0) 2319 * Max Reliability & LLT "Lo" (TOS1) 2320 * Max Throughput (TOS2) 2321 * Min Delay (TOS4) 2322 * LLT "La" (TOS5) 2323 * Assured Forwarding 1 (AF1x) - x3 2324 * Assured Forwarding 2 (AF2x) - x3 2325 * Assured Forwarding 3 (AF3x) - x3 2326 * Assured Forwarding 4 (AF4x) - x3 2327 * Precedence Class 2 (CS2) 2328 * Precedence Class 3 (CS3) 2329 * Precedence Class 4 (CS4) 2330 * Precedence Class 5 (CS5) 2331 * Precedence Class 6 (CS6) 2332 * Precedence Class 7 (CS7) 2333 * Voice Admit (VA) 2334 * Expedited Forwarding (EF) 2335 2336 * Total 25 codepoints. 2337 */ 2338 2339 /* List of traffic classes in RFC 4594: 2340 * (roughly descending order of contended priority) 2341 * (roughly ascending order of uncontended throughput) 2342 * 2343 * Network Control (CS6,CS7) - routing traffic 2344 * Telephony (EF,VA) - aka. VoIP streams 2345 * Signalling (CS5) - VoIP setup 2346 * Multimedia Conferencing (AF4x) - aka. video calls 2347 * Realtime Interactive (CS4) - eg. games 2348 * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch 2349 * Broadcast Video (CS3) 2350 * Low Latency Data (AF2x,TOS4) - eg. database 2351 * Ops, Admin, Management (CS2,TOS1) - eg. ssh 2352 * Standard Service (CS0 & unrecognised codepoints) 2353 * High Throughput Data (AF1x,TOS2) - eg. web traffic 2354 * Low Priority Data (CS1) - eg. BitTorrent 2355 2356 * Total 12 traffic classes. 2357 */ 2358 2359 static int cake_config_diffserv8(struct Qdisc *sch) 2360 { 2361 /* Pruned list of traffic classes for typical applications: 2362 * 2363 * Network Control (CS6, CS7) 2364 * Minimum Latency (EF, VA, CS5, CS4) 2365 * Interactive Shell (CS2, TOS1) 2366 * Low Latency Transactions (AF2x, TOS4) 2367 * Video Streaming (AF4x, AF3x, CS3) 2368 * Bog Standard (CS0 etc.) 2369 * High Throughput (AF1x, TOS2) 2370 * Background Traffic (CS1) 2371 * 2372 * Total 8 traffic classes. 2373 */ 2374 2375 struct cake_sched_data *q = qdisc_priv(sch); 2376 u32 mtu = psched_mtu(qdisc_dev(sch)); 2377 u64 rate = q->rate_bps; 2378 u32 quantum = 256; 2379 u32 i; 2380 2381 q->tin_cnt = 8; 2382 2383 /* codepoint to class mapping */ 2384 q->tin_index = diffserv8; 2385 q->tin_order = normal_order; 2386 2387 /* class characteristics */ 2388 for (i = 0; i < q->tin_cnt; i++) { 2389 struct cake_tin_data *b = &q->tins[i]; 2390 2391 cake_set_rate(b, rate, mtu, us_to_ns(q->target), 2392 us_to_ns(q->interval)); 2393 2394 b->tin_quantum = max_t(u16, 1U, quantum); 2395 2396 /* calculate next class's parameters */ 2397 rate *= 7; 2398 rate >>= 3; 2399 2400 quantum *= 7; 2401 quantum >>= 3; 2402 } 2403 2404 return 0; 2405 } 2406 2407 static int cake_config_diffserv4(struct Qdisc *sch) 2408 { 2409 /* Further pruned list of traffic classes for four-class system: 2410 * 2411 * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) 2412 * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1) 2413 * Best Effort (CS0, AF1x, TOS2, and those not specified) 2414 * Background Traffic (CS1) 2415 * 2416 * Total 4 traffic classes. 2417 */ 2418 2419 struct cake_sched_data *q = qdisc_priv(sch); 2420 u32 mtu = psched_mtu(qdisc_dev(sch)); 2421 u64 rate = q->rate_bps; 2422 u32 quantum = 1024; 2423 2424 q->tin_cnt = 4; 2425 2426 /* codepoint to class mapping */ 2427 q->tin_index = diffserv4; 2428 q->tin_order = bulk_order; 2429 2430 /* class characteristics */ 2431 cake_set_rate(&q->tins[0], rate, mtu, 2432 us_to_ns(q->target), us_to_ns(q->interval)); 2433 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2434 us_to_ns(q->target), us_to_ns(q->interval)); 2435 cake_set_rate(&q->tins[2], rate >> 1, mtu, 2436 us_to_ns(q->target), us_to_ns(q->interval)); 2437 cake_set_rate(&q->tins[3], rate >> 2, mtu, 2438 us_to_ns(q->target), us_to_ns(q->interval)); 2439 2440 /* bandwidth-sharing weights */ 2441 q->tins[0].tin_quantum = quantum; 2442 q->tins[1].tin_quantum = quantum >> 4; 2443 q->tins[2].tin_quantum = quantum >> 1; 2444 q->tins[3].tin_quantum = quantum >> 2; 2445 2446 return 0; 2447 } 2448 2449 static int cake_config_diffserv3(struct Qdisc *sch) 2450 { 2451 /* Simplified Diffserv structure with 3 tins. 2452 * Low Priority (CS1) 2453 * Best Effort 2454 * Latency Sensitive (TOS4, VA, EF, CS6, CS7) 2455 */ 2456 struct cake_sched_data *q = qdisc_priv(sch); 2457 u32 mtu = psched_mtu(qdisc_dev(sch)); 2458 u64 rate = q->rate_bps; 2459 u32 quantum = 1024; 2460 2461 q->tin_cnt = 3; 2462 2463 /* codepoint to class mapping */ 2464 q->tin_index = diffserv3; 2465 q->tin_order = bulk_order; 2466 2467 /* class characteristics */ 2468 cake_set_rate(&q->tins[0], rate, mtu, 2469 us_to_ns(q->target), us_to_ns(q->interval)); 2470 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2471 us_to_ns(q->target), us_to_ns(q->interval)); 2472 cake_set_rate(&q->tins[2], rate >> 2, mtu, 2473 us_to_ns(q->target), us_to_ns(q->interval)); 2474 2475 /* bandwidth-sharing weights */ 2476 q->tins[0].tin_quantum = quantum; 2477 q->tins[1].tin_quantum = quantum >> 4; 2478 q->tins[2].tin_quantum = quantum >> 2; 2479 2480 return 0; 2481 } 2482 2483 static void cake_reconfigure(struct Qdisc *sch) 2484 { 2485 struct cake_sched_data *q = qdisc_priv(sch); 2486 int c, ft; 2487 2488 switch (q->tin_mode) { 2489 case CAKE_DIFFSERV_BESTEFFORT: 2490 ft = cake_config_besteffort(sch); 2491 break; 2492 2493 case CAKE_DIFFSERV_PRECEDENCE: 2494 ft = cake_config_precedence(sch); 2495 break; 2496 2497 case CAKE_DIFFSERV_DIFFSERV8: 2498 ft = cake_config_diffserv8(sch); 2499 break; 2500 2501 case CAKE_DIFFSERV_DIFFSERV4: 2502 ft = cake_config_diffserv4(sch); 2503 break; 2504 2505 case CAKE_DIFFSERV_DIFFSERV3: 2506 default: 2507 ft = cake_config_diffserv3(sch); 2508 break; 2509 } 2510 2511 for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { 2512 cake_clear_tin(sch, c); 2513 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; 2514 } 2515 2516 q->rate_ns = q->tins[ft].tin_rate_ns; 2517 q->rate_shft = q->tins[ft].tin_rate_shft; 2518 2519 if (q->buffer_config_limit) { 2520 q->buffer_limit = q->buffer_config_limit; 2521 } else if (q->rate_bps) { 2522 u64 t = q->rate_bps * q->interval; 2523 2524 do_div(t, USEC_PER_SEC / 4); 2525 q->buffer_limit = max_t(u32, t, 4U << 20); 2526 } else { 2527 q->buffer_limit = ~0; 2528 } 2529 2530 sch->flags &= ~TCQ_F_CAN_BYPASS; 2531 2532 q->buffer_limit = min(q->buffer_limit, 2533 max(sch->limit * psched_mtu(qdisc_dev(sch)), 2534 q->buffer_config_limit)); 2535 } 2536 2537 static int cake_change(struct Qdisc *sch, struct nlattr *opt, 2538 struct netlink_ext_ack *extack) 2539 { 2540 struct cake_sched_data *q = qdisc_priv(sch); 2541 struct nlattr *tb[TCA_CAKE_MAX + 1]; 2542 int err; 2543 2544 if (!opt) 2545 return -EINVAL; 2546 2547 err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, 2548 extack); 2549 if (err < 0) 2550 return err; 2551 2552 if (tb[TCA_CAKE_NAT]) { 2553 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 2554 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; 2555 q->flow_mode |= CAKE_FLOW_NAT_FLAG * 2556 !!nla_get_u32(tb[TCA_CAKE_NAT]); 2557 #else 2558 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], 2559 "No conntrack support in kernel"); 2560 return -EOPNOTSUPP; 2561 #endif 2562 } 2563 2564 if (tb[TCA_CAKE_BASE_RATE64]) 2565 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); 2566 2567 if (tb[TCA_CAKE_DIFFSERV_MODE]) 2568 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); 2569 2570 if (tb[TCA_CAKE_WASH]) { 2571 if (!!nla_get_u32(tb[TCA_CAKE_WASH])) 2572 q->rate_flags |= CAKE_FLAG_WASH; 2573 else 2574 q->rate_flags &= ~CAKE_FLAG_WASH; 2575 } 2576 2577 if (tb[TCA_CAKE_FLOW_MODE]) 2578 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | 2579 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & 2580 CAKE_FLOW_MASK)); 2581 2582 if (tb[TCA_CAKE_ATM]) 2583 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); 2584 2585 if (tb[TCA_CAKE_OVERHEAD]) { 2586 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); 2587 q->rate_flags |= CAKE_FLAG_OVERHEAD; 2588 2589 q->max_netlen = 0; 2590 q->max_adjlen = 0; 2591 q->min_netlen = ~0; 2592 q->min_adjlen = ~0; 2593 } 2594 2595 if (tb[TCA_CAKE_RAW]) { 2596 q->rate_flags &= ~CAKE_FLAG_OVERHEAD; 2597 2598 q->max_netlen = 0; 2599 q->max_adjlen = 0; 2600 q->min_netlen = ~0; 2601 q->min_adjlen = ~0; 2602 } 2603 2604 if (tb[TCA_CAKE_MPU]) 2605 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); 2606 2607 if (tb[TCA_CAKE_RTT]) { 2608 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); 2609 2610 if (!q->interval) 2611 q->interval = 1; 2612 } 2613 2614 if (tb[TCA_CAKE_TARGET]) { 2615 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); 2616 2617 if (!q->target) 2618 q->target = 1; 2619 } 2620 2621 if (tb[TCA_CAKE_AUTORATE]) { 2622 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) 2623 q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; 2624 else 2625 q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; 2626 } 2627 2628 if (tb[TCA_CAKE_INGRESS]) { 2629 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) 2630 q->rate_flags |= CAKE_FLAG_INGRESS; 2631 else 2632 q->rate_flags &= ~CAKE_FLAG_INGRESS; 2633 } 2634 2635 if (tb[TCA_CAKE_ACK_FILTER]) 2636 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); 2637 2638 if (tb[TCA_CAKE_MEMORY]) 2639 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); 2640 2641 if (tb[TCA_CAKE_SPLIT_GSO]) { 2642 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) 2643 q->rate_flags |= CAKE_FLAG_SPLIT_GSO; 2644 else 2645 q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; 2646 } 2647 2648 if (tb[TCA_CAKE_FWMARK]) { 2649 q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]); 2650 q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0; 2651 } 2652 2653 if (q->tins) { 2654 sch_tree_lock(sch); 2655 cake_reconfigure(sch); 2656 sch_tree_unlock(sch); 2657 } 2658 2659 return 0; 2660 } 2661 2662 static void cake_destroy(struct Qdisc *sch) 2663 { 2664 struct cake_sched_data *q = qdisc_priv(sch); 2665 2666 qdisc_watchdog_cancel(&q->watchdog); 2667 tcf_block_put(q->block); 2668 kvfree(q->tins); 2669 } 2670 2671 static int cake_init(struct Qdisc *sch, struct nlattr *opt, 2672 struct netlink_ext_ack *extack) 2673 { 2674 struct cake_sched_data *q = qdisc_priv(sch); 2675 int i, j, err; 2676 2677 sch->limit = 10240; 2678 q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; 2679 q->flow_mode = CAKE_FLOW_TRIPLE; 2680 2681 q->rate_bps = 0; /* unlimited by default */ 2682 2683 q->interval = 100000; /* 100ms default */ 2684 q->target = 5000; /* 5ms: codel RFC argues 2685 * for 5 to 10% of interval 2686 */ 2687 q->rate_flags |= CAKE_FLAG_SPLIT_GSO; 2688 q->cur_tin = 0; 2689 q->cur_flow = 0; 2690 2691 qdisc_watchdog_init(&q->watchdog, sch); 2692 2693 if (opt) { 2694 int err = cake_change(sch, opt, extack); 2695 2696 if (err) 2697 return err; 2698 } 2699 2700 err = tcf_block_get(&q->block, &q->filter_list, sch, extack); 2701 if (err) 2702 return err; 2703 2704 quantum_div[0] = ~0; 2705 for (i = 1; i <= CAKE_QUEUES; i++) 2706 quantum_div[i] = 65535 / i; 2707 2708 q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), 2709 GFP_KERNEL); 2710 if (!q->tins) 2711 goto nomem; 2712 2713 for (i = 0; i < CAKE_MAX_TINS; i++) { 2714 struct cake_tin_data *b = q->tins + i; 2715 2716 INIT_LIST_HEAD(&b->new_flows); 2717 INIT_LIST_HEAD(&b->old_flows); 2718 INIT_LIST_HEAD(&b->decaying_flows); 2719 b->sparse_flow_count = 0; 2720 b->bulk_flow_count = 0; 2721 b->decaying_flow_count = 0; 2722 2723 for (j = 0; j < CAKE_QUEUES; j++) { 2724 struct cake_flow *flow = b->flows + j; 2725 u32 k = j * CAKE_MAX_TINS + i; 2726 2727 INIT_LIST_HEAD(&flow->flowchain); 2728 cobalt_vars_init(&flow->cvars); 2729 2730 q->overflow_heap[k].t = i; 2731 q->overflow_heap[k].b = j; 2732 b->overflow_idx[j] = k; 2733 } 2734 } 2735 2736 cake_reconfigure(sch); 2737 q->avg_peak_bandwidth = q->rate_bps; 2738 q->min_netlen = ~0; 2739 q->min_adjlen = ~0; 2740 return 0; 2741 2742 nomem: 2743 cake_destroy(sch); 2744 return -ENOMEM; 2745 } 2746 2747 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) 2748 { 2749 struct cake_sched_data *q = qdisc_priv(sch); 2750 struct nlattr *opts; 2751 2752 opts = nla_nest_start_noflag(skb, TCA_OPTIONS); 2753 if (!opts) 2754 goto nla_put_failure; 2755 2756 if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, 2757 TCA_CAKE_PAD)) 2758 goto nla_put_failure; 2759 2760 if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, 2761 q->flow_mode & CAKE_FLOW_MASK)) 2762 goto nla_put_failure; 2763 2764 if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) 2765 goto nla_put_failure; 2766 2767 if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) 2768 goto nla_put_failure; 2769 2770 if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) 2771 goto nla_put_failure; 2772 2773 if (nla_put_u32(skb, TCA_CAKE_AUTORATE, 2774 !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) 2775 goto nla_put_failure; 2776 2777 if (nla_put_u32(skb, TCA_CAKE_INGRESS, 2778 !!(q->rate_flags & CAKE_FLAG_INGRESS))) 2779 goto nla_put_failure; 2780 2781 if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) 2782 goto nla_put_failure; 2783 2784 if (nla_put_u32(skb, TCA_CAKE_NAT, 2785 !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) 2786 goto nla_put_failure; 2787 2788 if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) 2789 goto nla_put_failure; 2790 2791 if (nla_put_u32(skb, TCA_CAKE_WASH, 2792 !!(q->rate_flags & CAKE_FLAG_WASH))) 2793 goto nla_put_failure; 2794 2795 if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) 2796 goto nla_put_failure; 2797 2798 if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) 2799 if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) 2800 goto nla_put_failure; 2801 2802 if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) 2803 goto nla_put_failure; 2804 2805 if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) 2806 goto nla_put_failure; 2807 2808 if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, 2809 !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) 2810 goto nla_put_failure; 2811 2812 if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask)) 2813 goto nla_put_failure; 2814 2815 return nla_nest_end(skb, opts); 2816 2817 nla_put_failure: 2818 return -1; 2819 } 2820 2821 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 2822 { 2823 struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 2824 struct cake_sched_data *q = qdisc_priv(sch); 2825 struct nlattr *tstats, *ts; 2826 int i; 2827 2828 if (!stats) 2829 return -1; 2830 2831 #define PUT_STAT_U32(attr, data) do { \ 2832 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 2833 goto nla_put_failure; \ 2834 } while (0) 2835 #define PUT_STAT_U64(attr, data) do { \ 2836 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ 2837 data, TCA_CAKE_STATS_PAD)) \ 2838 goto nla_put_failure; \ 2839 } while (0) 2840 2841 PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); 2842 PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); 2843 PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); 2844 PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); 2845 PUT_STAT_U32(MAX_NETLEN, q->max_netlen); 2846 PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); 2847 PUT_STAT_U32(MIN_NETLEN, q->min_netlen); 2848 PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); 2849 2850 #undef PUT_STAT_U32 2851 #undef PUT_STAT_U64 2852 2853 tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS); 2854 if (!tstats) 2855 goto nla_put_failure; 2856 2857 #define PUT_TSTAT_U32(attr, data) do { \ 2858 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ 2859 goto nla_put_failure; \ 2860 } while (0) 2861 #define PUT_TSTAT_U64(attr, data) do { \ 2862 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ 2863 data, TCA_CAKE_TIN_STATS_PAD)) \ 2864 goto nla_put_failure; \ 2865 } while (0) 2866 2867 for (i = 0; i < q->tin_cnt; i++) { 2868 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 2869 2870 ts = nla_nest_start_noflag(d->skb, i + 1); 2871 if (!ts) 2872 goto nla_put_failure; 2873 2874 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); 2875 PUT_TSTAT_U64(SENT_BYTES64, b->bytes); 2876 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); 2877 2878 PUT_TSTAT_U32(TARGET_US, 2879 ktime_to_us(ns_to_ktime(b->cparams.target))); 2880 PUT_TSTAT_U32(INTERVAL_US, 2881 ktime_to_us(ns_to_ktime(b->cparams.interval))); 2882 2883 PUT_TSTAT_U32(SENT_PACKETS, b->packets); 2884 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); 2885 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); 2886 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); 2887 2888 PUT_TSTAT_U32(PEAK_DELAY_US, 2889 ktime_to_us(ns_to_ktime(b->peak_delay))); 2890 PUT_TSTAT_U32(AVG_DELAY_US, 2891 ktime_to_us(ns_to_ktime(b->avge_delay))); 2892 PUT_TSTAT_U32(BASE_DELAY_US, 2893 ktime_to_us(ns_to_ktime(b->base_delay))); 2894 2895 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); 2896 PUT_TSTAT_U32(WAY_MISSES, b->way_misses); 2897 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); 2898 2899 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + 2900 b->decaying_flow_count); 2901 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); 2902 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); 2903 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); 2904 2905 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); 2906 nla_nest_end(d->skb, ts); 2907 } 2908 2909 #undef PUT_TSTAT_U32 2910 #undef PUT_TSTAT_U64 2911 2912 nla_nest_end(d->skb, tstats); 2913 return nla_nest_end(d->skb, stats); 2914 2915 nla_put_failure: 2916 nla_nest_cancel(d->skb, stats); 2917 return -1; 2918 } 2919 2920 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) 2921 { 2922 return NULL; 2923 } 2924 2925 static unsigned long cake_find(struct Qdisc *sch, u32 classid) 2926 { 2927 return 0; 2928 } 2929 2930 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, 2931 u32 classid) 2932 { 2933 return 0; 2934 } 2935 2936 static void cake_unbind(struct Qdisc *q, unsigned long cl) 2937 { 2938 } 2939 2940 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, 2941 struct netlink_ext_ack *extack) 2942 { 2943 struct cake_sched_data *q = qdisc_priv(sch); 2944 2945 if (cl) 2946 return NULL; 2947 return q->block; 2948 } 2949 2950 static int cake_dump_class(struct Qdisc *sch, unsigned long cl, 2951 struct sk_buff *skb, struct tcmsg *tcm) 2952 { 2953 tcm->tcm_handle |= TC_H_MIN(cl); 2954 return 0; 2955 } 2956 2957 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, 2958 struct gnet_dump *d) 2959 { 2960 struct cake_sched_data *q = qdisc_priv(sch); 2961 const struct cake_flow *flow = NULL; 2962 struct gnet_stats_queue qs = { 0 }; 2963 struct nlattr *stats; 2964 u32 idx = cl - 1; 2965 2966 if (idx < CAKE_QUEUES * q->tin_cnt) { 2967 const struct cake_tin_data *b = \ 2968 &q->tins[q->tin_order[idx / CAKE_QUEUES]]; 2969 const struct sk_buff *skb; 2970 2971 flow = &b->flows[idx % CAKE_QUEUES]; 2972 2973 if (flow->head) { 2974 sch_tree_lock(sch); 2975 skb = flow->head; 2976 while (skb) { 2977 qs.qlen++; 2978 skb = skb->next; 2979 } 2980 sch_tree_unlock(sch); 2981 } 2982 qs.backlog = b->backlogs[idx % CAKE_QUEUES]; 2983 qs.drops = flow->dropped; 2984 } 2985 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) 2986 return -1; 2987 if (flow) { 2988 ktime_t now = ktime_get(); 2989 2990 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 2991 if (!stats) 2992 return -1; 2993 2994 #define PUT_STAT_U32(attr, data) do { \ 2995 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 2996 goto nla_put_failure; \ 2997 } while (0) 2998 #define PUT_STAT_S32(attr, data) do { \ 2999 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3000 goto nla_put_failure; \ 3001 } while (0) 3002 3003 PUT_STAT_S32(DEFICIT, flow->deficit); 3004 PUT_STAT_U32(DROPPING, flow->cvars.dropping); 3005 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); 3006 PUT_STAT_U32(P_DROP, flow->cvars.p_drop); 3007 if (flow->cvars.p_drop) { 3008 PUT_STAT_S32(BLUE_TIMER_US, 3009 ktime_to_us( 3010 ktime_sub(now, 3011 flow->cvars.blue_timer))); 3012 } 3013 if (flow->cvars.dropping) { 3014 PUT_STAT_S32(DROP_NEXT_US, 3015 ktime_to_us( 3016 ktime_sub(now, 3017 flow->cvars.drop_next))); 3018 } 3019 3020 if (nla_nest_end(d->skb, stats) < 0) 3021 return -1; 3022 } 3023 3024 return 0; 3025 3026 nla_put_failure: 3027 nla_nest_cancel(d->skb, stats); 3028 return -1; 3029 } 3030 3031 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) 3032 { 3033 struct cake_sched_data *q = qdisc_priv(sch); 3034 unsigned int i, j; 3035 3036 if (arg->stop) 3037 return; 3038 3039 for (i = 0; i < q->tin_cnt; i++) { 3040 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 3041 3042 for (j = 0; j < CAKE_QUEUES; j++) { 3043 if (list_empty(&b->flows[j].flowchain) || 3044 arg->count < arg->skip) { 3045 arg->count++; 3046 continue; 3047 } 3048 if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) { 3049 arg->stop = 1; 3050 break; 3051 } 3052 arg->count++; 3053 } 3054 } 3055 } 3056 3057 static const struct Qdisc_class_ops cake_class_ops = { 3058 .leaf = cake_leaf, 3059 .find = cake_find, 3060 .tcf_block = cake_tcf_block, 3061 .bind_tcf = cake_bind, 3062 .unbind_tcf = cake_unbind, 3063 .dump = cake_dump_class, 3064 .dump_stats = cake_dump_class_stats, 3065 .walk = cake_walk, 3066 }; 3067 3068 static struct Qdisc_ops cake_qdisc_ops __read_mostly = { 3069 .cl_ops = &cake_class_ops, 3070 .id = "cake", 3071 .priv_size = sizeof(struct cake_sched_data), 3072 .enqueue = cake_enqueue, 3073 .dequeue = cake_dequeue, 3074 .peek = qdisc_peek_dequeued, 3075 .init = cake_init, 3076 .reset = cake_reset, 3077 .destroy = cake_destroy, 3078 .change = cake_change, 3079 .dump = cake_dump, 3080 .dump_stats = cake_dump_stats, 3081 .owner = THIS_MODULE, 3082 }; 3083 3084 static int __init cake_module_init(void) 3085 { 3086 return register_qdisc(&cake_qdisc_ops); 3087 } 3088 3089 static void __exit cake_module_exit(void) 3090 { 3091 unregister_qdisc(&cake_qdisc_ops); 3092 } 3093 3094 module_init(cake_module_init) 3095 module_exit(cake_module_exit) 3096 MODULE_AUTHOR("Jonathan Morton"); 3097 MODULE_LICENSE("Dual BSD/GPL"); 3098 MODULE_DESCRIPTION("The CAKE shaper."); 3099