1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* COMMON Applications Kept Enhanced (CAKE) discipline 4 * 5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> 6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> 7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> 8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> 9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> 10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> 11 * 12 * The CAKE Principles: 13 * (or, how to have your cake and eat it too) 14 * 15 * This is a combination of several shaping, AQM and FQ techniques into one 16 * easy-to-use package: 17 * 18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE 19 * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), 20 * eliminating the need for any sort of burst parameter (eg. token bucket 21 * depth). Burst support is limited to that necessary to overcome scheduling 22 * latency. 23 * 24 * - A Diffserv-aware priority queue, giving more priority to certain classes, 25 * up to a specified fraction of bandwidth. Above that bandwidth threshold, 26 * the priority is reduced to avoid starving other tins. 27 * 28 * - Each priority tin has a separate Flow Queue system, to isolate traffic 29 * flows from each other. This prevents a burst on one flow from increasing 30 * the delay to another. Flows are distributed to queues using a 31 * set-associative hash function. 32 * 33 * - Each queue is actively managed by Cobalt, which is a combination of the 34 * Codel and Blue AQM algorithms. This serves flows fairly, and signals 35 * congestion early via ECN (if available) and/or packet drops, to keep 36 * latency low. The codel parameters are auto-tuned based on the bandwidth 37 * setting, as is necessary at low bandwidths. 38 * 39 * The configuration parameters are kept deliberately simple for ease of use. 40 * Everything has sane defaults. Complete generality of configuration is *not* 41 * a goal. 42 * 43 * The priority queue operates according to a weighted DRR scheme, combined with 44 * a bandwidth tracker which reuses the shaper logic to detect which side of the 45 * bandwidth sharing threshold the tin is operating. This determines whether a 46 * priority-based weight (high) or a bandwidth-based weight (low) is used for 47 * that tin in the current pass. 48 * 49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly 50 * granted us permission to leverage. 51 */ 52 53 #include <linux/module.h> 54 #include <linux/types.h> 55 #include <linux/kernel.h> 56 #include <linux/jiffies.h> 57 #include <linux/string.h> 58 #include <linux/in.h> 59 #include <linux/errno.h> 60 #include <linux/init.h> 61 #include <linux/skbuff.h> 62 #include <linux/jhash.h> 63 #include <linux/slab.h> 64 #include <linux/vmalloc.h> 65 #include <linux/reciprocal_div.h> 66 #include <net/netlink.h> 67 #include <linux/version.h> 68 #include <linux/if_vlan.h> 69 #include <net/pkt_sched.h> 70 #include <net/pkt_cls.h> 71 #include <net/tcp.h> 72 #include <net/flow_dissector.h> 73 74 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 75 #include <net/netfilter/nf_conntrack_core.h> 76 #endif 77 78 #define CAKE_SET_WAYS (8) 79 #define CAKE_MAX_TINS (8) 80 #define CAKE_QUEUES (1024) 81 #define CAKE_FLOW_MASK 63 82 #define CAKE_FLOW_NAT_FLAG 64 83 #define CAKE_SPLIT_GSO_THRESHOLD (125000000) /* 1Gbps */ 84 85 /* struct cobalt_params - contains codel and blue parameters 86 * @interval: codel initial drop rate 87 * @target: maximum persistent sojourn time & blue update rate 88 * @mtu_time: serialisation delay of maximum-size packet 89 * @p_inc: increment of blue drop probability (0.32 fxp) 90 * @p_dec: decrement of blue drop probability (0.32 fxp) 91 */ 92 struct cobalt_params { 93 u64 interval; 94 u64 target; 95 u64 mtu_time; 96 u32 p_inc; 97 u32 p_dec; 98 }; 99 100 /* struct cobalt_vars - contains codel and blue variables 101 * @count: codel dropping frequency 102 * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 103 * @drop_next: time to drop next packet, or when we dropped last 104 * @blue_timer: Blue time to next drop 105 * @p_drop: BLUE drop probability (0.32 fxp) 106 * @dropping: set if in dropping state 107 * @ecn_marked: set if marked 108 */ 109 struct cobalt_vars { 110 u32 count; 111 u32 rec_inv_sqrt; 112 ktime_t drop_next; 113 ktime_t blue_timer; 114 u32 p_drop; 115 bool dropping; 116 bool ecn_marked; 117 }; 118 119 enum { 120 CAKE_SET_NONE = 0, 121 CAKE_SET_SPARSE, 122 CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ 123 CAKE_SET_BULK, 124 CAKE_SET_DECAYING 125 }; 126 127 struct cake_flow { 128 /* this stuff is all needed per-flow at dequeue time */ 129 struct sk_buff *head; 130 struct sk_buff *tail; 131 struct list_head flowchain; 132 s32 deficit; 133 u32 dropped; 134 struct cobalt_vars cvars; 135 u16 srchost; /* index into cake_host table */ 136 u16 dsthost; 137 u8 set; 138 }; /* please try to keep this structure <= 64 bytes */ 139 140 struct cake_host { 141 u32 srchost_tag; 142 u32 dsthost_tag; 143 u16 srchost_refcnt; 144 u16 dsthost_refcnt; 145 }; 146 147 struct cake_heap_entry { 148 u16 t:3, b:10; 149 }; 150 151 struct cake_tin_data { 152 struct cake_flow flows[CAKE_QUEUES]; 153 u32 backlogs[CAKE_QUEUES]; 154 u32 tags[CAKE_QUEUES]; /* for set association */ 155 u16 overflow_idx[CAKE_QUEUES]; 156 struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ 157 u16 flow_quantum; 158 159 struct cobalt_params cparams; 160 u32 drop_overlimit; 161 u16 bulk_flow_count; 162 u16 sparse_flow_count; 163 u16 decaying_flow_count; 164 u16 unresponsive_flow_count; 165 166 u32 max_skblen; 167 168 struct list_head new_flows; 169 struct list_head old_flows; 170 struct list_head decaying_flows; 171 172 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 173 ktime_t time_next_packet; 174 u64 tin_rate_ns; 175 u64 tin_rate_bps; 176 u16 tin_rate_shft; 177 178 u16 tin_quantum_prio; 179 u16 tin_quantum_band; 180 s32 tin_deficit; 181 u32 tin_backlog; 182 u32 tin_dropped; 183 u32 tin_ecn_mark; 184 185 u32 packets; 186 u64 bytes; 187 188 u32 ack_drops; 189 190 /* moving averages */ 191 u64 avge_delay; 192 u64 peak_delay; 193 u64 base_delay; 194 195 /* hash function stats */ 196 u32 way_directs; 197 u32 way_hits; 198 u32 way_misses; 199 u32 way_collisions; 200 }; /* number of tins is small, so size of this struct doesn't matter much */ 201 202 struct cake_sched_data { 203 struct tcf_proto __rcu *filter_list; /* optional external classifier */ 204 struct tcf_block *block; 205 struct cake_tin_data *tins; 206 207 struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; 208 u16 overflow_timeout; 209 210 u16 tin_cnt; 211 u8 tin_mode; 212 u8 flow_mode; 213 u8 ack_filter; 214 u8 atm_mode; 215 216 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 217 u16 rate_shft; 218 ktime_t time_next_packet; 219 ktime_t failsafe_next_packet; 220 u64 rate_ns; 221 u64 rate_bps; 222 u16 rate_flags; 223 s16 rate_overhead; 224 u16 rate_mpu; 225 u64 interval; 226 u64 target; 227 228 /* resource tracking */ 229 u32 buffer_used; 230 u32 buffer_max_used; 231 u32 buffer_limit; 232 u32 buffer_config_limit; 233 234 /* indices for dequeue */ 235 u16 cur_tin; 236 u16 cur_flow; 237 238 struct qdisc_watchdog watchdog; 239 const u8 *tin_index; 240 const u8 *tin_order; 241 242 /* bandwidth capacity estimate */ 243 ktime_t last_packet_time; 244 ktime_t avg_window_begin; 245 u64 avg_packet_interval; 246 u64 avg_window_bytes; 247 u64 avg_peak_bandwidth; 248 ktime_t last_reconfig_time; 249 250 /* packet length stats */ 251 u32 avg_netoff; 252 u16 max_netlen; 253 u16 max_adjlen; 254 u16 min_netlen; 255 u16 min_adjlen; 256 }; 257 258 enum { 259 CAKE_FLAG_OVERHEAD = BIT(0), 260 CAKE_FLAG_AUTORATE_INGRESS = BIT(1), 261 CAKE_FLAG_INGRESS = BIT(2), 262 CAKE_FLAG_WASH = BIT(3), 263 CAKE_FLAG_SPLIT_GSO = BIT(4) 264 }; 265 266 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to 267 * obtain the best features of each. Codel is excellent on flows which 268 * respond to congestion signals in a TCP-like way. BLUE is more effective on 269 * unresponsive flows. 270 */ 271 272 struct cobalt_skb_cb { 273 ktime_t enqueue_time; 274 u32 adjusted_len; 275 }; 276 277 static u64 us_to_ns(u64 us) 278 { 279 return us * NSEC_PER_USEC; 280 } 281 282 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) 283 { 284 qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); 285 return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; 286 } 287 288 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) 289 { 290 return get_cobalt_cb(skb)->enqueue_time; 291 } 292 293 static void cobalt_set_enqueue_time(struct sk_buff *skb, 294 ktime_t now) 295 { 296 get_cobalt_cb(skb)->enqueue_time = now; 297 } 298 299 static u16 quantum_div[CAKE_QUEUES + 1] = {0}; 300 301 /* Diffserv lookup tables */ 302 303 static const u8 precedence[] = { 304 0, 0, 0, 0, 0, 0, 0, 0, 305 1, 1, 1, 1, 1, 1, 1, 1, 306 2, 2, 2, 2, 2, 2, 2, 2, 307 3, 3, 3, 3, 3, 3, 3, 3, 308 4, 4, 4, 4, 4, 4, 4, 4, 309 5, 5, 5, 5, 5, 5, 5, 5, 310 6, 6, 6, 6, 6, 6, 6, 6, 311 7, 7, 7, 7, 7, 7, 7, 7, 312 }; 313 314 static const u8 diffserv8[] = { 315 2, 5, 1, 2, 4, 2, 2, 2, 316 0, 2, 1, 2, 1, 2, 1, 2, 317 5, 2, 4, 2, 4, 2, 4, 2, 318 3, 2, 3, 2, 3, 2, 3, 2, 319 6, 2, 3, 2, 3, 2, 3, 2, 320 6, 2, 2, 2, 6, 2, 6, 2, 321 7, 2, 2, 2, 2, 2, 2, 2, 322 7, 2, 2, 2, 2, 2, 2, 2, 323 }; 324 325 static const u8 diffserv4[] = { 326 0, 2, 0, 0, 2, 0, 0, 0, 327 1, 0, 0, 0, 0, 0, 0, 0, 328 2, 0, 2, 0, 2, 0, 2, 0, 329 2, 0, 2, 0, 2, 0, 2, 0, 330 3, 0, 2, 0, 2, 0, 2, 0, 331 3, 0, 0, 0, 3, 0, 3, 0, 332 3, 0, 0, 0, 0, 0, 0, 0, 333 3, 0, 0, 0, 0, 0, 0, 0, 334 }; 335 336 static const u8 diffserv3[] = { 337 0, 0, 0, 0, 2, 0, 0, 0, 338 1, 0, 0, 0, 0, 0, 0, 0, 339 0, 0, 0, 0, 0, 0, 0, 0, 340 0, 0, 0, 0, 0, 0, 0, 0, 341 0, 0, 0, 0, 0, 0, 0, 0, 342 0, 0, 0, 0, 2, 0, 2, 0, 343 2, 0, 0, 0, 0, 0, 0, 0, 344 2, 0, 0, 0, 0, 0, 0, 0, 345 }; 346 347 static const u8 besteffort[] = { 348 0, 0, 0, 0, 0, 0, 0, 0, 349 0, 0, 0, 0, 0, 0, 0, 0, 350 0, 0, 0, 0, 0, 0, 0, 0, 351 0, 0, 0, 0, 0, 0, 0, 0, 352 0, 0, 0, 0, 0, 0, 0, 0, 353 0, 0, 0, 0, 0, 0, 0, 0, 354 0, 0, 0, 0, 0, 0, 0, 0, 355 0, 0, 0, 0, 0, 0, 0, 0, 356 }; 357 358 /* tin priority order for stats dumping */ 359 360 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; 361 static const u8 bulk_order[] = {1, 0, 2, 3}; 362 363 #define REC_INV_SQRT_CACHE (16) 364 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; 365 366 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots 367 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) 368 * 369 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 370 */ 371 372 static void cobalt_newton_step(struct cobalt_vars *vars) 373 { 374 u32 invsqrt, invsqrt2; 375 u64 val; 376 377 invsqrt = vars->rec_inv_sqrt; 378 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; 379 val = (3LL << 32) - ((u64)vars->count * invsqrt2); 380 381 val >>= 2; /* avoid overflow in following multiply */ 382 val = (val * invsqrt) >> (32 - 2 + 1); 383 384 vars->rec_inv_sqrt = val; 385 } 386 387 static void cobalt_invsqrt(struct cobalt_vars *vars) 388 { 389 if (vars->count < REC_INV_SQRT_CACHE) 390 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; 391 else 392 cobalt_newton_step(vars); 393 } 394 395 /* There is a big difference in timing between the accurate values placed in 396 * the cache and the approximations given by a single Newton step for small 397 * count values, particularly when stepping from count 1 to 2 or vice versa. 398 * Above 16, a single Newton step gives sufficient accuracy in either 399 * direction, given the precision stored. 400 * 401 * The magnitude of the error when stepping up to count 2 is such as to give 402 * the value that *should* have been produced at count 4. 403 */ 404 405 static void cobalt_cache_init(void) 406 { 407 struct cobalt_vars v; 408 409 memset(&v, 0, sizeof(v)); 410 v.rec_inv_sqrt = ~0U; 411 cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; 412 413 for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { 414 cobalt_newton_step(&v); 415 cobalt_newton_step(&v); 416 cobalt_newton_step(&v); 417 cobalt_newton_step(&v); 418 419 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; 420 } 421 } 422 423 static void cobalt_vars_init(struct cobalt_vars *vars) 424 { 425 memset(vars, 0, sizeof(*vars)); 426 427 if (!cobalt_rec_inv_sqrt_cache[0]) { 428 cobalt_cache_init(); 429 cobalt_rec_inv_sqrt_cache[0] = ~0; 430 } 431 } 432 433 /* CoDel control_law is t + interval/sqrt(count) 434 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid 435 * both sqrt() and divide operation. 436 */ 437 static ktime_t cobalt_control(ktime_t t, 438 u64 interval, 439 u32 rec_inv_sqrt) 440 { 441 return ktime_add_ns(t, reciprocal_scale(interval, 442 rec_inv_sqrt)); 443 } 444 445 /* Call this when a packet had to be dropped due to queue overflow. Returns 446 * true if the BLUE state was quiescent before but active after this call. 447 */ 448 static bool cobalt_queue_full(struct cobalt_vars *vars, 449 struct cobalt_params *p, 450 ktime_t now) 451 { 452 bool up = false; 453 454 if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 455 up = !vars->p_drop; 456 vars->p_drop += p->p_inc; 457 if (vars->p_drop < p->p_inc) 458 vars->p_drop = ~0; 459 vars->blue_timer = now; 460 } 461 vars->dropping = true; 462 vars->drop_next = now; 463 if (!vars->count) 464 vars->count = 1; 465 466 return up; 467 } 468 469 /* Call this when the queue was serviced but turned out to be empty. Returns 470 * true if the BLUE state was active before but quiescent after this call. 471 */ 472 static bool cobalt_queue_empty(struct cobalt_vars *vars, 473 struct cobalt_params *p, 474 ktime_t now) 475 { 476 bool down = false; 477 478 if (vars->p_drop && 479 ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 480 if (vars->p_drop < p->p_dec) 481 vars->p_drop = 0; 482 else 483 vars->p_drop -= p->p_dec; 484 vars->blue_timer = now; 485 down = !vars->p_drop; 486 } 487 vars->dropping = false; 488 489 if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { 490 vars->count--; 491 cobalt_invsqrt(vars); 492 vars->drop_next = cobalt_control(vars->drop_next, 493 p->interval, 494 vars->rec_inv_sqrt); 495 } 496 497 return down; 498 } 499 500 /* Call this with a freshly dequeued packet for possible congestion marking. 501 * Returns true as an instruction to drop the packet, false for delivery. 502 */ 503 static bool cobalt_should_drop(struct cobalt_vars *vars, 504 struct cobalt_params *p, 505 ktime_t now, 506 struct sk_buff *skb, 507 u32 bulk_flows) 508 { 509 bool next_due, over_target, drop = false; 510 ktime_t schedule; 511 u64 sojourn; 512 513 /* The 'schedule' variable records, in its sign, whether 'now' is before or 514 * after 'drop_next'. This allows 'drop_next' to be updated before the next 515 * scheduling decision is actually branched, without destroying that 516 * information. Similarly, the first 'schedule' value calculated is preserved 517 * in the boolean 'next_due'. 518 * 519 * As for 'drop_next', we take advantage of the fact that 'interval' is both 520 * the delay between first exceeding 'target' and the first signalling event, 521 * *and* the scaling factor for the signalling frequency. It's therefore very 522 * natural to use a single mechanism for both purposes, and eliminates a 523 * significant amount of reference Codel's spaghetti code. To help with this, 524 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close 525 * as possible to 1.0 in fixed-point. 526 */ 527 528 sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 529 schedule = ktime_sub(now, vars->drop_next); 530 over_target = sojourn > p->target && 531 sojourn > p->mtu_time * bulk_flows * 2 && 532 sojourn > p->mtu_time * 4; 533 next_due = vars->count && ktime_to_ns(schedule) >= 0; 534 535 vars->ecn_marked = false; 536 537 if (over_target) { 538 if (!vars->dropping) { 539 vars->dropping = true; 540 vars->drop_next = cobalt_control(now, 541 p->interval, 542 vars->rec_inv_sqrt); 543 } 544 if (!vars->count) 545 vars->count = 1; 546 } else if (vars->dropping) { 547 vars->dropping = false; 548 } 549 550 if (next_due && vars->dropping) { 551 /* Use ECN mark if possible, otherwise drop */ 552 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); 553 554 vars->count++; 555 if (!vars->count) 556 vars->count--; 557 cobalt_invsqrt(vars); 558 vars->drop_next = cobalt_control(vars->drop_next, 559 p->interval, 560 vars->rec_inv_sqrt); 561 schedule = ktime_sub(now, vars->drop_next); 562 } else { 563 while (next_due) { 564 vars->count--; 565 cobalt_invsqrt(vars); 566 vars->drop_next = cobalt_control(vars->drop_next, 567 p->interval, 568 vars->rec_inv_sqrt); 569 schedule = ktime_sub(now, vars->drop_next); 570 next_due = vars->count && ktime_to_ns(schedule) >= 0; 571 } 572 } 573 574 /* Simple BLUE implementation. Lack of ECN is deliberate. */ 575 if (vars->p_drop) 576 drop |= (prandom_u32() < vars->p_drop); 577 578 /* Overload the drop_next field as an activity timeout */ 579 if (!vars->count) 580 vars->drop_next = ktime_add_ns(now, p->interval); 581 else if (ktime_to_ns(schedule) > 0 && !drop) 582 vars->drop_next = now; 583 584 return drop; 585 } 586 587 static void cake_update_flowkeys(struct flow_keys *keys, 588 const struct sk_buff *skb) 589 { 590 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 591 struct nf_conntrack_tuple tuple = {}; 592 bool rev = !skb->_nfct; 593 594 if (tc_skb_protocol(skb) != htons(ETH_P_IP)) 595 return; 596 597 if (!nf_ct_get_tuple_skb(&tuple, skb)) 598 return; 599 600 keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; 601 keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; 602 603 if (keys->ports.ports) { 604 keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all; 605 keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all; 606 } 607 #endif 608 } 609 610 /* Cake has several subtle multiple bit settings. In these cases you 611 * would be matching triple isolate mode as well. 612 */ 613 614 static bool cake_dsrc(int flow_mode) 615 { 616 return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; 617 } 618 619 static bool cake_ddst(int flow_mode) 620 { 621 return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; 622 } 623 624 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, 625 int flow_mode) 626 { 627 u32 flow_hash = 0, srchost_hash, dsthost_hash; 628 u16 reduced_hash, srchost_idx, dsthost_idx; 629 struct flow_keys keys, host_keys; 630 631 if (unlikely(flow_mode == CAKE_FLOW_NONE)) 632 return 0; 633 634 skb_flow_dissect_flow_keys(skb, &keys, 635 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); 636 637 if (flow_mode & CAKE_FLOW_NAT_FLAG) 638 cake_update_flowkeys(&keys, skb); 639 640 /* flow_hash_from_keys() sorts the addresses by value, so we have 641 * to preserve their order in a separate data structure to treat 642 * src and dst host addresses as independently selectable. 643 */ 644 host_keys = keys; 645 host_keys.ports.ports = 0; 646 host_keys.basic.ip_proto = 0; 647 host_keys.keyid.keyid = 0; 648 host_keys.tags.flow_label = 0; 649 650 switch (host_keys.control.addr_type) { 651 case FLOW_DISSECTOR_KEY_IPV4_ADDRS: 652 host_keys.addrs.v4addrs.src = 0; 653 dsthost_hash = flow_hash_from_keys(&host_keys); 654 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 655 host_keys.addrs.v4addrs.dst = 0; 656 srchost_hash = flow_hash_from_keys(&host_keys); 657 break; 658 659 case FLOW_DISSECTOR_KEY_IPV6_ADDRS: 660 memset(&host_keys.addrs.v6addrs.src, 0, 661 sizeof(host_keys.addrs.v6addrs.src)); 662 dsthost_hash = flow_hash_from_keys(&host_keys); 663 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 664 memset(&host_keys.addrs.v6addrs.dst, 0, 665 sizeof(host_keys.addrs.v6addrs.dst)); 666 srchost_hash = flow_hash_from_keys(&host_keys); 667 break; 668 669 default: 670 dsthost_hash = 0; 671 srchost_hash = 0; 672 } 673 674 /* This *must* be after the above switch, since as a 675 * side-effect it sorts the src and dst addresses. 676 */ 677 if (flow_mode & CAKE_FLOW_FLOWS) 678 flow_hash = flow_hash_from_keys(&keys); 679 680 if (!(flow_mode & CAKE_FLOW_FLOWS)) { 681 if (flow_mode & CAKE_FLOW_SRC_IP) 682 flow_hash ^= srchost_hash; 683 684 if (flow_mode & CAKE_FLOW_DST_IP) 685 flow_hash ^= dsthost_hash; 686 } 687 688 reduced_hash = flow_hash % CAKE_QUEUES; 689 690 /* set-associative hashing */ 691 /* fast path if no hash collision (direct lookup succeeds) */ 692 if (likely(q->tags[reduced_hash] == flow_hash && 693 q->flows[reduced_hash].set)) { 694 q->way_directs++; 695 } else { 696 u32 inner_hash = reduced_hash % CAKE_SET_WAYS; 697 u32 outer_hash = reduced_hash - inner_hash; 698 bool allocate_src = false; 699 bool allocate_dst = false; 700 u32 i, k; 701 702 /* check if any active queue in the set is reserved for 703 * this flow. 704 */ 705 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 706 i++, k = (k + 1) % CAKE_SET_WAYS) { 707 if (q->tags[outer_hash + k] == flow_hash) { 708 if (i) 709 q->way_hits++; 710 711 if (!q->flows[outer_hash + k].set) { 712 /* need to increment host refcnts */ 713 allocate_src = cake_dsrc(flow_mode); 714 allocate_dst = cake_ddst(flow_mode); 715 } 716 717 goto found; 718 } 719 } 720 721 /* no queue is reserved for this flow, look for an 722 * empty one. 723 */ 724 for (i = 0; i < CAKE_SET_WAYS; 725 i++, k = (k + 1) % CAKE_SET_WAYS) { 726 if (!q->flows[outer_hash + k].set) { 727 q->way_misses++; 728 allocate_src = cake_dsrc(flow_mode); 729 allocate_dst = cake_ddst(flow_mode); 730 goto found; 731 } 732 } 733 734 /* With no empty queues, default to the original 735 * queue, accept the collision, update the host tags. 736 */ 737 q->way_collisions++; 738 q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--; 739 q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--; 740 allocate_src = cake_dsrc(flow_mode); 741 allocate_dst = cake_ddst(flow_mode); 742 found: 743 /* reserve queue for future packets in same flow */ 744 reduced_hash = outer_hash + k; 745 q->tags[reduced_hash] = flow_hash; 746 747 if (allocate_src) { 748 srchost_idx = srchost_hash % CAKE_QUEUES; 749 inner_hash = srchost_idx % CAKE_SET_WAYS; 750 outer_hash = srchost_idx - inner_hash; 751 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 752 i++, k = (k + 1) % CAKE_SET_WAYS) { 753 if (q->hosts[outer_hash + k].srchost_tag == 754 srchost_hash) 755 goto found_src; 756 } 757 for (i = 0; i < CAKE_SET_WAYS; 758 i++, k = (k + 1) % CAKE_SET_WAYS) { 759 if (!q->hosts[outer_hash + k].srchost_refcnt) 760 break; 761 } 762 q->hosts[outer_hash + k].srchost_tag = srchost_hash; 763 found_src: 764 srchost_idx = outer_hash + k; 765 q->hosts[srchost_idx].srchost_refcnt++; 766 q->flows[reduced_hash].srchost = srchost_idx; 767 } 768 769 if (allocate_dst) { 770 dsthost_idx = dsthost_hash % CAKE_QUEUES; 771 inner_hash = dsthost_idx % CAKE_SET_WAYS; 772 outer_hash = dsthost_idx - inner_hash; 773 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 774 i++, k = (k + 1) % CAKE_SET_WAYS) { 775 if (q->hosts[outer_hash + k].dsthost_tag == 776 dsthost_hash) 777 goto found_dst; 778 } 779 for (i = 0; i < CAKE_SET_WAYS; 780 i++, k = (k + 1) % CAKE_SET_WAYS) { 781 if (!q->hosts[outer_hash + k].dsthost_refcnt) 782 break; 783 } 784 q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; 785 found_dst: 786 dsthost_idx = outer_hash + k; 787 q->hosts[dsthost_idx].dsthost_refcnt++; 788 q->flows[reduced_hash].dsthost = dsthost_idx; 789 } 790 } 791 792 return reduced_hash; 793 } 794 795 /* helper functions : might be changed when/if skb use a standard list_head */ 796 /* remove one skb from head of slot queue */ 797 798 static struct sk_buff *dequeue_head(struct cake_flow *flow) 799 { 800 struct sk_buff *skb = flow->head; 801 802 if (skb) { 803 flow->head = skb->next; 804 skb->next = NULL; 805 } 806 807 return skb; 808 } 809 810 /* add skb to flow queue (tail add) */ 811 812 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) 813 { 814 if (!flow->head) 815 flow->head = skb; 816 else 817 flow->tail->next = skb; 818 flow->tail = skb; 819 skb->next = NULL; 820 } 821 822 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, 823 struct ipv6hdr *buf) 824 { 825 unsigned int offset = skb_network_offset(skb); 826 struct iphdr *iph; 827 828 iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); 829 830 if (!iph) 831 return NULL; 832 833 if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) 834 return skb_header_pointer(skb, offset + iph->ihl * 4, 835 sizeof(struct ipv6hdr), buf); 836 837 else if (iph->version == 4) 838 return iph; 839 840 else if (iph->version == 6) 841 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), 842 buf); 843 844 return NULL; 845 } 846 847 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, 848 void *buf, unsigned int bufsize) 849 { 850 unsigned int offset = skb_network_offset(skb); 851 const struct ipv6hdr *ipv6h; 852 const struct tcphdr *tcph; 853 const struct iphdr *iph; 854 struct ipv6hdr _ipv6h; 855 struct tcphdr _tcph; 856 857 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); 858 859 if (!ipv6h) 860 return NULL; 861 862 if (ipv6h->version == 4) { 863 iph = (struct iphdr *)ipv6h; 864 offset += iph->ihl * 4; 865 866 /* special-case 6in4 tunnelling, as that is a common way to get 867 * v6 connectivity in the home 868 */ 869 if (iph->protocol == IPPROTO_IPV6) { 870 ipv6h = skb_header_pointer(skb, offset, 871 sizeof(_ipv6h), &_ipv6h); 872 873 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) 874 return NULL; 875 876 offset += sizeof(struct ipv6hdr); 877 878 } else if (iph->protocol != IPPROTO_TCP) { 879 return NULL; 880 } 881 882 } else if (ipv6h->version == 6) { 883 if (ipv6h->nexthdr != IPPROTO_TCP) 884 return NULL; 885 886 offset += sizeof(struct ipv6hdr); 887 } else { 888 return NULL; 889 } 890 891 tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); 892 if (!tcph) 893 return NULL; 894 895 return skb_header_pointer(skb, offset, 896 min(__tcp_hdrlen(tcph), bufsize), buf); 897 } 898 899 static const void *cake_get_tcpopt(const struct tcphdr *tcph, 900 int code, int *oplen) 901 { 902 /* inspired by tcp_parse_options in tcp_input.c */ 903 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 904 const u8 *ptr = (const u8 *)(tcph + 1); 905 906 while (length > 0) { 907 int opcode = *ptr++; 908 int opsize; 909 910 if (opcode == TCPOPT_EOL) 911 break; 912 if (opcode == TCPOPT_NOP) { 913 length--; 914 continue; 915 } 916 opsize = *ptr++; 917 if (opsize < 2 || opsize > length) 918 break; 919 920 if (opcode == code) { 921 *oplen = opsize; 922 return ptr; 923 } 924 925 ptr += opsize - 2; 926 length -= opsize; 927 } 928 929 return NULL; 930 } 931 932 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more 933 * bytes than the other. In the case where both sequences ACKs bytes that the 934 * other doesn't, A is considered greater. DSACKs in A also makes A be 935 * considered greater. 936 * 937 * @return -1, 0 or 1 as normal compare functions 938 */ 939 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, 940 const struct tcphdr *tcph_b) 941 { 942 const struct tcp_sack_block_wire *sack_a, *sack_b; 943 u32 ack_seq_a = ntohl(tcph_a->ack_seq); 944 u32 bytes_a = 0, bytes_b = 0; 945 int oplen_a, oplen_b; 946 bool first = true; 947 948 sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); 949 sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); 950 951 /* pointers point to option contents */ 952 oplen_a -= TCPOLEN_SACK_BASE; 953 oplen_b -= TCPOLEN_SACK_BASE; 954 955 if (sack_a && oplen_a >= sizeof(*sack_a) && 956 (!sack_b || oplen_b < sizeof(*sack_b))) 957 return -1; 958 else if (sack_b && oplen_b >= sizeof(*sack_b) && 959 (!sack_a || oplen_a < sizeof(*sack_a))) 960 return 1; 961 else if ((!sack_a || oplen_a < sizeof(*sack_a)) && 962 (!sack_b || oplen_b < sizeof(*sack_b))) 963 return 0; 964 965 while (oplen_a >= sizeof(*sack_a)) { 966 const struct tcp_sack_block_wire *sack_tmp = sack_b; 967 u32 start_a = get_unaligned_be32(&sack_a->start_seq); 968 u32 end_a = get_unaligned_be32(&sack_a->end_seq); 969 int oplen_tmp = oplen_b; 970 bool found = false; 971 972 /* DSACK; always considered greater to prevent dropping */ 973 if (before(start_a, ack_seq_a)) 974 return -1; 975 976 bytes_a += end_a - start_a; 977 978 while (oplen_tmp >= sizeof(*sack_tmp)) { 979 u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); 980 u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); 981 982 /* first time through we count the total size */ 983 if (first) 984 bytes_b += end_b - start_b; 985 986 if (!after(start_b, start_a) && !before(end_b, end_a)) { 987 found = true; 988 if (!first) 989 break; 990 } 991 oplen_tmp -= sizeof(*sack_tmp); 992 sack_tmp++; 993 } 994 995 if (!found) 996 return -1; 997 998 oplen_a -= sizeof(*sack_a); 999 sack_a++; 1000 first = false; 1001 } 1002 1003 /* If we made it this far, all ranges SACKed by A are covered by B, so 1004 * either the SACKs are equal, or B SACKs more bytes. 1005 */ 1006 return bytes_b > bytes_a ? 1 : 0; 1007 } 1008 1009 static void cake_tcph_get_tstamp(const struct tcphdr *tcph, 1010 u32 *tsval, u32 *tsecr) 1011 { 1012 const u8 *ptr; 1013 int opsize; 1014 1015 ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); 1016 1017 if (ptr && opsize == TCPOLEN_TIMESTAMP) { 1018 *tsval = get_unaligned_be32(ptr); 1019 *tsecr = get_unaligned_be32(ptr + 4); 1020 } 1021 } 1022 1023 static bool cake_tcph_may_drop(const struct tcphdr *tcph, 1024 u32 tstamp_new, u32 tsecr_new) 1025 { 1026 /* inspired by tcp_parse_options in tcp_input.c */ 1027 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1028 const u8 *ptr = (const u8 *)(tcph + 1); 1029 u32 tstamp, tsecr; 1030 1031 /* 3 reserved flags must be unset to avoid future breakage 1032 * ACK must be set 1033 * ECE/CWR are handled separately 1034 * All other flags URG/PSH/RST/SYN/FIN must be unset 1035 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) 1036 * 0x00C00000 = CWR/ECE (handled separately) 1037 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 1038 */ 1039 if (((tcp_flag_word(tcph) & 1040 cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) 1041 return false; 1042 1043 while (length > 0) { 1044 int opcode = *ptr++; 1045 int opsize; 1046 1047 if (opcode == TCPOPT_EOL) 1048 break; 1049 if (opcode == TCPOPT_NOP) { 1050 length--; 1051 continue; 1052 } 1053 opsize = *ptr++; 1054 if (opsize < 2 || opsize > length) 1055 break; 1056 1057 switch (opcode) { 1058 case TCPOPT_MD5SIG: /* doesn't influence state */ 1059 break; 1060 1061 case TCPOPT_SACK: /* stricter checking performed later */ 1062 if (opsize % 8 != 2) 1063 return false; 1064 break; 1065 1066 case TCPOPT_TIMESTAMP: 1067 /* only drop timestamps lower than new */ 1068 if (opsize != TCPOLEN_TIMESTAMP) 1069 return false; 1070 tstamp = get_unaligned_be32(ptr); 1071 tsecr = get_unaligned_be32(ptr + 4); 1072 if (after(tstamp, tstamp_new) || 1073 after(tsecr, tsecr_new)) 1074 return false; 1075 break; 1076 1077 case TCPOPT_MSS: /* these should only be set on SYN */ 1078 case TCPOPT_WINDOW: 1079 case TCPOPT_SACK_PERM: 1080 case TCPOPT_FASTOPEN: 1081 case TCPOPT_EXP: 1082 default: /* don't drop if any unknown options are present */ 1083 return false; 1084 } 1085 1086 ptr += opsize - 2; 1087 length -= opsize; 1088 } 1089 1090 return true; 1091 } 1092 1093 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, 1094 struct cake_flow *flow) 1095 { 1096 bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; 1097 struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; 1098 struct sk_buff *skb_check, *skb_prev = NULL; 1099 const struct ipv6hdr *ipv6h, *ipv6h_check; 1100 unsigned char _tcph[64], _tcph_check[64]; 1101 const struct tcphdr *tcph, *tcph_check; 1102 const struct iphdr *iph, *iph_check; 1103 struct ipv6hdr _iph, _iph_check; 1104 const struct sk_buff *skb; 1105 int seglen, num_found = 0; 1106 u32 tstamp = 0, tsecr = 0; 1107 __be32 elig_flags = 0; 1108 int sack_comp; 1109 1110 /* no other possible ACKs to filter */ 1111 if (flow->head == flow->tail) 1112 return NULL; 1113 1114 skb = flow->tail; 1115 tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); 1116 iph = cake_get_iphdr(skb, &_iph); 1117 if (!tcph) 1118 return NULL; 1119 1120 cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); 1121 1122 /* the 'triggering' packet need only have the ACK flag set. 1123 * also check that SYN is not set, as there won't be any previous ACKs. 1124 */ 1125 if ((tcp_flag_word(tcph) & 1126 (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) 1127 return NULL; 1128 1129 /* the 'triggering' ACK is at the tail of the queue, we have already 1130 * returned if it is the only packet in the flow. loop through the rest 1131 * of the queue looking for pure ACKs with the same 5-tuple as the 1132 * triggering one. 1133 */ 1134 for (skb_check = flow->head; 1135 skb_check && skb_check != skb; 1136 skb_prev = skb_check, skb_check = skb_check->next) { 1137 iph_check = cake_get_iphdr(skb_check, &_iph_check); 1138 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, 1139 sizeof(_tcph_check)); 1140 1141 /* only TCP packets with matching 5-tuple are eligible, and only 1142 * drop safe headers 1143 */ 1144 if (!tcph_check || iph->version != iph_check->version || 1145 tcph_check->source != tcph->source || 1146 tcph_check->dest != tcph->dest) 1147 continue; 1148 1149 if (iph_check->version == 4) { 1150 if (iph_check->saddr != iph->saddr || 1151 iph_check->daddr != iph->daddr) 1152 continue; 1153 1154 seglen = ntohs(iph_check->tot_len) - 1155 (4 * iph_check->ihl); 1156 } else if (iph_check->version == 6) { 1157 ipv6h = (struct ipv6hdr *)iph; 1158 ipv6h_check = (struct ipv6hdr *)iph_check; 1159 1160 if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || 1161 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) 1162 continue; 1163 1164 seglen = ntohs(ipv6h_check->payload_len); 1165 } else { 1166 WARN_ON(1); /* shouldn't happen */ 1167 continue; 1168 } 1169 1170 /* If the ECE/CWR flags changed from the previous eligible 1171 * packet in the same flow, we should no longer be dropping that 1172 * previous packet as this would lose information. 1173 */ 1174 if (elig_ack && (tcp_flag_word(tcph_check) & 1175 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { 1176 elig_ack = NULL; 1177 elig_ack_prev = NULL; 1178 num_found--; 1179 } 1180 1181 /* Check TCP options and flags, don't drop ACKs with segment 1182 * data, and don't drop ACKs with a higher cumulative ACK 1183 * counter than the triggering packet. Check ACK seqno here to 1184 * avoid parsing SACK options of packets we are going to exclude 1185 * anyway. 1186 */ 1187 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || 1188 (seglen - __tcp_hdrlen(tcph_check)) != 0 || 1189 after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) 1190 continue; 1191 1192 /* Check SACK options. The triggering packet must SACK more data 1193 * than the ACK under consideration, or SACK the same range but 1194 * have a larger cumulative ACK counter. The latter is a 1195 * pathological case, but is contained in the following check 1196 * anyway, just to be safe. 1197 */ 1198 sack_comp = cake_tcph_sack_compare(tcph_check, tcph); 1199 1200 if (sack_comp < 0 || 1201 (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && 1202 sack_comp == 0)) 1203 continue; 1204 1205 /* At this point we have found an eligible pure ACK to drop; if 1206 * we are in aggressive mode, we are done. Otherwise, keep 1207 * searching unless this is the second eligible ACK we 1208 * found. 1209 * 1210 * Since we want to drop ACK closest to the head of the queue, 1211 * save the first eligible ACK we find, even if we need to loop 1212 * again. 1213 */ 1214 if (!elig_ack) { 1215 elig_ack = skb_check; 1216 elig_ack_prev = skb_prev; 1217 elig_flags = (tcp_flag_word(tcph_check) 1218 & (TCP_FLAG_ECE | TCP_FLAG_CWR)); 1219 } 1220 1221 if (num_found++ > 0) 1222 goto found; 1223 } 1224 1225 /* We made it through the queue without finding two eligible ACKs . If 1226 * we found a single eligible ACK we can drop it in aggressive mode if 1227 * we can guarantee that this does not interfere with ECN flag 1228 * information. We ensure this by dropping it only if the enqueued 1229 * packet is consecutive with the eligible ACK, and their flags match. 1230 */ 1231 if (elig_ack && aggressive && elig_ack->next == skb && 1232 (elig_flags == (tcp_flag_word(tcph) & 1233 (TCP_FLAG_ECE | TCP_FLAG_CWR)))) 1234 goto found; 1235 1236 return NULL; 1237 1238 found: 1239 if (elig_ack_prev) 1240 elig_ack_prev->next = elig_ack->next; 1241 else 1242 flow->head = elig_ack->next; 1243 1244 elig_ack->next = NULL; 1245 1246 return elig_ack; 1247 } 1248 1249 static u64 cake_ewma(u64 avg, u64 sample, u32 shift) 1250 { 1251 avg -= avg >> shift; 1252 avg += sample >> shift; 1253 return avg; 1254 } 1255 1256 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) 1257 { 1258 if (q->rate_flags & CAKE_FLAG_OVERHEAD) 1259 len -= off; 1260 1261 if (q->max_netlen < len) 1262 q->max_netlen = len; 1263 if (q->min_netlen > len) 1264 q->min_netlen = len; 1265 1266 len += q->rate_overhead; 1267 1268 if (len < q->rate_mpu) 1269 len = q->rate_mpu; 1270 1271 if (q->atm_mode == CAKE_ATM_ATM) { 1272 len += 47; 1273 len /= 48; 1274 len *= 53; 1275 } else if (q->atm_mode == CAKE_ATM_PTM) { 1276 /* Add one byte per 64 bytes or part thereof. 1277 * This is conservative and easier to calculate than the 1278 * precise value. 1279 */ 1280 len += (len + 63) / 64; 1281 } 1282 1283 if (q->max_adjlen < len) 1284 q->max_adjlen = len; 1285 if (q->min_adjlen > len) 1286 q->min_adjlen = len; 1287 1288 return len; 1289 } 1290 1291 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) 1292 { 1293 const struct skb_shared_info *shinfo = skb_shinfo(skb); 1294 unsigned int hdr_len, last_len = 0; 1295 u32 off = skb_network_offset(skb); 1296 u32 len = qdisc_pkt_len(skb); 1297 u16 segs = 1; 1298 1299 q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); 1300 1301 if (!shinfo->gso_size) 1302 return cake_calc_overhead(q, len, off); 1303 1304 /* borrowed from qdisc_pkt_len_init() */ 1305 hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 1306 1307 /* + transport layer */ 1308 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | 1309 SKB_GSO_TCPV6))) { 1310 const struct tcphdr *th; 1311 struct tcphdr _tcphdr; 1312 1313 th = skb_header_pointer(skb, skb_transport_offset(skb), 1314 sizeof(_tcphdr), &_tcphdr); 1315 if (likely(th)) 1316 hdr_len += __tcp_hdrlen(th); 1317 } else { 1318 struct udphdr _udphdr; 1319 1320 if (skb_header_pointer(skb, skb_transport_offset(skb), 1321 sizeof(_udphdr), &_udphdr)) 1322 hdr_len += sizeof(struct udphdr); 1323 } 1324 1325 if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) 1326 segs = DIV_ROUND_UP(skb->len - hdr_len, 1327 shinfo->gso_size); 1328 else 1329 segs = shinfo->gso_segs; 1330 1331 len = shinfo->gso_size + hdr_len; 1332 last_len = skb->len - shinfo->gso_size * (segs - 1); 1333 1334 return (cake_calc_overhead(q, len, off) * (segs - 1) + 1335 cake_calc_overhead(q, last_len, off)); 1336 } 1337 1338 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) 1339 { 1340 struct cake_heap_entry ii = q->overflow_heap[i]; 1341 struct cake_heap_entry jj = q->overflow_heap[j]; 1342 1343 q->overflow_heap[i] = jj; 1344 q->overflow_heap[j] = ii; 1345 1346 q->tins[ii.t].overflow_idx[ii.b] = j; 1347 q->tins[jj.t].overflow_idx[jj.b] = i; 1348 } 1349 1350 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) 1351 { 1352 struct cake_heap_entry ii = q->overflow_heap[i]; 1353 1354 return q->tins[ii.t].backlogs[ii.b]; 1355 } 1356 1357 static void cake_heapify(struct cake_sched_data *q, u16 i) 1358 { 1359 static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; 1360 u32 mb = cake_heap_get_backlog(q, i); 1361 u32 m = i; 1362 1363 while (m < a) { 1364 u32 l = m + m + 1; 1365 u32 r = l + 1; 1366 1367 if (l < a) { 1368 u32 lb = cake_heap_get_backlog(q, l); 1369 1370 if (lb > mb) { 1371 m = l; 1372 mb = lb; 1373 } 1374 } 1375 1376 if (r < a) { 1377 u32 rb = cake_heap_get_backlog(q, r); 1378 1379 if (rb > mb) { 1380 m = r; 1381 mb = rb; 1382 } 1383 } 1384 1385 if (m != i) { 1386 cake_heap_swap(q, i, m); 1387 i = m; 1388 } else { 1389 break; 1390 } 1391 } 1392 } 1393 1394 static void cake_heapify_up(struct cake_sched_data *q, u16 i) 1395 { 1396 while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { 1397 u16 p = (i - 1) >> 1; 1398 u32 ib = cake_heap_get_backlog(q, i); 1399 u32 pb = cake_heap_get_backlog(q, p); 1400 1401 if (ib > pb) { 1402 cake_heap_swap(q, i, p); 1403 i = p; 1404 } else { 1405 break; 1406 } 1407 } 1408 } 1409 1410 static int cake_advance_shaper(struct cake_sched_data *q, 1411 struct cake_tin_data *b, 1412 struct sk_buff *skb, 1413 ktime_t now, bool drop) 1414 { 1415 u32 len = get_cobalt_cb(skb)->adjusted_len; 1416 1417 /* charge packet bandwidth to this tin 1418 * and to the global shaper. 1419 */ 1420 if (q->rate_ns) { 1421 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; 1422 u64 global_dur = (len * q->rate_ns) >> q->rate_shft; 1423 u64 failsafe_dur = global_dur + (global_dur >> 1); 1424 1425 if (ktime_before(b->time_next_packet, now)) 1426 b->time_next_packet = ktime_add_ns(b->time_next_packet, 1427 tin_dur); 1428 1429 else if (ktime_before(b->time_next_packet, 1430 ktime_add_ns(now, tin_dur))) 1431 b->time_next_packet = ktime_add_ns(now, tin_dur); 1432 1433 q->time_next_packet = ktime_add_ns(q->time_next_packet, 1434 global_dur); 1435 if (!drop) 1436 q->failsafe_next_packet = \ 1437 ktime_add_ns(q->failsafe_next_packet, 1438 failsafe_dur); 1439 } 1440 return len; 1441 } 1442 1443 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) 1444 { 1445 struct cake_sched_data *q = qdisc_priv(sch); 1446 ktime_t now = ktime_get(); 1447 u32 idx = 0, tin = 0, len; 1448 struct cake_heap_entry qq; 1449 struct cake_tin_data *b; 1450 struct cake_flow *flow; 1451 struct sk_buff *skb; 1452 1453 if (!q->overflow_timeout) { 1454 int i; 1455 /* Build fresh max-heap */ 1456 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) 1457 cake_heapify(q, i); 1458 } 1459 q->overflow_timeout = 65535; 1460 1461 /* select longest queue for pruning */ 1462 qq = q->overflow_heap[0]; 1463 tin = qq.t; 1464 idx = qq.b; 1465 1466 b = &q->tins[tin]; 1467 flow = &b->flows[idx]; 1468 skb = dequeue_head(flow); 1469 if (unlikely(!skb)) { 1470 /* heap has gone wrong, rebuild it next time */ 1471 q->overflow_timeout = 0; 1472 return idx + (tin << 16); 1473 } 1474 1475 if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) 1476 b->unresponsive_flow_count++; 1477 1478 len = qdisc_pkt_len(skb); 1479 q->buffer_used -= skb->truesize; 1480 b->backlogs[idx] -= len; 1481 b->tin_backlog -= len; 1482 sch->qstats.backlog -= len; 1483 qdisc_tree_reduce_backlog(sch, 1, len); 1484 1485 flow->dropped++; 1486 b->tin_dropped++; 1487 sch->qstats.drops++; 1488 1489 if (q->rate_flags & CAKE_FLAG_INGRESS) 1490 cake_advance_shaper(q, b, skb, now, true); 1491 1492 __qdisc_drop(skb, to_free); 1493 sch->q.qlen--; 1494 1495 cake_heapify(q, 0); 1496 1497 return idx + (tin << 16); 1498 } 1499 1500 static void cake_wash_diffserv(struct sk_buff *skb) 1501 { 1502 switch (skb->protocol) { 1503 case htons(ETH_P_IP): 1504 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); 1505 break; 1506 case htons(ETH_P_IPV6): 1507 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); 1508 break; 1509 default: 1510 break; 1511 } 1512 } 1513 1514 static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) 1515 { 1516 u8 dscp; 1517 1518 switch (skb->protocol) { 1519 case htons(ETH_P_IP): 1520 dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; 1521 if (wash && dscp) 1522 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); 1523 return dscp; 1524 1525 case htons(ETH_P_IPV6): 1526 dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; 1527 if (wash && dscp) 1528 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); 1529 return dscp; 1530 1531 case htons(ETH_P_ARP): 1532 return 0x38; /* CS7 - Net Control */ 1533 1534 default: 1535 /* If there is no Diffserv field, treat as best-effort */ 1536 return 0; 1537 } 1538 } 1539 1540 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, 1541 struct sk_buff *skb) 1542 { 1543 struct cake_sched_data *q = qdisc_priv(sch); 1544 u32 tin; 1545 1546 if (TC_H_MAJ(skb->priority) == sch->handle && 1547 TC_H_MIN(skb->priority) > 0 && 1548 TC_H_MIN(skb->priority) <= q->tin_cnt) { 1549 tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; 1550 1551 if (q->rate_flags & CAKE_FLAG_WASH) 1552 cake_wash_diffserv(skb); 1553 } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) { 1554 /* extract the Diffserv Precedence field, if it exists */ 1555 /* and clear DSCP bits if washing */ 1556 tin = q->tin_index[cake_handle_diffserv(skb, 1557 q->rate_flags & CAKE_FLAG_WASH)]; 1558 if (unlikely(tin >= q->tin_cnt)) 1559 tin = 0; 1560 } else { 1561 tin = 0; 1562 if (q->rate_flags & CAKE_FLAG_WASH) 1563 cake_wash_diffserv(skb); 1564 } 1565 1566 return &q->tins[tin]; 1567 } 1568 1569 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, 1570 struct sk_buff *skb, int flow_mode, int *qerr) 1571 { 1572 struct cake_sched_data *q = qdisc_priv(sch); 1573 struct tcf_proto *filter; 1574 struct tcf_result res; 1575 u32 flow = 0; 1576 int result; 1577 1578 filter = rcu_dereference_bh(q->filter_list); 1579 if (!filter) 1580 goto hash; 1581 1582 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 1583 result = tcf_classify(skb, filter, &res, false); 1584 1585 if (result >= 0) { 1586 #ifdef CONFIG_NET_CLS_ACT 1587 switch (result) { 1588 case TC_ACT_STOLEN: 1589 case TC_ACT_QUEUED: 1590 case TC_ACT_TRAP: 1591 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 1592 /* fall through */ 1593 case TC_ACT_SHOT: 1594 return 0; 1595 } 1596 #endif 1597 if (TC_H_MIN(res.classid) <= CAKE_QUEUES) 1598 flow = TC_H_MIN(res.classid); 1599 } 1600 hash: 1601 *t = cake_select_tin(sch, skb); 1602 return flow ?: cake_hash(*t, skb, flow_mode) + 1; 1603 } 1604 1605 static void cake_reconfigure(struct Qdisc *sch); 1606 1607 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1608 struct sk_buff **to_free) 1609 { 1610 struct cake_sched_data *q = qdisc_priv(sch); 1611 int len = qdisc_pkt_len(skb); 1612 int uninitialized_var(ret); 1613 struct sk_buff *ack = NULL; 1614 ktime_t now = ktime_get(); 1615 struct cake_tin_data *b; 1616 struct cake_flow *flow; 1617 u32 idx; 1618 1619 /* choose flow to insert into */ 1620 idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); 1621 if (idx == 0) { 1622 if (ret & __NET_XMIT_BYPASS) 1623 qdisc_qstats_drop(sch); 1624 __qdisc_drop(skb, to_free); 1625 return ret; 1626 } 1627 idx--; 1628 flow = &b->flows[idx]; 1629 1630 /* ensure shaper state isn't stale */ 1631 if (!b->tin_backlog) { 1632 if (ktime_before(b->time_next_packet, now)) 1633 b->time_next_packet = now; 1634 1635 if (!sch->q.qlen) { 1636 if (ktime_before(q->time_next_packet, now)) { 1637 q->failsafe_next_packet = now; 1638 q->time_next_packet = now; 1639 } else if (ktime_after(q->time_next_packet, now) && 1640 ktime_after(q->failsafe_next_packet, now)) { 1641 u64 next = \ 1642 min(ktime_to_ns(q->time_next_packet), 1643 ktime_to_ns( 1644 q->failsafe_next_packet)); 1645 sch->qstats.overlimits++; 1646 qdisc_watchdog_schedule_ns(&q->watchdog, next); 1647 } 1648 } 1649 } 1650 1651 if (unlikely(len > b->max_skblen)) 1652 b->max_skblen = len; 1653 1654 if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { 1655 struct sk_buff *segs, *nskb; 1656 netdev_features_t features = netif_skb_features(skb); 1657 unsigned int slen = 0; 1658 1659 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 1660 if (IS_ERR_OR_NULL(segs)) 1661 return qdisc_drop(skb, sch, to_free); 1662 1663 while (segs) { 1664 nskb = segs->next; 1665 segs->next = NULL; 1666 qdisc_skb_cb(segs)->pkt_len = segs->len; 1667 cobalt_set_enqueue_time(segs, now); 1668 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, 1669 segs); 1670 flow_queue_add(flow, segs); 1671 1672 sch->q.qlen++; 1673 slen += segs->len; 1674 q->buffer_used += segs->truesize; 1675 b->packets++; 1676 segs = nskb; 1677 } 1678 1679 /* stats */ 1680 b->bytes += slen; 1681 b->backlogs[idx] += slen; 1682 b->tin_backlog += slen; 1683 sch->qstats.backlog += slen; 1684 q->avg_window_bytes += slen; 1685 1686 qdisc_tree_reduce_backlog(sch, 1, len); 1687 consume_skb(skb); 1688 } else { 1689 /* not splitting */ 1690 cobalt_set_enqueue_time(skb, now); 1691 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); 1692 flow_queue_add(flow, skb); 1693 1694 if (q->ack_filter) 1695 ack = cake_ack_filter(q, flow); 1696 1697 if (ack) { 1698 b->ack_drops++; 1699 sch->qstats.drops++; 1700 b->bytes += qdisc_pkt_len(ack); 1701 len -= qdisc_pkt_len(ack); 1702 q->buffer_used += skb->truesize - ack->truesize; 1703 if (q->rate_flags & CAKE_FLAG_INGRESS) 1704 cake_advance_shaper(q, b, ack, now, true); 1705 1706 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); 1707 consume_skb(ack); 1708 } else { 1709 sch->q.qlen++; 1710 q->buffer_used += skb->truesize; 1711 } 1712 1713 /* stats */ 1714 b->packets++; 1715 b->bytes += len; 1716 b->backlogs[idx] += len; 1717 b->tin_backlog += len; 1718 sch->qstats.backlog += len; 1719 q->avg_window_bytes += len; 1720 } 1721 1722 if (q->overflow_timeout) 1723 cake_heapify_up(q, b->overflow_idx[idx]); 1724 1725 /* incoming bandwidth capacity estimate */ 1726 if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { 1727 u64 packet_interval = \ 1728 ktime_to_ns(ktime_sub(now, q->last_packet_time)); 1729 1730 if (packet_interval > NSEC_PER_SEC) 1731 packet_interval = NSEC_PER_SEC; 1732 1733 /* filter out short-term bursts, eg. wifi aggregation */ 1734 q->avg_packet_interval = \ 1735 cake_ewma(q->avg_packet_interval, 1736 packet_interval, 1737 (packet_interval > q->avg_packet_interval ? 1738 2 : 8)); 1739 1740 q->last_packet_time = now; 1741 1742 if (packet_interval > q->avg_packet_interval) { 1743 u64 window_interval = \ 1744 ktime_to_ns(ktime_sub(now, 1745 q->avg_window_begin)); 1746 u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; 1747 1748 do_div(b, window_interval); 1749 q->avg_peak_bandwidth = 1750 cake_ewma(q->avg_peak_bandwidth, b, 1751 b > q->avg_peak_bandwidth ? 2 : 8); 1752 q->avg_window_bytes = 0; 1753 q->avg_window_begin = now; 1754 1755 if (ktime_after(now, 1756 ktime_add_ms(q->last_reconfig_time, 1757 250))) { 1758 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; 1759 cake_reconfigure(sch); 1760 } 1761 } 1762 } else { 1763 q->avg_window_bytes = 0; 1764 q->last_packet_time = now; 1765 } 1766 1767 /* flowchain */ 1768 if (!flow->set || flow->set == CAKE_SET_DECAYING) { 1769 struct cake_host *srchost = &b->hosts[flow->srchost]; 1770 struct cake_host *dsthost = &b->hosts[flow->dsthost]; 1771 u16 host_load = 1; 1772 1773 if (!flow->set) { 1774 list_add_tail(&flow->flowchain, &b->new_flows); 1775 } else { 1776 b->decaying_flow_count--; 1777 list_move_tail(&flow->flowchain, &b->new_flows); 1778 } 1779 flow->set = CAKE_SET_SPARSE; 1780 b->sparse_flow_count++; 1781 1782 if (cake_dsrc(q->flow_mode)) 1783 host_load = max(host_load, srchost->srchost_refcnt); 1784 1785 if (cake_ddst(q->flow_mode)) 1786 host_load = max(host_load, dsthost->dsthost_refcnt); 1787 1788 flow->deficit = (b->flow_quantum * 1789 quantum_div[host_load]) >> 16; 1790 } else if (flow->set == CAKE_SET_SPARSE_WAIT) { 1791 /* this flow was empty, accounted as a sparse flow, but actually 1792 * in the bulk rotation. 1793 */ 1794 flow->set = CAKE_SET_BULK; 1795 b->sparse_flow_count--; 1796 b->bulk_flow_count++; 1797 } 1798 1799 if (q->buffer_used > q->buffer_max_used) 1800 q->buffer_max_used = q->buffer_used; 1801 1802 if (q->buffer_used > q->buffer_limit) { 1803 u32 dropped = 0; 1804 1805 while (q->buffer_used > q->buffer_limit) { 1806 dropped++; 1807 cake_drop(sch, to_free); 1808 } 1809 b->drop_overlimit += dropped; 1810 } 1811 return NET_XMIT_SUCCESS; 1812 } 1813 1814 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) 1815 { 1816 struct cake_sched_data *q = qdisc_priv(sch); 1817 struct cake_tin_data *b = &q->tins[q->cur_tin]; 1818 struct cake_flow *flow = &b->flows[q->cur_flow]; 1819 struct sk_buff *skb = NULL; 1820 u32 len; 1821 1822 if (flow->head) { 1823 skb = dequeue_head(flow); 1824 len = qdisc_pkt_len(skb); 1825 b->backlogs[q->cur_flow] -= len; 1826 b->tin_backlog -= len; 1827 sch->qstats.backlog -= len; 1828 q->buffer_used -= skb->truesize; 1829 sch->q.qlen--; 1830 1831 if (q->overflow_timeout) 1832 cake_heapify(q, b->overflow_idx[q->cur_flow]); 1833 } 1834 return skb; 1835 } 1836 1837 /* Discard leftover packets from a tin no longer in use. */ 1838 static void cake_clear_tin(struct Qdisc *sch, u16 tin) 1839 { 1840 struct cake_sched_data *q = qdisc_priv(sch); 1841 struct sk_buff *skb; 1842 1843 q->cur_tin = tin; 1844 for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) 1845 while (!!(skb = cake_dequeue_one(sch))) 1846 kfree_skb(skb); 1847 } 1848 1849 static struct sk_buff *cake_dequeue(struct Qdisc *sch) 1850 { 1851 struct cake_sched_data *q = qdisc_priv(sch); 1852 struct cake_tin_data *b = &q->tins[q->cur_tin]; 1853 struct cake_host *srchost, *dsthost; 1854 ktime_t now = ktime_get(); 1855 struct cake_flow *flow; 1856 struct list_head *head; 1857 bool first_flow = true; 1858 struct sk_buff *skb; 1859 u16 host_load; 1860 u64 delay; 1861 u32 len; 1862 1863 begin: 1864 if (!sch->q.qlen) 1865 return NULL; 1866 1867 /* global hard shaper */ 1868 if (ktime_after(q->time_next_packet, now) && 1869 ktime_after(q->failsafe_next_packet, now)) { 1870 u64 next = min(ktime_to_ns(q->time_next_packet), 1871 ktime_to_ns(q->failsafe_next_packet)); 1872 1873 sch->qstats.overlimits++; 1874 qdisc_watchdog_schedule_ns(&q->watchdog, next); 1875 return NULL; 1876 } 1877 1878 /* Choose a class to work on. */ 1879 if (!q->rate_ns) { 1880 /* In unlimited mode, can't rely on shaper timings, just balance 1881 * with DRR 1882 */ 1883 bool wrapped = false, empty = true; 1884 1885 while (b->tin_deficit < 0 || 1886 !(b->sparse_flow_count + b->bulk_flow_count)) { 1887 if (b->tin_deficit <= 0) 1888 b->tin_deficit += b->tin_quantum_band; 1889 if (b->sparse_flow_count + b->bulk_flow_count) 1890 empty = false; 1891 1892 q->cur_tin++; 1893 b++; 1894 if (q->cur_tin >= q->tin_cnt) { 1895 q->cur_tin = 0; 1896 b = q->tins; 1897 1898 if (wrapped) { 1899 /* It's possible for q->qlen to be 1900 * nonzero when we actually have no 1901 * packets anywhere. 1902 */ 1903 if (empty) 1904 return NULL; 1905 } else { 1906 wrapped = true; 1907 } 1908 } 1909 } 1910 } else { 1911 /* In shaped mode, choose: 1912 * - Highest-priority tin with queue and meeting schedule, or 1913 * - The earliest-scheduled tin with queue. 1914 */ 1915 ktime_t best_time = KTIME_MAX; 1916 int tin, best_tin = 0; 1917 1918 for (tin = 0; tin < q->tin_cnt; tin++) { 1919 b = q->tins + tin; 1920 if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { 1921 ktime_t time_to_pkt = \ 1922 ktime_sub(b->time_next_packet, now); 1923 1924 if (ktime_to_ns(time_to_pkt) <= 0 || 1925 ktime_compare(time_to_pkt, 1926 best_time) <= 0) { 1927 best_time = time_to_pkt; 1928 best_tin = tin; 1929 } 1930 } 1931 } 1932 1933 q->cur_tin = best_tin; 1934 b = q->tins + best_tin; 1935 1936 /* No point in going further if no packets to deliver. */ 1937 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) 1938 return NULL; 1939 } 1940 1941 retry: 1942 /* service this class */ 1943 head = &b->decaying_flows; 1944 if (!first_flow || list_empty(head)) { 1945 head = &b->new_flows; 1946 if (list_empty(head)) { 1947 head = &b->old_flows; 1948 if (unlikely(list_empty(head))) { 1949 head = &b->decaying_flows; 1950 if (unlikely(list_empty(head))) 1951 goto begin; 1952 } 1953 } 1954 } 1955 flow = list_first_entry(head, struct cake_flow, flowchain); 1956 q->cur_flow = flow - b->flows; 1957 first_flow = false; 1958 1959 /* triple isolation (modified DRR++) */ 1960 srchost = &b->hosts[flow->srchost]; 1961 dsthost = &b->hosts[flow->dsthost]; 1962 host_load = 1; 1963 1964 if (cake_dsrc(q->flow_mode)) 1965 host_load = max(host_load, srchost->srchost_refcnt); 1966 1967 if (cake_ddst(q->flow_mode)) 1968 host_load = max(host_load, dsthost->dsthost_refcnt); 1969 1970 WARN_ON(host_load > CAKE_QUEUES); 1971 1972 /* flow isolation (DRR++) */ 1973 if (flow->deficit <= 0) { 1974 /* The shifted prandom_u32() is a way to apply dithering to 1975 * avoid accumulating roundoff errors 1976 */ 1977 flow->deficit += (b->flow_quantum * quantum_div[host_load] + 1978 (prandom_u32() >> 16)) >> 16; 1979 list_move_tail(&flow->flowchain, &b->old_flows); 1980 1981 /* Keep all flows with deficits out of the sparse and decaying 1982 * rotations. No non-empty flow can go into the decaying 1983 * rotation, so they can't get deficits 1984 */ 1985 if (flow->set == CAKE_SET_SPARSE) { 1986 if (flow->head) { 1987 b->sparse_flow_count--; 1988 b->bulk_flow_count++; 1989 flow->set = CAKE_SET_BULK; 1990 } else { 1991 /* we've moved it to the bulk rotation for 1992 * correct deficit accounting but we still want 1993 * to count it as a sparse flow, not a bulk one. 1994 */ 1995 flow->set = CAKE_SET_SPARSE_WAIT; 1996 } 1997 } 1998 goto retry; 1999 } 2000 2001 /* Retrieve a packet via the AQM */ 2002 while (1) { 2003 skb = cake_dequeue_one(sch); 2004 if (!skb) { 2005 /* this queue was actually empty */ 2006 if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) 2007 b->unresponsive_flow_count--; 2008 2009 if (flow->cvars.p_drop || flow->cvars.count || 2010 ktime_before(now, flow->cvars.drop_next)) { 2011 /* keep in the flowchain until the state has 2012 * decayed to rest 2013 */ 2014 list_move_tail(&flow->flowchain, 2015 &b->decaying_flows); 2016 if (flow->set == CAKE_SET_BULK) { 2017 b->bulk_flow_count--; 2018 b->decaying_flow_count++; 2019 } else if (flow->set == CAKE_SET_SPARSE || 2020 flow->set == CAKE_SET_SPARSE_WAIT) { 2021 b->sparse_flow_count--; 2022 b->decaying_flow_count++; 2023 } 2024 flow->set = CAKE_SET_DECAYING; 2025 } else { 2026 /* remove empty queue from the flowchain */ 2027 list_del_init(&flow->flowchain); 2028 if (flow->set == CAKE_SET_SPARSE || 2029 flow->set == CAKE_SET_SPARSE_WAIT) 2030 b->sparse_flow_count--; 2031 else if (flow->set == CAKE_SET_BULK) 2032 b->bulk_flow_count--; 2033 else 2034 b->decaying_flow_count--; 2035 2036 flow->set = CAKE_SET_NONE; 2037 srchost->srchost_refcnt--; 2038 dsthost->dsthost_refcnt--; 2039 } 2040 goto begin; 2041 } 2042 2043 /* Last packet in queue may be marked, shouldn't be dropped */ 2044 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, 2045 (b->bulk_flow_count * 2046 !!(q->rate_flags & 2047 CAKE_FLAG_INGRESS))) || 2048 !flow->head) 2049 break; 2050 2051 /* drop this packet, get another one */ 2052 if (q->rate_flags & CAKE_FLAG_INGRESS) { 2053 len = cake_advance_shaper(q, b, skb, 2054 now, true); 2055 flow->deficit -= len; 2056 b->tin_deficit -= len; 2057 } 2058 flow->dropped++; 2059 b->tin_dropped++; 2060 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); 2061 qdisc_qstats_drop(sch); 2062 kfree_skb(skb); 2063 if (q->rate_flags & CAKE_FLAG_INGRESS) 2064 goto retry; 2065 } 2066 2067 b->tin_ecn_mark += !!flow->cvars.ecn_marked; 2068 qdisc_bstats_update(sch, skb); 2069 2070 /* collect delay stats */ 2071 delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 2072 b->avge_delay = cake_ewma(b->avge_delay, delay, 8); 2073 b->peak_delay = cake_ewma(b->peak_delay, delay, 2074 delay > b->peak_delay ? 2 : 8); 2075 b->base_delay = cake_ewma(b->base_delay, delay, 2076 delay < b->base_delay ? 2 : 8); 2077 2078 len = cake_advance_shaper(q, b, skb, now, false); 2079 flow->deficit -= len; 2080 b->tin_deficit -= len; 2081 2082 if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { 2083 u64 next = min(ktime_to_ns(q->time_next_packet), 2084 ktime_to_ns(q->failsafe_next_packet)); 2085 2086 qdisc_watchdog_schedule_ns(&q->watchdog, next); 2087 } else if (!sch->q.qlen) { 2088 int i; 2089 2090 for (i = 0; i < q->tin_cnt; i++) { 2091 if (q->tins[i].decaying_flow_count) { 2092 ktime_t next = \ 2093 ktime_add_ns(now, 2094 q->tins[i].cparams.target); 2095 2096 qdisc_watchdog_schedule_ns(&q->watchdog, 2097 ktime_to_ns(next)); 2098 break; 2099 } 2100 } 2101 } 2102 2103 if (q->overflow_timeout) 2104 q->overflow_timeout--; 2105 2106 return skb; 2107 } 2108 2109 static void cake_reset(struct Qdisc *sch) 2110 { 2111 u32 c; 2112 2113 for (c = 0; c < CAKE_MAX_TINS; c++) 2114 cake_clear_tin(sch, c); 2115 } 2116 2117 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { 2118 [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, 2119 [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, 2120 [TCA_CAKE_ATM] = { .type = NLA_U32 }, 2121 [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, 2122 [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, 2123 [TCA_CAKE_RTT] = { .type = NLA_U32 }, 2124 [TCA_CAKE_TARGET] = { .type = NLA_U32 }, 2125 [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, 2126 [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, 2127 [TCA_CAKE_NAT] = { .type = NLA_U32 }, 2128 [TCA_CAKE_RAW] = { .type = NLA_U32 }, 2129 [TCA_CAKE_WASH] = { .type = NLA_U32 }, 2130 [TCA_CAKE_MPU] = { .type = NLA_U32 }, 2131 [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, 2132 [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, 2133 }; 2134 2135 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, 2136 u64 target_ns, u64 rtt_est_ns) 2137 { 2138 /* convert byte-rate into time-per-byte 2139 * so it will always unwedge in reasonable time. 2140 */ 2141 static const u64 MIN_RATE = 64; 2142 u32 byte_target = mtu; 2143 u64 byte_target_ns; 2144 u8 rate_shft = 0; 2145 u64 rate_ns = 0; 2146 2147 b->flow_quantum = 1514; 2148 if (rate) { 2149 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); 2150 rate_shft = 34; 2151 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; 2152 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); 2153 while (!!(rate_ns >> 34)) { 2154 rate_ns >>= 1; 2155 rate_shft--; 2156 } 2157 } /* else unlimited, ie. zero delay */ 2158 2159 b->tin_rate_bps = rate; 2160 b->tin_rate_ns = rate_ns; 2161 b->tin_rate_shft = rate_shft; 2162 2163 byte_target_ns = (byte_target * rate_ns) >> rate_shft; 2164 2165 b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); 2166 b->cparams.interval = max(rtt_est_ns + 2167 b->cparams.target - target_ns, 2168 b->cparams.target * 2); 2169 b->cparams.mtu_time = byte_target_ns; 2170 b->cparams.p_inc = 1 << 24; /* 1/256 */ 2171 b->cparams.p_dec = 1 << 20; /* 1/4096 */ 2172 } 2173 2174 static int cake_config_besteffort(struct Qdisc *sch) 2175 { 2176 struct cake_sched_data *q = qdisc_priv(sch); 2177 struct cake_tin_data *b = &q->tins[0]; 2178 u32 mtu = psched_mtu(qdisc_dev(sch)); 2179 u64 rate = q->rate_bps; 2180 2181 q->tin_cnt = 1; 2182 2183 q->tin_index = besteffort; 2184 q->tin_order = normal_order; 2185 2186 cake_set_rate(b, rate, mtu, 2187 us_to_ns(q->target), us_to_ns(q->interval)); 2188 b->tin_quantum_band = 65535; 2189 b->tin_quantum_prio = 65535; 2190 2191 return 0; 2192 } 2193 2194 static int cake_config_precedence(struct Qdisc *sch) 2195 { 2196 /* convert high-level (user visible) parameters into internal format */ 2197 struct cake_sched_data *q = qdisc_priv(sch); 2198 u32 mtu = psched_mtu(qdisc_dev(sch)); 2199 u64 rate = q->rate_bps; 2200 u32 quantum1 = 256; 2201 u32 quantum2 = 256; 2202 u32 i; 2203 2204 q->tin_cnt = 8; 2205 q->tin_index = precedence; 2206 q->tin_order = normal_order; 2207 2208 for (i = 0; i < q->tin_cnt; i++) { 2209 struct cake_tin_data *b = &q->tins[i]; 2210 2211 cake_set_rate(b, rate, mtu, us_to_ns(q->target), 2212 us_to_ns(q->interval)); 2213 2214 b->tin_quantum_prio = max_t(u16, 1U, quantum1); 2215 b->tin_quantum_band = max_t(u16, 1U, quantum2); 2216 2217 /* calculate next class's parameters */ 2218 rate *= 7; 2219 rate >>= 3; 2220 2221 quantum1 *= 3; 2222 quantum1 >>= 1; 2223 2224 quantum2 *= 7; 2225 quantum2 >>= 3; 2226 } 2227 2228 return 0; 2229 } 2230 2231 /* List of known Diffserv codepoints: 2232 * 2233 * Least Effort (CS1) 2234 * Best Effort (CS0) 2235 * Max Reliability & LLT "Lo" (TOS1) 2236 * Max Throughput (TOS2) 2237 * Min Delay (TOS4) 2238 * LLT "La" (TOS5) 2239 * Assured Forwarding 1 (AF1x) - x3 2240 * Assured Forwarding 2 (AF2x) - x3 2241 * Assured Forwarding 3 (AF3x) - x3 2242 * Assured Forwarding 4 (AF4x) - x3 2243 * Precedence Class 2 (CS2) 2244 * Precedence Class 3 (CS3) 2245 * Precedence Class 4 (CS4) 2246 * Precedence Class 5 (CS5) 2247 * Precedence Class 6 (CS6) 2248 * Precedence Class 7 (CS7) 2249 * Voice Admit (VA) 2250 * Expedited Forwarding (EF) 2251 2252 * Total 25 codepoints. 2253 */ 2254 2255 /* List of traffic classes in RFC 4594: 2256 * (roughly descending order of contended priority) 2257 * (roughly ascending order of uncontended throughput) 2258 * 2259 * Network Control (CS6,CS7) - routing traffic 2260 * Telephony (EF,VA) - aka. VoIP streams 2261 * Signalling (CS5) - VoIP setup 2262 * Multimedia Conferencing (AF4x) - aka. video calls 2263 * Realtime Interactive (CS4) - eg. games 2264 * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch 2265 * Broadcast Video (CS3) 2266 * Low Latency Data (AF2x,TOS4) - eg. database 2267 * Ops, Admin, Management (CS2,TOS1) - eg. ssh 2268 * Standard Service (CS0 & unrecognised codepoints) 2269 * High Throughput Data (AF1x,TOS2) - eg. web traffic 2270 * Low Priority Data (CS1) - eg. BitTorrent 2271 2272 * Total 12 traffic classes. 2273 */ 2274 2275 static int cake_config_diffserv8(struct Qdisc *sch) 2276 { 2277 /* Pruned list of traffic classes for typical applications: 2278 * 2279 * Network Control (CS6, CS7) 2280 * Minimum Latency (EF, VA, CS5, CS4) 2281 * Interactive Shell (CS2, TOS1) 2282 * Low Latency Transactions (AF2x, TOS4) 2283 * Video Streaming (AF4x, AF3x, CS3) 2284 * Bog Standard (CS0 etc.) 2285 * High Throughput (AF1x, TOS2) 2286 * Background Traffic (CS1) 2287 * 2288 * Total 8 traffic classes. 2289 */ 2290 2291 struct cake_sched_data *q = qdisc_priv(sch); 2292 u32 mtu = psched_mtu(qdisc_dev(sch)); 2293 u64 rate = q->rate_bps; 2294 u32 quantum1 = 256; 2295 u32 quantum2 = 256; 2296 u32 i; 2297 2298 q->tin_cnt = 8; 2299 2300 /* codepoint to class mapping */ 2301 q->tin_index = diffserv8; 2302 q->tin_order = normal_order; 2303 2304 /* class characteristics */ 2305 for (i = 0; i < q->tin_cnt; i++) { 2306 struct cake_tin_data *b = &q->tins[i]; 2307 2308 cake_set_rate(b, rate, mtu, us_to_ns(q->target), 2309 us_to_ns(q->interval)); 2310 2311 b->tin_quantum_prio = max_t(u16, 1U, quantum1); 2312 b->tin_quantum_band = max_t(u16, 1U, quantum2); 2313 2314 /* calculate next class's parameters */ 2315 rate *= 7; 2316 rate >>= 3; 2317 2318 quantum1 *= 3; 2319 quantum1 >>= 1; 2320 2321 quantum2 *= 7; 2322 quantum2 >>= 3; 2323 } 2324 2325 return 0; 2326 } 2327 2328 static int cake_config_diffserv4(struct Qdisc *sch) 2329 { 2330 /* Further pruned list of traffic classes for four-class system: 2331 * 2332 * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) 2333 * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1) 2334 * Best Effort (CS0, AF1x, TOS2, and those not specified) 2335 * Background Traffic (CS1) 2336 * 2337 * Total 4 traffic classes. 2338 */ 2339 2340 struct cake_sched_data *q = qdisc_priv(sch); 2341 u32 mtu = psched_mtu(qdisc_dev(sch)); 2342 u64 rate = q->rate_bps; 2343 u32 quantum = 1024; 2344 2345 q->tin_cnt = 4; 2346 2347 /* codepoint to class mapping */ 2348 q->tin_index = diffserv4; 2349 q->tin_order = bulk_order; 2350 2351 /* class characteristics */ 2352 cake_set_rate(&q->tins[0], rate, mtu, 2353 us_to_ns(q->target), us_to_ns(q->interval)); 2354 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2355 us_to_ns(q->target), us_to_ns(q->interval)); 2356 cake_set_rate(&q->tins[2], rate >> 1, mtu, 2357 us_to_ns(q->target), us_to_ns(q->interval)); 2358 cake_set_rate(&q->tins[3], rate >> 2, mtu, 2359 us_to_ns(q->target), us_to_ns(q->interval)); 2360 2361 /* priority weights */ 2362 q->tins[0].tin_quantum_prio = quantum; 2363 q->tins[1].tin_quantum_prio = quantum >> 4; 2364 q->tins[2].tin_quantum_prio = quantum << 2; 2365 q->tins[3].tin_quantum_prio = quantum << 4; 2366 2367 /* bandwidth-sharing weights */ 2368 q->tins[0].tin_quantum_band = quantum; 2369 q->tins[1].tin_quantum_band = quantum >> 4; 2370 q->tins[2].tin_quantum_band = quantum >> 1; 2371 q->tins[3].tin_quantum_band = quantum >> 2; 2372 2373 return 0; 2374 } 2375 2376 static int cake_config_diffserv3(struct Qdisc *sch) 2377 { 2378 /* Simplified Diffserv structure with 3 tins. 2379 * Low Priority (CS1) 2380 * Best Effort 2381 * Latency Sensitive (TOS4, VA, EF, CS6, CS7) 2382 */ 2383 struct cake_sched_data *q = qdisc_priv(sch); 2384 u32 mtu = psched_mtu(qdisc_dev(sch)); 2385 u64 rate = q->rate_bps; 2386 u32 quantum = 1024; 2387 2388 q->tin_cnt = 3; 2389 2390 /* codepoint to class mapping */ 2391 q->tin_index = diffserv3; 2392 q->tin_order = bulk_order; 2393 2394 /* class characteristics */ 2395 cake_set_rate(&q->tins[0], rate, mtu, 2396 us_to_ns(q->target), us_to_ns(q->interval)); 2397 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2398 us_to_ns(q->target), us_to_ns(q->interval)); 2399 cake_set_rate(&q->tins[2], rate >> 2, mtu, 2400 us_to_ns(q->target), us_to_ns(q->interval)); 2401 2402 /* priority weights */ 2403 q->tins[0].tin_quantum_prio = quantum; 2404 q->tins[1].tin_quantum_prio = quantum >> 4; 2405 q->tins[2].tin_quantum_prio = quantum << 4; 2406 2407 /* bandwidth-sharing weights */ 2408 q->tins[0].tin_quantum_band = quantum; 2409 q->tins[1].tin_quantum_band = quantum >> 4; 2410 q->tins[2].tin_quantum_band = quantum >> 2; 2411 2412 return 0; 2413 } 2414 2415 static void cake_reconfigure(struct Qdisc *sch) 2416 { 2417 struct cake_sched_data *q = qdisc_priv(sch); 2418 int c, ft; 2419 2420 switch (q->tin_mode) { 2421 case CAKE_DIFFSERV_BESTEFFORT: 2422 ft = cake_config_besteffort(sch); 2423 break; 2424 2425 case CAKE_DIFFSERV_PRECEDENCE: 2426 ft = cake_config_precedence(sch); 2427 break; 2428 2429 case CAKE_DIFFSERV_DIFFSERV8: 2430 ft = cake_config_diffserv8(sch); 2431 break; 2432 2433 case CAKE_DIFFSERV_DIFFSERV4: 2434 ft = cake_config_diffserv4(sch); 2435 break; 2436 2437 case CAKE_DIFFSERV_DIFFSERV3: 2438 default: 2439 ft = cake_config_diffserv3(sch); 2440 break; 2441 } 2442 2443 for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { 2444 cake_clear_tin(sch, c); 2445 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; 2446 } 2447 2448 q->rate_ns = q->tins[ft].tin_rate_ns; 2449 q->rate_shft = q->tins[ft].tin_rate_shft; 2450 2451 if (q->buffer_config_limit) { 2452 q->buffer_limit = q->buffer_config_limit; 2453 } else if (q->rate_bps) { 2454 u64 t = q->rate_bps * q->interval; 2455 2456 do_div(t, USEC_PER_SEC / 4); 2457 q->buffer_limit = max_t(u32, t, 4U << 20); 2458 } else { 2459 q->buffer_limit = ~0; 2460 } 2461 2462 sch->flags &= ~TCQ_F_CAN_BYPASS; 2463 2464 q->buffer_limit = min(q->buffer_limit, 2465 max(sch->limit * psched_mtu(qdisc_dev(sch)), 2466 q->buffer_config_limit)); 2467 } 2468 2469 static int cake_change(struct Qdisc *sch, struct nlattr *opt, 2470 struct netlink_ext_ack *extack) 2471 { 2472 struct cake_sched_data *q = qdisc_priv(sch); 2473 struct nlattr *tb[TCA_CAKE_MAX + 1]; 2474 int err; 2475 2476 if (!opt) 2477 return -EINVAL; 2478 2479 err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack); 2480 if (err < 0) 2481 return err; 2482 2483 if (tb[TCA_CAKE_NAT]) { 2484 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 2485 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; 2486 q->flow_mode |= CAKE_FLOW_NAT_FLAG * 2487 !!nla_get_u32(tb[TCA_CAKE_NAT]); 2488 #else 2489 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], 2490 "No conntrack support in kernel"); 2491 return -EOPNOTSUPP; 2492 #endif 2493 } 2494 2495 if (tb[TCA_CAKE_BASE_RATE64]) 2496 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); 2497 2498 if (tb[TCA_CAKE_DIFFSERV_MODE]) 2499 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); 2500 2501 if (tb[TCA_CAKE_WASH]) { 2502 if (!!nla_get_u32(tb[TCA_CAKE_WASH])) 2503 q->rate_flags |= CAKE_FLAG_WASH; 2504 else 2505 q->rate_flags &= ~CAKE_FLAG_WASH; 2506 } 2507 2508 if (tb[TCA_CAKE_FLOW_MODE]) 2509 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | 2510 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & 2511 CAKE_FLOW_MASK)); 2512 2513 if (tb[TCA_CAKE_ATM]) 2514 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); 2515 2516 if (tb[TCA_CAKE_OVERHEAD]) { 2517 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); 2518 q->rate_flags |= CAKE_FLAG_OVERHEAD; 2519 2520 q->max_netlen = 0; 2521 q->max_adjlen = 0; 2522 q->min_netlen = ~0; 2523 q->min_adjlen = ~0; 2524 } 2525 2526 if (tb[TCA_CAKE_RAW]) { 2527 q->rate_flags &= ~CAKE_FLAG_OVERHEAD; 2528 2529 q->max_netlen = 0; 2530 q->max_adjlen = 0; 2531 q->min_netlen = ~0; 2532 q->min_adjlen = ~0; 2533 } 2534 2535 if (tb[TCA_CAKE_MPU]) 2536 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); 2537 2538 if (tb[TCA_CAKE_RTT]) { 2539 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); 2540 2541 if (!q->interval) 2542 q->interval = 1; 2543 } 2544 2545 if (tb[TCA_CAKE_TARGET]) { 2546 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); 2547 2548 if (!q->target) 2549 q->target = 1; 2550 } 2551 2552 if (tb[TCA_CAKE_AUTORATE]) { 2553 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) 2554 q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; 2555 else 2556 q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; 2557 } 2558 2559 if (tb[TCA_CAKE_INGRESS]) { 2560 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) 2561 q->rate_flags |= CAKE_FLAG_INGRESS; 2562 else 2563 q->rate_flags &= ~CAKE_FLAG_INGRESS; 2564 } 2565 2566 if (tb[TCA_CAKE_ACK_FILTER]) 2567 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); 2568 2569 if (tb[TCA_CAKE_MEMORY]) 2570 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); 2571 2572 if (q->rate_bps && q->rate_bps <= CAKE_SPLIT_GSO_THRESHOLD) 2573 q->rate_flags |= CAKE_FLAG_SPLIT_GSO; 2574 else 2575 q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; 2576 2577 if (q->tins) { 2578 sch_tree_lock(sch); 2579 cake_reconfigure(sch); 2580 sch_tree_unlock(sch); 2581 } 2582 2583 return 0; 2584 } 2585 2586 static void cake_destroy(struct Qdisc *sch) 2587 { 2588 struct cake_sched_data *q = qdisc_priv(sch); 2589 2590 qdisc_watchdog_cancel(&q->watchdog); 2591 tcf_block_put(q->block); 2592 kvfree(q->tins); 2593 } 2594 2595 static int cake_init(struct Qdisc *sch, struct nlattr *opt, 2596 struct netlink_ext_ack *extack) 2597 { 2598 struct cake_sched_data *q = qdisc_priv(sch); 2599 int i, j, err; 2600 2601 sch->limit = 10240; 2602 q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; 2603 q->flow_mode = CAKE_FLOW_TRIPLE; 2604 2605 q->rate_bps = 0; /* unlimited by default */ 2606 2607 q->interval = 100000; /* 100ms default */ 2608 q->target = 5000; /* 5ms: codel RFC argues 2609 * for 5 to 10% of interval 2610 */ 2611 2612 q->cur_tin = 0; 2613 q->cur_flow = 0; 2614 2615 qdisc_watchdog_init(&q->watchdog, sch); 2616 2617 if (opt) { 2618 int err = cake_change(sch, opt, extack); 2619 2620 if (err) 2621 return err; 2622 } 2623 2624 err = tcf_block_get(&q->block, &q->filter_list, sch, extack); 2625 if (err) 2626 return err; 2627 2628 quantum_div[0] = ~0; 2629 for (i = 1; i <= CAKE_QUEUES; i++) 2630 quantum_div[i] = 65535 / i; 2631 2632 q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data), 2633 GFP_KERNEL); 2634 if (!q->tins) 2635 goto nomem; 2636 2637 for (i = 0; i < CAKE_MAX_TINS; i++) { 2638 struct cake_tin_data *b = q->tins + i; 2639 2640 INIT_LIST_HEAD(&b->new_flows); 2641 INIT_LIST_HEAD(&b->old_flows); 2642 INIT_LIST_HEAD(&b->decaying_flows); 2643 b->sparse_flow_count = 0; 2644 b->bulk_flow_count = 0; 2645 b->decaying_flow_count = 0; 2646 2647 for (j = 0; j < CAKE_QUEUES; j++) { 2648 struct cake_flow *flow = b->flows + j; 2649 u32 k = j * CAKE_MAX_TINS + i; 2650 2651 INIT_LIST_HEAD(&flow->flowchain); 2652 cobalt_vars_init(&flow->cvars); 2653 2654 q->overflow_heap[k].t = i; 2655 q->overflow_heap[k].b = j; 2656 b->overflow_idx[j] = k; 2657 } 2658 } 2659 2660 cake_reconfigure(sch); 2661 q->avg_peak_bandwidth = q->rate_bps; 2662 q->min_netlen = ~0; 2663 q->min_adjlen = ~0; 2664 return 0; 2665 2666 nomem: 2667 cake_destroy(sch); 2668 return -ENOMEM; 2669 } 2670 2671 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) 2672 { 2673 struct cake_sched_data *q = qdisc_priv(sch); 2674 struct nlattr *opts; 2675 2676 opts = nla_nest_start(skb, TCA_OPTIONS); 2677 if (!opts) 2678 goto nla_put_failure; 2679 2680 if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, 2681 TCA_CAKE_PAD)) 2682 goto nla_put_failure; 2683 2684 if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, 2685 q->flow_mode & CAKE_FLOW_MASK)) 2686 goto nla_put_failure; 2687 2688 if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) 2689 goto nla_put_failure; 2690 2691 if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) 2692 goto nla_put_failure; 2693 2694 if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) 2695 goto nla_put_failure; 2696 2697 if (nla_put_u32(skb, TCA_CAKE_AUTORATE, 2698 !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) 2699 goto nla_put_failure; 2700 2701 if (nla_put_u32(skb, TCA_CAKE_INGRESS, 2702 !!(q->rate_flags & CAKE_FLAG_INGRESS))) 2703 goto nla_put_failure; 2704 2705 if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) 2706 goto nla_put_failure; 2707 2708 if (nla_put_u32(skb, TCA_CAKE_NAT, 2709 !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) 2710 goto nla_put_failure; 2711 2712 if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) 2713 goto nla_put_failure; 2714 2715 if (nla_put_u32(skb, TCA_CAKE_WASH, 2716 !!(q->rate_flags & CAKE_FLAG_WASH))) 2717 goto nla_put_failure; 2718 2719 if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) 2720 goto nla_put_failure; 2721 2722 if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) 2723 if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) 2724 goto nla_put_failure; 2725 2726 if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) 2727 goto nla_put_failure; 2728 2729 if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) 2730 goto nla_put_failure; 2731 2732 if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, 2733 !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) 2734 goto nla_put_failure; 2735 2736 return nla_nest_end(skb, opts); 2737 2738 nla_put_failure: 2739 return -1; 2740 } 2741 2742 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 2743 { 2744 struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP); 2745 struct cake_sched_data *q = qdisc_priv(sch); 2746 struct nlattr *tstats, *ts; 2747 int i; 2748 2749 if (!stats) 2750 return -1; 2751 2752 #define PUT_STAT_U32(attr, data) do { \ 2753 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 2754 goto nla_put_failure; \ 2755 } while (0) 2756 #define PUT_STAT_U64(attr, data) do { \ 2757 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ 2758 data, TCA_CAKE_STATS_PAD)) \ 2759 goto nla_put_failure; \ 2760 } while (0) 2761 2762 PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); 2763 PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); 2764 PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); 2765 PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); 2766 PUT_STAT_U32(MAX_NETLEN, q->max_netlen); 2767 PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); 2768 PUT_STAT_U32(MIN_NETLEN, q->min_netlen); 2769 PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); 2770 2771 #undef PUT_STAT_U32 2772 #undef PUT_STAT_U64 2773 2774 tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS); 2775 if (!tstats) 2776 goto nla_put_failure; 2777 2778 #define PUT_TSTAT_U32(attr, data) do { \ 2779 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ 2780 goto nla_put_failure; \ 2781 } while (0) 2782 #define PUT_TSTAT_U64(attr, data) do { \ 2783 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ 2784 data, TCA_CAKE_TIN_STATS_PAD)) \ 2785 goto nla_put_failure; \ 2786 } while (0) 2787 2788 for (i = 0; i < q->tin_cnt; i++) { 2789 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 2790 2791 ts = nla_nest_start(d->skb, i + 1); 2792 if (!ts) 2793 goto nla_put_failure; 2794 2795 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); 2796 PUT_TSTAT_U64(SENT_BYTES64, b->bytes); 2797 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); 2798 2799 PUT_TSTAT_U32(TARGET_US, 2800 ktime_to_us(ns_to_ktime(b->cparams.target))); 2801 PUT_TSTAT_U32(INTERVAL_US, 2802 ktime_to_us(ns_to_ktime(b->cparams.interval))); 2803 2804 PUT_TSTAT_U32(SENT_PACKETS, b->packets); 2805 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); 2806 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); 2807 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); 2808 2809 PUT_TSTAT_U32(PEAK_DELAY_US, 2810 ktime_to_us(ns_to_ktime(b->peak_delay))); 2811 PUT_TSTAT_U32(AVG_DELAY_US, 2812 ktime_to_us(ns_to_ktime(b->avge_delay))); 2813 PUT_TSTAT_U32(BASE_DELAY_US, 2814 ktime_to_us(ns_to_ktime(b->base_delay))); 2815 2816 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); 2817 PUT_TSTAT_U32(WAY_MISSES, b->way_misses); 2818 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); 2819 2820 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + 2821 b->decaying_flow_count); 2822 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); 2823 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); 2824 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); 2825 2826 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); 2827 nla_nest_end(d->skb, ts); 2828 } 2829 2830 #undef PUT_TSTAT_U32 2831 #undef PUT_TSTAT_U64 2832 2833 nla_nest_end(d->skb, tstats); 2834 return nla_nest_end(d->skb, stats); 2835 2836 nla_put_failure: 2837 nla_nest_cancel(d->skb, stats); 2838 return -1; 2839 } 2840 2841 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) 2842 { 2843 return NULL; 2844 } 2845 2846 static unsigned long cake_find(struct Qdisc *sch, u32 classid) 2847 { 2848 return 0; 2849 } 2850 2851 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, 2852 u32 classid) 2853 { 2854 return 0; 2855 } 2856 2857 static void cake_unbind(struct Qdisc *q, unsigned long cl) 2858 { 2859 } 2860 2861 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, 2862 struct netlink_ext_ack *extack) 2863 { 2864 struct cake_sched_data *q = qdisc_priv(sch); 2865 2866 if (cl) 2867 return NULL; 2868 return q->block; 2869 } 2870 2871 static int cake_dump_class(struct Qdisc *sch, unsigned long cl, 2872 struct sk_buff *skb, struct tcmsg *tcm) 2873 { 2874 tcm->tcm_handle |= TC_H_MIN(cl); 2875 return 0; 2876 } 2877 2878 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, 2879 struct gnet_dump *d) 2880 { 2881 struct cake_sched_data *q = qdisc_priv(sch); 2882 const struct cake_flow *flow = NULL; 2883 struct gnet_stats_queue qs = { 0 }; 2884 struct nlattr *stats; 2885 u32 idx = cl - 1; 2886 2887 if (idx < CAKE_QUEUES * q->tin_cnt) { 2888 const struct cake_tin_data *b = \ 2889 &q->tins[q->tin_order[idx / CAKE_QUEUES]]; 2890 const struct sk_buff *skb; 2891 2892 flow = &b->flows[idx % CAKE_QUEUES]; 2893 2894 if (flow->head) { 2895 sch_tree_lock(sch); 2896 skb = flow->head; 2897 while (skb) { 2898 qs.qlen++; 2899 skb = skb->next; 2900 } 2901 sch_tree_unlock(sch); 2902 } 2903 qs.backlog = b->backlogs[idx % CAKE_QUEUES]; 2904 qs.drops = flow->dropped; 2905 } 2906 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) 2907 return -1; 2908 if (flow) { 2909 ktime_t now = ktime_get(); 2910 2911 stats = nla_nest_start(d->skb, TCA_STATS_APP); 2912 if (!stats) 2913 return -1; 2914 2915 #define PUT_STAT_U32(attr, data) do { \ 2916 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 2917 goto nla_put_failure; \ 2918 } while (0) 2919 #define PUT_STAT_S32(attr, data) do { \ 2920 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 2921 goto nla_put_failure; \ 2922 } while (0) 2923 2924 PUT_STAT_S32(DEFICIT, flow->deficit); 2925 PUT_STAT_U32(DROPPING, flow->cvars.dropping); 2926 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); 2927 PUT_STAT_U32(P_DROP, flow->cvars.p_drop); 2928 if (flow->cvars.p_drop) { 2929 PUT_STAT_S32(BLUE_TIMER_US, 2930 ktime_to_us( 2931 ktime_sub(now, 2932 flow->cvars.blue_timer))); 2933 } 2934 if (flow->cvars.dropping) { 2935 PUT_STAT_S32(DROP_NEXT_US, 2936 ktime_to_us( 2937 ktime_sub(now, 2938 flow->cvars.drop_next))); 2939 } 2940 2941 if (nla_nest_end(d->skb, stats) < 0) 2942 return -1; 2943 } 2944 2945 return 0; 2946 2947 nla_put_failure: 2948 nla_nest_cancel(d->skb, stats); 2949 return -1; 2950 } 2951 2952 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) 2953 { 2954 struct cake_sched_data *q = qdisc_priv(sch); 2955 unsigned int i, j; 2956 2957 if (arg->stop) 2958 return; 2959 2960 for (i = 0; i < q->tin_cnt; i++) { 2961 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 2962 2963 for (j = 0; j < CAKE_QUEUES; j++) { 2964 if (list_empty(&b->flows[j].flowchain) || 2965 arg->count < arg->skip) { 2966 arg->count++; 2967 continue; 2968 } 2969 if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) { 2970 arg->stop = 1; 2971 break; 2972 } 2973 arg->count++; 2974 } 2975 } 2976 } 2977 2978 static const struct Qdisc_class_ops cake_class_ops = { 2979 .leaf = cake_leaf, 2980 .find = cake_find, 2981 .tcf_block = cake_tcf_block, 2982 .bind_tcf = cake_bind, 2983 .unbind_tcf = cake_unbind, 2984 .dump = cake_dump_class, 2985 .dump_stats = cake_dump_class_stats, 2986 .walk = cake_walk, 2987 }; 2988 2989 static struct Qdisc_ops cake_qdisc_ops __read_mostly = { 2990 .cl_ops = &cake_class_ops, 2991 .id = "cake", 2992 .priv_size = sizeof(struct cake_sched_data), 2993 .enqueue = cake_enqueue, 2994 .dequeue = cake_dequeue, 2995 .peek = qdisc_peek_dequeued, 2996 .init = cake_init, 2997 .reset = cake_reset, 2998 .destroy = cake_destroy, 2999 .change = cake_change, 3000 .dump = cake_dump, 3001 .dump_stats = cake_dump_stats, 3002 .owner = THIS_MODULE, 3003 }; 3004 3005 static int __init cake_module_init(void) 3006 { 3007 return register_qdisc(&cake_qdisc_ops); 3008 } 3009 3010 static void __exit cake_module_exit(void) 3011 { 3012 unregister_qdisc(&cake_qdisc_ops); 3013 } 3014 3015 module_init(cake_module_init) 3016 module_exit(cake_module_exit) 3017 MODULE_AUTHOR("Jonathan Morton"); 3018 MODULE_LICENSE("Dual BSD/GPL"); 3019 MODULE_DESCRIPTION("The CAKE shaper."); 3020