1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* COMMON Applications Kept Enhanced (CAKE) discipline 4 * 5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> 6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> 7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> 8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> 9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> 10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> 11 * 12 * The CAKE Principles: 13 * (or, how to have your cake and eat it too) 14 * 15 * This is a combination of several shaping, AQM and FQ techniques into one 16 * easy-to-use package: 17 * 18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE 19 * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), 20 * eliminating the need for any sort of burst parameter (eg. token bucket 21 * depth). Burst support is limited to that necessary to overcome scheduling 22 * latency. 23 * 24 * - A Diffserv-aware priority queue, giving more priority to certain classes, 25 * up to a specified fraction of bandwidth. Above that bandwidth threshold, 26 * the priority is reduced to avoid starving other tins. 27 * 28 * - Each priority tin has a separate Flow Queue system, to isolate traffic 29 * flows from each other. This prevents a burst on one flow from increasing 30 * the delay to another. Flows are distributed to queues using a 31 * set-associative hash function. 32 * 33 * - Each queue is actively managed by Cobalt, which is a combination of the 34 * Codel and Blue AQM algorithms. This serves flows fairly, and signals 35 * congestion early via ECN (if available) and/or packet drops, to keep 36 * latency low. The codel parameters are auto-tuned based on the bandwidth 37 * setting, as is necessary at low bandwidths. 38 * 39 * The configuration parameters are kept deliberately simple for ease of use. 40 * Everything has sane defaults. Complete generality of configuration is *not* 41 * a goal. 42 * 43 * The priority queue operates according to a weighted DRR scheme, combined with 44 * a bandwidth tracker which reuses the shaper logic to detect which side of the 45 * bandwidth sharing threshold the tin is operating. This determines whether a 46 * priority-based weight (high) or a bandwidth-based weight (low) is used for 47 * that tin in the current pass. 48 * 49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly 50 * granted us permission to leverage. 51 */ 52 53 #include <linux/module.h> 54 #include <linux/types.h> 55 #include <linux/kernel.h> 56 #include <linux/jiffies.h> 57 #include <linux/string.h> 58 #include <linux/in.h> 59 #include <linux/errno.h> 60 #include <linux/init.h> 61 #include <linux/skbuff.h> 62 #include <linux/jhash.h> 63 #include <linux/slab.h> 64 #include <linux/vmalloc.h> 65 #include <linux/reciprocal_div.h> 66 #include <net/netlink.h> 67 #include <linux/if_vlan.h> 68 #include <net/gso.h> 69 #include <net/pkt_sched.h> 70 #include <net/pkt_cls.h> 71 #include <net/tcp.h> 72 #include <net/flow_dissector.h> 73 74 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 75 #include <net/netfilter/nf_conntrack_core.h> 76 #endif 77 78 #define CAKE_SET_WAYS (8) 79 #define CAKE_MAX_TINS (8) 80 #define CAKE_QUEUES (1024) 81 #define CAKE_FLOW_MASK 63 82 #define CAKE_FLOW_NAT_FLAG 64 83 84 /* struct cobalt_params - contains codel and blue parameters 85 * @interval: codel initial drop rate 86 * @target: maximum persistent sojourn time & blue update rate 87 * @mtu_time: serialisation delay of maximum-size packet 88 * @p_inc: increment of blue drop probability (0.32 fxp) 89 * @p_dec: decrement of blue drop probability (0.32 fxp) 90 */ 91 struct cobalt_params { 92 u64 interval; 93 u64 target; 94 u64 mtu_time; 95 u32 p_inc; 96 u32 p_dec; 97 }; 98 99 /* struct cobalt_vars - contains codel and blue variables 100 * @count: codel dropping frequency 101 * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 102 * @drop_next: time to drop next packet, or when we dropped last 103 * @blue_timer: Blue time to next drop 104 * @p_drop: BLUE drop probability (0.32 fxp) 105 * @dropping: set if in dropping state 106 * @ecn_marked: set if marked 107 */ 108 struct cobalt_vars { 109 u32 count; 110 u32 rec_inv_sqrt; 111 ktime_t drop_next; 112 ktime_t blue_timer; 113 u32 p_drop; 114 bool dropping; 115 bool ecn_marked; 116 }; 117 118 enum { 119 CAKE_SET_NONE = 0, 120 CAKE_SET_SPARSE, 121 CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ 122 CAKE_SET_BULK, 123 CAKE_SET_DECAYING 124 }; 125 126 struct cake_flow { 127 /* this stuff is all needed per-flow at dequeue time */ 128 struct sk_buff *head; 129 struct sk_buff *tail; 130 struct list_head flowchain; 131 s32 deficit; 132 u32 dropped; 133 struct cobalt_vars cvars; 134 u16 srchost; /* index into cake_host table */ 135 u16 dsthost; 136 u8 set; 137 }; /* please try to keep this structure <= 64 bytes */ 138 139 struct cake_host { 140 u32 srchost_tag; 141 u32 dsthost_tag; 142 u16 srchost_bulk_flow_count; 143 u16 dsthost_bulk_flow_count; 144 }; 145 146 struct cake_heap_entry { 147 u16 t:3, b:10; 148 }; 149 150 struct cake_tin_data { 151 struct cake_flow flows[CAKE_QUEUES]; 152 u32 backlogs[CAKE_QUEUES]; 153 u32 tags[CAKE_QUEUES]; /* for set association */ 154 u16 overflow_idx[CAKE_QUEUES]; 155 struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ 156 u16 flow_quantum; 157 158 struct cobalt_params cparams; 159 u32 drop_overlimit; 160 u16 bulk_flow_count; 161 u16 sparse_flow_count; 162 u16 decaying_flow_count; 163 u16 unresponsive_flow_count; 164 165 u32 max_skblen; 166 167 struct list_head new_flows; 168 struct list_head old_flows; 169 struct list_head decaying_flows; 170 171 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 172 ktime_t time_next_packet; 173 u64 tin_rate_ns; 174 u64 tin_rate_bps; 175 u16 tin_rate_shft; 176 177 u16 tin_quantum; 178 s32 tin_deficit; 179 u32 tin_backlog; 180 u32 tin_dropped; 181 u32 tin_ecn_mark; 182 183 u32 packets; 184 u64 bytes; 185 186 u32 ack_drops; 187 188 /* moving averages */ 189 u64 avge_delay; 190 u64 peak_delay; 191 u64 base_delay; 192 193 /* hash function stats */ 194 u32 way_directs; 195 u32 way_hits; 196 u32 way_misses; 197 u32 way_collisions; 198 }; /* number of tins is small, so size of this struct doesn't matter much */ 199 200 struct cake_sched_data { 201 struct tcf_proto __rcu *filter_list; /* optional external classifier */ 202 struct tcf_block *block; 203 struct cake_tin_data *tins; 204 205 struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; 206 u16 overflow_timeout; 207 208 u16 tin_cnt; 209 u8 tin_mode; 210 u8 flow_mode; 211 u8 ack_filter; 212 u8 atm_mode; 213 214 u32 fwmark_mask; 215 u16 fwmark_shft; 216 217 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 218 u16 rate_shft; 219 ktime_t time_next_packet; 220 ktime_t failsafe_next_packet; 221 u64 rate_ns; 222 u64 rate_bps; 223 u16 rate_flags; 224 s16 rate_overhead; 225 u16 rate_mpu; 226 u64 interval; 227 u64 target; 228 229 /* resource tracking */ 230 u32 buffer_used; 231 u32 buffer_max_used; 232 u32 buffer_limit; 233 u32 buffer_config_limit; 234 235 /* indices for dequeue */ 236 u16 cur_tin; 237 u16 cur_flow; 238 239 struct qdisc_watchdog watchdog; 240 const u8 *tin_index; 241 const u8 *tin_order; 242 243 /* bandwidth capacity estimate */ 244 ktime_t last_packet_time; 245 ktime_t avg_window_begin; 246 u64 avg_packet_interval; 247 u64 avg_window_bytes; 248 u64 avg_peak_bandwidth; 249 ktime_t last_reconfig_time; 250 251 /* packet length stats */ 252 u32 avg_netoff; 253 u16 max_netlen; 254 u16 max_adjlen; 255 u16 min_netlen; 256 u16 min_adjlen; 257 }; 258 259 enum { 260 CAKE_FLAG_OVERHEAD = BIT(0), 261 CAKE_FLAG_AUTORATE_INGRESS = BIT(1), 262 CAKE_FLAG_INGRESS = BIT(2), 263 CAKE_FLAG_WASH = BIT(3), 264 CAKE_FLAG_SPLIT_GSO = BIT(4) 265 }; 266 267 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to 268 * obtain the best features of each. Codel is excellent on flows which 269 * respond to congestion signals in a TCP-like way. BLUE is more effective on 270 * unresponsive flows. 271 */ 272 273 struct cobalt_skb_cb { 274 ktime_t enqueue_time; 275 u32 adjusted_len; 276 }; 277 us_to_ns(u64 us)278 static u64 us_to_ns(u64 us) 279 { 280 return us * NSEC_PER_USEC; 281 } 282 get_cobalt_cb(const struct sk_buff * skb)283 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) 284 { 285 qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); 286 return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; 287 } 288 cobalt_get_enqueue_time(const struct sk_buff * skb)289 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) 290 { 291 return get_cobalt_cb(skb)->enqueue_time; 292 } 293 cobalt_set_enqueue_time(struct sk_buff * skb,ktime_t now)294 static void cobalt_set_enqueue_time(struct sk_buff *skb, 295 ktime_t now) 296 { 297 get_cobalt_cb(skb)->enqueue_time = now; 298 } 299 300 static u16 quantum_div[CAKE_QUEUES + 1] = {0}; 301 302 /* Diffserv lookup tables */ 303 304 static const u8 precedence[] = { 305 0, 0, 0, 0, 0, 0, 0, 0, 306 1, 1, 1, 1, 1, 1, 1, 1, 307 2, 2, 2, 2, 2, 2, 2, 2, 308 3, 3, 3, 3, 3, 3, 3, 3, 309 4, 4, 4, 4, 4, 4, 4, 4, 310 5, 5, 5, 5, 5, 5, 5, 5, 311 6, 6, 6, 6, 6, 6, 6, 6, 312 7, 7, 7, 7, 7, 7, 7, 7, 313 }; 314 315 static const u8 diffserv8[] = { 316 2, 0, 1, 2, 4, 2, 2, 2, 317 1, 2, 1, 2, 1, 2, 1, 2, 318 5, 2, 4, 2, 4, 2, 4, 2, 319 3, 2, 3, 2, 3, 2, 3, 2, 320 6, 2, 3, 2, 3, 2, 3, 2, 321 6, 2, 2, 2, 6, 2, 6, 2, 322 7, 2, 2, 2, 2, 2, 2, 2, 323 7, 2, 2, 2, 2, 2, 2, 2, 324 }; 325 326 static const u8 diffserv4[] = { 327 0, 1, 0, 0, 2, 0, 0, 0, 328 1, 0, 0, 0, 0, 0, 0, 0, 329 2, 0, 2, 0, 2, 0, 2, 0, 330 2, 0, 2, 0, 2, 0, 2, 0, 331 3, 0, 2, 0, 2, 0, 2, 0, 332 3, 0, 0, 0, 3, 0, 3, 0, 333 3, 0, 0, 0, 0, 0, 0, 0, 334 3, 0, 0, 0, 0, 0, 0, 0, 335 }; 336 337 static const u8 diffserv3[] = { 338 0, 1, 0, 0, 2, 0, 0, 0, 339 1, 0, 0, 0, 0, 0, 0, 0, 340 0, 0, 0, 0, 0, 0, 0, 0, 341 0, 0, 0, 0, 0, 0, 0, 0, 342 0, 0, 0, 0, 0, 0, 0, 0, 343 0, 0, 0, 0, 2, 0, 2, 0, 344 2, 0, 0, 0, 0, 0, 0, 0, 345 2, 0, 0, 0, 0, 0, 0, 0, 346 }; 347 348 static const u8 besteffort[] = { 349 0, 0, 0, 0, 0, 0, 0, 0, 350 0, 0, 0, 0, 0, 0, 0, 0, 351 0, 0, 0, 0, 0, 0, 0, 0, 352 0, 0, 0, 0, 0, 0, 0, 0, 353 0, 0, 0, 0, 0, 0, 0, 0, 354 0, 0, 0, 0, 0, 0, 0, 0, 355 0, 0, 0, 0, 0, 0, 0, 0, 356 0, 0, 0, 0, 0, 0, 0, 0, 357 }; 358 359 /* tin priority order for stats dumping */ 360 361 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; 362 static const u8 bulk_order[] = {1, 0, 2, 3}; 363 364 #define REC_INV_SQRT_CACHE (16) 365 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; 366 367 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots 368 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) 369 * 370 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 371 */ 372 cobalt_newton_step(struct cobalt_vars * vars)373 static void cobalt_newton_step(struct cobalt_vars *vars) 374 { 375 u32 invsqrt, invsqrt2; 376 u64 val; 377 378 invsqrt = vars->rec_inv_sqrt; 379 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; 380 val = (3LL << 32) - ((u64)vars->count * invsqrt2); 381 382 val >>= 2; /* avoid overflow in following multiply */ 383 val = (val * invsqrt) >> (32 - 2 + 1); 384 385 vars->rec_inv_sqrt = val; 386 } 387 cobalt_invsqrt(struct cobalt_vars * vars)388 static void cobalt_invsqrt(struct cobalt_vars *vars) 389 { 390 if (vars->count < REC_INV_SQRT_CACHE) 391 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; 392 else 393 cobalt_newton_step(vars); 394 } 395 396 /* There is a big difference in timing between the accurate values placed in 397 * the cache and the approximations given by a single Newton step for small 398 * count values, particularly when stepping from count 1 to 2 or vice versa. 399 * Above 16, a single Newton step gives sufficient accuracy in either 400 * direction, given the precision stored. 401 * 402 * The magnitude of the error when stepping up to count 2 is such as to give 403 * the value that *should* have been produced at count 4. 404 */ 405 cobalt_cache_init(void)406 static void cobalt_cache_init(void) 407 { 408 struct cobalt_vars v; 409 410 memset(&v, 0, sizeof(v)); 411 v.rec_inv_sqrt = ~0U; 412 cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; 413 414 for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { 415 cobalt_newton_step(&v); 416 cobalt_newton_step(&v); 417 cobalt_newton_step(&v); 418 cobalt_newton_step(&v); 419 420 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; 421 } 422 } 423 cobalt_vars_init(struct cobalt_vars * vars)424 static void cobalt_vars_init(struct cobalt_vars *vars) 425 { 426 memset(vars, 0, sizeof(*vars)); 427 428 if (!cobalt_rec_inv_sqrt_cache[0]) { 429 cobalt_cache_init(); 430 cobalt_rec_inv_sqrt_cache[0] = ~0; 431 } 432 } 433 434 /* CoDel control_law is t + interval/sqrt(count) 435 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid 436 * both sqrt() and divide operation. 437 */ cobalt_control(ktime_t t,u64 interval,u32 rec_inv_sqrt)438 static ktime_t cobalt_control(ktime_t t, 439 u64 interval, 440 u32 rec_inv_sqrt) 441 { 442 return ktime_add_ns(t, reciprocal_scale(interval, 443 rec_inv_sqrt)); 444 } 445 446 /* Call this when a packet had to be dropped due to queue overflow. Returns 447 * true if the BLUE state was quiescent before but active after this call. 448 */ cobalt_queue_full(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)449 static bool cobalt_queue_full(struct cobalt_vars *vars, 450 struct cobalt_params *p, 451 ktime_t now) 452 { 453 bool up = false; 454 455 if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 456 up = !vars->p_drop; 457 vars->p_drop += p->p_inc; 458 if (vars->p_drop < p->p_inc) 459 vars->p_drop = ~0; 460 vars->blue_timer = now; 461 } 462 vars->dropping = true; 463 vars->drop_next = now; 464 if (!vars->count) 465 vars->count = 1; 466 467 return up; 468 } 469 470 /* Call this when the queue was serviced but turned out to be empty. Returns 471 * true if the BLUE state was active before but quiescent after this call. 472 */ cobalt_queue_empty(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)473 static bool cobalt_queue_empty(struct cobalt_vars *vars, 474 struct cobalt_params *p, 475 ktime_t now) 476 { 477 bool down = false; 478 479 if (vars->p_drop && 480 ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 481 if (vars->p_drop < p->p_dec) 482 vars->p_drop = 0; 483 else 484 vars->p_drop -= p->p_dec; 485 vars->blue_timer = now; 486 down = !vars->p_drop; 487 } 488 vars->dropping = false; 489 490 if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { 491 vars->count--; 492 cobalt_invsqrt(vars); 493 vars->drop_next = cobalt_control(vars->drop_next, 494 p->interval, 495 vars->rec_inv_sqrt); 496 } 497 498 return down; 499 } 500 501 /* Call this with a freshly dequeued packet for possible congestion marking. 502 * Returns true as an instruction to drop the packet, false for delivery. 503 */ cobalt_should_drop(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now,struct sk_buff * skb,u32 bulk_flows)504 static bool cobalt_should_drop(struct cobalt_vars *vars, 505 struct cobalt_params *p, 506 ktime_t now, 507 struct sk_buff *skb, 508 u32 bulk_flows) 509 { 510 bool next_due, over_target, drop = false; 511 ktime_t schedule; 512 u64 sojourn; 513 514 /* The 'schedule' variable records, in its sign, whether 'now' is before or 515 * after 'drop_next'. This allows 'drop_next' to be updated before the next 516 * scheduling decision is actually branched, without destroying that 517 * information. Similarly, the first 'schedule' value calculated is preserved 518 * in the boolean 'next_due'. 519 * 520 * As for 'drop_next', we take advantage of the fact that 'interval' is both 521 * the delay between first exceeding 'target' and the first signalling event, 522 * *and* the scaling factor for the signalling frequency. It's therefore very 523 * natural to use a single mechanism for both purposes, and eliminates a 524 * significant amount of reference Codel's spaghetti code. To help with this, 525 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close 526 * as possible to 1.0 in fixed-point. 527 */ 528 529 sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 530 schedule = ktime_sub(now, vars->drop_next); 531 over_target = sojourn > p->target && 532 sojourn > p->mtu_time * bulk_flows * 2 && 533 sojourn > p->mtu_time * 4; 534 next_due = vars->count && ktime_to_ns(schedule) >= 0; 535 536 vars->ecn_marked = false; 537 538 if (over_target) { 539 if (!vars->dropping) { 540 vars->dropping = true; 541 vars->drop_next = cobalt_control(now, 542 p->interval, 543 vars->rec_inv_sqrt); 544 } 545 if (!vars->count) 546 vars->count = 1; 547 } else if (vars->dropping) { 548 vars->dropping = false; 549 } 550 551 if (next_due && vars->dropping) { 552 /* Use ECN mark if possible, otherwise drop */ 553 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); 554 555 vars->count++; 556 if (!vars->count) 557 vars->count--; 558 cobalt_invsqrt(vars); 559 vars->drop_next = cobalt_control(vars->drop_next, 560 p->interval, 561 vars->rec_inv_sqrt); 562 schedule = ktime_sub(now, vars->drop_next); 563 } else { 564 while (next_due) { 565 vars->count--; 566 cobalt_invsqrt(vars); 567 vars->drop_next = cobalt_control(vars->drop_next, 568 p->interval, 569 vars->rec_inv_sqrt); 570 schedule = ktime_sub(now, vars->drop_next); 571 next_due = vars->count && ktime_to_ns(schedule) >= 0; 572 } 573 } 574 575 /* Simple BLUE implementation. Lack of ECN is deliberate. */ 576 if (vars->p_drop) 577 drop |= (get_random_u32() < vars->p_drop); 578 579 /* Overload the drop_next field as an activity timeout */ 580 if (!vars->count) 581 vars->drop_next = ktime_add_ns(now, p->interval); 582 else if (ktime_to_ns(schedule) > 0 && !drop) 583 vars->drop_next = now; 584 585 return drop; 586 } 587 cake_update_flowkeys(struct flow_keys * keys,const struct sk_buff * skb)588 static bool cake_update_flowkeys(struct flow_keys *keys, 589 const struct sk_buff *skb) 590 { 591 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 592 struct nf_conntrack_tuple tuple = {}; 593 bool rev = !skb->_nfct, upd = false; 594 __be32 ip; 595 596 if (skb_protocol(skb, true) != htons(ETH_P_IP)) 597 return false; 598 599 if (!nf_ct_get_tuple_skb(&tuple, skb)) 600 return false; 601 602 ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; 603 if (ip != keys->addrs.v4addrs.src) { 604 keys->addrs.v4addrs.src = ip; 605 upd = true; 606 } 607 ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; 608 if (ip != keys->addrs.v4addrs.dst) { 609 keys->addrs.v4addrs.dst = ip; 610 upd = true; 611 } 612 613 if (keys->ports.ports) { 614 __be16 port; 615 616 port = rev ? tuple.dst.u.all : tuple.src.u.all; 617 if (port != keys->ports.src) { 618 keys->ports.src = port; 619 upd = true; 620 } 621 port = rev ? tuple.src.u.all : tuple.dst.u.all; 622 if (port != keys->ports.dst) { 623 port = keys->ports.dst; 624 upd = true; 625 } 626 } 627 return upd; 628 #else 629 return false; 630 #endif 631 } 632 633 /* Cake has several subtle multiple bit settings. In these cases you 634 * would be matching triple isolate mode as well. 635 */ 636 cake_dsrc(int flow_mode)637 static bool cake_dsrc(int flow_mode) 638 { 639 return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; 640 } 641 cake_ddst(int flow_mode)642 static bool cake_ddst(int flow_mode) 643 { 644 return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; 645 } 646 cake_dec_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)647 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q, 648 struct cake_flow *flow, 649 int flow_mode) 650 { 651 if (likely(cake_dsrc(flow_mode) && 652 q->hosts[flow->srchost].srchost_bulk_flow_count)) 653 q->hosts[flow->srchost].srchost_bulk_flow_count--; 654 } 655 cake_inc_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)656 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q, 657 struct cake_flow *flow, 658 int flow_mode) 659 { 660 if (likely(cake_dsrc(flow_mode) && 661 q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES)) 662 q->hosts[flow->srchost].srchost_bulk_flow_count++; 663 } 664 cake_dec_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)665 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q, 666 struct cake_flow *flow, 667 int flow_mode) 668 { 669 if (likely(cake_ddst(flow_mode) && 670 q->hosts[flow->dsthost].dsthost_bulk_flow_count)) 671 q->hosts[flow->dsthost].dsthost_bulk_flow_count--; 672 } 673 cake_inc_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)674 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q, 675 struct cake_flow *flow, 676 int flow_mode) 677 { 678 if (likely(cake_ddst(flow_mode) && 679 q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES)) 680 q->hosts[flow->dsthost].dsthost_bulk_flow_count++; 681 } 682 cake_get_flow_quantum(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)683 static u16 cake_get_flow_quantum(struct cake_tin_data *q, 684 struct cake_flow *flow, 685 int flow_mode) 686 { 687 u16 host_load = 1; 688 689 if (cake_dsrc(flow_mode)) 690 host_load = max(host_load, 691 q->hosts[flow->srchost].srchost_bulk_flow_count); 692 693 if (cake_ddst(flow_mode)) 694 host_load = max(host_load, 695 q->hosts[flow->dsthost].dsthost_bulk_flow_count); 696 697 /* The get_random_u16() is a way to apply dithering to avoid 698 * accumulating roundoff errors 699 */ 700 return (q->flow_quantum * quantum_div[host_load] + 701 get_random_u16()) >> 16; 702 } 703 cake_hash(struct cake_tin_data * q,const struct sk_buff * skb,int flow_mode,u16 flow_override,u16 host_override)704 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, 705 int flow_mode, u16 flow_override, u16 host_override) 706 { 707 bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); 708 bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); 709 bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); 710 u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; 711 u16 reduced_hash, srchost_idx, dsthost_idx; 712 struct flow_keys keys, host_keys; 713 bool use_skbhash = skb->l4_hash; 714 715 if (unlikely(flow_mode == CAKE_FLOW_NONE)) 716 return 0; 717 718 /* If both overrides are set, or we can use the SKB hash and nat mode is 719 * disabled, we can skip packet dissection entirely. If nat mode is 720 * enabled there's another check below after doing the conntrack lookup. 721 */ 722 if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) 723 goto skip_hash; 724 725 skb_flow_dissect_flow_keys(skb, &keys, 726 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); 727 728 /* Don't use the SKB hash if we change the lookup keys from conntrack */ 729 if (nat_enabled && cake_update_flowkeys(&keys, skb)) 730 use_skbhash = false; 731 732 /* If we can still use the SKB hash and don't need the host hash, we can 733 * skip the rest of the hashing procedure 734 */ 735 if (use_skbhash && !hash_hosts) 736 goto skip_hash; 737 738 /* flow_hash_from_keys() sorts the addresses by value, so we have 739 * to preserve their order in a separate data structure to treat 740 * src and dst host addresses as independently selectable. 741 */ 742 host_keys = keys; 743 host_keys.ports.ports = 0; 744 host_keys.basic.ip_proto = 0; 745 host_keys.keyid.keyid = 0; 746 host_keys.tags.flow_label = 0; 747 748 switch (host_keys.control.addr_type) { 749 case FLOW_DISSECTOR_KEY_IPV4_ADDRS: 750 host_keys.addrs.v4addrs.src = 0; 751 dsthost_hash = flow_hash_from_keys(&host_keys); 752 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 753 host_keys.addrs.v4addrs.dst = 0; 754 srchost_hash = flow_hash_from_keys(&host_keys); 755 break; 756 757 case FLOW_DISSECTOR_KEY_IPV6_ADDRS: 758 memset(&host_keys.addrs.v6addrs.src, 0, 759 sizeof(host_keys.addrs.v6addrs.src)); 760 dsthost_hash = flow_hash_from_keys(&host_keys); 761 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 762 memset(&host_keys.addrs.v6addrs.dst, 0, 763 sizeof(host_keys.addrs.v6addrs.dst)); 764 srchost_hash = flow_hash_from_keys(&host_keys); 765 break; 766 767 default: 768 dsthost_hash = 0; 769 srchost_hash = 0; 770 } 771 772 /* This *must* be after the above switch, since as a 773 * side-effect it sorts the src and dst addresses. 774 */ 775 if (hash_flows && !use_skbhash) 776 flow_hash = flow_hash_from_keys(&keys); 777 778 skip_hash: 779 if (flow_override) 780 flow_hash = flow_override - 1; 781 else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS)) 782 flow_hash = skb->hash; 783 if (host_override) { 784 dsthost_hash = host_override - 1; 785 srchost_hash = host_override - 1; 786 } 787 788 if (!(flow_mode & CAKE_FLOW_FLOWS)) { 789 if (flow_mode & CAKE_FLOW_SRC_IP) 790 flow_hash ^= srchost_hash; 791 792 if (flow_mode & CAKE_FLOW_DST_IP) 793 flow_hash ^= dsthost_hash; 794 } 795 796 reduced_hash = flow_hash % CAKE_QUEUES; 797 798 /* set-associative hashing */ 799 /* fast path if no hash collision (direct lookup succeeds) */ 800 if (likely(q->tags[reduced_hash] == flow_hash && 801 q->flows[reduced_hash].set)) { 802 q->way_directs++; 803 } else { 804 u32 inner_hash = reduced_hash % CAKE_SET_WAYS; 805 u32 outer_hash = reduced_hash - inner_hash; 806 bool allocate_src = false; 807 bool allocate_dst = false; 808 u32 i, k; 809 810 /* check if any active queue in the set is reserved for 811 * this flow. 812 */ 813 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 814 i++, k = (k + 1) % CAKE_SET_WAYS) { 815 if (q->tags[outer_hash + k] == flow_hash) { 816 if (i) 817 q->way_hits++; 818 819 if (!q->flows[outer_hash + k].set) { 820 /* need to increment host refcnts */ 821 allocate_src = cake_dsrc(flow_mode); 822 allocate_dst = cake_ddst(flow_mode); 823 } 824 825 goto found; 826 } 827 } 828 829 /* no queue is reserved for this flow, look for an 830 * empty one. 831 */ 832 for (i = 0; i < CAKE_SET_WAYS; 833 i++, k = (k + 1) % CAKE_SET_WAYS) { 834 if (!q->flows[outer_hash + k].set) { 835 q->way_misses++; 836 allocate_src = cake_dsrc(flow_mode); 837 allocate_dst = cake_ddst(flow_mode); 838 goto found; 839 } 840 } 841 842 /* With no empty queues, default to the original 843 * queue, accept the collision, update the host tags. 844 */ 845 q->way_collisions++; 846 allocate_src = cake_dsrc(flow_mode); 847 allocate_dst = cake_ddst(flow_mode); 848 849 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { 850 cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); 851 cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); 852 } 853 found: 854 /* reserve queue for future packets in same flow */ 855 reduced_hash = outer_hash + k; 856 q->tags[reduced_hash] = flow_hash; 857 858 if (allocate_src) { 859 srchost_idx = srchost_hash % CAKE_QUEUES; 860 inner_hash = srchost_idx % CAKE_SET_WAYS; 861 outer_hash = srchost_idx - inner_hash; 862 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 863 i++, k = (k + 1) % CAKE_SET_WAYS) { 864 if (q->hosts[outer_hash + k].srchost_tag == 865 srchost_hash) 866 goto found_src; 867 } 868 for (i = 0; i < CAKE_SET_WAYS; 869 i++, k = (k + 1) % CAKE_SET_WAYS) { 870 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) 871 break; 872 } 873 q->hosts[outer_hash + k].srchost_tag = srchost_hash; 874 found_src: 875 srchost_idx = outer_hash + k; 876 q->flows[reduced_hash].srchost = srchost_idx; 877 878 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 879 cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); 880 } 881 882 if (allocate_dst) { 883 dsthost_idx = dsthost_hash % CAKE_QUEUES; 884 inner_hash = dsthost_idx % CAKE_SET_WAYS; 885 outer_hash = dsthost_idx - inner_hash; 886 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 887 i++, k = (k + 1) % CAKE_SET_WAYS) { 888 if (q->hosts[outer_hash + k].dsthost_tag == 889 dsthost_hash) 890 goto found_dst; 891 } 892 for (i = 0; i < CAKE_SET_WAYS; 893 i++, k = (k + 1) % CAKE_SET_WAYS) { 894 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) 895 break; 896 } 897 q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; 898 found_dst: 899 dsthost_idx = outer_hash + k; 900 q->flows[reduced_hash].dsthost = dsthost_idx; 901 902 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 903 cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); 904 } 905 } 906 907 return reduced_hash; 908 } 909 910 /* helper functions : might be changed when/if skb use a standard list_head */ 911 /* remove one skb from head of slot queue */ 912 dequeue_head(struct cake_flow * flow)913 static struct sk_buff *dequeue_head(struct cake_flow *flow) 914 { 915 struct sk_buff *skb = flow->head; 916 917 if (skb) { 918 flow->head = skb->next; 919 skb_mark_not_on_list(skb); 920 } 921 922 return skb; 923 } 924 925 /* add skb to flow queue (tail add) */ 926 flow_queue_add(struct cake_flow * flow,struct sk_buff * skb)927 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) 928 { 929 if (!flow->head) 930 flow->head = skb; 931 else 932 flow->tail->next = skb; 933 flow->tail = skb; 934 skb->next = NULL; 935 } 936 cake_get_iphdr(const struct sk_buff * skb,struct ipv6hdr * buf)937 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, 938 struct ipv6hdr *buf) 939 { 940 unsigned int offset = skb_network_offset(skb); 941 struct iphdr *iph; 942 943 iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); 944 945 if (!iph) 946 return NULL; 947 948 if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) 949 return skb_header_pointer(skb, offset + iph->ihl * 4, 950 sizeof(struct ipv6hdr), buf); 951 952 else if (iph->version == 4) 953 return iph; 954 955 else if (iph->version == 6) 956 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), 957 buf); 958 959 return NULL; 960 } 961 cake_get_tcphdr(const struct sk_buff * skb,void * buf,unsigned int bufsize)962 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, 963 void *buf, unsigned int bufsize) 964 { 965 unsigned int offset = skb_network_offset(skb); 966 const struct ipv6hdr *ipv6h; 967 const struct tcphdr *tcph; 968 const struct iphdr *iph; 969 struct ipv6hdr _ipv6h; 970 struct tcphdr _tcph; 971 972 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); 973 974 if (!ipv6h) 975 return NULL; 976 977 if (ipv6h->version == 4) { 978 iph = (struct iphdr *)ipv6h; 979 offset += iph->ihl * 4; 980 981 /* special-case 6in4 tunnelling, as that is a common way to get 982 * v6 connectivity in the home 983 */ 984 if (iph->protocol == IPPROTO_IPV6) { 985 ipv6h = skb_header_pointer(skb, offset, 986 sizeof(_ipv6h), &_ipv6h); 987 988 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) 989 return NULL; 990 991 offset += sizeof(struct ipv6hdr); 992 993 } else if (iph->protocol != IPPROTO_TCP) { 994 return NULL; 995 } 996 997 } else if (ipv6h->version == 6) { 998 if (ipv6h->nexthdr != IPPROTO_TCP) 999 return NULL; 1000 1001 offset += sizeof(struct ipv6hdr); 1002 } else { 1003 return NULL; 1004 } 1005 1006 tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); 1007 if (!tcph || tcph->doff < 5) 1008 return NULL; 1009 1010 return skb_header_pointer(skb, offset, 1011 min(__tcp_hdrlen(tcph), bufsize), buf); 1012 } 1013 cake_get_tcpopt(const struct tcphdr * tcph,int code,int * oplen)1014 static const void *cake_get_tcpopt(const struct tcphdr *tcph, 1015 int code, int *oplen) 1016 { 1017 /* inspired by tcp_parse_options in tcp_input.c */ 1018 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1019 const u8 *ptr = (const u8 *)(tcph + 1); 1020 1021 while (length > 0) { 1022 int opcode = *ptr++; 1023 int opsize; 1024 1025 if (opcode == TCPOPT_EOL) 1026 break; 1027 if (opcode == TCPOPT_NOP) { 1028 length--; 1029 continue; 1030 } 1031 if (length < 2) 1032 break; 1033 opsize = *ptr++; 1034 if (opsize < 2 || opsize > length) 1035 break; 1036 1037 if (opcode == code) { 1038 *oplen = opsize; 1039 return ptr; 1040 } 1041 1042 ptr += opsize - 2; 1043 length -= opsize; 1044 } 1045 1046 return NULL; 1047 } 1048 1049 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more 1050 * bytes than the other. In the case where both sequences ACKs bytes that the 1051 * other doesn't, A is considered greater. DSACKs in A also makes A be 1052 * considered greater. 1053 * 1054 * @return -1, 0 or 1 as normal compare functions 1055 */ cake_tcph_sack_compare(const struct tcphdr * tcph_a,const struct tcphdr * tcph_b)1056 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, 1057 const struct tcphdr *tcph_b) 1058 { 1059 const struct tcp_sack_block_wire *sack_a, *sack_b; 1060 u32 ack_seq_a = ntohl(tcph_a->ack_seq); 1061 u32 bytes_a = 0, bytes_b = 0; 1062 int oplen_a, oplen_b; 1063 bool first = true; 1064 1065 sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); 1066 sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); 1067 1068 /* pointers point to option contents */ 1069 oplen_a -= TCPOLEN_SACK_BASE; 1070 oplen_b -= TCPOLEN_SACK_BASE; 1071 1072 if (sack_a && oplen_a >= sizeof(*sack_a) && 1073 (!sack_b || oplen_b < sizeof(*sack_b))) 1074 return -1; 1075 else if (sack_b && oplen_b >= sizeof(*sack_b) && 1076 (!sack_a || oplen_a < sizeof(*sack_a))) 1077 return 1; 1078 else if ((!sack_a || oplen_a < sizeof(*sack_a)) && 1079 (!sack_b || oplen_b < sizeof(*sack_b))) 1080 return 0; 1081 1082 while (oplen_a >= sizeof(*sack_a)) { 1083 const struct tcp_sack_block_wire *sack_tmp = sack_b; 1084 u32 start_a = get_unaligned_be32(&sack_a->start_seq); 1085 u32 end_a = get_unaligned_be32(&sack_a->end_seq); 1086 int oplen_tmp = oplen_b; 1087 bool found = false; 1088 1089 /* DSACK; always considered greater to prevent dropping */ 1090 if (before(start_a, ack_seq_a)) 1091 return -1; 1092 1093 bytes_a += end_a - start_a; 1094 1095 while (oplen_tmp >= sizeof(*sack_tmp)) { 1096 u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); 1097 u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); 1098 1099 /* first time through we count the total size */ 1100 if (first) 1101 bytes_b += end_b - start_b; 1102 1103 if (!after(start_b, start_a) && !before(end_b, end_a)) { 1104 found = true; 1105 if (!first) 1106 break; 1107 } 1108 oplen_tmp -= sizeof(*sack_tmp); 1109 sack_tmp++; 1110 } 1111 1112 if (!found) 1113 return -1; 1114 1115 oplen_a -= sizeof(*sack_a); 1116 sack_a++; 1117 first = false; 1118 } 1119 1120 /* If we made it this far, all ranges SACKed by A are covered by B, so 1121 * either the SACKs are equal, or B SACKs more bytes. 1122 */ 1123 return bytes_b > bytes_a ? 1 : 0; 1124 } 1125 cake_tcph_get_tstamp(const struct tcphdr * tcph,u32 * tsval,u32 * tsecr)1126 static void cake_tcph_get_tstamp(const struct tcphdr *tcph, 1127 u32 *tsval, u32 *tsecr) 1128 { 1129 const u8 *ptr; 1130 int opsize; 1131 1132 ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); 1133 1134 if (ptr && opsize == TCPOLEN_TIMESTAMP) { 1135 *tsval = get_unaligned_be32(ptr); 1136 *tsecr = get_unaligned_be32(ptr + 4); 1137 } 1138 } 1139 cake_tcph_may_drop(const struct tcphdr * tcph,u32 tstamp_new,u32 tsecr_new)1140 static bool cake_tcph_may_drop(const struct tcphdr *tcph, 1141 u32 tstamp_new, u32 tsecr_new) 1142 { 1143 /* inspired by tcp_parse_options in tcp_input.c */ 1144 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1145 const u8 *ptr = (const u8 *)(tcph + 1); 1146 u32 tstamp, tsecr; 1147 1148 /* 3 reserved flags must be unset to avoid future breakage 1149 * ACK must be set 1150 * ECE/CWR are handled separately 1151 * All other flags URG/PSH/RST/SYN/FIN must be unset 1152 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) 1153 * 0x00C00000 = CWR/ECE (handled separately) 1154 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 1155 */ 1156 if (((tcp_flag_word(tcph) & 1157 cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) 1158 return false; 1159 1160 while (length > 0) { 1161 int opcode = *ptr++; 1162 int opsize; 1163 1164 if (opcode == TCPOPT_EOL) 1165 break; 1166 if (opcode == TCPOPT_NOP) { 1167 length--; 1168 continue; 1169 } 1170 if (length < 2) 1171 break; 1172 opsize = *ptr++; 1173 if (opsize < 2 || opsize > length) 1174 break; 1175 1176 switch (opcode) { 1177 case TCPOPT_MD5SIG: /* doesn't influence state */ 1178 break; 1179 1180 case TCPOPT_SACK: /* stricter checking performed later */ 1181 if (opsize % 8 != 2) 1182 return false; 1183 break; 1184 1185 case TCPOPT_TIMESTAMP: 1186 /* only drop timestamps lower than new */ 1187 if (opsize != TCPOLEN_TIMESTAMP) 1188 return false; 1189 tstamp = get_unaligned_be32(ptr); 1190 tsecr = get_unaligned_be32(ptr + 4); 1191 if (after(tstamp, tstamp_new) || 1192 after(tsecr, tsecr_new)) 1193 return false; 1194 break; 1195 1196 case TCPOPT_MSS: /* these should only be set on SYN */ 1197 case TCPOPT_WINDOW: 1198 case TCPOPT_SACK_PERM: 1199 case TCPOPT_FASTOPEN: 1200 case TCPOPT_EXP: 1201 default: /* don't drop if any unknown options are present */ 1202 return false; 1203 } 1204 1205 ptr += opsize - 2; 1206 length -= opsize; 1207 } 1208 1209 return true; 1210 } 1211 cake_ack_filter(struct cake_sched_data * q,struct cake_flow * flow)1212 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, 1213 struct cake_flow *flow) 1214 { 1215 bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; 1216 struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; 1217 struct sk_buff *skb_check, *skb_prev = NULL; 1218 const struct ipv6hdr *ipv6h, *ipv6h_check; 1219 unsigned char _tcph[64], _tcph_check[64]; 1220 const struct tcphdr *tcph, *tcph_check; 1221 const struct iphdr *iph, *iph_check; 1222 struct ipv6hdr _iph, _iph_check; 1223 const struct sk_buff *skb; 1224 int seglen, num_found = 0; 1225 u32 tstamp = 0, tsecr = 0; 1226 __be32 elig_flags = 0; 1227 int sack_comp; 1228 1229 /* no other possible ACKs to filter */ 1230 if (flow->head == flow->tail) 1231 return NULL; 1232 1233 skb = flow->tail; 1234 tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); 1235 iph = cake_get_iphdr(skb, &_iph); 1236 if (!tcph) 1237 return NULL; 1238 1239 cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); 1240 1241 /* the 'triggering' packet need only have the ACK flag set. 1242 * also check that SYN is not set, as there won't be any previous ACKs. 1243 */ 1244 if ((tcp_flag_word(tcph) & 1245 (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) 1246 return NULL; 1247 1248 /* the 'triggering' ACK is at the tail of the queue, we have already 1249 * returned if it is the only packet in the flow. loop through the rest 1250 * of the queue looking for pure ACKs with the same 5-tuple as the 1251 * triggering one. 1252 */ 1253 for (skb_check = flow->head; 1254 skb_check && skb_check != skb; 1255 skb_prev = skb_check, skb_check = skb_check->next) { 1256 iph_check = cake_get_iphdr(skb_check, &_iph_check); 1257 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, 1258 sizeof(_tcph_check)); 1259 1260 /* only TCP packets with matching 5-tuple are eligible, and only 1261 * drop safe headers 1262 */ 1263 if (!tcph_check || iph->version != iph_check->version || 1264 tcph_check->source != tcph->source || 1265 tcph_check->dest != tcph->dest) 1266 continue; 1267 1268 if (iph_check->version == 4) { 1269 if (iph_check->saddr != iph->saddr || 1270 iph_check->daddr != iph->daddr) 1271 continue; 1272 1273 seglen = iph_totlen(skb, iph_check) - 1274 (4 * iph_check->ihl); 1275 } else if (iph_check->version == 6) { 1276 ipv6h = (struct ipv6hdr *)iph; 1277 ipv6h_check = (struct ipv6hdr *)iph_check; 1278 1279 if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || 1280 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) 1281 continue; 1282 1283 seglen = ntohs(ipv6h_check->payload_len); 1284 } else { 1285 WARN_ON(1); /* shouldn't happen */ 1286 continue; 1287 } 1288 1289 /* If the ECE/CWR flags changed from the previous eligible 1290 * packet in the same flow, we should no longer be dropping that 1291 * previous packet as this would lose information. 1292 */ 1293 if (elig_ack && (tcp_flag_word(tcph_check) & 1294 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { 1295 elig_ack = NULL; 1296 elig_ack_prev = NULL; 1297 num_found--; 1298 } 1299 1300 /* Check TCP options and flags, don't drop ACKs with segment 1301 * data, and don't drop ACKs with a higher cumulative ACK 1302 * counter than the triggering packet. Check ACK seqno here to 1303 * avoid parsing SACK options of packets we are going to exclude 1304 * anyway. 1305 */ 1306 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || 1307 (seglen - __tcp_hdrlen(tcph_check)) != 0 || 1308 after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) 1309 continue; 1310 1311 /* Check SACK options. The triggering packet must SACK more data 1312 * than the ACK under consideration, or SACK the same range but 1313 * have a larger cumulative ACK counter. The latter is a 1314 * pathological case, but is contained in the following check 1315 * anyway, just to be safe. 1316 */ 1317 sack_comp = cake_tcph_sack_compare(tcph_check, tcph); 1318 1319 if (sack_comp < 0 || 1320 (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && 1321 sack_comp == 0)) 1322 continue; 1323 1324 /* At this point we have found an eligible pure ACK to drop; if 1325 * we are in aggressive mode, we are done. Otherwise, keep 1326 * searching unless this is the second eligible ACK we 1327 * found. 1328 * 1329 * Since we want to drop ACK closest to the head of the queue, 1330 * save the first eligible ACK we find, even if we need to loop 1331 * again. 1332 */ 1333 if (!elig_ack) { 1334 elig_ack = skb_check; 1335 elig_ack_prev = skb_prev; 1336 elig_flags = (tcp_flag_word(tcph_check) 1337 & (TCP_FLAG_ECE | TCP_FLAG_CWR)); 1338 } 1339 1340 if (num_found++ > 0) 1341 goto found; 1342 } 1343 1344 /* We made it through the queue without finding two eligible ACKs . If 1345 * we found a single eligible ACK we can drop it in aggressive mode if 1346 * we can guarantee that this does not interfere with ECN flag 1347 * information. We ensure this by dropping it only if the enqueued 1348 * packet is consecutive with the eligible ACK, and their flags match. 1349 */ 1350 if (elig_ack && aggressive && elig_ack->next == skb && 1351 (elig_flags == (tcp_flag_word(tcph) & 1352 (TCP_FLAG_ECE | TCP_FLAG_CWR)))) 1353 goto found; 1354 1355 return NULL; 1356 1357 found: 1358 if (elig_ack_prev) 1359 elig_ack_prev->next = elig_ack->next; 1360 else 1361 flow->head = elig_ack->next; 1362 1363 skb_mark_not_on_list(elig_ack); 1364 1365 return elig_ack; 1366 } 1367 cake_ewma(u64 avg,u64 sample,u32 shift)1368 static u64 cake_ewma(u64 avg, u64 sample, u32 shift) 1369 { 1370 avg -= avg >> shift; 1371 avg += sample >> shift; 1372 return avg; 1373 } 1374 cake_calc_overhead(struct cake_sched_data * q,u32 len,u32 off)1375 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) 1376 { 1377 if (q->rate_flags & CAKE_FLAG_OVERHEAD) 1378 len -= off; 1379 1380 if (q->max_netlen < len) 1381 q->max_netlen = len; 1382 if (q->min_netlen > len) 1383 q->min_netlen = len; 1384 1385 len += q->rate_overhead; 1386 1387 if (len < q->rate_mpu) 1388 len = q->rate_mpu; 1389 1390 if (q->atm_mode == CAKE_ATM_ATM) { 1391 len += 47; 1392 len /= 48; 1393 len *= 53; 1394 } else if (q->atm_mode == CAKE_ATM_PTM) { 1395 /* Add one byte per 64 bytes or part thereof. 1396 * This is conservative and easier to calculate than the 1397 * precise value. 1398 */ 1399 len += (len + 63) / 64; 1400 } 1401 1402 if (q->max_adjlen < len) 1403 q->max_adjlen = len; 1404 if (q->min_adjlen > len) 1405 q->min_adjlen = len; 1406 1407 return len; 1408 } 1409 cake_overhead(struct cake_sched_data * q,const struct sk_buff * skb)1410 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) 1411 { 1412 const struct skb_shared_info *shinfo = skb_shinfo(skb); 1413 unsigned int hdr_len, last_len = 0; 1414 u32 off = skb_network_offset(skb); 1415 u32 len = qdisc_pkt_len(skb); 1416 u16 segs = 1; 1417 1418 q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); 1419 1420 if (!shinfo->gso_size) 1421 return cake_calc_overhead(q, len, off); 1422 1423 /* borrowed from qdisc_pkt_len_init() */ 1424 hdr_len = skb_transport_offset(skb); 1425 1426 /* + transport layer */ 1427 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | 1428 SKB_GSO_TCPV6))) { 1429 const struct tcphdr *th; 1430 struct tcphdr _tcphdr; 1431 1432 th = skb_header_pointer(skb, hdr_len, 1433 sizeof(_tcphdr), &_tcphdr); 1434 if (likely(th)) 1435 hdr_len += __tcp_hdrlen(th); 1436 } else { 1437 struct udphdr _udphdr; 1438 1439 if (skb_header_pointer(skb, hdr_len, 1440 sizeof(_udphdr), &_udphdr)) 1441 hdr_len += sizeof(struct udphdr); 1442 } 1443 1444 if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) 1445 segs = DIV_ROUND_UP(skb->len - hdr_len, 1446 shinfo->gso_size); 1447 else 1448 segs = shinfo->gso_segs; 1449 1450 len = shinfo->gso_size + hdr_len; 1451 last_len = skb->len - shinfo->gso_size * (segs - 1); 1452 1453 return (cake_calc_overhead(q, len, off) * (segs - 1) + 1454 cake_calc_overhead(q, last_len, off)); 1455 } 1456 cake_heap_swap(struct cake_sched_data * q,u16 i,u16 j)1457 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) 1458 { 1459 struct cake_heap_entry ii = q->overflow_heap[i]; 1460 struct cake_heap_entry jj = q->overflow_heap[j]; 1461 1462 q->overflow_heap[i] = jj; 1463 q->overflow_heap[j] = ii; 1464 1465 q->tins[ii.t].overflow_idx[ii.b] = j; 1466 q->tins[jj.t].overflow_idx[jj.b] = i; 1467 } 1468 cake_heap_get_backlog(const struct cake_sched_data * q,u16 i)1469 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) 1470 { 1471 struct cake_heap_entry ii = q->overflow_heap[i]; 1472 1473 return q->tins[ii.t].backlogs[ii.b]; 1474 } 1475 cake_heapify(struct cake_sched_data * q,u16 i)1476 static void cake_heapify(struct cake_sched_data *q, u16 i) 1477 { 1478 static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; 1479 u32 mb = cake_heap_get_backlog(q, i); 1480 u32 m = i; 1481 1482 while (m < a) { 1483 u32 l = m + m + 1; 1484 u32 r = l + 1; 1485 1486 if (l < a) { 1487 u32 lb = cake_heap_get_backlog(q, l); 1488 1489 if (lb > mb) { 1490 m = l; 1491 mb = lb; 1492 } 1493 } 1494 1495 if (r < a) { 1496 u32 rb = cake_heap_get_backlog(q, r); 1497 1498 if (rb > mb) { 1499 m = r; 1500 mb = rb; 1501 } 1502 } 1503 1504 if (m != i) { 1505 cake_heap_swap(q, i, m); 1506 i = m; 1507 } else { 1508 break; 1509 } 1510 } 1511 } 1512 cake_heapify_up(struct cake_sched_data * q,u16 i)1513 static void cake_heapify_up(struct cake_sched_data *q, u16 i) 1514 { 1515 while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { 1516 u16 p = (i - 1) >> 1; 1517 u32 ib = cake_heap_get_backlog(q, i); 1518 u32 pb = cake_heap_get_backlog(q, p); 1519 1520 if (ib > pb) { 1521 cake_heap_swap(q, i, p); 1522 i = p; 1523 } else { 1524 break; 1525 } 1526 } 1527 } 1528 cake_advance_shaper(struct cake_sched_data * q,struct cake_tin_data * b,struct sk_buff * skb,ktime_t now,bool drop)1529 static int cake_advance_shaper(struct cake_sched_data *q, 1530 struct cake_tin_data *b, 1531 struct sk_buff *skb, 1532 ktime_t now, bool drop) 1533 { 1534 u32 len = get_cobalt_cb(skb)->adjusted_len; 1535 1536 /* charge packet bandwidth to this tin 1537 * and to the global shaper. 1538 */ 1539 if (q->rate_ns) { 1540 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; 1541 u64 global_dur = (len * q->rate_ns) >> q->rate_shft; 1542 u64 failsafe_dur = global_dur + (global_dur >> 1); 1543 1544 if (ktime_before(b->time_next_packet, now)) 1545 b->time_next_packet = ktime_add_ns(b->time_next_packet, 1546 tin_dur); 1547 1548 else if (ktime_before(b->time_next_packet, 1549 ktime_add_ns(now, tin_dur))) 1550 b->time_next_packet = ktime_add_ns(now, tin_dur); 1551 1552 q->time_next_packet = ktime_add_ns(q->time_next_packet, 1553 global_dur); 1554 if (!drop) 1555 q->failsafe_next_packet = \ 1556 ktime_add_ns(q->failsafe_next_packet, 1557 failsafe_dur); 1558 } 1559 return len; 1560 } 1561 cake_drop(struct Qdisc * sch,struct sk_buff ** to_free)1562 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) 1563 { 1564 struct cake_sched_data *q = qdisc_priv(sch); 1565 ktime_t now = ktime_get(); 1566 u32 idx = 0, tin = 0, len; 1567 struct cake_heap_entry qq; 1568 struct cake_tin_data *b; 1569 struct cake_flow *flow; 1570 struct sk_buff *skb; 1571 1572 if (!q->overflow_timeout) { 1573 int i; 1574 /* Build fresh max-heap */ 1575 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) 1576 cake_heapify(q, i); 1577 } 1578 q->overflow_timeout = 65535; 1579 1580 /* select longest queue for pruning */ 1581 qq = q->overflow_heap[0]; 1582 tin = qq.t; 1583 idx = qq.b; 1584 1585 b = &q->tins[tin]; 1586 flow = &b->flows[idx]; 1587 skb = dequeue_head(flow); 1588 if (unlikely(!skb)) { 1589 /* heap has gone wrong, rebuild it next time */ 1590 q->overflow_timeout = 0; 1591 return idx + (tin << 16); 1592 } 1593 1594 if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) 1595 b->unresponsive_flow_count++; 1596 1597 len = qdisc_pkt_len(skb); 1598 q->buffer_used -= skb->truesize; 1599 b->backlogs[idx] -= len; 1600 b->tin_backlog -= len; 1601 sch->qstats.backlog -= len; 1602 1603 flow->dropped++; 1604 b->tin_dropped++; 1605 sch->qstats.drops++; 1606 1607 if (q->rate_flags & CAKE_FLAG_INGRESS) 1608 cake_advance_shaper(q, b, skb, now, true); 1609 1610 __qdisc_drop(skb, to_free); 1611 sch->q.qlen--; 1612 qdisc_tree_reduce_backlog(sch, 1, len); 1613 1614 cake_heapify(q, 0); 1615 1616 return idx + (tin << 16); 1617 } 1618 cake_handle_diffserv(struct sk_buff * skb,bool wash)1619 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) 1620 { 1621 const int offset = skb_network_offset(skb); 1622 u16 *buf, buf_; 1623 u8 dscp; 1624 1625 switch (skb_protocol(skb, true)) { 1626 case htons(ETH_P_IP): 1627 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); 1628 if (unlikely(!buf)) 1629 return 0; 1630 1631 /* ToS is in the second byte of iphdr */ 1632 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2; 1633 1634 if (wash && dscp) { 1635 const int wlen = offset + sizeof(struct iphdr); 1636 1637 if (!pskb_may_pull(skb, wlen) || 1638 skb_try_make_writable(skb, wlen)) 1639 return 0; 1640 1641 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); 1642 } 1643 1644 return dscp; 1645 1646 case htons(ETH_P_IPV6): 1647 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); 1648 if (unlikely(!buf)) 1649 return 0; 1650 1651 /* Traffic class is in the first and second bytes of ipv6hdr */ 1652 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2; 1653 1654 if (wash && dscp) { 1655 const int wlen = offset + sizeof(struct ipv6hdr); 1656 1657 if (!pskb_may_pull(skb, wlen) || 1658 skb_try_make_writable(skb, wlen)) 1659 return 0; 1660 1661 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); 1662 } 1663 1664 return dscp; 1665 1666 case htons(ETH_P_ARP): 1667 return 0x38; /* CS7 - Net Control */ 1668 1669 default: 1670 /* If there is no Diffserv field, treat as best-effort */ 1671 return 0; 1672 } 1673 } 1674 cake_select_tin(struct Qdisc * sch,struct sk_buff * skb)1675 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, 1676 struct sk_buff *skb) 1677 { 1678 struct cake_sched_data *q = qdisc_priv(sch); 1679 u32 tin, mark; 1680 bool wash; 1681 u8 dscp; 1682 1683 /* Tin selection: Default to diffserv-based selection, allow overriding 1684 * using firewall marks or skb->priority. Call DSCP parsing early if 1685 * wash is enabled, otherwise defer to below to skip unneeded parsing. 1686 */ 1687 mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; 1688 wash = !!(q->rate_flags & CAKE_FLAG_WASH); 1689 if (wash) 1690 dscp = cake_handle_diffserv(skb, wash); 1691 1692 if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) 1693 tin = 0; 1694 1695 else if (mark && mark <= q->tin_cnt) 1696 tin = q->tin_order[mark - 1]; 1697 1698 else if (TC_H_MAJ(skb->priority) == sch->handle && 1699 TC_H_MIN(skb->priority) > 0 && 1700 TC_H_MIN(skb->priority) <= q->tin_cnt) 1701 tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; 1702 1703 else { 1704 if (!wash) 1705 dscp = cake_handle_diffserv(skb, wash); 1706 tin = q->tin_index[dscp]; 1707 1708 if (unlikely(tin >= q->tin_cnt)) 1709 tin = 0; 1710 } 1711 1712 return &q->tins[tin]; 1713 } 1714 cake_classify(struct Qdisc * sch,struct cake_tin_data ** t,struct sk_buff * skb,int flow_mode,int * qerr)1715 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, 1716 struct sk_buff *skb, int flow_mode, int *qerr) 1717 { 1718 struct cake_sched_data *q = qdisc_priv(sch); 1719 struct tcf_proto *filter; 1720 struct tcf_result res; 1721 u16 flow = 0, host = 0; 1722 int result; 1723 1724 filter = rcu_dereference_bh(q->filter_list); 1725 if (!filter) 1726 goto hash; 1727 1728 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 1729 result = tcf_classify(skb, NULL, filter, &res, false); 1730 1731 if (result >= 0) { 1732 #ifdef CONFIG_NET_CLS_ACT 1733 switch (result) { 1734 case TC_ACT_STOLEN: 1735 case TC_ACT_QUEUED: 1736 case TC_ACT_TRAP: 1737 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 1738 fallthrough; 1739 case TC_ACT_SHOT: 1740 return 0; 1741 } 1742 #endif 1743 if (TC_H_MIN(res.classid) <= CAKE_QUEUES) 1744 flow = TC_H_MIN(res.classid); 1745 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16)) 1746 host = TC_H_MAJ(res.classid) >> 16; 1747 } 1748 hash: 1749 *t = cake_select_tin(sch, skb); 1750 return cake_hash(*t, skb, flow_mode, flow, host) + 1; 1751 } 1752 1753 static void cake_reconfigure(struct Qdisc *sch); 1754 cake_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)1755 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1756 struct sk_buff **to_free) 1757 { 1758 struct cake_sched_data *q = qdisc_priv(sch); 1759 int len = qdisc_pkt_len(skb); 1760 int ret; 1761 struct sk_buff *ack = NULL; 1762 ktime_t now = ktime_get(); 1763 struct cake_tin_data *b; 1764 struct cake_flow *flow; 1765 u32 idx; 1766 1767 /* choose flow to insert into */ 1768 idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); 1769 if (idx == 0) { 1770 if (ret & __NET_XMIT_BYPASS) 1771 qdisc_qstats_drop(sch); 1772 __qdisc_drop(skb, to_free); 1773 return ret; 1774 } 1775 idx--; 1776 flow = &b->flows[idx]; 1777 1778 /* ensure shaper state isn't stale */ 1779 if (!b->tin_backlog) { 1780 if (ktime_before(b->time_next_packet, now)) 1781 b->time_next_packet = now; 1782 1783 if (!sch->q.qlen) { 1784 if (ktime_before(q->time_next_packet, now)) { 1785 q->failsafe_next_packet = now; 1786 q->time_next_packet = now; 1787 } else if (ktime_after(q->time_next_packet, now) && 1788 ktime_after(q->failsafe_next_packet, now)) { 1789 u64 next = \ 1790 min(ktime_to_ns(q->time_next_packet), 1791 ktime_to_ns( 1792 q->failsafe_next_packet)); 1793 sch->qstats.overlimits++; 1794 qdisc_watchdog_schedule_ns(&q->watchdog, next); 1795 } 1796 } 1797 } 1798 1799 if (unlikely(len > b->max_skblen)) 1800 b->max_skblen = len; 1801 1802 if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { 1803 struct sk_buff *segs, *nskb; 1804 netdev_features_t features = netif_skb_features(skb); 1805 unsigned int slen = 0, numsegs = 0; 1806 1807 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 1808 if (IS_ERR_OR_NULL(segs)) 1809 return qdisc_drop(skb, sch, to_free); 1810 1811 skb_list_walk_safe(segs, segs, nskb) { 1812 skb_mark_not_on_list(segs); 1813 qdisc_skb_cb(segs)->pkt_len = segs->len; 1814 cobalt_set_enqueue_time(segs, now); 1815 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, 1816 segs); 1817 flow_queue_add(flow, segs); 1818 1819 sch->q.qlen++; 1820 numsegs++; 1821 slen += segs->len; 1822 q->buffer_used += segs->truesize; 1823 b->packets++; 1824 } 1825 1826 /* stats */ 1827 b->bytes += slen; 1828 b->backlogs[idx] += slen; 1829 b->tin_backlog += slen; 1830 sch->qstats.backlog += slen; 1831 q->avg_window_bytes += slen; 1832 1833 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); 1834 consume_skb(skb); 1835 } else { 1836 /* not splitting */ 1837 cobalt_set_enqueue_time(skb, now); 1838 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); 1839 flow_queue_add(flow, skb); 1840 1841 if (q->ack_filter) 1842 ack = cake_ack_filter(q, flow); 1843 1844 if (ack) { 1845 b->ack_drops++; 1846 sch->qstats.drops++; 1847 b->bytes += qdisc_pkt_len(ack); 1848 len -= qdisc_pkt_len(ack); 1849 q->buffer_used += skb->truesize - ack->truesize; 1850 if (q->rate_flags & CAKE_FLAG_INGRESS) 1851 cake_advance_shaper(q, b, ack, now, true); 1852 1853 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); 1854 consume_skb(ack); 1855 } else { 1856 sch->q.qlen++; 1857 q->buffer_used += skb->truesize; 1858 } 1859 1860 /* stats */ 1861 b->packets++; 1862 b->bytes += len; 1863 b->backlogs[idx] += len; 1864 b->tin_backlog += len; 1865 sch->qstats.backlog += len; 1866 q->avg_window_bytes += len; 1867 } 1868 1869 if (q->overflow_timeout) 1870 cake_heapify_up(q, b->overflow_idx[idx]); 1871 1872 /* incoming bandwidth capacity estimate */ 1873 if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { 1874 u64 packet_interval = \ 1875 ktime_to_ns(ktime_sub(now, q->last_packet_time)); 1876 1877 if (packet_interval > NSEC_PER_SEC) 1878 packet_interval = NSEC_PER_SEC; 1879 1880 /* filter out short-term bursts, eg. wifi aggregation */ 1881 q->avg_packet_interval = \ 1882 cake_ewma(q->avg_packet_interval, 1883 packet_interval, 1884 (packet_interval > q->avg_packet_interval ? 1885 2 : 8)); 1886 1887 q->last_packet_time = now; 1888 1889 if (packet_interval > q->avg_packet_interval) { 1890 u64 window_interval = \ 1891 ktime_to_ns(ktime_sub(now, 1892 q->avg_window_begin)); 1893 u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; 1894 1895 b = div64_u64(b, window_interval); 1896 q->avg_peak_bandwidth = 1897 cake_ewma(q->avg_peak_bandwidth, b, 1898 b > q->avg_peak_bandwidth ? 2 : 8); 1899 q->avg_window_bytes = 0; 1900 q->avg_window_begin = now; 1901 1902 if (ktime_after(now, 1903 ktime_add_ms(q->last_reconfig_time, 1904 250))) { 1905 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; 1906 cake_reconfigure(sch); 1907 } 1908 } 1909 } else { 1910 q->avg_window_bytes = 0; 1911 q->last_packet_time = now; 1912 } 1913 1914 /* flowchain */ 1915 if (!flow->set || flow->set == CAKE_SET_DECAYING) { 1916 if (!flow->set) { 1917 list_add_tail(&flow->flowchain, &b->new_flows); 1918 } else { 1919 b->decaying_flow_count--; 1920 list_move_tail(&flow->flowchain, &b->new_flows); 1921 } 1922 flow->set = CAKE_SET_SPARSE; 1923 b->sparse_flow_count++; 1924 1925 flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode); 1926 } else if (flow->set == CAKE_SET_SPARSE_WAIT) { 1927 /* this flow was empty, accounted as a sparse flow, but actually 1928 * in the bulk rotation. 1929 */ 1930 flow->set = CAKE_SET_BULK; 1931 b->sparse_flow_count--; 1932 b->bulk_flow_count++; 1933 1934 cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); 1935 cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); 1936 } 1937 1938 if (q->buffer_used > q->buffer_max_used) 1939 q->buffer_max_used = q->buffer_used; 1940 1941 if (q->buffer_used > q->buffer_limit) { 1942 u32 dropped = 0; 1943 1944 while (q->buffer_used > q->buffer_limit) { 1945 dropped++; 1946 cake_drop(sch, to_free); 1947 } 1948 b->drop_overlimit += dropped; 1949 } 1950 return NET_XMIT_SUCCESS; 1951 } 1952 cake_dequeue_one(struct Qdisc * sch)1953 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) 1954 { 1955 struct cake_sched_data *q = qdisc_priv(sch); 1956 struct cake_tin_data *b = &q->tins[q->cur_tin]; 1957 struct cake_flow *flow = &b->flows[q->cur_flow]; 1958 struct sk_buff *skb = NULL; 1959 u32 len; 1960 1961 if (flow->head) { 1962 skb = dequeue_head(flow); 1963 len = qdisc_pkt_len(skb); 1964 b->backlogs[q->cur_flow] -= len; 1965 b->tin_backlog -= len; 1966 sch->qstats.backlog -= len; 1967 q->buffer_used -= skb->truesize; 1968 sch->q.qlen--; 1969 1970 if (q->overflow_timeout) 1971 cake_heapify(q, b->overflow_idx[q->cur_flow]); 1972 } 1973 return skb; 1974 } 1975 1976 /* Discard leftover packets from a tin no longer in use. */ cake_clear_tin(struct Qdisc * sch,u16 tin)1977 static void cake_clear_tin(struct Qdisc *sch, u16 tin) 1978 { 1979 struct cake_sched_data *q = qdisc_priv(sch); 1980 struct sk_buff *skb; 1981 1982 q->cur_tin = tin; 1983 for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) 1984 while (!!(skb = cake_dequeue_one(sch))) 1985 kfree_skb(skb); 1986 } 1987 cake_dequeue(struct Qdisc * sch)1988 static struct sk_buff *cake_dequeue(struct Qdisc *sch) 1989 { 1990 struct cake_sched_data *q = qdisc_priv(sch); 1991 struct cake_tin_data *b = &q->tins[q->cur_tin]; 1992 ktime_t now = ktime_get(); 1993 struct cake_flow *flow; 1994 struct list_head *head; 1995 bool first_flow = true; 1996 struct sk_buff *skb; 1997 u64 delay; 1998 u32 len; 1999 2000 begin: 2001 if (!sch->q.qlen) 2002 return NULL; 2003 2004 /* global hard shaper */ 2005 if (ktime_after(q->time_next_packet, now) && 2006 ktime_after(q->failsafe_next_packet, now)) { 2007 u64 next = min(ktime_to_ns(q->time_next_packet), 2008 ktime_to_ns(q->failsafe_next_packet)); 2009 2010 sch->qstats.overlimits++; 2011 qdisc_watchdog_schedule_ns(&q->watchdog, next); 2012 return NULL; 2013 } 2014 2015 /* Choose a class to work on. */ 2016 if (!q->rate_ns) { 2017 /* In unlimited mode, can't rely on shaper timings, just balance 2018 * with DRR 2019 */ 2020 bool wrapped = false, empty = true; 2021 2022 while (b->tin_deficit < 0 || 2023 !(b->sparse_flow_count + b->bulk_flow_count)) { 2024 if (b->tin_deficit <= 0) 2025 b->tin_deficit += b->tin_quantum; 2026 if (b->sparse_flow_count + b->bulk_flow_count) 2027 empty = false; 2028 2029 q->cur_tin++; 2030 b++; 2031 if (q->cur_tin >= q->tin_cnt) { 2032 q->cur_tin = 0; 2033 b = q->tins; 2034 2035 if (wrapped) { 2036 /* It's possible for q->qlen to be 2037 * nonzero when we actually have no 2038 * packets anywhere. 2039 */ 2040 if (empty) 2041 return NULL; 2042 } else { 2043 wrapped = true; 2044 } 2045 } 2046 } 2047 } else { 2048 /* In shaped mode, choose: 2049 * - Highest-priority tin with queue and meeting schedule, or 2050 * - The earliest-scheduled tin with queue. 2051 */ 2052 ktime_t best_time = KTIME_MAX; 2053 int tin, best_tin = 0; 2054 2055 for (tin = 0; tin < q->tin_cnt; tin++) { 2056 b = q->tins + tin; 2057 if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { 2058 ktime_t time_to_pkt = \ 2059 ktime_sub(b->time_next_packet, now); 2060 2061 if (ktime_to_ns(time_to_pkt) <= 0 || 2062 ktime_compare(time_to_pkt, 2063 best_time) <= 0) { 2064 best_time = time_to_pkt; 2065 best_tin = tin; 2066 } 2067 } 2068 } 2069 2070 q->cur_tin = best_tin; 2071 b = q->tins + best_tin; 2072 2073 /* No point in going further if no packets to deliver. */ 2074 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) 2075 return NULL; 2076 } 2077 2078 retry: 2079 /* service this class */ 2080 head = &b->decaying_flows; 2081 if (!first_flow || list_empty(head)) { 2082 head = &b->new_flows; 2083 if (list_empty(head)) { 2084 head = &b->old_flows; 2085 if (unlikely(list_empty(head))) { 2086 head = &b->decaying_flows; 2087 if (unlikely(list_empty(head))) 2088 goto begin; 2089 } 2090 } 2091 } 2092 flow = list_first_entry(head, struct cake_flow, flowchain); 2093 q->cur_flow = flow - b->flows; 2094 first_flow = false; 2095 2096 /* flow isolation (DRR++) */ 2097 if (flow->deficit <= 0) { 2098 /* Keep all flows with deficits out of the sparse and decaying 2099 * rotations. No non-empty flow can go into the decaying 2100 * rotation, so they can't get deficits 2101 */ 2102 if (flow->set == CAKE_SET_SPARSE) { 2103 if (flow->head) { 2104 b->sparse_flow_count--; 2105 b->bulk_flow_count++; 2106 2107 cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); 2108 cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); 2109 2110 flow->set = CAKE_SET_BULK; 2111 } else { 2112 /* we've moved it to the bulk rotation for 2113 * correct deficit accounting but we still want 2114 * to count it as a sparse flow, not a bulk one. 2115 */ 2116 flow->set = CAKE_SET_SPARSE_WAIT; 2117 } 2118 } 2119 2120 flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode); 2121 list_move_tail(&flow->flowchain, &b->old_flows); 2122 2123 goto retry; 2124 } 2125 2126 /* Retrieve a packet via the AQM */ 2127 while (1) { 2128 skb = cake_dequeue_one(sch); 2129 if (!skb) { 2130 /* this queue was actually empty */ 2131 if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) 2132 b->unresponsive_flow_count--; 2133 2134 if (flow->cvars.p_drop || flow->cvars.count || 2135 ktime_before(now, flow->cvars.drop_next)) { 2136 /* keep in the flowchain until the state has 2137 * decayed to rest 2138 */ 2139 list_move_tail(&flow->flowchain, 2140 &b->decaying_flows); 2141 if (flow->set == CAKE_SET_BULK) { 2142 b->bulk_flow_count--; 2143 2144 cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); 2145 cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); 2146 2147 b->decaying_flow_count++; 2148 } else if (flow->set == CAKE_SET_SPARSE || 2149 flow->set == CAKE_SET_SPARSE_WAIT) { 2150 b->sparse_flow_count--; 2151 b->decaying_flow_count++; 2152 } 2153 flow->set = CAKE_SET_DECAYING; 2154 } else { 2155 /* remove empty queue from the flowchain */ 2156 list_del_init(&flow->flowchain); 2157 if (flow->set == CAKE_SET_SPARSE || 2158 flow->set == CAKE_SET_SPARSE_WAIT) 2159 b->sparse_flow_count--; 2160 else if (flow->set == CAKE_SET_BULK) { 2161 b->bulk_flow_count--; 2162 2163 cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); 2164 cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); 2165 } else 2166 b->decaying_flow_count--; 2167 2168 flow->set = CAKE_SET_NONE; 2169 } 2170 goto begin; 2171 } 2172 2173 /* Last packet in queue may be marked, shouldn't be dropped */ 2174 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, 2175 (b->bulk_flow_count * 2176 !!(q->rate_flags & 2177 CAKE_FLAG_INGRESS))) || 2178 !flow->head) 2179 break; 2180 2181 /* drop this packet, get another one */ 2182 if (q->rate_flags & CAKE_FLAG_INGRESS) { 2183 len = cake_advance_shaper(q, b, skb, 2184 now, true); 2185 flow->deficit -= len; 2186 b->tin_deficit -= len; 2187 } 2188 flow->dropped++; 2189 b->tin_dropped++; 2190 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); 2191 qdisc_qstats_drop(sch); 2192 kfree_skb(skb); 2193 if (q->rate_flags & CAKE_FLAG_INGRESS) 2194 goto retry; 2195 } 2196 2197 b->tin_ecn_mark += !!flow->cvars.ecn_marked; 2198 qdisc_bstats_update(sch, skb); 2199 2200 /* collect delay stats */ 2201 delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 2202 b->avge_delay = cake_ewma(b->avge_delay, delay, 8); 2203 b->peak_delay = cake_ewma(b->peak_delay, delay, 2204 delay > b->peak_delay ? 2 : 8); 2205 b->base_delay = cake_ewma(b->base_delay, delay, 2206 delay < b->base_delay ? 2 : 8); 2207 2208 len = cake_advance_shaper(q, b, skb, now, false); 2209 flow->deficit -= len; 2210 b->tin_deficit -= len; 2211 2212 if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { 2213 u64 next = min(ktime_to_ns(q->time_next_packet), 2214 ktime_to_ns(q->failsafe_next_packet)); 2215 2216 qdisc_watchdog_schedule_ns(&q->watchdog, next); 2217 } else if (!sch->q.qlen) { 2218 int i; 2219 2220 for (i = 0; i < q->tin_cnt; i++) { 2221 if (q->tins[i].decaying_flow_count) { 2222 ktime_t next = \ 2223 ktime_add_ns(now, 2224 q->tins[i].cparams.target); 2225 2226 qdisc_watchdog_schedule_ns(&q->watchdog, 2227 ktime_to_ns(next)); 2228 break; 2229 } 2230 } 2231 } 2232 2233 if (q->overflow_timeout) 2234 q->overflow_timeout--; 2235 2236 return skb; 2237 } 2238 cake_reset(struct Qdisc * sch)2239 static void cake_reset(struct Qdisc *sch) 2240 { 2241 struct cake_sched_data *q = qdisc_priv(sch); 2242 u32 c; 2243 2244 if (!q->tins) 2245 return; 2246 2247 for (c = 0; c < CAKE_MAX_TINS; c++) 2248 cake_clear_tin(sch, c); 2249 } 2250 2251 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { 2252 [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, 2253 [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, 2254 [TCA_CAKE_ATM] = { .type = NLA_U32 }, 2255 [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, 2256 [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, 2257 [TCA_CAKE_RTT] = { .type = NLA_U32 }, 2258 [TCA_CAKE_TARGET] = { .type = NLA_U32 }, 2259 [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, 2260 [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, 2261 [TCA_CAKE_NAT] = { .type = NLA_U32 }, 2262 [TCA_CAKE_RAW] = { .type = NLA_U32 }, 2263 [TCA_CAKE_WASH] = { .type = NLA_U32 }, 2264 [TCA_CAKE_MPU] = { .type = NLA_U32 }, 2265 [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, 2266 [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, 2267 [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, 2268 [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, 2269 }; 2270 cake_set_rate(struct cake_tin_data * b,u64 rate,u32 mtu,u64 target_ns,u64 rtt_est_ns)2271 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, 2272 u64 target_ns, u64 rtt_est_ns) 2273 { 2274 /* convert byte-rate into time-per-byte 2275 * so it will always unwedge in reasonable time. 2276 */ 2277 static const u64 MIN_RATE = 64; 2278 u32 byte_target = mtu; 2279 u64 byte_target_ns; 2280 u8 rate_shft = 0; 2281 u64 rate_ns = 0; 2282 2283 b->flow_quantum = 1514; 2284 if (rate) { 2285 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); 2286 rate_shft = 34; 2287 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; 2288 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); 2289 while (!!(rate_ns >> 34)) { 2290 rate_ns >>= 1; 2291 rate_shft--; 2292 } 2293 } /* else unlimited, ie. zero delay */ 2294 2295 b->tin_rate_bps = rate; 2296 b->tin_rate_ns = rate_ns; 2297 b->tin_rate_shft = rate_shft; 2298 2299 byte_target_ns = (byte_target * rate_ns) >> rate_shft; 2300 2301 b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); 2302 b->cparams.interval = max(rtt_est_ns + 2303 b->cparams.target - target_ns, 2304 b->cparams.target * 2); 2305 b->cparams.mtu_time = byte_target_ns; 2306 b->cparams.p_inc = 1 << 24; /* 1/256 */ 2307 b->cparams.p_dec = 1 << 20; /* 1/4096 */ 2308 } 2309 cake_config_besteffort(struct Qdisc * sch)2310 static int cake_config_besteffort(struct Qdisc *sch) 2311 { 2312 struct cake_sched_data *q = qdisc_priv(sch); 2313 struct cake_tin_data *b = &q->tins[0]; 2314 u32 mtu = psched_mtu(qdisc_dev(sch)); 2315 u64 rate = q->rate_bps; 2316 2317 q->tin_cnt = 1; 2318 2319 q->tin_index = besteffort; 2320 q->tin_order = normal_order; 2321 2322 cake_set_rate(b, rate, mtu, 2323 us_to_ns(q->target), us_to_ns(q->interval)); 2324 b->tin_quantum = 65535; 2325 2326 return 0; 2327 } 2328 cake_config_precedence(struct Qdisc * sch)2329 static int cake_config_precedence(struct Qdisc *sch) 2330 { 2331 /* convert high-level (user visible) parameters into internal format */ 2332 struct cake_sched_data *q = qdisc_priv(sch); 2333 u32 mtu = psched_mtu(qdisc_dev(sch)); 2334 u64 rate = q->rate_bps; 2335 u32 quantum = 256; 2336 u32 i; 2337 2338 q->tin_cnt = 8; 2339 q->tin_index = precedence; 2340 q->tin_order = normal_order; 2341 2342 for (i = 0; i < q->tin_cnt; i++) { 2343 struct cake_tin_data *b = &q->tins[i]; 2344 2345 cake_set_rate(b, rate, mtu, us_to_ns(q->target), 2346 us_to_ns(q->interval)); 2347 2348 b->tin_quantum = max_t(u16, 1U, quantum); 2349 2350 /* calculate next class's parameters */ 2351 rate *= 7; 2352 rate >>= 3; 2353 2354 quantum *= 7; 2355 quantum >>= 3; 2356 } 2357 2358 return 0; 2359 } 2360 2361 /* List of known Diffserv codepoints: 2362 * 2363 * Default Forwarding (DF/CS0) - Best Effort 2364 * Max Throughput (TOS2) 2365 * Min Delay (TOS4) 2366 * LLT "La" (TOS5) 2367 * Assured Forwarding 1 (AF1x) - x3 2368 * Assured Forwarding 2 (AF2x) - x3 2369 * Assured Forwarding 3 (AF3x) - x3 2370 * Assured Forwarding 4 (AF4x) - x3 2371 * Precedence Class 1 (CS1) 2372 * Precedence Class 2 (CS2) 2373 * Precedence Class 3 (CS3) 2374 * Precedence Class 4 (CS4) 2375 * Precedence Class 5 (CS5) 2376 * Precedence Class 6 (CS6) 2377 * Precedence Class 7 (CS7) 2378 * Voice Admit (VA) 2379 * Expedited Forwarding (EF) 2380 * Lower Effort (LE) 2381 * 2382 * Total 26 codepoints. 2383 */ 2384 2385 /* List of traffic classes in RFC 4594, updated by RFC 8622: 2386 * (roughly descending order of contended priority) 2387 * (roughly ascending order of uncontended throughput) 2388 * 2389 * Network Control (CS6,CS7) - routing traffic 2390 * Telephony (EF,VA) - aka. VoIP streams 2391 * Signalling (CS5) - VoIP setup 2392 * Multimedia Conferencing (AF4x) - aka. video calls 2393 * Realtime Interactive (CS4) - eg. games 2394 * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch 2395 * Broadcast Video (CS3) 2396 * Low-Latency Data (AF2x,TOS4) - eg. database 2397 * Ops, Admin, Management (CS2) - eg. ssh 2398 * Standard Service (DF & unrecognised codepoints) 2399 * High-Throughput Data (AF1x,TOS2) - eg. web traffic 2400 * Low-Priority Data (LE,CS1) - eg. BitTorrent 2401 * 2402 * Total 12 traffic classes. 2403 */ 2404 cake_config_diffserv8(struct Qdisc * sch)2405 static int cake_config_diffserv8(struct Qdisc *sch) 2406 { 2407 /* Pruned list of traffic classes for typical applications: 2408 * 2409 * Network Control (CS6, CS7) 2410 * Minimum Latency (EF, VA, CS5, CS4) 2411 * Interactive Shell (CS2) 2412 * Low Latency Transactions (AF2x, TOS4) 2413 * Video Streaming (AF4x, AF3x, CS3) 2414 * Bog Standard (DF etc.) 2415 * High Throughput (AF1x, TOS2, CS1) 2416 * Background Traffic (LE) 2417 * 2418 * Total 8 traffic classes. 2419 */ 2420 2421 struct cake_sched_data *q = qdisc_priv(sch); 2422 u32 mtu = psched_mtu(qdisc_dev(sch)); 2423 u64 rate = q->rate_bps; 2424 u32 quantum = 256; 2425 u32 i; 2426 2427 q->tin_cnt = 8; 2428 2429 /* codepoint to class mapping */ 2430 q->tin_index = diffserv8; 2431 q->tin_order = normal_order; 2432 2433 /* class characteristics */ 2434 for (i = 0; i < q->tin_cnt; i++) { 2435 struct cake_tin_data *b = &q->tins[i]; 2436 2437 cake_set_rate(b, rate, mtu, us_to_ns(q->target), 2438 us_to_ns(q->interval)); 2439 2440 b->tin_quantum = max_t(u16, 1U, quantum); 2441 2442 /* calculate next class's parameters */ 2443 rate *= 7; 2444 rate >>= 3; 2445 2446 quantum *= 7; 2447 quantum >>= 3; 2448 } 2449 2450 return 0; 2451 } 2452 cake_config_diffserv4(struct Qdisc * sch)2453 static int cake_config_diffserv4(struct Qdisc *sch) 2454 { 2455 /* Further pruned list of traffic classes for four-class system: 2456 * 2457 * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) 2458 * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2) 2459 * Best Effort (DF, AF1x, TOS2, and those not specified) 2460 * Background Traffic (LE, CS1) 2461 * 2462 * Total 4 traffic classes. 2463 */ 2464 2465 struct cake_sched_data *q = qdisc_priv(sch); 2466 u32 mtu = psched_mtu(qdisc_dev(sch)); 2467 u64 rate = q->rate_bps; 2468 u32 quantum = 1024; 2469 2470 q->tin_cnt = 4; 2471 2472 /* codepoint to class mapping */ 2473 q->tin_index = diffserv4; 2474 q->tin_order = bulk_order; 2475 2476 /* class characteristics */ 2477 cake_set_rate(&q->tins[0], rate, mtu, 2478 us_to_ns(q->target), us_to_ns(q->interval)); 2479 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2480 us_to_ns(q->target), us_to_ns(q->interval)); 2481 cake_set_rate(&q->tins[2], rate >> 1, mtu, 2482 us_to_ns(q->target), us_to_ns(q->interval)); 2483 cake_set_rate(&q->tins[3], rate >> 2, mtu, 2484 us_to_ns(q->target), us_to_ns(q->interval)); 2485 2486 /* bandwidth-sharing weights */ 2487 q->tins[0].tin_quantum = quantum; 2488 q->tins[1].tin_quantum = quantum >> 4; 2489 q->tins[2].tin_quantum = quantum >> 1; 2490 q->tins[3].tin_quantum = quantum >> 2; 2491 2492 return 0; 2493 } 2494 cake_config_diffserv3(struct Qdisc * sch)2495 static int cake_config_diffserv3(struct Qdisc *sch) 2496 { 2497 /* Simplified Diffserv structure with 3 tins. 2498 * Latency Sensitive (CS7, CS6, EF, VA, TOS4) 2499 * Best Effort 2500 * Low Priority (LE, CS1) 2501 */ 2502 struct cake_sched_data *q = qdisc_priv(sch); 2503 u32 mtu = psched_mtu(qdisc_dev(sch)); 2504 u64 rate = q->rate_bps; 2505 u32 quantum = 1024; 2506 2507 q->tin_cnt = 3; 2508 2509 /* codepoint to class mapping */ 2510 q->tin_index = diffserv3; 2511 q->tin_order = bulk_order; 2512 2513 /* class characteristics */ 2514 cake_set_rate(&q->tins[0], rate, mtu, 2515 us_to_ns(q->target), us_to_ns(q->interval)); 2516 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2517 us_to_ns(q->target), us_to_ns(q->interval)); 2518 cake_set_rate(&q->tins[2], rate >> 2, mtu, 2519 us_to_ns(q->target), us_to_ns(q->interval)); 2520 2521 /* bandwidth-sharing weights */ 2522 q->tins[0].tin_quantum = quantum; 2523 q->tins[1].tin_quantum = quantum >> 4; 2524 q->tins[2].tin_quantum = quantum >> 2; 2525 2526 return 0; 2527 } 2528 cake_reconfigure(struct Qdisc * sch)2529 static void cake_reconfigure(struct Qdisc *sch) 2530 { 2531 struct cake_sched_data *q = qdisc_priv(sch); 2532 int c, ft; 2533 2534 switch (q->tin_mode) { 2535 case CAKE_DIFFSERV_BESTEFFORT: 2536 ft = cake_config_besteffort(sch); 2537 break; 2538 2539 case CAKE_DIFFSERV_PRECEDENCE: 2540 ft = cake_config_precedence(sch); 2541 break; 2542 2543 case CAKE_DIFFSERV_DIFFSERV8: 2544 ft = cake_config_diffserv8(sch); 2545 break; 2546 2547 case CAKE_DIFFSERV_DIFFSERV4: 2548 ft = cake_config_diffserv4(sch); 2549 break; 2550 2551 case CAKE_DIFFSERV_DIFFSERV3: 2552 default: 2553 ft = cake_config_diffserv3(sch); 2554 break; 2555 } 2556 2557 for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { 2558 cake_clear_tin(sch, c); 2559 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; 2560 } 2561 2562 q->rate_ns = q->tins[ft].tin_rate_ns; 2563 q->rate_shft = q->tins[ft].tin_rate_shft; 2564 2565 if (q->buffer_config_limit) { 2566 q->buffer_limit = q->buffer_config_limit; 2567 } else if (q->rate_bps) { 2568 u64 t = q->rate_bps * q->interval; 2569 2570 do_div(t, USEC_PER_SEC / 4); 2571 q->buffer_limit = max_t(u32, t, 4U << 20); 2572 } else { 2573 q->buffer_limit = ~0; 2574 } 2575 2576 sch->flags &= ~TCQ_F_CAN_BYPASS; 2577 2578 q->buffer_limit = min(q->buffer_limit, 2579 max(sch->limit * psched_mtu(qdisc_dev(sch)), 2580 q->buffer_config_limit)); 2581 } 2582 cake_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2583 static int cake_change(struct Qdisc *sch, struct nlattr *opt, 2584 struct netlink_ext_ack *extack) 2585 { 2586 struct cake_sched_data *q = qdisc_priv(sch); 2587 struct nlattr *tb[TCA_CAKE_MAX + 1]; 2588 int err; 2589 2590 err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, 2591 extack); 2592 if (err < 0) 2593 return err; 2594 2595 if (tb[TCA_CAKE_NAT]) { 2596 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 2597 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; 2598 q->flow_mode |= CAKE_FLOW_NAT_FLAG * 2599 !!nla_get_u32(tb[TCA_CAKE_NAT]); 2600 #else 2601 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], 2602 "No conntrack support in kernel"); 2603 return -EOPNOTSUPP; 2604 #endif 2605 } 2606 2607 if (tb[TCA_CAKE_BASE_RATE64]) 2608 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); 2609 2610 if (tb[TCA_CAKE_DIFFSERV_MODE]) 2611 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); 2612 2613 if (tb[TCA_CAKE_WASH]) { 2614 if (!!nla_get_u32(tb[TCA_CAKE_WASH])) 2615 q->rate_flags |= CAKE_FLAG_WASH; 2616 else 2617 q->rate_flags &= ~CAKE_FLAG_WASH; 2618 } 2619 2620 if (tb[TCA_CAKE_FLOW_MODE]) 2621 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | 2622 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & 2623 CAKE_FLOW_MASK)); 2624 2625 if (tb[TCA_CAKE_ATM]) 2626 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); 2627 2628 if (tb[TCA_CAKE_OVERHEAD]) { 2629 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); 2630 q->rate_flags |= CAKE_FLAG_OVERHEAD; 2631 2632 q->max_netlen = 0; 2633 q->max_adjlen = 0; 2634 q->min_netlen = ~0; 2635 q->min_adjlen = ~0; 2636 } 2637 2638 if (tb[TCA_CAKE_RAW]) { 2639 q->rate_flags &= ~CAKE_FLAG_OVERHEAD; 2640 2641 q->max_netlen = 0; 2642 q->max_adjlen = 0; 2643 q->min_netlen = ~0; 2644 q->min_adjlen = ~0; 2645 } 2646 2647 if (tb[TCA_CAKE_MPU]) 2648 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); 2649 2650 if (tb[TCA_CAKE_RTT]) { 2651 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); 2652 2653 if (!q->interval) 2654 q->interval = 1; 2655 } 2656 2657 if (tb[TCA_CAKE_TARGET]) { 2658 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); 2659 2660 if (!q->target) 2661 q->target = 1; 2662 } 2663 2664 if (tb[TCA_CAKE_AUTORATE]) { 2665 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) 2666 q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; 2667 else 2668 q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; 2669 } 2670 2671 if (tb[TCA_CAKE_INGRESS]) { 2672 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) 2673 q->rate_flags |= CAKE_FLAG_INGRESS; 2674 else 2675 q->rate_flags &= ~CAKE_FLAG_INGRESS; 2676 } 2677 2678 if (tb[TCA_CAKE_ACK_FILTER]) 2679 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); 2680 2681 if (tb[TCA_CAKE_MEMORY]) 2682 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); 2683 2684 if (tb[TCA_CAKE_SPLIT_GSO]) { 2685 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) 2686 q->rate_flags |= CAKE_FLAG_SPLIT_GSO; 2687 else 2688 q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; 2689 } 2690 2691 if (tb[TCA_CAKE_FWMARK]) { 2692 q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]); 2693 q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0; 2694 } 2695 2696 if (q->tins) { 2697 sch_tree_lock(sch); 2698 cake_reconfigure(sch); 2699 sch_tree_unlock(sch); 2700 } 2701 2702 return 0; 2703 } 2704 cake_destroy(struct Qdisc * sch)2705 static void cake_destroy(struct Qdisc *sch) 2706 { 2707 struct cake_sched_data *q = qdisc_priv(sch); 2708 2709 qdisc_watchdog_cancel(&q->watchdog); 2710 tcf_block_put(q->block); 2711 kvfree(q->tins); 2712 } 2713 cake_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2714 static int cake_init(struct Qdisc *sch, struct nlattr *opt, 2715 struct netlink_ext_ack *extack) 2716 { 2717 struct cake_sched_data *q = qdisc_priv(sch); 2718 int i, j, err; 2719 2720 sch->limit = 10240; 2721 q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; 2722 q->flow_mode = CAKE_FLOW_TRIPLE; 2723 2724 q->rate_bps = 0; /* unlimited by default */ 2725 2726 q->interval = 100000; /* 100ms default */ 2727 q->target = 5000; /* 5ms: codel RFC argues 2728 * for 5 to 10% of interval 2729 */ 2730 q->rate_flags |= CAKE_FLAG_SPLIT_GSO; 2731 q->cur_tin = 0; 2732 q->cur_flow = 0; 2733 2734 qdisc_watchdog_init(&q->watchdog, sch); 2735 2736 if (opt) { 2737 err = cake_change(sch, opt, extack); 2738 2739 if (err) 2740 return err; 2741 } 2742 2743 err = tcf_block_get(&q->block, &q->filter_list, sch, extack); 2744 if (err) 2745 return err; 2746 2747 quantum_div[0] = ~0; 2748 for (i = 1; i <= CAKE_QUEUES; i++) 2749 quantum_div[i] = 65535 / i; 2750 2751 q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), 2752 GFP_KERNEL); 2753 if (!q->tins) 2754 return -ENOMEM; 2755 2756 for (i = 0; i < CAKE_MAX_TINS; i++) { 2757 struct cake_tin_data *b = q->tins + i; 2758 2759 INIT_LIST_HEAD(&b->new_flows); 2760 INIT_LIST_HEAD(&b->old_flows); 2761 INIT_LIST_HEAD(&b->decaying_flows); 2762 b->sparse_flow_count = 0; 2763 b->bulk_flow_count = 0; 2764 b->decaying_flow_count = 0; 2765 2766 for (j = 0; j < CAKE_QUEUES; j++) { 2767 struct cake_flow *flow = b->flows + j; 2768 u32 k = j * CAKE_MAX_TINS + i; 2769 2770 INIT_LIST_HEAD(&flow->flowchain); 2771 cobalt_vars_init(&flow->cvars); 2772 2773 q->overflow_heap[k].t = i; 2774 q->overflow_heap[k].b = j; 2775 b->overflow_idx[j] = k; 2776 } 2777 } 2778 2779 cake_reconfigure(sch); 2780 q->avg_peak_bandwidth = q->rate_bps; 2781 q->min_netlen = ~0; 2782 q->min_adjlen = ~0; 2783 return 0; 2784 } 2785 cake_dump(struct Qdisc * sch,struct sk_buff * skb)2786 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) 2787 { 2788 struct cake_sched_data *q = qdisc_priv(sch); 2789 struct nlattr *opts; 2790 2791 opts = nla_nest_start_noflag(skb, TCA_OPTIONS); 2792 if (!opts) 2793 goto nla_put_failure; 2794 2795 if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, 2796 TCA_CAKE_PAD)) 2797 goto nla_put_failure; 2798 2799 if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, 2800 q->flow_mode & CAKE_FLOW_MASK)) 2801 goto nla_put_failure; 2802 2803 if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) 2804 goto nla_put_failure; 2805 2806 if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) 2807 goto nla_put_failure; 2808 2809 if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) 2810 goto nla_put_failure; 2811 2812 if (nla_put_u32(skb, TCA_CAKE_AUTORATE, 2813 !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) 2814 goto nla_put_failure; 2815 2816 if (nla_put_u32(skb, TCA_CAKE_INGRESS, 2817 !!(q->rate_flags & CAKE_FLAG_INGRESS))) 2818 goto nla_put_failure; 2819 2820 if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) 2821 goto nla_put_failure; 2822 2823 if (nla_put_u32(skb, TCA_CAKE_NAT, 2824 !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) 2825 goto nla_put_failure; 2826 2827 if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) 2828 goto nla_put_failure; 2829 2830 if (nla_put_u32(skb, TCA_CAKE_WASH, 2831 !!(q->rate_flags & CAKE_FLAG_WASH))) 2832 goto nla_put_failure; 2833 2834 if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) 2835 goto nla_put_failure; 2836 2837 if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) 2838 if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) 2839 goto nla_put_failure; 2840 2841 if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) 2842 goto nla_put_failure; 2843 2844 if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) 2845 goto nla_put_failure; 2846 2847 if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, 2848 !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) 2849 goto nla_put_failure; 2850 2851 if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask)) 2852 goto nla_put_failure; 2853 2854 return nla_nest_end(skb, opts); 2855 2856 nla_put_failure: 2857 return -1; 2858 } 2859 cake_dump_stats(struct Qdisc * sch,struct gnet_dump * d)2860 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 2861 { 2862 struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 2863 struct cake_sched_data *q = qdisc_priv(sch); 2864 struct nlattr *tstats, *ts; 2865 int i; 2866 2867 if (!stats) 2868 return -1; 2869 2870 #define PUT_STAT_U32(attr, data) do { \ 2871 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 2872 goto nla_put_failure; \ 2873 } while (0) 2874 #define PUT_STAT_U64(attr, data) do { \ 2875 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ 2876 data, TCA_CAKE_STATS_PAD)) \ 2877 goto nla_put_failure; \ 2878 } while (0) 2879 2880 PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); 2881 PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); 2882 PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); 2883 PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); 2884 PUT_STAT_U32(MAX_NETLEN, q->max_netlen); 2885 PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); 2886 PUT_STAT_U32(MIN_NETLEN, q->min_netlen); 2887 PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); 2888 2889 #undef PUT_STAT_U32 2890 #undef PUT_STAT_U64 2891 2892 tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS); 2893 if (!tstats) 2894 goto nla_put_failure; 2895 2896 #define PUT_TSTAT_U32(attr, data) do { \ 2897 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ 2898 goto nla_put_failure; \ 2899 } while (0) 2900 #define PUT_TSTAT_U64(attr, data) do { \ 2901 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ 2902 data, TCA_CAKE_TIN_STATS_PAD)) \ 2903 goto nla_put_failure; \ 2904 } while (0) 2905 2906 for (i = 0; i < q->tin_cnt; i++) { 2907 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 2908 2909 ts = nla_nest_start_noflag(d->skb, i + 1); 2910 if (!ts) 2911 goto nla_put_failure; 2912 2913 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); 2914 PUT_TSTAT_U64(SENT_BYTES64, b->bytes); 2915 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); 2916 2917 PUT_TSTAT_U32(TARGET_US, 2918 ktime_to_us(ns_to_ktime(b->cparams.target))); 2919 PUT_TSTAT_U32(INTERVAL_US, 2920 ktime_to_us(ns_to_ktime(b->cparams.interval))); 2921 2922 PUT_TSTAT_U32(SENT_PACKETS, b->packets); 2923 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); 2924 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); 2925 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); 2926 2927 PUT_TSTAT_U32(PEAK_DELAY_US, 2928 ktime_to_us(ns_to_ktime(b->peak_delay))); 2929 PUT_TSTAT_U32(AVG_DELAY_US, 2930 ktime_to_us(ns_to_ktime(b->avge_delay))); 2931 PUT_TSTAT_U32(BASE_DELAY_US, 2932 ktime_to_us(ns_to_ktime(b->base_delay))); 2933 2934 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); 2935 PUT_TSTAT_U32(WAY_MISSES, b->way_misses); 2936 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); 2937 2938 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + 2939 b->decaying_flow_count); 2940 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); 2941 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); 2942 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); 2943 2944 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); 2945 nla_nest_end(d->skb, ts); 2946 } 2947 2948 #undef PUT_TSTAT_U32 2949 #undef PUT_TSTAT_U64 2950 2951 nla_nest_end(d->skb, tstats); 2952 return nla_nest_end(d->skb, stats); 2953 2954 nla_put_failure: 2955 nla_nest_cancel(d->skb, stats); 2956 return -1; 2957 } 2958 cake_leaf(struct Qdisc * sch,unsigned long arg)2959 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) 2960 { 2961 return NULL; 2962 } 2963 cake_find(struct Qdisc * sch,u32 classid)2964 static unsigned long cake_find(struct Qdisc *sch, u32 classid) 2965 { 2966 return 0; 2967 } 2968 cake_bind(struct Qdisc * sch,unsigned long parent,u32 classid)2969 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, 2970 u32 classid) 2971 { 2972 return 0; 2973 } 2974 cake_unbind(struct Qdisc * q,unsigned long cl)2975 static void cake_unbind(struct Qdisc *q, unsigned long cl) 2976 { 2977 } 2978 cake_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)2979 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, 2980 struct netlink_ext_ack *extack) 2981 { 2982 struct cake_sched_data *q = qdisc_priv(sch); 2983 2984 if (cl) 2985 return NULL; 2986 return q->block; 2987 } 2988 cake_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)2989 static int cake_dump_class(struct Qdisc *sch, unsigned long cl, 2990 struct sk_buff *skb, struct tcmsg *tcm) 2991 { 2992 tcm->tcm_handle |= TC_H_MIN(cl); 2993 return 0; 2994 } 2995 cake_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)2996 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, 2997 struct gnet_dump *d) 2998 { 2999 struct cake_sched_data *q = qdisc_priv(sch); 3000 const struct cake_flow *flow = NULL; 3001 struct gnet_stats_queue qs = { 0 }; 3002 struct nlattr *stats; 3003 u32 idx = cl - 1; 3004 3005 if (idx < CAKE_QUEUES * q->tin_cnt) { 3006 const struct cake_tin_data *b = \ 3007 &q->tins[q->tin_order[idx / CAKE_QUEUES]]; 3008 const struct sk_buff *skb; 3009 3010 flow = &b->flows[idx % CAKE_QUEUES]; 3011 3012 if (flow->head) { 3013 sch_tree_lock(sch); 3014 skb = flow->head; 3015 while (skb) { 3016 qs.qlen++; 3017 skb = skb->next; 3018 } 3019 sch_tree_unlock(sch); 3020 } 3021 qs.backlog = b->backlogs[idx % CAKE_QUEUES]; 3022 qs.drops = flow->dropped; 3023 } 3024 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) 3025 return -1; 3026 if (flow) { 3027 ktime_t now = ktime_get(); 3028 3029 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 3030 if (!stats) 3031 return -1; 3032 3033 #define PUT_STAT_U32(attr, data) do { \ 3034 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3035 goto nla_put_failure; \ 3036 } while (0) 3037 #define PUT_STAT_S32(attr, data) do { \ 3038 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3039 goto nla_put_failure; \ 3040 } while (0) 3041 3042 PUT_STAT_S32(DEFICIT, flow->deficit); 3043 PUT_STAT_U32(DROPPING, flow->cvars.dropping); 3044 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); 3045 PUT_STAT_U32(P_DROP, flow->cvars.p_drop); 3046 if (flow->cvars.p_drop) { 3047 PUT_STAT_S32(BLUE_TIMER_US, 3048 ktime_to_us( 3049 ktime_sub(now, 3050 flow->cvars.blue_timer))); 3051 } 3052 if (flow->cvars.dropping) { 3053 PUT_STAT_S32(DROP_NEXT_US, 3054 ktime_to_us( 3055 ktime_sub(now, 3056 flow->cvars.drop_next))); 3057 } 3058 3059 if (nla_nest_end(d->skb, stats) < 0) 3060 return -1; 3061 } 3062 3063 return 0; 3064 3065 nla_put_failure: 3066 nla_nest_cancel(d->skb, stats); 3067 return -1; 3068 } 3069 cake_walk(struct Qdisc * sch,struct qdisc_walker * arg)3070 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) 3071 { 3072 struct cake_sched_data *q = qdisc_priv(sch); 3073 unsigned int i, j; 3074 3075 if (arg->stop) 3076 return; 3077 3078 for (i = 0; i < q->tin_cnt; i++) { 3079 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 3080 3081 for (j = 0; j < CAKE_QUEUES; j++) { 3082 if (list_empty(&b->flows[j].flowchain)) { 3083 arg->count++; 3084 continue; 3085 } 3086 if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1, 3087 arg)) 3088 break; 3089 } 3090 } 3091 } 3092 3093 static const struct Qdisc_class_ops cake_class_ops = { 3094 .leaf = cake_leaf, 3095 .find = cake_find, 3096 .tcf_block = cake_tcf_block, 3097 .bind_tcf = cake_bind, 3098 .unbind_tcf = cake_unbind, 3099 .dump = cake_dump_class, 3100 .dump_stats = cake_dump_class_stats, 3101 .walk = cake_walk, 3102 }; 3103 3104 static struct Qdisc_ops cake_qdisc_ops __read_mostly = { 3105 .cl_ops = &cake_class_ops, 3106 .id = "cake", 3107 .priv_size = sizeof(struct cake_sched_data), 3108 .enqueue = cake_enqueue, 3109 .dequeue = cake_dequeue, 3110 .peek = qdisc_peek_dequeued, 3111 .init = cake_init, 3112 .reset = cake_reset, 3113 .destroy = cake_destroy, 3114 .change = cake_change, 3115 .dump = cake_dump, 3116 .dump_stats = cake_dump_stats, 3117 .owner = THIS_MODULE, 3118 }; 3119 cake_module_init(void)3120 static int __init cake_module_init(void) 3121 { 3122 return register_qdisc(&cake_qdisc_ops); 3123 } 3124 cake_module_exit(void)3125 static void __exit cake_module_exit(void) 3126 { 3127 unregister_qdisc(&cake_qdisc_ops); 3128 } 3129 3130 module_init(cake_module_init) 3131 module_exit(cake_module_exit) 3132 MODULE_AUTHOR("Jonathan Morton"); 3133 MODULE_LICENSE("Dual BSD/GPL"); 3134 MODULE_DESCRIPTION("The CAKE shaper."); 3135