1 // SPDX-License-Identifier: GPL-2.0 2 3 /* net/sched/sch_taprio.c Time Aware Priority Scheduler 4 * 5 * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com> 6 * 7 */ 8 9 #include <linux/types.h> 10 #include <linux/slab.h> 11 #include <linux/kernel.h> 12 #include <linux/string.h> 13 #include <linux/list.h> 14 #include <linux/errno.h> 15 #include <linux/skbuff.h> 16 #include <linux/math64.h> 17 #include <linux/module.h> 18 #include <linux/spinlock.h> 19 #include <linux/rcupdate.h> 20 #include <net/netlink.h> 21 #include <net/pkt_sched.h> 22 #include <net/pkt_cls.h> 23 #include <net/sch_generic.h> 24 25 static LIST_HEAD(taprio_list); 26 static DEFINE_SPINLOCK(taprio_list_lock); 27 28 #define TAPRIO_ALL_GATES_OPEN -1 29 30 struct sched_entry { 31 struct list_head list; 32 33 /* The instant that this entry "closes" and the next one 34 * should open, the qdisc will make some effort so that no 35 * packet leaves after this time. 36 */ 37 ktime_t close_time; 38 atomic_t budget; 39 int index; 40 u32 gate_mask; 41 u32 interval; 42 u8 command; 43 }; 44 45 struct sched_gate_list { 46 struct rcu_head rcu; 47 struct list_head entries; 48 size_t num_entries; 49 ktime_t cycle_close_time; 50 s64 cycle_time; 51 s64 cycle_time_extension; 52 s64 base_time; 53 }; 54 55 struct taprio_sched { 56 struct Qdisc **qdiscs; 57 struct Qdisc *root; 58 int clockid; 59 atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ 60 * speeds it's sub-nanoseconds per byte 61 */ 62 63 /* Protects the update side of the RCU protected current_entry */ 64 spinlock_t current_entry_lock; 65 struct sched_entry __rcu *current_entry; 66 struct sched_gate_list __rcu *oper_sched; 67 struct sched_gate_list __rcu *admin_sched; 68 ktime_t (*get_time)(void); 69 struct hrtimer advance_timer; 70 struct list_head taprio_list; 71 }; 72 73 static ktime_t sched_base_time(const struct sched_gate_list *sched) 74 { 75 if (!sched) 76 return KTIME_MAX; 77 78 return ns_to_ktime(sched->base_time); 79 } 80 81 static void taprio_free_sched_cb(struct rcu_head *head) 82 { 83 struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); 84 struct sched_entry *entry, *n; 85 86 if (!sched) 87 return; 88 89 list_for_each_entry_safe(entry, n, &sched->entries, list) { 90 list_del(&entry->list); 91 kfree(entry); 92 } 93 94 kfree(sched); 95 } 96 97 static void switch_schedules(struct taprio_sched *q, 98 struct sched_gate_list **admin, 99 struct sched_gate_list **oper) 100 { 101 rcu_assign_pointer(q->oper_sched, *admin); 102 rcu_assign_pointer(q->admin_sched, NULL); 103 104 if (*oper) 105 call_rcu(&(*oper)->rcu, taprio_free_sched_cb); 106 107 *oper = *admin; 108 *admin = NULL; 109 } 110 111 static ktime_t get_cycle_time(struct sched_gate_list *sched) 112 { 113 struct sched_entry *entry; 114 ktime_t cycle = 0; 115 116 if (sched->cycle_time != 0) 117 return sched->cycle_time; 118 119 list_for_each_entry(entry, &sched->entries, list) 120 cycle = ktime_add_ns(cycle, entry->interval); 121 122 sched->cycle_time = cycle; 123 124 return cycle; 125 } 126 127 static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, 128 struct sk_buff **to_free) 129 { 130 struct taprio_sched *q = qdisc_priv(sch); 131 struct Qdisc *child; 132 int queue; 133 134 queue = skb_get_queue_mapping(skb); 135 136 child = q->qdiscs[queue]; 137 if (unlikely(!child)) 138 return qdisc_drop(skb, sch, to_free); 139 140 qdisc_qstats_backlog_inc(sch, skb); 141 sch->q.qlen++; 142 143 return qdisc_enqueue(skb, child, to_free); 144 } 145 146 static struct sk_buff *taprio_peek(struct Qdisc *sch) 147 { 148 struct taprio_sched *q = qdisc_priv(sch); 149 struct net_device *dev = qdisc_dev(sch); 150 struct sched_entry *entry; 151 struct sk_buff *skb; 152 u32 gate_mask; 153 int i; 154 155 rcu_read_lock(); 156 entry = rcu_dereference(q->current_entry); 157 gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; 158 rcu_read_unlock(); 159 160 if (!gate_mask) 161 return NULL; 162 163 for (i = 0; i < dev->num_tx_queues; i++) { 164 struct Qdisc *child = q->qdiscs[i]; 165 int prio; 166 u8 tc; 167 168 if (unlikely(!child)) 169 continue; 170 171 skb = child->ops->peek(child); 172 if (!skb) 173 continue; 174 175 prio = skb->priority; 176 tc = netdev_get_prio_tc_map(dev, prio); 177 178 if (!(gate_mask & BIT(tc))) 179 continue; 180 181 return skb; 182 } 183 184 return NULL; 185 } 186 187 static inline int length_to_duration(struct taprio_sched *q, int len) 188 { 189 return div_u64(len * atomic64_read(&q->picos_per_byte), 1000); 190 } 191 192 static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) 193 { 194 atomic_set(&entry->budget, 195 div64_u64((u64)entry->interval * 1000, 196 atomic64_read(&q->picos_per_byte))); 197 } 198 199 static struct sk_buff *taprio_dequeue(struct Qdisc *sch) 200 { 201 struct taprio_sched *q = qdisc_priv(sch); 202 struct net_device *dev = qdisc_dev(sch); 203 struct sk_buff *skb = NULL; 204 struct sched_entry *entry; 205 u32 gate_mask; 206 int i; 207 208 if (atomic64_read(&q->picos_per_byte) == -1) { 209 WARN_ONCE(1, "taprio: dequeue() called with unknown picos per byte."); 210 return NULL; 211 } 212 213 rcu_read_lock(); 214 entry = rcu_dereference(q->current_entry); 215 /* if there's no entry, it means that the schedule didn't 216 * start yet, so force all gates to be open, this is in 217 * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 218 * "AdminGateSates" 219 */ 220 gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; 221 222 if (!gate_mask) 223 goto done; 224 225 for (i = 0; i < dev->num_tx_queues; i++) { 226 struct Qdisc *child = q->qdiscs[i]; 227 ktime_t guard; 228 int prio; 229 int len; 230 u8 tc; 231 232 if (unlikely(!child)) 233 continue; 234 235 skb = child->ops->peek(child); 236 if (!skb) 237 continue; 238 239 prio = skb->priority; 240 tc = netdev_get_prio_tc_map(dev, prio); 241 242 if (!(gate_mask & BIT(tc))) 243 continue; 244 245 len = qdisc_pkt_len(skb); 246 guard = ktime_add_ns(q->get_time(), 247 length_to_duration(q, len)); 248 249 /* In the case that there's no gate entry, there's no 250 * guard band ... 251 */ 252 if (gate_mask != TAPRIO_ALL_GATES_OPEN && 253 ktime_after(guard, entry->close_time)) 254 continue; 255 256 /* ... and no budget. */ 257 if (gate_mask != TAPRIO_ALL_GATES_OPEN && 258 atomic_sub_return(len, &entry->budget) < 0) 259 continue; 260 261 skb = child->ops->dequeue(child); 262 if (unlikely(!skb)) 263 goto done; 264 265 qdisc_bstats_update(sch, skb); 266 qdisc_qstats_backlog_dec(sch, skb); 267 sch->q.qlen--; 268 269 goto done; 270 } 271 272 done: 273 rcu_read_unlock(); 274 275 return skb; 276 } 277 278 static bool should_restart_cycle(const struct sched_gate_list *oper, 279 const struct sched_entry *entry) 280 { 281 if (list_is_last(&entry->list, &oper->entries)) 282 return true; 283 284 if (ktime_compare(entry->close_time, oper->cycle_close_time) == 0) 285 return true; 286 287 return false; 288 } 289 290 static bool should_change_schedules(const struct sched_gate_list *admin, 291 const struct sched_gate_list *oper, 292 ktime_t close_time) 293 { 294 ktime_t next_base_time, extension_time; 295 296 if (!admin) 297 return false; 298 299 next_base_time = sched_base_time(admin); 300 301 /* This is the simple case, the close_time would fall after 302 * the next schedule base_time. 303 */ 304 if (ktime_compare(next_base_time, close_time) <= 0) 305 return true; 306 307 /* This is the cycle_time_extension case, if the close_time 308 * plus the amount that can be extended would fall after the 309 * next schedule base_time, we can extend the current schedule 310 * for that amount. 311 */ 312 extension_time = ktime_add_ns(close_time, oper->cycle_time_extension); 313 314 /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about 315 * how precisely the extension should be made. So after 316 * conformance testing, this logic may change. 317 */ 318 if (ktime_compare(next_base_time, extension_time) <= 0) 319 return true; 320 321 return false; 322 } 323 324 static enum hrtimer_restart advance_sched(struct hrtimer *timer) 325 { 326 struct taprio_sched *q = container_of(timer, struct taprio_sched, 327 advance_timer); 328 struct sched_gate_list *oper, *admin; 329 struct sched_entry *entry, *next; 330 struct Qdisc *sch = q->root; 331 ktime_t close_time; 332 333 spin_lock(&q->current_entry_lock); 334 entry = rcu_dereference_protected(q->current_entry, 335 lockdep_is_held(&q->current_entry_lock)); 336 oper = rcu_dereference_protected(q->oper_sched, 337 lockdep_is_held(&q->current_entry_lock)); 338 admin = rcu_dereference_protected(q->admin_sched, 339 lockdep_is_held(&q->current_entry_lock)); 340 341 if (!oper) 342 switch_schedules(q, &admin, &oper); 343 344 /* This can happen in two cases: 1. this is the very first run 345 * of this function (i.e. we weren't running any schedule 346 * previously); 2. The previous schedule just ended. The first 347 * entry of all schedules are pre-calculated during the 348 * schedule initialization. 349 */ 350 if (unlikely(!entry || entry->close_time == oper->base_time)) { 351 next = list_first_entry(&oper->entries, struct sched_entry, 352 list); 353 close_time = next->close_time; 354 goto first_run; 355 } 356 357 if (should_restart_cycle(oper, entry)) { 358 next = list_first_entry(&oper->entries, struct sched_entry, 359 list); 360 oper->cycle_close_time = ktime_add_ns(oper->cycle_close_time, 361 oper->cycle_time); 362 } else { 363 next = list_next_entry(entry, list); 364 } 365 366 close_time = ktime_add_ns(entry->close_time, next->interval); 367 close_time = min_t(ktime_t, close_time, oper->cycle_close_time); 368 369 if (should_change_schedules(admin, oper, close_time)) { 370 /* Set things so the next time this runs, the new 371 * schedule runs. 372 */ 373 close_time = sched_base_time(admin); 374 switch_schedules(q, &admin, &oper); 375 } 376 377 next->close_time = close_time; 378 taprio_set_budget(q, next); 379 380 first_run: 381 rcu_assign_pointer(q->current_entry, next); 382 spin_unlock(&q->current_entry_lock); 383 384 hrtimer_set_expires(&q->advance_timer, close_time); 385 386 rcu_read_lock(); 387 __netif_schedule(sch); 388 rcu_read_unlock(); 389 390 return HRTIMER_RESTART; 391 } 392 393 static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { 394 [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, 395 [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, 396 [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, 397 [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, 398 }; 399 400 static const struct nla_policy entry_list_policy[TCA_TAPRIO_SCHED_MAX + 1] = { 401 [TCA_TAPRIO_SCHED_ENTRY] = { .type = NLA_NESTED }, 402 }; 403 404 static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { 405 [TCA_TAPRIO_ATTR_PRIOMAP] = { 406 .len = sizeof(struct tc_mqprio_qopt) 407 }, 408 [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, 409 [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, 410 [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, 411 [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, 412 [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 }, 413 [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, 414 }; 415 416 static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry, 417 struct netlink_ext_ack *extack) 418 { 419 u32 interval = 0; 420 421 if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) 422 entry->command = nla_get_u8( 423 tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); 424 425 if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) 426 entry->gate_mask = nla_get_u32( 427 tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); 428 429 if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) 430 interval = nla_get_u32( 431 tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); 432 433 if (interval == 0) { 434 NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); 435 return -EINVAL; 436 } 437 438 entry->interval = interval; 439 440 return 0; 441 } 442 443 static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry, 444 int index, struct netlink_ext_ack *extack) 445 { 446 struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; 447 int err; 448 449 err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, 450 entry_policy, NULL); 451 if (err < 0) { 452 NL_SET_ERR_MSG(extack, "Could not parse nested entry"); 453 return -EINVAL; 454 } 455 456 entry->index = index; 457 458 return fill_sched_entry(tb, entry, extack); 459 } 460 461 static int parse_sched_list(struct nlattr *list, 462 struct sched_gate_list *sched, 463 struct netlink_ext_ack *extack) 464 { 465 struct nlattr *n; 466 int err, rem; 467 int i = 0; 468 469 if (!list) 470 return -EINVAL; 471 472 nla_for_each_nested(n, list, rem) { 473 struct sched_entry *entry; 474 475 if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { 476 NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); 477 continue; 478 } 479 480 entry = kzalloc(sizeof(*entry), GFP_KERNEL); 481 if (!entry) { 482 NL_SET_ERR_MSG(extack, "Not enough memory for entry"); 483 return -ENOMEM; 484 } 485 486 err = parse_sched_entry(n, entry, i, extack); 487 if (err < 0) { 488 kfree(entry); 489 return err; 490 } 491 492 list_add_tail(&entry->list, &sched->entries); 493 i++; 494 } 495 496 sched->num_entries = i; 497 498 return i; 499 } 500 501 static int parse_taprio_schedule(struct nlattr **tb, 502 struct sched_gate_list *new, 503 struct netlink_ext_ack *extack) 504 { 505 int err = 0; 506 507 if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) { 508 NL_SET_ERR_MSG(extack, "Adding a single entry is not supported"); 509 return -ENOTSUPP; 510 } 511 512 if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) 513 new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); 514 515 if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]) 516 new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]); 517 518 if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) 519 new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); 520 521 if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) 522 err = parse_sched_list( 523 tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], new, extack); 524 if (err < 0) 525 return err; 526 527 return 0; 528 } 529 530 static int taprio_parse_mqprio_opt(struct net_device *dev, 531 struct tc_mqprio_qopt *qopt, 532 struct netlink_ext_ack *extack) 533 { 534 int i, j; 535 536 if (!qopt && !dev->num_tc) { 537 NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); 538 return -EINVAL; 539 } 540 541 /* If num_tc is already set, it means that the user already 542 * configured the mqprio part 543 */ 544 if (dev->num_tc) 545 return 0; 546 547 /* Verify num_tc is not out of max range */ 548 if (qopt->num_tc > TC_MAX_QUEUE) { 549 NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); 550 return -EINVAL; 551 } 552 553 /* taprio imposes that traffic classes map 1:n to tx queues */ 554 if (qopt->num_tc > dev->num_tx_queues) { 555 NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); 556 return -EINVAL; 557 } 558 559 /* Verify priority mapping uses valid tcs */ 560 for (i = 0; i < TC_BITMASK + 1; i++) { 561 if (qopt->prio_tc_map[i] >= qopt->num_tc) { 562 NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); 563 return -EINVAL; 564 } 565 } 566 567 for (i = 0; i < qopt->num_tc; i++) { 568 unsigned int last = qopt->offset[i] + qopt->count[i]; 569 570 /* Verify the queue count is in tx range being equal to the 571 * real_num_tx_queues indicates the last queue is in use. 572 */ 573 if (qopt->offset[i] >= dev->num_tx_queues || 574 !qopt->count[i] || 575 last > dev->real_num_tx_queues) { 576 NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping"); 577 return -EINVAL; 578 } 579 580 /* Verify that the offset and counts do not overlap */ 581 for (j = i + 1; j < qopt->num_tc; j++) { 582 if (last > qopt->offset[j]) { 583 NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping"); 584 return -EINVAL; 585 } 586 } 587 } 588 589 return 0; 590 } 591 592 static int taprio_get_start_time(struct Qdisc *sch, 593 struct sched_gate_list *sched, 594 ktime_t *start) 595 { 596 struct taprio_sched *q = qdisc_priv(sch); 597 ktime_t now, base, cycle; 598 s64 n; 599 600 base = sched_base_time(sched); 601 now = q->get_time(); 602 603 if (ktime_after(base, now)) { 604 *start = base; 605 return 0; 606 } 607 608 cycle = get_cycle_time(sched); 609 610 /* The qdisc is expected to have at least one sched_entry. Moreover, 611 * any entry must have 'interval' > 0. Thus if the cycle time is zero, 612 * something went really wrong. In that case, we should warn about this 613 * inconsistent state and return error. 614 */ 615 if (WARN_ON(!cycle)) 616 return -EFAULT; 617 618 /* Schedule the start time for the beginning of the next 619 * cycle. 620 */ 621 n = div64_s64(ktime_sub_ns(now, base), cycle); 622 *start = ktime_add_ns(base, (n + 1) * cycle); 623 return 0; 624 } 625 626 static void setup_first_close_time(struct taprio_sched *q, 627 struct sched_gate_list *sched, ktime_t base) 628 { 629 struct sched_entry *first; 630 ktime_t cycle; 631 632 first = list_first_entry(&sched->entries, 633 struct sched_entry, list); 634 635 cycle = get_cycle_time(sched); 636 637 /* FIXME: find a better place to do this */ 638 sched->cycle_close_time = ktime_add_ns(base, cycle); 639 640 first->close_time = ktime_add_ns(base, first->interval); 641 taprio_set_budget(q, first); 642 rcu_assign_pointer(q->current_entry, NULL); 643 } 644 645 static void taprio_start_sched(struct Qdisc *sch, 646 ktime_t start, struct sched_gate_list *new) 647 { 648 struct taprio_sched *q = qdisc_priv(sch); 649 ktime_t expires; 650 651 expires = hrtimer_get_expires(&q->advance_timer); 652 if (expires == 0) 653 expires = KTIME_MAX; 654 655 /* If the new schedule starts before the next expiration, we 656 * reprogram it to the earliest one, so we change the admin 657 * schedule to the operational one at the right time. 658 */ 659 start = min_t(ktime_t, start, expires); 660 661 hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); 662 } 663 664 static void taprio_set_picos_per_byte(struct net_device *dev, 665 struct taprio_sched *q) 666 { 667 struct ethtool_link_ksettings ecmd; 668 int picos_per_byte = -1; 669 670 if (!__ethtool_get_link_ksettings(dev, &ecmd) && 671 ecmd.base.speed != SPEED_UNKNOWN) 672 picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, 673 ecmd.base.speed * 1000 * 1000); 674 675 atomic64_set(&q->picos_per_byte, picos_per_byte); 676 netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", 677 dev->name, (long long)atomic64_read(&q->picos_per_byte), 678 ecmd.base.speed); 679 } 680 681 static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, 682 void *ptr) 683 { 684 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 685 struct net_device *qdev; 686 struct taprio_sched *q; 687 bool found = false; 688 689 ASSERT_RTNL(); 690 691 if (event != NETDEV_UP && event != NETDEV_CHANGE) 692 return NOTIFY_DONE; 693 694 spin_lock(&taprio_list_lock); 695 list_for_each_entry(q, &taprio_list, taprio_list) { 696 qdev = qdisc_dev(q->root); 697 if (qdev == dev) { 698 found = true; 699 break; 700 } 701 } 702 spin_unlock(&taprio_list_lock); 703 704 if (found) 705 taprio_set_picos_per_byte(dev, q); 706 707 return NOTIFY_DONE; 708 } 709 710 static int taprio_change(struct Qdisc *sch, struct nlattr *opt, 711 struct netlink_ext_ack *extack) 712 { 713 struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; 714 struct sched_gate_list *oper, *admin, *new_admin; 715 struct taprio_sched *q = qdisc_priv(sch); 716 struct net_device *dev = qdisc_dev(sch); 717 struct tc_mqprio_qopt *mqprio = NULL; 718 int i, err, clockid; 719 unsigned long flags; 720 ktime_t start; 721 722 err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt, 723 taprio_policy, extack); 724 if (err < 0) 725 return err; 726 727 if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) 728 mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); 729 730 err = taprio_parse_mqprio_opt(dev, mqprio, extack); 731 if (err < 0) 732 return err; 733 734 new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); 735 if (!new_admin) { 736 NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); 737 return -ENOMEM; 738 } 739 INIT_LIST_HEAD(&new_admin->entries); 740 741 rcu_read_lock(); 742 oper = rcu_dereference(q->oper_sched); 743 admin = rcu_dereference(q->admin_sched); 744 rcu_read_unlock(); 745 746 if (mqprio && (oper || admin)) { 747 NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); 748 err = -ENOTSUPP; 749 goto free_sched; 750 } 751 752 err = parse_taprio_schedule(tb, new_admin, extack); 753 if (err < 0) 754 goto free_sched; 755 756 if (new_admin->num_entries == 0) { 757 NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule"); 758 err = -EINVAL; 759 goto free_sched; 760 } 761 762 if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { 763 clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); 764 765 /* We only support static clockids and we don't allow 766 * for it to be modified after the first init. 767 */ 768 if (clockid < 0 || 769 (q->clockid != -1 && q->clockid != clockid)) { 770 NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported"); 771 err = -ENOTSUPP; 772 goto free_sched; 773 } 774 775 q->clockid = clockid; 776 } 777 778 if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { 779 NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); 780 err = -EINVAL; 781 goto free_sched; 782 } 783 784 taprio_set_picos_per_byte(dev, q); 785 786 /* Protects against enqueue()/dequeue() */ 787 spin_lock_bh(qdisc_lock(sch)); 788 789 if (!hrtimer_active(&q->advance_timer)) { 790 hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); 791 q->advance_timer.function = advance_sched; 792 } 793 794 if (mqprio) { 795 netdev_set_num_tc(dev, mqprio->num_tc); 796 for (i = 0; i < mqprio->num_tc; i++) 797 netdev_set_tc_queue(dev, i, 798 mqprio->count[i], 799 mqprio->offset[i]); 800 801 /* Always use supplied priority mappings */ 802 for (i = 0; i < TC_BITMASK + 1; i++) 803 netdev_set_prio_tc_map(dev, i, 804 mqprio->prio_tc_map[i]); 805 } 806 807 switch (q->clockid) { 808 case CLOCK_REALTIME: 809 q->get_time = ktime_get_real; 810 break; 811 case CLOCK_MONOTONIC: 812 q->get_time = ktime_get; 813 break; 814 case CLOCK_BOOTTIME: 815 q->get_time = ktime_get_boottime; 816 break; 817 case CLOCK_TAI: 818 q->get_time = ktime_get_clocktai; 819 break; 820 default: 821 NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); 822 err = -EINVAL; 823 goto unlock; 824 } 825 826 err = taprio_get_start_time(sch, new_admin, &start); 827 if (err < 0) { 828 NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); 829 goto unlock; 830 } 831 832 setup_first_close_time(q, new_admin, start); 833 834 /* Protects against advance_sched() */ 835 spin_lock_irqsave(&q->current_entry_lock, flags); 836 837 taprio_start_sched(sch, start, new_admin); 838 839 rcu_assign_pointer(q->admin_sched, new_admin); 840 if (admin) 841 call_rcu(&admin->rcu, taprio_free_sched_cb); 842 new_admin = NULL; 843 844 spin_unlock_irqrestore(&q->current_entry_lock, flags); 845 846 err = 0; 847 848 unlock: 849 spin_unlock_bh(qdisc_lock(sch)); 850 851 free_sched: 852 kfree(new_admin); 853 854 return err; 855 } 856 857 static void taprio_destroy(struct Qdisc *sch) 858 { 859 struct taprio_sched *q = qdisc_priv(sch); 860 struct net_device *dev = qdisc_dev(sch); 861 unsigned int i; 862 863 spin_lock(&taprio_list_lock); 864 list_del(&q->taprio_list); 865 spin_unlock(&taprio_list_lock); 866 867 hrtimer_cancel(&q->advance_timer); 868 869 if (q->qdiscs) { 870 for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) 871 qdisc_put(q->qdiscs[i]); 872 873 kfree(q->qdiscs); 874 } 875 q->qdiscs = NULL; 876 877 netdev_set_num_tc(dev, 0); 878 879 if (q->oper_sched) 880 call_rcu(&q->oper_sched->rcu, taprio_free_sched_cb); 881 882 if (q->admin_sched) 883 call_rcu(&q->admin_sched->rcu, taprio_free_sched_cb); 884 } 885 886 static int taprio_init(struct Qdisc *sch, struct nlattr *opt, 887 struct netlink_ext_ack *extack) 888 { 889 struct taprio_sched *q = qdisc_priv(sch); 890 struct net_device *dev = qdisc_dev(sch); 891 int i; 892 893 spin_lock_init(&q->current_entry_lock); 894 895 hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); 896 q->advance_timer.function = advance_sched; 897 898 q->root = sch; 899 900 /* We only support static clockids. Use an invalid value as default 901 * and get the valid one on taprio_change(). 902 */ 903 q->clockid = -1; 904 905 if (sch->parent != TC_H_ROOT) 906 return -EOPNOTSUPP; 907 908 if (!netif_is_multiqueue(dev)) 909 return -EOPNOTSUPP; 910 911 /* pre-allocate qdisc, attachment can't fail */ 912 q->qdiscs = kcalloc(dev->num_tx_queues, 913 sizeof(q->qdiscs[0]), 914 GFP_KERNEL); 915 916 if (!q->qdiscs) 917 return -ENOMEM; 918 919 if (!opt) 920 return -EINVAL; 921 922 spin_lock(&taprio_list_lock); 923 list_add(&q->taprio_list, &taprio_list); 924 spin_unlock(&taprio_list_lock); 925 926 for (i = 0; i < dev->num_tx_queues; i++) { 927 struct netdev_queue *dev_queue; 928 struct Qdisc *qdisc; 929 930 dev_queue = netdev_get_tx_queue(dev, i); 931 qdisc = qdisc_create_dflt(dev_queue, 932 &pfifo_qdisc_ops, 933 TC_H_MAKE(TC_H_MAJ(sch->handle), 934 TC_H_MIN(i + 1)), 935 extack); 936 if (!qdisc) 937 return -ENOMEM; 938 939 if (i < dev->real_num_tx_queues) 940 qdisc_hash_add(qdisc, false); 941 942 q->qdiscs[i] = qdisc; 943 } 944 945 return taprio_change(sch, opt, extack); 946 } 947 948 static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, 949 unsigned long cl) 950 { 951 struct net_device *dev = qdisc_dev(sch); 952 unsigned long ntx = cl - 1; 953 954 if (ntx >= dev->num_tx_queues) 955 return NULL; 956 957 return netdev_get_tx_queue(dev, ntx); 958 } 959 960 static int taprio_graft(struct Qdisc *sch, unsigned long cl, 961 struct Qdisc *new, struct Qdisc **old, 962 struct netlink_ext_ack *extack) 963 { 964 struct taprio_sched *q = qdisc_priv(sch); 965 struct net_device *dev = qdisc_dev(sch); 966 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); 967 968 if (!dev_queue) 969 return -EINVAL; 970 971 if (dev->flags & IFF_UP) 972 dev_deactivate(dev); 973 974 *old = q->qdiscs[cl - 1]; 975 q->qdiscs[cl - 1] = new; 976 977 if (new) 978 new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; 979 980 if (dev->flags & IFF_UP) 981 dev_activate(dev); 982 983 return 0; 984 } 985 986 static int dump_entry(struct sk_buff *msg, 987 const struct sched_entry *entry) 988 { 989 struct nlattr *item; 990 991 item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY); 992 if (!item) 993 return -ENOSPC; 994 995 if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) 996 goto nla_put_failure; 997 998 if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) 999 goto nla_put_failure; 1000 1001 if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, 1002 entry->gate_mask)) 1003 goto nla_put_failure; 1004 1005 if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, 1006 entry->interval)) 1007 goto nla_put_failure; 1008 1009 return nla_nest_end(msg, item); 1010 1011 nla_put_failure: 1012 nla_nest_cancel(msg, item); 1013 return -1; 1014 } 1015 1016 static int dump_schedule(struct sk_buff *msg, 1017 const struct sched_gate_list *root) 1018 { 1019 struct nlattr *entry_list; 1020 struct sched_entry *entry; 1021 1022 if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, 1023 root->base_time, TCA_TAPRIO_PAD)) 1024 return -1; 1025 1026 if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, 1027 root->cycle_time, TCA_TAPRIO_PAD)) 1028 return -1; 1029 1030 if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, 1031 root->cycle_time_extension, TCA_TAPRIO_PAD)) 1032 return -1; 1033 1034 entry_list = nla_nest_start_noflag(msg, 1035 TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); 1036 if (!entry_list) 1037 goto error_nest; 1038 1039 list_for_each_entry(entry, &root->entries, list) { 1040 if (dump_entry(msg, entry) < 0) 1041 goto error_nest; 1042 } 1043 1044 nla_nest_end(msg, entry_list); 1045 return 0; 1046 1047 error_nest: 1048 nla_nest_cancel(msg, entry_list); 1049 return -1; 1050 } 1051 1052 static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) 1053 { 1054 struct taprio_sched *q = qdisc_priv(sch); 1055 struct net_device *dev = qdisc_dev(sch); 1056 struct sched_gate_list *oper, *admin; 1057 struct tc_mqprio_qopt opt = { 0 }; 1058 struct nlattr *nest, *sched_nest; 1059 unsigned int i; 1060 1061 rcu_read_lock(); 1062 oper = rcu_dereference(q->oper_sched); 1063 admin = rcu_dereference(q->admin_sched); 1064 1065 opt.num_tc = netdev_get_num_tc(dev); 1066 memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); 1067 1068 for (i = 0; i < netdev_get_num_tc(dev); i++) { 1069 opt.count[i] = dev->tc_to_txq[i].count; 1070 opt.offset[i] = dev->tc_to_txq[i].offset; 1071 } 1072 1073 nest = nla_nest_start_noflag(skb, TCA_OPTIONS); 1074 if (!nest) 1075 goto start_error; 1076 1077 if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) 1078 goto options_error; 1079 1080 if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) 1081 goto options_error; 1082 1083 if (oper && dump_schedule(skb, oper)) 1084 goto options_error; 1085 1086 if (!admin) 1087 goto done; 1088 1089 sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); 1090 if (!sched_nest) 1091 goto options_error; 1092 1093 if (dump_schedule(skb, admin)) 1094 goto admin_error; 1095 1096 nla_nest_end(skb, sched_nest); 1097 1098 done: 1099 rcu_read_unlock(); 1100 1101 return nla_nest_end(skb, nest); 1102 1103 admin_error: 1104 nla_nest_cancel(skb, sched_nest); 1105 1106 options_error: 1107 nla_nest_cancel(skb, nest); 1108 1109 start_error: 1110 rcu_read_unlock(); 1111 return -ENOSPC; 1112 } 1113 1114 static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) 1115 { 1116 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); 1117 1118 if (!dev_queue) 1119 return NULL; 1120 1121 return dev_queue->qdisc_sleeping; 1122 } 1123 1124 static unsigned long taprio_find(struct Qdisc *sch, u32 classid) 1125 { 1126 unsigned int ntx = TC_H_MIN(classid); 1127 1128 if (!taprio_queue_get(sch, ntx)) 1129 return 0; 1130 return ntx; 1131 } 1132 1133 static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, 1134 struct sk_buff *skb, struct tcmsg *tcm) 1135 { 1136 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); 1137 1138 tcm->tcm_parent = TC_H_ROOT; 1139 tcm->tcm_handle |= TC_H_MIN(cl); 1140 tcm->tcm_info = dev_queue->qdisc_sleeping->handle; 1141 1142 return 0; 1143 } 1144 1145 static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, 1146 struct gnet_dump *d) 1147 __releases(d->lock) 1148 __acquires(d->lock) 1149 { 1150 struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); 1151 1152 sch = dev_queue->qdisc_sleeping; 1153 if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || 1154 qdisc_qstats_copy(d, sch) < 0) 1155 return -1; 1156 return 0; 1157 } 1158 1159 static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) 1160 { 1161 struct net_device *dev = qdisc_dev(sch); 1162 unsigned long ntx; 1163 1164 if (arg->stop) 1165 return; 1166 1167 arg->count = arg->skip; 1168 for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { 1169 if (arg->fn(sch, ntx + 1, arg) < 0) { 1170 arg->stop = 1; 1171 break; 1172 } 1173 arg->count++; 1174 } 1175 } 1176 1177 static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, 1178 struct tcmsg *tcm) 1179 { 1180 return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); 1181 } 1182 1183 static const struct Qdisc_class_ops taprio_class_ops = { 1184 .graft = taprio_graft, 1185 .leaf = taprio_leaf, 1186 .find = taprio_find, 1187 .walk = taprio_walk, 1188 .dump = taprio_dump_class, 1189 .dump_stats = taprio_dump_class_stats, 1190 .select_queue = taprio_select_queue, 1191 }; 1192 1193 static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { 1194 .cl_ops = &taprio_class_ops, 1195 .id = "taprio", 1196 .priv_size = sizeof(struct taprio_sched), 1197 .init = taprio_init, 1198 .change = taprio_change, 1199 .destroy = taprio_destroy, 1200 .peek = taprio_peek, 1201 .dequeue = taprio_dequeue, 1202 .enqueue = taprio_enqueue, 1203 .dump = taprio_dump, 1204 .owner = THIS_MODULE, 1205 }; 1206 1207 static struct notifier_block taprio_device_notifier = { 1208 .notifier_call = taprio_dev_notifier, 1209 }; 1210 1211 static int __init taprio_module_init(void) 1212 { 1213 int err = register_netdevice_notifier(&taprio_device_notifier); 1214 1215 if (err) 1216 return err; 1217 1218 return register_qdisc(&taprio_qdisc_ops); 1219 } 1220 1221 static void __exit taprio_module_exit(void) 1222 { 1223 unregister_qdisc(&taprio_qdisc_ops); 1224 unregister_netdevice_notifier(&taprio_device_notifier); 1225 } 1226 1227 module_init(taprio_module_init); 1228 module_exit(taprio_module_exit); 1229 MODULE_LICENSE("GPL"); 1230