1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 4 * policies) 5 */ 6 7 int sched_rr_timeslice = RR_TIMESLICE; 8 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; 9 /* More than 4 hours if BW_SHIFT equals 20. */ 10 static const u64 max_rt_runtime = MAX_BW; 11 12 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 13 14 struct rt_bandwidth def_rt_bandwidth; 15 16 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 17 { 18 struct rt_bandwidth *rt_b = 19 container_of(timer, struct rt_bandwidth, rt_period_timer); 20 int idle = 0; 21 int overrun; 22 23 raw_spin_lock(&rt_b->rt_runtime_lock); 24 for (;;) { 25 overrun = hrtimer_forward_now(timer, rt_b->rt_period); 26 if (!overrun) 27 break; 28 29 raw_spin_unlock(&rt_b->rt_runtime_lock); 30 idle = do_sched_rt_period_timer(rt_b, overrun); 31 raw_spin_lock(&rt_b->rt_runtime_lock); 32 } 33 if (idle) 34 rt_b->rt_period_active = 0; 35 raw_spin_unlock(&rt_b->rt_runtime_lock); 36 37 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 38 } 39 40 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 41 { 42 rt_b->rt_period = ns_to_ktime(period); 43 rt_b->rt_runtime = runtime; 44 45 raw_spin_lock_init(&rt_b->rt_runtime_lock); 46 47 hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, 48 HRTIMER_MODE_REL_HARD); 49 rt_b->rt_period_timer.function = sched_rt_period_timer; 50 } 51 52 static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) 53 { 54 raw_spin_lock(&rt_b->rt_runtime_lock); 55 if (!rt_b->rt_period_active) { 56 rt_b->rt_period_active = 1; 57 /* 58 * SCHED_DEADLINE updates the bandwidth, as a run away 59 * RT task with a DL task could hog a CPU. But DL does 60 * not reset the period. If a deadline task was running 61 * without an RT task running, it can cause RT tasks to 62 * throttle when they start up. Kick the timer right away 63 * to update the period. 64 */ 65 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); 66 hrtimer_start_expires(&rt_b->rt_period_timer, 67 HRTIMER_MODE_ABS_PINNED_HARD); 68 } 69 raw_spin_unlock(&rt_b->rt_runtime_lock); 70 } 71 72 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 73 { 74 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 75 return; 76 77 do_start_rt_bandwidth(rt_b); 78 } 79 80 void init_rt_rq(struct rt_rq *rt_rq) 81 { 82 struct rt_prio_array *array; 83 int i; 84 85 array = &rt_rq->active; 86 for (i = 0; i < MAX_RT_PRIO; i++) { 87 INIT_LIST_HEAD(array->queue + i); 88 __clear_bit(i, array->bitmap); 89 } 90 /* delimiter for bitsearch: */ 91 __set_bit(MAX_RT_PRIO, array->bitmap); 92 93 #if defined CONFIG_SMP 94 rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 95 rt_rq->highest_prio.next = MAX_RT_PRIO-1; 96 rt_rq->rt_nr_migratory = 0; 97 rt_rq->overloaded = 0; 98 plist_head_init(&rt_rq->pushable_tasks); 99 #endif /* CONFIG_SMP */ 100 /* We start is dequeued state, because no RT tasks are queued */ 101 rt_rq->rt_queued = 0; 102 103 rt_rq->rt_time = 0; 104 rt_rq->rt_throttled = 0; 105 rt_rq->rt_runtime = 0; 106 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 107 } 108 109 #ifdef CONFIG_RT_GROUP_SCHED 110 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 111 { 112 hrtimer_cancel(&rt_b->rt_period_timer); 113 } 114 115 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 116 117 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 118 { 119 #ifdef CONFIG_SCHED_DEBUG 120 WARN_ON_ONCE(!rt_entity_is_task(rt_se)); 121 #endif 122 return container_of(rt_se, struct task_struct, rt); 123 } 124 125 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 126 { 127 return rt_rq->rq; 128 } 129 130 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 131 { 132 return rt_se->rt_rq; 133 } 134 135 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 136 { 137 struct rt_rq *rt_rq = rt_se->rt_rq; 138 139 return rt_rq->rq; 140 } 141 142 void unregister_rt_sched_group(struct task_group *tg) 143 { 144 if (tg->rt_se) 145 destroy_rt_bandwidth(&tg->rt_bandwidth); 146 147 } 148 149 void free_rt_sched_group(struct task_group *tg) 150 { 151 int i; 152 153 for_each_possible_cpu(i) { 154 if (tg->rt_rq) 155 kfree(tg->rt_rq[i]); 156 if (tg->rt_se) 157 kfree(tg->rt_se[i]); 158 } 159 160 kfree(tg->rt_rq); 161 kfree(tg->rt_se); 162 } 163 164 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 165 struct sched_rt_entity *rt_se, int cpu, 166 struct sched_rt_entity *parent) 167 { 168 struct rq *rq = cpu_rq(cpu); 169 170 rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 171 rt_rq->rt_nr_boosted = 0; 172 rt_rq->rq = rq; 173 rt_rq->tg = tg; 174 175 tg->rt_rq[cpu] = rt_rq; 176 tg->rt_se[cpu] = rt_se; 177 178 if (!rt_se) 179 return; 180 181 if (!parent) 182 rt_se->rt_rq = &rq->rt; 183 else 184 rt_se->rt_rq = parent->my_q; 185 186 rt_se->my_q = rt_rq; 187 rt_se->parent = parent; 188 INIT_LIST_HEAD(&rt_se->run_list); 189 } 190 191 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 192 { 193 struct rt_rq *rt_rq; 194 struct sched_rt_entity *rt_se; 195 int i; 196 197 tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); 198 if (!tg->rt_rq) 199 goto err; 200 tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL); 201 if (!tg->rt_se) 202 goto err; 203 204 init_rt_bandwidth(&tg->rt_bandwidth, 205 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 206 207 for_each_possible_cpu(i) { 208 rt_rq = kzalloc_node(sizeof(struct rt_rq), 209 GFP_KERNEL, cpu_to_node(i)); 210 if (!rt_rq) 211 goto err; 212 213 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 214 GFP_KERNEL, cpu_to_node(i)); 215 if (!rt_se) 216 goto err_free_rq; 217 218 init_rt_rq(rt_rq); 219 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 220 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 221 } 222 223 return 1; 224 225 err_free_rq: 226 kfree(rt_rq); 227 err: 228 return 0; 229 } 230 231 #else /* CONFIG_RT_GROUP_SCHED */ 232 233 #define rt_entity_is_task(rt_se) (1) 234 235 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 236 { 237 return container_of(rt_se, struct task_struct, rt); 238 } 239 240 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 241 { 242 return container_of(rt_rq, struct rq, rt); 243 } 244 245 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 246 { 247 struct task_struct *p = rt_task_of(rt_se); 248 249 return task_rq(p); 250 } 251 252 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 253 { 254 struct rq *rq = rq_of_rt_se(rt_se); 255 256 return &rq->rt; 257 } 258 259 void unregister_rt_sched_group(struct task_group *tg) { } 260 261 void free_rt_sched_group(struct task_group *tg) { } 262 263 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 264 { 265 return 1; 266 } 267 #endif /* CONFIG_RT_GROUP_SCHED */ 268 269 #ifdef CONFIG_SMP 270 271 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 272 { 273 /* Try to pull RT tasks here if we lower this rq's prio */ 274 return rq->online && rq->rt.highest_prio.curr > prev->prio; 275 } 276 277 static inline int rt_overloaded(struct rq *rq) 278 { 279 return atomic_read(&rq->rd->rto_count); 280 } 281 282 static inline void rt_set_overload(struct rq *rq) 283 { 284 if (!rq->online) 285 return; 286 287 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); 288 /* 289 * Make sure the mask is visible before we set 290 * the overload count. That is checked to determine 291 * if we should look at the mask. It would be a shame 292 * if we looked at the mask, but the mask was not 293 * updated yet. 294 * 295 * Matched by the barrier in pull_rt_task(). 296 */ 297 smp_wmb(); 298 atomic_inc(&rq->rd->rto_count); 299 } 300 301 static inline void rt_clear_overload(struct rq *rq) 302 { 303 if (!rq->online) 304 return; 305 306 /* the order here really doesn't matter */ 307 atomic_dec(&rq->rd->rto_count); 308 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); 309 } 310 311 static void update_rt_migration(struct rt_rq *rt_rq) 312 { 313 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { 314 if (!rt_rq->overloaded) { 315 rt_set_overload(rq_of_rt_rq(rt_rq)); 316 rt_rq->overloaded = 1; 317 } 318 } else if (rt_rq->overloaded) { 319 rt_clear_overload(rq_of_rt_rq(rt_rq)); 320 rt_rq->overloaded = 0; 321 } 322 } 323 324 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 325 { 326 struct task_struct *p; 327 328 if (!rt_entity_is_task(rt_se)) 329 return; 330 331 p = rt_task_of(rt_se); 332 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 333 334 rt_rq->rt_nr_total++; 335 if (p->nr_cpus_allowed > 1) 336 rt_rq->rt_nr_migratory++; 337 338 update_rt_migration(rt_rq); 339 } 340 341 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 342 { 343 struct task_struct *p; 344 345 if (!rt_entity_is_task(rt_se)) 346 return; 347 348 p = rt_task_of(rt_se); 349 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 350 351 rt_rq->rt_nr_total--; 352 if (p->nr_cpus_allowed > 1) 353 rt_rq->rt_nr_migratory--; 354 355 update_rt_migration(rt_rq); 356 } 357 358 static inline int has_pushable_tasks(struct rq *rq) 359 { 360 return !plist_head_empty(&rq->rt.pushable_tasks); 361 } 362 363 static DEFINE_PER_CPU(struct callback_head, rt_push_head); 364 static DEFINE_PER_CPU(struct callback_head, rt_pull_head); 365 366 static void push_rt_tasks(struct rq *); 367 static void pull_rt_task(struct rq *); 368 369 static inline void rt_queue_push_tasks(struct rq *rq) 370 { 371 if (!has_pushable_tasks(rq)) 372 return; 373 374 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); 375 } 376 377 static inline void rt_queue_pull_task(struct rq *rq) 378 { 379 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); 380 } 381 382 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 383 { 384 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 385 plist_node_init(&p->pushable_tasks, p->prio); 386 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 387 388 /* Update the highest prio pushable task */ 389 if (p->prio < rq->rt.highest_prio.next) 390 rq->rt.highest_prio.next = p->prio; 391 } 392 393 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 394 { 395 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 396 397 /* Update the new highest prio pushable task */ 398 if (has_pushable_tasks(rq)) { 399 p = plist_first_entry(&rq->rt.pushable_tasks, 400 struct task_struct, pushable_tasks); 401 rq->rt.highest_prio.next = p->prio; 402 } else { 403 rq->rt.highest_prio.next = MAX_RT_PRIO-1; 404 } 405 } 406 407 #else 408 409 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 410 { 411 } 412 413 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 414 { 415 } 416 417 static inline 418 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 419 { 420 } 421 422 static inline 423 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 424 { 425 } 426 427 static inline void rt_queue_push_tasks(struct rq *rq) 428 { 429 } 430 #endif /* CONFIG_SMP */ 431 432 static void enqueue_top_rt_rq(struct rt_rq *rt_rq); 433 static void dequeue_top_rt_rq(struct rt_rq *rt_rq); 434 435 static inline int on_rt_rq(struct sched_rt_entity *rt_se) 436 { 437 return rt_se->on_rq; 438 } 439 440 #ifdef CONFIG_UCLAMP_TASK 441 /* 442 * Verify the fitness of task @p to run on @cpu taking into account the uclamp 443 * settings. 444 * 445 * This check is only important for heterogeneous systems where uclamp_min value 446 * is higher than the capacity of a @cpu. For non-heterogeneous system this 447 * function will always return true. 448 * 449 * The function will return true if the capacity of the @cpu is >= the 450 * uclamp_min and false otherwise. 451 * 452 * Note that uclamp_min will be clamped to uclamp_max if uclamp_min 453 * > uclamp_max. 454 */ 455 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) 456 { 457 unsigned int min_cap; 458 unsigned int max_cap; 459 unsigned int cpu_cap; 460 461 /* Only heterogeneous systems can benefit from this check */ 462 if (!static_branch_unlikely(&sched_asym_cpucapacity)) 463 return true; 464 465 min_cap = uclamp_eff_value(p, UCLAMP_MIN); 466 max_cap = uclamp_eff_value(p, UCLAMP_MAX); 467 468 cpu_cap = capacity_orig_of(cpu); 469 470 return cpu_cap >= min(min_cap, max_cap); 471 } 472 #else 473 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) 474 { 475 return true; 476 } 477 #endif 478 479 #ifdef CONFIG_RT_GROUP_SCHED 480 481 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 482 { 483 if (!rt_rq->tg) 484 return RUNTIME_INF; 485 486 return rt_rq->rt_runtime; 487 } 488 489 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 490 { 491 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 492 } 493 494 typedef struct task_group *rt_rq_iter_t; 495 496 static inline struct task_group *next_task_group(struct task_group *tg) 497 { 498 do { 499 tg = list_entry_rcu(tg->list.next, 500 typeof(struct task_group), list); 501 } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); 502 503 if (&tg->list == &task_groups) 504 tg = NULL; 505 506 return tg; 507 } 508 509 #define for_each_rt_rq(rt_rq, iter, rq) \ 510 for (iter = container_of(&task_groups, typeof(*iter), list); \ 511 (iter = next_task_group(iter)) && \ 512 (rt_rq = iter->rt_rq[cpu_of(rq)]);) 513 514 #define for_each_sched_rt_entity(rt_se) \ 515 for (; rt_se; rt_se = rt_se->parent) 516 517 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 518 { 519 return rt_se->my_q; 520 } 521 522 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 523 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 524 525 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 526 { 527 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 528 struct rq *rq = rq_of_rt_rq(rt_rq); 529 struct sched_rt_entity *rt_se; 530 531 int cpu = cpu_of(rq); 532 533 rt_se = rt_rq->tg->rt_se[cpu]; 534 535 if (rt_rq->rt_nr_running) { 536 if (!rt_se) 537 enqueue_top_rt_rq(rt_rq); 538 else if (!on_rt_rq(rt_se)) 539 enqueue_rt_entity(rt_se, 0); 540 541 if (rt_rq->highest_prio.curr < curr->prio) 542 resched_curr(rq); 543 } 544 } 545 546 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 547 { 548 struct sched_rt_entity *rt_se; 549 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 550 551 rt_se = rt_rq->tg->rt_se[cpu]; 552 553 if (!rt_se) { 554 dequeue_top_rt_rq(rt_rq); 555 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 556 cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); 557 } 558 else if (on_rt_rq(rt_se)) 559 dequeue_rt_entity(rt_se, 0); 560 } 561 562 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 563 { 564 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; 565 } 566 567 static int rt_se_boosted(struct sched_rt_entity *rt_se) 568 { 569 struct rt_rq *rt_rq = group_rt_rq(rt_se); 570 struct task_struct *p; 571 572 if (rt_rq) 573 return !!rt_rq->rt_nr_boosted; 574 575 p = rt_task_of(rt_se); 576 return p->prio != p->normal_prio; 577 } 578 579 #ifdef CONFIG_SMP 580 static inline const struct cpumask *sched_rt_period_mask(void) 581 { 582 return this_rq()->rd->span; 583 } 584 #else 585 static inline const struct cpumask *sched_rt_period_mask(void) 586 { 587 return cpu_online_mask; 588 } 589 #endif 590 591 static inline 592 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 593 { 594 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; 595 } 596 597 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 598 { 599 return &rt_rq->tg->rt_bandwidth; 600 } 601 602 #else /* !CONFIG_RT_GROUP_SCHED */ 603 604 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 605 { 606 return rt_rq->rt_runtime; 607 } 608 609 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 610 { 611 return ktime_to_ns(def_rt_bandwidth.rt_period); 612 } 613 614 typedef struct rt_rq *rt_rq_iter_t; 615 616 #define for_each_rt_rq(rt_rq, iter, rq) \ 617 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 618 619 #define for_each_sched_rt_entity(rt_se) \ 620 for (; rt_se; rt_se = NULL) 621 622 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 623 { 624 return NULL; 625 } 626 627 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 628 { 629 struct rq *rq = rq_of_rt_rq(rt_rq); 630 631 if (!rt_rq->rt_nr_running) 632 return; 633 634 enqueue_top_rt_rq(rt_rq); 635 resched_curr(rq); 636 } 637 638 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 639 { 640 dequeue_top_rt_rq(rt_rq); 641 } 642 643 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 644 { 645 return rt_rq->rt_throttled; 646 } 647 648 static inline const struct cpumask *sched_rt_period_mask(void) 649 { 650 return cpu_online_mask; 651 } 652 653 static inline 654 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 655 { 656 return &cpu_rq(cpu)->rt; 657 } 658 659 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 660 { 661 return &def_rt_bandwidth; 662 } 663 664 #endif /* CONFIG_RT_GROUP_SCHED */ 665 666 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) 667 { 668 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 669 670 return (hrtimer_active(&rt_b->rt_period_timer) || 671 rt_rq->rt_time < rt_b->rt_runtime); 672 } 673 674 #ifdef CONFIG_SMP 675 /* 676 * We ran out of runtime, see if we can borrow some from our neighbours. 677 */ 678 static void do_balance_runtime(struct rt_rq *rt_rq) 679 { 680 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 681 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; 682 int i, weight; 683 u64 rt_period; 684 685 weight = cpumask_weight(rd->span); 686 687 raw_spin_lock(&rt_b->rt_runtime_lock); 688 rt_period = ktime_to_ns(rt_b->rt_period); 689 for_each_cpu(i, rd->span) { 690 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 691 s64 diff; 692 693 if (iter == rt_rq) 694 continue; 695 696 raw_spin_lock(&iter->rt_runtime_lock); 697 /* 698 * Either all rqs have inf runtime and there's nothing to steal 699 * or __disable_runtime() below sets a specific rq to inf to 700 * indicate its been disabled and disallow stealing. 701 */ 702 if (iter->rt_runtime == RUNTIME_INF) 703 goto next; 704 705 /* 706 * From runqueues with spare time, take 1/n part of their 707 * spare time, but no more than our period. 708 */ 709 diff = iter->rt_runtime - iter->rt_time; 710 if (diff > 0) { 711 diff = div_u64((u64)diff, weight); 712 if (rt_rq->rt_runtime + diff > rt_period) 713 diff = rt_period - rt_rq->rt_runtime; 714 iter->rt_runtime -= diff; 715 rt_rq->rt_runtime += diff; 716 if (rt_rq->rt_runtime == rt_period) { 717 raw_spin_unlock(&iter->rt_runtime_lock); 718 break; 719 } 720 } 721 next: 722 raw_spin_unlock(&iter->rt_runtime_lock); 723 } 724 raw_spin_unlock(&rt_b->rt_runtime_lock); 725 } 726 727 /* 728 * Ensure this RQ takes back all the runtime it lend to its neighbours. 729 */ 730 static void __disable_runtime(struct rq *rq) 731 { 732 struct root_domain *rd = rq->rd; 733 rt_rq_iter_t iter; 734 struct rt_rq *rt_rq; 735 736 if (unlikely(!scheduler_running)) 737 return; 738 739 for_each_rt_rq(rt_rq, iter, rq) { 740 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 741 s64 want; 742 int i; 743 744 raw_spin_lock(&rt_b->rt_runtime_lock); 745 raw_spin_lock(&rt_rq->rt_runtime_lock); 746 /* 747 * Either we're all inf and nobody needs to borrow, or we're 748 * already disabled and thus have nothing to do, or we have 749 * exactly the right amount of runtime to take out. 750 */ 751 if (rt_rq->rt_runtime == RUNTIME_INF || 752 rt_rq->rt_runtime == rt_b->rt_runtime) 753 goto balanced; 754 raw_spin_unlock(&rt_rq->rt_runtime_lock); 755 756 /* 757 * Calculate the difference between what we started out with 758 * and what we current have, that's the amount of runtime 759 * we lend and now have to reclaim. 760 */ 761 want = rt_b->rt_runtime - rt_rq->rt_runtime; 762 763 /* 764 * Greedy reclaim, take back as much as we can. 765 */ 766 for_each_cpu(i, rd->span) { 767 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 768 s64 diff; 769 770 /* 771 * Can't reclaim from ourselves or disabled runqueues. 772 */ 773 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 774 continue; 775 776 raw_spin_lock(&iter->rt_runtime_lock); 777 if (want > 0) { 778 diff = min_t(s64, iter->rt_runtime, want); 779 iter->rt_runtime -= diff; 780 want -= diff; 781 } else { 782 iter->rt_runtime -= want; 783 want -= want; 784 } 785 raw_spin_unlock(&iter->rt_runtime_lock); 786 787 if (!want) 788 break; 789 } 790 791 raw_spin_lock(&rt_rq->rt_runtime_lock); 792 /* 793 * We cannot be left wanting - that would mean some runtime 794 * leaked out of the system. 795 */ 796 BUG_ON(want); 797 balanced: 798 /* 799 * Disable all the borrow logic by pretending we have inf 800 * runtime - in which case borrowing doesn't make sense. 801 */ 802 rt_rq->rt_runtime = RUNTIME_INF; 803 rt_rq->rt_throttled = 0; 804 raw_spin_unlock(&rt_rq->rt_runtime_lock); 805 raw_spin_unlock(&rt_b->rt_runtime_lock); 806 807 /* Make rt_rq available for pick_next_task() */ 808 sched_rt_rq_enqueue(rt_rq); 809 } 810 } 811 812 static void __enable_runtime(struct rq *rq) 813 { 814 rt_rq_iter_t iter; 815 struct rt_rq *rt_rq; 816 817 if (unlikely(!scheduler_running)) 818 return; 819 820 /* 821 * Reset each runqueue's bandwidth settings 822 */ 823 for_each_rt_rq(rt_rq, iter, rq) { 824 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 825 826 raw_spin_lock(&rt_b->rt_runtime_lock); 827 raw_spin_lock(&rt_rq->rt_runtime_lock); 828 rt_rq->rt_runtime = rt_b->rt_runtime; 829 rt_rq->rt_time = 0; 830 rt_rq->rt_throttled = 0; 831 raw_spin_unlock(&rt_rq->rt_runtime_lock); 832 raw_spin_unlock(&rt_b->rt_runtime_lock); 833 } 834 } 835 836 static void balance_runtime(struct rt_rq *rt_rq) 837 { 838 if (!sched_feat(RT_RUNTIME_SHARE)) 839 return; 840 841 if (rt_rq->rt_time > rt_rq->rt_runtime) { 842 raw_spin_unlock(&rt_rq->rt_runtime_lock); 843 do_balance_runtime(rt_rq); 844 raw_spin_lock(&rt_rq->rt_runtime_lock); 845 } 846 } 847 #else /* !CONFIG_SMP */ 848 static inline void balance_runtime(struct rt_rq *rt_rq) {} 849 #endif /* CONFIG_SMP */ 850 851 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 852 { 853 int i, idle = 1, throttled = 0; 854 const struct cpumask *span; 855 856 span = sched_rt_period_mask(); 857 #ifdef CONFIG_RT_GROUP_SCHED 858 /* 859 * FIXME: isolated CPUs should really leave the root task group, 860 * whether they are isolcpus or were isolated via cpusets, lest 861 * the timer run on a CPU which does not service all runqueues, 862 * potentially leaving other CPUs indefinitely throttled. If 863 * isolation is really required, the user will turn the throttle 864 * off to kill the perturbations it causes anyway. Meanwhile, 865 * this maintains functionality for boot and/or troubleshooting. 866 */ 867 if (rt_b == &root_task_group.rt_bandwidth) 868 span = cpu_online_mask; 869 #endif 870 for_each_cpu(i, span) { 871 int enqueue = 0; 872 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 873 struct rq *rq = rq_of_rt_rq(rt_rq); 874 int skip; 875 876 /* 877 * When span == cpu_online_mask, taking each rq->lock 878 * can be time-consuming. Try to avoid it when possible. 879 */ 880 raw_spin_lock(&rt_rq->rt_runtime_lock); 881 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF) 882 rt_rq->rt_runtime = rt_b->rt_runtime; 883 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; 884 raw_spin_unlock(&rt_rq->rt_runtime_lock); 885 if (skip) 886 continue; 887 888 raw_spin_rq_lock(rq); 889 update_rq_clock(rq); 890 891 if (rt_rq->rt_time) { 892 u64 runtime; 893 894 raw_spin_lock(&rt_rq->rt_runtime_lock); 895 if (rt_rq->rt_throttled) 896 balance_runtime(rt_rq); 897 runtime = rt_rq->rt_runtime; 898 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); 899 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 900 rt_rq->rt_throttled = 0; 901 enqueue = 1; 902 903 /* 904 * When we're idle and a woken (rt) task is 905 * throttled check_preempt_curr() will set 906 * skip_update and the time between the wakeup 907 * and this unthrottle will get accounted as 908 * 'runtime'. 909 */ 910 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 911 rq_clock_cancel_skipupdate(rq); 912 } 913 if (rt_rq->rt_time || rt_rq->rt_nr_running) 914 idle = 0; 915 raw_spin_unlock(&rt_rq->rt_runtime_lock); 916 } else if (rt_rq->rt_nr_running) { 917 idle = 0; 918 if (!rt_rq_throttled(rt_rq)) 919 enqueue = 1; 920 } 921 if (rt_rq->rt_throttled) 922 throttled = 1; 923 924 if (enqueue) 925 sched_rt_rq_enqueue(rt_rq); 926 raw_spin_rq_unlock(rq); 927 } 928 929 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) 930 return 1; 931 932 return idle; 933 } 934 935 static inline int rt_se_prio(struct sched_rt_entity *rt_se) 936 { 937 #ifdef CONFIG_RT_GROUP_SCHED 938 struct rt_rq *rt_rq = group_rt_rq(rt_se); 939 940 if (rt_rq) 941 return rt_rq->highest_prio.curr; 942 #endif 943 944 return rt_task_of(rt_se)->prio; 945 } 946 947 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) 948 { 949 u64 runtime = sched_rt_runtime(rt_rq); 950 951 if (rt_rq->rt_throttled) 952 return rt_rq_throttled(rt_rq); 953 954 if (runtime >= sched_rt_period(rt_rq)) 955 return 0; 956 957 balance_runtime(rt_rq); 958 runtime = sched_rt_runtime(rt_rq); 959 if (runtime == RUNTIME_INF) 960 return 0; 961 962 if (rt_rq->rt_time > runtime) { 963 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 964 965 /* 966 * Don't actually throttle groups that have no runtime assigned 967 * but accrue some time due to boosting. 968 */ 969 if (likely(rt_b->rt_runtime)) { 970 rt_rq->rt_throttled = 1; 971 printk_deferred_once("sched: RT throttling activated\n"); 972 } else { 973 /* 974 * In case we did anyway, make it go away, 975 * replenishment is a joke, since it will replenish us 976 * with exactly 0 ns. 977 */ 978 rt_rq->rt_time = 0; 979 } 980 981 if (rt_rq_throttled(rt_rq)) { 982 sched_rt_rq_dequeue(rt_rq); 983 return 1; 984 } 985 } 986 987 return 0; 988 } 989 990 /* 991 * Update the current task's runtime statistics. Skip current tasks that 992 * are not in our scheduling class. 993 */ 994 static void update_curr_rt(struct rq *rq) 995 { 996 struct task_struct *curr = rq->curr; 997 struct sched_rt_entity *rt_se = &curr->rt; 998 u64 delta_exec; 999 u64 now; 1000 1001 if (curr->sched_class != &rt_sched_class) 1002 return; 1003 1004 now = rq_clock_task(rq); 1005 delta_exec = now - curr->se.exec_start; 1006 if (unlikely((s64)delta_exec <= 0)) 1007 return; 1008 1009 schedstat_set(curr->stats.exec_max, 1010 max(curr->stats.exec_max, delta_exec)); 1011 1012 trace_sched_stat_runtime(curr, delta_exec, 0); 1013 1014 curr->se.sum_exec_runtime += delta_exec; 1015 account_group_exec_runtime(curr, delta_exec); 1016 1017 curr->se.exec_start = now; 1018 cgroup_account_cputime(curr, delta_exec); 1019 1020 if (!rt_bandwidth_enabled()) 1021 return; 1022 1023 for_each_sched_rt_entity(rt_se) { 1024 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1025 int exceeded; 1026 1027 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 1028 raw_spin_lock(&rt_rq->rt_runtime_lock); 1029 rt_rq->rt_time += delta_exec; 1030 exceeded = sched_rt_runtime_exceeded(rt_rq); 1031 if (exceeded) 1032 resched_curr(rq); 1033 raw_spin_unlock(&rt_rq->rt_runtime_lock); 1034 if (exceeded) 1035 do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); 1036 } 1037 } 1038 } 1039 1040 static void 1041 dequeue_top_rt_rq(struct rt_rq *rt_rq) 1042 { 1043 struct rq *rq = rq_of_rt_rq(rt_rq); 1044 1045 BUG_ON(&rq->rt != rt_rq); 1046 1047 if (!rt_rq->rt_queued) 1048 return; 1049 1050 BUG_ON(!rq->nr_running); 1051 1052 sub_nr_running(rq, rt_rq->rt_nr_running); 1053 rt_rq->rt_queued = 0; 1054 1055 } 1056 1057 static void 1058 enqueue_top_rt_rq(struct rt_rq *rt_rq) 1059 { 1060 struct rq *rq = rq_of_rt_rq(rt_rq); 1061 1062 BUG_ON(&rq->rt != rt_rq); 1063 1064 if (rt_rq->rt_queued) 1065 return; 1066 1067 if (rt_rq_throttled(rt_rq)) 1068 return; 1069 1070 if (rt_rq->rt_nr_running) { 1071 add_nr_running(rq, rt_rq->rt_nr_running); 1072 rt_rq->rt_queued = 1; 1073 } 1074 1075 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 1076 cpufreq_update_util(rq, 0); 1077 } 1078 1079 #if defined CONFIG_SMP 1080 1081 static void 1082 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1083 { 1084 struct rq *rq = rq_of_rt_rq(rt_rq); 1085 1086 #ifdef CONFIG_RT_GROUP_SCHED 1087 /* 1088 * Change rq's cpupri only if rt_rq is the top queue. 1089 */ 1090 if (&rq->rt != rt_rq) 1091 return; 1092 #endif 1093 if (rq->online && prio < prev_prio) 1094 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 1095 } 1096 1097 static void 1098 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1099 { 1100 struct rq *rq = rq_of_rt_rq(rt_rq); 1101 1102 #ifdef CONFIG_RT_GROUP_SCHED 1103 /* 1104 * Change rq's cpupri only if rt_rq is the top queue. 1105 */ 1106 if (&rq->rt != rt_rq) 1107 return; 1108 #endif 1109 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 1110 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 1111 } 1112 1113 #else /* CONFIG_SMP */ 1114 1115 static inline 1116 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1117 static inline 1118 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1119 1120 #endif /* CONFIG_SMP */ 1121 1122 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 1123 static void 1124 inc_rt_prio(struct rt_rq *rt_rq, int prio) 1125 { 1126 int prev_prio = rt_rq->highest_prio.curr; 1127 1128 if (prio < prev_prio) 1129 rt_rq->highest_prio.curr = prio; 1130 1131 inc_rt_prio_smp(rt_rq, prio, prev_prio); 1132 } 1133 1134 static void 1135 dec_rt_prio(struct rt_rq *rt_rq, int prio) 1136 { 1137 int prev_prio = rt_rq->highest_prio.curr; 1138 1139 if (rt_rq->rt_nr_running) { 1140 1141 WARN_ON(prio < prev_prio); 1142 1143 /* 1144 * This may have been our highest task, and therefore 1145 * we may have some recomputation to do 1146 */ 1147 if (prio == prev_prio) { 1148 struct rt_prio_array *array = &rt_rq->active; 1149 1150 rt_rq->highest_prio.curr = 1151 sched_find_first_bit(array->bitmap); 1152 } 1153 1154 } else { 1155 rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 1156 } 1157 1158 dec_rt_prio_smp(rt_rq, prio, prev_prio); 1159 } 1160 1161 #else 1162 1163 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} 1164 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} 1165 1166 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ 1167 1168 #ifdef CONFIG_RT_GROUP_SCHED 1169 1170 static void 1171 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1172 { 1173 if (rt_se_boosted(rt_se)) 1174 rt_rq->rt_nr_boosted++; 1175 1176 if (rt_rq->tg) 1177 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 1178 } 1179 1180 static void 1181 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1182 { 1183 if (rt_se_boosted(rt_se)) 1184 rt_rq->rt_nr_boosted--; 1185 1186 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 1187 } 1188 1189 #else /* CONFIG_RT_GROUP_SCHED */ 1190 1191 static void 1192 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1193 { 1194 start_rt_bandwidth(&def_rt_bandwidth); 1195 } 1196 1197 static inline 1198 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} 1199 1200 #endif /* CONFIG_RT_GROUP_SCHED */ 1201 1202 static inline 1203 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) 1204 { 1205 struct rt_rq *group_rq = group_rt_rq(rt_se); 1206 1207 if (group_rq) 1208 return group_rq->rt_nr_running; 1209 else 1210 return 1; 1211 } 1212 1213 static inline 1214 unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se) 1215 { 1216 struct rt_rq *group_rq = group_rt_rq(rt_se); 1217 struct task_struct *tsk; 1218 1219 if (group_rq) 1220 return group_rq->rr_nr_running; 1221 1222 tsk = rt_task_of(rt_se); 1223 1224 return (tsk->policy == SCHED_RR) ? 1 : 0; 1225 } 1226 1227 static inline 1228 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1229 { 1230 int prio = rt_se_prio(rt_se); 1231 1232 WARN_ON(!rt_prio(prio)); 1233 rt_rq->rt_nr_running += rt_se_nr_running(rt_se); 1234 rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); 1235 1236 inc_rt_prio(rt_rq, prio); 1237 inc_rt_migration(rt_se, rt_rq); 1238 inc_rt_group(rt_se, rt_rq); 1239 } 1240 1241 static inline 1242 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1243 { 1244 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1245 WARN_ON(!rt_rq->rt_nr_running); 1246 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); 1247 rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); 1248 1249 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1250 dec_rt_migration(rt_se, rt_rq); 1251 dec_rt_group(rt_se, rt_rq); 1252 } 1253 1254 /* 1255 * Change rt_se->run_list location unless SAVE && !MOVE 1256 * 1257 * assumes ENQUEUE/DEQUEUE flags match 1258 */ 1259 static inline bool move_entity(unsigned int flags) 1260 { 1261 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) 1262 return false; 1263 1264 return true; 1265 } 1266 1267 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) 1268 { 1269 list_del_init(&rt_se->run_list); 1270 1271 if (list_empty(array->queue + rt_se_prio(rt_se))) 1272 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1273 1274 rt_se->on_list = 0; 1275 } 1276 1277 static inline struct sched_statistics * 1278 __schedstats_from_rt_se(struct sched_rt_entity *rt_se) 1279 { 1280 #ifdef CONFIG_RT_GROUP_SCHED 1281 /* schedstats is not supported for rt group. */ 1282 if (!rt_entity_is_task(rt_se)) 1283 return NULL; 1284 #endif 1285 1286 return &rt_task_of(rt_se)->stats; 1287 } 1288 1289 static inline void 1290 update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 1291 { 1292 struct sched_statistics *stats; 1293 struct task_struct *p = NULL; 1294 1295 if (!schedstat_enabled()) 1296 return; 1297 1298 if (rt_entity_is_task(rt_se)) 1299 p = rt_task_of(rt_se); 1300 1301 stats = __schedstats_from_rt_se(rt_se); 1302 if (!stats) 1303 return; 1304 1305 __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats); 1306 } 1307 1308 static inline void 1309 update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 1310 { 1311 struct sched_statistics *stats; 1312 struct task_struct *p = NULL; 1313 1314 if (!schedstat_enabled()) 1315 return; 1316 1317 if (rt_entity_is_task(rt_se)) 1318 p = rt_task_of(rt_se); 1319 1320 stats = __schedstats_from_rt_se(rt_se); 1321 if (!stats) 1322 return; 1323 1324 __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats); 1325 } 1326 1327 static inline void 1328 update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, 1329 int flags) 1330 { 1331 if (!schedstat_enabled()) 1332 return; 1333 1334 if (flags & ENQUEUE_WAKEUP) 1335 update_stats_enqueue_sleeper_rt(rt_rq, rt_se); 1336 } 1337 1338 static inline void 1339 update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 1340 { 1341 struct sched_statistics *stats; 1342 struct task_struct *p = NULL; 1343 1344 if (!schedstat_enabled()) 1345 return; 1346 1347 if (rt_entity_is_task(rt_se)) 1348 p = rt_task_of(rt_se); 1349 1350 stats = __schedstats_from_rt_se(rt_se); 1351 if (!stats) 1352 return; 1353 1354 __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats); 1355 } 1356 1357 static inline void 1358 update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, 1359 int flags) 1360 { 1361 struct task_struct *p = NULL; 1362 1363 if (!schedstat_enabled()) 1364 return; 1365 1366 if (rt_entity_is_task(rt_se)) 1367 p = rt_task_of(rt_se); 1368 1369 if ((flags & DEQUEUE_SLEEP) && p) { 1370 unsigned int state; 1371 1372 state = READ_ONCE(p->__state); 1373 if (state & TASK_INTERRUPTIBLE) 1374 __schedstat_set(p->stats.sleep_start, 1375 rq_clock(rq_of_rt_rq(rt_rq))); 1376 1377 if (state & TASK_UNINTERRUPTIBLE) 1378 __schedstat_set(p->stats.block_start, 1379 rq_clock(rq_of_rt_rq(rt_rq))); 1380 } 1381 } 1382 1383 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1384 { 1385 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1386 struct rt_prio_array *array = &rt_rq->active; 1387 struct rt_rq *group_rq = group_rt_rq(rt_se); 1388 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1389 1390 /* 1391 * Don't enqueue the group if its throttled, or when empty. 1392 * The latter is a consequence of the former when a child group 1393 * get throttled and the current group doesn't have any other 1394 * active members. 1395 */ 1396 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { 1397 if (rt_se->on_list) 1398 __delist_rt_entity(rt_se, array); 1399 return; 1400 } 1401 1402 if (move_entity(flags)) { 1403 WARN_ON_ONCE(rt_se->on_list); 1404 if (flags & ENQUEUE_HEAD) 1405 list_add(&rt_se->run_list, queue); 1406 else 1407 list_add_tail(&rt_se->run_list, queue); 1408 1409 __set_bit(rt_se_prio(rt_se), array->bitmap); 1410 rt_se->on_list = 1; 1411 } 1412 rt_se->on_rq = 1; 1413 1414 inc_rt_tasks(rt_se, rt_rq); 1415 } 1416 1417 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1418 { 1419 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1420 struct rt_prio_array *array = &rt_rq->active; 1421 1422 if (move_entity(flags)) { 1423 WARN_ON_ONCE(!rt_se->on_list); 1424 __delist_rt_entity(rt_se, array); 1425 } 1426 rt_se->on_rq = 0; 1427 1428 dec_rt_tasks(rt_se, rt_rq); 1429 } 1430 1431 /* 1432 * Because the prio of an upper entry depends on the lower 1433 * entries, we must remove entries top - down. 1434 */ 1435 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) 1436 { 1437 struct sched_rt_entity *back = NULL; 1438 1439 for_each_sched_rt_entity(rt_se) { 1440 rt_se->back = back; 1441 back = rt_se; 1442 } 1443 1444 dequeue_top_rt_rq(rt_rq_of_se(back)); 1445 1446 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1447 if (on_rt_rq(rt_se)) 1448 __dequeue_rt_entity(rt_se, flags); 1449 } 1450 } 1451 1452 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1453 { 1454 struct rq *rq = rq_of_rt_se(rt_se); 1455 1456 update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags); 1457 1458 dequeue_rt_stack(rt_se, flags); 1459 for_each_sched_rt_entity(rt_se) 1460 __enqueue_rt_entity(rt_se, flags); 1461 enqueue_top_rt_rq(&rq->rt); 1462 } 1463 1464 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1465 { 1466 struct rq *rq = rq_of_rt_se(rt_se); 1467 1468 update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags); 1469 1470 dequeue_rt_stack(rt_se, flags); 1471 1472 for_each_sched_rt_entity(rt_se) { 1473 struct rt_rq *rt_rq = group_rt_rq(rt_se); 1474 1475 if (rt_rq && rt_rq->rt_nr_running) 1476 __enqueue_rt_entity(rt_se, flags); 1477 } 1478 enqueue_top_rt_rq(&rq->rt); 1479 } 1480 1481 /* 1482 * Adding/removing a task to/from a priority array: 1483 */ 1484 static void 1485 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1486 { 1487 struct sched_rt_entity *rt_se = &p->rt; 1488 1489 if (flags & ENQUEUE_WAKEUP) 1490 rt_se->timeout = 0; 1491 1492 check_schedstat_required(); 1493 update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se); 1494 1495 enqueue_rt_entity(rt_se, flags); 1496 1497 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1498 enqueue_pushable_task(rq, p); 1499 } 1500 1501 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1502 { 1503 struct sched_rt_entity *rt_se = &p->rt; 1504 1505 update_curr_rt(rq); 1506 dequeue_rt_entity(rt_se, flags); 1507 1508 dequeue_pushable_task(rq, p); 1509 } 1510 1511 /* 1512 * Put task to the head or the end of the run list without the overhead of 1513 * dequeue followed by enqueue. 1514 */ 1515 static void 1516 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1517 { 1518 if (on_rt_rq(rt_se)) { 1519 struct rt_prio_array *array = &rt_rq->active; 1520 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1521 1522 if (head) 1523 list_move(&rt_se->run_list, queue); 1524 else 1525 list_move_tail(&rt_se->run_list, queue); 1526 } 1527 } 1528 1529 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) 1530 { 1531 struct sched_rt_entity *rt_se = &p->rt; 1532 struct rt_rq *rt_rq; 1533 1534 for_each_sched_rt_entity(rt_se) { 1535 rt_rq = rt_rq_of_se(rt_se); 1536 requeue_rt_entity(rt_rq, rt_se, head); 1537 } 1538 } 1539 1540 static void yield_task_rt(struct rq *rq) 1541 { 1542 requeue_task_rt(rq, rq->curr, 0); 1543 } 1544 1545 #ifdef CONFIG_SMP 1546 static int find_lowest_rq(struct task_struct *task); 1547 1548 static int 1549 select_task_rq_rt(struct task_struct *p, int cpu, int flags) 1550 { 1551 struct task_struct *curr; 1552 struct rq *rq; 1553 bool test; 1554 1555 /* For anything but wake ups, just return the task_cpu */ 1556 if (!(flags & (WF_TTWU | WF_FORK))) 1557 goto out; 1558 1559 rq = cpu_rq(cpu); 1560 1561 rcu_read_lock(); 1562 curr = READ_ONCE(rq->curr); /* unlocked access */ 1563 1564 /* 1565 * If the current task on @p's runqueue is an RT task, then 1566 * try to see if we can wake this RT task up on another 1567 * runqueue. Otherwise simply start this RT task 1568 * on its current runqueue. 1569 * 1570 * We want to avoid overloading runqueues. If the woken 1571 * task is a higher priority, then it will stay on this CPU 1572 * and the lower prio task should be moved to another CPU. 1573 * Even though this will probably make the lower prio task 1574 * lose its cache, we do not want to bounce a higher task 1575 * around just because it gave up its CPU, perhaps for a 1576 * lock? 1577 * 1578 * For equal prio tasks, we just let the scheduler sort it out. 1579 * 1580 * Otherwise, just let it ride on the affined RQ and the 1581 * post-schedule router will push the preempted task away 1582 * 1583 * This test is optimistic, if we get it wrong the load-balancer 1584 * will have to sort it out. 1585 * 1586 * We take into account the capacity of the CPU to ensure it fits the 1587 * requirement of the task - which is only important on heterogeneous 1588 * systems like big.LITTLE. 1589 */ 1590 test = curr && 1591 unlikely(rt_task(curr)) && 1592 (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); 1593 1594 if (test || !rt_task_fits_capacity(p, cpu)) { 1595 int target = find_lowest_rq(p); 1596 1597 /* 1598 * Bail out if we were forcing a migration to find a better 1599 * fitting CPU but our search failed. 1600 */ 1601 if (!test && target != -1 && !rt_task_fits_capacity(p, target)) 1602 goto out_unlock; 1603 1604 /* 1605 * Don't bother moving it if the destination CPU is 1606 * not running a lower priority task. 1607 */ 1608 if (target != -1 && 1609 p->prio < cpu_rq(target)->rt.highest_prio.curr) 1610 cpu = target; 1611 } 1612 1613 out_unlock: 1614 rcu_read_unlock(); 1615 1616 out: 1617 return cpu; 1618 } 1619 1620 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1621 { 1622 /* 1623 * Current can't be migrated, useless to reschedule, 1624 * let's hope p can move out. 1625 */ 1626 if (rq->curr->nr_cpus_allowed == 1 || 1627 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) 1628 return; 1629 1630 /* 1631 * p is migratable, so let's not schedule it and 1632 * see if it is pushed or pulled somewhere else. 1633 */ 1634 if (p->nr_cpus_allowed != 1 && 1635 cpupri_find(&rq->rd->cpupri, p, NULL)) 1636 return; 1637 1638 /* 1639 * There appear to be other CPUs that can accept 1640 * the current task but none can run 'p', so lets reschedule 1641 * to try and push the current task away: 1642 */ 1643 requeue_task_rt(rq, p, 1); 1644 resched_curr(rq); 1645 } 1646 1647 static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) 1648 { 1649 if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { 1650 /* 1651 * This is OK, because current is on_cpu, which avoids it being 1652 * picked for load-balance and preemption/IRQs are still 1653 * disabled avoiding further scheduler activity on it and we've 1654 * not yet started the picking loop. 1655 */ 1656 rq_unpin_lock(rq, rf); 1657 pull_rt_task(rq); 1658 rq_repin_lock(rq, rf); 1659 } 1660 1661 return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); 1662 } 1663 #endif /* CONFIG_SMP */ 1664 1665 /* 1666 * Preempt the current task with a newly woken task if needed: 1667 */ 1668 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) 1669 { 1670 if (p->prio < rq->curr->prio) { 1671 resched_curr(rq); 1672 return; 1673 } 1674 1675 #ifdef CONFIG_SMP 1676 /* 1677 * If: 1678 * 1679 * - the newly woken task is of equal priority to the current task 1680 * - the newly woken task is non-migratable while current is migratable 1681 * - current will be preempted on the next reschedule 1682 * 1683 * we should check to see if current can readily move to a different 1684 * cpu. If so, we will reschedule to allow the push logic to try 1685 * to move current somewhere else, making room for our non-migratable 1686 * task. 1687 */ 1688 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) 1689 check_preempt_equal_prio(rq, p); 1690 #endif 1691 } 1692 1693 static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) 1694 { 1695 struct sched_rt_entity *rt_se = &p->rt; 1696 struct rt_rq *rt_rq = &rq->rt; 1697 1698 p->se.exec_start = rq_clock_task(rq); 1699 if (on_rt_rq(&p->rt)) 1700 update_stats_wait_end_rt(rt_rq, rt_se); 1701 1702 /* The running task is never eligible for pushing */ 1703 dequeue_pushable_task(rq, p); 1704 1705 if (!first) 1706 return; 1707 1708 /* 1709 * If prev task was rt, put_prev_task() has already updated the 1710 * utilization. We only care of the case where we start to schedule a 1711 * rt task 1712 */ 1713 if (rq->curr->sched_class != &rt_sched_class) 1714 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); 1715 1716 rt_queue_push_tasks(rq); 1717 } 1718 1719 static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) 1720 { 1721 struct rt_prio_array *array = &rt_rq->active; 1722 struct sched_rt_entity *next = NULL; 1723 struct list_head *queue; 1724 int idx; 1725 1726 idx = sched_find_first_bit(array->bitmap); 1727 BUG_ON(idx >= MAX_RT_PRIO); 1728 1729 queue = array->queue + idx; 1730 next = list_entry(queue->next, struct sched_rt_entity, run_list); 1731 1732 return next; 1733 } 1734 1735 static struct task_struct *_pick_next_task_rt(struct rq *rq) 1736 { 1737 struct sched_rt_entity *rt_se; 1738 struct rt_rq *rt_rq = &rq->rt; 1739 1740 do { 1741 rt_se = pick_next_rt_entity(rt_rq); 1742 BUG_ON(!rt_se); 1743 rt_rq = group_rt_rq(rt_se); 1744 } while (rt_rq); 1745 1746 return rt_task_of(rt_se); 1747 } 1748 1749 static struct task_struct *pick_task_rt(struct rq *rq) 1750 { 1751 struct task_struct *p; 1752 1753 if (!sched_rt_runnable(rq)) 1754 return NULL; 1755 1756 p = _pick_next_task_rt(rq); 1757 1758 return p; 1759 } 1760 1761 static struct task_struct *pick_next_task_rt(struct rq *rq) 1762 { 1763 struct task_struct *p = pick_task_rt(rq); 1764 1765 if (p) 1766 set_next_task_rt(rq, p, true); 1767 1768 return p; 1769 } 1770 1771 static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1772 { 1773 struct sched_rt_entity *rt_se = &p->rt; 1774 struct rt_rq *rt_rq = &rq->rt; 1775 1776 if (on_rt_rq(&p->rt)) 1777 update_stats_wait_start_rt(rt_rq, rt_se); 1778 1779 update_curr_rt(rq); 1780 1781 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); 1782 1783 /* 1784 * The previous task needs to be made eligible for pushing 1785 * if it is still active 1786 */ 1787 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) 1788 enqueue_pushable_task(rq, p); 1789 } 1790 1791 #ifdef CONFIG_SMP 1792 1793 /* Only try algorithms three times */ 1794 #define RT_MAX_TRIES 3 1795 1796 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1797 { 1798 if (!task_running(rq, p) && 1799 cpumask_test_cpu(cpu, &p->cpus_mask)) 1800 return 1; 1801 1802 return 0; 1803 } 1804 1805 /* 1806 * Return the highest pushable rq's task, which is suitable to be executed 1807 * on the CPU, NULL otherwise 1808 */ 1809 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) 1810 { 1811 struct plist_head *head = &rq->rt.pushable_tasks; 1812 struct task_struct *p; 1813 1814 if (!has_pushable_tasks(rq)) 1815 return NULL; 1816 1817 plist_for_each_entry(p, head, pushable_tasks) { 1818 if (pick_rt_task(rq, p, cpu)) 1819 return p; 1820 } 1821 1822 return NULL; 1823 } 1824 1825 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1826 1827 static int find_lowest_rq(struct task_struct *task) 1828 { 1829 struct sched_domain *sd; 1830 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); 1831 int this_cpu = smp_processor_id(); 1832 int cpu = task_cpu(task); 1833 int ret; 1834 1835 /* Make sure the mask is initialized first */ 1836 if (unlikely(!lowest_mask)) 1837 return -1; 1838 1839 if (task->nr_cpus_allowed == 1) 1840 return -1; /* No other targets possible */ 1841 1842 /* 1843 * If we're on asym system ensure we consider the different capacities 1844 * of the CPUs when searching for the lowest_mask. 1845 */ 1846 if (static_branch_unlikely(&sched_asym_cpucapacity)) { 1847 1848 ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, 1849 task, lowest_mask, 1850 rt_task_fits_capacity); 1851 } else { 1852 1853 ret = cpupri_find(&task_rq(task)->rd->cpupri, 1854 task, lowest_mask); 1855 } 1856 1857 if (!ret) 1858 return -1; /* No targets found */ 1859 1860 /* 1861 * At this point we have built a mask of CPUs representing the 1862 * lowest priority tasks in the system. Now we want to elect 1863 * the best one based on our affinity and topology. 1864 * 1865 * We prioritize the last CPU that the task executed on since 1866 * it is most likely cache-hot in that location. 1867 */ 1868 if (cpumask_test_cpu(cpu, lowest_mask)) 1869 return cpu; 1870 1871 /* 1872 * Otherwise, we consult the sched_domains span maps to figure 1873 * out which CPU is logically closest to our hot cache data. 1874 */ 1875 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1876 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1877 1878 rcu_read_lock(); 1879 for_each_domain(cpu, sd) { 1880 if (sd->flags & SD_WAKE_AFFINE) { 1881 int best_cpu; 1882 1883 /* 1884 * "this_cpu" is cheaper to preempt than a 1885 * remote processor. 1886 */ 1887 if (this_cpu != -1 && 1888 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { 1889 rcu_read_unlock(); 1890 return this_cpu; 1891 } 1892 1893 best_cpu = cpumask_any_and_distribute(lowest_mask, 1894 sched_domain_span(sd)); 1895 if (best_cpu < nr_cpu_ids) { 1896 rcu_read_unlock(); 1897 return best_cpu; 1898 } 1899 } 1900 } 1901 rcu_read_unlock(); 1902 1903 /* 1904 * And finally, if there were no matches within the domains 1905 * just give the caller *something* to work with from the compatible 1906 * locations. 1907 */ 1908 if (this_cpu != -1) 1909 return this_cpu; 1910 1911 cpu = cpumask_any_distribute(lowest_mask); 1912 if (cpu < nr_cpu_ids) 1913 return cpu; 1914 1915 return -1; 1916 } 1917 1918 /* Will lock the rq it finds */ 1919 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) 1920 { 1921 struct rq *lowest_rq = NULL; 1922 int tries; 1923 int cpu; 1924 1925 for (tries = 0; tries < RT_MAX_TRIES; tries++) { 1926 cpu = find_lowest_rq(task); 1927 1928 if ((cpu == -1) || (cpu == rq->cpu)) 1929 break; 1930 1931 lowest_rq = cpu_rq(cpu); 1932 1933 if (lowest_rq->rt.highest_prio.curr <= task->prio) { 1934 /* 1935 * Target rq has tasks of equal or higher priority, 1936 * retrying does not release any lock and is unlikely 1937 * to yield a different result. 1938 */ 1939 lowest_rq = NULL; 1940 break; 1941 } 1942 1943 /* if the prio of this runqueue changed, try again */ 1944 if (double_lock_balance(rq, lowest_rq)) { 1945 /* 1946 * We had to unlock the run queue. In 1947 * the mean time, task could have 1948 * migrated already or had its affinity changed. 1949 * Also make sure that it wasn't scheduled on its rq. 1950 */ 1951 if (unlikely(task_rq(task) != rq || 1952 !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || 1953 task_running(rq, task) || 1954 !rt_task(task) || 1955 !task_on_rq_queued(task))) { 1956 1957 double_unlock_balance(rq, lowest_rq); 1958 lowest_rq = NULL; 1959 break; 1960 } 1961 } 1962 1963 /* If this rq is still suitable use it. */ 1964 if (lowest_rq->rt.highest_prio.curr > task->prio) 1965 break; 1966 1967 /* try again */ 1968 double_unlock_balance(rq, lowest_rq); 1969 lowest_rq = NULL; 1970 } 1971 1972 return lowest_rq; 1973 } 1974 1975 static struct task_struct *pick_next_pushable_task(struct rq *rq) 1976 { 1977 struct task_struct *p; 1978 1979 if (!has_pushable_tasks(rq)) 1980 return NULL; 1981 1982 p = plist_first_entry(&rq->rt.pushable_tasks, 1983 struct task_struct, pushable_tasks); 1984 1985 BUG_ON(rq->cpu != task_cpu(p)); 1986 BUG_ON(task_current(rq, p)); 1987 BUG_ON(p->nr_cpus_allowed <= 1); 1988 1989 BUG_ON(!task_on_rq_queued(p)); 1990 BUG_ON(!rt_task(p)); 1991 1992 return p; 1993 } 1994 1995 /* 1996 * If the current CPU has more than one RT task, see if the non 1997 * running task can migrate over to a CPU that is running a task 1998 * of lesser priority. 1999 */ 2000 static int push_rt_task(struct rq *rq, bool pull) 2001 { 2002 struct task_struct *next_task; 2003 struct rq *lowest_rq; 2004 int ret = 0; 2005 2006 if (!rq->rt.overloaded) 2007 return 0; 2008 2009 next_task = pick_next_pushable_task(rq); 2010 if (!next_task) 2011 return 0; 2012 2013 retry: 2014 /* 2015 * It's possible that the next_task slipped in of 2016 * higher priority than current. If that's the case 2017 * just reschedule current. 2018 */ 2019 if (unlikely(next_task->prio < rq->curr->prio)) { 2020 resched_curr(rq); 2021 return 0; 2022 } 2023 2024 if (is_migration_disabled(next_task)) { 2025 struct task_struct *push_task = NULL; 2026 int cpu; 2027 2028 if (!pull || rq->push_busy) 2029 return 0; 2030 2031 /* 2032 * Invoking find_lowest_rq() on anything but an RT task doesn't 2033 * make sense. Per the above priority check, curr has to 2034 * be of higher priority than next_task, so no need to 2035 * reschedule when bailing out. 2036 * 2037 * Note that the stoppers are masqueraded as SCHED_FIFO 2038 * (cf. sched_set_stop_task()), so we can't rely on rt_task(). 2039 */ 2040 if (rq->curr->sched_class != &rt_sched_class) 2041 return 0; 2042 2043 cpu = find_lowest_rq(rq->curr); 2044 if (cpu == -1 || cpu == rq->cpu) 2045 return 0; 2046 2047 /* 2048 * Given we found a CPU with lower priority than @next_task, 2049 * therefore it should be running. However we cannot migrate it 2050 * to this other CPU, instead attempt to push the current 2051 * running task on this CPU away. 2052 */ 2053 push_task = get_push_task(rq); 2054 if (push_task) { 2055 raw_spin_rq_unlock(rq); 2056 stop_one_cpu_nowait(rq->cpu, push_cpu_stop, 2057 push_task, &rq->push_work); 2058 raw_spin_rq_lock(rq); 2059 } 2060 2061 return 0; 2062 } 2063 2064 if (WARN_ON(next_task == rq->curr)) 2065 return 0; 2066 2067 /* We might release rq lock */ 2068 get_task_struct(next_task); 2069 2070 /* find_lock_lowest_rq locks the rq if found */ 2071 lowest_rq = find_lock_lowest_rq(next_task, rq); 2072 if (!lowest_rq) { 2073 struct task_struct *task; 2074 /* 2075 * find_lock_lowest_rq releases rq->lock 2076 * so it is possible that next_task has migrated. 2077 * 2078 * We need to make sure that the task is still on the same 2079 * run-queue and is also still the next task eligible for 2080 * pushing. 2081 */ 2082 task = pick_next_pushable_task(rq); 2083 if (task == next_task) { 2084 /* 2085 * The task hasn't migrated, and is still the next 2086 * eligible task, but we failed to find a run-queue 2087 * to push it to. Do not retry in this case, since 2088 * other CPUs will pull from us when ready. 2089 */ 2090 goto out; 2091 } 2092 2093 if (!task) 2094 /* No more tasks, just exit */ 2095 goto out; 2096 2097 /* 2098 * Something has shifted, try again. 2099 */ 2100 put_task_struct(next_task); 2101 next_task = task; 2102 goto retry; 2103 } 2104 2105 deactivate_task(rq, next_task, 0); 2106 set_task_cpu(next_task, lowest_rq->cpu); 2107 activate_task(lowest_rq, next_task, 0); 2108 resched_curr(lowest_rq); 2109 ret = 1; 2110 2111 double_unlock_balance(rq, lowest_rq); 2112 out: 2113 put_task_struct(next_task); 2114 2115 return ret; 2116 } 2117 2118 static void push_rt_tasks(struct rq *rq) 2119 { 2120 /* push_rt_task will return true if it moved an RT */ 2121 while (push_rt_task(rq, false)) 2122 ; 2123 } 2124 2125 #ifdef HAVE_RT_PUSH_IPI 2126 2127 /* 2128 * When a high priority task schedules out from a CPU and a lower priority 2129 * task is scheduled in, a check is made to see if there's any RT tasks 2130 * on other CPUs that are waiting to run because a higher priority RT task 2131 * is currently running on its CPU. In this case, the CPU with multiple RT 2132 * tasks queued on it (overloaded) needs to be notified that a CPU has opened 2133 * up that may be able to run one of its non-running queued RT tasks. 2134 * 2135 * All CPUs with overloaded RT tasks need to be notified as there is currently 2136 * no way to know which of these CPUs have the highest priority task waiting 2137 * to run. Instead of trying to take a spinlock on each of these CPUs, 2138 * which has shown to cause large latency when done on machines with many 2139 * CPUs, sending an IPI to the CPUs to have them push off the overloaded 2140 * RT tasks waiting to run. 2141 * 2142 * Just sending an IPI to each of the CPUs is also an issue, as on large 2143 * count CPU machines, this can cause an IPI storm on a CPU, especially 2144 * if its the only CPU with multiple RT tasks queued, and a large number 2145 * of CPUs scheduling a lower priority task at the same time. 2146 * 2147 * Each root domain has its own irq work function that can iterate over 2148 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT 2149 * task must be checked if there's one or many CPUs that are lowering 2150 * their priority, there's a single irq work iterator that will try to 2151 * push off RT tasks that are waiting to run. 2152 * 2153 * When a CPU schedules a lower priority task, it will kick off the 2154 * irq work iterator that will jump to each CPU with overloaded RT tasks. 2155 * As it only takes the first CPU that schedules a lower priority task 2156 * to start the process, the rto_start variable is incremented and if 2157 * the atomic result is one, then that CPU will try to take the rto_lock. 2158 * This prevents high contention on the lock as the process handles all 2159 * CPUs scheduling lower priority tasks. 2160 * 2161 * All CPUs that are scheduling a lower priority task will increment the 2162 * rt_loop_next variable. This will make sure that the irq work iterator 2163 * checks all RT overloaded CPUs whenever a CPU schedules a new lower 2164 * priority task, even if the iterator is in the middle of a scan. Incrementing 2165 * the rt_loop_next will cause the iterator to perform another scan. 2166 * 2167 */ 2168 static int rto_next_cpu(struct root_domain *rd) 2169 { 2170 int next; 2171 int cpu; 2172 2173 /* 2174 * When starting the IPI RT pushing, the rto_cpu is set to -1, 2175 * rt_next_cpu() will simply return the first CPU found in 2176 * the rto_mask. 2177 * 2178 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it 2179 * will return the next CPU found in the rto_mask. 2180 * 2181 * If there are no more CPUs left in the rto_mask, then a check is made 2182 * against rto_loop and rto_loop_next. rto_loop is only updated with 2183 * the rto_lock held, but any CPU may increment the rto_loop_next 2184 * without any locking. 2185 */ 2186 for (;;) { 2187 2188 /* When rto_cpu is -1 this acts like cpumask_first() */ 2189 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); 2190 2191 rd->rto_cpu = cpu; 2192 2193 if (cpu < nr_cpu_ids) 2194 return cpu; 2195 2196 rd->rto_cpu = -1; 2197 2198 /* 2199 * ACQUIRE ensures we see the @rto_mask changes 2200 * made prior to the @next value observed. 2201 * 2202 * Matches WMB in rt_set_overload(). 2203 */ 2204 next = atomic_read_acquire(&rd->rto_loop_next); 2205 2206 if (rd->rto_loop == next) 2207 break; 2208 2209 rd->rto_loop = next; 2210 } 2211 2212 return -1; 2213 } 2214 2215 static inline bool rto_start_trylock(atomic_t *v) 2216 { 2217 return !atomic_cmpxchg_acquire(v, 0, 1); 2218 } 2219 2220 static inline void rto_start_unlock(atomic_t *v) 2221 { 2222 atomic_set_release(v, 0); 2223 } 2224 2225 static void tell_cpu_to_push(struct rq *rq) 2226 { 2227 int cpu = -1; 2228 2229 /* Keep the loop going if the IPI is currently active */ 2230 atomic_inc(&rq->rd->rto_loop_next); 2231 2232 /* Only one CPU can initiate a loop at a time */ 2233 if (!rto_start_trylock(&rq->rd->rto_loop_start)) 2234 return; 2235 2236 raw_spin_lock(&rq->rd->rto_lock); 2237 2238 /* 2239 * The rto_cpu is updated under the lock, if it has a valid CPU 2240 * then the IPI is still running and will continue due to the 2241 * update to loop_next, and nothing needs to be done here. 2242 * Otherwise it is finishing up and an ipi needs to be sent. 2243 */ 2244 if (rq->rd->rto_cpu < 0) 2245 cpu = rto_next_cpu(rq->rd); 2246 2247 raw_spin_unlock(&rq->rd->rto_lock); 2248 2249 rto_start_unlock(&rq->rd->rto_loop_start); 2250 2251 if (cpu >= 0) { 2252 /* Make sure the rd does not get freed while pushing */ 2253 sched_get_rd(rq->rd); 2254 irq_work_queue_on(&rq->rd->rto_push_work, cpu); 2255 } 2256 } 2257 2258 /* Called from hardirq context */ 2259 void rto_push_irq_work_func(struct irq_work *work) 2260 { 2261 struct root_domain *rd = 2262 container_of(work, struct root_domain, rto_push_work); 2263 struct rq *rq; 2264 int cpu; 2265 2266 rq = this_rq(); 2267 2268 /* 2269 * We do not need to grab the lock to check for has_pushable_tasks. 2270 * When it gets updated, a check is made if a push is possible. 2271 */ 2272 if (has_pushable_tasks(rq)) { 2273 raw_spin_rq_lock(rq); 2274 while (push_rt_task(rq, true)) 2275 ; 2276 raw_spin_rq_unlock(rq); 2277 } 2278 2279 raw_spin_lock(&rd->rto_lock); 2280 2281 /* Pass the IPI to the next rt overloaded queue */ 2282 cpu = rto_next_cpu(rd); 2283 2284 raw_spin_unlock(&rd->rto_lock); 2285 2286 if (cpu < 0) { 2287 sched_put_rd(rd); 2288 return; 2289 } 2290 2291 /* Try the next RT overloaded CPU */ 2292 irq_work_queue_on(&rd->rto_push_work, cpu); 2293 } 2294 #endif /* HAVE_RT_PUSH_IPI */ 2295 2296 static void pull_rt_task(struct rq *this_rq) 2297 { 2298 int this_cpu = this_rq->cpu, cpu; 2299 bool resched = false; 2300 struct task_struct *p, *push_task; 2301 struct rq *src_rq; 2302 int rt_overload_count = rt_overloaded(this_rq); 2303 2304 if (likely(!rt_overload_count)) 2305 return; 2306 2307 /* 2308 * Match the barrier from rt_set_overloaded; this guarantees that if we 2309 * see overloaded we must also see the rto_mask bit. 2310 */ 2311 smp_rmb(); 2312 2313 /* If we are the only overloaded CPU do nothing */ 2314 if (rt_overload_count == 1 && 2315 cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) 2316 return; 2317 2318 #ifdef HAVE_RT_PUSH_IPI 2319 if (sched_feat(RT_PUSH_IPI)) { 2320 tell_cpu_to_push(this_rq); 2321 return; 2322 } 2323 #endif 2324 2325 for_each_cpu(cpu, this_rq->rd->rto_mask) { 2326 if (this_cpu == cpu) 2327 continue; 2328 2329 src_rq = cpu_rq(cpu); 2330 2331 /* 2332 * Don't bother taking the src_rq->lock if the next highest 2333 * task is known to be lower-priority than our current task. 2334 * This may look racy, but if this value is about to go 2335 * logically higher, the src_rq will push this task away. 2336 * And if its going logically lower, we do not care 2337 */ 2338 if (src_rq->rt.highest_prio.next >= 2339 this_rq->rt.highest_prio.curr) 2340 continue; 2341 2342 /* 2343 * We can potentially drop this_rq's lock in 2344 * double_lock_balance, and another CPU could 2345 * alter this_rq 2346 */ 2347 push_task = NULL; 2348 double_lock_balance(this_rq, src_rq); 2349 2350 /* 2351 * We can pull only a task, which is pushable 2352 * on its rq, and no others. 2353 */ 2354 p = pick_highest_pushable_task(src_rq, this_cpu); 2355 2356 /* 2357 * Do we have an RT task that preempts 2358 * the to-be-scheduled task? 2359 */ 2360 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 2361 WARN_ON(p == src_rq->curr); 2362 WARN_ON(!task_on_rq_queued(p)); 2363 2364 /* 2365 * There's a chance that p is higher in priority 2366 * than what's currently running on its CPU. 2367 * This is just that p is waking up and hasn't 2368 * had a chance to schedule. We only pull 2369 * p if it is lower in priority than the 2370 * current task on the run queue 2371 */ 2372 if (p->prio < src_rq->curr->prio) 2373 goto skip; 2374 2375 if (is_migration_disabled(p)) { 2376 push_task = get_push_task(src_rq); 2377 } else { 2378 deactivate_task(src_rq, p, 0); 2379 set_task_cpu(p, this_cpu); 2380 activate_task(this_rq, p, 0); 2381 resched = true; 2382 } 2383 /* 2384 * We continue with the search, just in 2385 * case there's an even higher prio task 2386 * in another runqueue. (low likelihood 2387 * but possible) 2388 */ 2389 } 2390 skip: 2391 double_unlock_balance(this_rq, src_rq); 2392 2393 if (push_task) { 2394 raw_spin_rq_unlock(this_rq); 2395 stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, 2396 push_task, &src_rq->push_work); 2397 raw_spin_rq_lock(this_rq); 2398 } 2399 } 2400 2401 if (resched) 2402 resched_curr(this_rq); 2403 } 2404 2405 /* 2406 * If we are not running and we are not going to reschedule soon, we should 2407 * try to push tasks away now 2408 */ 2409 static void task_woken_rt(struct rq *rq, struct task_struct *p) 2410 { 2411 bool need_to_push = !task_running(rq, p) && 2412 !test_tsk_need_resched(rq->curr) && 2413 p->nr_cpus_allowed > 1 && 2414 (dl_task(rq->curr) || rt_task(rq->curr)) && 2415 (rq->curr->nr_cpus_allowed < 2 || 2416 rq->curr->prio <= p->prio); 2417 2418 if (need_to_push) 2419 push_rt_tasks(rq); 2420 } 2421 2422 /* Assumes rq->lock is held */ 2423 static void rq_online_rt(struct rq *rq) 2424 { 2425 if (rq->rt.overloaded) 2426 rt_set_overload(rq); 2427 2428 __enable_runtime(rq); 2429 2430 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); 2431 } 2432 2433 /* Assumes rq->lock is held */ 2434 static void rq_offline_rt(struct rq *rq) 2435 { 2436 if (rq->rt.overloaded) 2437 rt_clear_overload(rq); 2438 2439 __disable_runtime(rq); 2440 2441 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); 2442 } 2443 2444 /* 2445 * When switch from the rt queue, we bring ourselves to a position 2446 * that we might want to pull RT tasks from other runqueues. 2447 */ 2448 static void switched_from_rt(struct rq *rq, struct task_struct *p) 2449 { 2450 /* 2451 * If there are other RT tasks then we will reschedule 2452 * and the scheduling of the other RT tasks will handle 2453 * the balancing. But if we are the last RT task 2454 * we may need to handle the pulling of RT tasks 2455 * now. 2456 */ 2457 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) 2458 return; 2459 2460 rt_queue_pull_task(rq); 2461 } 2462 2463 void __init init_sched_rt_class(void) 2464 { 2465 unsigned int i; 2466 2467 for_each_possible_cpu(i) { 2468 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 2469 GFP_KERNEL, cpu_to_node(i)); 2470 } 2471 } 2472 #endif /* CONFIG_SMP */ 2473 2474 /* 2475 * When switching a task to RT, we may overload the runqueue 2476 * with RT tasks. In this case we try to push them off to 2477 * other runqueues. 2478 */ 2479 static void switched_to_rt(struct rq *rq, struct task_struct *p) 2480 { 2481 /* 2482 * If we are running, update the avg_rt tracking, as the running time 2483 * will now on be accounted into the latter. 2484 */ 2485 if (task_current(rq, p)) { 2486 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); 2487 return; 2488 } 2489 2490 /* 2491 * If we are not running we may need to preempt the current 2492 * running task. If that current running task is also an RT task 2493 * then see if we can move to another run queue. 2494 */ 2495 if (task_on_rq_queued(p)) { 2496 #ifdef CONFIG_SMP 2497 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) 2498 rt_queue_push_tasks(rq); 2499 #endif /* CONFIG_SMP */ 2500 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) 2501 resched_curr(rq); 2502 } 2503 } 2504 2505 /* 2506 * Priority of the task has changed. This may cause 2507 * us to initiate a push or pull. 2508 */ 2509 static void 2510 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 2511 { 2512 if (!task_on_rq_queued(p)) 2513 return; 2514 2515 if (task_current(rq, p)) { 2516 #ifdef CONFIG_SMP 2517 /* 2518 * If our priority decreases while running, we 2519 * may need to pull tasks to this runqueue. 2520 */ 2521 if (oldprio < p->prio) 2522 rt_queue_pull_task(rq); 2523 2524 /* 2525 * If there's a higher priority task waiting to run 2526 * then reschedule. 2527 */ 2528 if (p->prio > rq->rt.highest_prio.curr) 2529 resched_curr(rq); 2530 #else 2531 /* For UP simply resched on drop of prio */ 2532 if (oldprio < p->prio) 2533 resched_curr(rq); 2534 #endif /* CONFIG_SMP */ 2535 } else { 2536 /* 2537 * This task is not running, but if it is 2538 * greater than the current running task 2539 * then reschedule. 2540 */ 2541 if (p->prio < rq->curr->prio) 2542 resched_curr(rq); 2543 } 2544 } 2545 2546 #ifdef CONFIG_POSIX_TIMERS 2547 static void watchdog(struct rq *rq, struct task_struct *p) 2548 { 2549 unsigned long soft, hard; 2550 2551 /* max may change after cur was read, this will be fixed next tick */ 2552 soft = task_rlimit(p, RLIMIT_RTTIME); 2553 hard = task_rlimit_max(p, RLIMIT_RTTIME); 2554 2555 if (soft != RLIM_INFINITY) { 2556 unsigned long next; 2557 2558 if (p->rt.watchdog_stamp != jiffies) { 2559 p->rt.timeout++; 2560 p->rt.watchdog_stamp = jiffies; 2561 } 2562 2563 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 2564 if (p->rt.timeout > next) { 2565 posix_cputimers_rt_watchdog(&p->posix_cputimers, 2566 p->se.sum_exec_runtime); 2567 } 2568 } 2569 } 2570 #else 2571 static inline void watchdog(struct rq *rq, struct task_struct *p) { } 2572 #endif 2573 2574 /* 2575 * scheduler tick hitting a task of our scheduling class. 2576 * 2577 * NOTE: This function can be called remotely by the tick offload that 2578 * goes along full dynticks. Therefore no local assumption can be made 2579 * and everything must be accessed through the @rq and @curr passed in 2580 * parameters. 2581 */ 2582 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 2583 { 2584 struct sched_rt_entity *rt_se = &p->rt; 2585 2586 update_curr_rt(rq); 2587 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); 2588 2589 watchdog(rq, p); 2590 2591 /* 2592 * RR tasks need a special form of timeslice management. 2593 * FIFO tasks have no timeslices. 2594 */ 2595 if (p->policy != SCHED_RR) 2596 return; 2597 2598 if (--p->rt.time_slice) 2599 return; 2600 2601 p->rt.time_slice = sched_rr_timeslice; 2602 2603 /* 2604 * Requeue to the end of queue if we (and all of our ancestors) are not 2605 * the only element on the queue 2606 */ 2607 for_each_sched_rt_entity(rt_se) { 2608 if (rt_se->run_list.prev != rt_se->run_list.next) { 2609 requeue_task_rt(rq, p, 0); 2610 resched_curr(rq); 2611 return; 2612 } 2613 } 2614 } 2615 2616 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 2617 { 2618 /* 2619 * Time slice is 0 for SCHED_FIFO tasks 2620 */ 2621 if (task->policy == SCHED_RR) 2622 return sched_rr_timeslice; 2623 else 2624 return 0; 2625 } 2626 2627 DEFINE_SCHED_CLASS(rt) = { 2628 2629 .enqueue_task = enqueue_task_rt, 2630 .dequeue_task = dequeue_task_rt, 2631 .yield_task = yield_task_rt, 2632 2633 .check_preempt_curr = check_preempt_curr_rt, 2634 2635 .pick_next_task = pick_next_task_rt, 2636 .put_prev_task = put_prev_task_rt, 2637 .set_next_task = set_next_task_rt, 2638 2639 #ifdef CONFIG_SMP 2640 .balance = balance_rt, 2641 .pick_task = pick_task_rt, 2642 .select_task_rq = select_task_rq_rt, 2643 .set_cpus_allowed = set_cpus_allowed_common, 2644 .rq_online = rq_online_rt, 2645 .rq_offline = rq_offline_rt, 2646 .task_woken = task_woken_rt, 2647 .switched_from = switched_from_rt, 2648 .find_lock_rq = find_lock_lowest_rq, 2649 #endif 2650 2651 .task_tick = task_tick_rt, 2652 2653 .get_rr_interval = get_rr_interval_rt, 2654 2655 .prio_changed = prio_changed_rt, 2656 .switched_to = switched_to_rt, 2657 2658 .update_curr = update_curr_rt, 2659 2660 #ifdef CONFIG_UCLAMP_TASK 2661 .uclamp_enabled = 1, 2662 #endif 2663 }; 2664 2665 #ifdef CONFIG_RT_GROUP_SCHED 2666 /* 2667 * Ensure that the real time constraints are schedulable. 2668 */ 2669 static DEFINE_MUTEX(rt_constraints_mutex); 2670 2671 static inline int tg_has_rt_tasks(struct task_group *tg) 2672 { 2673 struct task_struct *task; 2674 struct css_task_iter it; 2675 int ret = 0; 2676 2677 /* 2678 * Autogroups do not have RT tasks; see autogroup_create(). 2679 */ 2680 if (task_group_is_autogroup(tg)) 2681 return 0; 2682 2683 css_task_iter_start(&tg->css, 0, &it); 2684 while (!ret && (task = css_task_iter_next(&it))) 2685 ret |= rt_task(task); 2686 css_task_iter_end(&it); 2687 2688 return ret; 2689 } 2690 2691 struct rt_schedulable_data { 2692 struct task_group *tg; 2693 u64 rt_period; 2694 u64 rt_runtime; 2695 }; 2696 2697 static int tg_rt_schedulable(struct task_group *tg, void *data) 2698 { 2699 struct rt_schedulable_data *d = data; 2700 struct task_group *child; 2701 unsigned long total, sum = 0; 2702 u64 period, runtime; 2703 2704 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 2705 runtime = tg->rt_bandwidth.rt_runtime; 2706 2707 if (tg == d->tg) { 2708 period = d->rt_period; 2709 runtime = d->rt_runtime; 2710 } 2711 2712 /* 2713 * Cannot have more runtime than the period. 2714 */ 2715 if (runtime > period && runtime != RUNTIME_INF) 2716 return -EINVAL; 2717 2718 /* 2719 * Ensure we don't starve existing RT tasks if runtime turns zero. 2720 */ 2721 if (rt_bandwidth_enabled() && !runtime && 2722 tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) 2723 return -EBUSY; 2724 2725 total = to_ratio(period, runtime); 2726 2727 /* 2728 * Nobody can have more than the global setting allows. 2729 */ 2730 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 2731 return -EINVAL; 2732 2733 /* 2734 * The sum of our children's runtime should not exceed our own. 2735 */ 2736 list_for_each_entry_rcu(child, &tg->children, siblings) { 2737 period = ktime_to_ns(child->rt_bandwidth.rt_period); 2738 runtime = child->rt_bandwidth.rt_runtime; 2739 2740 if (child == d->tg) { 2741 period = d->rt_period; 2742 runtime = d->rt_runtime; 2743 } 2744 2745 sum += to_ratio(period, runtime); 2746 } 2747 2748 if (sum > total) 2749 return -EINVAL; 2750 2751 return 0; 2752 } 2753 2754 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 2755 { 2756 int ret; 2757 2758 struct rt_schedulable_data data = { 2759 .tg = tg, 2760 .rt_period = period, 2761 .rt_runtime = runtime, 2762 }; 2763 2764 rcu_read_lock(); 2765 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 2766 rcu_read_unlock(); 2767 2768 return ret; 2769 } 2770 2771 static int tg_set_rt_bandwidth(struct task_group *tg, 2772 u64 rt_period, u64 rt_runtime) 2773 { 2774 int i, err = 0; 2775 2776 /* 2777 * Disallowing the root group RT runtime is BAD, it would disallow the 2778 * kernel creating (and or operating) RT threads. 2779 */ 2780 if (tg == &root_task_group && rt_runtime == 0) 2781 return -EINVAL; 2782 2783 /* No period doesn't make any sense. */ 2784 if (rt_period == 0) 2785 return -EINVAL; 2786 2787 /* 2788 * Bound quota to defend quota against overflow during bandwidth shift. 2789 */ 2790 if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime) 2791 return -EINVAL; 2792 2793 mutex_lock(&rt_constraints_mutex); 2794 err = __rt_schedulable(tg, rt_period, rt_runtime); 2795 if (err) 2796 goto unlock; 2797 2798 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 2799 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 2800 tg->rt_bandwidth.rt_runtime = rt_runtime; 2801 2802 for_each_possible_cpu(i) { 2803 struct rt_rq *rt_rq = tg->rt_rq[i]; 2804 2805 raw_spin_lock(&rt_rq->rt_runtime_lock); 2806 rt_rq->rt_runtime = rt_runtime; 2807 raw_spin_unlock(&rt_rq->rt_runtime_lock); 2808 } 2809 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 2810 unlock: 2811 mutex_unlock(&rt_constraints_mutex); 2812 2813 return err; 2814 } 2815 2816 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 2817 { 2818 u64 rt_runtime, rt_period; 2819 2820 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 2821 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 2822 if (rt_runtime_us < 0) 2823 rt_runtime = RUNTIME_INF; 2824 else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) 2825 return -EINVAL; 2826 2827 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 2828 } 2829 2830 long sched_group_rt_runtime(struct task_group *tg) 2831 { 2832 u64 rt_runtime_us; 2833 2834 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 2835 return -1; 2836 2837 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 2838 do_div(rt_runtime_us, NSEC_PER_USEC); 2839 return rt_runtime_us; 2840 } 2841 2842 int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) 2843 { 2844 u64 rt_runtime, rt_period; 2845 2846 if (rt_period_us > U64_MAX / NSEC_PER_USEC) 2847 return -EINVAL; 2848 2849 rt_period = rt_period_us * NSEC_PER_USEC; 2850 rt_runtime = tg->rt_bandwidth.rt_runtime; 2851 2852 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 2853 } 2854 2855 long sched_group_rt_period(struct task_group *tg) 2856 { 2857 u64 rt_period_us; 2858 2859 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 2860 do_div(rt_period_us, NSEC_PER_USEC); 2861 return rt_period_us; 2862 } 2863 2864 static int sched_rt_global_constraints(void) 2865 { 2866 int ret = 0; 2867 2868 mutex_lock(&rt_constraints_mutex); 2869 ret = __rt_schedulable(NULL, 0, 0); 2870 mutex_unlock(&rt_constraints_mutex); 2871 2872 return ret; 2873 } 2874 2875 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 2876 { 2877 /* Don't accept realtime tasks when there is no way for them to run */ 2878 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 2879 return 0; 2880 2881 return 1; 2882 } 2883 2884 #else /* !CONFIG_RT_GROUP_SCHED */ 2885 static int sched_rt_global_constraints(void) 2886 { 2887 unsigned long flags; 2888 int i; 2889 2890 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 2891 for_each_possible_cpu(i) { 2892 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 2893 2894 raw_spin_lock(&rt_rq->rt_runtime_lock); 2895 rt_rq->rt_runtime = global_rt_runtime(); 2896 raw_spin_unlock(&rt_rq->rt_runtime_lock); 2897 } 2898 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 2899 2900 return 0; 2901 } 2902 #endif /* CONFIG_RT_GROUP_SCHED */ 2903 2904 static int sched_rt_global_validate(void) 2905 { 2906 if (sysctl_sched_rt_period <= 0) 2907 return -EINVAL; 2908 2909 if ((sysctl_sched_rt_runtime != RUNTIME_INF) && 2910 ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || 2911 ((u64)sysctl_sched_rt_runtime * 2912 NSEC_PER_USEC > max_rt_runtime))) 2913 return -EINVAL; 2914 2915 return 0; 2916 } 2917 2918 static void sched_rt_do_global(void) 2919 { 2920 unsigned long flags; 2921 2922 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 2923 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 2924 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 2925 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 2926 } 2927 2928 int sched_rt_handler(struct ctl_table *table, int write, void *buffer, 2929 size_t *lenp, loff_t *ppos) 2930 { 2931 int old_period, old_runtime; 2932 static DEFINE_MUTEX(mutex); 2933 int ret; 2934 2935 mutex_lock(&mutex); 2936 old_period = sysctl_sched_rt_period; 2937 old_runtime = sysctl_sched_rt_runtime; 2938 2939 ret = proc_dointvec(table, write, buffer, lenp, ppos); 2940 2941 if (!ret && write) { 2942 ret = sched_rt_global_validate(); 2943 if (ret) 2944 goto undo; 2945 2946 ret = sched_dl_global_validate(); 2947 if (ret) 2948 goto undo; 2949 2950 ret = sched_rt_global_constraints(); 2951 if (ret) 2952 goto undo; 2953 2954 sched_rt_do_global(); 2955 sched_dl_do_global(); 2956 } 2957 if (0) { 2958 undo: 2959 sysctl_sched_rt_period = old_period; 2960 sysctl_sched_rt_runtime = old_runtime; 2961 } 2962 mutex_unlock(&mutex); 2963 2964 return ret; 2965 } 2966 2967 int sched_rr_handler(struct ctl_table *table, int write, void *buffer, 2968 size_t *lenp, loff_t *ppos) 2969 { 2970 int ret; 2971 static DEFINE_MUTEX(mutex); 2972 2973 mutex_lock(&mutex); 2974 ret = proc_dointvec(table, write, buffer, lenp, ppos); 2975 /* 2976 * Make sure that internally we keep jiffies. 2977 * Also, writing zero resets the timeslice to default: 2978 */ 2979 if (!ret && write) { 2980 sched_rr_timeslice = 2981 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : 2982 msecs_to_jiffies(sysctl_sched_rr_timeslice); 2983 } 2984 mutex_unlock(&mutex); 2985 2986 return ret; 2987 } 2988 2989 #ifdef CONFIG_SCHED_DEBUG 2990 void print_rt_stats(struct seq_file *m, int cpu) 2991 { 2992 rt_rq_iter_t iter; 2993 struct rt_rq *rt_rq; 2994 2995 rcu_read_lock(); 2996 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) 2997 print_rt_rq(m, cpu, rt_rq); 2998 rcu_read_unlock(); 2999 } 3000 #endif /* CONFIG_SCHED_DEBUG */ 3001