1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 4 * policies) 5 */ 6 #include "sched.h" 7 8 #include "pelt.h" 9 10 int sched_rr_timeslice = RR_TIMESLICE; 11 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; 12 /* More than 4 hours if BW_SHIFT equals 20. */ 13 static const u64 max_rt_runtime = MAX_BW; 14 15 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 16 17 struct rt_bandwidth def_rt_bandwidth; 18 19 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 20 { 21 struct rt_bandwidth *rt_b = 22 container_of(timer, struct rt_bandwidth, rt_period_timer); 23 int idle = 0; 24 int overrun; 25 26 raw_spin_lock(&rt_b->rt_runtime_lock); 27 for (;;) { 28 overrun = hrtimer_forward_now(timer, rt_b->rt_period); 29 if (!overrun) 30 break; 31 32 raw_spin_unlock(&rt_b->rt_runtime_lock); 33 idle = do_sched_rt_period_timer(rt_b, overrun); 34 raw_spin_lock(&rt_b->rt_runtime_lock); 35 } 36 if (idle) 37 rt_b->rt_period_active = 0; 38 raw_spin_unlock(&rt_b->rt_runtime_lock); 39 40 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 41 } 42 43 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 44 { 45 rt_b->rt_period = ns_to_ktime(period); 46 rt_b->rt_runtime = runtime; 47 48 raw_spin_lock_init(&rt_b->rt_runtime_lock); 49 50 hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, 51 HRTIMER_MODE_REL_HARD); 52 rt_b->rt_period_timer.function = sched_rt_period_timer; 53 } 54 55 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 56 { 57 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 58 return; 59 60 raw_spin_lock(&rt_b->rt_runtime_lock); 61 if (!rt_b->rt_period_active) { 62 rt_b->rt_period_active = 1; 63 /* 64 * SCHED_DEADLINE updates the bandwidth, as a run away 65 * RT task with a DL task could hog a CPU. But DL does 66 * not reset the period. If a deadline task was running 67 * without an RT task running, it can cause RT tasks to 68 * throttle when they start up. Kick the timer right away 69 * to update the period. 70 */ 71 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); 72 hrtimer_start_expires(&rt_b->rt_period_timer, 73 HRTIMER_MODE_ABS_PINNED_HARD); 74 } 75 raw_spin_unlock(&rt_b->rt_runtime_lock); 76 } 77 78 void init_rt_rq(struct rt_rq *rt_rq) 79 { 80 struct rt_prio_array *array; 81 int i; 82 83 array = &rt_rq->active; 84 for (i = 0; i < MAX_RT_PRIO; i++) { 85 INIT_LIST_HEAD(array->queue + i); 86 __clear_bit(i, array->bitmap); 87 } 88 /* delimiter for bitsearch: */ 89 __set_bit(MAX_RT_PRIO, array->bitmap); 90 91 #if defined CONFIG_SMP 92 rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 93 rt_rq->highest_prio.next = MAX_RT_PRIO-1; 94 rt_rq->rt_nr_migratory = 0; 95 rt_rq->overloaded = 0; 96 plist_head_init(&rt_rq->pushable_tasks); 97 #endif /* CONFIG_SMP */ 98 /* We start is dequeued state, because no RT tasks are queued */ 99 rt_rq->rt_queued = 0; 100 101 rt_rq->rt_time = 0; 102 rt_rq->rt_throttled = 0; 103 rt_rq->rt_runtime = 0; 104 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 105 } 106 107 #ifdef CONFIG_RT_GROUP_SCHED 108 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 109 { 110 hrtimer_cancel(&rt_b->rt_period_timer); 111 } 112 113 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 114 115 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 116 { 117 #ifdef CONFIG_SCHED_DEBUG 118 WARN_ON_ONCE(!rt_entity_is_task(rt_se)); 119 #endif 120 return container_of(rt_se, struct task_struct, rt); 121 } 122 123 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 124 { 125 return rt_rq->rq; 126 } 127 128 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 129 { 130 return rt_se->rt_rq; 131 } 132 133 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 134 { 135 struct rt_rq *rt_rq = rt_se->rt_rq; 136 137 return rt_rq->rq; 138 } 139 140 void free_rt_sched_group(struct task_group *tg) 141 { 142 int i; 143 144 if (tg->rt_se) 145 destroy_rt_bandwidth(&tg->rt_bandwidth); 146 147 for_each_possible_cpu(i) { 148 if (tg->rt_rq) 149 kfree(tg->rt_rq[i]); 150 if (tg->rt_se) 151 kfree(tg->rt_se[i]); 152 } 153 154 kfree(tg->rt_rq); 155 kfree(tg->rt_se); 156 } 157 158 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 159 struct sched_rt_entity *rt_se, int cpu, 160 struct sched_rt_entity *parent) 161 { 162 struct rq *rq = cpu_rq(cpu); 163 164 rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 165 rt_rq->rt_nr_boosted = 0; 166 rt_rq->rq = rq; 167 rt_rq->tg = tg; 168 169 tg->rt_rq[cpu] = rt_rq; 170 tg->rt_se[cpu] = rt_se; 171 172 if (!rt_se) 173 return; 174 175 if (!parent) 176 rt_se->rt_rq = &rq->rt; 177 else 178 rt_se->rt_rq = parent->my_q; 179 180 rt_se->my_q = rt_rq; 181 rt_se->parent = parent; 182 INIT_LIST_HEAD(&rt_se->run_list); 183 } 184 185 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 186 { 187 struct rt_rq *rt_rq; 188 struct sched_rt_entity *rt_se; 189 int i; 190 191 tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); 192 if (!tg->rt_rq) 193 goto err; 194 tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL); 195 if (!tg->rt_se) 196 goto err; 197 198 init_rt_bandwidth(&tg->rt_bandwidth, 199 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 200 201 for_each_possible_cpu(i) { 202 rt_rq = kzalloc_node(sizeof(struct rt_rq), 203 GFP_KERNEL, cpu_to_node(i)); 204 if (!rt_rq) 205 goto err; 206 207 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 208 GFP_KERNEL, cpu_to_node(i)); 209 if (!rt_se) 210 goto err_free_rq; 211 212 init_rt_rq(rt_rq); 213 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 214 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 215 } 216 217 return 1; 218 219 err_free_rq: 220 kfree(rt_rq); 221 err: 222 return 0; 223 } 224 225 #else /* CONFIG_RT_GROUP_SCHED */ 226 227 #define rt_entity_is_task(rt_se) (1) 228 229 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 230 { 231 return container_of(rt_se, struct task_struct, rt); 232 } 233 234 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 235 { 236 return container_of(rt_rq, struct rq, rt); 237 } 238 239 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 240 { 241 struct task_struct *p = rt_task_of(rt_se); 242 243 return task_rq(p); 244 } 245 246 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 247 { 248 struct rq *rq = rq_of_rt_se(rt_se); 249 250 return &rq->rt; 251 } 252 253 void free_rt_sched_group(struct task_group *tg) { } 254 255 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 256 { 257 return 1; 258 } 259 #endif /* CONFIG_RT_GROUP_SCHED */ 260 261 #ifdef CONFIG_SMP 262 263 static void pull_rt_task(struct rq *this_rq); 264 265 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 266 { 267 /* Try to pull RT tasks here if we lower this rq's prio */ 268 return rq->online && rq->rt.highest_prio.curr > prev->prio; 269 } 270 271 static inline int rt_overloaded(struct rq *rq) 272 { 273 return atomic_read(&rq->rd->rto_count); 274 } 275 276 static inline void rt_set_overload(struct rq *rq) 277 { 278 if (!rq->online) 279 return; 280 281 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); 282 /* 283 * Make sure the mask is visible before we set 284 * the overload count. That is checked to determine 285 * if we should look at the mask. It would be a shame 286 * if we looked at the mask, but the mask was not 287 * updated yet. 288 * 289 * Matched by the barrier in pull_rt_task(). 290 */ 291 smp_wmb(); 292 atomic_inc(&rq->rd->rto_count); 293 } 294 295 static inline void rt_clear_overload(struct rq *rq) 296 { 297 if (!rq->online) 298 return; 299 300 /* the order here really doesn't matter */ 301 atomic_dec(&rq->rd->rto_count); 302 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); 303 } 304 305 static void update_rt_migration(struct rt_rq *rt_rq) 306 { 307 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { 308 if (!rt_rq->overloaded) { 309 rt_set_overload(rq_of_rt_rq(rt_rq)); 310 rt_rq->overloaded = 1; 311 } 312 } else if (rt_rq->overloaded) { 313 rt_clear_overload(rq_of_rt_rq(rt_rq)); 314 rt_rq->overloaded = 0; 315 } 316 } 317 318 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 319 { 320 struct task_struct *p; 321 322 if (!rt_entity_is_task(rt_se)) 323 return; 324 325 p = rt_task_of(rt_se); 326 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 327 328 rt_rq->rt_nr_total++; 329 if (p->nr_cpus_allowed > 1) 330 rt_rq->rt_nr_migratory++; 331 332 update_rt_migration(rt_rq); 333 } 334 335 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 336 { 337 struct task_struct *p; 338 339 if (!rt_entity_is_task(rt_se)) 340 return; 341 342 p = rt_task_of(rt_se); 343 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 344 345 rt_rq->rt_nr_total--; 346 if (p->nr_cpus_allowed > 1) 347 rt_rq->rt_nr_migratory--; 348 349 update_rt_migration(rt_rq); 350 } 351 352 static inline int has_pushable_tasks(struct rq *rq) 353 { 354 return !plist_head_empty(&rq->rt.pushable_tasks); 355 } 356 357 static DEFINE_PER_CPU(struct callback_head, rt_push_head); 358 static DEFINE_PER_CPU(struct callback_head, rt_pull_head); 359 360 static void push_rt_tasks(struct rq *); 361 static void pull_rt_task(struct rq *); 362 363 static inline void rt_queue_push_tasks(struct rq *rq) 364 { 365 if (!has_pushable_tasks(rq)) 366 return; 367 368 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); 369 } 370 371 static inline void rt_queue_pull_task(struct rq *rq) 372 { 373 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); 374 } 375 376 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 377 { 378 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 379 plist_node_init(&p->pushable_tasks, p->prio); 380 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 381 382 /* Update the highest prio pushable task */ 383 if (p->prio < rq->rt.highest_prio.next) 384 rq->rt.highest_prio.next = p->prio; 385 } 386 387 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 388 { 389 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 390 391 /* Update the new highest prio pushable task */ 392 if (has_pushable_tasks(rq)) { 393 p = plist_first_entry(&rq->rt.pushable_tasks, 394 struct task_struct, pushable_tasks); 395 rq->rt.highest_prio.next = p->prio; 396 } else { 397 rq->rt.highest_prio.next = MAX_RT_PRIO-1; 398 } 399 } 400 401 #else 402 403 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 404 { 405 } 406 407 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 408 { 409 } 410 411 static inline 412 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 413 { 414 } 415 416 static inline 417 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 418 { 419 } 420 421 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 422 { 423 return false; 424 } 425 426 static inline void pull_rt_task(struct rq *this_rq) 427 { 428 } 429 430 static inline void rt_queue_push_tasks(struct rq *rq) 431 { 432 } 433 #endif /* CONFIG_SMP */ 434 435 static void enqueue_top_rt_rq(struct rt_rq *rt_rq); 436 static void dequeue_top_rt_rq(struct rt_rq *rt_rq); 437 438 static inline int on_rt_rq(struct sched_rt_entity *rt_se) 439 { 440 return rt_se->on_rq; 441 } 442 443 #ifdef CONFIG_UCLAMP_TASK 444 /* 445 * Verify the fitness of task @p to run on @cpu taking into account the uclamp 446 * settings. 447 * 448 * This check is only important for heterogeneous systems where uclamp_min value 449 * is higher than the capacity of a @cpu. For non-heterogeneous system this 450 * function will always return true. 451 * 452 * The function will return true if the capacity of the @cpu is >= the 453 * uclamp_min and false otherwise. 454 * 455 * Note that uclamp_min will be clamped to uclamp_max if uclamp_min 456 * > uclamp_max. 457 */ 458 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) 459 { 460 unsigned int min_cap; 461 unsigned int max_cap; 462 unsigned int cpu_cap; 463 464 /* Only heterogeneous systems can benefit from this check */ 465 if (!static_branch_unlikely(&sched_asym_cpucapacity)) 466 return true; 467 468 min_cap = uclamp_eff_value(p, UCLAMP_MIN); 469 max_cap = uclamp_eff_value(p, UCLAMP_MAX); 470 471 cpu_cap = capacity_orig_of(cpu); 472 473 return cpu_cap >= min(min_cap, max_cap); 474 } 475 #else 476 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) 477 { 478 return true; 479 } 480 #endif 481 482 #ifdef CONFIG_RT_GROUP_SCHED 483 484 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 485 { 486 if (!rt_rq->tg) 487 return RUNTIME_INF; 488 489 return rt_rq->rt_runtime; 490 } 491 492 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 493 { 494 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 495 } 496 497 typedef struct task_group *rt_rq_iter_t; 498 499 static inline struct task_group *next_task_group(struct task_group *tg) 500 { 501 do { 502 tg = list_entry_rcu(tg->list.next, 503 typeof(struct task_group), list); 504 } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); 505 506 if (&tg->list == &task_groups) 507 tg = NULL; 508 509 return tg; 510 } 511 512 #define for_each_rt_rq(rt_rq, iter, rq) \ 513 for (iter = container_of(&task_groups, typeof(*iter), list); \ 514 (iter = next_task_group(iter)) && \ 515 (rt_rq = iter->rt_rq[cpu_of(rq)]);) 516 517 #define for_each_sched_rt_entity(rt_se) \ 518 for (; rt_se; rt_se = rt_se->parent) 519 520 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 521 { 522 return rt_se->my_q; 523 } 524 525 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 526 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 527 528 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 529 { 530 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 531 struct rq *rq = rq_of_rt_rq(rt_rq); 532 struct sched_rt_entity *rt_se; 533 534 int cpu = cpu_of(rq); 535 536 rt_se = rt_rq->tg->rt_se[cpu]; 537 538 if (rt_rq->rt_nr_running) { 539 if (!rt_se) 540 enqueue_top_rt_rq(rt_rq); 541 else if (!on_rt_rq(rt_se)) 542 enqueue_rt_entity(rt_se, 0); 543 544 if (rt_rq->highest_prio.curr < curr->prio) 545 resched_curr(rq); 546 } 547 } 548 549 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 550 { 551 struct sched_rt_entity *rt_se; 552 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 553 554 rt_se = rt_rq->tg->rt_se[cpu]; 555 556 if (!rt_se) { 557 dequeue_top_rt_rq(rt_rq); 558 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 559 cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); 560 } 561 else if (on_rt_rq(rt_se)) 562 dequeue_rt_entity(rt_se, 0); 563 } 564 565 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 566 { 567 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; 568 } 569 570 static int rt_se_boosted(struct sched_rt_entity *rt_se) 571 { 572 struct rt_rq *rt_rq = group_rt_rq(rt_se); 573 struct task_struct *p; 574 575 if (rt_rq) 576 return !!rt_rq->rt_nr_boosted; 577 578 p = rt_task_of(rt_se); 579 return p->prio != p->normal_prio; 580 } 581 582 #ifdef CONFIG_SMP 583 static inline const struct cpumask *sched_rt_period_mask(void) 584 { 585 return this_rq()->rd->span; 586 } 587 #else 588 static inline const struct cpumask *sched_rt_period_mask(void) 589 { 590 return cpu_online_mask; 591 } 592 #endif 593 594 static inline 595 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 596 { 597 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; 598 } 599 600 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 601 { 602 return &rt_rq->tg->rt_bandwidth; 603 } 604 605 #else /* !CONFIG_RT_GROUP_SCHED */ 606 607 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 608 { 609 return rt_rq->rt_runtime; 610 } 611 612 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 613 { 614 return ktime_to_ns(def_rt_bandwidth.rt_period); 615 } 616 617 typedef struct rt_rq *rt_rq_iter_t; 618 619 #define for_each_rt_rq(rt_rq, iter, rq) \ 620 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 621 622 #define for_each_sched_rt_entity(rt_se) \ 623 for (; rt_se; rt_se = NULL) 624 625 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 626 { 627 return NULL; 628 } 629 630 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 631 { 632 struct rq *rq = rq_of_rt_rq(rt_rq); 633 634 if (!rt_rq->rt_nr_running) 635 return; 636 637 enqueue_top_rt_rq(rt_rq); 638 resched_curr(rq); 639 } 640 641 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 642 { 643 dequeue_top_rt_rq(rt_rq); 644 } 645 646 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 647 { 648 return rt_rq->rt_throttled; 649 } 650 651 static inline const struct cpumask *sched_rt_period_mask(void) 652 { 653 return cpu_online_mask; 654 } 655 656 static inline 657 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 658 { 659 return &cpu_rq(cpu)->rt; 660 } 661 662 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 663 { 664 return &def_rt_bandwidth; 665 } 666 667 #endif /* CONFIG_RT_GROUP_SCHED */ 668 669 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) 670 { 671 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 672 673 return (hrtimer_active(&rt_b->rt_period_timer) || 674 rt_rq->rt_time < rt_b->rt_runtime); 675 } 676 677 #ifdef CONFIG_SMP 678 /* 679 * We ran out of runtime, see if we can borrow some from our neighbours. 680 */ 681 static void do_balance_runtime(struct rt_rq *rt_rq) 682 { 683 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 684 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; 685 int i, weight; 686 u64 rt_period; 687 688 weight = cpumask_weight(rd->span); 689 690 raw_spin_lock(&rt_b->rt_runtime_lock); 691 rt_period = ktime_to_ns(rt_b->rt_period); 692 for_each_cpu(i, rd->span) { 693 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 694 s64 diff; 695 696 if (iter == rt_rq) 697 continue; 698 699 raw_spin_lock(&iter->rt_runtime_lock); 700 /* 701 * Either all rqs have inf runtime and there's nothing to steal 702 * or __disable_runtime() below sets a specific rq to inf to 703 * indicate its been disabled and disallow stealing. 704 */ 705 if (iter->rt_runtime == RUNTIME_INF) 706 goto next; 707 708 /* 709 * From runqueues with spare time, take 1/n part of their 710 * spare time, but no more than our period. 711 */ 712 diff = iter->rt_runtime - iter->rt_time; 713 if (diff > 0) { 714 diff = div_u64((u64)diff, weight); 715 if (rt_rq->rt_runtime + diff > rt_period) 716 diff = rt_period - rt_rq->rt_runtime; 717 iter->rt_runtime -= diff; 718 rt_rq->rt_runtime += diff; 719 if (rt_rq->rt_runtime == rt_period) { 720 raw_spin_unlock(&iter->rt_runtime_lock); 721 break; 722 } 723 } 724 next: 725 raw_spin_unlock(&iter->rt_runtime_lock); 726 } 727 raw_spin_unlock(&rt_b->rt_runtime_lock); 728 } 729 730 /* 731 * Ensure this RQ takes back all the runtime it lend to its neighbours. 732 */ 733 static void __disable_runtime(struct rq *rq) 734 { 735 struct root_domain *rd = rq->rd; 736 rt_rq_iter_t iter; 737 struct rt_rq *rt_rq; 738 739 if (unlikely(!scheduler_running)) 740 return; 741 742 for_each_rt_rq(rt_rq, iter, rq) { 743 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 744 s64 want; 745 int i; 746 747 raw_spin_lock(&rt_b->rt_runtime_lock); 748 raw_spin_lock(&rt_rq->rt_runtime_lock); 749 /* 750 * Either we're all inf and nobody needs to borrow, or we're 751 * already disabled and thus have nothing to do, or we have 752 * exactly the right amount of runtime to take out. 753 */ 754 if (rt_rq->rt_runtime == RUNTIME_INF || 755 rt_rq->rt_runtime == rt_b->rt_runtime) 756 goto balanced; 757 raw_spin_unlock(&rt_rq->rt_runtime_lock); 758 759 /* 760 * Calculate the difference between what we started out with 761 * and what we current have, that's the amount of runtime 762 * we lend and now have to reclaim. 763 */ 764 want = rt_b->rt_runtime - rt_rq->rt_runtime; 765 766 /* 767 * Greedy reclaim, take back as much as we can. 768 */ 769 for_each_cpu(i, rd->span) { 770 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 771 s64 diff; 772 773 /* 774 * Can't reclaim from ourselves or disabled runqueues. 775 */ 776 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 777 continue; 778 779 raw_spin_lock(&iter->rt_runtime_lock); 780 if (want > 0) { 781 diff = min_t(s64, iter->rt_runtime, want); 782 iter->rt_runtime -= diff; 783 want -= diff; 784 } else { 785 iter->rt_runtime -= want; 786 want -= want; 787 } 788 raw_spin_unlock(&iter->rt_runtime_lock); 789 790 if (!want) 791 break; 792 } 793 794 raw_spin_lock(&rt_rq->rt_runtime_lock); 795 /* 796 * We cannot be left wanting - that would mean some runtime 797 * leaked out of the system. 798 */ 799 BUG_ON(want); 800 balanced: 801 /* 802 * Disable all the borrow logic by pretending we have inf 803 * runtime - in which case borrowing doesn't make sense. 804 */ 805 rt_rq->rt_runtime = RUNTIME_INF; 806 rt_rq->rt_throttled = 0; 807 raw_spin_unlock(&rt_rq->rt_runtime_lock); 808 raw_spin_unlock(&rt_b->rt_runtime_lock); 809 810 /* Make rt_rq available for pick_next_task() */ 811 sched_rt_rq_enqueue(rt_rq); 812 } 813 } 814 815 static void __enable_runtime(struct rq *rq) 816 { 817 rt_rq_iter_t iter; 818 struct rt_rq *rt_rq; 819 820 if (unlikely(!scheduler_running)) 821 return; 822 823 /* 824 * Reset each runqueue's bandwidth settings 825 */ 826 for_each_rt_rq(rt_rq, iter, rq) { 827 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 828 829 raw_spin_lock(&rt_b->rt_runtime_lock); 830 raw_spin_lock(&rt_rq->rt_runtime_lock); 831 rt_rq->rt_runtime = rt_b->rt_runtime; 832 rt_rq->rt_time = 0; 833 rt_rq->rt_throttled = 0; 834 raw_spin_unlock(&rt_rq->rt_runtime_lock); 835 raw_spin_unlock(&rt_b->rt_runtime_lock); 836 } 837 } 838 839 static void balance_runtime(struct rt_rq *rt_rq) 840 { 841 if (!sched_feat(RT_RUNTIME_SHARE)) 842 return; 843 844 if (rt_rq->rt_time > rt_rq->rt_runtime) { 845 raw_spin_unlock(&rt_rq->rt_runtime_lock); 846 do_balance_runtime(rt_rq); 847 raw_spin_lock(&rt_rq->rt_runtime_lock); 848 } 849 } 850 #else /* !CONFIG_SMP */ 851 static inline void balance_runtime(struct rt_rq *rt_rq) {} 852 #endif /* CONFIG_SMP */ 853 854 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 855 { 856 int i, idle = 1, throttled = 0; 857 const struct cpumask *span; 858 859 span = sched_rt_period_mask(); 860 #ifdef CONFIG_RT_GROUP_SCHED 861 /* 862 * FIXME: isolated CPUs should really leave the root task group, 863 * whether they are isolcpus or were isolated via cpusets, lest 864 * the timer run on a CPU which does not service all runqueues, 865 * potentially leaving other CPUs indefinitely throttled. If 866 * isolation is really required, the user will turn the throttle 867 * off to kill the perturbations it causes anyway. Meanwhile, 868 * this maintains functionality for boot and/or troubleshooting. 869 */ 870 if (rt_b == &root_task_group.rt_bandwidth) 871 span = cpu_online_mask; 872 #endif 873 for_each_cpu(i, span) { 874 int enqueue = 0; 875 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 876 struct rq *rq = rq_of_rt_rq(rt_rq); 877 int skip; 878 879 /* 880 * When span == cpu_online_mask, taking each rq->lock 881 * can be time-consuming. Try to avoid it when possible. 882 */ 883 raw_spin_lock(&rt_rq->rt_runtime_lock); 884 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF) 885 rt_rq->rt_runtime = rt_b->rt_runtime; 886 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; 887 raw_spin_unlock(&rt_rq->rt_runtime_lock); 888 if (skip) 889 continue; 890 891 raw_spin_rq_lock(rq); 892 update_rq_clock(rq); 893 894 if (rt_rq->rt_time) { 895 u64 runtime; 896 897 raw_spin_lock(&rt_rq->rt_runtime_lock); 898 if (rt_rq->rt_throttled) 899 balance_runtime(rt_rq); 900 runtime = rt_rq->rt_runtime; 901 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); 902 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 903 rt_rq->rt_throttled = 0; 904 enqueue = 1; 905 906 /* 907 * When we're idle and a woken (rt) task is 908 * throttled check_preempt_curr() will set 909 * skip_update and the time between the wakeup 910 * and this unthrottle will get accounted as 911 * 'runtime'. 912 */ 913 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 914 rq_clock_cancel_skipupdate(rq); 915 } 916 if (rt_rq->rt_time || rt_rq->rt_nr_running) 917 idle = 0; 918 raw_spin_unlock(&rt_rq->rt_runtime_lock); 919 } else if (rt_rq->rt_nr_running) { 920 idle = 0; 921 if (!rt_rq_throttled(rt_rq)) 922 enqueue = 1; 923 } 924 if (rt_rq->rt_throttled) 925 throttled = 1; 926 927 if (enqueue) 928 sched_rt_rq_enqueue(rt_rq); 929 raw_spin_rq_unlock(rq); 930 } 931 932 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) 933 return 1; 934 935 return idle; 936 } 937 938 static inline int rt_se_prio(struct sched_rt_entity *rt_se) 939 { 940 #ifdef CONFIG_RT_GROUP_SCHED 941 struct rt_rq *rt_rq = group_rt_rq(rt_se); 942 943 if (rt_rq) 944 return rt_rq->highest_prio.curr; 945 #endif 946 947 return rt_task_of(rt_se)->prio; 948 } 949 950 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) 951 { 952 u64 runtime = sched_rt_runtime(rt_rq); 953 954 if (rt_rq->rt_throttled) 955 return rt_rq_throttled(rt_rq); 956 957 if (runtime >= sched_rt_period(rt_rq)) 958 return 0; 959 960 balance_runtime(rt_rq); 961 runtime = sched_rt_runtime(rt_rq); 962 if (runtime == RUNTIME_INF) 963 return 0; 964 965 if (rt_rq->rt_time > runtime) { 966 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 967 968 /* 969 * Don't actually throttle groups that have no runtime assigned 970 * but accrue some time due to boosting. 971 */ 972 if (likely(rt_b->rt_runtime)) { 973 rt_rq->rt_throttled = 1; 974 printk_deferred_once("sched: RT throttling activated\n"); 975 } else { 976 /* 977 * In case we did anyway, make it go away, 978 * replenishment is a joke, since it will replenish us 979 * with exactly 0 ns. 980 */ 981 rt_rq->rt_time = 0; 982 } 983 984 if (rt_rq_throttled(rt_rq)) { 985 sched_rt_rq_dequeue(rt_rq); 986 return 1; 987 } 988 } 989 990 return 0; 991 } 992 993 /* 994 * Update the current task's runtime statistics. Skip current tasks that 995 * are not in our scheduling class. 996 */ 997 static void update_curr_rt(struct rq *rq) 998 { 999 struct task_struct *curr = rq->curr; 1000 struct sched_rt_entity *rt_se = &curr->rt; 1001 u64 delta_exec; 1002 u64 now; 1003 1004 if (curr->sched_class != &rt_sched_class) 1005 return; 1006 1007 now = rq_clock_task(rq); 1008 delta_exec = now - curr->se.exec_start; 1009 if (unlikely((s64)delta_exec <= 0)) 1010 return; 1011 1012 schedstat_set(curr->stats.exec_max, 1013 max(curr->stats.exec_max, delta_exec)); 1014 1015 trace_sched_stat_runtime(curr, delta_exec, 0); 1016 1017 curr->se.sum_exec_runtime += delta_exec; 1018 account_group_exec_runtime(curr, delta_exec); 1019 1020 curr->se.exec_start = now; 1021 cgroup_account_cputime(curr, delta_exec); 1022 1023 if (!rt_bandwidth_enabled()) 1024 return; 1025 1026 for_each_sched_rt_entity(rt_se) { 1027 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1028 1029 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 1030 raw_spin_lock(&rt_rq->rt_runtime_lock); 1031 rt_rq->rt_time += delta_exec; 1032 if (sched_rt_runtime_exceeded(rt_rq)) 1033 resched_curr(rq); 1034 raw_spin_unlock(&rt_rq->rt_runtime_lock); 1035 } 1036 } 1037 } 1038 1039 static void 1040 dequeue_top_rt_rq(struct rt_rq *rt_rq) 1041 { 1042 struct rq *rq = rq_of_rt_rq(rt_rq); 1043 1044 BUG_ON(&rq->rt != rt_rq); 1045 1046 if (!rt_rq->rt_queued) 1047 return; 1048 1049 BUG_ON(!rq->nr_running); 1050 1051 sub_nr_running(rq, rt_rq->rt_nr_running); 1052 rt_rq->rt_queued = 0; 1053 1054 } 1055 1056 static void 1057 enqueue_top_rt_rq(struct rt_rq *rt_rq) 1058 { 1059 struct rq *rq = rq_of_rt_rq(rt_rq); 1060 1061 BUG_ON(&rq->rt != rt_rq); 1062 1063 if (rt_rq->rt_queued) 1064 return; 1065 1066 if (rt_rq_throttled(rt_rq)) 1067 return; 1068 1069 if (rt_rq->rt_nr_running) { 1070 add_nr_running(rq, rt_rq->rt_nr_running); 1071 rt_rq->rt_queued = 1; 1072 } 1073 1074 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 1075 cpufreq_update_util(rq, 0); 1076 } 1077 1078 #if defined CONFIG_SMP 1079 1080 static void 1081 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1082 { 1083 struct rq *rq = rq_of_rt_rq(rt_rq); 1084 1085 #ifdef CONFIG_RT_GROUP_SCHED 1086 /* 1087 * Change rq's cpupri only if rt_rq is the top queue. 1088 */ 1089 if (&rq->rt != rt_rq) 1090 return; 1091 #endif 1092 if (rq->online && prio < prev_prio) 1093 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 1094 } 1095 1096 static void 1097 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1098 { 1099 struct rq *rq = rq_of_rt_rq(rt_rq); 1100 1101 #ifdef CONFIG_RT_GROUP_SCHED 1102 /* 1103 * Change rq's cpupri only if rt_rq is the top queue. 1104 */ 1105 if (&rq->rt != rt_rq) 1106 return; 1107 #endif 1108 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 1109 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 1110 } 1111 1112 #else /* CONFIG_SMP */ 1113 1114 static inline 1115 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1116 static inline 1117 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1118 1119 #endif /* CONFIG_SMP */ 1120 1121 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 1122 static void 1123 inc_rt_prio(struct rt_rq *rt_rq, int prio) 1124 { 1125 int prev_prio = rt_rq->highest_prio.curr; 1126 1127 if (prio < prev_prio) 1128 rt_rq->highest_prio.curr = prio; 1129 1130 inc_rt_prio_smp(rt_rq, prio, prev_prio); 1131 } 1132 1133 static void 1134 dec_rt_prio(struct rt_rq *rt_rq, int prio) 1135 { 1136 int prev_prio = rt_rq->highest_prio.curr; 1137 1138 if (rt_rq->rt_nr_running) { 1139 1140 WARN_ON(prio < prev_prio); 1141 1142 /* 1143 * This may have been our highest task, and therefore 1144 * we may have some recomputation to do 1145 */ 1146 if (prio == prev_prio) { 1147 struct rt_prio_array *array = &rt_rq->active; 1148 1149 rt_rq->highest_prio.curr = 1150 sched_find_first_bit(array->bitmap); 1151 } 1152 1153 } else { 1154 rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 1155 } 1156 1157 dec_rt_prio_smp(rt_rq, prio, prev_prio); 1158 } 1159 1160 #else 1161 1162 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} 1163 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} 1164 1165 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ 1166 1167 #ifdef CONFIG_RT_GROUP_SCHED 1168 1169 static void 1170 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1171 { 1172 if (rt_se_boosted(rt_se)) 1173 rt_rq->rt_nr_boosted++; 1174 1175 if (rt_rq->tg) 1176 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 1177 } 1178 1179 static void 1180 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1181 { 1182 if (rt_se_boosted(rt_se)) 1183 rt_rq->rt_nr_boosted--; 1184 1185 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 1186 } 1187 1188 #else /* CONFIG_RT_GROUP_SCHED */ 1189 1190 static void 1191 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1192 { 1193 start_rt_bandwidth(&def_rt_bandwidth); 1194 } 1195 1196 static inline 1197 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} 1198 1199 #endif /* CONFIG_RT_GROUP_SCHED */ 1200 1201 static inline 1202 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) 1203 { 1204 struct rt_rq *group_rq = group_rt_rq(rt_se); 1205 1206 if (group_rq) 1207 return group_rq->rt_nr_running; 1208 else 1209 return 1; 1210 } 1211 1212 static inline 1213 unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se) 1214 { 1215 struct rt_rq *group_rq = group_rt_rq(rt_se); 1216 struct task_struct *tsk; 1217 1218 if (group_rq) 1219 return group_rq->rr_nr_running; 1220 1221 tsk = rt_task_of(rt_se); 1222 1223 return (tsk->policy == SCHED_RR) ? 1 : 0; 1224 } 1225 1226 static inline 1227 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1228 { 1229 int prio = rt_se_prio(rt_se); 1230 1231 WARN_ON(!rt_prio(prio)); 1232 rt_rq->rt_nr_running += rt_se_nr_running(rt_se); 1233 rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); 1234 1235 inc_rt_prio(rt_rq, prio); 1236 inc_rt_migration(rt_se, rt_rq); 1237 inc_rt_group(rt_se, rt_rq); 1238 } 1239 1240 static inline 1241 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1242 { 1243 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1244 WARN_ON(!rt_rq->rt_nr_running); 1245 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); 1246 rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); 1247 1248 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1249 dec_rt_migration(rt_se, rt_rq); 1250 dec_rt_group(rt_se, rt_rq); 1251 } 1252 1253 /* 1254 * Change rt_se->run_list location unless SAVE && !MOVE 1255 * 1256 * assumes ENQUEUE/DEQUEUE flags match 1257 */ 1258 static inline bool move_entity(unsigned int flags) 1259 { 1260 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) 1261 return false; 1262 1263 return true; 1264 } 1265 1266 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) 1267 { 1268 list_del_init(&rt_se->run_list); 1269 1270 if (list_empty(array->queue + rt_se_prio(rt_se))) 1271 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1272 1273 rt_se->on_list = 0; 1274 } 1275 1276 static inline struct sched_statistics * 1277 __schedstats_from_rt_se(struct sched_rt_entity *rt_se) 1278 { 1279 #ifdef CONFIG_RT_GROUP_SCHED 1280 /* schedstats is not supported for rt group. */ 1281 if (!rt_entity_is_task(rt_se)) 1282 return NULL; 1283 #endif 1284 1285 return &rt_task_of(rt_se)->stats; 1286 } 1287 1288 static inline void 1289 update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 1290 { 1291 struct sched_statistics *stats; 1292 struct task_struct *p = NULL; 1293 1294 if (!schedstat_enabled()) 1295 return; 1296 1297 if (rt_entity_is_task(rt_se)) 1298 p = rt_task_of(rt_se); 1299 1300 stats = __schedstats_from_rt_se(rt_se); 1301 if (!stats) 1302 return; 1303 1304 __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats); 1305 } 1306 1307 static inline void 1308 update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 1309 { 1310 struct sched_statistics *stats; 1311 struct task_struct *p = NULL; 1312 1313 if (!schedstat_enabled()) 1314 return; 1315 1316 if (rt_entity_is_task(rt_se)) 1317 p = rt_task_of(rt_se); 1318 1319 stats = __schedstats_from_rt_se(rt_se); 1320 if (!stats) 1321 return; 1322 1323 __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats); 1324 } 1325 1326 static inline void 1327 update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, 1328 int flags) 1329 { 1330 if (!schedstat_enabled()) 1331 return; 1332 1333 if (flags & ENQUEUE_WAKEUP) 1334 update_stats_enqueue_sleeper_rt(rt_rq, rt_se); 1335 } 1336 1337 static inline void 1338 update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 1339 { 1340 struct sched_statistics *stats; 1341 struct task_struct *p = NULL; 1342 1343 if (!schedstat_enabled()) 1344 return; 1345 1346 if (rt_entity_is_task(rt_se)) 1347 p = rt_task_of(rt_se); 1348 1349 stats = __schedstats_from_rt_se(rt_se); 1350 if (!stats) 1351 return; 1352 1353 __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats); 1354 } 1355 1356 static inline void 1357 update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, 1358 int flags) 1359 { 1360 struct task_struct *p = NULL; 1361 1362 if (!schedstat_enabled()) 1363 return; 1364 1365 if (rt_entity_is_task(rt_se)) 1366 p = rt_task_of(rt_se); 1367 1368 if ((flags & DEQUEUE_SLEEP) && p) { 1369 unsigned int state; 1370 1371 state = READ_ONCE(p->__state); 1372 if (state & TASK_INTERRUPTIBLE) 1373 __schedstat_set(p->stats.sleep_start, 1374 rq_clock(rq_of_rt_rq(rt_rq))); 1375 1376 if (state & TASK_UNINTERRUPTIBLE) 1377 __schedstat_set(p->stats.block_start, 1378 rq_clock(rq_of_rt_rq(rt_rq))); 1379 } 1380 } 1381 1382 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1383 { 1384 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1385 struct rt_prio_array *array = &rt_rq->active; 1386 struct rt_rq *group_rq = group_rt_rq(rt_se); 1387 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1388 1389 /* 1390 * Don't enqueue the group if its throttled, or when empty. 1391 * The latter is a consequence of the former when a child group 1392 * get throttled and the current group doesn't have any other 1393 * active members. 1394 */ 1395 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { 1396 if (rt_se->on_list) 1397 __delist_rt_entity(rt_se, array); 1398 return; 1399 } 1400 1401 if (move_entity(flags)) { 1402 WARN_ON_ONCE(rt_se->on_list); 1403 if (flags & ENQUEUE_HEAD) 1404 list_add(&rt_se->run_list, queue); 1405 else 1406 list_add_tail(&rt_se->run_list, queue); 1407 1408 __set_bit(rt_se_prio(rt_se), array->bitmap); 1409 rt_se->on_list = 1; 1410 } 1411 rt_se->on_rq = 1; 1412 1413 inc_rt_tasks(rt_se, rt_rq); 1414 } 1415 1416 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1417 { 1418 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1419 struct rt_prio_array *array = &rt_rq->active; 1420 1421 if (move_entity(flags)) { 1422 WARN_ON_ONCE(!rt_se->on_list); 1423 __delist_rt_entity(rt_se, array); 1424 } 1425 rt_se->on_rq = 0; 1426 1427 dec_rt_tasks(rt_se, rt_rq); 1428 } 1429 1430 /* 1431 * Because the prio of an upper entry depends on the lower 1432 * entries, we must remove entries top - down. 1433 */ 1434 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) 1435 { 1436 struct sched_rt_entity *back = NULL; 1437 1438 for_each_sched_rt_entity(rt_se) { 1439 rt_se->back = back; 1440 back = rt_se; 1441 } 1442 1443 dequeue_top_rt_rq(rt_rq_of_se(back)); 1444 1445 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1446 if (on_rt_rq(rt_se)) 1447 __dequeue_rt_entity(rt_se, flags); 1448 } 1449 } 1450 1451 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1452 { 1453 struct rq *rq = rq_of_rt_se(rt_se); 1454 1455 update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags); 1456 1457 dequeue_rt_stack(rt_se, flags); 1458 for_each_sched_rt_entity(rt_se) 1459 __enqueue_rt_entity(rt_se, flags); 1460 enqueue_top_rt_rq(&rq->rt); 1461 } 1462 1463 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1464 { 1465 struct rq *rq = rq_of_rt_se(rt_se); 1466 1467 update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags); 1468 1469 dequeue_rt_stack(rt_se, flags); 1470 1471 for_each_sched_rt_entity(rt_se) { 1472 struct rt_rq *rt_rq = group_rt_rq(rt_se); 1473 1474 if (rt_rq && rt_rq->rt_nr_running) 1475 __enqueue_rt_entity(rt_se, flags); 1476 } 1477 enqueue_top_rt_rq(&rq->rt); 1478 } 1479 1480 /* 1481 * Adding/removing a task to/from a priority array: 1482 */ 1483 static void 1484 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1485 { 1486 struct sched_rt_entity *rt_se = &p->rt; 1487 1488 if (flags & ENQUEUE_WAKEUP) 1489 rt_se->timeout = 0; 1490 1491 check_schedstat_required(); 1492 update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se); 1493 1494 enqueue_rt_entity(rt_se, flags); 1495 1496 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1497 enqueue_pushable_task(rq, p); 1498 } 1499 1500 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1501 { 1502 struct sched_rt_entity *rt_se = &p->rt; 1503 1504 update_curr_rt(rq); 1505 dequeue_rt_entity(rt_se, flags); 1506 1507 dequeue_pushable_task(rq, p); 1508 } 1509 1510 /* 1511 * Put task to the head or the end of the run list without the overhead of 1512 * dequeue followed by enqueue. 1513 */ 1514 static void 1515 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1516 { 1517 if (on_rt_rq(rt_se)) { 1518 struct rt_prio_array *array = &rt_rq->active; 1519 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1520 1521 if (head) 1522 list_move(&rt_se->run_list, queue); 1523 else 1524 list_move_tail(&rt_se->run_list, queue); 1525 } 1526 } 1527 1528 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) 1529 { 1530 struct sched_rt_entity *rt_se = &p->rt; 1531 struct rt_rq *rt_rq; 1532 1533 for_each_sched_rt_entity(rt_se) { 1534 rt_rq = rt_rq_of_se(rt_se); 1535 requeue_rt_entity(rt_rq, rt_se, head); 1536 } 1537 } 1538 1539 static void yield_task_rt(struct rq *rq) 1540 { 1541 requeue_task_rt(rq, rq->curr, 0); 1542 } 1543 1544 #ifdef CONFIG_SMP 1545 static int find_lowest_rq(struct task_struct *task); 1546 1547 static int 1548 select_task_rq_rt(struct task_struct *p, int cpu, int flags) 1549 { 1550 struct task_struct *curr; 1551 struct rq *rq; 1552 bool test; 1553 1554 /* For anything but wake ups, just return the task_cpu */ 1555 if (!(flags & (WF_TTWU | WF_FORK))) 1556 goto out; 1557 1558 rq = cpu_rq(cpu); 1559 1560 rcu_read_lock(); 1561 curr = READ_ONCE(rq->curr); /* unlocked access */ 1562 1563 /* 1564 * If the current task on @p's runqueue is an RT task, then 1565 * try to see if we can wake this RT task up on another 1566 * runqueue. Otherwise simply start this RT task 1567 * on its current runqueue. 1568 * 1569 * We want to avoid overloading runqueues. If the woken 1570 * task is a higher priority, then it will stay on this CPU 1571 * and the lower prio task should be moved to another CPU. 1572 * Even though this will probably make the lower prio task 1573 * lose its cache, we do not want to bounce a higher task 1574 * around just because it gave up its CPU, perhaps for a 1575 * lock? 1576 * 1577 * For equal prio tasks, we just let the scheduler sort it out. 1578 * 1579 * Otherwise, just let it ride on the affined RQ and the 1580 * post-schedule router will push the preempted task away 1581 * 1582 * This test is optimistic, if we get it wrong the load-balancer 1583 * will have to sort it out. 1584 * 1585 * We take into account the capacity of the CPU to ensure it fits the 1586 * requirement of the task - which is only important on heterogeneous 1587 * systems like big.LITTLE. 1588 */ 1589 test = curr && 1590 unlikely(rt_task(curr)) && 1591 (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); 1592 1593 if (test || !rt_task_fits_capacity(p, cpu)) { 1594 int target = find_lowest_rq(p); 1595 1596 /* 1597 * Bail out if we were forcing a migration to find a better 1598 * fitting CPU but our search failed. 1599 */ 1600 if (!test && target != -1 && !rt_task_fits_capacity(p, target)) 1601 goto out_unlock; 1602 1603 /* 1604 * Don't bother moving it if the destination CPU is 1605 * not running a lower priority task. 1606 */ 1607 if (target != -1 && 1608 p->prio < cpu_rq(target)->rt.highest_prio.curr) 1609 cpu = target; 1610 } 1611 1612 out_unlock: 1613 rcu_read_unlock(); 1614 1615 out: 1616 return cpu; 1617 } 1618 1619 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1620 { 1621 /* 1622 * Current can't be migrated, useless to reschedule, 1623 * let's hope p can move out. 1624 */ 1625 if (rq->curr->nr_cpus_allowed == 1 || 1626 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) 1627 return; 1628 1629 /* 1630 * p is migratable, so let's not schedule it and 1631 * see if it is pushed or pulled somewhere else. 1632 */ 1633 if (p->nr_cpus_allowed != 1 && 1634 cpupri_find(&rq->rd->cpupri, p, NULL)) 1635 return; 1636 1637 /* 1638 * There appear to be other CPUs that can accept 1639 * the current task but none can run 'p', so lets reschedule 1640 * to try and push the current task away: 1641 */ 1642 requeue_task_rt(rq, p, 1); 1643 resched_curr(rq); 1644 } 1645 1646 static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) 1647 { 1648 if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { 1649 /* 1650 * This is OK, because current is on_cpu, which avoids it being 1651 * picked for load-balance and preemption/IRQs are still 1652 * disabled avoiding further scheduler activity on it and we've 1653 * not yet started the picking loop. 1654 */ 1655 rq_unpin_lock(rq, rf); 1656 pull_rt_task(rq); 1657 rq_repin_lock(rq, rf); 1658 } 1659 1660 return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); 1661 } 1662 #endif /* CONFIG_SMP */ 1663 1664 /* 1665 * Preempt the current task with a newly woken task if needed: 1666 */ 1667 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) 1668 { 1669 if (p->prio < rq->curr->prio) { 1670 resched_curr(rq); 1671 return; 1672 } 1673 1674 #ifdef CONFIG_SMP 1675 /* 1676 * If: 1677 * 1678 * - the newly woken task is of equal priority to the current task 1679 * - the newly woken task is non-migratable while current is migratable 1680 * - current will be preempted on the next reschedule 1681 * 1682 * we should check to see if current can readily move to a different 1683 * cpu. If so, we will reschedule to allow the push logic to try 1684 * to move current somewhere else, making room for our non-migratable 1685 * task. 1686 */ 1687 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) 1688 check_preempt_equal_prio(rq, p); 1689 #endif 1690 } 1691 1692 static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) 1693 { 1694 struct sched_rt_entity *rt_se = &p->rt; 1695 struct rt_rq *rt_rq = &rq->rt; 1696 1697 p->se.exec_start = rq_clock_task(rq); 1698 if (on_rt_rq(&p->rt)) 1699 update_stats_wait_end_rt(rt_rq, rt_se); 1700 1701 /* The running task is never eligible for pushing */ 1702 dequeue_pushable_task(rq, p); 1703 1704 if (!first) 1705 return; 1706 1707 /* 1708 * If prev task was rt, put_prev_task() has already updated the 1709 * utilization. We only care of the case where we start to schedule a 1710 * rt task 1711 */ 1712 if (rq->curr->sched_class != &rt_sched_class) 1713 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); 1714 1715 rt_queue_push_tasks(rq); 1716 } 1717 1718 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 1719 struct rt_rq *rt_rq) 1720 { 1721 struct rt_prio_array *array = &rt_rq->active; 1722 struct sched_rt_entity *next = NULL; 1723 struct list_head *queue; 1724 int idx; 1725 1726 idx = sched_find_first_bit(array->bitmap); 1727 BUG_ON(idx >= MAX_RT_PRIO); 1728 1729 queue = array->queue + idx; 1730 next = list_entry(queue->next, struct sched_rt_entity, run_list); 1731 1732 return next; 1733 } 1734 1735 static struct task_struct *_pick_next_task_rt(struct rq *rq) 1736 { 1737 struct sched_rt_entity *rt_se; 1738 struct rt_rq *rt_rq = &rq->rt; 1739 1740 do { 1741 rt_se = pick_next_rt_entity(rq, rt_rq); 1742 BUG_ON(!rt_se); 1743 rt_rq = group_rt_rq(rt_se); 1744 } while (rt_rq); 1745 1746 return rt_task_of(rt_se); 1747 } 1748 1749 static struct task_struct *pick_task_rt(struct rq *rq) 1750 { 1751 struct task_struct *p; 1752 1753 if (!sched_rt_runnable(rq)) 1754 return NULL; 1755 1756 p = _pick_next_task_rt(rq); 1757 1758 return p; 1759 } 1760 1761 static struct task_struct *pick_next_task_rt(struct rq *rq) 1762 { 1763 struct task_struct *p = pick_task_rt(rq); 1764 1765 if (p) 1766 set_next_task_rt(rq, p, true); 1767 1768 return p; 1769 } 1770 1771 static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1772 { 1773 struct sched_rt_entity *rt_se = &p->rt; 1774 struct rt_rq *rt_rq = &rq->rt; 1775 1776 if (on_rt_rq(&p->rt)) 1777 update_stats_wait_start_rt(rt_rq, rt_se); 1778 1779 update_curr_rt(rq); 1780 1781 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); 1782 1783 /* 1784 * The previous task needs to be made eligible for pushing 1785 * if it is still active 1786 */ 1787 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) 1788 enqueue_pushable_task(rq, p); 1789 } 1790 1791 #ifdef CONFIG_SMP 1792 1793 /* Only try algorithms three times */ 1794 #define RT_MAX_TRIES 3 1795 1796 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1797 { 1798 if (!task_running(rq, p) && 1799 cpumask_test_cpu(cpu, &p->cpus_mask)) 1800 return 1; 1801 1802 return 0; 1803 } 1804 1805 /* 1806 * Return the highest pushable rq's task, which is suitable to be executed 1807 * on the CPU, NULL otherwise 1808 */ 1809 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) 1810 { 1811 struct plist_head *head = &rq->rt.pushable_tasks; 1812 struct task_struct *p; 1813 1814 if (!has_pushable_tasks(rq)) 1815 return NULL; 1816 1817 plist_for_each_entry(p, head, pushable_tasks) { 1818 if (pick_rt_task(rq, p, cpu)) 1819 return p; 1820 } 1821 1822 return NULL; 1823 } 1824 1825 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1826 1827 static int find_lowest_rq(struct task_struct *task) 1828 { 1829 struct sched_domain *sd; 1830 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); 1831 int this_cpu = smp_processor_id(); 1832 int cpu = task_cpu(task); 1833 int ret; 1834 1835 /* Make sure the mask is initialized first */ 1836 if (unlikely(!lowest_mask)) 1837 return -1; 1838 1839 if (task->nr_cpus_allowed == 1) 1840 return -1; /* No other targets possible */ 1841 1842 /* 1843 * If we're on asym system ensure we consider the different capacities 1844 * of the CPUs when searching for the lowest_mask. 1845 */ 1846 if (static_branch_unlikely(&sched_asym_cpucapacity)) { 1847 1848 ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, 1849 task, lowest_mask, 1850 rt_task_fits_capacity); 1851 } else { 1852 1853 ret = cpupri_find(&task_rq(task)->rd->cpupri, 1854 task, lowest_mask); 1855 } 1856 1857 if (!ret) 1858 return -1; /* No targets found */ 1859 1860 /* 1861 * At this point we have built a mask of CPUs representing the 1862 * lowest priority tasks in the system. Now we want to elect 1863 * the best one based on our affinity and topology. 1864 * 1865 * We prioritize the last CPU that the task executed on since 1866 * it is most likely cache-hot in that location. 1867 */ 1868 if (cpumask_test_cpu(cpu, lowest_mask)) 1869 return cpu; 1870 1871 /* 1872 * Otherwise, we consult the sched_domains span maps to figure 1873 * out which CPU is logically closest to our hot cache data. 1874 */ 1875 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1876 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1877 1878 rcu_read_lock(); 1879 for_each_domain(cpu, sd) { 1880 if (sd->flags & SD_WAKE_AFFINE) { 1881 int best_cpu; 1882 1883 /* 1884 * "this_cpu" is cheaper to preempt than a 1885 * remote processor. 1886 */ 1887 if (this_cpu != -1 && 1888 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { 1889 rcu_read_unlock(); 1890 return this_cpu; 1891 } 1892 1893 best_cpu = cpumask_any_and_distribute(lowest_mask, 1894 sched_domain_span(sd)); 1895 if (best_cpu < nr_cpu_ids) { 1896 rcu_read_unlock(); 1897 return best_cpu; 1898 } 1899 } 1900 } 1901 rcu_read_unlock(); 1902 1903 /* 1904 * And finally, if there were no matches within the domains 1905 * just give the caller *something* to work with from the compatible 1906 * locations. 1907 */ 1908 if (this_cpu != -1) 1909 return this_cpu; 1910 1911 cpu = cpumask_any_distribute(lowest_mask); 1912 if (cpu < nr_cpu_ids) 1913 return cpu; 1914 1915 return -1; 1916 } 1917 1918 /* Will lock the rq it finds */ 1919 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) 1920 { 1921 struct rq *lowest_rq = NULL; 1922 int tries; 1923 int cpu; 1924 1925 for (tries = 0; tries < RT_MAX_TRIES; tries++) { 1926 cpu = find_lowest_rq(task); 1927 1928 if ((cpu == -1) || (cpu == rq->cpu)) 1929 break; 1930 1931 lowest_rq = cpu_rq(cpu); 1932 1933 if (lowest_rq->rt.highest_prio.curr <= task->prio) { 1934 /* 1935 * Target rq has tasks of equal or higher priority, 1936 * retrying does not release any lock and is unlikely 1937 * to yield a different result. 1938 */ 1939 lowest_rq = NULL; 1940 break; 1941 } 1942 1943 /* if the prio of this runqueue changed, try again */ 1944 if (double_lock_balance(rq, lowest_rq)) { 1945 /* 1946 * We had to unlock the run queue. In 1947 * the mean time, task could have 1948 * migrated already or had its affinity changed. 1949 * Also make sure that it wasn't scheduled on its rq. 1950 */ 1951 if (unlikely(task_rq(task) != rq || 1952 !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || 1953 task_running(rq, task) || 1954 !rt_task(task) || 1955 !task_on_rq_queued(task))) { 1956 1957 double_unlock_balance(rq, lowest_rq); 1958 lowest_rq = NULL; 1959 break; 1960 } 1961 } 1962 1963 /* If this rq is still suitable use it. */ 1964 if (lowest_rq->rt.highest_prio.curr > task->prio) 1965 break; 1966 1967 /* try again */ 1968 double_unlock_balance(rq, lowest_rq); 1969 lowest_rq = NULL; 1970 } 1971 1972 return lowest_rq; 1973 } 1974 1975 static struct task_struct *pick_next_pushable_task(struct rq *rq) 1976 { 1977 struct task_struct *p; 1978 1979 if (!has_pushable_tasks(rq)) 1980 return NULL; 1981 1982 p = plist_first_entry(&rq->rt.pushable_tasks, 1983 struct task_struct, pushable_tasks); 1984 1985 BUG_ON(rq->cpu != task_cpu(p)); 1986 BUG_ON(task_current(rq, p)); 1987 BUG_ON(p->nr_cpus_allowed <= 1); 1988 1989 BUG_ON(!task_on_rq_queued(p)); 1990 BUG_ON(!rt_task(p)); 1991 1992 return p; 1993 } 1994 1995 /* 1996 * If the current CPU has more than one RT task, see if the non 1997 * running task can migrate over to a CPU that is running a task 1998 * of lesser priority. 1999 */ 2000 static int push_rt_task(struct rq *rq, bool pull) 2001 { 2002 struct task_struct *next_task; 2003 struct rq *lowest_rq; 2004 int ret = 0; 2005 2006 if (!rq->rt.overloaded) 2007 return 0; 2008 2009 next_task = pick_next_pushable_task(rq); 2010 if (!next_task) 2011 return 0; 2012 2013 retry: 2014 if (is_migration_disabled(next_task)) { 2015 struct task_struct *push_task = NULL; 2016 int cpu; 2017 2018 if (!pull || rq->push_busy) 2019 return 0; 2020 2021 cpu = find_lowest_rq(rq->curr); 2022 if (cpu == -1 || cpu == rq->cpu) 2023 return 0; 2024 2025 /* 2026 * Given we found a CPU with lower priority than @next_task, 2027 * therefore it should be running. However we cannot migrate it 2028 * to this other CPU, instead attempt to push the current 2029 * running task on this CPU away. 2030 */ 2031 push_task = get_push_task(rq); 2032 if (push_task) { 2033 raw_spin_rq_unlock(rq); 2034 stop_one_cpu_nowait(rq->cpu, push_cpu_stop, 2035 push_task, &rq->push_work); 2036 raw_spin_rq_lock(rq); 2037 } 2038 2039 return 0; 2040 } 2041 2042 if (WARN_ON(next_task == rq->curr)) 2043 return 0; 2044 2045 /* 2046 * It's possible that the next_task slipped in of 2047 * higher priority than current. If that's the case 2048 * just reschedule current. 2049 */ 2050 if (unlikely(next_task->prio < rq->curr->prio)) { 2051 resched_curr(rq); 2052 return 0; 2053 } 2054 2055 /* We might release rq lock */ 2056 get_task_struct(next_task); 2057 2058 /* find_lock_lowest_rq locks the rq if found */ 2059 lowest_rq = find_lock_lowest_rq(next_task, rq); 2060 if (!lowest_rq) { 2061 struct task_struct *task; 2062 /* 2063 * find_lock_lowest_rq releases rq->lock 2064 * so it is possible that next_task has migrated. 2065 * 2066 * We need to make sure that the task is still on the same 2067 * run-queue and is also still the next task eligible for 2068 * pushing. 2069 */ 2070 task = pick_next_pushable_task(rq); 2071 if (task == next_task) { 2072 /* 2073 * The task hasn't migrated, and is still the next 2074 * eligible task, but we failed to find a run-queue 2075 * to push it to. Do not retry in this case, since 2076 * other CPUs will pull from us when ready. 2077 */ 2078 goto out; 2079 } 2080 2081 if (!task) 2082 /* No more tasks, just exit */ 2083 goto out; 2084 2085 /* 2086 * Something has shifted, try again. 2087 */ 2088 put_task_struct(next_task); 2089 next_task = task; 2090 goto retry; 2091 } 2092 2093 deactivate_task(rq, next_task, 0); 2094 set_task_cpu(next_task, lowest_rq->cpu); 2095 activate_task(lowest_rq, next_task, 0); 2096 resched_curr(lowest_rq); 2097 ret = 1; 2098 2099 double_unlock_balance(rq, lowest_rq); 2100 out: 2101 put_task_struct(next_task); 2102 2103 return ret; 2104 } 2105 2106 static void push_rt_tasks(struct rq *rq) 2107 { 2108 /* push_rt_task will return true if it moved an RT */ 2109 while (push_rt_task(rq, false)) 2110 ; 2111 } 2112 2113 #ifdef HAVE_RT_PUSH_IPI 2114 2115 /* 2116 * When a high priority task schedules out from a CPU and a lower priority 2117 * task is scheduled in, a check is made to see if there's any RT tasks 2118 * on other CPUs that are waiting to run because a higher priority RT task 2119 * is currently running on its CPU. In this case, the CPU with multiple RT 2120 * tasks queued on it (overloaded) needs to be notified that a CPU has opened 2121 * up that may be able to run one of its non-running queued RT tasks. 2122 * 2123 * All CPUs with overloaded RT tasks need to be notified as there is currently 2124 * no way to know which of these CPUs have the highest priority task waiting 2125 * to run. Instead of trying to take a spinlock on each of these CPUs, 2126 * which has shown to cause large latency when done on machines with many 2127 * CPUs, sending an IPI to the CPUs to have them push off the overloaded 2128 * RT tasks waiting to run. 2129 * 2130 * Just sending an IPI to each of the CPUs is also an issue, as on large 2131 * count CPU machines, this can cause an IPI storm on a CPU, especially 2132 * if its the only CPU with multiple RT tasks queued, and a large number 2133 * of CPUs scheduling a lower priority task at the same time. 2134 * 2135 * Each root domain has its own irq work function that can iterate over 2136 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT 2137 * task must be checked if there's one or many CPUs that are lowering 2138 * their priority, there's a single irq work iterator that will try to 2139 * push off RT tasks that are waiting to run. 2140 * 2141 * When a CPU schedules a lower priority task, it will kick off the 2142 * irq work iterator that will jump to each CPU with overloaded RT tasks. 2143 * As it only takes the first CPU that schedules a lower priority task 2144 * to start the process, the rto_start variable is incremented and if 2145 * the atomic result is one, then that CPU will try to take the rto_lock. 2146 * This prevents high contention on the lock as the process handles all 2147 * CPUs scheduling lower priority tasks. 2148 * 2149 * All CPUs that are scheduling a lower priority task will increment the 2150 * rt_loop_next variable. This will make sure that the irq work iterator 2151 * checks all RT overloaded CPUs whenever a CPU schedules a new lower 2152 * priority task, even if the iterator is in the middle of a scan. Incrementing 2153 * the rt_loop_next will cause the iterator to perform another scan. 2154 * 2155 */ 2156 static int rto_next_cpu(struct root_domain *rd) 2157 { 2158 int next; 2159 int cpu; 2160 2161 /* 2162 * When starting the IPI RT pushing, the rto_cpu is set to -1, 2163 * rt_next_cpu() will simply return the first CPU found in 2164 * the rto_mask. 2165 * 2166 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it 2167 * will return the next CPU found in the rto_mask. 2168 * 2169 * If there are no more CPUs left in the rto_mask, then a check is made 2170 * against rto_loop and rto_loop_next. rto_loop is only updated with 2171 * the rto_lock held, but any CPU may increment the rto_loop_next 2172 * without any locking. 2173 */ 2174 for (;;) { 2175 2176 /* When rto_cpu is -1 this acts like cpumask_first() */ 2177 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); 2178 2179 rd->rto_cpu = cpu; 2180 2181 if (cpu < nr_cpu_ids) 2182 return cpu; 2183 2184 rd->rto_cpu = -1; 2185 2186 /* 2187 * ACQUIRE ensures we see the @rto_mask changes 2188 * made prior to the @next value observed. 2189 * 2190 * Matches WMB in rt_set_overload(). 2191 */ 2192 next = atomic_read_acquire(&rd->rto_loop_next); 2193 2194 if (rd->rto_loop == next) 2195 break; 2196 2197 rd->rto_loop = next; 2198 } 2199 2200 return -1; 2201 } 2202 2203 static inline bool rto_start_trylock(atomic_t *v) 2204 { 2205 return !atomic_cmpxchg_acquire(v, 0, 1); 2206 } 2207 2208 static inline void rto_start_unlock(atomic_t *v) 2209 { 2210 atomic_set_release(v, 0); 2211 } 2212 2213 static void tell_cpu_to_push(struct rq *rq) 2214 { 2215 int cpu = -1; 2216 2217 /* Keep the loop going if the IPI is currently active */ 2218 atomic_inc(&rq->rd->rto_loop_next); 2219 2220 /* Only one CPU can initiate a loop at a time */ 2221 if (!rto_start_trylock(&rq->rd->rto_loop_start)) 2222 return; 2223 2224 raw_spin_lock(&rq->rd->rto_lock); 2225 2226 /* 2227 * The rto_cpu is updated under the lock, if it has a valid CPU 2228 * then the IPI is still running and will continue due to the 2229 * update to loop_next, and nothing needs to be done here. 2230 * Otherwise it is finishing up and an ipi needs to be sent. 2231 */ 2232 if (rq->rd->rto_cpu < 0) 2233 cpu = rto_next_cpu(rq->rd); 2234 2235 raw_spin_unlock(&rq->rd->rto_lock); 2236 2237 rto_start_unlock(&rq->rd->rto_loop_start); 2238 2239 if (cpu >= 0) { 2240 /* Make sure the rd does not get freed while pushing */ 2241 sched_get_rd(rq->rd); 2242 irq_work_queue_on(&rq->rd->rto_push_work, cpu); 2243 } 2244 } 2245 2246 /* Called from hardirq context */ 2247 void rto_push_irq_work_func(struct irq_work *work) 2248 { 2249 struct root_domain *rd = 2250 container_of(work, struct root_domain, rto_push_work); 2251 struct rq *rq; 2252 int cpu; 2253 2254 rq = this_rq(); 2255 2256 /* 2257 * We do not need to grab the lock to check for has_pushable_tasks. 2258 * When it gets updated, a check is made if a push is possible. 2259 */ 2260 if (has_pushable_tasks(rq)) { 2261 raw_spin_rq_lock(rq); 2262 while (push_rt_task(rq, true)) 2263 ; 2264 raw_spin_rq_unlock(rq); 2265 } 2266 2267 raw_spin_lock(&rd->rto_lock); 2268 2269 /* Pass the IPI to the next rt overloaded queue */ 2270 cpu = rto_next_cpu(rd); 2271 2272 raw_spin_unlock(&rd->rto_lock); 2273 2274 if (cpu < 0) { 2275 sched_put_rd(rd); 2276 return; 2277 } 2278 2279 /* Try the next RT overloaded CPU */ 2280 irq_work_queue_on(&rd->rto_push_work, cpu); 2281 } 2282 #endif /* HAVE_RT_PUSH_IPI */ 2283 2284 static void pull_rt_task(struct rq *this_rq) 2285 { 2286 int this_cpu = this_rq->cpu, cpu; 2287 bool resched = false; 2288 struct task_struct *p, *push_task; 2289 struct rq *src_rq; 2290 int rt_overload_count = rt_overloaded(this_rq); 2291 2292 if (likely(!rt_overload_count)) 2293 return; 2294 2295 /* 2296 * Match the barrier from rt_set_overloaded; this guarantees that if we 2297 * see overloaded we must also see the rto_mask bit. 2298 */ 2299 smp_rmb(); 2300 2301 /* If we are the only overloaded CPU do nothing */ 2302 if (rt_overload_count == 1 && 2303 cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) 2304 return; 2305 2306 #ifdef HAVE_RT_PUSH_IPI 2307 if (sched_feat(RT_PUSH_IPI)) { 2308 tell_cpu_to_push(this_rq); 2309 return; 2310 } 2311 #endif 2312 2313 for_each_cpu(cpu, this_rq->rd->rto_mask) { 2314 if (this_cpu == cpu) 2315 continue; 2316 2317 src_rq = cpu_rq(cpu); 2318 2319 /* 2320 * Don't bother taking the src_rq->lock if the next highest 2321 * task is known to be lower-priority than our current task. 2322 * This may look racy, but if this value is about to go 2323 * logically higher, the src_rq will push this task away. 2324 * And if its going logically lower, we do not care 2325 */ 2326 if (src_rq->rt.highest_prio.next >= 2327 this_rq->rt.highest_prio.curr) 2328 continue; 2329 2330 /* 2331 * We can potentially drop this_rq's lock in 2332 * double_lock_balance, and another CPU could 2333 * alter this_rq 2334 */ 2335 push_task = NULL; 2336 double_lock_balance(this_rq, src_rq); 2337 2338 /* 2339 * We can pull only a task, which is pushable 2340 * on its rq, and no others. 2341 */ 2342 p = pick_highest_pushable_task(src_rq, this_cpu); 2343 2344 /* 2345 * Do we have an RT task that preempts 2346 * the to-be-scheduled task? 2347 */ 2348 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 2349 WARN_ON(p == src_rq->curr); 2350 WARN_ON(!task_on_rq_queued(p)); 2351 2352 /* 2353 * There's a chance that p is higher in priority 2354 * than what's currently running on its CPU. 2355 * This is just that p is waking up and hasn't 2356 * had a chance to schedule. We only pull 2357 * p if it is lower in priority than the 2358 * current task on the run queue 2359 */ 2360 if (p->prio < src_rq->curr->prio) 2361 goto skip; 2362 2363 if (is_migration_disabled(p)) { 2364 push_task = get_push_task(src_rq); 2365 } else { 2366 deactivate_task(src_rq, p, 0); 2367 set_task_cpu(p, this_cpu); 2368 activate_task(this_rq, p, 0); 2369 resched = true; 2370 } 2371 /* 2372 * We continue with the search, just in 2373 * case there's an even higher prio task 2374 * in another runqueue. (low likelihood 2375 * but possible) 2376 */ 2377 } 2378 skip: 2379 double_unlock_balance(this_rq, src_rq); 2380 2381 if (push_task) { 2382 raw_spin_rq_unlock(this_rq); 2383 stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, 2384 push_task, &src_rq->push_work); 2385 raw_spin_rq_lock(this_rq); 2386 } 2387 } 2388 2389 if (resched) 2390 resched_curr(this_rq); 2391 } 2392 2393 /* 2394 * If we are not running and we are not going to reschedule soon, we should 2395 * try to push tasks away now 2396 */ 2397 static void task_woken_rt(struct rq *rq, struct task_struct *p) 2398 { 2399 bool need_to_push = !task_running(rq, p) && 2400 !test_tsk_need_resched(rq->curr) && 2401 p->nr_cpus_allowed > 1 && 2402 (dl_task(rq->curr) || rt_task(rq->curr)) && 2403 (rq->curr->nr_cpus_allowed < 2 || 2404 rq->curr->prio <= p->prio); 2405 2406 if (need_to_push) 2407 push_rt_tasks(rq); 2408 } 2409 2410 /* Assumes rq->lock is held */ 2411 static void rq_online_rt(struct rq *rq) 2412 { 2413 if (rq->rt.overloaded) 2414 rt_set_overload(rq); 2415 2416 __enable_runtime(rq); 2417 2418 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); 2419 } 2420 2421 /* Assumes rq->lock is held */ 2422 static void rq_offline_rt(struct rq *rq) 2423 { 2424 if (rq->rt.overloaded) 2425 rt_clear_overload(rq); 2426 2427 __disable_runtime(rq); 2428 2429 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); 2430 } 2431 2432 /* 2433 * When switch from the rt queue, we bring ourselves to a position 2434 * that we might want to pull RT tasks from other runqueues. 2435 */ 2436 static void switched_from_rt(struct rq *rq, struct task_struct *p) 2437 { 2438 /* 2439 * If there are other RT tasks then we will reschedule 2440 * and the scheduling of the other RT tasks will handle 2441 * the balancing. But if we are the last RT task 2442 * we may need to handle the pulling of RT tasks 2443 * now. 2444 */ 2445 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) 2446 return; 2447 2448 rt_queue_pull_task(rq); 2449 } 2450 2451 void __init init_sched_rt_class(void) 2452 { 2453 unsigned int i; 2454 2455 for_each_possible_cpu(i) { 2456 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 2457 GFP_KERNEL, cpu_to_node(i)); 2458 } 2459 } 2460 #endif /* CONFIG_SMP */ 2461 2462 /* 2463 * When switching a task to RT, we may overload the runqueue 2464 * with RT tasks. In this case we try to push them off to 2465 * other runqueues. 2466 */ 2467 static void switched_to_rt(struct rq *rq, struct task_struct *p) 2468 { 2469 /* 2470 * If we are running, update the avg_rt tracking, as the running time 2471 * will now on be accounted into the latter. 2472 */ 2473 if (task_current(rq, p)) { 2474 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); 2475 return; 2476 } 2477 2478 /* 2479 * If we are not running we may need to preempt the current 2480 * running task. If that current running task is also an RT task 2481 * then see if we can move to another run queue. 2482 */ 2483 if (task_on_rq_queued(p)) { 2484 #ifdef CONFIG_SMP 2485 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) 2486 rt_queue_push_tasks(rq); 2487 #endif /* CONFIG_SMP */ 2488 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) 2489 resched_curr(rq); 2490 } 2491 } 2492 2493 /* 2494 * Priority of the task has changed. This may cause 2495 * us to initiate a push or pull. 2496 */ 2497 static void 2498 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 2499 { 2500 if (!task_on_rq_queued(p)) 2501 return; 2502 2503 if (task_current(rq, p)) { 2504 #ifdef CONFIG_SMP 2505 /* 2506 * If our priority decreases while running, we 2507 * may need to pull tasks to this runqueue. 2508 */ 2509 if (oldprio < p->prio) 2510 rt_queue_pull_task(rq); 2511 2512 /* 2513 * If there's a higher priority task waiting to run 2514 * then reschedule. 2515 */ 2516 if (p->prio > rq->rt.highest_prio.curr) 2517 resched_curr(rq); 2518 #else 2519 /* For UP simply resched on drop of prio */ 2520 if (oldprio < p->prio) 2521 resched_curr(rq); 2522 #endif /* CONFIG_SMP */ 2523 } else { 2524 /* 2525 * This task is not running, but if it is 2526 * greater than the current running task 2527 * then reschedule. 2528 */ 2529 if (p->prio < rq->curr->prio) 2530 resched_curr(rq); 2531 } 2532 } 2533 2534 #ifdef CONFIG_POSIX_TIMERS 2535 static void watchdog(struct rq *rq, struct task_struct *p) 2536 { 2537 unsigned long soft, hard; 2538 2539 /* max may change after cur was read, this will be fixed next tick */ 2540 soft = task_rlimit(p, RLIMIT_RTTIME); 2541 hard = task_rlimit_max(p, RLIMIT_RTTIME); 2542 2543 if (soft != RLIM_INFINITY) { 2544 unsigned long next; 2545 2546 if (p->rt.watchdog_stamp != jiffies) { 2547 p->rt.timeout++; 2548 p->rt.watchdog_stamp = jiffies; 2549 } 2550 2551 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 2552 if (p->rt.timeout > next) { 2553 posix_cputimers_rt_watchdog(&p->posix_cputimers, 2554 p->se.sum_exec_runtime); 2555 } 2556 } 2557 } 2558 #else 2559 static inline void watchdog(struct rq *rq, struct task_struct *p) { } 2560 #endif 2561 2562 /* 2563 * scheduler tick hitting a task of our scheduling class. 2564 * 2565 * NOTE: This function can be called remotely by the tick offload that 2566 * goes along full dynticks. Therefore no local assumption can be made 2567 * and everything must be accessed through the @rq and @curr passed in 2568 * parameters. 2569 */ 2570 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 2571 { 2572 struct sched_rt_entity *rt_se = &p->rt; 2573 2574 update_curr_rt(rq); 2575 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); 2576 2577 watchdog(rq, p); 2578 2579 /* 2580 * RR tasks need a special form of timeslice management. 2581 * FIFO tasks have no timeslices. 2582 */ 2583 if (p->policy != SCHED_RR) 2584 return; 2585 2586 if (--p->rt.time_slice) 2587 return; 2588 2589 p->rt.time_slice = sched_rr_timeslice; 2590 2591 /* 2592 * Requeue to the end of queue if we (and all of our ancestors) are not 2593 * the only element on the queue 2594 */ 2595 for_each_sched_rt_entity(rt_se) { 2596 if (rt_se->run_list.prev != rt_se->run_list.next) { 2597 requeue_task_rt(rq, p, 0); 2598 resched_curr(rq); 2599 return; 2600 } 2601 } 2602 } 2603 2604 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 2605 { 2606 /* 2607 * Time slice is 0 for SCHED_FIFO tasks 2608 */ 2609 if (task->policy == SCHED_RR) 2610 return sched_rr_timeslice; 2611 else 2612 return 0; 2613 } 2614 2615 DEFINE_SCHED_CLASS(rt) = { 2616 2617 .enqueue_task = enqueue_task_rt, 2618 .dequeue_task = dequeue_task_rt, 2619 .yield_task = yield_task_rt, 2620 2621 .check_preempt_curr = check_preempt_curr_rt, 2622 2623 .pick_next_task = pick_next_task_rt, 2624 .put_prev_task = put_prev_task_rt, 2625 .set_next_task = set_next_task_rt, 2626 2627 #ifdef CONFIG_SMP 2628 .balance = balance_rt, 2629 .pick_task = pick_task_rt, 2630 .select_task_rq = select_task_rq_rt, 2631 .set_cpus_allowed = set_cpus_allowed_common, 2632 .rq_online = rq_online_rt, 2633 .rq_offline = rq_offline_rt, 2634 .task_woken = task_woken_rt, 2635 .switched_from = switched_from_rt, 2636 .find_lock_rq = find_lock_lowest_rq, 2637 #endif 2638 2639 .task_tick = task_tick_rt, 2640 2641 .get_rr_interval = get_rr_interval_rt, 2642 2643 .prio_changed = prio_changed_rt, 2644 .switched_to = switched_to_rt, 2645 2646 .update_curr = update_curr_rt, 2647 2648 #ifdef CONFIG_UCLAMP_TASK 2649 .uclamp_enabled = 1, 2650 #endif 2651 }; 2652 2653 #ifdef CONFIG_RT_GROUP_SCHED 2654 /* 2655 * Ensure that the real time constraints are schedulable. 2656 */ 2657 static DEFINE_MUTEX(rt_constraints_mutex); 2658 2659 static inline int tg_has_rt_tasks(struct task_group *tg) 2660 { 2661 struct task_struct *task; 2662 struct css_task_iter it; 2663 int ret = 0; 2664 2665 /* 2666 * Autogroups do not have RT tasks; see autogroup_create(). 2667 */ 2668 if (task_group_is_autogroup(tg)) 2669 return 0; 2670 2671 css_task_iter_start(&tg->css, 0, &it); 2672 while (!ret && (task = css_task_iter_next(&it))) 2673 ret |= rt_task(task); 2674 css_task_iter_end(&it); 2675 2676 return ret; 2677 } 2678 2679 struct rt_schedulable_data { 2680 struct task_group *tg; 2681 u64 rt_period; 2682 u64 rt_runtime; 2683 }; 2684 2685 static int tg_rt_schedulable(struct task_group *tg, void *data) 2686 { 2687 struct rt_schedulable_data *d = data; 2688 struct task_group *child; 2689 unsigned long total, sum = 0; 2690 u64 period, runtime; 2691 2692 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 2693 runtime = tg->rt_bandwidth.rt_runtime; 2694 2695 if (tg == d->tg) { 2696 period = d->rt_period; 2697 runtime = d->rt_runtime; 2698 } 2699 2700 /* 2701 * Cannot have more runtime than the period. 2702 */ 2703 if (runtime > period && runtime != RUNTIME_INF) 2704 return -EINVAL; 2705 2706 /* 2707 * Ensure we don't starve existing RT tasks if runtime turns zero. 2708 */ 2709 if (rt_bandwidth_enabled() && !runtime && 2710 tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) 2711 return -EBUSY; 2712 2713 total = to_ratio(period, runtime); 2714 2715 /* 2716 * Nobody can have more than the global setting allows. 2717 */ 2718 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 2719 return -EINVAL; 2720 2721 /* 2722 * The sum of our children's runtime should not exceed our own. 2723 */ 2724 list_for_each_entry_rcu(child, &tg->children, siblings) { 2725 period = ktime_to_ns(child->rt_bandwidth.rt_period); 2726 runtime = child->rt_bandwidth.rt_runtime; 2727 2728 if (child == d->tg) { 2729 period = d->rt_period; 2730 runtime = d->rt_runtime; 2731 } 2732 2733 sum += to_ratio(period, runtime); 2734 } 2735 2736 if (sum > total) 2737 return -EINVAL; 2738 2739 return 0; 2740 } 2741 2742 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 2743 { 2744 int ret; 2745 2746 struct rt_schedulable_data data = { 2747 .tg = tg, 2748 .rt_period = period, 2749 .rt_runtime = runtime, 2750 }; 2751 2752 rcu_read_lock(); 2753 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 2754 rcu_read_unlock(); 2755 2756 return ret; 2757 } 2758 2759 static int tg_set_rt_bandwidth(struct task_group *tg, 2760 u64 rt_period, u64 rt_runtime) 2761 { 2762 int i, err = 0; 2763 2764 /* 2765 * Disallowing the root group RT runtime is BAD, it would disallow the 2766 * kernel creating (and or operating) RT threads. 2767 */ 2768 if (tg == &root_task_group && rt_runtime == 0) 2769 return -EINVAL; 2770 2771 /* No period doesn't make any sense. */ 2772 if (rt_period == 0) 2773 return -EINVAL; 2774 2775 /* 2776 * Bound quota to defend quota against overflow during bandwidth shift. 2777 */ 2778 if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime) 2779 return -EINVAL; 2780 2781 mutex_lock(&rt_constraints_mutex); 2782 err = __rt_schedulable(tg, rt_period, rt_runtime); 2783 if (err) 2784 goto unlock; 2785 2786 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 2787 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 2788 tg->rt_bandwidth.rt_runtime = rt_runtime; 2789 2790 for_each_possible_cpu(i) { 2791 struct rt_rq *rt_rq = tg->rt_rq[i]; 2792 2793 raw_spin_lock(&rt_rq->rt_runtime_lock); 2794 rt_rq->rt_runtime = rt_runtime; 2795 raw_spin_unlock(&rt_rq->rt_runtime_lock); 2796 } 2797 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 2798 unlock: 2799 mutex_unlock(&rt_constraints_mutex); 2800 2801 return err; 2802 } 2803 2804 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 2805 { 2806 u64 rt_runtime, rt_period; 2807 2808 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 2809 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 2810 if (rt_runtime_us < 0) 2811 rt_runtime = RUNTIME_INF; 2812 else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) 2813 return -EINVAL; 2814 2815 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 2816 } 2817 2818 long sched_group_rt_runtime(struct task_group *tg) 2819 { 2820 u64 rt_runtime_us; 2821 2822 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 2823 return -1; 2824 2825 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 2826 do_div(rt_runtime_us, NSEC_PER_USEC); 2827 return rt_runtime_us; 2828 } 2829 2830 int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) 2831 { 2832 u64 rt_runtime, rt_period; 2833 2834 if (rt_period_us > U64_MAX / NSEC_PER_USEC) 2835 return -EINVAL; 2836 2837 rt_period = rt_period_us * NSEC_PER_USEC; 2838 rt_runtime = tg->rt_bandwidth.rt_runtime; 2839 2840 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 2841 } 2842 2843 long sched_group_rt_period(struct task_group *tg) 2844 { 2845 u64 rt_period_us; 2846 2847 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 2848 do_div(rt_period_us, NSEC_PER_USEC); 2849 return rt_period_us; 2850 } 2851 2852 static int sched_rt_global_constraints(void) 2853 { 2854 int ret = 0; 2855 2856 mutex_lock(&rt_constraints_mutex); 2857 ret = __rt_schedulable(NULL, 0, 0); 2858 mutex_unlock(&rt_constraints_mutex); 2859 2860 return ret; 2861 } 2862 2863 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 2864 { 2865 /* Don't accept realtime tasks when there is no way for them to run */ 2866 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 2867 return 0; 2868 2869 return 1; 2870 } 2871 2872 #else /* !CONFIG_RT_GROUP_SCHED */ 2873 static int sched_rt_global_constraints(void) 2874 { 2875 unsigned long flags; 2876 int i; 2877 2878 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 2879 for_each_possible_cpu(i) { 2880 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 2881 2882 raw_spin_lock(&rt_rq->rt_runtime_lock); 2883 rt_rq->rt_runtime = global_rt_runtime(); 2884 raw_spin_unlock(&rt_rq->rt_runtime_lock); 2885 } 2886 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 2887 2888 return 0; 2889 } 2890 #endif /* CONFIG_RT_GROUP_SCHED */ 2891 2892 static int sched_rt_global_validate(void) 2893 { 2894 if (sysctl_sched_rt_period <= 0) 2895 return -EINVAL; 2896 2897 if ((sysctl_sched_rt_runtime != RUNTIME_INF) && 2898 ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || 2899 ((u64)sysctl_sched_rt_runtime * 2900 NSEC_PER_USEC > max_rt_runtime))) 2901 return -EINVAL; 2902 2903 return 0; 2904 } 2905 2906 static void sched_rt_do_global(void) 2907 { 2908 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 2909 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 2910 } 2911 2912 int sched_rt_handler(struct ctl_table *table, int write, void *buffer, 2913 size_t *lenp, loff_t *ppos) 2914 { 2915 int old_period, old_runtime; 2916 static DEFINE_MUTEX(mutex); 2917 int ret; 2918 2919 mutex_lock(&mutex); 2920 old_period = sysctl_sched_rt_period; 2921 old_runtime = sysctl_sched_rt_runtime; 2922 2923 ret = proc_dointvec(table, write, buffer, lenp, ppos); 2924 2925 if (!ret && write) { 2926 ret = sched_rt_global_validate(); 2927 if (ret) 2928 goto undo; 2929 2930 ret = sched_dl_global_validate(); 2931 if (ret) 2932 goto undo; 2933 2934 ret = sched_rt_global_constraints(); 2935 if (ret) 2936 goto undo; 2937 2938 sched_rt_do_global(); 2939 sched_dl_do_global(); 2940 } 2941 if (0) { 2942 undo: 2943 sysctl_sched_rt_period = old_period; 2944 sysctl_sched_rt_runtime = old_runtime; 2945 } 2946 mutex_unlock(&mutex); 2947 2948 return ret; 2949 } 2950 2951 int sched_rr_handler(struct ctl_table *table, int write, void *buffer, 2952 size_t *lenp, loff_t *ppos) 2953 { 2954 int ret; 2955 static DEFINE_MUTEX(mutex); 2956 2957 mutex_lock(&mutex); 2958 ret = proc_dointvec(table, write, buffer, lenp, ppos); 2959 /* 2960 * Make sure that internally we keep jiffies. 2961 * Also, writing zero resets the timeslice to default: 2962 */ 2963 if (!ret && write) { 2964 sched_rr_timeslice = 2965 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : 2966 msecs_to_jiffies(sysctl_sched_rr_timeslice); 2967 } 2968 mutex_unlock(&mutex); 2969 2970 return ret; 2971 } 2972 2973 #ifdef CONFIG_SCHED_DEBUG 2974 void print_rt_stats(struct seq_file *m, int cpu) 2975 { 2976 rt_rq_iter_t iter; 2977 struct rt_rq *rt_rq; 2978 2979 rcu_read_lock(); 2980 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) 2981 print_rt_rq(m, cpu, rt_rq); 2982 rcu_read_unlock(); 2983 } 2984 #endif /* CONFIG_SCHED_DEBUG */ 2985