1 /* 2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 3 * policies) 4 */ 5 6 #include "sched.h" 7 8 #include <linux/slab.h> 9 #include <linux/irq_work.h> 10 11 int sched_rr_timeslice = RR_TIMESLICE; 12 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; 13 14 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 15 16 struct rt_bandwidth def_rt_bandwidth; 17 18 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 19 { 20 struct rt_bandwidth *rt_b = 21 container_of(timer, struct rt_bandwidth, rt_period_timer); 22 int idle = 0; 23 int overrun; 24 25 raw_spin_lock(&rt_b->rt_runtime_lock); 26 for (;;) { 27 overrun = hrtimer_forward_now(timer, rt_b->rt_period); 28 if (!overrun) 29 break; 30 31 raw_spin_unlock(&rt_b->rt_runtime_lock); 32 idle = do_sched_rt_period_timer(rt_b, overrun); 33 raw_spin_lock(&rt_b->rt_runtime_lock); 34 } 35 if (idle) 36 rt_b->rt_period_active = 0; 37 raw_spin_unlock(&rt_b->rt_runtime_lock); 38 39 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 40 } 41 42 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 43 { 44 rt_b->rt_period = ns_to_ktime(period); 45 rt_b->rt_runtime = runtime; 46 47 raw_spin_lock_init(&rt_b->rt_runtime_lock); 48 49 hrtimer_init(&rt_b->rt_period_timer, 50 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 51 rt_b->rt_period_timer.function = sched_rt_period_timer; 52 } 53 54 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 55 { 56 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 57 return; 58 59 raw_spin_lock(&rt_b->rt_runtime_lock); 60 if (!rt_b->rt_period_active) { 61 rt_b->rt_period_active = 1; 62 /* 63 * SCHED_DEADLINE updates the bandwidth, as a run away 64 * RT task with a DL task could hog a CPU. But DL does 65 * not reset the period. If a deadline task was running 66 * without an RT task running, it can cause RT tasks to 67 * throttle when they start up. Kick the timer right away 68 * to update the period. 69 */ 70 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); 71 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); 72 } 73 raw_spin_unlock(&rt_b->rt_runtime_lock); 74 } 75 76 #if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) 77 static void push_irq_work_func(struct irq_work *work); 78 #endif 79 80 void init_rt_rq(struct rt_rq *rt_rq) 81 { 82 struct rt_prio_array *array; 83 int i; 84 85 array = &rt_rq->active; 86 for (i = 0; i < MAX_RT_PRIO; i++) { 87 INIT_LIST_HEAD(array->queue + i); 88 __clear_bit(i, array->bitmap); 89 } 90 /* delimiter for bitsearch: */ 91 __set_bit(MAX_RT_PRIO, array->bitmap); 92 93 #if defined CONFIG_SMP 94 rt_rq->highest_prio.curr = MAX_RT_PRIO; 95 rt_rq->highest_prio.next = MAX_RT_PRIO; 96 rt_rq->rt_nr_migratory = 0; 97 rt_rq->overloaded = 0; 98 plist_head_init(&rt_rq->pushable_tasks); 99 100 #ifdef HAVE_RT_PUSH_IPI 101 rt_rq->push_flags = 0; 102 rt_rq->push_cpu = nr_cpu_ids; 103 raw_spin_lock_init(&rt_rq->push_lock); 104 init_irq_work(&rt_rq->push_work, push_irq_work_func); 105 #endif 106 #endif /* CONFIG_SMP */ 107 /* We start is dequeued state, because no RT tasks are queued */ 108 rt_rq->rt_queued = 0; 109 110 rt_rq->rt_time = 0; 111 rt_rq->rt_throttled = 0; 112 rt_rq->rt_runtime = 0; 113 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 114 } 115 116 #ifdef CONFIG_RT_GROUP_SCHED 117 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 118 { 119 hrtimer_cancel(&rt_b->rt_period_timer); 120 } 121 122 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 123 124 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 125 { 126 #ifdef CONFIG_SCHED_DEBUG 127 WARN_ON_ONCE(!rt_entity_is_task(rt_se)); 128 #endif 129 return container_of(rt_se, struct task_struct, rt); 130 } 131 132 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 133 { 134 return rt_rq->rq; 135 } 136 137 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 138 { 139 return rt_se->rt_rq; 140 } 141 142 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 143 { 144 struct rt_rq *rt_rq = rt_se->rt_rq; 145 146 return rt_rq->rq; 147 } 148 149 void free_rt_sched_group(struct task_group *tg) 150 { 151 int i; 152 153 if (tg->rt_se) 154 destroy_rt_bandwidth(&tg->rt_bandwidth); 155 156 for_each_possible_cpu(i) { 157 if (tg->rt_rq) 158 kfree(tg->rt_rq[i]); 159 if (tg->rt_se) 160 kfree(tg->rt_se[i]); 161 } 162 163 kfree(tg->rt_rq); 164 kfree(tg->rt_se); 165 } 166 167 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 168 struct sched_rt_entity *rt_se, int cpu, 169 struct sched_rt_entity *parent) 170 { 171 struct rq *rq = cpu_rq(cpu); 172 173 rt_rq->highest_prio.curr = MAX_RT_PRIO; 174 rt_rq->rt_nr_boosted = 0; 175 rt_rq->rq = rq; 176 rt_rq->tg = tg; 177 178 tg->rt_rq[cpu] = rt_rq; 179 tg->rt_se[cpu] = rt_se; 180 181 if (!rt_se) 182 return; 183 184 if (!parent) 185 rt_se->rt_rq = &rq->rt; 186 else 187 rt_se->rt_rq = parent->my_q; 188 189 rt_se->my_q = rt_rq; 190 rt_se->parent = parent; 191 INIT_LIST_HEAD(&rt_se->run_list); 192 } 193 194 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 195 { 196 struct rt_rq *rt_rq; 197 struct sched_rt_entity *rt_se; 198 int i; 199 200 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 201 if (!tg->rt_rq) 202 goto err; 203 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); 204 if (!tg->rt_se) 205 goto err; 206 207 init_rt_bandwidth(&tg->rt_bandwidth, 208 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 209 210 for_each_possible_cpu(i) { 211 rt_rq = kzalloc_node(sizeof(struct rt_rq), 212 GFP_KERNEL, cpu_to_node(i)); 213 if (!rt_rq) 214 goto err; 215 216 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 217 GFP_KERNEL, cpu_to_node(i)); 218 if (!rt_se) 219 goto err_free_rq; 220 221 init_rt_rq(rt_rq); 222 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 223 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 224 } 225 226 return 1; 227 228 err_free_rq: 229 kfree(rt_rq); 230 err: 231 return 0; 232 } 233 234 #else /* CONFIG_RT_GROUP_SCHED */ 235 236 #define rt_entity_is_task(rt_se) (1) 237 238 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 239 { 240 return container_of(rt_se, struct task_struct, rt); 241 } 242 243 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 244 { 245 return container_of(rt_rq, struct rq, rt); 246 } 247 248 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 249 { 250 struct task_struct *p = rt_task_of(rt_se); 251 252 return task_rq(p); 253 } 254 255 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 256 { 257 struct rq *rq = rq_of_rt_se(rt_se); 258 259 return &rq->rt; 260 } 261 262 void free_rt_sched_group(struct task_group *tg) { } 263 264 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 265 { 266 return 1; 267 } 268 #endif /* CONFIG_RT_GROUP_SCHED */ 269 270 #ifdef CONFIG_SMP 271 272 static void pull_rt_task(struct rq *this_rq); 273 274 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 275 { 276 /* Try to pull RT tasks here if we lower this rq's prio */ 277 return rq->rt.highest_prio.curr > prev->prio; 278 } 279 280 static inline int rt_overloaded(struct rq *rq) 281 { 282 return atomic_read(&rq->rd->rto_count); 283 } 284 285 static inline void rt_set_overload(struct rq *rq) 286 { 287 if (!rq->online) 288 return; 289 290 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); 291 /* 292 * Make sure the mask is visible before we set 293 * the overload count. That is checked to determine 294 * if we should look at the mask. It would be a shame 295 * if we looked at the mask, but the mask was not 296 * updated yet. 297 * 298 * Matched by the barrier in pull_rt_task(). 299 */ 300 smp_wmb(); 301 atomic_inc(&rq->rd->rto_count); 302 } 303 304 static inline void rt_clear_overload(struct rq *rq) 305 { 306 if (!rq->online) 307 return; 308 309 /* the order here really doesn't matter */ 310 atomic_dec(&rq->rd->rto_count); 311 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); 312 } 313 314 static void update_rt_migration(struct rt_rq *rt_rq) 315 { 316 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { 317 if (!rt_rq->overloaded) { 318 rt_set_overload(rq_of_rt_rq(rt_rq)); 319 rt_rq->overloaded = 1; 320 } 321 } else if (rt_rq->overloaded) { 322 rt_clear_overload(rq_of_rt_rq(rt_rq)); 323 rt_rq->overloaded = 0; 324 } 325 } 326 327 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 328 { 329 struct task_struct *p; 330 331 if (!rt_entity_is_task(rt_se)) 332 return; 333 334 p = rt_task_of(rt_se); 335 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 336 337 rt_rq->rt_nr_total++; 338 if (p->nr_cpus_allowed > 1) 339 rt_rq->rt_nr_migratory++; 340 341 update_rt_migration(rt_rq); 342 } 343 344 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 345 { 346 struct task_struct *p; 347 348 if (!rt_entity_is_task(rt_se)) 349 return; 350 351 p = rt_task_of(rt_se); 352 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 353 354 rt_rq->rt_nr_total--; 355 if (p->nr_cpus_allowed > 1) 356 rt_rq->rt_nr_migratory--; 357 358 update_rt_migration(rt_rq); 359 } 360 361 static inline int has_pushable_tasks(struct rq *rq) 362 { 363 return !plist_head_empty(&rq->rt.pushable_tasks); 364 } 365 366 static DEFINE_PER_CPU(struct callback_head, rt_push_head); 367 static DEFINE_PER_CPU(struct callback_head, rt_pull_head); 368 369 static void push_rt_tasks(struct rq *); 370 static void pull_rt_task(struct rq *); 371 372 static inline void queue_push_tasks(struct rq *rq) 373 { 374 if (!has_pushable_tasks(rq)) 375 return; 376 377 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); 378 } 379 380 static inline void queue_pull_task(struct rq *rq) 381 { 382 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); 383 } 384 385 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 386 { 387 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 388 plist_node_init(&p->pushable_tasks, p->prio); 389 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 390 391 /* Update the highest prio pushable task */ 392 if (p->prio < rq->rt.highest_prio.next) 393 rq->rt.highest_prio.next = p->prio; 394 } 395 396 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 397 { 398 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 399 400 /* Update the new highest prio pushable task */ 401 if (has_pushable_tasks(rq)) { 402 p = plist_first_entry(&rq->rt.pushable_tasks, 403 struct task_struct, pushable_tasks); 404 rq->rt.highest_prio.next = p->prio; 405 } else 406 rq->rt.highest_prio.next = MAX_RT_PRIO; 407 } 408 409 #else 410 411 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 412 { 413 } 414 415 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 416 { 417 } 418 419 static inline 420 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 421 { 422 } 423 424 static inline 425 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 426 { 427 } 428 429 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 430 { 431 return false; 432 } 433 434 static inline void pull_rt_task(struct rq *this_rq) 435 { 436 } 437 438 static inline void queue_push_tasks(struct rq *rq) 439 { 440 } 441 #endif /* CONFIG_SMP */ 442 443 static void enqueue_top_rt_rq(struct rt_rq *rt_rq); 444 static void dequeue_top_rt_rq(struct rt_rq *rt_rq); 445 446 static inline int on_rt_rq(struct sched_rt_entity *rt_se) 447 { 448 return rt_se->on_rq; 449 } 450 451 #ifdef CONFIG_RT_GROUP_SCHED 452 453 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 454 { 455 if (!rt_rq->tg) 456 return RUNTIME_INF; 457 458 return rt_rq->rt_runtime; 459 } 460 461 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 462 { 463 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 464 } 465 466 typedef struct task_group *rt_rq_iter_t; 467 468 static inline struct task_group *next_task_group(struct task_group *tg) 469 { 470 do { 471 tg = list_entry_rcu(tg->list.next, 472 typeof(struct task_group), list); 473 } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); 474 475 if (&tg->list == &task_groups) 476 tg = NULL; 477 478 return tg; 479 } 480 481 #define for_each_rt_rq(rt_rq, iter, rq) \ 482 for (iter = container_of(&task_groups, typeof(*iter), list); \ 483 (iter = next_task_group(iter)) && \ 484 (rt_rq = iter->rt_rq[cpu_of(rq)]);) 485 486 #define for_each_sched_rt_entity(rt_se) \ 487 for (; rt_se; rt_se = rt_se->parent) 488 489 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 490 { 491 return rt_se->my_q; 492 } 493 494 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 495 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 496 497 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 498 { 499 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 500 struct rq *rq = rq_of_rt_rq(rt_rq); 501 struct sched_rt_entity *rt_se; 502 503 int cpu = cpu_of(rq); 504 505 rt_se = rt_rq->tg->rt_se[cpu]; 506 507 if (rt_rq->rt_nr_running) { 508 if (!rt_se) 509 enqueue_top_rt_rq(rt_rq); 510 else if (!on_rt_rq(rt_se)) 511 enqueue_rt_entity(rt_se, 0); 512 513 if (rt_rq->highest_prio.curr < curr->prio) 514 resched_curr(rq); 515 } 516 } 517 518 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 519 { 520 struct sched_rt_entity *rt_se; 521 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 522 523 rt_se = rt_rq->tg->rt_se[cpu]; 524 525 if (!rt_se) 526 dequeue_top_rt_rq(rt_rq); 527 else if (on_rt_rq(rt_se)) 528 dequeue_rt_entity(rt_se, 0); 529 } 530 531 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 532 { 533 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; 534 } 535 536 static int rt_se_boosted(struct sched_rt_entity *rt_se) 537 { 538 struct rt_rq *rt_rq = group_rt_rq(rt_se); 539 struct task_struct *p; 540 541 if (rt_rq) 542 return !!rt_rq->rt_nr_boosted; 543 544 p = rt_task_of(rt_se); 545 return p->prio != p->normal_prio; 546 } 547 548 #ifdef CONFIG_SMP 549 static inline const struct cpumask *sched_rt_period_mask(void) 550 { 551 return this_rq()->rd->span; 552 } 553 #else 554 static inline const struct cpumask *sched_rt_period_mask(void) 555 { 556 return cpu_online_mask; 557 } 558 #endif 559 560 static inline 561 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 562 { 563 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; 564 } 565 566 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 567 { 568 return &rt_rq->tg->rt_bandwidth; 569 } 570 571 #else /* !CONFIG_RT_GROUP_SCHED */ 572 573 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 574 { 575 return rt_rq->rt_runtime; 576 } 577 578 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 579 { 580 return ktime_to_ns(def_rt_bandwidth.rt_period); 581 } 582 583 typedef struct rt_rq *rt_rq_iter_t; 584 585 #define for_each_rt_rq(rt_rq, iter, rq) \ 586 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 587 588 #define for_each_sched_rt_entity(rt_se) \ 589 for (; rt_se; rt_se = NULL) 590 591 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 592 { 593 return NULL; 594 } 595 596 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 597 { 598 struct rq *rq = rq_of_rt_rq(rt_rq); 599 600 if (!rt_rq->rt_nr_running) 601 return; 602 603 enqueue_top_rt_rq(rt_rq); 604 resched_curr(rq); 605 } 606 607 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 608 { 609 dequeue_top_rt_rq(rt_rq); 610 } 611 612 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 613 { 614 return rt_rq->rt_throttled; 615 } 616 617 static inline const struct cpumask *sched_rt_period_mask(void) 618 { 619 return cpu_online_mask; 620 } 621 622 static inline 623 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 624 { 625 return &cpu_rq(cpu)->rt; 626 } 627 628 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 629 { 630 return &def_rt_bandwidth; 631 } 632 633 #endif /* CONFIG_RT_GROUP_SCHED */ 634 635 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) 636 { 637 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 638 639 return (hrtimer_active(&rt_b->rt_period_timer) || 640 rt_rq->rt_time < rt_b->rt_runtime); 641 } 642 643 #ifdef CONFIG_SMP 644 /* 645 * We ran out of runtime, see if we can borrow some from our neighbours. 646 */ 647 static void do_balance_runtime(struct rt_rq *rt_rq) 648 { 649 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 650 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; 651 int i, weight; 652 u64 rt_period; 653 654 weight = cpumask_weight(rd->span); 655 656 raw_spin_lock(&rt_b->rt_runtime_lock); 657 rt_period = ktime_to_ns(rt_b->rt_period); 658 for_each_cpu(i, rd->span) { 659 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 660 s64 diff; 661 662 if (iter == rt_rq) 663 continue; 664 665 raw_spin_lock(&iter->rt_runtime_lock); 666 /* 667 * Either all rqs have inf runtime and there's nothing to steal 668 * or __disable_runtime() below sets a specific rq to inf to 669 * indicate its been disabled and disalow stealing. 670 */ 671 if (iter->rt_runtime == RUNTIME_INF) 672 goto next; 673 674 /* 675 * From runqueues with spare time, take 1/n part of their 676 * spare time, but no more than our period. 677 */ 678 diff = iter->rt_runtime - iter->rt_time; 679 if (diff > 0) { 680 diff = div_u64((u64)diff, weight); 681 if (rt_rq->rt_runtime + diff > rt_period) 682 diff = rt_period - rt_rq->rt_runtime; 683 iter->rt_runtime -= diff; 684 rt_rq->rt_runtime += diff; 685 if (rt_rq->rt_runtime == rt_period) { 686 raw_spin_unlock(&iter->rt_runtime_lock); 687 break; 688 } 689 } 690 next: 691 raw_spin_unlock(&iter->rt_runtime_lock); 692 } 693 raw_spin_unlock(&rt_b->rt_runtime_lock); 694 } 695 696 /* 697 * Ensure this RQ takes back all the runtime it lend to its neighbours. 698 */ 699 static void __disable_runtime(struct rq *rq) 700 { 701 struct root_domain *rd = rq->rd; 702 rt_rq_iter_t iter; 703 struct rt_rq *rt_rq; 704 705 if (unlikely(!scheduler_running)) 706 return; 707 708 for_each_rt_rq(rt_rq, iter, rq) { 709 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 710 s64 want; 711 int i; 712 713 raw_spin_lock(&rt_b->rt_runtime_lock); 714 raw_spin_lock(&rt_rq->rt_runtime_lock); 715 /* 716 * Either we're all inf and nobody needs to borrow, or we're 717 * already disabled and thus have nothing to do, or we have 718 * exactly the right amount of runtime to take out. 719 */ 720 if (rt_rq->rt_runtime == RUNTIME_INF || 721 rt_rq->rt_runtime == rt_b->rt_runtime) 722 goto balanced; 723 raw_spin_unlock(&rt_rq->rt_runtime_lock); 724 725 /* 726 * Calculate the difference between what we started out with 727 * and what we current have, that's the amount of runtime 728 * we lend and now have to reclaim. 729 */ 730 want = rt_b->rt_runtime - rt_rq->rt_runtime; 731 732 /* 733 * Greedy reclaim, take back as much as we can. 734 */ 735 for_each_cpu(i, rd->span) { 736 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 737 s64 diff; 738 739 /* 740 * Can't reclaim from ourselves or disabled runqueues. 741 */ 742 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 743 continue; 744 745 raw_spin_lock(&iter->rt_runtime_lock); 746 if (want > 0) { 747 diff = min_t(s64, iter->rt_runtime, want); 748 iter->rt_runtime -= diff; 749 want -= diff; 750 } else { 751 iter->rt_runtime -= want; 752 want -= want; 753 } 754 raw_spin_unlock(&iter->rt_runtime_lock); 755 756 if (!want) 757 break; 758 } 759 760 raw_spin_lock(&rt_rq->rt_runtime_lock); 761 /* 762 * We cannot be left wanting - that would mean some runtime 763 * leaked out of the system. 764 */ 765 BUG_ON(want); 766 balanced: 767 /* 768 * Disable all the borrow logic by pretending we have inf 769 * runtime - in which case borrowing doesn't make sense. 770 */ 771 rt_rq->rt_runtime = RUNTIME_INF; 772 rt_rq->rt_throttled = 0; 773 raw_spin_unlock(&rt_rq->rt_runtime_lock); 774 raw_spin_unlock(&rt_b->rt_runtime_lock); 775 776 /* Make rt_rq available for pick_next_task() */ 777 sched_rt_rq_enqueue(rt_rq); 778 } 779 } 780 781 static void __enable_runtime(struct rq *rq) 782 { 783 rt_rq_iter_t iter; 784 struct rt_rq *rt_rq; 785 786 if (unlikely(!scheduler_running)) 787 return; 788 789 /* 790 * Reset each runqueue's bandwidth settings 791 */ 792 for_each_rt_rq(rt_rq, iter, rq) { 793 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 794 795 raw_spin_lock(&rt_b->rt_runtime_lock); 796 raw_spin_lock(&rt_rq->rt_runtime_lock); 797 rt_rq->rt_runtime = rt_b->rt_runtime; 798 rt_rq->rt_time = 0; 799 rt_rq->rt_throttled = 0; 800 raw_spin_unlock(&rt_rq->rt_runtime_lock); 801 raw_spin_unlock(&rt_b->rt_runtime_lock); 802 } 803 } 804 805 static void balance_runtime(struct rt_rq *rt_rq) 806 { 807 if (!sched_feat(RT_RUNTIME_SHARE)) 808 return; 809 810 if (rt_rq->rt_time > rt_rq->rt_runtime) { 811 raw_spin_unlock(&rt_rq->rt_runtime_lock); 812 do_balance_runtime(rt_rq); 813 raw_spin_lock(&rt_rq->rt_runtime_lock); 814 } 815 } 816 #else /* !CONFIG_SMP */ 817 static inline void balance_runtime(struct rt_rq *rt_rq) {} 818 #endif /* CONFIG_SMP */ 819 820 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 821 { 822 int i, idle = 1, throttled = 0; 823 const struct cpumask *span; 824 825 span = sched_rt_period_mask(); 826 #ifdef CONFIG_RT_GROUP_SCHED 827 /* 828 * FIXME: isolated CPUs should really leave the root task group, 829 * whether they are isolcpus or were isolated via cpusets, lest 830 * the timer run on a CPU which does not service all runqueues, 831 * potentially leaving other CPUs indefinitely throttled. If 832 * isolation is really required, the user will turn the throttle 833 * off to kill the perturbations it causes anyway. Meanwhile, 834 * this maintains functionality for boot and/or troubleshooting. 835 */ 836 if (rt_b == &root_task_group.rt_bandwidth) 837 span = cpu_online_mask; 838 #endif 839 for_each_cpu(i, span) { 840 int enqueue = 0; 841 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 842 struct rq *rq = rq_of_rt_rq(rt_rq); 843 844 raw_spin_lock(&rq->lock); 845 if (rt_rq->rt_time) { 846 u64 runtime; 847 848 raw_spin_lock(&rt_rq->rt_runtime_lock); 849 if (rt_rq->rt_throttled) 850 balance_runtime(rt_rq); 851 runtime = rt_rq->rt_runtime; 852 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); 853 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 854 rt_rq->rt_throttled = 0; 855 enqueue = 1; 856 857 /* 858 * When we're idle and a woken (rt) task is 859 * throttled check_preempt_curr() will set 860 * skip_update and the time between the wakeup 861 * and this unthrottle will get accounted as 862 * 'runtime'. 863 */ 864 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 865 rq_clock_skip_update(rq, false); 866 } 867 if (rt_rq->rt_time || rt_rq->rt_nr_running) 868 idle = 0; 869 raw_spin_unlock(&rt_rq->rt_runtime_lock); 870 } else if (rt_rq->rt_nr_running) { 871 idle = 0; 872 if (!rt_rq_throttled(rt_rq)) 873 enqueue = 1; 874 } 875 if (rt_rq->rt_throttled) 876 throttled = 1; 877 878 if (enqueue) 879 sched_rt_rq_enqueue(rt_rq); 880 raw_spin_unlock(&rq->lock); 881 } 882 883 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) 884 return 1; 885 886 return idle; 887 } 888 889 static inline int rt_se_prio(struct sched_rt_entity *rt_se) 890 { 891 #ifdef CONFIG_RT_GROUP_SCHED 892 struct rt_rq *rt_rq = group_rt_rq(rt_se); 893 894 if (rt_rq) 895 return rt_rq->highest_prio.curr; 896 #endif 897 898 return rt_task_of(rt_se)->prio; 899 } 900 901 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) 902 { 903 u64 runtime = sched_rt_runtime(rt_rq); 904 905 if (rt_rq->rt_throttled) 906 return rt_rq_throttled(rt_rq); 907 908 if (runtime >= sched_rt_period(rt_rq)) 909 return 0; 910 911 balance_runtime(rt_rq); 912 runtime = sched_rt_runtime(rt_rq); 913 if (runtime == RUNTIME_INF) 914 return 0; 915 916 if (rt_rq->rt_time > runtime) { 917 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 918 919 /* 920 * Don't actually throttle groups that have no runtime assigned 921 * but accrue some time due to boosting. 922 */ 923 if (likely(rt_b->rt_runtime)) { 924 rt_rq->rt_throttled = 1; 925 printk_deferred_once("sched: RT throttling activated\n"); 926 } else { 927 /* 928 * In case we did anyway, make it go away, 929 * replenishment is a joke, since it will replenish us 930 * with exactly 0 ns. 931 */ 932 rt_rq->rt_time = 0; 933 } 934 935 if (rt_rq_throttled(rt_rq)) { 936 sched_rt_rq_dequeue(rt_rq); 937 return 1; 938 } 939 } 940 941 return 0; 942 } 943 944 /* 945 * Update the current task's runtime statistics. Skip current tasks that 946 * are not in our scheduling class. 947 */ 948 static void update_curr_rt(struct rq *rq) 949 { 950 struct task_struct *curr = rq->curr; 951 struct sched_rt_entity *rt_se = &curr->rt; 952 u64 delta_exec; 953 954 if (curr->sched_class != &rt_sched_class) 955 return; 956 957 delta_exec = rq_clock_task(rq) - curr->se.exec_start; 958 if (unlikely((s64)delta_exec <= 0)) 959 return; 960 961 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 962 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); 963 964 schedstat_set(curr->se.statistics.exec_max, 965 max(curr->se.statistics.exec_max, delta_exec)); 966 967 curr->se.sum_exec_runtime += delta_exec; 968 account_group_exec_runtime(curr, delta_exec); 969 970 curr->se.exec_start = rq_clock_task(rq); 971 cpuacct_charge(curr, delta_exec); 972 973 sched_rt_avg_update(rq, delta_exec); 974 975 if (!rt_bandwidth_enabled()) 976 return; 977 978 for_each_sched_rt_entity(rt_se) { 979 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 980 981 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 982 raw_spin_lock(&rt_rq->rt_runtime_lock); 983 rt_rq->rt_time += delta_exec; 984 if (sched_rt_runtime_exceeded(rt_rq)) 985 resched_curr(rq); 986 raw_spin_unlock(&rt_rq->rt_runtime_lock); 987 } 988 } 989 } 990 991 static void 992 dequeue_top_rt_rq(struct rt_rq *rt_rq) 993 { 994 struct rq *rq = rq_of_rt_rq(rt_rq); 995 996 BUG_ON(&rq->rt != rt_rq); 997 998 if (!rt_rq->rt_queued) 999 return; 1000 1001 BUG_ON(!rq->nr_running); 1002 1003 sub_nr_running(rq, rt_rq->rt_nr_running); 1004 rt_rq->rt_queued = 0; 1005 } 1006 1007 static void 1008 enqueue_top_rt_rq(struct rt_rq *rt_rq) 1009 { 1010 struct rq *rq = rq_of_rt_rq(rt_rq); 1011 1012 BUG_ON(&rq->rt != rt_rq); 1013 1014 if (rt_rq->rt_queued) 1015 return; 1016 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) 1017 return; 1018 1019 add_nr_running(rq, rt_rq->rt_nr_running); 1020 rt_rq->rt_queued = 1; 1021 } 1022 1023 #if defined CONFIG_SMP 1024 1025 static void 1026 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1027 { 1028 struct rq *rq = rq_of_rt_rq(rt_rq); 1029 1030 #ifdef CONFIG_RT_GROUP_SCHED 1031 /* 1032 * Change rq's cpupri only if rt_rq is the top queue. 1033 */ 1034 if (&rq->rt != rt_rq) 1035 return; 1036 #endif 1037 if (rq->online && prio < prev_prio) 1038 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 1039 } 1040 1041 static void 1042 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1043 { 1044 struct rq *rq = rq_of_rt_rq(rt_rq); 1045 1046 #ifdef CONFIG_RT_GROUP_SCHED 1047 /* 1048 * Change rq's cpupri only if rt_rq is the top queue. 1049 */ 1050 if (&rq->rt != rt_rq) 1051 return; 1052 #endif 1053 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 1054 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 1055 } 1056 1057 #else /* CONFIG_SMP */ 1058 1059 static inline 1060 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1061 static inline 1062 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1063 1064 #endif /* CONFIG_SMP */ 1065 1066 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 1067 static void 1068 inc_rt_prio(struct rt_rq *rt_rq, int prio) 1069 { 1070 int prev_prio = rt_rq->highest_prio.curr; 1071 1072 if (prio < prev_prio) 1073 rt_rq->highest_prio.curr = prio; 1074 1075 inc_rt_prio_smp(rt_rq, prio, prev_prio); 1076 } 1077 1078 static void 1079 dec_rt_prio(struct rt_rq *rt_rq, int prio) 1080 { 1081 int prev_prio = rt_rq->highest_prio.curr; 1082 1083 if (rt_rq->rt_nr_running) { 1084 1085 WARN_ON(prio < prev_prio); 1086 1087 /* 1088 * This may have been our highest task, and therefore 1089 * we may have some recomputation to do 1090 */ 1091 if (prio == prev_prio) { 1092 struct rt_prio_array *array = &rt_rq->active; 1093 1094 rt_rq->highest_prio.curr = 1095 sched_find_first_bit(array->bitmap); 1096 } 1097 1098 } else 1099 rt_rq->highest_prio.curr = MAX_RT_PRIO; 1100 1101 dec_rt_prio_smp(rt_rq, prio, prev_prio); 1102 } 1103 1104 #else 1105 1106 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} 1107 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} 1108 1109 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ 1110 1111 #ifdef CONFIG_RT_GROUP_SCHED 1112 1113 static void 1114 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1115 { 1116 if (rt_se_boosted(rt_se)) 1117 rt_rq->rt_nr_boosted++; 1118 1119 if (rt_rq->tg) 1120 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 1121 } 1122 1123 static void 1124 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1125 { 1126 if (rt_se_boosted(rt_se)) 1127 rt_rq->rt_nr_boosted--; 1128 1129 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 1130 } 1131 1132 #else /* CONFIG_RT_GROUP_SCHED */ 1133 1134 static void 1135 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1136 { 1137 start_rt_bandwidth(&def_rt_bandwidth); 1138 } 1139 1140 static inline 1141 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} 1142 1143 #endif /* CONFIG_RT_GROUP_SCHED */ 1144 1145 static inline 1146 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) 1147 { 1148 struct rt_rq *group_rq = group_rt_rq(rt_se); 1149 1150 if (group_rq) 1151 return group_rq->rt_nr_running; 1152 else 1153 return 1; 1154 } 1155 1156 static inline 1157 unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se) 1158 { 1159 struct rt_rq *group_rq = group_rt_rq(rt_se); 1160 struct task_struct *tsk; 1161 1162 if (group_rq) 1163 return group_rq->rr_nr_running; 1164 1165 tsk = rt_task_of(rt_se); 1166 1167 return (tsk->policy == SCHED_RR) ? 1 : 0; 1168 } 1169 1170 static inline 1171 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1172 { 1173 int prio = rt_se_prio(rt_se); 1174 1175 WARN_ON(!rt_prio(prio)); 1176 rt_rq->rt_nr_running += rt_se_nr_running(rt_se); 1177 rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); 1178 1179 inc_rt_prio(rt_rq, prio); 1180 inc_rt_migration(rt_se, rt_rq); 1181 inc_rt_group(rt_se, rt_rq); 1182 } 1183 1184 static inline 1185 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1186 { 1187 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1188 WARN_ON(!rt_rq->rt_nr_running); 1189 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); 1190 rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); 1191 1192 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1193 dec_rt_migration(rt_se, rt_rq); 1194 dec_rt_group(rt_se, rt_rq); 1195 } 1196 1197 /* 1198 * Change rt_se->run_list location unless SAVE && !MOVE 1199 * 1200 * assumes ENQUEUE/DEQUEUE flags match 1201 */ 1202 static inline bool move_entity(unsigned int flags) 1203 { 1204 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) 1205 return false; 1206 1207 return true; 1208 } 1209 1210 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) 1211 { 1212 list_del_init(&rt_se->run_list); 1213 1214 if (list_empty(array->queue + rt_se_prio(rt_se))) 1215 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1216 1217 rt_se->on_list = 0; 1218 } 1219 1220 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1221 { 1222 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1223 struct rt_prio_array *array = &rt_rq->active; 1224 struct rt_rq *group_rq = group_rt_rq(rt_se); 1225 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1226 1227 /* 1228 * Don't enqueue the group if its throttled, or when empty. 1229 * The latter is a consequence of the former when a child group 1230 * get throttled and the current group doesn't have any other 1231 * active members. 1232 */ 1233 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { 1234 if (rt_se->on_list) 1235 __delist_rt_entity(rt_se, array); 1236 return; 1237 } 1238 1239 if (move_entity(flags)) { 1240 WARN_ON_ONCE(rt_se->on_list); 1241 if (flags & ENQUEUE_HEAD) 1242 list_add(&rt_se->run_list, queue); 1243 else 1244 list_add_tail(&rt_se->run_list, queue); 1245 1246 __set_bit(rt_se_prio(rt_se), array->bitmap); 1247 rt_se->on_list = 1; 1248 } 1249 rt_se->on_rq = 1; 1250 1251 inc_rt_tasks(rt_se, rt_rq); 1252 } 1253 1254 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1255 { 1256 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1257 struct rt_prio_array *array = &rt_rq->active; 1258 1259 if (move_entity(flags)) { 1260 WARN_ON_ONCE(!rt_se->on_list); 1261 __delist_rt_entity(rt_se, array); 1262 } 1263 rt_se->on_rq = 0; 1264 1265 dec_rt_tasks(rt_se, rt_rq); 1266 } 1267 1268 /* 1269 * Because the prio of an upper entry depends on the lower 1270 * entries, we must remove entries top - down. 1271 */ 1272 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) 1273 { 1274 struct sched_rt_entity *back = NULL; 1275 1276 for_each_sched_rt_entity(rt_se) { 1277 rt_se->back = back; 1278 back = rt_se; 1279 } 1280 1281 dequeue_top_rt_rq(rt_rq_of_se(back)); 1282 1283 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1284 if (on_rt_rq(rt_se)) 1285 __dequeue_rt_entity(rt_se, flags); 1286 } 1287 } 1288 1289 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1290 { 1291 struct rq *rq = rq_of_rt_se(rt_se); 1292 1293 dequeue_rt_stack(rt_se, flags); 1294 for_each_sched_rt_entity(rt_se) 1295 __enqueue_rt_entity(rt_se, flags); 1296 enqueue_top_rt_rq(&rq->rt); 1297 } 1298 1299 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1300 { 1301 struct rq *rq = rq_of_rt_se(rt_se); 1302 1303 dequeue_rt_stack(rt_se, flags); 1304 1305 for_each_sched_rt_entity(rt_se) { 1306 struct rt_rq *rt_rq = group_rt_rq(rt_se); 1307 1308 if (rt_rq && rt_rq->rt_nr_running) 1309 __enqueue_rt_entity(rt_se, flags); 1310 } 1311 enqueue_top_rt_rq(&rq->rt); 1312 } 1313 1314 /* 1315 * Adding/removing a task to/from a priority array: 1316 */ 1317 static void 1318 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1319 { 1320 struct sched_rt_entity *rt_se = &p->rt; 1321 1322 if (flags & ENQUEUE_WAKEUP) 1323 rt_se->timeout = 0; 1324 1325 enqueue_rt_entity(rt_se, flags); 1326 1327 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1328 enqueue_pushable_task(rq, p); 1329 } 1330 1331 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1332 { 1333 struct sched_rt_entity *rt_se = &p->rt; 1334 1335 update_curr_rt(rq); 1336 dequeue_rt_entity(rt_se, flags); 1337 1338 dequeue_pushable_task(rq, p); 1339 } 1340 1341 /* 1342 * Put task to the head or the end of the run list without the overhead of 1343 * dequeue followed by enqueue. 1344 */ 1345 static void 1346 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1347 { 1348 if (on_rt_rq(rt_se)) { 1349 struct rt_prio_array *array = &rt_rq->active; 1350 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1351 1352 if (head) 1353 list_move(&rt_se->run_list, queue); 1354 else 1355 list_move_tail(&rt_se->run_list, queue); 1356 } 1357 } 1358 1359 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) 1360 { 1361 struct sched_rt_entity *rt_se = &p->rt; 1362 struct rt_rq *rt_rq; 1363 1364 for_each_sched_rt_entity(rt_se) { 1365 rt_rq = rt_rq_of_se(rt_se); 1366 requeue_rt_entity(rt_rq, rt_se, head); 1367 } 1368 } 1369 1370 static void yield_task_rt(struct rq *rq) 1371 { 1372 requeue_task_rt(rq, rq->curr, 0); 1373 } 1374 1375 #ifdef CONFIG_SMP 1376 static int find_lowest_rq(struct task_struct *task); 1377 1378 static int 1379 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 1380 { 1381 struct task_struct *curr; 1382 struct rq *rq; 1383 1384 /* For anything but wake ups, just return the task_cpu */ 1385 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1386 goto out; 1387 1388 rq = cpu_rq(cpu); 1389 1390 rcu_read_lock(); 1391 curr = READ_ONCE(rq->curr); /* unlocked access */ 1392 1393 /* 1394 * If the current task on @p's runqueue is an RT task, then 1395 * try to see if we can wake this RT task up on another 1396 * runqueue. Otherwise simply start this RT task 1397 * on its current runqueue. 1398 * 1399 * We want to avoid overloading runqueues. If the woken 1400 * task is a higher priority, then it will stay on this CPU 1401 * and the lower prio task should be moved to another CPU. 1402 * Even though this will probably make the lower prio task 1403 * lose its cache, we do not want to bounce a higher task 1404 * around just because it gave up its CPU, perhaps for a 1405 * lock? 1406 * 1407 * For equal prio tasks, we just let the scheduler sort it out. 1408 * 1409 * Otherwise, just let it ride on the affined RQ and the 1410 * post-schedule router will push the preempted task away 1411 * 1412 * This test is optimistic, if we get it wrong the load-balancer 1413 * will have to sort it out. 1414 */ 1415 if (curr && unlikely(rt_task(curr)) && 1416 (curr->nr_cpus_allowed < 2 || 1417 curr->prio <= p->prio)) { 1418 int target = find_lowest_rq(p); 1419 1420 /* 1421 * Don't bother moving it if the destination CPU is 1422 * not running a lower priority task. 1423 */ 1424 if (target != -1 && 1425 p->prio < cpu_rq(target)->rt.highest_prio.curr) 1426 cpu = target; 1427 } 1428 rcu_read_unlock(); 1429 1430 out: 1431 return cpu; 1432 } 1433 1434 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1435 { 1436 /* 1437 * Current can't be migrated, useless to reschedule, 1438 * let's hope p can move out. 1439 */ 1440 if (rq->curr->nr_cpus_allowed == 1 || 1441 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) 1442 return; 1443 1444 /* 1445 * p is migratable, so let's not schedule it and 1446 * see if it is pushed or pulled somewhere else. 1447 */ 1448 if (p->nr_cpus_allowed != 1 1449 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1450 return; 1451 1452 /* 1453 * There appears to be other cpus that can accept 1454 * current and none to run 'p', so lets reschedule 1455 * to try and push current away: 1456 */ 1457 requeue_task_rt(rq, p, 1); 1458 resched_curr(rq); 1459 } 1460 1461 #endif /* CONFIG_SMP */ 1462 1463 /* 1464 * Preempt the current task with a newly woken task if needed: 1465 */ 1466 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) 1467 { 1468 if (p->prio < rq->curr->prio) { 1469 resched_curr(rq); 1470 return; 1471 } 1472 1473 #ifdef CONFIG_SMP 1474 /* 1475 * If: 1476 * 1477 * - the newly woken task is of equal priority to the current task 1478 * - the newly woken task is non-migratable while current is migratable 1479 * - current will be preempted on the next reschedule 1480 * 1481 * we should check to see if current can readily move to a different 1482 * cpu. If so, we will reschedule to allow the push logic to try 1483 * to move current somewhere else, making room for our non-migratable 1484 * task. 1485 */ 1486 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) 1487 check_preempt_equal_prio(rq, p); 1488 #endif 1489 } 1490 1491 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 1492 struct rt_rq *rt_rq) 1493 { 1494 struct rt_prio_array *array = &rt_rq->active; 1495 struct sched_rt_entity *next = NULL; 1496 struct list_head *queue; 1497 int idx; 1498 1499 idx = sched_find_first_bit(array->bitmap); 1500 BUG_ON(idx >= MAX_RT_PRIO); 1501 1502 queue = array->queue + idx; 1503 next = list_entry(queue->next, struct sched_rt_entity, run_list); 1504 1505 return next; 1506 } 1507 1508 static struct task_struct *_pick_next_task_rt(struct rq *rq) 1509 { 1510 struct sched_rt_entity *rt_se; 1511 struct task_struct *p; 1512 struct rt_rq *rt_rq = &rq->rt; 1513 1514 do { 1515 rt_se = pick_next_rt_entity(rq, rt_rq); 1516 BUG_ON(!rt_se); 1517 rt_rq = group_rt_rq(rt_se); 1518 } while (rt_rq); 1519 1520 p = rt_task_of(rt_se); 1521 p->se.exec_start = rq_clock_task(rq); 1522 1523 return p; 1524 } 1525 1526 static struct task_struct * 1527 pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 1528 { 1529 struct task_struct *p; 1530 struct rt_rq *rt_rq = &rq->rt; 1531 1532 if (need_pull_rt_task(rq, prev)) { 1533 /* 1534 * This is OK, because current is on_cpu, which avoids it being 1535 * picked for load-balance and preemption/IRQs are still 1536 * disabled avoiding further scheduler activity on it and we're 1537 * being very careful to re-start the picking loop. 1538 */ 1539 rq_unpin_lock(rq, rf); 1540 pull_rt_task(rq); 1541 rq_repin_lock(rq, rf); 1542 /* 1543 * pull_rt_task() can drop (and re-acquire) rq->lock; this 1544 * means a dl or stop task can slip in, in which case we need 1545 * to re-start task selection. 1546 */ 1547 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || 1548 rq->dl.dl_nr_running)) 1549 return RETRY_TASK; 1550 } 1551 1552 /* 1553 * We may dequeue prev's rt_rq in put_prev_task(). 1554 * So, we update time before rt_nr_running check. 1555 */ 1556 if (prev->sched_class == &rt_sched_class) 1557 update_curr_rt(rq); 1558 1559 if (!rt_rq->rt_queued) 1560 return NULL; 1561 1562 put_prev_task(rq, prev); 1563 1564 p = _pick_next_task_rt(rq); 1565 1566 /* The running task is never eligible for pushing */ 1567 dequeue_pushable_task(rq, p); 1568 1569 queue_push_tasks(rq); 1570 1571 return p; 1572 } 1573 1574 static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1575 { 1576 update_curr_rt(rq); 1577 1578 /* 1579 * The previous task needs to be made eligible for pushing 1580 * if it is still active 1581 */ 1582 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) 1583 enqueue_pushable_task(rq, p); 1584 } 1585 1586 #ifdef CONFIG_SMP 1587 1588 /* Only try algorithms three times */ 1589 #define RT_MAX_TRIES 3 1590 1591 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1592 { 1593 if (!task_running(rq, p) && 1594 cpumask_test_cpu(cpu, &p->cpus_allowed)) 1595 return 1; 1596 return 0; 1597 } 1598 1599 /* 1600 * Return the highest pushable rq's task, which is suitable to be executed 1601 * on the cpu, NULL otherwise 1602 */ 1603 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) 1604 { 1605 struct plist_head *head = &rq->rt.pushable_tasks; 1606 struct task_struct *p; 1607 1608 if (!has_pushable_tasks(rq)) 1609 return NULL; 1610 1611 plist_for_each_entry(p, head, pushable_tasks) { 1612 if (pick_rt_task(rq, p, cpu)) 1613 return p; 1614 } 1615 1616 return NULL; 1617 } 1618 1619 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1620 1621 static int find_lowest_rq(struct task_struct *task) 1622 { 1623 struct sched_domain *sd; 1624 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); 1625 int this_cpu = smp_processor_id(); 1626 int cpu = task_cpu(task); 1627 1628 /* Make sure the mask is initialized first */ 1629 if (unlikely(!lowest_mask)) 1630 return -1; 1631 1632 if (task->nr_cpus_allowed == 1) 1633 return -1; /* No other targets possible */ 1634 1635 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) 1636 return -1; /* No targets found */ 1637 1638 /* 1639 * At this point we have built a mask of cpus representing the 1640 * lowest priority tasks in the system. Now we want to elect 1641 * the best one based on our affinity and topology. 1642 * 1643 * We prioritize the last cpu that the task executed on since 1644 * it is most likely cache-hot in that location. 1645 */ 1646 if (cpumask_test_cpu(cpu, lowest_mask)) 1647 return cpu; 1648 1649 /* 1650 * Otherwise, we consult the sched_domains span maps to figure 1651 * out which cpu is logically closest to our hot cache data. 1652 */ 1653 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1654 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1655 1656 rcu_read_lock(); 1657 for_each_domain(cpu, sd) { 1658 if (sd->flags & SD_WAKE_AFFINE) { 1659 int best_cpu; 1660 1661 /* 1662 * "this_cpu" is cheaper to preempt than a 1663 * remote processor. 1664 */ 1665 if (this_cpu != -1 && 1666 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { 1667 rcu_read_unlock(); 1668 return this_cpu; 1669 } 1670 1671 best_cpu = cpumask_first_and(lowest_mask, 1672 sched_domain_span(sd)); 1673 if (best_cpu < nr_cpu_ids) { 1674 rcu_read_unlock(); 1675 return best_cpu; 1676 } 1677 } 1678 } 1679 rcu_read_unlock(); 1680 1681 /* 1682 * And finally, if there were no matches within the domains 1683 * just give the caller *something* to work with from the compatible 1684 * locations. 1685 */ 1686 if (this_cpu != -1) 1687 return this_cpu; 1688 1689 cpu = cpumask_any(lowest_mask); 1690 if (cpu < nr_cpu_ids) 1691 return cpu; 1692 return -1; 1693 } 1694 1695 /* Will lock the rq it finds */ 1696 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) 1697 { 1698 struct rq *lowest_rq = NULL; 1699 int tries; 1700 int cpu; 1701 1702 for (tries = 0; tries < RT_MAX_TRIES; tries++) { 1703 cpu = find_lowest_rq(task); 1704 1705 if ((cpu == -1) || (cpu == rq->cpu)) 1706 break; 1707 1708 lowest_rq = cpu_rq(cpu); 1709 1710 if (lowest_rq->rt.highest_prio.curr <= task->prio) { 1711 /* 1712 * Target rq has tasks of equal or higher priority, 1713 * retrying does not release any lock and is unlikely 1714 * to yield a different result. 1715 */ 1716 lowest_rq = NULL; 1717 break; 1718 } 1719 1720 /* if the prio of this runqueue changed, try again */ 1721 if (double_lock_balance(rq, lowest_rq)) { 1722 /* 1723 * We had to unlock the run queue. In 1724 * the mean time, task could have 1725 * migrated already or had its affinity changed. 1726 * Also make sure that it wasn't scheduled on its rq. 1727 */ 1728 if (unlikely(task_rq(task) != rq || 1729 !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || 1730 task_running(rq, task) || 1731 !rt_task(task) || 1732 !task_on_rq_queued(task))) { 1733 1734 double_unlock_balance(rq, lowest_rq); 1735 lowest_rq = NULL; 1736 break; 1737 } 1738 } 1739 1740 /* If this rq is still suitable use it. */ 1741 if (lowest_rq->rt.highest_prio.curr > task->prio) 1742 break; 1743 1744 /* try again */ 1745 double_unlock_balance(rq, lowest_rq); 1746 lowest_rq = NULL; 1747 } 1748 1749 return lowest_rq; 1750 } 1751 1752 static struct task_struct *pick_next_pushable_task(struct rq *rq) 1753 { 1754 struct task_struct *p; 1755 1756 if (!has_pushable_tasks(rq)) 1757 return NULL; 1758 1759 p = plist_first_entry(&rq->rt.pushable_tasks, 1760 struct task_struct, pushable_tasks); 1761 1762 BUG_ON(rq->cpu != task_cpu(p)); 1763 BUG_ON(task_current(rq, p)); 1764 BUG_ON(p->nr_cpus_allowed <= 1); 1765 1766 BUG_ON(!task_on_rq_queued(p)); 1767 BUG_ON(!rt_task(p)); 1768 1769 return p; 1770 } 1771 1772 /* 1773 * If the current CPU has more than one RT task, see if the non 1774 * running task can migrate over to a CPU that is running a task 1775 * of lesser priority. 1776 */ 1777 static int push_rt_task(struct rq *rq) 1778 { 1779 struct task_struct *next_task; 1780 struct rq *lowest_rq; 1781 int ret = 0; 1782 1783 if (!rq->rt.overloaded) 1784 return 0; 1785 1786 next_task = pick_next_pushable_task(rq); 1787 if (!next_task) 1788 return 0; 1789 1790 retry: 1791 if (unlikely(next_task == rq->curr)) { 1792 WARN_ON(1); 1793 return 0; 1794 } 1795 1796 /* 1797 * It's possible that the next_task slipped in of 1798 * higher priority than current. If that's the case 1799 * just reschedule current. 1800 */ 1801 if (unlikely(next_task->prio < rq->curr->prio)) { 1802 resched_curr(rq); 1803 return 0; 1804 } 1805 1806 /* We might release rq lock */ 1807 get_task_struct(next_task); 1808 1809 /* find_lock_lowest_rq locks the rq if found */ 1810 lowest_rq = find_lock_lowest_rq(next_task, rq); 1811 if (!lowest_rq) { 1812 struct task_struct *task; 1813 /* 1814 * find_lock_lowest_rq releases rq->lock 1815 * so it is possible that next_task has migrated. 1816 * 1817 * We need to make sure that the task is still on the same 1818 * run-queue and is also still the next task eligible for 1819 * pushing. 1820 */ 1821 task = pick_next_pushable_task(rq); 1822 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1823 /* 1824 * The task hasn't migrated, and is still the next 1825 * eligible task, but we failed to find a run-queue 1826 * to push it to. Do not retry in this case, since 1827 * other cpus will pull from us when ready. 1828 */ 1829 goto out; 1830 } 1831 1832 if (!task) 1833 /* No more tasks, just exit */ 1834 goto out; 1835 1836 /* 1837 * Something has shifted, try again. 1838 */ 1839 put_task_struct(next_task); 1840 next_task = task; 1841 goto retry; 1842 } 1843 1844 deactivate_task(rq, next_task, 0); 1845 set_task_cpu(next_task, lowest_rq->cpu); 1846 activate_task(lowest_rq, next_task, 0); 1847 ret = 1; 1848 1849 resched_curr(lowest_rq); 1850 1851 double_unlock_balance(rq, lowest_rq); 1852 1853 out: 1854 put_task_struct(next_task); 1855 1856 return ret; 1857 } 1858 1859 static void push_rt_tasks(struct rq *rq) 1860 { 1861 /* push_rt_task will return true if it moved an RT */ 1862 while (push_rt_task(rq)) 1863 ; 1864 } 1865 1866 #ifdef HAVE_RT_PUSH_IPI 1867 /* 1868 * The search for the next cpu always starts at rq->cpu and ends 1869 * when we reach rq->cpu again. It will never return rq->cpu. 1870 * This returns the next cpu to check, or nr_cpu_ids if the loop 1871 * is complete. 1872 * 1873 * rq->rt.push_cpu holds the last cpu returned by this function, 1874 * or if this is the first instance, it must hold rq->cpu. 1875 */ 1876 static int rto_next_cpu(struct rq *rq) 1877 { 1878 int prev_cpu = rq->rt.push_cpu; 1879 int cpu; 1880 1881 cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); 1882 1883 /* 1884 * If the previous cpu is less than the rq's CPU, then it already 1885 * passed the end of the mask, and has started from the beginning. 1886 * We end if the next CPU is greater or equal to rq's CPU. 1887 */ 1888 if (prev_cpu < rq->cpu) { 1889 if (cpu >= rq->cpu) 1890 return nr_cpu_ids; 1891 1892 } else if (cpu >= nr_cpu_ids) { 1893 /* 1894 * We passed the end of the mask, start at the beginning. 1895 * If the result is greater or equal to the rq's CPU, then 1896 * the loop is finished. 1897 */ 1898 cpu = cpumask_first(rq->rd->rto_mask); 1899 if (cpu >= rq->cpu) 1900 return nr_cpu_ids; 1901 } 1902 rq->rt.push_cpu = cpu; 1903 1904 /* Return cpu to let the caller know if the loop is finished or not */ 1905 return cpu; 1906 } 1907 1908 static int find_next_push_cpu(struct rq *rq) 1909 { 1910 struct rq *next_rq; 1911 int cpu; 1912 1913 while (1) { 1914 cpu = rto_next_cpu(rq); 1915 if (cpu >= nr_cpu_ids) 1916 break; 1917 next_rq = cpu_rq(cpu); 1918 1919 /* Make sure the next rq can push to this rq */ 1920 if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) 1921 break; 1922 } 1923 1924 return cpu; 1925 } 1926 1927 #define RT_PUSH_IPI_EXECUTING 1 1928 #define RT_PUSH_IPI_RESTART 2 1929 1930 /* 1931 * When a high priority task schedules out from a CPU and a lower priority 1932 * task is scheduled in, a check is made to see if there's any RT tasks 1933 * on other CPUs that are waiting to run because a higher priority RT task 1934 * is currently running on its CPU. In this case, the CPU with multiple RT 1935 * tasks queued on it (overloaded) needs to be notified that a CPU has opened 1936 * up that may be able to run one of its non-running queued RT tasks. 1937 * 1938 * On large CPU boxes, there's the case that several CPUs could schedule 1939 * a lower priority task at the same time, in which case it will look for 1940 * any overloaded CPUs that it could pull a task from. To do this, the runqueue 1941 * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting 1942 * for a single overloaded CPU's runqueue lock can produce a large latency. 1943 * (This has actually been observed on large boxes running cyclictest). 1944 * Instead of taking the runqueue lock of the overloaded CPU, each of the 1945 * CPUs that scheduled a lower priority task simply sends an IPI to the 1946 * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with 1947 * lots of contention. The overloaded CPU will look to push its non-running 1948 * RT task off, and if it does, it can then ignore the other IPIs coming 1949 * in, and just pass those IPIs off to any other overloaded CPU. 1950 * 1951 * When a CPU schedules a lower priority task, it only sends an IPI to 1952 * the "next" CPU that has overloaded RT tasks. This prevents IPI storms, 1953 * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with 1954 * RT overloaded tasks, would cause 100 IPIs to go out at once. 1955 * 1956 * The overloaded RT CPU, when receiving an IPI, will try to push off its 1957 * overloaded RT tasks and then send an IPI to the next CPU that has 1958 * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks 1959 * have completed. Just because a CPU may have pushed off its own overloaded 1960 * RT task does not mean it should stop sending the IPI around to other 1961 * overloaded CPUs. There may be another RT task waiting to run on one of 1962 * those CPUs that are of higher priority than the one that was just 1963 * pushed. 1964 * 1965 * An optimization that could possibly be made is to make a CPU array similar 1966 * to the cpupri array mask of all running RT tasks, but for the overloaded 1967 * case, then the IPI could be sent to only the CPU with the highest priority 1968 * RT task waiting, and that CPU could send off further IPIs to the CPU with 1969 * the next highest waiting task. Since the overloaded case is much less likely 1970 * to happen, the complexity of this implementation may not be worth it. 1971 * Instead, just send an IPI around to all overloaded CPUs. 1972 * 1973 * The rq->rt.push_flags holds the status of the IPI that is going around. 1974 * A run queue can only send out a single IPI at a time. The possible flags 1975 * for rq->rt.push_flags are: 1976 * 1977 * (None or zero): No IPI is going around for the current rq 1978 * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around 1979 * RT_PUSH_IPI_RESTART: The priority of the running task for the rq 1980 * has changed, and the IPI should restart 1981 * circulating the overloaded CPUs again. 1982 * 1983 * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated 1984 * before sending to the next CPU. 1985 * 1986 * Instead of having all CPUs that schedule a lower priority task send 1987 * an IPI to the same "first" CPU in the RT overload mask, they send it 1988 * to the next overloaded CPU after their own CPU. This helps distribute 1989 * the work when there's more than one overloaded CPU and multiple CPUs 1990 * scheduling in lower priority tasks. 1991 * 1992 * When a rq schedules a lower priority task than what was currently 1993 * running, the next CPU with overloaded RT tasks is examined first. 1994 * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower 1995 * priority task, it will send an IPI first to CPU 5, then CPU 5 will 1996 * send to CPU 1 if it is still overloaded. CPU 1 will clear the 1997 * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set. 1998 * 1999 * The first CPU to notice IPI_RESTART is set, will clear that flag and then 2000 * send an IPI to the next overloaded CPU after the rq->cpu and not the next 2001 * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3 2002 * schedules a lower priority task, and the IPI_RESTART gets set while the 2003 * handling is being done on CPU 5, it will clear the flag and send it back to 2004 * CPU 4 instead of CPU 1. 2005 * 2006 * Note, the above logic can be disabled by turning off the sched_feature 2007 * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be 2008 * taken by the CPU requesting a pull and the waiting RT task will be pulled 2009 * by that CPU. This may be fine for machines with few CPUs. 2010 */ 2011 static void tell_cpu_to_push(struct rq *rq) 2012 { 2013 int cpu; 2014 2015 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { 2016 raw_spin_lock(&rq->rt.push_lock); 2017 /* Make sure it's still executing */ 2018 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { 2019 /* 2020 * Tell the IPI to restart the loop as things have 2021 * changed since it started. 2022 */ 2023 rq->rt.push_flags |= RT_PUSH_IPI_RESTART; 2024 raw_spin_unlock(&rq->rt.push_lock); 2025 return; 2026 } 2027 raw_spin_unlock(&rq->rt.push_lock); 2028 } 2029 2030 /* When here, there's no IPI going around */ 2031 2032 rq->rt.push_cpu = rq->cpu; 2033 cpu = find_next_push_cpu(rq); 2034 if (cpu >= nr_cpu_ids) 2035 return; 2036 2037 rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; 2038 2039 irq_work_queue_on(&rq->rt.push_work, cpu); 2040 } 2041 2042 /* Called from hardirq context */ 2043 static void try_to_push_tasks(void *arg) 2044 { 2045 struct rt_rq *rt_rq = arg; 2046 struct rq *rq, *src_rq; 2047 int this_cpu; 2048 int cpu; 2049 2050 this_cpu = rt_rq->push_cpu; 2051 2052 /* Paranoid check */ 2053 BUG_ON(this_cpu != smp_processor_id()); 2054 2055 rq = cpu_rq(this_cpu); 2056 src_rq = rq_of_rt_rq(rt_rq); 2057 2058 again: 2059 if (has_pushable_tasks(rq)) { 2060 raw_spin_lock(&rq->lock); 2061 push_rt_task(rq); 2062 raw_spin_unlock(&rq->lock); 2063 } 2064 2065 /* Pass the IPI to the next rt overloaded queue */ 2066 raw_spin_lock(&rt_rq->push_lock); 2067 /* 2068 * If the source queue changed since the IPI went out, 2069 * we need to restart the search from that CPU again. 2070 */ 2071 if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { 2072 rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; 2073 rt_rq->push_cpu = src_rq->cpu; 2074 } 2075 2076 cpu = find_next_push_cpu(src_rq); 2077 2078 if (cpu >= nr_cpu_ids) 2079 rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; 2080 raw_spin_unlock(&rt_rq->push_lock); 2081 2082 if (cpu >= nr_cpu_ids) 2083 return; 2084 2085 /* 2086 * It is possible that a restart caused this CPU to be 2087 * chosen again. Don't bother with an IPI, just see if we 2088 * have more to push. 2089 */ 2090 if (unlikely(cpu == rq->cpu)) 2091 goto again; 2092 2093 /* Try the next RT overloaded CPU */ 2094 irq_work_queue_on(&rt_rq->push_work, cpu); 2095 } 2096 2097 static void push_irq_work_func(struct irq_work *work) 2098 { 2099 struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); 2100 2101 try_to_push_tasks(rt_rq); 2102 } 2103 #endif /* HAVE_RT_PUSH_IPI */ 2104 2105 static void pull_rt_task(struct rq *this_rq) 2106 { 2107 int this_cpu = this_rq->cpu, cpu; 2108 bool resched = false; 2109 struct task_struct *p; 2110 struct rq *src_rq; 2111 2112 if (likely(!rt_overloaded(this_rq))) 2113 return; 2114 2115 /* 2116 * Match the barrier from rt_set_overloaded; this guarantees that if we 2117 * see overloaded we must also see the rto_mask bit. 2118 */ 2119 smp_rmb(); 2120 2121 #ifdef HAVE_RT_PUSH_IPI 2122 if (sched_feat(RT_PUSH_IPI)) { 2123 tell_cpu_to_push(this_rq); 2124 return; 2125 } 2126 #endif 2127 2128 for_each_cpu(cpu, this_rq->rd->rto_mask) { 2129 if (this_cpu == cpu) 2130 continue; 2131 2132 src_rq = cpu_rq(cpu); 2133 2134 /* 2135 * Don't bother taking the src_rq->lock if the next highest 2136 * task is known to be lower-priority than our current task. 2137 * This may look racy, but if this value is about to go 2138 * logically higher, the src_rq will push this task away. 2139 * And if its going logically lower, we do not care 2140 */ 2141 if (src_rq->rt.highest_prio.next >= 2142 this_rq->rt.highest_prio.curr) 2143 continue; 2144 2145 /* 2146 * We can potentially drop this_rq's lock in 2147 * double_lock_balance, and another CPU could 2148 * alter this_rq 2149 */ 2150 double_lock_balance(this_rq, src_rq); 2151 2152 /* 2153 * We can pull only a task, which is pushable 2154 * on its rq, and no others. 2155 */ 2156 p = pick_highest_pushable_task(src_rq, this_cpu); 2157 2158 /* 2159 * Do we have an RT task that preempts 2160 * the to-be-scheduled task? 2161 */ 2162 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 2163 WARN_ON(p == src_rq->curr); 2164 WARN_ON(!task_on_rq_queued(p)); 2165 2166 /* 2167 * There's a chance that p is higher in priority 2168 * than what's currently running on its cpu. 2169 * This is just that p is wakeing up and hasn't 2170 * had a chance to schedule. We only pull 2171 * p if it is lower in priority than the 2172 * current task on the run queue 2173 */ 2174 if (p->prio < src_rq->curr->prio) 2175 goto skip; 2176 2177 resched = true; 2178 2179 deactivate_task(src_rq, p, 0); 2180 set_task_cpu(p, this_cpu); 2181 activate_task(this_rq, p, 0); 2182 /* 2183 * We continue with the search, just in 2184 * case there's an even higher prio task 2185 * in another runqueue. (low likelihood 2186 * but possible) 2187 */ 2188 } 2189 skip: 2190 double_unlock_balance(this_rq, src_rq); 2191 } 2192 2193 if (resched) 2194 resched_curr(this_rq); 2195 } 2196 2197 /* 2198 * If we are not running and we are not going to reschedule soon, we should 2199 * try to push tasks away now 2200 */ 2201 static void task_woken_rt(struct rq *rq, struct task_struct *p) 2202 { 2203 if (!task_running(rq, p) && 2204 !test_tsk_need_resched(rq->curr) && 2205 p->nr_cpus_allowed > 1 && 2206 (dl_task(rq->curr) || rt_task(rq->curr)) && 2207 (rq->curr->nr_cpus_allowed < 2 || 2208 rq->curr->prio <= p->prio)) 2209 push_rt_tasks(rq); 2210 } 2211 2212 /* Assumes rq->lock is held */ 2213 static void rq_online_rt(struct rq *rq) 2214 { 2215 if (rq->rt.overloaded) 2216 rt_set_overload(rq); 2217 2218 __enable_runtime(rq); 2219 2220 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); 2221 } 2222 2223 /* Assumes rq->lock is held */ 2224 static void rq_offline_rt(struct rq *rq) 2225 { 2226 if (rq->rt.overloaded) 2227 rt_clear_overload(rq); 2228 2229 __disable_runtime(rq); 2230 2231 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); 2232 } 2233 2234 /* 2235 * When switch from the rt queue, we bring ourselves to a position 2236 * that we might want to pull RT tasks from other runqueues. 2237 */ 2238 static void switched_from_rt(struct rq *rq, struct task_struct *p) 2239 { 2240 /* 2241 * If there are other RT tasks then we will reschedule 2242 * and the scheduling of the other RT tasks will handle 2243 * the balancing. But if we are the last RT task 2244 * we may need to handle the pulling of RT tasks 2245 * now. 2246 */ 2247 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) 2248 return; 2249 2250 queue_pull_task(rq); 2251 } 2252 2253 void __init init_sched_rt_class(void) 2254 { 2255 unsigned int i; 2256 2257 for_each_possible_cpu(i) { 2258 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 2259 GFP_KERNEL, cpu_to_node(i)); 2260 } 2261 } 2262 #endif /* CONFIG_SMP */ 2263 2264 /* 2265 * When switching a task to RT, we may overload the runqueue 2266 * with RT tasks. In this case we try to push them off to 2267 * other runqueues. 2268 */ 2269 static void switched_to_rt(struct rq *rq, struct task_struct *p) 2270 { 2271 /* 2272 * If we are already running, then there's nothing 2273 * that needs to be done. But if we are not running 2274 * we may need to preempt the current running task. 2275 * If that current running task is also an RT task 2276 * then see if we can move to another run queue. 2277 */ 2278 if (task_on_rq_queued(p) && rq->curr != p) { 2279 #ifdef CONFIG_SMP 2280 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) 2281 queue_push_tasks(rq); 2282 #endif /* CONFIG_SMP */ 2283 if (p->prio < rq->curr->prio) 2284 resched_curr(rq); 2285 } 2286 } 2287 2288 /* 2289 * Priority of the task has changed. This may cause 2290 * us to initiate a push or pull. 2291 */ 2292 static void 2293 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 2294 { 2295 if (!task_on_rq_queued(p)) 2296 return; 2297 2298 if (rq->curr == p) { 2299 #ifdef CONFIG_SMP 2300 /* 2301 * If our priority decreases while running, we 2302 * may need to pull tasks to this runqueue. 2303 */ 2304 if (oldprio < p->prio) 2305 queue_pull_task(rq); 2306 2307 /* 2308 * If there's a higher priority task waiting to run 2309 * then reschedule. 2310 */ 2311 if (p->prio > rq->rt.highest_prio.curr) 2312 resched_curr(rq); 2313 #else 2314 /* For UP simply resched on drop of prio */ 2315 if (oldprio < p->prio) 2316 resched_curr(rq); 2317 #endif /* CONFIG_SMP */ 2318 } else { 2319 /* 2320 * This task is not running, but if it is 2321 * greater than the current running task 2322 * then reschedule. 2323 */ 2324 if (p->prio < rq->curr->prio) 2325 resched_curr(rq); 2326 } 2327 } 2328 2329 #ifdef CONFIG_POSIX_TIMERS 2330 static void watchdog(struct rq *rq, struct task_struct *p) 2331 { 2332 unsigned long soft, hard; 2333 2334 /* max may change after cur was read, this will be fixed next tick */ 2335 soft = task_rlimit(p, RLIMIT_RTTIME); 2336 hard = task_rlimit_max(p, RLIMIT_RTTIME); 2337 2338 if (soft != RLIM_INFINITY) { 2339 unsigned long next; 2340 2341 if (p->rt.watchdog_stamp != jiffies) { 2342 p->rt.timeout++; 2343 p->rt.watchdog_stamp = jiffies; 2344 } 2345 2346 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 2347 if (p->rt.timeout > next) 2348 p->cputime_expires.sched_exp = p->se.sum_exec_runtime; 2349 } 2350 } 2351 #else 2352 static inline void watchdog(struct rq *rq, struct task_struct *p) { } 2353 #endif 2354 2355 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 2356 { 2357 struct sched_rt_entity *rt_se = &p->rt; 2358 2359 update_curr_rt(rq); 2360 2361 watchdog(rq, p); 2362 2363 /* 2364 * RR tasks need a special form of timeslice management. 2365 * FIFO tasks have no timeslices. 2366 */ 2367 if (p->policy != SCHED_RR) 2368 return; 2369 2370 if (--p->rt.time_slice) 2371 return; 2372 2373 p->rt.time_slice = sched_rr_timeslice; 2374 2375 /* 2376 * Requeue to the end of queue if we (and all of our ancestors) are not 2377 * the only element on the queue 2378 */ 2379 for_each_sched_rt_entity(rt_se) { 2380 if (rt_se->run_list.prev != rt_se->run_list.next) { 2381 requeue_task_rt(rq, p, 0); 2382 resched_curr(rq); 2383 return; 2384 } 2385 } 2386 } 2387 2388 static void set_curr_task_rt(struct rq *rq) 2389 { 2390 struct task_struct *p = rq->curr; 2391 2392 p->se.exec_start = rq_clock_task(rq); 2393 2394 /* The running task is never eligible for pushing */ 2395 dequeue_pushable_task(rq, p); 2396 } 2397 2398 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 2399 { 2400 /* 2401 * Time slice is 0 for SCHED_FIFO tasks 2402 */ 2403 if (task->policy == SCHED_RR) 2404 return sched_rr_timeslice; 2405 else 2406 return 0; 2407 } 2408 2409 const struct sched_class rt_sched_class = { 2410 .next = &fair_sched_class, 2411 .enqueue_task = enqueue_task_rt, 2412 .dequeue_task = dequeue_task_rt, 2413 .yield_task = yield_task_rt, 2414 2415 .check_preempt_curr = check_preempt_curr_rt, 2416 2417 .pick_next_task = pick_next_task_rt, 2418 .put_prev_task = put_prev_task_rt, 2419 2420 #ifdef CONFIG_SMP 2421 .select_task_rq = select_task_rq_rt, 2422 2423 .set_cpus_allowed = set_cpus_allowed_common, 2424 .rq_online = rq_online_rt, 2425 .rq_offline = rq_offline_rt, 2426 .task_woken = task_woken_rt, 2427 .switched_from = switched_from_rt, 2428 #endif 2429 2430 .set_curr_task = set_curr_task_rt, 2431 .task_tick = task_tick_rt, 2432 2433 .get_rr_interval = get_rr_interval_rt, 2434 2435 .prio_changed = prio_changed_rt, 2436 .switched_to = switched_to_rt, 2437 2438 .update_curr = update_curr_rt, 2439 }; 2440 2441 #ifdef CONFIG_SCHED_DEBUG 2442 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2443 2444 void print_rt_stats(struct seq_file *m, int cpu) 2445 { 2446 rt_rq_iter_t iter; 2447 struct rt_rq *rt_rq; 2448 2449 rcu_read_lock(); 2450 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) 2451 print_rt_rq(m, cpu, rt_rq); 2452 rcu_read_unlock(); 2453 } 2454 #endif /* CONFIG_SCHED_DEBUG */ 2455