1 /* 2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 3 * policies) 4 */ 5 6 #include "sched.h" 7 8 #include <linux/slab.h> 9 10 int sched_rr_timeslice = RR_TIMESLICE; 11 12 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 13 14 struct rt_bandwidth def_rt_bandwidth; 15 16 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 17 { 18 struct rt_bandwidth *rt_b = 19 container_of(timer, struct rt_bandwidth, rt_period_timer); 20 ktime_t now; 21 int overrun; 22 int idle = 0; 23 24 for (;;) { 25 now = hrtimer_cb_get_time(timer); 26 overrun = hrtimer_forward(timer, now, rt_b->rt_period); 27 28 if (!overrun) 29 break; 30 31 idle = do_sched_rt_period_timer(rt_b, overrun); 32 } 33 34 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 35 } 36 37 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 38 { 39 rt_b->rt_period = ns_to_ktime(period); 40 rt_b->rt_runtime = runtime; 41 42 raw_spin_lock_init(&rt_b->rt_runtime_lock); 43 44 hrtimer_init(&rt_b->rt_period_timer, 45 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 46 rt_b->rt_period_timer.function = sched_rt_period_timer; 47 } 48 49 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 50 { 51 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 52 return; 53 54 if (hrtimer_active(&rt_b->rt_period_timer)) 55 return; 56 57 raw_spin_lock(&rt_b->rt_runtime_lock); 58 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); 59 raw_spin_unlock(&rt_b->rt_runtime_lock); 60 } 61 62 void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 63 { 64 struct rt_prio_array *array; 65 int i; 66 67 array = &rt_rq->active; 68 for (i = 0; i < MAX_RT_PRIO; i++) { 69 INIT_LIST_HEAD(array->queue + i); 70 __clear_bit(i, array->bitmap); 71 } 72 /* delimiter for bitsearch: */ 73 __set_bit(MAX_RT_PRIO, array->bitmap); 74 75 #if defined CONFIG_SMP 76 rt_rq->highest_prio.curr = MAX_RT_PRIO; 77 rt_rq->highest_prio.next = MAX_RT_PRIO; 78 rt_rq->rt_nr_migratory = 0; 79 rt_rq->overloaded = 0; 80 plist_head_init(&rt_rq->pushable_tasks); 81 #endif 82 83 rt_rq->rt_time = 0; 84 rt_rq->rt_throttled = 0; 85 rt_rq->rt_runtime = 0; 86 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 87 } 88 89 #ifdef CONFIG_RT_GROUP_SCHED 90 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 91 { 92 hrtimer_cancel(&rt_b->rt_period_timer); 93 } 94 95 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 96 97 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 98 { 99 #ifdef CONFIG_SCHED_DEBUG 100 WARN_ON_ONCE(!rt_entity_is_task(rt_se)); 101 #endif 102 return container_of(rt_se, struct task_struct, rt); 103 } 104 105 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 106 { 107 return rt_rq->rq; 108 } 109 110 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 111 { 112 return rt_se->rt_rq; 113 } 114 115 void free_rt_sched_group(struct task_group *tg) 116 { 117 int i; 118 119 if (tg->rt_se) 120 destroy_rt_bandwidth(&tg->rt_bandwidth); 121 122 for_each_possible_cpu(i) { 123 if (tg->rt_rq) 124 kfree(tg->rt_rq[i]); 125 if (tg->rt_se) 126 kfree(tg->rt_se[i]); 127 } 128 129 kfree(tg->rt_rq); 130 kfree(tg->rt_se); 131 } 132 133 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 134 struct sched_rt_entity *rt_se, int cpu, 135 struct sched_rt_entity *parent) 136 { 137 struct rq *rq = cpu_rq(cpu); 138 139 rt_rq->highest_prio.curr = MAX_RT_PRIO; 140 rt_rq->rt_nr_boosted = 0; 141 rt_rq->rq = rq; 142 rt_rq->tg = tg; 143 144 tg->rt_rq[cpu] = rt_rq; 145 tg->rt_se[cpu] = rt_se; 146 147 if (!rt_se) 148 return; 149 150 if (!parent) 151 rt_se->rt_rq = &rq->rt; 152 else 153 rt_se->rt_rq = parent->my_q; 154 155 rt_se->my_q = rt_rq; 156 rt_se->parent = parent; 157 INIT_LIST_HEAD(&rt_se->run_list); 158 } 159 160 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 161 { 162 struct rt_rq *rt_rq; 163 struct sched_rt_entity *rt_se; 164 int i; 165 166 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 167 if (!tg->rt_rq) 168 goto err; 169 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); 170 if (!tg->rt_se) 171 goto err; 172 173 init_rt_bandwidth(&tg->rt_bandwidth, 174 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 175 176 for_each_possible_cpu(i) { 177 rt_rq = kzalloc_node(sizeof(struct rt_rq), 178 GFP_KERNEL, cpu_to_node(i)); 179 if (!rt_rq) 180 goto err; 181 182 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 183 GFP_KERNEL, cpu_to_node(i)); 184 if (!rt_se) 185 goto err_free_rq; 186 187 init_rt_rq(rt_rq, cpu_rq(i)); 188 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 189 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 190 } 191 192 return 1; 193 194 err_free_rq: 195 kfree(rt_rq); 196 err: 197 return 0; 198 } 199 200 #else /* CONFIG_RT_GROUP_SCHED */ 201 202 #define rt_entity_is_task(rt_se) (1) 203 204 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 205 { 206 return container_of(rt_se, struct task_struct, rt); 207 } 208 209 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 210 { 211 return container_of(rt_rq, struct rq, rt); 212 } 213 214 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 215 { 216 struct task_struct *p = rt_task_of(rt_se); 217 struct rq *rq = task_rq(p); 218 219 return &rq->rt; 220 } 221 222 void free_rt_sched_group(struct task_group *tg) { } 223 224 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 225 { 226 return 1; 227 } 228 #endif /* CONFIG_RT_GROUP_SCHED */ 229 230 #ifdef CONFIG_SMP 231 232 static inline int rt_overloaded(struct rq *rq) 233 { 234 return atomic_read(&rq->rd->rto_count); 235 } 236 237 static inline void rt_set_overload(struct rq *rq) 238 { 239 if (!rq->online) 240 return; 241 242 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); 243 /* 244 * Make sure the mask is visible before we set 245 * the overload count. That is checked to determine 246 * if we should look at the mask. It would be a shame 247 * if we looked at the mask, but the mask was not 248 * updated yet. 249 */ 250 wmb(); 251 atomic_inc(&rq->rd->rto_count); 252 } 253 254 static inline void rt_clear_overload(struct rq *rq) 255 { 256 if (!rq->online) 257 return; 258 259 /* the order here really doesn't matter */ 260 atomic_dec(&rq->rd->rto_count); 261 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); 262 } 263 264 static void update_rt_migration(struct rt_rq *rt_rq) 265 { 266 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { 267 if (!rt_rq->overloaded) { 268 rt_set_overload(rq_of_rt_rq(rt_rq)); 269 rt_rq->overloaded = 1; 270 } 271 } else if (rt_rq->overloaded) { 272 rt_clear_overload(rq_of_rt_rq(rt_rq)); 273 rt_rq->overloaded = 0; 274 } 275 } 276 277 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 278 { 279 struct task_struct *p; 280 281 if (!rt_entity_is_task(rt_se)) 282 return; 283 284 p = rt_task_of(rt_se); 285 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 286 287 rt_rq->rt_nr_total++; 288 if (p->nr_cpus_allowed > 1) 289 rt_rq->rt_nr_migratory++; 290 291 update_rt_migration(rt_rq); 292 } 293 294 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 295 { 296 struct task_struct *p; 297 298 if (!rt_entity_is_task(rt_se)) 299 return; 300 301 p = rt_task_of(rt_se); 302 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 303 304 rt_rq->rt_nr_total--; 305 if (p->nr_cpus_allowed > 1) 306 rt_rq->rt_nr_migratory--; 307 308 update_rt_migration(rt_rq); 309 } 310 311 static inline int has_pushable_tasks(struct rq *rq) 312 { 313 return !plist_head_empty(&rq->rt.pushable_tasks); 314 } 315 316 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 317 { 318 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 319 plist_node_init(&p->pushable_tasks, p->prio); 320 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 321 322 /* Update the highest prio pushable task */ 323 if (p->prio < rq->rt.highest_prio.next) 324 rq->rt.highest_prio.next = p->prio; 325 } 326 327 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 328 { 329 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 330 331 /* Update the new highest prio pushable task */ 332 if (has_pushable_tasks(rq)) { 333 p = plist_first_entry(&rq->rt.pushable_tasks, 334 struct task_struct, pushable_tasks); 335 rq->rt.highest_prio.next = p->prio; 336 } else 337 rq->rt.highest_prio.next = MAX_RT_PRIO; 338 } 339 340 #else 341 342 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 343 { 344 } 345 346 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 347 { 348 } 349 350 static inline 351 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 352 { 353 } 354 355 static inline 356 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 357 { 358 } 359 360 #endif /* CONFIG_SMP */ 361 362 static inline int on_rt_rq(struct sched_rt_entity *rt_se) 363 { 364 return !list_empty(&rt_se->run_list); 365 } 366 367 #ifdef CONFIG_RT_GROUP_SCHED 368 369 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 370 { 371 if (!rt_rq->tg) 372 return RUNTIME_INF; 373 374 return rt_rq->rt_runtime; 375 } 376 377 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 378 { 379 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 380 } 381 382 typedef struct task_group *rt_rq_iter_t; 383 384 static inline struct task_group *next_task_group(struct task_group *tg) 385 { 386 do { 387 tg = list_entry_rcu(tg->list.next, 388 typeof(struct task_group), list); 389 } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); 390 391 if (&tg->list == &task_groups) 392 tg = NULL; 393 394 return tg; 395 } 396 397 #define for_each_rt_rq(rt_rq, iter, rq) \ 398 for (iter = container_of(&task_groups, typeof(*iter), list); \ 399 (iter = next_task_group(iter)) && \ 400 (rt_rq = iter->rt_rq[cpu_of(rq)]);) 401 402 static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 403 { 404 list_add_rcu(&rt_rq->leaf_rt_rq_list, 405 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); 406 } 407 408 static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) 409 { 410 list_del_rcu(&rt_rq->leaf_rt_rq_list); 411 } 412 413 #define for_each_leaf_rt_rq(rt_rq, rq) \ 414 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 415 416 #define for_each_sched_rt_entity(rt_se) \ 417 for (; rt_se; rt_se = rt_se->parent) 418 419 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 420 { 421 return rt_se->my_q; 422 } 423 424 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); 425 static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 426 427 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 428 { 429 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 430 struct sched_rt_entity *rt_se; 431 432 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 433 434 rt_se = rt_rq->tg->rt_se[cpu]; 435 436 if (rt_rq->rt_nr_running) { 437 if (rt_se && !on_rt_rq(rt_se)) 438 enqueue_rt_entity(rt_se, false); 439 if (rt_rq->highest_prio.curr < curr->prio) 440 resched_task(curr); 441 } 442 } 443 444 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 445 { 446 struct sched_rt_entity *rt_se; 447 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 448 449 rt_se = rt_rq->tg->rt_se[cpu]; 450 451 if (rt_se && on_rt_rq(rt_se)) 452 dequeue_rt_entity(rt_se); 453 } 454 455 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 456 { 457 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; 458 } 459 460 static int rt_se_boosted(struct sched_rt_entity *rt_se) 461 { 462 struct rt_rq *rt_rq = group_rt_rq(rt_se); 463 struct task_struct *p; 464 465 if (rt_rq) 466 return !!rt_rq->rt_nr_boosted; 467 468 p = rt_task_of(rt_se); 469 return p->prio != p->normal_prio; 470 } 471 472 #ifdef CONFIG_SMP 473 static inline const struct cpumask *sched_rt_period_mask(void) 474 { 475 return cpu_rq(smp_processor_id())->rd->span; 476 } 477 #else 478 static inline const struct cpumask *sched_rt_period_mask(void) 479 { 480 return cpu_online_mask; 481 } 482 #endif 483 484 static inline 485 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 486 { 487 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; 488 } 489 490 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 491 { 492 return &rt_rq->tg->rt_bandwidth; 493 } 494 495 #else /* !CONFIG_RT_GROUP_SCHED */ 496 497 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 498 { 499 return rt_rq->rt_runtime; 500 } 501 502 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 503 { 504 return ktime_to_ns(def_rt_bandwidth.rt_period); 505 } 506 507 typedef struct rt_rq *rt_rq_iter_t; 508 509 #define for_each_rt_rq(rt_rq, iter, rq) \ 510 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 511 512 static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 513 { 514 } 515 516 static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) 517 { 518 } 519 520 #define for_each_leaf_rt_rq(rt_rq, rq) \ 521 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 522 523 #define for_each_sched_rt_entity(rt_se) \ 524 for (; rt_se; rt_se = NULL) 525 526 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 527 { 528 return NULL; 529 } 530 531 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 532 { 533 if (rt_rq->rt_nr_running) 534 resched_task(rq_of_rt_rq(rt_rq)->curr); 535 } 536 537 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 538 { 539 } 540 541 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 542 { 543 return rt_rq->rt_throttled; 544 } 545 546 static inline const struct cpumask *sched_rt_period_mask(void) 547 { 548 return cpu_online_mask; 549 } 550 551 static inline 552 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 553 { 554 return &cpu_rq(cpu)->rt; 555 } 556 557 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 558 { 559 return &def_rt_bandwidth; 560 } 561 562 #endif /* CONFIG_RT_GROUP_SCHED */ 563 564 #ifdef CONFIG_SMP 565 /* 566 * We ran out of runtime, see if we can borrow some from our neighbours. 567 */ 568 static int do_balance_runtime(struct rt_rq *rt_rq) 569 { 570 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 571 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; 572 int i, weight, more = 0; 573 u64 rt_period; 574 575 weight = cpumask_weight(rd->span); 576 577 raw_spin_lock(&rt_b->rt_runtime_lock); 578 rt_period = ktime_to_ns(rt_b->rt_period); 579 for_each_cpu(i, rd->span) { 580 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 581 s64 diff; 582 583 if (iter == rt_rq) 584 continue; 585 586 raw_spin_lock(&iter->rt_runtime_lock); 587 /* 588 * Either all rqs have inf runtime and there's nothing to steal 589 * or __disable_runtime() below sets a specific rq to inf to 590 * indicate its been disabled and disalow stealing. 591 */ 592 if (iter->rt_runtime == RUNTIME_INF) 593 goto next; 594 595 /* 596 * From runqueues with spare time, take 1/n part of their 597 * spare time, but no more than our period. 598 */ 599 diff = iter->rt_runtime - iter->rt_time; 600 if (diff > 0) { 601 diff = div_u64((u64)diff, weight); 602 if (rt_rq->rt_runtime + diff > rt_period) 603 diff = rt_period - rt_rq->rt_runtime; 604 iter->rt_runtime -= diff; 605 rt_rq->rt_runtime += diff; 606 more = 1; 607 if (rt_rq->rt_runtime == rt_period) { 608 raw_spin_unlock(&iter->rt_runtime_lock); 609 break; 610 } 611 } 612 next: 613 raw_spin_unlock(&iter->rt_runtime_lock); 614 } 615 raw_spin_unlock(&rt_b->rt_runtime_lock); 616 617 return more; 618 } 619 620 /* 621 * Ensure this RQ takes back all the runtime it lend to its neighbours. 622 */ 623 static void __disable_runtime(struct rq *rq) 624 { 625 struct root_domain *rd = rq->rd; 626 rt_rq_iter_t iter; 627 struct rt_rq *rt_rq; 628 629 if (unlikely(!scheduler_running)) 630 return; 631 632 for_each_rt_rq(rt_rq, iter, rq) { 633 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 634 s64 want; 635 int i; 636 637 raw_spin_lock(&rt_b->rt_runtime_lock); 638 raw_spin_lock(&rt_rq->rt_runtime_lock); 639 /* 640 * Either we're all inf and nobody needs to borrow, or we're 641 * already disabled and thus have nothing to do, or we have 642 * exactly the right amount of runtime to take out. 643 */ 644 if (rt_rq->rt_runtime == RUNTIME_INF || 645 rt_rq->rt_runtime == rt_b->rt_runtime) 646 goto balanced; 647 raw_spin_unlock(&rt_rq->rt_runtime_lock); 648 649 /* 650 * Calculate the difference between what we started out with 651 * and what we current have, that's the amount of runtime 652 * we lend and now have to reclaim. 653 */ 654 want = rt_b->rt_runtime - rt_rq->rt_runtime; 655 656 /* 657 * Greedy reclaim, take back as much as we can. 658 */ 659 for_each_cpu(i, rd->span) { 660 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 661 s64 diff; 662 663 /* 664 * Can't reclaim from ourselves or disabled runqueues. 665 */ 666 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 667 continue; 668 669 raw_spin_lock(&iter->rt_runtime_lock); 670 if (want > 0) { 671 diff = min_t(s64, iter->rt_runtime, want); 672 iter->rt_runtime -= diff; 673 want -= diff; 674 } else { 675 iter->rt_runtime -= want; 676 want -= want; 677 } 678 raw_spin_unlock(&iter->rt_runtime_lock); 679 680 if (!want) 681 break; 682 } 683 684 raw_spin_lock(&rt_rq->rt_runtime_lock); 685 /* 686 * We cannot be left wanting - that would mean some runtime 687 * leaked out of the system. 688 */ 689 BUG_ON(want); 690 balanced: 691 /* 692 * Disable all the borrow logic by pretending we have inf 693 * runtime - in which case borrowing doesn't make sense. 694 */ 695 rt_rq->rt_runtime = RUNTIME_INF; 696 rt_rq->rt_throttled = 0; 697 raw_spin_unlock(&rt_rq->rt_runtime_lock); 698 raw_spin_unlock(&rt_b->rt_runtime_lock); 699 } 700 } 701 702 static void disable_runtime(struct rq *rq) 703 { 704 unsigned long flags; 705 706 raw_spin_lock_irqsave(&rq->lock, flags); 707 __disable_runtime(rq); 708 raw_spin_unlock_irqrestore(&rq->lock, flags); 709 } 710 711 static void __enable_runtime(struct rq *rq) 712 { 713 rt_rq_iter_t iter; 714 struct rt_rq *rt_rq; 715 716 if (unlikely(!scheduler_running)) 717 return; 718 719 /* 720 * Reset each runqueue's bandwidth settings 721 */ 722 for_each_rt_rq(rt_rq, iter, rq) { 723 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 724 725 raw_spin_lock(&rt_b->rt_runtime_lock); 726 raw_spin_lock(&rt_rq->rt_runtime_lock); 727 rt_rq->rt_runtime = rt_b->rt_runtime; 728 rt_rq->rt_time = 0; 729 rt_rq->rt_throttled = 0; 730 raw_spin_unlock(&rt_rq->rt_runtime_lock); 731 raw_spin_unlock(&rt_b->rt_runtime_lock); 732 } 733 } 734 735 static void enable_runtime(struct rq *rq) 736 { 737 unsigned long flags; 738 739 raw_spin_lock_irqsave(&rq->lock, flags); 740 __enable_runtime(rq); 741 raw_spin_unlock_irqrestore(&rq->lock, flags); 742 } 743 744 int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) 745 { 746 int cpu = (int)(long)hcpu; 747 748 switch (action) { 749 case CPU_DOWN_PREPARE: 750 case CPU_DOWN_PREPARE_FROZEN: 751 disable_runtime(cpu_rq(cpu)); 752 return NOTIFY_OK; 753 754 case CPU_DOWN_FAILED: 755 case CPU_DOWN_FAILED_FROZEN: 756 case CPU_ONLINE: 757 case CPU_ONLINE_FROZEN: 758 enable_runtime(cpu_rq(cpu)); 759 return NOTIFY_OK; 760 761 default: 762 return NOTIFY_DONE; 763 } 764 } 765 766 static int balance_runtime(struct rt_rq *rt_rq) 767 { 768 int more = 0; 769 770 if (!sched_feat(RT_RUNTIME_SHARE)) 771 return more; 772 773 if (rt_rq->rt_time > rt_rq->rt_runtime) { 774 raw_spin_unlock(&rt_rq->rt_runtime_lock); 775 more = do_balance_runtime(rt_rq); 776 raw_spin_lock(&rt_rq->rt_runtime_lock); 777 } 778 779 return more; 780 } 781 #else /* !CONFIG_SMP */ 782 static inline int balance_runtime(struct rt_rq *rt_rq) 783 { 784 return 0; 785 } 786 #endif /* CONFIG_SMP */ 787 788 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 789 { 790 int i, idle = 1, throttled = 0; 791 const struct cpumask *span; 792 793 span = sched_rt_period_mask(); 794 #ifdef CONFIG_RT_GROUP_SCHED 795 /* 796 * FIXME: isolated CPUs should really leave the root task group, 797 * whether they are isolcpus or were isolated via cpusets, lest 798 * the timer run on a CPU which does not service all runqueues, 799 * potentially leaving other CPUs indefinitely throttled. If 800 * isolation is really required, the user will turn the throttle 801 * off to kill the perturbations it causes anyway. Meanwhile, 802 * this maintains functionality for boot and/or troubleshooting. 803 */ 804 if (rt_b == &root_task_group.rt_bandwidth) 805 span = cpu_online_mask; 806 #endif 807 for_each_cpu(i, span) { 808 int enqueue = 0; 809 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 810 struct rq *rq = rq_of_rt_rq(rt_rq); 811 812 raw_spin_lock(&rq->lock); 813 if (rt_rq->rt_time) { 814 u64 runtime; 815 816 raw_spin_lock(&rt_rq->rt_runtime_lock); 817 if (rt_rq->rt_throttled) 818 balance_runtime(rt_rq); 819 runtime = rt_rq->rt_runtime; 820 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); 821 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 822 rt_rq->rt_throttled = 0; 823 enqueue = 1; 824 825 /* 826 * Force a clock update if the CPU was idle, 827 * lest wakeup -> unthrottle time accumulate. 828 */ 829 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 830 rq->skip_clock_update = -1; 831 } 832 if (rt_rq->rt_time || rt_rq->rt_nr_running) 833 idle = 0; 834 raw_spin_unlock(&rt_rq->rt_runtime_lock); 835 } else if (rt_rq->rt_nr_running) { 836 idle = 0; 837 if (!rt_rq_throttled(rt_rq)) 838 enqueue = 1; 839 } 840 if (rt_rq->rt_throttled) 841 throttled = 1; 842 843 if (enqueue) 844 sched_rt_rq_enqueue(rt_rq); 845 raw_spin_unlock(&rq->lock); 846 } 847 848 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) 849 return 1; 850 851 return idle; 852 } 853 854 static inline int rt_se_prio(struct sched_rt_entity *rt_se) 855 { 856 #ifdef CONFIG_RT_GROUP_SCHED 857 struct rt_rq *rt_rq = group_rt_rq(rt_se); 858 859 if (rt_rq) 860 return rt_rq->highest_prio.curr; 861 #endif 862 863 return rt_task_of(rt_se)->prio; 864 } 865 866 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) 867 { 868 u64 runtime = sched_rt_runtime(rt_rq); 869 870 if (rt_rq->rt_throttled) 871 return rt_rq_throttled(rt_rq); 872 873 if (runtime >= sched_rt_period(rt_rq)) 874 return 0; 875 876 balance_runtime(rt_rq); 877 runtime = sched_rt_runtime(rt_rq); 878 if (runtime == RUNTIME_INF) 879 return 0; 880 881 if (rt_rq->rt_time > runtime) { 882 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 883 884 /* 885 * Don't actually throttle groups that have no runtime assigned 886 * but accrue some time due to boosting. 887 */ 888 if (likely(rt_b->rt_runtime)) { 889 static bool once = false; 890 891 rt_rq->rt_throttled = 1; 892 893 if (!once) { 894 once = true; 895 printk_sched("sched: RT throttling activated\n"); 896 } 897 } else { 898 /* 899 * In case we did anyway, make it go away, 900 * replenishment is a joke, since it will replenish us 901 * with exactly 0 ns. 902 */ 903 rt_rq->rt_time = 0; 904 } 905 906 if (rt_rq_throttled(rt_rq)) { 907 sched_rt_rq_dequeue(rt_rq); 908 return 1; 909 } 910 } 911 912 return 0; 913 } 914 915 /* 916 * Update the current task's runtime statistics. Skip current tasks that 917 * are not in our scheduling class. 918 */ 919 static void update_curr_rt(struct rq *rq) 920 { 921 struct task_struct *curr = rq->curr; 922 struct sched_rt_entity *rt_se = &curr->rt; 923 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 924 u64 delta_exec; 925 926 if (curr->sched_class != &rt_sched_class) 927 return; 928 929 delta_exec = rq->clock_task - curr->se.exec_start; 930 if (unlikely((s64)delta_exec <= 0)) 931 return; 932 933 schedstat_set(curr->se.statistics.exec_max, 934 max(curr->se.statistics.exec_max, delta_exec)); 935 936 curr->se.sum_exec_runtime += delta_exec; 937 account_group_exec_runtime(curr, delta_exec); 938 939 curr->se.exec_start = rq->clock_task; 940 cpuacct_charge(curr, delta_exec); 941 942 sched_rt_avg_update(rq, delta_exec); 943 944 if (!rt_bandwidth_enabled()) 945 return; 946 947 for_each_sched_rt_entity(rt_se) { 948 rt_rq = rt_rq_of_se(rt_se); 949 950 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 951 raw_spin_lock(&rt_rq->rt_runtime_lock); 952 rt_rq->rt_time += delta_exec; 953 if (sched_rt_runtime_exceeded(rt_rq)) 954 resched_task(curr); 955 raw_spin_unlock(&rt_rq->rt_runtime_lock); 956 } 957 } 958 } 959 960 #if defined CONFIG_SMP 961 962 static void 963 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 964 { 965 struct rq *rq = rq_of_rt_rq(rt_rq); 966 967 if (rq->online && prio < prev_prio) 968 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 969 } 970 971 static void 972 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 973 { 974 struct rq *rq = rq_of_rt_rq(rt_rq); 975 976 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 977 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 978 } 979 980 #else /* CONFIG_SMP */ 981 982 static inline 983 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 984 static inline 985 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 986 987 #endif /* CONFIG_SMP */ 988 989 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 990 static void 991 inc_rt_prio(struct rt_rq *rt_rq, int prio) 992 { 993 int prev_prio = rt_rq->highest_prio.curr; 994 995 if (prio < prev_prio) 996 rt_rq->highest_prio.curr = prio; 997 998 inc_rt_prio_smp(rt_rq, prio, prev_prio); 999 } 1000 1001 static void 1002 dec_rt_prio(struct rt_rq *rt_rq, int prio) 1003 { 1004 int prev_prio = rt_rq->highest_prio.curr; 1005 1006 if (rt_rq->rt_nr_running) { 1007 1008 WARN_ON(prio < prev_prio); 1009 1010 /* 1011 * This may have been our highest task, and therefore 1012 * we may have some recomputation to do 1013 */ 1014 if (prio == prev_prio) { 1015 struct rt_prio_array *array = &rt_rq->active; 1016 1017 rt_rq->highest_prio.curr = 1018 sched_find_first_bit(array->bitmap); 1019 } 1020 1021 } else 1022 rt_rq->highest_prio.curr = MAX_RT_PRIO; 1023 1024 dec_rt_prio_smp(rt_rq, prio, prev_prio); 1025 } 1026 1027 #else 1028 1029 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} 1030 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} 1031 1032 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ 1033 1034 #ifdef CONFIG_RT_GROUP_SCHED 1035 1036 static void 1037 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1038 { 1039 if (rt_se_boosted(rt_se)) 1040 rt_rq->rt_nr_boosted++; 1041 1042 if (rt_rq->tg) 1043 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 1044 } 1045 1046 static void 1047 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1048 { 1049 if (rt_se_boosted(rt_se)) 1050 rt_rq->rt_nr_boosted--; 1051 1052 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 1053 } 1054 1055 #else /* CONFIG_RT_GROUP_SCHED */ 1056 1057 static void 1058 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1059 { 1060 start_rt_bandwidth(&def_rt_bandwidth); 1061 } 1062 1063 static inline 1064 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} 1065 1066 #endif /* CONFIG_RT_GROUP_SCHED */ 1067 1068 static inline 1069 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1070 { 1071 int prio = rt_se_prio(rt_se); 1072 1073 WARN_ON(!rt_prio(prio)); 1074 rt_rq->rt_nr_running++; 1075 1076 inc_rt_prio(rt_rq, prio); 1077 inc_rt_migration(rt_se, rt_rq); 1078 inc_rt_group(rt_se, rt_rq); 1079 } 1080 1081 static inline 1082 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1083 { 1084 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1085 WARN_ON(!rt_rq->rt_nr_running); 1086 rt_rq->rt_nr_running--; 1087 1088 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1089 dec_rt_migration(rt_se, rt_rq); 1090 dec_rt_group(rt_se, rt_rq); 1091 } 1092 1093 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1094 { 1095 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1096 struct rt_prio_array *array = &rt_rq->active; 1097 struct rt_rq *group_rq = group_rt_rq(rt_se); 1098 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1099 1100 /* 1101 * Don't enqueue the group if its throttled, or when empty. 1102 * The latter is a consequence of the former when a child group 1103 * get throttled and the current group doesn't have any other 1104 * active members. 1105 */ 1106 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 1107 return; 1108 1109 if (!rt_rq->rt_nr_running) 1110 list_add_leaf_rt_rq(rt_rq); 1111 1112 if (head) 1113 list_add(&rt_se->run_list, queue); 1114 else 1115 list_add_tail(&rt_se->run_list, queue); 1116 __set_bit(rt_se_prio(rt_se), array->bitmap); 1117 1118 inc_rt_tasks(rt_se, rt_rq); 1119 } 1120 1121 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) 1122 { 1123 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1124 struct rt_prio_array *array = &rt_rq->active; 1125 1126 list_del_init(&rt_se->run_list); 1127 if (list_empty(array->queue + rt_se_prio(rt_se))) 1128 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1129 1130 dec_rt_tasks(rt_se, rt_rq); 1131 if (!rt_rq->rt_nr_running) 1132 list_del_leaf_rt_rq(rt_rq); 1133 } 1134 1135 /* 1136 * Because the prio of an upper entry depends on the lower 1137 * entries, we must remove entries top - down. 1138 */ 1139 static void dequeue_rt_stack(struct sched_rt_entity *rt_se) 1140 { 1141 struct sched_rt_entity *back = NULL; 1142 1143 for_each_sched_rt_entity(rt_se) { 1144 rt_se->back = back; 1145 back = rt_se; 1146 } 1147 1148 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1149 if (on_rt_rq(rt_se)) 1150 __dequeue_rt_entity(rt_se); 1151 } 1152 } 1153 1154 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1155 { 1156 dequeue_rt_stack(rt_se); 1157 for_each_sched_rt_entity(rt_se) 1158 __enqueue_rt_entity(rt_se, head); 1159 } 1160 1161 static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 1162 { 1163 dequeue_rt_stack(rt_se); 1164 1165 for_each_sched_rt_entity(rt_se) { 1166 struct rt_rq *rt_rq = group_rt_rq(rt_se); 1167 1168 if (rt_rq && rt_rq->rt_nr_running) 1169 __enqueue_rt_entity(rt_se, false); 1170 } 1171 } 1172 1173 /* 1174 * Adding/removing a task to/from a priority array: 1175 */ 1176 static void 1177 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1178 { 1179 struct sched_rt_entity *rt_se = &p->rt; 1180 1181 if (flags & ENQUEUE_WAKEUP) 1182 rt_se->timeout = 0; 1183 1184 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); 1185 1186 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1187 enqueue_pushable_task(rq, p); 1188 1189 inc_nr_running(rq); 1190 } 1191 1192 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1193 { 1194 struct sched_rt_entity *rt_se = &p->rt; 1195 1196 update_curr_rt(rq); 1197 dequeue_rt_entity(rt_se); 1198 1199 dequeue_pushable_task(rq, p); 1200 1201 dec_nr_running(rq); 1202 } 1203 1204 /* 1205 * Put task to the head or the end of the run list without the overhead of 1206 * dequeue followed by enqueue. 1207 */ 1208 static void 1209 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1210 { 1211 if (on_rt_rq(rt_se)) { 1212 struct rt_prio_array *array = &rt_rq->active; 1213 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1214 1215 if (head) 1216 list_move(&rt_se->run_list, queue); 1217 else 1218 list_move_tail(&rt_se->run_list, queue); 1219 } 1220 } 1221 1222 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) 1223 { 1224 struct sched_rt_entity *rt_se = &p->rt; 1225 struct rt_rq *rt_rq; 1226 1227 for_each_sched_rt_entity(rt_se) { 1228 rt_rq = rt_rq_of_se(rt_se); 1229 requeue_rt_entity(rt_rq, rt_se, head); 1230 } 1231 } 1232 1233 static void yield_task_rt(struct rq *rq) 1234 { 1235 requeue_task_rt(rq, rq->curr, 0); 1236 } 1237 1238 #ifdef CONFIG_SMP 1239 static int find_lowest_rq(struct task_struct *task); 1240 1241 static int 1242 select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 1243 { 1244 struct task_struct *curr; 1245 struct rq *rq; 1246 int cpu; 1247 1248 cpu = task_cpu(p); 1249 1250 if (p->nr_cpus_allowed == 1) 1251 goto out; 1252 1253 /* For anything but wake ups, just return the task_cpu */ 1254 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1255 goto out; 1256 1257 rq = cpu_rq(cpu); 1258 1259 rcu_read_lock(); 1260 curr = ACCESS_ONCE(rq->curr); /* unlocked access */ 1261 1262 /* 1263 * If the current task on @p's runqueue is an RT task, then 1264 * try to see if we can wake this RT task up on another 1265 * runqueue. Otherwise simply start this RT task 1266 * on its current runqueue. 1267 * 1268 * We want to avoid overloading runqueues. If the woken 1269 * task is a higher priority, then it will stay on this CPU 1270 * and the lower prio task should be moved to another CPU. 1271 * Even though this will probably make the lower prio task 1272 * lose its cache, we do not want to bounce a higher task 1273 * around just because it gave up its CPU, perhaps for a 1274 * lock? 1275 * 1276 * For equal prio tasks, we just let the scheduler sort it out. 1277 * 1278 * Otherwise, just let it ride on the affined RQ and the 1279 * post-schedule router will push the preempted task away 1280 * 1281 * This test is optimistic, if we get it wrong the load-balancer 1282 * will have to sort it out. 1283 */ 1284 if (curr && unlikely(rt_task(curr)) && 1285 (curr->nr_cpus_allowed < 2 || 1286 curr->prio <= p->prio) && 1287 (p->nr_cpus_allowed > 1)) { 1288 int target = find_lowest_rq(p); 1289 1290 if (target != -1) 1291 cpu = target; 1292 } 1293 rcu_read_unlock(); 1294 1295 out: 1296 return cpu; 1297 } 1298 1299 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1300 { 1301 if (rq->curr->nr_cpus_allowed == 1) 1302 return; 1303 1304 if (p->nr_cpus_allowed != 1 1305 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1306 return; 1307 1308 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) 1309 return; 1310 1311 /* 1312 * There appears to be other cpus that can accept 1313 * current and none to run 'p', so lets reschedule 1314 * to try and push current away: 1315 */ 1316 requeue_task_rt(rq, p, 1); 1317 resched_task(rq->curr); 1318 } 1319 1320 #endif /* CONFIG_SMP */ 1321 1322 /* 1323 * Preempt the current task with a newly woken task if needed: 1324 */ 1325 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) 1326 { 1327 if (p->prio < rq->curr->prio) { 1328 resched_task(rq->curr); 1329 return; 1330 } 1331 1332 #ifdef CONFIG_SMP 1333 /* 1334 * If: 1335 * 1336 * - the newly woken task is of equal priority to the current task 1337 * - the newly woken task is non-migratable while current is migratable 1338 * - current will be preempted on the next reschedule 1339 * 1340 * we should check to see if current can readily move to a different 1341 * cpu. If so, we will reschedule to allow the push logic to try 1342 * to move current somewhere else, making room for our non-migratable 1343 * task. 1344 */ 1345 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) 1346 check_preempt_equal_prio(rq, p); 1347 #endif 1348 } 1349 1350 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 1351 struct rt_rq *rt_rq) 1352 { 1353 struct rt_prio_array *array = &rt_rq->active; 1354 struct sched_rt_entity *next = NULL; 1355 struct list_head *queue; 1356 int idx; 1357 1358 idx = sched_find_first_bit(array->bitmap); 1359 BUG_ON(idx >= MAX_RT_PRIO); 1360 1361 queue = array->queue + idx; 1362 next = list_entry(queue->next, struct sched_rt_entity, run_list); 1363 1364 return next; 1365 } 1366 1367 static struct task_struct *_pick_next_task_rt(struct rq *rq) 1368 { 1369 struct sched_rt_entity *rt_se; 1370 struct task_struct *p; 1371 struct rt_rq *rt_rq; 1372 1373 rt_rq = &rq->rt; 1374 1375 if (!rt_rq->rt_nr_running) 1376 return NULL; 1377 1378 if (rt_rq_throttled(rt_rq)) 1379 return NULL; 1380 1381 do { 1382 rt_se = pick_next_rt_entity(rq, rt_rq); 1383 BUG_ON(!rt_se); 1384 rt_rq = group_rt_rq(rt_se); 1385 } while (rt_rq); 1386 1387 p = rt_task_of(rt_se); 1388 p->se.exec_start = rq->clock_task; 1389 1390 return p; 1391 } 1392 1393 static struct task_struct *pick_next_task_rt(struct rq *rq) 1394 { 1395 struct task_struct *p = _pick_next_task_rt(rq); 1396 1397 /* The running task is never eligible for pushing */ 1398 if (p) 1399 dequeue_pushable_task(rq, p); 1400 1401 #ifdef CONFIG_SMP 1402 /* 1403 * We detect this state here so that we can avoid taking the RQ 1404 * lock again later if there is no need to push 1405 */ 1406 rq->post_schedule = has_pushable_tasks(rq); 1407 #endif 1408 1409 return p; 1410 } 1411 1412 static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1413 { 1414 update_curr_rt(rq); 1415 1416 /* 1417 * The previous task needs to be made eligible for pushing 1418 * if it is still active 1419 */ 1420 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) 1421 enqueue_pushable_task(rq, p); 1422 } 1423 1424 #ifdef CONFIG_SMP 1425 1426 /* Only try algorithms three times */ 1427 #define RT_MAX_TRIES 3 1428 1429 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1430 { 1431 if (!task_running(rq, p) && 1432 cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) 1433 return 1; 1434 return 0; 1435 } 1436 1437 /* Return the second highest RT task, NULL otherwise */ 1438 static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) 1439 { 1440 struct task_struct *next = NULL; 1441 struct sched_rt_entity *rt_se; 1442 struct rt_prio_array *array; 1443 struct rt_rq *rt_rq; 1444 int idx; 1445 1446 for_each_leaf_rt_rq(rt_rq, rq) { 1447 array = &rt_rq->active; 1448 idx = sched_find_first_bit(array->bitmap); 1449 next_idx: 1450 if (idx >= MAX_RT_PRIO) 1451 continue; 1452 if (next && next->prio <= idx) 1453 continue; 1454 list_for_each_entry(rt_se, array->queue + idx, run_list) { 1455 struct task_struct *p; 1456 1457 if (!rt_entity_is_task(rt_se)) 1458 continue; 1459 1460 p = rt_task_of(rt_se); 1461 if (pick_rt_task(rq, p, cpu)) { 1462 next = p; 1463 break; 1464 } 1465 } 1466 if (!next) { 1467 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); 1468 goto next_idx; 1469 } 1470 } 1471 1472 return next; 1473 } 1474 1475 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1476 1477 static int find_lowest_rq(struct task_struct *task) 1478 { 1479 struct sched_domain *sd; 1480 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 1481 int this_cpu = smp_processor_id(); 1482 int cpu = task_cpu(task); 1483 1484 /* Make sure the mask is initialized first */ 1485 if (unlikely(!lowest_mask)) 1486 return -1; 1487 1488 if (task->nr_cpus_allowed == 1) 1489 return -1; /* No other targets possible */ 1490 1491 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) 1492 return -1; /* No targets found */ 1493 1494 /* 1495 * At this point we have built a mask of cpus representing the 1496 * lowest priority tasks in the system. Now we want to elect 1497 * the best one based on our affinity and topology. 1498 * 1499 * We prioritize the last cpu that the task executed on since 1500 * it is most likely cache-hot in that location. 1501 */ 1502 if (cpumask_test_cpu(cpu, lowest_mask)) 1503 return cpu; 1504 1505 /* 1506 * Otherwise, we consult the sched_domains span maps to figure 1507 * out which cpu is logically closest to our hot cache data. 1508 */ 1509 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1510 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1511 1512 rcu_read_lock(); 1513 for_each_domain(cpu, sd) { 1514 if (sd->flags & SD_WAKE_AFFINE) { 1515 int best_cpu; 1516 1517 /* 1518 * "this_cpu" is cheaper to preempt than a 1519 * remote processor. 1520 */ 1521 if (this_cpu != -1 && 1522 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { 1523 rcu_read_unlock(); 1524 return this_cpu; 1525 } 1526 1527 best_cpu = cpumask_first_and(lowest_mask, 1528 sched_domain_span(sd)); 1529 if (best_cpu < nr_cpu_ids) { 1530 rcu_read_unlock(); 1531 return best_cpu; 1532 } 1533 } 1534 } 1535 rcu_read_unlock(); 1536 1537 /* 1538 * And finally, if there were no matches within the domains 1539 * just give the caller *something* to work with from the compatible 1540 * locations. 1541 */ 1542 if (this_cpu != -1) 1543 return this_cpu; 1544 1545 cpu = cpumask_any(lowest_mask); 1546 if (cpu < nr_cpu_ids) 1547 return cpu; 1548 return -1; 1549 } 1550 1551 /* Will lock the rq it finds */ 1552 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) 1553 { 1554 struct rq *lowest_rq = NULL; 1555 int tries; 1556 int cpu; 1557 1558 for (tries = 0; tries < RT_MAX_TRIES; tries++) { 1559 cpu = find_lowest_rq(task); 1560 1561 if ((cpu == -1) || (cpu == rq->cpu)) 1562 break; 1563 1564 lowest_rq = cpu_rq(cpu); 1565 1566 /* if the prio of this runqueue changed, try again */ 1567 if (double_lock_balance(rq, lowest_rq)) { 1568 /* 1569 * We had to unlock the run queue. In 1570 * the mean time, task could have 1571 * migrated already or had its affinity changed. 1572 * Also make sure that it wasn't scheduled on its rq. 1573 */ 1574 if (unlikely(task_rq(task) != rq || 1575 !cpumask_test_cpu(lowest_rq->cpu, 1576 tsk_cpus_allowed(task)) || 1577 task_running(rq, task) || 1578 !task->on_rq)) { 1579 1580 double_unlock_balance(rq, lowest_rq); 1581 lowest_rq = NULL; 1582 break; 1583 } 1584 } 1585 1586 /* If this rq is still suitable use it. */ 1587 if (lowest_rq->rt.highest_prio.curr > task->prio) 1588 break; 1589 1590 /* try again */ 1591 double_unlock_balance(rq, lowest_rq); 1592 lowest_rq = NULL; 1593 } 1594 1595 return lowest_rq; 1596 } 1597 1598 static struct task_struct *pick_next_pushable_task(struct rq *rq) 1599 { 1600 struct task_struct *p; 1601 1602 if (!has_pushable_tasks(rq)) 1603 return NULL; 1604 1605 p = plist_first_entry(&rq->rt.pushable_tasks, 1606 struct task_struct, pushable_tasks); 1607 1608 BUG_ON(rq->cpu != task_cpu(p)); 1609 BUG_ON(task_current(rq, p)); 1610 BUG_ON(p->nr_cpus_allowed <= 1); 1611 1612 BUG_ON(!p->on_rq); 1613 BUG_ON(!rt_task(p)); 1614 1615 return p; 1616 } 1617 1618 /* 1619 * If the current CPU has more than one RT task, see if the non 1620 * running task can migrate over to a CPU that is running a task 1621 * of lesser priority. 1622 */ 1623 static int push_rt_task(struct rq *rq) 1624 { 1625 struct task_struct *next_task; 1626 struct rq *lowest_rq; 1627 int ret = 0; 1628 1629 if (!rq->rt.overloaded) 1630 return 0; 1631 1632 next_task = pick_next_pushable_task(rq); 1633 if (!next_task) 1634 return 0; 1635 1636 retry: 1637 if (unlikely(next_task == rq->curr)) { 1638 WARN_ON(1); 1639 return 0; 1640 } 1641 1642 /* 1643 * It's possible that the next_task slipped in of 1644 * higher priority than current. If that's the case 1645 * just reschedule current. 1646 */ 1647 if (unlikely(next_task->prio < rq->curr->prio)) { 1648 resched_task(rq->curr); 1649 return 0; 1650 } 1651 1652 /* We might release rq lock */ 1653 get_task_struct(next_task); 1654 1655 /* find_lock_lowest_rq locks the rq if found */ 1656 lowest_rq = find_lock_lowest_rq(next_task, rq); 1657 if (!lowest_rq) { 1658 struct task_struct *task; 1659 /* 1660 * find_lock_lowest_rq releases rq->lock 1661 * so it is possible that next_task has migrated. 1662 * 1663 * We need to make sure that the task is still on the same 1664 * run-queue and is also still the next task eligible for 1665 * pushing. 1666 */ 1667 task = pick_next_pushable_task(rq); 1668 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1669 /* 1670 * The task hasn't migrated, and is still the next 1671 * eligible task, but we failed to find a run-queue 1672 * to push it to. Do not retry in this case, since 1673 * other cpus will pull from us when ready. 1674 */ 1675 goto out; 1676 } 1677 1678 if (!task) 1679 /* No more tasks, just exit */ 1680 goto out; 1681 1682 /* 1683 * Something has shifted, try again. 1684 */ 1685 put_task_struct(next_task); 1686 next_task = task; 1687 goto retry; 1688 } 1689 1690 deactivate_task(rq, next_task, 0); 1691 set_task_cpu(next_task, lowest_rq->cpu); 1692 activate_task(lowest_rq, next_task, 0); 1693 ret = 1; 1694 1695 resched_task(lowest_rq->curr); 1696 1697 double_unlock_balance(rq, lowest_rq); 1698 1699 out: 1700 put_task_struct(next_task); 1701 1702 return ret; 1703 } 1704 1705 static void push_rt_tasks(struct rq *rq) 1706 { 1707 /* push_rt_task will return true if it moved an RT */ 1708 while (push_rt_task(rq)) 1709 ; 1710 } 1711 1712 static int pull_rt_task(struct rq *this_rq) 1713 { 1714 int this_cpu = this_rq->cpu, ret = 0, cpu; 1715 struct task_struct *p; 1716 struct rq *src_rq; 1717 1718 if (likely(!rt_overloaded(this_rq))) 1719 return 0; 1720 1721 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1722 if (this_cpu == cpu) 1723 continue; 1724 1725 src_rq = cpu_rq(cpu); 1726 1727 /* 1728 * Don't bother taking the src_rq->lock if the next highest 1729 * task is known to be lower-priority than our current task. 1730 * This may look racy, but if this value is about to go 1731 * logically higher, the src_rq will push this task away. 1732 * And if its going logically lower, we do not care 1733 */ 1734 if (src_rq->rt.highest_prio.next >= 1735 this_rq->rt.highest_prio.curr) 1736 continue; 1737 1738 /* 1739 * We can potentially drop this_rq's lock in 1740 * double_lock_balance, and another CPU could 1741 * alter this_rq 1742 */ 1743 double_lock_balance(this_rq, src_rq); 1744 1745 /* 1746 * Are there still pullable RT tasks? 1747 */ 1748 if (src_rq->rt.rt_nr_running <= 1) 1749 goto skip; 1750 1751 p = pick_next_highest_task_rt(src_rq, this_cpu); 1752 1753 /* 1754 * Do we have an RT task that preempts 1755 * the to-be-scheduled task? 1756 */ 1757 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1758 WARN_ON(p == src_rq->curr); 1759 WARN_ON(!p->on_rq); 1760 1761 /* 1762 * There's a chance that p is higher in priority 1763 * than what's currently running on its cpu. 1764 * This is just that p is wakeing up and hasn't 1765 * had a chance to schedule. We only pull 1766 * p if it is lower in priority than the 1767 * current task on the run queue 1768 */ 1769 if (p->prio < src_rq->curr->prio) 1770 goto skip; 1771 1772 ret = 1; 1773 1774 deactivate_task(src_rq, p, 0); 1775 set_task_cpu(p, this_cpu); 1776 activate_task(this_rq, p, 0); 1777 /* 1778 * We continue with the search, just in 1779 * case there's an even higher prio task 1780 * in another runqueue. (low likelihood 1781 * but possible) 1782 */ 1783 } 1784 skip: 1785 double_unlock_balance(this_rq, src_rq); 1786 } 1787 1788 return ret; 1789 } 1790 1791 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) 1792 { 1793 /* Try to pull RT tasks here if we lower this rq's prio */ 1794 if (rq->rt.highest_prio.curr > prev->prio) 1795 pull_rt_task(rq); 1796 } 1797 1798 static void post_schedule_rt(struct rq *rq) 1799 { 1800 push_rt_tasks(rq); 1801 } 1802 1803 /* 1804 * If we are not running and we are not going to reschedule soon, we should 1805 * try to push tasks away now 1806 */ 1807 static void task_woken_rt(struct rq *rq, struct task_struct *p) 1808 { 1809 if (!task_running(rq, p) && 1810 !test_tsk_need_resched(rq->curr) && 1811 has_pushable_tasks(rq) && 1812 p->nr_cpus_allowed > 1 && 1813 rt_task(rq->curr) && 1814 (rq->curr->nr_cpus_allowed < 2 || 1815 rq->curr->prio <= p->prio)) 1816 push_rt_tasks(rq); 1817 } 1818 1819 static void set_cpus_allowed_rt(struct task_struct *p, 1820 const struct cpumask *new_mask) 1821 { 1822 struct rq *rq; 1823 int weight; 1824 1825 BUG_ON(!rt_task(p)); 1826 1827 if (!p->on_rq) 1828 return; 1829 1830 weight = cpumask_weight(new_mask); 1831 1832 /* 1833 * Only update if the process changes its state from whether it 1834 * can migrate or not. 1835 */ 1836 if ((p->nr_cpus_allowed > 1) == (weight > 1)) 1837 return; 1838 1839 rq = task_rq(p); 1840 1841 /* 1842 * The process used to be able to migrate OR it can now migrate 1843 */ 1844 if (weight <= 1) { 1845 if (!task_current(rq, p)) 1846 dequeue_pushable_task(rq, p); 1847 BUG_ON(!rq->rt.rt_nr_migratory); 1848 rq->rt.rt_nr_migratory--; 1849 } else { 1850 if (!task_current(rq, p)) 1851 enqueue_pushable_task(rq, p); 1852 rq->rt.rt_nr_migratory++; 1853 } 1854 1855 update_rt_migration(&rq->rt); 1856 } 1857 1858 /* Assumes rq->lock is held */ 1859 static void rq_online_rt(struct rq *rq) 1860 { 1861 if (rq->rt.overloaded) 1862 rt_set_overload(rq); 1863 1864 __enable_runtime(rq); 1865 1866 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); 1867 } 1868 1869 /* Assumes rq->lock is held */ 1870 static void rq_offline_rt(struct rq *rq) 1871 { 1872 if (rq->rt.overloaded) 1873 rt_clear_overload(rq); 1874 1875 __disable_runtime(rq); 1876 1877 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); 1878 } 1879 1880 /* 1881 * When switch from the rt queue, we bring ourselves to a position 1882 * that we might want to pull RT tasks from other runqueues. 1883 */ 1884 static void switched_from_rt(struct rq *rq, struct task_struct *p) 1885 { 1886 /* 1887 * If there are other RT tasks then we will reschedule 1888 * and the scheduling of the other RT tasks will handle 1889 * the balancing. But if we are the last RT task 1890 * we may need to handle the pulling of RT tasks 1891 * now. 1892 */ 1893 if (!p->on_rq || rq->rt.rt_nr_running) 1894 return; 1895 1896 if (pull_rt_task(rq)) 1897 resched_task(rq->curr); 1898 } 1899 1900 void init_sched_rt_class(void) 1901 { 1902 unsigned int i; 1903 1904 for_each_possible_cpu(i) { 1905 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1906 GFP_KERNEL, cpu_to_node(i)); 1907 } 1908 } 1909 #endif /* CONFIG_SMP */ 1910 1911 /* 1912 * When switching a task to RT, we may overload the runqueue 1913 * with RT tasks. In this case we try to push them off to 1914 * other runqueues. 1915 */ 1916 static void switched_to_rt(struct rq *rq, struct task_struct *p) 1917 { 1918 int check_resched = 1; 1919 1920 /* 1921 * If we are already running, then there's nothing 1922 * that needs to be done. But if we are not running 1923 * we may need to preempt the current running task. 1924 * If that current running task is also an RT task 1925 * then see if we can move to another run queue. 1926 */ 1927 if (p->on_rq && rq->curr != p) { 1928 #ifdef CONFIG_SMP 1929 if (rq->rt.overloaded && push_rt_task(rq) && 1930 /* Don't resched if we changed runqueues */ 1931 rq != task_rq(p)) 1932 check_resched = 0; 1933 #endif /* CONFIG_SMP */ 1934 if (check_resched && p->prio < rq->curr->prio) 1935 resched_task(rq->curr); 1936 } 1937 } 1938 1939 /* 1940 * Priority of the task has changed. This may cause 1941 * us to initiate a push or pull. 1942 */ 1943 static void 1944 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 1945 { 1946 if (!p->on_rq) 1947 return; 1948 1949 if (rq->curr == p) { 1950 #ifdef CONFIG_SMP 1951 /* 1952 * If our priority decreases while running, we 1953 * may need to pull tasks to this runqueue. 1954 */ 1955 if (oldprio < p->prio) 1956 pull_rt_task(rq); 1957 /* 1958 * If there's a higher priority task waiting to run 1959 * then reschedule. Note, the above pull_rt_task 1960 * can release the rq lock and p could migrate. 1961 * Only reschedule if p is still on the same runqueue. 1962 */ 1963 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) 1964 resched_task(p); 1965 #else 1966 /* For UP simply resched on drop of prio */ 1967 if (oldprio < p->prio) 1968 resched_task(p); 1969 #endif /* CONFIG_SMP */ 1970 } else { 1971 /* 1972 * This task is not running, but if it is 1973 * greater than the current running task 1974 * then reschedule. 1975 */ 1976 if (p->prio < rq->curr->prio) 1977 resched_task(rq->curr); 1978 } 1979 } 1980 1981 static void watchdog(struct rq *rq, struct task_struct *p) 1982 { 1983 unsigned long soft, hard; 1984 1985 /* max may change after cur was read, this will be fixed next tick */ 1986 soft = task_rlimit(p, RLIMIT_RTTIME); 1987 hard = task_rlimit_max(p, RLIMIT_RTTIME); 1988 1989 if (soft != RLIM_INFINITY) { 1990 unsigned long next; 1991 1992 if (p->rt.watchdog_stamp != jiffies) { 1993 p->rt.timeout++; 1994 p->rt.watchdog_stamp = jiffies; 1995 } 1996 1997 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 1998 if (p->rt.timeout > next) 1999 p->cputime_expires.sched_exp = p->se.sum_exec_runtime; 2000 } 2001 } 2002 2003 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 2004 { 2005 struct sched_rt_entity *rt_se = &p->rt; 2006 2007 update_curr_rt(rq); 2008 2009 watchdog(rq, p); 2010 2011 /* 2012 * RR tasks need a special form of timeslice management. 2013 * FIFO tasks have no timeslices. 2014 */ 2015 if (p->policy != SCHED_RR) 2016 return; 2017 2018 if (--p->rt.time_slice) 2019 return; 2020 2021 p->rt.time_slice = sched_rr_timeslice; 2022 2023 /* 2024 * Requeue to the end of queue if we (and all of our ancestors) are the 2025 * only element on the queue 2026 */ 2027 for_each_sched_rt_entity(rt_se) { 2028 if (rt_se->run_list.prev != rt_se->run_list.next) { 2029 requeue_task_rt(rq, p, 0); 2030 set_tsk_need_resched(p); 2031 return; 2032 } 2033 } 2034 } 2035 2036 static void set_curr_task_rt(struct rq *rq) 2037 { 2038 struct task_struct *p = rq->curr; 2039 2040 p->se.exec_start = rq->clock_task; 2041 2042 /* The running task is never eligible for pushing */ 2043 dequeue_pushable_task(rq, p); 2044 } 2045 2046 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 2047 { 2048 /* 2049 * Time slice is 0 for SCHED_FIFO tasks 2050 */ 2051 if (task->policy == SCHED_RR) 2052 return sched_rr_timeslice; 2053 else 2054 return 0; 2055 } 2056 2057 const struct sched_class rt_sched_class = { 2058 .next = &fair_sched_class, 2059 .enqueue_task = enqueue_task_rt, 2060 .dequeue_task = dequeue_task_rt, 2061 .yield_task = yield_task_rt, 2062 2063 .check_preempt_curr = check_preempt_curr_rt, 2064 2065 .pick_next_task = pick_next_task_rt, 2066 .put_prev_task = put_prev_task_rt, 2067 2068 #ifdef CONFIG_SMP 2069 .select_task_rq = select_task_rq_rt, 2070 2071 .set_cpus_allowed = set_cpus_allowed_rt, 2072 .rq_online = rq_online_rt, 2073 .rq_offline = rq_offline_rt, 2074 .pre_schedule = pre_schedule_rt, 2075 .post_schedule = post_schedule_rt, 2076 .task_woken = task_woken_rt, 2077 .switched_from = switched_from_rt, 2078 #endif 2079 2080 .set_curr_task = set_curr_task_rt, 2081 .task_tick = task_tick_rt, 2082 2083 .get_rr_interval = get_rr_interval_rt, 2084 2085 .prio_changed = prio_changed_rt, 2086 .switched_to = switched_to_rt, 2087 }; 2088 2089 #ifdef CONFIG_SCHED_DEBUG 2090 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2091 2092 void print_rt_stats(struct seq_file *m, int cpu) 2093 { 2094 rt_rq_iter_t iter; 2095 struct rt_rq *rt_rq; 2096 2097 rcu_read_lock(); 2098 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) 2099 print_rt_rq(m, cpu, rt_rq); 2100 rcu_read_unlock(); 2101 } 2102 #endif /* CONFIG_SCHED_DEBUG */ 2103