1 /* 2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 3 * policies) 4 */ 5 6 #include "sched.h" 7 8 #include <linux/slab.h> 9 #include <linux/irq_work.h> 10 11 int sched_rr_timeslice = RR_TIMESLICE; 12 13 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 14 15 struct rt_bandwidth def_rt_bandwidth; 16 17 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 18 { 19 struct rt_bandwidth *rt_b = 20 container_of(timer, struct rt_bandwidth, rt_period_timer); 21 ktime_t now; 22 int overrun; 23 int idle = 0; 24 25 for (;;) { 26 now = hrtimer_cb_get_time(timer); 27 overrun = hrtimer_forward(timer, now, rt_b->rt_period); 28 29 if (!overrun) 30 break; 31 32 idle = do_sched_rt_period_timer(rt_b, overrun); 33 } 34 35 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 36 } 37 38 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 39 { 40 rt_b->rt_period = ns_to_ktime(period); 41 rt_b->rt_runtime = runtime; 42 43 raw_spin_lock_init(&rt_b->rt_runtime_lock); 44 45 hrtimer_init(&rt_b->rt_period_timer, 46 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 47 rt_b->rt_period_timer.function = sched_rt_period_timer; 48 } 49 50 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 51 { 52 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 53 return; 54 55 if (hrtimer_active(&rt_b->rt_period_timer)) 56 return; 57 58 raw_spin_lock(&rt_b->rt_runtime_lock); 59 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); 60 raw_spin_unlock(&rt_b->rt_runtime_lock); 61 } 62 63 #ifdef CONFIG_SMP 64 static void push_irq_work_func(struct irq_work *work); 65 #endif 66 67 void init_rt_rq(struct rt_rq *rt_rq) 68 { 69 struct rt_prio_array *array; 70 int i; 71 72 array = &rt_rq->active; 73 for (i = 0; i < MAX_RT_PRIO; i++) { 74 INIT_LIST_HEAD(array->queue + i); 75 __clear_bit(i, array->bitmap); 76 } 77 /* delimiter for bitsearch: */ 78 __set_bit(MAX_RT_PRIO, array->bitmap); 79 80 #if defined CONFIG_SMP 81 rt_rq->highest_prio.curr = MAX_RT_PRIO; 82 rt_rq->highest_prio.next = MAX_RT_PRIO; 83 rt_rq->rt_nr_migratory = 0; 84 rt_rq->overloaded = 0; 85 plist_head_init(&rt_rq->pushable_tasks); 86 87 #ifdef HAVE_RT_PUSH_IPI 88 rt_rq->push_flags = 0; 89 rt_rq->push_cpu = nr_cpu_ids; 90 raw_spin_lock_init(&rt_rq->push_lock); 91 init_irq_work(&rt_rq->push_work, push_irq_work_func); 92 #endif 93 #endif /* CONFIG_SMP */ 94 /* We start is dequeued state, because no RT tasks are queued */ 95 rt_rq->rt_queued = 0; 96 97 rt_rq->rt_time = 0; 98 rt_rq->rt_throttled = 0; 99 rt_rq->rt_runtime = 0; 100 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 101 } 102 103 #ifdef CONFIG_RT_GROUP_SCHED 104 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 105 { 106 hrtimer_cancel(&rt_b->rt_period_timer); 107 } 108 109 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 110 111 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 112 { 113 #ifdef CONFIG_SCHED_DEBUG 114 WARN_ON_ONCE(!rt_entity_is_task(rt_se)); 115 #endif 116 return container_of(rt_se, struct task_struct, rt); 117 } 118 119 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 120 { 121 return rt_rq->rq; 122 } 123 124 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 125 { 126 return rt_se->rt_rq; 127 } 128 129 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 130 { 131 struct rt_rq *rt_rq = rt_se->rt_rq; 132 133 return rt_rq->rq; 134 } 135 136 void free_rt_sched_group(struct task_group *tg) 137 { 138 int i; 139 140 if (tg->rt_se) 141 destroy_rt_bandwidth(&tg->rt_bandwidth); 142 143 for_each_possible_cpu(i) { 144 if (tg->rt_rq) 145 kfree(tg->rt_rq[i]); 146 if (tg->rt_se) 147 kfree(tg->rt_se[i]); 148 } 149 150 kfree(tg->rt_rq); 151 kfree(tg->rt_se); 152 } 153 154 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 155 struct sched_rt_entity *rt_se, int cpu, 156 struct sched_rt_entity *parent) 157 { 158 struct rq *rq = cpu_rq(cpu); 159 160 rt_rq->highest_prio.curr = MAX_RT_PRIO; 161 rt_rq->rt_nr_boosted = 0; 162 rt_rq->rq = rq; 163 rt_rq->tg = tg; 164 165 tg->rt_rq[cpu] = rt_rq; 166 tg->rt_se[cpu] = rt_se; 167 168 if (!rt_se) 169 return; 170 171 if (!parent) 172 rt_se->rt_rq = &rq->rt; 173 else 174 rt_se->rt_rq = parent->my_q; 175 176 rt_se->my_q = rt_rq; 177 rt_se->parent = parent; 178 INIT_LIST_HEAD(&rt_se->run_list); 179 } 180 181 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 182 { 183 struct rt_rq *rt_rq; 184 struct sched_rt_entity *rt_se; 185 int i; 186 187 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 188 if (!tg->rt_rq) 189 goto err; 190 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); 191 if (!tg->rt_se) 192 goto err; 193 194 init_rt_bandwidth(&tg->rt_bandwidth, 195 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 196 197 for_each_possible_cpu(i) { 198 rt_rq = kzalloc_node(sizeof(struct rt_rq), 199 GFP_KERNEL, cpu_to_node(i)); 200 if (!rt_rq) 201 goto err; 202 203 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 204 GFP_KERNEL, cpu_to_node(i)); 205 if (!rt_se) 206 goto err_free_rq; 207 208 init_rt_rq(rt_rq); 209 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 210 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 211 } 212 213 return 1; 214 215 err_free_rq: 216 kfree(rt_rq); 217 err: 218 return 0; 219 } 220 221 #else /* CONFIG_RT_GROUP_SCHED */ 222 223 #define rt_entity_is_task(rt_se) (1) 224 225 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 226 { 227 return container_of(rt_se, struct task_struct, rt); 228 } 229 230 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 231 { 232 return container_of(rt_rq, struct rq, rt); 233 } 234 235 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 236 { 237 struct task_struct *p = rt_task_of(rt_se); 238 239 return task_rq(p); 240 } 241 242 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 243 { 244 struct rq *rq = rq_of_rt_se(rt_se); 245 246 return &rq->rt; 247 } 248 249 void free_rt_sched_group(struct task_group *tg) { } 250 251 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 252 { 253 return 1; 254 } 255 #endif /* CONFIG_RT_GROUP_SCHED */ 256 257 #ifdef CONFIG_SMP 258 259 static int pull_rt_task(struct rq *this_rq); 260 261 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 262 { 263 /* Try to pull RT tasks here if we lower this rq's prio */ 264 return rq->rt.highest_prio.curr > prev->prio; 265 } 266 267 static inline int rt_overloaded(struct rq *rq) 268 { 269 return atomic_read(&rq->rd->rto_count); 270 } 271 272 static inline void rt_set_overload(struct rq *rq) 273 { 274 if (!rq->online) 275 return; 276 277 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); 278 /* 279 * Make sure the mask is visible before we set 280 * the overload count. That is checked to determine 281 * if we should look at the mask. It would be a shame 282 * if we looked at the mask, but the mask was not 283 * updated yet. 284 * 285 * Matched by the barrier in pull_rt_task(). 286 */ 287 smp_wmb(); 288 atomic_inc(&rq->rd->rto_count); 289 } 290 291 static inline void rt_clear_overload(struct rq *rq) 292 { 293 if (!rq->online) 294 return; 295 296 /* the order here really doesn't matter */ 297 atomic_dec(&rq->rd->rto_count); 298 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); 299 } 300 301 static void update_rt_migration(struct rt_rq *rt_rq) 302 { 303 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { 304 if (!rt_rq->overloaded) { 305 rt_set_overload(rq_of_rt_rq(rt_rq)); 306 rt_rq->overloaded = 1; 307 } 308 } else if (rt_rq->overloaded) { 309 rt_clear_overload(rq_of_rt_rq(rt_rq)); 310 rt_rq->overloaded = 0; 311 } 312 } 313 314 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 315 { 316 struct task_struct *p; 317 318 if (!rt_entity_is_task(rt_se)) 319 return; 320 321 p = rt_task_of(rt_se); 322 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 323 324 rt_rq->rt_nr_total++; 325 if (p->nr_cpus_allowed > 1) 326 rt_rq->rt_nr_migratory++; 327 328 update_rt_migration(rt_rq); 329 } 330 331 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 332 { 333 struct task_struct *p; 334 335 if (!rt_entity_is_task(rt_se)) 336 return; 337 338 p = rt_task_of(rt_se); 339 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 340 341 rt_rq->rt_nr_total--; 342 if (p->nr_cpus_allowed > 1) 343 rt_rq->rt_nr_migratory--; 344 345 update_rt_migration(rt_rq); 346 } 347 348 static inline int has_pushable_tasks(struct rq *rq) 349 { 350 return !plist_head_empty(&rq->rt.pushable_tasks); 351 } 352 353 static inline void set_post_schedule(struct rq *rq) 354 { 355 /* 356 * We detect this state here so that we can avoid taking the RQ 357 * lock again later if there is no need to push 358 */ 359 rq->post_schedule = has_pushable_tasks(rq); 360 } 361 362 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 363 { 364 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 365 plist_node_init(&p->pushable_tasks, p->prio); 366 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 367 368 /* Update the highest prio pushable task */ 369 if (p->prio < rq->rt.highest_prio.next) 370 rq->rt.highest_prio.next = p->prio; 371 } 372 373 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 374 { 375 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 376 377 /* Update the new highest prio pushable task */ 378 if (has_pushable_tasks(rq)) { 379 p = plist_first_entry(&rq->rt.pushable_tasks, 380 struct task_struct, pushable_tasks); 381 rq->rt.highest_prio.next = p->prio; 382 } else 383 rq->rt.highest_prio.next = MAX_RT_PRIO; 384 } 385 386 #else 387 388 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 389 { 390 } 391 392 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 393 { 394 } 395 396 static inline 397 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 398 { 399 } 400 401 static inline 402 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 403 { 404 } 405 406 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 407 { 408 return false; 409 } 410 411 static inline int pull_rt_task(struct rq *this_rq) 412 { 413 return 0; 414 } 415 416 static inline void set_post_schedule(struct rq *rq) 417 { 418 } 419 #endif /* CONFIG_SMP */ 420 421 static void enqueue_top_rt_rq(struct rt_rq *rt_rq); 422 static void dequeue_top_rt_rq(struct rt_rq *rt_rq); 423 424 static inline int on_rt_rq(struct sched_rt_entity *rt_se) 425 { 426 return !list_empty(&rt_se->run_list); 427 } 428 429 #ifdef CONFIG_RT_GROUP_SCHED 430 431 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 432 { 433 if (!rt_rq->tg) 434 return RUNTIME_INF; 435 436 return rt_rq->rt_runtime; 437 } 438 439 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 440 { 441 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 442 } 443 444 typedef struct task_group *rt_rq_iter_t; 445 446 static inline struct task_group *next_task_group(struct task_group *tg) 447 { 448 do { 449 tg = list_entry_rcu(tg->list.next, 450 typeof(struct task_group), list); 451 } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); 452 453 if (&tg->list == &task_groups) 454 tg = NULL; 455 456 return tg; 457 } 458 459 #define for_each_rt_rq(rt_rq, iter, rq) \ 460 for (iter = container_of(&task_groups, typeof(*iter), list); \ 461 (iter = next_task_group(iter)) && \ 462 (rt_rq = iter->rt_rq[cpu_of(rq)]);) 463 464 #define for_each_sched_rt_entity(rt_se) \ 465 for (; rt_se; rt_se = rt_se->parent) 466 467 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 468 { 469 return rt_se->my_q; 470 } 471 472 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); 473 static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 474 475 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 476 { 477 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 478 struct rq *rq = rq_of_rt_rq(rt_rq); 479 struct sched_rt_entity *rt_se; 480 481 int cpu = cpu_of(rq); 482 483 rt_se = rt_rq->tg->rt_se[cpu]; 484 485 if (rt_rq->rt_nr_running) { 486 if (!rt_se) 487 enqueue_top_rt_rq(rt_rq); 488 else if (!on_rt_rq(rt_se)) 489 enqueue_rt_entity(rt_se, false); 490 491 if (rt_rq->highest_prio.curr < curr->prio) 492 resched_curr(rq); 493 } 494 } 495 496 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 497 { 498 struct sched_rt_entity *rt_se; 499 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 500 501 rt_se = rt_rq->tg->rt_se[cpu]; 502 503 if (!rt_se) 504 dequeue_top_rt_rq(rt_rq); 505 else if (on_rt_rq(rt_se)) 506 dequeue_rt_entity(rt_se); 507 } 508 509 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 510 { 511 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; 512 } 513 514 static int rt_se_boosted(struct sched_rt_entity *rt_se) 515 { 516 struct rt_rq *rt_rq = group_rt_rq(rt_se); 517 struct task_struct *p; 518 519 if (rt_rq) 520 return !!rt_rq->rt_nr_boosted; 521 522 p = rt_task_of(rt_se); 523 return p->prio != p->normal_prio; 524 } 525 526 #ifdef CONFIG_SMP 527 static inline const struct cpumask *sched_rt_period_mask(void) 528 { 529 return this_rq()->rd->span; 530 } 531 #else 532 static inline const struct cpumask *sched_rt_period_mask(void) 533 { 534 return cpu_online_mask; 535 } 536 #endif 537 538 static inline 539 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 540 { 541 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; 542 } 543 544 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 545 { 546 return &rt_rq->tg->rt_bandwidth; 547 } 548 549 #else /* !CONFIG_RT_GROUP_SCHED */ 550 551 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 552 { 553 return rt_rq->rt_runtime; 554 } 555 556 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 557 { 558 return ktime_to_ns(def_rt_bandwidth.rt_period); 559 } 560 561 typedef struct rt_rq *rt_rq_iter_t; 562 563 #define for_each_rt_rq(rt_rq, iter, rq) \ 564 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 565 566 #define for_each_sched_rt_entity(rt_se) \ 567 for (; rt_se; rt_se = NULL) 568 569 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 570 { 571 return NULL; 572 } 573 574 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 575 { 576 struct rq *rq = rq_of_rt_rq(rt_rq); 577 578 if (!rt_rq->rt_nr_running) 579 return; 580 581 enqueue_top_rt_rq(rt_rq); 582 resched_curr(rq); 583 } 584 585 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 586 { 587 dequeue_top_rt_rq(rt_rq); 588 } 589 590 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 591 { 592 return rt_rq->rt_throttled; 593 } 594 595 static inline const struct cpumask *sched_rt_period_mask(void) 596 { 597 return cpu_online_mask; 598 } 599 600 static inline 601 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 602 { 603 return &cpu_rq(cpu)->rt; 604 } 605 606 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 607 { 608 return &def_rt_bandwidth; 609 } 610 611 #endif /* CONFIG_RT_GROUP_SCHED */ 612 613 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) 614 { 615 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 616 617 return (hrtimer_active(&rt_b->rt_period_timer) || 618 rt_rq->rt_time < rt_b->rt_runtime); 619 } 620 621 #ifdef CONFIG_SMP 622 /* 623 * We ran out of runtime, see if we can borrow some from our neighbours. 624 */ 625 static int do_balance_runtime(struct rt_rq *rt_rq) 626 { 627 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 628 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; 629 int i, weight, more = 0; 630 u64 rt_period; 631 632 weight = cpumask_weight(rd->span); 633 634 raw_spin_lock(&rt_b->rt_runtime_lock); 635 rt_period = ktime_to_ns(rt_b->rt_period); 636 for_each_cpu(i, rd->span) { 637 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 638 s64 diff; 639 640 if (iter == rt_rq) 641 continue; 642 643 raw_spin_lock(&iter->rt_runtime_lock); 644 /* 645 * Either all rqs have inf runtime and there's nothing to steal 646 * or __disable_runtime() below sets a specific rq to inf to 647 * indicate its been disabled and disalow stealing. 648 */ 649 if (iter->rt_runtime == RUNTIME_INF) 650 goto next; 651 652 /* 653 * From runqueues with spare time, take 1/n part of their 654 * spare time, but no more than our period. 655 */ 656 diff = iter->rt_runtime - iter->rt_time; 657 if (diff > 0) { 658 diff = div_u64((u64)diff, weight); 659 if (rt_rq->rt_runtime + diff > rt_period) 660 diff = rt_period - rt_rq->rt_runtime; 661 iter->rt_runtime -= diff; 662 rt_rq->rt_runtime += diff; 663 more = 1; 664 if (rt_rq->rt_runtime == rt_period) { 665 raw_spin_unlock(&iter->rt_runtime_lock); 666 break; 667 } 668 } 669 next: 670 raw_spin_unlock(&iter->rt_runtime_lock); 671 } 672 raw_spin_unlock(&rt_b->rt_runtime_lock); 673 674 return more; 675 } 676 677 /* 678 * Ensure this RQ takes back all the runtime it lend to its neighbours. 679 */ 680 static void __disable_runtime(struct rq *rq) 681 { 682 struct root_domain *rd = rq->rd; 683 rt_rq_iter_t iter; 684 struct rt_rq *rt_rq; 685 686 if (unlikely(!scheduler_running)) 687 return; 688 689 for_each_rt_rq(rt_rq, iter, rq) { 690 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 691 s64 want; 692 int i; 693 694 raw_spin_lock(&rt_b->rt_runtime_lock); 695 raw_spin_lock(&rt_rq->rt_runtime_lock); 696 /* 697 * Either we're all inf and nobody needs to borrow, or we're 698 * already disabled and thus have nothing to do, or we have 699 * exactly the right amount of runtime to take out. 700 */ 701 if (rt_rq->rt_runtime == RUNTIME_INF || 702 rt_rq->rt_runtime == rt_b->rt_runtime) 703 goto balanced; 704 raw_spin_unlock(&rt_rq->rt_runtime_lock); 705 706 /* 707 * Calculate the difference between what we started out with 708 * and what we current have, that's the amount of runtime 709 * we lend and now have to reclaim. 710 */ 711 want = rt_b->rt_runtime - rt_rq->rt_runtime; 712 713 /* 714 * Greedy reclaim, take back as much as we can. 715 */ 716 for_each_cpu(i, rd->span) { 717 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 718 s64 diff; 719 720 /* 721 * Can't reclaim from ourselves or disabled runqueues. 722 */ 723 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 724 continue; 725 726 raw_spin_lock(&iter->rt_runtime_lock); 727 if (want > 0) { 728 diff = min_t(s64, iter->rt_runtime, want); 729 iter->rt_runtime -= diff; 730 want -= diff; 731 } else { 732 iter->rt_runtime -= want; 733 want -= want; 734 } 735 raw_spin_unlock(&iter->rt_runtime_lock); 736 737 if (!want) 738 break; 739 } 740 741 raw_spin_lock(&rt_rq->rt_runtime_lock); 742 /* 743 * We cannot be left wanting - that would mean some runtime 744 * leaked out of the system. 745 */ 746 BUG_ON(want); 747 balanced: 748 /* 749 * Disable all the borrow logic by pretending we have inf 750 * runtime - in which case borrowing doesn't make sense. 751 */ 752 rt_rq->rt_runtime = RUNTIME_INF; 753 rt_rq->rt_throttled = 0; 754 raw_spin_unlock(&rt_rq->rt_runtime_lock); 755 raw_spin_unlock(&rt_b->rt_runtime_lock); 756 757 /* Make rt_rq available for pick_next_task() */ 758 sched_rt_rq_enqueue(rt_rq); 759 } 760 } 761 762 static void __enable_runtime(struct rq *rq) 763 { 764 rt_rq_iter_t iter; 765 struct rt_rq *rt_rq; 766 767 if (unlikely(!scheduler_running)) 768 return; 769 770 /* 771 * Reset each runqueue's bandwidth settings 772 */ 773 for_each_rt_rq(rt_rq, iter, rq) { 774 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 775 776 raw_spin_lock(&rt_b->rt_runtime_lock); 777 raw_spin_lock(&rt_rq->rt_runtime_lock); 778 rt_rq->rt_runtime = rt_b->rt_runtime; 779 rt_rq->rt_time = 0; 780 rt_rq->rt_throttled = 0; 781 raw_spin_unlock(&rt_rq->rt_runtime_lock); 782 raw_spin_unlock(&rt_b->rt_runtime_lock); 783 } 784 } 785 786 static int balance_runtime(struct rt_rq *rt_rq) 787 { 788 int more = 0; 789 790 if (!sched_feat(RT_RUNTIME_SHARE)) 791 return more; 792 793 if (rt_rq->rt_time > rt_rq->rt_runtime) { 794 raw_spin_unlock(&rt_rq->rt_runtime_lock); 795 more = do_balance_runtime(rt_rq); 796 raw_spin_lock(&rt_rq->rt_runtime_lock); 797 } 798 799 return more; 800 } 801 #else /* !CONFIG_SMP */ 802 static inline int balance_runtime(struct rt_rq *rt_rq) 803 { 804 return 0; 805 } 806 #endif /* CONFIG_SMP */ 807 808 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 809 { 810 int i, idle = 1, throttled = 0; 811 const struct cpumask *span; 812 813 span = sched_rt_period_mask(); 814 #ifdef CONFIG_RT_GROUP_SCHED 815 /* 816 * FIXME: isolated CPUs should really leave the root task group, 817 * whether they are isolcpus or were isolated via cpusets, lest 818 * the timer run on a CPU which does not service all runqueues, 819 * potentially leaving other CPUs indefinitely throttled. If 820 * isolation is really required, the user will turn the throttle 821 * off to kill the perturbations it causes anyway. Meanwhile, 822 * this maintains functionality for boot and/or troubleshooting. 823 */ 824 if (rt_b == &root_task_group.rt_bandwidth) 825 span = cpu_online_mask; 826 #endif 827 for_each_cpu(i, span) { 828 int enqueue = 0; 829 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 830 struct rq *rq = rq_of_rt_rq(rt_rq); 831 832 raw_spin_lock(&rq->lock); 833 if (rt_rq->rt_time) { 834 u64 runtime; 835 836 raw_spin_lock(&rt_rq->rt_runtime_lock); 837 if (rt_rq->rt_throttled) 838 balance_runtime(rt_rq); 839 runtime = rt_rq->rt_runtime; 840 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); 841 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 842 rt_rq->rt_throttled = 0; 843 enqueue = 1; 844 845 /* 846 * When we're idle and a woken (rt) task is 847 * throttled check_preempt_curr() will set 848 * skip_update and the time between the wakeup 849 * and this unthrottle will get accounted as 850 * 'runtime'. 851 */ 852 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 853 rq_clock_skip_update(rq, false); 854 } 855 if (rt_rq->rt_time || rt_rq->rt_nr_running) 856 idle = 0; 857 raw_spin_unlock(&rt_rq->rt_runtime_lock); 858 } else if (rt_rq->rt_nr_running) { 859 idle = 0; 860 if (!rt_rq_throttled(rt_rq)) 861 enqueue = 1; 862 } 863 if (rt_rq->rt_throttled) 864 throttled = 1; 865 866 if (enqueue) 867 sched_rt_rq_enqueue(rt_rq); 868 raw_spin_unlock(&rq->lock); 869 } 870 871 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) 872 return 1; 873 874 return idle; 875 } 876 877 static inline int rt_se_prio(struct sched_rt_entity *rt_se) 878 { 879 #ifdef CONFIG_RT_GROUP_SCHED 880 struct rt_rq *rt_rq = group_rt_rq(rt_se); 881 882 if (rt_rq) 883 return rt_rq->highest_prio.curr; 884 #endif 885 886 return rt_task_of(rt_se)->prio; 887 } 888 889 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) 890 { 891 u64 runtime = sched_rt_runtime(rt_rq); 892 893 if (rt_rq->rt_throttled) 894 return rt_rq_throttled(rt_rq); 895 896 if (runtime >= sched_rt_period(rt_rq)) 897 return 0; 898 899 balance_runtime(rt_rq); 900 runtime = sched_rt_runtime(rt_rq); 901 if (runtime == RUNTIME_INF) 902 return 0; 903 904 if (rt_rq->rt_time > runtime) { 905 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 906 907 /* 908 * Don't actually throttle groups that have no runtime assigned 909 * but accrue some time due to boosting. 910 */ 911 if (likely(rt_b->rt_runtime)) { 912 rt_rq->rt_throttled = 1; 913 printk_deferred_once("sched: RT throttling activated\n"); 914 } else { 915 /* 916 * In case we did anyway, make it go away, 917 * replenishment is a joke, since it will replenish us 918 * with exactly 0 ns. 919 */ 920 rt_rq->rt_time = 0; 921 } 922 923 if (rt_rq_throttled(rt_rq)) { 924 sched_rt_rq_dequeue(rt_rq); 925 return 1; 926 } 927 } 928 929 return 0; 930 } 931 932 /* 933 * Update the current task's runtime statistics. Skip current tasks that 934 * are not in our scheduling class. 935 */ 936 static void update_curr_rt(struct rq *rq) 937 { 938 struct task_struct *curr = rq->curr; 939 struct sched_rt_entity *rt_se = &curr->rt; 940 u64 delta_exec; 941 942 if (curr->sched_class != &rt_sched_class) 943 return; 944 945 delta_exec = rq_clock_task(rq) - curr->se.exec_start; 946 if (unlikely((s64)delta_exec <= 0)) 947 return; 948 949 schedstat_set(curr->se.statistics.exec_max, 950 max(curr->se.statistics.exec_max, delta_exec)); 951 952 curr->se.sum_exec_runtime += delta_exec; 953 account_group_exec_runtime(curr, delta_exec); 954 955 curr->se.exec_start = rq_clock_task(rq); 956 cpuacct_charge(curr, delta_exec); 957 958 sched_rt_avg_update(rq, delta_exec); 959 960 if (!rt_bandwidth_enabled()) 961 return; 962 963 for_each_sched_rt_entity(rt_se) { 964 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 965 966 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 967 raw_spin_lock(&rt_rq->rt_runtime_lock); 968 rt_rq->rt_time += delta_exec; 969 if (sched_rt_runtime_exceeded(rt_rq)) 970 resched_curr(rq); 971 raw_spin_unlock(&rt_rq->rt_runtime_lock); 972 } 973 } 974 } 975 976 static void 977 dequeue_top_rt_rq(struct rt_rq *rt_rq) 978 { 979 struct rq *rq = rq_of_rt_rq(rt_rq); 980 981 BUG_ON(&rq->rt != rt_rq); 982 983 if (!rt_rq->rt_queued) 984 return; 985 986 BUG_ON(!rq->nr_running); 987 988 sub_nr_running(rq, rt_rq->rt_nr_running); 989 rt_rq->rt_queued = 0; 990 } 991 992 static void 993 enqueue_top_rt_rq(struct rt_rq *rt_rq) 994 { 995 struct rq *rq = rq_of_rt_rq(rt_rq); 996 997 BUG_ON(&rq->rt != rt_rq); 998 999 if (rt_rq->rt_queued) 1000 return; 1001 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) 1002 return; 1003 1004 add_nr_running(rq, rt_rq->rt_nr_running); 1005 rt_rq->rt_queued = 1; 1006 } 1007 1008 #if defined CONFIG_SMP 1009 1010 static void 1011 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1012 { 1013 struct rq *rq = rq_of_rt_rq(rt_rq); 1014 1015 #ifdef CONFIG_RT_GROUP_SCHED 1016 /* 1017 * Change rq's cpupri only if rt_rq is the top queue. 1018 */ 1019 if (&rq->rt != rt_rq) 1020 return; 1021 #endif 1022 if (rq->online && prio < prev_prio) 1023 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 1024 } 1025 1026 static void 1027 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1028 { 1029 struct rq *rq = rq_of_rt_rq(rt_rq); 1030 1031 #ifdef CONFIG_RT_GROUP_SCHED 1032 /* 1033 * Change rq's cpupri only if rt_rq is the top queue. 1034 */ 1035 if (&rq->rt != rt_rq) 1036 return; 1037 #endif 1038 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 1039 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 1040 } 1041 1042 #else /* CONFIG_SMP */ 1043 1044 static inline 1045 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1046 static inline 1047 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1048 1049 #endif /* CONFIG_SMP */ 1050 1051 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 1052 static void 1053 inc_rt_prio(struct rt_rq *rt_rq, int prio) 1054 { 1055 int prev_prio = rt_rq->highest_prio.curr; 1056 1057 if (prio < prev_prio) 1058 rt_rq->highest_prio.curr = prio; 1059 1060 inc_rt_prio_smp(rt_rq, prio, prev_prio); 1061 } 1062 1063 static void 1064 dec_rt_prio(struct rt_rq *rt_rq, int prio) 1065 { 1066 int prev_prio = rt_rq->highest_prio.curr; 1067 1068 if (rt_rq->rt_nr_running) { 1069 1070 WARN_ON(prio < prev_prio); 1071 1072 /* 1073 * This may have been our highest task, and therefore 1074 * we may have some recomputation to do 1075 */ 1076 if (prio == prev_prio) { 1077 struct rt_prio_array *array = &rt_rq->active; 1078 1079 rt_rq->highest_prio.curr = 1080 sched_find_first_bit(array->bitmap); 1081 } 1082 1083 } else 1084 rt_rq->highest_prio.curr = MAX_RT_PRIO; 1085 1086 dec_rt_prio_smp(rt_rq, prio, prev_prio); 1087 } 1088 1089 #else 1090 1091 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} 1092 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} 1093 1094 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ 1095 1096 #ifdef CONFIG_RT_GROUP_SCHED 1097 1098 static void 1099 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1100 { 1101 if (rt_se_boosted(rt_se)) 1102 rt_rq->rt_nr_boosted++; 1103 1104 if (rt_rq->tg) 1105 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 1106 } 1107 1108 static void 1109 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1110 { 1111 if (rt_se_boosted(rt_se)) 1112 rt_rq->rt_nr_boosted--; 1113 1114 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 1115 } 1116 1117 #else /* CONFIG_RT_GROUP_SCHED */ 1118 1119 static void 1120 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1121 { 1122 start_rt_bandwidth(&def_rt_bandwidth); 1123 } 1124 1125 static inline 1126 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} 1127 1128 #endif /* CONFIG_RT_GROUP_SCHED */ 1129 1130 static inline 1131 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) 1132 { 1133 struct rt_rq *group_rq = group_rt_rq(rt_se); 1134 1135 if (group_rq) 1136 return group_rq->rt_nr_running; 1137 else 1138 return 1; 1139 } 1140 1141 static inline 1142 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1143 { 1144 int prio = rt_se_prio(rt_se); 1145 1146 WARN_ON(!rt_prio(prio)); 1147 rt_rq->rt_nr_running += rt_se_nr_running(rt_se); 1148 1149 inc_rt_prio(rt_rq, prio); 1150 inc_rt_migration(rt_se, rt_rq); 1151 inc_rt_group(rt_se, rt_rq); 1152 } 1153 1154 static inline 1155 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1156 { 1157 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1158 WARN_ON(!rt_rq->rt_nr_running); 1159 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); 1160 1161 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1162 dec_rt_migration(rt_se, rt_rq); 1163 dec_rt_group(rt_se, rt_rq); 1164 } 1165 1166 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1167 { 1168 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1169 struct rt_prio_array *array = &rt_rq->active; 1170 struct rt_rq *group_rq = group_rt_rq(rt_se); 1171 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1172 1173 /* 1174 * Don't enqueue the group if its throttled, or when empty. 1175 * The latter is a consequence of the former when a child group 1176 * get throttled and the current group doesn't have any other 1177 * active members. 1178 */ 1179 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 1180 return; 1181 1182 if (head) 1183 list_add(&rt_se->run_list, queue); 1184 else 1185 list_add_tail(&rt_se->run_list, queue); 1186 __set_bit(rt_se_prio(rt_se), array->bitmap); 1187 1188 inc_rt_tasks(rt_se, rt_rq); 1189 } 1190 1191 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) 1192 { 1193 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1194 struct rt_prio_array *array = &rt_rq->active; 1195 1196 list_del_init(&rt_se->run_list); 1197 if (list_empty(array->queue + rt_se_prio(rt_se))) 1198 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1199 1200 dec_rt_tasks(rt_se, rt_rq); 1201 } 1202 1203 /* 1204 * Because the prio of an upper entry depends on the lower 1205 * entries, we must remove entries top - down. 1206 */ 1207 static void dequeue_rt_stack(struct sched_rt_entity *rt_se) 1208 { 1209 struct sched_rt_entity *back = NULL; 1210 1211 for_each_sched_rt_entity(rt_se) { 1212 rt_se->back = back; 1213 back = rt_se; 1214 } 1215 1216 dequeue_top_rt_rq(rt_rq_of_se(back)); 1217 1218 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1219 if (on_rt_rq(rt_se)) 1220 __dequeue_rt_entity(rt_se); 1221 } 1222 } 1223 1224 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1225 { 1226 struct rq *rq = rq_of_rt_se(rt_se); 1227 1228 dequeue_rt_stack(rt_se); 1229 for_each_sched_rt_entity(rt_se) 1230 __enqueue_rt_entity(rt_se, head); 1231 enqueue_top_rt_rq(&rq->rt); 1232 } 1233 1234 static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 1235 { 1236 struct rq *rq = rq_of_rt_se(rt_se); 1237 1238 dequeue_rt_stack(rt_se); 1239 1240 for_each_sched_rt_entity(rt_se) { 1241 struct rt_rq *rt_rq = group_rt_rq(rt_se); 1242 1243 if (rt_rq && rt_rq->rt_nr_running) 1244 __enqueue_rt_entity(rt_se, false); 1245 } 1246 enqueue_top_rt_rq(&rq->rt); 1247 } 1248 1249 /* 1250 * Adding/removing a task to/from a priority array: 1251 */ 1252 static void 1253 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1254 { 1255 struct sched_rt_entity *rt_se = &p->rt; 1256 1257 if (flags & ENQUEUE_WAKEUP) 1258 rt_se->timeout = 0; 1259 1260 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); 1261 1262 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1263 enqueue_pushable_task(rq, p); 1264 } 1265 1266 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1267 { 1268 struct sched_rt_entity *rt_se = &p->rt; 1269 1270 update_curr_rt(rq); 1271 dequeue_rt_entity(rt_se); 1272 1273 dequeue_pushable_task(rq, p); 1274 } 1275 1276 /* 1277 * Put task to the head or the end of the run list without the overhead of 1278 * dequeue followed by enqueue. 1279 */ 1280 static void 1281 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1282 { 1283 if (on_rt_rq(rt_se)) { 1284 struct rt_prio_array *array = &rt_rq->active; 1285 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1286 1287 if (head) 1288 list_move(&rt_se->run_list, queue); 1289 else 1290 list_move_tail(&rt_se->run_list, queue); 1291 } 1292 } 1293 1294 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) 1295 { 1296 struct sched_rt_entity *rt_se = &p->rt; 1297 struct rt_rq *rt_rq; 1298 1299 for_each_sched_rt_entity(rt_se) { 1300 rt_rq = rt_rq_of_se(rt_se); 1301 requeue_rt_entity(rt_rq, rt_se, head); 1302 } 1303 } 1304 1305 static void yield_task_rt(struct rq *rq) 1306 { 1307 requeue_task_rt(rq, rq->curr, 0); 1308 } 1309 1310 #ifdef CONFIG_SMP 1311 static int find_lowest_rq(struct task_struct *task); 1312 1313 static int 1314 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 1315 { 1316 struct task_struct *curr; 1317 struct rq *rq; 1318 1319 /* For anything but wake ups, just return the task_cpu */ 1320 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1321 goto out; 1322 1323 rq = cpu_rq(cpu); 1324 1325 rcu_read_lock(); 1326 curr = ACCESS_ONCE(rq->curr); /* unlocked access */ 1327 1328 /* 1329 * If the current task on @p's runqueue is an RT task, then 1330 * try to see if we can wake this RT task up on another 1331 * runqueue. Otherwise simply start this RT task 1332 * on its current runqueue. 1333 * 1334 * We want to avoid overloading runqueues. If the woken 1335 * task is a higher priority, then it will stay on this CPU 1336 * and the lower prio task should be moved to another CPU. 1337 * Even though this will probably make the lower prio task 1338 * lose its cache, we do not want to bounce a higher task 1339 * around just because it gave up its CPU, perhaps for a 1340 * lock? 1341 * 1342 * For equal prio tasks, we just let the scheduler sort it out. 1343 * 1344 * Otherwise, just let it ride on the affined RQ and the 1345 * post-schedule router will push the preempted task away 1346 * 1347 * This test is optimistic, if we get it wrong the load-balancer 1348 * will have to sort it out. 1349 */ 1350 if (curr && unlikely(rt_task(curr)) && 1351 (curr->nr_cpus_allowed < 2 || 1352 curr->prio <= p->prio)) { 1353 int target = find_lowest_rq(p); 1354 1355 /* 1356 * Don't bother moving it if the destination CPU is 1357 * not running a lower priority task. 1358 */ 1359 if (target != -1 && 1360 p->prio < cpu_rq(target)->rt.highest_prio.curr) 1361 cpu = target; 1362 } 1363 rcu_read_unlock(); 1364 1365 out: 1366 return cpu; 1367 } 1368 1369 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1370 { 1371 /* 1372 * Current can't be migrated, useless to reschedule, 1373 * let's hope p can move out. 1374 */ 1375 if (rq->curr->nr_cpus_allowed == 1 || 1376 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) 1377 return; 1378 1379 /* 1380 * p is migratable, so let's not schedule it and 1381 * see if it is pushed or pulled somewhere else. 1382 */ 1383 if (p->nr_cpus_allowed != 1 1384 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1385 return; 1386 1387 /* 1388 * There appears to be other cpus that can accept 1389 * current and none to run 'p', so lets reschedule 1390 * to try and push current away: 1391 */ 1392 requeue_task_rt(rq, p, 1); 1393 resched_curr(rq); 1394 } 1395 1396 #endif /* CONFIG_SMP */ 1397 1398 /* 1399 * Preempt the current task with a newly woken task if needed: 1400 */ 1401 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) 1402 { 1403 if (p->prio < rq->curr->prio) { 1404 resched_curr(rq); 1405 return; 1406 } 1407 1408 #ifdef CONFIG_SMP 1409 /* 1410 * If: 1411 * 1412 * - the newly woken task is of equal priority to the current task 1413 * - the newly woken task is non-migratable while current is migratable 1414 * - current will be preempted on the next reschedule 1415 * 1416 * we should check to see if current can readily move to a different 1417 * cpu. If so, we will reschedule to allow the push logic to try 1418 * to move current somewhere else, making room for our non-migratable 1419 * task. 1420 */ 1421 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) 1422 check_preempt_equal_prio(rq, p); 1423 #endif 1424 } 1425 1426 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 1427 struct rt_rq *rt_rq) 1428 { 1429 struct rt_prio_array *array = &rt_rq->active; 1430 struct sched_rt_entity *next = NULL; 1431 struct list_head *queue; 1432 int idx; 1433 1434 idx = sched_find_first_bit(array->bitmap); 1435 BUG_ON(idx >= MAX_RT_PRIO); 1436 1437 queue = array->queue + idx; 1438 next = list_entry(queue->next, struct sched_rt_entity, run_list); 1439 1440 return next; 1441 } 1442 1443 static struct task_struct *_pick_next_task_rt(struct rq *rq) 1444 { 1445 struct sched_rt_entity *rt_se; 1446 struct task_struct *p; 1447 struct rt_rq *rt_rq = &rq->rt; 1448 1449 do { 1450 rt_se = pick_next_rt_entity(rq, rt_rq); 1451 BUG_ON(!rt_se); 1452 rt_rq = group_rt_rq(rt_se); 1453 } while (rt_rq); 1454 1455 p = rt_task_of(rt_se); 1456 p->se.exec_start = rq_clock_task(rq); 1457 1458 return p; 1459 } 1460 1461 static struct task_struct * 1462 pick_next_task_rt(struct rq *rq, struct task_struct *prev) 1463 { 1464 struct task_struct *p; 1465 struct rt_rq *rt_rq = &rq->rt; 1466 1467 if (need_pull_rt_task(rq, prev)) { 1468 pull_rt_task(rq); 1469 /* 1470 * pull_rt_task() can drop (and re-acquire) rq->lock; this 1471 * means a dl or stop task can slip in, in which case we need 1472 * to re-start task selection. 1473 */ 1474 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || 1475 rq->dl.dl_nr_running)) 1476 return RETRY_TASK; 1477 } 1478 1479 /* 1480 * We may dequeue prev's rt_rq in put_prev_task(). 1481 * So, we update time before rt_nr_running check. 1482 */ 1483 if (prev->sched_class == &rt_sched_class) 1484 update_curr_rt(rq); 1485 1486 if (!rt_rq->rt_queued) 1487 return NULL; 1488 1489 put_prev_task(rq, prev); 1490 1491 p = _pick_next_task_rt(rq); 1492 1493 /* The running task is never eligible for pushing */ 1494 dequeue_pushable_task(rq, p); 1495 1496 set_post_schedule(rq); 1497 1498 return p; 1499 } 1500 1501 static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1502 { 1503 update_curr_rt(rq); 1504 1505 /* 1506 * The previous task needs to be made eligible for pushing 1507 * if it is still active 1508 */ 1509 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) 1510 enqueue_pushable_task(rq, p); 1511 } 1512 1513 #ifdef CONFIG_SMP 1514 1515 /* Only try algorithms three times */ 1516 #define RT_MAX_TRIES 3 1517 1518 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1519 { 1520 if (!task_running(rq, p) && 1521 cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) 1522 return 1; 1523 return 0; 1524 } 1525 1526 /* 1527 * Return the highest pushable rq's task, which is suitable to be executed 1528 * on the cpu, NULL otherwise 1529 */ 1530 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) 1531 { 1532 struct plist_head *head = &rq->rt.pushable_tasks; 1533 struct task_struct *p; 1534 1535 if (!has_pushable_tasks(rq)) 1536 return NULL; 1537 1538 plist_for_each_entry(p, head, pushable_tasks) { 1539 if (pick_rt_task(rq, p, cpu)) 1540 return p; 1541 } 1542 1543 return NULL; 1544 } 1545 1546 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1547 1548 static int find_lowest_rq(struct task_struct *task) 1549 { 1550 struct sched_domain *sd; 1551 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); 1552 int this_cpu = smp_processor_id(); 1553 int cpu = task_cpu(task); 1554 1555 /* Make sure the mask is initialized first */ 1556 if (unlikely(!lowest_mask)) 1557 return -1; 1558 1559 if (task->nr_cpus_allowed == 1) 1560 return -1; /* No other targets possible */ 1561 1562 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) 1563 return -1; /* No targets found */ 1564 1565 /* 1566 * At this point we have built a mask of cpus representing the 1567 * lowest priority tasks in the system. Now we want to elect 1568 * the best one based on our affinity and topology. 1569 * 1570 * We prioritize the last cpu that the task executed on since 1571 * it is most likely cache-hot in that location. 1572 */ 1573 if (cpumask_test_cpu(cpu, lowest_mask)) 1574 return cpu; 1575 1576 /* 1577 * Otherwise, we consult the sched_domains span maps to figure 1578 * out which cpu is logically closest to our hot cache data. 1579 */ 1580 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1581 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1582 1583 rcu_read_lock(); 1584 for_each_domain(cpu, sd) { 1585 if (sd->flags & SD_WAKE_AFFINE) { 1586 int best_cpu; 1587 1588 /* 1589 * "this_cpu" is cheaper to preempt than a 1590 * remote processor. 1591 */ 1592 if (this_cpu != -1 && 1593 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { 1594 rcu_read_unlock(); 1595 return this_cpu; 1596 } 1597 1598 best_cpu = cpumask_first_and(lowest_mask, 1599 sched_domain_span(sd)); 1600 if (best_cpu < nr_cpu_ids) { 1601 rcu_read_unlock(); 1602 return best_cpu; 1603 } 1604 } 1605 } 1606 rcu_read_unlock(); 1607 1608 /* 1609 * And finally, if there were no matches within the domains 1610 * just give the caller *something* to work with from the compatible 1611 * locations. 1612 */ 1613 if (this_cpu != -1) 1614 return this_cpu; 1615 1616 cpu = cpumask_any(lowest_mask); 1617 if (cpu < nr_cpu_ids) 1618 return cpu; 1619 return -1; 1620 } 1621 1622 /* Will lock the rq it finds */ 1623 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) 1624 { 1625 struct rq *lowest_rq = NULL; 1626 int tries; 1627 int cpu; 1628 1629 for (tries = 0; tries < RT_MAX_TRIES; tries++) { 1630 cpu = find_lowest_rq(task); 1631 1632 if ((cpu == -1) || (cpu == rq->cpu)) 1633 break; 1634 1635 lowest_rq = cpu_rq(cpu); 1636 1637 if (lowest_rq->rt.highest_prio.curr <= task->prio) { 1638 /* 1639 * Target rq has tasks of equal or higher priority, 1640 * retrying does not release any lock and is unlikely 1641 * to yield a different result. 1642 */ 1643 lowest_rq = NULL; 1644 break; 1645 } 1646 1647 /* if the prio of this runqueue changed, try again */ 1648 if (double_lock_balance(rq, lowest_rq)) { 1649 /* 1650 * We had to unlock the run queue. In 1651 * the mean time, task could have 1652 * migrated already or had its affinity changed. 1653 * Also make sure that it wasn't scheduled on its rq. 1654 */ 1655 if (unlikely(task_rq(task) != rq || 1656 !cpumask_test_cpu(lowest_rq->cpu, 1657 tsk_cpus_allowed(task)) || 1658 task_running(rq, task) || 1659 !task_on_rq_queued(task))) { 1660 1661 double_unlock_balance(rq, lowest_rq); 1662 lowest_rq = NULL; 1663 break; 1664 } 1665 } 1666 1667 /* If this rq is still suitable use it. */ 1668 if (lowest_rq->rt.highest_prio.curr > task->prio) 1669 break; 1670 1671 /* try again */ 1672 double_unlock_balance(rq, lowest_rq); 1673 lowest_rq = NULL; 1674 } 1675 1676 return lowest_rq; 1677 } 1678 1679 static struct task_struct *pick_next_pushable_task(struct rq *rq) 1680 { 1681 struct task_struct *p; 1682 1683 if (!has_pushable_tasks(rq)) 1684 return NULL; 1685 1686 p = plist_first_entry(&rq->rt.pushable_tasks, 1687 struct task_struct, pushable_tasks); 1688 1689 BUG_ON(rq->cpu != task_cpu(p)); 1690 BUG_ON(task_current(rq, p)); 1691 BUG_ON(p->nr_cpus_allowed <= 1); 1692 1693 BUG_ON(!task_on_rq_queued(p)); 1694 BUG_ON(!rt_task(p)); 1695 1696 return p; 1697 } 1698 1699 /* 1700 * If the current CPU has more than one RT task, see if the non 1701 * running task can migrate over to a CPU that is running a task 1702 * of lesser priority. 1703 */ 1704 static int push_rt_task(struct rq *rq) 1705 { 1706 struct task_struct *next_task; 1707 struct rq *lowest_rq; 1708 int ret = 0; 1709 1710 if (!rq->rt.overloaded) 1711 return 0; 1712 1713 next_task = pick_next_pushable_task(rq); 1714 if (!next_task) 1715 return 0; 1716 1717 retry: 1718 if (unlikely(next_task == rq->curr)) { 1719 WARN_ON(1); 1720 return 0; 1721 } 1722 1723 /* 1724 * It's possible that the next_task slipped in of 1725 * higher priority than current. If that's the case 1726 * just reschedule current. 1727 */ 1728 if (unlikely(next_task->prio < rq->curr->prio)) { 1729 resched_curr(rq); 1730 return 0; 1731 } 1732 1733 /* We might release rq lock */ 1734 get_task_struct(next_task); 1735 1736 /* find_lock_lowest_rq locks the rq if found */ 1737 lowest_rq = find_lock_lowest_rq(next_task, rq); 1738 if (!lowest_rq) { 1739 struct task_struct *task; 1740 /* 1741 * find_lock_lowest_rq releases rq->lock 1742 * so it is possible that next_task has migrated. 1743 * 1744 * We need to make sure that the task is still on the same 1745 * run-queue and is also still the next task eligible for 1746 * pushing. 1747 */ 1748 task = pick_next_pushable_task(rq); 1749 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1750 /* 1751 * The task hasn't migrated, and is still the next 1752 * eligible task, but we failed to find a run-queue 1753 * to push it to. Do not retry in this case, since 1754 * other cpus will pull from us when ready. 1755 */ 1756 goto out; 1757 } 1758 1759 if (!task) 1760 /* No more tasks, just exit */ 1761 goto out; 1762 1763 /* 1764 * Something has shifted, try again. 1765 */ 1766 put_task_struct(next_task); 1767 next_task = task; 1768 goto retry; 1769 } 1770 1771 deactivate_task(rq, next_task, 0); 1772 set_task_cpu(next_task, lowest_rq->cpu); 1773 activate_task(lowest_rq, next_task, 0); 1774 ret = 1; 1775 1776 resched_curr(lowest_rq); 1777 1778 double_unlock_balance(rq, lowest_rq); 1779 1780 out: 1781 put_task_struct(next_task); 1782 1783 return ret; 1784 } 1785 1786 static void push_rt_tasks(struct rq *rq) 1787 { 1788 /* push_rt_task will return true if it moved an RT */ 1789 while (push_rt_task(rq)) 1790 ; 1791 } 1792 1793 #ifdef HAVE_RT_PUSH_IPI 1794 /* 1795 * The search for the next cpu always starts at rq->cpu and ends 1796 * when we reach rq->cpu again. It will never return rq->cpu. 1797 * This returns the next cpu to check, or nr_cpu_ids if the loop 1798 * is complete. 1799 * 1800 * rq->rt.push_cpu holds the last cpu returned by this function, 1801 * or if this is the first instance, it must hold rq->cpu. 1802 */ 1803 static int rto_next_cpu(struct rq *rq) 1804 { 1805 int prev_cpu = rq->rt.push_cpu; 1806 int cpu; 1807 1808 cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); 1809 1810 /* 1811 * If the previous cpu is less than the rq's CPU, then it already 1812 * passed the end of the mask, and has started from the beginning. 1813 * We end if the next CPU is greater or equal to rq's CPU. 1814 */ 1815 if (prev_cpu < rq->cpu) { 1816 if (cpu >= rq->cpu) 1817 return nr_cpu_ids; 1818 1819 } else if (cpu >= nr_cpu_ids) { 1820 /* 1821 * We passed the end of the mask, start at the beginning. 1822 * If the result is greater or equal to the rq's CPU, then 1823 * the loop is finished. 1824 */ 1825 cpu = cpumask_first(rq->rd->rto_mask); 1826 if (cpu >= rq->cpu) 1827 return nr_cpu_ids; 1828 } 1829 rq->rt.push_cpu = cpu; 1830 1831 /* Return cpu to let the caller know if the loop is finished or not */ 1832 return cpu; 1833 } 1834 1835 static int find_next_push_cpu(struct rq *rq) 1836 { 1837 struct rq *next_rq; 1838 int cpu; 1839 1840 while (1) { 1841 cpu = rto_next_cpu(rq); 1842 if (cpu >= nr_cpu_ids) 1843 break; 1844 next_rq = cpu_rq(cpu); 1845 1846 /* Make sure the next rq can push to this rq */ 1847 if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) 1848 break; 1849 } 1850 1851 return cpu; 1852 } 1853 1854 #define RT_PUSH_IPI_EXECUTING 1 1855 #define RT_PUSH_IPI_RESTART 2 1856 1857 static void tell_cpu_to_push(struct rq *rq) 1858 { 1859 int cpu; 1860 1861 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { 1862 raw_spin_lock(&rq->rt.push_lock); 1863 /* Make sure it's still executing */ 1864 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { 1865 /* 1866 * Tell the IPI to restart the loop as things have 1867 * changed since it started. 1868 */ 1869 rq->rt.push_flags |= RT_PUSH_IPI_RESTART; 1870 raw_spin_unlock(&rq->rt.push_lock); 1871 return; 1872 } 1873 raw_spin_unlock(&rq->rt.push_lock); 1874 } 1875 1876 /* When here, there's no IPI going around */ 1877 1878 rq->rt.push_cpu = rq->cpu; 1879 cpu = find_next_push_cpu(rq); 1880 if (cpu >= nr_cpu_ids) 1881 return; 1882 1883 rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; 1884 1885 irq_work_queue_on(&rq->rt.push_work, cpu); 1886 } 1887 1888 /* Called from hardirq context */ 1889 static void try_to_push_tasks(void *arg) 1890 { 1891 struct rt_rq *rt_rq = arg; 1892 struct rq *rq, *src_rq; 1893 int this_cpu; 1894 int cpu; 1895 1896 this_cpu = rt_rq->push_cpu; 1897 1898 /* Paranoid check */ 1899 BUG_ON(this_cpu != smp_processor_id()); 1900 1901 rq = cpu_rq(this_cpu); 1902 src_rq = rq_of_rt_rq(rt_rq); 1903 1904 again: 1905 if (has_pushable_tasks(rq)) { 1906 raw_spin_lock(&rq->lock); 1907 push_rt_task(rq); 1908 raw_spin_unlock(&rq->lock); 1909 } 1910 1911 /* Pass the IPI to the next rt overloaded queue */ 1912 raw_spin_lock(&rt_rq->push_lock); 1913 /* 1914 * If the source queue changed since the IPI went out, 1915 * we need to restart the search from that CPU again. 1916 */ 1917 if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { 1918 rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; 1919 rt_rq->push_cpu = src_rq->cpu; 1920 } 1921 1922 cpu = find_next_push_cpu(src_rq); 1923 1924 if (cpu >= nr_cpu_ids) 1925 rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; 1926 raw_spin_unlock(&rt_rq->push_lock); 1927 1928 if (cpu >= nr_cpu_ids) 1929 return; 1930 1931 /* 1932 * It is possible that a restart caused this CPU to be 1933 * chosen again. Don't bother with an IPI, just see if we 1934 * have more to push. 1935 */ 1936 if (unlikely(cpu == rq->cpu)) 1937 goto again; 1938 1939 /* Try the next RT overloaded CPU */ 1940 irq_work_queue_on(&rt_rq->push_work, cpu); 1941 } 1942 1943 static void push_irq_work_func(struct irq_work *work) 1944 { 1945 struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); 1946 1947 try_to_push_tasks(rt_rq); 1948 } 1949 #endif /* HAVE_RT_PUSH_IPI */ 1950 1951 static int pull_rt_task(struct rq *this_rq) 1952 { 1953 int this_cpu = this_rq->cpu, ret = 0, cpu; 1954 struct task_struct *p; 1955 struct rq *src_rq; 1956 1957 if (likely(!rt_overloaded(this_rq))) 1958 return 0; 1959 1960 /* 1961 * Match the barrier from rt_set_overloaded; this guarantees that if we 1962 * see overloaded we must also see the rto_mask bit. 1963 */ 1964 smp_rmb(); 1965 1966 #ifdef HAVE_RT_PUSH_IPI 1967 if (sched_feat(RT_PUSH_IPI)) { 1968 tell_cpu_to_push(this_rq); 1969 return 0; 1970 } 1971 #endif 1972 1973 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1974 if (this_cpu == cpu) 1975 continue; 1976 1977 src_rq = cpu_rq(cpu); 1978 1979 /* 1980 * Don't bother taking the src_rq->lock if the next highest 1981 * task is known to be lower-priority than our current task. 1982 * This may look racy, but if this value is about to go 1983 * logically higher, the src_rq will push this task away. 1984 * And if its going logically lower, we do not care 1985 */ 1986 if (src_rq->rt.highest_prio.next >= 1987 this_rq->rt.highest_prio.curr) 1988 continue; 1989 1990 /* 1991 * We can potentially drop this_rq's lock in 1992 * double_lock_balance, and another CPU could 1993 * alter this_rq 1994 */ 1995 double_lock_balance(this_rq, src_rq); 1996 1997 /* 1998 * We can pull only a task, which is pushable 1999 * on its rq, and no others. 2000 */ 2001 p = pick_highest_pushable_task(src_rq, this_cpu); 2002 2003 /* 2004 * Do we have an RT task that preempts 2005 * the to-be-scheduled task? 2006 */ 2007 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 2008 WARN_ON(p == src_rq->curr); 2009 WARN_ON(!task_on_rq_queued(p)); 2010 2011 /* 2012 * There's a chance that p is higher in priority 2013 * than what's currently running on its cpu. 2014 * This is just that p is wakeing up and hasn't 2015 * had a chance to schedule. We only pull 2016 * p if it is lower in priority than the 2017 * current task on the run queue 2018 */ 2019 if (p->prio < src_rq->curr->prio) 2020 goto skip; 2021 2022 ret = 1; 2023 2024 deactivate_task(src_rq, p, 0); 2025 set_task_cpu(p, this_cpu); 2026 activate_task(this_rq, p, 0); 2027 /* 2028 * We continue with the search, just in 2029 * case there's an even higher prio task 2030 * in another runqueue. (low likelihood 2031 * but possible) 2032 */ 2033 } 2034 skip: 2035 double_unlock_balance(this_rq, src_rq); 2036 } 2037 2038 return ret; 2039 } 2040 2041 static void post_schedule_rt(struct rq *rq) 2042 { 2043 push_rt_tasks(rq); 2044 } 2045 2046 /* 2047 * If we are not running and we are not going to reschedule soon, we should 2048 * try to push tasks away now 2049 */ 2050 static void task_woken_rt(struct rq *rq, struct task_struct *p) 2051 { 2052 if (!task_running(rq, p) && 2053 !test_tsk_need_resched(rq->curr) && 2054 has_pushable_tasks(rq) && 2055 p->nr_cpus_allowed > 1 && 2056 (dl_task(rq->curr) || rt_task(rq->curr)) && 2057 (rq->curr->nr_cpus_allowed < 2 || 2058 rq->curr->prio <= p->prio)) 2059 push_rt_tasks(rq); 2060 } 2061 2062 static void set_cpus_allowed_rt(struct task_struct *p, 2063 const struct cpumask *new_mask) 2064 { 2065 struct rq *rq; 2066 int weight; 2067 2068 BUG_ON(!rt_task(p)); 2069 2070 if (!task_on_rq_queued(p)) 2071 return; 2072 2073 weight = cpumask_weight(new_mask); 2074 2075 /* 2076 * Only update if the process changes its state from whether it 2077 * can migrate or not. 2078 */ 2079 if ((p->nr_cpus_allowed > 1) == (weight > 1)) 2080 return; 2081 2082 rq = task_rq(p); 2083 2084 /* 2085 * The process used to be able to migrate OR it can now migrate 2086 */ 2087 if (weight <= 1) { 2088 if (!task_current(rq, p)) 2089 dequeue_pushable_task(rq, p); 2090 BUG_ON(!rq->rt.rt_nr_migratory); 2091 rq->rt.rt_nr_migratory--; 2092 } else { 2093 if (!task_current(rq, p)) 2094 enqueue_pushable_task(rq, p); 2095 rq->rt.rt_nr_migratory++; 2096 } 2097 2098 update_rt_migration(&rq->rt); 2099 } 2100 2101 /* Assumes rq->lock is held */ 2102 static void rq_online_rt(struct rq *rq) 2103 { 2104 if (rq->rt.overloaded) 2105 rt_set_overload(rq); 2106 2107 __enable_runtime(rq); 2108 2109 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); 2110 } 2111 2112 /* Assumes rq->lock is held */ 2113 static void rq_offline_rt(struct rq *rq) 2114 { 2115 if (rq->rt.overloaded) 2116 rt_clear_overload(rq); 2117 2118 __disable_runtime(rq); 2119 2120 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); 2121 } 2122 2123 /* 2124 * When switch from the rt queue, we bring ourselves to a position 2125 * that we might want to pull RT tasks from other runqueues. 2126 */ 2127 static void switched_from_rt(struct rq *rq, struct task_struct *p) 2128 { 2129 /* 2130 * If there are other RT tasks then we will reschedule 2131 * and the scheduling of the other RT tasks will handle 2132 * the balancing. But if we are the last RT task 2133 * we may need to handle the pulling of RT tasks 2134 * now. 2135 */ 2136 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) 2137 return; 2138 2139 if (pull_rt_task(rq)) 2140 resched_curr(rq); 2141 } 2142 2143 void __init init_sched_rt_class(void) 2144 { 2145 unsigned int i; 2146 2147 for_each_possible_cpu(i) { 2148 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 2149 GFP_KERNEL, cpu_to_node(i)); 2150 } 2151 } 2152 #endif /* CONFIG_SMP */ 2153 2154 /* 2155 * When switching a task to RT, we may overload the runqueue 2156 * with RT tasks. In this case we try to push them off to 2157 * other runqueues. 2158 */ 2159 static void switched_to_rt(struct rq *rq, struct task_struct *p) 2160 { 2161 int check_resched = 1; 2162 2163 /* 2164 * If we are already running, then there's nothing 2165 * that needs to be done. But if we are not running 2166 * we may need to preempt the current running task. 2167 * If that current running task is also an RT task 2168 * then see if we can move to another run queue. 2169 */ 2170 if (task_on_rq_queued(p) && rq->curr != p) { 2171 #ifdef CONFIG_SMP 2172 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && 2173 /* Don't resched if we changed runqueues */ 2174 push_rt_task(rq) && rq != task_rq(p)) 2175 check_resched = 0; 2176 #endif /* CONFIG_SMP */ 2177 if (check_resched && p->prio < rq->curr->prio) 2178 resched_curr(rq); 2179 } 2180 } 2181 2182 /* 2183 * Priority of the task has changed. This may cause 2184 * us to initiate a push or pull. 2185 */ 2186 static void 2187 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 2188 { 2189 if (!task_on_rq_queued(p)) 2190 return; 2191 2192 if (rq->curr == p) { 2193 #ifdef CONFIG_SMP 2194 /* 2195 * If our priority decreases while running, we 2196 * may need to pull tasks to this runqueue. 2197 */ 2198 if (oldprio < p->prio) 2199 pull_rt_task(rq); 2200 /* 2201 * If there's a higher priority task waiting to run 2202 * then reschedule. Note, the above pull_rt_task 2203 * can release the rq lock and p could migrate. 2204 * Only reschedule if p is still on the same runqueue. 2205 */ 2206 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) 2207 resched_curr(rq); 2208 #else 2209 /* For UP simply resched on drop of prio */ 2210 if (oldprio < p->prio) 2211 resched_curr(rq); 2212 #endif /* CONFIG_SMP */ 2213 } else { 2214 /* 2215 * This task is not running, but if it is 2216 * greater than the current running task 2217 * then reschedule. 2218 */ 2219 if (p->prio < rq->curr->prio) 2220 resched_curr(rq); 2221 } 2222 } 2223 2224 static void watchdog(struct rq *rq, struct task_struct *p) 2225 { 2226 unsigned long soft, hard; 2227 2228 /* max may change after cur was read, this will be fixed next tick */ 2229 soft = task_rlimit(p, RLIMIT_RTTIME); 2230 hard = task_rlimit_max(p, RLIMIT_RTTIME); 2231 2232 if (soft != RLIM_INFINITY) { 2233 unsigned long next; 2234 2235 if (p->rt.watchdog_stamp != jiffies) { 2236 p->rt.timeout++; 2237 p->rt.watchdog_stamp = jiffies; 2238 } 2239 2240 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 2241 if (p->rt.timeout > next) 2242 p->cputime_expires.sched_exp = p->se.sum_exec_runtime; 2243 } 2244 } 2245 2246 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 2247 { 2248 struct sched_rt_entity *rt_se = &p->rt; 2249 2250 update_curr_rt(rq); 2251 2252 watchdog(rq, p); 2253 2254 /* 2255 * RR tasks need a special form of timeslice management. 2256 * FIFO tasks have no timeslices. 2257 */ 2258 if (p->policy != SCHED_RR) 2259 return; 2260 2261 if (--p->rt.time_slice) 2262 return; 2263 2264 p->rt.time_slice = sched_rr_timeslice; 2265 2266 /* 2267 * Requeue to the end of queue if we (and all of our ancestors) are not 2268 * the only element on the queue 2269 */ 2270 for_each_sched_rt_entity(rt_se) { 2271 if (rt_se->run_list.prev != rt_se->run_list.next) { 2272 requeue_task_rt(rq, p, 0); 2273 resched_curr(rq); 2274 return; 2275 } 2276 } 2277 } 2278 2279 static void set_curr_task_rt(struct rq *rq) 2280 { 2281 struct task_struct *p = rq->curr; 2282 2283 p->se.exec_start = rq_clock_task(rq); 2284 2285 /* The running task is never eligible for pushing */ 2286 dequeue_pushable_task(rq, p); 2287 } 2288 2289 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 2290 { 2291 /* 2292 * Time slice is 0 for SCHED_FIFO tasks 2293 */ 2294 if (task->policy == SCHED_RR) 2295 return sched_rr_timeslice; 2296 else 2297 return 0; 2298 } 2299 2300 const struct sched_class rt_sched_class = { 2301 .next = &fair_sched_class, 2302 .enqueue_task = enqueue_task_rt, 2303 .dequeue_task = dequeue_task_rt, 2304 .yield_task = yield_task_rt, 2305 2306 .check_preempt_curr = check_preempt_curr_rt, 2307 2308 .pick_next_task = pick_next_task_rt, 2309 .put_prev_task = put_prev_task_rt, 2310 2311 #ifdef CONFIG_SMP 2312 .select_task_rq = select_task_rq_rt, 2313 2314 .set_cpus_allowed = set_cpus_allowed_rt, 2315 .rq_online = rq_online_rt, 2316 .rq_offline = rq_offline_rt, 2317 .post_schedule = post_schedule_rt, 2318 .task_woken = task_woken_rt, 2319 .switched_from = switched_from_rt, 2320 #endif 2321 2322 .set_curr_task = set_curr_task_rt, 2323 .task_tick = task_tick_rt, 2324 2325 .get_rr_interval = get_rr_interval_rt, 2326 2327 .prio_changed = prio_changed_rt, 2328 .switched_to = switched_to_rt, 2329 2330 .update_curr = update_curr_rt, 2331 }; 2332 2333 #ifdef CONFIG_SCHED_DEBUG 2334 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2335 2336 void print_rt_stats(struct seq_file *m, int cpu) 2337 { 2338 rt_rq_iter_t iter; 2339 struct rt_rq *rt_rq; 2340 2341 rcu_read_lock(); 2342 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) 2343 print_rt_rq(m, cpu, rt_rq); 2344 rcu_read_unlock(); 2345 } 2346 #endif /* CONFIG_SCHED_DEBUG */ 2347