1 /* sched.c - SPU scheduler. 2 * 3 * Copyright (C) IBM 2005 4 * Author: Mark Nutter <mnutter@us.ibm.com> 5 * 6 * 2006-03-31 NUMA domains added. 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2, or (at your option) 11 * any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 21 */ 22 23 #undef DEBUG 24 25 #include <linux/module.h> 26 #include <linux/errno.h> 27 #include <linux/sched.h> 28 #include <linux/kernel.h> 29 #include <linux/mm.h> 30 #include <linux/completion.h> 31 #include <linux/vmalloc.h> 32 #include <linux/smp.h> 33 #include <linux/stddef.h> 34 #include <linux/unistd.h> 35 #include <linux/numa.h> 36 #include <linux/mutex.h> 37 #include <linux/notifier.h> 38 #include <linux/kthread.h> 39 #include <linux/pid_namespace.h> 40 #include <linux/proc_fs.h> 41 #include <linux/seq_file.h> 42 43 #include <asm/io.h> 44 #include <asm/mmu_context.h> 45 #include <asm/spu.h> 46 #include <asm/spu_csa.h> 47 #include <asm/spu_priv1.h> 48 #include "spufs.h" 49 50 struct spu_prio_array { 51 DECLARE_BITMAP(bitmap, MAX_PRIO); 52 struct list_head runq[MAX_PRIO]; 53 spinlock_t runq_lock; 54 int nr_waiting; 55 }; 56 57 static unsigned long spu_avenrun[3]; 58 static struct spu_prio_array *spu_prio; 59 static struct task_struct *spusched_task; 60 static struct timer_list spusched_timer; 61 62 /* 63 * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). 64 */ 65 #define NORMAL_PRIO 120 66 67 /* 68 * Frequency of the spu scheduler tick. By default we do one SPU scheduler 69 * tick for every 10 CPU scheduler ticks. 70 */ 71 #define SPUSCHED_TICK (10) 72 73 /* 74 * These are the 'tuning knobs' of the scheduler: 75 * 76 * Minimum timeslice is 5 msecs (or 1 spu scheduler tick, whichever is 77 * larger), default timeslice is 100 msecs, maximum timeslice is 800 msecs. 78 */ 79 #define MIN_SPU_TIMESLICE max(5 * HZ / (1000 * SPUSCHED_TICK), 1) 80 #define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK)) 81 82 #define MAX_USER_PRIO (MAX_PRIO - MAX_RT_PRIO) 83 #define SCALE_PRIO(x, prio) \ 84 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE) 85 86 /* 87 * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values: 88 * [800ms ... 100ms ... 5ms] 89 * 90 * The higher a thread's priority, the bigger timeslices 91 * it gets during one round of execution. But even the lowest 92 * priority thread gets MIN_TIMESLICE worth of execution time. 93 */ 94 void spu_set_timeslice(struct spu_context *ctx) 95 { 96 if (ctx->prio < NORMAL_PRIO) 97 ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE * 4, ctx->prio); 98 else 99 ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE, ctx->prio); 100 } 101 102 /* 103 * Update scheduling information from the owning thread. 104 */ 105 void __spu_update_sched_info(struct spu_context *ctx) 106 { 107 /* 108 * 32-Bit assignment are atomic on powerpc, and we don't care about 109 * memory ordering here because retriving the controlling thread is 110 * per defintion racy. 111 */ 112 ctx->tid = current->pid; 113 114 /* 115 * We do our own priority calculations, so we normally want 116 * ->static_prio to start with. Unfortunately thies field 117 * contains junk for threads with a realtime scheduling 118 * policy so we have to look at ->prio in this case. 119 */ 120 if (rt_prio(current->prio)) 121 ctx->prio = current->prio; 122 else 123 ctx->prio = current->static_prio; 124 ctx->policy = current->policy; 125 126 /* 127 * A lot of places that don't hold list_mutex poke into 128 * cpus_allowed, including grab_runnable_context which 129 * already holds the runq_lock. So abuse runq_lock 130 * to protect this field aswell. 131 */ 132 spin_lock(&spu_prio->runq_lock); 133 ctx->cpus_allowed = current->cpus_allowed; 134 spin_unlock(&spu_prio->runq_lock); 135 } 136 137 void spu_update_sched_info(struct spu_context *ctx) 138 { 139 int node = ctx->spu->node; 140 141 mutex_lock(&cbe_spu_info[node].list_mutex); 142 __spu_update_sched_info(ctx); 143 mutex_unlock(&cbe_spu_info[node].list_mutex); 144 } 145 146 static int __node_allowed(struct spu_context *ctx, int node) 147 { 148 if (nr_cpus_node(node)) { 149 cpumask_t mask = node_to_cpumask(node); 150 151 if (cpus_intersects(mask, ctx->cpus_allowed)) 152 return 1; 153 } 154 155 return 0; 156 } 157 158 static int node_allowed(struct spu_context *ctx, int node) 159 { 160 int rval; 161 162 spin_lock(&spu_prio->runq_lock); 163 rval = __node_allowed(ctx, node); 164 spin_unlock(&spu_prio->runq_lock); 165 166 return rval; 167 } 168 169 static BLOCKING_NOTIFIER_HEAD(spu_switch_notifier); 170 171 void spu_switch_notify(struct spu *spu, struct spu_context *ctx) 172 { 173 blocking_notifier_call_chain(&spu_switch_notifier, 174 ctx ? ctx->object_id : 0, spu); 175 } 176 177 static void notify_spus_active(void) 178 { 179 int node; 180 181 /* 182 * Wake up the active spu_contexts. 183 * 184 * When the awakened processes see their "notify_active" flag is set, 185 * they will call spu_switch_notify(); 186 */ 187 for_each_online_node(node) { 188 struct spu *spu; 189 190 mutex_lock(&cbe_spu_info[node].list_mutex); 191 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) { 192 if (spu->alloc_state != SPU_FREE) { 193 struct spu_context *ctx = spu->ctx; 194 set_bit(SPU_SCHED_NOTIFY_ACTIVE, 195 &ctx->sched_flags); 196 mb(); 197 wake_up_all(&ctx->stop_wq); 198 } 199 } 200 mutex_unlock(&cbe_spu_info[node].list_mutex); 201 } 202 } 203 204 int spu_switch_event_register(struct notifier_block * n) 205 { 206 int ret; 207 ret = blocking_notifier_chain_register(&spu_switch_notifier, n); 208 if (!ret) 209 notify_spus_active(); 210 return ret; 211 } 212 EXPORT_SYMBOL_GPL(spu_switch_event_register); 213 214 int spu_switch_event_unregister(struct notifier_block * n) 215 { 216 return blocking_notifier_chain_unregister(&spu_switch_notifier, n); 217 } 218 EXPORT_SYMBOL_GPL(spu_switch_event_unregister); 219 220 /** 221 * spu_bind_context - bind spu context to physical spu 222 * @spu: physical spu to bind to 223 * @ctx: context to bind 224 */ 225 static void spu_bind_context(struct spu *spu, struct spu_context *ctx) 226 { 227 pr_debug("%s: pid=%d SPU=%d NODE=%d\n", __FUNCTION__, current->pid, 228 spu->number, spu->node); 229 spuctx_switch_state(ctx, SPU_UTIL_SYSTEM); 230 231 if (ctx->flags & SPU_CREATE_NOSCHED) 232 atomic_inc(&cbe_spu_info[spu->node].reserved_spus); 233 234 ctx->stats.slb_flt_base = spu->stats.slb_flt; 235 ctx->stats.class2_intr_base = spu->stats.class2_intr; 236 237 spu->ctx = ctx; 238 spu->flags = 0; 239 ctx->spu = spu; 240 ctx->ops = &spu_hw_ops; 241 spu->pid = current->pid; 242 spu->tgid = current->tgid; 243 spu_associate_mm(spu, ctx->owner); 244 spu->ibox_callback = spufs_ibox_callback; 245 spu->wbox_callback = spufs_wbox_callback; 246 spu->stop_callback = spufs_stop_callback; 247 spu->mfc_callback = spufs_mfc_callback; 248 spu->dma_callback = spufs_dma_callback; 249 mb(); 250 spu_unmap_mappings(ctx); 251 spu_restore(&ctx->csa, spu); 252 spu->timestamp = jiffies; 253 spu_cpu_affinity_set(spu, raw_smp_processor_id()); 254 spu_switch_notify(spu, ctx); 255 ctx->state = SPU_STATE_RUNNABLE; 256 257 spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED); 258 } 259 260 /* 261 * Must be used with the list_mutex held. 262 */ 263 static inline int sched_spu(struct spu *spu) 264 { 265 BUG_ON(!mutex_is_locked(&cbe_spu_info[spu->node].list_mutex)); 266 267 return (!spu->ctx || !(spu->ctx->flags & SPU_CREATE_NOSCHED)); 268 } 269 270 static void aff_merge_remaining_ctxs(struct spu_gang *gang) 271 { 272 struct spu_context *ctx; 273 274 list_for_each_entry(ctx, &gang->aff_list_head, aff_list) { 275 if (list_empty(&ctx->aff_list)) 276 list_add(&ctx->aff_list, &gang->aff_list_head); 277 } 278 gang->aff_flags |= AFF_MERGED; 279 } 280 281 static void aff_set_offsets(struct spu_gang *gang) 282 { 283 struct spu_context *ctx; 284 int offset; 285 286 offset = -1; 287 list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list, 288 aff_list) { 289 if (&ctx->aff_list == &gang->aff_list_head) 290 break; 291 ctx->aff_offset = offset--; 292 } 293 294 offset = 0; 295 list_for_each_entry(ctx, gang->aff_ref_ctx->aff_list.prev, aff_list) { 296 if (&ctx->aff_list == &gang->aff_list_head) 297 break; 298 ctx->aff_offset = offset++; 299 } 300 301 gang->aff_flags |= AFF_OFFSETS_SET; 302 } 303 304 static struct spu *aff_ref_location(struct spu_context *ctx, int mem_aff, 305 int group_size, int lowest_offset) 306 { 307 struct spu *spu; 308 int node, n; 309 310 /* 311 * TODO: A better algorithm could be used to find a good spu to be 312 * used as reference location for the ctxs chain. 313 */ 314 node = cpu_to_node(raw_smp_processor_id()); 315 for (n = 0; n < MAX_NUMNODES; n++, node++) { 316 node = (node < MAX_NUMNODES) ? node : 0; 317 if (!node_allowed(ctx, node)) 318 continue; 319 mutex_lock(&cbe_spu_info[node].list_mutex); 320 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) { 321 if ((!mem_aff || spu->has_mem_affinity) && 322 sched_spu(spu)) { 323 mutex_unlock(&cbe_spu_info[node].list_mutex); 324 return spu; 325 } 326 } 327 mutex_unlock(&cbe_spu_info[node].list_mutex); 328 } 329 return NULL; 330 } 331 332 static void aff_set_ref_point_location(struct spu_gang *gang) 333 { 334 int mem_aff, gs, lowest_offset; 335 struct spu_context *ctx; 336 struct spu *tmp; 337 338 mem_aff = gang->aff_ref_ctx->flags & SPU_CREATE_AFFINITY_MEM; 339 lowest_offset = 0; 340 gs = 0; 341 342 list_for_each_entry(tmp, &gang->aff_list_head, aff_list) 343 gs++; 344 345 list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list, 346 aff_list) { 347 if (&ctx->aff_list == &gang->aff_list_head) 348 break; 349 lowest_offset = ctx->aff_offset; 350 } 351 352 gang->aff_ref_spu = aff_ref_location(gang->aff_ref_ctx, mem_aff, gs, 353 lowest_offset); 354 } 355 356 static struct spu *ctx_location(struct spu *ref, int offset, int node) 357 { 358 struct spu *spu; 359 360 spu = NULL; 361 if (offset >= 0) { 362 list_for_each_entry(spu, ref->aff_list.prev, aff_list) { 363 BUG_ON(spu->node != node); 364 if (offset == 0) 365 break; 366 if (sched_spu(spu)) 367 offset--; 368 } 369 } else { 370 list_for_each_entry_reverse(spu, ref->aff_list.next, aff_list) { 371 BUG_ON(spu->node != node); 372 if (offset == 0) 373 break; 374 if (sched_spu(spu)) 375 offset++; 376 } 377 } 378 379 return spu; 380 } 381 382 /* 383 * affinity_check is called each time a context is going to be scheduled. 384 * It returns the spu ptr on which the context must run. 385 */ 386 static int has_affinity(struct spu_context *ctx) 387 { 388 struct spu_gang *gang = ctx->gang; 389 390 if (list_empty(&ctx->aff_list)) 391 return 0; 392 393 if (!gang->aff_ref_spu) { 394 if (!(gang->aff_flags & AFF_MERGED)) 395 aff_merge_remaining_ctxs(gang); 396 if (!(gang->aff_flags & AFF_OFFSETS_SET)) 397 aff_set_offsets(gang); 398 aff_set_ref_point_location(gang); 399 } 400 401 return gang->aff_ref_spu != NULL; 402 } 403 404 /** 405 * spu_unbind_context - unbind spu context from physical spu 406 * @spu: physical spu to unbind from 407 * @ctx: context to unbind 408 */ 409 static void spu_unbind_context(struct spu *spu, struct spu_context *ctx) 410 { 411 pr_debug("%s: unbind pid=%d SPU=%d NODE=%d\n", __FUNCTION__, 412 spu->pid, spu->number, spu->node); 413 spuctx_switch_state(ctx, SPU_UTIL_SYSTEM); 414 415 if (spu->ctx->flags & SPU_CREATE_NOSCHED) 416 atomic_dec(&cbe_spu_info[spu->node].reserved_spus); 417 418 if (ctx->gang){ 419 mutex_lock(&ctx->gang->aff_mutex); 420 if (has_affinity(ctx)) { 421 if (atomic_dec_and_test(&ctx->gang->aff_sched_count)) 422 ctx->gang->aff_ref_spu = NULL; 423 } 424 mutex_unlock(&ctx->gang->aff_mutex); 425 } 426 427 spu_switch_notify(spu, NULL); 428 spu_unmap_mappings(ctx); 429 spu_save(&ctx->csa, spu); 430 spu->timestamp = jiffies; 431 ctx->state = SPU_STATE_SAVED; 432 spu->ibox_callback = NULL; 433 spu->wbox_callback = NULL; 434 spu->stop_callback = NULL; 435 spu->mfc_callback = NULL; 436 spu->dma_callback = NULL; 437 spu_associate_mm(spu, NULL); 438 spu->pid = 0; 439 spu->tgid = 0; 440 ctx->ops = &spu_backing_ops; 441 spu->flags = 0; 442 spu->ctx = NULL; 443 444 ctx->stats.slb_flt += 445 (spu->stats.slb_flt - ctx->stats.slb_flt_base); 446 ctx->stats.class2_intr += 447 (spu->stats.class2_intr - ctx->stats.class2_intr_base); 448 449 /* This maps the underlying spu state to idle */ 450 spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED); 451 ctx->spu = NULL; 452 } 453 454 /** 455 * spu_add_to_rq - add a context to the runqueue 456 * @ctx: context to add 457 */ 458 static void __spu_add_to_rq(struct spu_context *ctx) 459 { 460 /* 461 * Unfortunately this code path can be called from multiple threads 462 * on behalf of a single context due to the way the problem state 463 * mmap support works. 464 * 465 * Fortunately we need to wake up all these threads at the same time 466 * and can simply skip the runqueue addition for every but the first 467 * thread getting into this codepath. 468 * 469 * It's still quite hacky, and long-term we should proxy all other 470 * threads through the owner thread so that spu_run is in control 471 * of all the scheduling activity for a given context. 472 */ 473 if (list_empty(&ctx->rq)) { 474 list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]); 475 set_bit(ctx->prio, spu_prio->bitmap); 476 if (!spu_prio->nr_waiting++) 477 __mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK); 478 } 479 } 480 481 static void __spu_del_from_rq(struct spu_context *ctx) 482 { 483 int prio = ctx->prio; 484 485 if (!list_empty(&ctx->rq)) { 486 if (!--spu_prio->nr_waiting) 487 del_timer(&spusched_timer); 488 list_del_init(&ctx->rq); 489 490 if (list_empty(&spu_prio->runq[prio])) 491 clear_bit(prio, spu_prio->bitmap); 492 } 493 } 494 495 static void spu_prio_wait(struct spu_context *ctx) 496 { 497 DEFINE_WAIT(wait); 498 499 spin_lock(&spu_prio->runq_lock); 500 prepare_to_wait_exclusive(&ctx->stop_wq, &wait, TASK_INTERRUPTIBLE); 501 if (!signal_pending(current)) { 502 __spu_add_to_rq(ctx); 503 spin_unlock(&spu_prio->runq_lock); 504 mutex_unlock(&ctx->state_mutex); 505 schedule(); 506 mutex_lock(&ctx->state_mutex); 507 spin_lock(&spu_prio->runq_lock); 508 __spu_del_from_rq(ctx); 509 } 510 spin_unlock(&spu_prio->runq_lock); 511 __set_current_state(TASK_RUNNING); 512 remove_wait_queue(&ctx->stop_wq, &wait); 513 } 514 515 static struct spu *spu_get_idle(struct spu_context *ctx) 516 { 517 struct spu *spu, *aff_ref_spu; 518 int node, n; 519 520 if (ctx->gang) { 521 mutex_lock(&ctx->gang->aff_mutex); 522 if (has_affinity(ctx)) { 523 aff_ref_spu = ctx->gang->aff_ref_spu; 524 atomic_inc(&ctx->gang->aff_sched_count); 525 mutex_unlock(&ctx->gang->aff_mutex); 526 node = aff_ref_spu->node; 527 528 mutex_lock(&cbe_spu_info[node].list_mutex); 529 spu = ctx_location(aff_ref_spu, ctx->aff_offset, node); 530 if (spu && spu->alloc_state == SPU_FREE) 531 goto found; 532 mutex_unlock(&cbe_spu_info[node].list_mutex); 533 534 mutex_lock(&ctx->gang->aff_mutex); 535 if (atomic_dec_and_test(&ctx->gang->aff_sched_count)) 536 ctx->gang->aff_ref_spu = NULL; 537 mutex_unlock(&ctx->gang->aff_mutex); 538 539 return NULL; 540 } 541 mutex_unlock(&ctx->gang->aff_mutex); 542 } 543 node = cpu_to_node(raw_smp_processor_id()); 544 for (n = 0; n < MAX_NUMNODES; n++, node++) { 545 node = (node < MAX_NUMNODES) ? node : 0; 546 if (!node_allowed(ctx, node)) 547 continue; 548 549 mutex_lock(&cbe_spu_info[node].list_mutex); 550 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) { 551 if (spu->alloc_state == SPU_FREE) 552 goto found; 553 } 554 mutex_unlock(&cbe_spu_info[node].list_mutex); 555 } 556 557 return NULL; 558 559 found: 560 spu->alloc_state = SPU_USED; 561 mutex_unlock(&cbe_spu_info[node].list_mutex); 562 pr_debug("Got SPU %d %d\n", spu->number, spu->node); 563 spu_init_channels(spu); 564 return spu; 565 } 566 567 /** 568 * find_victim - find a lower priority context to preempt 569 * @ctx: canidate context for running 570 * 571 * Returns the freed physical spu to run the new context on. 572 */ 573 static struct spu *find_victim(struct spu_context *ctx) 574 { 575 struct spu_context *victim = NULL; 576 struct spu *spu; 577 int node, n; 578 579 /* 580 * Look for a possible preemption candidate on the local node first. 581 * If there is no candidate look at the other nodes. This isn't 582 * exactly fair, but so far the whole spu schedule tries to keep 583 * a strong node affinity. We might want to fine-tune this in 584 * the future. 585 */ 586 restart: 587 node = cpu_to_node(raw_smp_processor_id()); 588 for (n = 0; n < MAX_NUMNODES; n++, node++) { 589 node = (node < MAX_NUMNODES) ? node : 0; 590 if (!node_allowed(ctx, node)) 591 continue; 592 593 mutex_lock(&cbe_spu_info[node].list_mutex); 594 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) { 595 struct spu_context *tmp = spu->ctx; 596 597 if (tmp && tmp->prio > ctx->prio && 598 (!victim || tmp->prio > victim->prio)) 599 victim = spu->ctx; 600 } 601 mutex_unlock(&cbe_spu_info[node].list_mutex); 602 603 if (victim) { 604 /* 605 * This nests ctx->state_mutex, but we always lock 606 * higher priority contexts before lower priority 607 * ones, so this is safe until we introduce 608 * priority inheritance schemes. 609 */ 610 if (!mutex_trylock(&victim->state_mutex)) { 611 victim = NULL; 612 goto restart; 613 } 614 615 spu = victim->spu; 616 if (!spu) { 617 /* 618 * This race can happen because we've dropped 619 * the active list mutex. No a problem, just 620 * restart the search. 621 */ 622 mutex_unlock(&victim->state_mutex); 623 victim = NULL; 624 goto restart; 625 } 626 627 mutex_lock(&cbe_spu_info[node].list_mutex); 628 cbe_spu_info[node].nr_active--; 629 spu_unbind_context(spu, victim); 630 mutex_unlock(&cbe_spu_info[node].list_mutex); 631 632 victim->stats.invol_ctx_switch++; 633 spu->stats.invol_ctx_switch++; 634 mutex_unlock(&victim->state_mutex); 635 /* 636 * We need to break out of the wait loop in spu_run 637 * manually to ensure this context gets put on the 638 * runqueue again ASAP. 639 */ 640 wake_up(&victim->stop_wq); 641 return spu; 642 } 643 } 644 645 return NULL; 646 } 647 648 /** 649 * spu_activate - find a free spu for a context and execute it 650 * @ctx: spu context to schedule 651 * @flags: flags (currently ignored) 652 * 653 * Tries to find a free spu to run @ctx. If no free spu is available 654 * add the context to the runqueue so it gets woken up once an spu 655 * is available. 656 */ 657 int spu_activate(struct spu_context *ctx, unsigned long flags) 658 { 659 do { 660 struct spu *spu; 661 662 /* 663 * If there are multiple threads waiting for a single context 664 * only one actually binds the context while the others will 665 * only be able to acquire the state_mutex once the context 666 * already is in runnable state. 667 */ 668 if (ctx->spu) 669 return 0; 670 671 spu = spu_get_idle(ctx); 672 /* 673 * If this is a realtime thread we try to get it running by 674 * preempting a lower priority thread. 675 */ 676 if (!spu && rt_prio(ctx->prio)) 677 spu = find_victim(ctx); 678 if (spu) { 679 int node = spu->node; 680 681 mutex_lock(&cbe_spu_info[node].list_mutex); 682 spu_bind_context(spu, ctx); 683 cbe_spu_info[node].nr_active++; 684 mutex_unlock(&cbe_spu_info[node].list_mutex); 685 return 0; 686 } 687 688 spu_prio_wait(ctx); 689 } while (!signal_pending(current)); 690 691 return -ERESTARTSYS; 692 } 693 694 /** 695 * grab_runnable_context - try to find a runnable context 696 * 697 * Remove the highest priority context on the runqueue and return it 698 * to the caller. Returns %NULL if no runnable context was found. 699 */ 700 static struct spu_context *grab_runnable_context(int prio, int node) 701 { 702 struct spu_context *ctx; 703 int best; 704 705 spin_lock(&spu_prio->runq_lock); 706 best = find_first_bit(spu_prio->bitmap, prio); 707 while (best < prio) { 708 struct list_head *rq = &spu_prio->runq[best]; 709 710 list_for_each_entry(ctx, rq, rq) { 711 /* XXX(hch): check for affinity here aswell */ 712 if (__node_allowed(ctx, node)) { 713 __spu_del_from_rq(ctx); 714 goto found; 715 } 716 } 717 best++; 718 } 719 ctx = NULL; 720 found: 721 spin_unlock(&spu_prio->runq_lock); 722 return ctx; 723 } 724 725 static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio) 726 { 727 struct spu *spu = ctx->spu; 728 struct spu_context *new = NULL; 729 730 if (spu) { 731 new = grab_runnable_context(max_prio, spu->node); 732 if (new || force) { 733 int node = spu->node; 734 735 mutex_lock(&cbe_spu_info[node].list_mutex); 736 spu_unbind_context(spu, ctx); 737 spu->alloc_state = SPU_FREE; 738 cbe_spu_info[node].nr_active--; 739 mutex_unlock(&cbe_spu_info[node].list_mutex); 740 741 ctx->stats.vol_ctx_switch++; 742 spu->stats.vol_ctx_switch++; 743 744 if (new) 745 wake_up(&new->stop_wq); 746 } 747 748 } 749 750 return new != NULL; 751 } 752 753 /** 754 * spu_deactivate - unbind a context from it's physical spu 755 * @ctx: spu context to unbind 756 * 757 * Unbind @ctx from the physical spu it is running on and schedule 758 * the highest priority context to run on the freed physical spu. 759 */ 760 void spu_deactivate(struct spu_context *ctx) 761 { 762 __spu_deactivate(ctx, 1, MAX_PRIO); 763 } 764 765 /** 766 * spu_yield - yield a physical spu if others are waiting 767 * @ctx: spu context to yield 768 * 769 * Check if there is a higher priority context waiting and if yes 770 * unbind @ctx from the physical spu and schedule the highest 771 * priority context to run on the freed physical spu instead. 772 */ 773 void spu_yield(struct spu_context *ctx) 774 { 775 if (!(ctx->flags & SPU_CREATE_NOSCHED)) { 776 mutex_lock(&ctx->state_mutex); 777 __spu_deactivate(ctx, 0, MAX_PRIO); 778 mutex_unlock(&ctx->state_mutex); 779 } 780 } 781 782 static noinline void spusched_tick(struct spu_context *ctx) 783 { 784 if (ctx->flags & SPU_CREATE_NOSCHED) 785 return; 786 if (ctx->policy == SCHED_FIFO) 787 return; 788 789 if (--ctx->time_slice) 790 return; 791 792 /* 793 * Unfortunately list_mutex ranks outside of state_mutex, so 794 * we have to trylock here. If we fail give the context another 795 * tick and try again. 796 */ 797 if (mutex_trylock(&ctx->state_mutex)) { 798 struct spu *spu = ctx->spu; 799 struct spu_context *new; 800 801 new = grab_runnable_context(ctx->prio + 1, spu->node); 802 if (new) { 803 spu_unbind_context(spu, ctx); 804 ctx->stats.invol_ctx_switch++; 805 spu->stats.invol_ctx_switch++; 806 spu->alloc_state = SPU_FREE; 807 cbe_spu_info[spu->node].nr_active--; 808 wake_up(&new->stop_wq); 809 /* 810 * We need to break out of the wait loop in 811 * spu_run manually to ensure this context 812 * gets put on the runqueue again ASAP. 813 */ 814 wake_up(&ctx->stop_wq); 815 } 816 spu_set_timeslice(ctx); 817 mutex_unlock(&ctx->state_mutex); 818 } else { 819 ctx->time_slice++; 820 } 821 } 822 823 /** 824 * count_active_contexts - count nr of active tasks 825 * 826 * Return the number of tasks currently running or waiting to run. 827 * 828 * Note that we don't take runq_lock / list_mutex here. Reading 829 * a single 32bit value is atomic on powerpc, and we don't care 830 * about memory ordering issues here. 831 */ 832 static unsigned long count_active_contexts(void) 833 { 834 int nr_active = 0, node; 835 836 for (node = 0; node < MAX_NUMNODES; node++) 837 nr_active += cbe_spu_info[node].nr_active; 838 nr_active += spu_prio->nr_waiting; 839 840 return nr_active; 841 } 842 843 /** 844 * spu_calc_load - given tick count, update the avenrun load estimates. 845 * @tick: tick count 846 * 847 * No locking against reading these values from userspace, as for 848 * the CPU loadavg code. 849 */ 850 static void spu_calc_load(unsigned long ticks) 851 { 852 unsigned long active_tasks; /* fixed-point */ 853 static int count = LOAD_FREQ; 854 855 count -= ticks; 856 857 if (unlikely(count < 0)) { 858 active_tasks = count_active_contexts() * FIXED_1; 859 do { 860 CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks); 861 CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks); 862 CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks); 863 count += LOAD_FREQ; 864 } while (count < 0); 865 } 866 } 867 868 static void spusched_wake(unsigned long data) 869 { 870 mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK); 871 wake_up_process(spusched_task); 872 spu_calc_load(SPUSCHED_TICK); 873 } 874 875 static int spusched_thread(void *unused) 876 { 877 struct spu *spu; 878 int node; 879 880 while (!kthread_should_stop()) { 881 set_current_state(TASK_INTERRUPTIBLE); 882 schedule(); 883 for (node = 0; node < MAX_NUMNODES; node++) { 884 mutex_lock(&cbe_spu_info[node].list_mutex); 885 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) 886 if (spu->ctx) 887 spusched_tick(spu->ctx); 888 mutex_unlock(&cbe_spu_info[node].list_mutex); 889 } 890 } 891 892 return 0; 893 } 894 895 #define LOAD_INT(x) ((x) >> FSHIFT) 896 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) 897 898 static int show_spu_loadavg(struct seq_file *s, void *private) 899 { 900 int a, b, c; 901 902 a = spu_avenrun[0] + (FIXED_1/200); 903 b = spu_avenrun[1] + (FIXED_1/200); 904 c = spu_avenrun[2] + (FIXED_1/200); 905 906 /* 907 * Note that last_pid doesn't really make much sense for the 908 * SPU loadavg (it even seems very odd on the CPU side..), 909 * but we include it here to have a 100% compatible interface. 910 */ 911 seq_printf(s, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n", 912 LOAD_INT(a), LOAD_FRAC(a), 913 LOAD_INT(b), LOAD_FRAC(b), 914 LOAD_INT(c), LOAD_FRAC(c), 915 count_active_contexts(), 916 atomic_read(&nr_spu_contexts), 917 current->nsproxy->pid_ns->last_pid); 918 return 0; 919 } 920 921 static int spu_loadavg_open(struct inode *inode, struct file *file) 922 { 923 return single_open(file, show_spu_loadavg, NULL); 924 } 925 926 static const struct file_operations spu_loadavg_fops = { 927 .open = spu_loadavg_open, 928 .read = seq_read, 929 .llseek = seq_lseek, 930 .release = single_release, 931 }; 932 933 int __init spu_sched_init(void) 934 { 935 struct proc_dir_entry *entry; 936 int err = -ENOMEM, i; 937 938 spu_prio = kzalloc(sizeof(struct spu_prio_array), GFP_KERNEL); 939 if (!spu_prio) 940 goto out; 941 942 for (i = 0; i < MAX_PRIO; i++) { 943 INIT_LIST_HEAD(&spu_prio->runq[i]); 944 __clear_bit(i, spu_prio->bitmap); 945 } 946 spin_lock_init(&spu_prio->runq_lock); 947 948 setup_timer(&spusched_timer, spusched_wake, 0); 949 950 spusched_task = kthread_run(spusched_thread, NULL, "spusched"); 951 if (IS_ERR(spusched_task)) { 952 err = PTR_ERR(spusched_task); 953 goto out_free_spu_prio; 954 } 955 956 entry = create_proc_entry("spu_loadavg", 0, NULL); 957 if (!entry) 958 goto out_stop_kthread; 959 entry->proc_fops = &spu_loadavg_fops; 960 961 pr_debug("spusched: tick: %d, min ticks: %d, default ticks: %d\n", 962 SPUSCHED_TICK, MIN_SPU_TIMESLICE, DEF_SPU_TIMESLICE); 963 return 0; 964 965 out_stop_kthread: 966 kthread_stop(spusched_task); 967 out_free_spu_prio: 968 kfree(spu_prio); 969 out: 970 return err; 971 } 972 973 void spu_sched_exit(void) 974 { 975 struct spu *spu; 976 int node; 977 978 remove_proc_entry("spu_loadavg", NULL); 979 980 del_timer_sync(&spusched_timer); 981 kthread_stop(spusched_task); 982 983 for (node = 0; node < MAX_NUMNODES; node++) { 984 mutex_lock(&cbe_spu_info[node].list_mutex); 985 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) 986 if (spu->alloc_state != SPU_FREE) 987 spu->alloc_state = SPU_FREE; 988 mutex_unlock(&cbe_spu_info[node].list_mutex); 989 } 990 kfree(spu_prio); 991 } 992