1 /* 2 * This file is subject to the terms and conditions of the GNU General Public 3 * License. See the file "COPYING" in the main directory of this archive 4 * for more details. 5 * 6 * Copyright (c) 2004-2009 Silicon Graphics, Inc. All Rights Reserved. 7 */ 8 9 /* 10 * Cross Partition Communication (XPC) support - standard version. 11 * 12 * XPC provides a message passing capability that crosses partition 13 * boundaries. This module is made up of two parts: 14 * 15 * partition This part detects the presence/absence of other 16 * partitions. It provides a heartbeat and monitors 17 * the heartbeats of other partitions. 18 * 19 * channel This part manages the channels and sends/receives 20 * messages across them to/from other partitions. 21 * 22 * There are a couple of additional functions residing in XP, which 23 * provide an interface to XPC for its users. 24 * 25 * 26 * Caveats: 27 * 28 * . Currently on sn2, we have no way to determine which nasid an IRQ 29 * came from. Thus, xpc_send_IRQ_sn2() does a remote amo write 30 * followed by an IPI. The amo indicates where data is to be pulled 31 * from, so after the IPI arrives, the remote partition checks the amo 32 * word. The IPI can actually arrive before the amo however, so other 33 * code must periodically check for this case. Also, remote amo 34 * operations do not reliably time out. Thus we do a remote PIO read 35 * solely to know whether the remote partition is down and whether we 36 * should stop sending IPIs to it. This remote PIO read operation is 37 * set up in a special nofault region so SAL knows to ignore (and 38 * cleanup) any errors due to the remote amo write, PIO read, and/or 39 * PIO write operations. 40 * 41 * If/when new hardware solves this IPI problem, we should abandon 42 * the current approach. 43 * 44 */ 45 46 #include <linux/module.h> 47 #include <linux/slab.h> 48 #include <linux/sysctl.h> 49 #include <linux/device.h> 50 #include <linux/delay.h> 51 #include <linux/reboot.h> 52 #include <linux/kdebug.h> 53 #include <linux/kthread.h> 54 #include "xpc.h" 55 56 #ifdef CONFIG_X86_64 57 #include <asm/traps.h> 58 #endif 59 60 /* define two XPC debug device structures to be used with dev_dbg() et al */ 61 62 struct device_driver xpc_dbg_name = { 63 .name = "xpc" 64 }; 65 66 struct device xpc_part_dbg_subname = { 67 .init_name = "", /* set to "part" at xpc_init() time */ 68 .driver = &xpc_dbg_name 69 }; 70 71 struct device xpc_chan_dbg_subname = { 72 .init_name = "", /* set to "chan" at xpc_init() time */ 73 .driver = &xpc_dbg_name 74 }; 75 76 struct device *xpc_part = &xpc_part_dbg_subname; 77 struct device *xpc_chan = &xpc_chan_dbg_subname; 78 79 static int xpc_kdebug_ignore; 80 81 /* systune related variables for /proc/sys directories */ 82 83 static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL; 84 static int xpc_hb_min_interval = 1; 85 static int xpc_hb_max_interval = 10; 86 87 static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL; 88 static int xpc_hb_check_min_interval = 10; 89 static int xpc_hb_check_max_interval = 120; 90 91 int xpc_disengage_timelimit = XPC_DISENGAGE_DEFAULT_TIMELIMIT; 92 static int xpc_disengage_min_timelimit; /* = 0 */ 93 static int xpc_disengage_max_timelimit = 120; 94 95 static struct ctl_table xpc_sys_xpc_hb_dir[] = { 96 { 97 .procname = "hb_interval", 98 .data = &xpc_hb_interval, 99 .maxlen = sizeof(int), 100 .mode = 0644, 101 .proc_handler = proc_dointvec_minmax, 102 .extra1 = &xpc_hb_min_interval, 103 .extra2 = &xpc_hb_max_interval}, 104 { 105 .procname = "hb_check_interval", 106 .data = &xpc_hb_check_interval, 107 .maxlen = sizeof(int), 108 .mode = 0644, 109 .proc_handler = proc_dointvec_minmax, 110 .extra1 = &xpc_hb_check_min_interval, 111 .extra2 = &xpc_hb_check_max_interval}, 112 {} 113 }; 114 static struct ctl_table xpc_sys_xpc_dir[] = { 115 { 116 .procname = "hb", 117 .mode = 0555, 118 .child = xpc_sys_xpc_hb_dir}, 119 { 120 .procname = "disengage_timelimit", 121 .data = &xpc_disengage_timelimit, 122 .maxlen = sizeof(int), 123 .mode = 0644, 124 .proc_handler = proc_dointvec_minmax, 125 .extra1 = &xpc_disengage_min_timelimit, 126 .extra2 = &xpc_disengage_max_timelimit}, 127 {} 128 }; 129 static struct ctl_table xpc_sys_dir[] = { 130 { 131 .procname = "xpc", 132 .mode = 0555, 133 .child = xpc_sys_xpc_dir}, 134 {} 135 }; 136 static struct ctl_table_header *xpc_sysctl; 137 138 /* non-zero if any remote partition disengage was timed out */ 139 int xpc_disengage_timedout; 140 141 /* #of activate IRQs received and not yet processed */ 142 int xpc_activate_IRQ_rcvd; 143 DEFINE_SPINLOCK(xpc_activate_IRQ_rcvd_lock); 144 145 /* IRQ handler notifies this wait queue on receipt of an IRQ */ 146 DECLARE_WAIT_QUEUE_HEAD(xpc_activate_IRQ_wq); 147 148 static unsigned long xpc_hb_check_timeout; 149 static struct timer_list xpc_hb_timer; 150 151 /* notification that the xpc_hb_checker thread has exited */ 152 static DECLARE_COMPLETION(xpc_hb_checker_exited); 153 154 /* notification that the xpc_discovery thread has exited */ 155 static DECLARE_COMPLETION(xpc_discovery_exited); 156 157 static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *); 158 159 static int xpc_system_reboot(struct notifier_block *, unsigned long, void *); 160 static struct notifier_block xpc_reboot_notifier = { 161 .notifier_call = xpc_system_reboot, 162 }; 163 164 static int xpc_system_die(struct notifier_block *, unsigned long, void *); 165 static struct notifier_block xpc_die_notifier = { 166 .notifier_call = xpc_system_die, 167 }; 168 169 struct xpc_arch_operations xpc_arch_ops; 170 171 /* 172 * Timer function to enforce the timelimit on the partition disengage. 173 */ 174 static void 175 xpc_timeout_partition_disengage(struct timer_list *t) 176 { 177 struct xpc_partition *part = from_timer(part, t, disengage_timer); 178 179 DBUG_ON(time_is_after_jiffies(part->disengage_timeout)); 180 181 (void)xpc_partition_disengaged(part); 182 183 DBUG_ON(part->disengage_timeout != 0); 184 DBUG_ON(xpc_arch_ops.partition_engaged(XPC_PARTID(part))); 185 } 186 187 /* 188 * Timer to produce the heartbeat. The timer structures function is 189 * already set when this is initially called. A tunable is used to 190 * specify when the next timeout should occur. 191 */ 192 static void 193 xpc_hb_beater(struct timer_list *unused) 194 { 195 xpc_arch_ops.increment_heartbeat(); 196 197 if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) 198 wake_up_interruptible(&xpc_activate_IRQ_wq); 199 200 xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ); 201 add_timer(&xpc_hb_timer); 202 } 203 204 static void 205 xpc_start_hb_beater(void) 206 { 207 xpc_arch_ops.heartbeat_init(); 208 timer_setup(&xpc_hb_timer, xpc_hb_beater, 0); 209 xpc_hb_beater(0); 210 } 211 212 static void 213 xpc_stop_hb_beater(void) 214 { 215 del_timer_sync(&xpc_hb_timer); 216 xpc_arch_ops.heartbeat_exit(); 217 } 218 219 /* 220 * At periodic intervals, scan through all active partitions and ensure 221 * their heartbeat is still active. If not, the partition is deactivated. 222 */ 223 static void 224 xpc_check_remote_hb(void) 225 { 226 struct xpc_partition *part; 227 short partid; 228 enum xp_retval ret; 229 230 for (partid = 0; partid < xp_max_npartitions; partid++) { 231 232 if (xpc_exiting) 233 break; 234 235 if (partid == xp_partition_id) 236 continue; 237 238 part = &xpc_partitions[partid]; 239 240 if (part->act_state == XPC_P_AS_INACTIVE || 241 part->act_state == XPC_P_AS_DEACTIVATING) { 242 continue; 243 } 244 245 ret = xpc_arch_ops.get_remote_heartbeat(part); 246 if (ret != xpSuccess) 247 XPC_DEACTIVATE_PARTITION(part, ret); 248 } 249 } 250 251 /* 252 * This thread is responsible for nearly all of the partition 253 * activation/deactivation. 254 */ 255 static int 256 xpc_hb_checker(void *ignore) 257 { 258 int force_IRQ = 0; 259 260 /* this thread was marked active by xpc_hb_init() */ 261 262 set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU)); 263 264 /* set our heartbeating to other partitions into motion */ 265 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ); 266 xpc_start_hb_beater(); 267 268 while (!xpc_exiting) { 269 270 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have " 271 "been received\n", 272 (int)(xpc_hb_check_timeout - jiffies), 273 xpc_activate_IRQ_rcvd); 274 275 /* checking of remote heartbeats is skewed by IRQ handling */ 276 if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) { 277 xpc_hb_check_timeout = jiffies + 278 (xpc_hb_check_interval * HZ); 279 280 dev_dbg(xpc_part, "checking remote heartbeats\n"); 281 xpc_check_remote_hb(); 282 } 283 284 /* check for outstanding IRQs */ 285 if (xpc_activate_IRQ_rcvd > 0 || force_IRQ != 0) { 286 force_IRQ = 0; 287 dev_dbg(xpc_part, "processing activate IRQs " 288 "received\n"); 289 xpc_arch_ops.process_activate_IRQ_rcvd(); 290 } 291 292 /* wait for IRQ or timeout */ 293 (void)wait_event_interruptible(xpc_activate_IRQ_wq, 294 (time_is_before_eq_jiffies( 295 xpc_hb_check_timeout) || 296 xpc_activate_IRQ_rcvd > 0 || 297 xpc_exiting)); 298 } 299 300 xpc_stop_hb_beater(); 301 302 dev_dbg(xpc_part, "heartbeat checker is exiting\n"); 303 304 /* mark this thread as having exited */ 305 complete(&xpc_hb_checker_exited); 306 return 0; 307 } 308 309 /* 310 * This thread will attempt to discover other partitions to activate 311 * based on info provided by SAL. This new thread is short lived and 312 * will exit once discovery is complete. 313 */ 314 static int 315 xpc_initiate_discovery(void *ignore) 316 { 317 xpc_discovery(); 318 319 dev_dbg(xpc_part, "discovery thread is exiting\n"); 320 321 /* mark this thread as having exited */ 322 complete(&xpc_discovery_exited); 323 return 0; 324 } 325 326 /* 327 * The first kthread assigned to a newly activated partition is the one 328 * created by XPC HB with which it calls xpc_activating(). XPC hangs on to 329 * that kthread until the partition is brought down, at which time that kthread 330 * returns back to XPC HB. (The return of that kthread will signify to XPC HB 331 * that XPC has dismantled all communication infrastructure for the associated 332 * partition.) This kthread becomes the channel manager for that partition. 333 * 334 * Each active partition has a channel manager, who, besides connecting and 335 * disconnecting channels, will ensure that each of the partition's connected 336 * channels has the required number of assigned kthreads to get the work done. 337 */ 338 static void 339 xpc_channel_mgr(struct xpc_partition *part) 340 { 341 while (part->act_state != XPC_P_AS_DEACTIVATING || 342 atomic_read(&part->nchannels_active) > 0 || 343 !xpc_partition_disengaged(part)) { 344 345 xpc_process_sent_chctl_flags(part); 346 347 /* 348 * Wait until we've been requested to activate kthreads or 349 * all of the channel's message queues have been torn down or 350 * a signal is pending. 351 * 352 * The channel_mgr_requests is set to 1 after being awakened, 353 * This is done to prevent the channel mgr from making one pass 354 * through the loop for each request, since he will 355 * be servicing all the requests in one pass. The reason it's 356 * set to 1 instead of 0 is so that other kthreads will know 357 * that the channel mgr is running and won't bother trying to 358 * wake him up. 359 */ 360 atomic_dec(&part->channel_mgr_requests); 361 (void)wait_event_interruptible(part->channel_mgr_wq, 362 (atomic_read(&part->channel_mgr_requests) > 0 || 363 part->chctl.all_flags != 0 || 364 (part->act_state == XPC_P_AS_DEACTIVATING && 365 atomic_read(&part->nchannels_active) == 0 && 366 xpc_partition_disengaged(part)))); 367 atomic_set(&part->channel_mgr_requests, 1); 368 } 369 } 370 371 /* 372 * Guarantee that the kzalloc'd memory is cacheline aligned. 373 */ 374 void * 375 xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base) 376 { 377 /* see if kzalloc will give us cachline aligned memory by default */ 378 *base = kzalloc(size, flags); 379 if (*base == NULL) 380 return NULL; 381 382 if ((u64)*base == L1_CACHE_ALIGN((u64)*base)) 383 return *base; 384 385 kfree(*base); 386 387 /* nope, we'll have to do it ourselves */ 388 *base = kzalloc(size + L1_CACHE_BYTES, flags); 389 if (*base == NULL) 390 return NULL; 391 392 return (void *)L1_CACHE_ALIGN((u64)*base); 393 } 394 395 /* 396 * Setup the channel structures necessary to support XPartition Communication 397 * between the specified remote partition and the local one. 398 */ 399 static enum xp_retval 400 xpc_setup_ch_structures(struct xpc_partition *part) 401 { 402 enum xp_retval ret; 403 int ch_number; 404 struct xpc_channel *ch; 405 short partid = XPC_PARTID(part); 406 407 /* 408 * Allocate all of the channel structures as a contiguous chunk of 409 * memory. 410 */ 411 DBUG_ON(part->channels != NULL); 412 part->channels = kcalloc(XPC_MAX_NCHANNELS, 413 sizeof(struct xpc_channel), 414 GFP_KERNEL); 415 if (part->channels == NULL) { 416 dev_err(xpc_chan, "can't get memory for channels\n"); 417 return xpNoMemory; 418 } 419 420 /* allocate the remote open and close args */ 421 422 part->remote_openclose_args = 423 xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE, 424 GFP_KERNEL, &part-> 425 remote_openclose_args_base); 426 if (part->remote_openclose_args == NULL) { 427 dev_err(xpc_chan, "can't get memory for remote connect args\n"); 428 ret = xpNoMemory; 429 goto out_1; 430 } 431 432 part->chctl.all_flags = 0; 433 spin_lock_init(&part->chctl_lock); 434 435 atomic_set(&part->channel_mgr_requests, 1); 436 init_waitqueue_head(&part->channel_mgr_wq); 437 438 part->nchannels = XPC_MAX_NCHANNELS; 439 440 atomic_set(&part->nchannels_active, 0); 441 atomic_set(&part->nchannels_engaged, 0); 442 443 for (ch_number = 0; ch_number < part->nchannels; ch_number++) { 444 ch = &part->channels[ch_number]; 445 446 ch->partid = partid; 447 ch->number = ch_number; 448 ch->flags = XPC_C_DISCONNECTED; 449 450 atomic_set(&ch->kthreads_assigned, 0); 451 atomic_set(&ch->kthreads_idle, 0); 452 atomic_set(&ch->kthreads_active, 0); 453 454 atomic_set(&ch->references, 0); 455 atomic_set(&ch->n_to_notify, 0); 456 457 spin_lock_init(&ch->lock); 458 init_completion(&ch->wdisconnect_wait); 459 460 atomic_set(&ch->n_on_msg_allocate_wq, 0); 461 init_waitqueue_head(&ch->msg_allocate_wq); 462 init_waitqueue_head(&ch->idle_wq); 463 } 464 465 ret = xpc_arch_ops.setup_ch_structures(part); 466 if (ret != xpSuccess) 467 goto out_2; 468 469 /* 470 * With the setting of the partition setup_state to XPC_P_SS_SETUP, 471 * we're declaring that this partition is ready to go. 472 */ 473 part->setup_state = XPC_P_SS_SETUP; 474 475 return xpSuccess; 476 477 /* setup of ch structures failed */ 478 out_2: 479 kfree(part->remote_openclose_args_base); 480 part->remote_openclose_args = NULL; 481 out_1: 482 kfree(part->channels); 483 part->channels = NULL; 484 return ret; 485 } 486 487 /* 488 * Teardown the channel structures necessary to support XPartition Communication 489 * between the specified remote partition and the local one. 490 */ 491 static void 492 xpc_teardown_ch_structures(struct xpc_partition *part) 493 { 494 DBUG_ON(atomic_read(&part->nchannels_engaged) != 0); 495 DBUG_ON(atomic_read(&part->nchannels_active) != 0); 496 497 /* 498 * Make this partition inaccessible to local processes by marking it 499 * as no longer setup. Then wait before proceeding with the teardown 500 * until all existing references cease. 501 */ 502 DBUG_ON(part->setup_state != XPC_P_SS_SETUP); 503 part->setup_state = XPC_P_SS_WTEARDOWN; 504 505 wait_event(part->teardown_wq, (atomic_read(&part->references) == 0)); 506 507 /* now we can begin tearing down the infrastructure */ 508 509 xpc_arch_ops.teardown_ch_structures(part); 510 511 kfree(part->remote_openclose_args_base); 512 part->remote_openclose_args = NULL; 513 kfree(part->channels); 514 part->channels = NULL; 515 516 part->setup_state = XPC_P_SS_TORNDOWN; 517 } 518 519 /* 520 * When XPC HB determines that a partition has come up, it will create a new 521 * kthread and that kthread will call this function to attempt to set up the 522 * basic infrastructure used for Cross Partition Communication with the newly 523 * upped partition. 524 * 525 * The kthread that was created by XPC HB and which setup the XPC 526 * infrastructure will remain assigned to the partition becoming the channel 527 * manager for that partition until the partition is deactivating, at which 528 * time the kthread will teardown the XPC infrastructure and then exit. 529 */ 530 static int 531 xpc_activating(void *__partid) 532 { 533 short partid = (u64)__partid; 534 struct xpc_partition *part = &xpc_partitions[partid]; 535 unsigned long irq_flags; 536 537 DBUG_ON(partid < 0 || partid >= xp_max_npartitions); 538 539 spin_lock_irqsave(&part->act_lock, irq_flags); 540 541 if (part->act_state == XPC_P_AS_DEACTIVATING) { 542 part->act_state = XPC_P_AS_INACTIVE; 543 spin_unlock_irqrestore(&part->act_lock, irq_flags); 544 part->remote_rp_pa = 0; 545 return 0; 546 } 547 548 /* indicate the thread is activating */ 549 DBUG_ON(part->act_state != XPC_P_AS_ACTIVATION_REQ); 550 part->act_state = XPC_P_AS_ACTIVATING; 551 552 XPC_SET_REASON(part, 0, 0); 553 spin_unlock_irqrestore(&part->act_lock, irq_flags); 554 555 dev_dbg(xpc_part, "activating partition %d\n", partid); 556 557 xpc_arch_ops.allow_hb(partid); 558 559 if (xpc_setup_ch_structures(part) == xpSuccess) { 560 (void)xpc_part_ref(part); /* this will always succeed */ 561 562 if (xpc_arch_ops.make_first_contact(part) == xpSuccess) { 563 xpc_mark_partition_active(part); 564 xpc_channel_mgr(part); 565 /* won't return until partition is deactivating */ 566 } 567 568 xpc_part_deref(part); 569 xpc_teardown_ch_structures(part); 570 } 571 572 xpc_arch_ops.disallow_hb(partid); 573 xpc_mark_partition_inactive(part); 574 575 if (part->reason == xpReactivating) { 576 /* interrupting ourselves results in activating partition */ 577 xpc_arch_ops.request_partition_reactivation(part); 578 } 579 580 return 0; 581 } 582 583 void 584 xpc_activate_partition(struct xpc_partition *part) 585 { 586 short partid = XPC_PARTID(part); 587 unsigned long irq_flags; 588 struct task_struct *kthread; 589 590 spin_lock_irqsave(&part->act_lock, irq_flags); 591 592 DBUG_ON(part->act_state != XPC_P_AS_INACTIVE); 593 594 part->act_state = XPC_P_AS_ACTIVATION_REQ; 595 XPC_SET_REASON(part, xpCloneKThread, __LINE__); 596 597 spin_unlock_irqrestore(&part->act_lock, irq_flags); 598 599 kthread = kthread_run(xpc_activating, (void *)((u64)partid), "xpc%02d", 600 partid); 601 if (IS_ERR(kthread)) { 602 spin_lock_irqsave(&part->act_lock, irq_flags); 603 part->act_state = XPC_P_AS_INACTIVE; 604 XPC_SET_REASON(part, xpCloneKThreadFailed, __LINE__); 605 spin_unlock_irqrestore(&part->act_lock, irq_flags); 606 } 607 } 608 609 void 610 xpc_activate_kthreads(struct xpc_channel *ch, int needed) 611 { 612 int idle = atomic_read(&ch->kthreads_idle); 613 int assigned = atomic_read(&ch->kthreads_assigned); 614 int wakeup; 615 616 DBUG_ON(needed <= 0); 617 618 if (idle > 0) { 619 wakeup = (needed > idle) ? idle : needed; 620 needed -= wakeup; 621 622 dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, " 623 "channel=%d\n", wakeup, ch->partid, ch->number); 624 625 /* only wakeup the requested number of kthreads */ 626 wake_up_nr(&ch->idle_wq, wakeup); 627 } 628 629 if (needed <= 0) 630 return; 631 632 if (needed + assigned > ch->kthreads_assigned_limit) { 633 needed = ch->kthreads_assigned_limit - assigned; 634 if (needed <= 0) 635 return; 636 } 637 638 dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n", 639 needed, ch->partid, ch->number); 640 641 xpc_create_kthreads(ch, needed, 0); 642 } 643 644 /* 645 * This function is where XPC's kthreads wait for messages to deliver. 646 */ 647 static void 648 xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch) 649 { 650 int (*n_of_deliverable_payloads) (struct xpc_channel *) = 651 xpc_arch_ops.n_of_deliverable_payloads; 652 653 do { 654 /* deliver messages to their intended recipients */ 655 656 while (n_of_deliverable_payloads(ch) > 0 && 657 !(ch->flags & XPC_C_DISCONNECTING)) { 658 xpc_deliver_payload(ch); 659 } 660 661 if (atomic_inc_return(&ch->kthreads_idle) > 662 ch->kthreads_idle_limit) { 663 /* too many idle kthreads on this channel */ 664 atomic_dec(&ch->kthreads_idle); 665 break; 666 } 667 668 dev_dbg(xpc_chan, "idle kthread calling " 669 "wait_event_interruptible_exclusive()\n"); 670 671 (void)wait_event_interruptible_exclusive(ch->idle_wq, 672 (n_of_deliverable_payloads(ch) > 0 || 673 (ch->flags & XPC_C_DISCONNECTING))); 674 675 atomic_dec(&ch->kthreads_idle); 676 677 } while (!(ch->flags & XPC_C_DISCONNECTING)); 678 } 679 680 static int 681 xpc_kthread_start(void *args) 682 { 683 short partid = XPC_UNPACK_ARG1(args); 684 u16 ch_number = XPC_UNPACK_ARG2(args); 685 struct xpc_partition *part = &xpc_partitions[partid]; 686 struct xpc_channel *ch; 687 int n_needed; 688 unsigned long irq_flags; 689 int (*n_of_deliverable_payloads) (struct xpc_channel *) = 690 xpc_arch_ops.n_of_deliverable_payloads; 691 692 dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n", 693 partid, ch_number); 694 695 ch = &part->channels[ch_number]; 696 697 if (!(ch->flags & XPC_C_DISCONNECTING)) { 698 699 /* let registerer know that connection has been established */ 700 701 spin_lock_irqsave(&ch->lock, irq_flags); 702 if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) { 703 ch->flags |= XPC_C_CONNECTEDCALLOUT; 704 spin_unlock_irqrestore(&ch->lock, irq_flags); 705 706 xpc_connected_callout(ch); 707 708 spin_lock_irqsave(&ch->lock, irq_flags); 709 ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE; 710 spin_unlock_irqrestore(&ch->lock, irq_flags); 711 712 /* 713 * It is possible that while the callout was being 714 * made that the remote partition sent some messages. 715 * If that is the case, we may need to activate 716 * additional kthreads to help deliver them. We only 717 * need one less than total #of messages to deliver. 718 */ 719 n_needed = n_of_deliverable_payloads(ch) - 1; 720 if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING)) 721 xpc_activate_kthreads(ch, n_needed); 722 723 } else { 724 spin_unlock_irqrestore(&ch->lock, irq_flags); 725 } 726 727 xpc_kthread_waitmsgs(part, ch); 728 } 729 730 /* let registerer know that connection is disconnecting */ 731 732 spin_lock_irqsave(&ch->lock, irq_flags); 733 if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) && 734 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) { 735 ch->flags |= XPC_C_DISCONNECTINGCALLOUT; 736 spin_unlock_irqrestore(&ch->lock, irq_flags); 737 738 xpc_disconnect_callout(ch, xpDisconnecting); 739 740 spin_lock_irqsave(&ch->lock, irq_flags); 741 ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE; 742 } 743 spin_unlock_irqrestore(&ch->lock, irq_flags); 744 745 if (atomic_dec_return(&ch->kthreads_assigned) == 0 && 746 atomic_dec_return(&part->nchannels_engaged) == 0) { 747 xpc_arch_ops.indicate_partition_disengaged(part); 748 } 749 750 xpc_msgqueue_deref(ch); 751 752 dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n", 753 partid, ch_number); 754 755 xpc_part_deref(part); 756 return 0; 757 } 758 759 /* 760 * For each partition that XPC has established communications with, there is 761 * a minimum of one kernel thread assigned to perform any operation that 762 * may potentially sleep or block (basically the callouts to the asynchronous 763 * functions registered via xpc_connect()). 764 * 765 * Additional kthreads are created and destroyed by XPC as the workload 766 * demands. 767 * 768 * A kthread is assigned to one of the active channels that exists for a given 769 * partition. 770 */ 771 void 772 xpc_create_kthreads(struct xpc_channel *ch, int needed, 773 int ignore_disconnecting) 774 { 775 unsigned long irq_flags; 776 u64 args = XPC_PACK_ARGS(ch->partid, ch->number); 777 struct xpc_partition *part = &xpc_partitions[ch->partid]; 778 struct task_struct *kthread; 779 void (*indicate_partition_disengaged) (struct xpc_partition *) = 780 xpc_arch_ops.indicate_partition_disengaged; 781 782 while (needed-- > 0) { 783 784 /* 785 * The following is done on behalf of the newly created 786 * kthread. That kthread is responsible for doing the 787 * counterpart to the following before it exits. 788 */ 789 if (ignore_disconnecting) { 790 if (!atomic_inc_not_zero(&ch->kthreads_assigned)) { 791 /* kthreads assigned had gone to zero */ 792 BUG_ON(!(ch->flags & 793 XPC_C_DISCONNECTINGCALLOUT_MADE)); 794 break; 795 } 796 797 } else if (ch->flags & XPC_C_DISCONNECTING) { 798 break; 799 800 } else if (atomic_inc_return(&ch->kthreads_assigned) == 1 && 801 atomic_inc_return(&part->nchannels_engaged) == 1) { 802 xpc_arch_ops.indicate_partition_engaged(part); 803 } 804 (void)xpc_part_ref(part); 805 xpc_msgqueue_ref(ch); 806 807 kthread = kthread_run(xpc_kthread_start, (void *)args, 808 "xpc%02dc%d", ch->partid, ch->number); 809 if (IS_ERR(kthread)) { 810 /* the fork failed */ 811 812 /* 813 * NOTE: if (ignore_disconnecting && 814 * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true, 815 * then we'll deadlock if all other kthreads assigned 816 * to this channel are blocked in the channel's 817 * registerer, because the only thing that will unblock 818 * them is the xpDisconnecting callout that this 819 * failed kthread_run() would have made. 820 */ 821 822 if (atomic_dec_return(&ch->kthreads_assigned) == 0 && 823 atomic_dec_return(&part->nchannels_engaged) == 0) { 824 indicate_partition_disengaged(part); 825 } 826 xpc_msgqueue_deref(ch); 827 xpc_part_deref(part); 828 829 if (atomic_read(&ch->kthreads_assigned) < 830 ch->kthreads_idle_limit) { 831 /* 832 * Flag this as an error only if we have an 833 * insufficient #of kthreads for the channel 834 * to function. 835 */ 836 spin_lock_irqsave(&ch->lock, irq_flags); 837 XPC_DISCONNECT_CHANNEL(ch, xpLackOfResources, 838 &irq_flags); 839 spin_unlock_irqrestore(&ch->lock, irq_flags); 840 } 841 break; 842 } 843 } 844 } 845 846 void 847 xpc_disconnect_wait(int ch_number) 848 { 849 unsigned long irq_flags; 850 short partid; 851 struct xpc_partition *part; 852 struct xpc_channel *ch; 853 int wakeup_channel_mgr; 854 855 /* now wait for all callouts to the caller's function to cease */ 856 for (partid = 0; partid < xp_max_npartitions; partid++) { 857 part = &xpc_partitions[partid]; 858 859 if (!xpc_part_ref(part)) 860 continue; 861 862 ch = &part->channels[ch_number]; 863 864 if (!(ch->flags & XPC_C_WDISCONNECT)) { 865 xpc_part_deref(part); 866 continue; 867 } 868 869 wait_for_completion(&ch->wdisconnect_wait); 870 871 spin_lock_irqsave(&ch->lock, irq_flags); 872 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED)); 873 wakeup_channel_mgr = 0; 874 875 if (ch->delayed_chctl_flags) { 876 if (part->act_state != XPC_P_AS_DEACTIVATING) { 877 spin_lock(&part->chctl_lock); 878 part->chctl.flags[ch->number] |= 879 ch->delayed_chctl_flags; 880 spin_unlock(&part->chctl_lock); 881 wakeup_channel_mgr = 1; 882 } 883 ch->delayed_chctl_flags = 0; 884 } 885 886 ch->flags &= ~XPC_C_WDISCONNECT; 887 spin_unlock_irqrestore(&ch->lock, irq_flags); 888 889 if (wakeup_channel_mgr) 890 xpc_wakeup_channel_mgr(part); 891 892 xpc_part_deref(part); 893 } 894 } 895 896 static int 897 xpc_setup_partitions(void) 898 { 899 short partid; 900 struct xpc_partition *part; 901 902 xpc_partitions = kcalloc(xp_max_npartitions, 903 sizeof(struct xpc_partition), 904 GFP_KERNEL); 905 if (xpc_partitions == NULL) { 906 dev_err(xpc_part, "can't get memory for partition structure\n"); 907 return -ENOMEM; 908 } 909 910 /* 911 * The first few fields of each entry of xpc_partitions[] need to 912 * be initialized now so that calls to xpc_connect() and 913 * xpc_disconnect() can be made prior to the activation of any remote 914 * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE 915 * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING 916 * PARTITION HAS BEEN ACTIVATED. 917 */ 918 for (partid = 0; partid < xp_max_npartitions; partid++) { 919 part = &xpc_partitions[partid]; 920 921 DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part)); 922 923 part->activate_IRQ_rcvd = 0; 924 spin_lock_init(&part->act_lock); 925 part->act_state = XPC_P_AS_INACTIVE; 926 XPC_SET_REASON(part, 0, 0); 927 928 timer_setup(&part->disengage_timer, 929 xpc_timeout_partition_disengage, 0); 930 931 part->setup_state = XPC_P_SS_UNSET; 932 init_waitqueue_head(&part->teardown_wq); 933 atomic_set(&part->references, 0); 934 } 935 936 return xpc_arch_ops.setup_partitions(); 937 } 938 939 static void 940 xpc_teardown_partitions(void) 941 { 942 xpc_arch_ops.teardown_partitions(); 943 kfree(xpc_partitions); 944 } 945 946 static void 947 xpc_do_exit(enum xp_retval reason) 948 { 949 short partid; 950 int active_part_count, printed_waiting_msg = 0; 951 struct xpc_partition *part; 952 unsigned long printmsg_time, disengage_timeout = 0; 953 954 /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */ 955 DBUG_ON(xpc_exiting == 1); 956 957 /* 958 * Let the heartbeat checker thread and the discovery thread 959 * (if one is running) know that they should exit. Also wake up 960 * the heartbeat checker thread in case it's sleeping. 961 */ 962 xpc_exiting = 1; 963 wake_up_interruptible(&xpc_activate_IRQ_wq); 964 965 /* wait for the discovery thread to exit */ 966 wait_for_completion(&xpc_discovery_exited); 967 968 /* wait for the heartbeat checker thread to exit */ 969 wait_for_completion(&xpc_hb_checker_exited); 970 971 /* sleep for a 1/3 of a second or so */ 972 (void)msleep_interruptible(300); 973 974 /* wait for all partitions to become inactive */ 975 976 printmsg_time = jiffies + (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ); 977 xpc_disengage_timedout = 0; 978 979 do { 980 active_part_count = 0; 981 982 for (partid = 0; partid < xp_max_npartitions; partid++) { 983 part = &xpc_partitions[partid]; 984 985 if (xpc_partition_disengaged(part) && 986 part->act_state == XPC_P_AS_INACTIVE) { 987 continue; 988 } 989 990 active_part_count++; 991 992 XPC_DEACTIVATE_PARTITION(part, reason); 993 994 if (part->disengage_timeout > disengage_timeout) 995 disengage_timeout = part->disengage_timeout; 996 } 997 998 if (xpc_arch_ops.any_partition_engaged()) { 999 if (time_is_before_jiffies(printmsg_time)) { 1000 dev_info(xpc_part, "waiting for remote " 1001 "partitions to deactivate, timeout in " 1002 "%ld seconds\n", (disengage_timeout - 1003 jiffies) / HZ); 1004 printmsg_time = jiffies + 1005 (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ); 1006 printed_waiting_msg = 1; 1007 } 1008 1009 } else if (active_part_count > 0) { 1010 if (printed_waiting_msg) { 1011 dev_info(xpc_part, "waiting for local partition" 1012 " to deactivate\n"); 1013 printed_waiting_msg = 0; 1014 } 1015 1016 } else { 1017 if (!xpc_disengage_timedout) { 1018 dev_info(xpc_part, "all partitions have " 1019 "deactivated\n"); 1020 } 1021 break; 1022 } 1023 1024 /* sleep for a 1/3 of a second or so */ 1025 (void)msleep_interruptible(300); 1026 1027 } while (1); 1028 1029 DBUG_ON(xpc_arch_ops.any_partition_engaged()); 1030 1031 xpc_teardown_rsvd_page(); 1032 1033 if (reason == xpUnloading) { 1034 (void)unregister_die_notifier(&xpc_die_notifier); 1035 (void)unregister_reboot_notifier(&xpc_reboot_notifier); 1036 } 1037 1038 /* clear the interface to XPC's functions */ 1039 xpc_clear_interface(); 1040 1041 if (xpc_sysctl) 1042 unregister_sysctl_table(xpc_sysctl); 1043 1044 xpc_teardown_partitions(); 1045 1046 if (is_uv()) 1047 xpc_exit_uv(); 1048 } 1049 1050 /* 1051 * This function is called when the system is being rebooted. 1052 */ 1053 static int 1054 xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused) 1055 { 1056 enum xp_retval reason; 1057 1058 switch (event) { 1059 case SYS_RESTART: 1060 reason = xpSystemReboot; 1061 break; 1062 case SYS_HALT: 1063 reason = xpSystemHalt; 1064 break; 1065 case SYS_POWER_OFF: 1066 reason = xpSystemPoweroff; 1067 break; 1068 default: 1069 reason = xpSystemGoingDown; 1070 } 1071 1072 xpc_do_exit(reason); 1073 return NOTIFY_DONE; 1074 } 1075 1076 /* Used to only allow one cpu to complete disconnect */ 1077 static unsigned int xpc_die_disconnecting; 1078 1079 /* 1080 * Notify other partitions to deactivate from us by first disengaging from all 1081 * references to our memory. 1082 */ 1083 static void 1084 xpc_die_deactivate(void) 1085 { 1086 struct xpc_partition *part; 1087 short partid; 1088 int any_engaged; 1089 long keep_waiting; 1090 long wait_to_print; 1091 1092 if (cmpxchg(&xpc_die_disconnecting, 0, 1)) 1093 return; 1094 1095 /* keep xpc_hb_checker thread from doing anything (just in case) */ 1096 xpc_exiting = 1; 1097 1098 xpc_arch_ops.disallow_all_hbs(); /*indicate we're deactivated */ 1099 1100 for (partid = 0; partid < xp_max_npartitions; partid++) { 1101 part = &xpc_partitions[partid]; 1102 1103 if (xpc_arch_ops.partition_engaged(partid) || 1104 part->act_state != XPC_P_AS_INACTIVE) { 1105 xpc_arch_ops.request_partition_deactivation(part); 1106 xpc_arch_ops.indicate_partition_disengaged(part); 1107 } 1108 } 1109 1110 /* 1111 * Though we requested that all other partitions deactivate from us, 1112 * we only wait until they've all disengaged or we've reached the 1113 * defined timelimit. 1114 * 1115 * Given that one iteration through the following while-loop takes 1116 * approximately 200 microseconds, calculate the #of loops to take 1117 * before bailing and the #of loops before printing a waiting message. 1118 */ 1119 keep_waiting = xpc_disengage_timelimit * 1000 * 5; 1120 wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 1000 * 5; 1121 1122 while (1) { 1123 any_engaged = xpc_arch_ops.any_partition_engaged(); 1124 if (!any_engaged) { 1125 dev_info(xpc_part, "all partitions have deactivated\n"); 1126 break; 1127 } 1128 1129 if (!keep_waiting--) { 1130 for (partid = 0; partid < xp_max_npartitions; 1131 partid++) { 1132 if (xpc_arch_ops.partition_engaged(partid)) { 1133 dev_info(xpc_part, "deactivate from " 1134 "remote partition %d timed " 1135 "out\n", partid); 1136 } 1137 } 1138 break; 1139 } 1140 1141 if (!wait_to_print--) { 1142 dev_info(xpc_part, "waiting for remote partitions to " 1143 "deactivate, timeout in %ld seconds\n", 1144 keep_waiting / (1000 * 5)); 1145 wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 1146 1000 * 5; 1147 } 1148 1149 udelay(200); 1150 } 1151 } 1152 1153 /* 1154 * This function is called when the system is being restarted or halted due 1155 * to some sort of system failure. If this is the case we need to notify the 1156 * other partitions to disengage from all references to our memory. 1157 * This function can also be called when our heartbeater could be offlined 1158 * for a time. In this case we need to notify other partitions to not worry 1159 * about the lack of a heartbeat. 1160 */ 1161 static int 1162 xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args) 1163 { 1164 #ifdef CONFIG_IA64 /* !!! temporary kludge */ 1165 switch (event) { 1166 case DIE_MACHINE_RESTART: 1167 case DIE_MACHINE_HALT: 1168 xpc_die_deactivate(); 1169 break; 1170 1171 case DIE_KDEBUG_ENTER: 1172 /* Should lack of heartbeat be ignored by other partitions? */ 1173 if (!xpc_kdebug_ignore) 1174 break; 1175 1176 /* fall through */ 1177 case DIE_MCA_MONARCH_ENTER: 1178 case DIE_INIT_MONARCH_ENTER: 1179 xpc_arch_ops.offline_heartbeat(); 1180 break; 1181 1182 case DIE_KDEBUG_LEAVE: 1183 /* Is lack of heartbeat being ignored by other partitions? */ 1184 if (!xpc_kdebug_ignore) 1185 break; 1186 1187 /* fall through */ 1188 case DIE_MCA_MONARCH_LEAVE: 1189 case DIE_INIT_MONARCH_LEAVE: 1190 xpc_arch_ops.online_heartbeat(); 1191 break; 1192 } 1193 #else 1194 struct die_args *die_args = _die_args; 1195 1196 switch (event) { 1197 case DIE_TRAP: 1198 if (die_args->trapnr == X86_TRAP_DF) 1199 xpc_die_deactivate(); 1200 1201 if (((die_args->trapnr == X86_TRAP_MF) || 1202 (die_args->trapnr == X86_TRAP_XF)) && 1203 !user_mode(die_args->regs)) 1204 xpc_die_deactivate(); 1205 1206 break; 1207 case DIE_INT3: 1208 case DIE_DEBUG: 1209 break; 1210 case DIE_OOPS: 1211 case DIE_GPF: 1212 default: 1213 xpc_die_deactivate(); 1214 } 1215 #endif 1216 1217 return NOTIFY_DONE; 1218 } 1219 1220 int __init 1221 xpc_init(void) 1222 { 1223 int ret; 1224 struct task_struct *kthread; 1225 1226 dev_set_name(xpc_part, "part"); 1227 dev_set_name(xpc_chan, "chan"); 1228 1229 if (is_uv()) { 1230 ret = xpc_init_uv(); 1231 1232 } else { 1233 ret = -ENODEV; 1234 } 1235 1236 if (ret != 0) 1237 return ret; 1238 1239 ret = xpc_setup_partitions(); 1240 if (ret != 0) { 1241 dev_err(xpc_part, "can't get memory for partition structure\n"); 1242 goto out_1; 1243 } 1244 1245 xpc_sysctl = register_sysctl_table(xpc_sys_dir); 1246 1247 /* 1248 * Fill the partition reserved page with the information needed by 1249 * other partitions to discover we are alive and establish initial 1250 * communications. 1251 */ 1252 ret = xpc_setup_rsvd_page(); 1253 if (ret != 0) { 1254 dev_err(xpc_part, "can't setup our reserved page\n"); 1255 goto out_2; 1256 } 1257 1258 /* add ourselves to the reboot_notifier_list */ 1259 ret = register_reboot_notifier(&xpc_reboot_notifier); 1260 if (ret != 0) 1261 dev_warn(xpc_part, "can't register reboot notifier\n"); 1262 1263 /* add ourselves to the die_notifier list */ 1264 ret = register_die_notifier(&xpc_die_notifier); 1265 if (ret != 0) 1266 dev_warn(xpc_part, "can't register die notifier\n"); 1267 1268 /* 1269 * The real work-horse behind xpc. This processes incoming 1270 * interrupts and monitors remote heartbeats. 1271 */ 1272 kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME); 1273 if (IS_ERR(kthread)) { 1274 dev_err(xpc_part, "failed while forking hb check thread\n"); 1275 ret = -EBUSY; 1276 goto out_3; 1277 } 1278 1279 /* 1280 * Startup a thread that will attempt to discover other partitions to 1281 * activate based on info provided by SAL. This new thread is short 1282 * lived and will exit once discovery is complete. 1283 */ 1284 kthread = kthread_run(xpc_initiate_discovery, NULL, 1285 XPC_DISCOVERY_THREAD_NAME); 1286 if (IS_ERR(kthread)) { 1287 dev_err(xpc_part, "failed while forking discovery thread\n"); 1288 1289 /* mark this new thread as a non-starter */ 1290 complete(&xpc_discovery_exited); 1291 1292 xpc_do_exit(xpUnloading); 1293 return -EBUSY; 1294 } 1295 1296 /* set the interface to point at XPC's functions */ 1297 xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect, 1298 xpc_initiate_send, xpc_initiate_send_notify, 1299 xpc_initiate_received, xpc_initiate_partid_to_nasids); 1300 1301 return 0; 1302 1303 /* initialization was not successful */ 1304 out_3: 1305 xpc_teardown_rsvd_page(); 1306 1307 (void)unregister_die_notifier(&xpc_die_notifier); 1308 (void)unregister_reboot_notifier(&xpc_reboot_notifier); 1309 out_2: 1310 if (xpc_sysctl) 1311 unregister_sysctl_table(xpc_sysctl); 1312 1313 xpc_teardown_partitions(); 1314 out_1: 1315 if (is_uv()) 1316 xpc_exit_uv(); 1317 return ret; 1318 } 1319 1320 module_init(xpc_init); 1321 1322 void __exit 1323 xpc_exit(void) 1324 { 1325 xpc_do_exit(xpUnloading); 1326 } 1327 1328 module_exit(xpc_exit); 1329 1330 MODULE_AUTHOR("Silicon Graphics, Inc."); 1331 MODULE_DESCRIPTION("Cross Partition Communication (XPC) support"); 1332 MODULE_LICENSE("GPL"); 1333 1334 module_param(xpc_hb_interval, int, 0); 1335 MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between " 1336 "heartbeat increments."); 1337 1338 module_param(xpc_hb_check_interval, int, 0); 1339 MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between " 1340 "heartbeat checks."); 1341 1342 module_param(xpc_disengage_timelimit, int, 0); 1343 MODULE_PARM_DESC(xpc_disengage_timelimit, "Number of seconds to wait " 1344 "for disengage to complete."); 1345 1346 module_param(xpc_kdebug_ignore, int, 0); 1347 MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by " 1348 "other partitions when dropping into kdebug."); 1349