1 /* 2 * This file is subject to the terms and conditions of the GNU General Public 3 * License. See the file "COPYING" in the main directory of this archive 4 * for more details. 5 * 6 * (C) Copyright 2020 Hewlett Packard Enterprise Development LP 7 * Copyright (c) 2004-2009 Silicon Graphics, Inc. All Rights Reserved. 8 */ 9 10 /* 11 * Cross Partition Communication (XPC) support - standard version. 12 * 13 * XPC provides a message passing capability that crosses partition 14 * boundaries. This module is made up of two parts: 15 * 16 * partition This part detects the presence/absence of other 17 * partitions. It provides a heartbeat and monitors 18 * the heartbeats of other partitions. 19 * 20 * channel This part manages the channels and sends/receives 21 * messages across them to/from other partitions. 22 * 23 * There are a couple of additional functions residing in XP, which 24 * provide an interface to XPC for its users. 25 * 26 * 27 * Caveats: 28 * 29 * . Currently on sn2, we have no way to determine which nasid an IRQ 30 * came from. Thus, xpc_send_IRQ_sn2() does a remote amo write 31 * followed by an IPI. The amo indicates where data is to be pulled 32 * from, so after the IPI arrives, the remote partition checks the amo 33 * word. The IPI can actually arrive before the amo however, so other 34 * code must periodically check for this case. Also, remote amo 35 * operations do not reliably time out. Thus we do a remote PIO read 36 * solely to know whether the remote partition is down and whether we 37 * should stop sending IPIs to it. This remote PIO read operation is 38 * set up in a special nofault region so SAL knows to ignore (and 39 * cleanup) any errors due to the remote amo write, PIO read, and/or 40 * PIO write operations. 41 * 42 * If/when new hardware solves this IPI problem, we should abandon 43 * the current approach. 44 * 45 */ 46 47 #include <linux/module.h> 48 #include <linux/slab.h> 49 #include <linux/sysctl.h> 50 #include <linux/device.h> 51 #include <linux/delay.h> 52 #include <linux/reboot.h> 53 #include <linux/kdebug.h> 54 #include <linux/kthread.h> 55 #include "xpc.h" 56 57 #ifdef CONFIG_X86_64 58 #include <asm/traps.h> 59 #endif 60 61 /* define two XPC debug device structures to be used with dev_dbg() et al */ 62 63 static struct device_driver xpc_dbg_name = { 64 .name = "xpc" 65 }; 66 67 static struct device xpc_part_dbg_subname = { 68 .init_name = "", /* set to "part" at xpc_init() time */ 69 .driver = &xpc_dbg_name 70 }; 71 72 static struct device xpc_chan_dbg_subname = { 73 .init_name = "", /* set to "chan" at xpc_init() time */ 74 .driver = &xpc_dbg_name 75 }; 76 77 struct device *xpc_part = &xpc_part_dbg_subname; 78 struct device *xpc_chan = &xpc_chan_dbg_subname; 79 80 static int xpc_kdebug_ignore; 81 82 /* systune related variables for /proc/sys directories */ 83 84 static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL; 85 static int xpc_hb_min_interval = 1; 86 static int xpc_hb_max_interval = 10; 87 88 static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL; 89 static int xpc_hb_check_min_interval = 10; 90 static int xpc_hb_check_max_interval = 120; 91 92 int xpc_disengage_timelimit = XPC_DISENGAGE_DEFAULT_TIMELIMIT; 93 static int xpc_disengage_min_timelimit; /* = 0 */ 94 static int xpc_disengage_max_timelimit = 120; 95 96 static struct ctl_table xpc_sys_xpc_hb[] = { 97 { 98 .procname = "hb_interval", 99 .data = &xpc_hb_interval, 100 .maxlen = sizeof(int), 101 .mode = 0644, 102 .proc_handler = proc_dointvec_minmax, 103 .extra1 = &xpc_hb_min_interval, 104 .extra2 = &xpc_hb_max_interval}, 105 { 106 .procname = "hb_check_interval", 107 .data = &xpc_hb_check_interval, 108 .maxlen = sizeof(int), 109 .mode = 0644, 110 .proc_handler = proc_dointvec_minmax, 111 .extra1 = &xpc_hb_check_min_interval, 112 .extra2 = &xpc_hb_check_max_interval}, 113 {} 114 }; 115 static struct ctl_table xpc_sys_xpc[] = { 116 { 117 .procname = "disengage_timelimit", 118 .data = &xpc_disengage_timelimit, 119 .maxlen = sizeof(int), 120 .mode = 0644, 121 .proc_handler = proc_dointvec_minmax, 122 .extra1 = &xpc_disengage_min_timelimit, 123 .extra2 = &xpc_disengage_max_timelimit}, 124 {} 125 }; 126 127 static struct ctl_table_header *xpc_sysctl; 128 static struct ctl_table_header *xpc_sysctl_hb; 129 130 /* non-zero if any remote partition disengage was timed out */ 131 int xpc_disengage_timedout; 132 133 /* #of activate IRQs received and not yet processed */ 134 int xpc_activate_IRQ_rcvd; 135 DEFINE_SPINLOCK(xpc_activate_IRQ_rcvd_lock); 136 137 /* IRQ handler notifies this wait queue on receipt of an IRQ */ 138 DECLARE_WAIT_QUEUE_HEAD(xpc_activate_IRQ_wq); 139 140 static unsigned long xpc_hb_check_timeout; 141 static struct timer_list xpc_hb_timer; 142 143 /* notification that the xpc_hb_checker thread has exited */ 144 static DECLARE_COMPLETION(xpc_hb_checker_exited); 145 146 /* notification that the xpc_discovery thread has exited */ 147 static DECLARE_COMPLETION(xpc_discovery_exited); 148 149 static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *); 150 151 static int xpc_system_reboot(struct notifier_block *, unsigned long, void *); 152 static struct notifier_block xpc_reboot_notifier = { 153 .notifier_call = xpc_system_reboot, 154 }; 155 156 static int xpc_system_die(struct notifier_block *, unsigned long, void *); 157 static struct notifier_block xpc_die_notifier = { 158 .notifier_call = xpc_system_die, 159 }; 160 161 struct xpc_arch_operations xpc_arch_ops; 162 163 /* 164 * Timer function to enforce the timelimit on the partition disengage. 165 */ 166 static void 167 xpc_timeout_partition_disengage(struct timer_list *t) 168 { 169 struct xpc_partition *part = from_timer(part, t, disengage_timer); 170 171 DBUG_ON(time_is_after_jiffies(part->disengage_timeout)); 172 173 xpc_partition_disengaged_from_timer(part); 174 175 DBUG_ON(part->disengage_timeout != 0); 176 DBUG_ON(xpc_arch_ops.partition_engaged(XPC_PARTID(part))); 177 } 178 179 /* 180 * Timer to produce the heartbeat. The timer structures function is 181 * already set when this is initially called. A tunable is used to 182 * specify when the next timeout should occur. 183 */ 184 static void 185 xpc_hb_beater(struct timer_list *unused) 186 { 187 xpc_arch_ops.increment_heartbeat(); 188 189 if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) 190 wake_up_interruptible(&xpc_activate_IRQ_wq); 191 192 xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ); 193 add_timer(&xpc_hb_timer); 194 } 195 196 static void 197 xpc_start_hb_beater(void) 198 { 199 xpc_arch_ops.heartbeat_init(); 200 timer_setup(&xpc_hb_timer, xpc_hb_beater, 0); 201 xpc_hb_beater(NULL); 202 } 203 204 static void 205 xpc_stop_hb_beater(void) 206 { 207 del_timer_sync(&xpc_hb_timer); 208 xpc_arch_ops.heartbeat_exit(); 209 } 210 211 /* 212 * At periodic intervals, scan through all active partitions and ensure 213 * their heartbeat is still active. If not, the partition is deactivated. 214 */ 215 static void 216 xpc_check_remote_hb(void) 217 { 218 struct xpc_partition *part; 219 short partid; 220 enum xp_retval ret; 221 222 for (partid = 0; partid < xp_max_npartitions; partid++) { 223 224 if (xpc_exiting) 225 break; 226 227 if (partid == xp_partition_id) 228 continue; 229 230 part = &xpc_partitions[partid]; 231 232 if (part->act_state == XPC_P_AS_INACTIVE || 233 part->act_state == XPC_P_AS_DEACTIVATING) { 234 continue; 235 } 236 237 ret = xpc_arch_ops.get_remote_heartbeat(part); 238 if (ret != xpSuccess) 239 XPC_DEACTIVATE_PARTITION(part, ret); 240 } 241 } 242 243 /* 244 * This thread is responsible for nearly all of the partition 245 * activation/deactivation. 246 */ 247 static int 248 xpc_hb_checker(void *ignore) 249 { 250 int force_IRQ = 0; 251 252 /* this thread was marked active by xpc_hb_init() */ 253 254 set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU)); 255 256 /* set our heartbeating to other partitions into motion */ 257 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ); 258 xpc_start_hb_beater(); 259 260 while (!xpc_exiting) { 261 262 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have " 263 "been received\n", 264 (int)(xpc_hb_check_timeout - jiffies), 265 xpc_activate_IRQ_rcvd); 266 267 /* checking of remote heartbeats is skewed by IRQ handling */ 268 if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) { 269 xpc_hb_check_timeout = jiffies + 270 (xpc_hb_check_interval * HZ); 271 272 dev_dbg(xpc_part, "checking remote heartbeats\n"); 273 xpc_check_remote_hb(); 274 } 275 276 /* check for outstanding IRQs */ 277 if (xpc_activate_IRQ_rcvd > 0 || force_IRQ != 0) { 278 force_IRQ = 0; 279 dev_dbg(xpc_part, "processing activate IRQs " 280 "received\n"); 281 xpc_arch_ops.process_activate_IRQ_rcvd(); 282 } 283 284 /* wait for IRQ or timeout */ 285 (void)wait_event_interruptible(xpc_activate_IRQ_wq, 286 (time_is_before_eq_jiffies( 287 xpc_hb_check_timeout) || 288 xpc_activate_IRQ_rcvd > 0 || 289 xpc_exiting)); 290 } 291 292 xpc_stop_hb_beater(); 293 294 dev_dbg(xpc_part, "heartbeat checker is exiting\n"); 295 296 /* mark this thread as having exited */ 297 complete(&xpc_hb_checker_exited); 298 return 0; 299 } 300 301 /* 302 * This thread will attempt to discover other partitions to activate 303 * based on info provided by SAL. This new thread is short lived and 304 * will exit once discovery is complete. 305 */ 306 static int 307 xpc_initiate_discovery(void *ignore) 308 { 309 xpc_discovery(); 310 311 dev_dbg(xpc_part, "discovery thread is exiting\n"); 312 313 /* mark this thread as having exited */ 314 complete(&xpc_discovery_exited); 315 return 0; 316 } 317 318 /* 319 * The first kthread assigned to a newly activated partition is the one 320 * created by XPC HB with which it calls xpc_activating(). XPC hangs on to 321 * that kthread until the partition is brought down, at which time that kthread 322 * returns back to XPC HB. (The return of that kthread will signify to XPC HB 323 * that XPC has dismantled all communication infrastructure for the associated 324 * partition.) This kthread becomes the channel manager for that partition. 325 * 326 * Each active partition has a channel manager, who, besides connecting and 327 * disconnecting channels, will ensure that each of the partition's connected 328 * channels has the required number of assigned kthreads to get the work done. 329 */ 330 static void 331 xpc_channel_mgr(struct xpc_partition *part) 332 { 333 while (part->act_state != XPC_P_AS_DEACTIVATING || 334 atomic_read(&part->nchannels_active) > 0 || 335 !xpc_partition_disengaged(part)) { 336 337 xpc_process_sent_chctl_flags(part); 338 339 /* 340 * Wait until we've been requested to activate kthreads or 341 * all of the channel's message queues have been torn down or 342 * a signal is pending. 343 * 344 * The channel_mgr_requests is set to 1 after being awakened, 345 * This is done to prevent the channel mgr from making one pass 346 * through the loop for each request, since he will 347 * be servicing all the requests in one pass. The reason it's 348 * set to 1 instead of 0 is so that other kthreads will know 349 * that the channel mgr is running and won't bother trying to 350 * wake him up. 351 */ 352 atomic_dec(&part->channel_mgr_requests); 353 (void)wait_event_interruptible(part->channel_mgr_wq, 354 (atomic_read(&part->channel_mgr_requests) > 0 || 355 part->chctl.all_flags != 0 || 356 (part->act_state == XPC_P_AS_DEACTIVATING && 357 atomic_read(&part->nchannels_active) == 0 && 358 xpc_partition_disengaged(part)))); 359 atomic_set(&part->channel_mgr_requests, 1); 360 } 361 } 362 363 /* 364 * Guarantee that the kzalloc'd memory is cacheline aligned. 365 */ 366 void * 367 xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base) 368 { 369 /* see if kzalloc will give us cachline aligned memory by default */ 370 *base = kzalloc(size, flags); 371 if (*base == NULL) 372 return NULL; 373 374 if ((u64)*base == L1_CACHE_ALIGN((u64)*base)) 375 return *base; 376 377 kfree(*base); 378 379 /* nope, we'll have to do it ourselves */ 380 *base = kzalloc(size + L1_CACHE_BYTES, flags); 381 if (*base == NULL) 382 return NULL; 383 384 return (void *)L1_CACHE_ALIGN((u64)*base); 385 } 386 387 /* 388 * Setup the channel structures necessary to support XPartition Communication 389 * between the specified remote partition and the local one. 390 */ 391 static enum xp_retval 392 xpc_setup_ch_structures(struct xpc_partition *part) 393 { 394 enum xp_retval ret; 395 int ch_number; 396 struct xpc_channel *ch; 397 short partid = XPC_PARTID(part); 398 399 /* 400 * Allocate all of the channel structures as a contiguous chunk of 401 * memory. 402 */ 403 DBUG_ON(part->channels != NULL); 404 part->channels = kcalloc(XPC_MAX_NCHANNELS, 405 sizeof(struct xpc_channel), 406 GFP_KERNEL); 407 if (part->channels == NULL) { 408 dev_err(xpc_chan, "can't get memory for channels\n"); 409 return xpNoMemory; 410 } 411 412 /* allocate the remote open and close args */ 413 414 part->remote_openclose_args = 415 xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE, 416 GFP_KERNEL, &part-> 417 remote_openclose_args_base); 418 if (part->remote_openclose_args == NULL) { 419 dev_err(xpc_chan, "can't get memory for remote connect args\n"); 420 ret = xpNoMemory; 421 goto out_1; 422 } 423 424 part->chctl.all_flags = 0; 425 spin_lock_init(&part->chctl_lock); 426 427 atomic_set(&part->channel_mgr_requests, 1); 428 init_waitqueue_head(&part->channel_mgr_wq); 429 430 part->nchannels = XPC_MAX_NCHANNELS; 431 432 atomic_set(&part->nchannels_active, 0); 433 atomic_set(&part->nchannels_engaged, 0); 434 435 for (ch_number = 0; ch_number < part->nchannels; ch_number++) { 436 ch = &part->channels[ch_number]; 437 438 ch->partid = partid; 439 ch->number = ch_number; 440 ch->flags = XPC_C_DISCONNECTED; 441 442 atomic_set(&ch->kthreads_assigned, 0); 443 atomic_set(&ch->kthreads_idle, 0); 444 atomic_set(&ch->kthreads_active, 0); 445 446 atomic_set(&ch->references, 0); 447 atomic_set(&ch->n_to_notify, 0); 448 449 spin_lock_init(&ch->lock); 450 init_completion(&ch->wdisconnect_wait); 451 452 atomic_set(&ch->n_on_msg_allocate_wq, 0); 453 init_waitqueue_head(&ch->msg_allocate_wq); 454 init_waitqueue_head(&ch->idle_wq); 455 } 456 457 ret = xpc_arch_ops.setup_ch_structures(part); 458 if (ret != xpSuccess) 459 goto out_2; 460 461 /* 462 * With the setting of the partition setup_state to XPC_P_SS_SETUP, 463 * we're declaring that this partition is ready to go. 464 */ 465 part->setup_state = XPC_P_SS_SETUP; 466 467 return xpSuccess; 468 469 /* setup of ch structures failed */ 470 out_2: 471 kfree(part->remote_openclose_args_base); 472 part->remote_openclose_args = NULL; 473 out_1: 474 kfree(part->channels); 475 part->channels = NULL; 476 return ret; 477 } 478 479 /* 480 * Teardown the channel structures necessary to support XPartition Communication 481 * between the specified remote partition and the local one. 482 */ 483 static void 484 xpc_teardown_ch_structures(struct xpc_partition *part) 485 { 486 DBUG_ON(atomic_read(&part->nchannels_engaged) != 0); 487 DBUG_ON(atomic_read(&part->nchannels_active) != 0); 488 489 /* 490 * Make this partition inaccessible to local processes by marking it 491 * as no longer setup. Then wait before proceeding with the teardown 492 * until all existing references cease. 493 */ 494 DBUG_ON(part->setup_state != XPC_P_SS_SETUP); 495 part->setup_state = XPC_P_SS_WTEARDOWN; 496 497 wait_event(part->teardown_wq, (atomic_read(&part->references) == 0)); 498 499 /* now we can begin tearing down the infrastructure */ 500 501 xpc_arch_ops.teardown_ch_structures(part); 502 503 kfree(part->remote_openclose_args_base); 504 part->remote_openclose_args = NULL; 505 kfree(part->channels); 506 part->channels = NULL; 507 508 part->setup_state = XPC_P_SS_TORNDOWN; 509 } 510 511 /* 512 * When XPC HB determines that a partition has come up, it will create a new 513 * kthread and that kthread will call this function to attempt to set up the 514 * basic infrastructure used for Cross Partition Communication with the newly 515 * upped partition. 516 * 517 * The kthread that was created by XPC HB and which setup the XPC 518 * infrastructure will remain assigned to the partition becoming the channel 519 * manager for that partition until the partition is deactivating, at which 520 * time the kthread will teardown the XPC infrastructure and then exit. 521 */ 522 static int 523 xpc_activating(void *__partid) 524 { 525 short partid = (u64)__partid; 526 struct xpc_partition *part = &xpc_partitions[partid]; 527 unsigned long irq_flags; 528 529 DBUG_ON(partid < 0 || partid >= xp_max_npartitions); 530 531 spin_lock_irqsave(&part->act_lock, irq_flags); 532 533 if (part->act_state == XPC_P_AS_DEACTIVATING) { 534 part->act_state = XPC_P_AS_INACTIVE; 535 spin_unlock_irqrestore(&part->act_lock, irq_flags); 536 part->remote_rp_pa = 0; 537 return 0; 538 } 539 540 /* indicate the thread is activating */ 541 DBUG_ON(part->act_state != XPC_P_AS_ACTIVATION_REQ); 542 part->act_state = XPC_P_AS_ACTIVATING; 543 544 XPC_SET_REASON(part, 0, 0); 545 spin_unlock_irqrestore(&part->act_lock, irq_flags); 546 547 dev_dbg(xpc_part, "activating partition %d\n", partid); 548 549 xpc_arch_ops.allow_hb(partid); 550 551 if (xpc_setup_ch_structures(part) == xpSuccess) { 552 (void)xpc_part_ref(part); /* this will always succeed */ 553 554 if (xpc_arch_ops.make_first_contact(part) == xpSuccess) { 555 xpc_mark_partition_active(part); 556 xpc_channel_mgr(part); 557 /* won't return until partition is deactivating */ 558 } 559 560 xpc_part_deref(part); 561 xpc_teardown_ch_structures(part); 562 } 563 564 xpc_arch_ops.disallow_hb(partid); 565 xpc_mark_partition_inactive(part); 566 567 if (part->reason == xpReactivating) { 568 /* interrupting ourselves results in activating partition */ 569 xpc_arch_ops.request_partition_reactivation(part); 570 } 571 572 return 0; 573 } 574 575 void 576 xpc_activate_partition(struct xpc_partition *part) 577 { 578 short partid = XPC_PARTID(part); 579 unsigned long irq_flags; 580 struct task_struct *kthread; 581 582 spin_lock_irqsave(&part->act_lock, irq_flags); 583 584 DBUG_ON(part->act_state != XPC_P_AS_INACTIVE); 585 586 part->act_state = XPC_P_AS_ACTIVATION_REQ; 587 XPC_SET_REASON(part, xpCloneKThread, __LINE__); 588 589 spin_unlock_irqrestore(&part->act_lock, irq_flags); 590 591 kthread = kthread_run(xpc_activating, (void *)((u64)partid), "xpc%02d", 592 partid); 593 if (IS_ERR(kthread)) { 594 spin_lock_irqsave(&part->act_lock, irq_flags); 595 part->act_state = XPC_P_AS_INACTIVE; 596 XPC_SET_REASON(part, xpCloneKThreadFailed, __LINE__); 597 spin_unlock_irqrestore(&part->act_lock, irq_flags); 598 } 599 } 600 601 void 602 xpc_activate_kthreads(struct xpc_channel *ch, int needed) 603 { 604 int idle = atomic_read(&ch->kthreads_idle); 605 int assigned = atomic_read(&ch->kthreads_assigned); 606 int wakeup; 607 608 DBUG_ON(needed <= 0); 609 610 if (idle > 0) { 611 wakeup = (needed > idle) ? idle : needed; 612 needed -= wakeup; 613 614 dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, " 615 "channel=%d\n", wakeup, ch->partid, ch->number); 616 617 /* only wakeup the requested number of kthreads */ 618 wake_up_nr(&ch->idle_wq, wakeup); 619 } 620 621 if (needed <= 0) 622 return; 623 624 if (needed + assigned > ch->kthreads_assigned_limit) { 625 needed = ch->kthreads_assigned_limit - assigned; 626 if (needed <= 0) 627 return; 628 } 629 630 dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n", 631 needed, ch->partid, ch->number); 632 633 xpc_create_kthreads(ch, needed, 0); 634 } 635 636 /* 637 * This function is where XPC's kthreads wait for messages to deliver. 638 */ 639 static void 640 xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch) 641 { 642 int (*n_of_deliverable_payloads) (struct xpc_channel *) = 643 xpc_arch_ops.n_of_deliverable_payloads; 644 645 do { 646 /* deliver messages to their intended recipients */ 647 648 while (n_of_deliverable_payloads(ch) > 0 && 649 !(ch->flags & XPC_C_DISCONNECTING)) { 650 xpc_deliver_payload(ch); 651 } 652 653 if (atomic_inc_return(&ch->kthreads_idle) > 654 ch->kthreads_idle_limit) { 655 /* too many idle kthreads on this channel */ 656 atomic_dec(&ch->kthreads_idle); 657 break; 658 } 659 660 dev_dbg(xpc_chan, "idle kthread calling " 661 "wait_event_interruptible_exclusive()\n"); 662 663 (void)wait_event_interruptible_exclusive(ch->idle_wq, 664 (n_of_deliverable_payloads(ch) > 0 || 665 (ch->flags & XPC_C_DISCONNECTING))); 666 667 atomic_dec(&ch->kthreads_idle); 668 669 } while (!(ch->flags & XPC_C_DISCONNECTING)); 670 } 671 672 static int 673 xpc_kthread_start(void *args) 674 { 675 short partid = XPC_UNPACK_ARG1(args); 676 u16 ch_number = XPC_UNPACK_ARG2(args); 677 struct xpc_partition *part = &xpc_partitions[partid]; 678 struct xpc_channel *ch; 679 int n_needed; 680 unsigned long irq_flags; 681 int (*n_of_deliverable_payloads) (struct xpc_channel *) = 682 xpc_arch_ops.n_of_deliverable_payloads; 683 684 dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n", 685 partid, ch_number); 686 687 ch = &part->channels[ch_number]; 688 689 if (!(ch->flags & XPC_C_DISCONNECTING)) { 690 691 /* let registerer know that connection has been established */ 692 693 spin_lock_irqsave(&ch->lock, irq_flags); 694 if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) { 695 ch->flags |= XPC_C_CONNECTEDCALLOUT; 696 spin_unlock_irqrestore(&ch->lock, irq_flags); 697 698 xpc_connected_callout(ch); 699 700 spin_lock_irqsave(&ch->lock, irq_flags); 701 ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE; 702 spin_unlock_irqrestore(&ch->lock, irq_flags); 703 704 /* 705 * It is possible that while the callout was being 706 * made that the remote partition sent some messages. 707 * If that is the case, we may need to activate 708 * additional kthreads to help deliver them. We only 709 * need one less than total #of messages to deliver. 710 */ 711 n_needed = n_of_deliverable_payloads(ch) - 1; 712 if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING)) 713 xpc_activate_kthreads(ch, n_needed); 714 715 } else { 716 spin_unlock_irqrestore(&ch->lock, irq_flags); 717 } 718 719 xpc_kthread_waitmsgs(part, ch); 720 } 721 722 /* let registerer know that connection is disconnecting */ 723 724 spin_lock_irqsave(&ch->lock, irq_flags); 725 if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) && 726 !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) { 727 ch->flags |= XPC_C_DISCONNECTINGCALLOUT; 728 spin_unlock_irqrestore(&ch->lock, irq_flags); 729 730 xpc_disconnect_callout(ch, xpDisconnecting); 731 732 spin_lock_irqsave(&ch->lock, irq_flags); 733 ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE; 734 } 735 spin_unlock_irqrestore(&ch->lock, irq_flags); 736 737 if (atomic_dec_return(&ch->kthreads_assigned) == 0 && 738 atomic_dec_return(&part->nchannels_engaged) == 0) { 739 xpc_arch_ops.indicate_partition_disengaged(part); 740 } 741 742 xpc_msgqueue_deref(ch); 743 744 dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n", 745 partid, ch_number); 746 747 xpc_part_deref(part); 748 return 0; 749 } 750 751 /* 752 * For each partition that XPC has established communications with, there is 753 * a minimum of one kernel thread assigned to perform any operation that 754 * may potentially sleep or block (basically the callouts to the asynchronous 755 * functions registered via xpc_connect()). 756 * 757 * Additional kthreads are created and destroyed by XPC as the workload 758 * demands. 759 * 760 * A kthread is assigned to one of the active channels that exists for a given 761 * partition. 762 */ 763 void 764 xpc_create_kthreads(struct xpc_channel *ch, int needed, 765 int ignore_disconnecting) 766 { 767 unsigned long irq_flags; 768 u64 args = XPC_PACK_ARGS(ch->partid, ch->number); 769 struct xpc_partition *part = &xpc_partitions[ch->partid]; 770 struct task_struct *kthread; 771 void (*indicate_partition_disengaged) (struct xpc_partition *) = 772 xpc_arch_ops.indicate_partition_disengaged; 773 774 while (needed-- > 0) { 775 776 /* 777 * The following is done on behalf of the newly created 778 * kthread. That kthread is responsible for doing the 779 * counterpart to the following before it exits. 780 */ 781 if (ignore_disconnecting) { 782 if (!atomic_inc_not_zero(&ch->kthreads_assigned)) { 783 /* kthreads assigned had gone to zero */ 784 BUG_ON(!(ch->flags & 785 XPC_C_DISCONNECTINGCALLOUT_MADE)); 786 break; 787 } 788 789 } else if (ch->flags & XPC_C_DISCONNECTING) { 790 break; 791 792 } else if (atomic_inc_return(&ch->kthreads_assigned) == 1 && 793 atomic_inc_return(&part->nchannels_engaged) == 1) { 794 xpc_arch_ops.indicate_partition_engaged(part); 795 } 796 (void)xpc_part_ref(part); 797 xpc_msgqueue_ref(ch); 798 799 kthread = kthread_run(xpc_kthread_start, (void *)args, 800 "xpc%02dc%d", ch->partid, ch->number); 801 if (IS_ERR(kthread)) { 802 /* the fork failed */ 803 804 /* 805 * NOTE: if (ignore_disconnecting && 806 * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true, 807 * then we'll deadlock if all other kthreads assigned 808 * to this channel are blocked in the channel's 809 * registerer, because the only thing that will unblock 810 * them is the xpDisconnecting callout that this 811 * failed kthread_run() would have made. 812 */ 813 814 if (atomic_dec_return(&ch->kthreads_assigned) == 0 && 815 atomic_dec_return(&part->nchannels_engaged) == 0) { 816 indicate_partition_disengaged(part); 817 } 818 xpc_msgqueue_deref(ch); 819 xpc_part_deref(part); 820 821 if (atomic_read(&ch->kthreads_assigned) < 822 ch->kthreads_idle_limit) { 823 /* 824 * Flag this as an error only if we have an 825 * insufficient #of kthreads for the channel 826 * to function. 827 */ 828 spin_lock_irqsave(&ch->lock, irq_flags); 829 XPC_DISCONNECT_CHANNEL(ch, xpLackOfResources, 830 &irq_flags); 831 spin_unlock_irqrestore(&ch->lock, irq_flags); 832 } 833 break; 834 } 835 } 836 } 837 838 void 839 xpc_disconnect_wait(int ch_number) 840 { 841 unsigned long irq_flags; 842 short partid; 843 struct xpc_partition *part; 844 struct xpc_channel *ch; 845 int wakeup_channel_mgr; 846 847 /* now wait for all callouts to the caller's function to cease */ 848 for (partid = 0; partid < xp_max_npartitions; partid++) { 849 part = &xpc_partitions[partid]; 850 851 if (!xpc_part_ref(part)) 852 continue; 853 854 ch = &part->channels[ch_number]; 855 856 if (!(ch->flags & XPC_C_WDISCONNECT)) { 857 xpc_part_deref(part); 858 continue; 859 } 860 861 wait_for_completion(&ch->wdisconnect_wait); 862 863 spin_lock_irqsave(&ch->lock, irq_flags); 864 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED)); 865 wakeup_channel_mgr = 0; 866 867 if (ch->delayed_chctl_flags) { 868 if (part->act_state != XPC_P_AS_DEACTIVATING) { 869 spin_lock(&part->chctl_lock); 870 part->chctl.flags[ch->number] |= 871 ch->delayed_chctl_flags; 872 spin_unlock(&part->chctl_lock); 873 wakeup_channel_mgr = 1; 874 } 875 ch->delayed_chctl_flags = 0; 876 } 877 878 ch->flags &= ~XPC_C_WDISCONNECT; 879 spin_unlock_irqrestore(&ch->lock, irq_flags); 880 881 if (wakeup_channel_mgr) 882 xpc_wakeup_channel_mgr(part); 883 884 xpc_part_deref(part); 885 } 886 } 887 888 static int 889 xpc_setup_partitions(void) 890 { 891 short partid; 892 struct xpc_partition *part; 893 894 xpc_partitions = kcalloc(xp_max_npartitions, 895 sizeof(struct xpc_partition), 896 GFP_KERNEL); 897 if (xpc_partitions == NULL) { 898 dev_err(xpc_part, "can't get memory for partition structure\n"); 899 return -ENOMEM; 900 } 901 902 /* 903 * The first few fields of each entry of xpc_partitions[] need to 904 * be initialized now so that calls to xpc_connect() and 905 * xpc_disconnect() can be made prior to the activation of any remote 906 * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE 907 * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING 908 * PARTITION HAS BEEN ACTIVATED. 909 */ 910 for (partid = 0; partid < xp_max_npartitions; partid++) { 911 part = &xpc_partitions[partid]; 912 913 DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part)); 914 915 part->activate_IRQ_rcvd = 0; 916 spin_lock_init(&part->act_lock); 917 part->act_state = XPC_P_AS_INACTIVE; 918 XPC_SET_REASON(part, 0, 0); 919 920 timer_setup(&part->disengage_timer, 921 xpc_timeout_partition_disengage, 0); 922 923 part->setup_state = XPC_P_SS_UNSET; 924 init_waitqueue_head(&part->teardown_wq); 925 atomic_set(&part->references, 0); 926 } 927 928 return xpc_arch_ops.setup_partitions(); 929 } 930 931 static void 932 xpc_teardown_partitions(void) 933 { 934 xpc_arch_ops.teardown_partitions(); 935 kfree(xpc_partitions); 936 } 937 938 static void 939 xpc_do_exit(enum xp_retval reason) 940 { 941 short partid; 942 int active_part_count, printed_waiting_msg = 0; 943 struct xpc_partition *part; 944 unsigned long printmsg_time, disengage_timeout = 0; 945 946 /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */ 947 DBUG_ON(xpc_exiting == 1); 948 949 /* 950 * Let the heartbeat checker thread and the discovery thread 951 * (if one is running) know that they should exit. Also wake up 952 * the heartbeat checker thread in case it's sleeping. 953 */ 954 xpc_exiting = 1; 955 wake_up_interruptible(&xpc_activate_IRQ_wq); 956 957 /* wait for the discovery thread to exit */ 958 wait_for_completion(&xpc_discovery_exited); 959 960 /* wait for the heartbeat checker thread to exit */ 961 wait_for_completion(&xpc_hb_checker_exited); 962 963 /* sleep for a 1/3 of a second or so */ 964 (void)msleep_interruptible(300); 965 966 /* wait for all partitions to become inactive */ 967 968 printmsg_time = jiffies + (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ); 969 xpc_disengage_timedout = 0; 970 971 do { 972 active_part_count = 0; 973 974 for (partid = 0; partid < xp_max_npartitions; partid++) { 975 part = &xpc_partitions[partid]; 976 977 if (xpc_partition_disengaged(part) && 978 part->act_state == XPC_P_AS_INACTIVE) { 979 continue; 980 } 981 982 active_part_count++; 983 984 XPC_DEACTIVATE_PARTITION(part, reason); 985 986 if (part->disengage_timeout > disengage_timeout) 987 disengage_timeout = part->disengage_timeout; 988 } 989 990 if (xpc_arch_ops.any_partition_engaged()) { 991 if (time_is_before_jiffies(printmsg_time)) { 992 dev_info(xpc_part, "waiting for remote " 993 "partitions to deactivate, timeout in " 994 "%ld seconds\n", (disengage_timeout - 995 jiffies) / HZ); 996 printmsg_time = jiffies + 997 (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ); 998 printed_waiting_msg = 1; 999 } 1000 1001 } else if (active_part_count > 0) { 1002 if (printed_waiting_msg) { 1003 dev_info(xpc_part, "waiting for local partition" 1004 " to deactivate\n"); 1005 printed_waiting_msg = 0; 1006 } 1007 1008 } else { 1009 if (!xpc_disengage_timedout) { 1010 dev_info(xpc_part, "all partitions have " 1011 "deactivated\n"); 1012 } 1013 break; 1014 } 1015 1016 /* sleep for a 1/3 of a second or so */ 1017 (void)msleep_interruptible(300); 1018 1019 } while (1); 1020 1021 DBUG_ON(xpc_arch_ops.any_partition_engaged()); 1022 1023 xpc_teardown_rsvd_page(); 1024 1025 if (reason == xpUnloading) { 1026 (void)unregister_die_notifier(&xpc_die_notifier); 1027 (void)unregister_reboot_notifier(&xpc_reboot_notifier); 1028 } 1029 1030 /* clear the interface to XPC's functions */ 1031 xpc_clear_interface(); 1032 1033 if (xpc_sysctl) 1034 unregister_sysctl_table(xpc_sysctl); 1035 if (xpc_sysctl_hb) 1036 unregister_sysctl_table(xpc_sysctl_hb); 1037 1038 xpc_teardown_partitions(); 1039 1040 if (is_uv_system()) 1041 xpc_exit_uv(); 1042 } 1043 1044 /* 1045 * This function is called when the system is being rebooted. 1046 */ 1047 static int 1048 xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused) 1049 { 1050 enum xp_retval reason; 1051 1052 switch (event) { 1053 case SYS_RESTART: 1054 reason = xpSystemReboot; 1055 break; 1056 case SYS_HALT: 1057 reason = xpSystemHalt; 1058 break; 1059 case SYS_POWER_OFF: 1060 reason = xpSystemPoweroff; 1061 break; 1062 default: 1063 reason = xpSystemGoingDown; 1064 } 1065 1066 xpc_do_exit(reason); 1067 return NOTIFY_DONE; 1068 } 1069 1070 /* Used to only allow one cpu to complete disconnect */ 1071 static unsigned int xpc_die_disconnecting; 1072 1073 /* 1074 * Notify other partitions to deactivate from us by first disengaging from all 1075 * references to our memory. 1076 */ 1077 static void 1078 xpc_die_deactivate(void) 1079 { 1080 struct xpc_partition *part; 1081 short partid; 1082 int any_engaged; 1083 long keep_waiting; 1084 long wait_to_print; 1085 1086 if (cmpxchg(&xpc_die_disconnecting, 0, 1)) 1087 return; 1088 1089 /* keep xpc_hb_checker thread from doing anything (just in case) */ 1090 xpc_exiting = 1; 1091 1092 xpc_arch_ops.disallow_all_hbs(); /*indicate we're deactivated */ 1093 1094 for (partid = 0; partid < xp_max_npartitions; partid++) { 1095 part = &xpc_partitions[partid]; 1096 1097 if (xpc_arch_ops.partition_engaged(partid) || 1098 part->act_state != XPC_P_AS_INACTIVE) { 1099 xpc_arch_ops.request_partition_deactivation(part); 1100 xpc_arch_ops.indicate_partition_disengaged(part); 1101 } 1102 } 1103 1104 /* 1105 * Though we requested that all other partitions deactivate from us, 1106 * we only wait until they've all disengaged or we've reached the 1107 * defined timelimit. 1108 * 1109 * Given that one iteration through the following while-loop takes 1110 * approximately 200 microseconds, calculate the #of loops to take 1111 * before bailing and the #of loops before printing a waiting message. 1112 */ 1113 keep_waiting = xpc_disengage_timelimit * 1000 * 5; 1114 wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 1000 * 5; 1115 1116 while (1) { 1117 any_engaged = xpc_arch_ops.any_partition_engaged(); 1118 if (!any_engaged) { 1119 dev_info(xpc_part, "all partitions have deactivated\n"); 1120 break; 1121 } 1122 1123 if (!keep_waiting--) { 1124 for (partid = 0; partid < xp_max_npartitions; 1125 partid++) { 1126 if (xpc_arch_ops.partition_engaged(partid)) { 1127 dev_info(xpc_part, "deactivate from " 1128 "remote partition %d timed " 1129 "out\n", partid); 1130 } 1131 } 1132 break; 1133 } 1134 1135 if (!wait_to_print--) { 1136 dev_info(xpc_part, "waiting for remote partitions to " 1137 "deactivate, timeout in %ld seconds\n", 1138 keep_waiting / (1000 * 5)); 1139 wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 1140 1000 * 5; 1141 } 1142 1143 udelay(200); 1144 } 1145 } 1146 1147 /* 1148 * This function is called when the system is being restarted or halted due 1149 * to some sort of system failure. If this is the case we need to notify the 1150 * other partitions to disengage from all references to our memory. 1151 * This function can also be called when our heartbeater could be offlined 1152 * for a time. In this case we need to notify other partitions to not worry 1153 * about the lack of a heartbeat. 1154 */ 1155 static int 1156 xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args) 1157 { 1158 #ifdef CONFIG_IA64 /* !!! temporary kludge */ 1159 switch (event) { 1160 case DIE_MACHINE_RESTART: 1161 case DIE_MACHINE_HALT: 1162 xpc_die_deactivate(); 1163 break; 1164 1165 case DIE_KDEBUG_ENTER: 1166 /* Should lack of heartbeat be ignored by other partitions? */ 1167 if (!xpc_kdebug_ignore) 1168 break; 1169 1170 fallthrough; 1171 case DIE_MCA_MONARCH_ENTER: 1172 case DIE_INIT_MONARCH_ENTER: 1173 xpc_arch_ops.offline_heartbeat(); 1174 break; 1175 1176 case DIE_KDEBUG_LEAVE: 1177 /* Is lack of heartbeat being ignored by other partitions? */ 1178 if (!xpc_kdebug_ignore) 1179 break; 1180 1181 fallthrough; 1182 case DIE_MCA_MONARCH_LEAVE: 1183 case DIE_INIT_MONARCH_LEAVE: 1184 xpc_arch_ops.online_heartbeat(); 1185 break; 1186 } 1187 #else 1188 struct die_args *die_args = _die_args; 1189 1190 switch (event) { 1191 case DIE_TRAP: 1192 if (die_args->trapnr == X86_TRAP_DF) 1193 xpc_die_deactivate(); 1194 1195 if (((die_args->trapnr == X86_TRAP_MF) || 1196 (die_args->trapnr == X86_TRAP_XF)) && 1197 !user_mode(die_args->regs)) 1198 xpc_die_deactivate(); 1199 1200 break; 1201 case DIE_INT3: 1202 case DIE_DEBUG: 1203 break; 1204 case DIE_OOPS: 1205 case DIE_GPF: 1206 default: 1207 xpc_die_deactivate(); 1208 } 1209 #endif 1210 1211 return NOTIFY_DONE; 1212 } 1213 1214 static int __init 1215 xpc_init(void) 1216 { 1217 int ret; 1218 struct task_struct *kthread; 1219 1220 dev_set_name(xpc_part, "part"); 1221 dev_set_name(xpc_chan, "chan"); 1222 1223 if (is_uv_system()) { 1224 ret = xpc_init_uv(); 1225 1226 } else { 1227 ret = -ENODEV; 1228 } 1229 1230 if (ret != 0) 1231 return ret; 1232 1233 ret = xpc_setup_partitions(); 1234 if (ret != 0) { 1235 dev_err(xpc_part, "can't get memory for partition structure\n"); 1236 goto out_1; 1237 } 1238 1239 xpc_sysctl = register_sysctl("xpc", xpc_sys_xpc); 1240 xpc_sysctl_hb = register_sysctl("xpc/hb", xpc_sys_xpc_hb); 1241 1242 /* 1243 * Fill the partition reserved page with the information needed by 1244 * other partitions to discover we are alive and establish initial 1245 * communications. 1246 */ 1247 ret = xpc_setup_rsvd_page(); 1248 if (ret != 0) { 1249 dev_err(xpc_part, "can't setup our reserved page\n"); 1250 goto out_2; 1251 } 1252 1253 /* add ourselves to the reboot_notifier_list */ 1254 ret = register_reboot_notifier(&xpc_reboot_notifier); 1255 if (ret != 0) 1256 dev_warn(xpc_part, "can't register reboot notifier\n"); 1257 1258 /* add ourselves to the die_notifier list */ 1259 ret = register_die_notifier(&xpc_die_notifier); 1260 if (ret != 0) 1261 dev_warn(xpc_part, "can't register die notifier\n"); 1262 1263 /* 1264 * The real work-horse behind xpc. This processes incoming 1265 * interrupts and monitors remote heartbeats. 1266 */ 1267 kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME); 1268 if (IS_ERR(kthread)) { 1269 dev_err(xpc_part, "failed while forking hb check thread\n"); 1270 ret = -EBUSY; 1271 goto out_3; 1272 } 1273 1274 /* 1275 * Startup a thread that will attempt to discover other partitions to 1276 * activate based on info provided by SAL. This new thread is short 1277 * lived and will exit once discovery is complete. 1278 */ 1279 kthread = kthread_run(xpc_initiate_discovery, NULL, 1280 XPC_DISCOVERY_THREAD_NAME); 1281 if (IS_ERR(kthread)) { 1282 dev_err(xpc_part, "failed while forking discovery thread\n"); 1283 1284 /* mark this new thread as a non-starter */ 1285 complete(&xpc_discovery_exited); 1286 1287 xpc_do_exit(xpUnloading); 1288 return -EBUSY; 1289 } 1290 1291 /* set the interface to point at XPC's functions */ 1292 xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect, 1293 xpc_initiate_send, xpc_initiate_send_notify, 1294 xpc_initiate_received, xpc_initiate_partid_to_nasids); 1295 1296 return 0; 1297 1298 /* initialization was not successful */ 1299 out_3: 1300 xpc_teardown_rsvd_page(); 1301 1302 (void)unregister_die_notifier(&xpc_die_notifier); 1303 (void)unregister_reboot_notifier(&xpc_reboot_notifier); 1304 out_2: 1305 if (xpc_sysctl_hb) 1306 unregister_sysctl_table(xpc_sysctl_hb); 1307 if (xpc_sysctl) 1308 unregister_sysctl_table(xpc_sysctl); 1309 1310 xpc_teardown_partitions(); 1311 out_1: 1312 if (is_uv_system()) 1313 xpc_exit_uv(); 1314 return ret; 1315 } 1316 1317 module_init(xpc_init); 1318 1319 static void __exit 1320 xpc_exit(void) 1321 { 1322 xpc_do_exit(xpUnloading); 1323 } 1324 1325 module_exit(xpc_exit); 1326 1327 MODULE_AUTHOR("Silicon Graphics, Inc."); 1328 MODULE_DESCRIPTION("Cross Partition Communication (XPC) support"); 1329 MODULE_LICENSE("GPL"); 1330 1331 module_param(xpc_hb_interval, int, 0); 1332 MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between " 1333 "heartbeat increments."); 1334 1335 module_param(xpc_hb_check_interval, int, 0); 1336 MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between " 1337 "heartbeat checks."); 1338 1339 module_param(xpc_disengage_timelimit, int, 0); 1340 MODULE_PARM_DESC(xpc_disengage_timelimit, "Number of seconds to wait " 1341 "for disengage to complete."); 1342 1343 module_param(xpc_kdebug_ignore, int, 0); 1344 MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by " 1345 "other partitions when dropping into kdebug."); 1346