1 /* 2 * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. 3 * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. 4 * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 */ 34 35 #include <linux/pci.h> 36 #include <linux/netdevice.h> 37 #include <linux/vmalloc.h> 38 #include <linux/delay.h> 39 #include <linux/idr.h> 40 #include <linux/module.h> 41 #include <linux/printk.h> 42 #ifdef CONFIG_INFINIBAND_QIB_DCA 43 #include <linux/dca.h> 44 #endif 45 #include <rdma/rdma_vt.h> 46 47 #include "qib.h" 48 #include "qib_common.h" 49 #include "qib_mad.h" 50 #ifdef CONFIG_DEBUG_FS 51 #include "qib_debugfs.h" 52 #include "qib_verbs.h" 53 #endif 54 55 #undef pr_fmt 56 #define pr_fmt(fmt) QIB_DRV_NAME ": " fmt 57 58 /* 59 * min buffers we want to have per context, after driver 60 */ 61 #define QIB_MIN_USER_CTXT_BUFCNT 7 62 63 #define QLOGIC_IB_R_SOFTWARE_MASK 0xFF 64 #define QLOGIC_IB_R_SOFTWARE_SHIFT 24 65 #define QLOGIC_IB_R_EMULATOR_MASK (1ULL<<62) 66 67 /* 68 * Number of ctxts we are configured to use (to allow for more pio 69 * buffers per ctxt, etc.) Zero means use chip value. 70 */ 71 ushort qib_cfgctxts; 72 module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO); 73 MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use"); 74 75 unsigned qib_numa_aware; 76 module_param_named(numa_aware, qib_numa_aware, uint, S_IRUGO); 77 MODULE_PARM_DESC(numa_aware, 78 "0 -> PSM allocation close to HCA, 1 -> PSM allocation local to process"); 79 80 /* 81 * If set, do not write to any regs if avoidable, hack to allow 82 * check for deranged default register values. 83 */ 84 ushort qib_mini_init; 85 module_param_named(mini_init, qib_mini_init, ushort, S_IRUGO); 86 MODULE_PARM_DESC(mini_init, "If set, do minimal diag init"); 87 88 unsigned qib_n_krcv_queues; 89 module_param_named(krcvqs, qib_n_krcv_queues, uint, S_IRUGO); 90 MODULE_PARM_DESC(krcvqs, "number of kernel receive queues per IB port"); 91 92 unsigned qib_cc_table_size; 93 module_param_named(cc_table_size, qib_cc_table_size, uint, S_IRUGO); 94 MODULE_PARM_DESC(cc_table_size, "Congestion control table entries 0 (CCA disabled - default), min = 128, max = 1984"); 95 96 static void verify_interrupt(unsigned long); 97 98 static struct idr qib_unit_table; 99 u32 qib_cpulist_count; 100 unsigned long *qib_cpulist; 101 102 /* set number of contexts we'll actually use */ 103 void qib_set_ctxtcnt(struct qib_devdata *dd) 104 { 105 if (!qib_cfgctxts) { 106 dd->cfgctxts = dd->first_user_ctxt + num_online_cpus(); 107 if (dd->cfgctxts > dd->ctxtcnt) 108 dd->cfgctxts = dd->ctxtcnt; 109 } else if (qib_cfgctxts < dd->num_pports) 110 dd->cfgctxts = dd->ctxtcnt; 111 else if (qib_cfgctxts <= dd->ctxtcnt) 112 dd->cfgctxts = qib_cfgctxts; 113 else 114 dd->cfgctxts = dd->ctxtcnt; 115 dd->freectxts = (dd->first_user_ctxt > dd->cfgctxts) ? 0 : 116 dd->cfgctxts - dd->first_user_ctxt; 117 } 118 119 /* 120 * Common code for creating the receive context array. 121 */ 122 int qib_create_ctxts(struct qib_devdata *dd) 123 { 124 unsigned i; 125 int local_node_id = pcibus_to_node(dd->pcidev->bus); 126 127 if (local_node_id < 0) 128 local_node_id = numa_node_id(); 129 dd->assigned_node_id = local_node_id; 130 131 /* 132 * Allocate full ctxtcnt array, rather than just cfgctxts, because 133 * cleanup iterates across all possible ctxts. 134 */ 135 dd->rcd = kcalloc(dd->ctxtcnt, sizeof(*dd->rcd), GFP_KERNEL); 136 if (!dd->rcd) { 137 qib_dev_err(dd, 138 "Unable to allocate ctxtdata array, failing\n"); 139 return -ENOMEM; 140 } 141 142 /* create (one or more) kctxt */ 143 for (i = 0; i < dd->first_user_ctxt; ++i) { 144 struct qib_pportdata *ppd; 145 struct qib_ctxtdata *rcd; 146 147 if (dd->skip_kctxt_mask & (1 << i)) 148 continue; 149 150 ppd = dd->pport + (i % dd->num_pports); 151 152 rcd = qib_create_ctxtdata(ppd, i, dd->assigned_node_id); 153 if (!rcd) { 154 qib_dev_err(dd, 155 "Unable to allocate ctxtdata for Kernel ctxt, failing\n"); 156 kfree(dd->rcd); 157 dd->rcd = NULL; 158 return -ENOMEM; 159 } 160 rcd->pkeys[0] = QIB_DEFAULT_P_KEY; 161 rcd->seq_cnt = 1; 162 } 163 return 0; 164 } 165 166 /* 167 * Common code for user and kernel context setup. 168 */ 169 struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt, 170 int node_id) 171 { 172 struct qib_devdata *dd = ppd->dd; 173 struct qib_ctxtdata *rcd; 174 175 rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, node_id); 176 if (rcd) { 177 INIT_LIST_HEAD(&rcd->qp_wait_list); 178 rcd->node_id = node_id; 179 rcd->ppd = ppd; 180 rcd->dd = dd; 181 rcd->cnt = 1; 182 rcd->ctxt = ctxt; 183 dd->rcd[ctxt] = rcd; 184 #ifdef CONFIG_DEBUG_FS 185 if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */ 186 rcd->opstats = kzalloc_node(sizeof(*rcd->opstats), 187 GFP_KERNEL, node_id); 188 if (!rcd->opstats) { 189 kfree(rcd); 190 qib_dev_err(dd, 191 "Unable to allocate per ctxt stats buffer\n"); 192 return NULL; 193 } 194 } 195 #endif 196 dd->f_init_ctxt(rcd); 197 198 /* 199 * To avoid wasting a lot of memory, we allocate 32KB chunks 200 * of physically contiguous memory, advance through it until 201 * used up and then allocate more. Of course, we need 202 * memory to store those extra pointers, now. 32KB seems to 203 * be the most that is "safe" under memory pressure 204 * (creating large files and then copying them over 205 * NFS while doing lots of MPI jobs). The OOM killer can 206 * get invoked, even though we say we can sleep and this can 207 * cause significant system problems.... 208 */ 209 rcd->rcvegrbuf_size = 0x8000; 210 rcd->rcvegrbufs_perchunk = 211 rcd->rcvegrbuf_size / dd->rcvegrbufsize; 212 rcd->rcvegrbuf_chunks = (rcd->rcvegrcnt + 213 rcd->rcvegrbufs_perchunk - 1) / 214 rcd->rcvegrbufs_perchunk; 215 BUG_ON(!is_power_of_2(rcd->rcvegrbufs_perchunk)); 216 rcd->rcvegrbufs_perchunk_shift = 217 ilog2(rcd->rcvegrbufs_perchunk); 218 } 219 return rcd; 220 } 221 222 /* 223 * Common code for initializing the physical port structure. 224 */ 225 int qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, 226 u8 hw_pidx, u8 port) 227 { 228 int size; 229 230 ppd->dd = dd; 231 ppd->hw_pidx = hw_pidx; 232 ppd->port = port; /* IB port number, not index */ 233 234 spin_lock_init(&ppd->sdma_lock); 235 spin_lock_init(&ppd->lflags_lock); 236 spin_lock_init(&ppd->cc_shadow_lock); 237 init_waitqueue_head(&ppd->state_wait); 238 239 init_timer(&ppd->symerr_clear_timer); 240 ppd->symerr_clear_timer.function = qib_clear_symerror_on_linkup; 241 ppd->symerr_clear_timer.data = (unsigned long)ppd; 242 243 ppd->qib_wq = NULL; 244 ppd->ibport_data.pmastats = 245 alloc_percpu(struct qib_pma_counters); 246 if (!ppd->ibport_data.pmastats) 247 return -ENOMEM; 248 ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64); 249 ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64); 250 ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64); 251 if (!(ppd->ibport_data.rvp.rc_acks) || 252 !(ppd->ibport_data.rvp.rc_qacks) || 253 !(ppd->ibport_data.rvp.rc_delayed_comp)) 254 return -ENOMEM; 255 256 if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) 257 goto bail; 258 259 ppd->cc_supported_table_entries = min(max_t(int, qib_cc_table_size, 260 IB_CCT_MIN_ENTRIES), IB_CCT_ENTRIES*IB_CC_TABLE_CAP_DEFAULT); 261 262 ppd->cc_max_table_entries = 263 ppd->cc_supported_table_entries/IB_CCT_ENTRIES; 264 265 size = IB_CC_TABLE_CAP_DEFAULT * sizeof(struct ib_cc_table_entry) 266 * IB_CCT_ENTRIES; 267 ppd->ccti_entries = kzalloc(size, GFP_KERNEL); 268 if (!ppd->ccti_entries) { 269 qib_dev_err(dd, 270 "failed to allocate congestion control table for port %d!\n", 271 port); 272 goto bail; 273 } 274 275 size = IB_CC_CCS_ENTRIES * sizeof(struct ib_cc_congestion_entry); 276 ppd->congestion_entries = kzalloc(size, GFP_KERNEL); 277 if (!ppd->congestion_entries) { 278 qib_dev_err(dd, 279 "failed to allocate congestion setting list for port %d!\n", 280 port); 281 goto bail_1; 282 } 283 284 size = sizeof(struct cc_table_shadow); 285 ppd->ccti_entries_shadow = kzalloc(size, GFP_KERNEL); 286 if (!ppd->ccti_entries_shadow) { 287 qib_dev_err(dd, 288 "failed to allocate shadow ccti list for port %d!\n", 289 port); 290 goto bail_2; 291 } 292 293 size = sizeof(struct ib_cc_congestion_setting_attr); 294 ppd->congestion_entries_shadow = kzalloc(size, GFP_KERNEL); 295 if (!ppd->congestion_entries_shadow) { 296 qib_dev_err(dd, 297 "failed to allocate shadow congestion setting list for port %d!\n", 298 port); 299 goto bail_3; 300 } 301 302 return 0; 303 304 bail_3: 305 kfree(ppd->ccti_entries_shadow); 306 ppd->ccti_entries_shadow = NULL; 307 bail_2: 308 kfree(ppd->congestion_entries); 309 ppd->congestion_entries = NULL; 310 bail_1: 311 kfree(ppd->ccti_entries); 312 ppd->ccti_entries = NULL; 313 bail: 314 /* User is intentionally disabling the congestion control agent */ 315 if (!qib_cc_table_size) 316 return 0; 317 318 if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) { 319 qib_cc_table_size = 0; 320 qib_dev_err(dd, 321 "Congestion Control table size %d less than minimum %d for port %d\n", 322 qib_cc_table_size, IB_CCT_MIN_ENTRIES, port); 323 } 324 325 qib_dev_err(dd, "Congestion Control Agent disabled for port %d\n", 326 port); 327 return 0; 328 } 329 330 static int init_pioavailregs(struct qib_devdata *dd) 331 { 332 int ret, pidx; 333 u64 *status_page; 334 335 dd->pioavailregs_dma = dma_alloc_coherent( 336 &dd->pcidev->dev, PAGE_SIZE, &dd->pioavailregs_phys, 337 GFP_KERNEL); 338 if (!dd->pioavailregs_dma) { 339 qib_dev_err(dd, 340 "failed to allocate PIOavail reg area in memory\n"); 341 ret = -ENOMEM; 342 goto done; 343 } 344 345 /* 346 * We really want L2 cache aligned, but for current CPUs of 347 * interest, they are the same. 348 */ 349 status_page = (u64 *) 350 ((char *) dd->pioavailregs_dma + 351 ((2 * L1_CACHE_BYTES + 352 dd->pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES)); 353 /* device status comes first, for backwards compatibility */ 354 dd->devstatusp = status_page; 355 *status_page++ = 0; 356 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 357 dd->pport[pidx].statusp = status_page; 358 *status_page++ = 0; 359 } 360 361 /* 362 * Setup buffer to hold freeze and other messages, accessible to 363 * apps, following statusp. This is per-unit, not per port. 364 */ 365 dd->freezemsg = (char *) status_page; 366 *dd->freezemsg = 0; 367 /* length of msg buffer is "whatever is left" */ 368 ret = (char *) status_page - (char *) dd->pioavailregs_dma; 369 dd->freezelen = PAGE_SIZE - ret; 370 371 ret = 0; 372 373 done: 374 return ret; 375 } 376 377 /** 378 * init_shadow_tids - allocate the shadow TID array 379 * @dd: the qlogic_ib device 380 * 381 * allocate the shadow TID array, so we can qib_munlock previous 382 * entries. It may make more sense to move the pageshadow to the 383 * ctxt data structure, so we only allocate memory for ctxts actually 384 * in use, since we at 8k per ctxt, now. 385 * We don't want failures here to prevent use of the driver/chip, 386 * so no return value. 387 */ 388 static void init_shadow_tids(struct qib_devdata *dd) 389 { 390 struct page **pages; 391 dma_addr_t *addrs; 392 393 pages = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(struct page *)); 394 if (!pages) { 395 qib_dev_err(dd, 396 "failed to allocate shadow page * array, no expected sends!\n"); 397 goto bail; 398 } 399 400 addrs = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(dma_addr_t)); 401 if (!addrs) { 402 qib_dev_err(dd, 403 "failed to allocate shadow dma handle array, no expected sends!\n"); 404 goto bail_free; 405 } 406 407 dd->pageshadow = pages; 408 dd->physshadow = addrs; 409 return; 410 411 bail_free: 412 vfree(pages); 413 bail: 414 dd->pageshadow = NULL; 415 } 416 417 /* 418 * Do initialization for device that is only needed on 419 * first detect, not on resets. 420 */ 421 static int loadtime_init(struct qib_devdata *dd) 422 { 423 int ret = 0; 424 425 if (((dd->revision >> QLOGIC_IB_R_SOFTWARE_SHIFT) & 426 QLOGIC_IB_R_SOFTWARE_MASK) != QIB_CHIP_SWVERSION) { 427 qib_dev_err(dd, 428 "Driver only handles version %d, chip swversion is %d (%llx), failng\n", 429 QIB_CHIP_SWVERSION, 430 (int)(dd->revision >> 431 QLOGIC_IB_R_SOFTWARE_SHIFT) & 432 QLOGIC_IB_R_SOFTWARE_MASK, 433 (unsigned long long) dd->revision); 434 ret = -ENOSYS; 435 goto done; 436 } 437 438 if (dd->revision & QLOGIC_IB_R_EMULATOR_MASK) 439 qib_devinfo(dd->pcidev, "%s", dd->boardversion); 440 441 spin_lock_init(&dd->pioavail_lock); 442 spin_lock_init(&dd->sendctrl_lock); 443 spin_lock_init(&dd->uctxt_lock); 444 spin_lock_init(&dd->qib_diag_trans_lock); 445 spin_lock_init(&dd->eep_st_lock); 446 mutex_init(&dd->eep_lock); 447 448 if (qib_mini_init) 449 goto done; 450 451 ret = init_pioavailregs(dd); 452 init_shadow_tids(dd); 453 454 qib_get_eeprom_info(dd); 455 456 /* setup time (don't start yet) to verify we got interrupt */ 457 init_timer(&dd->intrchk_timer); 458 dd->intrchk_timer.function = verify_interrupt; 459 dd->intrchk_timer.data = (unsigned long) dd; 460 done: 461 return ret; 462 } 463 464 /** 465 * init_after_reset - re-initialize after a reset 466 * @dd: the qlogic_ib device 467 * 468 * sanity check at least some of the values after reset, and 469 * ensure no receive or transmit (explicitly, in case reset 470 * failed 471 */ 472 static int init_after_reset(struct qib_devdata *dd) 473 { 474 int i; 475 476 /* 477 * Ensure chip does no sends or receives, tail updates, or 478 * pioavail updates while we re-initialize. This is mostly 479 * for the driver data structures, not chip registers. 480 */ 481 for (i = 0; i < dd->num_pports; ++i) { 482 /* 483 * ctxt == -1 means "all contexts". Only really safe for 484 * _dis_abling things, as here. 485 */ 486 dd->f_rcvctrl(dd->pport + i, QIB_RCVCTRL_CTXT_DIS | 487 QIB_RCVCTRL_INTRAVAIL_DIS | 488 QIB_RCVCTRL_TAILUPD_DIS, -1); 489 /* Redundant across ports for some, but no big deal. */ 490 dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_DIS | 491 QIB_SENDCTRL_AVAIL_DIS); 492 } 493 494 return 0; 495 } 496 497 static void enable_chip(struct qib_devdata *dd) 498 { 499 u64 rcvmask; 500 int i; 501 502 /* 503 * Enable PIO send, and update of PIOavail regs to memory. 504 */ 505 for (i = 0; i < dd->num_pports; ++i) 506 dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_ENB | 507 QIB_SENDCTRL_AVAIL_ENB); 508 /* 509 * Enable kernel ctxts' receive and receive interrupt. 510 * Other ctxts done as user opens and inits them. 511 */ 512 rcvmask = QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_INTRAVAIL_ENB; 513 rcvmask |= (dd->flags & QIB_NODMA_RTAIL) ? 514 QIB_RCVCTRL_TAILUPD_DIS : QIB_RCVCTRL_TAILUPD_ENB; 515 for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { 516 struct qib_ctxtdata *rcd = dd->rcd[i]; 517 518 if (rcd) 519 dd->f_rcvctrl(rcd->ppd, rcvmask, i); 520 } 521 } 522 523 static void verify_interrupt(unsigned long opaque) 524 { 525 struct qib_devdata *dd = (struct qib_devdata *) opaque; 526 u64 int_counter; 527 528 if (!dd) 529 return; /* being torn down */ 530 531 /* 532 * If we don't have a lid or any interrupts, let the user know and 533 * don't bother checking again. 534 */ 535 int_counter = qib_int_counter(dd) - dd->z_int_counter; 536 if (int_counter == 0) { 537 if (!dd->f_intr_fallback(dd)) 538 dev_err(&dd->pcidev->dev, 539 "No interrupts detected, not usable.\n"); 540 else /* re-arm the timer to see if fallback works */ 541 mod_timer(&dd->intrchk_timer, jiffies + HZ/2); 542 } 543 } 544 545 static void init_piobuf_state(struct qib_devdata *dd) 546 { 547 int i, pidx; 548 u32 uctxts; 549 550 /* 551 * Ensure all buffers are free, and fifos empty. Buffers 552 * are common, so only do once for port 0. 553 * 554 * After enable and qib_chg_pioavailkernel so we can safely 555 * enable pioavail updates and PIOENABLE. After this, packets 556 * are ready and able to go out. 557 */ 558 dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_ALL); 559 for (pidx = 0; pidx < dd->num_pports; ++pidx) 560 dd->f_sendctrl(dd->pport + pidx, QIB_SENDCTRL_FLUSH); 561 562 /* 563 * If not all sendbufs are used, add the one to each of the lower 564 * numbered contexts. pbufsctxt and lastctxt_piobuf are 565 * calculated in chip-specific code because it may cause some 566 * chip-specific adjustments to be made. 567 */ 568 uctxts = dd->cfgctxts - dd->first_user_ctxt; 569 dd->ctxts_extrabuf = dd->pbufsctxt ? 570 dd->lastctxt_piobuf - (dd->pbufsctxt * uctxts) : 0; 571 572 /* 573 * Set up the shadow copies of the piobufavail registers, 574 * which we compare against the chip registers for now, and 575 * the in memory DMA'ed copies of the registers. 576 * By now pioavail updates to memory should have occurred, so 577 * copy them into our working/shadow registers; this is in 578 * case something went wrong with abort, but mostly to get the 579 * initial values of the generation bit correct. 580 */ 581 for (i = 0; i < dd->pioavregs; i++) { 582 __le64 tmp; 583 584 tmp = dd->pioavailregs_dma[i]; 585 /* 586 * Don't need to worry about pioavailkernel here 587 * because we will call qib_chg_pioavailkernel() later 588 * in initialization, to busy out buffers as needed. 589 */ 590 dd->pioavailshadow[i] = le64_to_cpu(tmp); 591 } 592 while (i < ARRAY_SIZE(dd->pioavailshadow)) 593 dd->pioavailshadow[i++] = 0; /* for debugging sanity */ 594 595 /* after pioavailshadow is setup */ 596 qib_chg_pioavailkernel(dd, 0, dd->piobcnt2k + dd->piobcnt4k, 597 TXCHK_CHG_TYPE_KERN, NULL); 598 dd->f_initvl15_bufs(dd); 599 } 600 601 /** 602 * qib_create_workqueues - create per port workqueues 603 * @dd: the qlogic_ib device 604 */ 605 static int qib_create_workqueues(struct qib_devdata *dd) 606 { 607 int pidx; 608 struct qib_pportdata *ppd; 609 610 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 611 ppd = dd->pport + pidx; 612 if (!ppd->qib_wq) { 613 char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */ 614 615 snprintf(wq_name, sizeof(wq_name), "qib%d_%d", 616 dd->unit, pidx); 617 ppd->qib_wq = alloc_ordered_workqueue(wq_name, 618 WQ_MEM_RECLAIM); 619 if (!ppd->qib_wq) 620 goto wq_error; 621 } 622 } 623 return 0; 624 wq_error: 625 pr_err("create_singlethread_workqueue failed for port %d\n", 626 pidx + 1); 627 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 628 ppd = dd->pport + pidx; 629 if (ppd->qib_wq) { 630 destroy_workqueue(ppd->qib_wq); 631 ppd->qib_wq = NULL; 632 } 633 } 634 return -ENOMEM; 635 } 636 637 static void qib_free_pportdata(struct qib_pportdata *ppd) 638 { 639 free_percpu(ppd->ibport_data.pmastats); 640 free_percpu(ppd->ibport_data.rvp.rc_acks); 641 free_percpu(ppd->ibport_data.rvp.rc_qacks); 642 free_percpu(ppd->ibport_data.rvp.rc_delayed_comp); 643 ppd->ibport_data.pmastats = NULL; 644 } 645 646 /** 647 * qib_init - do the actual initialization sequence on the chip 648 * @dd: the qlogic_ib device 649 * @reinit: reinitializing, so don't allocate new memory 650 * 651 * Do the actual initialization sequence on the chip. This is done 652 * both from the init routine called from the PCI infrastructure, and 653 * when we reset the chip, or detect that it was reset internally, 654 * or it's administratively re-enabled. 655 * 656 * Memory allocation here and in called routines is only done in 657 * the first case (reinit == 0). We have to be careful, because even 658 * without memory allocation, we need to re-write all the chip registers 659 * TIDs, etc. after the reset or enable has completed. 660 */ 661 int qib_init(struct qib_devdata *dd, int reinit) 662 { 663 int ret = 0, pidx, lastfail = 0; 664 u32 portok = 0; 665 unsigned i; 666 struct qib_ctxtdata *rcd; 667 struct qib_pportdata *ppd; 668 unsigned long flags; 669 670 /* Set linkstate to unknown, so we can watch for a transition. */ 671 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 672 ppd = dd->pport + pidx; 673 spin_lock_irqsave(&ppd->lflags_lock, flags); 674 ppd->lflags &= ~(QIBL_LINKACTIVE | QIBL_LINKARMED | 675 QIBL_LINKDOWN | QIBL_LINKINIT | 676 QIBL_LINKV); 677 spin_unlock_irqrestore(&ppd->lflags_lock, flags); 678 } 679 680 if (reinit) 681 ret = init_after_reset(dd); 682 else 683 ret = loadtime_init(dd); 684 if (ret) 685 goto done; 686 687 /* Bypass most chip-init, to get to device creation */ 688 if (qib_mini_init) 689 return 0; 690 691 ret = dd->f_late_initreg(dd); 692 if (ret) 693 goto done; 694 695 /* dd->rcd can be NULL if early init failed */ 696 for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { 697 /* 698 * Set up the (kernel) rcvhdr queue and egr TIDs. If doing 699 * re-init, the simplest way to handle this is to free 700 * existing, and re-allocate. 701 * Need to re-create rest of ctxt 0 ctxtdata as well. 702 */ 703 rcd = dd->rcd[i]; 704 if (!rcd) 705 continue; 706 707 lastfail = qib_create_rcvhdrq(dd, rcd); 708 if (!lastfail) 709 lastfail = qib_setup_eagerbufs(rcd); 710 if (lastfail) { 711 qib_dev_err(dd, 712 "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); 713 continue; 714 } 715 } 716 717 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 718 int mtu; 719 720 if (lastfail) 721 ret = lastfail; 722 ppd = dd->pport + pidx; 723 mtu = ib_mtu_enum_to_int(qib_ibmtu); 724 if (mtu == -1) { 725 mtu = QIB_DEFAULT_MTU; 726 qib_ibmtu = 0; /* don't leave invalid value */ 727 } 728 /* set max we can ever have for this driver load */ 729 ppd->init_ibmaxlen = min(mtu > 2048 ? 730 dd->piosize4k : dd->piosize2k, 731 dd->rcvegrbufsize + 732 (dd->rcvhdrentsize << 2)); 733 /* 734 * Have to initialize ibmaxlen, but this will normally 735 * change immediately in qib_set_mtu(). 736 */ 737 ppd->ibmaxlen = ppd->init_ibmaxlen; 738 qib_set_mtu(ppd, mtu); 739 740 spin_lock_irqsave(&ppd->lflags_lock, flags); 741 ppd->lflags |= QIBL_IB_LINK_DISABLED; 742 spin_unlock_irqrestore(&ppd->lflags_lock, flags); 743 744 lastfail = dd->f_bringup_serdes(ppd); 745 if (lastfail) { 746 qib_devinfo(dd->pcidev, 747 "Failed to bringup IB port %u\n", ppd->port); 748 lastfail = -ENETDOWN; 749 continue; 750 } 751 752 portok++; 753 } 754 755 if (!portok) { 756 /* none of the ports initialized */ 757 if (!ret && lastfail) 758 ret = lastfail; 759 else if (!ret) 760 ret = -ENETDOWN; 761 /* but continue on, so we can debug cause */ 762 } 763 764 enable_chip(dd); 765 766 init_piobuf_state(dd); 767 768 done: 769 if (!ret) { 770 /* chip is OK for user apps; mark it as initialized */ 771 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 772 ppd = dd->pport + pidx; 773 /* 774 * Set status even if port serdes is not initialized 775 * so that diags will work. 776 */ 777 *ppd->statusp |= QIB_STATUS_CHIP_PRESENT | 778 QIB_STATUS_INITTED; 779 if (!ppd->link_speed_enabled) 780 continue; 781 if (dd->flags & QIB_HAS_SEND_DMA) 782 ret = qib_setup_sdma(ppd); 783 init_timer(&ppd->hol_timer); 784 ppd->hol_timer.function = qib_hol_event; 785 ppd->hol_timer.data = (unsigned long)ppd; 786 ppd->hol_state = QIB_HOL_UP; 787 } 788 789 /* now we can enable all interrupts from the chip */ 790 dd->f_set_intr_state(dd, 1); 791 792 /* 793 * Setup to verify we get an interrupt, and fallback 794 * to an alternate if necessary and possible. 795 */ 796 mod_timer(&dd->intrchk_timer, jiffies + HZ/2); 797 /* start stats retrieval timer */ 798 mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER); 799 } 800 801 /* if ret is non-zero, we probably should do some cleanup here... */ 802 return ret; 803 } 804 805 /* 806 * These next two routines are placeholders in case we don't have per-arch 807 * code for controlling write combining. If explicit control of write 808 * combining is not available, performance will probably be awful. 809 */ 810 811 int __attribute__((weak)) qib_enable_wc(struct qib_devdata *dd) 812 { 813 return -EOPNOTSUPP; 814 } 815 816 void __attribute__((weak)) qib_disable_wc(struct qib_devdata *dd) 817 { 818 } 819 820 static inline struct qib_devdata *__qib_lookup(int unit) 821 { 822 return idr_find(&qib_unit_table, unit); 823 } 824 825 struct qib_devdata *qib_lookup(int unit) 826 { 827 struct qib_devdata *dd; 828 unsigned long flags; 829 830 spin_lock_irqsave(&qib_devs_lock, flags); 831 dd = __qib_lookup(unit); 832 spin_unlock_irqrestore(&qib_devs_lock, flags); 833 834 return dd; 835 } 836 837 /* 838 * Stop the timers during unit shutdown, or after an error late 839 * in initialization. 840 */ 841 static void qib_stop_timers(struct qib_devdata *dd) 842 { 843 struct qib_pportdata *ppd; 844 int pidx; 845 846 if (dd->stats_timer.data) { 847 del_timer_sync(&dd->stats_timer); 848 dd->stats_timer.data = 0; 849 } 850 if (dd->intrchk_timer.data) { 851 del_timer_sync(&dd->intrchk_timer); 852 dd->intrchk_timer.data = 0; 853 } 854 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 855 ppd = dd->pport + pidx; 856 if (ppd->hol_timer.data) 857 del_timer_sync(&ppd->hol_timer); 858 if (ppd->led_override_timer.data) { 859 del_timer_sync(&ppd->led_override_timer); 860 atomic_set(&ppd->led_override_timer_active, 0); 861 } 862 if (ppd->symerr_clear_timer.data) 863 del_timer_sync(&ppd->symerr_clear_timer); 864 } 865 } 866 867 /** 868 * qib_shutdown_device - shut down a device 869 * @dd: the qlogic_ib device 870 * 871 * This is called to make the device quiet when we are about to 872 * unload the driver, and also when the device is administratively 873 * disabled. It does not free any data structures. 874 * Everything it does has to be setup again by qib_init(dd, 1) 875 */ 876 static void qib_shutdown_device(struct qib_devdata *dd) 877 { 878 struct qib_pportdata *ppd; 879 unsigned pidx; 880 881 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 882 ppd = dd->pport + pidx; 883 884 spin_lock_irq(&ppd->lflags_lock); 885 ppd->lflags &= ~(QIBL_LINKDOWN | QIBL_LINKINIT | 886 QIBL_LINKARMED | QIBL_LINKACTIVE | 887 QIBL_LINKV); 888 spin_unlock_irq(&ppd->lflags_lock); 889 *ppd->statusp &= ~(QIB_STATUS_IB_CONF | QIB_STATUS_IB_READY); 890 } 891 dd->flags &= ~QIB_INITTED; 892 893 /* mask interrupts, but not errors */ 894 dd->f_set_intr_state(dd, 0); 895 896 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 897 ppd = dd->pport + pidx; 898 dd->f_rcvctrl(ppd, QIB_RCVCTRL_TAILUPD_DIS | 899 QIB_RCVCTRL_CTXT_DIS | 900 QIB_RCVCTRL_INTRAVAIL_DIS | 901 QIB_RCVCTRL_PKEY_ENB, -1); 902 /* 903 * Gracefully stop all sends allowing any in progress to 904 * trickle out first. 905 */ 906 dd->f_sendctrl(ppd, QIB_SENDCTRL_CLEAR); 907 } 908 909 /* 910 * Enough for anything that's going to trickle out to have actually 911 * done so. 912 */ 913 udelay(20); 914 915 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 916 ppd = dd->pport + pidx; 917 dd->f_setextled(ppd, 0); /* make sure LEDs are off */ 918 919 if (dd->flags & QIB_HAS_SEND_DMA) 920 qib_teardown_sdma(ppd); 921 922 dd->f_sendctrl(ppd, QIB_SENDCTRL_AVAIL_DIS | 923 QIB_SENDCTRL_SEND_DIS); 924 /* 925 * Clear SerdesEnable. 926 * We can't count on interrupts since we are stopping. 927 */ 928 dd->f_quiet_serdes(ppd); 929 930 if (ppd->qib_wq) { 931 destroy_workqueue(ppd->qib_wq); 932 ppd->qib_wq = NULL; 933 } 934 qib_free_pportdata(ppd); 935 } 936 937 } 938 939 /** 940 * qib_free_ctxtdata - free a context's allocated data 941 * @dd: the qlogic_ib device 942 * @rcd: the ctxtdata structure 943 * 944 * free up any allocated data for a context 945 * This should not touch anything that would affect a simultaneous 946 * re-allocation of context data, because it is called after qib_mutex 947 * is released (and can be called from reinit as well). 948 * It should never change any chip state, or global driver state. 949 */ 950 void qib_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd) 951 { 952 if (!rcd) 953 return; 954 955 if (rcd->rcvhdrq) { 956 dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size, 957 rcd->rcvhdrq, rcd->rcvhdrq_phys); 958 rcd->rcvhdrq = NULL; 959 if (rcd->rcvhdrtail_kvaddr) { 960 dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, 961 rcd->rcvhdrtail_kvaddr, 962 rcd->rcvhdrqtailaddr_phys); 963 rcd->rcvhdrtail_kvaddr = NULL; 964 } 965 } 966 if (rcd->rcvegrbuf) { 967 unsigned e; 968 969 for (e = 0; e < rcd->rcvegrbuf_chunks; e++) { 970 void *base = rcd->rcvegrbuf[e]; 971 size_t size = rcd->rcvegrbuf_size; 972 973 dma_free_coherent(&dd->pcidev->dev, size, 974 base, rcd->rcvegrbuf_phys[e]); 975 } 976 kfree(rcd->rcvegrbuf); 977 rcd->rcvegrbuf = NULL; 978 kfree(rcd->rcvegrbuf_phys); 979 rcd->rcvegrbuf_phys = NULL; 980 rcd->rcvegrbuf_chunks = 0; 981 } 982 983 kfree(rcd->tid_pg_list); 984 vfree(rcd->user_event_mask); 985 vfree(rcd->subctxt_uregbase); 986 vfree(rcd->subctxt_rcvegrbuf); 987 vfree(rcd->subctxt_rcvhdr_base); 988 #ifdef CONFIG_DEBUG_FS 989 kfree(rcd->opstats); 990 rcd->opstats = NULL; 991 #endif 992 kfree(rcd); 993 } 994 995 /* 996 * Perform a PIO buffer bandwidth write test, to verify proper system 997 * configuration. Even when all the setup calls work, occasionally 998 * BIOS or other issues can prevent write combining from working, or 999 * can cause other bandwidth problems to the chip. 1000 * 1001 * This test simply writes the same buffer over and over again, and 1002 * measures close to the peak bandwidth to the chip (not testing 1003 * data bandwidth to the wire). On chips that use an address-based 1004 * trigger to send packets to the wire, this is easy. On chips that 1005 * use a count to trigger, we want to make sure that the packet doesn't 1006 * go out on the wire, or trigger flow control checks. 1007 */ 1008 static void qib_verify_pioperf(struct qib_devdata *dd) 1009 { 1010 u32 pbnum, cnt, lcnt; 1011 u32 __iomem *piobuf; 1012 u32 *addr; 1013 u64 msecs, emsecs; 1014 1015 piobuf = dd->f_getsendbuf(dd->pport, 0ULL, &pbnum); 1016 if (!piobuf) { 1017 qib_devinfo(dd->pcidev, 1018 "No PIObufs for checking perf, skipping\n"); 1019 return; 1020 } 1021 1022 /* 1023 * Enough to give us a reasonable test, less than piobuf size, and 1024 * likely multiple of store buffer length. 1025 */ 1026 cnt = 1024; 1027 1028 addr = vmalloc(cnt); 1029 if (!addr) { 1030 qib_devinfo(dd->pcidev, 1031 "Couldn't get memory for checking PIO perf, skipping\n"); 1032 goto done; 1033 } 1034 1035 preempt_disable(); /* we want reasonably accurate elapsed time */ 1036 msecs = 1 + jiffies_to_msecs(jiffies); 1037 for (lcnt = 0; lcnt < 10000U; lcnt++) { 1038 /* wait until we cross msec boundary */ 1039 if (jiffies_to_msecs(jiffies) >= msecs) 1040 break; 1041 udelay(1); 1042 } 1043 1044 dd->f_set_armlaunch(dd, 0); 1045 1046 /* 1047 * length 0, no dwords actually sent 1048 */ 1049 writeq(0, piobuf); 1050 qib_flush_wc(); 1051 1052 /* 1053 * This is only roughly accurate, since even with preempt we 1054 * still take interrupts that could take a while. Running for 1055 * >= 5 msec seems to get us "close enough" to accurate values. 1056 */ 1057 msecs = jiffies_to_msecs(jiffies); 1058 for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) { 1059 qib_pio_copy(piobuf + 64, addr, cnt >> 2); 1060 emsecs = jiffies_to_msecs(jiffies) - msecs; 1061 } 1062 1063 /* 1 GiB/sec, slightly over IB SDR line rate */ 1064 if (lcnt < (emsecs * 1024U)) 1065 qib_dev_err(dd, 1066 "Performance problem: bandwidth to PIO buffers is only %u MiB/sec\n", 1067 lcnt / (u32) emsecs); 1068 1069 preempt_enable(); 1070 1071 vfree(addr); 1072 1073 done: 1074 /* disarm piobuf, so it's available again */ 1075 dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbnum)); 1076 qib_sendbuf_done(dd, pbnum); 1077 dd->f_set_armlaunch(dd, 1); 1078 } 1079 1080 void qib_free_devdata(struct qib_devdata *dd) 1081 { 1082 unsigned long flags; 1083 1084 spin_lock_irqsave(&qib_devs_lock, flags); 1085 idr_remove(&qib_unit_table, dd->unit); 1086 list_del(&dd->list); 1087 spin_unlock_irqrestore(&qib_devs_lock, flags); 1088 1089 #ifdef CONFIG_DEBUG_FS 1090 qib_dbg_ibdev_exit(&dd->verbs_dev); 1091 #endif 1092 free_percpu(dd->int_counter); 1093 rvt_dealloc_device(&dd->verbs_dev.rdi); 1094 } 1095 1096 u64 qib_int_counter(struct qib_devdata *dd) 1097 { 1098 int cpu; 1099 u64 int_counter = 0; 1100 1101 for_each_possible_cpu(cpu) 1102 int_counter += *per_cpu_ptr(dd->int_counter, cpu); 1103 return int_counter; 1104 } 1105 1106 u64 qib_sps_ints(void) 1107 { 1108 unsigned long flags; 1109 struct qib_devdata *dd; 1110 u64 sps_ints = 0; 1111 1112 spin_lock_irqsave(&qib_devs_lock, flags); 1113 list_for_each_entry(dd, &qib_dev_list, list) { 1114 sps_ints += qib_int_counter(dd); 1115 } 1116 spin_unlock_irqrestore(&qib_devs_lock, flags); 1117 return sps_ints; 1118 } 1119 1120 /* 1121 * Allocate our primary per-unit data structure. Must be done via verbs 1122 * allocator, because the verbs cleanup process both does cleanup and 1123 * free of the data structure. 1124 * "extra" is for chip-specific data. 1125 * 1126 * Use the idr mechanism to get a unit number for this unit. 1127 */ 1128 struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) 1129 { 1130 unsigned long flags; 1131 struct qib_devdata *dd; 1132 int ret, nports; 1133 1134 /* extra is * number of ports */ 1135 nports = extra / sizeof(struct qib_pportdata); 1136 dd = (struct qib_devdata *)rvt_alloc_device(sizeof(*dd) + extra, 1137 nports); 1138 if (!dd) 1139 return ERR_PTR(-ENOMEM); 1140 1141 INIT_LIST_HEAD(&dd->list); 1142 1143 idr_preload(GFP_KERNEL); 1144 spin_lock_irqsave(&qib_devs_lock, flags); 1145 1146 ret = idr_alloc(&qib_unit_table, dd, 0, 0, GFP_NOWAIT); 1147 if (ret >= 0) { 1148 dd->unit = ret; 1149 list_add(&dd->list, &qib_dev_list); 1150 } 1151 1152 spin_unlock_irqrestore(&qib_devs_lock, flags); 1153 idr_preload_end(); 1154 1155 if (ret < 0) { 1156 qib_early_err(&pdev->dev, 1157 "Could not allocate unit ID: error %d\n", -ret); 1158 goto bail; 1159 } 1160 dd->int_counter = alloc_percpu(u64); 1161 if (!dd->int_counter) { 1162 ret = -ENOMEM; 1163 qib_early_err(&pdev->dev, 1164 "Could not allocate per-cpu int_counter\n"); 1165 goto bail; 1166 } 1167 1168 if (!qib_cpulist_count) { 1169 u32 count = num_online_cpus(); 1170 1171 qib_cpulist = kzalloc(BITS_TO_LONGS(count) * 1172 sizeof(long), GFP_KERNEL); 1173 if (qib_cpulist) 1174 qib_cpulist_count = count; 1175 else 1176 qib_early_err(&pdev->dev, 1177 "Could not alloc cpulist info, cpu affinity might be wrong\n"); 1178 } 1179 #ifdef CONFIG_DEBUG_FS 1180 qib_dbg_ibdev_init(&dd->verbs_dev); 1181 #endif 1182 return dd; 1183 bail: 1184 if (!list_empty(&dd->list)) 1185 list_del_init(&dd->list); 1186 rvt_dealloc_device(&dd->verbs_dev.rdi); 1187 return ERR_PTR(ret); 1188 } 1189 1190 /* 1191 * Called from freeze mode handlers, and from PCI error 1192 * reporting code. Should be paranoid about state of 1193 * system and data structures. 1194 */ 1195 void qib_disable_after_error(struct qib_devdata *dd) 1196 { 1197 if (dd->flags & QIB_INITTED) { 1198 u32 pidx; 1199 1200 dd->flags &= ~QIB_INITTED; 1201 if (dd->pport) 1202 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1203 struct qib_pportdata *ppd; 1204 1205 ppd = dd->pport + pidx; 1206 if (dd->flags & QIB_PRESENT) { 1207 qib_set_linkstate(ppd, 1208 QIB_IB_LINKDOWN_DISABLE); 1209 dd->f_setextled(ppd, 0); 1210 } 1211 *ppd->statusp &= ~QIB_STATUS_IB_READY; 1212 } 1213 } 1214 1215 /* 1216 * Mark as having had an error for driver, and also 1217 * for /sys and status word mapped to user programs. 1218 * This marks unit as not usable, until reset. 1219 */ 1220 if (dd->devstatusp) 1221 *dd->devstatusp |= QIB_STATUS_HWERROR; 1222 } 1223 1224 static void qib_remove_one(struct pci_dev *); 1225 static int qib_init_one(struct pci_dev *, const struct pci_device_id *); 1226 1227 #define DRIVER_LOAD_MSG "Intel " QIB_DRV_NAME " loaded: " 1228 #define PFX QIB_DRV_NAME ": " 1229 1230 static const struct pci_device_id qib_pci_tbl[] = { 1231 { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_QLOGIC_IB_6120) }, 1232 { PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7220) }, 1233 { PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7322) }, 1234 { 0, } 1235 }; 1236 1237 MODULE_DEVICE_TABLE(pci, qib_pci_tbl); 1238 1239 static struct pci_driver qib_driver = { 1240 .name = QIB_DRV_NAME, 1241 .probe = qib_init_one, 1242 .remove = qib_remove_one, 1243 .id_table = qib_pci_tbl, 1244 .err_handler = &qib_pci_err_handler, 1245 }; 1246 1247 #ifdef CONFIG_INFINIBAND_QIB_DCA 1248 1249 static int qib_notify_dca(struct notifier_block *, unsigned long, void *); 1250 static struct notifier_block dca_notifier = { 1251 .notifier_call = qib_notify_dca, 1252 .next = NULL, 1253 .priority = 0 1254 }; 1255 1256 static int qib_notify_dca_device(struct device *device, void *data) 1257 { 1258 struct qib_devdata *dd = dev_get_drvdata(device); 1259 unsigned long event = *(unsigned long *)data; 1260 1261 return dd->f_notify_dca(dd, event); 1262 } 1263 1264 static int qib_notify_dca(struct notifier_block *nb, unsigned long event, 1265 void *p) 1266 { 1267 int rval; 1268 1269 rval = driver_for_each_device(&qib_driver.driver, NULL, 1270 &event, qib_notify_dca_device); 1271 return rval ? NOTIFY_BAD : NOTIFY_DONE; 1272 } 1273 1274 #endif 1275 1276 /* 1277 * Do all the generic driver unit- and chip-independent memory 1278 * allocation and initialization. 1279 */ 1280 static int __init qib_ib_init(void) 1281 { 1282 int ret; 1283 1284 ret = qib_dev_init(); 1285 if (ret) 1286 goto bail; 1287 1288 /* 1289 * These must be called before the driver is registered with 1290 * the PCI subsystem. 1291 */ 1292 idr_init(&qib_unit_table); 1293 1294 #ifdef CONFIG_INFINIBAND_QIB_DCA 1295 dca_register_notify(&dca_notifier); 1296 #endif 1297 #ifdef CONFIG_DEBUG_FS 1298 qib_dbg_init(); 1299 #endif 1300 ret = pci_register_driver(&qib_driver); 1301 if (ret < 0) { 1302 pr_err("Unable to register driver: error %d\n", -ret); 1303 goto bail_dev; 1304 } 1305 1306 /* not fatal if it doesn't work */ 1307 if (qib_init_qibfs()) 1308 pr_err("Unable to register ipathfs\n"); 1309 goto bail; /* all OK */ 1310 1311 bail_dev: 1312 #ifdef CONFIG_INFINIBAND_QIB_DCA 1313 dca_unregister_notify(&dca_notifier); 1314 #endif 1315 #ifdef CONFIG_DEBUG_FS 1316 qib_dbg_exit(); 1317 #endif 1318 idr_destroy(&qib_unit_table); 1319 qib_dev_cleanup(); 1320 bail: 1321 return ret; 1322 } 1323 1324 module_init(qib_ib_init); 1325 1326 /* 1327 * Do the non-unit driver cleanup, memory free, etc. at unload. 1328 */ 1329 static void __exit qib_ib_cleanup(void) 1330 { 1331 int ret; 1332 1333 ret = qib_exit_qibfs(); 1334 if (ret) 1335 pr_err( 1336 "Unable to cleanup counter filesystem: error %d\n", 1337 -ret); 1338 1339 #ifdef CONFIG_INFINIBAND_QIB_DCA 1340 dca_unregister_notify(&dca_notifier); 1341 #endif 1342 pci_unregister_driver(&qib_driver); 1343 #ifdef CONFIG_DEBUG_FS 1344 qib_dbg_exit(); 1345 #endif 1346 1347 qib_cpulist_count = 0; 1348 kfree(qib_cpulist); 1349 1350 idr_destroy(&qib_unit_table); 1351 qib_dev_cleanup(); 1352 } 1353 1354 module_exit(qib_ib_cleanup); 1355 1356 /* this can only be called after a successful initialization */ 1357 static void cleanup_device_data(struct qib_devdata *dd) 1358 { 1359 int ctxt; 1360 int pidx; 1361 struct qib_ctxtdata **tmp; 1362 unsigned long flags; 1363 1364 /* users can't do anything more with chip */ 1365 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1366 if (dd->pport[pidx].statusp) 1367 *dd->pport[pidx].statusp &= ~QIB_STATUS_CHIP_PRESENT; 1368 1369 spin_lock(&dd->pport[pidx].cc_shadow_lock); 1370 1371 kfree(dd->pport[pidx].congestion_entries); 1372 dd->pport[pidx].congestion_entries = NULL; 1373 kfree(dd->pport[pidx].ccti_entries); 1374 dd->pport[pidx].ccti_entries = NULL; 1375 kfree(dd->pport[pidx].ccti_entries_shadow); 1376 dd->pport[pidx].ccti_entries_shadow = NULL; 1377 kfree(dd->pport[pidx].congestion_entries_shadow); 1378 dd->pport[pidx].congestion_entries_shadow = NULL; 1379 1380 spin_unlock(&dd->pport[pidx].cc_shadow_lock); 1381 } 1382 1383 qib_disable_wc(dd); 1384 1385 if (dd->pioavailregs_dma) { 1386 dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, 1387 (void *) dd->pioavailregs_dma, 1388 dd->pioavailregs_phys); 1389 dd->pioavailregs_dma = NULL; 1390 } 1391 1392 if (dd->pageshadow) { 1393 struct page **tmpp = dd->pageshadow; 1394 dma_addr_t *tmpd = dd->physshadow; 1395 int i; 1396 1397 for (ctxt = 0; ctxt < dd->cfgctxts; ctxt++) { 1398 int ctxt_tidbase = ctxt * dd->rcvtidcnt; 1399 int maxtid = ctxt_tidbase + dd->rcvtidcnt; 1400 1401 for (i = ctxt_tidbase; i < maxtid; i++) { 1402 if (!tmpp[i]) 1403 continue; 1404 pci_unmap_page(dd->pcidev, tmpd[i], 1405 PAGE_SIZE, PCI_DMA_FROMDEVICE); 1406 qib_release_user_pages(&tmpp[i], 1); 1407 tmpp[i] = NULL; 1408 } 1409 } 1410 1411 dd->pageshadow = NULL; 1412 vfree(tmpp); 1413 dd->physshadow = NULL; 1414 vfree(tmpd); 1415 } 1416 1417 /* 1418 * Free any resources still in use (usually just kernel contexts) 1419 * at unload; we do for ctxtcnt, because that's what we allocate. 1420 * We acquire lock to be really paranoid that rcd isn't being 1421 * accessed from some interrupt-related code (that should not happen, 1422 * but best to be sure). 1423 */ 1424 spin_lock_irqsave(&dd->uctxt_lock, flags); 1425 tmp = dd->rcd; 1426 dd->rcd = NULL; 1427 spin_unlock_irqrestore(&dd->uctxt_lock, flags); 1428 for (ctxt = 0; tmp && ctxt < dd->ctxtcnt; ctxt++) { 1429 struct qib_ctxtdata *rcd = tmp[ctxt]; 1430 1431 tmp[ctxt] = NULL; /* debugging paranoia */ 1432 qib_free_ctxtdata(dd, rcd); 1433 } 1434 kfree(tmp); 1435 kfree(dd->boardname); 1436 } 1437 1438 /* 1439 * Clean up on unit shutdown, or error during unit load after 1440 * successful initialization. 1441 */ 1442 static void qib_postinit_cleanup(struct qib_devdata *dd) 1443 { 1444 /* 1445 * Clean up chip-specific stuff. 1446 * We check for NULL here, because it's outside 1447 * the kregbase check, and we need to call it 1448 * after the free_irq. Thus it's possible that 1449 * the function pointers were never initialized. 1450 */ 1451 if (dd->f_cleanup) 1452 dd->f_cleanup(dd); 1453 1454 qib_pcie_ddcleanup(dd); 1455 1456 cleanup_device_data(dd); 1457 1458 qib_free_devdata(dd); 1459 } 1460 1461 static int qib_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) 1462 { 1463 int ret, j, pidx, initfail; 1464 struct qib_devdata *dd = NULL; 1465 1466 ret = qib_pcie_init(pdev, ent); 1467 if (ret) 1468 goto bail; 1469 1470 /* 1471 * Do device-specific initialiation, function table setup, dd 1472 * allocation, etc. 1473 */ 1474 switch (ent->device) { 1475 case PCI_DEVICE_ID_QLOGIC_IB_6120: 1476 #ifdef CONFIG_PCI_MSI 1477 dd = qib_init_iba6120_funcs(pdev, ent); 1478 #else 1479 qib_early_err(&pdev->dev, 1480 "Intel PCIE device 0x%x cannot work if CONFIG_PCI_MSI is not enabled\n", 1481 ent->device); 1482 dd = ERR_PTR(-ENODEV); 1483 #endif 1484 break; 1485 1486 case PCI_DEVICE_ID_QLOGIC_IB_7220: 1487 dd = qib_init_iba7220_funcs(pdev, ent); 1488 break; 1489 1490 case PCI_DEVICE_ID_QLOGIC_IB_7322: 1491 dd = qib_init_iba7322_funcs(pdev, ent); 1492 break; 1493 1494 default: 1495 qib_early_err(&pdev->dev, 1496 "Failing on unknown Intel deviceid 0x%x\n", 1497 ent->device); 1498 ret = -ENODEV; 1499 } 1500 1501 if (IS_ERR(dd)) 1502 ret = PTR_ERR(dd); 1503 if (ret) 1504 goto bail; /* error already printed */ 1505 1506 ret = qib_create_workqueues(dd); 1507 if (ret) 1508 goto bail; 1509 1510 /* do the generic initialization */ 1511 initfail = qib_init(dd, 0); 1512 1513 ret = qib_register_ib_device(dd); 1514 1515 /* 1516 * Now ready for use. this should be cleared whenever we 1517 * detect a reset, or initiate one. If earlier failure, 1518 * we still create devices, so diags, etc. can be used 1519 * to determine cause of problem. 1520 */ 1521 if (!qib_mini_init && !initfail && !ret) 1522 dd->flags |= QIB_INITTED; 1523 1524 j = qib_device_create(dd); 1525 if (j) 1526 qib_dev_err(dd, "Failed to create /dev devices: %d\n", -j); 1527 j = qibfs_add(dd); 1528 if (j) 1529 qib_dev_err(dd, "Failed filesystem setup for counters: %d\n", 1530 -j); 1531 1532 if (qib_mini_init || initfail || ret) { 1533 qib_stop_timers(dd); 1534 flush_workqueue(ib_wq); 1535 for (pidx = 0; pidx < dd->num_pports; ++pidx) 1536 dd->f_quiet_serdes(dd->pport + pidx); 1537 if (qib_mini_init) 1538 goto bail; 1539 if (!j) { 1540 (void) qibfs_remove(dd); 1541 qib_device_remove(dd); 1542 } 1543 if (!ret) 1544 qib_unregister_ib_device(dd); 1545 qib_postinit_cleanup(dd); 1546 if (initfail) 1547 ret = initfail; 1548 goto bail; 1549 } 1550 1551 ret = qib_enable_wc(dd); 1552 if (ret) { 1553 qib_dev_err(dd, 1554 "Write combining not enabled (err %d): performance may be poor\n", 1555 -ret); 1556 ret = 0; 1557 } 1558 1559 qib_verify_pioperf(dd); 1560 bail: 1561 return ret; 1562 } 1563 1564 static void qib_remove_one(struct pci_dev *pdev) 1565 { 1566 struct qib_devdata *dd = pci_get_drvdata(pdev); 1567 int ret; 1568 1569 /* unregister from IB core */ 1570 qib_unregister_ib_device(dd); 1571 1572 /* 1573 * Disable the IB link, disable interrupts on the device, 1574 * clear dma engines, etc. 1575 */ 1576 if (!qib_mini_init) 1577 qib_shutdown_device(dd); 1578 1579 qib_stop_timers(dd); 1580 1581 /* wait until all of our (qsfp) queue_work() calls complete */ 1582 flush_workqueue(ib_wq); 1583 1584 ret = qibfs_remove(dd); 1585 if (ret) 1586 qib_dev_err(dd, "Failed counters filesystem cleanup: %d\n", 1587 -ret); 1588 1589 qib_device_remove(dd); 1590 1591 qib_postinit_cleanup(dd); 1592 } 1593 1594 /** 1595 * qib_create_rcvhdrq - create a receive header queue 1596 * @dd: the qlogic_ib device 1597 * @rcd: the context data 1598 * 1599 * This must be contiguous memory (from an i/o perspective), and must be 1600 * DMA'able (which means for some systems, it will go through an IOMMU, 1601 * or be forced into a low address range). 1602 */ 1603 int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd) 1604 { 1605 unsigned amt; 1606 int old_node_id; 1607 1608 if (!rcd->rcvhdrq) { 1609 dma_addr_t phys_hdrqtail; 1610 gfp_t gfp_flags; 1611 1612 amt = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize * 1613 sizeof(u32), PAGE_SIZE); 1614 gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ? 1615 GFP_USER : GFP_KERNEL; 1616 1617 old_node_id = dev_to_node(&dd->pcidev->dev); 1618 set_dev_node(&dd->pcidev->dev, rcd->node_id); 1619 rcd->rcvhdrq = dma_alloc_coherent( 1620 &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys, 1621 gfp_flags | __GFP_COMP); 1622 set_dev_node(&dd->pcidev->dev, old_node_id); 1623 1624 if (!rcd->rcvhdrq) { 1625 qib_dev_err(dd, 1626 "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", 1627 amt, rcd->ctxt); 1628 goto bail; 1629 } 1630 1631 if (rcd->ctxt >= dd->first_user_ctxt) { 1632 rcd->user_event_mask = vmalloc_user(PAGE_SIZE); 1633 if (!rcd->user_event_mask) 1634 goto bail_free_hdrq; 1635 } 1636 1637 if (!(dd->flags & QIB_NODMA_RTAIL)) { 1638 set_dev_node(&dd->pcidev->dev, rcd->node_id); 1639 rcd->rcvhdrtail_kvaddr = dma_alloc_coherent( 1640 &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, 1641 gfp_flags); 1642 set_dev_node(&dd->pcidev->dev, old_node_id); 1643 if (!rcd->rcvhdrtail_kvaddr) 1644 goto bail_free; 1645 rcd->rcvhdrqtailaddr_phys = phys_hdrqtail; 1646 } 1647 1648 rcd->rcvhdrq_size = amt; 1649 } 1650 1651 /* clear for security and sanity on each use */ 1652 memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size); 1653 if (rcd->rcvhdrtail_kvaddr) 1654 memset(rcd->rcvhdrtail_kvaddr, 0, PAGE_SIZE); 1655 return 0; 1656 1657 bail_free: 1658 qib_dev_err(dd, 1659 "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", 1660 rcd->ctxt); 1661 vfree(rcd->user_event_mask); 1662 rcd->user_event_mask = NULL; 1663 bail_free_hdrq: 1664 dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, 1665 rcd->rcvhdrq_phys); 1666 rcd->rcvhdrq = NULL; 1667 bail: 1668 return -ENOMEM; 1669 } 1670 1671 /** 1672 * allocate eager buffers, both kernel and user contexts. 1673 * @rcd: the context we are setting up. 1674 * 1675 * Allocate the eager TID buffers and program them into hip. 1676 * They are no longer completely contiguous, we do multiple allocation 1677 * calls. Otherwise we get the OOM code involved, by asking for too 1678 * much per call, with disastrous results on some kernels. 1679 */ 1680 int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) 1681 { 1682 struct qib_devdata *dd = rcd->dd; 1683 unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff; 1684 size_t size; 1685 gfp_t gfp_flags; 1686 int old_node_id; 1687 1688 /* 1689 * GFP_USER, but without GFP_FS, so buffer cache can be 1690 * coalesced (we hope); otherwise, even at order 4, 1691 * heavy filesystem activity makes these fail, and we can 1692 * use compound pages. 1693 */ 1694 gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; 1695 1696 egrcnt = rcd->rcvegrcnt; 1697 egroff = rcd->rcvegr_tid_base; 1698 egrsize = dd->rcvegrbufsize; 1699 1700 chunk = rcd->rcvegrbuf_chunks; 1701 egrperchunk = rcd->rcvegrbufs_perchunk; 1702 size = rcd->rcvegrbuf_size; 1703 if (!rcd->rcvegrbuf) { 1704 rcd->rcvegrbuf = 1705 kzalloc_node(chunk * sizeof(rcd->rcvegrbuf[0]), 1706 GFP_KERNEL, rcd->node_id); 1707 if (!rcd->rcvegrbuf) 1708 goto bail; 1709 } 1710 if (!rcd->rcvegrbuf_phys) { 1711 rcd->rcvegrbuf_phys = 1712 kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]), 1713 GFP_KERNEL, rcd->node_id); 1714 if (!rcd->rcvegrbuf_phys) 1715 goto bail_rcvegrbuf; 1716 } 1717 for (e = 0; e < rcd->rcvegrbuf_chunks; e++) { 1718 if (rcd->rcvegrbuf[e]) 1719 continue; 1720 1721 old_node_id = dev_to_node(&dd->pcidev->dev); 1722 set_dev_node(&dd->pcidev->dev, rcd->node_id); 1723 rcd->rcvegrbuf[e] = 1724 dma_alloc_coherent(&dd->pcidev->dev, size, 1725 &rcd->rcvegrbuf_phys[e], 1726 gfp_flags); 1727 set_dev_node(&dd->pcidev->dev, old_node_id); 1728 if (!rcd->rcvegrbuf[e]) 1729 goto bail_rcvegrbuf_phys; 1730 } 1731 1732 rcd->rcvegr_phys = rcd->rcvegrbuf_phys[0]; 1733 1734 for (e = chunk = 0; chunk < rcd->rcvegrbuf_chunks; chunk++) { 1735 dma_addr_t pa = rcd->rcvegrbuf_phys[chunk]; 1736 unsigned i; 1737 1738 /* clear for security and sanity on each use */ 1739 memset(rcd->rcvegrbuf[chunk], 0, size); 1740 1741 for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) { 1742 dd->f_put_tid(dd, e + egroff + 1743 (u64 __iomem *) 1744 ((char __iomem *) 1745 dd->kregbase + 1746 dd->rcvegrbase), 1747 RCVHQ_RCV_TYPE_EAGER, pa); 1748 pa += egrsize; 1749 } 1750 cond_resched(); /* don't hog the cpu */ 1751 } 1752 1753 return 0; 1754 1755 bail_rcvegrbuf_phys: 1756 for (e = 0; e < rcd->rcvegrbuf_chunks && rcd->rcvegrbuf[e]; e++) 1757 dma_free_coherent(&dd->pcidev->dev, size, 1758 rcd->rcvegrbuf[e], rcd->rcvegrbuf_phys[e]); 1759 kfree(rcd->rcvegrbuf_phys); 1760 rcd->rcvegrbuf_phys = NULL; 1761 bail_rcvegrbuf: 1762 kfree(rcd->rcvegrbuf); 1763 rcd->rcvegrbuf = NULL; 1764 bail: 1765 return -ENOMEM; 1766 } 1767 1768 /* 1769 * Note: Changes to this routine should be mirrored 1770 * for the diagnostics routine qib_remap_ioaddr32(). 1771 * There is also related code for VL15 buffers in qib_init_7322_variables(). 1772 * The teardown code that unmaps is in qib_pcie_ddcleanup() 1773 */ 1774 int init_chip_wc_pat(struct qib_devdata *dd, u32 vl15buflen) 1775 { 1776 u64 __iomem *qib_kregbase = NULL; 1777 void __iomem *qib_piobase = NULL; 1778 u64 __iomem *qib_userbase = NULL; 1779 u64 qib_kreglen; 1780 u64 qib_pio2koffset = dd->piobufbase & 0xffffffff; 1781 u64 qib_pio4koffset = dd->piobufbase >> 32; 1782 u64 qib_pio2klen = dd->piobcnt2k * dd->palign; 1783 u64 qib_pio4klen = dd->piobcnt4k * dd->align4k; 1784 u64 qib_physaddr = dd->physaddr; 1785 u64 qib_piolen; 1786 u64 qib_userlen = 0; 1787 1788 /* 1789 * Free the old mapping because the kernel will try to reuse the 1790 * old mapping and not create a new mapping with the 1791 * write combining attribute. 1792 */ 1793 iounmap(dd->kregbase); 1794 dd->kregbase = NULL; 1795 1796 /* 1797 * Assumes chip address space looks like: 1798 * - kregs + sregs + cregs + uregs (in any order) 1799 * - piobufs (2K and 4K bufs in either order) 1800 * or: 1801 * - kregs + sregs + cregs (in any order) 1802 * - piobufs (2K and 4K bufs in either order) 1803 * - uregs 1804 */ 1805 if (dd->piobcnt4k == 0) { 1806 qib_kreglen = qib_pio2koffset; 1807 qib_piolen = qib_pio2klen; 1808 } else if (qib_pio2koffset < qib_pio4koffset) { 1809 qib_kreglen = qib_pio2koffset; 1810 qib_piolen = qib_pio4koffset + qib_pio4klen - qib_kreglen; 1811 } else { 1812 qib_kreglen = qib_pio4koffset; 1813 qib_piolen = qib_pio2koffset + qib_pio2klen - qib_kreglen; 1814 } 1815 qib_piolen += vl15buflen; 1816 /* Map just the configured ports (not all hw ports) */ 1817 if (dd->uregbase > qib_kreglen) 1818 qib_userlen = dd->ureg_align * dd->cfgctxts; 1819 1820 /* Sanity checks passed, now create the new mappings */ 1821 qib_kregbase = ioremap_nocache(qib_physaddr, qib_kreglen); 1822 if (!qib_kregbase) 1823 goto bail; 1824 1825 qib_piobase = ioremap_wc(qib_physaddr + qib_kreglen, qib_piolen); 1826 if (!qib_piobase) 1827 goto bail_kregbase; 1828 1829 if (qib_userlen) { 1830 qib_userbase = ioremap_nocache(qib_physaddr + dd->uregbase, 1831 qib_userlen); 1832 if (!qib_userbase) 1833 goto bail_piobase; 1834 } 1835 1836 dd->kregbase = qib_kregbase; 1837 dd->kregend = (u64 __iomem *) 1838 ((char __iomem *) qib_kregbase + qib_kreglen); 1839 dd->piobase = qib_piobase; 1840 dd->pio2kbase = (void __iomem *) 1841 (((char __iomem *) dd->piobase) + 1842 qib_pio2koffset - qib_kreglen); 1843 if (dd->piobcnt4k) 1844 dd->pio4kbase = (void __iomem *) 1845 (((char __iomem *) dd->piobase) + 1846 qib_pio4koffset - qib_kreglen); 1847 if (qib_userlen) 1848 /* ureg will now be accessed relative to dd->userbase */ 1849 dd->userbase = qib_userbase; 1850 return 0; 1851 1852 bail_piobase: 1853 iounmap(qib_piobase); 1854 bail_kregbase: 1855 iounmap(qib_kregbase); 1856 bail: 1857 return -ENOMEM; 1858 } 1859