1 /* 2 * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. 3 * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. 4 * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 */ 34 35 #include <linux/pci.h> 36 #include <linux/netdevice.h> 37 #include <linux/vmalloc.h> 38 #include <linux/delay.h> 39 #include <linux/idr.h> 40 #include <linux/module.h> 41 #include <linux/printk.h> 42 #ifdef CONFIG_INFINIBAND_QIB_DCA 43 #include <linux/dca.h> 44 #endif 45 46 #include "qib.h" 47 #include "qib_common.h" 48 #include "qib_mad.h" 49 #ifdef CONFIG_DEBUG_FS 50 #include "qib_debugfs.h" 51 #include "qib_verbs.h" 52 #endif 53 54 #undef pr_fmt 55 #define pr_fmt(fmt) QIB_DRV_NAME ": " fmt 56 57 /* 58 * min buffers we want to have per context, after driver 59 */ 60 #define QIB_MIN_USER_CTXT_BUFCNT 7 61 62 #define QLOGIC_IB_R_SOFTWARE_MASK 0xFF 63 #define QLOGIC_IB_R_SOFTWARE_SHIFT 24 64 #define QLOGIC_IB_R_EMULATOR_MASK (1ULL<<62) 65 66 /* 67 * Number of ctxts we are configured to use (to allow for more pio 68 * buffers per ctxt, etc.) Zero means use chip value. 69 */ 70 ushort qib_cfgctxts; 71 module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO); 72 MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use"); 73 74 unsigned qib_numa_aware; 75 module_param_named(numa_aware, qib_numa_aware, uint, S_IRUGO); 76 MODULE_PARM_DESC(numa_aware, 77 "0 -> PSM allocation close to HCA, 1 -> PSM allocation local to process"); 78 79 /* 80 * If set, do not write to any regs if avoidable, hack to allow 81 * check for deranged default register values. 82 */ 83 ushort qib_mini_init; 84 module_param_named(mini_init, qib_mini_init, ushort, S_IRUGO); 85 MODULE_PARM_DESC(mini_init, "If set, do minimal diag init"); 86 87 unsigned qib_n_krcv_queues; 88 module_param_named(krcvqs, qib_n_krcv_queues, uint, S_IRUGO); 89 MODULE_PARM_DESC(krcvqs, "number of kernel receive queues per IB port"); 90 91 unsigned qib_cc_table_size; 92 module_param_named(cc_table_size, qib_cc_table_size, uint, S_IRUGO); 93 MODULE_PARM_DESC(cc_table_size, "Congestion control table entries 0 (CCA disabled - default), min = 128, max = 1984"); 94 /* 95 * qib_wc_pat parameter: 96 * 0 is WC via MTRR 97 * 1 is WC via PAT 98 * If PAT initialization fails, code reverts back to MTRR 99 */ 100 unsigned qib_wc_pat = 1; /* default (1) is to use PAT, not MTRR */ 101 module_param_named(wc_pat, qib_wc_pat, uint, S_IRUGO); 102 MODULE_PARM_DESC(wc_pat, "enable write-combining via PAT mechanism"); 103 104 static void verify_interrupt(unsigned long); 105 106 static struct idr qib_unit_table; 107 u32 qib_cpulist_count; 108 unsigned long *qib_cpulist; 109 110 /* set number of contexts we'll actually use */ 111 void qib_set_ctxtcnt(struct qib_devdata *dd) 112 { 113 if (!qib_cfgctxts) { 114 dd->cfgctxts = dd->first_user_ctxt + num_online_cpus(); 115 if (dd->cfgctxts > dd->ctxtcnt) 116 dd->cfgctxts = dd->ctxtcnt; 117 } else if (qib_cfgctxts < dd->num_pports) 118 dd->cfgctxts = dd->ctxtcnt; 119 else if (qib_cfgctxts <= dd->ctxtcnt) 120 dd->cfgctxts = qib_cfgctxts; 121 else 122 dd->cfgctxts = dd->ctxtcnt; 123 dd->freectxts = (dd->first_user_ctxt > dd->cfgctxts) ? 0 : 124 dd->cfgctxts - dd->first_user_ctxt; 125 } 126 127 /* 128 * Common code for creating the receive context array. 129 */ 130 int qib_create_ctxts(struct qib_devdata *dd) 131 { 132 unsigned i; 133 int local_node_id = pcibus_to_node(dd->pcidev->bus); 134 135 if (local_node_id < 0) 136 local_node_id = numa_node_id(); 137 dd->assigned_node_id = local_node_id; 138 139 /* 140 * Allocate full ctxtcnt array, rather than just cfgctxts, because 141 * cleanup iterates across all possible ctxts. 142 */ 143 dd->rcd = kzalloc(sizeof(*dd->rcd) * dd->ctxtcnt, GFP_KERNEL); 144 if (!dd->rcd) { 145 qib_dev_err(dd, 146 "Unable to allocate ctxtdata array, failing\n"); 147 return -ENOMEM; 148 } 149 150 /* create (one or more) kctxt */ 151 for (i = 0; i < dd->first_user_ctxt; ++i) { 152 struct qib_pportdata *ppd; 153 struct qib_ctxtdata *rcd; 154 155 if (dd->skip_kctxt_mask & (1 << i)) 156 continue; 157 158 ppd = dd->pport + (i % dd->num_pports); 159 160 rcd = qib_create_ctxtdata(ppd, i, dd->assigned_node_id); 161 if (!rcd) { 162 qib_dev_err(dd, 163 "Unable to allocate ctxtdata for Kernel ctxt, failing\n"); 164 kfree(dd->rcd); 165 dd->rcd = NULL; 166 return -ENOMEM; 167 } 168 rcd->pkeys[0] = QIB_DEFAULT_P_KEY; 169 rcd->seq_cnt = 1; 170 } 171 return 0; 172 } 173 174 /* 175 * Common code for user and kernel context setup. 176 */ 177 struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt, 178 int node_id) 179 { 180 struct qib_devdata *dd = ppd->dd; 181 struct qib_ctxtdata *rcd; 182 183 rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, node_id); 184 if (rcd) { 185 INIT_LIST_HEAD(&rcd->qp_wait_list); 186 rcd->node_id = node_id; 187 rcd->ppd = ppd; 188 rcd->dd = dd; 189 rcd->cnt = 1; 190 rcd->ctxt = ctxt; 191 dd->rcd[ctxt] = rcd; 192 #ifdef CONFIG_DEBUG_FS 193 if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */ 194 rcd->opstats = kzalloc_node(sizeof(*rcd->opstats), 195 GFP_KERNEL, node_id); 196 if (!rcd->opstats) { 197 kfree(rcd); 198 qib_dev_err(dd, 199 "Unable to allocate per ctxt stats buffer\n"); 200 return NULL; 201 } 202 } 203 #endif 204 dd->f_init_ctxt(rcd); 205 206 /* 207 * To avoid wasting a lot of memory, we allocate 32KB chunks 208 * of physically contiguous memory, advance through it until 209 * used up and then allocate more. Of course, we need 210 * memory to store those extra pointers, now. 32KB seems to 211 * be the most that is "safe" under memory pressure 212 * (creating large files and then copying them over 213 * NFS while doing lots of MPI jobs). The OOM killer can 214 * get invoked, even though we say we can sleep and this can 215 * cause significant system problems.... 216 */ 217 rcd->rcvegrbuf_size = 0x8000; 218 rcd->rcvegrbufs_perchunk = 219 rcd->rcvegrbuf_size / dd->rcvegrbufsize; 220 rcd->rcvegrbuf_chunks = (rcd->rcvegrcnt + 221 rcd->rcvegrbufs_perchunk - 1) / 222 rcd->rcvegrbufs_perchunk; 223 BUG_ON(!is_power_of_2(rcd->rcvegrbufs_perchunk)); 224 rcd->rcvegrbufs_perchunk_shift = 225 ilog2(rcd->rcvegrbufs_perchunk); 226 } 227 return rcd; 228 } 229 230 /* 231 * Common code for initializing the physical port structure. 232 */ 233 int qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd, 234 u8 hw_pidx, u8 port) 235 { 236 int size; 237 ppd->dd = dd; 238 ppd->hw_pidx = hw_pidx; 239 ppd->port = port; /* IB port number, not index */ 240 241 spin_lock_init(&ppd->sdma_lock); 242 spin_lock_init(&ppd->lflags_lock); 243 spin_lock_init(&ppd->cc_shadow_lock); 244 init_waitqueue_head(&ppd->state_wait); 245 246 init_timer(&ppd->symerr_clear_timer); 247 ppd->symerr_clear_timer.function = qib_clear_symerror_on_linkup; 248 ppd->symerr_clear_timer.data = (unsigned long)ppd; 249 250 ppd->qib_wq = NULL; 251 ppd->ibport_data.pmastats = 252 alloc_percpu(struct qib_pma_counters); 253 if (!ppd->ibport_data.pmastats) 254 return -ENOMEM; 255 256 if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) 257 goto bail; 258 259 ppd->cc_supported_table_entries = min(max_t(int, qib_cc_table_size, 260 IB_CCT_MIN_ENTRIES), IB_CCT_ENTRIES*IB_CC_TABLE_CAP_DEFAULT); 261 262 ppd->cc_max_table_entries = 263 ppd->cc_supported_table_entries/IB_CCT_ENTRIES; 264 265 size = IB_CC_TABLE_CAP_DEFAULT * sizeof(struct ib_cc_table_entry) 266 * IB_CCT_ENTRIES; 267 ppd->ccti_entries = kzalloc(size, GFP_KERNEL); 268 if (!ppd->ccti_entries) { 269 qib_dev_err(dd, 270 "failed to allocate congestion control table for port %d!\n", 271 port); 272 goto bail; 273 } 274 275 size = IB_CC_CCS_ENTRIES * sizeof(struct ib_cc_congestion_entry); 276 ppd->congestion_entries = kzalloc(size, GFP_KERNEL); 277 if (!ppd->congestion_entries) { 278 qib_dev_err(dd, 279 "failed to allocate congestion setting list for port %d!\n", 280 port); 281 goto bail_1; 282 } 283 284 size = sizeof(struct cc_table_shadow); 285 ppd->ccti_entries_shadow = kzalloc(size, GFP_KERNEL); 286 if (!ppd->ccti_entries_shadow) { 287 qib_dev_err(dd, 288 "failed to allocate shadow ccti list for port %d!\n", 289 port); 290 goto bail_2; 291 } 292 293 size = sizeof(struct ib_cc_congestion_setting_attr); 294 ppd->congestion_entries_shadow = kzalloc(size, GFP_KERNEL); 295 if (!ppd->congestion_entries_shadow) { 296 qib_dev_err(dd, 297 "failed to allocate shadow congestion setting list for port %d!\n", 298 port); 299 goto bail_3; 300 } 301 302 return 0; 303 304 bail_3: 305 kfree(ppd->ccti_entries_shadow); 306 ppd->ccti_entries_shadow = NULL; 307 bail_2: 308 kfree(ppd->congestion_entries); 309 ppd->congestion_entries = NULL; 310 bail_1: 311 kfree(ppd->ccti_entries); 312 ppd->ccti_entries = NULL; 313 bail: 314 /* User is intentionally disabling the congestion control agent */ 315 if (!qib_cc_table_size) 316 return 0; 317 318 if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) { 319 qib_cc_table_size = 0; 320 qib_dev_err(dd, 321 "Congestion Control table size %d less than minimum %d for port %d\n", 322 qib_cc_table_size, IB_CCT_MIN_ENTRIES, port); 323 } 324 325 qib_dev_err(dd, "Congestion Control Agent disabled for port %d\n", 326 port); 327 return 0; 328 } 329 330 static int init_pioavailregs(struct qib_devdata *dd) 331 { 332 int ret, pidx; 333 u64 *status_page; 334 335 dd->pioavailregs_dma = dma_alloc_coherent( 336 &dd->pcidev->dev, PAGE_SIZE, &dd->pioavailregs_phys, 337 GFP_KERNEL); 338 if (!dd->pioavailregs_dma) { 339 qib_dev_err(dd, 340 "failed to allocate PIOavail reg area in memory\n"); 341 ret = -ENOMEM; 342 goto done; 343 } 344 345 /* 346 * We really want L2 cache aligned, but for current CPUs of 347 * interest, they are the same. 348 */ 349 status_page = (u64 *) 350 ((char *) dd->pioavailregs_dma + 351 ((2 * L1_CACHE_BYTES + 352 dd->pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES)); 353 /* device status comes first, for backwards compatibility */ 354 dd->devstatusp = status_page; 355 *status_page++ = 0; 356 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 357 dd->pport[pidx].statusp = status_page; 358 *status_page++ = 0; 359 } 360 361 /* 362 * Setup buffer to hold freeze and other messages, accessible to 363 * apps, following statusp. This is per-unit, not per port. 364 */ 365 dd->freezemsg = (char *) status_page; 366 *dd->freezemsg = 0; 367 /* length of msg buffer is "whatever is left" */ 368 ret = (char *) status_page - (char *) dd->pioavailregs_dma; 369 dd->freezelen = PAGE_SIZE - ret; 370 371 ret = 0; 372 373 done: 374 return ret; 375 } 376 377 /** 378 * init_shadow_tids - allocate the shadow TID array 379 * @dd: the qlogic_ib device 380 * 381 * allocate the shadow TID array, so we can qib_munlock previous 382 * entries. It may make more sense to move the pageshadow to the 383 * ctxt data structure, so we only allocate memory for ctxts actually 384 * in use, since we at 8k per ctxt, now. 385 * We don't want failures here to prevent use of the driver/chip, 386 * so no return value. 387 */ 388 static void init_shadow_tids(struct qib_devdata *dd) 389 { 390 struct page **pages; 391 dma_addr_t *addrs; 392 393 pages = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(struct page *)); 394 if (!pages) { 395 qib_dev_err(dd, 396 "failed to allocate shadow page * array, no expected sends!\n"); 397 goto bail; 398 } 399 400 addrs = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(dma_addr_t)); 401 if (!addrs) { 402 qib_dev_err(dd, 403 "failed to allocate shadow dma handle array, no expected sends!\n"); 404 goto bail_free; 405 } 406 407 dd->pageshadow = pages; 408 dd->physshadow = addrs; 409 return; 410 411 bail_free: 412 vfree(pages); 413 bail: 414 dd->pageshadow = NULL; 415 } 416 417 /* 418 * Do initialization for device that is only needed on 419 * first detect, not on resets. 420 */ 421 static int loadtime_init(struct qib_devdata *dd) 422 { 423 int ret = 0; 424 425 if (((dd->revision >> QLOGIC_IB_R_SOFTWARE_SHIFT) & 426 QLOGIC_IB_R_SOFTWARE_MASK) != QIB_CHIP_SWVERSION) { 427 qib_dev_err(dd, 428 "Driver only handles version %d, chip swversion is %d (%llx), failng\n", 429 QIB_CHIP_SWVERSION, 430 (int)(dd->revision >> 431 QLOGIC_IB_R_SOFTWARE_SHIFT) & 432 QLOGIC_IB_R_SOFTWARE_MASK, 433 (unsigned long long) dd->revision); 434 ret = -ENOSYS; 435 goto done; 436 } 437 438 if (dd->revision & QLOGIC_IB_R_EMULATOR_MASK) 439 qib_devinfo(dd->pcidev, "%s", dd->boardversion); 440 441 spin_lock_init(&dd->pioavail_lock); 442 spin_lock_init(&dd->sendctrl_lock); 443 spin_lock_init(&dd->uctxt_lock); 444 spin_lock_init(&dd->qib_diag_trans_lock); 445 spin_lock_init(&dd->eep_st_lock); 446 mutex_init(&dd->eep_lock); 447 448 if (qib_mini_init) 449 goto done; 450 451 ret = init_pioavailregs(dd); 452 init_shadow_tids(dd); 453 454 qib_get_eeprom_info(dd); 455 456 /* setup time (don't start yet) to verify we got interrupt */ 457 init_timer(&dd->intrchk_timer); 458 dd->intrchk_timer.function = verify_interrupt; 459 dd->intrchk_timer.data = (unsigned long) dd; 460 461 ret = qib_cq_init(dd); 462 done: 463 return ret; 464 } 465 466 /** 467 * init_after_reset - re-initialize after a reset 468 * @dd: the qlogic_ib device 469 * 470 * sanity check at least some of the values after reset, and 471 * ensure no receive or transmit (explicitly, in case reset 472 * failed 473 */ 474 static int init_after_reset(struct qib_devdata *dd) 475 { 476 int i; 477 478 /* 479 * Ensure chip does no sends or receives, tail updates, or 480 * pioavail updates while we re-initialize. This is mostly 481 * for the driver data structures, not chip registers. 482 */ 483 for (i = 0; i < dd->num_pports; ++i) { 484 /* 485 * ctxt == -1 means "all contexts". Only really safe for 486 * _dis_abling things, as here. 487 */ 488 dd->f_rcvctrl(dd->pport + i, QIB_RCVCTRL_CTXT_DIS | 489 QIB_RCVCTRL_INTRAVAIL_DIS | 490 QIB_RCVCTRL_TAILUPD_DIS, -1); 491 /* Redundant across ports for some, but no big deal. */ 492 dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_DIS | 493 QIB_SENDCTRL_AVAIL_DIS); 494 } 495 496 return 0; 497 } 498 499 static void enable_chip(struct qib_devdata *dd) 500 { 501 u64 rcvmask; 502 int i; 503 504 /* 505 * Enable PIO send, and update of PIOavail regs to memory. 506 */ 507 for (i = 0; i < dd->num_pports; ++i) 508 dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_ENB | 509 QIB_SENDCTRL_AVAIL_ENB); 510 /* 511 * Enable kernel ctxts' receive and receive interrupt. 512 * Other ctxts done as user opens and inits them. 513 */ 514 rcvmask = QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_INTRAVAIL_ENB; 515 rcvmask |= (dd->flags & QIB_NODMA_RTAIL) ? 516 QIB_RCVCTRL_TAILUPD_DIS : QIB_RCVCTRL_TAILUPD_ENB; 517 for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { 518 struct qib_ctxtdata *rcd = dd->rcd[i]; 519 520 if (rcd) 521 dd->f_rcvctrl(rcd->ppd, rcvmask, i); 522 } 523 } 524 525 static void verify_interrupt(unsigned long opaque) 526 { 527 struct qib_devdata *dd = (struct qib_devdata *) opaque; 528 u64 int_counter; 529 530 if (!dd) 531 return; /* being torn down */ 532 533 /* 534 * If we don't have a lid or any interrupts, let the user know and 535 * don't bother checking again. 536 */ 537 int_counter = qib_int_counter(dd) - dd->z_int_counter; 538 if (int_counter == 0) { 539 if (!dd->f_intr_fallback(dd)) 540 dev_err(&dd->pcidev->dev, 541 "No interrupts detected, not usable.\n"); 542 else /* re-arm the timer to see if fallback works */ 543 mod_timer(&dd->intrchk_timer, jiffies + HZ/2); 544 } 545 } 546 547 static void init_piobuf_state(struct qib_devdata *dd) 548 { 549 int i, pidx; 550 u32 uctxts; 551 552 /* 553 * Ensure all buffers are free, and fifos empty. Buffers 554 * are common, so only do once for port 0. 555 * 556 * After enable and qib_chg_pioavailkernel so we can safely 557 * enable pioavail updates and PIOENABLE. After this, packets 558 * are ready and able to go out. 559 */ 560 dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_ALL); 561 for (pidx = 0; pidx < dd->num_pports; ++pidx) 562 dd->f_sendctrl(dd->pport + pidx, QIB_SENDCTRL_FLUSH); 563 564 /* 565 * If not all sendbufs are used, add the one to each of the lower 566 * numbered contexts. pbufsctxt and lastctxt_piobuf are 567 * calculated in chip-specific code because it may cause some 568 * chip-specific adjustments to be made. 569 */ 570 uctxts = dd->cfgctxts - dd->first_user_ctxt; 571 dd->ctxts_extrabuf = dd->pbufsctxt ? 572 dd->lastctxt_piobuf - (dd->pbufsctxt * uctxts) : 0; 573 574 /* 575 * Set up the shadow copies of the piobufavail registers, 576 * which we compare against the chip registers for now, and 577 * the in memory DMA'ed copies of the registers. 578 * By now pioavail updates to memory should have occurred, so 579 * copy them into our working/shadow registers; this is in 580 * case something went wrong with abort, but mostly to get the 581 * initial values of the generation bit correct. 582 */ 583 for (i = 0; i < dd->pioavregs; i++) { 584 __le64 tmp; 585 586 tmp = dd->pioavailregs_dma[i]; 587 /* 588 * Don't need to worry about pioavailkernel here 589 * because we will call qib_chg_pioavailkernel() later 590 * in initialization, to busy out buffers as needed. 591 */ 592 dd->pioavailshadow[i] = le64_to_cpu(tmp); 593 } 594 while (i < ARRAY_SIZE(dd->pioavailshadow)) 595 dd->pioavailshadow[i++] = 0; /* for debugging sanity */ 596 597 /* after pioavailshadow is setup */ 598 qib_chg_pioavailkernel(dd, 0, dd->piobcnt2k + dd->piobcnt4k, 599 TXCHK_CHG_TYPE_KERN, NULL); 600 dd->f_initvl15_bufs(dd); 601 } 602 603 /** 604 * qib_create_workqueues - create per port workqueues 605 * @dd: the qlogic_ib device 606 */ 607 static int qib_create_workqueues(struct qib_devdata *dd) 608 { 609 int pidx; 610 struct qib_pportdata *ppd; 611 612 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 613 ppd = dd->pport + pidx; 614 if (!ppd->qib_wq) { 615 char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */ 616 snprintf(wq_name, sizeof(wq_name), "qib%d_%d", 617 dd->unit, pidx); 618 ppd->qib_wq = 619 create_singlethread_workqueue(wq_name); 620 if (!ppd->qib_wq) 621 goto wq_error; 622 } 623 } 624 return 0; 625 wq_error: 626 pr_err("create_singlethread_workqueue failed for port %d\n", 627 pidx + 1); 628 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 629 ppd = dd->pport + pidx; 630 if (ppd->qib_wq) { 631 destroy_workqueue(ppd->qib_wq); 632 ppd->qib_wq = NULL; 633 } 634 } 635 return -ENOMEM; 636 } 637 638 static void qib_free_pportdata(struct qib_pportdata *ppd) 639 { 640 free_percpu(ppd->ibport_data.pmastats); 641 ppd->ibport_data.pmastats = NULL; 642 } 643 644 /** 645 * qib_init - do the actual initialization sequence on the chip 646 * @dd: the qlogic_ib device 647 * @reinit: reinitializing, so don't allocate new memory 648 * 649 * Do the actual initialization sequence on the chip. This is done 650 * both from the init routine called from the PCI infrastructure, and 651 * when we reset the chip, or detect that it was reset internally, 652 * or it's administratively re-enabled. 653 * 654 * Memory allocation here and in called routines is only done in 655 * the first case (reinit == 0). We have to be careful, because even 656 * without memory allocation, we need to re-write all the chip registers 657 * TIDs, etc. after the reset or enable has completed. 658 */ 659 int qib_init(struct qib_devdata *dd, int reinit) 660 { 661 int ret = 0, pidx, lastfail = 0; 662 u32 portok = 0; 663 unsigned i; 664 struct qib_ctxtdata *rcd; 665 struct qib_pportdata *ppd; 666 unsigned long flags; 667 668 /* Set linkstate to unknown, so we can watch for a transition. */ 669 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 670 ppd = dd->pport + pidx; 671 spin_lock_irqsave(&ppd->lflags_lock, flags); 672 ppd->lflags &= ~(QIBL_LINKACTIVE | QIBL_LINKARMED | 673 QIBL_LINKDOWN | QIBL_LINKINIT | 674 QIBL_LINKV); 675 spin_unlock_irqrestore(&ppd->lflags_lock, flags); 676 } 677 678 if (reinit) 679 ret = init_after_reset(dd); 680 else 681 ret = loadtime_init(dd); 682 if (ret) 683 goto done; 684 685 /* Bypass most chip-init, to get to device creation */ 686 if (qib_mini_init) 687 return 0; 688 689 ret = dd->f_late_initreg(dd); 690 if (ret) 691 goto done; 692 693 /* dd->rcd can be NULL if early init failed */ 694 for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { 695 /* 696 * Set up the (kernel) rcvhdr queue and egr TIDs. If doing 697 * re-init, the simplest way to handle this is to free 698 * existing, and re-allocate. 699 * Need to re-create rest of ctxt 0 ctxtdata as well. 700 */ 701 rcd = dd->rcd[i]; 702 if (!rcd) 703 continue; 704 705 lastfail = qib_create_rcvhdrq(dd, rcd); 706 if (!lastfail) 707 lastfail = qib_setup_eagerbufs(rcd); 708 if (lastfail) { 709 qib_dev_err(dd, 710 "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); 711 continue; 712 } 713 } 714 715 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 716 int mtu; 717 if (lastfail) 718 ret = lastfail; 719 ppd = dd->pport + pidx; 720 mtu = ib_mtu_enum_to_int(qib_ibmtu); 721 if (mtu == -1) { 722 mtu = QIB_DEFAULT_MTU; 723 qib_ibmtu = 0; /* don't leave invalid value */ 724 } 725 /* set max we can ever have for this driver load */ 726 ppd->init_ibmaxlen = min(mtu > 2048 ? 727 dd->piosize4k : dd->piosize2k, 728 dd->rcvegrbufsize + 729 (dd->rcvhdrentsize << 2)); 730 /* 731 * Have to initialize ibmaxlen, but this will normally 732 * change immediately in qib_set_mtu(). 733 */ 734 ppd->ibmaxlen = ppd->init_ibmaxlen; 735 qib_set_mtu(ppd, mtu); 736 737 spin_lock_irqsave(&ppd->lflags_lock, flags); 738 ppd->lflags |= QIBL_IB_LINK_DISABLED; 739 spin_unlock_irqrestore(&ppd->lflags_lock, flags); 740 741 lastfail = dd->f_bringup_serdes(ppd); 742 if (lastfail) { 743 qib_devinfo(dd->pcidev, 744 "Failed to bringup IB port %u\n", ppd->port); 745 lastfail = -ENETDOWN; 746 continue; 747 } 748 749 portok++; 750 } 751 752 if (!portok) { 753 /* none of the ports initialized */ 754 if (!ret && lastfail) 755 ret = lastfail; 756 else if (!ret) 757 ret = -ENETDOWN; 758 /* but continue on, so we can debug cause */ 759 } 760 761 enable_chip(dd); 762 763 init_piobuf_state(dd); 764 765 done: 766 if (!ret) { 767 /* chip is OK for user apps; mark it as initialized */ 768 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 769 ppd = dd->pport + pidx; 770 /* 771 * Set status even if port serdes is not initialized 772 * so that diags will work. 773 */ 774 *ppd->statusp |= QIB_STATUS_CHIP_PRESENT | 775 QIB_STATUS_INITTED; 776 if (!ppd->link_speed_enabled) 777 continue; 778 if (dd->flags & QIB_HAS_SEND_DMA) 779 ret = qib_setup_sdma(ppd); 780 init_timer(&ppd->hol_timer); 781 ppd->hol_timer.function = qib_hol_event; 782 ppd->hol_timer.data = (unsigned long)ppd; 783 ppd->hol_state = QIB_HOL_UP; 784 } 785 786 /* now we can enable all interrupts from the chip */ 787 dd->f_set_intr_state(dd, 1); 788 789 /* 790 * Setup to verify we get an interrupt, and fallback 791 * to an alternate if necessary and possible. 792 */ 793 mod_timer(&dd->intrchk_timer, jiffies + HZ/2); 794 /* start stats retrieval timer */ 795 mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER); 796 } 797 798 /* if ret is non-zero, we probably should do some cleanup here... */ 799 return ret; 800 } 801 802 /* 803 * These next two routines are placeholders in case we don't have per-arch 804 * code for controlling write combining. If explicit control of write 805 * combining is not available, performance will probably be awful. 806 */ 807 808 int __attribute__((weak)) qib_enable_wc(struct qib_devdata *dd) 809 { 810 return -EOPNOTSUPP; 811 } 812 813 void __attribute__((weak)) qib_disable_wc(struct qib_devdata *dd) 814 { 815 } 816 817 static inline struct qib_devdata *__qib_lookup(int unit) 818 { 819 return idr_find(&qib_unit_table, unit); 820 } 821 822 struct qib_devdata *qib_lookup(int unit) 823 { 824 struct qib_devdata *dd; 825 unsigned long flags; 826 827 spin_lock_irqsave(&qib_devs_lock, flags); 828 dd = __qib_lookup(unit); 829 spin_unlock_irqrestore(&qib_devs_lock, flags); 830 831 return dd; 832 } 833 834 /* 835 * Stop the timers during unit shutdown, or after an error late 836 * in initialization. 837 */ 838 static void qib_stop_timers(struct qib_devdata *dd) 839 { 840 struct qib_pportdata *ppd; 841 int pidx; 842 843 if (dd->stats_timer.data) { 844 del_timer_sync(&dd->stats_timer); 845 dd->stats_timer.data = 0; 846 } 847 if (dd->intrchk_timer.data) { 848 del_timer_sync(&dd->intrchk_timer); 849 dd->intrchk_timer.data = 0; 850 } 851 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 852 ppd = dd->pport + pidx; 853 if (ppd->hol_timer.data) 854 del_timer_sync(&ppd->hol_timer); 855 if (ppd->led_override_timer.data) { 856 del_timer_sync(&ppd->led_override_timer); 857 atomic_set(&ppd->led_override_timer_active, 0); 858 } 859 if (ppd->symerr_clear_timer.data) 860 del_timer_sync(&ppd->symerr_clear_timer); 861 } 862 } 863 864 /** 865 * qib_shutdown_device - shut down a device 866 * @dd: the qlogic_ib device 867 * 868 * This is called to make the device quiet when we are about to 869 * unload the driver, and also when the device is administratively 870 * disabled. It does not free any data structures. 871 * Everything it does has to be setup again by qib_init(dd, 1) 872 */ 873 static void qib_shutdown_device(struct qib_devdata *dd) 874 { 875 struct qib_pportdata *ppd; 876 unsigned pidx; 877 878 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 879 ppd = dd->pport + pidx; 880 881 spin_lock_irq(&ppd->lflags_lock); 882 ppd->lflags &= ~(QIBL_LINKDOWN | QIBL_LINKINIT | 883 QIBL_LINKARMED | QIBL_LINKACTIVE | 884 QIBL_LINKV); 885 spin_unlock_irq(&ppd->lflags_lock); 886 *ppd->statusp &= ~(QIB_STATUS_IB_CONF | QIB_STATUS_IB_READY); 887 } 888 dd->flags &= ~QIB_INITTED; 889 890 /* mask interrupts, but not errors */ 891 dd->f_set_intr_state(dd, 0); 892 893 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 894 ppd = dd->pport + pidx; 895 dd->f_rcvctrl(ppd, QIB_RCVCTRL_TAILUPD_DIS | 896 QIB_RCVCTRL_CTXT_DIS | 897 QIB_RCVCTRL_INTRAVAIL_DIS | 898 QIB_RCVCTRL_PKEY_ENB, -1); 899 /* 900 * Gracefully stop all sends allowing any in progress to 901 * trickle out first. 902 */ 903 dd->f_sendctrl(ppd, QIB_SENDCTRL_CLEAR); 904 } 905 906 /* 907 * Enough for anything that's going to trickle out to have actually 908 * done so. 909 */ 910 udelay(20); 911 912 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 913 ppd = dd->pport + pidx; 914 dd->f_setextled(ppd, 0); /* make sure LEDs are off */ 915 916 if (dd->flags & QIB_HAS_SEND_DMA) 917 qib_teardown_sdma(ppd); 918 919 dd->f_sendctrl(ppd, QIB_SENDCTRL_AVAIL_DIS | 920 QIB_SENDCTRL_SEND_DIS); 921 /* 922 * Clear SerdesEnable. 923 * We can't count on interrupts since we are stopping. 924 */ 925 dd->f_quiet_serdes(ppd); 926 927 if (ppd->qib_wq) { 928 destroy_workqueue(ppd->qib_wq); 929 ppd->qib_wq = NULL; 930 } 931 qib_free_pportdata(ppd); 932 } 933 934 qib_update_eeprom_log(dd); 935 } 936 937 /** 938 * qib_free_ctxtdata - free a context's allocated data 939 * @dd: the qlogic_ib device 940 * @rcd: the ctxtdata structure 941 * 942 * free up any allocated data for a context 943 * This should not touch anything that would affect a simultaneous 944 * re-allocation of context data, because it is called after qib_mutex 945 * is released (and can be called from reinit as well). 946 * It should never change any chip state, or global driver state. 947 */ 948 void qib_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd) 949 { 950 if (!rcd) 951 return; 952 953 if (rcd->rcvhdrq) { 954 dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size, 955 rcd->rcvhdrq, rcd->rcvhdrq_phys); 956 rcd->rcvhdrq = NULL; 957 if (rcd->rcvhdrtail_kvaddr) { 958 dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, 959 rcd->rcvhdrtail_kvaddr, 960 rcd->rcvhdrqtailaddr_phys); 961 rcd->rcvhdrtail_kvaddr = NULL; 962 } 963 } 964 if (rcd->rcvegrbuf) { 965 unsigned e; 966 967 for (e = 0; e < rcd->rcvegrbuf_chunks; e++) { 968 void *base = rcd->rcvegrbuf[e]; 969 size_t size = rcd->rcvegrbuf_size; 970 971 dma_free_coherent(&dd->pcidev->dev, size, 972 base, rcd->rcvegrbuf_phys[e]); 973 } 974 kfree(rcd->rcvegrbuf); 975 rcd->rcvegrbuf = NULL; 976 kfree(rcd->rcvegrbuf_phys); 977 rcd->rcvegrbuf_phys = NULL; 978 rcd->rcvegrbuf_chunks = 0; 979 } 980 981 kfree(rcd->tid_pg_list); 982 vfree(rcd->user_event_mask); 983 vfree(rcd->subctxt_uregbase); 984 vfree(rcd->subctxt_rcvegrbuf); 985 vfree(rcd->subctxt_rcvhdr_base); 986 #ifdef CONFIG_DEBUG_FS 987 kfree(rcd->opstats); 988 rcd->opstats = NULL; 989 #endif 990 kfree(rcd); 991 } 992 993 /* 994 * Perform a PIO buffer bandwidth write test, to verify proper system 995 * configuration. Even when all the setup calls work, occasionally 996 * BIOS or other issues can prevent write combining from working, or 997 * can cause other bandwidth problems to the chip. 998 * 999 * This test simply writes the same buffer over and over again, and 1000 * measures close to the peak bandwidth to the chip (not testing 1001 * data bandwidth to the wire). On chips that use an address-based 1002 * trigger to send packets to the wire, this is easy. On chips that 1003 * use a count to trigger, we want to make sure that the packet doesn't 1004 * go out on the wire, or trigger flow control checks. 1005 */ 1006 static void qib_verify_pioperf(struct qib_devdata *dd) 1007 { 1008 u32 pbnum, cnt, lcnt; 1009 u32 __iomem *piobuf; 1010 u32 *addr; 1011 u64 msecs, emsecs; 1012 1013 piobuf = dd->f_getsendbuf(dd->pport, 0ULL, &pbnum); 1014 if (!piobuf) { 1015 qib_devinfo(dd->pcidev, 1016 "No PIObufs for checking perf, skipping\n"); 1017 return; 1018 } 1019 1020 /* 1021 * Enough to give us a reasonable test, less than piobuf size, and 1022 * likely multiple of store buffer length. 1023 */ 1024 cnt = 1024; 1025 1026 addr = vmalloc(cnt); 1027 if (!addr) { 1028 qib_devinfo(dd->pcidev, 1029 "Couldn't get memory for checking PIO perf," 1030 " skipping\n"); 1031 goto done; 1032 } 1033 1034 preempt_disable(); /* we want reasonably accurate elapsed time */ 1035 msecs = 1 + jiffies_to_msecs(jiffies); 1036 for (lcnt = 0; lcnt < 10000U; lcnt++) { 1037 /* wait until we cross msec boundary */ 1038 if (jiffies_to_msecs(jiffies) >= msecs) 1039 break; 1040 udelay(1); 1041 } 1042 1043 dd->f_set_armlaunch(dd, 0); 1044 1045 /* 1046 * length 0, no dwords actually sent 1047 */ 1048 writeq(0, piobuf); 1049 qib_flush_wc(); 1050 1051 /* 1052 * This is only roughly accurate, since even with preempt we 1053 * still take interrupts that could take a while. Running for 1054 * >= 5 msec seems to get us "close enough" to accurate values. 1055 */ 1056 msecs = jiffies_to_msecs(jiffies); 1057 for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) { 1058 qib_pio_copy(piobuf + 64, addr, cnt >> 2); 1059 emsecs = jiffies_to_msecs(jiffies) - msecs; 1060 } 1061 1062 /* 1 GiB/sec, slightly over IB SDR line rate */ 1063 if (lcnt < (emsecs * 1024U)) 1064 qib_dev_err(dd, 1065 "Performance problem: bandwidth to PIO buffers is only %u MiB/sec\n", 1066 lcnt / (u32) emsecs); 1067 1068 preempt_enable(); 1069 1070 vfree(addr); 1071 1072 done: 1073 /* disarm piobuf, so it's available again */ 1074 dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbnum)); 1075 qib_sendbuf_done(dd, pbnum); 1076 dd->f_set_armlaunch(dd, 1); 1077 } 1078 1079 void qib_free_devdata(struct qib_devdata *dd) 1080 { 1081 unsigned long flags; 1082 1083 spin_lock_irqsave(&qib_devs_lock, flags); 1084 idr_remove(&qib_unit_table, dd->unit); 1085 list_del(&dd->list); 1086 spin_unlock_irqrestore(&qib_devs_lock, flags); 1087 1088 #ifdef CONFIG_DEBUG_FS 1089 qib_dbg_ibdev_exit(&dd->verbs_dev); 1090 #endif 1091 free_percpu(dd->int_counter); 1092 ib_dealloc_device(&dd->verbs_dev.ibdev); 1093 } 1094 1095 u64 qib_int_counter(struct qib_devdata *dd) 1096 { 1097 int cpu; 1098 u64 int_counter = 0; 1099 1100 for_each_possible_cpu(cpu) 1101 int_counter += *per_cpu_ptr(dd->int_counter, cpu); 1102 return int_counter; 1103 } 1104 1105 u64 qib_sps_ints(void) 1106 { 1107 unsigned long flags; 1108 struct qib_devdata *dd; 1109 u64 sps_ints = 0; 1110 1111 spin_lock_irqsave(&qib_devs_lock, flags); 1112 list_for_each_entry(dd, &qib_dev_list, list) { 1113 sps_ints += qib_int_counter(dd); 1114 } 1115 spin_unlock_irqrestore(&qib_devs_lock, flags); 1116 return sps_ints; 1117 } 1118 1119 /* 1120 * Allocate our primary per-unit data structure. Must be done via verbs 1121 * allocator, because the verbs cleanup process both does cleanup and 1122 * free of the data structure. 1123 * "extra" is for chip-specific data. 1124 * 1125 * Use the idr mechanism to get a unit number for this unit. 1126 */ 1127 struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) 1128 { 1129 unsigned long flags; 1130 struct qib_devdata *dd; 1131 int ret; 1132 1133 dd = (struct qib_devdata *) ib_alloc_device(sizeof(*dd) + extra); 1134 if (!dd) 1135 return ERR_PTR(-ENOMEM); 1136 1137 INIT_LIST_HEAD(&dd->list); 1138 1139 idr_preload(GFP_KERNEL); 1140 spin_lock_irqsave(&qib_devs_lock, flags); 1141 1142 ret = idr_alloc(&qib_unit_table, dd, 0, 0, GFP_NOWAIT); 1143 if (ret >= 0) { 1144 dd->unit = ret; 1145 list_add(&dd->list, &qib_dev_list); 1146 } 1147 1148 spin_unlock_irqrestore(&qib_devs_lock, flags); 1149 idr_preload_end(); 1150 1151 if (ret < 0) { 1152 qib_early_err(&pdev->dev, 1153 "Could not allocate unit ID: error %d\n", -ret); 1154 goto bail; 1155 } 1156 dd->int_counter = alloc_percpu(u64); 1157 if (!dd->int_counter) { 1158 ret = -ENOMEM; 1159 qib_early_err(&pdev->dev, 1160 "Could not allocate per-cpu int_counter\n"); 1161 goto bail; 1162 } 1163 1164 if (!qib_cpulist_count) { 1165 u32 count = num_online_cpus(); 1166 qib_cpulist = kzalloc(BITS_TO_LONGS(count) * 1167 sizeof(long), GFP_KERNEL); 1168 if (qib_cpulist) 1169 qib_cpulist_count = count; 1170 else 1171 qib_early_err(&pdev->dev, 1172 "Could not alloc cpulist info, cpu affinity might be wrong\n"); 1173 } 1174 #ifdef CONFIG_DEBUG_FS 1175 qib_dbg_ibdev_init(&dd->verbs_dev); 1176 #endif 1177 return dd; 1178 bail: 1179 if (!list_empty(&dd->list)) 1180 list_del_init(&dd->list); 1181 ib_dealloc_device(&dd->verbs_dev.ibdev); 1182 return ERR_PTR(ret);; 1183 } 1184 1185 /* 1186 * Called from freeze mode handlers, and from PCI error 1187 * reporting code. Should be paranoid about state of 1188 * system and data structures. 1189 */ 1190 void qib_disable_after_error(struct qib_devdata *dd) 1191 { 1192 if (dd->flags & QIB_INITTED) { 1193 u32 pidx; 1194 1195 dd->flags &= ~QIB_INITTED; 1196 if (dd->pport) 1197 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1198 struct qib_pportdata *ppd; 1199 1200 ppd = dd->pport + pidx; 1201 if (dd->flags & QIB_PRESENT) { 1202 qib_set_linkstate(ppd, 1203 QIB_IB_LINKDOWN_DISABLE); 1204 dd->f_setextled(ppd, 0); 1205 } 1206 *ppd->statusp &= ~QIB_STATUS_IB_READY; 1207 } 1208 } 1209 1210 /* 1211 * Mark as having had an error for driver, and also 1212 * for /sys and status word mapped to user programs. 1213 * This marks unit as not usable, until reset. 1214 */ 1215 if (dd->devstatusp) 1216 *dd->devstatusp |= QIB_STATUS_HWERROR; 1217 } 1218 1219 static void qib_remove_one(struct pci_dev *); 1220 static int qib_init_one(struct pci_dev *, const struct pci_device_id *); 1221 1222 #define DRIVER_LOAD_MSG "Intel " QIB_DRV_NAME " loaded: " 1223 #define PFX QIB_DRV_NAME ": " 1224 1225 static const struct pci_device_id qib_pci_tbl[] = { 1226 { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_QLOGIC_IB_6120) }, 1227 { PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7220) }, 1228 { PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7322) }, 1229 { 0, } 1230 }; 1231 1232 MODULE_DEVICE_TABLE(pci, qib_pci_tbl); 1233 1234 static struct pci_driver qib_driver = { 1235 .name = QIB_DRV_NAME, 1236 .probe = qib_init_one, 1237 .remove = qib_remove_one, 1238 .id_table = qib_pci_tbl, 1239 .err_handler = &qib_pci_err_handler, 1240 }; 1241 1242 #ifdef CONFIG_INFINIBAND_QIB_DCA 1243 1244 static int qib_notify_dca(struct notifier_block *, unsigned long, void *); 1245 static struct notifier_block dca_notifier = { 1246 .notifier_call = qib_notify_dca, 1247 .next = NULL, 1248 .priority = 0 1249 }; 1250 1251 static int qib_notify_dca_device(struct device *device, void *data) 1252 { 1253 struct qib_devdata *dd = dev_get_drvdata(device); 1254 unsigned long event = *(unsigned long *)data; 1255 1256 return dd->f_notify_dca(dd, event); 1257 } 1258 1259 static int qib_notify_dca(struct notifier_block *nb, unsigned long event, 1260 void *p) 1261 { 1262 int rval; 1263 1264 rval = driver_for_each_device(&qib_driver.driver, NULL, 1265 &event, qib_notify_dca_device); 1266 return rval ? NOTIFY_BAD : NOTIFY_DONE; 1267 } 1268 1269 #endif 1270 1271 /* 1272 * Do all the generic driver unit- and chip-independent memory 1273 * allocation and initialization. 1274 */ 1275 static int __init qib_ib_init(void) 1276 { 1277 int ret; 1278 1279 ret = qib_dev_init(); 1280 if (ret) 1281 goto bail; 1282 1283 /* 1284 * These must be called before the driver is registered with 1285 * the PCI subsystem. 1286 */ 1287 idr_init(&qib_unit_table); 1288 1289 #ifdef CONFIG_INFINIBAND_QIB_DCA 1290 dca_register_notify(&dca_notifier); 1291 #endif 1292 #ifdef CONFIG_DEBUG_FS 1293 qib_dbg_init(); 1294 #endif 1295 ret = pci_register_driver(&qib_driver); 1296 if (ret < 0) { 1297 pr_err("Unable to register driver: error %d\n", -ret); 1298 goto bail_dev; 1299 } 1300 1301 /* not fatal if it doesn't work */ 1302 if (qib_init_qibfs()) 1303 pr_err("Unable to register ipathfs\n"); 1304 goto bail; /* all OK */ 1305 1306 bail_dev: 1307 #ifdef CONFIG_INFINIBAND_QIB_DCA 1308 dca_unregister_notify(&dca_notifier); 1309 #endif 1310 #ifdef CONFIG_DEBUG_FS 1311 qib_dbg_exit(); 1312 #endif 1313 idr_destroy(&qib_unit_table); 1314 qib_dev_cleanup(); 1315 bail: 1316 return ret; 1317 } 1318 1319 module_init(qib_ib_init); 1320 1321 /* 1322 * Do the non-unit driver cleanup, memory free, etc. at unload. 1323 */ 1324 static void __exit qib_ib_cleanup(void) 1325 { 1326 int ret; 1327 1328 ret = qib_exit_qibfs(); 1329 if (ret) 1330 pr_err( 1331 "Unable to cleanup counter filesystem: error %d\n", 1332 -ret); 1333 1334 #ifdef CONFIG_INFINIBAND_QIB_DCA 1335 dca_unregister_notify(&dca_notifier); 1336 #endif 1337 pci_unregister_driver(&qib_driver); 1338 #ifdef CONFIG_DEBUG_FS 1339 qib_dbg_exit(); 1340 #endif 1341 1342 qib_cpulist_count = 0; 1343 kfree(qib_cpulist); 1344 1345 idr_destroy(&qib_unit_table); 1346 qib_dev_cleanup(); 1347 } 1348 1349 module_exit(qib_ib_cleanup); 1350 1351 /* this can only be called after a successful initialization */ 1352 static void cleanup_device_data(struct qib_devdata *dd) 1353 { 1354 int ctxt; 1355 int pidx; 1356 struct qib_ctxtdata **tmp; 1357 unsigned long flags; 1358 1359 /* users can't do anything more with chip */ 1360 for (pidx = 0; pidx < dd->num_pports; ++pidx) { 1361 if (dd->pport[pidx].statusp) 1362 *dd->pport[pidx].statusp &= ~QIB_STATUS_CHIP_PRESENT; 1363 1364 spin_lock(&dd->pport[pidx].cc_shadow_lock); 1365 1366 kfree(dd->pport[pidx].congestion_entries); 1367 dd->pport[pidx].congestion_entries = NULL; 1368 kfree(dd->pport[pidx].ccti_entries); 1369 dd->pport[pidx].ccti_entries = NULL; 1370 kfree(dd->pport[pidx].ccti_entries_shadow); 1371 dd->pport[pidx].ccti_entries_shadow = NULL; 1372 kfree(dd->pport[pidx].congestion_entries_shadow); 1373 dd->pport[pidx].congestion_entries_shadow = NULL; 1374 1375 spin_unlock(&dd->pport[pidx].cc_shadow_lock); 1376 } 1377 1378 if (!qib_wc_pat) 1379 qib_disable_wc(dd); 1380 1381 if (dd->pioavailregs_dma) { 1382 dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, 1383 (void *) dd->pioavailregs_dma, 1384 dd->pioavailregs_phys); 1385 dd->pioavailregs_dma = NULL; 1386 } 1387 1388 if (dd->pageshadow) { 1389 struct page **tmpp = dd->pageshadow; 1390 dma_addr_t *tmpd = dd->physshadow; 1391 int i; 1392 1393 for (ctxt = 0; ctxt < dd->cfgctxts; ctxt++) { 1394 int ctxt_tidbase = ctxt * dd->rcvtidcnt; 1395 int maxtid = ctxt_tidbase + dd->rcvtidcnt; 1396 1397 for (i = ctxt_tidbase; i < maxtid; i++) { 1398 if (!tmpp[i]) 1399 continue; 1400 pci_unmap_page(dd->pcidev, tmpd[i], 1401 PAGE_SIZE, PCI_DMA_FROMDEVICE); 1402 qib_release_user_pages(&tmpp[i], 1); 1403 tmpp[i] = NULL; 1404 } 1405 } 1406 1407 dd->pageshadow = NULL; 1408 vfree(tmpp); 1409 dd->physshadow = NULL; 1410 vfree(tmpd); 1411 } 1412 1413 /* 1414 * Free any resources still in use (usually just kernel contexts) 1415 * at unload; we do for ctxtcnt, because that's what we allocate. 1416 * We acquire lock to be really paranoid that rcd isn't being 1417 * accessed from some interrupt-related code (that should not happen, 1418 * but best to be sure). 1419 */ 1420 spin_lock_irqsave(&dd->uctxt_lock, flags); 1421 tmp = dd->rcd; 1422 dd->rcd = NULL; 1423 spin_unlock_irqrestore(&dd->uctxt_lock, flags); 1424 for (ctxt = 0; tmp && ctxt < dd->ctxtcnt; ctxt++) { 1425 struct qib_ctxtdata *rcd = tmp[ctxt]; 1426 1427 tmp[ctxt] = NULL; /* debugging paranoia */ 1428 qib_free_ctxtdata(dd, rcd); 1429 } 1430 kfree(tmp); 1431 kfree(dd->boardname); 1432 qib_cq_exit(dd); 1433 } 1434 1435 /* 1436 * Clean up on unit shutdown, or error during unit load after 1437 * successful initialization. 1438 */ 1439 static void qib_postinit_cleanup(struct qib_devdata *dd) 1440 { 1441 /* 1442 * Clean up chip-specific stuff. 1443 * We check for NULL here, because it's outside 1444 * the kregbase check, and we need to call it 1445 * after the free_irq. Thus it's possible that 1446 * the function pointers were never initialized. 1447 */ 1448 if (dd->f_cleanup) 1449 dd->f_cleanup(dd); 1450 1451 qib_pcie_ddcleanup(dd); 1452 1453 cleanup_device_data(dd); 1454 1455 qib_free_devdata(dd); 1456 } 1457 1458 static int qib_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) 1459 { 1460 int ret, j, pidx, initfail; 1461 struct qib_devdata *dd = NULL; 1462 1463 ret = qib_pcie_init(pdev, ent); 1464 if (ret) 1465 goto bail; 1466 1467 /* 1468 * Do device-specific initialiation, function table setup, dd 1469 * allocation, etc. 1470 */ 1471 switch (ent->device) { 1472 case PCI_DEVICE_ID_QLOGIC_IB_6120: 1473 #ifdef CONFIG_PCI_MSI 1474 dd = qib_init_iba6120_funcs(pdev, ent); 1475 #else 1476 qib_early_err(&pdev->dev, 1477 "Intel PCIE device 0x%x cannot work if CONFIG_PCI_MSI is not enabled\n", 1478 ent->device); 1479 dd = ERR_PTR(-ENODEV); 1480 #endif 1481 break; 1482 1483 case PCI_DEVICE_ID_QLOGIC_IB_7220: 1484 dd = qib_init_iba7220_funcs(pdev, ent); 1485 break; 1486 1487 case PCI_DEVICE_ID_QLOGIC_IB_7322: 1488 dd = qib_init_iba7322_funcs(pdev, ent); 1489 break; 1490 1491 default: 1492 qib_early_err(&pdev->dev, 1493 "Failing on unknown Intel deviceid 0x%x\n", 1494 ent->device); 1495 ret = -ENODEV; 1496 } 1497 1498 if (IS_ERR(dd)) 1499 ret = PTR_ERR(dd); 1500 if (ret) 1501 goto bail; /* error already printed */ 1502 1503 ret = qib_create_workqueues(dd); 1504 if (ret) 1505 goto bail; 1506 1507 /* do the generic initialization */ 1508 initfail = qib_init(dd, 0); 1509 1510 ret = qib_register_ib_device(dd); 1511 1512 /* 1513 * Now ready for use. this should be cleared whenever we 1514 * detect a reset, or initiate one. If earlier failure, 1515 * we still create devices, so diags, etc. can be used 1516 * to determine cause of problem. 1517 */ 1518 if (!qib_mini_init && !initfail && !ret) 1519 dd->flags |= QIB_INITTED; 1520 1521 j = qib_device_create(dd); 1522 if (j) 1523 qib_dev_err(dd, "Failed to create /dev devices: %d\n", -j); 1524 j = qibfs_add(dd); 1525 if (j) 1526 qib_dev_err(dd, "Failed filesystem setup for counters: %d\n", 1527 -j); 1528 1529 if (qib_mini_init || initfail || ret) { 1530 qib_stop_timers(dd); 1531 flush_workqueue(ib_wq); 1532 for (pidx = 0; pidx < dd->num_pports; ++pidx) 1533 dd->f_quiet_serdes(dd->pport + pidx); 1534 if (qib_mini_init) 1535 goto bail; 1536 if (!j) { 1537 (void) qibfs_remove(dd); 1538 qib_device_remove(dd); 1539 } 1540 if (!ret) 1541 qib_unregister_ib_device(dd); 1542 qib_postinit_cleanup(dd); 1543 if (initfail) 1544 ret = initfail; 1545 goto bail; 1546 } 1547 1548 if (!qib_wc_pat) { 1549 ret = qib_enable_wc(dd); 1550 if (ret) { 1551 qib_dev_err(dd, 1552 "Write combining not enabled (err %d): performance may be poor\n", 1553 -ret); 1554 ret = 0; 1555 } 1556 } 1557 1558 qib_verify_pioperf(dd); 1559 bail: 1560 return ret; 1561 } 1562 1563 static void qib_remove_one(struct pci_dev *pdev) 1564 { 1565 struct qib_devdata *dd = pci_get_drvdata(pdev); 1566 int ret; 1567 1568 /* unregister from IB core */ 1569 qib_unregister_ib_device(dd); 1570 1571 /* 1572 * Disable the IB link, disable interrupts on the device, 1573 * clear dma engines, etc. 1574 */ 1575 if (!qib_mini_init) 1576 qib_shutdown_device(dd); 1577 1578 qib_stop_timers(dd); 1579 1580 /* wait until all of our (qsfp) queue_work() calls complete */ 1581 flush_workqueue(ib_wq); 1582 1583 ret = qibfs_remove(dd); 1584 if (ret) 1585 qib_dev_err(dd, "Failed counters filesystem cleanup: %d\n", 1586 -ret); 1587 1588 qib_device_remove(dd); 1589 1590 qib_postinit_cleanup(dd); 1591 } 1592 1593 /** 1594 * qib_create_rcvhdrq - create a receive header queue 1595 * @dd: the qlogic_ib device 1596 * @rcd: the context data 1597 * 1598 * This must be contiguous memory (from an i/o perspective), and must be 1599 * DMA'able (which means for some systems, it will go through an IOMMU, 1600 * or be forced into a low address range). 1601 */ 1602 int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd) 1603 { 1604 unsigned amt; 1605 int old_node_id; 1606 1607 if (!rcd->rcvhdrq) { 1608 dma_addr_t phys_hdrqtail; 1609 gfp_t gfp_flags; 1610 1611 amt = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize * 1612 sizeof(u32), PAGE_SIZE); 1613 gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ? 1614 GFP_USER : GFP_KERNEL; 1615 1616 old_node_id = dev_to_node(&dd->pcidev->dev); 1617 set_dev_node(&dd->pcidev->dev, rcd->node_id); 1618 rcd->rcvhdrq = dma_alloc_coherent( 1619 &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys, 1620 gfp_flags | __GFP_COMP); 1621 set_dev_node(&dd->pcidev->dev, old_node_id); 1622 1623 if (!rcd->rcvhdrq) { 1624 qib_dev_err(dd, 1625 "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", 1626 amt, rcd->ctxt); 1627 goto bail; 1628 } 1629 1630 if (rcd->ctxt >= dd->first_user_ctxt) { 1631 rcd->user_event_mask = vmalloc_user(PAGE_SIZE); 1632 if (!rcd->user_event_mask) 1633 goto bail_free_hdrq; 1634 } 1635 1636 if (!(dd->flags & QIB_NODMA_RTAIL)) { 1637 set_dev_node(&dd->pcidev->dev, rcd->node_id); 1638 rcd->rcvhdrtail_kvaddr = dma_alloc_coherent( 1639 &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, 1640 gfp_flags); 1641 set_dev_node(&dd->pcidev->dev, old_node_id); 1642 if (!rcd->rcvhdrtail_kvaddr) 1643 goto bail_free; 1644 rcd->rcvhdrqtailaddr_phys = phys_hdrqtail; 1645 } 1646 1647 rcd->rcvhdrq_size = amt; 1648 } 1649 1650 /* clear for security and sanity on each use */ 1651 memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size); 1652 if (rcd->rcvhdrtail_kvaddr) 1653 memset(rcd->rcvhdrtail_kvaddr, 0, PAGE_SIZE); 1654 return 0; 1655 1656 bail_free: 1657 qib_dev_err(dd, 1658 "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", 1659 rcd->ctxt); 1660 vfree(rcd->user_event_mask); 1661 rcd->user_event_mask = NULL; 1662 bail_free_hdrq: 1663 dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, 1664 rcd->rcvhdrq_phys); 1665 rcd->rcvhdrq = NULL; 1666 bail: 1667 return -ENOMEM; 1668 } 1669 1670 /** 1671 * allocate eager buffers, both kernel and user contexts. 1672 * @rcd: the context we are setting up. 1673 * 1674 * Allocate the eager TID buffers and program them into hip. 1675 * They are no longer completely contiguous, we do multiple allocation 1676 * calls. Otherwise we get the OOM code involved, by asking for too 1677 * much per call, with disastrous results on some kernels. 1678 */ 1679 int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) 1680 { 1681 struct qib_devdata *dd = rcd->dd; 1682 unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff; 1683 size_t size; 1684 gfp_t gfp_flags; 1685 int old_node_id; 1686 1687 /* 1688 * GFP_USER, but without GFP_FS, so buffer cache can be 1689 * coalesced (we hope); otherwise, even at order 4, 1690 * heavy filesystem activity makes these fail, and we can 1691 * use compound pages. 1692 */ 1693 gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; 1694 1695 egrcnt = rcd->rcvegrcnt; 1696 egroff = rcd->rcvegr_tid_base; 1697 egrsize = dd->rcvegrbufsize; 1698 1699 chunk = rcd->rcvegrbuf_chunks; 1700 egrperchunk = rcd->rcvegrbufs_perchunk; 1701 size = rcd->rcvegrbuf_size; 1702 if (!rcd->rcvegrbuf) { 1703 rcd->rcvegrbuf = 1704 kzalloc_node(chunk * sizeof(rcd->rcvegrbuf[0]), 1705 GFP_KERNEL, rcd->node_id); 1706 if (!rcd->rcvegrbuf) 1707 goto bail; 1708 } 1709 if (!rcd->rcvegrbuf_phys) { 1710 rcd->rcvegrbuf_phys = 1711 kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]), 1712 GFP_KERNEL, rcd->node_id); 1713 if (!rcd->rcvegrbuf_phys) 1714 goto bail_rcvegrbuf; 1715 } 1716 for (e = 0; e < rcd->rcvegrbuf_chunks; e++) { 1717 if (rcd->rcvegrbuf[e]) 1718 continue; 1719 1720 old_node_id = dev_to_node(&dd->pcidev->dev); 1721 set_dev_node(&dd->pcidev->dev, rcd->node_id); 1722 rcd->rcvegrbuf[e] = 1723 dma_alloc_coherent(&dd->pcidev->dev, size, 1724 &rcd->rcvegrbuf_phys[e], 1725 gfp_flags); 1726 set_dev_node(&dd->pcidev->dev, old_node_id); 1727 if (!rcd->rcvegrbuf[e]) 1728 goto bail_rcvegrbuf_phys; 1729 } 1730 1731 rcd->rcvegr_phys = rcd->rcvegrbuf_phys[0]; 1732 1733 for (e = chunk = 0; chunk < rcd->rcvegrbuf_chunks; chunk++) { 1734 dma_addr_t pa = rcd->rcvegrbuf_phys[chunk]; 1735 unsigned i; 1736 1737 /* clear for security and sanity on each use */ 1738 memset(rcd->rcvegrbuf[chunk], 0, size); 1739 1740 for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) { 1741 dd->f_put_tid(dd, e + egroff + 1742 (u64 __iomem *) 1743 ((char __iomem *) 1744 dd->kregbase + 1745 dd->rcvegrbase), 1746 RCVHQ_RCV_TYPE_EAGER, pa); 1747 pa += egrsize; 1748 } 1749 cond_resched(); /* don't hog the cpu */ 1750 } 1751 1752 return 0; 1753 1754 bail_rcvegrbuf_phys: 1755 for (e = 0; e < rcd->rcvegrbuf_chunks && rcd->rcvegrbuf[e]; e++) 1756 dma_free_coherent(&dd->pcidev->dev, size, 1757 rcd->rcvegrbuf[e], rcd->rcvegrbuf_phys[e]); 1758 kfree(rcd->rcvegrbuf_phys); 1759 rcd->rcvegrbuf_phys = NULL; 1760 bail_rcvegrbuf: 1761 kfree(rcd->rcvegrbuf); 1762 rcd->rcvegrbuf = NULL; 1763 bail: 1764 return -ENOMEM; 1765 } 1766 1767 /* 1768 * Note: Changes to this routine should be mirrored 1769 * for the diagnostics routine qib_remap_ioaddr32(). 1770 * There is also related code for VL15 buffers in qib_init_7322_variables(). 1771 * The teardown code that unmaps is in qib_pcie_ddcleanup() 1772 */ 1773 int init_chip_wc_pat(struct qib_devdata *dd, u32 vl15buflen) 1774 { 1775 u64 __iomem *qib_kregbase = NULL; 1776 void __iomem *qib_piobase = NULL; 1777 u64 __iomem *qib_userbase = NULL; 1778 u64 qib_kreglen; 1779 u64 qib_pio2koffset = dd->piobufbase & 0xffffffff; 1780 u64 qib_pio4koffset = dd->piobufbase >> 32; 1781 u64 qib_pio2klen = dd->piobcnt2k * dd->palign; 1782 u64 qib_pio4klen = dd->piobcnt4k * dd->align4k; 1783 u64 qib_physaddr = dd->physaddr; 1784 u64 qib_piolen; 1785 u64 qib_userlen = 0; 1786 1787 /* 1788 * Free the old mapping because the kernel will try to reuse the 1789 * old mapping and not create a new mapping with the 1790 * write combining attribute. 1791 */ 1792 iounmap(dd->kregbase); 1793 dd->kregbase = NULL; 1794 1795 /* 1796 * Assumes chip address space looks like: 1797 * - kregs + sregs + cregs + uregs (in any order) 1798 * - piobufs (2K and 4K bufs in either order) 1799 * or: 1800 * - kregs + sregs + cregs (in any order) 1801 * - piobufs (2K and 4K bufs in either order) 1802 * - uregs 1803 */ 1804 if (dd->piobcnt4k == 0) { 1805 qib_kreglen = qib_pio2koffset; 1806 qib_piolen = qib_pio2klen; 1807 } else if (qib_pio2koffset < qib_pio4koffset) { 1808 qib_kreglen = qib_pio2koffset; 1809 qib_piolen = qib_pio4koffset + qib_pio4klen - qib_kreglen; 1810 } else { 1811 qib_kreglen = qib_pio4koffset; 1812 qib_piolen = qib_pio2koffset + qib_pio2klen - qib_kreglen; 1813 } 1814 qib_piolen += vl15buflen; 1815 /* Map just the configured ports (not all hw ports) */ 1816 if (dd->uregbase > qib_kreglen) 1817 qib_userlen = dd->ureg_align * dd->cfgctxts; 1818 1819 /* Sanity checks passed, now create the new mappings */ 1820 qib_kregbase = ioremap_nocache(qib_physaddr, qib_kreglen); 1821 if (!qib_kregbase) 1822 goto bail; 1823 1824 qib_piobase = ioremap_wc(qib_physaddr + qib_kreglen, qib_piolen); 1825 if (!qib_piobase) 1826 goto bail_kregbase; 1827 1828 if (qib_userlen) { 1829 qib_userbase = ioremap_nocache(qib_physaddr + dd->uregbase, 1830 qib_userlen); 1831 if (!qib_userbase) 1832 goto bail_piobase; 1833 } 1834 1835 dd->kregbase = qib_kregbase; 1836 dd->kregend = (u64 __iomem *) 1837 ((char __iomem *) qib_kregbase + qib_kreglen); 1838 dd->piobase = qib_piobase; 1839 dd->pio2kbase = (void __iomem *) 1840 (((char __iomem *) dd->piobase) + 1841 qib_pio2koffset - qib_kreglen); 1842 if (dd->piobcnt4k) 1843 dd->pio4kbase = (void __iomem *) 1844 (((char __iomem *) dd->piobase) + 1845 qib_pio4koffset - qib_kreglen); 1846 if (qib_userlen) 1847 /* ureg will now be accessed relative to dd->userbase */ 1848 dd->userbase = qib_userbase; 1849 return 0; 1850 1851 bail_piobase: 1852 iounmap(qib_piobase); 1853 bail_kregbase: 1854 iounmap(qib_kregbase); 1855 bail: 1856 return -ENOMEM; 1857 } 1858