1 /* 2 * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * Copyright (c) 2005 Mellanox Technologies. All rights reserved. 5 * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved. 6 * 7 * This software is available to you under a choice of one of two 8 * licenses. You may choose to be licensed under the terms of the GNU 9 * General Public License (GPL) Version 2, available from the file 10 * COPYING in the main directory of this source tree, or the 11 * OpenIB.org BSD license below: 12 * 13 * Redistribution and use in source and binary forms, with or 14 * without modification, are permitted provided that the following 15 * conditions are met: 16 * 17 * - Redistributions of source code must retain the above 18 * copyright notice, this list of conditions and the following 19 * disclaimer. 20 * 21 * - Redistributions in binary form must reproduce the above 22 * copyright notice, this list of conditions and the following 23 * disclaimer in the documentation and/or other materials 24 * provided with the distribution. 25 * 26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 27 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 28 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 29 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 30 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 31 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 32 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 33 * SOFTWARE. 34 * 35 * $Id: ipoib_ib.c 1386 2004-12-27 16:23:17Z roland $ 36 */ 37 38 #include <linux/delay.h> 39 #include <linux/dma-mapping.h> 40 41 #include <rdma/ib_cache.h> 42 #include <linux/ip.h> 43 #include <linux/tcp.h> 44 45 #include "ipoib.h" 46 47 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA 48 static int data_debug_level; 49 50 module_param(data_debug_level, int, 0644); 51 MODULE_PARM_DESC(data_debug_level, 52 "Enable data path debug tracing if > 0"); 53 #endif 54 55 static DEFINE_MUTEX(pkey_mutex); 56 57 struct ipoib_ah *ipoib_create_ah(struct net_device *dev, 58 struct ib_pd *pd, struct ib_ah_attr *attr) 59 { 60 struct ipoib_ah *ah; 61 62 ah = kmalloc(sizeof *ah, GFP_KERNEL); 63 if (!ah) 64 return NULL; 65 66 ah->dev = dev; 67 ah->last_send = 0; 68 kref_init(&ah->ref); 69 70 ah->ah = ib_create_ah(pd, attr); 71 if (IS_ERR(ah->ah)) { 72 kfree(ah); 73 ah = NULL; 74 } else 75 ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah); 76 77 return ah; 78 } 79 80 void ipoib_free_ah(struct kref *kref) 81 { 82 struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref); 83 struct ipoib_dev_priv *priv = netdev_priv(ah->dev); 84 85 unsigned long flags; 86 87 spin_lock_irqsave(&priv->lock, flags); 88 list_add_tail(&ah->list, &priv->dead_ahs); 89 spin_unlock_irqrestore(&priv->lock, flags); 90 } 91 92 static int ipoib_ib_post_receive(struct net_device *dev, int id) 93 { 94 struct ipoib_dev_priv *priv = netdev_priv(dev); 95 struct ib_sge list; 96 struct ib_recv_wr param; 97 struct ib_recv_wr *bad_wr; 98 int ret; 99 100 list.addr = priv->rx_ring[id].mapping; 101 list.length = IPOIB_BUF_SIZE; 102 list.lkey = priv->mr->lkey; 103 104 param.next = NULL; 105 param.wr_id = id | IPOIB_OP_RECV; 106 param.sg_list = &list; 107 param.num_sge = 1; 108 109 ret = ib_post_recv(priv->qp, ¶m, &bad_wr); 110 if (unlikely(ret)) { 111 ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret); 112 ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping, 113 IPOIB_BUF_SIZE, DMA_FROM_DEVICE); 114 dev_kfree_skb_any(priv->rx_ring[id].skb); 115 priv->rx_ring[id].skb = NULL; 116 } 117 118 return ret; 119 } 120 121 static int ipoib_alloc_rx_skb(struct net_device *dev, int id) 122 { 123 struct ipoib_dev_priv *priv = netdev_priv(dev); 124 struct sk_buff *skb; 125 u64 addr; 126 127 skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4); 128 if (!skb) 129 return -ENOMEM; 130 131 /* 132 * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte 133 * header. So we need 4 more bytes to get to 48 and align the 134 * IP header to a multiple of 16. 135 */ 136 skb_reserve(skb, 4); 137 138 addr = ib_dma_map_single(priv->ca, skb->data, IPOIB_BUF_SIZE, 139 DMA_FROM_DEVICE); 140 if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { 141 dev_kfree_skb_any(skb); 142 return -EIO; 143 } 144 145 priv->rx_ring[id].skb = skb; 146 priv->rx_ring[id].mapping = addr; 147 148 return 0; 149 } 150 151 static int ipoib_ib_post_receives(struct net_device *dev) 152 { 153 struct ipoib_dev_priv *priv = netdev_priv(dev); 154 int i; 155 156 for (i = 0; i < ipoib_recvq_size; ++i) { 157 if (ipoib_alloc_rx_skb(dev, i)) { 158 ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); 159 return -ENOMEM; 160 } 161 if (ipoib_ib_post_receive(dev, i)) { 162 ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i); 163 return -EIO; 164 } 165 } 166 167 return 0; 168 } 169 170 static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) 171 { 172 struct ipoib_dev_priv *priv = netdev_priv(dev); 173 unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; 174 struct sk_buff *skb; 175 u64 addr; 176 177 ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", 178 wr_id, wc->status); 179 180 if (unlikely(wr_id >= ipoib_recvq_size)) { 181 ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n", 182 wr_id, ipoib_recvq_size); 183 return; 184 } 185 186 skb = priv->rx_ring[wr_id].skb; 187 addr = priv->rx_ring[wr_id].mapping; 188 189 if (unlikely(wc->status != IB_WC_SUCCESS)) { 190 if (wc->status != IB_WC_WR_FLUSH_ERR) 191 ipoib_warn(priv, "failed recv event " 192 "(status=%d, wrid=%d vend_err %x)\n", 193 wc->status, wr_id, wc->vendor_err); 194 ib_dma_unmap_single(priv->ca, addr, 195 IPOIB_BUF_SIZE, DMA_FROM_DEVICE); 196 dev_kfree_skb_any(skb); 197 priv->rx_ring[wr_id].skb = NULL; 198 return; 199 } 200 201 /* 202 * Drop packets that this interface sent, ie multicast packets 203 * that the HCA has replicated. 204 */ 205 if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) 206 goto repost; 207 208 /* 209 * If we can't allocate a new RX buffer, dump 210 * this packet and reuse the old buffer. 211 */ 212 if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) { 213 ++dev->stats.rx_dropped; 214 goto repost; 215 } 216 217 ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", 218 wc->byte_len, wc->slid); 219 220 ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE); 221 222 skb_put(skb, wc->byte_len); 223 skb_pull(skb, IB_GRH_BYTES); 224 225 skb->protocol = ((struct ipoib_header *) skb->data)->proto; 226 skb_reset_mac_header(skb); 227 skb_pull(skb, IPOIB_ENCAP_LEN); 228 229 dev->last_rx = jiffies; 230 ++dev->stats.rx_packets; 231 dev->stats.rx_bytes += skb->len; 232 233 skb->dev = dev; 234 /* XXX get correct PACKET_ type here */ 235 skb->pkt_type = PACKET_HOST; 236 237 if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) 238 skb->ip_summed = CHECKSUM_UNNECESSARY; 239 240 netif_receive_skb(skb); 241 242 repost: 243 if (unlikely(ipoib_ib_post_receive(dev, wr_id))) 244 ipoib_warn(priv, "ipoib_ib_post_receive failed " 245 "for buf %d\n", wr_id); 246 } 247 248 static int ipoib_dma_map_tx(struct ib_device *ca, 249 struct ipoib_tx_buf *tx_req) 250 { 251 struct sk_buff *skb = tx_req->skb; 252 u64 *mapping = tx_req->mapping; 253 int i; 254 int off; 255 256 if (skb_headlen(skb)) { 257 mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb), 258 DMA_TO_DEVICE); 259 if (unlikely(ib_dma_mapping_error(ca, mapping[0]))) 260 return -EIO; 261 262 off = 1; 263 } else 264 off = 0; 265 266 for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { 267 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 268 mapping[i + off] = ib_dma_map_page(ca, frag->page, 269 frag->page_offset, frag->size, 270 DMA_TO_DEVICE); 271 if (unlikely(ib_dma_mapping_error(ca, mapping[i + off]))) 272 goto partial_error; 273 } 274 return 0; 275 276 partial_error: 277 for (; i > 0; --i) { 278 skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; 279 ib_dma_unmap_page(ca, mapping[i - !off], frag->size, DMA_TO_DEVICE); 280 } 281 282 if (off) 283 ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); 284 285 return -EIO; 286 } 287 288 static void ipoib_dma_unmap_tx(struct ib_device *ca, 289 struct ipoib_tx_buf *tx_req) 290 { 291 struct sk_buff *skb = tx_req->skb; 292 u64 *mapping = tx_req->mapping; 293 int i; 294 int off; 295 296 if (skb_headlen(skb)) { 297 ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); 298 off = 1; 299 } else 300 off = 0; 301 302 for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { 303 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 304 ib_dma_unmap_page(ca, mapping[i + off], frag->size, 305 DMA_TO_DEVICE); 306 } 307 } 308 309 static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) 310 { 311 struct ipoib_dev_priv *priv = netdev_priv(dev); 312 unsigned int wr_id = wc->wr_id; 313 struct ipoib_tx_buf *tx_req; 314 unsigned long flags; 315 316 ipoib_dbg_data(priv, "send completion: id %d, status: %d\n", 317 wr_id, wc->status); 318 319 if (unlikely(wr_id >= ipoib_sendq_size)) { 320 ipoib_warn(priv, "send completion event with wrid %d (> %d)\n", 321 wr_id, ipoib_sendq_size); 322 return; 323 } 324 325 tx_req = &priv->tx_ring[wr_id]; 326 327 ipoib_dma_unmap_tx(priv->ca, tx_req); 328 329 ++dev->stats.tx_packets; 330 dev->stats.tx_bytes += tx_req->skb->len; 331 332 dev_kfree_skb_any(tx_req->skb); 333 334 spin_lock_irqsave(&priv->tx_lock, flags); 335 ++priv->tx_tail; 336 if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && 337 netif_queue_stopped(dev) && 338 test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) 339 netif_wake_queue(dev); 340 spin_unlock_irqrestore(&priv->tx_lock, flags); 341 342 if (wc->status != IB_WC_SUCCESS && 343 wc->status != IB_WC_WR_FLUSH_ERR) 344 ipoib_warn(priv, "failed send event " 345 "(status=%d, wrid=%d vend_err %x)\n", 346 wc->status, wr_id, wc->vendor_err); 347 } 348 349 int ipoib_poll(struct napi_struct *napi, int budget) 350 { 351 struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, napi); 352 struct net_device *dev = priv->dev; 353 int done; 354 int t; 355 int n, i; 356 357 done = 0; 358 359 poll_more: 360 while (done < budget) { 361 int max = (budget - done); 362 363 t = min(IPOIB_NUM_WC, max); 364 n = ib_poll_cq(priv->cq, t, priv->ibwc); 365 366 for (i = 0; i < n; i++) { 367 struct ib_wc *wc = priv->ibwc + i; 368 369 if (wc->wr_id & IPOIB_OP_RECV) { 370 ++done; 371 if (wc->wr_id & IPOIB_OP_CM) 372 ipoib_cm_handle_rx_wc(dev, wc); 373 else 374 ipoib_ib_handle_rx_wc(dev, wc); 375 } else { 376 if (wc->wr_id & IPOIB_OP_CM) 377 ipoib_cm_handle_tx_wc(dev, wc); 378 else 379 ipoib_ib_handle_tx_wc(dev, wc); 380 } 381 } 382 383 if (n != t) 384 break; 385 } 386 387 if (done < budget) { 388 netif_rx_complete(dev, napi); 389 if (unlikely(ib_req_notify_cq(priv->cq, 390 IB_CQ_NEXT_COMP | 391 IB_CQ_REPORT_MISSED_EVENTS)) && 392 netif_rx_reschedule(dev, napi)) 393 goto poll_more; 394 } 395 396 return done; 397 } 398 399 void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr) 400 { 401 struct net_device *dev = dev_ptr; 402 struct ipoib_dev_priv *priv = netdev_priv(dev); 403 404 netif_rx_schedule(dev, &priv->napi); 405 } 406 407 static inline int post_send(struct ipoib_dev_priv *priv, 408 unsigned int wr_id, 409 struct ib_ah *address, u32 qpn, 410 struct ipoib_tx_buf *tx_req, 411 void *head, int hlen) 412 { 413 struct ib_send_wr *bad_wr; 414 int i, off; 415 struct sk_buff *skb = tx_req->skb; 416 skb_frag_t *frags = skb_shinfo(skb)->frags; 417 int nr_frags = skb_shinfo(skb)->nr_frags; 418 u64 *mapping = tx_req->mapping; 419 420 if (skb_headlen(skb)) { 421 priv->tx_sge[0].addr = mapping[0]; 422 priv->tx_sge[0].length = skb_headlen(skb); 423 off = 1; 424 } else 425 off = 0; 426 427 for (i = 0; i < nr_frags; ++i) { 428 priv->tx_sge[i + off].addr = mapping[i + off]; 429 priv->tx_sge[i + off].length = frags[i].size; 430 } 431 priv->tx_wr.num_sge = nr_frags + off; 432 priv->tx_wr.wr_id = wr_id; 433 priv->tx_wr.wr.ud.remote_qpn = qpn; 434 priv->tx_wr.wr.ud.ah = address; 435 436 if (head) { 437 priv->tx_wr.wr.ud.mss = skb_shinfo(skb)->gso_size; 438 priv->tx_wr.wr.ud.header = head; 439 priv->tx_wr.wr.ud.hlen = hlen; 440 priv->tx_wr.opcode = IB_WR_LSO; 441 } else 442 priv->tx_wr.opcode = IB_WR_SEND; 443 444 return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr); 445 } 446 447 void ipoib_send(struct net_device *dev, struct sk_buff *skb, 448 struct ipoib_ah *address, u32 qpn) 449 { 450 struct ipoib_dev_priv *priv = netdev_priv(dev); 451 struct ipoib_tx_buf *tx_req; 452 int hlen; 453 void *phead; 454 455 if (skb_is_gso(skb)) { 456 hlen = skb_transport_offset(skb) + tcp_hdrlen(skb); 457 phead = skb->data; 458 if (unlikely(!skb_pull(skb, hlen))) { 459 ipoib_warn(priv, "linear data too small\n"); 460 ++dev->stats.tx_dropped; 461 ++dev->stats.tx_errors; 462 dev_kfree_skb_any(skb); 463 return; 464 } 465 } else { 466 if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { 467 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", 468 skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN); 469 ++dev->stats.tx_dropped; 470 ++dev->stats.tx_errors; 471 ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu); 472 return; 473 } 474 phead = NULL; 475 hlen = 0; 476 } 477 478 ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n", 479 skb->len, address, qpn); 480 481 /* 482 * We put the skb into the tx_ring _before_ we call post_send() 483 * because it's entirely possible that the completion handler will 484 * run before we execute anything after the post_send(). That 485 * means we have to make sure everything is properly recorded and 486 * our state is consistent before we call post_send(). 487 */ 488 tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; 489 tx_req->skb = skb; 490 if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { 491 ++dev->stats.tx_errors; 492 dev_kfree_skb_any(skb); 493 return; 494 } 495 496 if (skb->ip_summed == CHECKSUM_PARTIAL) 497 priv->tx_wr.send_flags |= IB_SEND_IP_CSUM; 498 else 499 priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; 500 501 if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), 502 address->ah, qpn, tx_req, phead, hlen))) { 503 ipoib_warn(priv, "post_send failed\n"); 504 ++dev->stats.tx_errors; 505 ipoib_dma_unmap_tx(priv->ca, tx_req); 506 dev_kfree_skb_any(skb); 507 } else { 508 dev->trans_start = jiffies; 509 510 address->last_send = priv->tx_head; 511 ++priv->tx_head; 512 513 if (++priv->tx_outstanding == ipoib_sendq_size) { 514 ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); 515 netif_stop_queue(dev); 516 } 517 } 518 } 519 520 static void __ipoib_reap_ah(struct net_device *dev) 521 { 522 struct ipoib_dev_priv *priv = netdev_priv(dev); 523 struct ipoib_ah *ah, *tah; 524 LIST_HEAD(remove_list); 525 526 spin_lock_irq(&priv->tx_lock); 527 spin_lock(&priv->lock); 528 list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) 529 if ((int) priv->tx_tail - (int) ah->last_send >= 0) { 530 list_del(&ah->list); 531 ib_destroy_ah(ah->ah); 532 kfree(ah); 533 } 534 spin_unlock(&priv->lock); 535 spin_unlock_irq(&priv->tx_lock); 536 } 537 538 void ipoib_reap_ah(struct work_struct *work) 539 { 540 struct ipoib_dev_priv *priv = 541 container_of(work, struct ipoib_dev_priv, ah_reap_task.work); 542 struct net_device *dev = priv->dev; 543 544 __ipoib_reap_ah(dev); 545 546 if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) 547 queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, 548 round_jiffies_relative(HZ)); 549 } 550 551 int ipoib_ib_dev_open(struct net_device *dev) 552 { 553 struct ipoib_dev_priv *priv = netdev_priv(dev); 554 int ret; 555 556 if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) { 557 ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey); 558 clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); 559 return -1; 560 } 561 set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); 562 563 ret = ipoib_init_qp(dev); 564 if (ret) { 565 ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret); 566 return -1; 567 } 568 569 ret = ipoib_ib_post_receives(dev); 570 if (ret) { 571 ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); 572 ipoib_ib_dev_stop(dev, 1); 573 return -1; 574 } 575 576 ret = ipoib_cm_dev_open(dev); 577 if (ret) { 578 ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); 579 ipoib_ib_dev_stop(dev, 1); 580 return -1; 581 } 582 583 clear_bit(IPOIB_STOP_REAPER, &priv->flags); 584 queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, 585 round_jiffies_relative(HZ)); 586 587 set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); 588 589 return 0; 590 } 591 592 static void ipoib_pkey_dev_check_presence(struct net_device *dev) 593 { 594 struct ipoib_dev_priv *priv = netdev_priv(dev); 595 u16 pkey_index = 0; 596 597 if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) 598 clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); 599 else 600 set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); 601 } 602 603 int ipoib_ib_dev_up(struct net_device *dev) 604 { 605 struct ipoib_dev_priv *priv = netdev_priv(dev); 606 607 ipoib_pkey_dev_check_presence(dev); 608 609 if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { 610 ipoib_dbg(priv, "PKEY is not assigned.\n"); 611 return 0; 612 } 613 614 set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); 615 616 return ipoib_mcast_start_thread(dev); 617 } 618 619 int ipoib_ib_dev_down(struct net_device *dev, int flush) 620 { 621 struct ipoib_dev_priv *priv = netdev_priv(dev); 622 623 ipoib_dbg(priv, "downing ib_dev\n"); 624 625 clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); 626 netif_carrier_off(dev); 627 628 /* Shutdown the P_Key thread if still active */ 629 if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { 630 mutex_lock(&pkey_mutex); 631 set_bit(IPOIB_PKEY_STOP, &priv->flags); 632 cancel_delayed_work(&priv->pkey_poll_task); 633 mutex_unlock(&pkey_mutex); 634 if (flush) 635 flush_workqueue(ipoib_workqueue); 636 } 637 638 ipoib_mcast_stop_thread(dev, flush); 639 ipoib_mcast_dev_flush(dev); 640 641 ipoib_flush_paths(dev); 642 643 return 0; 644 } 645 646 static int recvs_pending(struct net_device *dev) 647 { 648 struct ipoib_dev_priv *priv = netdev_priv(dev); 649 int pending = 0; 650 int i; 651 652 for (i = 0; i < ipoib_recvq_size; ++i) 653 if (priv->rx_ring[i].skb) 654 ++pending; 655 656 return pending; 657 } 658 659 void ipoib_drain_cq(struct net_device *dev) 660 { 661 struct ipoib_dev_priv *priv = netdev_priv(dev); 662 int i, n; 663 do { 664 n = ib_poll_cq(priv->cq, IPOIB_NUM_WC, priv->ibwc); 665 for (i = 0; i < n; ++i) { 666 /* 667 * Convert any successful completions to flush 668 * errors to avoid passing packets up the 669 * stack after bringing the device down. 670 */ 671 if (priv->ibwc[i].status == IB_WC_SUCCESS) 672 priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; 673 674 if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) { 675 if (priv->ibwc[i].wr_id & IPOIB_OP_CM) 676 ipoib_cm_handle_rx_wc(dev, priv->ibwc + i); 677 else 678 ipoib_ib_handle_rx_wc(dev, priv->ibwc + i); 679 } else { 680 if (priv->ibwc[i].wr_id & IPOIB_OP_CM) 681 ipoib_cm_handle_tx_wc(dev, priv->ibwc + i); 682 else 683 ipoib_ib_handle_tx_wc(dev, priv->ibwc + i); 684 } 685 } 686 } while (n == IPOIB_NUM_WC); 687 } 688 689 int ipoib_ib_dev_stop(struct net_device *dev, int flush) 690 { 691 struct ipoib_dev_priv *priv = netdev_priv(dev); 692 struct ib_qp_attr qp_attr; 693 unsigned long begin; 694 struct ipoib_tx_buf *tx_req; 695 int i; 696 697 clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); 698 699 ipoib_cm_dev_stop(dev); 700 701 /* 702 * Move our QP to the error state and then reinitialize in 703 * when all work requests have completed or have been flushed. 704 */ 705 qp_attr.qp_state = IB_QPS_ERR; 706 if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) 707 ipoib_warn(priv, "Failed to modify QP to ERROR state\n"); 708 709 /* Wait for all sends and receives to complete */ 710 begin = jiffies; 711 712 while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) { 713 if (time_after(jiffies, begin + 5 * HZ)) { 714 ipoib_warn(priv, "timing out; %d sends %d receives not completed\n", 715 priv->tx_head - priv->tx_tail, recvs_pending(dev)); 716 717 /* 718 * assume the HW is wedged and just free up 719 * all our pending work requests. 720 */ 721 while ((int) priv->tx_tail - (int) priv->tx_head < 0) { 722 tx_req = &priv->tx_ring[priv->tx_tail & 723 (ipoib_sendq_size - 1)]; 724 ipoib_dma_unmap_tx(priv->ca, tx_req); 725 dev_kfree_skb_any(tx_req->skb); 726 ++priv->tx_tail; 727 --priv->tx_outstanding; 728 } 729 730 for (i = 0; i < ipoib_recvq_size; ++i) { 731 struct ipoib_rx_buf *rx_req; 732 733 rx_req = &priv->rx_ring[i]; 734 if (!rx_req->skb) 735 continue; 736 ib_dma_unmap_single(priv->ca, 737 rx_req->mapping, 738 IPOIB_BUF_SIZE, 739 DMA_FROM_DEVICE); 740 dev_kfree_skb_any(rx_req->skb); 741 rx_req->skb = NULL; 742 } 743 744 goto timeout; 745 } 746 747 ipoib_drain_cq(dev); 748 749 msleep(1); 750 } 751 752 ipoib_dbg(priv, "All sends and receives done.\n"); 753 754 timeout: 755 qp_attr.qp_state = IB_QPS_RESET; 756 if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) 757 ipoib_warn(priv, "Failed to modify QP to RESET state\n"); 758 759 /* Wait for all AHs to be reaped */ 760 set_bit(IPOIB_STOP_REAPER, &priv->flags); 761 cancel_delayed_work(&priv->ah_reap_task); 762 if (flush) 763 flush_workqueue(ipoib_workqueue); 764 765 begin = jiffies; 766 767 while (!list_empty(&priv->dead_ahs)) { 768 __ipoib_reap_ah(dev); 769 770 if (time_after(jiffies, begin + HZ)) { 771 ipoib_warn(priv, "timing out; will leak address handles\n"); 772 break; 773 } 774 775 msleep(1); 776 } 777 778 ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP); 779 780 return 0; 781 } 782 783 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) 784 { 785 struct ipoib_dev_priv *priv = netdev_priv(dev); 786 787 priv->ca = ca; 788 priv->port = port; 789 priv->qp = NULL; 790 791 if (ipoib_transport_dev_init(dev, ca)) { 792 printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name); 793 return -ENODEV; 794 } 795 796 if (dev->flags & IFF_UP) { 797 if (ipoib_ib_dev_open(dev)) { 798 ipoib_transport_dev_cleanup(dev); 799 return -ENODEV; 800 } 801 } 802 803 return 0; 804 } 805 806 static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event) 807 { 808 struct ipoib_dev_priv *cpriv; 809 struct net_device *dev = priv->dev; 810 u16 new_index; 811 812 mutex_lock(&priv->vlan_mutex); 813 814 /* 815 * Flush any child interfaces too -- they might be up even if 816 * the parent is down. 817 */ 818 list_for_each_entry(cpriv, &priv->child_intfs, list) 819 __ipoib_ib_dev_flush(cpriv, pkey_event); 820 821 mutex_unlock(&priv->vlan_mutex); 822 823 if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { 824 ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); 825 return; 826 } 827 828 if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { 829 ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); 830 return; 831 } 832 833 if (pkey_event) { 834 if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { 835 clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); 836 ipoib_ib_dev_down(dev, 0); 837 ipoib_ib_dev_stop(dev, 0); 838 if (ipoib_pkey_dev_delay_open(dev)) 839 return; 840 } 841 842 /* restart QP only if P_Key index is changed */ 843 if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && 844 new_index == priv->pkey_index) { 845 ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); 846 return; 847 } 848 priv->pkey_index = new_index; 849 } 850 851 ipoib_dbg(priv, "flushing\n"); 852 853 ipoib_ib_dev_down(dev, 0); 854 855 if (pkey_event) { 856 ipoib_ib_dev_stop(dev, 0); 857 ipoib_ib_dev_open(dev); 858 } 859 860 /* 861 * The device could have been brought down between the start and when 862 * we get here, don't bring it back up if it's not configured up 863 */ 864 if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { 865 ipoib_ib_dev_up(dev); 866 ipoib_mcast_restart_task(&priv->restart_task); 867 } 868 } 869 870 void ipoib_ib_dev_flush(struct work_struct *work) 871 { 872 struct ipoib_dev_priv *priv = 873 container_of(work, struct ipoib_dev_priv, flush_task); 874 875 ipoib_dbg(priv, "Flushing %s\n", priv->dev->name); 876 __ipoib_ib_dev_flush(priv, 0); 877 } 878 879 void ipoib_pkey_event(struct work_struct *work) 880 { 881 struct ipoib_dev_priv *priv = 882 container_of(work, struct ipoib_dev_priv, pkey_event_task); 883 884 ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name); 885 __ipoib_ib_dev_flush(priv, 1); 886 } 887 888 void ipoib_ib_dev_cleanup(struct net_device *dev) 889 { 890 struct ipoib_dev_priv *priv = netdev_priv(dev); 891 892 ipoib_dbg(priv, "cleaning up ib_dev\n"); 893 894 ipoib_mcast_stop_thread(dev, 1); 895 ipoib_mcast_dev_flush(dev); 896 897 ipoib_transport_dev_cleanup(dev); 898 } 899 900 /* 901 * Delayed P_Key Assigment Interim Support 902 * 903 * The following is initial implementation of delayed P_Key assigment 904 * mechanism. It is using the same approach implemented for the multicast 905 * group join. The single goal of this implementation is to quickly address 906 * Bug #2507. This implementation will probably be removed when the P_Key 907 * change async notification is available. 908 */ 909 910 void ipoib_pkey_poll(struct work_struct *work) 911 { 912 struct ipoib_dev_priv *priv = 913 container_of(work, struct ipoib_dev_priv, pkey_poll_task.work); 914 struct net_device *dev = priv->dev; 915 916 ipoib_pkey_dev_check_presence(dev); 917 918 if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) 919 ipoib_open(dev); 920 else { 921 mutex_lock(&pkey_mutex); 922 if (!test_bit(IPOIB_PKEY_STOP, &priv->flags)) 923 queue_delayed_work(ipoib_workqueue, 924 &priv->pkey_poll_task, 925 HZ); 926 mutex_unlock(&pkey_mutex); 927 } 928 } 929 930 int ipoib_pkey_dev_delay_open(struct net_device *dev) 931 { 932 struct ipoib_dev_priv *priv = netdev_priv(dev); 933 934 /* Look for the interface pkey value in the IB Port P_Key table and */ 935 /* set the interface pkey assigment flag */ 936 ipoib_pkey_dev_check_presence(dev); 937 938 /* P_Key value not assigned yet - start polling */ 939 if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { 940 mutex_lock(&pkey_mutex); 941 clear_bit(IPOIB_PKEY_STOP, &priv->flags); 942 queue_delayed_work(ipoib_workqueue, 943 &priv->pkey_poll_task, 944 HZ); 945 mutex_unlock(&pkey_mutex); 946 return 1; 947 } 948 949 return 0; 950 } 951