1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 * 34 * $Id: ipoib_main.c 1377 2004-12-23 19:57:12Z roland $ 35 */ 36 37 #include "ipoib.h" 38 39 #include <linux/module.h> 40 41 #include <linux/init.h> 42 #include <linux/slab.h> 43 #include <linux/vmalloc.h> 44 45 #include <linux/if_arp.h> /* For ARPHRD_xxx */ 46 47 #include <linux/ip.h> 48 #include <linux/in.h> 49 50 #include <net/dst.h> 51 52 MODULE_AUTHOR("Roland Dreier"); 53 MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); 54 MODULE_LICENSE("Dual BSD/GPL"); 55 56 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 57 int ipoib_debug_level; 58 59 module_param_named(debug_level, ipoib_debug_level, int, 0644); 60 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); 61 #endif 62 63 struct ipoib_path_iter { 64 struct net_device *dev; 65 struct ipoib_path path; 66 }; 67 68 static const u8 ipv4_bcast_addr[] = { 69 0x00, 0xff, 0xff, 0xff, 70 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 71 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff 72 }; 73 74 struct workqueue_struct *ipoib_workqueue; 75 76 static void ipoib_add_one(struct ib_device *device); 77 static void ipoib_remove_one(struct ib_device *device); 78 79 static struct ib_client ipoib_client = { 80 .name = "ipoib", 81 .add = ipoib_add_one, 82 .remove = ipoib_remove_one 83 }; 84 85 int ipoib_open(struct net_device *dev) 86 { 87 struct ipoib_dev_priv *priv = netdev_priv(dev); 88 89 ipoib_dbg(priv, "bringing up interface\n"); 90 91 set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 92 93 if (ipoib_pkey_dev_delay_open(dev)) 94 return 0; 95 96 if (ipoib_ib_dev_open(dev)) 97 return -EINVAL; 98 99 if (ipoib_ib_dev_up(dev)) { 100 ipoib_ib_dev_stop(dev); 101 return -EINVAL; 102 } 103 104 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 105 struct ipoib_dev_priv *cpriv; 106 107 /* Bring up any child interfaces too */ 108 down(&priv->vlan_mutex); 109 list_for_each_entry(cpriv, &priv->child_intfs, list) { 110 int flags; 111 112 flags = cpriv->dev->flags; 113 if (flags & IFF_UP) 114 continue; 115 116 dev_change_flags(cpriv->dev, flags | IFF_UP); 117 } 118 up(&priv->vlan_mutex); 119 } 120 121 netif_start_queue(dev); 122 123 return 0; 124 } 125 126 static int ipoib_stop(struct net_device *dev) 127 { 128 struct ipoib_dev_priv *priv = netdev_priv(dev); 129 130 ipoib_dbg(priv, "stopping interface\n"); 131 132 clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 133 134 netif_stop_queue(dev); 135 136 ipoib_ib_dev_down(dev); 137 ipoib_ib_dev_stop(dev); 138 139 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 140 struct ipoib_dev_priv *cpriv; 141 142 /* Bring down any child interfaces too */ 143 down(&priv->vlan_mutex); 144 list_for_each_entry(cpriv, &priv->child_intfs, list) { 145 int flags; 146 147 flags = cpriv->dev->flags; 148 if (!(flags & IFF_UP)) 149 continue; 150 151 dev_change_flags(cpriv->dev, flags & ~IFF_UP); 152 } 153 up(&priv->vlan_mutex); 154 } 155 156 return 0; 157 } 158 159 static int ipoib_change_mtu(struct net_device *dev, int new_mtu) 160 { 161 struct ipoib_dev_priv *priv = netdev_priv(dev); 162 163 if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) 164 return -EINVAL; 165 166 priv->admin_mtu = new_mtu; 167 168 dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); 169 170 return 0; 171 } 172 173 static struct ipoib_path *__path_find(struct net_device *dev, 174 union ib_gid *gid) 175 { 176 struct ipoib_dev_priv *priv = netdev_priv(dev); 177 struct rb_node *n = priv->path_tree.rb_node; 178 struct ipoib_path *path; 179 int ret; 180 181 while (n) { 182 path = rb_entry(n, struct ipoib_path, rb_node); 183 184 ret = memcmp(gid->raw, path->pathrec.dgid.raw, 185 sizeof (union ib_gid)); 186 187 if (ret < 0) 188 n = n->rb_left; 189 else if (ret > 0) 190 n = n->rb_right; 191 else 192 return path; 193 } 194 195 return NULL; 196 } 197 198 static int __path_add(struct net_device *dev, struct ipoib_path *path) 199 { 200 struct ipoib_dev_priv *priv = netdev_priv(dev); 201 struct rb_node **n = &priv->path_tree.rb_node; 202 struct rb_node *pn = NULL; 203 struct ipoib_path *tpath; 204 int ret; 205 206 while (*n) { 207 pn = *n; 208 tpath = rb_entry(pn, struct ipoib_path, rb_node); 209 210 ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, 211 sizeof (union ib_gid)); 212 if (ret < 0) 213 n = &pn->rb_left; 214 else if (ret > 0) 215 n = &pn->rb_right; 216 else 217 return -EEXIST; 218 } 219 220 rb_link_node(&path->rb_node, pn, n); 221 rb_insert_color(&path->rb_node, &priv->path_tree); 222 223 list_add_tail(&path->list, &priv->path_list); 224 225 return 0; 226 } 227 228 static void path_free(struct net_device *dev, struct ipoib_path *path) 229 { 230 struct ipoib_dev_priv *priv = netdev_priv(dev); 231 struct ipoib_neigh *neigh, *tn; 232 struct sk_buff *skb; 233 unsigned long flags; 234 235 while ((skb = __skb_dequeue(&path->queue))) 236 dev_kfree_skb_irq(skb); 237 238 spin_lock_irqsave(&priv->lock, flags); 239 240 list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { 241 /* 242 * It's safe to call ipoib_put_ah() inside priv->lock 243 * here, because we know that path->ah will always 244 * hold one more reference, so ipoib_put_ah() will 245 * never do more than decrement the ref count. 246 */ 247 if (neigh->ah) 248 ipoib_put_ah(neigh->ah); 249 *to_ipoib_neigh(neigh->neighbour) = NULL; 250 neigh->neighbour->ops->destructor = NULL; 251 kfree(neigh); 252 } 253 254 spin_unlock_irqrestore(&priv->lock, flags); 255 256 if (path->ah) 257 ipoib_put_ah(path->ah); 258 259 kfree(path); 260 } 261 262 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 263 264 struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) 265 { 266 struct ipoib_path_iter *iter; 267 268 iter = kmalloc(sizeof *iter, GFP_KERNEL); 269 if (!iter) 270 return NULL; 271 272 iter->dev = dev; 273 memset(iter->path.pathrec.dgid.raw, 0, 16); 274 275 if (ipoib_path_iter_next(iter)) { 276 kfree(iter); 277 return NULL; 278 } 279 280 return iter; 281 } 282 283 int ipoib_path_iter_next(struct ipoib_path_iter *iter) 284 { 285 struct ipoib_dev_priv *priv = netdev_priv(iter->dev); 286 struct rb_node *n; 287 struct ipoib_path *path; 288 int ret = 1; 289 290 spin_lock_irq(&priv->lock); 291 292 n = rb_first(&priv->path_tree); 293 294 while (n) { 295 path = rb_entry(n, struct ipoib_path, rb_node); 296 297 if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, 298 sizeof (union ib_gid)) < 0) { 299 iter->path = *path; 300 ret = 0; 301 break; 302 } 303 304 n = rb_next(n); 305 } 306 307 spin_unlock_irq(&priv->lock); 308 309 return ret; 310 } 311 312 void ipoib_path_iter_read(struct ipoib_path_iter *iter, 313 struct ipoib_path *path) 314 { 315 *path = iter->path; 316 } 317 318 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ 319 320 void ipoib_flush_paths(struct net_device *dev) 321 { 322 struct ipoib_dev_priv *priv = netdev_priv(dev); 323 struct ipoib_path *path, *tp; 324 LIST_HEAD(remove_list); 325 unsigned long flags; 326 327 spin_lock_irqsave(&priv->lock, flags); 328 329 list_splice(&priv->path_list, &remove_list); 330 INIT_LIST_HEAD(&priv->path_list); 331 332 list_for_each_entry(path, &remove_list, list) 333 rb_erase(&path->rb_node, &priv->path_tree); 334 335 spin_unlock_irqrestore(&priv->lock, flags); 336 337 list_for_each_entry_safe(path, tp, &remove_list, list) { 338 if (path->query) 339 ib_sa_cancel_query(path->query_id, path->query); 340 wait_for_completion(&path->done); 341 path_free(dev, path); 342 } 343 } 344 345 static void path_rec_completion(int status, 346 struct ib_sa_path_rec *pathrec, 347 void *path_ptr) 348 { 349 struct ipoib_path *path = path_ptr; 350 struct net_device *dev = path->dev; 351 struct ipoib_dev_priv *priv = netdev_priv(dev); 352 struct ipoib_ah *ah = NULL; 353 struct ipoib_neigh *neigh; 354 struct sk_buff_head skqueue; 355 struct sk_buff *skb; 356 unsigned long flags; 357 358 if (pathrec) 359 ipoib_dbg(priv, "PathRec LID 0x%04x for GID " IPOIB_GID_FMT "\n", 360 be16_to_cpu(pathrec->dlid), IPOIB_GID_ARG(pathrec->dgid)); 361 else 362 ipoib_dbg(priv, "PathRec status %d for GID " IPOIB_GID_FMT "\n", 363 status, IPOIB_GID_ARG(path->pathrec.dgid)); 364 365 skb_queue_head_init(&skqueue); 366 367 if (!status) { 368 struct ib_ah_attr av = { 369 .dlid = be16_to_cpu(pathrec->dlid), 370 .sl = pathrec->sl, 371 .port_num = priv->port 372 }; 373 int path_rate = ib_sa_rate_enum_to_int(pathrec->rate); 374 375 if (path_rate > 0 && priv->local_rate > path_rate) 376 av.static_rate = (priv->local_rate - 1) / path_rate; 377 378 ipoib_dbg(priv, "static_rate %d for local port %dX, path %dX\n", 379 av.static_rate, priv->local_rate, 380 ib_sa_rate_enum_to_int(pathrec->rate)); 381 382 ah = ipoib_create_ah(dev, priv->pd, &av); 383 } 384 385 spin_lock_irqsave(&priv->lock, flags); 386 387 path->ah = ah; 388 389 if (ah) { 390 path->pathrec = *pathrec; 391 392 ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", 393 ah, be16_to_cpu(pathrec->dlid), pathrec->sl); 394 395 while ((skb = __skb_dequeue(&path->queue))) 396 __skb_queue_tail(&skqueue, skb); 397 398 list_for_each_entry(neigh, &path->neigh_list, list) { 399 kref_get(&path->ah->ref); 400 neigh->ah = path->ah; 401 402 while ((skb = __skb_dequeue(&neigh->queue))) 403 __skb_queue_tail(&skqueue, skb); 404 } 405 } 406 407 path->query = NULL; 408 complete(&path->done); 409 410 spin_unlock_irqrestore(&priv->lock, flags); 411 412 while ((skb = __skb_dequeue(&skqueue))) { 413 skb->dev = dev; 414 if (dev_queue_xmit(skb)) 415 ipoib_warn(priv, "dev_queue_xmit failed " 416 "to requeue packet\n"); 417 } 418 } 419 420 static struct ipoib_path *path_rec_create(struct net_device *dev, 421 union ib_gid *gid) 422 { 423 struct ipoib_dev_priv *priv = netdev_priv(dev); 424 struct ipoib_path *path; 425 426 path = kzalloc(sizeof *path, GFP_ATOMIC); 427 if (!path) 428 return NULL; 429 430 path->dev = dev; 431 432 skb_queue_head_init(&path->queue); 433 434 INIT_LIST_HEAD(&path->neigh_list); 435 436 memcpy(path->pathrec.dgid.raw, gid->raw, sizeof (union ib_gid)); 437 path->pathrec.sgid = priv->local_gid; 438 path->pathrec.pkey = cpu_to_be16(priv->pkey); 439 path->pathrec.numb_path = 1; 440 441 return path; 442 } 443 444 static int path_rec_start(struct net_device *dev, 445 struct ipoib_path *path) 446 { 447 struct ipoib_dev_priv *priv = netdev_priv(dev); 448 449 ipoib_dbg(priv, "Start path record lookup for " IPOIB_GID_FMT "\n", 450 IPOIB_GID_ARG(path->pathrec.dgid)); 451 452 init_completion(&path->done); 453 454 path->query_id = 455 ib_sa_path_rec_get(priv->ca, priv->port, 456 &path->pathrec, 457 IB_SA_PATH_REC_DGID | 458 IB_SA_PATH_REC_SGID | 459 IB_SA_PATH_REC_NUMB_PATH | 460 IB_SA_PATH_REC_PKEY, 461 1000, GFP_ATOMIC, 462 path_rec_completion, 463 path, &path->query); 464 if (path->query_id < 0) { 465 ipoib_warn(priv, "ib_sa_path_rec_get failed\n"); 466 path->query = NULL; 467 return path->query_id; 468 } 469 470 return 0; 471 } 472 473 static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) 474 { 475 struct ipoib_dev_priv *priv = netdev_priv(dev); 476 struct ipoib_path *path; 477 struct ipoib_neigh *neigh; 478 479 neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); 480 if (!neigh) { 481 ++priv->stats.tx_dropped; 482 dev_kfree_skb_any(skb); 483 return; 484 } 485 486 skb_queue_head_init(&neigh->queue); 487 neigh->neighbour = skb->dst->neighbour; 488 *to_ipoib_neigh(skb->dst->neighbour) = neigh; 489 490 /* 491 * We can only be called from ipoib_start_xmit, so we're 492 * inside tx_lock -- no need to save/restore flags. 493 */ 494 spin_lock(&priv->lock); 495 496 path = __path_find(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4)); 497 if (!path) { 498 path = path_rec_create(dev, 499 (union ib_gid *) (skb->dst->neighbour->ha + 4)); 500 if (!path) 501 goto err; 502 503 __path_add(dev, path); 504 } 505 506 list_add_tail(&neigh->list, &path->neigh_list); 507 508 if (path->pathrec.dlid) { 509 kref_get(&path->ah->ref); 510 neigh->ah = path->ah; 511 512 ipoib_send(dev, skb, path->ah, 513 be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); 514 } else { 515 neigh->ah = NULL; 516 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 517 __skb_queue_tail(&neigh->queue, skb); 518 } else { 519 ++priv->stats.tx_dropped; 520 dev_kfree_skb_any(skb); 521 } 522 523 if (!path->query && path_rec_start(dev, path)) 524 goto err; 525 } 526 527 spin_unlock(&priv->lock); 528 return; 529 530 err: 531 *to_ipoib_neigh(skb->dst->neighbour) = NULL; 532 list_del(&neigh->list); 533 neigh->neighbour->ops->destructor = NULL; 534 kfree(neigh); 535 536 ++priv->stats.tx_dropped; 537 dev_kfree_skb_any(skb); 538 539 spin_unlock(&priv->lock); 540 } 541 542 static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev) 543 { 544 struct ipoib_dev_priv *priv = netdev_priv(skb->dev); 545 546 /* Look up path record for unicasts */ 547 if (skb->dst->neighbour->ha[4] != 0xff) { 548 neigh_add_path(skb, dev); 549 return; 550 } 551 552 /* Add in the P_Key for multicasts */ 553 skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff; 554 skb->dst->neighbour->ha[9] = priv->pkey & 0xff; 555 ipoib_mcast_send(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4), skb); 556 } 557 558 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, 559 struct ipoib_pseudoheader *phdr) 560 { 561 struct ipoib_dev_priv *priv = netdev_priv(dev); 562 struct ipoib_path *path; 563 564 /* 565 * We can only be called from ipoib_start_xmit, so we're 566 * inside tx_lock -- no need to save/restore flags. 567 */ 568 spin_lock(&priv->lock); 569 570 path = __path_find(dev, (union ib_gid *) (phdr->hwaddr + 4)); 571 if (!path) { 572 path = path_rec_create(dev, 573 (union ib_gid *) (phdr->hwaddr + 4)); 574 if (path) { 575 /* put pseudoheader back on for next time */ 576 skb_push(skb, sizeof *phdr); 577 __skb_queue_tail(&path->queue, skb); 578 579 if (path_rec_start(dev, path)) { 580 spin_unlock(&priv->lock); 581 path_free(dev, path); 582 return; 583 } else 584 __path_add(dev, path); 585 } else { 586 ++priv->stats.tx_dropped; 587 dev_kfree_skb_any(skb); 588 } 589 590 spin_unlock(&priv->lock); 591 return; 592 } 593 594 if (path->pathrec.dlid) { 595 ipoib_dbg(priv, "Send unicast ARP to %04x\n", 596 be16_to_cpu(path->pathrec.dlid)); 597 598 ipoib_send(dev, skb, path->ah, 599 be32_to_cpup((__be32 *) phdr->hwaddr)); 600 } else if ((path->query || !path_rec_start(dev, path)) && 601 skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 602 /* put pseudoheader back on for next time */ 603 skb_push(skb, sizeof *phdr); 604 __skb_queue_tail(&path->queue, skb); 605 } else { 606 ++priv->stats.tx_dropped; 607 dev_kfree_skb_any(skb); 608 } 609 610 spin_unlock(&priv->lock); 611 } 612 613 static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) 614 { 615 struct ipoib_dev_priv *priv = netdev_priv(dev); 616 struct ipoib_neigh *neigh; 617 unsigned long flags; 618 619 if (!spin_trylock_irqsave(&priv->tx_lock, flags)) 620 return NETDEV_TX_LOCKED; 621 622 /* 623 * Check if our queue is stopped. Since we have the LLTX bit 624 * set, we can't rely on netif_stop_queue() preventing our 625 * xmit function from being called with a full queue. 626 */ 627 if (unlikely(netif_queue_stopped(dev))) { 628 spin_unlock_irqrestore(&priv->tx_lock, flags); 629 return NETDEV_TX_BUSY; 630 } 631 632 if (skb->dst && skb->dst->neighbour) { 633 if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) { 634 ipoib_path_lookup(skb, dev); 635 goto out; 636 } 637 638 neigh = *to_ipoib_neigh(skb->dst->neighbour); 639 640 if (likely(neigh->ah)) { 641 ipoib_send(dev, skb, neigh->ah, 642 be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); 643 goto out; 644 } 645 646 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 647 spin_lock(&priv->lock); 648 __skb_queue_tail(&neigh->queue, skb); 649 spin_unlock(&priv->lock); 650 } else { 651 ++priv->stats.tx_dropped; 652 dev_kfree_skb_any(skb); 653 } 654 } else { 655 struct ipoib_pseudoheader *phdr = 656 (struct ipoib_pseudoheader *) skb->data; 657 skb_pull(skb, sizeof *phdr); 658 659 if (phdr->hwaddr[4] == 0xff) { 660 /* Add in the P_Key for multicast*/ 661 phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; 662 phdr->hwaddr[9] = priv->pkey & 0xff; 663 664 ipoib_mcast_send(dev, (union ib_gid *) (phdr->hwaddr + 4), skb); 665 } else { 666 /* unicast GID -- should be ARP or RARP reply */ 667 668 if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) && 669 (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) { 670 ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x " 671 IPOIB_GID_FMT "\n", 672 skb->dst ? "neigh" : "dst", 673 be16_to_cpup((__be16 *) skb->data), 674 be32_to_cpup((__be32 *) phdr->hwaddr), 675 IPOIB_GID_ARG(*(union ib_gid *) (phdr->hwaddr + 4))); 676 dev_kfree_skb_any(skb); 677 ++priv->stats.tx_dropped; 678 goto out; 679 } 680 681 unicast_arp_send(skb, dev, phdr); 682 } 683 } 684 685 out: 686 spin_unlock_irqrestore(&priv->tx_lock, flags); 687 688 return NETDEV_TX_OK; 689 } 690 691 static struct net_device_stats *ipoib_get_stats(struct net_device *dev) 692 { 693 struct ipoib_dev_priv *priv = netdev_priv(dev); 694 695 return &priv->stats; 696 } 697 698 static void ipoib_timeout(struct net_device *dev) 699 { 700 struct ipoib_dev_priv *priv = netdev_priv(dev); 701 702 ipoib_warn(priv, "transmit timeout: latency %d msecs\n", 703 jiffies_to_msecs(jiffies - dev->trans_start)); 704 ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n", 705 netif_queue_stopped(dev), 706 priv->tx_head, priv->tx_tail); 707 /* XXX reset QP, etc. */ 708 } 709 710 static int ipoib_hard_header(struct sk_buff *skb, 711 struct net_device *dev, 712 unsigned short type, 713 void *daddr, void *saddr, unsigned len) 714 { 715 struct ipoib_header *header; 716 717 header = (struct ipoib_header *) skb_push(skb, sizeof *header); 718 719 header->proto = htons(type); 720 header->reserved = 0; 721 722 /* 723 * If we don't have a neighbour structure, stuff the 724 * destination address onto the front of the skb so we can 725 * figure out where to send the packet later. 726 */ 727 if (!skb->dst || !skb->dst->neighbour) { 728 struct ipoib_pseudoheader *phdr = 729 (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr); 730 memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); 731 } 732 733 return 0; 734 } 735 736 static void ipoib_set_mcast_list(struct net_device *dev) 737 { 738 struct ipoib_dev_priv *priv = netdev_priv(dev); 739 740 queue_work(ipoib_workqueue, &priv->restart_task); 741 } 742 743 static void ipoib_neigh_destructor(struct neighbour *n) 744 { 745 struct ipoib_neigh *neigh; 746 struct ipoib_dev_priv *priv = netdev_priv(n->dev); 747 unsigned long flags; 748 struct ipoib_ah *ah = NULL; 749 750 ipoib_dbg(priv, 751 "neigh_destructor for %06x " IPOIB_GID_FMT "\n", 752 be32_to_cpup((__be32 *) n->ha), 753 IPOIB_GID_ARG(*((union ib_gid *) (n->ha + 4)))); 754 755 spin_lock_irqsave(&priv->lock, flags); 756 757 neigh = *to_ipoib_neigh(n); 758 if (neigh) { 759 if (neigh->ah) 760 ah = neigh->ah; 761 list_del(&neigh->list); 762 *to_ipoib_neigh(n) = NULL; 763 kfree(neigh); 764 } 765 766 spin_unlock_irqrestore(&priv->lock, flags); 767 768 if (ah) 769 ipoib_put_ah(ah); 770 } 771 772 static int ipoib_neigh_setup(struct neighbour *neigh) 773 { 774 /* 775 * Is this kosher? I can't find anybody in the kernel that 776 * sets neigh->destructor, so we should be able to set it here 777 * without trouble. 778 */ 779 neigh->ops->destructor = ipoib_neigh_destructor; 780 781 return 0; 782 } 783 784 static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) 785 { 786 parms->neigh_setup = ipoib_neigh_setup; 787 788 return 0; 789 } 790 791 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) 792 { 793 struct ipoib_dev_priv *priv = netdev_priv(dev); 794 795 /* Allocate RX/TX "rings" to hold queued skbs */ 796 797 priv->rx_ring = kzalloc(IPOIB_RX_RING_SIZE * sizeof (struct ipoib_rx_buf), 798 GFP_KERNEL); 799 if (!priv->rx_ring) { 800 printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", 801 ca->name, IPOIB_RX_RING_SIZE); 802 goto out; 803 } 804 805 priv->tx_ring = kzalloc(IPOIB_TX_RING_SIZE * sizeof (struct ipoib_tx_buf), 806 GFP_KERNEL); 807 if (!priv->tx_ring) { 808 printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", 809 ca->name, IPOIB_TX_RING_SIZE); 810 goto out_rx_ring_cleanup; 811 } 812 813 /* priv->tx_head & tx_tail are already 0 */ 814 815 if (ipoib_ib_dev_init(dev, ca, port)) 816 goto out_tx_ring_cleanup; 817 818 return 0; 819 820 out_tx_ring_cleanup: 821 kfree(priv->tx_ring); 822 823 out_rx_ring_cleanup: 824 kfree(priv->rx_ring); 825 826 out: 827 return -ENOMEM; 828 } 829 830 void ipoib_dev_cleanup(struct net_device *dev) 831 { 832 struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; 833 834 ipoib_delete_debug_files(dev); 835 836 /* Delete any child interfaces first */ 837 list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { 838 unregister_netdev(cpriv->dev); 839 ipoib_dev_cleanup(cpriv->dev); 840 free_netdev(cpriv->dev); 841 } 842 843 ipoib_ib_dev_cleanup(dev); 844 845 kfree(priv->rx_ring); 846 kfree(priv->tx_ring); 847 848 priv->rx_ring = NULL; 849 priv->tx_ring = NULL; 850 } 851 852 static void ipoib_setup(struct net_device *dev) 853 { 854 struct ipoib_dev_priv *priv = netdev_priv(dev); 855 856 dev->open = ipoib_open; 857 dev->stop = ipoib_stop; 858 dev->change_mtu = ipoib_change_mtu; 859 dev->hard_start_xmit = ipoib_start_xmit; 860 dev->get_stats = ipoib_get_stats; 861 dev->tx_timeout = ipoib_timeout; 862 dev->hard_header = ipoib_hard_header; 863 dev->set_multicast_list = ipoib_set_mcast_list; 864 dev->neigh_setup = ipoib_neigh_setup_dev; 865 866 dev->watchdog_timeo = HZ; 867 868 dev->flags |= IFF_BROADCAST | IFF_MULTICAST; 869 870 /* 871 * We add in INFINIBAND_ALEN to allow for the destination 872 * address "pseudoheader" for skbs without neighbour struct. 873 */ 874 dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN; 875 dev->addr_len = INFINIBAND_ALEN; 876 dev->type = ARPHRD_INFINIBAND; 877 dev->tx_queue_len = IPOIB_TX_RING_SIZE * 2; 878 dev->features = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX; 879 880 /* MTU will be reset when mcast join happens */ 881 dev->mtu = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN; 882 priv->mcast_mtu = priv->admin_mtu = dev->mtu; 883 884 memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); 885 886 netif_carrier_off(dev); 887 888 SET_MODULE_OWNER(dev); 889 890 priv->dev = dev; 891 892 spin_lock_init(&priv->lock); 893 spin_lock_init(&priv->tx_lock); 894 895 init_MUTEX(&priv->mcast_mutex); 896 init_MUTEX(&priv->vlan_mutex); 897 898 INIT_LIST_HEAD(&priv->path_list); 899 INIT_LIST_HEAD(&priv->child_intfs); 900 INIT_LIST_HEAD(&priv->dead_ahs); 901 INIT_LIST_HEAD(&priv->multicast_list); 902 903 INIT_WORK(&priv->pkey_task, ipoib_pkey_poll, priv->dev); 904 INIT_WORK(&priv->mcast_task, ipoib_mcast_join_task, priv->dev); 905 INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush, priv->dev); 906 INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task, priv->dev); 907 INIT_WORK(&priv->ah_reap_task, ipoib_reap_ah, priv->dev); 908 } 909 910 struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) 911 { 912 struct net_device *dev; 913 914 dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name, 915 ipoib_setup); 916 if (!dev) 917 return NULL; 918 919 return netdev_priv(dev); 920 } 921 922 static ssize_t show_pkey(struct class_device *cdev, char *buf) 923 { 924 struct ipoib_dev_priv *priv = 925 netdev_priv(container_of(cdev, struct net_device, class_dev)); 926 927 return sprintf(buf, "0x%04x\n", priv->pkey); 928 } 929 static CLASS_DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); 930 931 static ssize_t create_child(struct class_device *cdev, 932 const char *buf, size_t count) 933 { 934 int pkey; 935 int ret; 936 937 if (sscanf(buf, "%i", &pkey) != 1) 938 return -EINVAL; 939 940 if (pkey < 0 || pkey > 0xffff) 941 return -EINVAL; 942 943 /* 944 * Set the full membership bit, so that we join the right 945 * broadcast group, etc. 946 */ 947 pkey |= 0x8000; 948 949 ret = ipoib_vlan_add(container_of(cdev, struct net_device, class_dev), 950 pkey); 951 952 return ret ? ret : count; 953 } 954 static CLASS_DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child); 955 956 static ssize_t delete_child(struct class_device *cdev, 957 const char *buf, size_t count) 958 { 959 int pkey; 960 int ret; 961 962 if (sscanf(buf, "%i", &pkey) != 1) 963 return -EINVAL; 964 965 if (pkey < 0 || pkey > 0xffff) 966 return -EINVAL; 967 968 ret = ipoib_vlan_delete(container_of(cdev, struct net_device, class_dev), 969 pkey); 970 971 return ret ? ret : count; 972 973 } 974 static CLASS_DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child); 975 976 int ipoib_add_pkey_attr(struct net_device *dev) 977 { 978 return class_device_create_file(&dev->class_dev, 979 &class_device_attr_pkey); 980 } 981 982 static struct net_device *ipoib_add_port(const char *format, 983 struct ib_device *hca, u8 port) 984 { 985 struct ipoib_dev_priv *priv; 986 int result = -ENOMEM; 987 988 priv = ipoib_intf_alloc(format); 989 if (!priv) 990 goto alloc_mem_failed; 991 992 SET_NETDEV_DEV(priv->dev, hca->dma_device); 993 994 result = ib_query_pkey(hca, port, 0, &priv->pkey); 995 if (result) { 996 printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", 997 hca->name, port, result); 998 goto alloc_mem_failed; 999 } 1000 1001 /* 1002 * Set the full membership bit, so that we join the right 1003 * broadcast group, etc. 1004 */ 1005 priv->pkey |= 0x8000; 1006 1007 priv->dev->broadcast[8] = priv->pkey >> 8; 1008 priv->dev->broadcast[9] = priv->pkey & 0xff; 1009 1010 result = ib_query_gid(hca, port, 0, &priv->local_gid); 1011 if (result) { 1012 printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", 1013 hca->name, port, result); 1014 goto alloc_mem_failed; 1015 } else 1016 memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); 1017 1018 1019 result = ipoib_dev_init(priv->dev, hca, port); 1020 if (result < 0) { 1021 printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", 1022 hca->name, port, result); 1023 goto device_init_failed; 1024 } 1025 1026 INIT_IB_EVENT_HANDLER(&priv->event_handler, 1027 priv->ca, ipoib_event); 1028 result = ib_register_event_handler(&priv->event_handler); 1029 if (result < 0) { 1030 printk(KERN_WARNING "%s: ib_register_event_handler failed for " 1031 "port %d (ret = %d)\n", 1032 hca->name, port, result); 1033 goto event_failed; 1034 } 1035 1036 result = register_netdev(priv->dev); 1037 if (result) { 1038 printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n", 1039 hca->name, port, result); 1040 goto register_failed; 1041 } 1042 1043 ipoib_create_debug_files(priv->dev); 1044 1045 if (ipoib_add_pkey_attr(priv->dev)) 1046 goto sysfs_failed; 1047 if (class_device_create_file(&priv->dev->class_dev, 1048 &class_device_attr_create_child)) 1049 goto sysfs_failed; 1050 if (class_device_create_file(&priv->dev->class_dev, 1051 &class_device_attr_delete_child)) 1052 goto sysfs_failed; 1053 1054 return priv->dev; 1055 1056 sysfs_failed: 1057 ipoib_delete_debug_files(priv->dev); 1058 unregister_netdev(priv->dev); 1059 1060 register_failed: 1061 ib_unregister_event_handler(&priv->event_handler); 1062 flush_scheduled_work(); 1063 1064 event_failed: 1065 ipoib_dev_cleanup(priv->dev); 1066 1067 device_init_failed: 1068 free_netdev(priv->dev); 1069 1070 alloc_mem_failed: 1071 return ERR_PTR(result); 1072 } 1073 1074 static void ipoib_add_one(struct ib_device *device) 1075 { 1076 struct list_head *dev_list; 1077 struct net_device *dev; 1078 struct ipoib_dev_priv *priv; 1079 int s, e, p; 1080 1081 dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); 1082 if (!dev_list) 1083 return; 1084 1085 INIT_LIST_HEAD(dev_list); 1086 1087 if (device->node_type == IB_NODE_SWITCH) { 1088 s = 0; 1089 e = 0; 1090 } else { 1091 s = 1; 1092 e = device->phys_port_cnt; 1093 } 1094 1095 for (p = s; p <= e; ++p) { 1096 dev = ipoib_add_port("ib%d", device, p); 1097 if (!IS_ERR(dev)) { 1098 priv = netdev_priv(dev); 1099 list_add_tail(&priv->list, dev_list); 1100 } 1101 } 1102 1103 ib_set_client_data(device, &ipoib_client, dev_list); 1104 } 1105 1106 static void ipoib_remove_one(struct ib_device *device) 1107 { 1108 struct ipoib_dev_priv *priv, *tmp; 1109 struct list_head *dev_list; 1110 1111 dev_list = ib_get_client_data(device, &ipoib_client); 1112 1113 list_for_each_entry_safe(priv, tmp, dev_list, list) { 1114 ib_unregister_event_handler(&priv->event_handler); 1115 flush_scheduled_work(); 1116 1117 unregister_netdev(priv->dev); 1118 ipoib_dev_cleanup(priv->dev); 1119 free_netdev(priv->dev); 1120 } 1121 1122 kfree(dev_list); 1123 } 1124 1125 static int __init ipoib_init_module(void) 1126 { 1127 int ret; 1128 1129 ret = ipoib_register_debugfs(); 1130 if (ret) 1131 return ret; 1132 1133 /* 1134 * We create our own workqueue mainly because we want to be 1135 * able to flush it when devices are being removed. We can't 1136 * use schedule_work()/flush_scheduled_work() because both 1137 * unregister_netdev() and linkwatch_event take the rtnl lock, 1138 * so flush_scheduled_work() can deadlock during device 1139 * removal. 1140 */ 1141 ipoib_workqueue = create_singlethread_workqueue("ipoib"); 1142 if (!ipoib_workqueue) { 1143 ret = -ENOMEM; 1144 goto err_fs; 1145 } 1146 1147 ret = ib_register_client(&ipoib_client); 1148 if (ret) 1149 goto err_wq; 1150 1151 return 0; 1152 1153 err_wq: 1154 destroy_workqueue(ipoib_workqueue); 1155 1156 err_fs: 1157 ipoib_unregister_debugfs(); 1158 1159 return ret; 1160 } 1161 1162 static void __exit ipoib_cleanup_module(void) 1163 { 1164 ib_unregister_client(&ipoib_client); 1165 ipoib_unregister_debugfs(); 1166 destroy_workqueue(ipoib_workqueue); 1167 } 1168 1169 module_init(ipoib_init_module); 1170 module_exit(ipoib_cleanup_module); 1171