1 /*
2  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4  * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenIB.org BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  */
34 
35 #include "ipoib.h"
36 
37 #include <linux/module.h>
38 
39 #include <linux/init.h>
40 #include <linux/slab.h>
41 #include <linux/kernel.h>
42 #include <linux/vmalloc.h>
43 
44 #include <linux/if_arp.h>	/* For ARPHRD_xxx */
45 
46 #include <linux/ip.h>
47 #include <linux/in.h>
48 
49 #include <linux/jhash.h>
50 #include <net/arp.h>
51 
52 MODULE_AUTHOR("Roland Dreier");
53 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
54 MODULE_LICENSE("Dual BSD/GPL");
55 
56 int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
57 int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
58 
59 module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
60 MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
61 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
62 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
63 
64 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
65 int ipoib_debug_level;
66 
67 module_param_named(debug_level, ipoib_debug_level, int, 0644);
68 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
69 #endif
70 
71 struct ipoib_path_iter {
72 	struct net_device *dev;
73 	struct ipoib_path  path;
74 };
75 
76 static const u8 ipv4_bcast_addr[] = {
77 	0x00, 0xff, 0xff, 0xff,
78 	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
79 	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
80 };
81 
82 struct workqueue_struct *ipoib_workqueue;
83 
84 struct ib_sa_client ipoib_sa_client;
85 
86 static void ipoib_add_one(struct ib_device *device);
87 static void ipoib_remove_one(struct ib_device *device);
88 static void ipoib_neigh_reclaim(struct rcu_head *rp);
89 
90 static struct ib_client ipoib_client = {
91 	.name   = "ipoib",
92 	.add    = ipoib_add_one,
93 	.remove = ipoib_remove_one
94 };
95 
96 int ipoib_open(struct net_device *dev)
97 {
98 	struct ipoib_dev_priv *priv = netdev_priv(dev);
99 
100 	ipoib_dbg(priv, "bringing up interface\n");
101 
102 	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
103 
104 	if (ipoib_pkey_dev_delay_open(dev))
105 		return 0;
106 
107 	if (ipoib_ib_dev_open(dev))
108 		goto err_disable;
109 
110 	if (ipoib_ib_dev_up(dev))
111 		goto err_stop;
112 
113 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
114 		struct ipoib_dev_priv *cpriv;
115 
116 		/* Bring up any child interfaces too */
117 		mutex_lock(&priv->vlan_mutex);
118 		list_for_each_entry(cpriv, &priv->child_intfs, list) {
119 			int flags;
120 
121 			flags = cpriv->dev->flags;
122 			if (flags & IFF_UP)
123 				continue;
124 
125 			dev_change_flags(cpriv->dev, flags | IFF_UP);
126 		}
127 		mutex_unlock(&priv->vlan_mutex);
128 	}
129 
130 	netif_start_queue(dev);
131 
132 	return 0;
133 
134 err_stop:
135 	ipoib_ib_dev_stop(dev, 1);
136 
137 err_disable:
138 	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
139 
140 	return -EINVAL;
141 }
142 
143 static int ipoib_stop(struct net_device *dev)
144 {
145 	struct ipoib_dev_priv *priv = netdev_priv(dev);
146 
147 	ipoib_dbg(priv, "stopping interface\n");
148 
149 	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
150 
151 	netif_stop_queue(dev);
152 
153 	ipoib_ib_dev_down(dev, 0);
154 	ipoib_ib_dev_stop(dev, 0);
155 
156 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
157 		struct ipoib_dev_priv *cpriv;
158 
159 		/* Bring down any child interfaces too */
160 		mutex_lock(&priv->vlan_mutex);
161 		list_for_each_entry(cpriv, &priv->child_intfs, list) {
162 			int flags;
163 
164 			flags = cpriv->dev->flags;
165 			if (!(flags & IFF_UP))
166 				continue;
167 
168 			dev_change_flags(cpriv->dev, flags & ~IFF_UP);
169 		}
170 		mutex_unlock(&priv->vlan_mutex);
171 	}
172 
173 	return 0;
174 }
175 
176 static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
177 {
178 	struct ipoib_dev_priv *priv = netdev_priv(dev);
179 
180 	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
181 		features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
182 
183 	return features;
184 }
185 
186 static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
187 {
188 	struct ipoib_dev_priv *priv = netdev_priv(dev);
189 
190 	/* dev->mtu > 2K ==> connected mode */
191 	if (ipoib_cm_admin_enabled(dev)) {
192 		if (new_mtu > ipoib_cm_max_mtu(dev))
193 			return -EINVAL;
194 
195 		if (new_mtu > priv->mcast_mtu)
196 			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
197 				   priv->mcast_mtu);
198 
199 		dev->mtu = new_mtu;
200 		return 0;
201 	}
202 
203 	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
204 		return -EINVAL;
205 
206 	priv->admin_mtu = new_mtu;
207 
208 	dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
209 
210 	return 0;
211 }
212 
213 static struct ipoib_path *__path_find(struct net_device *dev, void *gid)
214 {
215 	struct ipoib_dev_priv *priv = netdev_priv(dev);
216 	struct rb_node *n = priv->path_tree.rb_node;
217 	struct ipoib_path *path;
218 	int ret;
219 
220 	while (n) {
221 		path = rb_entry(n, struct ipoib_path, rb_node);
222 
223 		ret = memcmp(gid, path->pathrec.dgid.raw,
224 			     sizeof (union ib_gid));
225 
226 		if (ret < 0)
227 			n = n->rb_left;
228 		else if (ret > 0)
229 			n = n->rb_right;
230 		else
231 			return path;
232 	}
233 
234 	return NULL;
235 }
236 
237 static int __path_add(struct net_device *dev, struct ipoib_path *path)
238 {
239 	struct ipoib_dev_priv *priv = netdev_priv(dev);
240 	struct rb_node **n = &priv->path_tree.rb_node;
241 	struct rb_node *pn = NULL;
242 	struct ipoib_path *tpath;
243 	int ret;
244 
245 	while (*n) {
246 		pn = *n;
247 		tpath = rb_entry(pn, struct ipoib_path, rb_node);
248 
249 		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
250 			     sizeof (union ib_gid));
251 		if (ret < 0)
252 			n = &pn->rb_left;
253 		else if (ret > 0)
254 			n = &pn->rb_right;
255 		else
256 			return -EEXIST;
257 	}
258 
259 	rb_link_node(&path->rb_node, pn, n);
260 	rb_insert_color(&path->rb_node, &priv->path_tree);
261 
262 	list_add_tail(&path->list, &priv->path_list);
263 
264 	return 0;
265 }
266 
267 static void path_free(struct net_device *dev, struct ipoib_path *path)
268 {
269 	struct sk_buff *skb;
270 
271 	while ((skb = __skb_dequeue(&path->queue)))
272 		dev_kfree_skb_irq(skb);
273 
274 	ipoib_dbg(netdev_priv(dev), "path_free\n");
275 
276 	/* remove all neigh connected to this path */
277 	ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
278 
279 	if (path->ah)
280 		ipoib_put_ah(path->ah);
281 
282 	kfree(path);
283 }
284 
285 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
286 
287 struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
288 {
289 	struct ipoib_path_iter *iter;
290 
291 	iter = kmalloc(sizeof *iter, GFP_KERNEL);
292 	if (!iter)
293 		return NULL;
294 
295 	iter->dev = dev;
296 	memset(iter->path.pathrec.dgid.raw, 0, 16);
297 
298 	if (ipoib_path_iter_next(iter)) {
299 		kfree(iter);
300 		return NULL;
301 	}
302 
303 	return iter;
304 }
305 
306 int ipoib_path_iter_next(struct ipoib_path_iter *iter)
307 {
308 	struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
309 	struct rb_node *n;
310 	struct ipoib_path *path;
311 	int ret = 1;
312 
313 	spin_lock_irq(&priv->lock);
314 
315 	n = rb_first(&priv->path_tree);
316 
317 	while (n) {
318 		path = rb_entry(n, struct ipoib_path, rb_node);
319 
320 		if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
321 			   sizeof (union ib_gid)) < 0) {
322 			iter->path = *path;
323 			ret = 0;
324 			break;
325 		}
326 
327 		n = rb_next(n);
328 	}
329 
330 	spin_unlock_irq(&priv->lock);
331 
332 	return ret;
333 }
334 
335 void ipoib_path_iter_read(struct ipoib_path_iter *iter,
336 			  struct ipoib_path *path)
337 {
338 	*path = iter->path;
339 }
340 
341 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
342 
343 void ipoib_mark_paths_invalid(struct net_device *dev)
344 {
345 	struct ipoib_dev_priv *priv = netdev_priv(dev);
346 	struct ipoib_path *path, *tp;
347 
348 	spin_lock_irq(&priv->lock);
349 
350 	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
351 		ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
352 			be16_to_cpu(path->pathrec.dlid),
353 			path->pathrec.dgid.raw);
354 		path->valid =  0;
355 	}
356 
357 	spin_unlock_irq(&priv->lock);
358 }
359 
360 void ipoib_flush_paths(struct net_device *dev)
361 {
362 	struct ipoib_dev_priv *priv = netdev_priv(dev);
363 	struct ipoib_path *path, *tp;
364 	LIST_HEAD(remove_list);
365 	unsigned long flags;
366 
367 	netif_tx_lock_bh(dev);
368 	spin_lock_irqsave(&priv->lock, flags);
369 
370 	list_splice_init(&priv->path_list, &remove_list);
371 
372 	list_for_each_entry(path, &remove_list, list)
373 		rb_erase(&path->rb_node, &priv->path_tree);
374 
375 	list_for_each_entry_safe(path, tp, &remove_list, list) {
376 		if (path->query)
377 			ib_sa_cancel_query(path->query_id, path->query);
378 		spin_unlock_irqrestore(&priv->lock, flags);
379 		netif_tx_unlock_bh(dev);
380 		wait_for_completion(&path->done);
381 		path_free(dev, path);
382 		netif_tx_lock_bh(dev);
383 		spin_lock_irqsave(&priv->lock, flags);
384 	}
385 
386 	spin_unlock_irqrestore(&priv->lock, flags);
387 	netif_tx_unlock_bh(dev);
388 }
389 
390 static void path_rec_completion(int status,
391 				struct ib_sa_path_rec *pathrec,
392 				void *path_ptr)
393 {
394 	struct ipoib_path *path = path_ptr;
395 	struct net_device *dev = path->dev;
396 	struct ipoib_dev_priv *priv = netdev_priv(dev);
397 	struct ipoib_ah *ah = NULL;
398 	struct ipoib_ah *old_ah = NULL;
399 	struct ipoib_neigh *neigh, *tn;
400 	struct sk_buff_head skqueue;
401 	struct sk_buff *skb;
402 	unsigned long flags;
403 
404 	if (!status)
405 		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
406 			  be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
407 	else
408 		ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
409 			  status, path->pathrec.dgid.raw);
410 
411 	skb_queue_head_init(&skqueue);
412 
413 	if (!status) {
414 		struct ib_ah_attr av;
415 
416 		if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
417 			ah = ipoib_create_ah(dev, priv->pd, &av);
418 	}
419 
420 	spin_lock_irqsave(&priv->lock, flags);
421 
422 	if (!IS_ERR_OR_NULL(ah)) {
423 		path->pathrec = *pathrec;
424 
425 		old_ah   = path->ah;
426 		path->ah = ah;
427 
428 		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
429 			  ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
430 
431 		while ((skb = __skb_dequeue(&path->queue)))
432 			__skb_queue_tail(&skqueue, skb);
433 
434 		list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
435 			if (neigh->ah) {
436 				WARN_ON(neigh->ah != old_ah);
437 				/*
438 				 * Dropping the ah reference inside
439 				 * priv->lock is safe here, because we
440 				 * will hold one more reference from
441 				 * the original value of path->ah (ie
442 				 * old_ah).
443 				 */
444 				ipoib_put_ah(neigh->ah);
445 			}
446 			kref_get(&path->ah->ref);
447 			neigh->ah = path->ah;
448 
449 			if (ipoib_cm_enabled(dev, neigh->daddr)) {
450 				if (!ipoib_cm_get(neigh))
451 					ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
452 									       path,
453 									       neigh));
454 				if (!ipoib_cm_get(neigh)) {
455 					list_del(&neigh->list);
456 					ipoib_neigh_free(neigh);
457 					continue;
458 				}
459 			}
460 
461 			while ((skb = __skb_dequeue(&neigh->queue)))
462 				__skb_queue_tail(&skqueue, skb);
463 		}
464 		path->valid = 1;
465 	}
466 
467 	path->query = NULL;
468 	complete(&path->done);
469 
470 	spin_unlock_irqrestore(&priv->lock, flags);
471 
472 	if (old_ah)
473 		ipoib_put_ah(old_ah);
474 
475 	while ((skb = __skb_dequeue(&skqueue))) {
476 		skb->dev = dev;
477 		if (dev_queue_xmit(skb))
478 			ipoib_warn(priv, "dev_queue_xmit failed "
479 				   "to requeue packet\n");
480 	}
481 }
482 
483 static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
484 {
485 	struct ipoib_dev_priv *priv = netdev_priv(dev);
486 	struct ipoib_path *path;
487 
488 	if (!priv->broadcast)
489 		return NULL;
490 
491 	path = kzalloc(sizeof *path, GFP_ATOMIC);
492 	if (!path)
493 		return NULL;
494 
495 	path->dev = dev;
496 
497 	skb_queue_head_init(&path->queue);
498 
499 	INIT_LIST_HEAD(&path->neigh_list);
500 
501 	memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
502 	path->pathrec.sgid	    = priv->local_gid;
503 	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
504 	path->pathrec.numb_path     = 1;
505 	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
506 
507 	return path;
508 }
509 
510 static int path_rec_start(struct net_device *dev,
511 			  struct ipoib_path *path)
512 {
513 	struct ipoib_dev_priv *priv = netdev_priv(dev);
514 
515 	ipoib_dbg(priv, "Start path record lookup for %pI6\n",
516 		  path->pathrec.dgid.raw);
517 
518 	init_completion(&path->done);
519 
520 	path->query_id =
521 		ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
522 				   &path->pathrec,
523 				   IB_SA_PATH_REC_DGID		|
524 				   IB_SA_PATH_REC_SGID		|
525 				   IB_SA_PATH_REC_NUMB_PATH	|
526 				   IB_SA_PATH_REC_TRAFFIC_CLASS |
527 				   IB_SA_PATH_REC_PKEY,
528 				   1000, GFP_ATOMIC,
529 				   path_rec_completion,
530 				   path, &path->query);
531 	if (path->query_id < 0) {
532 		ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
533 		path->query = NULL;
534 		complete(&path->done);
535 		return path->query_id;
536 	}
537 
538 	return 0;
539 }
540 
541 static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
542 			   struct net_device *dev)
543 {
544 	struct ipoib_dev_priv *priv = netdev_priv(dev);
545 	struct ipoib_path *path;
546 	struct ipoib_neigh *neigh;
547 	unsigned long flags;
548 
549 	neigh = ipoib_neigh_alloc(daddr, dev);
550 	if (!neigh) {
551 		++dev->stats.tx_dropped;
552 		dev_kfree_skb_any(skb);
553 		return;
554 	}
555 
556 	spin_lock_irqsave(&priv->lock, flags);
557 
558 	path = __path_find(dev, daddr + 4);
559 	if (!path) {
560 		path = path_rec_create(dev, daddr + 4);
561 		if (!path)
562 			goto err_path;
563 
564 		__path_add(dev, path);
565 	}
566 
567 	list_add_tail(&neigh->list, &path->neigh_list);
568 
569 	if (path->ah) {
570 		kref_get(&path->ah->ref);
571 		neigh->ah = path->ah;
572 
573 		if (ipoib_cm_enabled(dev, neigh->daddr)) {
574 			if (!ipoib_cm_get(neigh))
575 				ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
576 			if (!ipoib_cm_get(neigh)) {
577 				list_del(&neigh->list);
578 				ipoib_neigh_free(neigh);
579 				goto err_drop;
580 			}
581 			if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
582 				__skb_queue_tail(&neigh->queue, skb);
583 			else {
584 				ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
585 					   skb_queue_len(&neigh->queue));
586 				goto err_drop;
587 			}
588 		} else {
589 			spin_unlock_irqrestore(&priv->lock, flags);
590 			ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr));
591 			ipoib_neigh_put(neigh);
592 			return;
593 		}
594 	} else {
595 		neigh->ah  = NULL;
596 
597 		if (!path->query && path_rec_start(dev, path))
598 			goto err_list;
599 
600 		__skb_queue_tail(&neigh->queue, skb);
601 	}
602 
603 	spin_unlock_irqrestore(&priv->lock, flags);
604 	ipoib_neigh_put(neigh);
605 	return;
606 
607 err_list:
608 	list_del(&neigh->list);
609 
610 err_path:
611 	ipoib_neigh_free(neigh);
612 err_drop:
613 	++dev->stats.tx_dropped;
614 	dev_kfree_skb_any(skb);
615 
616 	spin_unlock_irqrestore(&priv->lock, flags);
617 	ipoib_neigh_put(neigh);
618 }
619 
620 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
621 			     struct ipoib_cb *cb)
622 {
623 	struct ipoib_dev_priv *priv = netdev_priv(dev);
624 	struct ipoib_path *path;
625 	unsigned long flags;
626 
627 	spin_lock_irqsave(&priv->lock, flags);
628 
629 	path = __path_find(dev, cb->hwaddr + 4);
630 	if (!path || !path->valid) {
631 		int new_path = 0;
632 
633 		if (!path) {
634 			path = path_rec_create(dev, cb->hwaddr + 4);
635 			new_path = 1;
636 		}
637 		if (path) {
638 			__skb_queue_tail(&path->queue, skb);
639 
640 			if (!path->query && path_rec_start(dev, path)) {
641 				spin_unlock_irqrestore(&priv->lock, flags);
642 				if (new_path)
643 					path_free(dev, path);
644 				return;
645 			} else
646 				__path_add(dev, path);
647 		} else {
648 			++dev->stats.tx_dropped;
649 			dev_kfree_skb_any(skb);
650 		}
651 
652 		spin_unlock_irqrestore(&priv->lock, flags);
653 		return;
654 	}
655 
656 	if (path->ah) {
657 		ipoib_dbg(priv, "Send unicast ARP to %04x\n",
658 			  be16_to_cpu(path->pathrec.dlid));
659 
660 		spin_unlock_irqrestore(&priv->lock, flags);
661 		ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
662 		return;
663 	} else if ((path->query || !path_rec_start(dev, path)) &&
664 		   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
665 		__skb_queue_tail(&path->queue, skb);
666 	} else {
667 		++dev->stats.tx_dropped;
668 		dev_kfree_skb_any(skb);
669 	}
670 
671 	spin_unlock_irqrestore(&priv->lock, flags);
672 }
673 
674 static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
675 {
676 	struct ipoib_dev_priv *priv = netdev_priv(dev);
677 	struct ipoib_neigh *neigh;
678 	struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
679 	struct ipoib_header *header;
680 	unsigned long flags;
681 
682 	header = (struct ipoib_header *) skb->data;
683 
684 	if (unlikely(cb->hwaddr[4] == 0xff)) {
685 		/* multicast, arrange "if" according to probability */
686 		if ((header->proto != htons(ETH_P_IP)) &&
687 		    (header->proto != htons(ETH_P_IPV6)) &&
688 		    (header->proto != htons(ETH_P_ARP)) &&
689 		    (header->proto != htons(ETH_P_RARP))) {
690 			/* ethertype not supported by IPoIB */
691 			++dev->stats.tx_dropped;
692 			dev_kfree_skb_any(skb);
693 			return NETDEV_TX_OK;
694 		}
695 		/* Add in the P_Key for multicast*/
696 		cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
697 		cb->hwaddr[9] = priv->pkey & 0xff;
698 
699 		neigh = ipoib_neigh_get(dev, cb->hwaddr);
700 		if (likely(neigh))
701 			goto send_using_neigh;
702 		ipoib_mcast_send(dev, cb->hwaddr, skb);
703 		return NETDEV_TX_OK;
704 	}
705 
706 	/* unicast, arrange "switch" according to probability */
707 	switch (header->proto) {
708 	case htons(ETH_P_IP):
709 	case htons(ETH_P_IPV6):
710 		neigh = ipoib_neigh_get(dev, cb->hwaddr);
711 		if (unlikely(!neigh)) {
712 			neigh_add_path(skb, cb->hwaddr, dev);
713 			return NETDEV_TX_OK;
714 		}
715 		break;
716 	case htons(ETH_P_ARP):
717 	case htons(ETH_P_RARP):
718 		/* for unicast ARP and RARP should always perform path find */
719 		unicast_arp_send(skb, dev, cb);
720 		return NETDEV_TX_OK;
721 	default:
722 		/* ethertype not supported by IPoIB */
723 		++dev->stats.tx_dropped;
724 		dev_kfree_skb_any(skb);
725 		return NETDEV_TX_OK;
726 	}
727 
728 send_using_neigh:
729 	/* note we now hold a ref to neigh */
730 	if (ipoib_cm_get(neigh)) {
731 		if (ipoib_cm_up(neigh)) {
732 			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
733 			goto unref;
734 		}
735 	} else if (neigh->ah) {
736 		ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
737 		goto unref;
738 	}
739 
740 	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
741 		spin_lock_irqsave(&priv->lock, flags);
742 		__skb_queue_tail(&neigh->queue, skb);
743 		spin_unlock_irqrestore(&priv->lock, flags);
744 	} else {
745 		++dev->stats.tx_dropped;
746 		dev_kfree_skb_any(skb);
747 	}
748 
749 unref:
750 	ipoib_neigh_put(neigh);
751 
752 	return NETDEV_TX_OK;
753 }
754 
755 static void ipoib_timeout(struct net_device *dev)
756 {
757 	struct ipoib_dev_priv *priv = netdev_priv(dev);
758 
759 	ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
760 		   jiffies_to_msecs(jiffies - dev->trans_start));
761 	ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
762 		   netif_queue_stopped(dev),
763 		   priv->tx_head, priv->tx_tail);
764 	/* XXX reset QP, etc. */
765 }
766 
767 static int ipoib_hard_header(struct sk_buff *skb,
768 			     struct net_device *dev,
769 			     unsigned short type,
770 			     const void *daddr, const void *saddr, unsigned len)
771 {
772 	struct ipoib_header *header;
773 	struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
774 
775 	header = (struct ipoib_header *) skb_push(skb, sizeof *header);
776 
777 	header->proto = htons(type);
778 	header->reserved = 0;
779 
780 	/*
781 	 * we don't rely on dst_entry structure,  always stuff the
782 	 * destination address into skb->cb so we can figure out where
783 	 * to send the packet later.
784 	 */
785 	memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
786 
787 	return 0;
788 }
789 
790 static void ipoib_set_mcast_list(struct net_device *dev)
791 {
792 	struct ipoib_dev_priv *priv = netdev_priv(dev);
793 
794 	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
795 		ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
796 		return;
797 	}
798 
799 	queue_work(ipoib_workqueue, &priv->restart_task);
800 }
801 
802 static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
803 {
804 	/*
805 	 * Use only the address parts that contributes to spreading
806 	 * The subnet prefix is not used as one can not connect to
807 	 * same remote port (GUID) using the same remote QPN via two
808 	 * different subnets.
809 	 */
810 	 /* qpn octets[1:4) & port GUID octets[12:20) */
811 	u32 *daddr_32 = (u32 *) daddr;
812 	u32 hv;
813 
814 	hv = jhash_3words(daddr_32[3], daddr_32[4], 0xFFFFFF & daddr_32[0], 0);
815 	return hv & htbl->mask;
816 }
817 
818 struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
819 {
820 	struct ipoib_dev_priv *priv = netdev_priv(dev);
821 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
822 	struct ipoib_neigh_hash *htbl;
823 	struct ipoib_neigh *neigh = NULL;
824 	u32 hash_val;
825 
826 	rcu_read_lock_bh();
827 
828 	htbl = rcu_dereference_bh(ntbl->htbl);
829 
830 	if (!htbl)
831 		goto out_unlock;
832 
833 	hash_val = ipoib_addr_hash(htbl, daddr);
834 	for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
835 	     neigh != NULL;
836 	     neigh = rcu_dereference_bh(neigh->hnext)) {
837 		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
838 			/* found, take one ref on behalf of the caller */
839 			if (!atomic_inc_not_zero(&neigh->refcnt)) {
840 				/* deleted */
841 				neigh = NULL;
842 				goto out_unlock;
843 			}
844 			neigh->alive = jiffies;
845 			goto out_unlock;
846 		}
847 	}
848 
849 out_unlock:
850 	rcu_read_unlock_bh();
851 	return neigh;
852 }
853 
854 static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
855 {
856 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
857 	struct ipoib_neigh_hash *htbl;
858 	unsigned long neigh_obsolete;
859 	unsigned long dt;
860 	unsigned long flags;
861 	int i;
862 
863 	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
864 		return;
865 
866 	write_lock_bh(&ntbl->rwlock);
867 
868 	htbl = rcu_dereference_protected(ntbl->htbl,
869 					 lockdep_is_held(&ntbl->rwlock));
870 
871 	if (!htbl)
872 		goto out_unlock;
873 
874 	/* neigh is obsolete if it was idle for two GC periods */
875 	dt = 2 * arp_tbl.gc_interval;
876 	neigh_obsolete = jiffies - dt;
877 	/* handle possible race condition */
878 	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
879 		goto out_unlock;
880 
881 	for (i = 0; i < htbl->size; i++) {
882 		struct ipoib_neigh *neigh;
883 		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
884 
885 		while ((neigh = rcu_dereference_protected(*np,
886 							  lockdep_is_held(&ntbl->rwlock))) != NULL) {
887 			/* was the neigh idle for two GC periods */
888 			if (time_after(neigh_obsolete, neigh->alive)) {
889 				rcu_assign_pointer(*np,
890 						   rcu_dereference_protected(neigh->hnext,
891 									     lockdep_is_held(&ntbl->rwlock)));
892 				/* remove from path/mc list */
893 				spin_lock_irqsave(&priv->lock, flags);
894 				list_del(&neigh->list);
895 				spin_unlock_irqrestore(&priv->lock, flags);
896 				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
897 			} else {
898 				np = &neigh->hnext;
899 			}
900 
901 		}
902 	}
903 
904 out_unlock:
905 	write_unlock_bh(&ntbl->rwlock);
906 }
907 
908 static void ipoib_reap_neigh(struct work_struct *work)
909 {
910 	struct ipoib_dev_priv *priv =
911 		container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
912 
913 	__ipoib_reap_neigh(priv);
914 
915 	if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
916 		queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
917 				   arp_tbl.gc_interval);
918 }
919 
920 
921 static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
922 				      struct net_device *dev)
923 {
924 	struct ipoib_neigh *neigh;
925 
926 	neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
927 	if (!neigh)
928 		return NULL;
929 
930 	neigh->dev = dev;
931 	memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
932 	skb_queue_head_init(&neigh->queue);
933 	INIT_LIST_HEAD(&neigh->list);
934 	ipoib_cm_set(neigh, NULL);
935 	/* one ref on behalf of the caller */
936 	atomic_set(&neigh->refcnt, 1);
937 
938 	return neigh;
939 }
940 
941 struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
942 				      struct net_device *dev)
943 {
944 	struct ipoib_dev_priv *priv = netdev_priv(dev);
945 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
946 	struct ipoib_neigh_hash *htbl;
947 	struct ipoib_neigh *neigh;
948 	u32 hash_val;
949 
950 	write_lock_bh(&ntbl->rwlock);
951 
952 	htbl = rcu_dereference_protected(ntbl->htbl,
953 					 lockdep_is_held(&ntbl->rwlock));
954 	if (!htbl) {
955 		neigh = NULL;
956 		goto out_unlock;
957 	}
958 
959 	/* need to add a new neigh, but maybe some other thread succeeded?
960 	 * recalc hash, maybe hash resize took place so we do a search
961 	 */
962 	hash_val = ipoib_addr_hash(htbl, daddr);
963 	for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
964 					       lockdep_is_held(&ntbl->rwlock));
965 	     neigh != NULL;
966 	     neigh = rcu_dereference_protected(neigh->hnext,
967 					       lockdep_is_held(&ntbl->rwlock))) {
968 		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
969 			/* found, take one ref on behalf of the caller */
970 			if (!atomic_inc_not_zero(&neigh->refcnt)) {
971 				/* deleted */
972 				neigh = NULL;
973 				break;
974 			}
975 			neigh->alive = jiffies;
976 			goto out_unlock;
977 		}
978 	}
979 
980 	neigh = ipoib_neigh_ctor(daddr, dev);
981 	if (!neigh)
982 		goto out_unlock;
983 
984 	/* one ref on behalf of the hash table */
985 	atomic_inc(&neigh->refcnt);
986 	neigh->alive = jiffies;
987 	/* put in hash */
988 	rcu_assign_pointer(neigh->hnext,
989 			   rcu_dereference_protected(htbl->buckets[hash_val],
990 						     lockdep_is_held(&ntbl->rwlock)));
991 	rcu_assign_pointer(htbl->buckets[hash_val], neigh);
992 	atomic_inc(&ntbl->entries);
993 
994 out_unlock:
995 	write_unlock_bh(&ntbl->rwlock);
996 
997 	return neigh;
998 }
999 
1000 void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
1001 {
1002 	/* neigh reference count was dropprd to zero */
1003 	struct net_device *dev = neigh->dev;
1004 	struct ipoib_dev_priv *priv = netdev_priv(dev);
1005 	struct sk_buff *skb;
1006 	if (neigh->ah)
1007 		ipoib_put_ah(neigh->ah);
1008 	while ((skb = __skb_dequeue(&neigh->queue))) {
1009 		++dev->stats.tx_dropped;
1010 		dev_kfree_skb_any(skb);
1011 	}
1012 	if (ipoib_cm_get(neigh))
1013 		ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
1014 	ipoib_dbg(netdev_priv(dev),
1015 		  "neigh free for %06x %pI6\n",
1016 		  IPOIB_QPN(neigh->daddr),
1017 		  neigh->daddr + 4);
1018 	kfree(neigh);
1019 	if (atomic_dec_and_test(&priv->ntbl.entries)) {
1020 		if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
1021 			complete(&priv->ntbl.flushed);
1022 	}
1023 }
1024 
1025 static void ipoib_neigh_reclaim(struct rcu_head *rp)
1026 {
1027 	/* Called as a result of removal from hash table */
1028 	struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
1029 	/* note TX context may hold another ref */
1030 	ipoib_neigh_put(neigh);
1031 }
1032 
1033 void ipoib_neigh_free(struct ipoib_neigh *neigh)
1034 {
1035 	struct net_device *dev = neigh->dev;
1036 	struct ipoib_dev_priv *priv = netdev_priv(dev);
1037 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
1038 	struct ipoib_neigh_hash *htbl;
1039 	struct ipoib_neigh __rcu **np;
1040 	struct ipoib_neigh *n;
1041 	u32 hash_val;
1042 
1043 	write_lock_bh(&ntbl->rwlock);
1044 
1045 	htbl = rcu_dereference_protected(ntbl->htbl,
1046 					lockdep_is_held(&ntbl->rwlock));
1047 	if (!htbl)
1048 		goto out_unlock;
1049 
1050 	hash_val = ipoib_addr_hash(htbl, neigh->daddr);
1051 	np = &htbl->buckets[hash_val];
1052 	for (n = rcu_dereference_protected(*np,
1053 					    lockdep_is_held(&ntbl->rwlock));
1054 	     n != NULL;
1055 	     n = rcu_dereference_protected(*np,
1056 					lockdep_is_held(&ntbl->rwlock))) {
1057 		if (n == neigh) {
1058 			/* found */
1059 			rcu_assign_pointer(*np,
1060 					   rcu_dereference_protected(neigh->hnext,
1061 								     lockdep_is_held(&ntbl->rwlock)));
1062 			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1063 			goto out_unlock;
1064 		} else {
1065 			np = &n->hnext;
1066 		}
1067 	}
1068 
1069 out_unlock:
1070 	write_unlock_bh(&ntbl->rwlock);
1071 
1072 }
1073 
1074 static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
1075 {
1076 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
1077 	struct ipoib_neigh_hash *htbl;
1078 	struct ipoib_neigh **buckets;
1079 	u32 size;
1080 
1081 	clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1082 	ntbl->htbl = NULL;
1083 	rwlock_init(&ntbl->rwlock);
1084 	htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
1085 	if (!htbl)
1086 		return -ENOMEM;
1087 	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1088 	size = roundup_pow_of_two(arp_tbl.gc_thresh3);
1089 	buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
1090 	if (!buckets) {
1091 		kfree(htbl);
1092 		return -ENOMEM;
1093 	}
1094 	htbl->size = size;
1095 	htbl->mask = (size - 1);
1096 	htbl->buckets = buckets;
1097 	ntbl->htbl = htbl;
1098 	atomic_set(&ntbl->entries, 0);
1099 
1100 	/* start garbage collection */
1101 	clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1102 	queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
1103 			   arp_tbl.gc_interval);
1104 
1105 	return 0;
1106 }
1107 
1108 static void neigh_hash_free_rcu(struct rcu_head *head)
1109 {
1110 	struct ipoib_neigh_hash *htbl = container_of(head,
1111 						    struct ipoib_neigh_hash,
1112 						    rcu);
1113 	struct ipoib_neigh __rcu **buckets = htbl->buckets;
1114 
1115 	kfree(buckets);
1116 	kfree(htbl);
1117 }
1118 
1119 void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
1120 {
1121 	struct ipoib_dev_priv *priv = netdev_priv(dev);
1122 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
1123 	struct ipoib_neigh_hash *htbl;
1124 	unsigned long flags;
1125 	int i;
1126 
1127 	/* remove all neigh connected to a given path or mcast */
1128 	write_lock_bh(&ntbl->rwlock);
1129 
1130 	htbl = rcu_dereference_protected(ntbl->htbl,
1131 					 lockdep_is_held(&ntbl->rwlock));
1132 
1133 	if (!htbl)
1134 		goto out_unlock;
1135 
1136 	for (i = 0; i < htbl->size; i++) {
1137 		struct ipoib_neigh *neigh;
1138 		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1139 
1140 		while ((neigh = rcu_dereference_protected(*np,
1141 							  lockdep_is_held(&ntbl->rwlock))) != NULL) {
1142 			/* delete neighs belong to this parent */
1143 			if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
1144 				rcu_assign_pointer(*np,
1145 						   rcu_dereference_protected(neigh->hnext,
1146 									     lockdep_is_held(&ntbl->rwlock)));
1147 				/* remove from parent list */
1148 				spin_lock_irqsave(&priv->lock, flags);
1149 				list_del(&neigh->list);
1150 				spin_unlock_irqrestore(&priv->lock, flags);
1151 				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1152 			} else {
1153 				np = &neigh->hnext;
1154 			}
1155 
1156 		}
1157 	}
1158 out_unlock:
1159 	write_unlock_bh(&ntbl->rwlock);
1160 }
1161 
1162 static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
1163 {
1164 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
1165 	struct ipoib_neigh_hash *htbl;
1166 	unsigned long flags;
1167 	int i;
1168 
1169 	write_lock_bh(&ntbl->rwlock);
1170 
1171 	htbl = rcu_dereference_protected(ntbl->htbl,
1172 					lockdep_is_held(&ntbl->rwlock));
1173 	if (!htbl)
1174 		goto out_unlock;
1175 
1176 	for (i = 0; i < htbl->size; i++) {
1177 		struct ipoib_neigh *neigh;
1178 		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1179 
1180 		while ((neigh = rcu_dereference_protected(*np,
1181 							  lockdep_is_held(&ntbl->rwlock))) != NULL) {
1182 			rcu_assign_pointer(*np,
1183 					   rcu_dereference_protected(neigh->hnext,
1184 								     lockdep_is_held(&ntbl->rwlock)));
1185 			/* remove from path/mc list */
1186 			spin_lock_irqsave(&priv->lock, flags);
1187 			list_del(&neigh->list);
1188 			spin_unlock_irqrestore(&priv->lock, flags);
1189 			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1190 		}
1191 	}
1192 
1193 	rcu_assign_pointer(ntbl->htbl, NULL);
1194 	call_rcu(&htbl->rcu, neigh_hash_free_rcu);
1195 
1196 out_unlock:
1197 	write_unlock_bh(&ntbl->rwlock);
1198 }
1199 
1200 static void ipoib_neigh_hash_uninit(struct net_device *dev)
1201 {
1202 	struct ipoib_dev_priv *priv = netdev_priv(dev);
1203 	int stopped;
1204 
1205 	ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
1206 	init_completion(&priv->ntbl.flushed);
1207 	set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1208 
1209 	/* Stop GC if called at init fail need to cancel work */
1210 	stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1211 	if (!stopped)
1212 		cancel_delayed_work(&priv->neigh_reap_task);
1213 
1214 	if (atomic_read(&priv->ntbl.entries)) {
1215 		ipoib_flush_neighs(priv);
1216 		wait_for_completion(&priv->ntbl.flushed);
1217 	}
1218 }
1219 
1220 
1221 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
1222 {
1223 	struct ipoib_dev_priv *priv = netdev_priv(dev);
1224 
1225 	if (ipoib_neigh_hash_init(priv) < 0)
1226 		goto out;
1227 	/* Allocate RX/TX "rings" to hold queued skbs */
1228 	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
1229 				GFP_KERNEL);
1230 	if (!priv->rx_ring) {
1231 		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
1232 		       ca->name, ipoib_recvq_size);
1233 		goto out_neigh_hash_cleanup;
1234 	}
1235 
1236 	priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
1237 	if (!priv->tx_ring) {
1238 		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
1239 		       ca->name, ipoib_sendq_size);
1240 		goto out_rx_ring_cleanup;
1241 	}
1242 
1243 	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
1244 
1245 	if (ipoib_ib_dev_init(dev, ca, port))
1246 		goto out_tx_ring_cleanup;
1247 
1248 	return 0;
1249 
1250 out_tx_ring_cleanup:
1251 	vfree(priv->tx_ring);
1252 
1253 out_rx_ring_cleanup:
1254 	kfree(priv->rx_ring);
1255 
1256 out_neigh_hash_cleanup:
1257 	ipoib_neigh_hash_uninit(dev);
1258 out:
1259 	return -ENOMEM;
1260 }
1261 
1262 void ipoib_dev_cleanup(struct net_device *dev)
1263 {
1264 	struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
1265 
1266 	ipoib_delete_debug_files(dev);
1267 
1268 	/* Delete any child interfaces first */
1269 	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
1270 		/* Stop GC on child */
1271 		set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
1272 		cancel_delayed_work(&cpriv->neigh_reap_task);
1273 		unregister_netdev(cpriv->dev);
1274 		ipoib_dev_cleanup(cpriv->dev);
1275 		free_netdev(cpriv->dev);
1276 	}
1277 
1278 	ipoib_ib_dev_cleanup(dev);
1279 
1280 	kfree(priv->rx_ring);
1281 	vfree(priv->tx_ring);
1282 
1283 	priv->rx_ring = NULL;
1284 	priv->tx_ring = NULL;
1285 
1286 	ipoib_neigh_hash_uninit(dev);
1287 }
1288 
1289 static const struct header_ops ipoib_header_ops = {
1290 	.create	= ipoib_hard_header,
1291 };
1292 
1293 static const struct net_device_ops ipoib_netdev_ops = {
1294 	.ndo_open		 = ipoib_open,
1295 	.ndo_stop		 = ipoib_stop,
1296 	.ndo_change_mtu		 = ipoib_change_mtu,
1297 	.ndo_fix_features	 = ipoib_fix_features,
1298 	.ndo_start_xmit	 	 = ipoib_start_xmit,
1299 	.ndo_tx_timeout		 = ipoib_timeout,
1300 	.ndo_set_rx_mode	 = ipoib_set_mcast_list,
1301 };
1302 
1303 static void ipoib_setup(struct net_device *dev)
1304 {
1305 	struct ipoib_dev_priv *priv = netdev_priv(dev);
1306 
1307 	dev->netdev_ops		 = &ipoib_netdev_ops;
1308 	dev->header_ops		 = &ipoib_header_ops;
1309 
1310 	ipoib_set_ethtool_ops(dev);
1311 
1312 	netif_napi_add(dev, &priv->napi, ipoib_poll, 100);
1313 
1314 	dev->watchdog_timeo	 = HZ;
1315 
1316 	dev->flags		|= IFF_BROADCAST | IFF_MULTICAST;
1317 
1318 	dev->hard_header_len	 = IPOIB_ENCAP_LEN;
1319 	dev->addr_len		 = INFINIBAND_ALEN;
1320 	dev->type		 = ARPHRD_INFINIBAND;
1321 	dev->tx_queue_len	 = ipoib_sendq_size * 2;
1322 	dev->features		 = (NETIF_F_VLAN_CHALLENGED	|
1323 				    NETIF_F_HIGHDMA);
1324 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1325 
1326 	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
1327 
1328 	netif_carrier_off(dev);
1329 
1330 	priv->dev = dev;
1331 
1332 	spin_lock_init(&priv->lock);
1333 
1334 	mutex_init(&priv->vlan_mutex);
1335 
1336 	INIT_LIST_HEAD(&priv->path_list);
1337 	INIT_LIST_HEAD(&priv->child_intfs);
1338 	INIT_LIST_HEAD(&priv->dead_ahs);
1339 	INIT_LIST_HEAD(&priv->multicast_list);
1340 
1341 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
1342 	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
1343 	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
1344 	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
1345 	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
1346 	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
1347 	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
1348 	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
1349 	INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
1350 }
1351 
1352 struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
1353 {
1354 	struct net_device *dev;
1355 
1356 	dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name,
1357 			   ipoib_setup);
1358 	if (!dev)
1359 		return NULL;
1360 
1361 	return netdev_priv(dev);
1362 }
1363 
1364 static ssize_t show_pkey(struct device *dev,
1365 			 struct device_attribute *attr, char *buf)
1366 {
1367 	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1368 
1369 	return sprintf(buf, "0x%04x\n", priv->pkey);
1370 }
1371 static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
1372 
1373 static ssize_t show_umcast(struct device *dev,
1374 			   struct device_attribute *attr, char *buf)
1375 {
1376 	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1377 
1378 	return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
1379 }
1380 
1381 static ssize_t set_umcast(struct device *dev,
1382 			  struct device_attribute *attr,
1383 			  const char *buf, size_t count)
1384 {
1385 	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1386 	unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
1387 
1388 	if (umcast_val > 0) {
1389 		set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1390 		ipoib_warn(priv, "ignoring multicast groups joined directly "
1391 				"by userspace\n");
1392 	} else
1393 		clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1394 
1395 	return count;
1396 }
1397 static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
1398 
1399 int ipoib_add_umcast_attr(struct net_device *dev)
1400 {
1401 	return device_create_file(&dev->dev, &dev_attr_umcast);
1402 }
1403 
1404 static ssize_t create_child(struct device *dev,
1405 			    struct device_attribute *attr,
1406 			    const char *buf, size_t count)
1407 {
1408 	int pkey;
1409 	int ret;
1410 
1411 	if (sscanf(buf, "%i", &pkey) != 1)
1412 		return -EINVAL;
1413 
1414 	if (pkey < 0 || pkey > 0xffff)
1415 		return -EINVAL;
1416 
1417 	/*
1418 	 * Set the full membership bit, so that we join the right
1419 	 * broadcast group, etc.
1420 	 */
1421 	pkey |= 0x8000;
1422 
1423 	ret = ipoib_vlan_add(to_net_dev(dev), pkey);
1424 
1425 	return ret ? ret : count;
1426 }
1427 static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
1428 
1429 static ssize_t delete_child(struct device *dev,
1430 			    struct device_attribute *attr,
1431 			    const char *buf, size_t count)
1432 {
1433 	int pkey;
1434 	int ret;
1435 
1436 	if (sscanf(buf, "%i", &pkey) != 1)
1437 		return -EINVAL;
1438 
1439 	if (pkey < 0 || pkey > 0xffff)
1440 		return -EINVAL;
1441 
1442 	ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
1443 
1444 	return ret ? ret : count;
1445 
1446 }
1447 static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
1448 
1449 int ipoib_add_pkey_attr(struct net_device *dev)
1450 {
1451 	return device_create_file(&dev->dev, &dev_attr_pkey);
1452 }
1453 
1454 int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
1455 {
1456 	struct ib_device_attr *device_attr;
1457 	int result = -ENOMEM;
1458 
1459 	device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
1460 	if (!device_attr) {
1461 		printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
1462 		       hca->name, sizeof *device_attr);
1463 		return result;
1464 	}
1465 
1466 	result = ib_query_device(hca, device_attr);
1467 	if (result) {
1468 		printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
1469 		       hca->name, result);
1470 		kfree(device_attr);
1471 		return result;
1472 	}
1473 	priv->hca_caps = device_attr->device_cap_flags;
1474 
1475 	kfree(device_attr);
1476 
1477 	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
1478 		priv->dev->hw_features = NETIF_F_SG |
1479 			NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
1480 
1481 		if (priv->hca_caps & IB_DEVICE_UD_TSO)
1482 			priv->dev->hw_features |= NETIF_F_TSO;
1483 
1484 		priv->dev->features |= priv->dev->hw_features;
1485 	}
1486 
1487 	return 0;
1488 }
1489 
1490 static struct net_device *ipoib_add_port(const char *format,
1491 					 struct ib_device *hca, u8 port)
1492 {
1493 	struct ipoib_dev_priv *priv;
1494 	struct ib_port_attr attr;
1495 	int result = -ENOMEM;
1496 
1497 	priv = ipoib_intf_alloc(format);
1498 	if (!priv)
1499 		goto alloc_mem_failed;
1500 
1501 	SET_NETDEV_DEV(priv->dev, hca->dma_device);
1502 	priv->dev->dev_id = port - 1;
1503 
1504 	if (!ib_query_port(hca, port, &attr))
1505 		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
1506 	else {
1507 		printk(KERN_WARNING "%s: ib_query_port %d failed\n",
1508 		       hca->name, port);
1509 		goto device_init_failed;
1510 	}
1511 
1512 	/* MTU will be reset when mcast join happens */
1513 	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
1514 	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
1515 
1516 	priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
1517 
1518 	result = ib_query_pkey(hca, port, 0, &priv->pkey);
1519 	if (result) {
1520 		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
1521 		       hca->name, port, result);
1522 		goto device_init_failed;
1523 	}
1524 
1525 	if (ipoib_set_dev_features(priv, hca))
1526 		goto device_init_failed;
1527 
1528 	/*
1529 	 * Set the full membership bit, so that we join the right
1530 	 * broadcast group, etc.
1531 	 */
1532 	priv->pkey |= 0x8000;
1533 
1534 	priv->dev->broadcast[8] = priv->pkey >> 8;
1535 	priv->dev->broadcast[9] = priv->pkey & 0xff;
1536 
1537 	result = ib_query_gid(hca, port, 0, &priv->local_gid);
1538 	if (result) {
1539 		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
1540 		       hca->name, port, result);
1541 		goto device_init_failed;
1542 	} else
1543 		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
1544 
1545 	result = ipoib_dev_init(priv->dev, hca, port);
1546 	if (result < 0) {
1547 		printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
1548 		       hca->name, port, result);
1549 		goto device_init_failed;
1550 	}
1551 
1552 	INIT_IB_EVENT_HANDLER(&priv->event_handler,
1553 			      priv->ca, ipoib_event);
1554 	result = ib_register_event_handler(&priv->event_handler);
1555 	if (result < 0) {
1556 		printk(KERN_WARNING "%s: ib_register_event_handler failed for "
1557 		       "port %d (ret = %d)\n",
1558 		       hca->name, port, result);
1559 		goto event_failed;
1560 	}
1561 
1562 	result = register_netdev(priv->dev);
1563 	if (result) {
1564 		printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
1565 		       hca->name, port, result);
1566 		goto register_failed;
1567 	}
1568 
1569 	ipoib_create_debug_files(priv->dev);
1570 
1571 	if (ipoib_cm_add_mode_attr(priv->dev))
1572 		goto sysfs_failed;
1573 	if (ipoib_add_pkey_attr(priv->dev))
1574 		goto sysfs_failed;
1575 	if (ipoib_add_umcast_attr(priv->dev))
1576 		goto sysfs_failed;
1577 	if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
1578 		goto sysfs_failed;
1579 	if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
1580 		goto sysfs_failed;
1581 
1582 	return priv->dev;
1583 
1584 sysfs_failed:
1585 	ipoib_delete_debug_files(priv->dev);
1586 	unregister_netdev(priv->dev);
1587 
1588 register_failed:
1589 	ib_unregister_event_handler(&priv->event_handler);
1590 	/* Stop GC if started before flush */
1591 	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1592 	cancel_delayed_work(&priv->neigh_reap_task);
1593 	flush_workqueue(ipoib_workqueue);
1594 
1595 event_failed:
1596 	ipoib_dev_cleanup(priv->dev);
1597 
1598 device_init_failed:
1599 	free_netdev(priv->dev);
1600 
1601 alloc_mem_failed:
1602 	return ERR_PTR(result);
1603 }
1604 
1605 static void ipoib_add_one(struct ib_device *device)
1606 {
1607 	struct list_head *dev_list;
1608 	struct net_device *dev;
1609 	struct ipoib_dev_priv *priv;
1610 	int s, e, p;
1611 
1612 	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1613 		return;
1614 
1615 	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
1616 	if (!dev_list)
1617 		return;
1618 
1619 	INIT_LIST_HEAD(dev_list);
1620 
1621 	if (device->node_type == RDMA_NODE_IB_SWITCH) {
1622 		s = 0;
1623 		e = 0;
1624 	} else {
1625 		s = 1;
1626 		e = device->phys_port_cnt;
1627 	}
1628 
1629 	for (p = s; p <= e; ++p) {
1630 		if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
1631 			continue;
1632 		dev = ipoib_add_port("ib%d", device, p);
1633 		if (!IS_ERR(dev)) {
1634 			priv = netdev_priv(dev);
1635 			list_add_tail(&priv->list, dev_list);
1636 		}
1637 	}
1638 
1639 	ib_set_client_data(device, &ipoib_client, dev_list);
1640 }
1641 
1642 static void ipoib_remove_one(struct ib_device *device)
1643 {
1644 	struct ipoib_dev_priv *priv, *tmp;
1645 	struct list_head *dev_list;
1646 
1647 	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1648 		return;
1649 
1650 	dev_list = ib_get_client_data(device, &ipoib_client);
1651 
1652 	list_for_each_entry_safe(priv, tmp, dev_list, list) {
1653 		ib_unregister_event_handler(&priv->event_handler);
1654 
1655 		rtnl_lock();
1656 		dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
1657 		rtnl_unlock();
1658 
1659 		/* Stop GC */
1660 		set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1661 		cancel_delayed_work(&priv->neigh_reap_task);
1662 		flush_workqueue(ipoib_workqueue);
1663 
1664 		unregister_netdev(priv->dev);
1665 		ipoib_dev_cleanup(priv->dev);
1666 		free_netdev(priv->dev);
1667 	}
1668 
1669 	kfree(dev_list);
1670 }
1671 
1672 static int __init ipoib_init_module(void)
1673 {
1674 	int ret;
1675 
1676 	ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
1677 	ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
1678 	ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
1679 
1680 	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
1681 	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
1682 	ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
1683 #ifdef CONFIG_INFINIBAND_IPOIB_CM
1684 	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
1685 #endif
1686 
1687 	/*
1688 	 * When copying small received packets, we only copy from the
1689 	 * linear data part of the SKB, so we rely on this condition.
1690 	 */
1691 	BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
1692 
1693 	ret = ipoib_register_debugfs();
1694 	if (ret)
1695 		return ret;
1696 
1697 	/*
1698 	 * We create our own workqueue mainly because we want to be
1699 	 * able to flush it when devices are being removed.  We can't
1700 	 * use schedule_work()/flush_scheduled_work() because both
1701 	 * unregister_netdev() and linkwatch_event take the rtnl lock,
1702 	 * so flush_scheduled_work() can deadlock during device
1703 	 * removal.
1704 	 */
1705 	ipoib_workqueue = create_singlethread_workqueue("ipoib");
1706 	if (!ipoib_workqueue) {
1707 		ret = -ENOMEM;
1708 		goto err_fs;
1709 	}
1710 
1711 	ib_sa_register_client(&ipoib_sa_client);
1712 
1713 	ret = ib_register_client(&ipoib_client);
1714 	if (ret)
1715 		goto err_sa;
1716 
1717 	return 0;
1718 
1719 err_sa:
1720 	ib_sa_unregister_client(&ipoib_sa_client);
1721 	destroy_workqueue(ipoib_workqueue);
1722 
1723 err_fs:
1724 	ipoib_unregister_debugfs();
1725 
1726 	return ret;
1727 }
1728 
1729 static void __exit ipoib_cleanup_module(void)
1730 {
1731 	ib_unregister_client(&ipoib_client);
1732 	ib_sa_unregister_client(&ipoib_sa_client);
1733 	ipoib_unregister_debugfs();
1734 	destroy_workqueue(ipoib_workqueue);
1735 }
1736 
1737 module_init(ipoib_init_module);
1738 module_exit(ipoib_cleanup_module);
1739