1 /*
2  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4  * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenIB.org BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  *
34  * $Id: ipoib_multicast.c 1362 2004-12-18 15:56:29Z roland $
35  */
36 
37 #include <linux/skbuff.h>
38 #include <linux/rtnetlink.h>
39 #include <linux/ip.h>
40 #include <linux/in.h>
41 #include <linux/igmp.h>
42 #include <linux/inetdevice.h>
43 #include <linux/delay.h>
44 #include <linux/completion.h>
45 
46 #include <net/dst.h>
47 
48 #include "ipoib.h"
49 
50 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
51 static int mcast_debug_level;
52 
53 module_param(mcast_debug_level, int, 0644);
54 MODULE_PARM_DESC(mcast_debug_level,
55 		 "Enable multicast debug tracing if > 0");
56 #endif
57 
58 static DECLARE_MUTEX(mcast_mutex);
59 
60 /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
61 struct ipoib_mcast {
62 	struct ib_sa_mcmember_rec mcmember;
63 	struct ipoib_ah          *ah;
64 
65 	struct rb_node    rb_node;
66 	struct list_head  list;
67 	struct completion done;
68 
69 	int                 query_id;
70 	struct ib_sa_query *query;
71 
72 	unsigned long created;
73 	unsigned long backoff;
74 
75 	unsigned long flags;
76 	unsigned char logcount;
77 
78 	struct list_head  neigh_list;
79 
80 	struct sk_buff_head pkt_queue;
81 
82 	struct net_device *dev;
83 };
84 
85 struct ipoib_mcast_iter {
86 	struct net_device *dev;
87 	union ib_gid       mgid;
88 	unsigned long      created;
89 	unsigned int       queuelen;
90 	unsigned int       complete;
91 	unsigned int       send_only;
92 };
93 
94 static void ipoib_mcast_free(struct ipoib_mcast *mcast)
95 {
96 	struct net_device *dev = mcast->dev;
97 	struct ipoib_dev_priv *priv = netdev_priv(dev);
98 	struct ipoib_neigh *neigh, *tmp;
99 	unsigned long flags;
100 	LIST_HEAD(ah_list);
101 	struct ipoib_ah *ah, *tah;
102 
103 	ipoib_dbg_mcast(netdev_priv(dev),
104 			"deleting multicast group " IPOIB_GID_FMT "\n",
105 			IPOIB_GID_ARG(mcast->mcmember.mgid));
106 
107 	spin_lock_irqsave(&priv->lock, flags);
108 
109 	list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) {
110 		if (neigh->ah)
111 			list_add_tail(&neigh->ah->list, &ah_list);
112 		*to_ipoib_neigh(neigh->neighbour) = NULL;
113 		neigh->neighbour->ops->destructor = NULL;
114 		kfree(neigh);
115 	}
116 
117 	spin_unlock_irqrestore(&priv->lock, flags);
118 
119 	list_for_each_entry_safe(ah, tah, &ah_list, list)
120 		ipoib_put_ah(ah);
121 
122 	if (mcast->ah)
123 		ipoib_put_ah(mcast->ah);
124 
125 	while (!skb_queue_empty(&mcast->pkt_queue))
126 		dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
127 
128 	kfree(mcast);
129 }
130 
131 static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
132 					     int can_sleep)
133 {
134 	struct ipoib_mcast *mcast;
135 
136 	mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC);
137 	if (!mcast)
138 		return NULL;
139 
140 	mcast->dev = dev;
141 	mcast->created = jiffies;
142 	mcast->backoff = 1;
143 
144 	INIT_LIST_HEAD(&mcast->list);
145 	INIT_LIST_HEAD(&mcast->neigh_list);
146 	skb_queue_head_init(&mcast->pkt_queue);
147 
148 	return mcast;
149 }
150 
151 static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, union ib_gid *mgid)
152 {
153 	struct ipoib_dev_priv *priv = netdev_priv(dev);
154 	struct rb_node *n = priv->multicast_tree.rb_node;
155 
156 	while (n) {
157 		struct ipoib_mcast *mcast;
158 		int ret;
159 
160 		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
161 
162 		ret = memcmp(mgid->raw, mcast->mcmember.mgid.raw,
163 			     sizeof (union ib_gid));
164 		if (ret < 0)
165 			n = n->rb_left;
166 		else if (ret > 0)
167 			n = n->rb_right;
168 		else
169 			return mcast;
170 	}
171 
172 	return NULL;
173 }
174 
175 static int __ipoib_mcast_add(struct net_device *dev, struct ipoib_mcast *mcast)
176 {
177 	struct ipoib_dev_priv *priv = netdev_priv(dev);
178 	struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL;
179 
180 	while (*n) {
181 		struct ipoib_mcast *tmcast;
182 		int ret;
183 
184 		pn = *n;
185 		tmcast = rb_entry(pn, struct ipoib_mcast, rb_node);
186 
187 		ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw,
188 			     sizeof (union ib_gid));
189 		if (ret < 0)
190 			n = &pn->rb_left;
191 		else if (ret > 0)
192 			n = &pn->rb_right;
193 		else
194 			return -EEXIST;
195 	}
196 
197 	rb_link_node(&mcast->rb_node, pn, n);
198 	rb_insert_color(&mcast->rb_node, &priv->multicast_tree);
199 
200 	return 0;
201 }
202 
203 static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
204 				   struct ib_sa_mcmember_rec *mcmember)
205 {
206 	struct net_device *dev = mcast->dev;
207 	struct ipoib_dev_priv *priv = netdev_priv(dev);
208 	int ret;
209 
210 	mcast->mcmember = *mcmember;
211 
212 	/* Set the cached Q_Key before we attach if it's the broadcast group */
213 	if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
214 		    sizeof (union ib_gid))) {
215 		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
216 		priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
217 	}
218 
219 	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
220 		if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
221 			ipoib_warn(priv, "multicast group " IPOIB_GID_FMT
222 				   " already attached\n",
223 				   IPOIB_GID_ARG(mcast->mcmember.mgid));
224 
225 			return 0;
226 		}
227 
228 		ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid),
229 					 &mcast->mcmember.mgid);
230 		if (ret < 0) {
231 			ipoib_warn(priv, "couldn't attach QP to multicast group "
232 				   IPOIB_GID_FMT "\n",
233 				   IPOIB_GID_ARG(mcast->mcmember.mgid));
234 
235 			clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
236 			return ret;
237 		}
238 	}
239 
240 	{
241 		struct ib_ah_attr av = {
242 			.dlid	       = be16_to_cpu(mcast->mcmember.mlid),
243 			.port_num      = priv->port,
244 			.sl	       = mcast->mcmember.sl,
245 			.ah_flags      = IB_AH_GRH,
246 			.grh	       = {
247 				.flow_label    = be32_to_cpu(mcast->mcmember.flow_label),
248 				.hop_limit     = mcast->mcmember.hop_limit,
249 				.sgid_index    = 0,
250 				.traffic_class = mcast->mcmember.traffic_class
251 			}
252 		};
253 		int path_rate = ib_sa_rate_enum_to_int(mcast->mcmember.rate);
254 
255 		av.grh.dgid = mcast->mcmember.mgid;
256 
257 		if (path_rate > 0 && priv->local_rate > path_rate)
258 			av.static_rate = (priv->local_rate - 1) / path_rate;
259 
260 		ipoib_dbg_mcast(priv, "static_rate %d for local port %dX, mcmember %dX\n",
261 				av.static_rate, priv->local_rate,
262 				ib_sa_rate_enum_to_int(mcast->mcmember.rate));
263 
264 		mcast->ah = ipoib_create_ah(dev, priv->pd, &av);
265 		if (!mcast->ah) {
266 			ipoib_warn(priv, "ib_address_create failed\n");
267 		} else {
268 			ipoib_dbg_mcast(priv, "MGID " IPOIB_GID_FMT
269 					" AV %p, LID 0x%04x, SL %d\n",
270 					IPOIB_GID_ARG(mcast->mcmember.mgid),
271 					mcast->ah->ah,
272 					be16_to_cpu(mcast->mcmember.mlid),
273 					mcast->mcmember.sl);
274 		}
275 	}
276 
277 	/* actually send any queued packets */
278 	while (!skb_queue_empty(&mcast->pkt_queue)) {
279 		struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
280 
281 		skb->dev = dev;
282 
283 		if (!skb->dst || !skb->dst->neighbour) {
284 			/* put pseudoheader back on for next time */
285 			skb_push(skb, sizeof (struct ipoib_pseudoheader));
286 		}
287 
288 		if (dev_queue_xmit(skb))
289 			ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n");
290 	}
291 
292 	return 0;
293 }
294 
295 static void
296 ipoib_mcast_sendonly_join_complete(int status,
297 				   struct ib_sa_mcmember_rec *mcmember,
298 				   void *mcast_ptr)
299 {
300 	struct ipoib_mcast *mcast = mcast_ptr;
301 	struct net_device *dev = mcast->dev;
302 
303 	if (!status)
304 		ipoib_mcast_join_finish(mcast, mcmember);
305 	else {
306 		if (mcast->logcount++ < 20)
307 			ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for "
308 					IPOIB_GID_FMT ", status %d\n",
309 					IPOIB_GID_ARG(mcast->mcmember.mgid), status);
310 
311 		/* Flush out any queued packets */
312 		while (!skb_queue_empty(&mcast->pkt_queue))
313 			dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
314 
315 		/* Clear the busy flag so we try again */
316 		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
317 	}
318 
319 	complete(&mcast->done);
320 }
321 
322 static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
323 {
324 	struct net_device *dev = mcast->dev;
325 	struct ipoib_dev_priv *priv = netdev_priv(dev);
326 	struct ib_sa_mcmember_rec rec = {
327 #if 0				/* Some SMs don't support send-only yet */
328 		.join_state = 4
329 #else
330 		.join_state = 1
331 #endif
332 	};
333 	int ret = 0;
334 
335 	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
336 		ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
337 		return -ENODEV;
338 	}
339 
340 	if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
341 		ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
342 		return -EBUSY;
343 	}
344 
345 	rec.mgid     = mcast->mcmember.mgid;
346 	rec.port_gid = priv->local_gid;
347 	rec.pkey     = cpu_to_be16(priv->pkey);
348 
349 	init_completion(&mcast->done);
350 
351 	ret = ib_sa_mcmember_rec_set(priv->ca, priv->port, &rec,
352 				     IB_SA_MCMEMBER_REC_MGID		|
353 				     IB_SA_MCMEMBER_REC_PORT_GID	|
354 				     IB_SA_MCMEMBER_REC_PKEY		|
355 				     IB_SA_MCMEMBER_REC_JOIN_STATE,
356 				     1000, GFP_ATOMIC,
357 				     ipoib_mcast_sendonly_join_complete,
358 				     mcast, &mcast->query);
359 	if (ret < 0) {
360 		ipoib_warn(priv, "ib_sa_mcmember_rec_set failed (ret = %d)\n",
361 			   ret);
362 	} else {
363 		ipoib_dbg_mcast(priv, "no multicast record for " IPOIB_GID_FMT
364 				", starting join\n",
365 				IPOIB_GID_ARG(mcast->mcmember.mgid));
366 
367 		mcast->query_id = ret;
368 	}
369 
370 	return ret;
371 }
372 
373 static void ipoib_mcast_join_complete(int status,
374 				      struct ib_sa_mcmember_rec *mcmember,
375 				      void *mcast_ptr)
376 {
377 	struct ipoib_mcast *mcast = mcast_ptr;
378 	struct net_device *dev = mcast->dev;
379 	struct ipoib_dev_priv *priv = netdev_priv(dev);
380 
381 	ipoib_dbg_mcast(priv, "join completion for " IPOIB_GID_FMT
382 			" (status %d)\n",
383 			IPOIB_GID_ARG(mcast->mcmember.mgid), status);
384 
385 	if (!status && !ipoib_mcast_join_finish(mcast, mcmember)) {
386 		mcast->backoff = 1;
387 		down(&mcast_mutex);
388 		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
389 			queue_work(ipoib_workqueue, &priv->mcast_task);
390 		up(&mcast_mutex);
391 		complete(&mcast->done);
392 		return;
393 	}
394 
395 	if (status == -EINTR) {
396 		complete(&mcast->done);
397 		return;
398 	}
399 
400 	if (status && mcast->logcount++ < 20) {
401 		if (status == -ETIMEDOUT || status == -EINTR) {
402 			ipoib_dbg_mcast(priv, "multicast join failed for " IPOIB_GID_FMT
403 					", status %d\n",
404 					IPOIB_GID_ARG(mcast->mcmember.mgid),
405 					status);
406 		} else {
407 			ipoib_warn(priv, "multicast join failed for "
408 				   IPOIB_GID_FMT ", status %d\n",
409 				   IPOIB_GID_ARG(mcast->mcmember.mgid),
410 				   status);
411 		}
412 	}
413 
414 	mcast->backoff *= 2;
415 	if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
416 		mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
417 
418 	mcast->query = NULL;
419 
420 	down(&mcast_mutex);
421 	if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) {
422 		if (status == -ETIMEDOUT)
423 			queue_work(ipoib_workqueue, &priv->mcast_task);
424 		else
425 			queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
426 					   mcast->backoff * HZ);
427 	} else
428 		complete(&mcast->done);
429 	up(&mcast_mutex);
430 
431 	return;
432 }
433 
434 static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
435 			     int create)
436 {
437 	struct ipoib_dev_priv *priv = netdev_priv(dev);
438 	struct ib_sa_mcmember_rec rec = {
439 		.join_state = 1
440 	};
441 	ib_sa_comp_mask comp_mask;
442 	int ret = 0;
443 
444 	ipoib_dbg_mcast(priv, "joining MGID " IPOIB_GID_FMT "\n",
445 			IPOIB_GID_ARG(mcast->mcmember.mgid));
446 
447 	rec.mgid     = mcast->mcmember.mgid;
448 	rec.port_gid = priv->local_gid;
449 	rec.pkey     = cpu_to_be16(priv->pkey);
450 
451 	comp_mask =
452 		IB_SA_MCMEMBER_REC_MGID		|
453 		IB_SA_MCMEMBER_REC_PORT_GID	|
454 		IB_SA_MCMEMBER_REC_PKEY		|
455 		IB_SA_MCMEMBER_REC_JOIN_STATE;
456 
457 	if (create) {
458 		comp_mask |=
459 			IB_SA_MCMEMBER_REC_QKEY		|
460 			IB_SA_MCMEMBER_REC_SL		|
461 			IB_SA_MCMEMBER_REC_FLOW_LABEL	|
462 			IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
463 
464 		rec.qkey	  = priv->broadcast->mcmember.qkey;
465 		rec.sl		  = priv->broadcast->mcmember.sl;
466 		rec.flow_label	  = priv->broadcast->mcmember.flow_label;
467 		rec.traffic_class = priv->broadcast->mcmember.traffic_class;
468 	}
469 
470 	init_completion(&mcast->done);
471 
472 	ret = ib_sa_mcmember_rec_set(priv->ca, priv->port, &rec, comp_mask,
473 				     mcast->backoff * 1000, GFP_ATOMIC,
474 				     ipoib_mcast_join_complete,
475 				     mcast, &mcast->query);
476 
477 	if (ret < 0) {
478 		ipoib_warn(priv, "ib_sa_mcmember_rec_set failed, status %d\n", ret);
479 
480 		mcast->backoff *= 2;
481 		if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
482 			mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
483 
484 		down(&mcast_mutex);
485 		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
486 			queue_delayed_work(ipoib_workqueue,
487 					   &priv->mcast_task,
488 					   mcast->backoff * HZ);
489 		up(&mcast_mutex);
490 	} else
491 		mcast->query_id = ret;
492 }
493 
494 void ipoib_mcast_join_task(void *dev_ptr)
495 {
496 	struct net_device *dev = dev_ptr;
497 	struct ipoib_dev_priv *priv = netdev_priv(dev);
498 
499 	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
500 		return;
501 
502 	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
503 		ipoib_warn(priv, "ib_gid_entry_get() failed\n");
504 	else
505 		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
506 
507 	{
508 		struct ib_port_attr attr;
509 
510 		if (!ib_query_port(priv->ca, priv->port, &attr)) {
511 			priv->local_lid  = attr.lid;
512 			priv->local_rate = attr.active_speed *
513 				ib_width_enum_to_int(attr.active_width);
514 		} else
515 			ipoib_warn(priv, "ib_query_port failed\n");
516 	}
517 
518 	if (!priv->broadcast) {
519 		priv->broadcast = ipoib_mcast_alloc(dev, 1);
520 		if (!priv->broadcast) {
521 			ipoib_warn(priv, "failed to allocate broadcast group\n");
522 			down(&mcast_mutex);
523 			if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
524 				queue_delayed_work(ipoib_workqueue,
525 						   &priv->mcast_task, HZ);
526 			up(&mcast_mutex);
527 			return;
528 		}
529 
530 		memcpy(priv->broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
531 		       sizeof (union ib_gid));
532 
533 		spin_lock_irq(&priv->lock);
534 		__ipoib_mcast_add(dev, priv->broadcast);
535 		spin_unlock_irq(&priv->lock);
536 	}
537 
538 	if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
539 		ipoib_mcast_join(dev, priv->broadcast, 0);
540 		return;
541 	}
542 
543 	while (1) {
544 		struct ipoib_mcast *mcast = NULL;
545 
546 		spin_lock_irq(&priv->lock);
547 		list_for_each_entry(mcast, &priv->multicast_list, list) {
548 			if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
549 			    && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
550 			    && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
551 				/* Found the next unjoined group */
552 				break;
553 			}
554 		}
555 		spin_unlock_irq(&priv->lock);
556 
557 		if (&mcast->list == &priv->multicast_list) {
558 			/* All done */
559 			break;
560 		}
561 
562 		ipoib_mcast_join(dev, mcast, 1);
563 		return;
564 	}
565 
566 	priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
567 		IPOIB_ENCAP_LEN;
568 	dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
569 
570 	ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
571 
572 	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
573 	netif_carrier_on(dev);
574 }
575 
576 int ipoib_mcast_start_thread(struct net_device *dev)
577 {
578 	struct ipoib_dev_priv *priv = netdev_priv(dev);
579 
580 	ipoib_dbg_mcast(priv, "starting multicast thread\n");
581 
582 	down(&mcast_mutex);
583 	if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
584 		queue_work(ipoib_workqueue, &priv->mcast_task);
585 	up(&mcast_mutex);
586 
587 	return 0;
588 }
589 
590 int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
591 {
592 	struct ipoib_dev_priv *priv = netdev_priv(dev);
593 	struct ipoib_mcast *mcast;
594 
595 	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
596 
597 	down(&mcast_mutex);
598 	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
599 	cancel_delayed_work(&priv->mcast_task);
600 	up(&mcast_mutex);
601 
602 	if (flush)
603 		flush_workqueue(ipoib_workqueue);
604 
605 	if (priv->broadcast && priv->broadcast->query) {
606 		ib_sa_cancel_query(priv->broadcast->query_id, priv->broadcast->query);
607 		priv->broadcast->query = NULL;
608 		ipoib_dbg_mcast(priv, "waiting for bcast\n");
609 		wait_for_completion(&priv->broadcast->done);
610 	}
611 
612 	list_for_each_entry(mcast, &priv->multicast_list, list) {
613 		if (mcast->query) {
614 			ib_sa_cancel_query(mcast->query_id, mcast->query);
615 			mcast->query = NULL;
616 			ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT "\n",
617 					IPOIB_GID_ARG(mcast->mcmember.mgid));
618 			wait_for_completion(&mcast->done);
619 		}
620 	}
621 
622 	return 0;
623 }
624 
625 static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
626 {
627 	struct ipoib_dev_priv *priv = netdev_priv(dev);
628 	struct ib_sa_mcmember_rec rec = {
629 		.join_state = 1
630 	};
631 	int ret = 0;
632 
633 	if (!test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags))
634 		return 0;
635 
636 	ipoib_dbg_mcast(priv, "leaving MGID " IPOIB_GID_FMT "\n",
637 			IPOIB_GID_ARG(mcast->mcmember.mgid));
638 
639 	rec.mgid     = mcast->mcmember.mgid;
640 	rec.port_gid = priv->local_gid;
641 	rec.pkey     = cpu_to_be16(priv->pkey);
642 
643 	/* Remove ourselves from the multicast group */
644 	ret = ipoib_mcast_detach(dev, be16_to_cpu(mcast->mcmember.mlid),
645 				 &mcast->mcmember.mgid);
646 	if (ret)
647 		ipoib_warn(priv, "ipoib_mcast_detach failed (result = %d)\n", ret);
648 
649 	/*
650 	 * Just make one shot at leaving and don't wait for a reply;
651 	 * if we fail, too bad.
652 	 */
653 	ret = ib_sa_mcmember_rec_delete(priv->ca, priv->port, &rec,
654 					IB_SA_MCMEMBER_REC_MGID		|
655 					IB_SA_MCMEMBER_REC_PORT_GID	|
656 					IB_SA_MCMEMBER_REC_PKEY		|
657 					IB_SA_MCMEMBER_REC_JOIN_STATE,
658 					0, GFP_ATOMIC, NULL,
659 					mcast, &mcast->query);
660 	if (ret < 0)
661 		ipoib_warn(priv, "ib_sa_mcmember_rec_delete failed "
662 			   "for leave (result = %d)\n", ret);
663 
664 	return 0;
665 }
666 
667 void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid,
668 		      struct sk_buff *skb)
669 {
670 	struct ipoib_dev_priv *priv = netdev_priv(dev);
671 	struct ipoib_mcast *mcast;
672 
673 	/*
674 	 * We can only be called from ipoib_start_xmit, so we're
675 	 * inside tx_lock -- no need to save/restore flags.
676 	 */
677 	spin_lock(&priv->lock);
678 
679 	mcast = __ipoib_mcast_find(dev, mgid);
680 	if (!mcast) {
681 		/* Let's create a new send only group now */
682 		ipoib_dbg_mcast(priv, "setting up send only multicast group for "
683 				IPOIB_GID_FMT "\n", IPOIB_GID_ARG(*mgid));
684 
685 		mcast = ipoib_mcast_alloc(dev, 0);
686 		if (!mcast) {
687 			ipoib_warn(priv, "unable to allocate memory for "
688 				   "multicast structure\n");
689 			dev_kfree_skb_any(skb);
690 			goto out;
691 		}
692 
693 		set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
694 		mcast->mcmember.mgid = *mgid;
695 		__ipoib_mcast_add(dev, mcast);
696 		list_add_tail(&mcast->list, &priv->multicast_list);
697 	}
698 
699 	if (!mcast->ah) {
700 		if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
701 			skb_queue_tail(&mcast->pkt_queue, skb);
702 		else
703 			dev_kfree_skb_any(skb);
704 
705 		if (mcast->query)
706 			ipoib_dbg_mcast(priv, "no address vector, "
707 					"but multicast join already started\n");
708 		else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
709 			ipoib_mcast_sendonly_join(mcast);
710 
711 		/*
712 		 * If lookup completes between here and out:, don't
713 		 * want to send packet twice.
714 		 */
715 		mcast = NULL;
716 	}
717 
718 out:
719 	if (mcast && mcast->ah) {
720 		if (skb->dst            &&
721 		    skb->dst->neighbour &&
722 		    !*to_ipoib_neigh(skb->dst->neighbour)) {
723 			struct ipoib_neigh *neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
724 
725 			if (neigh) {
726 				kref_get(&mcast->ah->ref);
727 				neigh->ah  	= mcast->ah;
728 				neigh->neighbour = skb->dst->neighbour;
729 				*to_ipoib_neigh(skb->dst->neighbour) = neigh;
730 				list_add_tail(&neigh->list, &mcast->neigh_list);
731 			}
732 		}
733 
734 		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
735 	}
736 
737 	spin_unlock(&priv->lock);
738 }
739 
740 void ipoib_mcast_dev_flush(struct net_device *dev)
741 {
742 	struct ipoib_dev_priv *priv = netdev_priv(dev);
743 	LIST_HEAD(remove_list);
744 	struct ipoib_mcast *mcast, *tmcast, *nmcast;
745 	unsigned long flags;
746 
747 	ipoib_dbg_mcast(priv, "flushing multicast list\n");
748 
749 	spin_lock_irqsave(&priv->lock, flags);
750 	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
751 		nmcast = ipoib_mcast_alloc(dev, 0);
752 		if (nmcast) {
753 			nmcast->flags =
754 				mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY);
755 
756 			nmcast->mcmember.mgid = mcast->mcmember.mgid;
757 
758 			/* Add the new group in before the to-be-destroyed group */
759 			list_add_tail(&nmcast->list, &mcast->list);
760 			list_del_init(&mcast->list);
761 
762 			rb_replace_node(&mcast->rb_node, &nmcast->rb_node,
763 					&priv->multicast_tree);
764 
765 			list_add_tail(&mcast->list, &remove_list);
766 		} else {
767 			ipoib_warn(priv, "could not reallocate multicast group "
768 				   IPOIB_GID_FMT "\n",
769 				   IPOIB_GID_ARG(mcast->mcmember.mgid));
770 		}
771 	}
772 
773 	if (priv->broadcast) {
774 		nmcast = ipoib_mcast_alloc(dev, 0);
775 		if (nmcast) {
776 			nmcast->mcmember.mgid = priv->broadcast->mcmember.mgid;
777 
778 			rb_replace_node(&priv->broadcast->rb_node,
779 					&nmcast->rb_node,
780 					&priv->multicast_tree);
781 
782 			list_add_tail(&priv->broadcast->list, &remove_list);
783 		}
784 
785 		priv->broadcast = nmcast;
786 	}
787 
788 	spin_unlock_irqrestore(&priv->lock, flags);
789 
790 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
791 		ipoib_mcast_leave(dev, mcast);
792 		ipoib_mcast_free(mcast);
793 	}
794 }
795 
796 void ipoib_mcast_dev_down(struct net_device *dev)
797 {
798 	struct ipoib_dev_priv *priv = netdev_priv(dev);
799 	unsigned long flags;
800 
801 	/* Delete broadcast since it will be recreated */
802 	if (priv->broadcast) {
803 		ipoib_dbg_mcast(priv, "deleting broadcast group\n");
804 
805 		spin_lock_irqsave(&priv->lock, flags);
806 		rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree);
807 		spin_unlock_irqrestore(&priv->lock, flags);
808 		ipoib_mcast_leave(dev, priv->broadcast);
809 		ipoib_mcast_free(priv->broadcast);
810 		priv->broadcast = NULL;
811 	}
812 }
813 
814 void ipoib_mcast_restart_task(void *dev_ptr)
815 {
816 	struct net_device *dev = dev_ptr;
817 	struct ipoib_dev_priv *priv = netdev_priv(dev);
818 	struct dev_mc_list *mclist;
819 	struct ipoib_mcast *mcast, *tmcast;
820 	LIST_HEAD(remove_list);
821 	unsigned long flags;
822 
823 	ipoib_dbg_mcast(priv, "restarting multicast task\n");
824 
825 	ipoib_mcast_stop_thread(dev, 0);
826 
827 	spin_lock_irqsave(&priv->lock, flags);
828 
829 	/*
830 	 * Unfortunately, the networking core only gives us a list of all of
831 	 * the multicast hardware addresses. We need to figure out which ones
832 	 * are new and which ones have been removed
833 	 */
834 
835 	/* Clear out the found flag */
836 	list_for_each_entry(mcast, &priv->multicast_list, list)
837 		clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
838 
839 	/* Mark all of the entries that are found or don't exist */
840 	for (mclist = dev->mc_list; mclist; mclist = mclist->next) {
841 		union ib_gid mgid;
842 
843 		memcpy(mgid.raw, mclist->dmi_addr + 4, sizeof mgid);
844 
845 		/* Add in the P_Key */
846 		mgid.raw[4] = (priv->pkey >> 8) & 0xff;
847 		mgid.raw[5] = priv->pkey & 0xff;
848 
849 		mcast = __ipoib_mcast_find(dev, &mgid);
850 		if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
851 			struct ipoib_mcast *nmcast;
852 
853 			/* Not found or send-only group, let's add a new entry */
854 			ipoib_dbg_mcast(priv, "adding multicast entry for mgid "
855 					IPOIB_GID_FMT "\n", IPOIB_GID_ARG(mgid));
856 
857 			nmcast = ipoib_mcast_alloc(dev, 0);
858 			if (!nmcast) {
859 				ipoib_warn(priv, "unable to allocate memory for multicast structure\n");
860 				continue;
861 			}
862 
863 			set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags);
864 
865 			nmcast->mcmember.mgid = mgid;
866 
867 			if (mcast) {
868 				/* Destroy the send only entry */
869 				list_del(&mcast->list);
870 				list_add_tail(&mcast->list, &remove_list);
871 
872 				rb_replace_node(&mcast->rb_node,
873 						&nmcast->rb_node,
874 						&priv->multicast_tree);
875 			} else
876 				__ipoib_mcast_add(dev, nmcast);
877 
878 			list_add_tail(&nmcast->list, &priv->multicast_list);
879 		}
880 
881 		if (mcast)
882 			set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
883 	}
884 
885 	/* Remove all of the entries don't exist anymore */
886 	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
887 		if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) &&
888 		    !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
889 			ipoib_dbg_mcast(priv, "deleting multicast group " IPOIB_GID_FMT "\n",
890 					IPOIB_GID_ARG(mcast->mcmember.mgid));
891 
892 			rb_erase(&mcast->rb_node, &priv->multicast_tree);
893 
894 			/* Move to the remove list */
895 			list_del(&mcast->list);
896 			list_add_tail(&mcast->list, &remove_list);
897 		}
898 	}
899 	spin_unlock_irqrestore(&priv->lock, flags);
900 
901 	/* We have to cancel outside of the spinlock */
902 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
903 		ipoib_mcast_leave(mcast->dev, mcast);
904 		ipoib_mcast_free(mcast);
905 	}
906 
907 	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
908 		ipoib_mcast_start_thread(dev);
909 }
910 
911 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
912 
913 struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev)
914 {
915 	struct ipoib_mcast_iter *iter;
916 
917 	iter = kmalloc(sizeof *iter, GFP_KERNEL);
918 	if (!iter)
919 		return NULL;
920 
921 	iter->dev = dev;
922 	memset(iter->mgid.raw, 0, 16);
923 
924 	if (ipoib_mcast_iter_next(iter)) {
925 		kfree(iter);
926 		return NULL;
927 	}
928 
929 	return iter;
930 }
931 
932 int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter)
933 {
934 	struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
935 	struct rb_node *n;
936 	struct ipoib_mcast *mcast;
937 	int ret = 1;
938 
939 	spin_lock_irq(&priv->lock);
940 
941 	n = rb_first(&priv->multicast_tree);
942 
943 	while (n) {
944 		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
945 
946 		if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw,
947 			   sizeof (union ib_gid)) < 0) {
948 			iter->mgid      = mcast->mcmember.mgid;
949 			iter->created   = mcast->created;
950 			iter->queuelen  = skb_queue_len(&mcast->pkt_queue);
951 			iter->complete  = !!mcast->ah;
952 			iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY));
953 
954 			ret = 0;
955 
956 			break;
957 		}
958 
959 		n = rb_next(n);
960 	}
961 
962 	spin_unlock_irq(&priv->lock);
963 
964 	return ret;
965 }
966 
967 void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
968 			   union ib_gid *mgid,
969 			   unsigned long *created,
970 			   unsigned int *queuelen,
971 			   unsigned int *complete,
972 			   unsigned int *send_only)
973 {
974 	*mgid      = iter->mgid;
975 	*created   = iter->created;
976 	*queuelen  = iter->queuelen;
977 	*complete  = iter->complete;
978 	*send_only = iter->send_only;
979 }
980 
981 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
982