1 /*
2  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4  * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
5  * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
6  *
7  * This software is available to you under a choice of one of two
8  * licenses.  You may choose to be licensed under the terms of the GNU
9  * General Public License (GPL) Version 2, available from the file
10  * COPYING in the main directory of this source tree, or the
11  * OpenIB.org BSD license below:
12  *
13  *     Redistribution and use in source and binary forms, with or
14  *     without modification, are permitted provided that the following
15  *     conditions are met:
16  *
17  *      - Redistributions of source code must retain the above
18  *        copyright notice, this list of conditions and the following
19  *        disclaimer.
20  *
21  *      - Redistributions in binary form must reproduce the above
22  *        copyright notice, this list of conditions and the following
23  *        disclaimer in the documentation and/or other materials
24  *        provided with the distribution.
25  *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33  * SOFTWARE.
34  *
35  * $Id: ipoib_ib.c 1386 2004-12-27 16:23:17Z roland $
36  */
37 
38 #include <linux/delay.h>
39 #include <linux/dma-mapping.h>
40 
41 #include <rdma/ib_cache.h>
42 #include <linux/ip.h>
43 #include <linux/tcp.h>
44 
45 #include "ipoib.h"
46 
47 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
48 static int data_debug_level;
49 
50 module_param(data_debug_level, int, 0644);
51 MODULE_PARM_DESC(data_debug_level,
52 		 "Enable data path debug tracing if > 0");
53 #endif
54 
55 static DEFINE_MUTEX(pkey_mutex);
56 
57 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
58 				 struct ib_pd *pd, struct ib_ah_attr *attr)
59 {
60 	struct ipoib_ah *ah;
61 
62 	ah = kmalloc(sizeof *ah, GFP_KERNEL);
63 	if (!ah)
64 		return NULL;
65 
66 	ah->dev       = dev;
67 	ah->last_send = 0;
68 	kref_init(&ah->ref);
69 
70 	ah->ah = ib_create_ah(pd, attr);
71 	if (IS_ERR(ah->ah)) {
72 		kfree(ah);
73 		ah = NULL;
74 	} else
75 		ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah);
76 
77 	return ah;
78 }
79 
80 void ipoib_free_ah(struct kref *kref)
81 {
82 	struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
83 	struct ipoib_dev_priv *priv = netdev_priv(ah->dev);
84 
85 	unsigned long flags;
86 
87 	spin_lock_irqsave(&priv->lock, flags);
88 	list_add_tail(&ah->list, &priv->dead_ahs);
89 	spin_unlock_irqrestore(&priv->lock, flags);
90 }
91 
92 static int ipoib_ib_post_receive(struct net_device *dev, int id)
93 {
94 	struct ipoib_dev_priv *priv = netdev_priv(dev);
95 	struct ib_sge list;
96 	struct ib_recv_wr param;
97 	struct ib_recv_wr *bad_wr;
98 	int ret;
99 
100 	list.addr     = priv->rx_ring[id].mapping;
101 	list.length   = IPOIB_BUF_SIZE;
102 	list.lkey     = priv->mr->lkey;
103 
104 	param.next    = NULL;
105 	param.wr_id   = id | IPOIB_OP_RECV;
106 	param.sg_list = &list;
107 	param.num_sge = 1;
108 
109 	ret = ib_post_recv(priv->qp, &param, &bad_wr);
110 	if (unlikely(ret)) {
111 		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
112 		ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
113 				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
114 		dev_kfree_skb_any(priv->rx_ring[id].skb);
115 		priv->rx_ring[id].skb = NULL;
116 	}
117 
118 	return ret;
119 }
120 
121 static int ipoib_alloc_rx_skb(struct net_device *dev, int id)
122 {
123 	struct ipoib_dev_priv *priv = netdev_priv(dev);
124 	struct sk_buff *skb;
125 	u64 addr;
126 
127 	skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
128 	if (!skb)
129 		return -ENOMEM;
130 
131 	/*
132 	 * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
133 	 * header.  So we need 4 more bytes to get to 48 and align the
134 	 * IP header to a multiple of 16.
135 	 */
136 	skb_reserve(skb, 4);
137 
138 	addr = ib_dma_map_single(priv->ca, skb->data, IPOIB_BUF_SIZE,
139 				 DMA_FROM_DEVICE);
140 	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
141 		dev_kfree_skb_any(skb);
142 		return -EIO;
143 	}
144 
145 	priv->rx_ring[id].skb     = skb;
146 	priv->rx_ring[id].mapping = addr;
147 
148 	return 0;
149 }
150 
151 static int ipoib_ib_post_receives(struct net_device *dev)
152 {
153 	struct ipoib_dev_priv *priv = netdev_priv(dev);
154 	int i;
155 
156 	for (i = 0; i < ipoib_recvq_size; ++i) {
157 		if (ipoib_alloc_rx_skb(dev, i)) {
158 			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
159 			return -ENOMEM;
160 		}
161 		if (ipoib_ib_post_receive(dev, i)) {
162 			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
163 			return -EIO;
164 		}
165 	}
166 
167 	return 0;
168 }
169 
170 static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
171 {
172 	struct ipoib_dev_priv *priv = netdev_priv(dev);
173 	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
174 	struct sk_buff *skb;
175 	u64 addr;
176 
177 	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
178 		       wr_id, wc->status);
179 
180 	if (unlikely(wr_id >= ipoib_recvq_size)) {
181 		ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n",
182 			   wr_id, ipoib_recvq_size);
183 		return;
184 	}
185 
186 	skb  = priv->rx_ring[wr_id].skb;
187 	addr = priv->rx_ring[wr_id].mapping;
188 
189 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
190 		if (wc->status != IB_WC_WR_FLUSH_ERR)
191 			ipoib_warn(priv, "failed recv event "
192 				   "(status=%d, wrid=%d vend_err %x)\n",
193 				   wc->status, wr_id, wc->vendor_err);
194 		ib_dma_unmap_single(priv->ca, addr,
195 				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
196 		dev_kfree_skb_any(skb);
197 		priv->rx_ring[wr_id].skb = NULL;
198 		return;
199 	}
200 
201 	/*
202 	 * Drop packets that this interface sent, ie multicast packets
203 	 * that the HCA has replicated.
204 	 */
205 	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
206 		goto repost;
207 
208 	/*
209 	 * If we can't allocate a new RX buffer, dump
210 	 * this packet and reuse the old buffer.
211 	 */
212 	if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
213 		++dev->stats.rx_dropped;
214 		goto repost;
215 	}
216 
217 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
218 		       wc->byte_len, wc->slid);
219 
220 	ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
221 
222 	skb_put(skb, wc->byte_len);
223 	skb_pull(skb, IB_GRH_BYTES);
224 
225 	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
226 	skb_reset_mac_header(skb);
227 	skb_pull(skb, IPOIB_ENCAP_LEN);
228 
229 	dev->last_rx = jiffies;
230 	++dev->stats.rx_packets;
231 	dev->stats.rx_bytes += skb->len;
232 
233 	skb->dev = dev;
234 	/* XXX get correct PACKET_ type here */
235 	skb->pkt_type = PACKET_HOST;
236 
237 	if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok))
238 		skb->ip_summed = CHECKSUM_UNNECESSARY;
239 
240 	netif_receive_skb(skb);
241 
242 repost:
243 	if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
244 		ipoib_warn(priv, "ipoib_ib_post_receive failed "
245 			   "for buf %d\n", wr_id);
246 }
247 
248 static int ipoib_dma_map_tx(struct ib_device *ca,
249 			    struct ipoib_tx_buf *tx_req)
250 {
251 	struct sk_buff *skb = tx_req->skb;
252 	u64 *mapping = tx_req->mapping;
253 	int i;
254 	int off;
255 
256 	if (skb_headlen(skb)) {
257 		mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb),
258 					       DMA_TO_DEVICE);
259 		if (unlikely(ib_dma_mapping_error(ca, mapping[0])))
260 			return -EIO;
261 
262 		off = 1;
263 	} else
264 		off = 0;
265 
266 	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
267 		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
268 		mapping[i + off] = ib_dma_map_page(ca, frag->page,
269 						 frag->page_offset, frag->size,
270 						 DMA_TO_DEVICE);
271 		if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
272 			goto partial_error;
273 	}
274 	return 0;
275 
276 partial_error:
277 	for (; i > 0; --i) {
278 		skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
279 		ib_dma_unmap_page(ca, mapping[i - !off], frag->size, DMA_TO_DEVICE);
280 	}
281 
282 	if (off)
283 		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
284 
285 	return -EIO;
286 }
287 
288 static void ipoib_dma_unmap_tx(struct ib_device *ca,
289 			       struct ipoib_tx_buf *tx_req)
290 {
291 	struct sk_buff *skb = tx_req->skb;
292 	u64 *mapping = tx_req->mapping;
293 	int i;
294 	int off;
295 
296 	if (skb_headlen(skb)) {
297 		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
298 		off = 1;
299 	} else
300 		off = 0;
301 
302 	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
303 		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
304 		ib_dma_unmap_page(ca, mapping[i + off], frag->size,
305 				  DMA_TO_DEVICE);
306 	}
307 }
308 
309 static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
310 {
311 	struct ipoib_dev_priv *priv = netdev_priv(dev);
312 	unsigned int wr_id = wc->wr_id;
313 	struct ipoib_tx_buf *tx_req;
314 	unsigned long flags;
315 
316 	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
317 		       wr_id, wc->status);
318 
319 	if (unlikely(wr_id >= ipoib_sendq_size)) {
320 		ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
321 			   wr_id, ipoib_sendq_size);
322 		return;
323 	}
324 
325 	tx_req = &priv->tx_ring[wr_id];
326 
327 	ipoib_dma_unmap_tx(priv->ca, tx_req);
328 
329 	++dev->stats.tx_packets;
330 	dev->stats.tx_bytes += tx_req->skb->len;
331 
332 	dev_kfree_skb_any(tx_req->skb);
333 
334 	spin_lock_irqsave(&priv->tx_lock, flags);
335 	++priv->tx_tail;
336 	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
337 	    netif_queue_stopped(dev) &&
338 	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
339 		netif_wake_queue(dev);
340 	spin_unlock_irqrestore(&priv->tx_lock, flags);
341 
342 	if (wc->status != IB_WC_SUCCESS &&
343 	    wc->status != IB_WC_WR_FLUSH_ERR)
344 		ipoib_warn(priv, "failed send event "
345 			   "(status=%d, wrid=%d vend_err %x)\n",
346 			   wc->status, wr_id, wc->vendor_err);
347 }
348 
349 int ipoib_poll(struct napi_struct *napi, int budget)
350 {
351 	struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, napi);
352 	struct net_device *dev = priv->dev;
353 	int done;
354 	int t;
355 	int n, i;
356 
357 	done  = 0;
358 
359 poll_more:
360 	while (done < budget) {
361 		int max = (budget - done);
362 
363 		t = min(IPOIB_NUM_WC, max);
364 		n = ib_poll_cq(priv->cq, t, priv->ibwc);
365 
366 		for (i = 0; i < n; i++) {
367 			struct ib_wc *wc = priv->ibwc + i;
368 
369 			if (wc->wr_id & IPOIB_OP_RECV) {
370 				++done;
371 				if (wc->wr_id & IPOIB_OP_CM)
372 					ipoib_cm_handle_rx_wc(dev, wc);
373 				else
374 					ipoib_ib_handle_rx_wc(dev, wc);
375 			} else {
376 				if (wc->wr_id & IPOIB_OP_CM)
377 					ipoib_cm_handle_tx_wc(dev, wc);
378 				else
379 					ipoib_ib_handle_tx_wc(dev, wc);
380 			}
381 		}
382 
383 		if (n != t)
384 			break;
385 	}
386 
387 	if (done < budget) {
388 		netif_rx_complete(dev, napi);
389 		if (unlikely(ib_req_notify_cq(priv->cq,
390 					      IB_CQ_NEXT_COMP |
391 					      IB_CQ_REPORT_MISSED_EVENTS)) &&
392 		    netif_rx_reschedule(dev, napi))
393 			goto poll_more;
394 	}
395 
396 	return done;
397 }
398 
399 void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
400 {
401 	struct net_device *dev = dev_ptr;
402 	struct ipoib_dev_priv *priv = netdev_priv(dev);
403 
404 	netif_rx_schedule(dev, &priv->napi);
405 }
406 
407 static inline int post_send(struct ipoib_dev_priv *priv,
408 			    unsigned int wr_id,
409 			    struct ib_ah *address, u32 qpn,
410 			    struct ipoib_tx_buf *tx_req,
411 			    void *head, int hlen)
412 {
413 	struct ib_send_wr *bad_wr;
414 	int i, off;
415 	struct sk_buff *skb = tx_req->skb;
416 	skb_frag_t *frags = skb_shinfo(skb)->frags;
417 	int nr_frags = skb_shinfo(skb)->nr_frags;
418 	u64 *mapping = tx_req->mapping;
419 
420 	if (skb_headlen(skb)) {
421 		priv->tx_sge[0].addr         = mapping[0];
422 		priv->tx_sge[0].length       = skb_headlen(skb);
423 		off = 1;
424 	} else
425 		off = 0;
426 
427 	for (i = 0; i < nr_frags; ++i) {
428 		priv->tx_sge[i + off].addr = mapping[i + off];
429 		priv->tx_sge[i + off].length = frags[i].size;
430 	}
431 	priv->tx_wr.num_sge	     = nr_frags + off;
432 	priv->tx_wr.wr_id 	     = wr_id;
433 	priv->tx_wr.wr.ud.remote_qpn = qpn;
434 	priv->tx_wr.wr.ud.ah 	     = address;
435 
436 	if (head) {
437 		priv->tx_wr.wr.ud.mss	 = skb_shinfo(skb)->gso_size;
438 		priv->tx_wr.wr.ud.header = head;
439 		priv->tx_wr.wr.ud.hlen	 = hlen;
440 		priv->tx_wr.opcode	 = IB_WR_LSO;
441 	} else
442 		priv->tx_wr.opcode	 = IB_WR_SEND;
443 
444 	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
445 }
446 
447 void ipoib_send(struct net_device *dev, struct sk_buff *skb,
448 		struct ipoib_ah *address, u32 qpn)
449 {
450 	struct ipoib_dev_priv *priv = netdev_priv(dev);
451 	struct ipoib_tx_buf *tx_req;
452 	int hlen;
453 	void *phead;
454 
455 	if (skb_is_gso(skb)) {
456 		hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
457 		phead = skb->data;
458 		if (unlikely(!skb_pull(skb, hlen))) {
459 			ipoib_warn(priv, "linear data too small\n");
460 			++dev->stats.tx_dropped;
461 			++dev->stats.tx_errors;
462 			dev_kfree_skb_any(skb);
463 			return;
464 		}
465 	} else {
466 		if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
467 			ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
468 				   skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN);
469 			++dev->stats.tx_dropped;
470 			++dev->stats.tx_errors;
471 			ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
472 			return;
473 		}
474 		phead = NULL;
475 		hlen  = 0;
476 	}
477 
478 	ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
479 		       skb->len, address, qpn);
480 
481 	/*
482 	 * We put the skb into the tx_ring _before_ we call post_send()
483 	 * because it's entirely possible that the completion handler will
484 	 * run before we execute anything after the post_send().  That
485 	 * means we have to make sure everything is properly recorded and
486 	 * our state is consistent before we call post_send().
487 	 */
488 	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
489 	tx_req->skb = skb;
490 	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
491 		++dev->stats.tx_errors;
492 		dev_kfree_skb_any(skb);
493 		return;
494 	}
495 
496 	if (skb->ip_summed == CHECKSUM_PARTIAL)
497 		priv->tx_wr.send_flags |= IB_SEND_IP_CSUM;
498 	else
499 		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
500 
501 	if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
502 			       address->ah, qpn, tx_req, phead, hlen))) {
503 		ipoib_warn(priv, "post_send failed\n");
504 		++dev->stats.tx_errors;
505 		ipoib_dma_unmap_tx(priv->ca, tx_req);
506 		dev_kfree_skb_any(skb);
507 	} else {
508 		dev->trans_start = jiffies;
509 
510 		address->last_send = priv->tx_head;
511 		++priv->tx_head;
512 
513 		if (++priv->tx_outstanding == ipoib_sendq_size) {
514 			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
515 			netif_stop_queue(dev);
516 		}
517 	}
518 }
519 
520 static void __ipoib_reap_ah(struct net_device *dev)
521 {
522 	struct ipoib_dev_priv *priv = netdev_priv(dev);
523 	struct ipoib_ah *ah, *tah;
524 	LIST_HEAD(remove_list);
525 
526 	spin_lock_irq(&priv->tx_lock);
527 	spin_lock(&priv->lock);
528 	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
529 		if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
530 			list_del(&ah->list);
531 			ib_destroy_ah(ah->ah);
532 			kfree(ah);
533 		}
534 	spin_unlock(&priv->lock);
535 	spin_unlock_irq(&priv->tx_lock);
536 }
537 
538 void ipoib_reap_ah(struct work_struct *work)
539 {
540 	struct ipoib_dev_priv *priv =
541 		container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
542 	struct net_device *dev = priv->dev;
543 
544 	__ipoib_reap_ah(dev);
545 
546 	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
547 		queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
548 				   round_jiffies_relative(HZ));
549 }
550 
551 int ipoib_ib_dev_open(struct net_device *dev)
552 {
553 	struct ipoib_dev_priv *priv = netdev_priv(dev);
554 	int ret;
555 
556 	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) {
557 		ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey);
558 		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
559 		return -1;
560 	}
561 	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
562 
563 	ret = ipoib_init_qp(dev);
564 	if (ret) {
565 		ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
566 		return -1;
567 	}
568 
569 	ret = ipoib_ib_post_receives(dev);
570 	if (ret) {
571 		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
572 		ipoib_ib_dev_stop(dev, 1);
573 		return -1;
574 	}
575 
576 	ret = ipoib_cm_dev_open(dev);
577 	if (ret) {
578 		ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
579 		ipoib_ib_dev_stop(dev, 1);
580 		return -1;
581 	}
582 
583 	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
584 	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
585 			   round_jiffies_relative(HZ));
586 
587 	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
588 
589 	return 0;
590 }
591 
592 static void ipoib_pkey_dev_check_presence(struct net_device *dev)
593 {
594 	struct ipoib_dev_priv *priv = netdev_priv(dev);
595 	u16 pkey_index = 0;
596 
597 	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index))
598 		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
599 	else
600 		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
601 }
602 
603 int ipoib_ib_dev_up(struct net_device *dev)
604 {
605 	struct ipoib_dev_priv *priv = netdev_priv(dev);
606 
607 	ipoib_pkey_dev_check_presence(dev);
608 
609 	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
610 		ipoib_dbg(priv, "PKEY is not assigned.\n");
611 		return 0;
612 	}
613 
614 	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
615 
616 	return ipoib_mcast_start_thread(dev);
617 }
618 
619 int ipoib_ib_dev_down(struct net_device *dev, int flush)
620 {
621 	struct ipoib_dev_priv *priv = netdev_priv(dev);
622 
623 	ipoib_dbg(priv, "downing ib_dev\n");
624 
625 	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
626 	netif_carrier_off(dev);
627 
628 	/* Shutdown the P_Key thread if still active */
629 	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
630 		mutex_lock(&pkey_mutex);
631 		set_bit(IPOIB_PKEY_STOP, &priv->flags);
632 		cancel_delayed_work(&priv->pkey_poll_task);
633 		mutex_unlock(&pkey_mutex);
634 		if (flush)
635 			flush_workqueue(ipoib_workqueue);
636 	}
637 
638 	ipoib_mcast_stop_thread(dev, flush);
639 	ipoib_mcast_dev_flush(dev);
640 
641 	ipoib_flush_paths(dev);
642 
643 	return 0;
644 }
645 
646 static int recvs_pending(struct net_device *dev)
647 {
648 	struct ipoib_dev_priv *priv = netdev_priv(dev);
649 	int pending = 0;
650 	int i;
651 
652 	for (i = 0; i < ipoib_recvq_size; ++i)
653 		if (priv->rx_ring[i].skb)
654 			++pending;
655 
656 	return pending;
657 }
658 
659 void ipoib_drain_cq(struct net_device *dev)
660 {
661 	struct ipoib_dev_priv *priv = netdev_priv(dev);
662 	int i, n;
663 	do {
664 		n = ib_poll_cq(priv->cq, IPOIB_NUM_WC, priv->ibwc);
665 		for (i = 0; i < n; ++i) {
666 			/*
667 			 * Convert any successful completions to flush
668 			 * errors to avoid passing packets up the
669 			 * stack after bringing the device down.
670 			 */
671 			if (priv->ibwc[i].status == IB_WC_SUCCESS)
672 				priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
673 
674 			if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
675 				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
676 					ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
677 				else
678 					ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
679 			} else {
680 				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
681 					ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
682 				else
683 					ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
684 			}
685 		}
686 	} while (n == IPOIB_NUM_WC);
687 }
688 
689 int ipoib_ib_dev_stop(struct net_device *dev, int flush)
690 {
691 	struct ipoib_dev_priv *priv = netdev_priv(dev);
692 	struct ib_qp_attr qp_attr;
693 	unsigned long begin;
694 	struct ipoib_tx_buf *tx_req;
695 	int i;
696 
697 	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
698 
699 	ipoib_cm_dev_stop(dev);
700 
701 	/*
702 	 * Move our QP to the error state and then reinitialize in
703 	 * when all work requests have completed or have been flushed.
704 	 */
705 	qp_attr.qp_state = IB_QPS_ERR;
706 	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
707 		ipoib_warn(priv, "Failed to modify QP to ERROR state\n");
708 
709 	/* Wait for all sends and receives to complete */
710 	begin = jiffies;
711 
712 	while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) {
713 		if (time_after(jiffies, begin + 5 * HZ)) {
714 			ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
715 				   priv->tx_head - priv->tx_tail, recvs_pending(dev));
716 
717 			/*
718 			 * assume the HW is wedged and just free up
719 			 * all our pending work requests.
720 			 */
721 			while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
722 				tx_req = &priv->tx_ring[priv->tx_tail &
723 							(ipoib_sendq_size - 1)];
724 				ipoib_dma_unmap_tx(priv->ca, tx_req);
725 				dev_kfree_skb_any(tx_req->skb);
726 				++priv->tx_tail;
727 				--priv->tx_outstanding;
728 			}
729 
730 			for (i = 0; i < ipoib_recvq_size; ++i) {
731 				struct ipoib_rx_buf *rx_req;
732 
733 				rx_req = &priv->rx_ring[i];
734 				if (!rx_req->skb)
735 					continue;
736 				ib_dma_unmap_single(priv->ca,
737 						    rx_req->mapping,
738 						    IPOIB_BUF_SIZE,
739 						    DMA_FROM_DEVICE);
740 				dev_kfree_skb_any(rx_req->skb);
741 				rx_req->skb = NULL;
742 			}
743 
744 			goto timeout;
745 		}
746 
747 		ipoib_drain_cq(dev);
748 
749 		msleep(1);
750 	}
751 
752 	ipoib_dbg(priv, "All sends and receives done.\n");
753 
754 timeout:
755 	qp_attr.qp_state = IB_QPS_RESET;
756 	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
757 		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
758 
759 	/* Wait for all AHs to be reaped */
760 	set_bit(IPOIB_STOP_REAPER, &priv->flags);
761 	cancel_delayed_work(&priv->ah_reap_task);
762 	if (flush)
763 		flush_workqueue(ipoib_workqueue);
764 
765 	begin = jiffies;
766 
767 	while (!list_empty(&priv->dead_ahs)) {
768 		__ipoib_reap_ah(dev);
769 
770 		if (time_after(jiffies, begin + HZ)) {
771 			ipoib_warn(priv, "timing out; will leak address handles\n");
772 			break;
773 		}
774 
775 		msleep(1);
776 	}
777 
778 	ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP);
779 
780 	return 0;
781 }
782 
783 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
784 {
785 	struct ipoib_dev_priv *priv = netdev_priv(dev);
786 
787 	priv->ca = ca;
788 	priv->port = port;
789 	priv->qp = NULL;
790 
791 	if (ipoib_transport_dev_init(dev, ca)) {
792 		printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name);
793 		return -ENODEV;
794 	}
795 
796 	if (dev->flags & IFF_UP) {
797 		if (ipoib_ib_dev_open(dev)) {
798 			ipoib_transport_dev_cleanup(dev);
799 			return -ENODEV;
800 		}
801 	}
802 
803 	return 0;
804 }
805 
806 static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
807 {
808 	struct ipoib_dev_priv *cpriv;
809 	struct net_device *dev = priv->dev;
810 	u16 new_index;
811 
812 	mutex_lock(&priv->vlan_mutex);
813 
814 	/*
815 	 * Flush any child interfaces too -- they might be up even if
816 	 * the parent is down.
817 	 */
818 	list_for_each_entry(cpriv, &priv->child_intfs, list)
819 		__ipoib_ib_dev_flush(cpriv, pkey_event);
820 
821 	mutex_unlock(&priv->vlan_mutex);
822 
823 	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
824 		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
825 		return;
826 	}
827 
828 	if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
829 		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
830 		return;
831 	}
832 
833 	if (pkey_event) {
834 		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
835 			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
836 			ipoib_ib_dev_down(dev, 0);
837 			ipoib_ib_dev_stop(dev, 0);
838 			if (ipoib_pkey_dev_delay_open(dev))
839 				return;
840 		}
841 
842 		/* restart QP only if P_Key index is changed */
843 		if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
844 		    new_index == priv->pkey_index) {
845 			ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
846 			return;
847 		}
848 		priv->pkey_index = new_index;
849 	}
850 
851 	ipoib_dbg(priv, "flushing\n");
852 
853 	ipoib_ib_dev_down(dev, 0);
854 
855 	if (pkey_event) {
856 		ipoib_ib_dev_stop(dev, 0);
857 		ipoib_ib_dev_open(dev);
858 	}
859 
860 	/*
861 	 * The device could have been brought down between the start and when
862 	 * we get here, don't bring it back up if it's not configured up
863 	 */
864 	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
865 		ipoib_ib_dev_up(dev);
866 		ipoib_mcast_restart_task(&priv->restart_task);
867 	}
868 }
869 
870 void ipoib_ib_dev_flush(struct work_struct *work)
871 {
872 	struct ipoib_dev_priv *priv =
873 		container_of(work, struct ipoib_dev_priv, flush_task);
874 
875 	ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
876 	__ipoib_ib_dev_flush(priv, 0);
877 }
878 
879 void ipoib_pkey_event(struct work_struct *work)
880 {
881 	struct ipoib_dev_priv *priv =
882 		container_of(work, struct ipoib_dev_priv, pkey_event_task);
883 
884 	ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
885 	__ipoib_ib_dev_flush(priv, 1);
886 }
887 
888 void ipoib_ib_dev_cleanup(struct net_device *dev)
889 {
890 	struct ipoib_dev_priv *priv = netdev_priv(dev);
891 
892 	ipoib_dbg(priv, "cleaning up ib_dev\n");
893 
894 	ipoib_mcast_stop_thread(dev, 1);
895 	ipoib_mcast_dev_flush(dev);
896 
897 	ipoib_transport_dev_cleanup(dev);
898 }
899 
900 /*
901  * Delayed P_Key Assigment Interim Support
902  *
903  * The following is initial implementation of delayed P_Key assigment
904  * mechanism. It is using the same approach implemented for the multicast
905  * group join. The single goal of this implementation is to quickly address
906  * Bug #2507. This implementation will probably be removed when the P_Key
907  * change async notification is available.
908  */
909 
910 void ipoib_pkey_poll(struct work_struct *work)
911 {
912 	struct ipoib_dev_priv *priv =
913 		container_of(work, struct ipoib_dev_priv, pkey_poll_task.work);
914 	struct net_device *dev = priv->dev;
915 
916 	ipoib_pkey_dev_check_presence(dev);
917 
918 	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
919 		ipoib_open(dev);
920 	else {
921 		mutex_lock(&pkey_mutex);
922 		if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
923 			queue_delayed_work(ipoib_workqueue,
924 					   &priv->pkey_poll_task,
925 					   HZ);
926 		mutex_unlock(&pkey_mutex);
927 	}
928 }
929 
930 int ipoib_pkey_dev_delay_open(struct net_device *dev)
931 {
932 	struct ipoib_dev_priv *priv = netdev_priv(dev);
933 
934 	/* Look for the interface pkey value in the IB Port P_Key table and */
935 	/* set the interface pkey assigment flag                            */
936 	ipoib_pkey_dev_check_presence(dev);
937 
938 	/* P_Key value not assigned yet - start polling */
939 	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
940 		mutex_lock(&pkey_mutex);
941 		clear_bit(IPOIB_PKEY_STOP, &priv->flags);
942 		queue_delayed_work(ipoib_workqueue,
943 				   &priv->pkey_poll_task,
944 				   HZ);
945 		mutex_unlock(&pkey_mutex);
946 		return 1;
947 	}
948 
949 	return 0;
950 }
951