xref: /openbmc/linux/drivers/infiniband/hw/hfi1/vnic_main.c (revision 4da722ca19f30f7db250db808d1ab1703607a932)
1 /*
2  * Copyright(c) 2017 Intel Corporation.
3  *
4  * This file is provided under a dual BSD/GPLv2 license.  When using or
5  * redistributing this file, you may do so under either license.
6  *
7  * GPL LICENSE SUMMARY
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of version 2 of the GNU General Public License as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License for more details.
17  *
18  * BSD LICENSE
19  *
20  * Redistribution and use in source and binary forms, with or without
21  * modification, are permitted provided that the following conditions
22  * are met:
23  *
24  *  - Redistributions of source code must retain the above copyright
25  *    notice, this list of conditions and the following disclaimer.
26  *  - Redistributions in binary form must reproduce the above copyright
27  *    notice, this list of conditions and the following disclaimer in
28  *    the documentation and/or other materials provided with the
29  *    distribution.
30  *  - Neither the name of Intel Corporation nor the names of its
31  *    contributors may be used to endorse or promote products derived
32  *    from this software without specific prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45  *
46  */
47 
48 /*
49  * This file contains HFI1 support for VNIC functionality
50  */
51 
52 #include <linux/io.h>
53 #include <linux/if_vlan.h>
54 
55 #include "vnic.h"
56 
57 #define HFI_TX_TIMEOUT_MS 1000
58 
59 #define HFI1_VNIC_RCV_Q_SIZE   1024
60 
61 #define HFI1_VNIC_UP 0
62 
63 static DEFINE_SPINLOCK(vport_cntr_lock);
64 
65 static int setup_vnic_ctxt(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt)
66 {
67 	unsigned int rcvctrl_ops = 0;
68 	int ret;
69 
70 	hfi1_init_ctxt(uctxt->sc);
71 
72 	uctxt->do_interrupt = &handle_receive_interrupt;
73 
74 	/* Now allocate the RcvHdr queue and eager buffers. */
75 	ret = hfi1_create_rcvhdrq(dd, uctxt);
76 	if (ret)
77 		goto done;
78 
79 	ret = hfi1_setup_eagerbufs(uctxt);
80 	if (ret)
81 		goto done;
82 
83 	if (uctxt->rcvhdrtail_kvaddr)
84 		clear_rcvhdrtail(uctxt);
85 
86 	rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
87 	rcvctrl_ops |= HFI1_RCVCTRL_INTRAVAIL_ENB;
88 
89 	if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
90 		rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
91 	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
92 		rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
93 	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
94 		rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
95 	if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
96 		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
97 
98 	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
99 
100 	uctxt->is_vnic = true;
101 done:
102 	return ret;
103 }
104 
105 static int allocate_vnic_ctxt(struct hfi1_devdata *dd,
106 			      struct hfi1_ctxtdata **vnic_ctxt)
107 {
108 	struct hfi1_ctxtdata *uctxt;
109 	unsigned int ctxt;
110 	int ret;
111 
112 	if (dd->flags & HFI1_FROZEN)
113 		return -EIO;
114 
115 	for (ctxt = dd->first_dyn_alloc_ctxt;
116 	     ctxt < dd->num_rcv_contexts; ctxt++)
117 		if (!dd->rcd[ctxt])
118 			break;
119 
120 	if (ctxt == dd->num_rcv_contexts)
121 		return -EBUSY;
122 
123 	uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, dd->node);
124 	if (!uctxt) {
125 		dd_dev_err(dd, "Unable to create ctxtdata, failing open\n");
126 		return -ENOMEM;
127 	}
128 
129 	uctxt->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
130 			HFI1_CAP_KGET(NODROP_RHQ_FULL) |
131 			HFI1_CAP_KGET(NODROP_EGR_FULL) |
132 			HFI1_CAP_KGET(DMA_RTAIL);
133 	uctxt->seq_cnt = 1;
134 
135 	/* Allocate and enable a PIO send context */
136 	uctxt->sc = sc_alloc(dd, SC_VNIC, uctxt->rcvhdrqentsize,
137 			     uctxt->numa_id);
138 
139 	ret = uctxt->sc ? 0 : -ENOMEM;
140 	if (ret)
141 		goto bail;
142 
143 	dd_dev_dbg(dd, "allocated vnic send context %u(%u)\n",
144 		   uctxt->sc->sw_index, uctxt->sc->hw_context);
145 	ret = sc_enable(uctxt->sc);
146 	if (ret)
147 		goto bail;
148 
149 	if (dd->num_msix_entries)
150 		hfi1_set_vnic_msix_info(uctxt);
151 
152 	hfi1_stats.sps_ctxts++;
153 	dd_dev_dbg(dd, "created vnic context %d\n", uctxt->ctxt);
154 	*vnic_ctxt = uctxt;
155 
156 	return ret;
157 bail:
158 	/*
159 	 * hfi1_free_ctxtdata() also releases send_context
160 	 * structure if uctxt->sc is not null
161 	 */
162 	dd->rcd[uctxt->ctxt] = NULL;
163 	hfi1_free_ctxtdata(dd, uctxt);
164 	dd_dev_dbg(dd, "vnic allocation failed. rc %d\n", ret);
165 	return ret;
166 }
167 
168 static void deallocate_vnic_ctxt(struct hfi1_devdata *dd,
169 				 struct hfi1_ctxtdata *uctxt)
170 {
171 	unsigned long flags;
172 
173 	dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt);
174 	flush_wc();
175 
176 	if (dd->num_msix_entries)
177 		hfi1_reset_vnic_msix_info(uctxt);
178 
179 	spin_lock_irqsave(&dd->uctxt_lock, flags);
180 	/*
181 	 * Disable receive context and interrupt available, reset all
182 	 * RcvCtxtCtrl bits to default values.
183 	 */
184 	hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
185 		     HFI1_RCVCTRL_TIDFLOW_DIS |
186 		     HFI1_RCVCTRL_INTRAVAIL_DIS |
187 		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
188 		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
189 		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
190 	/*
191 	 * VNIC contexts are allocated from user context pool.
192 	 * Release them back to user context pool.
193 	 *
194 	 * Reset context integrity checks to default.
195 	 * (writes to CSRs probably belong in chip.c)
196 	 */
197 	write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
198 			hfi1_pkt_default_send_ctxt_mask(dd, SC_USER));
199 	sc_disable(uctxt->sc);
200 
201 	dd->send_contexts[uctxt->sc->sw_index].type = SC_USER;
202 	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
203 
204 	dd->rcd[uctxt->ctxt] = NULL;
205 	uctxt->event_flags = 0;
206 
207 	hfi1_clear_tids(uctxt);
208 	hfi1_clear_ctxt_pkey(dd, uctxt);
209 
210 	hfi1_stats.sps_ctxts--;
211 	hfi1_free_ctxtdata(dd, uctxt);
212 }
213 
214 void hfi1_vnic_setup(struct hfi1_devdata *dd)
215 {
216 	idr_init(&dd->vnic.vesw_idr);
217 }
218 
219 void hfi1_vnic_cleanup(struct hfi1_devdata *dd)
220 {
221 	idr_destroy(&dd->vnic.vesw_idr);
222 }
223 
224 #define SUM_GRP_COUNTERS(stats, qstats, x_grp) do {            \
225 		u64 *src64, *dst64;                            \
226 		for (src64 = &qstats->x_grp.unicast,           \
227 			dst64 = &stats->x_grp.unicast;         \
228 			dst64 <= &stats->x_grp.s_1519_max;) {  \
229 			*dst64++ += *src64++;                  \
230 		}                                              \
231 	} while (0)
232 
233 /* hfi1_vnic_update_stats - update statistics */
234 static void hfi1_vnic_update_stats(struct hfi1_vnic_vport_info *vinfo,
235 				   struct opa_vnic_stats *stats)
236 {
237 	struct net_device *netdev = vinfo->netdev;
238 	u8 i;
239 
240 	/* add tx counters on different queues */
241 	for (i = 0; i < vinfo->num_tx_q; i++) {
242 		struct opa_vnic_stats *qstats = &vinfo->stats[i];
243 		struct rtnl_link_stats64 *qnstats = &vinfo->stats[i].netstats;
244 
245 		stats->netstats.tx_fifo_errors += qnstats->tx_fifo_errors;
246 		stats->netstats.tx_carrier_errors += qnstats->tx_carrier_errors;
247 		stats->tx_drop_state += qstats->tx_drop_state;
248 		stats->tx_dlid_zero += qstats->tx_dlid_zero;
249 
250 		SUM_GRP_COUNTERS(stats, qstats, tx_grp);
251 		stats->netstats.tx_packets += qnstats->tx_packets;
252 		stats->netstats.tx_bytes += qnstats->tx_bytes;
253 	}
254 
255 	/* add rx counters on different queues */
256 	for (i = 0; i < vinfo->num_rx_q; i++) {
257 		struct opa_vnic_stats *qstats = &vinfo->stats[i];
258 		struct rtnl_link_stats64 *qnstats = &vinfo->stats[i].netstats;
259 
260 		stats->netstats.rx_fifo_errors += qnstats->rx_fifo_errors;
261 		stats->netstats.rx_nohandler += qnstats->rx_nohandler;
262 		stats->rx_drop_state += qstats->rx_drop_state;
263 		stats->rx_oversize += qstats->rx_oversize;
264 		stats->rx_runt += qstats->rx_runt;
265 
266 		SUM_GRP_COUNTERS(stats, qstats, rx_grp);
267 		stats->netstats.rx_packets += qnstats->rx_packets;
268 		stats->netstats.rx_bytes += qnstats->rx_bytes;
269 	}
270 
271 	stats->netstats.tx_errors = stats->netstats.tx_fifo_errors +
272 				    stats->netstats.tx_carrier_errors +
273 				    stats->tx_drop_state + stats->tx_dlid_zero;
274 	stats->netstats.tx_dropped = stats->netstats.tx_errors;
275 
276 	stats->netstats.rx_errors = stats->netstats.rx_fifo_errors +
277 				    stats->netstats.rx_nohandler +
278 				    stats->rx_drop_state + stats->rx_oversize +
279 				    stats->rx_runt;
280 	stats->netstats.rx_dropped = stats->netstats.rx_errors;
281 
282 	netdev->stats.tx_packets = stats->netstats.tx_packets;
283 	netdev->stats.tx_bytes = stats->netstats.tx_bytes;
284 	netdev->stats.tx_fifo_errors = stats->netstats.tx_fifo_errors;
285 	netdev->stats.tx_carrier_errors = stats->netstats.tx_carrier_errors;
286 	netdev->stats.tx_errors = stats->netstats.tx_errors;
287 	netdev->stats.tx_dropped = stats->netstats.tx_dropped;
288 
289 	netdev->stats.rx_packets = stats->netstats.rx_packets;
290 	netdev->stats.rx_bytes = stats->netstats.rx_bytes;
291 	netdev->stats.rx_fifo_errors = stats->netstats.rx_fifo_errors;
292 	netdev->stats.multicast = stats->rx_grp.mcastbcast;
293 	netdev->stats.rx_length_errors = stats->rx_oversize + stats->rx_runt;
294 	netdev->stats.rx_errors = stats->netstats.rx_errors;
295 	netdev->stats.rx_dropped = stats->netstats.rx_dropped;
296 }
297 
298 /* update_len_counters - update pkt's len histogram counters */
299 static inline void update_len_counters(struct opa_vnic_grp_stats *grp,
300 				       int len)
301 {
302 	/* account for 4 byte FCS */
303 	if (len >= 1515)
304 		grp->s_1519_max++;
305 	else if (len >= 1020)
306 		grp->s_1024_1518++;
307 	else if (len >= 508)
308 		grp->s_512_1023++;
309 	else if (len >= 252)
310 		grp->s_256_511++;
311 	else if (len >= 124)
312 		grp->s_128_255++;
313 	else if (len >= 61)
314 		grp->s_65_127++;
315 	else
316 		grp->s_64++;
317 }
318 
319 /* hfi1_vnic_update_tx_counters - update transmit counters */
320 static void hfi1_vnic_update_tx_counters(struct hfi1_vnic_vport_info *vinfo,
321 					 u8 q_idx, struct sk_buff *skb, int err)
322 {
323 	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
324 	struct opa_vnic_stats *stats = &vinfo->stats[q_idx];
325 	struct opa_vnic_grp_stats *tx_grp = &stats->tx_grp;
326 	u16 vlan_tci;
327 
328 	stats->netstats.tx_packets++;
329 	stats->netstats.tx_bytes += skb->len + ETH_FCS_LEN;
330 
331 	update_len_counters(tx_grp, skb->len);
332 
333 	/* rest of the counts are for good packets only */
334 	if (unlikely(err))
335 		return;
336 
337 	if (is_multicast_ether_addr(mac_hdr->h_dest))
338 		tx_grp->mcastbcast++;
339 	else
340 		tx_grp->unicast++;
341 
342 	if (!__vlan_get_tag(skb, &vlan_tci))
343 		tx_grp->vlan++;
344 	else
345 		tx_grp->untagged++;
346 }
347 
348 /* hfi1_vnic_update_rx_counters - update receive counters */
349 static void hfi1_vnic_update_rx_counters(struct hfi1_vnic_vport_info *vinfo,
350 					 u8 q_idx, struct sk_buff *skb, int err)
351 {
352 	struct ethhdr *mac_hdr = (struct ethhdr *)skb->data;
353 	struct opa_vnic_stats *stats = &vinfo->stats[q_idx];
354 	struct opa_vnic_grp_stats *rx_grp = &stats->rx_grp;
355 	u16 vlan_tci;
356 
357 	stats->netstats.rx_packets++;
358 	stats->netstats.rx_bytes += skb->len + ETH_FCS_LEN;
359 
360 	update_len_counters(rx_grp, skb->len);
361 
362 	/* rest of the counts are for good packets only */
363 	if (unlikely(err))
364 		return;
365 
366 	if (is_multicast_ether_addr(mac_hdr->h_dest))
367 		rx_grp->mcastbcast++;
368 	else
369 		rx_grp->unicast++;
370 
371 	if (!__vlan_get_tag(skb, &vlan_tci))
372 		rx_grp->vlan++;
373 	else
374 		rx_grp->untagged++;
375 }
376 
377 /* This function is overloaded for opa_vnic specific implementation */
378 static void hfi1_vnic_get_stats64(struct net_device *netdev,
379 				  struct rtnl_link_stats64 *stats)
380 {
381 	struct opa_vnic_stats *vstats = (struct opa_vnic_stats *)stats;
382 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
383 
384 	hfi1_vnic_update_stats(vinfo, vstats);
385 }
386 
387 static u64 create_bypass_pbc(u32 vl, u32 dw_len)
388 {
389 	u64 pbc;
390 
391 	pbc = ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
392 		| PBC_INSERT_BYPASS_ICRC | PBC_CREDIT_RETURN
393 		| PBC_PACKET_BYPASS
394 		| ((vl & PBC_VL_MASK) << PBC_VL_SHIFT)
395 		| (dw_len & PBC_LENGTH_DWS_MASK) << PBC_LENGTH_DWS_SHIFT;
396 
397 	return pbc;
398 }
399 
400 /* hfi1_vnic_maybe_stop_tx - stop tx queue if required */
401 static void hfi1_vnic_maybe_stop_tx(struct hfi1_vnic_vport_info *vinfo,
402 				    u8 q_idx)
403 {
404 	netif_stop_subqueue(vinfo->netdev, q_idx);
405 	if (!hfi1_vnic_sdma_write_avail(vinfo, q_idx))
406 		return;
407 
408 	netif_start_subqueue(vinfo->netdev, q_idx);
409 }
410 
411 static netdev_tx_t hfi1_netdev_start_xmit(struct sk_buff *skb,
412 					  struct net_device *netdev)
413 {
414 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
415 	u8 pad_len, q_idx = skb->queue_mapping;
416 	struct hfi1_devdata *dd = vinfo->dd;
417 	struct opa_vnic_skb_mdata *mdata;
418 	u32 pkt_len, total_len;
419 	int err = -EINVAL;
420 	u64 pbc;
421 
422 	v_dbg("xmit: queue %d skb len %d\n", q_idx, skb->len);
423 	if (unlikely(!netif_oper_up(netdev))) {
424 		vinfo->stats[q_idx].tx_drop_state++;
425 		goto tx_finish;
426 	}
427 
428 	/* take out meta data */
429 	mdata = (struct opa_vnic_skb_mdata *)skb->data;
430 	skb_pull(skb, sizeof(*mdata));
431 	if (unlikely(mdata->flags & OPA_VNIC_SKB_MDATA_ENCAP_ERR)) {
432 		vinfo->stats[q_idx].tx_dlid_zero++;
433 		goto tx_finish;
434 	}
435 
436 	/* add tail padding (for 8 bytes size alignment) and icrc */
437 	pad_len = -(skb->len + OPA_VNIC_ICRC_TAIL_LEN) & 0x7;
438 	pad_len += OPA_VNIC_ICRC_TAIL_LEN;
439 
440 	/*
441 	 * pkt_len is how much data we have to write, includes header and data.
442 	 * total_len is length of the packet in Dwords plus the PBC should not
443 	 * include the CRC.
444 	 */
445 	pkt_len = (skb->len + pad_len) >> 2;
446 	total_len = pkt_len + 2; /* PBC + packet */
447 
448 	pbc = create_bypass_pbc(mdata->vl, total_len);
449 
450 	skb_get(skb);
451 	v_dbg("pbc 0x%016llX len %d pad_len %d\n", pbc, skb->len, pad_len);
452 	err = dd->process_vnic_dma_send(dd, q_idx, vinfo, skb, pbc, pad_len);
453 	if (unlikely(err)) {
454 		if (err == -ENOMEM)
455 			vinfo->stats[q_idx].netstats.tx_fifo_errors++;
456 		else if (err != -EBUSY)
457 			vinfo->stats[q_idx].netstats.tx_carrier_errors++;
458 	}
459 	/* remove the header before updating tx counters */
460 	skb_pull(skb, OPA_VNIC_HDR_LEN);
461 
462 	if (unlikely(err == -EBUSY)) {
463 		hfi1_vnic_maybe_stop_tx(vinfo, q_idx);
464 		dev_kfree_skb_any(skb);
465 		return NETDEV_TX_BUSY;
466 	}
467 
468 tx_finish:
469 	/* update tx counters */
470 	hfi1_vnic_update_tx_counters(vinfo, q_idx, skb, err);
471 	dev_kfree_skb_any(skb);
472 	return NETDEV_TX_OK;
473 }
474 
475 static u16 hfi1_vnic_select_queue(struct net_device *netdev,
476 				  struct sk_buff *skb,
477 				  void *accel_priv,
478 				  select_queue_fallback_t fallback)
479 {
480 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
481 	struct opa_vnic_skb_mdata *mdata;
482 	struct sdma_engine *sde;
483 
484 	mdata = (struct opa_vnic_skb_mdata *)skb->data;
485 	sde = sdma_select_engine_vl(vinfo->dd, mdata->entropy, mdata->vl);
486 	return sde->this_idx;
487 }
488 
489 /* hfi1_vnic_decap_skb - strip OPA header from the skb (ethernet) packet */
490 static inline int hfi1_vnic_decap_skb(struct hfi1_vnic_rx_queue *rxq,
491 				      struct sk_buff *skb)
492 {
493 	struct hfi1_vnic_vport_info *vinfo = rxq->vinfo;
494 	int max_len = vinfo->netdev->mtu + VLAN_ETH_HLEN;
495 	int rc = -EFAULT;
496 
497 	skb_pull(skb, OPA_VNIC_HDR_LEN);
498 
499 	/* Validate Packet length */
500 	if (unlikely(skb->len > max_len))
501 		vinfo->stats[rxq->idx].rx_oversize++;
502 	else if (unlikely(skb->len < ETH_ZLEN))
503 		vinfo->stats[rxq->idx].rx_runt++;
504 	else
505 		rc = 0;
506 	return rc;
507 }
508 
509 static inline struct sk_buff *hfi1_vnic_get_skb(struct hfi1_vnic_rx_queue *rxq)
510 {
511 	unsigned char *pad_info;
512 	struct sk_buff *skb;
513 
514 	skb = skb_dequeue(&rxq->skbq);
515 	if (unlikely(!skb))
516 		return NULL;
517 
518 	/* remove tail padding and icrc */
519 	pad_info = skb->data + skb->len - 1;
520 	skb_trim(skb, (skb->len - OPA_VNIC_ICRC_TAIL_LEN -
521 		       ((*pad_info) & 0x7)));
522 
523 	return skb;
524 }
525 
526 /* hfi1_vnic_handle_rx - handle skb receive */
527 static void hfi1_vnic_handle_rx(struct hfi1_vnic_rx_queue *rxq,
528 				int *work_done, int work_to_do)
529 {
530 	struct hfi1_vnic_vport_info *vinfo = rxq->vinfo;
531 	struct sk_buff *skb;
532 	int rc;
533 
534 	while (1) {
535 		if (*work_done >= work_to_do)
536 			break;
537 
538 		skb = hfi1_vnic_get_skb(rxq);
539 		if (unlikely(!skb))
540 			break;
541 
542 		rc = hfi1_vnic_decap_skb(rxq, skb);
543 		/* update rx counters */
544 		hfi1_vnic_update_rx_counters(vinfo, rxq->idx, skb, rc);
545 		if (unlikely(rc)) {
546 			dev_kfree_skb_any(skb);
547 			continue;
548 		}
549 
550 		skb_checksum_none_assert(skb);
551 		skb->protocol = eth_type_trans(skb, rxq->netdev);
552 
553 		napi_gro_receive(&rxq->napi, skb);
554 		(*work_done)++;
555 	}
556 }
557 
558 /* hfi1_vnic_napi - napi receive polling callback function */
559 static int hfi1_vnic_napi(struct napi_struct *napi, int budget)
560 {
561 	struct hfi1_vnic_rx_queue *rxq = container_of(napi,
562 					      struct hfi1_vnic_rx_queue, napi);
563 	struct hfi1_vnic_vport_info *vinfo = rxq->vinfo;
564 	int work_done = 0;
565 
566 	v_dbg("napi %d budget %d\n", rxq->idx, budget);
567 	hfi1_vnic_handle_rx(rxq, &work_done, budget);
568 
569 	v_dbg("napi %d work_done %d\n", rxq->idx, work_done);
570 	if (work_done < budget)
571 		napi_complete(napi);
572 
573 	return work_done;
574 }
575 
576 void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet)
577 {
578 	struct hfi1_devdata *dd = packet->rcd->dd;
579 	struct hfi1_vnic_vport_info *vinfo = NULL;
580 	struct hfi1_vnic_rx_queue *rxq;
581 	struct sk_buff *skb;
582 	int l4_type, vesw_id = -1;
583 	u8 q_idx;
584 
585 	l4_type = HFI1_GET_L4_TYPE(packet->ebuf);
586 	if (likely(l4_type == OPA_VNIC_L4_ETHR)) {
587 		vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf);
588 		vinfo = idr_find(&dd->vnic.vesw_idr, vesw_id);
589 
590 		/*
591 		 * In case of invalid vesw id, count the error on
592 		 * the first available vport.
593 		 */
594 		if (unlikely(!vinfo)) {
595 			struct hfi1_vnic_vport_info *vinfo_tmp;
596 			int id_tmp = 0;
597 
598 			vinfo_tmp =  idr_get_next(&dd->vnic.vesw_idr, &id_tmp);
599 			if (vinfo_tmp) {
600 				spin_lock(&vport_cntr_lock);
601 				vinfo_tmp->stats[0].netstats.rx_nohandler++;
602 				spin_unlock(&vport_cntr_lock);
603 			}
604 		}
605 	}
606 
607 	if (unlikely(!vinfo)) {
608 		dd_dev_warn(dd, "vnic rcv err: l4 %d vesw id %d ctx %d\n",
609 			    l4_type, vesw_id, packet->rcd->ctxt);
610 		return;
611 	}
612 
613 	q_idx = packet->rcd->vnic_q_idx;
614 	rxq = &vinfo->rxq[q_idx];
615 	if (unlikely(!netif_oper_up(vinfo->netdev))) {
616 		vinfo->stats[q_idx].rx_drop_state++;
617 		skb_queue_purge(&rxq->skbq);
618 		return;
619 	}
620 
621 	if (unlikely(skb_queue_len(&rxq->skbq) > HFI1_VNIC_RCV_Q_SIZE)) {
622 		vinfo->stats[q_idx].netstats.rx_fifo_errors++;
623 		return;
624 	}
625 
626 	skb = netdev_alloc_skb(vinfo->netdev, packet->tlen);
627 	if (unlikely(!skb)) {
628 		vinfo->stats[q_idx].netstats.rx_fifo_errors++;
629 		return;
630 	}
631 
632 	memcpy(skb->data, packet->ebuf, packet->tlen);
633 	skb_put(skb, packet->tlen);
634 	skb_queue_tail(&rxq->skbq, skb);
635 
636 	if (napi_schedule_prep(&rxq->napi)) {
637 		v_dbg("napi %d scheduling\n", q_idx);
638 		__napi_schedule(&rxq->napi);
639 	}
640 }
641 
642 static int hfi1_vnic_up(struct hfi1_vnic_vport_info *vinfo)
643 {
644 	struct hfi1_devdata *dd = vinfo->dd;
645 	struct net_device *netdev = vinfo->netdev;
646 	int i, rc;
647 
648 	/* ensure virtual eth switch id is valid */
649 	if (!vinfo->vesw_id)
650 		return -EINVAL;
651 
652 	rc = idr_alloc(&dd->vnic.vesw_idr, vinfo, vinfo->vesw_id,
653 		       vinfo->vesw_id + 1, GFP_NOWAIT);
654 	if (rc < 0)
655 		return rc;
656 
657 	for (i = 0; i < vinfo->num_rx_q; i++) {
658 		struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i];
659 
660 		skb_queue_head_init(&rxq->skbq);
661 		napi_enable(&rxq->napi);
662 	}
663 
664 	netif_carrier_on(netdev);
665 	netif_tx_start_all_queues(netdev);
666 	set_bit(HFI1_VNIC_UP, &vinfo->flags);
667 
668 	return 0;
669 }
670 
671 static void hfi1_vnic_down(struct hfi1_vnic_vport_info *vinfo)
672 {
673 	struct hfi1_devdata *dd = vinfo->dd;
674 	u8 i;
675 
676 	clear_bit(HFI1_VNIC_UP, &vinfo->flags);
677 	netif_carrier_off(vinfo->netdev);
678 	netif_tx_disable(vinfo->netdev);
679 	idr_remove(&dd->vnic.vesw_idr, vinfo->vesw_id);
680 
681 	/* ensure irqs see the change */
682 	hfi1_vnic_synchronize_irq(dd);
683 
684 	/* remove unread skbs */
685 	for (i = 0; i < vinfo->num_rx_q; i++) {
686 		struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i];
687 
688 		napi_disable(&rxq->napi);
689 		skb_queue_purge(&rxq->skbq);
690 	}
691 }
692 
693 static int hfi1_netdev_open(struct net_device *netdev)
694 {
695 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
696 	int rc;
697 
698 	mutex_lock(&vinfo->lock);
699 	rc = hfi1_vnic_up(vinfo);
700 	mutex_unlock(&vinfo->lock);
701 	return rc;
702 }
703 
704 static int hfi1_netdev_close(struct net_device *netdev)
705 {
706 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
707 
708 	mutex_lock(&vinfo->lock);
709 	if (test_bit(HFI1_VNIC_UP, &vinfo->flags))
710 		hfi1_vnic_down(vinfo);
711 	mutex_unlock(&vinfo->lock);
712 	return 0;
713 }
714 
715 static int hfi1_vnic_allot_ctxt(struct hfi1_devdata *dd,
716 				struct hfi1_ctxtdata **vnic_ctxt)
717 {
718 	int rc;
719 
720 	rc = allocate_vnic_ctxt(dd, vnic_ctxt);
721 	if (rc) {
722 		dd_dev_err(dd, "vnic ctxt alloc failed %d\n", rc);
723 		return rc;
724 	}
725 
726 	rc = setup_vnic_ctxt(dd, *vnic_ctxt);
727 	if (rc) {
728 		dd_dev_err(dd, "vnic ctxt setup failed %d\n", rc);
729 		deallocate_vnic_ctxt(dd, *vnic_ctxt);
730 		*vnic_ctxt = NULL;
731 	}
732 
733 	return rc;
734 }
735 
736 static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo)
737 {
738 	struct hfi1_devdata *dd = vinfo->dd;
739 	int i, rc = 0;
740 
741 	mutex_lock(&hfi1_mutex);
742 	if (!dd->vnic.num_vports) {
743 		rc = hfi1_vnic_txreq_init(dd);
744 		if (rc)
745 			goto txreq_fail;
746 
747 		dd->vnic.msix_idx = dd->first_dyn_msix_idx;
748 	}
749 
750 	for (i = dd->vnic.num_ctxt; i < vinfo->num_rx_q; i++) {
751 		rc = hfi1_vnic_allot_ctxt(dd, &dd->vnic.ctxt[i]);
752 		if (rc)
753 			break;
754 		dd->vnic.ctxt[i]->vnic_q_idx = i;
755 	}
756 
757 	if (i < vinfo->num_rx_q) {
758 		/*
759 		 * If required amount of contexts is not
760 		 * allocated successfully then remaining contexts
761 		 * are released.
762 		 */
763 		while (i-- > dd->vnic.num_ctxt) {
764 			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
765 			dd->vnic.ctxt[i] = NULL;
766 		}
767 		goto alloc_fail;
768 	}
769 
770 	if (dd->vnic.num_ctxt != i) {
771 		dd->vnic.num_ctxt = i;
772 		hfi1_init_vnic_rsm(dd);
773 	}
774 
775 	dd->vnic.num_vports++;
776 	hfi1_vnic_sdma_init(vinfo);
777 alloc_fail:
778 	if (!dd->vnic.num_vports)
779 		hfi1_vnic_txreq_deinit(dd);
780 txreq_fail:
781 	mutex_unlock(&hfi1_mutex);
782 	return rc;
783 }
784 
785 static void hfi1_vnic_deinit(struct hfi1_vnic_vport_info *vinfo)
786 {
787 	struct hfi1_devdata *dd = vinfo->dd;
788 	int i;
789 
790 	mutex_lock(&hfi1_mutex);
791 	if (--dd->vnic.num_vports == 0) {
792 		for (i = 0; i < dd->vnic.num_ctxt; i++) {
793 			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
794 			dd->vnic.ctxt[i] = NULL;
795 		}
796 		hfi1_deinit_vnic_rsm(dd);
797 		dd->vnic.num_ctxt = 0;
798 		hfi1_vnic_txreq_deinit(dd);
799 	}
800 	mutex_unlock(&hfi1_mutex);
801 }
802 
803 static void hfi1_vnic_set_vesw_id(struct net_device *netdev, int id)
804 {
805 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
806 	bool reopen = false;
807 
808 	/*
809 	 * If vesw_id is being changed, and if the vnic port is up,
810 	 * reset the vnic port to ensure new vesw_id gets picked up
811 	 */
812 	if (id != vinfo->vesw_id) {
813 		mutex_lock(&vinfo->lock);
814 		if (test_bit(HFI1_VNIC_UP, &vinfo->flags)) {
815 			hfi1_vnic_down(vinfo);
816 			reopen = true;
817 		}
818 
819 		vinfo->vesw_id = id;
820 		if (reopen)
821 			hfi1_vnic_up(vinfo);
822 
823 		mutex_unlock(&vinfo->lock);
824 	}
825 }
826 
827 /* netdev ops */
828 static const struct net_device_ops hfi1_netdev_ops = {
829 	.ndo_open = hfi1_netdev_open,
830 	.ndo_stop = hfi1_netdev_close,
831 	.ndo_start_xmit = hfi1_netdev_start_xmit,
832 	.ndo_select_queue = hfi1_vnic_select_queue,
833 	.ndo_get_stats64 = hfi1_vnic_get_stats64,
834 };
835 
836 static void hfi1_vnic_free_rn(struct net_device *netdev)
837 {
838 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
839 
840 	hfi1_vnic_deinit(vinfo);
841 	mutex_destroy(&vinfo->lock);
842 	free_netdev(netdev);
843 }
844 
845 struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device,
846 				      u8 port_num,
847 				      enum rdma_netdev_t type,
848 				      const char *name,
849 				      unsigned char name_assign_type,
850 				      void (*setup)(struct net_device *))
851 {
852 	struct hfi1_devdata *dd = dd_from_ibdev(device);
853 	struct hfi1_vnic_vport_info *vinfo;
854 	struct net_device *netdev;
855 	struct rdma_netdev *rn;
856 	int i, size, rc;
857 
858 	if (!port_num || (port_num > dd->num_pports))
859 		return ERR_PTR(-EINVAL);
860 
861 	if (type != RDMA_NETDEV_OPA_VNIC)
862 		return ERR_PTR(-EOPNOTSUPP);
863 
864 	size = sizeof(struct opa_vnic_rdma_netdev) + sizeof(*vinfo);
865 	netdev = alloc_netdev_mqs(size, name, name_assign_type, setup,
866 				  dd->chip_sdma_engines, HFI1_NUM_VNIC_CTXT);
867 	if (!netdev)
868 		return ERR_PTR(-ENOMEM);
869 
870 	rn = netdev_priv(netdev);
871 	vinfo = opa_vnic_dev_priv(netdev);
872 	vinfo->dd = dd;
873 	vinfo->num_tx_q = dd->chip_sdma_engines;
874 	vinfo->num_rx_q = HFI1_NUM_VNIC_CTXT;
875 	vinfo->netdev = netdev;
876 	rn->free_rdma_netdev = hfi1_vnic_free_rn;
877 	rn->set_id = hfi1_vnic_set_vesw_id;
878 
879 	netdev->features = NETIF_F_HIGHDMA | NETIF_F_SG;
880 	netdev->hw_features = netdev->features;
881 	netdev->vlan_features = netdev->features;
882 	netdev->watchdog_timeo = msecs_to_jiffies(HFI_TX_TIMEOUT_MS);
883 	netdev->netdev_ops = &hfi1_netdev_ops;
884 	mutex_init(&vinfo->lock);
885 
886 	for (i = 0; i < vinfo->num_rx_q; i++) {
887 		struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i];
888 
889 		rxq->idx = i;
890 		rxq->vinfo = vinfo;
891 		rxq->netdev = netdev;
892 		netif_napi_add(netdev, &rxq->napi, hfi1_vnic_napi, 64);
893 	}
894 
895 	rc = hfi1_vnic_init(vinfo);
896 	if (rc)
897 		goto init_fail;
898 
899 	return netdev;
900 init_fail:
901 	mutex_destroy(&vinfo->lock);
902 	free_netdev(netdev);
903 	return ERR_PTR(rc);
904 }
905