xref: /openbmc/linux/drivers/infiniband/hw/hfi1/vnic_main.c (revision 023e41632e065d49bcbe31b3c4b336217f96a271)
1 /*
2  * Copyright(c) 2017 - 2018 Intel Corporation.
3  *
4  * This file is provided under a dual BSD/GPLv2 license.  When using or
5  * redistributing this file, you may do so under either license.
6  *
7  * GPL LICENSE SUMMARY
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of version 2 of the GNU General Public License as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License for more details.
17  *
18  * BSD LICENSE
19  *
20  * Redistribution and use in source and binary forms, with or without
21  * modification, are permitted provided that the following conditions
22  * are met:
23  *
24  *  - Redistributions of source code must retain the above copyright
25  *    notice, this list of conditions and the following disclaimer.
26  *  - Redistributions in binary form must reproduce the above copyright
27  *    notice, this list of conditions and the following disclaimer in
28  *    the documentation and/or other materials provided with the
29  *    distribution.
30  *  - Neither the name of Intel Corporation nor the names of its
31  *    contributors may be used to endorse or promote products derived
32  *    from this software without specific prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45  *
46  */
47 
48 /*
49  * This file contains HFI1 support for VNIC functionality
50  */
51 
52 #include <linux/io.h>
53 #include <linux/if_vlan.h>
54 
55 #include "vnic.h"
56 
57 #define HFI_TX_TIMEOUT_MS 1000
58 
59 #define HFI1_VNIC_RCV_Q_SIZE   1024
60 
61 #define HFI1_VNIC_UP 0
62 
63 static DEFINE_SPINLOCK(vport_cntr_lock);
64 
65 static int setup_vnic_ctxt(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt)
66 {
67 	unsigned int rcvctrl_ops = 0;
68 	int ret;
69 
70 	uctxt->do_interrupt = &handle_receive_interrupt;
71 
72 	/* Now allocate the RcvHdr queue and eager buffers. */
73 	ret = hfi1_create_rcvhdrq(dd, uctxt);
74 	if (ret)
75 		goto done;
76 
77 	ret = hfi1_setup_eagerbufs(uctxt);
78 	if (ret)
79 		goto done;
80 
81 	if (uctxt->rcvhdrtail_kvaddr)
82 		clear_rcvhdrtail(uctxt);
83 
84 	rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
85 	rcvctrl_ops |= HFI1_RCVCTRL_INTRAVAIL_ENB;
86 
87 	if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
88 		rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
89 	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
90 		rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
91 	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
92 		rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
93 	if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
94 		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
95 
96 	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt);
97 done:
98 	return ret;
99 }
100 
101 static int allocate_vnic_ctxt(struct hfi1_devdata *dd,
102 			      struct hfi1_ctxtdata **vnic_ctxt)
103 {
104 	struct hfi1_ctxtdata *uctxt;
105 	int ret;
106 
107 	if (dd->flags & HFI1_FROZEN)
108 		return -EIO;
109 
110 	ret = hfi1_create_ctxtdata(dd->pport, dd->node, &uctxt);
111 	if (ret < 0) {
112 		dd_dev_err(dd, "Unable to create ctxtdata, failing open\n");
113 		return -ENOMEM;
114 	}
115 
116 	uctxt->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
117 			HFI1_CAP_KGET(NODROP_RHQ_FULL) |
118 			HFI1_CAP_KGET(NODROP_EGR_FULL) |
119 			HFI1_CAP_KGET(DMA_RTAIL);
120 	uctxt->seq_cnt = 1;
121 	uctxt->is_vnic = true;
122 
123 	msix_request_rcd_irq(uctxt);
124 
125 	hfi1_stats.sps_ctxts++;
126 	dd_dev_dbg(dd, "created vnic context %d\n", uctxt->ctxt);
127 	*vnic_ctxt = uctxt;
128 
129 	return 0;
130 }
131 
132 static void deallocate_vnic_ctxt(struct hfi1_devdata *dd,
133 				 struct hfi1_ctxtdata *uctxt)
134 {
135 	dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt);
136 	flush_wc();
137 
138 	/*
139 	 * Disable receive context and interrupt available, reset all
140 	 * RcvCtxtCtrl bits to default values.
141 	 */
142 	hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
143 		     HFI1_RCVCTRL_TIDFLOW_DIS |
144 		     HFI1_RCVCTRL_INTRAVAIL_DIS |
145 		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
146 		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
147 		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
148 
149 	/* msix_intr will always be > 0, only clean up if this is true */
150 	if (uctxt->msix_intr)
151 		msix_free_irq(dd, uctxt->msix_intr);
152 
153 	uctxt->event_flags = 0;
154 
155 	hfi1_clear_tids(uctxt);
156 	hfi1_clear_ctxt_pkey(dd, uctxt);
157 
158 	hfi1_stats.sps_ctxts--;
159 
160 	hfi1_free_ctxt(uctxt);
161 }
162 
163 void hfi1_vnic_setup(struct hfi1_devdata *dd)
164 {
165 	idr_init(&dd->vnic.vesw_idr);
166 }
167 
168 void hfi1_vnic_cleanup(struct hfi1_devdata *dd)
169 {
170 	idr_destroy(&dd->vnic.vesw_idr);
171 }
172 
173 #define SUM_GRP_COUNTERS(stats, qstats, x_grp) do {            \
174 		u64 *src64, *dst64;                            \
175 		for (src64 = &qstats->x_grp.unicast,           \
176 			dst64 = &stats->x_grp.unicast;         \
177 			dst64 <= &stats->x_grp.s_1519_max;) {  \
178 			*dst64++ += *src64++;                  \
179 		}                                              \
180 	} while (0)
181 
182 /* hfi1_vnic_update_stats - update statistics */
183 static void hfi1_vnic_update_stats(struct hfi1_vnic_vport_info *vinfo,
184 				   struct opa_vnic_stats *stats)
185 {
186 	struct net_device *netdev = vinfo->netdev;
187 	u8 i;
188 
189 	/* add tx counters on different queues */
190 	for (i = 0; i < vinfo->num_tx_q; i++) {
191 		struct opa_vnic_stats *qstats = &vinfo->stats[i];
192 		struct rtnl_link_stats64 *qnstats = &vinfo->stats[i].netstats;
193 
194 		stats->netstats.tx_fifo_errors += qnstats->tx_fifo_errors;
195 		stats->netstats.tx_carrier_errors += qnstats->tx_carrier_errors;
196 		stats->tx_drop_state += qstats->tx_drop_state;
197 		stats->tx_dlid_zero += qstats->tx_dlid_zero;
198 
199 		SUM_GRP_COUNTERS(stats, qstats, tx_grp);
200 		stats->netstats.tx_packets += qnstats->tx_packets;
201 		stats->netstats.tx_bytes += qnstats->tx_bytes;
202 	}
203 
204 	/* add rx counters on different queues */
205 	for (i = 0; i < vinfo->num_rx_q; i++) {
206 		struct opa_vnic_stats *qstats = &vinfo->stats[i];
207 		struct rtnl_link_stats64 *qnstats = &vinfo->stats[i].netstats;
208 
209 		stats->netstats.rx_fifo_errors += qnstats->rx_fifo_errors;
210 		stats->netstats.rx_nohandler += qnstats->rx_nohandler;
211 		stats->rx_drop_state += qstats->rx_drop_state;
212 		stats->rx_oversize += qstats->rx_oversize;
213 		stats->rx_runt += qstats->rx_runt;
214 
215 		SUM_GRP_COUNTERS(stats, qstats, rx_grp);
216 		stats->netstats.rx_packets += qnstats->rx_packets;
217 		stats->netstats.rx_bytes += qnstats->rx_bytes;
218 	}
219 
220 	stats->netstats.tx_errors = stats->netstats.tx_fifo_errors +
221 				    stats->netstats.tx_carrier_errors +
222 				    stats->tx_drop_state + stats->tx_dlid_zero;
223 	stats->netstats.tx_dropped = stats->netstats.tx_errors;
224 
225 	stats->netstats.rx_errors = stats->netstats.rx_fifo_errors +
226 				    stats->netstats.rx_nohandler +
227 				    stats->rx_drop_state + stats->rx_oversize +
228 				    stats->rx_runt;
229 	stats->netstats.rx_dropped = stats->netstats.rx_errors;
230 
231 	netdev->stats.tx_packets = stats->netstats.tx_packets;
232 	netdev->stats.tx_bytes = stats->netstats.tx_bytes;
233 	netdev->stats.tx_fifo_errors = stats->netstats.tx_fifo_errors;
234 	netdev->stats.tx_carrier_errors = stats->netstats.tx_carrier_errors;
235 	netdev->stats.tx_errors = stats->netstats.tx_errors;
236 	netdev->stats.tx_dropped = stats->netstats.tx_dropped;
237 
238 	netdev->stats.rx_packets = stats->netstats.rx_packets;
239 	netdev->stats.rx_bytes = stats->netstats.rx_bytes;
240 	netdev->stats.rx_fifo_errors = stats->netstats.rx_fifo_errors;
241 	netdev->stats.multicast = stats->rx_grp.mcastbcast;
242 	netdev->stats.rx_length_errors = stats->rx_oversize + stats->rx_runt;
243 	netdev->stats.rx_errors = stats->netstats.rx_errors;
244 	netdev->stats.rx_dropped = stats->netstats.rx_dropped;
245 }
246 
247 /* update_len_counters - update pkt's len histogram counters */
248 static inline void update_len_counters(struct opa_vnic_grp_stats *grp,
249 				       int len)
250 {
251 	/* account for 4 byte FCS */
252 	if (len >= 1515)
253 		grp->s_1519_max++;
254 	else if (len >= 1020)
255 		grp->s_1024_1518++;
256 	else if (len >= 508)
257 		grp->s_512_1023++;
258 	else if (len >= 252)
259 		grp->s_256_511++;
260 	else if (len >= 124)
261 		grp->s_128_255++;
262 	else if (len >= 61)
263 		grp->s_65_127++;
264 	else
265 		grp->s_64++;
266 }
267 
268 /* hfi1_vnic_update_tx_counters - update transmit counters */
269 static void hfi1_vnic_update_tx_counters(struct hfi1_vnic_vport_info *vinfo,
270 					 u8 q_idx, struct sk_buff *skb, int err)
271 {
272 	struct ethhdr *mac_hdr = (struct ethhdr *)skb_mac_header(skb);
273 	struct opa_vnic_stats *stats = &vinfo->stats[q_idx];
274 	struct opa_vnic_grp_stats *tx_grp = &stats->tx_grp;
275 	u16 vlan_tci;
276 
277 	stats->netstats.tx_packets++;
278 	stats->netstats.tx_bytes += skb->len + ETH_FCS_LEN;
279 
280 	update_len_counters(tx_grp, skb->len);
281 
282 	/* rest of the counts are for good packets only */
283 	if (unlikely(err))
284 		return;
285 
286 	if (is_multicast_ether_addr(mac_hdr->h_dest))
287 		tx_grp->mcastbcast++;
288 	else
289 		tx_grp->unicast++;
290 
291 	if (!__vlan_get_tag(skb, &vlan_tci))
292 		tx_grp->vlan++;
293 	else
294 		tx_grp->untagged++;
295 }
296 
297 /* hfi1_vnic_update_rx_counters - update receive counters */
298 static void hfi1_vnic_update_rx_counters(struct hfi1_vnic_vport_info *vinfo,
299 					 u8 q_idx, struct sk_buff *skb, int err)
300 {
301 	struct ethhdr *mac_hdr = (struct ethhdr *)skb->data;
302 	struct opa_vnic_stats *stats = &vinfo->stats[q_idx];
303 	struct opa_vnic_grp_stats *rx_grp = &stats->rx_grp;
304 	u16 vlan_tci;
305 
306 	stats->netstats.rx_packets++;
307 	stats->netstats.rx_bytes += skb->len + ETH_FCS_LEN;
308 
309 	update_len_counters(rx_grp, skb->len);
310 
311 	/* rest of the counts are for good packets only */
312 	if (unlikely(err))
313 		return;
314 
315 	if (is_multicast_ether_addr(mac_hdr->h_dest))
316 		rx_grp->mcastbcast++;
317 	else
318 		rx_grp->unicast++;
319 
320 	if (!__vlan_get_tag(skb, &vlan_tci))
321 		rx_grp->vlan++;
322 	else
323 		rx_grp->untagged++;
324 }
325 
326 /* This function is overloaded for opa_vnic specific implementation */
327 static void hfi1_vnic_get_stats64(struct net_device *netdev,
328 				  struct rtnl_link_stats64 *stats)
329 {
330 	struct opa_vnic_stats *vstats = (struct opa_vnic_stats *)stats;
331 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
332 
333 	hfi1_vnic_update_stats(vinfo, vstats);
334 }
335 
336 static u64 create_bypass_pbc(u32 vl, u32 dw_len)
337 {
338 	u64 pbc;
339 
340 	pbc = ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
341 		| PBC_INSERT_BYPASS_ICRC | PBC_CREDIT_RETURN
342 		| PBC_PACKET_BYPASS
343 		| ((vl & PBC_VL_MASK) << PBC_VL_SHIFT)
344 		| (dw_len & PBC_LENGTH_DWS_MASK) << PBC_LENGTH_DWS_SHIFT;
345 
346 	return pbc;
347 }
348 
349 /* hfi1_vnic_maybe_stop_tx - stop tx queue if required */
350 static void hfi1_vnic_maybe_stop_tx(struct hfi1_vnic_vport_info *vinfo,
351 				    u8 q_idx)
352 {
353 	netif_stop_subqueue(vinfo->netdev, q_idx);
354 	if (!hfi1_vnic_sdma_write_avail(vinfo, q_idx))
355 		return;
356 
357 	netif_start_subqueue(vinfo->netdev, q_idx);
358 }
359 
360 static netdev_tx_t hfi1_netdev_start_xmit(struct sk_buff *skb,
361 					  struct net_device *netdev)
362 {
363 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
364 	u8 pad_len, q_idx = skb->queue_mapping;
365 	struct hfi1_devdata *dd = vinfo->dd;
366 	struct opa_vnic_skb_mdata *mdata;
367 	u32 pkt_len, total_len;
368 	int err = -EINVAL;
369 	u64 pbc;
370 
371 	v_dbg("xmit: queue %d skb len %d\n", q_idx, skb->len);
372 	if (unlikely(!netif_oper_up(netdev))) {
373 		vinfo->stats[q_idx].tx_drop_state++;
374 		goto tx_finish;
375 	}
376 
377 	/* take out meta data */
378 	mdata = (struct opa_vnic_skb_mdata *)skb->data;
379 	skb_pull(skb, sizeof(*mdata));
380 	if (unlikely(mdata->flags & OPA_VNIC_SKB_MDATA_ENCAP_ERR)) {
381 		vinfo->stats[q_idx].tx_dlid_zero++;
382 		goto tx_finish;
383 	}
384 
385 	/* add tail padding (for 8 bytes size alignment) and icrc */
386 	pad_len = -(skb->len + OPA_VNIC_ICRC_TAIL_LEN) & 0x7;
387 	pad_len += OPA_VNIC_ICRC_TAIL_LEN;
388 
389 	/*
390 	 * pkt_len is how much data we have to write, includes header and data.
391 	 * total_len is length of the packet in Dwords plus the PBC should not
392 	 * include the CRC.
393 	 */
394 	pkt_len = (skb->len + pad_len) >> 2;
395 	total_len = pkt_len + 2; /* PBC + packet */
396 
397 	pbc = create_bypass_pbc(mdata->vl, total_len);
398 
399 	skb_get(skb);
400 	v_dbg("pbc 0x%016llX len %d pad_len %d\n", pbc, skb->len, pad_len);
401 	err = dd->process_vnic_dma_send(dd, q_idx, vinfo, skb, pbc, pad_len);
402 	if (unlikely(err)) {
403 		if (err == -ENOMEM)
404 			vinfo->stats[q_idx].netstats.tx_fifo_errors++;
405 		else if (err != -EBUSY)
406 			vinfo->stats[q_idx].netstats.tx_carrier_errors++;
407 	}
408 	/* remove the header before updating tx counters */
409 	skb_pull(skb, OPA_VNIC_HDR_LEN);
410 
411 	if (unlikely(err == -EBUSY)) {
412 		hfi1_vnic_maybe_stop_tx(vinfo, q_idx);
413 		dev_kfree_skb_any(skb);
414 		return NETDEV_TX_BUSY;
415 	}
416 
417 tx_finish:
418 	/* update tx counters */
419 	hfi1_vnic_update_tx_counters(vinfo, q_idx, skb, err);
420 	dev_kfree_skb_any(skb);
421 	return NETDEV_TX_OK;
422 }
423 
424 static u16 hfi1_vnic_select_queue(struct net_device *netdev,
425 				  struct sk_buff *skb,
426 				  struct net_device *sb_dev,
427 				  select_queue_fallback_t fallback)
428 {
429 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
430 	struct opa_vnic_skb_mdata *mdata;
431 	struct sdma_engine *sde;
432 
433 	mdata = (struct opa_vnic_skb_mdata *)skb->data;
434 	sde = sdma_select_engine_vl(vinfo->dd, mdata->entropy, mdata->vl);
435 	return sde->this_idx;
436 }
437 
438 /* hfi1_vnic_decap_skb - strip OPA header from the skb (ethernet) packet */
439 static inline int hfi1_vnic_decap_skb(struct hfi1_vnic_rx_queue *rxq,
440 				      struct sk_buff *skb)
441 {
442 	struct hfi1_vnic_vport_info *vinfo = rxq->vinfo;
443 	int max_len = vinfo->netdev->mtu + VLAN_ETH_HLEN;
444 	int rc = -EFAULT;
445 
446 	skb_pull(skb, OPA_VNIC_HDR_LEN);
447 
448 	/* Validate Packet length */
449 	if (unlikely(skb->len > max_len))
450 		vinfo->stats[rxq->idx].rx_oversize++;
451 	else if (unlikely(skb->len < ETH_ZLEN))
452 		vinfo->stats[rxq->idx].rx_runt++;
453 	else
454 		rc = 0;
455 	return rc;
456 }
457 
458 static inline struct sk_buff *hfi1_vnic_get_skb(struct hfi1_vnic_rx_queue *rxq)
459 {
460 	unsigned char *pad_info;
461 	struct sk_buff *skb;
462 
463 	skb = skb_dequeue(&rxq->skbq);
464 	if (unlikely(!skb))
465 		return NULL;
466 
467 	/* remove tail padding and icrc */
468 	pad_info = skb->data + skb->len - 1;
469 	skb_trim(skb, (skb->len - OPA_VNIC_ICRC_TAIL_LEN -
470 		       ((*pad_info) & 0x7)));
471 
472 	return skb;
473 }
474 
475 /* hfi1_vnic_handle_rx - handle skb receive */
476 static void hfi1_vnic_handle_rx(struct hfi1_vnic_rx_queue *rxq,
477 				int *work_done, int work_to_do)
478 {
479 	struct hfi1_vnic_vport_info *vinfo = rxq->vinfo;
480 	struct sk_buff *skb;
481 	int rc;
482 
483 	while (1) {
484 		if (*work_done >= work_to_do)
485 			break;
486 
487 		skb = hfi1_vnic_get_skb(rxq);
488 		if (unlikely(!skb))
489 			break;
490 
491 		rc = hfi1_vnic_decap_skb(rxq, skb);
492 		/* update rx counters */
493 		hfi1_vnic_update_rx_counters(vinfo, rxq->idx, skb, rc);
494 		if (unlikely(rc)) {
495 			dev_kfree_skb_any(skb);
496 			continue;
497 		}
498 
499 		skb_checksum_none_assert(skb);
500 		skb->protocol = eth_type_trans(skb, rxq->netdev);
501 
502 		napi_gro_receive(&rxq->napi, skb);
503 		(*work_done)++;
504 	}
505 }
506 
507 /* hfi1_vnic_napi - napi receive polling callback function */
508 static int hfi1_vnic_napi(struct napi_struct *napi, int budget)
509 {
510 	struct hfi1_vnic_rx_queue *rxq = container_of(napi,
511 					      struct hfi1_vnic_rx_queue, napi);
512 	struct hfi1_vnic_vport_info *vinfo = rxq->vinfo;
513 	int work_done = 0;
514 
515 	v_dbg("napi %d budget %d\n", rxq->idx, budget);
516 	hfi1_vnic_handle_rx(rxq, &work_done, budget);
517 
518 	v_dbg("napi %d work_done %d\n", rxq->idx, work_done);
519 	if (work_done < budget)
520 		napi_complete(napi);
521 
522 	return work_done;
523 }
524 
525 void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet)
526 {
527 	struct hfi1_devdata *dd = packet->rcd->dd;
528 	struct hfi1_vnic_vport_info *vinfo = NULL;
529 	struct hfi1_vnic_rx_queue *rxq;
530 	struct sk_buff *skb;
531 	int l4_type, vesw_id = -1;
532 	u8 q_idx;
533 
534 	l4_type = hfi1_16B_get_l4(packet->ebuf);
535 	if (likely(l4_type == OPA_16B_L4_ETHR)) {
536 		vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf);
537 		vinfo = idr_find(&dd->vnic.vesw_idr, vesw_id);
538 
539 		/*
540 		 * In case of invalid vesw id, count the error on
541 		 * the first available vport.
542 		 */
543 		if (unlikely(!vinfo)) {
544 			struct hfi1_vnic_vport_info *vinfo_tmp;
545 			int id_tmp = 0;
546 
547 			vinfo_tmp =  idr_get_next(&dd->vnic.vesw_idr, &id_tmp);
548 			if (vinfo_tmp) {
549 				spin_lock(&vport_cntr_lock);
550 				vinfo_tmp->stats[0].netstats.rx_nohandler++;
551 				spin_unlock(&vport_cntr_lock);
552 			}
553 		}
554 	}
555 
556 	if (unlikely(!vinfo)) {
557 		dd_dev_warn(dd, "vnic rcv err: l4 %d vesw id %d ctx %d\n",
558 			    l4_type, vesw_id, packet->rcd->ctxt);
559 		return;
560 	}
561 
562 	q_idx = packet->rcd->vnic_q_idx;
563 	rxq = &vinfo->rxq[q_idx];
564 	if (unlikely(!netif_oper_up(vinfo->netdev))) {
565 		vinfo->stats[q_idx].rx_drop_state++;
566 		skb_queue_purge(&rxq->skbq);
567 		return;
568 	}
569 
570 	if (unlikely(skb_queue_len(&rxq->skbq) > HFI1_VNIC_RCV_Q_SIZE)) {
571 		vinfo->stats[q_idx].netstats.rx_fifo_errors++;
572 		return;
573 	}
574 
575 	skb = netdev_alloc_skb(vinfo->netdev, packet->tlen);
576 	if (unlikely(!skb)) {
577 		vinfo->stats[q_idx].netstats.rx_fifo_errors++;
578 		return;
579 	}
580 
581 	memcpy(skb->data, packet->ebuf, packet->tlen);
582 	skb_put(skb, packet->tlen);
583 	skb_queue_tail(&rxq->skbq, skb);
584 
585 	if (napi_schedule_prep(&rxq->napi)) {
586 		v_dbg("napi %d scheduling\n", q_idx);
587 		__napi_schedule(&rxq->napi);
588 	}
589 }
590 
591 static int hfi1_vnic_up(struct hfi1_vnic_vport_info *vinfo)
592 {
593 	struct hfi1_devdata *dd = vinfo->dd;
594 	struct net_device *netdev = vinfo->netdev;
595 	int i, rc;
596 
597 	/* ensure virtual eth switch id is valid */
598 	if (!vinfo->vesw_id)
599 		return -EINVAL;
600 
601 	rc = idr_alloc(&dd->vnic.vesw_idr, vinfo, vinfo->vesw_id,
602 		       vinfo->vesw_id + 1, GFP_NOWAIT);
603 	if (rc < 0)
604 		return rc;
605 
606 	for (i = 0; i < vinfo->num_rx_q; i++) {
607 		struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i];
608 
609 		skb_queue_head_init(&rxq->skbq);
610 		napi_enable(&rxq->napi);
611 	}
612 
613 	netif_carrier_on(netdev);
614 	netif_tx_start_all_queues(netdev);
615 	set_bit(HFI1_VNIC_UP, &vinfo->flags);
616 
617 	return 0;
618 }
619 
620 static void hfi1_vnic_down(struct hfi1_vnic_vport_info *vinfo)
621 {
622 	struct hfi1_devdata *dd = vinfo->dd;
623 	u8 i;
624 
625 	clear_bit(HFI1_VNIC_UP, &vinfo->flags);
626 	netif_carrier_off(vinfo->netdev);
627 	netif_tx_disable(vinfo->netdev);
628 	idr_remove(&dd->vnic.vesw_idr, vinfo->vesw_id);
629 
630 	/* ensure irqs see the change */
631 	msix_vnic_synchronize_irq(dd);
632 
633 	/* remove unread skbs */
634 	for (i = 0; i < vinfo->num_rx_q; i++) {
635 		struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i];
636 
637 		napi_disable(&rxq->napi);
638 		skb_queue_purge(&rxq->skbq);
639 	}
640 }
641 
642 static int hfi1_netdev_open(struct net_device *netdev)
643 {
644 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
645 	int rc;
646 
647 	mutex_lock(&vinfo->lock);
648 	rc = hfi1_vnic_up(vinfo);
649 	mutex_unlock(&vinfo->lock);
650 	return rc;
651 }
652 
653 static int hfi1_netdev_close(struct net_device *netdev)
654 {
655 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
656 
657 	mutex_lock(&vinfo->lock);
658 	if (test_bit(HFI1_VNIC_UP, &vinfo->flags))
659 		hfi1_vnic_down(vinfo);
660 	mutex_unlock(&vinfo->lock);
661 	return 0;
662 }
663 
664 static int hfi1_vnic_allot_ctxt(struct hfi1_devdata *dd,
665 				struct hfi1_ctxtdata **vnic_ctxt)
666 {
667 	int rc;
668 
669 	rc = allocate_vnic_ctxt(dd, vnic_ctxt);
670 	if (rc) {
671 		dd_dev_err(dd, "vnic ctxt alloc failed %d\n", rc);
672 		return rc;
673 	}
674 
675 	rc = setup_vnic_ctxt(dd, *vnic_ctxt);
676 	if (rc) {
677 		dd_dev_err(dd, "vnic ctxt setup failed %d\n", rc);
678 		deallocate_vnic_ctxt(dd, *vnic_ctxt);
679 		*vnic_ctxt = NULL;
680 	}
681 
682 	return rc;
683 }
684 
685 static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo)
686 {
687 	struct hfi1_devdata *dd = vinfo->dd;
688 	int i, rc = 0;
689 
690 	mutex_lock(&hfi1_mutex);
691 	if (!dd->vnic.num_vports) {
692 		rc = hfi1_vnic_txreq_init(dd);
693 		if (rc)
694 			goto txreq_fail;
695 	}
696 
697 	for (i = dd->vnic.num_ctxt; i < vinfo->num_rx_q; i++) {
698 		rc = hfi1_vnic_allot_ctxt(dd, &dd->vnic.ctxt[i]);
699 		if (rc)
700 			break;
701 		hfi1_rcd_get(dd->vnic.ctxt[i]);
702 		dd->vnic.ctxt[i]->vnic_q_idx = i;
703 	}
704 
705 	if (i < vinfo->num_rx_q) {
706 		/*
707 		 * If required amount of contexts is not
708 		 * allocated successfully then remaining contexts
709 		 * are released.
710 		 */
711 		while (i-- > dd->vnic.num_ctxt) {
712 			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
713 			hfi1_rcd_put(dd->vnic.ctxt[i]);
714 			dd->vnic.ctxt[i] = NULL;
715 		}
716 		goto alloc_fail;
717 	}
718 
719 	if (dd->vnic.num_ctxt != i) {
720 		dd->vnic.num_ctxt = i;
721 		hfi1_init_vnic_rsm(dd);
722 	}
723 
724 	dd->vnic.num_vports++;
725 	hfi1_vnic_sdma_init(vinfo);
726 alloc_fail:
727 	if (!dd->vnic.num_vports)
728 		hfi1_vnic_txreq_deinit(dd);
729 txreq_fail:
730 	mutex_unlock(&hfi1_mutex);
731 	return rc;
732 }
733 
734 static void hfi1_vnic_deinit(struct hfi1_vnic_vport_info *vinfo)
735 {
736 	struct hfi1_devdata *dd = vinfo->dd;
737 	int i;
738 
739 	mutex_lock(&hfi1_mutex);
740 	if (--dd->vnic.num_vports == 0) {
741 		for (i = 0; i < dd->vnic.num_ctxt; i++) {
742 			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
743 			hfi1_rcd_put(dd->vnic.ctxt[i]);
744 			dd->vnic.ctxt[i] = NULL;
745 		}
746 		hfi1_deinit_vnic_rsm(dd);
747 		dd->vnic.num_ctxt = 0;
748 		hfi1_vnic_txreq_deinit(dd);
749 	}
750 	mutex_unlock(&hfi1_mutex);
751 }
752 
753 static void hfi1_vnic_set_vesw_id(struct net_device *netdev, int id)
754 {
755 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
756 	bool reopen = false;
757 
758 	/*
759 	 * If vesw_id is being changed, and if the vnic port is up,
760 	 * reset the vnic port to ensure new vesw_id gets picked up
761 	 */
762 	if (id != vinfo->vesw_id) {
763 		mutex_lock(&vinfo->lock);
764 		if (test_bit(HFI1_VNIC_UP, &vinfo->flags)) {
765 			hfi1_vnic_down(vinfo);
766 			reopen = true;
767 		}
768 
769 		vinfo->vesw_id = id;
770 		if (reopen)
771 			hfi1_vnic_up(vinfo);
772 
773 		mutex_unlock(&vinfo->lock);
774 	}
775 }
776 
777 /* netdev ops */
778 static const struct net_device_ops hfi1_netdev_ops = {
779 	.ndo_open = hfi1_netdev_open,
780 	.ndo_stop = hfi1_netdev_close,
781 	.ndo_start_xmit = hfi1_netdev_start_xmit,
782 	.ndo_select_queue = hfi1_vnic_select_queue,
783 	.ndo_get_stats64 = hfi1_vnic_get_stats64,
784 };
785 
786 static void hfi1_vnic_free_rn(struct net_device *netdev)
787 {
788 	struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
789 
790 	hfi1_vnic_deinit(vinfo);
791 	mutex_destroy(&vinfo->lock);
792 	free_netdev(netdev);
793 }
794 
795 struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device,
796 				      u8 port_num,
797 				      enum rdma_netdev_t type,
798 				      const char *name,
799 				      unsigned char name_assign_type,
800 				      void (*setup)(struct net_device *))
801 {
802 	struct hfi1_devdata *dd = dd_from_ibdev(device);
803 	struct hfi1_vnic_vport_info *vinfo;
804 	struct net_device *netdev;
805 	struct rdma_netdev *rn;
806 	int i, size, rc;
807 
808 	if (!dd->num_vnic_contexts)
809 		return ERR_PTR(-ENOMEM);
810 
811 	if (!port_num || (port_num > dd->num_pports))
812 		return ERR_PTR(-EINVAL);
813 
814 	if (type != RDMA_NETDEV_OPA_VNIC)
815 		return ERR_PTR(-EOPNOTSUPP);
816 
817 	size = sizeof(struct opa_vnic_rdma_netdev) + sizeof(*vinfo);
818 	netdev = alloc_netdev_mqs(size, name, name_assign_type, setup,
819 				  dd->num_sdma, dd->num_vnic_contexts);
820 	if (!netdev)
821 		return ERR_PTR(-ENOMEM);
822 
823 	rn = netdev_priv(netdev);
824 	vinfo = opa_vnic_dev_priv(netdev);
825 	vinfo->dd = dd;
826 	vinfo->num_tx_q = dd->num_sdma;
827 	vinfo->num_rx_q = dd->num_vnic_contexts;
828 	vinfo->netdev = netdev;
829 	rn->free_rdma_netdev = hfi1_vnic_free_rn;
830 	rn->set_id = hfi1_vnic_set_vesw_id;
831 
832 	netdev->features = NETIF_F_HIGHDMA | NETIF_F_SG;
833 	netdev->hw_features = netdev->features;
834 	netdev->vlan_features = netdev->features;
835 	netdev->watchdog_timeo = msecs_to_jiffies(HFI_TX_TIMEOUT_MS);
836 	netdev->netdev_ops = &hfi1_netdev_ops;
837 	mutex_init(&vinfo->lock);
838 
839 	for (i = 0; i < vinfo->num_rx_q; i++) {
840 		struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i];
841 
842 		rxq->idx = i;
843 		rxq->vinfo = vinfo;
844 		rxq->netdev = netdev;
845 		netif_napi_add(netdev, &rxq->napi, hfi1_vnic_napi, 64);
846 	}
847 
848 	rc = hfi1_vnic_init(vinfo);
849 	if (rc)
850 		goto init_fail;
851 
852 	return netdev;
853 init_fail:
854 	mutex_destroy(&vinfo->lock);
855 	free_netdev(netdev);
856 	return ERR_PTR(rc);
857 }
858