1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 /* Copyright (C) 2015-2019 Netronome Systems, Inc. */
3 
4 #include <linux/bpf_trace.h>
5 #include <linux/netdevice.h>
6 #include <linux/overflow.h>
7 #include <linux/sizes.h>
8 #include <linux/bitfield.h>
9 
10 #include "../nfp_app.h"
11 #include "../nfp_net.h"
12 #include "../nfp_net_dp.h"
13 #include "../crypto/crypto.h"
14 #include "../crypto/fw.h"
15 #include "nfdk.h"
16 
17 static int nfp_nfdk_tx_ring_should_wake(struct nfp_net_tx_ring *tx_ring)
18 {
19 	return !nfp_net_tx_full(tx_ring, NFDK_TX_DESC_STOP_CNT * 2);
20 }
21 
22 static int nfp_nfdk_tx_ring_should_stop(struct nfp_net_tx_ring *tx_ring)
23 {
24 	return nfp_net_tx_full(tx_ring, NFDK_TX_DESC_STOP_CNT);
25 }
26 
27 static void nfp_nfdk_tx_ring_stop(struct netdev_queue *nd_q,
28 				  struct nfp_net_tx_ring *tx_ring)
29 {
30 	netif_tx_stop_queue(nd_q);
31 
32 	/* We can race with the TX completion out of NAPI so recheck */
33 	smp_mb();
34 	if (unlikely(nfp_nfdk_tx_ring_should_wake(tx_ring)))
35 		netif_tx_start_queue(nd_q);
36 }
37 
38 static __le64
39 nfp_nfdk_tx_tso(struct nfp_net_r_vector *r_vec, struct nfp_nfdk_tx_buf *txbuf,
40 		struct sk_buff *skb)
41 {
42 	u32 segs, hdrlen, l3_offset, l4_offset;
43 	struct nfp_nfdk_tx_desc txd;
44 	u16 mss;
45 
46 	if (!skb->encapsulation) {
47 		l3_offset = skb_network_offset(skb);
48 		l4_offset = skb_transport_offset(skb);
49 		hdrlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
50 	} else {
51 		l3_offset = skb_inner_network_offset(skb);
52 		l4_offset = skb_inner_transport_offset(skb);
53 		hdrlen = skb_inner_transport_header(skb) - skb->data +
54 			inner_tcp_hdrlen(skb);
55 	}
56 
57 	segs = skb_shinfo(skb)->gso_segs;
58 	mss = skb_shinfo(skb)->gso_size & NFDK_DESC_TX_MSS_MASK;
59 
60 	/* Note: TSO of the packet with metadata prepended to skb is not
61 	 * supported yet, in which case l3/l4_offset and lso_hdrlen need
62 	 * be correctly handled here.
63 	 * Concern:
64 	 * The driver doesn't have md_bytes easily available at this point.
65 	 * The PCI.IN PD ME won't have md_bytes bytes to add to lso_hdrlen,
66 	 * so it needs the full length there.  The app MEs might prefer
67 	 * l3_offset and l4_offset relative to the start of packet data,
68 	 * but could probably cope with it being relative to the CTM buf
69 	 * data offset.
70 	 */
71 	txd.l3_offset = l3_offset;
72 	txd.l4_offset = l4_offset;
73 	txd.lso_meta_res = 0;
74 	txd.mss = cpu_to_le16(mss);
75 	txd.lso_hdrlen = hdrlen;
76 	txd.lso_totsegs = segs;
77 
78 	txbuf->pkt_cnt = segs;
79 	txbuf->real_len = skb->len + hdrlen * (txbuf->pkt_cnt - 1);
80 
81 	u64_stats_update_begin(&r_vec->tx_sync);
82 	r_vec->tx_lso++;
83 	u64_stats_update_end(&r_vec->tx_sync);
84 
85 	return txd.raw;
86 }
87 
88 static u8
89 nfp_nfdk_tx_csum(struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec,
90 		 unsigned int pkt_cnt, struct sk_buff *skb, u64 flags)
91 {
92 	struct ipv6hdr *ipv6h;
93 	struct iphdr *iph;
94 
95 	if (!(dp->ctrl & NFP_NET_CFG_CTRL_TXCSUM))
96 		return flags;
97 
98 	if (skb->ip_summed != CHECKSUM_PARTIAL)
99 		return flags;
100 
101 	flags |= NFDK_DESC_TX_L4_CSUM;
102 
103 	iph = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb);
104 	ipv6h = skb->encapsulation ? inner_ipv6_hdr(skb) : ipv6_hdr(skb);
105 
106 	/* L3 checksum offloading flag is not required for ipv6 */
107 	if (iph->version == 4) {
108 		flags |= NFDK_DESC_TX_L3_CSUM;
109 	} else if (ipv6h->version != 6) {
110 		nn_dp_warn(dp, "partial checksum but ipv=%x!\n", iph->version);
111 		return flags;
112 	}
113 
114 	u64_stats_update_begin(&r_vec->tx_sync);
115 	if (!skb->encapsulation) {
116 		r_vec->hw_csum_tx += pkt_cnt;
117 	} else {
118 		flags |= NFDK_DESC_TX_ENCAP;
119 		r_vec->hw_csum_tx_inner += pkt_cnt;
120 	}
121 	u64_stats_update_end(&r_vec->tx_sync);
122 
123 	return flags;
124 }
125 
126 static int
127 nfp_nfdk_tx_maybe_close_block(struct nfp_net_tx_ring *tx_ring,
128 			      unsigned int nr_frags, struct sk_buff *skb)
129 {
130 	unsigned int n_descs, wr_p, nop_slots;
131 	const skb_frag_t *frag, *fend;
132 	struct nfp_nfdk_tx_desc *txd;
133 	unsigned int wr_idx;
134 	int err;
135 
136 recount_descs:
137 	n_descs = nfp_nfdk_headlen_to_segs(skb_headlen(skb));
138 
139 	frag = skb_shinfo(skb)->frags;
140 	fend = frag + nr_frags;
141 	for (; frag < fend; frag++)
142 		n_descs += DIV_ROUND_UP(skb_frag_size(frag),
143 					NFDK_TX_MAX_DATA_PER_DESC);
144 
145 	if (unlikely(n_descs > NFDK_TX_DESC_GATHER_MAX)) {
146 		if (skb_is_nonlinear(skb)) {
147 			err = skb_linearize(skb);
148 			if (err)
149 				return err;
150 			goto recount_descs;
151 		}
152 		return -EINVAL;
153 	}
154 
155 	/* Under count by 1 (don't count meta) for the round down to work out */
156 	n_descs += !!skb_is_gso(skb);
157 
158 	if (round_down(tx_ring->wr_p, NFDK_TX_DESC_BLOCK_CNT) !=
159 	    round_down(tx_ring->wr_p + n_descs, NFDK_TX_DESC_BLOCK_CNT))
160 		goto close_block;
161 
162 	if ((u32)tx_ring->data_pending + skb->len > NFDK_TX_MAX_DATA_PER_BLOCK)
163 		goto close_block;
164 
165 	return 0;
166 
167 close_block:
168 	wr_p = tx_ring->wr_p;
169 	nop_slots = D_BLOCK_CPL(wr_p);
170 
171 	wr_idx = D_IDX(tx_ring, wr_p);
172 	tx_ring->ktxbufs[wr_idx].skb = NULL;
173 	txd = &tx_ring->ktxds[wr_idx];
174 
175 	memset(txd, 0, array_size(nop_slots, sizeof(struct nfp_nfdk_tx_desc)));
176 
177 	tx_ring->data_pending = 0;
178 	tx_ring->wr_p += nop_slots;
179 	tx_ring->wr_ptr_add += nop_slots;
180 
181 	return 0;
182 }
183 
184 static int nfp_nfdk_prep_port_id(struct sk_buff *skb)
185 {
186 	struct metadata_dst *md_dst = skb_metadata_dst(skb);
187 	unsigned char *data;
188 
189 	if (likely(!md_dst))
190 		return 0;
191 	if (unlikely(md_dst->type != METADATA_HW_PORT_MUX))
192 		return 0;
193 
194 	/* Note: Unsupported case when TSO a skb with metedata prepended.
195 	 * See the comments in `nfp_nfdk_tx_tso` for details.
196 	 */
197 	if (unlikely(md_dst && skb_is_gso(skb)))
198 		return -EOPNOTSUPP;
199 
200 	if (unlikely(skb_cow_head(skb, sizeof(md_dst->u.port_info.port_id))))
201 		return -ENOMEM;
202 
203 	data = skb_push(skb, sizeof(md_dst->u.port_info.port_id));
204 	put_unaligned_be32(md_dst->u.port_info.port_id, data);
205 
206 	return sizeof(md_dst->u.port_info.port_id);
207 }
208 
209 static int
210 nfp_nfdk_prep_tx_meta(struct nfp_app *app, struct sk_buff *skb,
211 		      struct nfp_net_r_vector *r_vec)
212 {
213 	unsigned char *data;
214 	int res, md_bytes;
215 	u32 meta_id = 0;
216 
217 	res = nfp_nfdk_prep_port_id(skb);
218 	if (unlikely(res <= 0))
219 		return res;
220 
221 	md_bytes = res;
222 	meta_id = NFP_NET_META_PORTID;
223 
224 	if (unlikely(skb_cow_head(skb, sizeof(meta_id))))
225 		return -ENOMEM;
226 
227 	md_bytes += sizeof(meta_id);
228 
229 	meta_id = FIELD_PREP(NFDK_META_LEN, md_bytes) |
230 		  FIELD_PREP(NFDK_META_FIELDS, meta_id);
231 
232 	data = skb_push(skb, sizeof(meta_id));
233 	put_unaligned_be32(meta_id, data);
234 
235 	return NFDK_DESC_TX_CHAIN_META;
236 }
237 
238 /**
239  * nfp_nfdk_tx() - Main transmit entry point
240  * @skb:    SKB to transmit
241  * @netdev: netdev structure
242  *
243  * Return: NETDEV_TX_OK on success.
244  */
245 netdev_tx_t nfp_nfdk_tx(struct sk_buff *skb, struct net_device *netdev)
246 {
247 	struct nfp_net *nn = netdev_priv(netdev);
248 	struct nfp_nfdk_tx_buf *txbuf, *etxbuf;
249 	u32 cnt, tmp_dlen, dlen_type = 0;
250 	struct nfp_net_tx_ring *tx_ring;
251 	struct nfp_net_r_vector *r_vec;
252 	const skb_frag_t *frag, *fend;
253 	struct nfp_nfdk_tx_desc *txd;
254 	unsigned int real_len, qidx;
255 	unsigned int dma_len, type;
256 	struct netdev_queue *nd_q;
257 	struct nfp_net_dp *dp;
258 	int nr_frags, wr_idx;
259 	dma_addr_t dma_addr;
260 	u64 metadata;
261 
262 	dp = &nn->dp;
263 	qidx = skb_get_queue_mapping(skb);
264 	tx_ring = &dp->tx_rings[qidx];
265 	r_vec = tx_ring->r_vec;
266 	nd_q = netdev_get_tx_queue(dp->netdev, qidx);
267 
268 	/* Don't bother counting frags, assume the worst */
269 	if (unlikely(nfp_net_tx_full(tx_ring, NFDK_TX_DESC_STOP_CNT))) {
270 		nn_dp_warn(dp, "TX ring %d busy. wrp=%u rdp=%u\n",
271 			   qidx, tx_ring->wr_p, tx_ring->rd_p);
272 		netif_tx_stop_queue(nd_q);
273 		nfp_net_tx_xmit_more_flush(tx_ring);
274 		u64_stats_update_begin(&r_vec->tx_sync);
275 		r_vec->tx_busy++;
276 		u64_stats_update_end(&r_vec->tx_sync);
277 		return NETDEV_TX_BUSY;
278 	}
279 
280 	metadata = nfp_nfdk_prep_tx_meta(nn->app, skb, r_vec);
281 	if (unlikely((int)metadata < 0))
282 		goto err_flush;
283 
284 	nr_frags = skb_shinfo(skb)->nr_frags;
285 	if (nfp_nfdk_tx_maybe_close_block(tx_ring, nr_frags, skb))
286 		goto err_flush;
287 
288 	/* DMA map all */
289 	wr_idx = D_IDX(tx_ring, tx_ring->wr_p);
290 	txd = &tx_ring->ktxds[wr_idx];
291 	txbuf = &tx_ring->ktxbufs[wr_idx];
292 
293 	dma_len = skb_headlen(skb);
294 	if (skb_is_gso(skb))
295 		type = NFDK_DESC_TX_TYPE_TSO;
296 	else if (!nr_frags && dma_len < NFDK_TX_MAX_DATA_PER_HEAD)
297 		type = NFDK_DESC_TX_TYPE_SIMPLE;
298 	else
299 		type = NFDK_DESC_TX_TYPE_GATHER;
300 
301 	dma_addr = dma_map_single(dp->dev, skb->data, dma_len, DMA_TO_DEVICE);
302 	if (dma_mapping_error(dp->dev, dma_addr))
303 		goto err_warn_dma;
304 
305 	txbuf->skb = skb;
306 	txbuf++;
307 
308 	txbuf->dma_addr = dma_addr;
309 	txbuf++;
310 
311 	/* FIELD_PREP() implicitly truncates to chunk */
312 	dma_len -= 1;
313 	dlen_type = FIELD_PREP(NFDK_DESC_TX_DMA_LEN_HEAD, dma_len) |
314 		    FIELD_PREP(NFDK_DESC_TX_TYPE_HEAD, type);
315 
316 	txd->dma_len_type = cpu_to_le16(dlen_type);
317 	nfp_nfdk_tx_desc_set_dma_addr(txd, dma_addr);
318 
319 	/* starts at bit 0 */
320 	BUILD_BUG_ON(!(NFDK_DESC_TX_DMA_LEN_HEAD & 1));
321 
322 	/* Preserve the original dlen_type, this way below the EOP logic
323 	 * can use dlen_type.
324 	 */
325 	tmp_dlen = dlen_type & NFDK_DESC_TX_DMA_LEN_HEAD;
326 	dma_len -= tmp_dlen;
327 	dma_addr += tmp_dlen + 1;
328 	txd++;
329 
330 	/* The rest of the data (if any) will be in larger dma descritors
331 	 * and is handled with the fragment loop.
332 	 */
333 	frag = skb_shinfo(skb)->frags;
334 	fend = frag + nr_frags;
335 
336 	while (true) {
337 		while (dma_len > 0) {
338 			dma_len -= 1;
339 			dlen_type = FIELD_PREP(NFDK_DESC_TX_DMA_LEN, dma_len);
340 
341 			txd->dma_len_type = cpu_to_le16(dlen_type);
342 			nfp_nfdk_tx_desc_set_dma_addr(txd, dma_addr);
343 
344 			dma_len -= dlen_type;
345 			dma_addr += dlen_type + 1;
346 			txd++;
347 		}
348 
349 		if (frag >= fend)
350 			break;
351 
352 		dma_len = skb_frag_size(frag);
353 		dma_addr = skb_frag_dma_map(dp->dev, frag, 0, dma_len,
354 					    DMA_TO_DEVICE);
355 		if (dma_mapping_error(dp->dev, dma_addr))
356 			goto err_unmap;
357 
358 		txbuf->dma_addr = dma_addr;
359 		txbuf++;
360 
361 		frag++;
362 	}
363 
364 	(txd - 1)->dma_len_type = cpu_to_le16(dlen_type | NFDK_DESC_TX_EOP);
365 
366 	if (!skb_is_gso(skb)) {
367 		real_len = skb->len;
368 		/* Metadata desc */
369 		metadata = nfp_nfdk_tx_csum(dp, r_vec, 1, skb, metadata);
370 		txd->raw = cpu_to_le64(metadata);
371 		txd++;
372 	} else {
373 		/* lso desc should be placed after metadata desc */
374 		(txd + 1)->raw = nfp_nfdk_tx_tso(r_vec, txbuf, skb);
375 		real_len = txbuf->real_len;
376 		/* Metadata desc */
377 		metadata = nfp_nfdk_tx_csum(dp, r_vec, txbuf->pkt_cnt, skb, metadata);
378 		txd->raw = cpu_to_le64(metadata);
379 		txd += 2;
380 		txbuf++;
381 	}
382 
383 	cnt = txd - tx_ring->ktxds - wr_idx;
384 	if (unlikely(round_down(wr_idx, NFDK_TX_DESC_BLOCK_CNT) !=
385 		     round_down(wr_idx + cnt - 1, NFDK_TX_DESC_BLOCK_CNT)))
386 		goto err_warn_overflow;
387 
388 	skb_tx_timestamp(skb);
389 
390 	tx_ring->wr_p += cnt;
391 	if (tx_ring->wr_p % NFDK_TX_DESC_BLOCK_CNT)
392 		tx_ring->data_pending += skb->len;
393 	else
394 		tx_ring->data_pending = 0;
395 
396 	if (nfp_nfdk_tx_ring_should_stop(tx_ring))
397 		nfp_nfdk_tx_ring_stop(nd_q, tx_ring);
398 
399 	tx_ring->wr_ptr_add += cnt;
400 	if (__netdev_tx_sent_queue(nd_q, real_len, netdev_xmit_more()))
401 		nfp_net_tx_xmit_more_flush(tx_ring);
402 
403 	return NETDEV_TX_OK;
404 
405 err_warn_overflow:
406 	WARN_ONCE(1, "unable to fit packet into a descriptor wr_idx:%d head:%d frags:%d cnt:%d",
407 		  wr_idx, skb_headlen(skb), nr_frags, cnt);
408 	if (skb_is_gso(skb))
409 		txbuf--;
410 err_unmap:
411 	/* txbuf pointed to the next-to-use */
412 	etxbuf = txbuf;
413 	/* first txbuf holds the skb */
414 	txbuf = &tx_ring->ktxbufs[wr_idx + 1];
415 	if (txbuf < etxbuf) {
416 		dma_unmap_single(dp->dev, txbuf->dma_addr,
417 				 skb_headlen(skb), DMA_TO_DEVICE);
418 		txbuf->raw = 0;
419 		txbuf++;
420 	}
421 	frag = skb_shinfo(skb)->frags;
422 	while (etxbuf < txbuf) {
423 		dma_unmap_page(dp->dev, txbuf->dma_addr,
424 			       skb_frag_size(frag), DMA_TO_DEVICE);
425 		txbuf->raw = 0;
426 		frag++;
427 		txbuf++;
428 	}
429 err_warn_dma:
430 	nn_dp_warn(dp, "Failed to map DMA TX buffer\n");
431 err_flush:
432 	nfp_net_tx_xmit_more_flush(tx_ring);
433 	u64_stats_update_begin(&r_vec->tx_sync);
434 	r_vec->tx_errors++;
435 	u64_stats_update_end(&r_vec->tx_sync);
436 	dev_kfree_skb_any(skb);
437 	return NETDEV_TX_OK;
438 }
439 
440 /**
441  * nfp_nfdk_tx_complete() - Handled completed TX packets
442  * @tx_ring:	TX ring structure
443  * @budget:	NAPI budget (only used as bool to determine if in NAPI context)
444  */
445 static void nfp_nfdk_tx_complete(struct nfp_net_tx_ring *tx_ring, int budget)
446 {
447 	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
448 	struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
449 	u32 done_pkts = 0, done_bytes = 0;
450 	struct nfp_nfdk_tx_buf *ktxbufs;
451 	struct device *dev = dp->dev;
452 	struct netdev_queue *nd_q;
453 	u32 rd_p, qcp_rd_p;
454 	int todo;
455 
456 	rd_p = tx_ring->rd_p;
457 	if (tx_ring->wr_p == rd_p)
458 		return;
459 
460 	/* Work out how many descriptors have been transmitted */
461 	qcp_rd_p = nfp_net_read_tx_cmpl(tx_ring, dp);
462 
463 	if (qcp_rd_p == tx_ring->qcp_rd_p)
464 		return;
465 
466 	todo = D_IDX(tx_ring, qcp_rd_p - tx_ring->qcp_rd_p);
467 	ktxbufs = tx_ring->ktxbufs;
468 
469 	while (todo > 0) {
470 		const skb_frag_t *frag, *fend;
471 		unsigned int size, n_descs = 1;
472 		struct nfp_nfdk_tx_buf *txbuf;
473 		struct sk_buff *skb;
474 
475 		txbuf = &ktxbufs[D_IDX(tx_ring, rd_p)];
476 		skb = txbuf->skb;
477 		txbuf++;
478 
479 		/* Closed block */
480 		if (!skb) {
481 			n_descs = D_BLOCK_CPL(rd_p);
482 			goto next;
483 		}
484 
485 		/* Unmap head */
486 		size = skb_headlen(skb);
487 		n_descs += nfp_nfdk_headlen_to_segs(size);
488 		dma_unmap_single(dev, txbuf->dma_addr, size, DMA_TO_DEVICE);
489 		txbuf++;
490 
491 		/* Unmap frags */
492 		frag = skb_shinfo(skb)->frags;
493 		fend = frag + skb_shinfo(skb)->nr_frags;
494 		for (; frag < fend; frag++) {
495 			size = skb_frag_size(frag);
496 			n_descs += DIV_ROUND_UP(size,
497 						NFDK_TX_MAX_DATA_PER_DESC);
498 			dma_unmap_page(dev, txbuf->dma_addr,
499 				       skb_frag_size(frag), DMA_TO_DEVICE);
500 			txbuf++;
501 		}
502 
503 		if (!skb_is_gso(skb)) {
504 			done_bytes += skb->len;
505 			done_pkts++;
506 		} else {
507 			done_bytes += txbuf->real_len;
508 			done_pkts += txbuf->pkt_cnt;
509 			n_descs++;
510 		}
511 
512 		napi_consume_skb(skb, budget);
513 next:
514 		rd_p += n_descs;
515 		todo -= n_descs;
516 	}
517 
518 	tx_ring->rd_p = rd_p;
519 	tx_ring->qcp_rd_p = qcp_rd_p;
520 
521 	u64_stats_update_begin(&r_vec->tx_sync);
522 	r_vec->tx_bytes += done_bytes;
523 	r_vec->tx_pkts += done_pkts;
524 	u64_stats_update_end(&r_vec->tx_sync);
525 
526 	if (!dp->netdev)
527 		return;
528 
529 	nd_q = netdev_get_tx_queue(dp->netdev, tx_ring->idx);
530 	netdev_tx_completed_queue(nd_q, done_pkts, done_bytes);
531 	if (nfp_nfdk_tx_ring_should_wake(tx_ring)) {
532 		/* Make sure TX thread will see updated tx_ring->rd_p */
533 		smp_mb();
534 
535 		if (unlikely(netif_tx_queue_stopped(nd_q)))
536 			netif_tx_wake_queue(nd_q);
537 	}
538 
539 	WARN_ONCE(tx_ring->wr_p - tx_ring->rd_p > tx_ring->cnt,
540 		  "TX ring corruption rd_p=%u wr_p=%u cnt=%u\n",
541 		  tx_ring->rd_p, tx_ring->wr_p, tx_ring->cnt);
542 }
543 
544 /* Receive processing */
545 static void *
546 nfp_nfdk_napi_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr)
547 {
548 	void *frag;
549 
550 	if (!dp->xdp_prog) {
551 		frag = napi_alloc_frag(dp->fl_bufsz);
552 		if (unlikely(!frag))
553 			return NULL;
554 	} else {
555 		struct page *page;
556 
557 		page = dev_alloc_page();
558 		if (unlikely(!page))
559 			return NULL;
560 		frag = page_address(page);
561 	}
562 
563 	*dma_addr = nfp_net_dma_map_rx(dp, frag);
564 	if (dma_mapping_error(dp->dev, *dma_addr)) {
565 		nfp_net_free_frag(frag, dp->xdp_prog);
566 		nn_dp_warn(dp, "Failed to map DMA RX buffer\n");
567 		return NULL;
568 	}
569 
570 	return frag;
571 }
572 
573 /**
574  * nfp_nfdk_rx_give_one() - Put mapped skb on the software and hardware rings
575  * @dp:		NFP Net data path struct
576  * @rx_ring:	RX ring structure
577  * @frag:	page fragment buffer
578  * @dma_addr:	DMA address of skb mapping
579  */
580 static void
581 nfp_nfdk_rx_give_one(const struct nfp_net_dp *dp,
582 		     struct nfp_net_rx_ring *rx_ring,
583 		     void *frag, dma_addr_t dma_addr)
584 {
585 	unsigned int wr_idx;
586 
587 	wr_idx = D_IDX(rx_ring, rx_ring->wr_p);
588 
589 	nfp_net_dma_sync_dev_rx(dp, dma_addr);
590 
591 	/* Stash SKB and DMA address away */
592 	rx_ring->rxbufs[wr_idx].frag = frag;
593 	rx_ring->rxbufs[wr_idx].dma_addr = dma_addr;
594 
595 	/* Fill freelist descriptor */
596 	rx_ring->rxds[wr_idx].fld.reserved = 0;
597 	rx_ring->rxds[wr_idx].fld.meta_len_dd = 0;
598 	nfp_desc_set_dma_addr(&rx_ring->rxds[wr_idx].fld,
599 			      dma_addr + dp->rx_dma_off);
600 
601 	rx_ring->wr_p++;
602 	if (!(rx_ring->wr_p % NFP_NET_FL_BATCH)) {
603 		/* Update write pointer of the freelist queue. Make
604 		 * sure all writes are flushed before telling the hardware.
605 		 */
606 		wmb();
607 		nfp_qcp_wr_ptr_add(rx_ring->qcp_fl, NFP_NET_FL_BATCH);
608 	}
609 }
610 
611 /**
612  * nfp_nfdk_rx_ring_fill_freelist() - Give buffers from the ring to FW
613  * @dp:	     NFP Net data path struct
614  * @rx_ring: RX ring to fill
615  */
616 void nfp_nfdk_rx_ring_fill_freelist(struct nfp_net_dp *dp,
617 				    struct nfp_net_rx_ring *rx_ring)
618 {
619 	unsigned int i;
620 
621 	for (i = 0; i < rx_ring->cnt - 1; i++)
622 		nfp_nfdk_rx_give_one(dp, rx_ring, rx_ring->rxbufs[i].frag,
623 				     rx_ring->rxbufs[i].dma_addr);
624 }
625 
626 /**
627  * nfp_nfdk_rx_csum_has_errors() - group check if rxd has any csum errors
628  * @flags: RX descriptor flags field in CPU byte order
629  */
630 static int nfp_nfdk_rx_csum_has_errors(u16 flags)
631 {
632 	u16 csum_all_checked, csum_all_ok;
633 
634 	csum_all_checked = flags & __PCIE_DESC_RX_CSUM_ALL;
635 	csum_all_ok = flags & __PCIE_DESC_RX_CSUM_ALL_OK;
636 
637 	return csum_all_checked != (csum_all_ok << PCIE_DESC_RX_CSUM_OK_SHIFT);
638 }
639 
640 /**
641  * nfp_nfdk_rx_csum() - set SKB checksum field based on RX descriptor flags
642  * @dp:  NFP Net data path struct
643  * @r_vec: per-ring structure
644  * @rxd: Pointer to RX descriptor
645  * @meta: Parsed metadata prepend
646  * @skb: Pointer to SKB
647  */
648 static void
649 nfp_nfdk_rx_csum(struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec,
650 		 struct nfp_net_rx_desc *rxd, struct nfp_meta_parsed *meta,
651 		 struct sk_buff *skb)
652 {
653 	skb_checksum_none_assert(skb);
654 
655 	if (!(dp->netdev->features & NETIF_F_RXCSUM))
656 		return;
657 
658 	if (meta->csum_type) {
659 		skb->ip_summed = meta->csum_type;
660 		skb->csum = meta->csum;
661 		u64_stats_update_begin(&r_vec->rx_sync);
662 		r_vec->hw_csum_rx_complete++;
663 		u64_stats_update_end(&r_vec->rx_sync);
664 		return;
665 	}
666 
667 	if (nfp_nfdk_rx_csum_has_errors(le16_to_cpu(rxd->rxd.flags))) {
668 		u64_stats_update_begin(&r_vec->rx_sync);
669 		r_vec->hw_csum_rx_error++;
670 		u64_stats_update_end(&r_vec->rx_sync);
671 		return;
672 	}
673 
674 	/* Assume that the firmware will never report inner CSUM_OK unless outer
675 	 * L4 headers were successfully parsed. FW will always report zero UDP
676 	 * checksum as CSUM_OK.
677 	 */
678 	if (rxd->rxd.flags & PCIE_DESC_RX_TCP_CSUM_OK ||
679 	    rxd->rxd.flags & PCIE_DESC_RX_UDP_CSUM_OK) {
680 		__skb_incr_checksum_unnecessary(skb);
681 		u64_stats_update_begin(&r_vec->rx_sync);
682 		r_vec->hw_csum_rx_ok++;
683 		u64_stats_update_end(&r_vec->rx_sync);
684 	}
685 
686 	if (rxd->rxd.flags & PCIE_DESC_RX_I_TCP_CSUM_OK ||
687 	    rxd->rxd.flags & PCIE_DESC_RX_I_UDP_CSUM_OK) {
688 		__skb_incr_checksum_unnecessary(skb);
689 		u64_stats_update_begin(&r_vec->rx_sync);
690 		r_vec->hw_csum_rx_inner_ok++;
691 		u64_stats_update_end(&r_vec->rx_sync);
692 	}
693 }
694 
695 static void
696 nfp_nfdk_set_hash(struct net_device *netdev, struct nfp_meta_parsed *meta,
697 		  unsigned int type, __be32 *hash)
698 {
699 	if (!(netdev->features & NETIF_F_RXHASH))
700 		return;
701 
702 	switch (type) {
703 	case NFP_NET_RSS_IPV4:
704 	case NFP_NET_RSS_IPV6:
705 	case NFP_NET_RSS_IPV6_EX:
706 		meta->hash_type = PKT_HASH_TYPE_L3;
707 		break;
708 	default:
709 		meta->hash_type = PKT_HASH_TYPE_L4;
710 		break;
711 	}
712 
713 	meta->hash = get_unaligned_be32(hash);
714 }
715 
716 static bool
717 nfp_nfdk_parse_meta(struct net_device *netdev, struct nfp_meta_parsed *meta,
718 		    void *data, void *pkt, unsigned int pkt_len, int meta_len)
719 {
720 	u32 meta_info;
721 
722 	meta_info = get_unaligned_be32(data);
723 	data += 4;
724 
725 	while (meta_info) {
726 		switch (meta_info & NFP_NET_META_FIELD_MASK) {
727 		case NFP_NET_META_HASH:
728 			meta_info >>= NFP_NET_META_FIELD_SIZE;
729 			nfp_nfdk_set_hash(netdev, meta,
730 					  meta_info & NFP_NET_META_FIELD_MASK,
731 					  (__be32 *)data);
732 			data += 4;
733 			break;
734 		case NFP_NET_META_MARK:
735 			meta->mark = get_unaligned_be32(data);
736 			data += 4;
737 			break;
738 		case NFP_NET_META_PORTID:
739 			meta->portid = get_unaligned_be32(data);
740 			data += 4;
741 			break;
742 		case NFP_NET_META_CSUM:
743 			meta->csum_type = CHECKSUM_COMPLETE;
744 			meta->csum =
745 				(__force __wsum)__get_unaligned_cpu32(data);
746 			data += 4;
747 			break;
748 		case NFP_NET_META_RESYNC_INFO:
749 			if (nfp_net_tls_rx_resync_req(netdev, data, pkt,
750 						      pkt_len))
751 				return false;
752 			data += sizeof(struct nfp_net_tls_resync_req);
753 			break;
754 		default:
755 			return true;
756 		}
757 
758 		meta_info >>= NFP_NET_META_FIELD_SIZE;
759 	}
760 
761 	return data != pkt;
762 }
763 
764 static void
765 nfp_nfdk_rx_drop(const struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec,
766 		 struct nfp_net_rx_ring *rx_ring, struct nfp_net_rx_buf *rxbuf,
767 		 struct sk_buff *skb)
768 {
769 	u64_stats_update_begin(&r_vec->rx_sync);
770 	r_vec->rx_drops++;
771 	/* If we have both skb and rxbuf the replacement buffer allocation
772 	 * must have failed, count this as an alloc failure.
773 	 */
774 	if (skb && rxbuf)
775 		r_vec->rx_replace_buf_alloc_fail++;
776 	u64_stats_update_end(&r_vec->rx_sync);
777 
778 	/* skb is build based on the frag, free_skb() would free the frag
779 	 * so to be able to reuse it we need an extra ref.
780 	 */
781 	if (skb && rxbuf && skb->head == rxbuf->frag)
782 		page_ref_inc(virt_to_head_page(rxbuf->frag));
783 	if (rxbuf)
784 		nfp_nfdk_rx_give_one(dp, rx_ring, rxbuf->frag, rxbuf->dma_addr);
785 	if (skb)
786 		dev_kfree_skb_any(skb);
787 }
788 
789 static bool nfp_nfdk_xdp_complete(struct nfp_net_tx_ring *tx_ring)
790 {
791 	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
792 	struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
793 	struct nfp_net_rx_ring *rx_ring;
794 	u32 qcp_rd_p, done = 0;
795 	bool done_all;
796 	int todo;
797 
798 	/* Work out how many descriptors have been transmitted */
799 	qcp_rd_p = nfp_net_read_tx_cmpl(tx_ring, dp);
800 	if (qcp_rd_p == tx_ring->qcp_rd_p)
801 		return true;
802 
803 	todo = D_IDX(tx_ring, qcp_rd_p - tx_ring->qcp_rd_p);
804 
805 	done_all = todo <= NFP_NET_XDP_MAX_COMPLETE;
806 	todo = min(todo, NFP_NET_XDP_MAX_COMPLETE);
807 
808 	rx_ring = r_vec->rx_ring;
809 	while (todo > 0) {
810 		int idx = D_IDX(tx_ring, tx_ring->rd_p + done);
811 		struct nfp_nfdk_tx_buf *txbuf;
812 		unsigned int step = 1;
813 
814 		txbuf = &tx_ring->ktxbufs[idx];
815 		if (!txbuf->raw)
816 			goto next;
817 
818 		if (NFDK_TX_BUF_INFO(txbuf->val) != NFDK_TX_BUF_INFO_SOP) {
819 			WARN_ONCE(1, "Unexpected TX buffer in XDP TX ring\n");
820 			goto next;
821 		}
822 
823 		/* Two successive txbufs are used to stash virtual and dma
824 		 * address respectively, recycle and clean them here.
825 		 */
826 		nfp_nfdk_rx_give_one(dp, rx_ring,
827 				     (void *)NFDK_TX_BUF_PTR(txbuf[0].val),
828 				     txbuf[1].dma_addr);
829 		txbuf[0].raw = 0;
830 		txbuf[1].raw = 0;
831 		step = 2;
832 
833 		u64_stats_update_begin(&r_vec->tx_sync);
834 		/* Note: tx_bytes not accumulated. */
835 		r_vec->tx_pkts++;
836 		u64_stats_update_end(&r_vec->tx_sync);
837 next:
838 		todo -= step;
839 		done += step;
840 	}
841 
842 	tx_ring->qcp_rd_p = D_IDX(tx_ring, tx_ring->qcp_rd_p + done);
843 	tx_ring->rd_p += done;
844 
845 	WARN_ONCE(tx_ring->wr_p - tx_ring->rd_p > tx_ring->cnt,
846 		  "XDP TX ring corruption rd_p=%u wr_p=%u cnt=%u\n",
847 		  tx_ring->rd_p, tx_ring->wr_p, tx_ring->cnt);
848 
849 	return done_all;
850 }
851 
852 static bool
853 nfp_nfdk_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring,
854 		    struct nfp_net_tx_ring *tx_ring,
855 		    struct nfp_net_rx_buf *rxbuf, unsigned int dma_off,
856 		    unsigned int pkt_len, bool *completed)
857 {
858 	unsigned int dma_map_sz = dp->fl_bufsz - NFP_NET_RX_BUF_NON_DATA;
859 	unsigned int dma_len, type, cnt, dlen_type, tmp_dlen;
860 	struct nfp_nfdk_tx_buf *txbuf;
861 	struct nfp_nfdk_tx_desc *txd;
862 	unsigned int n_descs;
863 	dma_addr_t dma_addr;
864 	int wr_idx;
865 
866 	/* Reject if xdp_adjust_tail grow packet beyond DMA area */
867 	if (pkt_len + dma_off > dma_map_sz)
868 		return false;
869 
870 	/* Make sure there's still at least one block available after
871 	 * aligning to block boundary, so that the txds used below
872 	 * won't wrap around the tx_ring.
873 	 */
874 	if (unlikely(nfp_net_tx_full(tx_ring, NFDK_TX_DESC_STOP_CNT))) {
875 		if (!*completed) {
876 			nfp_nfdk_xdp_complete(tx_ring);
877 			*completed = true;
878 		}
879 
880 		if (unlikely(nfp_net_tx_full(tx_ring, NFDK_TX_DESC_STOP_CNT))) {
881 			nfp_nfdk_rx_drop(dp, rx_ring->r_vec, rx_ring, rxbuf,
882 					 NULL);
883 			return false;
884 		}
885 	}
886 
887 	/* Check if cross block boundary */
888 	n_descs = nfp_nfdk_headlen_to_segs(pkt_len);
889 	if ((round_down(tx_ring->wr_p, NFDK_TX_DESC_BLOCK_CNT) !=
890 	     round_down(tx_ring->wr_p + n_descs, NFDK_TX_DESC_BLOCK_CNT)) ||
891 	    ((u32)tx_ring->data_pending + pkt_len >
892 	     NFDK_TX_MAX_DATA_PER_BLOCK)) {
893 		unsigned int nop_slots = D_BLOCK_CPL(tx_ring->wr_p);
894 
895 		wr_idx = D_IDX(tx_ring, tx_ring->wr_p);
896 		txd = &tx_ring->ktxds[wr_idx];
897 		memset(txd, 0,
898 		       array_size(nop_slots, sizeof(struct nfp_nfdk_tx_desc)));
899 
900 		tx_ring->data_pending = 0;
901 		tx_ring->wr_p += nop_slots;
902 		tx_ring->wr_ptr_add += nop_slots;
903 	}
904 
905 	wr_idx = D_IDX(tx_ring, tx_ring->wr_p);
906 
907 	txbuf = &tx_ring->ktxbufs[wr_idx];
908 
909 	txbuf[0].val = (unsigned long)rxbuf->frag | NFDK_TX_BUF_INFO_SOP;
910 	txbuf[1].dma_addr = rxbuf->dma_addr;
911 	/* Note: pkt len not stored */
912 
913 	dma_sync_single_for_device(dp->dev, rxbuf->dma_addr + dma_off,
914 				   pkt_len, DMA_BIDIRECTIONAL);
915 
916 	/* Build TX descriptor */
917 	txd = &tx_ring->ktxds[wr_idx];
918 	dma_len = pkt_len;
919 	dma_addr = rxbuf->dma_addr + dma_off;
920 
921 	if (dma_len < NFDK_TX_MAX_DATA_PER_HEAD)
922 		type = NFDK_DESC_TX_TYPE_SIMPLE;
923 	else
924 		type = NFDK_DESC_TX_TYPE_GATHER;
925 
926 	/* FIELD_PREP() implicitly truncates to chunk */
927 	dma_len -= 1;
928 	dlen_type = FIELD_PREP(NFDK_DESC_TX_DMA_LEN_HEAD, dma_len) |
929 		    FIELD_PREP(NFDK_DESC_TX_TYPE_HEAD, type);
930 
931 	txd->dma_len_type = cpu_to_le16(dlen_type);
932 	nfp_nfdk_tx_desc_set_dma_addr(txd, dma_addr);
933 
934 	tmp_dlen = dlen_type & NFDK_DESC_TX_DMA_LEN_HEAD;
935 	dma_len -= tmp_dlen;
936 	dma_addr += tmp_dlen + 1;
937 	txd++;
938 
939 	while (dma_len > 0) {
940 		dma_len -= 1;
941 		dlen_type = FIELD_PREP(NFDK_DESC_TX_DMA_LEN, dma_len);
942 		txd->dma_len_type = cpu_to_le16(dlen_type);
943 		nfp_nfdk_tx_desc_set_dma_addr(txd, dma_addr);
944 
945 		dlen_type &= NFDK_DESC_TX_DMA_LEN;
946 		dma_len -= dlen_type;
947 		dma_addr += dlen_type + 1;
948 		txd++;
949 	}
950 
951 	(txd - 1)->dma_len_type = cpu_to_le16(dlen_type | NFDK_DESC_TX_EOP);
952 
953 	/* Metadata desc */
954 	txd->raw = 0;
955 	txd++;
956 
957 	cnt = txd - tx_ring->ktxds - wr_idx;
958 	tx_ring->wr_p += cnt;
959 	if (tx_ring->wr_p % NFDK_TX_DESC_BLOCK_CNT)
960 		tx_ring->data_pending += pkt_len;
961 	else
962 		tx_ring->data_pending = 0;
963 
964 	tx_ring->wr_ptr_add += cnt;
965 	return true;
966 }
967 
968 /**
969  * nfp_nfdk_rx() - receive up to @budget packets on @rx_ring
970  * @rx_ring:   RX ring to receive from
971  * @budget:    NAPI budget
972  *
973  * Note, this function is separated out from the napi poll function to
974  * more cleanly separate packet receive code from other bookkeeping
975  * functions performed in the napi poll function.
976  *
977  * Return: Number of packets received.
978  */
979 static int nfp_nfdk_rx(struct nfp_net_rx_ring *rx_ring, int budget)
980 {
981 	struct nfp_net_r_vector *r_vec = rx_ring->r_vec;
982 	struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
983 	struct nfp_net_tx_ring *tx_ring;
984 	struct bpf_prog *xdp_prog;
985 	bool xdp_tx_cmpl = false;
986 	unsigned int true_bufsz;
987 	struct sk_buff *skb;
988 	int pkts_polled = 0;
989 	struct xdp_buff xdp;
990 	int idx;
991 
992 	xdp_prog = READ_ONCE(dp->xdp_prog);
993 	true_bufsz = xdp_prog ? PAGE_SIZE : dp->fl_bufsz;
994 	xdp_init_buff(&xdp, PAGE_SIZE - NFP_NET_RX_BUF_HEADROOM,
995 		      &rx_ring->xdp_rxq);
996 	tx_ring = r_vec->xdp_ring;
997 
998 	while (pkts_polled < budget) {
999 		unsigned int meta_len, data_len, meta_off, pkt_len, pkt_off;
1000 		struct nfp_net_rx_buf *rxbuf;
1001 		struct nfp_net_rx_desc *rxd;
1002 		struct nfp_meta_parsed meta;
1003 		bool redir_egress = false;
1004 		struct net_device *netdev;
1005 		dma_addr_t new_dma_addr;
1006 		u32 meta_len_xdp = 0;
1007 		void *new_frag;
1008 
1009 		idx = D_IDX(rx_ring, rx_ring->rd_p);
1010 
1011 		rxd = &rx_ring->rxds[idx];
1012 		if (!(rxd->rxd.meta_len_dd & PCIE_DESC_RX_DD))
1013 			break;
1014 
1015 		/* Memory barrier to ensure that we won't do other reads
1016 		 * before the DD bit.
1017 		 */
1018 		dma_rmb();
1019 
1020 		memset(&meta, 0, sizeof(meta));
1021 
1022 		rx_ring->rd_p++;
1023 		pkts_polled++;
1024 
1025 		rxbuf =	&rx_ring->rxbufs[idx];
1026 		/*         < meta_len >
1027 		 *  <-- [rx_offset] -->
1028 		 *  ---------------------------------------------------------
1029 		 * | [XX] |  metadata  |             packet           | XXXX |
1030 		 *  ---------------------------------------------------------
1031 		 *         <---------------- data_len --------------->
1032 		 *
1033 		 * The rx_offset is fixed for all packets, the meta_len can vary
1034 		 * on a packet by packet basis. If rx_offset is set to zero
1035 		 * (_RX_OFFSET_DYNAMIC) metadata starts at the beginning of the
1036 		 * buffer and is immediately followed by the packet (no [XX]).
1037 		 */
1038 		meta_len = rxd->rxd.meta_len_dd & PCIE_DESC_RX_META_LEN_MASK;
1039 		data_len = le16_to_cpu(rxd->rxd.data_len);
1040 		pkt_len = data_len - meta_len;
1041 
1042 		pkt_off = NFP_NET_RX_BUF_HEADROOM + dp->rx_dma_off;
1043 		if (dp->rx_offset == NFP_NET_CFG_RX_OFFSET_DYNAMIC)
1044 			pkt_off += meta_len;
1045 		else
1046 			pkt_off += dp->rx_offset;
1047 		meta_off = pkt_off - meta_len;
1048 
1049 		/* Stats update */
1050 		u64_stats_update_begin(&r_vec->rx_sync);
1051 		r_vec->rx_pkts++;
1052 		r_vec->rx_bytes += pkt_len;
1053 		u64_stats_update_end(&r_vec->rx_sync);
1054 
1055 		if (unlikely(meta_len > NFP_NET_MAX_PREPEND ||
1056 			     (dp->rx_offset && meta_len > dp->rx_offset))) {
1057 			nn_dp_warn(dp, "oversized RX packet metadata %u\n",
1058 				   meta_len);
1059 			nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf, NULL);
1060 			continue;
1061 		}
1062 
1063 		nfp_net_dma_sync_cpu_rx(dp, rxbuf->dma_addr + meta_off,
1064 					data_len);
1065 
1066 		if (meta_len) {
1067 			if (unlikely(nfp_nfdk_parse_meta(dp->netdev, &meta,
1068 							 rxbuf->frag + meta_off,
1069 							 rxbuf->frag + pkt_off,
1070 							 pkt_len, meta_len))) {
1071 				nn_dp_warn(dp, "invalid RX packet metadata\n");
1072 				nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf,
1073 						 NULL);
1074 				continue;
1075 			}
1076 		}
1077 
1078 		if (xdp_prog && !meta.portid) {
1079 			void *orig_data = rxbuf->frag + pkt_off;
1080 			unsigned int dma_off;
1081 			int act;
1082 
1083 			xdp_prepare_buff(&xdp,
1084 					 rxbuf->frag + NFP_NET_RX_BUF_HEADROOM,
1085 					 pkt_off - NFP_NET_RX_BUF_HEADROOM,
1086 					 pkt_len, true);
1087 
1088 			act = bpf_prog_run_xdp(xdp_prog, &xdp);
1089 
1090 			pkt_len = xdp.data_end - xdp.data;
1091 			pkt_off += xdp.data - orig_data;
1092 
1093 			switch (act) {
1094 			case XDP_PASS:
1095 				meta_len_xdp = xdp.data - xdp.data_meta;
1096 				break;
1097 			case XDP_TX:
1098 				dma_off = pkt_off - NFP_NET_RX_BUF_HEADROOM;
1099 				if (unlikely(!nfp_nfdk_tx_xdp_buf(dp, rx_ring,
1100 								  tx_ring,
1101 								  rxbuf,
1102 								  dma_off,
1103 								  pkt_len,
1104 								  &xdp_tx_cmpl)))
1105 					trace_xdp_exception(dp->netdev,
1106 							    xdp_prog, act);
1107 				continue;
1108 			default:
1109 				bpf_warn_invalid_xdp_action(dp->netdev, xdp_prog, act);
1110 				fallthrough;
1111 			case XDP_ABORTED:
1112 				trace_xdp_exception(dp->netdev, xdp_prog, act);
1113 				fallthrough;
1114 			case XDP_DROP:
1115 				nfp_nfdk_rx_give_one(dp, rx_ring, rxbuf->frag,
1116 						     rxbuf->dma_addr);
1117 				continue;
1118 			}
1119 		}
1120 
1121 		if (likely(!meta.portid)) {
1122 			netdev = dp->netdev;
1123 		} else if (meta.portid == NFP_META_PORT_ID_CTRL) {
1124 			struct nfp_net *nn = netdev_priv(dp->netdev);
1125 
1126 			nfp_app_ctrl_rx_raw(nn->app, rxbuf->frag + pkt_off,
1127 					    pkt_len);
1128 			nfp_nfdk_rx_give_one(dp, rx_ring, rxbuf->frag,
1129 					     rxbuf->dma_addr);
1130 			continue;
1131 		} else {
1132 			struct nfp_net *nn;
1133 
1134 			nn = netdev_priv(dp->netdev);
1135 			netdev = nfp_app_dev_get(nn->app, meta.portid,
1136 						 &redir_egress);
1137 			if (unlikely(!netdev)) {
1138 				nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf,
1139 						 NULL);
1140 				continue;
1141 			}
1142 
1143 			if (nfp_netdev_is_nfp_repr(netdev))
1144 				nfp_repr_inc_rx_stats(netdev, pkt_len);
1145 		}
1146 
1147 		skb = build_skb(rxbuf->frag, true_bufsz);
1148 		if (unlikely(!skb)) {
1149 			nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf, NULL);
1150 			continue;
1151 		}
1152 		new_frag = nfp_nfdk_napi_alloc_one(dp, &new_dma_addr);
1153 		if (unlikely(!new_frag)) {
1154 			nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf, skb);
1155 			continue;
1156 		}
1157 
1158 		nfp_net_dma_unmap_rx(dp, rxbuf->dma_addr);
1159 
1160 		nfp_nfdk_rx_give_one(dp, rx_ring, new_frag, new_dma_addr);
1161 
1162 		skb_reserve(skb, pkt_off);
1163 		skb_put(skb, pkt_len);
1164 
1165 		skb->mark = meta.mark;
1166 		skb_set_hash(skb, meta.hash, meta.hash_type);
1167 
1168 		skb_record_rx_queue(skb, rx_ring->idx);
1169 		skb->protocol = eth_type_trans(skb, netdev);
1170 
1171 		nfp_nfdk_rx_csum(dp, r_vec, rxd, &meta, skb);
1172 
1173 		if (rxd->rxd.flags & PCIE_DESC_RX_VLAN)
1174 			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
1175 					       le16_to_cpu(rxd->rxd.vlan));
1176 		if (meta_len_xdp)
1177 			skb_metadata_set(skb, meta_len_xdp);
1178 
1179 		if (likely(!redir_egress)) {
1180 			napi_gro_receive(&rx_ring->r_vec->napi, skb);
1181 		} else {
1182 			skb->dev = netdev;
1183 			skb_reset_network_header(skb);
1184 			__skb_push(skb, ETH_HLEN);
1185 			dev_queue_xmit(skb);
1186 		}
1187 	}
1188 
1189 	if (xdp_prog) {
1190 		if (tx_ring->wr_ptr_add)
1191 			nfp_net_tx_xmit_more_flush(tx_ring);
1192 		else if (unlikely(tx_ring->wr_p != tx_ring->rd_p) &&
1193 			 !xdp_tx_cmpl)
1194 			if (!nfp_nfdk_xdp_complete(tx_ring))
1195 				pkts_polled = budget;
1196 	}
1197 
1198 	return pkts_polled;
1199 }
1200 
1201 /**
1202  * nfp_nfdk_poll() - napi poll function
1203  * @napi:    NAPI structure
1204  * @budget:  NAPI budget
1205  *
1206  * Return: number of packets polled.
1207  */
1208 int nfp_nfdk_poll(struct napi_struct *napi, int budget)
1209 {
1210 	struct nfp_net_r_vector *r_vec =
1211 		container_of(napi, struct nfp_net_r_vector, napi);
1212 	unsigned int pkts_polled = 0;
1213 
1214 	if (r_vec->tx_ring)
1215 		nfp_nfdk_tx_complete(r_vec->tx_ring, budget);
1216 	if (r_vec->rx_ring)
1217 		pkts_polled = nfp_nfdk_rx(r_vec->rx_ring, budget);
1218 
1219 	if (pkts_polled < budget)
1220 		if (napi_complete_done(napi, pkts_polled))
1221 			nfp_net_irq_unmask(r_vec->nfp_net, r_vec->irq_entry);
1222 
1223 	if (r_vec->nfp_net->rx_coalesce_adapt_on && r_vec->rx_ring) {
1224 		struct dim_sample dim_sample = {};
1225 		unsigned int start;
1226 		u64 pkts, bytes;
1227 
1228 		do {
1229 			start = u64_stats_fetch_begin(&r_vec->rx_sync);
1230 			pkts = r_vec->rx_pkts;
1231 			bytes = r_vec->rx_bytes;
1232 		} while (u64_stats_fetch_retry(&r_vec->rx_sync, start));
1233 
1234 		dim_update_sample(r_vec->event_ctr, pkts, bytes, &dim_sample);
1235 		net_dim(&r_vec->rx_dim, dim_sample);
1236 	}
1237 
1238 	if (r_vec->nfp_net->tx_coalesce_adapt_on && r_vec->tx_ring) {
1239 		struct dim_sample dim_sample = {};
1240 		unsigned int start;
1241 		u64 pkts, bytes;
1242 
1243 		do {
1244 			start = u64_stats_fetch_begin(&r_vec->tx_sync);
1245 			pkts = r_vec->tx_pkts;
1246 			bytes = r_vec->tx_bytes;
1247 		} while (u64_stats_fetch_retry(&r_vec->tx_sync, start));
1248 
1249 		dim_update_sample(r_vec->event_ctr, pkts, bytes, &dim_sample);
1250 		net_dim(&r_vec->tx_dim, dim_sample);
1251 	}
1252 
1253 	return pkts_polled;
1254 }
1255 
1256 /* Control device data path
1257  */
1258 
1259 bool
1260 nfp_nfdk_ctrl_tx_one(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
1261 		     struct sk_buff *skb, bool old)
1262 {
1263 	u32 cnt, tmp_dlen, dlen_type = 0;
1264 	struct nfp_net_tx_ring *tx_ring;
1265 	struct nfp_nfdk_tx_buf *txbuf;
1266 	struct nfp_nfdk_tx_desc *txd;
1267 	unsigned int dma_len, type;
1268 	struct nfp_net_dp *dp;
1269 	dma_addr_t dma_addr;
1270 	u64 metadata = 0;
1271 	int wr_idx;
1272 
1273 	dp = &r_vec->nfp_net->dp;
1274 	tx_ring = r_vec->tx_ring;
1275 
1276 	if (WARN_ON_ONCE(skb_shinfo(skb)->nr_frags)) {
1277 		nn_dp_warn(dp, "Driver's CTRL TX does not implement gather\n");
1278 		goto err_free;
1279 	}
1280 
1281 	/* Don't bother counting frags, assume the worst */
1282 	if (unlikely(nfp_net_tx_full(tx_ring, NFDK_TX_DESC_STOP_CNT))) {
1283 		u64_stats_update_begin(&r_vec->tx_sync);
1284 		r_vec->tx_busy++;
1285 		u64_stats_update_end(&r_vec->tx_sync);
1286 		if (!old)
1287 			__skb_queue_tail(&r_vec->queue, skb);
1288 		else
1289 			__skb_queue_head(&r_vec->queue, skb);
1290 		return NETDEV_TX_BUSY;
1291 	}
1292 
1293 	if (nfp_app_ctrl_has_meta(nn->app)) {
1294 		if (unlikely(skb_headroom(skb) < 8)) {
1295 			nn_dp_warn(dp, "CTRL TX on skb without headroom\n");
1296 			goto err_free;
1297 		}
1298 		metadata = NFDK_DESC_TX_CHAIN_META;
1299 		put_unaligned_be32(NFP_META_PORT_ID_CTRL, skb_push(skb, 4));
1300 		put_unaligned_be32(FIELD_PREP(NFDK_META_LEN, 8) |
1301 				   FIELD_PREP(NFDK_META_FIELDS,
1302 					      NFP_NET_META_PORTID),
1303 				   skb_push(skb, 4));
1304 	}
1305 
1306 	if (nfp_nfdk_tx_maybe_close_block(tx_ring, 0, skb))
1307 		goto err_free;
1308 
1309 	/* DMA map all */
1310 	wr_idx = D_IDX(tx_ring, tx_ring->wr_p);
1311 	txd = &tx_ring->ktxds[wr_idx];
1312 	txbuf = &tx_ring->ktxbufs[wr_idx];
1313 
1314 	dma_len = skb_headlen(skb);
1315 	if (dma_len < NFDK_TX_MAX_DATA_PER_HEAD)
1316 		type = NFDK_DESC_TX_TYPE_SIMPLE;
1317 	else
1318 		type = NFDK_DESC_TX_TYPE_GATHER;
1319 
1320 	dma_addr = dma_map_single(dp->dev, skb->data, dma_len, DMA_TO_DEVICE);
1321 	if (dma_mapping_error(dp->dev, dma_addr))
1322 		goto err_warn_dma;
1323 
1324 	txbuf->skb = skb;
1325 	txbuf++;
1326 
1327 	txbuf->dma_addr = dma_addr;
1328 	txbuf++;
1329 
1330 	dma_len -= 1;
1331 	dlen_type = FIELD_PREP(NFDK_DESC_TX_DMA_LEN_HEAD, dma_len) |
1332 		    FIELD_PREP(NFDK_DESC_TX_TYPE_HEAD, type);
1333 
1334 	txd->dma_len_type = cpu_to_le16(dlen_type);
1335 	nfp_nfdk_tx_desc_set_dma_addr(txd, dma_addr);
1336 
1337 	tmp_dlen = dlen_type & NFDK_DESC_TX_DMA_LEN_HEAD;
1338 	dma_len -= tmp_dlen;
1339 	dma_addr += tmp_dlen + 1;
1340 	txd++;
1341 
1342 	while (dma_len > 0) {
1343 		dma_len -= 1;
1344 		dlen_type = FIELD_PREP(NFDK_DESC_TX_DMA_LEN, dma_len);
1345 		txd->dma_len_type = cpu_to_le16(dlen_type);
1346 		nfp_nfdk_tx_desc_set_dma_addr(txd, dma_addr);
1347 
1348 		dlen_type &= NFDK_DESC_TX_DMA_LEN;
1349 		dma_len -= dlen_type;
1350 		dma_addr += dlen_type + 1;
1351 		txd++;
1352 	}
1353 
1354 	(txd - 1)->dma_len_type = cpu_to_le16(dlen_type | NFDK_DESC_TX_EOP);
1355 
1356 	/* Metadata desc */
1357 	txd->raw = cpu_to_le64(metadata);
1358 	txd++;
1359 
1360 	cnt = txd - tx_ring->ktxds - wr_idx;
1361 	if (unlikely(round_down(wr_idx, NFDK_TX_DESC_BLOCK_CNT) !=
1362 		     round_down(wr_idx + cnt - 1, NFDK_TX_DESC_BLOCK_CNT)))
1363 		goto err_warn_overflow;
1364 
1365 	tx_ring->wr_p += cnt;
1366 	if (tx_ring->wr_p % NFDK_TX_DESC_BLOCK_CNT)
1367 		tx_ring->data_pending += skb->len;
1368 	else
1369 		tx_ring->data_pending = 0;
1370 
1371 	tx_ring->wr_ptr_add += cnt;
1372 	nfp_net_tx_xmit_more_flush(tx_ring);
1373 
1374 	return NETDEV_TX_OK;
1375 
1376 err_warn_overflow:
1377 	WARN_ONCE(1, "unable to fit packet into a descriptor wr_idx:%d head:%d frags:%d cnt:%d",
1378 		  wr_idx, skb_headlen(skb), 0, cnt);
1379 	txbuf--;
1380 	dma_unmap_single(dp->dev, txbuf->dma_addr,
1381 			 skb_headlen(skb), DMA_TO_DEVICE);
1382 	txbuf->raw = 0;
1383 err_warn_dma:
1384 	nn_dp_warn(dp, "Failed to map DMA TX buffer\n");
1385 err_free:
1386 	u64_stats_update_begin(&r_vec->tx_sync);
1387 	r_vec->tx_errors++;
1388 	u64_stats_update_end(&r_vec->tx_sync);
1389 	dev_kfree_skb_any(skb);
1390 	return NETDEV_TX_OK;
1391 }
1392 
1393 static void __nfp_ctrl_tx_queued(struct nfp_net_r_vector *r_vec)
1394 {
1395 	struct sk_buff *skb;
1396 
1397 	while ((skb = __skb_dequeue(&r_vec->queue)))
1398 		if (nfp_nfdk_ctrl_tx_one(r_vec->nfp_net, r_vec, skb, true))
1399 			return;
1400 }
1401 
1402 static bool
1403 nfp_ctrl_meta_ok(struct nfp_net *nn, void *data, unsigned int meta_len)
1404 {
1405 	u32 meta_type, meta_tag;
1406 
1407 	if (!nfp_app_ctrl_has_meta(nn->app))
1408 		return !meta_len;
1409 
1410 	if (meta_len != 8)
1411 		return false;
1412 
1413 	meta_type = get_unaligned_be32(data);
1414 	meta_tag = get_unaligned_be32(data + 4);
1415 
1416 	return (meta_type == NFP_NET_META_PORTID &&
1417 		meta_tag == NFP_META_PORT_ID_CTRL);
1418 }
1419 
1420 static bool
1421 nfp_ctrl_rx_one(struct nfp_net *nn, struct nfp_net_dp *dp,
1422 		struct nfp_net_r_vector *r_vec, struct nfp_net_rx_ring *rx_ring)
1423 {
1424 	unsigned int meta_len, data_len, meta_off, pkt_len, pkt_off;
1425 	struct nfp_net_rx_buf *rxbuf;
1426 	struct nfp_net_rx_desc *rxd;
1427 	dma_addr_t new_dma_addr;
1428 	struct sk_buff *skb;
1429 	void *new_frag;
1430 	int idx;
1431 
1432 	idx = D_IDX(rx_ring, rx_ring->rd_p);
1433 
1434 	rxd = &rx_ring->rxds[idx];
1435 	if (!(rxd->rxd.meta_len_dd & PCIE_DESC_RX_DD))
1436 		return false;
1437 
1438 	/* Memory barrier to ensure that we won't do other reads
1439 	 * before the DD bit.
1440 	 */
1441 	dma_rmb();
1442 
1443 	rx_ring->rd_p++;
1444 
1445 	rxbuf =	&rx_ring->rxbufs[idx];
1446 	meta_len = rxd->rxd.meta_len_dd & PCIE_DESC_RX_META_LEN_MASK;
1447 	data_len = le16_to_cpu(rxd->rxd.data_len);
1448 	pkt_len = data_len - meta_len;
1449 
1450 	pkt_off = NFP_NET_RX_BUF_HEADROOM + dp->rx_dma_off;
1451 	if (dp->rx_offset == NFP_NET_CFG_RX_OFFSET_DYNAMIC)
1452 		pkt_off += meta_len;
1453 	else
1454 		pkt_off += dp->rx_offset;
1455 	meta_off = pkt_off - meta_len;
1456 
1457 	/* Stats update */
1458 	u64_stats_update_begin(&r_vec->rx_sync);
1459 	r_vec->rx_pkts++;
1460 	r_vec->rx_bytes += pkt_len;
1461 	u64_stats_update_end(&r_vec->rx_sync);
1462 
1463 	nfp_net_dma_sync_cpu_rx(dp, rxbuf->dma_addr + meta_off,	data_len);
1464 
1465 	if (unlikely(!nfp_ctrl_meta_ok(nn, rxbuf->frag + meta_off, meta_len))) {
1466 		nn_dp_warn(dp, "incorrect metadata for ctrl packet (%d)\n",
1467 			   meta_len);
1468 		nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf, NULL);
1469 		return true;
1470 	}
1471 
1472 	skb = build_skb(rxbuf->frag, dp->fl_bufsz);
1473 	if (unlikely(!skb)) {
1474 		nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf, NULL);
1475 		return true;
1476 	}
1477 	new_frag = nfp_nfdk_napi_alloc_one(dp, &new_dma_addr);
1478 	if (unlikely(!new_frag)) {
1479 		nfp_nfdk_rx_drop(dp, r_vec, rx_ring, rxbuf, skb);
1480 		return true;
1481 	}
1482 
1483 	nfp_net_dma_unmap_rx(dp, rxbuf->dma_addr);
1484 
1485 	nfp_nfdk_rx_give_one(dp, rx_ring, new_frag, new_dma_addr);
1486 
1487 	skb_reserve(skb, pkt_off);
1488 	skb_put(skb, pkt_len);
1489 
1490 	nfp_app_ctrl_rx(nn->app, skb);
1491 
1492 	return true;
1493 }
1494 
1495 static bool nfp_ctrl_rx(struct nfp_net_r_vector *r_vec)
1496 {
1497 	struct nfp_net_rx_ring *rx_ring = r_vec->rx_ring;
1498 	struct nfp_net *nn = r_vec->nfp_net;
1499 	struct nfp_net_dp *dp = &nn->dp;
1500 	unsigned int budget = 512;
1501 
1502 	while (nfp_ctrl_rx_one(nn, dp, r_vec, rx_ring) && budget--)
1503 		continue;
1504 
1505 	return budget;
1506 }
1507 
1508 void nfp_nfdk_ctrl_poll(struct tasklet_struct *t)
1509 {
1510 	struct nfp_net_r_vector *r_vec = from_tasklet(r_vec, t, tasklet);
1511 
1512 	spin_lock(&r_vec->lock);
1513 	nfp_nfdk_tx_complete(r_vec->tx_ring, 0);
1514 	__nfp_ctrl_tx_queued(r_vec);
1515 	spin_unlock(&r_vec->lock);
1516 
1517 	if (nfp_ctrl_rx(r_vec)) {
1518 		nfp_net_irq_unmask(r_vec->nfp_net, r_vec->irq_entry);
1519 	} else {
1520 		tasklet_schedule(&r_vec->tasklet);
1521 		nn_dp_warn(&r_vec->nfp_net->dp,
1522 			   "control message budget exceeded!\n");
1523 	}
1524 }
1525