xref: /openbmc/linux/drivers/net/ethernet/google/gve/gve_tx_dqo.c (revision b1c3d2beed8ef3699fab106340e33a79052df116)
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6 
7 #include "gve.h"
8 #include "gve_adminq.h"
9 #include "gve_utils.h"
10 #include "gve_dqo.h"
11 #include <linux/tcp.h>
12 #include <linux/slab.h>
13 #include <linux/skbuff.h>
14 
15 /* Returns true if a gve_tx_pending_packet_dqo object is available. */
16 static bool gve_has_pending_packet(struct gve_tx_ring *tx)
17 {
18 	/* Check TX path's list. */
19 	if (tx->dqo_tx.free_pending_packets != -1)
20 		return true;
21 
22 	/* Check completion handler's list. */
23 	if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1)
24 		return true;
25 
26 	return false;
27 }
28 
29 static struct gve_tx_pending_packet_dqo *
30 gve_alloc_pending_packet(struct gve_tx_ring *tx)
31 {
32 	struct gve_tx_pending_packet_dqo *pending_packet;
33 	s16 index;
34 
35 	index = tx->dqo_tx.free_pending_packets;
36 
37 	/* No pending_packets available, try to steal the list from the
38 	 * completion handler.
39 	 */
40 	if (unlikely(index == -1)) {
41 		tx->dqo_tx.free_pending_packets =
42 			atomic_xchg(&tx->dqo_compl.free_pending_packets, -1);
43 		index = tx->dqo_tx.free_pending_packets;
44 
45 		if (unlikely(index == -1))
46 			return NULL;
47 	}
48 
49 	pending_packet = &tx->dqo.pending_packets[index];
50 
51 	/* Remove pending_packet from free list */
52 	tx->dqo_tx.free_pending_packets = pending_packet->next;
53 	pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
54 
55 	return pending_packet;
56 }
57 
58 static void
59 gve_free_pending_packet(struct gve_tx_ring *tx,
60 			struct gve_tx_pending_packet_dqo *pending_packet)
61 {
62 	s16 index = pending_packet - tx->dqo.pending_packets;
63 
64 	pending_packet->state = GVE_PACKET_STATE_UNALLOCATED;
65 	while (true) {
66 		s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets);
67 
68 		pending_packet->next = old_head;
69 		if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets,
70 				   old_head, index) == old_head) {
71 			break;
72 		}
73 	}
74 }
75 
76 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers.
77  */
78 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx)
79 {
80 	int i;
81 
82 	for (i = 0; i < tx->dqo.num_pending_packets; i++) {
83 		struct gve_tx_pending_packet_dqo *cur_state =
84 			&tx->dqo.pending_packets[i];
85 		int j;
86 
87 		for (j = 0; j < cur_state->num_bufs; j++) {
88 			if (j == 0) {
89 				dma_unmap_single(tx->dev,
90 					dma_unmap_addr(cur_state, dma[j]),
91 					dma_unmap_len(cur_state, len[j]),
92 					DMA_TO_DEVICE);
93 			} else {
94 				dma_unmap_page(tx->dev,
95 					dma_unmap_addr(cur_state, dma[j]),
96 					dma_unmap_len(cur_state, len[j]),
97 					DMA_TO_DEVICE);
98 			}
99 		}
100 		if (cur_state->skb) {
101 			dev_consume_skb_any(cur_state->skb);
102 			cur_state->skb = NULL;
103 		}
104 	}
105 }
106 
107 static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx)
108 {
109 	struct gve_tx_ring *tx = &priv->tx[idx];
110 	struct device *hdev = &priv->pdev->dev;
111 	size_t bytes;
112 
113 	gve_tx_remove_from_block(priv, idx);
114 
115 	if (tx->q_resources) {
116 		dma_free_coherent(hdev, sizeof(*tx->q_resources),
117 				  tx->q_resources, tx->q_resources_bus);
118 		tx->q_resources = NULL;
119 	}
120 
121 	if (tx->dqo.compl_ring) {
122 		bytes = sizeof(tx->dqo.compl_ring[0]) *
123 			(tx->dqo.complq_mask + 1);
124 		dma_free_coherent(hdev, bytes, tx->dqo.compl_ring,
125 				  tx->complq_bus_dqo);
126 		tx->dqo.compl_ring = NULL;
127 	}
128 
129 	if (tx->dqo.tx_ring) {
130 		bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
131 		dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus);
132 		tx->dqo.tx_ring = NULL;
133 	}
134 
135 	kvfree(tx->dqo.pending_packets);
136 	tx->dqo.pending_packets = NULL;
137 
138 	netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);
139 }
140 
141 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx)
142 {
143 	struct gve_tx_ring *tx = &priv->tx[idx];
144 	struct device *hdev = &priv->pdev->dev;
145 	int num_pending_packets;
146 	size_t bytes;
147 	int i;
148 
149 	memset(tx, 0, sizeof(*tx));
150 	tx->q_num = idx;
151 	tx->dev = &priv->pdev->dev;
152 	tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
153 	atomic_set_release(&tx->dqo_compl.hw_tx_head, 0);
154 
155 	/* Queue sizes must be a power of 2 */
156 	tx->mask = priv->tx_desc_cnt - 1;
157 	tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1;
158 
159 	/* The max number of pending packets determines the maximum number of
160 	 * descriptors which maybe written to the completion queue.
161 	 *
162 	 * We must set the number small enough to make sure we never overrun the
163 	 * completion queue.
164 	 */
165 	num_pending_packets = tx->dqo.complq_mask + 1;
166 
167 	/* Reserve space for descriptor completions, which will be reported at
168 	 * most every GVE_TX_MIN_RE_INTERVAL packets.
169 	 */
170 	num_pending_packets -=
171 		(tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL;
172 
173 	/* Each packet may have at most 2 buffer completions if it receives both
174 	 * a miss and reinjection completion.
175 	 */
176 	num_pending_packets /= 2;
177 
178 	tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX);
179 	tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets,
180 					   sizeof(tx->dqo.pending_packets[0]),
181 					   GFP_KERNEL);
182 	if (!tx->dqo.pending_packets)
183 		goto err;
184 
185 	/* Set up linked list of pending packets */
186 	for (i = 0; i < tx->dqo.num_pending_packets - 1; i++)
187 		tx->dqo.pending_packets[i].next = i + 1;
188 
189 	tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1;
190 	atomic_set_release(&tx->dqo_compl.free_pending_packets, -1);
191 	tx->dqo_compl.miss_completions.head = -1;
192 	tx->dqo_compl.miss_completions.tail = -1;
193 	tx->dqo_compl.timed_out_completions.head = -1;
194 	tx->dqo_compl.timed_out_completions.tail = -1;
195 
196 	bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
197 	tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL);
198 	if (!tx->dqo.tx_ring)
199 		goto err;
200 
201 	bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1);
202 	tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes,
203 						&tx->complq_bus_dqo,
204 						GFP_KERNEL);
205 	if (!tx->dqo.compl_ring)
206 		goto err;
207 
208 	tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources),
209 					     &tx->q_resources_bus, GFP_KERNEL);
210 	if (!tx->q_resources)
211 		goto err;
212 
213 	gve_tx_add_to_block(priv, idx);
214 
215 	return 0;
216 
217 err:
218 	gve_tx_free_ring_dqo(priv, idx);
219 	return -ENOMEM;
220 }
221 
222 int gve_tx_alloc_rings_dqo(struct gve_priv *priv)
223 {
224 	int err = 0;
225 	int i;
226 
227 	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
228 		err = gve_tx_alloc_ring_dqo(priv, i);
229 		if (err) {
230 			netif_err(priv, drv, priv->dev,
231 				  "Failed to alloc tx ring=%d: err=%d\n",
232 				  i, err);
233 			goto err;
234 		}
235 	}
236 
237 	return 0;
238 
239 err:
240 	for (i--; i >= 0; i--)
241 		gve_tx_free_ring_dqo(priv, i);
242 
243 	return err;
244 }
245 
246 void gve_tx_free_rings_dqo(struct gve_priv *priv)
247 {
248 	int i;
249 
250 	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
251 		struct gve_tx_ring *tx = &priv->tx[i];
252 
253 		gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL);
254 		netdev_tx_reset_queue(tx->netdev_txq);
255 		gve_tx_clean_pending_packets(tx);
256 
257 		gve_tx_free_ring_dqo(priv, i);
258 	}
259 }
260 
261 /* Returns the number of slots available in the ring */
262 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx)
263 {
264 	u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask;
265 
266 	return tx->mask - num_used;
267 }
268 
269 /* Stops the queue if available descriptors is less than 'count'.
270  * Return: 0 if stop is not required.
271  */
272 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count)
273 {
274 	if (likely(gve_has_pending_packet(tx) &&
275 		   num_avail_tx_slots(tx) >= count))
276 		return 0;
277 
278 	/* Update cached TX head pointer */
279 	tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
280 
281 	if (likely(gve_has_pending_packet(tx) &&
282 		   num_avail_tx_slots(tx) >= count))
283 		return 0;
284 
285 	/* No space, so stop the queue */
286 	tx->stop_queue++;
287 	netif_tx_stop_queue(tx->netdev_txq);
288 
289 	/* Sync with restarting queue in `gve_tx_poll_dqo()` */
290 	mb();
291 
292 	/* After stopping queue, check if we can transmit again in order to
293 	 * avoid TOCTOU bug.
294 	 */
295 	tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
296 
297 	if (likely(!gve_has_pending_packet(tx) ||
298 		   num_avail_tx_slots(tx) < count))
299 		return -EBUSY;
300 
301 	netif_tx_start_queue(tx->netdev_txq);
302 	tx->wake_queue++;
303 	return 0;
304 }
305 
306 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb,
307 					struct gve_tx_metadata_dqo *metadata)
308 {
309 	memset(metadata, 0, sizeof(*metadata));
310 	metadata->version = GVE_TX_METADATA_VERSION_DQO;
311 
312 	if (skb->l4_hash) {
313 		u16 path_hash = skb->hash ^ (skb->hash >> 16);
314 
315 		path_hash &= (1 << 15) - 1;
316 		if (unlikely(path_hash == 0))
317 			path_hash = ~path_hash;
318 
319 		metadata->path_hash = path_hash;
320 	}
321 }
322 
323 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx,
324 				     struct sk_buff *skb, u32 len, u64 addr,
325 				     s16 compl_tag, bool eop, bool is_gso)
326 {
327 	const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL;
328 
329 	while (len > 0) {
330 		struct gve_tx_pkt_desc_dqo *desc =
331 			&tx->dqo.tx_ring[*desc_idx].pkt;
332 		u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO);
333 		bool cur_eop = eop && cur_len == len;
334 
335 		*desc = (struct gve_tx_pkt_desc_dqo){
336 			.buf_addr = cpu_to_le64(addr),
337 			.dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
338 			.end_of_packet = cur_eop,
339 			.checksum_offload_enable = checksum_offload_en,
340 			.compl_tag = cpu_to_le16(compl_tag),
341 			.buf_size = cur_len,
342 		};
343 
344 		addr += cur_len;
345 		len -= cur_len;
346 		*desc_idx = (*desc_idx + 1) & tx->mask;
347 	}
348 }
349 
350 /* Validates and prepares `skb` for TSO.
351  *
352  * Returns header length, or < 0 if invalid.
353  */
354 static int gve_prep_tso(struct sk_buff *skb)
355 {
356 	struct tcphdr *tcp;
357 	int header_len;
358 	u32 paylen;
359 	int err;
360 
361 	/* Note: HW requires MSS (gso_size) to be <= 9728 and the total length
362 	 * of the TSO to be <= 262143.
363 	 *
364 	 * However, we don't validate these because:
365 	 * - Hypervisor enforces a limit of 9K MTU
366 	 * - Kernel will not produce a TSO larger than 64k
367 	 */
368 
369 	if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO))
370 		return -1;
371 
372 	/* Needed because we will modify header. */
373 	err = skb_cow_head(skb, 0);
374 	if (err < 0)
375 		return err;
376 
377 	tcp = tcp_hdr(skb);
378 
379 	/* Remove payload length from checksum. */
380 	paylen = skb->len - skb_transport_offset(skb);
381 
382 	switch (skb_shinfo(skb)->gso_type) {
383 	case SKB_GSO_TCPV4:
384 	case SKB_GSO_TCPV6:
385 		csum_replace_by_diff(&tcp->check,
386 				     (__force __wsum)htonl(paylen));
387 
388 		/* Compute length of segmentation header. */
389 		header_len = skb_tcp_all_headers(skb);
390 		break;
391 	default:
392 		return -EINVAL;
393 	}
394 
395 	if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO))
396 		return -EINVAL;
397 
398 	return header_len;
399 }
400 
401 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc,
402 				     const struct sk_buff *skb,
403 				     const struct gve_tx_metadata_dqo *metadata,
404 				     int header_len)
405 {
406 	*desc = (struct gve_tx_tso_context_desc_dqo){
407 		.header_len = header_len,
408 		.cmd_dtype = {
409 			.dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO,
410 			.tso = 1,
411 		},
412 		.flex0 = metadata->bytes[0],
413 		.flex5 = metadata->bytes[5],
414 		.flex6 = metadata->bytes[6],
415 		.flex7 = metadata->bytes[7],
416 		.flex8 = metadata->bytes[8],
417 		.flex9 = metadata->bytes[9],
418 		.flex10 = metadata->bytes[10],
419 		.flex11 = metadata->bytes[11],
420 	};
421 	desc->tso_total_len = skb->len - header_len;
422 	desc->mss = skb_shinfo(skb)->gso_size;
423 }
424 
425 static void
426 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
427 			     const struct gve_tx_metadata_dqo *metadata)
428 {
429 	*desc = (struct gve_tx_general_context_desc_dqo){
430 		.flex0 = metadata->bytes[0],
431 		.flex1 = metadata->bytes[1],
432 		.flex2 = metadata->bytes[2],
433 		.flex3 = metadata->bytes[3],
434 		.flex4 = metadata->bytes[4],
435 		.flex5 = metadata->bytes[5],
436 		.flex6 = metadata->bytes[6],
437 		.flex7 = metadata->bytes[7],
438 		.flex8 = metadata->bytes[8],
439 		.flex9 = metadata->bytes[9],
440 		.flex10 = metadata->bytes[10],
441 		.flex11 = metadata->bytes[11],
442 		.cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO},
443 	};
444 }
445 
446 /* Returns 0 on success, or < 0 on error.
447  *
448  * Before this function is called, the caller must ensure
449  * gve_has_pending_packet(tx) returns true.
450  */
451 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
452 				      struct sk_buff *skb)
453 {
454 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
455 	const bool is_gso = skb_is_gso(skb);
456 	u32 desc_idx = tx->dqo_tx.tail;
457 
458 	struct gve_tx_pending_packet_dqo *pkt;
459 	struct gve_tx_metadata_dqo metadata;
460 	s16 completion_tag;
461 	int i;
462 
463 	pkt = gve_alloc_pending_packet(tx);
464 	pkt->skb = skb;
465 	pkt->num_bufs = 0;
466 	completion_tag = pkt - tx->dqo.pending_packets;
467 
468 	gve_extract_tx_metadata_dqo(skb, &metadata);
469 	if (is_gso) {
470 		int header_len = gve_prep_tso(skb);
471 
472 		if (unlikely(header_len < 0))
473 			goto err;
474 
475 		gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx,
476 					 skb, &metadata, header_len);
477 		desc_idx = (desc_idx + 1) & tx->mask;
478 	}
479 
480 	gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx,
481 				     &metadata);
482 	desc_idx = (desc_idx + 1) & tx->mask;
483 
484 	/* Note: HW requires that the size of a non-TSO packet be within the
485 	 * range of [17, 9728].
486 	 *
487 	 * We don't double check because
488 	 * - We limited `netdev->min_mtu` to ETH_MIN_MTU.
489 	 * - Hypervisor won't allow MTU larger than 9216.
490 	 */
491 
492 	/* Map the linear portion of skb */
493 	{
494 		u32 len = skb_headlen(skb);
495 		dma_addr_t addr;
496 
497 		addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
498 		if (unlikely(dma_mapping_error(tx->dev, addr)))
499 			goto err;
500 
501 		dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
502 		dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
503 		++pkt->num_bufs;
504 
505 		gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
506 					 completion_tag,
507 					 /*eop=*/shinfo->nr_frags == 0, is_gso);
508 	}
509 
510 	for (i = 0; i < shinfo->nr_frags; i++) {
511 		const skb_frag_t *frag = &shinfo->frags[i];
512 		bool is_eop = i == (shinfo->nr_frags - 1);
513 		u32 len = skb_frag_size(frag);
514 		dma_addr_t addr;
515 
516 		addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
517 		if (unlikely(dma_mapping_error(tx->dev, addr)))
518 			goto err;
519 
520 		dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
521 		dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
522 		++pkt->num_bufs;
523 
524 		gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
525 					 completion_tag, is_eop, is_gso);
526 	}
527 
528 	/* Commit the changes to our state */
529 	tx->dqo_tx.tail = desc_idx;
530 
531 	/* Request a descriptor completion on the last descriptor of the
532 	 * packet if we are allowed to by the HW enforced interval.
533 	 */
534 	{
535 		u32 last_desc_idx = (desc_idx - 1) & tx->mask;
536 		u32 last_report_event_interval =
537 			(last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask;
538 
539 		if (unlikely(last_report_event_interval >=
540 			     GVE_TX_MIN_RE_INTERVAL)) {
541 			tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true;
542 			tx->dqo_tx.last_re_idx = last_desc_idx;
543 		}
544 	}
545 
546 	return 0;
547 
548 err:
549 	for (i = 0; i < pkt->num_bufs; i++) {
550 		if (i == 0) {
551 			dma_unmap_single(tx->dev,
552 					 dma_unmap_addr(pkt, dma[i]),
553 					 dma_unmap_len(pkt, len[i]),
554 					 DMA_TO_DEVICE);
555 		} else {
556 			dma_unmap_page(tx->dev,
557 				       dma_unmap_addr(pkt, dma[i]),
558 				       dma_unmap_len(pkt, len[i]),
559 				       DMA_TO_DEVICE);
560 		}
561 	}
562 
563 	pkt->skb = NULL;
564 	pkt->num_bufs = 0;
565 	gve_free_pending_packet(tx, pkt);
566 
567 	return -1;
568 }
569 
570 static int gve_num_descs_per_buf(size_t size)
571 {
572 	return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO);
573 }
574 
575 static int gve_num_buffer_descs_needed(const struct sk_buff *skb)
576 {
577 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
578 	int num_descs;
579 	int i;
580 
581 	num_descs = gve_num_descs_per_buf(skb_headlen(skb));
582 
583 	for (i = 0; i < shinfo->nr_frags; i++) {
584 		unsigned int frag_size = skb_frag_size(&shinfo->frags[i]);
585 
586 		num_descs += gve_num_descs_per_buf(frag_size);
587 	}
588 
589 	return num_descs;
590 }
591 
592 /* Returns true if HW is capable of sending TSO represented by `skb`.
593  *
594  * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers.
595  * - The header is counted as one buffer for every single segment.
596  * - A buffer which is split between two segments is counted for both.
597  * - If a buffer contains both header and payload, it is counted as two buffers.
598  */
599 static bool gve_can_send_tso(const struct sk_buff *skb)
600 {
601 	const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1;
602 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
603 	const int header_len = skb_tcp_all_headers(skb);
604 	const int gso_size = shinfo->gso_size;
605 	int cur_seg_num_bufs;
606 	int cur_seg_size;
607 	int i;
608 
609 	cur_seg_size = skb_headlen(skb) - header_len;
610 	cur_seg_num_bufs = cur_seg_size > 0;
611 
612 	for (i = 0; i < shinfo->nr_frags; i++) {
613 		if (cur_seg_size >= gso_size) {
614 			cur_seg_size %= gso_size;
615 			cur_seg_num_bufs = cur_seg_size > 0;
616 		}
617 
618 		if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg))
619 			return false;
620 
621 		cur_seg_size += skb_frag_size(&shinfo->frags[i]);
622 	}
623 
624 	return true;
625 }
626 
627 /* Attempt to transmit specified SKB.
628  *
629  * Returns 0 if the SKB was transmitted or dropped.
630  * Returns -1 if there is not currently enough space to transmit the SKB.
631  */
632 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
633 			  struct sk_buff *skb)
634 {
635 	int num_buffer_descs;
636 	int total_num_descs;
637 
638 	if (skb_is_gso(skb)) {
639 		/* If TSO doesn't meet HW requirements, attempt to linearize the
640 		 * packet.
641 		 */
642 		if (unlikely(!gve_can_send_tso(skb) &&
643 			     skb_linearize(skb) < 0)) {
644 			net_err_ratelimited("%s: Failed to transmit TSO packet\n",
645 					    priv->dev->name);
646 			goto drop;
647 		}
648 
649 		num_buffer_descs = gve_num_buffer_descs_needed(skb);
650 	} else {
651 		num_buffer_descs = gve_num_buffer_descs_needed(skb);
652 
653 		if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) {
654 			if (unlikely(skb_linearize(skb) < 0))
655 				goto drop;
656 
657 			num_buffer_descs = 1;
658 		}
659 	}
660 
661 	/* Metadata + (optional TSO) + data descriptors. */
662 	total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs;
663 	if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs +
664 			GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) {
665 		return -1;
666 	}
667 
668 	if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0))
669 		goto drop;
670 
671 	netdev_tx_sent_queue(tx->netdev_txq, skb->len);
672 	skb_tx_timestamp(skb);
673 	return 0;
674 
675 drop:
676 	tx->dropped_pkt++;
677 	dev_kfree_skb_any(skb);
678 	return 0;
679 }
680 
681 /* Transmit a given skb and ring the doorbell. */
682 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
683 {
684 	struct gve_priv *priv = netdev_priv(dev);
685 	struct gve_tx_ring *tx;
686 
687 	tx = &priv->tx[skb_get_queue_mapping(skb)];
688 	if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) {
689 		/* We need to ring the txq doorbell -- we have stopped the Tx
690 		 * queue for want of resources, but prior calls to gve_tx()
691 		 * may have added descriptors without ringing the doorbell.
692 		 */
693 		gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
694 		return NETDEV_TX_BUSY;
695 	}
696 
697 	if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
698 		return NETDEV_TX_OK;
699 
700 	gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
701 	return NETDEV_TX_OK;
702 }
703 
704 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list,
705 			struct gve_tx_pending_packet_dqo *pending_packet)
706 {
707 	s16 old_tail, index;
708 
709 	index = pending_packet - tx->dqo.pending_packets;
710 	old_tail = list->tail;
711 	list->tail = index;
712 	if (old_tail == -1)
713 		list->head = index;
714 	else
715 		tx->dqo.pending_packets[old_tail].next = index;
716 
717 	pending_packet->next = -1;
718 	pending_packet->prev = old_tail;
719 }
720 
721 static void remove_from_list(struct gve_tx_ring *tx,
722 			     struct gve_index_list *list,
723 			     struct gve_tx_pending_packet_dqo *pkt)
724 {
725 	s16 prev_index, next_index;
726 
727 	prev_index = pkt->prev;
728 	next_index = pkt->next;
729 
730 	if (prev_index == -1) {
731 		/* Node is head */
732 		list->head = next_index;
733 	} else {
734 		tx->dqo.pending_packets[prev_index].next = next_index;
735 	}
736 	if (next_index == -1) {
737 		/* Node is tail */
738 		list->tail = prev_index;
739 	} else {
740 		tx->dqo.pending_packets[next_index].prev = prev_index;
741 	}
742 }
743 
744 static void gve_unmap_packet(struct device *dev,
745 			     struct gve_tx_pending_packet_dqo *pkt)
746 {
747 	int i;
748 
749 	/* SKB linear portion is guaranteed to be mapped */
750 	dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]),
751 			 dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE);
752 	for (i = 1; i < pkt->num_bufs; i++) {
753 		dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]),
754 			       dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE);
755 	}
756 	pkt->num_bufs = 0;
757 }
758 
759 /* Completion types and expected behavior:
760  * No Miss compl + Packet compl = Packet completed normally.
761  * Miss compl + Re-inject compl = Packet completed normally.
762  * No Miss compl + Re-inject compl = Skipped i.e. packet not completed.
763  * Miss compl + Packet compl = Skipped i.e. packet not completed.
764  */
765 static void gve_handle_packet_completion(struct gve_priv *priv,
766 					 struct gve_tx_ring *tx, bool is_napi,
767 					 u16 compl_tag, u64 *bytes, u64 *pkts,
768 					 bool is_reinjection)
769 {
770 	struct gve_tx_pending_packet_dqo *pending_packet;
771 
772 	if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
773 		net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
774 				    priv->dev->name, (int)compl_tag);
775 		return;
776 	}
777 
778 	pending_packet = &tx->dqo.pending_packets[compl_tag];
779 
780 	if (unlikely(is_reinjection)) {
781 		if (unlikely(pending_packet->state ==
782 			     GVE_PACKET_STATE_TIMED_OUT_COMPL)) {
783 			net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n",
784 					    priv->dev->name, (int)compl_tag);
785 			/* Packet was already completed as a result of timeout,
786 			 * so just remove from list and free pending packet.
787 			 */
788 			remove_from_list(tx,
789 					 &tx->dqo_compl.timed_out_completions,
790 					 pending_packet);
791 			gve_free_pending_packet(tx, pending_packet);
792 			return;
793 		}
794 		if (unlikely(pending_packet->state !=
795 			     GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) {
796 			/* No outstanding miss completion but packet allocated
797 			 * implies packet receives a re-injection completion
798 			 * without a prior miss completion. Return without
799 			 * completing the packet.
800 			 */
801 			net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n",
802 					    priv->dev->name, (int)compl_tag);
803 			return;
804 		}
805 		remove_from_list(tx, &tx->dqo_compl.miss_completions,
806 				 pending_packet);
807 	} else {
808 		/* Packet is allocated but not a pending data completion. */
809 		if (unlikely(pending_packet->state !=
810 			     GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
811 			net_err_ratelimited("%s: No pending data completion: %d\n",
812 					    priv->dev->name, (int)compl_tag);
813 			return;
814 		}
815 	}
816 	gve_unmap_packet(tx->dev, pending_packet);
817 
818 	*bytes += pending_packet->skb->len;
819 	(*pkts)++;
820 	napi_consume_skb(pending_packet->skb, is_napi);
821 	pending_packet->skb = NULL;
822 	gve_free_pending_packet(tx, pending_packet);
823 }
824 
825 static void gve_handle_miss_completion(struct gve_priv *priv,
826 				       struct gve_tx_ring *tx, u16 compl_tag,
827 				       u64 *bytes, u64 *pkts)
828 {
829 	struct gve_tx_pending_packet_dqo *pending_packet;
830 
831 	if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
832 		net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
833 				    priv->dev->name, (int)compl_tag);
834 		return;
835 	}
836 
837 	pending_packet = &tx->dqo.pending_packets[compl_tag];
838 	if (unlikely(pending_packet->state !=
839 				GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
840 		net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n",
841 				    priv->dev->name, (int)pending_packet->state,
842 				    (int)compl_tag);
843 		return;
844 	}
845 
846 	pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL;
847 	/* jiffies can wraparound but time comparisons can handle overflows. */
848 	pending_packet->timeout_jiffies =
849 			jiffies +
850 			msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT *
851 					 MSEC_PER_SEC);
852 	add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet);
853 
854 	*bytes += pending_packet->skb->len;
855 	(*pkts)++;
856 }
857 
858 static void remove_miss_completions(struct gve_priv *priv,
859 				    struct gve_tx_ring *tx)
860 {
861 	struct gve_tx_pending_packet_dqo *pending_packet;
862 	s16 next_index;
863 
864 	next_index = tx->dqo_compl.miss_completions.head;
865 	while (next_index != -1) {
866 		pending_packet = &tx->dqo.pending_packets[next_index];
867 		next_index = pending_packet->next;
868 		/* Break early because packets should timeout in order. */
869 		if (time_is_after_jiffies(pending_packet->timeout_jiffies))
870 			break;
871 
872 		remove_from_list(tx, &tx->dqo_compl.miss_completions,
873 				 pending_packet);
874 		/* Unmap buffers and free skb but do not unallocate packet i.e.
875 		 * the completion tag is not freed to ensure that the driver
876 		 * can take appropriate action if a corresponding valid
877 		 * completion is received later.
878 		 */
879 		gve_unmap_packet(tx->dev, pending_packet);
880 		/* This indicates the packet was dropped. */
881 		dev_kfree_skb_any(pending_packet->skb);
882 		pending_packet->skb = NULL;
883 		tx->dropped_pkt++;
884 		net_err_ratelimited("%s: No reinjection completion was received for: %d.\n",
885 				    priv->dev->name,
886 				    (int)(pending_packet - tx->dqo.pending_packets));
887 
888 		pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL;
889 		pending_packet->timeout_jiffies =
890 				jiffies +
891 				msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT *
892 						 MSEC_PER_SEC);
893 		/* Maintain pending packet in another list so the packet can be
894 		 * unallocated at a later time.
895 		 */
896 		add_to_list(tx, &tx->dqo_compl.timed_out_completions,
897 			    pending_packet);
898 	}
899 }
900 
901 static void remove_timed_out_completions(struct gve_priv *priv,
902 					 struct gve_tx_ring *tx)
903 {
904 	struct gve_tx_pending_packet_dqo *pending_packet;
905 	s16 next_index;
906 
907 	next_index = tx->dqo_compl.timed_out_completions.head;
908 	while (next_index != -1) {
909 		pending_packet = &tx->dqo.pending_packets[next_index];
910 		next_index = pending_packet->next;
911 		/* Break early because packets should timeout in order. */
912 		if (time_is_after_jiffies(pending_packet->timeout_jiffies))
913 			break;
914 
915 		remove_from_list(tx, &tx->dqo_compl.timed_out_completions,
916 				 pending_packet);
917 		gve_free_pending_packet(tx, pending_packet);
918 	}
919 }
920 
921 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
922 			  struct napi_struct *napi)
923 {
924 	u64 reinject_compl_bytes = 0;
925 	u64 reinject_compl_pkts = 0;
926 	int num_descs_cleaned = 0;
927 	u64 miss_compl_bytes = 0;
928 	u64 miss_compl_pkts = 0;
929 	u64 pkt_compl_bytes = 0;
930 	u64 pkt_compl_pkts = 0;
931 
932 	/* Limit in order to avoid blocking for too long */
933 	while (!napi || pkt_compl_pkts < napi->weight) {
934 		struct gve_tx_compl_desc *compl_desc =
935 			&tx->dqo.compl_ring[tx->dqo_compl.head];
936 		u16 type;
937 
938 		if (compl_desc->generation == tx->dqo_compl.cur_gen_bit)
939 			break;
940 
941 		/* Prefetch the next descriptor. */
942 		prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) &
943 				tx->dqo.complq_mask]);
944 
945 		/* Do not read data until we own the descriptor */
946 		dma_rmb();
947 		type = compl_desc->type;
948 
949 		if (type == GVE_COMPL_TYPE_DQO_DESC) {
950 			/* This is the last descriptor fetched by HW plus one */
951 			u16 tx_head = le16_to_cpu(compl_desc->tx_head);
952 
953 			atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head);
954 		} else if (type == GVE_COMPL_TYPE_DQO_PKT) {
955 			u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
956 			if (compl_tag & GVE_ALT_MISS_COMPL_BIT) {
957 				compl_tag &= ~GVE_ALT_MISS_COMPL_BIT;
958 				gve_handle_miss_completion(priv, tx, compl_tag,
959 							   &miss_compl_bytes,
960 							   &miss_compl_pkts);
961 			} else {
962 				gve_handle_packet_completion(priv, tx, !!napi,
963 							     compl_tag,
964 							     &pkt_compl_bytes,
965 							     &pkt_compl_pkts,
966 							     false);
967 			}
968 		} else if (type == GVE_COMPL_TYPE_DQO_MISS) {
969 			u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
970 
971 			gve_handle_miss_completion(priv, tx, compl_tag,
972 						   &miss_compl_bytes,
973 						   &miss_compl_pkts);
974 		} else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) {
975 			u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
976 
977 			gve_handle_packet_completion(priv, tx, !!napi,
978 						     compl_tag,
979 						     &reinject_compl_bytes,
980 						     &reinject_compl_pkts,
981 						     true);
982 		}
983 
984 		tx->dqo_compl.head =
985 			(tx->dqo_compl.head + 1) & tx->dqo.complq_mask;
986 		/* Flip the generation bit when we wrap around */
987 		tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0;
988 		num_descs_cleaned++;
989 	}
990 
991 	netdev_tx_completed_queue(tx->netdev_txq,
992 				  pkt_compl_pkts + miss_compl_pkts,
993 				  pkt_compl_bytes + miss_compl_bytes);
994 
995 	remove_miss_completions(priv, tx);
996 	remove_timed_out_completions(priv, tx);
997 
998 	u64_stats_update_begin(&tx->statss);
999 	tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes;
1000 	tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts;
1001 	u64_stats_update_end(&tx->statss);
1002 	return num_descs_cleaned;
1003 }
1004 
1005 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean)
1006 {
1007 	struct gve_tx_compl_desc *compl_desc;
1008 	struct gve_tx_ring *tx = block->tx;
1009 	struct gve_priv *priv = block->priv;
1010 
1011 	if (do_clean) {
1012 		int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx,
1013 							      &block->napi);
1014 
1015 		/* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */
1016 		mb();
1017 
1018 		if (netif_tx_queue_stopped(tx->netdev_txq) &&
1019 		    num_descs_cleaned > 0) {
1020 			tx->wake_queue++;
1021 			netif_tx_wake_queue(tx->netdev_txq);
1022 		}
1023 	}
1024 
1025 	/* Return true if we still have work. */
1026 	compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head];
1027 	return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
1028 }
1029