1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6 
7 #include "gve.h"
8 #include "gve_adminq.h"
9 #include "gve_utils.h"
10 #include "gve_dqo.h"
11 #include <net/ip.h>
12 #include <linux/tcp.h>
13 #include <linux/slab.h>
14 #include <linux/skbuff.h>
15 
16 /* Returns true if a gve_tx_pending_packet_dqo object is available. */
17 static bool gve_has_pending_packet(struct gve_tx_ring *tx)
18 {
19 	/* Check TX path's list. */
20 	if (tx->dqo_tx.free_pending_packets != -1)
21 		return true;
22 
23 	/* Check completion handler's list. */
24 	if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1)
25 		return true;
26 
27 	return false;
28 }
29 
30 static struct gve_tx_pending_packet_dqo *
31 gve_alloc_pending_packet(struct gve_tx_ring *tx)
32 {
33 	struct gve_tx_pending_packet_dqo *pending_packet;
34 	s16 index;
35 
36 	index = tx->dqo_tx.free_pending_packets;
37 
38 	/* No pending_packets available, try to steal the list from the
39 	 * completion handler.
40 	 */
41 	if (unlikely(index == -1)) {
42 		tx->dqo_tx.free_pending_packets =
43 			atomic_xchg(&tx->dqo_compl.free_pending_packets, -1);
44 		index = tx->dqo_tx.free_pending_packets;
45 
46 		if (unlikely(index == -1))
47 			return NULL;
48 	}
49 
50 	pending_packet = &tx->dqo.pending_packets[index];
51 
52 	/* Remove pending_packet from free list */
53 	tx->dqo_tx.free_pending_packets = pending_packet->next;
54 	pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
55 
56 	return pending_packet;
57 }
58 
59 static void
60 gve_free_pending_packet(struct gve_tx_ring *tx,
61 			struct gve_tx_pending_packet_dqo *pending_packet)
62 {
63 	s16 index = pending_packet - tx->dqo.pending_packets;
64 
65 	pending_packet->state = GVE_PACKET_STATE_UNALLOCATED;
66 	while (true) {
67 		s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets);
68 
69 		pending_packet->next = old_head;
70 		if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets,
71 				   old_head, index) == old_head) {
72 			break;
73 		}
74 	}
75 }
76 
77 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers.
78  */
79 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx)
80 {
81 	int i;
82 
83 	for (i = 0; i < tx->dqo.num_pending_packets; i++) {
84 		struct gve_tx_pending_packet_dqo *cur_state =
85 			&tx->dqo.pending_packets[i];
86 		int j;
87 
88 		for (j = 0; j < cur_state->num_bufs; j++) {
89 			if (j == 0) {
90 				dma_unmap_single(tx->dev,
91 					dma_unmap_addr(cur_state, dma[j]),
92 					dma_unmap_len(cur_state, len[j]),
93 					DMA_TO_DEVICE);
94 			} else {
95 				dma_unmap_page(tx->dev,
96 					dma_unmap_addr(cur_state, dma[j]),
97 					dma_unmap_len(cur_state, len[j]),
98 					DMA_TO_DEVICE);
99 			}
100 		}
101 		if (cur_state->skb) {
102 			dev_consume_skb_any(cur_state->skb);
103 			cur_state->skb = NULL;
104 		}
105 	}
106 }
107 
108 static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx)
109 {
110 	struct gve_tx_ring *tx = &priv->tx[idx];
111 	struct device *hdev = &priv->pdev->dev;
112 	size_t bytes;
113 
114 	gve_tx_remove_from_block(priv, idx);
115 
116 	if (tx->q_resources) {
117 		dma_free_coherent(hdev, sizeof(*tx->q_resources),
118 				  tx->q_resources, tx->q_resources_bus);
119 		tx->q_resources = NULL;
120 	}
121 
122 	if (tx->dqo.compl_ring) {
123 		bytes = sizeof(tx->dqo.compl_ring[0]) *
124 			(tx->dqo.complq_mask + 1);
125 		dma_free_coherent(hdev, bytes, tx->dqo.compl_ring,
126 				  tx->complq_bus_dqo);
127 		tx->dqo.compl_ring = NULL;
128 	}
129 
130 	if (tx->dqo.tx_ring) {
131 		bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
132 		dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus);
133 		tx->dqo.tx_ring = NULL;
134 	}
135 
136 	kvfree(tx->dqo.pending_packets);
137 	tx->dqo.pending_packets = NULL;
138 
139 	netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);
140 }
141 
142 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx)
143 {
144 	struct gve_tx_ring *tx = &priv->tx[idx];
145 	struct device *hdev = &priv->pdev->dev;
146 	int num_pending_packets;
147 	size_t bytes;
148 	int i;
149 
150 	memset(tx, 0, sizeof(*tx));
151 	tx->q_num = idx;
152 	tx->dev = &priv->pdev->dev;
153 	tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
154 	atomic_set_release(&tx->dqo_compl.hw_tx_head, 0);
155 
156 	/* Queue sizes must be a power of 2 */
157 	tx->mask = priv->tx_desc_cnt - 1;
158 	tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1;
159 
160 	/* The max number of pending packets determines the maximum number of
161 	 * descriptors which maybe written to the completion queue.
162 	 *
163 	 * We must set the number small enough to make sure we never overrun the
164 	 * completion queue.
165 	 */
166 	num_pending_packets = tx->dqo.complq_mask + 1;
167 
168 	/* Reserve space for descriptor completions, which will be reported at
169 	 * most every GVE_TX_MIN_RE_INTERVAL packets.
170 	 */
171 	num_pending_packets -=
172 		(tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL;
173 
174 	/* Each packet may have at most 2 buffer completions if it receives both
175 	 * a miss and reinjection completion.
176 	 */
177 	num_pending_packets /= 2;
178 
179 	tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX);
180 	tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets,
181 					   sizeof(tx->dqo.pending_packets[0]),
182 					   GFP_KERNEL);
183 	if (!tx->dqo.pending_packets)
184 		goto err;
185 
186 	/* Set up linked list of pending packets */
187 	for (i = 0; i < tx->dqo.num_pending_packets - 1; i++)
188 		tx->dqo.pending_packets[i].next = i + 1;
189 
190 	tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1;
191 	atomic_set_release(&tx->dqo_compl.free_pending_packets, -1);
192 	tx->dqo_compl.miss_completions.head = -1;
193 	tx->dqo_compl.miss_completions.tail = -1;
194 	tx->dqo_compl.timed_out_completions.head = -1;
195 	tx->dqo_compl.timed_out_completions.tail = -1;
196 
197 	bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
198 	tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL);
199 	if (!tx->dqo.tx_ring)
200 		goto err;
201 
202 	bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1);
203 	tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes,
204 						&tx->complq_bus_dqo,
205 						GFP_KERNEL);
206 	if (!tx->dqo.compl_ring)
207 		goto err;
208 
209 	tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources),
210 					     &tx->q_resources_bus, GFP_KERNEL);
211 	if (!tx->q_resources)
212 		goto err;
213 
214 	gve_tx_add_to_block(priv, idx);
215 
216 	return 0;
217 
218 err:
219 	gve_tx_free_ring_dqo(priv, idx);
220 	return -ENOMEM;
221 }
222 
223 int gve_tx_alloc_rings_dqo(struct gve_priv *priv)
224 {
225 	int err = 0;
226 	int i;
227 
228 	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
229 		err = gve_tx_alloc_ring_dqo(priv, i);
230 		if (err) {
231 			netif_err(priv, drv, priv->dev,
232 				  "Failed to alloc tx ring=%d: err=%d\n",
233 				  i, err);
234 			goto err;
235 		}
236 	}
237 
238 	return 0;
239 
240 err:
241 	for (i--; i >= 0; i--)
242 		gve_tx_free_ring_dqo(priv, i);
243 
244 	return err;
245 }
246 
247 void gve_tx_free_rings_dqo(struct gve_priv *priv)
248 {
249 	int i;
250 
251 	for (i = 0; i < priv->tx_cfg.num_queues; i++) {
252 		struct gve_tx_ring *tx = &priv->tx[i];
253 
254 		gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL);
255 		netdev_tx_reset_queue(tx->netdev_txq);
256 		gve_tx_clean_pending_packets(tx);
257 
258 		gve_tx_free_ring_dqo(priv, i);
259 	}
260 }
261 
262 /* Returns the number of slots available in the ring */
263 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx)
264 {
265 	u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask;
266 
267 	return tx->mask - num_used;
268 }
269 
270 /* Stops the queue if available descriptors is less than 'count'.
271  * Return: 0 if stop is not required.
272  */
273 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count)
274 {
275 	if (likely(gve_has_pending_packet(tx) &&
276 		   num_avail_tx_slots(tx) >= count))
277 		return 0;
278 
279 	/* Update cached TX head pointer */
280 	tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
281 
282 	if (likely(gve_has_pending_packet(tx) &&
283 		   num_avail_tx_slots(tx) >= count))
284 		return 0;
285 
286 	/* No space, so stop the queue */
287 	tx->stop_queue++;
288 	netif_tx_stop_queue(tx->netdev_txq);
289 
290 	/* Sync with restarting queue in `gve_tx_poll_dqo()` */
291 	mb();
292 
293 	/* After stopping queue, check if we can transmit again in order to
294 	 * avoid TOCTOU bug.
295 	 */
296 	tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
297 
298 	if (likely(!gve_has_pending_packet(tx) ||
299 		   num_avail_tx_slots(tx) < count))
300 		return -EBUSY;
301 
302 	netif_tx_start_queue(tx->netdev_txq);
303 	tx->wake_queue++;
304 	return 0;
305 }
306 
307 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb,
308 					struct gve_tx_metadata_dqo *metadata)
309 {
310 	memset(metadata, 0, sizeof(*metadata));
311 	metadata->version = GVE_TX_METADATA_VERSION_DQO;
312 
313 	if (skb->l4_hash) {
314 		u16 path_hash = skb->hash ^ (skb->hash >> 16);
315 
316 		path_hash &= (1 << 15) - 1;
317 		if (unlikely(path_hash == 0))
318 			path_hash = ~path_hash;
319 
320 		metadata->path_hash = path_hash;
321 	}
322 }
323 
324 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx,
325 				     struct sk_buff *skb, u32 len, u64 addr,
326 				     s16 compl_tag, bool eop, bool is_gso)
327 {
328 	const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL;
329 
330 	while (len > 0) {
331 		struct gve_tx_pkt_desc_dqo *desc =
332 			&tx->dqo.tx_ring[*desc_idx].pkt;
333 		u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO);
334 		bool cur_eop = eop && cur_len == len;
335 
336 		*desc = (struct gve_tx_pkt_desc_dqo){
337 			.buf_addr = cpu_to_le64(addr),
338 			.dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
339 			.end_of_packet = cur_eop,
340 			.checksum_offload_enable = checksum_offload_en,
341 			.compl_tag = cpu_to_le16(compl_tag),
342 			.buf_size = cur_len,
343 		};
344 
345 		addr += cur_len;
346 		len -= cur_len;
347 		*desc_idx = (*desc_idx + 1) & tx->mask;
348 	}
349 }
350 
351 /* Validates and prepares `skb` for TSO.
352  *
353  * Returns header length, or < 0 if invalid.
354  */
355 static int gve_prep_tso(struct sk_buff *skb)
356 {
357 	struct tcphdr *tcp;
358 	int header_len;
359 	u32 paylen;
360 	int err;
361 
362 	/* Note: HW requires MSS (gso_size) to be <= 9728 and the total length
363 	 * of the TSO to be <= 262143.
364 	 *
365 	 * However, we don't validate these because:
366 	 * - Hypervisor enforces a limit of 9K MTU
367 	 * - Kernel will not produce a TSO larger than 64k
368 	 */
369 
370 	if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO))
371 		return -1;
372 
373 	/* Needed because we will modify header. */
374 	err = skb_cow_head(skb, 0);
375 	if (err < 0)
376 		return err;
377 
378 	tcp = tcp_hdr(skb);
379 
380 	/* Remove payload length from checksum. */
381 	paylen = skb->len - skb_transport_offset(skb);
382 
383 	switch (skb_shinfo(skb)->gso_type) {
384 	case SKB_GSO_TCPV4:
385 	case SKB_GSO_TCPV6:
386 		csum_replace_by_diff(&tcp->check,
387 				     (__force __wsum)htonl(paylen));
388 
389 		/* Compute length of segmentation header. */
390 		header_len = skb_tcp_all_headers(skb);
391 		break;
392 	default:
393 		return -EINVAL;
394 	}
395 
396 	if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO))
397 		return -EINVAL;
398 
399 	return header_len;
400 }
401 
402 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc,
403 				     const struct sk_buff *skb,
404 				     const struct gve_tx_metadata_dqo *metadata,
405 				     int header_len)
406 {
407 	*desc = (struct gve_tx_tso_context_desc_dqo){
408 		.header_len = header_len,
409 		.cmd_dtype = {
410 			.dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO,
411 			.tso = 1,
412 		},
413 		.flex0 = metadata->bytes[0],
414 		.flex5 = metadata->bytes[5],
415 		.flex6 = metadata->bytes[6],
416 		.flex7 = metadata->bytes[7],
417 		.flex8 = metadata->bytes[8],
418 		.flex9 = metadata->bytes[9],
419 		.flex10 = metadata->bytes[10],
420 		.flex11 = metadata->bytes[11],
421 	};
422 	desc->tso_total_len = skb->len - header_len;
423 	desc->mss = skb_shinfo(skb)->gso_size;
424 }
425 
426 static void
427 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
428 			     const struct gve_tx_metadata_dqo *metadata)
429 {
430 	*desc = (struct gve_tx_general_context_desc_dqo){
431 		.flex0 = metadata->bytes[0],
432 		.flex1 = metadata->bytes[1],
433 		.flex2 = metadata->bytes[2],
434 		.flex3 = metadata->bytes[3],
435 		.flex4 = metadata->bytes[4],
436 		.flex5 = metadata->bytes[5],
437 		.flex6 = metadata->bytes[6],
438 		.flex7 = metadata->bytes[7],
439 		.flex8 = metadata->bytes[8],
440 		.flex9 = metadata->bytes[9],
441 		.flex10 = metadata->bytes[10],
442 		.flex11 = metadata->bytes[11],
443 		.cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO},
444 	};
445 }
446 
447 /* Returns 0 on success, or < 0 on error.
448  *
449  * Before this function is called, the caller must ensure
450  * gve_has_pending_packet(tx) returns true.
451  */
452 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
453 				      struct sk_buff *skb)
454 {
455 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
456 	const bool is_gso = skb_is_gso(skb);
457 	u32 desc_idx = tx->dqo_tx.tail;
458 
459 	struct gve_tx_pending_packet_dqo *pkt;
460 	struct gve_tx_metadata_dqo metadata;
461 	s16 completion_tag;
462 	int i;
463 
464 	pkt = gve_alloc_pending_packet(tx);
465 	pkt->skb = skb;
466 	pkt->num_bufs = 0;
467 	completion_tag = pkt - tx->dqo.pending_packets;
468 
469 	gve_extract_tx_metadata_dqo(skb, &metadata);
470 	if (is_gso) {
471 		int header_len = gve_prep_tso(skb);
472 
473 		if (unlikely(header_len < 0))
474 			goto err;
475 
476 		gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx,
477 					 skb, &metadata, header_len);
478 		desc_idx = (desc_idx + 1) & tx->mask;
479 	}
480 
481 	gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx,
482 				     &metadata);
483 	desc_idx = (desc_idx + 1) & tx->mask;
484 
485 	/* Note: HW requires that the size of a non-TSO packet be within the
486 	 * range of [17, 9728].
487 	 *
488 	 * We don't double check because
489 	 * - We limited `netdev->min_mtu` to ETH_MIN_MTU.
490 	 * - Hypervisor won't allow MTU larger than 9216.
491 	 */
492 
493 	/* Map the linear portion of skb */
494 	{
495 		u32 len = skb_headlen(skb);
496 		dma_addr_t addr;
497 
498 		addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
499 		if (unlikely(dma_mapping_error(tx->dev, addr)))
500 			goto err;
501 
502 		dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
503 		dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
504 		++pkt->num_bufs;
505 
506 		gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
507 					 completion_tag,
508 					 /*eop=*/shinfo->nr_frags == 0, is_gso);
509 	}
510 
511 	for (i = 0; i < shinfo->nr_frags; i++) {
512 		const skb_frag_t *frag = &shinfo->frags[i];
513 		bool is_eop = i == (shinfo->nr_frags - 1);
514 		u32 len = skb_frag_size(frag);
515 		dma_addr_t addr;
516 
517 		addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
518 		if (unlikely(dma_mapping_error(tx->dev, addr)))
519 			goto err;
520 
521 		dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
522 		dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
523 		++pkt->num_bufs;
524 
525 		gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
526 					 completion_tag, is_eop, is_gso);
527 	}
528 
529 	/* Commit the changes to our state */
530 	tx->dqo_tx.tail = desc_idx;
531 
532 	/* Request a descriptor completion on the last descriptor of the
533 	 * packet if we are allowed to by the HW enforced interval.
534 	 */
535 	{
536 		u32 last_desc_idx = (desc_idx - 1) & tx->mask;
537 		u32 last_report_event_interval =
538 			(last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask;
539 
540 		if (unlikely(last_report_event_interval >=
541 			     GVE_TX_MIN_RE_INTERVAL)) {
542 			tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true;
543 			tx->dqo_tx.last_re_idx = last_desc_idx;
544 		}
545 	}
546 
547 	return 0;
548 
549 err:
550 	for (i = 0; i < pkt->num_bufs; i++) {
551 		if (i == 0) {
552 			dma_unmap_single(tx->dev,
553 					 dma_unmap_addr(pkt, dma[i]),
554 					 dma_unmap_len(pkt, len[i]),
555 					 DMA_TO_DEVICE);
556 		} else {
557 			dma_unmap_page(tx->dev,
558 				       dma_unmap_addr(pkt, dma[i]),
559 				       dma_unmap_len(pkt, len[i]),
560 				       DMA_TO_DEVICE);
561 		}
562 	}
563 
564 	pkt->skb = NULL;
565 	pkt->num_bufs = 0;
566 	gve_free_pending_packet(tx, pkt);
567 
568 	return -1;
569 }
570 
571 static int gve_num_descs_per_buf(size_t size)
572 {
573 	return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO);
574 }
575 
576 static int gve_num_buffer_descs_needed(const struct sk_buff *skb)
577 {
578 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
579 	int num_descs;
580 	int i;
581 
582 	num_descs = gve_num_descs_per_buf(skb_headlen(skb));
583 
584 	for (i = 0; i < shinfo->nr_frags; i++) {
585 		unsigned int frag_size = skb_frag_size(&shinfo->frags[i]);
586 
587 		num_descs += gve_num_descs_per_buf(frag_size);
588 	}
589 
590 	return num_descs;
591 }
592 
593 /* Returns true if HW is capable of sending TSO represented by `skb`.
594  *
595  * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers.
596  * - The header is counted as one buffer for every single segment.
597  * - A buffer which is split between two segments is counted for both.
598  * - If a buffer contains both header and payload, it is counted as two buffers.
599  */
600 static bool gve_can_send_tso(const struct sk_buff *skb)
601 {
602 	const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1;
603 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
604 	const int header_len = skb_tcp_all_headers(skb);
605 	const int gso_size = shinfo->gso_size;
606 	int cur_seg_num_bufs;
607 	int cur_seg_size;
608 	int i;
609 
610 	cur_seg_size = skb_headlen(skb) - header_len;
611 	cur_seg_num_bufs = cur_seg_size > 0;
612 
613 	for (i = 0; i < shinfo->nr_frags; i++) {
614 		if (cur_seg_size >= gso_size) {
615 			cur_seg_size %= gso_size;
616 			cur_seg_num_bufs = cur_seg_size > 0;
617 		}
618 
619 		if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg))
620 			return false;
621 
622 		cur_seg_size += skb_frag_size(&shinfo->frags[i]);
623 	}
624 
625 	return true;
626 }
627 
628 /* Attempt to transmit specified SKB.
629  *
630  * Returns 0 if the SKB was transmitted or dropped.
631  * Returns -1 if there is not currently enough space to transmit the SKB.
632  */
633 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
634 			  struct sk_buff *skb)
635 {
636 	int num_buffer_descs;
637 	int total_num_descs;
638 
639 	if (skb_is_gso(skb)) {
640 		/* If TSO doesn't meet HW requirements, attempt to linearize the
641 		 * packet.
642 		 */
643 		if (unlikely(!gve_can_send_tso(skb) &&
644 			     skb_linearize(skb) < 0)) {
645 			net_err_ratelimited("%s: Failed to transmit TSO packet\n",
646 					    priv->dev->name);
647 			goto drop;
648 		}
649 
650 		if (unlikely(ipv6_hopopt_jumbo_remove(skb)))
651 			goto drop;
652 
653 		num_buffer_descs = gve_num_buffer_descs_needed(skb);
654 	} else {
655 		num_buffer_descs = gve_num_buffer_descs_needed(skb);
656 
657 		if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) {
658 			if (unlikely(skb_linearize(skb) < 0))
659 				goto drop;
660 
661 			num_buffer_descs = 1;
662 		}
663 	}
664 
665 	/* Metadata + (optional TSO) + data descriptors. */
666 	total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs;
667 	if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs +
668 			GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) {
669 		return -1;
670 	}
671 
672 	if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0))
673 		goto drop;
674 
675 	netdev_tx_sent_queue(tx->netdev_txq, skb->len);
676 	skb_tx_timestamp(skb);
677 	return 0;
678 
679 drop:
680 	tx->dropped_pkt++;
681 	dev_kfree_skb_any(skb);
682 	return 0;
683 }
684 
685 /* Transmit a given skb and ring the doorbell. */
686 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
687 {
688 	struct gve_priv *priv = netdev_priv(dev);
689 	struct gve_tx_ring *tx;
690 
691 	tx = &priv->tx[skb_get_queue_mapping(skb)];
692 	if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) {
693 		/* We need to ring the txq doorbell -- we have stopped the Tx
694 		 * queue for want of resources, but prior calls to gve_tx()
695 		 * may have added descriptors without ringing the doorbell.
696 		 */
697 		gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
698 		return NETDEV_TX_BUSY;
699 	}
700 
701 	if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
702 		return NETDEV_TX_OK;
703 
704 	gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
705 	return NETDEV_TX_OK;
706 }
707 
708 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list,
709 			struct gve_tx_pending_packet_dqo *pending_packet)
710 {
711 	s16 old_tail, index;
712 
713 	index = pending_packet - tx->dqo.pending_packets;
714 	old_tail = list->tail;
715 	list->tail = index;
716 	if (old_tail == -1)
717 		list->head = index;
718 	else
719 		tx->dqo.pending_packets[old_tail].next = index;
720 
721 	pending_packet->next = -1;
722 	pending_packet->prev = old_tail;
723 }
724 
725 static void remove_from_list(struct gve_tx_ring *tx,
726 			     struct gve_index_list *list,
727 			     struct gve_tx_pending_packet_dqo *pkt)
728 {
729 	s16 prev_index, next_index;
730 
731 	prev_index = pkt->prev;
732 	next_index = pkt->next;
733 
734 	if (prev_index == -1) {
735 		/* Node is head */
736 		list->head = next_index;
737 	} else {
738 		tx->dqo.pending_packets[prev_index].next = next_index;
739 	}
740 	if (next_index == -1) {
741 		/* Node is tail */
742 		list->tail = prev_index;
743 	} else {
744 		tx->dqo.pending_packets[next_index].prev = prev_index;
745 	}
746 }
747 
748 static void gve_unmap_packet(struct device *dev,
749 			     struct gve_tx_pending_packet_dqo *pkt)
750 {
751 	int i;
752 
753 	/* SKB linear portion is guaranteed to be mapped */
754 	dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]),
755 			 dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE);
756 	for (i = 1; i < pkt->num_bufs; i++) {
757 		dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]),
758 			       dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE);
759 	}
760 	pkt->num_bufs = 0;
761 }
762 
763 /* Completion types and expected behavior:
764  * No Miss compl + Packet compl = Packet completed normally.
765  * Miss compl + Re-inject compl = Packet completed normally.
766  * No Miss compl + Re-inject compl = Skipped i.e. packet not completed.
767  * Miss compl + Packet compl = Skipped i.e. packet not completed.
768  */
769 static void gve_handle_packet_completion(struct gve_priv *priv,
770 					 struct gve_tx_ring *tx, bool is_napi,
771 					 u16 compl_tag, u64 *bytes, u64 *pkts,
772 					 bool is_reinjection)
773 {
774 	struct gve_tx_pending_packet_dqo *pending_packet;
775 
776 	if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
777 		net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
778 				    priv->dev->name, (int)compl_tag);
779 		return;
780 	}
781 
782 	pending_packet = &tx->dqo.pending_packets[compl_tag];
783 
784 	if (unlikely(is_reinjection)) {
785 		if (unlikely(pending_packet->state ==
786 			     GVE_PACKET_STATE_TIMED_OUT_COMPL)) {
787 			net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n",
788 					    priv->dev->name, (int)compl_tag);
789 			/* Packet was already completed as a result of timeout,
790 			 * so just remove from list and free pending packet.
791 			 */
792 			remove_from_list(tx,
793 					 &tx->dqo_compl.timed_out_completions,
794 					 pending_packet);
795 			gve_free_pending_packet(tx, pending_packet);
796 			return;
797 		}
798 		if (unlikely(pending_packet->state !=
799 			     GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) {
800 			/* No outstanding miss completion but packet allocated
801 			 * implies packet receives a re-injection completion
802 			 * without a prior miss completion. Return without
803 			 * completing the packet.
804 			 */
805 			net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n",
806 					    priv->dev->name, (int)compl_tag);
807 			return;
808 		}
809 		remove_from_list(tx, &tx->dqo_compl.miss_completions,
810 				 pending_packet);
811 	} else {
812 		/* Packet is allocated but not a pending data completion. */
813 		if (unlikely(pending_packet->state !=
814 			     GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
815 			net_err_ratelimited("%s: No pending data completion: %d\n",
816 					    priv->dev->name, (int)compl_tag);
817 			return;
818 		}
819 	}
820 	gve_unmap_packet(tx->dev, pending_packet);
821 
822 	*bytes += pending_packet->skb->len;
823 	(*pkts)++;
824 	napi_consume_skb(pending_packet->skb, is_napi);
825 	pending_packet->skb = NULL;
826 	gve_free_pending_packet(tx, pending_packet);
827 }
828 
829 static void gve_handle_miss_completion(struct gve_priv *priv,
830 				       struct gve_tx_ring *tx, u16 compl_tag,
831 				       u64 *bytes, u64 *pkts)
832 {
833 	struct gve_tx_pending_packet_dqo *pending_packet;
834 
835 	if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
836 		net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
837 				    priv->dev->name, (int)compl_tag);
838 		return;
839 	}
840 
841 	pending_packet = &tx->dqo.pending_packets[compl_tag];
842 	if (unlikely(pending_packet->state !=
843 				GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
844 		net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n",
845 				    priv->dev->name, (int)pending_packet->state,
846 				    (int)compl_tag);
847 		return;
848 	}
849 
850 	pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL;
851 	/* jiffies can wraparound but time comparisons can handle overflows. */
852 	pending_packet->timeout_jiffies =
853 			jiffies +
854 			msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT *
855 					 MSEC_PER_SEC);
856 	add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet);
857 
858 	*bytes += pending_packet->skb->len;
859 	(*pkts)++;
860 }
861 
862 static void remove_miss_completions(struct gve_priv *priv,
863 				    struct gve_tx_ring *tx)
864 {
865 	struct gve_tx_pending_packet_dqo *pending_packet;
866 	s16 next_index;
867 
868 	next_index = tx->dqo_compl.miss_completions.head;
869 	while (next_index != -1) {
870 		pending_packet = &tx->dqo.pending_packets[next_index];
871 		next_index = pending_packet->next;
872 		/* Break early because packets should timeout in order. */
873 		if (time_is_after_jiffies(pending_packet->timeout_jiffies))
874 			break;
875 
876 		remove_from_list(tx, &tx->dqo_compl.miss_completions,
877 				 pending_packet);
878 		/* Unmap buffers and free skb but do not unallocate packet i.e.
879 		 * the completion tag is not freed to ensure that the driver
880 		 * can take appropriate action if a corresponding valid
881 		 * completion is received later.
882 		 */
883 		gve_unmap_packet(tx->dev, pending_packet);
884 		/* This indicates the packet was dropped. */
885 		dev_kfree_skb_any(pending_packet->skb);
886 		pending_packet->skb = NULL;
887 		tx->dropped_pkt++;
888 		net_err_ratelimited("%s: No reinjection completion was received for: %d.\n",
889 				    priv->dev->name,
890 				    (int)(pending_packet - tx->dqo.pending_packets));
891 
892 		pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL;
893 		pending_packet->timeout_jiffies =
894 				jiffies +
895 				msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT *
896 						 MSEC_PER_SEC);
897 		/* Maintain pending packet in another list so the packet can be
898 		 * unallocated at a later time.
899 		 */
900 		add_to_list(tx, &tx->dqo_compl.timed_out_completions,
901 			    pending_packet);
902 	}
903 }
904 
905 static void remove_timed_out_completions(struct gve_priv *priv,
906 					 struct gve_tx_ring *tx)
907 {
908 	struct gve_tx_pending_packet_dqo *pending_packet;
909 	s16 next_index;
910 
911 	next_index = tx->dqo_compl.timed_out_completions.head;
912 	while (next_index != -1) {
913 		pending_packet = &tx->dqo.pending_packets[next_index];
914 		next_index = pending_packet->next;
915 		/* Break early because packets should timeout in order. */
916 		if (time_is_after_jiffies(pending_packet->timeout_jiffies))
917 			break;
918 
919 		remove_from_list(tx, &tx->dqo_compl.timed_out_completions,
920 				 pending_packet);
921 		gve_free_pending_packet(tx, pending_packet);
922 	}
923 }
924 
925 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
926 			  struct napi_struct *napi)
927 {
928 	u64 reinject_compl_bytes = 0;
929 	u64 reinject_compl_pkts = 0;
930 	int num_descs_cleaned = 0;
931 	u64 miss_compl_bytes = 0;
932 	u64 miss_compl_pkts = 0;
933 	u64 pkt_compl_bytes = 0;
934 	u64 pkt_compl_pkts = 0;
935 
936 	/* Limit in order to avoid blocking for too long */
937 	while (!napi || pkt_compl_pkts < napi->weight) {
938 		struct gve_tx_compl_desc *compl_desc =
939 			&tx->dqo.compl_ring[tx->dqo_compl.head];
940 		u16 type;
941 
942 		if (compl_desc->generation == tx->dqo_compl.cur_gen_bit)
943 			break;
944 
945 		/* Prefetch the next descriptor. */
946 		prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) &
947 				tx->dqo.complq_mask]);
948 
949 		/* Do not read data until we own the descriptor */
950 		dma_rmb();
951 		type = compl_desc->type;
952 
953 		if (type == GVE_COMPL_TYPE_DQO_DESC) {
954 			/* This is the last descriptor fetched by HW plus one */
955 			u16 tx_head = le16_to_cpu(compl_desc->tx_head);
956 
957 			atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head);
958 		} else if (type == GVE_COMPL_TYPE_DQO_PKT) {
959 			u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
960 			if (compl_tag & GVE_ALT_MISS_COMPL_BIT) {
961 				compl_tag &= ~GVE_ALT_MISS_COMPL_BIT;
962 				gve_handle_miss_completion(priv, tx, compl_tag,
963 							   &miss_compl_bytes,
964 							   &miss_compl_pkts);
965 			} else {
966 				gve_handle_packet_completion(priv, tx, !!napi,
967 							     compl_tag,
968 							     &pkt_compl_bytes,
969 							     &pkt_compl_pkts,
970 							     false);
971 			}
972 		} else if (type == GVE_COMPL_TYPE_DQO_MISS) {
973 			u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
974 
975 			gve_handle_miss_completion(priv, tx, compl_tag,
976 						   &miss_compl_bytes,
977 						   &miss_compl_pkts);
978 		} else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) {
979 			u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
980 
981 			gve_handle_packet_completion(priv, tx, !!napi,
982 						     compl_tag,
983 						     &reinject_compl_bytes,
984 						     &reinject_compl_pkts,
985 						     true);
986 		}
987 
988 		tx->dqo_compl.head =
989 			(tx->dqo_compl.head + 1) & tx->dqo.complq_mask;
990 		/* Flip the generation bit when we wrap around */
991 		tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0;
992 		num_descs_cleaned++;
993 	}
994 
995 	netdev_tx_completed_queue(tx->netdev_txq,
996 				  pkt_compl_pkts + miss_compl_pkts,
997 				  pkt_compl_bytes + miss_compl_bytes);
998 
999 	remove_miss_completions(priv, tx);
1000 	remove_timed_out_completions(priv, tx);
1001 
1002 	u64_stats_update_begin(&tx->statss);
1003 	tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes;
1004 	tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts;
1005 	u64_stats_update_end(&tx->statss);
1006 	return num_descs_cleaned;
1007 }
1008 
1009 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean)
1010 {
1011 	struct gve_tx_compl_desc *compl_desc;
1012 	struct gve_tx_ring *tx = block->tx;
1013 	struct gve_priv *priv = block->priv;
1014 
1015 	if (do_clean) {
1016 		int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx,
1017 							      &block->napi);
1018 
1019 		/* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */
1020 		mb();
1021 
1022 		if (netif_tx_queue_stopped(tx->netdev_txq) &&
1023 		    num_descs_cleaned > 0) {
1024 			tx->wake_queue++;
1025 			netif_tx_wake_queue(tx->netdev_txq);
1026 		}
1027 	}
1028 
1029 	/* Return true if we still have work. */
1030 	compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head];
1031 	return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
1032 }
1033