1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6 
7 #include <linux/bpf.h>
8 #include <linux/cpumask.h>
9 #include <linux/etherdevice.h>
10 #include <linux/filter.h>
11 #include <linux/interrupt.h>
12 #include <linux/module.h>
13 #include <linux/pci.h>
14 #include <linux/sched.h>
15 #include <linux/timer.h>
16 #include <linux/workqueue.h>
17 #include <linux/utsname.h>
18 #include <linux/version.h>
19 #include <net/sch_generic.h>
20 #include <net/xdp_sock_drv.h>
21 #include "gve.h"
22 #include "gve_dqo.h"
23 #include "gve_adminq.h"
24 #include "gve_register.h"
25 
26 #define GVE_DEFAULT_RX_COPYBREAK	(256)
27 
28 #define DEFAULT_MSG_LEVEL	(NETIF_MSG_DRV | NETIF_MSG_LINK)
29 #define GVE_VERSION		"1.0.0"
30 #define GVE_VERSION_PREFIX	"GVE-"
31 
32 // Minimum amount of time between queue kicks in msec (10 seconds)
33 #define MIN_TX_TIMEOUT_GAP (1000 * 10)
34 #define DQO_TX_MAX	0x3FFFF
35 
36 char gve_driver_name[] = "gve";
37 const char gve_version_str[] = GVE_VERSION;
38 static const char gve_version_prefix[] = GVE_VERSION_PREFIX;
39 
40 static int gve_verify_driver_compatibility(struct gve_priv *priv)
41 {
42 	int err;
43 	struct gve_driver_info *driver_info;
44 	dma_addr_t driver_info_bus;
45 
46 	driver_info = dma_alloc_coherent(&priv->pdev->dev,
47 					 sizeof(struct gve_driver_info),
48 					 &driver_info_bus, GFP_KERNEL);
49 	if (!driver_info)
50 		return -ENOMEM;
51 
52 	*driver_info = (struct gve_driver_info) {
53 		.os_type = 1, /* Linux */
54 		.os_version_major = cpu_to_be32(LINUX_VERSION_MAJOR),
55 		.os_version_minor = cpu_to_be32(LINUX_VERSION_SUBLEVEL),
56 		.os_version_sub = cpu_to_be32(LINUX_VERSION_PATCHLEVEL),
57 		.driver_capability_flags = {
58 			cpu_to_be64(GVE_DRIVER_CAPABILITY_FLAGS1),
59 			cpu_to_be64(GVE_DRIVER_CAPABILITY_FLAGS2),
60 			cpu_to_be64(GVE_DRIVER_CAPABILITY_FLAGS3),
61 			cpu_to_be64(GVE_DRIVER_CAPABILITY_FLAGS4),
62 		},
63 	};
64 	strscpy(driver_info->os_version_str1, utsname()->release,
65 		sizeof(driver_info->os_version_str1));
66 	strscpy(driver_info->os_version_str2, utsname()->version,
67 		sizeof(driver_info->os_version_str2));
68 
69 	err = gve_adminq_verify_driver_compatibility(priv,
70 						     sizeof(struct gve_driver_info),
71 						     driver_info_bus);
72 
73 	/* It's ok if the device doesn't support this */
74 	if (err == -EOPNOTSUPP)
75 		err = 0;
76 
77 	dma_free_coherent(&priv->pdev->dev,
78 			  sizeof(struct gve_driver_info),
79 			  driver_info, driver_info_bus);
80 	return err;
81 }
82 
83 static netdev_tx_t gve_start_xmit(struct sk_buff *skb, struct net_device *dev)
84 {
85 	struct gve_priv *priv = netdev_priv(dev);
86 
87 	if (gve_is_gqi(priv))
88 		return gve_tx(skb, dev);
89 	else
90 		return gve_tx_dqo(skb, dev);
91 }
92 
93 static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s)
94 {
95 	struct gve_priv *priv = netdev_priv(dev);
96 	unsigned int start;
97 	u64 packets, bytes;
98 	int num_tx_queues;
99 	int ring;
100 
101 	num_tx_queues = gve_num_tx_queues(priv);
102 	if (priv->rx) {
103 		for (ring = 0; ring < priv->rx_cfg.num_queues; ring++) {
104 			do {
105 				start =
106 				  u64_stats_fetch_begin(&priv->rx[ring].statss);
107 				packets = priv->rx[ring].rpackets;
108 				bytes = priv->rx[ring].rbytes;
109 			} while (u64_stats_fetch_retry(&priv->rx[ring].statss,
110 						       start));
111 			s->rx_packets += packets;
112 			s->rx_bytes += bytes;
113 		}
114 	}
115 	if (priv->tx) {
116 		for (ring = 0; ring < num_tx_queues; ring++) {
117 			do {
118 				start =
119 				  u64_stats_fetch_begin(&priv->tx[ring].statss);
120 				packets = priv->tx[ring].pkt_done;
121 				bytes = priv->tx[ring].bytes_done;
122 			} while (u64_stats_fetch_retry(&priv->tx[ring].statss,
123 						       start));
124 			s->tx_packets += packets;
125 			s->tx_bytes += bytes;
126 		}
127 	}
128 }
129 
130 static int gve_alloc_counter_array(struct gve_priv *priv)
131 {
132 	priv->counter_array =
133 		dma_alloc_coherent(&priv->pdev->dev,
134 				   priv->num_event_counters *
135 				   sizeof(*priv->counter_array),
136 				   &priv->counter_array_bus, GFP_KERNEL);
137 	if (!priv->counter_array)
138 		return -ENOMEM;
139 
140 	return 0;
141 }
142 
143 static void gve_free_counter_array(struct gve_priv *priv)
144 {
145 	if (!priv->counter_array)
146 		return;
147 
148 	dma_free_coherent(&priv->pdev->dev,
149 			  priv->num_event_counters *
150 			  sizeof(*priv->counter_array),
151 			  priv->counter_array, priv->counter_array_bus);
152 	priv->counter_array = NULL;
153 }
154 
155 /* NIC requests to report stats */
156 static void gve_stats_report_task(struct work_struct *work)
157 {
158 	struct gve_priv *priv = container_of(work, struct gve_priv,
159 					     stats_report_task);
160 	if (gve_get_do_report_stats(priv)) {
161 		gve_handle_report_stats(priv);
162 		gve_clear_do_report_stats(priv);
163 	}
164 }
165 
166 static void gve_stats_report_schedule(struct gve_priv *priv)
167 {
168 	if (!gve_get_probe_in_progress(priv) &&
169 	    !gve_get_reset_in_progress(priv)) {
170 		gve_set_do_report_stats(priv);
171 		queue_work(priv->gve_wq, &priv->stats_report_task);
172 	}
173 }
174 
175 static void gve_stats_report_timer(struct timer_list *t)
176 {
177 	struct gve_priv *priv = from_timer(priv, t, stats_report_timer);
178 
179 	mod_timer(&priv->stats_report_timer,
180 		  round_jiffies(jiffies +
181 		  msecs_to_jiffies(priv->stats_report_timer_period)));
182 	gve_stats_report_schedule(priv);
183 }
184 
185 static int gve_alloc_stats_report(struct gve_priv *priv)
186 {
187 	int tx_stats_num, rx_stats_num;
188 
189 	tx_stats_num = (GVE_TX_STATS_REPORT_NUM + NIC_TX_STATS_REPORT_NUM) *
190 		       gve_num_tx_queues(priv);
191 	rx_stats_num = (GVE_RX_STATS_REPORT_NUM + NIC_RX_STATS_REPORT_NUM) *
192 		       priv->rx_cfg.num_queues;
193 	priv->stats_report_len = struct_size(priv->stats_report, stats,
194 					     tx_stats_num + rx_stats_num);
195 	priv->stats_report =
196 		dma_alloc_coherent(&priv->pdev->dev, priv->stats_report_len,
197 				   &priv->stats_report_bus, GFP_KERNEL);
198 	if (!priv->stats_report)
199 		return -ENOMEM;
200 	/* Set up timer for the report-stats task */
201 	timer_setup(&priv->stats_report_timer, gve_stats_report_timer, 0);
202 	priv->stats_report_timer_period = GVE_STATS_REPORT_TIMER_PERIOD;
203 	return 0;
204 }
205 
206 static void gve_free_stats_report(struct gve_priv *priv)
207 {
208 	if (!priv->stats_report)
209 		return;
210 
211 	del_timer_sync(&priv->stats_report_timer);
212 	dma_free_coherent(&priv->pdev->dev, priv->stats_report_len,
213 			  priv->stats_report, priv->stats_report_bus);
214 	priv->stats_report = NULL;
215 }
216 
217 static irqreturn_t gve_mgmnt_intr(int irq, void *arg)
218 {
219 	struct gve_priv *priv = arg;
220 
221 	queue_work(priv->gve_wq, &priv->service_task);
222 	return IRQ_HANDLED;
223 }
224 
225 static irqreturn_t gve_intr(int irq, void *arg)
226 {
227 	struct gve_notify_block *block = arg;
228 	struct gve_priv *priv = block->priv;
229 
230 	iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block));
231 	napi_schedule_irqoff(&block->napi);
232 	return IRQ_HANDLED;
233 }
234 
235 static irqreturn_t gve_intr_dqo(int irq, void *arg)
236 {
237 	struct gve_notify_block *block = arg;
238 
239 	/* Interrupts are automatically masked */
240 	napi_schedule_irqoff(&block->napi);
241 	return IRQ_HANDLED;
242 }
243 
244 static int gve_napi_poll(struct napi_struct *napi, int budget)
245 {
246 	struct gve_notify_block *block;
247 	__be32 __iomem *irq_doorbell;
248 	bool reschedule = false;
249 	struct gve_priv *priv;
250 	int work_done = 0;
251 
252 	block = container_of(napi, struct gve_notify_block, napi);
253 	priv = block->priv;
254 
255 	if (block->tx) {
256 		if (block->tx->q_num < priv->tx_cfg.num_queues)
257 			reschedule |= gve_tx_poll(block, budget);
258 		else
259 			reschedule |= gve_xdp_poll(block, budget);
260 	}
261 
262 	if (block->rx) {
263 		work_done = gve_rx_poll(block, budget);
264 		reschedule |= work_done == budget;
265 	}
266 
267 	if (reschedule)
268 		return budget;
269 
270        /* Complete processing - don't unmask irq if busy polling is enabled */
271 	if (likely(napi_complete_done(napi, work_done))) {
272 		irq_doorbell = gve_irq_doorbell(priv, block);
273 		iowrite32be(GVE_IRQ_ACK | GVE_IRQ_EVENT, irq_doorbell);
274 
275 		/* Ensure IRQ ACK is visible before we check pending work.
276 		 * If queue had issued updates, it would be truly visible.
277 		 */
278 		mb();
279 
280 		if (block->tx)
281 			reschedule |= gve_tx_clean_pending(priv, block->tx);
282 		if (block->rx)
283 			reschedule |= gve_rx_work_pending(block->rx);
284 
285 		if (reschedule && napi_reschedule(napi))
286 			iowrite32be(GVE_IRQ_MASK, irq_doorbell);
287 	}
288 	return work_done;
289 }
290 
291 static int gve_napi_poll_dqo(struct napi_struct *napi, int budget)
292 {
293 	struct gve_notify_block *block =
294 		container_of(napi, struct gve_notify_block, napi);
295 	struct gve_priv *priv = block->priv;
296 	bool reschedule = false;
297 	int work_done = 0;
298 
299 	if (block->tx)
300 		reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true);
301 
302 	if (block->rx) {
303 		work_done = gve_rx_poll_dqo(block, budget);
304 		reschedule |= work_done == budget;
305 	}
306 
307 	if (reschedule)
308 		return budget;
309 
310 	if (likely(napi_complete_done(napi, work_done))) {
311 		/* Enable interrupts again.
312 		 *
313 		 * We don't need to repoll afterwards because HW supports the
314 		 * PCI MSI-X PBA feature.
315 		 *
316 		 * Another interrupt would be triggered if a new event came in
317 		 * since the last one.
318 		 */
319 		gve_write_irq_doorbell_dqo(priv, block,
320 					   GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO);
321 	}
322 
323 	return work_done;
324 }
325 
326 static int gve_alloc_notify_blocks(struct gve_priv *priv)
327 {
328 	int num_vecs_requested = priv->num_ntfy_blks + 1;
329 	unsigned int active_cpus;
330 	int vecs_enabled;
331 	int i, j;
332 	int err;
333 
334 	priv->msix_vectors = kvcalloc(num_vecs_requested,
335 				      sizeof(*priv->msix_vectors), GFP_KERNEL);
336 	if (!priv->msix_vectors)
337 		return -ENOMEM;
338 	for (i = 0; i < num_vecs_requested; i++)
339 		priv->msix_vectors[i].entry = i;
340 	vecs_enabled = pci_enable_msix_range(priv->pdev, priv->msix_vectors,
341 					     GVE_MIN_MSIX, num_vecs_requested);
342 	if (vecs_enabled < 0) {
343 		dev_err(&priv->pdev->dev, "Could not enable min msix %d/%d\n",
344 			GVE_MIN_MSIX, vecs_enabled);
345 		err = vecs_enabled;
346 		goto abort_with_msix_vectors;
347 	}
348 	if (vecs_enabled != num_vecs_requested) {
349 		int new_num_ntfy_blks = (vecs_enabled - 1) & ~0x1;
350 		int vecs_per_type = new_num_ntfy_blks / 2;
351 		int vecs_left = new_num_ntfy_blks % 2;
352 
353 		priv->num_ntfy_blks = new_num_ntfy_blks;
354 		priv->mgmt_msix_idx = priv->num_ntfy_blks;
355 		priv->tx_cfg.max_queues = min_t(int, priv->tx_cfg.max_queues,
356 						vecs_per_type);
357 		priv->rx_cfg.max_queues = min_t(int, priv->rx_cfg.max_queues,
358 						vecs_per_type + vecs_left);
359 		dev_err(&priv->pdev->dev,
360 			"Could not enable desired msix, only enabled %d, adjusting tx max queues to %d, and rx max queues to %d\n",
361 			vecs_enabled, priv->tx_cfg.max_queues,
362 			priv->rx_cfg.max_queues);
363 		if (priv->tx_cfg.num_queues > priv->tx_cfg.max_queues)
364 			priv->tx_cfg.num_queues = priv->tx_cfg.max_queues;
365 		if (priv->rx_cfg.num_queues > priv->rx_cfg.max_queues)
366 			priv->rx_cfg.num_queues = priv->rx_cfg.max_queues;
367 	}
368 	/* Half the notification blocks go to TX and half to RX */
369 	active_cpus = min_t(int, priv->num_ntfy_blks / 2, num_online_cpus());
370 
371 	/* Setup Management Vector  - the last vector */
372 	snprintf(priv->mgmt_msix_name, sizeof(priv->mgmt_msix_name), "gve-mgmnt@pci:%s",
373 		 pci_name(priv->pdev));
374 	err = request_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector,
375 			  gve_mgmnt_intr, 0, priv->mgmt_msix_name, priv);
376 	if (err) {
377 		dev_err(&priv->pdev->dev, "Did not receive management vector.\n");
378 		goto abort_with_msix_enabled;
379 	}
380 	priv->irq_db_indices =
381 		dma_alloc_coherent(&priv->pdev->dev,
382 				   priv->num_ntfy_blks *
383 				   sizeof(*priv->irq_db_indices),
384 				   &priv->irq_db_indices_bus, GFP_KERNEL);
385 	if (!priv->irq_db_indices) {
386 		err = -ENOMEM;
387 		goto abort_with_mgmt_vector;
388 	}
389 
390 	priv->ntfy_blocks = kvzalloc(priv->num_ntfy_blks *
391 				     sizeof(*priv->ntfy_blocks), GFP_KERNEL);
392 	if (!priv->ntfy_blocks) {
393 		err = -ENOMEM;
394 		goto abort_with_irq_db_indices;
395 	}
396 
397 	/* Setup the other blocks - the first n-1 vectors */
398 	for (i = 0; i < priv->num_ntfy_blks; i++) {
399 		struct gve_notify_block *block = &priv->ntfy_blocks[i];
400 		int msix_idx = i;
401 
402 		snprintf(block->name, sizeof(block->name), "gve-ntfy-blk%d@pci:%s",
403 			 i, pci_name(priv->pdev));
404 		block->priv = priv;
405 		err = request_irq(priv->msix_vectors[msix_idx].vector,
406 				  gve_is_gqi(priv) ? gve_intr : gve_intr_dqo,
407 				  0, block->name, block);
408 		if (err) {
409 			dev_err(&priv->pdev->dev,
410 				"Failed to receive msix vector %d\n", i);
411 			goto abort_with_some_ntfy_blocks;
412 		}
413 		irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
414 				      get_cpu_mask(i % active_cpus));
415 		block->irq_db_index = &priv->irq_db_indices[i].index;
416 	}
417 	return 0;
418 abort_with_some_ntfy_blocks:
419 	for (j = 0; j < i; j++) {
420 		struct gve_notify_block *block = &priv->ntfy_blocks[j];
421 		int msix_idx = j;
422 
423 		irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
424 				      NULL);
425 		free_irq(priv->msix_vectors[msix_idx].vector, block);
426 	}
427 	kvfree(priv->ntfy_blocks);
428 	priv->ntfy_blocks = NULL;
429 abort_with_irq_db_indices:
430 	dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks *
431 			  sizeof(*priv->irq_db_indices),
432 			  priv->irq_db_indices, priv->irq_db_indices_bus);
433 	priv->irq_db_indices = NULL;
434 abort_with_mgmt_vector:
435 	free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv);
436 abort_with_msix_enabled:
437 	pci_disable_msix(priv->pdev);
438 abort_with_msix_vectors:
439 	kvfree(priv->msix_vectors);
440 	priv->msix_vectors = NULL;
441 	return err;
442 }
443 
444 static void gve_free_notify_blocks(struct gve_priv *priv)
445 {
446 	int i;
447 
448 	if (!priv->msix_vectors)
449 		return;
450 
451 	/* Free the irqs */
452 	for (i = 0; i < priv->num_ntfy_blks; i++) {
453 		struct gve_notify_block *block = &priv->ntfy_blocks[i];
454 		int msix_idx = i;
455 
456 		irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector,
457 				      NULL);
458 		free_irq(priv->msix_vectors[msix_idx].vector, block);
459 	}
460 	free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv);
461 	kvfree(priv->ntfy_blocks);
462 	priv->ntfy_blocks = NULL;
463 	dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks *
464 			  sizeof(*priv->irq_db_indices),
465 			  priv->irq_db_indices, priv->irq_db_indices_bus);
466 	priv->irq_db_indices = NULL;
467 	pci_disable_msix(priv->pdev);
468 	kvfree(priv->msix_vectors);
469 	priv->msix_vectors = NULL;
470 }
471 
472 static int gve_setup_device_resources(struct gve_priv *priv)
473 {
474 	int err;
475 
476 	err = gve_alloc_counter_array(priv);
477 	if (err)
478 		return err;
479 	err = gve_alloc_notify_blocks(priv);
480 	if (err)
481 		goto abort_with_counter;
482 	err = gve_alloc_stats_report(priv);
483 	if (err)
484 		goto abort_with_ntfy_blocks;
485 	err = gve_adminq_configure_device_resources(priv,
486 						    priv->counter_array_bus,
487 						    priv->num_event_counters,
488 						    priv->irq_db_indices_bus,
489 						    priv->num_ntfy_blks);
490 	if (unlikely(err)) {
491 		dev_err(&priv->pdev->dev,
492 			"could not setup device_resources: err=%d\n", err);
493 		err = -ENXIO;
494 		goto abort_with_stats_report;
495 	}
496 
497 	if (priv->queue_format == GVE_DQO_RDA_FORMAT) {
498 		priv->ptype_lut_dqo = kvzalloc(sizeof(*priv->ptype_lut_dqo),
499 					       GFP_KERNEL);
500 		if (!priv->ptype_lut_dqo) {
501 			err = -ENOMEM;
502 			goto abort_with_stats_report;
503 		}
504 		err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo);
505 		if (err) {
506 			dev_err(&priv->pdev->dev,
507 				"Failed to get ptype map: err=%d\n", err);
508 			goto abort_with_ptype_lut;
509 		}
510 	}
511 
512 	err = gve_adminq_report_stats(priv, priv->stats_report_len,
513 				      priv->stats_report_bus,
514 				      GVE_STATS_REPORT_TIMER_PERIOD);
515 	if (err)
516 		dev_err(&priv->pdev->dev,
517 			"Failed to report stats: err=%d\n", err);
518 	gve_set_device_resources_ok(priv);
519 	return 0;
520 
521 abort_with_ptype_lut:
522 	kvfree(priv->ptype_lut_dqo);
523 	priv->ptype_lut_dqo = NULL;
524 abort_with_stats_report:
525 	gve_free_stats_report(priv);
526 abort_with_ntfy_blocks:
527 	gve_free_notify_blocks(priv);
528 abort_with_counter:
529 	gve_free_counter_array(priv);
530 
531 	return err;
532 }
533 
534 static void gve_trigger_reset(struct gve_priv *priv);
535 
536 static void gve_teardown_device_resources(struct gve_priv *priv)
537 {
538 	int err;
539 
540 	/* Tell device its resources are being freed */
541 	if (gve_get_device_resources_ok(priv)) {
542 		/* detach the stats report */
543 		err = gve_adminq_report_stats(priv, 0, 0x0, GVE_STATS_REPORT_TIMER_PERIOD);
544 		if (err) {
545 			dev_err(&priv->pdev->dev,
546 				"Failed to detach stats report: err=%d\n", err);
547 			gve_trigger_reset(priv);
548 		}
549 		err = gve_adminq_deconfigure_device_resources(priv);
550 		if (err) {
551 			dev_err(&priv->pdev->dev,
552 				"Could not deconfigure device resources: err=%d\n",
553 				err);
554 			gve_trigger_reset(priv);
555 		}
556 	}
557 
558 	kvfree(priv->ptype_lut_dqo);
559 	priv->ptype_lut_dqo = NULL;
560 
561 	gve_free_counter_array(priv);
562 	gve_free_notify_blocks(priv);
563 	gve_free_stats_report(priv);
564 	gve_clear_device_resources_ok(priv);
565 }
566 
567 static void gve_add_napi(struct gve_priv *priv, int ntfy_idx,
568 			 int (*gve_poll)(struct napi_struct *, int))
569 {
570 	struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
571 
572 	netif_napi_add(priv->dev, &block->napi, gve_poll);
573 }
574 
575 static void gve_remove_napi(struct gve_priv *priv, int ntfy_idx)
576 {
577 	struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
578 
579 	netif_napi_del(&block->napi);
580 }
581 
582 static int gve_register_xdp_qpls(struct gve_priv *priv)
583 {
584 	int start_id;
585 	int err;
586 	int i;
587 
588 	start_id = gve_tx_qpl_id(priv, gve_xdp_tx_start_queue_id(priv));
589 	for (i = start_id; i < start_id + gve_num_xdp_qpls(priv); i++) {
590 		err = gve_adminq_register_page_list(priv, &priv->qpls[i]);
591 		if (err) {
592 			netif_err(priv, drv, priv->dev,
593 				  "failed to register queue page list %d\n",
594 				  priv->qpls[i].id);
595 			/* This failure will trigger a reset - no need to clean
596 			 * up
597 			 */
598 			return err;
599 		}
600 	}
601 	return 0;
602 }
603 
604 static int gve_register_qpls(struct gve_priv *priv)
605 {
606 	int start_id;
607 	int err;
608 	int i;
609 
610 	start_id = gve_tx_start_qpl_id(priv);
611 	for (i = start_id; i < start_id + gve_num_tx_qpls(priv); i++) {
612 		err = gve_adminq_register_page_list(priv, &priv->qpls[i]);
613 		if (err) {
614 			netif_err(priv, drv, priv->dev,
615 				  "failed to register queue page list %d\n",
616 				  priv->qpls[i].id);
617 			/* This failure will trigger a reset - no need to clean
618 			 * up
619 			 */
620 			return err;
621 		}
622 	}
623 
624 	start_id = gve_rx_start_qpl_id(priv);
625 	for (i = start_id; i < start_id + gve_num_rx_qpls(priv); i++) {
626 		err = gve_adminq_register_page_list(priv, &priv->qpls[i]);
627 		if (err) {
628 			netif_err(priv, drv, priv->dev,
629 				  "failed to register queue page list %d\n",
630 				  priv->qpls[i].id);
631 			/* This failure will trigger a reset - no need to clean
632 			 * up
633 			 */
634 			return err;
635 		}
636 	}
637 	return 0;
638 }
639 
640 static int gve_unregister_xdp_qpls(struct gve_priv *priv)
641 {
642 	int start_id;
643 	int err;
644 	int i;
645 
646 	start_id = gve_tx_qpl_id(priv, gve_xdp_tx_start_queue_id(priv));
647 	for (i = start_id; i < start_id + gve_num_xdp_qpls(priv); i++) {
648 		err = gve_adminq_unregister_page_list(priv, priv->qpls[i].id);
649 		/* This failure will trigger a reset - no need to clean up */
650 		if (err) {
651 			netif_err(priv, drv, priv->dev,
652 				  "Failed to unregister queue page list %d\n",
653 				  priv->qpls[i].id);
654 			return err;
655 		}
656 	}
657 	return 0;
658 }
659 
660 static int gve_unregister_qpls(struct gve_priv *priv)
661 {
662 	int start_id;
663 	int err;
664 	int i;
665 
666 	start_id = gve_tx_start_qpl_id(priv);
667 	for (i = start_id; i < start_id + gve_num_tx_qpls(priv); i++) {
668 		err = gve_adminq_unregister_page_list(priv, priv->qpls[i].id);
669 		/* This failure will trigger a reset - no need to clean up */
670 		if (err) {
671 			netif_err(priv, drv, priv->dev,
672 				  "Failed to unregister queue page list %d\n",
673 				  priv->qpls[i].id);
674 			return err;
675 		}
676 	}
677 
678 	start_id = gve_rx_start_qpl_id(priv);
679 	for (i = start_id; i < start_id + gve_num_rx_qpls(priv); i++) {
680 		err = gve_adminq_unregister_page_list(priv, priv->qpls[i].id);
681 		/* This failure will trigger a reset - no need to clean up */
682 		if (err) {
683 			netif_err(priv, drv, priv->dev,
684 				  "Failed to unregister queue page list %d\n",
685 				  priv->qpls[i].id);
686 			return err;
687 		}
688 	}
689 	return 0;
690 }
691 
692 static int gve_create_xdp_rings(struct gve_priv *priv)
693 {
694 	int err;
695 
696 	err = gve_adminq_create_tx_queues(priv,
697 					  gve_xdp_tx_start_queue_id(priv),
698 					  priv->num_xdp_queues);
699 	if (err) {
700 		netif_err(priv, drv, priv->dev, "failed to create %d XDP tx queues\n",
701 			  priv->num_xdp_queues);
702 		/* This failure will trigger a reset - no need to clean
703 		 * up
704 		 */
705 		return err;
706 	}
707 	netif_dbg(priv, drv, priv->dev, "created %d XDP tx queues\n",
708 		  priv->num_xdp_queues);
709 
710 	return 0;
711 }
712 
713 static int gve_create_rings(struct gve_priv *priv)
714 {
715 	int num_tx_queues = gve_num_tx_queues(priv);
716 	int err;
717 	int i;
718 
719 	err = gve_adminq_create_tx_queues(priv, 0, num_tx_queues);
720 	if (err) {
721 		netif_err(priv, drv, priv->dev, "failed to create %d tx queues\n",
722 			  num_tx_queues);
723 		/* This failure will trigger a reset - no need to clean
724 		 * up
725 		 */
726 		return err;
727 	}
728 	netif_dbg(priv, drv, priv->dev, "created %d tx queues\n",
729 		  num_tx_queues);
730 
731 	err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues);
732 	if (err) {
733 		netif_err(priv, drv, priv->dev, "failed to create %d rx queues\n",
734 			  priv->rx_cfg.num_queues);
735 		/* This failure will trigger a reset - no need to clean
736 		 * up
737 		 */
738 		return err;
739 	}
740 	netif_dbg(priv, drv, priv->dev, "created %d rx queues\n",
741 		  priv->rx_cfg.num_queues);
742 
743 	if (gve_is_gqi(priv)) {
744 		/* Rx data ring has been prefilled with packet buffers at queue
745 		 * allocation time.
746 		 *
747 		 * Write the doorbell to provide descriptor slots and packet
748 		 * buffers to the NIC.
749 		 */
750 		for (i = 0; i < priv->rx_cfg.num_queues; i++)
751 			gve_rx_write_doorbell(priv, &priv->rx[i]);
752 	} else {
753 		for (i = 0; i < priv->rx_cfg.num_queues; i++) {
754 			/* Post buffers and ring doorbell. */
755 			gve_rx_post_buffers_dqo(&priv->rx[i]);
756 		}
757 	}
758 
759 	return 0;
760 }
761 
762 static void add_napi_init_xdp_sync_stats(struct gve_priv *priv,
763 					 int (*napi_poll)(struct napi_struct *napi,
764 							  int budget))
765 {
766 	int start_id = gve_xdp_tx_start_queue_id(priv);
767 	int i;
768 
769 	/* Add xdp tx napi & init sync stats*/
770 	for (i = start_id; i < start_id + priv->num_xdp_queues; i++) {
771 		int ntfy_idx = gve_tx_idx_to_ntfy(priv, i);
772 
773 		u64_stats_init(&priv->tx[i].statss);
774 		priv->tx[i].ntfy_id = ntfy_idx;
775 		gve_add_napi(priv, ntfy_idx, napi_poll);
776 	}
777 }
778 
779 static void add_napi_init_sync_stats(struct gve_priv *priv,
780 				     int (*napi_poll)(struct napi_struct *napi,
781 						      int budget))
782 {
783 	int i;
784 
785 	/* Add tx napi & init sync stats*/
786 	for (i = 0; i < gve_num_tx_queues(priv); i++) {
787 		int ntfy_idx = gve_tx_idx_to_ntfy(priv, i);
788 
789 		u64_stats_init(&priv->tx[i].statss);
790 		priv->tx[i].ntfy_id = ntfy_idx;
791 		gve_add_napi(priv, ntfy_idx, napi_poll);
792 	}
793 	/* Add rx napi  & init sync stats*/
794 	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
795 		int ntfy_idx = gve_rx_idx_to_ntfy(priv, i);
796 
797 		u64_stats_init(&priv->rx[i].statss);
798 		priv->rx[i].ntfy_id = ntfy_idx;
799 		gve_add_napi(priv, ntfy_idx, napi_poll);
800 	}
801 }
802 
803 static void gve_tx_free_rings(struct gve_priv *priv, int start_id, int num_rings)
804 {
805 	if (gve_is_gqi(priv)) {
806 		gve_tx_free_rings_gqi(priv, start_id, num_rings);
807 	} else {
808 		gve_tx_free_rings_dqo(priv);
809 	}
810 }
811 
812 static int gve_alloc_xdp_rings(struct gve_priv *priv)
813 {
814 	int start_id;
815 	int err = 0;
816 
817 	if (!priv->num_xdp_queues)
818 		return 0;
819 
820 	start_id = gve_xdp_tx_start_queue_id(priv);
821 	err = gve_tx_alloc_rings(priv, start_id, priv->num_xdp_queues);
822 	if (err)
823 		return err;
824 	add_napi_init_xdp_sync_stats(priv, gve_napi_poll);
825 
826 	return 0;
827 }
828 
829 static int gve_alloc_rings(struct gve_priv *priv)
830 {
831 	int err;
832 
833 	/* Setup tx rings */
834 	priv->tx = kvcalloc(priv->tx_cfg.max_queues, sizeof(*priv->tx),
835 			    GFP_KERNEL);
836 	if (!priv->tx)
837 		return -ENOMEM;
838 
839 	if (gve_is_gqi(priv))
840 		err = gve_tx_alloc_rings(priv, 0, gve_num_tx_queues(priv));
841 	else
842 		err = gve_tx_alloc_rings_dqo(priv);
843 	if (err)
844 		goto free_tx;
845 
846 	/* Setup rx rings */
847 	priv->rx = kvcalloc(priv->rx_cfg.max_queues, sizeof(*priv->rx),
848 			    GFP_KERNEL);
849 	if (!priv->rx) {
850 		err = -ENOMEM;
851 		goto free_tx_queue;
852 	}
853 
854 	if (gve_is_gqi(priv))
855 		err = gve_rx_alloc_rings(priv);
856 	else
857 		err = gve_rx_alloc_rings_dqo(priv);
858 	if (err)
859 		goto free_rx;
860 
861 	if (gve_is_gqi(priv))
862 		add_napi_init_sync_stats(priv, gve_napi_poll);
863 	else
864 		add_napi_init_sync_stats(priv, gve_napi_poll_dqo);
865 
866 	return 0;
867 
868 free_rx:
869 	kvfree(priv->rx);
870 	priv->rx = NULL;
871 free_tx_queue:
872 	gve_tx_free_rings(priv, 0, gve_num_tx_queues(priv));
873 free_tx:
874 	kvfree(priv->tx);
875 	priv->tx = NULL;
876 	return err;
877 }
878 
879 static int gve_destroy_xdp_rings(struct gve_priv *priv)
880 {
881 	int start_id;
882 	int err;
883 
884 	start_id = gve_xdp_tx_start_queue_id(priv);
885 	err = gve_adminq_destroy_tx_queues(priv,
886 					   start_id,
887 					   priv->num_xdp_queues);
888 	if (err) {
889 		netif_err(priv, drv, priv->dev,
890 			  "failed to destroy XDP queues\n");
891 		/* This failure will trigger a reset - no need to clean up */
892 		return err;
893 	}
894 	netif_dbg(priv, drv, priv->dev, "destroyed XDP queues\n");
895 
896 	return 0;
897 }
898 
899 static int gve_destroy_rings(struct gve_priv *priv)
900 {
901 	int num_tx_queues = gve_num_tx_queues(priv);
902 	int err;
903 
904 	err = gve_adminq_destroy_tx_queues(priv, 0, num_tx_queues);
905 	if (err) {
906 		netif_err(priv, drv, priv->dev,
907 			  "failed to destroy tx queues\n");
908 		/* This failure will trigger a reset - no need to clean up */
909 		return err;
910 	}
911 	netif_dbg(priv, drv, priv->dev, "destroyed tx queues\n");
912 	err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues);
913 	if (err) {
914 		netif_err(priv, drv, priv->dev,
915 			  "failed to destroy rx queues\n");
916 		/* This failure will trigger a reset - no need to clean up */
917 		return err;
918 	}
919 	netif_dbg(priv, drv, priv->dev, "destroyed rx queues\n");
920 	return 0;
921 }
922 
923 static void gve_rx_free_rings(struct gve_priv *priv)
924 {
925 	if (gve_is_gqi(priv))
926 		gve_rx_free_rings_gqi(priv);
927 	else
928 		gve_rx_free_rings_dqo(priv);
929 }
930 
931 static void gve_free_xdp_rings(struct gve_priv *priv)
932 {
933 	int ntfy_idx, start_id;
934 	int i;
935 
936 	start_id = gve_xdp_tx_start_queue_id(priv);
937 	if (priv->tx) {
938 		for (i = start_id; i <  start_id + priv->num_xdp_queues; i++) {
939 			ntfy_idx = gve_tx_idx_to_ntfy(priv, i);
940 			gve_remove_napi(priv, ntfy_idx);
941 		}
942 		gve_tx_free_rings(priv, start_id, priv->num_xdp_queues);
943 	}
944 }
945 
946 static void gve_free_rings(struct gve_priv *priv)
947 {
948 	int num_tx_queues = gve_num_tx_queues(priv);
949 	int ntfy_idx;
950 	int i;
951 
952 	if (priv->tx) {
953 		for (i = 0; i < num_tx_queues; i++) {
954 			ntfy_idx = gve_tx_idx_to_ntfy(priv, i);
955 			gve_remove_napi(priv, ntfy_idx);
956 		}
957 		gve_tx_free_rings(priv, 0, num_tx_queues);
958 		kvfree(priv->tx);
959 		priv->tx = NULL;
960 	}
961 	if (priv->rx) {
962 		for (i = 0; i < priv->rx_cfg.num_queues; i++) {
963 			ntfy_idx = gve_rx_idx_to_ntfy(priv, i);
964 			gve_remove_napi(priv, ntfy_idx);
965 		}
966 		gve_rx_free_rings(priv);
967 		kvfree(priv->rx);
968 		priv->rx = NULL;
969 	}
970 }
971 
972 int gve_alloc_page(struct gve_priv *priv, struct device *dev,
973 		   struct page **page, dma_addr_t *dma,
974 		   enum dma_data_direction dir, gfp_t gfp_flags)
975 {
976 	*page = alloc_page(gfp_flags);
977 	if (!*page) {
978 		priv->page_alloc_fail++;
979 		return -ENOMEM;
980 	}
981 	*dma = dma_map_page(dev, *page, 0, PAGE_SIZE, dir);
982 	if (dma_mapping_error(dev, *dma)) {
983 		priv->dma_mapping_error++;
984 		put_page(*page);
985 		return -ENOMEM;
986 	}
987 	return 0;
988 }
989 
990 static int gve_alloc_queue_page_list(struct gve_priv *priv, u32 id,
991 				     int pages)
992 {
993 	struct gve_queue_page_list *qpl = &priv->qpls[id];
994 	int err;
995 	int i;
996 
997 	if (pages + priv->num_registered_pages > priv->max_registered_pages) {
998 		netif_err(priv, drv, priv->dev,
999 			  "Reached max number of registered pages %llu > %llu\n",
1000 			  pages + priv->num_registered_pages,
1001 			  priv->max_registered_pages);
1002 		return -EINVAL;
1003 	}
1004 
1005 	qpl->id = id;
1006 	qpl->num_entries = 0;
1007 	qpl->pages = kvcalloc(pages, sizeof(*qpl->pages), GFP_KERNEL);
1008 	/* caller handles clean up */
1009 	if (!qpl->pages)
1010 		return -ENOMEM;
1011 	qpl->page_buses = kvcalloc(pages, sizeof(*qpl->page_buses), GFP_KERNEL);
1012 	/* caller handles clean up */
1013 	if (!qpl->page_buses)
1014 		return -ENOMEM;
1015 
1016 	for (i = 0; i < pages; i++) {
1017 		err = gve_alloc_page(priv, &priv->pdev->dev, &qpl->pages[i],
1018 				     &qpl->page_buses[i],
1019 				     gve_qpl_dma_dir(priv, id), GFP_KERNEL);
1020 		/* caller handles clean up */
1021 		if (err)
1022 			return -ENOMEM;
1023 		qpl->num_entries++;
1024 	}
1025 	priv->num_registered_pages += pages;
1026 
1027 	return 0;
1028 }
1029 
1030 void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma,
1031 		   enum dma_data_direction dir)
1032 {
1033 	if (!dma_mapping_error(dev, dma))
1034 		dma_unmap_page(dev, dma, PAGE_SIZE, dir);
1035 	if (page)
1036 		put_page(page);
1037 }
1038 
1039 static void gve_free_queue_page_list(struct gve_priv *priv, u32 id)
1040 {
1041 	struct gve_queue_page_list *qpl = &priv->qpls[id];
1042 	int i;
1043 
1044 	if (!qpl->pages)
1045 		return;
1046 	if (!qpl->page_buses)
1047 		goto free_pages;
1048 
1049 	for (i = 0; i < qpl->num_entries; i++)
1050 		gve_free_page(&priv->pdev->dev, qpl->pages[i],
1051 			      qpl->page_buses[i], gve_qpl_dma_dir(priv, id));
1052 
1053 	kvfree(qpl->page_buses);
1054 	qpl->page_buses = NULL;
1055 free_pages:
1056 	kvfree(qpl->pages);
1057 	qpl->pages = NULL;
1058 	priv->num_registered_pages -= qpl->num_entries;
1059 }
1060 
1061 static int gve_alloc_xdp_qpls(struct gve_priv *priv)
1062 {
1063 	int start_id;
1064 	int i, j;
1065 	int err;
1066 
1067 	start_id = gve_tx_qpl_id(priv, gve_xdp_tx_start_queue_id(priv));
1068 	for (i = start_id; i < start_id + gve_num_xdp_qpls(priv); i++) {
1069 		err = gve_alloc_queue_page_list(priv, i,
1070 						priv->tx_pages_per_qpl);
1071 		if (err)
1072 			goto free_qpls;
1073 	}
1074 
1075 	return 0;
1076 
1077 free_qpls:
1078 	for (j = start_id; j <= i; j++)
1079 		gve_free_queue_page_list(priv, j);
1080 	return err;
1081 }
1082 
1083 static int gve_alloc_qpls(struct gve_priv *priv)
1084 {
1085 	int max_queues = priv->tx_cfg.max_queues + priv->rx_cfg.max_queues;
1086 	int start_id;
1087 	int i, j;
1088 	int err;
1089 
1090 	if (priv->queue_format != GVE_GQI_QPL_FORMAT)
1091 		return 0;
1092 
1093 	priv->qpls = kvcalloc(max_queues, sizeof(*priv->qpls), GFP_KERNEL);
1094 	if (!priv->qpls)
1095 		return -ENOMEM;
1096 
1097 	start_id = gve_tx_start_qpl_id(priv);
1098 	for (i = start_id; i < start_id + gve_num_tx_qpls(priv); i++) {
1099 		err = gve_alloc_queue_page_list(priv, i,
1100 						priv->tx_pages_per_qpl);
1101 		if (err)
1102 			goto free_qpls;
1103 	}
1104 
1105 	start_id = gve_rx_start_qpl_id(priv);
1106 	for (i = start_id; i < start_id + gve_num_rx_qpls(priv); i++) {
1107 		err = gve_alloc_queue_page_list(priv, i,
1108 						priv->rx_data_slot_cnt);
1109 		if (err)
1110 			goto free_qpls;
1111 	}
1112 
1113 	priv->qpl_cfg.qpl_map_size = BITS_TO_LONGS(max_queues) *
1114 				     sizeof(unsigned long) * BITS_PER_BYTE;
1115 	priv->qpl_cfg.qpl_id_map = kvcalloc(BITS_TO_LONGS(max_queues),
1116 					    sizeof(unsigned long), GFP_KERNEL);
1117 	if (!priv->qpl_cfg.qpl_id_map) {
1118 		err = -ENOMEM;
1119 		goto free_qpls;
1120 	}
1121 
1122 	return 0;
1123 
1124 free_qpls:
1125 	for (j = 0; j <= i; j++)
1126 		gve_free_queue_page_list(priv, j);
1127 	kvfree(priv->qpls);
1128 	priv->qpls = NULL;
1129 	return err;
1130 }
1131 
1132 static void gve_free_xdp_qpls(struct gve_priv *priv)
1133 {
1134 	int start_id;
1135 	int i;
1136 
1137 	start_id = gve_tx_qpl_id(priv, gve_xdp_tx_start_queue_id(priv));
1138 	for (i = start_id; i < start_id + gve_num_xdp_qpls(priv); i++)
1139 		gve_free_queue_page_list(priv, i);
1140 }
1141 
1142 static void gve_free_qpls(struct gve_priv *priv)
1143 {
1144 	int max_queues = priv->tx_cfg.max_queues + priv->rx_cfg.max_queues;
1145 	int i;
1146 
1147 	if (!priv->qpls)
1148 		return;
1149 
1150 	kvfree(priv->qpl_cfg.qpl_id_map);
1151 	priv->qpl_cfg.qpl_id_map = NULL;
1152 
1153 	for (i = 0; i < max_queues; i++)
1154 		gve_free_queue_page_list(priv, i);
1155 
1156 	kvfree(priv->qpls);
1157 	priv->qpls = NULL;
1158 }
1159 
1160 /* Use this to schedule a reset when the device is capable of continuing
1161  * to handle other requests in its current state. If it is not, do a reset
1162  * in thread instead.
1163  */
1164 void gve_schedule_reset(struct gve_priv *priv)
1165 {
1166 	gve_set_do_reset(priv);
1167 	queue_work(priv->gve_wq, &priv->service_task);
1168 }
1169 
1170 static void gve_reset_and_teardown(struct gve_priv *priv, bool was_up);
1171 static int gve_reset_recovery(struct gve_priv *priv, bool was_up);
1172 static void gve_turndown(struct gve_priv *priv);
1173 static void gve_turnup(struct gve_priv *priv);
1174 
1175 static int gve_reg_xdp_info(struct gve_priv *priv, struct net_device *dev)
1176 {
1177 	struct napi_struct *napi;
1178 	struct gve_rx_ring *rx;
1179 	int err = 0;
1180 	int i, j;
1181 	u32 tx_qid;
1182 
1183 	if (!priv->num_xdp_queues)
1184 		return 0;
1185 
1186 	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
1187 		rx = &priv->rx[i];
1188 		napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
1189 
1190 		err = xdp_rxq_info_reg(&rx->xdp_rxq, dev, i,
1191 				       napi->napi_id);
1192 		if (err)
1193 			goto err;
1194 		err = xdp_rxq_info_reg_mem_model(&rx->xdp_rxq,
1195 						 MEM_TYPE_PAGE_SHARED, NULL);
1196 		if (err)
1197 			goto err;
1198 		rx->xsk_pool = xsk_get_pool_from_qid(dev, i);
1199 		if (rx->xsk_pool) {
1200 			err = xdp_rxq_info_reg(&rx->xsk_rxq, dev, i,
1201 					       napi->napi_id);
1202 			if (err)
1203 				goto err;
1204 			err = xdp_rxq_info_reg_mem_model(&rx->xsk_rxq,
1205 							 MEM_TYPE_XSK_BUFF_POOL, NULL);
1206 			if (err)
1207 				goto err;
1208 			xsk_pool_set_rxq_info(rx->xsk_pool,
1209 					      &rx->xsk_rxq);
1210 		}
1211 	}
1212 
1213 	for (i = 0; i < priv->num_xdp_queues; i++) {
1214 		tx_qid = gve_xdp_tx_queue_id(priv, i);
1215 		priv->tx[tx_qid].xsk_pool = xsk_get_pool_from_qid(dev, i);
1216 	}
1217 	return 0;
1218 
1219 err:
1220 	for (j = i; j >= 0; j--) {
1221 		rx = &priv->rx[j];
1222 		if (xdp_rxq_info_is_reg(&rx->xdp_rxq))
1223 			xdp_rxq_info_unreg(&rx->xdp_rxq);
1224 		if (xdp_rxq_info_is_reg(&rx->xsk_rxq))
1225 			xdp_rxq_info_unreg(&rx->xsk_rxq);
1226 	}
1227 	return err;
1228 }
1229 
1230 static void gve_unreg_xdp_info(struct gve_priv *priv)
1231 {
1232 	int i, tx_qid;
1233 
1234 	if (!priv->num_xdp_queues)
1235 		return;
1236 
1237 	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
1238 		struct gve_rx_ring *rx = &priv->rx[i];
1239 
1240 		xdp_rxq_info_unreg(&rx->xdp_rxq);
1241 		if (rx->xsk_pool) {
1242 			xdp_rxq_info_unreg(&rx->xsk_rxq);
1243 			rx->xsk_pool = NULL;
1244 		}
1245 	}
1246 
1247 	for (i = 0; i < priv->num_xdp_queues; i++) {
1248 		tx_qid = gve_xdp_tx_queue_id(priv, i);
1249 		priv->tx[tx_qid].xsk_pool = NULL;
1250 	}
1251 }
1252 
1253 static void gve_drain_page_cache(struct gve_priv *priv)
1254 {
1255 	struct page_frag_cache *nc;
1256 	int i;
1257 
1258 	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
1259 		nc = &priv->rx[i].page_cache;
1260 		if (nc->va) {
1261 			__page_frag_cache_drain(virt_to_page(nc->va),
1262 						nc->pagecnt_bias);
1263 			nc->va = NULL;
1264 		}
1265 	}
1266 }
1267 
1268 static int gve_open(struct net_device *dev)
1269 {
1270 	struct gve_priv *priv = netdev_priv(dev);
1271 	int err;
1272 
1273 	if (priv->xdp_prog)
1274 		priv->num_xdp_queues = priv->rx_cfg.num_queues;
1275 	else
1276 		priv->num_xdp_queues = 0;
1277 
1278 	err = gve_alloc_qpls(priv);
1279 	if (err)
1280 		return err;
1281 
1282 	err = gve_alloc_rings(priv);
1283 	if (err)
1284 		goto free_qpls;
1285 
1286 	err = netif_set_real_num_tx_queues(dev, priv->tx_cfg.num_queues);
1287 	if (err)
1288 		goto free_rings;
1289 	err = netif_set_real_num_rx_queues(dev, priv->rx_cfg.num_queues);
1290 	if (err)
1291 		goto free_rings;
1292 
1293 	err = gve_reg_xdp_info(priv, dev);
1294 	if (err)
1295 		goto free_rings;
1296 
1297 	err = gve_register_qpls(priv);
1298 	if (err)
1299 		goto reset;
1300 
1301 	if (!gve_is_gqi(priv)) {
1302 		/* Hard code this for now. This may be tuned in the future for
1303 		 * performance.
1304 		 */
1305 		priv->data_buffer_size_dqo = GVE_RX_BUFFER_SIZE_DQO;
1306 	}
1307 	err = gve_create_rings(priv);
1308 	if (err)
1309 		goto reset;
1310 
1311 	gve_set_device_rings_ok(priv);
1312 
1313 	if (gve_get_report_stats(priv))
1314 		mod_timer(&priv->stats_report_timer,
1315 			  round_jiffies(jiffies +
1316 				msecs_to_jiffies(priv->stats_report_timer_period)));
1317 
1318 	gve_turnup(priv);
1319 	queue_work(priv->gve_wq, &priv->service_task);
1320 	priv->interface_up_cnt++;
1321 	return 0;
1322 
1323 free_rings:
1324 	gve_free_rings(priv);
1325 free_qpls:
1326 	gve_free_qpls(priv);
1327 	return err;
1328 
1329 reset:
1330 	/* This must have been called from a reset due to the rtnl lock
1331 	 * so just return at this point.
1332 	 */
1333 	if (gve_get_reset_in_progress(priv))
1334 		return err;
1335 	/* Otherwise reset before returning */
1336 	gve_reset_and_teardown(priv, true);
1337 	/* if this fails there is nothing we can do so just ignore the return */
1338 	gve_reset_recovery(priv, false);
1339 	/* return the original error */
1340 	return err;
1341 }
1342 
1343 static int gve_close(struct net_device *dev)
1344 {
1345 	struct gve_priv *priv = netdev_priv(dev);
1346 	int err;
1347 
1348 	netif_carrier_off(dev);
1349 	if (gve_get_device_rings_ok(priv)) {
1350 		gve_turndown(priv);
1351 		gve_drain_page_cache(priv);
1352 		err = gve_destroy_rings(priv);
1353 		if (err)
1354 			goto err;
1355 		err = gve_unregister_qpls(priv);
1356 		if (err)
1357 			goto err;
1358 		gve_clear_device_rings_ok(priv);
1359 	}
1360 	del_timer_sync(&priv->stats_report_timer);
1361 
1362 	gve_unreg_xdp_info(priv);
1363 	gve_free_rings(priv);
1364 	gve_free_qpls(priv);
1365 	priv->interface_down_cnt++;
1366 	return 0;
1367 
1368 err:
1369 	/* This must have been called from a reset due to the rtnl lock
1370 	 * so just return at this point.
1371 	 */
1372 	if (gve_get_reset_in_progress(priv))
1373 		return err;
1374 	/* Otherwise reset before returning */
1375 	gve_reset_and_teardown(priv, true);
1376 	return gve_reset_recovery(priv, false);
1377 }
1378 
1379 static int gve_remove_xdp_queues(struct gve_priv *priv)
1380 {
1381 	int err;
1382 
1383 	err = gve_destroy_xdp_rings(priv);
1384 	if (err)
1385 		return err;
1386 
1387 	err = gve_unregister_xdp_qpls(priv);
1388 	if (err)
1389 		return err;
1390 
1391 	gve_unreg_xdp_info(priv);
1392 	gve_free_xdp_rings(priv);
1393 	gve_free_xdp_qpls(priv);
1394 	priv->num_xdp_queues = 0;
1395 	return 0;
1396 }
1397 
1398 static int gve_add_xdp_queues(struct gve_priv *priv)
1399 {
1400 	int err;
1401 
1402 	priv->num_xdp_queues = priv->tx_cfg.num_queues;
1403 
1404 	err = gve_alloc_xdp_qpls(priv);
1405 	if (err)
1406 		goto err;
1407 
1408 	err = gve_alloc_xdp_rings(priv);
1409 	if (err)
1410 		goto free_xdp_qpls;
1411 
1412 	err = gve_reg_xdp_info(priv, priv->dev);
1413 	if (err)
1414 		goto free_xdp_rings;
1415 
1416 	err = gve_register_xdp_qpls(priv);
1417 	if (err)
1418 		goto free_xdp_rings;
1419 
1420 	err = gve_create_xdp_rings(priv);
1421 	if (err)
1422 		goto free_xdp_rings;
1423 
1424 	return 0;
1425 
1426 free_xdp_rings:
1427 	gve_free_xdp_rings(priv);
1428 free_xdp_qpls:
1429 	gve_free_xdp_qpls(priv);
1430 err:
1431 	priv->num_xdp_queues = 0;
1432 	return err;
1433 }
1434 
1435 static void gve_handle_link_status(struct gve_priv *priv, bool link_status)
1436 {
1437 	if (!gve_get_napi_enabled(priv))
1438 		return;
1439 
1440 	if (link_status == netif_carrier_ok(priv->dev))
1441 		return;
1442 
1443 	if (link_status) {
1444 		netdev_info(priv->dev, "Device link is up.\n");
1445 		netif_carrier_on(priv->dev);
1446 	} else {
1447 		netdev_info(priv->dev, "Device link is down.\n");
1448 		netif_carrier_off(priv->dev);
1449 	}
1450 }
1451 
1452 static int gve_set_xdp(struct gve_priv *priv, struct bpf_prog *prog,
1453 		       struct netlink_ext_ack *extack)
1454 {
1455 	struct bpf_prog *old_prog;
1456 	int err = 0;
1457 	u32 status;
1458 
1459 	old_prog = READ_ONCE(priv->xdp_prog);
1460 	if (!netif_carrier_ok(priv->dev)) {
1461 		WRITE_ONCE(priv->xdp_prog, prog);
1462 		if (old_prog)
1463 			bpf_prog_put(old_prog);
1464 		return 0;
1465 	}
1466 
1467 	gve_turndown(priv);
1468 	if (!old_prog && prog) {
1469 		// Allocate XDP TX queues if an XDP program is
1470 		// being installed
1471 		err = gve_add_xdp_queues(priv);
1472 		if (err)
1473 			goto out;
1474 	} else if (old_prog && !prog) {
1475 		// Remove XDP TX queues if an XDP program is
1476 		// being uninstalled
1477 		err = gve_remove_xdp_queues(priv);
1478 		if (err)
1479 			goto out;
1480 	}
1481 	WRITE_ONCE(priv->xdp_prog, prog);
1482 	if (old_prog)
1483 		bpf_prog_put(old_prog);
1484 
1485 out:
1486 	gve_turnup(priv);
1487 	status = ioread32be(&priv->reg_bar0->device_status);
1488 	gve_handle_link_status(priv, GVE_DEVICE_STATUS_LINK_STATUS_MASK & status);
1489 	return err;
1490 }
1491 
1492 static int gve_xsk_pool_enable(struct net_device *dev,
1493 			       struct xsk_buff_pool *pool,
1494 			       u16 qid)
1495 {
1496 	struct gve_priv *priv = netdev_priv(dev);
1497 	struct napi_struct *napi;
1498 	struct gve_rx_ring *rx;
1499 	int tx_qid;
1500 	int err;
1501 
1502 	if (qid >= priv->rx_cfg.num_queues) {
1503 		dev_err(&priv->pdev->dev, "xsk pool invalid qid %d", qid);
1504 		return -EINVAL;
1505 	}
1506 	if (xsk_pool_get_rx_frame_size(pool) <
1507 	     priv->dev->max_mtu + sizeof(struct ethhdr)) {
1508 		dev_err(&priv->pdev->dev, "xsk pool frame_len too small");
1509 		return -EINVAL;
1510 	}
1511 
1512 	err = xsk_pool_dma_map(pool, &priv->pdev->dev,
1513 			       DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
1514 	if (err)
1515 		return err;
1516 
1517 	/* If XDP prog is not installed, return */
1518 	if (!priv->xdp_prog)
1519 		return 0;
1520 
1521 	rx = &priv->rx[qid];
1522 	napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
1523 	err = xdp_rxq_info_reg(&rx->xsk_rxq, dev, qid, napi->napi_id);
1524 	if (err)
1525 		goto err;
1526 
1527 	err = xdp_rxq_info_reg_mem_model(&rx->xsk_rxq,
1528 					 MEM_TYPE_XSK_BUFF_POOL, NULL);
1529 	if (err)
1530 		goto err;
1531 
1532 	xsk_pool_set_rxq_info(pool, &rx->xsk_rxq);
1533 	rx->xsk_pool = pool;
1534 
1535 	tx_qid = gve_xdp_tx_queue_id(priv, qid);
1536 	priv->tx[tx_qid].xsk_pool = pool;
1537 
1538 	return 0;
1539 err:
1540 	if (xdp_rxq_info_is_reg(&rx->xsk_rxq))
1541 		xdp_rxq_info_unreg(&rx->xsk_rxq);
1542 
1543 	xsk_pool_dma_unmap(pool,
1544 			   DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
1545 	return err;
1546 }
1547 
1548 static int gve_xsk_pool_disable(struct net_device *dev,
1549 				u16 qid)
1550 {
1551 	struct gve_priv *priv = netdev_priv(dev);
1552 	struct napi_struct *napi_rx;
1553 	struct napi_struct *napi_tx;
1554 	struct xsk_buff_pool *pool;
1555 	int tx_qid;
1556 
1557 	pool = xsk_get_pool_from_qid(dev, qid);
1558 	if (!pool)
1559 		return -EINVAL;
1560 	if (qid >= priv->rx_cfg.num_queues)
1561 		return -EINVAL;
1562 
1563 	/* If XDP prog is not installed, unmap DMA and return */
1564 	if (!priv->xdp_prog)
1565 		goto done;
1566 
1567 	tx_qid = gve_xdp_tx_queue_id(priv, qid);
1568 	if (!netif_running(dev)) {
1569 		priv->rx[qid].xsk_pool = NULL;
1570 		xdp_rxq_info_unreg(&priv->rx[qid].xsk_rxq);
1571 		priv->tx[tx_qid].xsk_pool = NULL;
1572 		goto done;
1573 	}
1574 
1575 	napi_rx = &priv->ntfy_blocks[priv->rx[qid].ntfy_id].napi;
1576 	napi_disable(napi_rx); /* make sure current rx poll is done */
1577 
1578 	napi_tx = &priv->ntfy_blocks[priv->tx[tx_qid].ntfy_id].napi;
1579 	napi_disable(napi_tx); /* make sure current tx poll is done */
1580 
1581 	priv->rx[qid].xsk_pool = NULL;
1582 	xdp_rxq_info_unreg(&priv->rx[qid].xsk_rxq);
1583 	priv->tx[tx_qid].xsk_pool = NULL;
1584 	smp_mb(); /* Make sure it is visible to the workers on datapath */
1585 
1586 	napi_enable(napi_rx);
1587 	if (gve_rx_work_pending(&priv->rx[qid]))
1588 		napi_schedule(napi_rx);
1589 
1590 	napi_enable(napi_tx);
1591 	if (gve_tx_clean_pending(priv, &priv->tx[tx_qid]))
1592 		napi_schedule(napi_tx);
1593 
1594 done:
1595 	xsk_pool_dma_unmap(pool,
1596 			   DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
1597 	return 0;
1598 }
1599 
1600 static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
1601 {
1602 	struct gve_priv *priv = netdev_priv(dev);
1603 	int tx_queue_id = gve_xdp_tx_queue_id(priv, queue_id);
1604 
1605 	if (queue_id >= priv->rx_cfg.num_queues || !priv->xdp_prog)
1606 		return -EINVAL;
1607 
1608 	if (flags & XDP_WAKEUP_TX) {
1609 		struct gve_tx_ring *tx = &priv->tx[tx_queue_id];
1610 		struct napi_struct *napi =
1611 			&priv->ntfy_blocks[tx->ntfy_id].napi;
1612 
1613 		if (!napi_if_scheduled_mark_missed(napi)) {
1614 			/* Call local_bh_enable to trigger SoftIRQ processing */
1615 			local_bh_disable();
1616 			napi_schedule(napi);
1617 			local_bh_enable();
1618 		}
1619 
1620 		tx->xdp_xsk_wakeup++;
1621 	}
1622 
1623 	return 0;
1624 }
1625 
1626 static int verify_xdp_configuration(struct net_device *dev)
1627 {
1628 	struct gve_priv *priv = netdev_priv(dev);
1629 
1630 	if (dev->features & NETIF_F_LRO) {
1631 		netdev_warn(dev, "XDP is not supported when LRO is on.\n");
1632 		return -EOPNOTSUPP;
1633 	}
1634 
1635 	if (priv->queue_format != GVE_GQI_QPL_FORMAT) {
1636 		netdev_warn(dev, "XDP is not supported in mode %d.\n",
1637 			    priv->queue_format);
1638 		return -EOPNOTSUPP;
1639 	}
1640 
1641 	if (dev->mtu > (PAGE_SIZE / 2) - sizeof(struct ethhdr) - GVE_RX_PAD) {
1642 		netdev_warn(dev, "XDP is not supported for mtu %d.\n",
1643 			    dev->mtu);
1644 		return -EOPNOTSUPP;
1645 	}
1646 
1647 	if (priv->rx_cfg.num_queues != priv->tx_cfg.num_queues ||
1648 	    (2 * priv->tx_cfg.num_queues > priv->tx_cfg.max_queues)) {
1649 		netdev_warn(dev, "XDP load failed: The number of configured RX queues %d should be equal to the number of configured TX queues %d and the number of configured RX/TX queues should be less than or equal to half the maximum number of RX/TX queues %d",
1650 			    priv->rx_cfg.num_queues,
1651 			    priv->tx_cfg.num_queues,
1652 			    priv->tx_cfg.max_queues);
1653 		return -EINVAL;
1654 	}
1655 	return 0;
1656 }
1657 
1658 static int gve_xdp(struct net_device *dev, struct netdev_bpf *xdp)
1659 {
1660 	struct gve_priv *priv = netdev_priv(dev);
1661 	int err;
1662 
1663 	err = verify_xdp_configuration(dev);
1664 	if (err)
1665 		return err;
1666 	switch (xdp->command) {
1667 	case XDP_SETUP_PROG:
1668 		return gve_set_xdp(priv, xdp->prog, xdp->extack);
1669 	case XDP_SETUP_XSK_POOL:
1670 		if (xdp->xsk.pool)
1671 			return gve_xsk_pool_enable(dev, xdp->xsk.pool, xdp->xsk.queue_id);
1672 		else
1673 			return gve_xsk_pool_disable(dev, xdp->xsk.queue_id);
1674 	default:
1675 		return -EINVAL;
1676 	}
1677 }
1678 
1679 int gve_adjust_queues(struct gve_priv *priv,
1680 		      struct gve_queue_config new_rx_config,
1681 		      struct gve_queue_config new_tx_config)
1682 {
1683 	int err;
1684 
1685 	if (netif_carrier_ok(priv->dev)) {
1686 		/* To make this process as simple as possible we teardown the
1687 		 * device, set the new configuration, and then bring the device
1688 		 * up again.
1689 		 */
1690 		err = gve_close(priv->dev);
1691 		/* we have already tried to reset in close,
1692 		 * just fail at this point
1693 		 */
1694 		if (err)
1695 			return err;
1696 		priv->tx_cfg = new_tx_config;
1697 		priv->rx_cfg = new_rx_config;
1698 
1699 		err = gve_open(priv->dev);
1700 		if (err)
1701 			goto err;
1702 
1703 		return 0;
1704 	}
1705 	/* Set the config for the next up. */
1706 	priv->tx_cfg = new_tx_config;
1707 	priv->rx_cfg = new_rx_config;
1708 
1709 	return 0;
1710 err:
1711 	netif_err(priv, drv, priv->dev,
1712 		  "Adjust queues failed! !!! DISABLING ALL QUEUES !!!\n");
1713 	gve_turndown(priv);
1714 	return err;
1715 }
1716 
1717 static void gve_turndown(struct gve_priv *priv)
1718 {
1719 	int idx;
1720 
1721 	if (netif_carrier_ok(priv->dev))
1722 		netif_carrier_off(priv->dev);
1723 
1724 	if (!gve_get_napi_enabled(priv))
1725 		return;
1726 
1727 	/* Disable napi to prevent more work from coming in */
1728 	for (idx = 0; idx < gve_num_tx_queues(priv); idx++) {
1729 		int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx);
1730 		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
1731 
1732 		napi_disable(&block->napi);
1733 	}
1734 	for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
1735 		int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
1736 		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
1737 
1738 		napi_disable(&block->napi);
1739 	}
1740 
1741 	/* Stop tx queues */
1742 	netif_tx_disable(priv->dev);
1743 
1744 	gve_clear_napi_enabled(priv);
1745 	gve_clear_report_stats(priv);
1746 }
1747 
1748 static void gve_turnup(struct gve_priv *priv)
1749 {
1750 	int idx;
1751 
1752 	/* Start the tx queues */
1753 	netif_tx_start_all_queues(priv->dev);
1754 
1755 	/* Enable napi and unmask interrupts for all queues */
1756 	for (idx = 0; idx < gve_num_tx_queues(priv); idx++) {
1757 		int ntfy_idx = gve_tx_idx_to_ntfy(priv, idx);
1758 		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
1759 
1760 		napi_enable(&block->napi);
1761 		if (gve_is_gqi(priv)) {
1762 			iowrite32be(0, gve_irq_doorbell(priv, block));
1763 		} else {
1764 			gve_set_itr_coalesce_usecs_dqo(priv, block,
1765 						       priv->tx_coalesce_usecs);
1766 		}
1767 	}
1768 	for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
1769 		int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx);
1770 		struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx];
1771 
1772 		napi_enable(&block->napi);
1773 		if (gve_is_gqi(priv)) {
1774 			iowrite32be(0, gve_irq_doorbell(priv, block));
1775 		} else {
1776 			gve_set_itr_coalesce_usecs_dqo(priv, block,
1777 						       priv->rx_coalesce_usecs);
1778 		}
1779 	}
1780 
1781 	gve_set_napi_enabled(priv);
1782 }
1783 
1784 static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue)
1785 {
1786 	struct gve_notify_block *block;
1787 	struct gve_tx_ring *tx = NULL;
1788 	struct gve_priv *priv;
1789 	u32 last_nic_done;
1790 	u32 current_time;
1791 	u32 ntfy_idx;
1792 
1793 	netdev_info(dev, "Timeout on tx queue, %d", txqueue);
1794 	priv = netdev_priv(dev);
1795 	if (txqueue > priv->tx_cfg.num_queues)
1796 		goto reset;
1797 
1798 	ntfy_idx = gve_tx_idx_to_ntfy(priv, txqueue);
1799 	if (ntfy_idx >= priv->num_ntfy_blks)
1800 		goto reset;
1801 
1802 	block = &priv->ntfy_blocks[ntfy_idx];
1803 	tx = block->tx;
1804 
1805 	current_time = jiffies_to_msecs(jiffies);
1806 	if (tx->last_kick_msec + MIN_TX_TIMEOUT_GAP > current_time)
1807 		goto reset;
1808 
1809 	/* Check to see if there are missed completions, which will allow us to
1810 	 * kick the queue.
1811 	 */
1812 	last_nic_done = gve_tx_load_event_counter(priv, tx);
1813 	if (last_nic_done - tx->done) {
1814 		netdev_info(dev, "Kicking queue %d", txqueue);
1815 		iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block));
1816 		napi_schedule(&block->napi);
1817 		tx->last_kick_msec = current_time;
1818 		goto out;
1819 	} // Else reset.
1820 
1821 reset:
1822 	gve_schedule_reset(priv);
1823 
1824 out:
1825 	if (tx)
1826 		tx->queue_timeout++;
1827 	priv->tx_timeo_cnt++;
1828 }
1829 
1830 static int gve_set_features(struct net_device *netdev,
1831 			    netdev_features_t features)
1832 {
1833 	const netdev_features_t orig_features = netdev->features;
1834 	struct gve_priv *priv = netdev_priv(netdev);
1835 	int err;
1836 
1837 	if ((netdev->features & NETIF_F_LRO) != (features & NETIF_F_LRO)) {
1838 		netdev->features ^= NETIF_F_LRO;
1839 		if (netif_carrier_ok(netdev)) {
1840 			/* To make this process as simple as possible we
1841 			 * teardown the device, set the new configuration,
1842 			 * and then bring the device up again.
1843 			 */
1844 			err = gve_close(netdev);
1845 			/* We have already tried to reset in close, just fail
1846 			 * at this point.
1847 			 */
1848 			if (err)
1849 				goto err;
1850 
1851 			err = gve_open(netdev);
1852 			if (err)
1853 				goto err;
1854 		}
1855 	}
1856 
1857 	return 0;
1858 err:
1859 	/* Reverts the change on error. */
1860 	netdev->features = orig_features;
1861 	netif_err(priv, drv, netdev,
1862 		  "Set features failed! !!! DISABLING ALL QUEUES !!!\n");
1863 	return err;
1864 }
1865 
1866 static const struct net_device_ops gve_netdev_ops = {
1867 	.ndo_start_xmit		=	gve_start_xmit,
1868 	.ndo_open		=	gve_open,
1869 	.ndo_stop		=	gve_close,
1870 	.ndo_get_stats64	=	gve_get_stats,
1871 	.ndo_tx_timeout         =       gve_tx_timeout,
1872 	.ndo_set_features	=	gve_set_features,
1873 	.ndo_bpf		=	gve_xdp,
1874 	.ndo_xdp_xmit		=	gve_xdp_xmit,
1875 	.ndo_xsk_wakeup		=	gve_xsk_wakeup,
1876 };
1877 
1878 static void gve_handle_status(struct gve_priv *priv, u32 status)
1879 {
1880 	if (GVE_DEVICE_STATUS_RESET_MASK & status) {
1881 		dev_info(&priv->pdev->dev, "Device requested reset.\n");
1882 		gve_set_do_reset(priv);
1883 	}
1884 	if (GVE_DEVICE_STATUS_REPORT_STATS_MASK & status) {
1885 		priv->stats_report_trigger_cnt++;
1886 		gve_set_do_report_stats(priv);
1887 	}
1888 }
1889 
1890 static void gve_handle_reset(struct gve_priv *priv)
1891 {
1892 	/* A service task will be scheduled at the end of probe to catch any
1893 	 * resets that need to happen, and we don't want to reset until
1894 	 * probe is done.
1895 	 */
1896 	if (gve_get_probe_in_progress(priv))
1897 		return;
1898 
1899 	if (gve_get_do_reset(priv)) {
1900 		rtnl_lock();
1901 		gve_reset(priv, false);
1902 		rtnl_unlock();
1903 	}
1904 }
1905 
1906 void gve_handle_report_stats(struct gve_priv *priv)
1907 {
1908 	struct stats *stats = priv->stats_report->stats;
1909 	int idx, stats_idx = 0;
1910 	unsigned int start = 0;
1911 	u64 tx_bytes;
1912 
1913 	if (!gve_get_report_stats(priv))
1914 		return;
1915 
1916 	be64_add_cpu(&priv->stats_report->written_count, 1);
1917 	/* tx stats */
1918 	if (priv->tx) {
1919 		for (idx = 0; idx < gve_num_tx_queues(priv); idx++) {
1920 			u32 last_completion = 0;
1921 			u32 tx_frames = 0;
1922 
1923 			/* DQO doesn't currently support these metrics. */
1924 			if (gve_is_gqi(priv)) {
1925 				last_completion = priv->tx[idx].done;
1926 				tx_frames = priv->tx[idx].req;
1927 			}
1928 
1929 			do {
1930 				start = u64_stats_fetch_begin(&priv->tx[idx].statss);
1931 				tx_bytes = priv->tx[idx].bytes_done;
1932 			} while (u64_stats_fetch_retry(&priv->tx[idx].statss, start));
1933 			stats[stats_idx++] = (struct stats) {
1934 				.stat_name = cpu_to_be32(TX_WAKE_CNT),
1935 				.value = cpu_to_be64(priv->tx[idx].wake_queue),
1936 				.queue_id = cpu_to_be32(idx),
1937 			};
1938 			stats[stats_idx++] = (struct stats) {
1939 				.stat_name = cpu_to_be32(TX_STOP_CNT),
1940 				.value = cpu_to_be64(priv->tx[idx].stop_queue),
1941 				.queue_id = cpu_to_be32(idx),
1942 			};
1943 			stats[stats_idx++] = (struct stats) {
1944 				.stat_name = cpu_to_be32(TX_FRAMES_SENT),
1945 				.value = cpu_to_be64(tx_frames),
1946 				.queue_id = cpu_to_be32(idx),
1947 			};
1948 			stats[stats_idx++] = (struct stats) {
1949 				.stat_name = cpu_to_be32(TX_BYTES_SENT),
1950 				.value = cpu_to_be64(tx_bytes),
1951 				.queue_id = cpu_to_be32(idx),
1952 			};
1953 			stats[stats_idx++] = (struct stats) {
1954 				.stat_name = cpu_to_be32(TX_LAST_COMPLETION_PROCESSED),
1955 				.value = cpu_to_be64(last_completion),
1956 				.queue_id = cpu_to_be32(idx),
1957 			};
1958 			stats[stats_idx++] = (struct stats) {
1959 				.stat_name = cpu_to_be32(TX_TIMEOUT_CNT),
1960 				.value = cpu_to_be64(priv->tx[idx].queue_timeout),
1961 				.queue_id = cpu_to_be32(idx),
1962 			};
1963 		}
1964 	}
1965 	/* rx stats */
1966 	if (priv->rx) {
1967 		for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) {
1968 			stats[stats_idx++] = (struct stats) {
1969 				.stat_name = cpu_to_be32(RX_NEXT_EXPECTED_SEQUENCE),
1970 				.value = cpu_to_be64(priv->rx[idx].desc.seqno),
1971 				.queue_id = cpu_to_be32(idx),
1972 			};
1973 			stats[stats_idx++] = (struct stats) {
1974 				.stat_name = cpu_to_be32(RX_BUFFERS_POSTED),
1975 				.value = cpu_to_be64(priv->rx[0].fill_cnt),
1976 				.queue_id = cpu_to_be32(idx),
1977 			};
1978 		}
1979 	}
1980 }
1981 
1982 /* Handle NIC status register changes, reset requests and report stats */
1983 static void gve_service_task(struct work_struct *work)
1984 {
1985 	struct gve_priv *priv = container_of(work, struct gve_priv,
1986 					     service_task);
1987 	u32 status = ioread32be(&priv->reg_bar0->device_status);
1988 
1989 	gve_handle_status(priv, status);
1990 
1991 	gve_handle_reset(priv);
1992 	gve_handle_link_status(priv, GVE_DEVICE_STATUS_LINK_STATUS_MASK & status);
1993 }
1994 
1995 static void gve_set_netdev_xdp_features(struct gve_priv *priv)
1996 {
1997 	if (priv->queue_format == GVE_GQI_QPL_FORMAT) {
1998 		priv->dev->xdp_features = NETDEV_XDP_ACT_BASIC;
1999 		priv->dev->xdp_features |= NETDEV_XDP_ACT_REDIRECT;
2000 		priv->dev->xdp_features |= NETDEV_XDP_ACT_NDO_XMIT;
2001 		priv->dev->xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY;
2002 	} else {
2003 		priv->dev->xdp_features = 0;
2004 	}
2005 }
2006 
2007 static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device)
2008 {
2009 	int num_ntfy;
2010 	int err;
2011 
2012 	/* Set up the adminq */
2013 	err = gve_adminq_alloc(&priv->pdev->dev, priv);
2014 	if (err) {
2015 		dev_err(&priv->pdev->dev,
2016 			"Failed to alloc admin queue: err=%d\n", err);
2017 		return err;
2018 	}
2019 
2020 	err = gve_verify_driver_compatibility(priv);
2021 	if (err) {
2022 		dev_err(&priv->pdev->dev,
2023 			"Could not verify driver compatibility: err=%d\n", err);
2024 		goto err;
2025 	}
2026 
2027 	if (skip_describe_device)
2028 		goto setup_device;
2029 
2030 	priv->queue_format = GVE_QUEUE_FORMAT_UNSPECIFIED;
2031 	/* Get the initial information we need from the device */
2032 	err = gve_adminq_describe_device(priv);
2033 	if (err) {
2034 		dev_err(&priv->pdev->dev,
2035 			"Could not get device information: err=%d\n", err);
2036 		goto err;
2037 	}
2038 	priv->dev->mtu = priv->dev->max_mtu;
2039 	num_ntfy = pci_msix_vec_count(priv->pdev);
2040 	if (num_ntfy <= 0) {
2041 		dev_err(&priv->pdev->dev,
2042 			"could not count MSI-x vectors: err=%d\n", num_ntfy);
2043 		err = num_ntfy;
2044 		goto err;
2045 	} else if (num_ntfy < GVE_MIN_MSIX) {
2046 		dev_err(&priv->pdev->dev, "gve needs at least %d MSI-x vectors, but only has %d\n",
2047 			GVE_MIN_MSIX, num_ntfy);
2048 		err = -EINVAL;
2049 		goto err;
2050 	}
2051 
2052 	/* Big TCP is only supported on DQ*/
2053 	if (!gve_is_gqi(priv))
2054 		netif_set_tso_max_size(priv->dev, DQO_TX_MAX);
2055 
2056 	priv->num_registered_pages = 0;
2057 	priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK;
2058 	/* gvnic has one Notification Block per MSI-x vector, except for the
2059 	 * management vector
2060 	 */
2061 	priv->num_ntfy_blks = (num_ntfy - 1) & ~0x1;
2062 	priv->mgmt_msix_idx = priv->num_ntfy_blks;
2063 
2064 	priv->tx_cfg.max_queues =
2065 		min_t(int, priv->tx_cfg.max_queues, priv->num_ntfy_blks / 2);
2066 	priv->rx_cfg.max_queues =
2067 		min_t(int, priv->rx_cfg.max_queues, priv->num_ntfy_blks / 2);
2068 
2069 	priv->tx_cfg.num_queues = priv->tx_cfg.max_queues;
2070 	priv->rx_cfg.num_queues = priv->rx_cfg.max_queues;
2071 	if (priv->default_num_queues > 0) {
2072 		priv->tx_cfg.num_queues = min_t(int, priv->default_num_queues,
2073 						priv->tx_cfg.num_queues);
2074 		priv->rx_cfg.num_queues = min_t(int, priv->default_num_queues,
2075 						priv->rx_cfg.num_queues);
2076 	}
2077 
2078 	dev_info(&priv->pdev->dev, "TX queues %d, RX queues %d\n",
2079 		 priv->tx_cfg.num_queues, priv->rx_cfg.num_queues);
2080 	dev_info(&priv->pdev->dev, "Max TX queues %d, Max RX queues %d\n",
2081 		 priv->tx_cfg.max_queues, priv->rx_cfg.max_queues);
2082 
2083 	if (!gve_is_gqi(priv)) {
2084 		priv->tx_coalesce_usecs = GVE_TX_IRQ_RATELIMIT_US_DQO;
2085 		priv->rx_coalesce_usecs = GVE_RX_IRQ_RATELIMIT_US_DQO;
2086 	}
2087 
2088 setup_device:
2089 	gve_set_netdev_xdp_features(priv);
2090 	err = gve_setup_device_resources(priv);
2091 	if (!err)
2092 		return 0;
2093 err:
2094 	gve_adminq_free(&priv->pdev->dev, priv);
2095 	return err;
2096 }
2097 
2098 static void gve_teardown_priv_resources(struct gve_priv *priv)
2099 {
2100 	gve_teardown_device_resources(priv);
2101 	gve_adminq_free(&priv->pdev->dev, priv);
2102 }
2103 
2104 static void gve_trigger_reset(struct gve_priv *priv)
2105 {
2106 	/* Reset the device by releasing the AQ */
2107 	gve_adminq_release(priv);
2108 }
2109 
2110 static void gve_reset_and_teardown(struct gve_priv *priv, bool was_up)
2111 {
2112 	gve_trigger_reset(priv);
2113 	/* With the reset having already happened, close cannot fail */
2114 	if (was_up)
2115 		gve_close(priv->dev);
2116 	gve_teardown_priv_resources(priv);
2117 }
2118 
2119 static int gve_reset_recovery(struct gve_priv *priv, bool was_up)
2120 {
2121 	int err;
2122 
2123 	err = gve_init_priv(priv, true);
2124 	if (err)
2125 		goto err;
2126 	if (was_up) {
2127 		err = gve_open(priv->dev);
2128 		if (err)
2129 			goto err;
2130 	}
2131 	return 0;
2132 err:
2133 	dev_err(&priv->pdev->dev, "Reset failed! !!! DISABLING ALL QUEUES !!!\n");
2134 	gve_turndown(priv);
2135 	return err;
2136 }
2137 
2138 int gve_reset(struct gve_priv *priv, bool attempt_teardown)
2139 {
2140 	bool was_up = netif_carrier_ok(priv->dev);
2141 	int err;
2142 
2143 	dev_info(&priv->pdev->dev, "Performing reset\n");
2144 	gve_clear_do_reset(priv);
2145 	gve_set_reset_in_progress(priv);
2146 	/* If we aren't attempting to teardown normally, just go turndown and
2147 	 * reset right away.
2148 	 */
2149 	if (!attempt_teardown) {
2150 		gve_turndown(priv);
2151 		gve_reset_and_teardown(priv, was_up);
2152 	} else {
2153 		/* Otherwise attempt to close normally */
2154 		if (was_up) {
2155 			err = gve_close(priv->dev);
2156 			/* If that fails reset as we did above */
2157 			if (err)
2158 				gve_reset_and_teardown(priv, was_up);
2159 		}
2160 		/* Clean up any remaining resources */
2161 		gve_teardown_priv_resources(priv);
2162 	}
2163 
2164 	/* Set it all back up */
2165 	err = gve_reset_recovery(priv, was_up);
2166 	gve_clear_reset_in_progress(priv);
2167 	priv->reset_cnt++;
2168 	priv->interface_up_cnt = 0;
2169 	priv->interface_down_cnt = 0;
2170 	priv->stats_report_trigger_cnt = 0;
2171 	return err;
2172 }
2173 
2174 static void gve_write_version(u8 __iomem *driver_version_register)
2175 {
2176 	const char *c = gve_version_prefix;
2177 
2178 	while (*c) {
2179 		writeb(*c, driver_version_register);
2180 		c++;
2181 	}
2182 
2183 	c = gve_version_str;
2184 	while (*c) {
2185 		writeb(*c, driver_version_register);
2186 		c++;
2187 	}
2188 	writeb('\n', driver_version_register);
2189 }
2190 
2191 static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
2192 {
2193 	int max_tx_queues, max_rx_queues;
2194 	struct net_device *dev;
2195 	__be32 __iomem *db_bar;
2196 	struct gve_registers __iomem *reg_bar;
2197 	struct gve_priv *priv;
2198 	int err;
2199 
2200 	err = pci_enable_device(pdev);
2201 	if (err)
2202 		return err;
2203 
2204 	err = pci_request_regions(pdev, gve_driver_name);
2205 	if (err)
2206 		goto abort_with_enabled;
2207 
2208 	pci_set_master(pdev);
2209 
2210 	err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
2211 	if (err) {
2212 		dev_err(&pdev->dev, "Failed to set dma mask: err=%d\n", err);
2213 		goto abort_with_pci_region;
2214 	}
2215 
2216 	reg_bar = pci_iomap(pdev, GVE_REGISTER_BAR, 0);
2217 	if (!reg_bar) {
2218 		dev_err(&pdev->dev, "Failed to map pci bar!\n");
2219 		err = -ENOMEM;
2220 		goto abort_with_pci_region;
2221 	}
2222 
2223 	db_bar = pci_iomap(pdev, GVE_DOORBELL_BAR, 0);
2224 	if (!db_bar) {
2225 		dev_err(&pdev->dev, "Failed to map doorbell bar!\n");
2226 		err = -ENOMEM;
2227 		goto abort_with_reg_bar;
2228 	}
2229 
2230 	gve_write_version(&reg_bar->driver_version);
2231 	/* Get max queues to alloc etherdev */
2232 	max_tx_queues = ioread32be(&reg_bar->max_tx_queues);
2233 	max_rx_queues = ioread32be(&reg_bar->max_rx_queues);
2234 	/* Alloc and setup the netdev and priv */
2235 	dev = alloc_etherdev_mqs(sizeof(*priv), max_tx_queues, max_rx_queues);
2236 	if (!dev) {
2237 		dev_err(&pdev->dev, "could not allocate netdev\n");
2238 		err = -ENOMEM;
2239 		goto abort_with_db_bar;
2240 	}
2241 	SET_NETDEV_DEV(dev, &pdev->dev);
2242 	pci_set_drvdata(pdev, dev);
2243 	dev->ethtool_ops = &gve_ethtool_ops;
2244 	dev->netdev_ops = &gve_netdev_ops;
2245 
2246 	/* Set default and supported features.
2247 	 *
2248 	 * Features might be set in other locations as well (such as
2249 	 * `gve_adminq_describe_device`).
2250 	 */
2251 	dev->hw_features = NETIF_F_HIGHDMA;
2252 	dev->hw_features |= NETIF_F_SG;
2253 	dev->hw_features |= NETIF_F_HW_CSUM;
2254 	dev->hw_features |= NETIF_F_TSO;
2255 	dev->hw_features |= NETIF_F_TSO6;
2256 	dev->hw_features |= NETIF_F_TSO_ECN;
2257 	dev->hw_features |= NETIF_F_RXCSUM;
2258 	dev->hw_features |= NETIF_F_RXHASH;
2259 	dev->features = dev->hw_features;
2260 	dev->watchdog_timeo = 5 * HZ;
2261 	dev->min_mtu = ETH_MIN_MTU;
2262 	netif_carrier_off(dev);
2263 
2264 	priv = netdev_priv(dev);
2265 	priv->dev = dev;
2266 	priv->pdev = pdev;
2267 	priv->msg_enable = DEFAULT_MSG_LEVEL;
2268 	priv->reg_bar0 = reg_bar;
2269 	priv->db_bar2 = db_bar;
2270 	priv->service_task_flags = 0x0;
2271 	priv->state_flags = 0x0;
2272 	priv->ethtool_flags = 0x0;
2273 
2274 	gve_set_probe_in_progress(priv);
2275 	priv->gve_wq = alloc_ordered_workqueue("gve", 0);
2276 	if (!priv->gve_wq) {
2277 		dev_err(&pdev->dev, "Could not allocate workqueue");
2278 		err = -ENOMEM;
2279 		goto abort_with_netdev;
2280 	}
2281 	INIT_WORK(&priv->service_task, gve_service_task);
2282 	INIT_WORK(&priv->stats_report_task, gve_stats_report_task);
2283 	priv->tx_cfg.max_queues = max_tx_queues;
2284 	priv->rx_cfg.max_queues = max_rx_queues;
2285 
2286 	err = gve_init_priv(priv, false);
2287 	if (err)
2288 		goto abort_with_wq;
2289 
2290 	err = register_netdev(dev);
2291 	if (err)
2292 		goto abort_with_gve_init;
2293 
2294 	dev_info(&pdev->dev, "GVE version %s\n", gve_version_str);
2295 	dev_info(&pdev->dev, "GVE queue format %d\n", (int)priv->queue_format);
2296 	gve_clear_probe_in_progress(priv);
2297 	queue_work(priv->gve_wq, &priv->service_task);
2298 	return 0;
2299 
2300 abort_with_gve_init:
2301 	gve_teardown_priv_resources(priv);
2302 
2303 abort_with_wq:
2304 	destroy_workqueue(priv->gve_wq);
2305 
2306 abort_with_netdev:
2307 	free_netdev(dev);
2308 
2309 abort_with_db_bar:
2310 	pci_iounmap(pdev, db_bar);
2311 
2312 abort_with_reg_bar:
2313 	pci_iounmap(pdev, reg_bar);
2314 
2315 abort_with_pci_region:
2316 	pci_release_regions(pdev);
2317 
2318 abort_with_enabled:
2319 	pci_disable_device(pdev);
2320 	return err;
2321 }
2322 
2323 static void gve_remove(struct pci_dev *pdev)
2324 {
2325 	struct net_device *netdev = pci_get_drvdata(pdev);
2326 	struct gve_priv *priv = netdev_priv(netdev);
2327 	__be32 __iomem *db_bar = priv->db_bar2;
2328 	void __iomem *reg_bar = priv->reg_bar0;
2329 
2330 	unregister_netdev(netdev);
2331 	gve_teardown_priv_resources(priv);
2332 	destroy_workqueue(priv->gve_wq);
2333 	free_netdev(netdev);
2334 	pci_iounmap(pdev, db_bar);
2335 	pci_iounmap(pdev, reg_bar);
2336 	pci_release_regions(pdev);
2337 	pci_disable_device(pdev);
2338 }
2339 
2340 static void gve_shutdown(struct pci_dev *pdev)
2341 {
2342 	struct net_device *netdev = pci_get_drvdata(pdev);
2343 	struct gve_priv *priv = netdev_priv(netdev);
2344 	bool was_up = netif_carrier_ok(priv->dev);
2345 
2346 	rtnl_lock();
2347 	if (was_up && gve_close(priv->dev)) {
2348 		/* If the dev was up, attempt to close, if close fails, reset */
2349 		gve_reset_and_teardown(priv, was_up);
2350 	} else {
2351 		/* If the dev wasn't up or close worked, finish tearing down */
2352 		gve_teardown_priv_resources(priv);
2353 	}
2354 	rtnl_unlock();
2355 }
2356 
2357 #ifdef CONFIG_PM
2358 static int gve_suspend(struct pci_dev *pdev, pm_message_t state)
2359 {
2360 	struct net_device *netdev = pci_get_drvdata(pdev);
2361 	struct gve_priv *priv = netdev_priv(netdev);
2362 	bool was_up = netif_carrier_ok(priv->dev);
2363 
2364 	priv->suspend_cnt++;
2365 	rtnl_lock();
2366 	if (was_up && gve_close(priv->dev)) {
2367 		/* If the dev was up, attempt to close, if close fails, reset */
2368 		gve_reset_and_teardown(priv, was_up);
2369 	} else {
2370 		/* If the dev wasn't up or close worked, finish tearing down */
2371 		gve_teardown_priv_resources(priv);
2372 	}
2373 	priv->up_before_suspend = was_up;
2374 	rtnl_unlock();
2375 	return 0;
2376 }
2377 
2378 static int gve_resume(struct pci_dev *pdev)
2379 {
2380 	struct net_device *netdev = pci_get_drvdata(pdev);
2381 	struct gve_priv *priv = netdev_priv(netdev);
2382 	int err;
2383 
2384 	priv->resume_cnt++;
2385 	rtnl_lock();
2386 	err = gve_reset_recovery(priv, priv->up_before_suspend);
2387 	rtnl_unlock();
2388 	return err;
2389 }
2390 #endif /* CONFIG_PM */
2391 
2392 static const struct pci_device_id gve_id_table[] = {
2393 	{ PCI_DEVICE(PCI_VENDOR_ID_GOOGLE, PCI_DEV_ID_GVNIC) },
2394 	{ }
2395 };
2396 
2397 static struct pci_driver gve_driver = {
2398 	.name		= gve_driver_name,
2399 	.id_table	= gve_id_table,
2400 	.probe		= gve_probe,
2401 	.remove		= gve_remove,
2402 	.shutdown	= gve_shutdown,
2403 #ifdef CONFIG_PM
2404 	.suspend        = gve_suspend,
2405 	.resume         = gve_resume,
2406 #endif
2407 };
2408 
2409 module_pci_driver(gve_driver);
2410 
2411 MODULE_DEVICE_TABLE(pci, gve_id_table);
2412 MODULE_AUTHOR("Google, Inc.");
2413 MODULE_DESCRIPTION("Google Virtual NIC Driver");
2414 MODULE_LICENSE("Dual MIT/GPL");
2415 MODULE_VERSION(GVE_VERSION);
2416