1 /*
2  * Back-end of the driver for virtual network devices. This portion of the
3  * driver exports a 'unified' network-device interface that can be accessed
4  * by any operating system that implements a compatible front end. A
5  * reference front-end implementation can be found in:
6  *  drivers/net/xen-netfront.c
7  *
8  * Copyright (c) 2002-2005, K A Fraser
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public License version 2
12  * as published by the Free Software Foundation; or, when distributed
13  * separately from the Linux kernel or incorporated into other
14  * software packages, subject to the following license:
15  *
16  * Permission is hereby granted, free of charge, to any person obtaining a copy
17  * of this source file (the "Software"), to deal in the Software without
18  * restriction, including without limitation the rights to use, copy, modify,
19  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
20  * and to permit persons to whom the Software is furnished to do so, subject to
21  * the following conditions:
22  *
23  * The above copyright notice and this permission notice shall be included in
24  * all copies or substantial portions of the Software.
25  *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
31  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
32  * IN THE SOFTWARE.
33  */
34 
35 #include "common.h"
36 
37 #include <linux/kthread.h>
38 #include <linux/if_vlan.h>
39 #include <linux/udp.h>
40 #include <linux/highmem.h>
41 
42 #include <net/tcp.h>
43 
44 #include <xen/xen.h>
45 #include <xen/events.h>
46 #include <xen/interface/memory.h>
47 #include <xen/page.h>
48 
49 #include <asm/xen/hypercall.h>
50 
51 /* Provide an option to disable split event channels at load time as
52  * event channels are limited resource. Split event channels are
53  * enabled by default.
54  */
55 bool separate_tx_rx_irq = true;
56 module_param(separate_tx_rx_irq, bool, 0644);
57 
58 /* The time that packets can stay on the guest Rx internal queue
59  * before they are dropped.
60  */
61 unsigned int rx_drain_timeout_msecs = 10000;
62 module_param(rx_drain_timeout_msecs, uint, 0444);
63 
64 /* The length of time before the frontend is considered unresponsive
65  * because it isn't providing Rx slots.
66  */
67 unsigned int rx_stall_timeout_msecs = 60000;
68 module_param(rx_stall_timeout_msecs, uint, 0444);
69 
70 #define MAX_QUEUES_DEFAULT 8
71 unsigned int xenvif_max_queues;
72 module_param_named(max_queues, xenvif_max_queues, uint, 0644);
73 MODULE_PARM_DESC(max_queues,
74 		 "Maximum number of queues per virtual interface");
75 
76 /*
77  * This is the maximum slots a skb can have. If a guest sends a skb
78  * which exceeds this limit it is considered malicious.
79  */
80 #define FATAL_SKB_SLOTS_DEFAULT 20
81 static unsigned int fatal_skb_slots = FATAL_SKB_SLOTS_DEFAULT;
82 module_param(fatal_skb_slots, uint, 0444);
83 
84 /* The amount to copy out of the first guest Tx slot into the skb's
85  * linear area.  If the first slot has more data, it will be mapped
86  * and put into the first frag.
87  *
88  * This is sized to avoid pulling headers from the frags for most
89  * TCP/IP packets.
90  */
91 #define XEN_NETBACK_TX_COPY_LEN 128
92 
93 /* This is the maximum number of flows in the hash cache. */
94 #define XENVIF_HASH_CACHE_SIZE_DEFAULT 64
95 unsigned int xenvif_hash_cache_size = XENVIF_HASH_CACHE_SIZE_DEFAULT;
96 module_param_named(hash_cache_size, xenvif_hash_cache_size, uint, 0644);
97 MODULE_PARM_DESC(hash_cache_size, "Number of flows in the hash cache");
98 
99 /* The module parameter tells that we have to put data
100  * for xen-netfront with the XDP_PACKET_HEADROOM offset
101  * needed for XDP processing
102  */
103 bool provides_xdp_headroom = true;
104 module_param(provides_xdp_headroom, bool, 0644);
105 
106 static void xenvif_idx_release(struct xenvif_queue *queue, u16 pending_idx,
107 			       u8 status);
108 
109 static void make_tx_response(struct xenvif_queue *queue,
110 			     struct xen_netif_tx_request *txp,
111 			     unsigned int extra_count,
112 			     s8       st);
113 static void push_tx_responses(struct xenvif_queue *queue);
114 
115 static void xenvif_idx_unmap(struct xenvif_queue *queue, u16 pending_idx);
116 
117 static inline int tx_work_todo(struct xenvif_queue *queue);
118 
119 static inline unsigned long idx_to_pfn(struct xenvif_queue *queue,
120 				       u16 idx)
121 {
122 	return page_to_pfn(queue->mmap_pages[idx]);
123 }
124 
125 static inline unsigned long idx_to_kaddr(struct xenvif_queue *queue,
126 					 u16 idx)
127 {
128 	return (unsigned long)pfn_to_kaddr(idx_to_pfn(queue, idx));
129 }
130 
131 #define callback_param(vif, pending_idx) \
132 	(vif->pending_tx_info[pending_idx].callback_struct)
133 
134 /* Find the containing VIF's structure from a pointer in pending_tx_info array
135  */
136 static inline struct xenvif_queue *ubuf_to_queue(const struct ubuf_info *ubuf)
137 {
138 	u16 pending_idx = ubuf->desc;
139 	struct pending_tx_info *temp =
140 		container_of(ubuf, struct pending_tx_info, callback_struct);
141 	return container_of(temp - pending_idx,
142 			    struct xenvif_queue,
143 			    pending_tx_info[0]);
144 }
145 
146 static u16 frag_get_pending_idx(skb_frag_t *frag)
147 {
148 	return (u16)skb_frag_off(frag);
149 }
150 
151 static void frag_set_pending_idx(skb_frag_t *frag, u16 pending_idx)
152 {
153 	skb_frag_off_set(frag, pending_idx);
154 }
155 
156 static inline pending_ring_idx_t pending_index(unsigned i)
157 {
158 	return i & (MAX_PENDING_REQS-1);
159 }
160 
161 void xenvif_kick_thread(struct xenvif_queue *queue)
162 {
163 	wake_up(&queue->wq);
164 }
165 
166 void xenvif_napi_schedule_or_enable_events(struct xenvif_queue *queue)
167 {
168 	int more_to_do;
169 
170 	RING_FINAL_CHECK_FOR_REQUESTS(&queue->tx, more_to_do);
171 
172 	if (more_to_do)
173 		napi_schedule(&queue->napi);
174 	else if (atomic_fetch_andnot(NETBK_TX_EOI | NETBK_COMMON_EOI,
175 				     &queue->eoi_pending) &
176 		 (NETBK_TX_EOI | NETBK_COMMON_EOI))
177 		xen_irq_lateeoi(queue->tx_irq, 0);
178 }
179 
180 static void tx_add_credit(struct xenvif_queue *queue)
181 {
182 	unsigned long max_burst, max_credit;
183 
184 	/*
185 	 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
186 	 * Otherwise the interface can seize up due to insufficient credit.
187 	 */
188 	max_burst = max(131072UL, queue->credit_bytes);
189 
190 	/* Take care that adding a new chunk of credit doesn't wrap to zero. */
191 	max_credit = queue->remaining_credit + queue->credit_bytes;
192 	if (max_credit < queue->remaining_credit)
193 		max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
194 
195 	queue->remaining_credit = min(max_credit, max_burst);
196 	queue->rate_limited = false;
197 }
198 
199 void xenvif_tx_credit_callback(struct timer_list *t)
200 {
201 	struct xenvif_queue *queue = from_timer(queue, t, credit_timeout);
202 	tx_add_credit(queue);
203 	xenvif_napi_schedule_or_enable_events(queue);
204 }
205 
206 static void xenvif_tx_err(struct xenvif_queue *queue,
207 			  struct xen_netif_tx_request *txp,
208 			  unsigned int extra_count, RING_IDX end)
209 {
210 	RING_IDX cons = queue->tx.req_cons;
211 	unsigned long flags;
212 
213 	do {
214 		spin_lock_irqsave(&queue->response_lock, flags);
215 		make_tx_response(queue, txp, extra_count, XEN_NETIF_RSP_ERROR);
216 		push_tx_responses(queue);
217 		spin_unlock_irqrestore(&queue->response_lock, flags);
218 		if (cons == end)
219 			break;
220 		RING_COPY_REQUEST(&queue->tx, cons++, txp);
221 		extra_count = 0; /* only the first frag can have extras */
222 	} while (1);
223 	queue->tx.req_cons = cons;
224 }
225 
226 static void xenvif_fatal_tx_err(struct xenvif *vif)
227 {
228 	netdev_err(vif->dev, "fatal error; disabling device\n");
229 	vif->disabled = true;
230 	/* Disable the vif from queue 0's kthread */
231 	if (vif->num_queues)
232 		xenvif_kick_thread(&vif->queues[0]);
233 }
234 
235 static int xenvif_count_requests(struct xenvif_queue *queue,
236 				 struct xen_netif_tx_request *first,
237 				 unsigned int extra_count,
238 				 struct xen_netif_tx_request *txp,
239 				 int work_to_do)
240 {
241 	RING_IDX cons = queue->tx.req_cons;
242 	int slots = 0;
243 	int drop_err = 0;
244 	int more_data;
245 
246 	if (!(first->flags & XEN_NETTXF_more_data))
247 		return 0;
248 
249 	do {
250 		struct xen_netif_tx_request dropped_tx = { 0 };
251 
252 		if (slots >= work_to_do) {
253 			netdev_err(queue->vif->dev,
254 				   "Asked for %d slots but exceeds this limit\n",
255 				   work_to_do);
256 			xenvif_fatal_tx_err(queue->vif);
257 			return -ENODATA;
258 		}
259 
260 		/* This guest is really using too many slots and
261 		 * considered malicious.
262 		 */
263 		if (unlikely(slots >= fatal_skb_slots)) {
264 			netdev_err(queue->vif->dev,
265 				   "Malicious frontend using %d slots, threshold %u\n",
266 				   slots, fatal_skb_slots);
267 			xenvif_fatal_tx_err(queue->vif);
268 			return -E2BIG;
269 		}
270 
271 		/* Xen network protocol had implicit dependency on
272 		 * MAX_SKB_FRAGS. XEN_NETBK_LEGACY_SLOTS_MAX is set to
273 		 * the historical MAX_SKB_FRAGS value 18 to honor the
274 		 * same behavior as before. Any packet using more than
275 		 * 18 slots but less than fatal_skb_slots slots is
276 		 * dropped
277 		 */
278 		if (!drop_err && slots >= XEN_NETBK_LEGACY_SLOTS_MAX) {
279 			if (net_ratelimit())
280 				netdev_dbg(queue->vif->dev,
281 					   "Too many slots (%d) exceeding limit (%d), dropping packet\n",
282 					   slots, XEN_NETBK_LEGACY_SLOTS_MAX);
283 			drop_err = -E2BIG;
284 		}
285 
286 		if (drop_err)
287 			txp = &dropped_tx;
288 
289 		RING_COPY_REQUEST(&queue->tx, cons + slots, txp);
290 
291 		/* If the guest submitted a frame >= 64 KiB then
292 		 * first->size overflowed and following slots will
293 		 * appear to be larger than the frame.
294 		 *
295 		 * This cannot be fatal error as there are buggy
296 		 * frontends that do this.
297 		 *
298 		 * Consume all slots and drop the packet.
299 		 */
300 		if (!drop_err && txp->size > first->size) {
301 			if (net_ratelimit())
302 				netdev_dbg(queue->vif->dev,
303 					   "Invalid tx request, slot size %u > remaining size %u\n",
304 					   txp->size, first->size);
305 			drop_err = -EIO;
306 		}
307 
308 		first->size -= txp->size;
309 		slots++;
310 
311 		if (unlikely((txp->offset + txp->size) > XEN_PAGE_SIZE)) {
312 			netdev_err(queue->vif->dev, "Cross page boundary, txp->offset: %u, size: %u\n",
313 				 txp->offset, txp->size);
314 			xenvif_fatal_tx_err(queue->vif);
315 			return -EINVAL;
316 		}
317 
318 		more_data = txp->flags & XEN_NETTXF_more_data;
319 
320 		if (!drop_err)
321 			txp++;
322 
323 	} while (more_data);
324 
325 	if (drop_err) {
326 		xenvif_tx_err(queue, first, extra_count, cons + slots);
327 		return drop_err;
328 	}
329 
330 	return slots;
331 }
332 
333 
334 struct xenvif_tx_cb {
335 	u16 pending_idx;
336 };
337 
338 #define XENVIF_TX_CB(skb) ((struct xenvif_tx_cb *)(skb)->cb)
339 
340 static inline void xenvif_tx_create_map_op(struct xenvif_queue *queue,
341 					   u16 pending_idx,
342 					   struct xen_netif_tx_request *txp,
343 					   unsigned int extra_count,
344 					   struct gnttab_map_grant_ref *mop)
345 {
346 	queue->pages_to_map[mop-queue->tx_map_ops] = queue->mmap_pages[pending_idx];
347 	gnttab_set_map_op(mop, idx_to_kaddr(queue, pending_idx),
348 			  GNTMAP_host_map | GNTMAP_readonly,
349 			  txp->gref, queue->vif->domid);
350 
351 	memcpy(&queue->pending_tx_info[pending_idx].req, txp,
352 	       sizeof(*txp));
353 	queue->pending_tx_info[pending_idx].extra_count = extra_count;
354 }
355 
356 static inline struct sk_buff *xenvif_alloc_skb(unsigned int size)
357 {
358 	struct sk_buff *skb =
359 		alloc_skb(size + NET_SKB_PAD + NET_IP_ALIGN,
360 			  GFP_ATOMIC | __GFP_NOWARN);
361 	if (unlikely(skb == NULL))
362 		return NULL;
363 
364 	/* Packets passed to netif_rx() must have some headroom. */
365 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
366 
367 	/* Initialize it here to avoid later surprises */
368 	skb_shinfo(skb)->destructor_arg = NULL;
369 
370 	return skb;
371 }
372 
373 static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif_queue *queue,
374 							struct sk_buff *skb,
375 							struct xen_netif_tx_request *txp,
376 							struct gnttab_map_grant_ref *gop,
377 							unsigned int frag_overflow,
378 							struct sk_buff *nskb)
379 {
380 	struct skb_shared_info *shinfo = skb_shinfo(skb);
381 	skb_frag_t *frags = shinfo->frags;
382 	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
383 	int start;
384 	pending_ring_idx_t index;
385 	unsigned int nr_slots;
386 
387 	nr_slots = shinfo->nr_frags;
388 
389 	/* Skip first skb fragment if it is on same page as header fragment. */
390 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
391 
392 	for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
393 	     shinfo->nr_frags++, txp++, gop++) {
394 		index = pending_index(queue->pending_cons++);
395 		pending_idx = queue->pending_ring[index];
396 		xenvif_tx_create_map_op(queue, pending_idx, txp, 0, gop);
397 		frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
398 	}
399 
400 	if (frag_overflow) {
401 
402 		shinfo = skb_shinfo(nskb);
403 		frags = shinfo->frags;
404 
405 		for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
406 		     shinfo->nr_frags++, txp++, gop++) {
407 			index = pending_index(queue->pending_cons++);
408 			pending_idx = queue->pending_ring[index];
409 			xenvif_tx_create_map_op(queue, pending_idx, txp, 0,
410 						gop);
411 			frag_set_pending_idx(&frags[shinfo->nr_frags],
412 					     pending_idx);
413 		}
414 
415 		skb_shinfo(skb)->frag_list = nskb;
416 	}
417 
418 	return gop;
419 }
420 
421 static inline void xenvif_grant_handle_set(struct xenvif_queue *queue,
422 					   u16 pending_idx,
423 					   grant_handle_t handle)
424 {
425 	if (unlikely(queue->grant_tx_handle[pending_idx] !=
426 		     NETBACK_INVALID_HANDLE)) {
427 		netdev_err(queue->vif->dev,
428 			   "Trying to overwrite active handle! pending_idx: 0x%x\n",
429 			   pending_idx);
430 		BUG();
431 	}
432 	queue->grant_tx_handle[pending_idx] = handle;
433 }
434 
435 static inline void xenvif_grant_handle_reset(struct xenvif_queue *queue,
436 					     u16 pending_idx)
437 {
438 	if (unlikely(queue->grant_tx_handle[pending_idx] ==
439 		     NETBACK_INVALID_HANDLE)) {
440 		netdev_err(queue->vif->dev,
441 			   "Trying to unmap invalid handle! pending_idx: 0x%x\n",
442 			   pending_idx);
443 		BUG();
444 	}
445 	queue->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE;
446 }
447 
448 static int xenvif_tx_check_gop(struct xenvif_queue *queue,
449 			       struct sk_buff *skb,
450 			       struct gnttab_map_grant_ref **gopp_map,
451 			       struct gnttab_copy **gopp_copy)
452 {
453 	struct gnttab_map_grant_ref *gop_map = *gopp_map;
454 	u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
455 	/* This always points to the shinfo of the skb being checked, which
456 	 * could be either the first or the one on the frag_list
457 	 */
458 	struct skb_shared_info *shinfo = skb_shinfo(skb);
459 	/* If this is non-NULL, we are currently checking the frag_list skb, and
460 	 * this points to the shinfo of the first one
461 	 */
462 	struct skb_shared_info *first_shinfo = NULL;
463 	int nr_frags = shinfo->nr_frags;
464 	const bool sharedslot = nr_frags &&
465 				frag_get_pending_idx(&shinfo->frags[0]) == pending_idx;
466 	int i, err;
467 
468 	/* Check status of header. */
469 	err = (*gopp_copy)->status;
470 	if (unlikely(err)) {
471 		if (net_ratelimit())
472 			netdev_dbg(queue->vif->dev,
473 				   "Grant copy of header failed! status: %d pending_idx: %u ref: %u\n",
474 				   (*gopp_copy)->status,
475 				   pending_idx,
476 				   (*gopp_copy)->source.u.ref);
477 		/* The first frag might still have this slot mapped */
478 		if (!sharedslot)
479 			xenvif_idx_release(queue, pending_idx,
480 					   XEN_NETIF_RSP_ERROR);
481 	}
482 	(*gopp_copy)++;
483 
484 check_frags:
485 	for (i = 0; i < nr_frags; i++, gop_map++) {
486 		int j, newerr;
487 
488 		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
489 
490 		/* Check error status: if okay then remember grant handle. */
491 		newerr = gop_map->status;
492 
493 		if (likely(!newerr)) {
494 			xenvif_grant_handle_set(queue,
495 						pending_idx,
496 						gop_map->handle);
497 			/* Had a previous error? Invalidate this fragment. */
498 			if (unlikely(err)) {
499 				xenvif_idx_unmap(queue, pending_idx);
500 				/* If the mapping of the first frag was OK, but
501 				 * the header's copy failed, and they are
502 				 * sharing a slot, send an error
503 				 */
504 				if (i == 0 && !first_shinfo && sharedslot)
505 					xenvif_idx_release(queue, pending_idx,
506 							   XEN_NETIF_RSP_ERROR);
507 				else
508 					xenvif_idx_release(queue, pending_idx,
509 							   XEN_NETIF_RSP_OKAY);
510 			}
511 			continue;
512 		}
513 
514 		/* Error on this fragment: respond to client with an error. */
515 		if (net_ratelimit())
516 			netdev_dbg(queue->vif->dev,
517 				   "Grant map of %d. frag failed! status: %d pending_idx: %u ref: %u\n",
518 				   i,
519 				   gop_map->status,
520 				   pending_idx,
521 				   gop_map->ref);
522 
523 		xenvif_idx_release(queue, pending_idx, XEN_NETIF_RSP_ERROR);
524 
525 		/* Not the first error? Preceding frags already invalidated. */
526 		if (err)
527 			continue;
528 
529 		/* First error: if the header haven't shared a slot with the
530 		 * first frag, release it as well.
531 		 */
532 		if (!sharedslot)
533 			xenvif_idx_release(queue,
534 					   XENVIF_TX_CB(skb)->pending_idx,
535 					   XEN_NETIF_RSP_OKAY);
536 
537 		/* Invalidate preceding fragments of this skb. */
538 		for (j = 0; j < i; j++) {
539 			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
540 			xenvif_idx_unmap(queue, pending_idx);
541 			xenvif_idx_release(queue, pending_idx,
542 					   XEN_NETIF_RSP_OKAY);
543 		}
544 
545 		/* And if we found the error while checking the frag_list, unmap
546 		 * the first skb's frags
547 		 */
548 		if (first_shinfo) {
549 			for (j = 0; j < first_shinfo->nr_frags; j++) {
550 				pending_idx = frag_get_pending_idx(&first_shinfo->frags[j]);
551 				xenvif_idx_unmap(queue, pending_idx);
552 				xenvif_idx_release(queue, pending_idx,
553 						   XEN_NETIF_RSP_OKAY);
554 			}
555 		}
556 
557 		/* Remember the error: invalidate all subsequent fragments. */
558 		err = newerr;
559 	}
560 
561 	if (skb_has_frag_list(skb) && !first_shinfo) {
562 		first_shinfo = shinfo;
563 		shinfo = skb_shinfo(shinfo->frag_list);
564 		nr_frags = shinfo->nr_frags;
565 
566 		goto check_frags;
567 	}
568 
569 	*gopp_map = gop_map;
570 	return err;
571 }
572 
573 static void xenvif_fill_frags(struct xenvif_queue *queue, struct sk_buff *skb)
574 {
575 	struct skb_shared_info *shinfo = skb_shinfo(skb);
576 	int nr_frags = shinfo->nr_frags;
577 	int i;
578 	u16 prev_pending_idx = INVALID_PENDING_IDX;
579 
580 	for (i = 0; i < nr_frags; i++) {
581 		skb_frag_t *frag = shinfo->frags + i;
582 		struct xen_netif_tx_request *txp;
583 		struct page *page;
584 		u16 pending_idx;
585 
586 		pending_idx = frag_get_pending_idx(frag);
587 
588 		/* If this is not the first frag, chain it to the previous*/
589 		if (prev_pending_idx == INVALID_PENDING_IDX)
590 			skb_shinfo(skb)->destructor_arg =
591 				&callback_param(queue, pending_idx);
592 		else
593 			callback_param(queue, prev_pending_idx).ctx =
594 				&callback_param(queue, pending_idx);
595 
596 		callback_param(queue, pending_idx).ctx = NULL;
597 		prev_pending_idx = pending_idx;
598 
599 		txp = &queue->pending_tx_info[pending_idx].req;
600 		page = virt_to_page(idx_to_kaddr(queue, pending_idx));
601 		__skb_fill_page_desc(skb, i, page, txp->offset, txp->size);
602 		skb->len += txp->size;
603 		skb->data_len += txp->size;
604 		skb->truesize += txp->size;
605 
606 		/* Take an extra reference to offset network stack's put_page */
607 		get_page(queue->mmap_pages[pending_idx]);
608 	}
609 }
610 
611 static int xenvif_get_extras(struct xenvif_queue *queue,
612 			     struct xen_netif_extra_info *extras,
613 			     unsigned int *extra_count,
614 			     int work_to_do)
615 {
616 	struct xen_netif_extra_info extra;
617 	RING_IDX cons = queue->tx.req_cons;
618 
619 	do {
620 		if (unlikely(work_to_do-- <= 0)) {
621 			netdev_err(queue->vif->dev, "Missing extra info\n");
622 			xenvif_fatal_tx_err(queue->vif);
623 			return -EBADR;
624 		}
625 
626 		RING_COPY_REQUEST(&queue->tx, cons, &extra);
627 
628 		queue->tx.req_cons = ++cons;
629 		(*extra_count)++;
630 
631 		if (unlikely(!extra.type ||
632 			     extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
633 			netdev_err(queue->vif->dev,
634 				   "Invalid extra type: %d\n", extra.type);
635 			xenvif_fatal_tx_err(queue->vif);
636 			return -EINVAL;
637 		}
638 
639 		memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
640 	} while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
641 
642 	return work_to_do;
643 }
644 
645 static int xenvif_set_skb_gso(struct xenvif *vif,
646 			      struct sk_buff *skb,
647 			      struct xen_netif_extra_info *gso)
648 {
649 	if (!gso->u.gso.size) {
650 		netdev_err(vif->dev, "GSO size must not be zero.\n");
651 		xenvif_fatal_tx_err(vif);
652 		return -EINVAL;
653 	}
654 
655 	switch (gso->u.gso.type) {
656 	case XEN_NETIF_GSO_TYPE_TCPV4:
657 		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
658 		break;
659 	case XEN_NETIF_GSO_TYPE_TCPV6:
660 		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
661 		break;
662 	default:
663 		netdev_err(vif->dev, "Bad GSO type %d.\n", gso->u.gso.type);
664 		xenvif_fatal_tx_err(vif);
665 		return -EINVAL;
666 	}
667 
668 	skb_shinfo(skb)->gso_size = gso->u.gso.size;
669 	/* gso_segs will be calculated later */
670 
671 	return 0;
672 }
673 
674 static int checksum_setup(struct xenvif_queue *queue, struct sk_buff *skb)
675 {
676 	bool recalculate_partial_csum = false;
677 
678 	/* A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
679 	 * peers can fail to set NETRXF_csum_blank when sending a GSO
680 	 * frame. In this case force the SKB to CHECKSUM_PARTIAL and
681 	 * recalculate the partial checksum.
682 	 */
683 	if (skb->ip_summed != CHECKSUM_PARTIAL && skb_is_gso(skb)) {
684 		queue->stats.rx_gso_checksum_fixup++;
685 		skb->ip_summed = CHECKSUM_PARTIAL;
686 		recalculate_partial_csum = true;
687 	}
688 
689 	/* A non-CHECKSUM_PARTIAL SKB does not require setup. */
690 	if (skb->ip_summed != CHECKSUM_PARTIAL)
691 		return 0;
692 
693 	return skb_checksum_setup(skb, recalculate_partial_csum);
694 }
695 
696 static bool tx_credit_exceeded(struct xenvif_queue *queue, unsigned size)
697 {
698 	u64 now = get_jiffies_64();
699 	u64 next_credit = queue->credit_window_start +
700 		msecs_to_jiffies(queue->credit_usec / 1000);
701 
702 	/* Timer could already be pending in rare cases. */
703 	if (timer_pending(&queue->credit_timeout)) {
704 		queue->rate_limited = true;
705 		return true;
706 	}
707 
708 	/* Passed the point where we can replenish credit? */
709 	if (time_after_eq64(now, next_credit)) {
710 		queue->credit_window_start = now;
711 		tx_add_credit(queue);
712 	}
713 
714 	/* Still too big to send right now? Set a callback. */
715 	if (size > queue->remaining_credit) {
716 		mod_timer(&queue->credit_timeout,
717 			  next_credit);
718 		queue->credit_window_start = next_credit;
719 		queue->rate_limited = true;
720 
721 		return true;
722 	}
723 
724 	return false;
725 }
726 
727 /* No locking is required in xenvif_mcast_add/del() as they are
728  * only ever invoked from NAPI poll. An RCU list is used because
729  * xenvif_mcast_match() is called asynchronously, during start_xmit.
730  */
731 
732 static int xenvif_mcast_add(struct xenvif *vif, const u8 *addr)
733 {
734 	struct xenvif_mcast_addr *mcast;
735 
736 	if (vif->fe_mcast_count == XEN_NETBK_MCAST_MAX) {
737 		if (net_ratelimit())
738 			netdev_err(vif->dev,
739 				   "Too many multicast addresses\n");
740 		return -ENOSPC;
741 	}
742 
743 	mcast = kzalloc(sizeof(*mcast), GFP_ATOMIC);
744 	if (!mcast)
745 		return -ENOMEM;
746 
747 	ether_addr_copy(mcast->addr, addr);
748 	list_add_tail_rcu(&mcast->entry, &vif->fe_mcast_addr);
749 	vif->fe_mcast_count++;
750 
751 	return 0;
752 }
753 
754 static void xenvif_mcast_del(struct xenvif *vif, const u8 *addr)
755 {
756 	struct xenvif_mcast_addr *mcast;
757 
758 	list_for_each_entry_rcu(mcast, &vif->fe_mcast_addr, entry) {
759 		if (ether_addr_equal(addr, mcast->addr)) {
760 			--vif->fe_mcast_count;
761 			list_del_rcu(&mcast->entry);
762 			kfree_rcu(mcast, rcu);
763 			break;
764 		}
765 	}
766 }
767 
768 bool xenvif_mcast_match(struct xenvif *vif, const u8 *addr)
769 {
770 	struct xenvif_mcast_addr *mcast;
771 
772 	rcu_read_lock();
773 	list_for_each_entry_rcu(mcast, &vif->fe_mcast_addr, entry) {
774 		if (ether_addr_equal(addr, mcast->addr)) {
775 			rcu_read_unlock();
776 			return true;
777 		}
778 	}
779 	rcu_read_unlock();
780 
781 	return false;
782 }
783 
784 void xenvif_mcast_addr_list_free(struct xenvif *vif)
785 {
786 	/* No need for locking or RCU here. NAPI poll and TX queue
787 	 * are stopped.
788 	 */
789 	while (!list_empty(&vif->fe_mcast_addr)) {
790 		struct xenvif_mcast_addr *mcast;
791 
792 		mcast = list_first_entry(&vif->fe_mcast_addr,
793 					 struct xenvif_mcast_addr,
794 					 entry);
795 		--vif->fe_mcast_count;
796 		list_del(&mcast->entry);
797 		kfree(mcast);
798 	}
799 }
800 
801 static void xenvif_tx_build_gops(struct xenvif_queue *queue,
802 				     int budget,
803 				     unsigned *copy_ops,
804 				     unsigned *map_ops)
805 {
806 	struct gnttab_map_grant_ref *gop = queue->tx_map_ops;
807 	struct sk_buff *skb, *nskb;
808 	int ret;
809 	unsigned int frag_overflow;
810 
811 	while (skb_queue_len(&queue->tx_queue) < budget) {
812 		struct xen_netif_tx_request txreq;
813 		struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX];
814 		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
815 		unsigned int extra_count;
816 		u16 pending_idx;
817 		RING_IDX idx;
818 		int work_to_do;
819 		unsigned int data_len;
820 		pending_ring_idx_t index;
821 
822 		if (queue->tx.sring->req_prod - queue->tx.req_cons >
823 		    XEN_NETIF_TX_RING_SIZE) {
824 			netdev_err(queue->vif->dev,
825 				   "Impossible number of requests. "
826 				   "req_prod %d, req_cons %d, size %ld\n",
827 				   queue->tx.sring->req_prod, queue->tx.req_cons,
828 				   XEN_NETIF_TX_RING_SIZE);
829 			xenvif_fatal_tx_err(queue->vif);
830 			break;
831 		}
832 
833 		work_to_do = XEN_RING_NR_UNCONSUMED_REQUESTS(&queue->tx);
834 		if (!work_to_do)
835 			break;
836 
837 		idx = queue->tx.req_cons;
838 		rmb(); /* Ensure that we see the request before we copy it. */
839 		RING_COPY_REQUEST(&queue->tx, idx, &txreq);
840 
841 		/* Credit-based scheduling. */
842 		if (txreq.size > queue->remaining_credit &&
843 		    tx_credit_exceeded(queue, txreq.size))
844 			break;
845 
846 		queue->remaining_credit -= txreq.size;
847 
848 		work_to_do--;
849 		queue->tx.req_cons = ++idx;
850 
851 		memset(extras, 0, sizeof(extras));
852 		extra_count = 0;
853 		if (txreq.flags & XEN_NETTXF_extra_info) {
854 			work_to_do = xenvif_get_extras(queue, extras,
855 						       &extra_count,
856 						       work_to_do);
857 			idx = queue->tx.req_cons;
858 			if (unlikely(work_to_do < 0))
859 				break;
860 		}
861 
862 		if (extras[XEN_NETIF_EXTRA_TYPE_MCAST_ADD - 1].type) {
863 			struct xen_netif_extra_info *extra;
864 
865 			extra = &extras[XEN_NETIF_EXTRA_TYPE_MCAST_ADD - 1];
866 			ret = xenvif_mcast_add(queue->vif, extra->u.mcast.addr);
867 
868 			make_tx_response(queue, &txreq, extra_count,
869 					 (ret == 0) ?
870 					 XEN_NETIF_RSP_OKAY :
871 					 XEN_NETIF_RSP_ERROR);
872 			push_tx_responses(queue);
873 			continue;
874 		}
875 
876 		if (extras[XEN_NETIF_EXTRA_TYPE_MCAST_DEL - 1].type) {
877 			struct xen_netif_extra_info *extra;
878 
879 			extra = &extras[XEN_NETIF_EXTRA_TYPE_MCAST_DEL - 1];
880 			xenvif_mcast_del(queue->vif, extra->u.mcast.addr);
881 
882 			make_tx_response(queue, &txreq, extra_count,
883 					 XEN_NETIF_RSP_OKAY);
884 			push_tx_responses(queue);
885 			continue;
886 		}
887 
888 		ret = xenvif_count_requests(queue, &txreq, extra_count,
889 					    txfrags, work_to_do);
890 		if (unlikely(ret < 0))
891 			break;
892 
893 		idx += ret;
894 
895 		if (unlikely(txreq.size < ETH_HLEN)) {
896 			netdev_dbg(queue->vif->dev,
897 				   "Bad packet size: %d\n", txreq.size);
898 			xenvif_tx_err(queue, &txreq, extra_count, idx);
899 			break;
900 		}
901 
902 		/* No crossing a page as the payload mustn't fragment. */
903 		if (unlikely((txreq.offset + txreq.size) > XEN_PAGE_SIZE)) {
904 			netdev_err(queue->vif->dev,
905 				   "txreq.offset: %u, size: %u, end: %lu\n",
906 				   txreq.offset, txreq.size,
907 				   (unsigned long)(txreq.offset&~XEN_PAGE_MASK) + txreq.size);
908 			xenvif_fatal_tx_err(queue->vif);
909 			break;
910 		}
911 
912 		index = pending_index(queue->pending_cons);
913 		pending_idx = queue->pending_ring[index];
914 
915 		data_len = (txreq.size > XEN_NETBACK_TX_COPY_LEN &&
916 			    ret < XEN_NETBK_LEGACY_SLOTS_MAX) ?
917 			XEN_NETBACK_TX_COPY_LEN : txreq.size;
918 
919 		skb = xenvif_alloc_skb(data_len);
920 		if (unlikely(skb == NULL)) {
921 			netdev_dbg(queue->vif->dev,
922 				   "Can't allocate a skb in start_xmit.\n");
923 			xenvif_tx_err(queue, &txreq, extra_count, idx);
924 			break;
925 		}
926 
927 		skb_shinfo(skb)->nr_frags = ret;
928 		if (data_len < txreq.size)
929 			skb_shinfo(skb)->nr_frags++;
930 		/* At this point shinfo->nr_frags is in fact the number of
931 		 * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
932 		 */
933 		frag_overflow = 0;
934 		nskb = NULL;
935 		if (skb_shinfo(skb)->nr_frags > MAX_SKB_FRAGS) {
936 			frag_overflow = skb_shinfo(skb)->nr_frags - MAX_SKB_FRAGS;
937 			BUG_ON(frag_overflow > MAX_SKB_FRAGS);
938 			skb_shinfo(skb)->nr_frags = MAX_SKB_FRAGS;
939 			nskb = xenvif_alloc_skb(0);
940 			if (unlikely(nskb == NULL)) {
941 				skb_shinfo(skb)->nr_frags = 0;
942 				kfree_skb(skb);
943 				xenvif_tx_err(queue, &txreq, extra_count, idx);
944 				if (net_ratelimit())
945 					netdev_err(queue->vif->dev,
946 						   "Can't allocate the frag_list skb.\n");
947 				break;
948 			}
949 		}
950 
951 		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
952 			struct xen_netif_extra_info *gso;
953 			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
954 
955 			if (xenvif_set_skb_gso(queue->vif, skb, gso)) {
956 				/* Failure in xenvif_set_skb_gso is fatal. */
957 				skb_shinfo(skb)->nr_frags = 0;
958 				kfree_skb(skb);
959 				kfree_skb(nskb);
960 				break;
961 			}
962 		}
963 
964 		if (extras[XEN_NETIF_EXTRA_TYPE_HASH - 1].type) {
965 			struct xen_netif_extra_info *extra;
966 			enum pkt_hash_types type = PKT_HASH_TYPE_NONE;
967 
968 			extra = &extras[XEN_NETIF_EXTRA_TYPE_HASH - 1];
969 
970 			switch (extra->u.hash.type) {
971 			case _XEN_NETIF_CTRL_HASH_TYPE_IPV4:
972 			case _XEN_NETIF_CTRL_HASH_TYPE_IPV6:
973 				type = PKT_HASH_TYPE_L3;
974 				break;
975 
976 			case _XEN_NETIF_CTRL_HASH_TYPE_IPV4_TCP:
977 			case _XEN_NETIF_CTRL_HASH_TYPE_IPV6_TCP:
978 				type = PKT_HASH_TYPE_L4;
979 				break;
980 
981 			default:
982 				break;
983 			}
984 
985 			if (type != PKT_HASH_TYPE_NONE)
986 				skb_set_hash(skb,
987 					     *(u32 *)extra->u.hash.value,
988 					     type);
989 		}
990 
991 		XENVIF_TX_CB(skb)->pending_idx = pending_idx;
992 
993 		__skb_put(skb, data_len);
994 		queue->tx_copy_ops[*copy_ops].source.u.ref = txreq.gref;
995 		queue->tx_copy_ops[*copy_ops].source.domid = queue->vif->domid;
996 		queue->tx_copy_ops[*copy_ops].source.offset = txreq.offset;
997 
998 		queue->tx_copy_ops[*copy_ops].dest.u.gmfn =
999 			virt_to_gfn(skb->data);
1000 		queue->tx_copy_ops[*copy_ops].dest.domid = DOMID_SELF;
1001 		queue->tx_copy_ops[*copy_ops].dest.offset =
1002 			offset_in_page(skb->data) & ~XEN_PAGE_MASK;
1003 
1004 		queue->tx_copy_ops[*copy_ops].len = data_len;
1005 		queue->tx_copy_ops[*copy_ops].flags = GNTCOPY_source_gref;
1006 
1007 		(*copy_ops)++;
1008 
1009 		if (data_len < txreq.size) {
1010 			frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
1011 					     pending_idx);
1012 			xenvif_tx_create_map_op(queue, pending_idx, &txreq,
1013 						extra_count, gop);
1014 			gop++;
1015 		} else {
1016 			frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
1017 					     INVALID_PENDING_IDX);
1018 			memcpy(&queue->pending_tx_info[pending_idx].req,
1019 			       &txreq, sizeof(txreq));
1020 			queue->pending_tx_info[pending_idx].extra_count =
1021 				extra_count;
1022 		}
1023 
1024 		queue->pending_cons++;
1025 
1026 		gop = xenvif_get_requests(queue, skb, txfrags, gop,
1027 				          frag_overflow, nskb);
1028 
1029 		__skb_queue_tail(&queue->tx_queue, skb);
1030 
1031 		queue->tx.req_cons = idx;
1032 
1033 		if (((gop-queue->tx_map_ops) >= ARRAY_SIZE(queue->tx_map_ops)) ||
1034 		    (*copy_ops >= ARRAY_SIZE(queue->tx_copy_ops)))
1035 			break;
1036 	}
1037 
1038 	(*map_ops) = gop - queue->tx_map_ops;
1039 	return;
1040 }
1041 
1042 /* Consolidate skb with a frag_list into a brand new one with local pages on
1043  * frags. Returns 0 or -ENOMEM if can't allocate new pages.
1044  */
1045 static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *skb)
1046 {
1047 	unsigned int offset = skb_headlen(skb);
1048 	skb_frag_t frags[MAX_SKB_FRAGS];
1049 	int i, f;
1050 	struct ubuf_info *uarg;
1051 	struct sk_buff *nskb = skb_shinfo(skb)->frag_list;
1052 
1053 	queue->stats.tx_zerocopy_sent += 2;
1054 	queue->stats.tx_frag_overflow++;
1055 
1056 	xenvif_fill_frags(queue, nskb);
1057 	/* Subtract frags size, we will correct it later */
1058 	skb->truesize -= skb->data_len;
1059 	skb->len += nskb->len;
1060 	skb->data_len += nskb->len;
1061 
1062 	/* create a brand new frags array and coalesce there */
1063 	for (i = 0; offset < skb->len; i++) {
1064 		struct page *page;
1065 		unsigned int len;
1066 
1067 		BUG_ON(i >= MAX_SKB_FRAGS);
1068 		page = alloc_page(GFP_ATOMIC);
1069 		if (!page) {
1070 			int j;
1071 			skb->truesize += skb->data_len;
1072 			for (j = 0; j < i; j++)
1073 				put_page(skb_frag_page(&frags[j]));
1074 			return -ENOMEM;
1075 		}
1076 
1077 		if (offset + PAGE_SIZE < skb->len)
1078 			len = PAGE_SIZE;
1079 		else
1080 			len = skb->len - offset;
1081 		if (skb_copy_bits(skb, offset, page_address(page), len))
1082 			BUG();
1083 
1084 		offset += len;
1085 		__skb_frag_set_page(&frags[i], page);
1086 		skb_frag_off_set(&frags[i], 0);
1087 		skb_frag_size_set(&frags[i], len);
1088 	}
1089 
1090 	/* Release all the original (foreign) frags. */
1091 	for (f = 0; f < skb_shinfo(skb)->nr_frags; f++)
1092 		skb_frag_unref(skb, f);
1093 	uarg = skb_shinfo(skb)->destructor_arg;
1094 	/* increase inflight counter to offset decrement in callback */
1095 	atomic_inc(&queue->inflight_packets);
1096 	uarg->callback(NULL, uarg, true);
1097 	skb_shinfo(skb)->destructor_arg = NULL;
1098 
1099 	/* Fill the skb with the new (local) frags. */
1100 	memcpy(skb_shinfo(skb)->frags, frags, i * sizeof(skb_frag_t));
1101 	skb_shinfo(skb)->nr_frags = i;
1102 	skb->truesize += i * PAGE_SIZE;
1103 
1104 	return 0;
1105 }
1106 
1107 static int xenvif_tx_submit(struct xenvif_queue *queue)
1108 {
1109 	struct gnttab_map_grant_ref *gop_map = queue->tx_map_ops;
1110 	struct gnttab_copy *gop_copy = queue->tx_copy_ops;
1111 	struct sk_buff *skb;
1112 	int work_done = 0;
1113 
1114 	while ((skb = __skb_dequeue(&queue->tx_queue)) != NULL) {
1115 		struct xen_netif_tx_request *txp;
1116 		u16 pending_idx;
1117 		unsigned data_len;
1118 
1119 		pending_idx = XENVIF_TX_CB(skb)->pending_idx;
1120 		txp = &queue->pending_tx_info[pending_idx].req;
1121 
1122 		/* Check the remap error code. */
1123 		if (unlikely(xenvif_tx_check_gop(queue, skb, &gop_map, &gop_copy))) {
1124 			/* If there was an error, xenvif_tx_check_gop is
1125 			 * expected to release all the frags which were mapped,
1126 			 * so kfree_skb shouldn't do it again
1127 			 */
1128 			skb_shinfo(skb)->nr_frags = 0;
1129 			if (skb_has_frag_list(skb)) {
1130 				struct sk_buff *nskb =
1131 						skb_shinfo(skb)->frag_list;
1132 				skb_shinfo(nskb)->nr_frags = 0;
1133 			}
1134 			kfree_skb(skb);
1135 			continue;
1136 		}
1137 
1138 		data_len = skb->len;
1139 		callback_param(queue, pending_idx).ctx = NULL;
1140 		if (data_len < txp->size) {
1141 			/* Append the packet payload as a fragment. */
1142 			txp->offset += data_len;
1143 			txp->size -= data_len;
1144 		} else {
1145 			/* Schedule a response immediately. */
1146 			xenvif_idx_release(queue, pending_idx,
1147 					   XEN_NETIF_RSP_OKAY);
1148 		}
1149 
1150 		if (txp->flags & XEN_NETTXF_csum_blank)
1151 			skb->ip_summed = CHECKSUM_PARTIAL;
1152 		else if (txp->flags & XEN_NETTXF_data_validated)
1153 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1154 
1155 		xenvif_fill_frags(queue, skb);
1156 
1157 		if (unlikely(skb_has_frag_list(skb))) {
1158 			struct sk_buff *nskb = skb_shinfo(skb)->frag_list;
1159 			xenvif_skb_zerocopy_prepare(queue, nskb);
1160 			if (xenvif_handle_frag_list(queue, skb)) {
1161 				if (net_ratelimit())
1162 					netdev_err(queue->vif->dev,
1163 						   "Not enough memory to consolidate frag_list!\n");
1164 				xenvif_skb_zerocopy_prepare(queue, skb);
1165 				kfree_skb(skb);
1166 				continue;
1167 			}
1168 			/* Copied all the bits from the frag list -- free it. */
1169 			skb_frag_list_init(skb);
1170 			kfree_skb(nskb);
1171 		}
1172 
1173 		skb->dev      = queue->vif->dev;
1174 		skb->protocol = eth_type_trans(skb, skb->dev);
1175 		skb_reset_network_header(skb);
1176 
1177 		if (checksum_setup(queue, skb)) {
1178 			netdev_dbg(queue->vif->dev,
1179 				   "Can't setup checksum in net_tx_action\n");
1180 			/* We have to set this flag to trigger the callback */
1181 			if (skb_shinfo(skb)->destructor_arg)
1182 				xenvif_skb_zerocopy_prepare(queue, skb);
1183 			kfree_skb(skb);
1184 			continue;
1185 		}
1186 
1187 		skb_probe_transport_header(skb);
1188 
1189 		/* If the packet is GSO then we will have just set up the
1190 		 * transport header offset in checksum_setup so it's now
1191 		 * straightforward to calculate gso_segs.
1192 		 */
1193 		if (skb_is_gso(skb)) {
1194 			int mss, hdrlen;
1195 
1196 			/* GSO implies having the L4 header. */
1197 			WARN_ON_ONCE(!skb_transport_header_was_set(skb));
1198 			if (unlikely(!skb_transport_header_was_set(skb))) {
1199 				kfree_skb(skb);
1200 				continue;
1201 			}
1202 
1203 			mss = skb_shinfo(skb)->gso_size;
1204 			hdrlen = skb_tcp_all_headers(skb);
1205 
1206 			skb_shinfo(skb)->gso_segs =
1207 				DIV_ROUND_UP(skb->len - hdrlen, mss);
1208 		}
1209 
1210 		queue->stats.rx_bytes += skb->len;
1211 		queue->stats.rx_packets++;
1212 
1213 		work_done++;
1214 
1215 		/* Set this flag right before netif_receive_skb, otherwise
1216 		 * someone might think this packet already left netback, and
1217 		 * do a skb_copy_ubufs while we are still in control of the
1218 		 * skb. E.g. the __pskb_pull_tail earlier can do such thing.
1219 		 */
1220 		if (skb_shinfo(skb)->destructor_arg) {
1221 			xenvif_skb_zerocopy_prepare(queue, skb);
1222 			queue->stats.tx_zerocopy_sent++;
1223 		}
1224 
1225 		netif_receive_skb(skb);
1226 	}
1227 
1228 	return work_done;
1229 }
1230 
1231 void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf,
1232 			      bool zerocopy_success)
1233 {
1234 	unsigned long flags;
1235 	pending_ring_idx_t index;
1236 	struct xenvif_queue *queue = ubuf_to_queue(ubuf);
1237 
1238 	/* This is the only place where we grab this lock, to protect callbacks
1239 	 * from each other.
1240 	 */
1241 	spin_lock_irqsave(&queue->callback_lock, flags);
1242 	do {
1243 		u16 pending_idx = ubuf->desc;
1244 		ubuf = (struct ubuf_info *) ubuf->ctx;
1245 		BUG_ON(queue->dealloc_prod - queue->dealloc_cons >=
1246 			MAX_PENDING_REQS);
1247 		index = pending_index(queue->dealloc_prod);
1248 		queue->dealloc_ring[index] = pending_idx;
1249 		/* Sync with xenvif_tx_dealloc_action:
1250 		 * insert idx then incr producer.
1251 		 */
1252 		smp_wmb();
1253 		queue->dealloc_prod++;
1254 	} while (ubuf);
1255 	spin_unlock_irqrestore(&queue->callback_lock, flags);
1256 
1257 	if (likely(zerocopy_success))
1258 		queue->stats.tx_zerocopy_success++;
1259 	else
1260 		queue->stats.tx_zerocopy_fail++;
1261 	xenvif_skb_zerocopy_complete(queue);
1262 }
1263 
1264 static inline void xenvif_tx_dealloc_action(struct xenvif_queue *queue)
1265 {
1266 	struct gnttab_unmap_grant_ref *gop;
1267 	pending_ring_idx_t dc, dp;
1268 	u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
1269 	unsigned int i = 0;
1270 
1271 	dc = queue->dealloc_cons;
1272 	gop = queue->tx_unmap_ops;
1273 
1274 	/* Free up any grants we have finished using */
1275 	do {
1276 		dp = queue->dealloc_prod;
1277 
1278 		/* Ensure we see all indices enqueued by all
1279 		 * xenvif_zerocopy_callback().
1280 		 */
1281 		smp_rmb();
1282 
1283 		while (dc != dp) {
1284 			BUG_ON(gop - queue->tx_unmap_ops >= MAX_PENDING_REQS);
1285 			pending_idx =
1286 				queue->dealloc_ring[pending_index(dc++)];
1287 
1288 			pending_idx_release[gop - queue->tx_unmap_ops] =
1289 				pending_idx;
1290 			queue->pages_to_unmap[gop - queue->tx_unmap_ops] =
1291 				queue->mmap_pages[pending_idx];
1292 			gnttab_set_unmap_op(gop,
1293 					    idx_to_kaddr(queue, pending_idx),
1294 					    GNTMAP_host_map,
1295 					    queue->grant_tx_handle[pending_idx]);
1296 			xenvif_grant_handle_reset(queue, pending_idx);
1297 			++gop;
1298 		}
1299 
1300 	} while (dp != queue->dealloc_prod);
1301 
1302 	queue->dealloc_cons = dc;
1303 
1304 	if (gop - queue->tx_unmap_ops > 0) {
1305 		int ret;
1306 		ret = gnttab_unmap_refs(queue->tx_unmap_ops,
1307 					NULL,
1308 					queue->pages_to_unmap,
1309 					gop - queue->tx_unmap_ops);
1310 		if (ret) {
1311 			netdev_err(queue->vif->dev, "Unmap fail: nr_ops %tu ret %d\n",
1312 				   gop - queue->tx_unmap_ops, ret);
1313 			for (i = 0; i < gop - queue->tx_unmap_ops; ++i) {
1314 				if (gop[i].status != GNTST_okay)
1315 					netdev_err(queue->vif->dev,
1316 						   " host_addr: 0x%llx handle: 0x%x status: %d\n",
1317 						   gop[i].host_addr,
1318 						   gop[i].handle,
1319 						   gop[i].status);
1320 			}
1321 			BUG();
1322 		}
1323 	}
1324 
1325 	for (i = 0; i < gop - queue->tx_unmap_ops; ++i)
1326 		xenvif_idx_release(queue, pending_idx_release[i],
1327 				   XEN_NETIF_RSP_OKAY);
1328 }
1329 
1330 
1331 /* Called after netfront has transmitted */
1332 int xenvif_tx_action(struct xenvif_queue *queue, int budget)
1333 {
1334 	unsigned nr_mops, nr_cops = 0;
1335 	int work_done, ret;
1336 
1337 	if (unlikely(!tx_work_todo(queue)))
1338 		return 0;
1339 
1340 	xenvif_tx_build_gops(queue, budget, &nr_cops, &nr_mops);
1341 
1342 	if (nr_cops == 0)
1343 		return 0;
1344 
1345 	gnttab_batch_copy(queue->tx_copy_ops, nr_cops);
1346 	if (nr_mops != 0) {
1347 		ret = gnttab_map_refs(queue->tx_map_ops,
1348 				      NULL,
1349 				      queue->pages_to_map,
1350 				      nr_mops);
1351 		if (ret) {
1352 			unsigned int i;
1353 
1354 			netdev_err(queue->vif->dev, "Map fail: nr %u ret %d\n",
1355 				   nr_mops, ret);
1356 			for (i = 0; i < nr_mops; ++i)
1357 				WARN_ON_ONCE(queue->tx_map_ops[i].status ==
1358 				             GNTST_okay);
1359 		}
1360 	}
1361 
1362 	work_done = xenvif_tx_submit(queue);
1363 
1364 	return work_done;
1365 }
1366 
1367 static void xenvif_idx_release(struct xenvif_queue *queue, u16 pending_idx,
1368 			       u8 status)
1369 {
1370 	struct pending_tx_info *pending_tx_info;
1371 	pending_ring_idx_t index;
1372 	unsigned long flags;
1373 
1374 	pending_tx_info = &queue->pending_tx_info[pending_idx];
1375 
1376 	spin_lock_irqsave(&queue->response_lock, flags);
1377 
1378 	make_tx_response(queue, &pending_tx_info->req,
1379 			 pending_tx_info->extra_count, status);
1380 
1381 	/* Release the pending index before pusing the Tx response so
1382 	 * its available before a new Tx request is pushed by the
1383 	 * frontend.
1384 	 */
1385 	index = pending_index(queue->pending_prod++);
1386 	queue->pending_ring[index] = pending_idx;
1387 
1388 	push_tx_responses(queue);
1389 
1390 	spin_unlock_irqrestore(&queue->response_lock, flags);
1391 }
1392 
1393 
1394 static void make_tx_response(struct xenvif_queue *queue,
1395 			     struct xen_netif_tx_request *txp,
1396 			     unsigned int extra_count,
1397 			     s8       st)
1398 {
1399 	RING_IDX i = queue->tx.rsp_prod_pvt;
1400 	struct xen_netif_tx_response *resp;
1401 
1402 	resp = RING_GET_RESPONSE(&queue->tx, i);
1403 	resp->id     = txp->id;
1404 	resp->status = st;
1405 
1406 	while (extra_count-- != 0)
1407 		RING_GET_RESPONSE(&queue->tx, ++i)->status = XEN_NETIF_RSP_NULL;
1408 
1409 	queue->tx.rsp_prod_pvt = ++i;
1410 }
1411 
1412 static void push_tx_responses(struct xenvif_queue *queue)
1413 {
1414 	int notify;
1415 
1416 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&queue->tx, notify);
1417 	if (notify)
1418 		notify_remote_via_irq(queue->tx_irq);
1419 }
1420 
1421 static void xenvif_idx_unmap(struct xenvif_queue *queue, u16 pending_idx)
1422 {
1423 	int ret;
1424 	struct gnttab_unmap_grant_ref tx_unmap_op;
1425 
1426 	gnttab_set_unmap_op(&tx_unmap_op,
1427 			    idx_to_kaddr(queue, pending_idx),
1428 			    GNTMAP_host_map,
1429 			    queue->grant_tx_handle[pending_idx]);
1430 	xenvif_grant_handle_reset(queue, pending_idx);
1431 
1432 	ret = gnttab_unmap_refs(&tx_unmap_op, NULL,
1433 				&queue->mmap_pages[pending_idx], 1);
1434 	if (ret) {
1435 		netdev_err(queue->vif->dev,
1436 			   "Unmap fail: ret: %d pending_idx: %d host_addr: %llx handle: 0x%x status: %d\n",
1437 			   ret,
1438 			   pending_idx,
1439 			   tx_unmap_op.host_addr,
1440 			   tx_unmap_op.handle,
1441 			   tx_unmap_op.status);
1442 		BUG();
1443 	}
1444 }
1445 
1446 static inline int tx_work_todo(struct xenvif_queue *queue)
1447 {
1448 	if (likely(RING_HAS_UNCONSUMED_REQUESTS(&queue->tx)))
1449 		return 1;
1450 
1451 	return 0;
1452 }
1453 
1454 static inline bool tx_dealloc_work_todo(struct xenvif_queue *queue)
1455 {
1456 	return queue->dealloc_cons != queue->dealloc_prod;
1457 }
1458 
1459 void xenvif_unmap_frontend_data_rings(struct xenvif_queue *queue)
1460 {
1461 	if (queue->tx.sring)
1462 		xenbus_unmap_ring_vfree(xenvif_to_xenbus_device(queue->vif),
1463 					queue->tx.sring);
1464 	if (queue->rx.sring)
1465 		xenbus_unmap_ring_vfree(xenvif_to_xenbus_device(queue->vif),
1466 					queue->rx.sring);
1467 }
1468 
1469 int xenvif_map_frontend_data_rings(struct xenvif_queue *queue,
1470 				   grant_ref_t tx_ring_ref,
1471 				   grant_ref_t rx_ring_ref)
1472 {
1473 	void *addr;
1474 	struct xen_netif_tx_sring *txs;
1475 	struct xen_netif_rx_sring *rxs;
1476 	RING_IDX rsp_prod, req_prod;
1477 	int err;
1478 
1479 	err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(queue->vif),
1480 				     &tx_ring_ref, 1, &addr);
1481 	if (err)
1482 		goto err;
1483 
1484 	txs = (struct xen_netif_tx_sring *)addr;
1485 	rsp_prod = READ_ONCE(txs->rsp_prod);
1486 	req_prod = READ_ONCE(txs->req_prod);
1487 
1488 	BACK_RING_ATTACH(&queue->tx, txs, rsp_prod, XEN_PAGE_SIZE);
1489 
1490 	err = -EIO;
1491 	if (req_prod - rsp_prod > RING_SIZE(&queue->tx))
1492 		goto err;
1493 
1494 	err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(queue->vif),
1495 				     &rx_ring_ref, 1, &addr);
1496 	if (err)
1497 		goto err;
1498 
1499 	rxs = (struct xen_netif_rx_sring *)addr;
1500 	rsp_prod = READ_ONCE(rxs->rsp_prod);
1501 	req_prod = READ_ONCE(rxs->req_prod);
1502 
1503 	BACK_RING_ATTACH(&queue->rx, rxs, rsp_prod, XEN_PAGE_SIZE);
1504 
1505 	err = -EIO;
1506 	if (req_prod - rsp_prod > RING_SIZE(&queue->rx))
1507 		goto err;
1508 
1509 	return 0;
1510 
1511 err:
1512 	xenvif_unmap_frontend_data_rings(queue);
1513 	return err;
1514 }
1515 
1516 static bool xenvif_dealloc_kthread_should_stop(struct xenvif_queue *queue)
1517 {
1518 	/* Dealloc thread must remain running until all inflight
1519 	 * packets complete.
1520 	 */
1521 	return kthread_should_stop() &&
1522 		!atomic_read(&queue->inflight_packets);
1523 }
1524 
1525 int xenvif_dealloc_kthread(void *data)
1526 {
1527 	struct xenvif_queue *queue = data;
1528 
1529 	for (;;) {
1530 		wait_event_interruptible(queue->dealloc_wq,
1531 					 tx_dealloc_work_todo(queue) ||
1532 					 xenvif_dealloc_kthread_should_stop(queue));
1533 		if (xenvif_dealloc_kthread_should_stop(queue))
1534 			break;
1535 
1536 		xenvif_tx_dealloc_action(queue);
1537 		cond_resched();
1538 	}
1539 
1540 	/* Unmap anything remaining*/
1541 	if (tx_dealloc_work_todo(queue))
1542 		xenvif_tx_dealloc_action(queue);
1543 
1544 	return 0;
1545 }
1546 
1547 static void make_ctrl_response(struct xenvif *vif,
1548 			       const struct xen_netif_ctrl_request *req,
1549 			       u32 status, u32 data)
1550 {
1551 	RING_IDX idx = vif->ctrl.rsp_prod_pvt;
1552 	struct xen_netif_ctrl_response rsp = {
1553 		.id = req->id,
1554 		.type = req->type,
1555 		.status = status,
1556 		.data = data,
1557 	};
1558 
1559 	*RING_GET_RESPONSE(&vif->ctrl, idx) = rsp;
1560 	vif->ctrl.rsp_prod_pvt = ++idx;
1561 }
1562 
1563 static void push_ctrl_response(struct xenvif *vif)
1564 {
1565 	int notify;
1566 
1567 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vif->ctrl, notify);
1568 	if (notify)
1569 		notify_remote_via_irq(vif->ctrl_irq);
1570 }
1571 
1572 static void process_ctrl_request(struct xenvif *vif,
1573 				 const struct xen_netif_ctrl_request *req)
1574 {
1575 	u32 status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED;
1576 	u32 data = 0;
1577 
1578 	switch (req->type) {
1579 	case XEN_NETIF_CTRL_TYPE_SET_HASH_ALGORITHM:
1580 		status = xenvif_set_hash_alg(vif, req->data[0]);
1581 		break;
1582 
1583 	case XEN_NETIF_CTRL_TYPE_GET_HASH_FLAGS:
1584 		status = xenvif_get_hash_flags(vif, &data);
1585 		break;
1586 
1587 	case XEN_NETIF_CTRL_TYPE_SET_HASH_FLAGS:
1588 		status = xenvif_set_hash_flags(vif, req->data[0]);
1589 		break;
1590 
1591 	case XEN_NETIF_CTRL_TYPE_SET_HASH_KEY:
1592 		status = xenvif_set_hash_key(vif, req->data[0],
1593 					     req->data[1]);
1594 		break;
1595 
1596 	case XEN_NETIF_CTRL_TYPE_GET_HASH_MAPPING_SIZE:
1597 		status = XEN_NETIF_CTRL_STATUS_SUCCESS;
1598 		data = XEN_NETBK_MAX_HASH_MAPPING_SIZE;
1599 		break;
1600 
1601 	case XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING_SIZE:
1602 		status = xenvif_set_hash_mapping_size(vif,
1603 						      req->data[0]);
1604 		break;
1605 
1606 	case XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING:
1607 		status = xenvif_set_hash_mapping(vif, req->data[0],
1608 						 req->data[1],
1609 						 req->data[2]);
1610 		break;
1611 
1612 	default:
1613 		break;
1614 	}
1615 
1616 	make_ctrl_response(vif, req, status, data);
1617 	push_ctrl_response(vif);
1618 }
1619 
1620 static void xenvif_ctrl_action(struct xenvif *vif)
1621 {
1622 	for (;;) {
1623 		RING_IDX req_prod, req_cons;
1624 
1625 		req_prod = vif->ctrl.sring->req_prod;
1626 		req_cons = vif->ctrl.req_cons;
1627 
1628 		/* Make sure we can see requests before we process them. */
1629 		rmb();
1630 
1631 		if (req_cons == req_prod)
1632 			break;
1633 
1634 		while (req_cons != req_prod) {
1635 			struct xen_netif_ctrl_request req;
1636 
1637 			RING_COPY_REQUEST(&vif->ctrl, req_cons, &req);
1638 			req_cons++;
1639 
1640 			process_ctrl_request(vif, &req);
1641 		}
1642 
1643 		vif->ctrl.req_cons = req_cons;
1644 		vif->ctrl.sring->req_event = req_cons + 1;
1645 	}
1646 }
1647 
1648 static bool xenvif_ctrl_work_todo(struct xenvif *vif)
1649 {
1650 	if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->ctrl)))
1651 		return true;
1652 
1653 	return false;
1654 }
1655 
1656 irqreturn_t xenvif_ctrl_irq_fn(int irq, void *data)
1657 {
1658 	struct xenvif *vif = data;
1659 	unsigned int eoi_flag = XEN_EOI_FLAG_SPURIOUS;
1660 
1661 	while (xenvif_ctrl_work_todo(vif)) {
1662 		xenvif_ctrl_action(vif);
1663 		eoi_flag = 0;
1664 	}
1665 
1666 	xen_irq_lateeoi(irq, eoi_flag);
1667 
1668 	return IRQ_HANDLED;
1669 }
1670 
1671 static int __init netback_init(void)
1672 {
1673 	int rc = 0;
1674 
1675 	if (!xen_domain())
1676 		return -ENODEV;
1677 
1678 	/* Allow as many queues as there are CPUs but max. 8 if user has not
1679 	 * specified a value.
1680 	 */
1681 	if (xenvif_max_queues == 0)
1682 		xenvif_max_queues = min_t(unsigned int, MAX_QUEUES_DEFAULT,
1683 					  num_online_cpus());
1684 
1685 	if (fatal_skb_slots < XEN_NETBK_LEGACY_SLOTS_MAX) {
1686 		pr_info("fatal_skb_slots too small (%d), bump it to XEN_NETBK_LEGACY_SLOTS_MAX (%d)\n",
1687 			fatal_skb_slots, XEN_NETBK_LEGACY_SLOTS_MAX);
1688 		fatal_skb_slots = XEN_NETBK_LEGACY_SLOTS_MAX;
1689 	}
1690 
1691 	rc = xenvif_xenbus_init();
1692 	if (rc)
1693 		goto failed_init;
1694 
1695 #ifdef CONFIG_DEBUG_FS
1696 	xen_netback_dbg_root = debugfs_create_dir("xen-netback", NULL);
1697 #endif /* CONFIG_DEBUG_FS */
1698 
1699 	return 0;
1700 
1701 failed_init:
1702 	return rc;
1703 }
1704 
1705 module_init(netback_init);
1706 
1707 static void __exit netback_fini(void)
1708 {
1709 #ifdef CONFIG_DEBUG_FS
1710 	debugfs_remove_recursive(xen_netback_dbg_root);
1711 #endif /* CONFIG_DEBUG_FS */
1712 	xenvif_xenbus_fini();
1713 }
1714 module_exit(netback_fini);
1715 
1716 MODULE_LICENSE("Dual BSD/GPL");
1717 MODULE_ALIAS("xen-backend:vif");
1718