xref: /openbmc/linux/drivers/net/xen-netfront.c (revision 384740dc)
1 /*
2  * Virtual network driver for conversing with remote driver backends.
3  *
4  * Copyright (c) 2002-2005, K A Fraser
5  * Copyright (c) 2005, XenSource Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License version 2
9  * as published by the Free Software Foundation; or, when distributed
10  * separately from the Linux kernel or incorporated into other
11  * software packages, subject to the following license:
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining a copy
14  * of this source file (the "Software"), to deal in the Software without
15  * restriction, including without limitation the rights to use, copy, modify,
16  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
17  * and to permit persons to whom the Software is furnished to do so, subject to
18  * the following conditions:
19  *
20  * The above copyright notice and this permission notice shall be included in
21  * all copies or substantial portions of the Software.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
29  * IN THE SOFTWARE.
30  */
31 
32 #include <linux/module.h>
33 #include <linux/kernel.h>
34 #include <linux/netdevice.h>
35 #include <linux/etherdevice.h>
36 #include <linux/skbuff.h>
37 #include <linux/ethtool.h>
38 #include <linux/if_ether.h>
39 #include <linux/tcp.h>
40 #include <linux/udp.h>
41 #include <linux/moduleparam.h>
42 #include <linux/mm.h>
43 #include <net/ip.h>
44 
45 #include <xen/xenbus.h>
46 #include <xen/events.h>
47 #include <xen/page.h>
48 #include <xen/grant_table.h>
49 
50 #include <xen/interface/io/netif.h>
51 #include <xen/interface/memory.h>
52 #include <xen/interface/grant_table.h>
53 
54 static struct ethtool_ops xennet_ethtool_ops;
55 
56 struct netfront_cb {
57 	struct page *page;
58 	unsigned offset;
59 };
60 
61 #define NETFRONT_SKB_CB(skb)	((struct netfront_cb *)((skb)->cb))
62 
63 #define RX_COPY_THRESHOLD 256
64 
65 #define GRANT_INVALID_REF	0
66 
67 #define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
68 #define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
69 #define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
70 
71 struct netfront_info {
72 	struct list_head list;
73 	struct net_device *netdev;
74 
75 	struct napi_struct napi;
76 
77 	unsigned int evtchn;
78 	struct xenbus_device *xbdev;
79 
80 	spinlock_t   tx_lock;
81 	struct xen_netif_tx_front_ring tx;
82 	int tx_ring_ref;
83 
84 	/*
85 	 * {tx,rx}_skbs store outstanding skbuffs. Free tx_skb entries
86 	 * are linked from tx_skb_freelist through skb_entry.link.
87 	 *
88 	 *  NB. Freelist index entries are always going to be less than
89 	 *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
90 	 *  greater than PAGE_OFFSET: we use this property to distinguish
91 	 *  them.
92 	 */
93 	union skb_entry {
94 		struct sk_buff *skb;
95 		unsigned long link;
96 	} tx_skbs[NET_TX_RING_SIZE];
97 	grant_ref_t gref_tx_head;
98 	grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
99 	unsigned tx_skb_freelist;
100 
101 	spinlock_t   rx_lock ____cacheline_aligned_in_smp;
102 	struct xen_netif_rx_front_ring rx;
103 	int rx_ring_ref;
104 
105 	/* Receive-ring batched refills. */
106 #define RX_MIN_TARGET 8
107 #define RX_DFL_MIN_TARGET 64
108 #define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
109 	unsigned rx_min_target, rx_max_target, rx_target;
110 	struct sk_buff_head rx_batch;
111 
112 	struct timer_list rx_refill_timer;
113 
114 	struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
115 	grant_ref_t gref_rx_head;
116 	grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
117 
118 	unsigned long rx_pfn_array[NET_RX_RING_SIZE];
119 	struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
120 	struct mmu_update rx_mmu[NET_RX_RING_SIZE];
121 };
122 
123 struct netfront_rx_info {
124 	struct xen_netif_rx_response rx;
125 	struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
126 };
127 
128 static void skb_entry_set_link(union skb_entry *list, unsigned short id)
129 {
130 	list->link = id;
131 }
132 
133 static int skb_entry_is_link(const union skb_entry *list)
134 {
135 	BUILD_BUG_ON(sizeof(list->skb) != sizeof(list->link));
136 	return ((unsigned long)list->skb < PAGE_OFFSET);
137 }
138 
139 /*
140  * Access macros for acquiring freeing slots in tx_skbs[].
141  */
142 
143 static void add_id_to_freelist(unsigned *head, union skb_entry *list,
144 			       unsigned short id)
145 {
146 	skb_entry_set_link(&list[id], *head);
147 	*head = id;
148 }
149 
150 static unsigned short get_id_from_freelist(unsigned *head,
151 					   union skb_entry *list)
152 {
153 	unsigned int id = *head;
154 	*head = list[id].link;
155 	return id;
156 }
157 
158 static int xennet_rxidx(RING_IDX idx)
159 {
160 	return idx & (NET_RX_RING_SIZE - 1);
161 }
162 
163 static struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
164 					 RING_IDX ri)
165 {
166 	int i = xennet_rxidx(ri);
167 	struct sk_buff *skb = np->rx_skbs[i];
168 	np->rx_skbs[i] = NULL;
169 	return skb;
170 }
171 
172 static grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
173 					    RING_IDX ri)
174 {
175 	int i = xennet_rxidx(ri);
176 	grant_ref_t ref = np->grant_rx_ref[i];
177 	np->grant_rx_ref[i] = GRANT_INVALID_REF;
178 	return ref;
179 }
180 
181 #ifdef CONFIG_SYSFS
182 static int xennet_sysfs_addif(struct net_device *netdev);
183 static void xennet_sysfs_delif(struct net_device *netdev);
184 #else /* !CONFIG_SYSFS */
185 #define xennet_sysfs_addif(dev) (0)
186 #define xennet_sysfs_delif(dev) do { } while (0)
187 #endif
188 
189 static int xennet_can_sg(struct net_device *dev)
190 {
191 	return dev->features & NETIF_F_SG;
192 }
193 
194 
195 static void rx_refill_timeout(unsigned long data)
196 {
197 	struct net_device *dev = (struct net_device *)data;
198 	struct netfront_info *np = netdev_priv(dev);
199 	netif_rx_schedule(dev, &np->napi);
200 }
201 
202 static int netfront_tx_slot_available(struct netfront_info *np)
203 {
204 	return ((np->tx.req_prod_pvt - np->tx.rsp_cons) <
205 		(TX_MAX_TARGET - MAX_SKB_FRAGS - 2));
206 }
207 
208 static void xennet_maybe_wake_tx(struct net_device *dev)
209 {
210 	struct netfront_info *np = netdev_priv(dev);
211 
212 	if (unlikely(netif_queue_stopped(dev)) &&
213 	    netfront_tx_slot_available(np) &&
214 	    likely(netif_running(dev)))
215 		netif_wake_queue(dev);
216 }
217 
218 static void xennet_alloc_rx_buffers(struct net_device *dev)
219 {
220 	unsigned short id;
221 	struct netfront_info *np = netdev_priv(dev);
222 	struct sk_buff *skb;
223 	struct page *page;
224 	int i, batch_target, notify;
225 	RING_IDX req_prod = np->rx.req_prod_pvt;
226 	grant_ref_t ref;
227 	unsigned long pfn;
228 	void *vaddr;
229 	struct xen_netif_rx_request *req;
230 
231 	if (unlikely(!netif_carrier_ok(dev)))
232 		return;
233 
234 	/*
235 	 * Allocate skbuffs greedily, even though we batch updates to the
236 	 * receive ring. This creates a less bursty demand on the memory
237 	 * allocator, so should reduce the chance of failed allocation requests
238 	 * both for ourself and for other kernel subsystems.
239 	 */
240 	batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
241 	for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
242 		skb = __netdev_alloc_skb(dev, RX_COPY_THRESHOLD,
243 					 GFP_ATOMIC | __GFP_NOWARN);
244 		if (unlikely(!skb))
245 			goto no_skb;
246 
247 		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
248 		if (!page) {
249 			kfree_skb(skb);
250 no_skb:
251 			/* Any skbuffs queued for refill? Force them out. */
252 			if (i != 0)
253 				goto refill;
254 			/* Could not allocate any skbuffs. Try again later. */
255 			mod_timer(&np->rx_refill_timer,
256 				  jiffies + (HZ/10));
257 			break;
258 		}
259 
260 		skb_shinfo(skb)->frags[0].page = page;
261 		skb_shinfo(skb)->nr_frags = 1;
262 		__skb_queue_tail(&np->rx_batch, skb);
263 	}
264 
265 	/* Is the batch large enough to be worthwhile? */
266 	if (i < (np->rx_target/2)) {
267 		if (req_prod > np->rx.sring->req_prod)
268 			goto push;
269 		return;
270 	}
271 
272 	/* Adjust our fill target if we risked running out of buffers. */
273 	if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
274 	    ((np->rx_target *= 2) > np->rx_max_target))
275 		np->rx_target = np->rx_max_target;
276 
277  refill:
278 	for (i = 0; ; i++) {
279 		skb = __skb_dequeue(&np->rx_batch);
280 		if (skb == NULL)
281 			break;
282 
283 		skb->dev = dev;
284 
285 		id = xennet_rxidx(req_prod + i);
286 
287 		BUG_ON(np->rx_skbs[id]);
288 		np->rx_skbs[id] = skb;
289 
290 		ref = gnttab_claim_grant_reference(&np->gref_rx_head);
291 		BUG_ON((signed short)ref < 0);
292 		np->grant_rx_ref[id] = ref;
293 
294 		pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
295 		vaddr = page_address(skb_shinfo(skb)->frags[0].page);
296 
297 		req = RING_GET_REQUEST(&np->rx, req_prod + i);
298 		gnttab_grant_foreign_access_ref(ref,
299 						np->xbdev->otherend_id,
300 						pfn_to_mfn(pfn),
301 						0);
302 
303 		req->id = id;
304 		req->gref = ref;
305 	}
306 
307 	wmb();		/* barrier so backend seens requests */
308 
309 	/* Above is a suitable barrier to ensure backend will see requests. */
310 	np->rx.req_prod_pvt = req_prod + i;
311  push:
312 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
313 	if (notify)
314 		notify_remote_via_irq(np->netdev->irq);
315 }
316 
317 static int xennet_open(struct net_device *dev)
318 {
319 	struct netfront_info *np = netdev_priv(dev);
320 
321 	napi_enable(&np->napi);
322 
323 	spin_lock_bh(&np->rx_lock);
324 	if (netif_carrier_ok(dev)) {
325 		xennet_alloc_rx_buffers(dev);
326 		np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
327 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
328 			netif_rx_schedule(dev, &np->napi);
329 	}
330 	spin_unlock_bh(&np->rx_lock);
331 
332 	netif_start_queue(dev);
333 
334 	return 0;
335 }
336 
337 static void xennet_tx_buf_gc(struct net_device *dev)
338 {
339 	RING_IDX cons, prod;
340 	unsigned short id;
341 	struct netfront_info *np = netdev_priv(dev);
342 	struct sk_buff *skb;
343 
344 	BUG_ON(!netif_carrier_ok(dev));
345 
346 	do {
347 		prod = np->tx.sring->rsp_prod;
348 		rmb(); /* Ensure we see responses up to 'rp'. */
349 
350 		for (cons = np->tx.rsp_cons; cons != prod; cons++) {
351 			struct xen_netif_tx_response *txrsp;
352 
353 			txrsp = RING_GET_RESPONSE(&np->tx, cons);
354 			if (txrsp->status == NETIF_RSP_NULL)
355 				continue;
356 
357 			id  = txrsp->id;
358 			skb = np->tx_skbs[id].skb;
359 			if (unlikely(gnttab_query_foreign_access(
360 				np->grant_tx_ref[id]) != 0)) {
361 				printk(KERN_ALERT "xennet_tx_buf_gc: warning "
362 				       "-- grant still in use by backend "
363 				       "domain.\n");
364 				BUG();
365 			}
366 			gnttab_end_foreign_access_ref(
367 				np->grant_tx_ref[id], GNTMAP_readonly);
368 			gnttab_release_grant_reference(
369 				&np->gref_tx_head, np->grant_tx_ref[id]);
370 			np->grant_tx_ref[id] = GRANT_INVALID_REF;
371 			add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, id);
372 			dev_kfree_skb_irq(skb);
373 		}
374 
375 		np->tx.rsp_cons = prod;
376 
377 		/*
378 		 * Set a new event, then check for race with update of tx_cons.
379 		 * Note that it is essential to schedule a callback, no matter
380 		 * how few buffers are pending. Even if there is space in the
381 		 * transmit ring, higher layers may be blocked because too much
382 		 * data is outstanding: in such cases notification from Xen is
383 		 * likely to be the only kick that we'll get.
384 		 */
385 		np->tx.sring->rsp_event =
386 			prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
387 		mb();		/* update shared area */
388 	} while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
389 
390 	xennet_maybe_wake_tx(dev);
391 }
392 
393 static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
394 			      struct xen_netif_tx_request *tx)
395 {
396 	struct netfront_info *np = netdev_priv(dev);
397 	char *data = skb->data;
398 	unsigned long mfn;
399 	RING_IDX prod = np->tx.req_prod_pvt;
400 	int frags = skb_shinfo(skb)->nr_frags;
401 	unsigned int offset = offset_in_page(data);
402 	unsigned int len = skb_headlen(skb);
403 	unsigned int id;
404 	grant_ref_t ref;
405 	int i;
406 
407 	/* While the header overlaps a page boundary (including being
408 	   larger than a page), split it it into page-sized chunks. */
409 	while (len > PAGE_SIZE - offset) {
410 		tx->size = PAGE_SIZE - offset;
411 		tx->flags |= NETTXF_more_data;
412 		len -= tx->size;
413 		data += tx->size;
414 		offset = 0;
415 
416 		id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
417 		np->tx_skbs[id].skb = skb_get(skb);
418 		tx = RING_GET_REQUEST(&np->tx, prod++);
419 		tx->id = id;
420 		ref = gnttab_claim_grant_reference(&np->gref_tx_head);
421 		BUG_ON((signed short)ref < 0);
422 
423 		mfn = virt_to_mfn(data);
424 		gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
425 						mfn, GNTMAP_readonly);
426 
427 		tx->gref = np->grant_tx_ref[id] = ref;
428 		tx->offset = offset;
429 		tx->size = len;
430 		tx->flags = 0;
431 	}
432 
433 	/* Grant backend access to each skb fragment page. */
434 	for (i = 0; i < frags; i++) {
435 		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
436 
437 		tx->flags |= NETTXF_more_data;
438 
439 		id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
440 		np->tx_skbs[id].skb = skb_get(skb);
441 		tx = RING_GET_REQUEST(&np->tx, prod++);
442 		tx->id = id;
443 		ref = gnttab_claim_grant_reference(&np->gref_tx_head);
444 		BUG_ON((signed short)ref < 0);
445 
446 		mfn = pfn_to_mfn(page_to_pfn(frag->page));
447 		gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
448 						mfn, GNTMAP_readonly);
449 
450 		tx->gref = np->grant_tx_ref[id] = ref;
451 		tx->offset = frag->page_offset;
452 		tx->size = frag->size;
453 		tx->flags = 0;
454 	}
455 
456 	np->tx.req_prod_pvt = prod;
457 }
458 
459 static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
460 {
461 	unsigned short id;
462 	struct netfront_info *np = netdev_priv(dev);
463 	struct xen_netif_tx_request *tx;
464 	struct xen_netif_extra_info *extra;
465 	char *data = skb->data;
466 	RING_IDX i;
467 	grant_ref_t ref;
468 	unsigned long mfn;
469 	int notify;
470 	int frags = skb_shinfo(skb)->nr_frags;
471 	unsigned int offset = offset_in_page(data);
472 	unsigned int len = skb_headlen(skb);
473 
474 	frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
475 	if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
476 		printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
477 		       frags);
478 		dump_stack();
479 		goto drop;
480 	}
481 
482 	spin_lock_irq(&np->tx_lock);
483 
484 	if (unlikely(!netif_carrier_ok(dev) ||
485 		     (frags > 1 && !xennet_can_sg(dev)) ||
486 		     netif_needs_gso(dev, skb))) {
487 		spin_unlock_irq(&np->tx_lock);
488 		goto drop;
489 	}
490 
491 	i = np->tx.req_prod_pvt;
492 
493 	id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
494 	np->tx_skbs[id].skb = skb;
495 
496 	tx = RING_GET_REQUEST(&np->tx, i);
497 
498 	tx->id   = id;
499 	ref = gnttab_claim_grant_reference(&np->gref_tx_head);
500 	BUG_ON((signed short)ref < 0);
501 	mfn = virt_to_mfn(data);
502 	gnttab_grant_foreign_access_ref(
503 		ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
504 	tx->gref = np->grant_tx_ref[id] = ref;
505 	tx->offset = offset;
506 	tx->size = len;
507 	extra = NULL;
508 
509 	tx->flags = 0;
510 	if (skb->ip_summed == CHECKSUM_PARTIAL)
511 		/* local packet? */
512 		tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
513 	else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
514 		/* remote but checksummed. */
515 		tx->flags |= NETTXF_data_validated;
516 
517 	if (skb_shinfo(skb)->gso_size) {
518 		struct xen_netif_extra_info *gso;
519 
520 		gso = (struct xen_netif_extra_info *)
521 			RING_GET_REQUEST(&np->tx, ++i);
522 
523 		if (extra)
524 			extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
525 		else
526 			tx->flags |= NETTXF_extra_info;
527 
528 		gso->u.gso.size = skb_shinfo(skb)->gso_size;
529 		gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
530 		gso->u.gso.pad = 0;
531 		gso->u.gso.features = 0;
532 
533 		gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
534 		gso->flags = 0;
535 		extra = gso;
536 	}
537 
538 	np->tx.req_prod_pvt = i + 1;
539 
540 	xennet_make_frags(skb, dev, tx);
541 	tx->size = skb->len;
542 
543 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
544 	if (notify)
545 		notify_remote_via_irq(np->netdev->irq);
546 
547 	dev->stats.tx_bytes += skb->len;
548 	dev->stats.tx_packets++;
549 
550 	/* Note: It is not safe to access skb after xennet_tx_buf_gc()! */
551 	xennet_tx_buf_gc(dev);
552 
553 	if (!netfront_tx_slot_available(np))
554 		netif_stop_queue(dev);
555 
556 	spin_unlock_irq(&np->tx_lock);
557 
558 	return 0;
559 
560  drop:
561 	dev->stats.tx_dropped++;
562 	dev_kfree_skb(skb);
563 	return 0;
564 }
565 
566 static int xennet_close(struct net_device *dev)
567 {
568 	struct netfront_info *np = netdev_priv(dev);
569 	netif_stop_queue(np->netdev);
570 	napi_disable(&np->napi);
571 	return 0;
572 }
573 
574 static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
575 				grant_ref_t ref)
576 {
577 	int new = xennet_rxidx(np->rx.req_prod_pvt);
578 
579 	BUG_ON(np->rx_skbs[new]);
580 	np->rx_skbs[new] = skb;
581 	np->grant_rx_ref[new] = ref;
582 	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
583 	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
584 	np->rx.req_prod_pvt++;
585 }
586 
587 static int xennet_get_extras(struct netfront_info *np,
588 			     struct xen_netif_extra_info *extras,
589 			     RING_IDX rp)
590 
591 {
592 	struct xen_netif_extra_info *extra;
593 	struct device *dev = &np->netdev->dev;
594 	RING_IDX cons = np->rx.rsp_cons;
595 	int err = 0;
596 
597 	do {
598 		struct sk_buff *skb;
599 		grant_ref_t ref;
600 
601 		if (unlikely(cons + 1 == rp)) {
602 			if (net_ratelimit())
603 				dev_warn(dev, "Missing extra info\n");
604 			err = -EBADR;
605 			break;
606 		}
607 
608 		extra = (struct xen_netif_extra_info *)
609 			RING_GET_RESPONSE(&np->rx, ++cons);
610 
611 		if (unlikely(!extra->type ||
612 			     extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
613 			if (net_ratelimit())
614 				dev_warn(dev, "Invalid extra type: %d\n",
615 					extra->type);
616 			err = -EINVAL;
617 		} else {
618 			memcpy(&extras[extra->type - 1], extra,
619 			       sizeof(*extra));
620 		}
621 
622 		skb = xennet_get_rx_skb(np, cons);
623 		ref = xennet_get_rx_ref(np, cons);
624 		xennet_move_rx_slot(np, skb, ref);
625 	} while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
626 
627 	np->rx.rsp_cons = cons;
628 	return err;
629 }
630 
631 static int xennet_get_responses(struct netfront_info *np,
632 				struct netfront_rx_info *rinfo, RING_IDX rp,
633 				struct sk_buff_head *list)
634 {
635 	struct xen_netif_rx_response *rx = &rinfo->rx;
636 	struct xen_netif_extra_info *extras = rinfo->extras;
637 	struct device *dev = &np->netdev->dev;
638 	RING_IDX cons = np->rx.rsp_cons;
639 	struct sk_buff *skb = xennet_get_rx_skb(np, cons);
640 	grant_ref_t ref = xennet_get_rx_ref(np, cons);
641 	int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
642 	int frags = 1;
643 	int err = 0;
644 	unsigned long ret;
645 
646 	if (rx->flags & NETRXF_extra_info) {
647 		err = xennet_get_extras(np, extras, rp);
648 		cons = np->rx.rsp_cons;
649 	}
650 
651 	for (;;) {
652 		if (unlikely(rx->status < 0 ||
653 			     rx->offset + rx->status > PAGE_SIZE)) {
654 			if (net_ratelimit())
655 				dev_warn(dev, "rx->offset: %x, size: %u\n",
656 					 rx->offset, rx->status);
657 			xennet_move_rx_slot(np, skb, ref);
658 			err = -EINVAL;
659 			goto next;
660 		}
661 
662 		/*
663 		 * This definitely indicates a bug, either in this driver or in
664 		 * the backend driver. In future this should flag the bad
665 		 * situation to the system controller to reboot the backed.
666 		 */
667 		if (ref == GRANT_INVALID_REF) {
668 			if (net_ratelimit())
669 				dev_warn(dev, "Bad rx response id %d.\n",
670 					 rx->id);
671 			err = -EINVAL;
672 			goto next;
673 		}
674 
675 		ret = gnttab_end_foreign_access_ref(ref, 0);
676 		BUG_ON(!ret);
677 
678 		gnttab_release_grant_reference(&np->gref_rx_head, ref);
679 
680 		__skb_queue_tail(list, skb);
681 
682 next:
683 		if (!(rx->flags & NETRXF_more_data))
684 			break;
685 
686 		if (cons + frags == rp) {
687 			if (net_ratelimit())
688 				dev_warn(dev, "Need more frags\n");
689 			err = -ENOENT;
690 			break;
691 		}
692 
693 		rx = RING_GET_RESPONSE(&np->rx, cons + frags);
694 		skb = xennet_get_rx_skb(np, cons + frags);
695 		ref = xennet_get_rx_ref(np, cons + frags);
696 		frags++;
697 	}
698 
699 	if (unlikely(frags > max)) {
700 		if (net_ratelimit())
701 			dev_warn(dev, "Too many frags\n");
702 		err = -E2BIG;
703 	}
704 
705 	if (unlikely(err))
706 		np->rx.rsp_cons = cons + frags;
707 
708 	return err;
709 }
710 
711 static int xennet_set_skb_gso(struct sk_buff *skb,
712 			      struct xen_netif_extra_info *gso)
713 {
714 	if (!gso->u.gso.size) {
715 		if (net_ratelimit())
716 			printk(KERN_WARNING "GSO size must not be zero.\n");
717 		return -EINVAL;
718 	}
719 
720 	/* Currently only TCPv4 S.O. is supported. */
721 	if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
722 		if (net_ratelimit())
723 			printk(KERN_WARNING "Bad GSO type %d.\n", gso->u.gso.type);
724 		return -EINVAL;
725 	}
726 
727 	skb_shinfo(skb)->gso_size = gso->u.gso.size;
728 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
729 
730 	/* Header must be checked, and gso_segs computed. */
731 	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
732 	skb_shinfo(skb)->gso_segs = 0;
733 
734 	return 0;
735 }
736 
737 static RING_IDX xennet_fill_frags(struct netfront_info *np,
738 				  struct sk_buff *skb,
739 				  struct sk_buff_head *list)
740 {
741 	struct skb_shared_info *shinfo = skb_shinfo(skb);
742 	int nr_frags = shinfo->nr_frags;
743 	RING_IDX cons = np->rx.rsp_cons;
744 	skb_frag_t *frag = shinfo->frags + nr_frags;
745 	struct sk_buff *nskb;
746 
747 	while ((nskb = __skb_dequeue(list))) {
748 		struct xen_netif_rx_response *rx =
749 			RING_GET_RESPONSE(&np->rx, ++cons);
750 
751 		frag->page = skb_shinfo(nskb)->frags[0].page;
752 		frag->page_offset = rx->offset;
753 		frag->size = rx->status;
754 
755 		skb->data_len += rx->status;
756 
757 		skb_shinfo(nskb)->nr_frags = 0;
758 		kfree_skb(nskb);
759 
760 		frag++;
761 		nr_frags++;
762 	}
763 
764 	shinfo->nr_frags = nr_frags;
765 	return cons;
766 }
767 
768 static int skb_checksum_setup(struct sk_buff *skb)
769 {
770 	struct iphdr *iph;
771 	unsigned char *th;
772 	int err = -EPROTO;
773 
774 	if (skb->protocol != htons(ETH_P_IP))
775 		goto out;
776 
777 	iph = (void *)skb->data;
778 	th = skb->data + 4 * iph->ihl;
779 	if (th >= skb_tail_pointer(skb))
780 		goto out;
781 
782 	skb->csum_start = th - skb->head;
783 	switch (iph->protocol) {
784 	case IPPROTO_TCP:
785 		skb->csum_offset = offsetof(struct tcphdr, check);
786 		break;
787 	case IPPROTO_UDP:
788 		skb->csum_offset = offsetof(struct udphdr, check);
789 		break;
790 	default:
791 		if (net_ratelimit())
792 			printk(KERN_ERR "Attempting to checksum a non-"
793 			       "TCP/UDP packet, dropping a protocol"
794 			       " %d packet", iph->protocol);
795 		goto out;
796 	}
797 
798 	if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
799 		goto out;
800 
801 	err = 0;
802 
803 out:
804 	return err;
805 }
806 
807 static int handle_incoming_queue(struct net_device *dev,
808 				 struct sk_buff_head *rxq)
809 {
810 	int packets_dropped = 0;
811 	struct sk_buff *skb;
812 
813 	while ((skb = __skb_dequeue(rxq)) != NULL) {
814 		struct page *page = NETFRONT_SKB_CB(skb)->page;
815 		void *vaddr = page_address(page);
816 		unsigned offset = NETFRONT_SKB_CB(skb)->offset;
817 
818 		memcpy(skb->data, vaddr + offset,
819 		       skb_headlen(skb));
820 
821 		if (page != skb_shinfo(skb)->frags[0].page)
822 			__free_page(page);
823 
824 		/* Ethernet work: Delayed to here as it peeks the header. */
825 		skb->protocol = eth_type_trans(skb, dev);
826 
827 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
828 			if (skb_checksum_setup(skb)) {
829 				kfree_skb(skb);
830 				packets_dropped++;
831 				dev->stats.rx_errors++;
832 				continue;
833 			}
834 		}
835 
836 		dev->stats.rx_packets++;
837 		dev->stats.rx_bytes += skb->len;
838 
839 		/* Pass it up. */
840 		netif_receive_skb(skb);
841 		dev->last_rx = jiffies;
842 	}
843 
844 	return packets_dropped;
845 }
846 
847 static int xennet_poll(struct napi_struct *napi, int budget)
848 {
849 	struct netfront_info *np = container_of(napi, struct netfront_info, napi);
850 	struct net_device *dev = np->netdev;
851 	struct sk_buff *skb;
852 	struct netfront_rx_info rinfo;
853 	struct xen_netif_rx_response *rx = &rinfo.rx;
854 	struct xen_netif_extra_info *extras = rinfo.extras;
855 	RING_IDX i, rp;
856 	int work_done;
857 	struct sk_buff_head rxq;
858 	struct sk_buff_head errq;
859 	struct sk_buff_head tmpq;
860 	unsigned long flags;
861 	unsigned int len;
862 	int err;
863 
864 	spin_lock(&np->rx_lock);
865 
866 	skb_queue_head_init(&rxq);
867 	skb_queue_head_init(&errq);
868 	skb_queue_head_init(&tmpq);
869 
870 	rp = np->rx.sring->rsp_prod;
871 	rmb(); /* Ensure we see queued responses up to 'rp'. */
872 
873 	i = np->rx.rsp_cons;
874 	work_done = 0;
875 	while ((i != rp) && (work_done < budget)) {
876 		memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
877 		memset(extras, 0, sizeof(rinfo.extras));
878 
879 		err = xennet_get_responses(np, &rinfo, rp, &tmpq);
880 
881 		if (unlikely(err)) {
882 err:
883 			while ((skb = __skb_dequeue(&tmpq)))
884 				__skb_queue_tail(&errq, skb);
885 			dev->stats.rx_errors++;
886 			i = np->rx.rsp_cons;
887 			continue;
888 		}
889 
890 		skb = __skb_dequeue(&tmpq);
891 
892 		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
893 			struct xen_netif_extra_info *gso;
894 			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
895 
896 			if (unlikely(xennet_set_skb_gso(skb, gso))) {
897 				__skb_queue_head(&tmpq, skb);
898 				np->rx.rsp_cons += skb_queue_len(&tmpq);
899 				goto err;
900 			}
901 		}
902 
903 		NETFRONT_SKB_CB(skb)->page = skb_shinfo(skb)->frags[0].page;
904 		NETFRONT_SKB_CB(skb)->offset = rx->offset;
905 
906 		len = rx->status;
907 		if (len > RX_COPY_THRESHOLD)
908 			len = RX_COPY_THRESHOLD;
909 		skb_put(skb, len);
910 
911 		if (rx->status > len) {
912 			skb_shinfo(skb)->frags[0].page_offset =
913 				rx->offset + len;
914 			skb_shinfo(skb)->frags[0].size = rx->status - len;
915 			skb->data_len = rx->status - len;
916 		} else {
917 			skb_shinfo(skb)->frags[0].page = NULL;
918 			skb_shinfo(skb)->nr_frags = 0;
919 		}
920 
921 		i = xennet_fill_frags(np, skb, &tmpq);
922 
923 		/*
924 		 * Truesize approximates the size of true data plus
925 		 * any supervisor overheads. Adding hypervisor
926 		 * overheads has been shown to significantly reduce
927 		 * achievable bandwidth with the default receive
928 		 * buffer size. It is therefore not wise to account
929 		 * for it here.
930 		 *
931 		 * After alloc_skb(RX_COPY_THRESHOLD), truesize is set
932 		 * to RX_COPY_THRESHOLD + the supervisor
933 		 * overheads. Here, we add the size of the data pulled
934 		 * in xennet_fill_frags().
935 		 *
936 		 * We also adjust for any unused space in the main
937 		 * data area by subtracting (RX_COPY_THRESHOLD -
938 		 * len). This is especially important with drivers
939 		 * which split incoming packets into header and data,
940 		 * using only 66 bytes of the main data area (see the
941 		 * e1000 driver for example.)  On such systems,
942 		 * without this last adjustement, our achievable
943 		 * receive throughout using the standard receive
944 		 * buffer size was cut by 25%(!!!).
945 		 */
946 		skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
947 		skb->len += skb->data_len;
948 
949 		if (rx->flags & NETRXF_csum_blank)
950 			skb->ip_summed = CHECKSUM_PARTIAL;
951 		else if (rx->flags & NETRXF_data_validated)
952 			skb->ip_summed = CHECKSUM_UNNECESSARY;
953 
954 		__skb_queue_tail(&rxq, skb);
955 
956 		np->rx.rsp_cons = ++i;
957 		work_done++;
958 	}
959 
960 	__skb_queue_purge(&errq);
961 
962 	work_done -= handle_incoming_queue(dev, &rxq);
963 
964 	/* If we get a callback with very few responses, reduce fill target. */
965 	/* NB. Note exponential increase, linear decrease. */
966 	if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
967 	     ((3*np->rx_target) / 4)) &&
968 	    (--np->rx_target < np->rx_min_target))
969 		np->rx_target = np->rx_min_target;
970 
971 	xennet_alloc_rx_buffers(dev);
972 
973 	if (work_done < budget) {
974 		int more_to_do = 0;
975 
976 		local_irq_save(flags);
977 
978 		RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
979 		if (!more_to_do)
980 			__netif_rx_complete(dev, napi);
981 
982 		local_irq_restore(flags);
983 	}
984 
985 	spin_unlock(&np->rx_lock);
986 
987 	return work_done;
988 }
989 
990 static int xennet_change_mtu(struct net_device *dev, int mtu)
991 {
992 	int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
993 
994 	if (mtu > max)
995 		return -EINVAL;
996 	dev->mtu = mtu;
997 	return 0;
998 }
999 
1000 static void xennet_release_tx_bufs(struct netfront_info *np)
1001 {
1002 	struct sk_buff *skb;
1003 	int i;
1004 
1005 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
1006 		/* Skip over entries which are actually freelist references */
1007 		if (skb_entry_is_link(&np->tx_skbs[i]))
1008 			continue;
1009 
1010 		skb = np->tx_skbs[i].skb;
1011 		gnttab_end_foreign_access_ref(np->grant_tx_ref[i],
1012 					      GNTMAP_readonly);
1013 		gnttab_release_grant_reference(&np->gref_tx_head,
1014 					       np->grant_tx_ref[i]);
1015 		np->grant_tx_ref[i] = GRANT_INVALID_REF;
1016 		add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, i);
1017 		dev_kfree_skb_irq(skb);
1018 	}
1019 }
1020 
1021 static void xennet_release_rx_bufs(struct netfront_info *np)
1022 {
1023 	struct mmu_update      *mmu = np->rx_mmu;
1024 	struct multicall_entry *mcl = np->rx_mcl;
1025 	struct sk_buff_head free_list;
1026 	struct sk_buff *skb;
1027 	unsigned long mfn;
1028 	int xfer = 0, noxfer = 0, unused = 0;
1029 	int id, ref;
1030 
1031 	dev_warn(&np->netdev->dev, "%s: fix me for copying receiver.\n",
1032 			 __func__);
1033 	return;
1034 
1035 	skb_queue_head_init(&free_list);
1036 
1037 	spin_lock_bh(&np->rx_lock);
1038 
1039 	for (id = 0; id < NET_RX_RING_SIZE; id++) {
1040 		ref = np->grant_rx_ref[id];
1041 		if (ref == GRANT_INVALID_REF) {
1042 			unused++;
1043 			continue;
1044 		}
1045 
1046 		skb = np->rx_skbs[id];
1047 		mfn = gnttab_end_foreign_transfer_ref(ref);
1048 		gnttab_release_grant_reference(&np->gref_rx_head, ref);
1049 		np->grant_rx_ref[id] = GRANT_INVALID_REF;
1050 
1051 		if (0 == mfn) {
1052 			skb_shinfo(skb)->nr_frags = 0;
1053 			dev_kfree_skb(skb);
1054 			noxfer++;
1055 			continue;
1056 		}
1057 
1058 		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1059 			/* Remap the page. */
1060 			struct page *page = skb_shinfo(skb)->frags[0].page;
1061 			unsigned long pfn = page_to_pfn(page);
1062 			void *vaddr = page_address(page);
1063 
1064 			MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
1065 						mfn_pte(mfn, PAGE_KERNEL),
1066 						0);
1067 			mcl++;
1068 			mmu->ptr = ((u64)mfn << PAGE_SHIFT)
1069 				| MMU_MACHPHYS_UPDATE;
1070 			mmu->val = pfn;
1071 			mmu++;
1072 
1073 			set_phys_to_machine(pfn, mfn);
1074 		}
1075 		__skb_queue_tail(&free_list, skb);
1076 		xfer++;
1077 	}
1078 
1079 	dev_info(&np->netdev->dev, "%s: %d xfer, %d noxfer, %d unused\n",
1080 		 __func__, xfer, noxfer, unused);
1081 
1082 	if (xfer) {
1083 		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1084 			/* Do all the remapping work and M2P updates. */
1085 			MULTI_mmu_update(mcl, np->rx_mmu, mmu - np->rx_mmu,
1086 					 NULL, DOMID_SELF);
1087 			mcl++;
1088 			HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
1089 		}
1090 	}
1091 
1092 	__skb_queue_purge(&free_list);
1093 
1094 	spin_unlock_bh(&np->rx_lock);
1095 }
1096 
1097 static void xennet_uninit(struct net_device *dev)
1098 {
1099 	struct netfront_info *np = netdev_priv(dev);
1100 	xennet_release_tx_bufs(np);
1101 	xennet_release_rx_bufs(np);
1102 	gnttab_free_grant_references(np->gref_tx_head);
1103 	gnttab_free_grant_references(np->gref_rx_head);
1104 }
1105 
1106 static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev)
1107 {
1108 	int i, err;
1109 	struct net_device *netdev;
1110 	struct netfront_info *np;
1111 
1112 	netdev = alloc_etherdev(sizeof(struct netfront_info));
1113 	if (!netdev) {
1114 		printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
1115 		       __func__);
1116 		return ERR_PTR(-ENOMEM);
1117 	}
1118 
1119 	np                   = netdev_priv(netdev);
1120 	np->xbdev            = dev;
1121 
1122 	spin_lock_init(&np->tx_lock);
1123 	spin_lock_init(&np->rx_lock);
1124 
1125 	skb_queue_head_init(&np->rx_batch);
1126 	np->rx_target     = RX_DFL_MIN_TARGET;
1127 	np->rx_min_target = RX_DFL_MIN_TARGET;
1128 	np->rx_max_target = RX_MAX_TARGET;
1129 
1130 	init_timer(&np->rx_refill_timer);
1131 	np->rx_refill_timer.data = (unsigned long)netdev;
1132 	np->rx_refill_timer.function = rx_refill_timeout;
1133 
1134 	/* Initialise tx_skbs as a free chain containing every entry. */
1135 	np->tx_skb_freelist = 0;
1136 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
1137 		skb_entry_set_link(&np->tx_skbs[i], i+1);
1138 		np->grant_tx_ref[i] = GRANT_INVALID_REF;
1139 	}
1140 
1141 	/* Clear out rx_skbs */
1142 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
1143 		np->rx_skbs[i] = NULL;
1144 		np->grant_rx_ref[i] = GRANT_INVALID_REF;
1145 	}
1146 
1147 	/* A grant for every tx ring slot */
1148 	if (gnttab_alloc_grant_references(TX_MAX_TARGET,
1149 					  &np->gref_tx_head) < 0) {
1150 		printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
1151 		err = -ENOMEM;
1152 		goto exit;
1153 	}
1154 	/* A grant for every rx ring slot */
1155 	if (gnttab_alloc_grant_references(RX_MAX_TARGET,
1156 					  &np->gref_rx_head) < 0) {
1157 		printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
1158 		err = -ENOMEM;
1159 		goto exit_free_tx;
1160 	}
1161 
1162 	netdev->open            = xennet_open;
1163 	netdev->hard_start_xmit = xennet_start_xmit;
1164 	netdev->stop            = xennet_close;
1165 	netif_napi_add(netdev, &np->napi, xennet_poll, 64);
1166 	netdev->uninit          = xennet_uninit;
1167 	netdev->change_mtu	= xennet_change_mtu;
1168 	netdev->features        = NETIF_F_IP_CSUM;
1169 
1170 	SET_ETHTOOL_OPS(netdev, &xennet_ethtool_ops);
1171 	SET_NETDEV_DEV(netdev, &dev->dev);
1172 
1173 	np->netdev = netdev;
1174 
1175 	netif_carrier_off(netdev);
1176 
1177 	return netdev;
1178 
1179  exit_free_tx:
1180 	gnttab_free_grant_references(np->gref_tx_head);
1181  exit:
1182 	free_netdev(netdev);
1183 	return ERR_PTR(err);
1184 }
1185 
1186 /**
1187  * Entry point to this code when a new device is created.  Allocate the basic
1188  * structures and the ring buffers for communication with the backend, and
1189  * inform the backend of the appropriate details for those.
1190  */
1191 static int __devinit netfront_probe(struct xenbus_device *dev,
1192 				    const struct xenbus_device_id *id)
1193 {
1194 	int err;
1195 	struct net_device *netdev;
1196 	struct netfront_info *info;
1197 
1198 	netdev = xennet_create_dev(dev);
1199 	if (IS_ERR(netdev)) {
1200 		err = PTR_ERR(netdev);
1201 		xenbus_dev_fatal(dev, err, "creating netdev");
1202 		return err;
1203 	}
1204 
1205 	info = netdev_priv(netdev);
1206 	dev->dev.driver_data = info;
1207 
1208 	err = register_netdev(info->netdev);
1209 	if (err) {
1210 		printk(KERN_WARNING "%s: register_netdev err=%d\n",
1211 		       __func__, err);
1212 		goto fail;
1213 	}
1214 
1215 	err = xennet_sysfs_addif(info->netdev);
1216 	if (err) {
1217 		unregister_netdev(info->netdev);
1218 		printk(KERN_WARNING "%s: add sysfs failed err=%d\n",
1219 		       __func__, err);
1220 		goto fail;
1221 	}
1222 
1223 	return 0;
1224 
1225  fail:
1226 	free_netdev(netdev);
1227 	dev->dev.driver_data = NULL;
1228 	return err;
1229 }
1230 
1231 static void xennet_end_access(int ref, void *page)
1232 {
1233 	/* This frees the page as a side-effect */
1234 	if (ref != GRANT_INVALID_REF)
1235 		gnttab_end_foreign_access(ref, 0, (unsigned long)page);
1236 }
1237 
1238 static void xennet_disconnect_backend(struct netfront_info *info)
1239 {
1240 	/* Stop old i/f to prevent errors whilst we rebuild the state. */
1241 	spin_lock_bh(&info->rx_lock);
1242 	spin_lock_irq(&info->tx_lock);
1243 	netif_carrier_off(info->netdev);
1244 	spin_unlock_irq(&info->tx_lock);
1245 	spin_unlock_bh(&info->rx_lock);
1246 
1247 	if (info->netdev->irq)
1248 		unbind_from_irqhandler(info->netdev->irq, info->netdev);
1249 	info->evtchn = info->netdev->irq = 0;
1250 
1251 	/* End access and free the pages */
1252 	xennet_end_access(info->tx_ring_ref, info->tx.sring);
1253 	xennet_end_access(info->rx_ring_ref, info->rx.sring);
1254 
1255 	info->tx_ring_ref = GRANT_INVALID_REF;
1256 	info->rx_ring_ref = GRANT_INVALID_REF;
1257 	info->tx.sring = NULL;
1258 	info->rx.sring = NULL;
1259 }
1260 
1261 /**
1262  * We are reconnecting to the backend, due to a suspend/resume, or a backend
1263  * driver restart.  We tear down our netif structure and recreate it, but
1264  * leave the device-layer structures intact so that this is transparent to the
1265  * rest of the kernel.
1266  */
1267 static int netfront_resume(struct xenbus_device *dev)
1268 {
1269 	struct netfront_info *info = dev->dev.driver_data;
1270 
1271 	dev_dbg(&dev->dev, "%s\n", dev->nodename);
1272 
1273 	xennet_disconnect_backend(info);
1274 	return 0;
1275 }
1276 
1277 static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
1278 {
1279 	char *s, *e, *macstr;
1280 	int i;
1281 
1282 	macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
1283 	if (IS_ERR(macstr))
1284 		return PTR_ERR(macstr);
1285 
1286 	for (i = 0; i < ETH_ALEN; i++) {
1287 		mac[i] = simple_strtoul(s, &e, 16);
1288 		if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
1289 			kfree(macstr);
1290 			return -ENOENT;
1291 		}
1292 		s = e+1;
1293 	}
1294 
1295 	kfree(macstr);
1296 	return 0;
1297 }
1298 
1299 static irqreturn_t xennet_interrupt(int irq, void *dev_id)
1300 {
1301 	struct net_device *dev = dev_id;
1302 	struct netfront_info *np = netdev_priv(dev);
1303 	unsigned long flags;
1304 
1305 	spin_lock_irqsave(&np->tx_lock, flags);
1306 
1307 	if (likely(netif_carrier_ok(dev))) {
1308 		xennet_tx_buf_gc(dev);
1309 		/* Under tx_lock: protects access to rx shared-ring indexes. */
1310 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
1311 			netif_rx_schedule(dev, &np->napi);
1312 	}
1313 
1314 	spin_unlock_irqrestore(&np->tx_lock, flags);
1315 
1316 	return IRQ_HANDLED;
1317 }
1318 
1319 static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
1320 {
1321 	struct xen_netif_tx_sring *txs;
1322 	struct xen_netif_rx_sring *rxs;
1323 	int err;
1324 	struct net_device *netdev = info->netdev;
1325 
1326 	info->tx_ring_ref = GRANT_INVALID_REF;
1327 	info->rx_ring_ref = GRANT_INVALID_REF;
1328 	info->rx.sring = NULL;
1329 	info->tx.sring = NULL;
1330 	netdev->irq = 0;
1331 
1332 	err = xen_net_read_mac(dev, netdev->dev_addr);
1333 	if (err) {
1334 		xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
1335 		goto fail;
1336 	}
1337 
1338 	txs = (struct xen_netif_tx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH);
1339 	if (!txs) {
1340 		err = -ENOMEM;
1341 		xenbus_dev_fatal(dev, err, "allocating tx ring page");
1342 		goto fail;
1343 	}
1344 	SHARED_RING_INIT(txs);
1345 	FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
1346 
1347 	err = xenbus_grant_ring(dev, virt_to_mfn(txs));
1348 	if (err < 0) {
1349 		free_page((unsigned long)txs);
1350 		goto fail;
1351 	}
1352 
1353 	info->tx_ring_ref = err;
1354 	rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH);
1355 	if (!rxs) {
1356 		err = -ENOMEM;
1357 		xenbus_dev_fatal(dev, err, "allocating rx ring page");
1358 		goto fail;
1359 	}
1360 	SHARED_RING_INIT(rxs);
1361 	FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
1362 
1363 	err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
1364 	if (err < 0) {
1365 		free_page((unsigned long)rxs);
1366 		goto fail;
1367 	}
1368 	info->rx_ring_ref = err;
1369 
1370 	err = xenbus_alloc_evtchn(dev, &info->evtchn);
1371 	if (err)
1372 		goto fail;
1373 
1374 	err = bind_evtchn_to_irqhandler(info->evtchn, xennet_interrupt,
1375 					IRQF_SAMPLE_RANDOM, netdev->name,
1376 					netdev);
1377 	if (err < 0)
1378 		goto fail;
1379 	netdev->irq = err;
1380 	return 0;
1381 
1382  fail:
1383 	return err;
1384 }
1385 
1386 /* Common code used when first setting up, and when resuming. */
1387 static int talk_to_backend(struct xenbus_device *dev,
1388 			   struct netfront_info *info)
1389 {
1390 	const char *message;
1391 	struct xenbus_transaction xbt;
1392 	int err;
1393 
1394 	/* Create shared ring, alloc event channel. */
1395 	err = setup_netfront(dev, info);
1396 	if (err)
1397 		goto out;
1398 
1399 again:
1400 	err = xenbus_transaction_start(&xbt);
1401 	if (err) {
1402 		xenbus_dev_fatal(dev, err, "starting transaction");
1403 		goto destroy_ring;
1404 	}
1405 
1406 	err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref", "%u",
1407 			    info->tx_ring_ref);
1408 	if (err) {
1409 		message = "writing tx ring-ref";
1410 		goto abort_transaction;
1411 	}
1412 	err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref", "%u",
1413 			    info->rx_ring_ref);
1414 	if (err) {
1415 		message = "writing rx ring-ref";
1416 		goto abort_transaction;
1417 	}
1418 	err = xenbus_printf(xbt, dev->nodename,
1419 			    "event-channel", "%u", info->evtchn);
1420 	if (err) {
1421 		message = "writing event-channel";
1422 		goto abort_transaction;
1423 	}
1424 
1425 	err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
1426 			    1);
1427 	if (err) {
1428 		message = "writing request-rx-copy";
1429 		goto abort_transaction;
1430 	}
1431 
1432 	err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
1433 	if (err) {
1434 		message = "writing feature-rx-notify";
1435 		goto abort_transaction;
1436 	}
1437 
1438 	err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
1439 	if (err) {
1440 		message = "writing feature-sg";
1441 		goto abort_transaction;
1442 	}
1443 
1444 	err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1);
1445 	if (err) {
1446 		message = "writing feature-gso-tcpv4";
1447 		goto abort_transaction;
1448 	}
1449 
1450 	err = xenbus_transaction_end(xbt, 0);
1451 	if (err) {
1452 		if (err == -EAGAIN)
1453 			goto again;
1454 		xenbus_dev_fatal(dev, err, "completing transaction");
1455 		goto destroy_ring;
1456 	}
1457 
1458 	return 0;
1459 
1460  abort_transaction:
1461 	xenbus_transaction_end(xbt, 1);
1462 	xenbus_dev_fatal(dev, err, "%s", message);
1463  destroy_ring:
1464 	xennet_disconnect_backend(info);
1465  out:
1466 	return err;
1467 }
1468 
1469 static int xennet_set_sg(struct net_device *dev, u32 data)
1470 {
1471 	if (data) {
1472 		struct netfront_info *np = netdev_priv(dev);
1473 		int val;
1474 
1475 		if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
1476 				 "%d", &val) < 0)
1477 			val = 0;
1478 		if (!val)
1479 			return -ENOSYS;
1480 	} else if (dev->mtu > ETH_DATA_LEN)
1481 		dev->mtu = ETH_DATA_LEN;
1482 
1483 	return ethtool_op_set_sg(dev, data);
1484 }
1485 
1486 static int xennet_set_tso(struct net_device *dev, u32 data)
1487 {
1488 	if (data) {
1489 		struct netfront_info *np = netdev_priv(dev);
1490 		int val;
1491 
1492 		if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
1493 				 "feature-gso-tcpv4", "%d", &val) < 0)
1494 			val = 0;
1495 		if (!val)
1496 			return -ENOSYS;
1497 	}
1498 
1499 	return ethtool_op_set_tso(dev, data);
1500 }
1501 
1502 static void xennet_set_features(struct net_device *dev)
1503 {
1504 	/* Turn off all GSO bits except ROBUST. */
1505 	dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
1506 	dev->features |= NETIF_F_GSO_ROBUST;
1507 	xennet_set_sg(dev, 0);
1508 
1509 	/* We need checksum offload to enable scatter/gather and TSO. */
1510 	if (!(dev->features & NETIF_F_IP_CSUM))
1511 		return;
1512 
1513 	if (!xennet_set_sg(dev, 1))
1514 		xennet_set_tso(dev, 1);
1515 }
1516 
1517 static int xennet_connect(struct net_device *dev)
1518 {
1519 	struct netfront_info *np = netdev_priv(dev);
1520 	int i, requeue_idx, err;
1521 	struct sk_buff *skb;
1522 	grant_ref_t ref;
1523 	struct xen_netif_rx_request *req;
1524 	unsigned int feature_rx_copy;
1525 
1526 	err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
1527 			   "feature-rx-copy", "%u", &feature_rx_copy);
1528 	if (err != 1)
1529 		feature_rx_copy = 0;
1530 
1531 	if (!feature_rx_copy) {
1532 		dev_info(&dev->dev,
1533 			 "backend does not support copying receive path\n");
1534 		return -ENODEV;
1535 	}
1536 
1537 	err = talk_to_backend(np->xbdev, np);
1538 	if (err)
1539 		return err;
1540 
1541 	xennet_set_features(dev);
1542 
1543 	spin_lock_bh(&np->rx_lock);
1544 	spin_lock_irq(&np->tx_lock);
1545 
1546 	/* Step 1: Discard all pending TX packet fragments. */
1547 	xennet_release_tx_bufs(np);
1548 
1549 	/* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
1550 	for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
1551 		if (!np->rx_skbs[i])
1552 			continue;
1553 
1554 		skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
1555 		ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
1556 		req = RING_GET_REQUEST(&np->rx, requeue_idx);
1557 
1558 		gnttab_grant_foreign_access_ref(
1559 			ref, np->xbdev->otherend_id,
1560 			pfn_to_mfn(page_to_pfn(skb_shinfo(skb)->
1561 					       frags->page)),
1562 			0);
1563 		req->gref = ref;
1564 		req->id   = requeue_idx;
1565 
1566 		requeue_idx++;
1567 	}
1568 
1569 	np->rx.req_prod_pvt = requeue_idx;
1570 
1571 	/*
1572 	 * Step 3: All public and private state should now be sane.  Get
1573 	 * ready to start sending and receiving packets and give the driver
1574 	 * domain a kick because we've probably just requeued some
1575 	 * packets.
1576 	 */
1577 	netif_carrier_on(np->netdev);
1578 	notify_remote_via_irq(np->netdev->irq);
1579 	xennet_tx_buf_gc(dev);
1580 	xennet_alloc_rx_buffers(dev);
1581 
1582 	spin_unlock_irq(&np->tx_lock);
1583 	spin_unlock_bh(&np->rx_lock);
1584 
1585 	return 0;
1586 }
1587 
1588 /**
1589  * Callback received when the backend's state changes.
1590  */
1591 static void backend_changed(struct xenbus_device *dev,
1592 			    enum xenbus_state backend_state)
1593 {
1594 	struct netfront_info *np = dev->dev.driver_data;
1595 	struct net_device *netdev = np->netdev;
1596 
1597 	dev_dbg(&dev->dev, "%s\n", xenbus_strstate(backend_state));
1598 
1599 	switch (backend_state) {
1600 	case XenbusStateInitialising:
1601 	case XenbusStateInitialised:
1602 	case XenbusStateConnected:
1603 	case XenbusStateUnknown:
1604 	case XenbusStateClosed:
1605 		break;
1606 
1607 	case XenbusStateInitWait:
1608 		if (dev->state != XenbusStateInitialising)
1609 			break;
1610 		if (xennet_connect(netdev) != 0)
1611 			break;
1612 		xenbus_switch_state(dev, XenbusStateConnected);
1613 		break;
1614 
1615 	case XenbusStateClosing:
1616 		xenbus_frontend_closed(dev);
1617 		break;
1618 	}
1619 }
1620 
1621 static struct ethtool_ops xennet_ethtool_ops =
1622 {
1623 	.set_tx_csum = ethtool_op_set_tx_csum,
1624 	.set_sg = xennet_set_sg,
1625 	.set_tso = xennet_set_tso,
1626 	.get_link = ethtool_op_get_link,
1627 };
1628 
1629 #ifdef CONFIG_SYSFS
1630 static ssize_t show_rxbuf_min(struct device *dev,
1631 			      struct device_attribute *attr, char *buf)
1632 {
1633 	struct net_device *netdev = to_net_dev(dev);
1634 	struct netfront_info *info = netdev_priv(netdev);
1635 
1636 	return sprintf(buf, "%u\n", info->rx_min_target);
1637 }
1638 
1639 static ssize_t store_rxbuf_min(struct device *dev,
1640 			       struct device_attribute *attr,
1641 			       const char *buf, size_t len)
1642 {
1643 	struct net_device *netdev = to_net_dev(dev);
1644 	struct netfront_info *np = netdev_priv(netdev);
1645 	char *endp;
1646 	unsigned long target;
1647 
1648 	if (!capable(CAP_NET_ADMIN))
1649 		return -EPERM;
1650 
1651 	target = simple_strtoul(buf, &endp, 0);
1652 	if (endp == buf)
1653 		return -EBADMSG;
1654 
1655 	if (target < RX_MIN_TARGET)
1656 		target = RX_MIN_TARGET;
1657 	if (target > RX_MAX_TARGET)
1658 		target = RX_MAX_TARGET;
1659 
1660 	spin_lock_bh(&np->rx_lock);
1661 	if (target > np->rx_max_target)
1662 		np->rx_max_target = target;
1663 	np->rx_min_target = target;
1664 	if (target > np->rx_target)
1665 		np->rx_target = target;
1666 
1667 	xennet_alloc_rx_buffers(netdev);
1668 
1669 	spin_unlock_bh(&np->rx_lock);
1670 	return len;
1671 }
1672 
1673 static ssize_t show_rxbuf_max(struct device *dev,
1674 			      struct device_attribute *attr, char *buf)
1675 {
1676 	struct net_device *netdev = to_net_dev(dev);
1677 	struct netfront_info *info = netdev_priv(netdev);
1678 
1679 	return sprintf(buf, "%u\n", info->rx_max_target);
1680 }
1681 
1682 static ssize_t store_rxbuf_max(struct device *dev,
1683 			       struct device_attribute *attr,
1684 			       const char *buf, size_t len)
1685 {
1686 	struct net_device *netdev = to_net_dev(dev);
1687 	struct netfront_info *np = netdev_priv(netdev);
1688 	char *endp;
1689 	unsigned long target;
1690 
1691 	if (!capable(CAP_NET_ADMIN))
1692 		return -EPERM;
1693 
1694 	target = simple_strtoul(buf, &endp, 0);
1695 	if (endp == buf)
1696 		return -EBADMSG;
1697 
1698 	if (target < RX_MIN_TARGET)
1699 		target = RX_MIN_TARGET;
1700 	if (target > RX_MAX_TARGET)
1701 		target = RX_MAX_TARGET;
1702 
1703 	spin_lock_bh(&np->rx_lock);
1704 	if (target < np->rx_min_target)
1705 		np->rx_min_target = target;
1706 	np->rx_max_target = target;
1707 	if (target < np->rx_target)
1708 		np->rx_target = target;
1709 
1710 	xennet_alloc_rx_buffers(netdev);
1711 
1712 	spin_unlock_bh(&np->rx_lock);
1713 	return len;
1714 }
1715 
1716 static ssize_t show_rxbuf_cur(struct device *dev,
1717 			      struct device_attribute *attr, char *buf)
1718 {
1719 	struct net_device *netdev = to_net_dev(dev);
1720 	struct netfront_info *info = netdev_priv(netdev);
1721 
1722 	return sprintf(buf, "%u\n", info->rx_target);
1723 }
1724 
1725 static struct device_attribute xennet_attrs[] = {
1726 	__ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
1727 	__ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
1728 	__ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
1729 };
1730 
1731 static int xennet_sysfs_addif(struct net_device *netdev)
1732 {
1733 	int i;
1734 	int err;
1735 
1736 	for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
1737 		err = device_create_file(&netdev->dev,
1738 					   &xennet_attrs[i]);
1739 		if (err)
1740 			goto fail;
1741 	}
1742 	return 0;
1743 
1744  fail:
1745 	while (--i >= 0)
1746 		device_remove_file(&netdev->dev, &xennet_attrs[i]);
1747 	return err;
1748 }
1749 
1750 static void xennet_sysfs_delif(struct net_device *netdev)
1751 {
1752 	int i;
1753 
1754 	for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++)
1755 		device_remove_file(&netdev->dev, &xennet_attrs[i]);
1756 }
1757 
1758 #endif /* CONFIG_SYSFS */
1759 
1760 static struct xenbus_device_id netfront_ids[] = {
1761 	{ "vif" },
1762 	{ "" }
1763 };
1764 
1765 
1766 static int __devexit xennet_remove(struct xenbus_device *dev)
1767 {
1768 	struct netfront_info *info = dev->dev.driver_data;
1769 
1770 	dev_dbg(&dev->dev, "%s\n", dev->nodename);
1771 
1772 	unregister_netdev(info->netdev);
1773 
1774 	xennet_disconnect_backend(info);
1775 
1776 	del_timer_sync(&info->rx_refill_timer);
1777 
1778 	xennet_sysfs_delif(info->netdev);
1779 
1780 	free_netdev(info->netdev);
1781 
1782 	return 0;
1783 }
1784 
1785 static struct xenbus_driver netfront = {
1786 	.name = "vif",
1787 	.owner = THIS_MODULE,
1788 	.ids = netfront_ids,
1789 	.probe = netfront_probe,
1790 	.remove = __devexit_p(xennet_remove),
1791 	.resume = netfront_resume,
1792 	.otherend_changed = backend_changed,
1793 };
1794 
1795 static int __init netif_init(void)
1796 {
1797 	if (!is_running_on_xen())
1798 		return -ENODEV;
1799 
1800 	if (is_initial_xendomain())
1801 		return 0;
1802 
1803 	printk(KERN_INFO "Initialising Xen virtual ethernet driver.\n");
1804 
1805 	return xenbus_register_frontend(&netfront);
1806 }
1807 module_init(netif_init);
1808 
1809 
1810 static void __exit netif_exit(void)
1811 {
1812 	if (is_initial_xendomain())
1813 		return;
1814 
1815 	xenbus_unregister_driver(&netfront);
1816 }
1817 module_exit(netif_exit);
1818 
1819 MODULE_DESCRIPTION("Xen virtual network device frontend");
1820 MODULE_LICENSE("GPL");
1821 MODULE_ALIAS("xen:vif");
1822 MODULE_ALIAS("xennet");
1823