xref: /openbmc/linux/drivers/net/hyperv/netvsc_drv.c (revision a8fe58ce)
1 /*
2  * Copyright (c) 2009, Microsoft Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, see <http://www.gnu.org/licenses/>.
15  *
16  * Authors:
17  *   Haiyang Zhang <haiyangz@microsoft.com>
18  *   Hank Janssen  <hjanssen@microsoft.com>
19  */
20 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
21 
22 #include <linux/init.h>
23 #include <linux/atomic.h>
24 #include <linux/module.h>
25 #include <linux/highmem.h>
26 #include <linux/device.h>
27 #include <linux/io.h>
28 #include <linux/delay.h>
29 #include <linux/netdevice.h>
30 #include <linux/inetdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/skbuff.h>
33 #include <linux/if_vlan.h>
34 #include <linux/in.h>
35 #include <linux/slab.h>
36 #include <net/arp.h>
37 #include <net/route.h>
38 #include <net/sock.h>
39 #include <net/pkt_sched.h>
40 
41 #include "hyperv_net.h"
42 
43 
44 #define RING_SIZE_MIN 64
45 #define LINKCHANGE_INT (2 * HZ)
46 static int ring_size = 128;
47 module_param(ring_size, int, S_IRUGO);
48 MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)");
49 
50 static int max_num_vrss_chns = 8;
51 
52 static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE |
53 				NETIF_MSG_LINK | NETIF_MSG_IFUP |
54 				NETIF_MSG_IFDOWN | NETIF_MSG_RX_ERR |
55 				NETIF_MSG_TX_ERR;
56 
57 static int debug = -1;
58 module_param(debug, int, S_IRUGO);
59 MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
60 
61 static void do_set_multicast(struct work_struct *w)
62 {
63 	struct net_device_context *ndevctx =
64 		container_of(w, struct net_device_context, work);
65 	struct netvsc_device *nvdev;
66 	struct rndis_device *rdev;
67 
68 	nvdev = hv_get_drvdata(ndevctx->device_ctx);
69 	if (nvdev == NULL || nvdev->ndev == NULL)
70 		return;
71 
72 	rdev = nvdev->extension;
73 	if (rdev == NULL)
74 		return;
75 
76 	if (nvdev->ndev->flags & IFF_PROMISC)
77 		rndis_filter_set_packet_filter(rdev,
78 			NDIS_PACKET_TYPE_PROMISCUOUS);
79 	else
80 		rndis_filter_set_packet_filter(rdev,
81 			NDIS_PACKET_TYPE_BROADCAST |
82 			NDIS_PACKET_TYPE_ALL_MULTICAST |
83 			NDIS_PACKET_TYPE_DIRECTED);
84 }
85 
86 static void netvsc_set_multicast_list(struct net_device *net)
87 {
88 	struct net_device_context *net_device_ctx = netdev_priv(net);
89 
90 	schedule_work(&net_device_ctx->work);
91 }
92 
93 static int netvsc_open(struct net_device *net)
94 {
95 	struct net_device_context *net_device_ctx = netdev_priv(net);
96 	struct hv_device *device_obj = net_device_ctx->device_ctx;
97 	struct netvsc_device *nvdev;
98 	struct rndis_device *rdev;
99 	int ret = 0;
100 
101 	netif_carrier_off(net);
102 
103 	/* Open up the device */
104 	ret = rndis_filter_open(device_obj);
105 	if (ret != 0) {
106 		netdev_err(net, "unable to open device (ret %d).\n", ret);
107 		return ret;
108 	}
109 
110 	netif_tx_wake_all_queues(net);
111 
112 	nvdev = hv_get_drvdata(device_obj);
113 	rdev = nvdev->extension;
114 	if (!rdev->link_state)
115 		netif_carrier_on(net);
116 
117 	return ret;
118 }
119 
120 static int netvsc_close(struct net_device *net)
121 {
122 	struct net_device_context *net_device_ctx = netdev_priv(net);
123 	struct hv_device *device_obj = net_device_ctx->device_ctx;
124 	struct netvsc_device *nvdev = hv_get_drvdata(device_obj);
125 	int ret;
126 	u32 aread, awrite, i, msec = 10, retry = 0, retry_max = 20;
127 	struct vmbus_channel *chn;
128 
129 	netif_tx_disable(net);
130 
131 	/* Make sure netvsc_set_multicast_list doesn't re-enable filter! */
132 	cancel_work_sync(&net_device_ctx->work);
133 	ret = rndis_filter_close(device_obj);
134 	if (ret != 0) {
135 		netdev_err(net, "unable to close device (ret %d).\n", ret);
136 		return ret;
137 	}
138 
139 	/* Ensure pending bytes in ring are read */
140 	while (true) {
141 		aread = 0;
142 		for (i = 0; i < nvdev->num_chn; i++) {
143 			chn = nvdev->chn_table[i];
144 			if (!chn)
145 				continue;
146 
147 			hv_get_ringbuffer_availbytes(&chn->inbound, &aread,
148 						     &awrite);
149 
150 			if (aread)
151 				break;
152 
153 			hv_get_ringbuffer_availbytes(&chn->outbound, &aread,
154 						     &awrite);
155 
156 			if (aread)
157 				break;
158 		}
159 
160 		retry++;
161 		if (retry > retry_max || aread == 0)
162 			break;
163 
164 		msleep(msec);
165 
166 		if (msec < 1000)
167 			msec *= 2;
168 	}
169 
170 	if (aread) {
171 		netdev_err(net, "Ring buffer not empty after closing rndis\n");
172 		ret = -ETIMEDOUT;
173 	}
174 
175 	return ret;
176 }
177 
178 static void *init_ppi_data(struct rndis_message *msg, u32 ppi_size,
179 				int pkt_type)
180 {
181 	struct rndis_packet *rndis_pkt;
182 	struct rndis_per_packet_info *ppi;
183 
184 	rndis_pkt = &msg->msg.pkt;
185 	rndis_pkt->data_offset += ppi_size;
186 
187 	ppi = (struct rndis_per_packet_info *)((void *)rndis_pkt +
188 		rndis_pkt->per_pkt_info_offset + rndis_pkt->per_pkt_info_len);
189 
190 	ppi->size = ppi_size;
191 	ppi->type = pkt_type;
192 	ppi->ppi_offset = sizeof(struct rndis_per_packet_info);
193 
194 	rndis_pkt->per_pkt_info_len += ppi_size;
195 
196 	return ppi;
197 }
198 
199 static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
200 			void *accel_priv, select_queue_fallback_t fallback)
201 {
202 	struct net_device_context *net_device_ctx = netdev_priv(ndev);
203 	struct hv_device *hdev =  net_device_ctx->device_ctx;
204 	struct netvsc_device *nvsc_dev = hv_get_drvdata(hdev);
205 	u32 hash;
206 	u16 q_idx = 0;
207 
208 	if (nvsc_dev == NULL || ndev->real_num_tx_queues <= 1)
209 		return 0;
210 
211 	hash = skb_get_hash(skb);
212 	q_idx = nvsc_dev->send_table[hash % VRSS_SEND_TAB_SIZE] %
213 		ndev->real_num_tx_queues;
214 
215 	if (!nvsc_dev->chn_table[q_idx])
216 		q_idx = 0;
217 
218 	return q_idx;
219 }
220 
221 static u32 fill_pg_buf(struct page *page, u32 offset, u32 len,
222 			struct hv_page_buffer *pb)
223 {
224 	int j = 0;
225 
226 	/* Deal with compund pages by ignoring unused part
227 	 * of the page.
228 	 */
229 	page += (offset >> PAGE_SHIFT);
230 	offset &= ~PAGE_MASK;
231 
232 	while (len > 0) {
233 		unsigned long bytes;
234 
235 		bytes = PAGE_SIZE - offset;
236 		if (bytes > len)
237 			bytes = len;
238 		pb[j].pfn = page_to_pfn(page);
239 		pb[j].offset = offset;
240 		pb[j].len = bytes;
241 
242 		offset += bytes;
243 		len -= bytes;
244 
245 		if (offset == PAGE_SIZE && len) {
246 			page++;
247 			offset = 0;
248 			j++;
249 		}
250 	}
251 
252 	return j + 1;
253 }
254 
255 static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb,
256 			   struct hv_netvsc_packet *packet,
257 			   struct hv_page_buffer **page_buf)
258 {
259 	struct hv_page_buffer *pb = *page_buf;
260 	u32 slots_used = 0;
261 	char *data = skb->data;
262 	int frags = skb_shinfo(skb)->nr_frags;
263 	int i;
264 
265 	/* The packet is laid out thus:
266 	 * 1. hdr: RNDIS header and PPI
267 	 * 2. skb linear data
268 	 * 3. skb fragment data
269 	 */
270 	if (hdr != NULL)
271 		slots_used += fill_pg_buf(virt_to_page(hdr),
272 					offset_in_page(hdr),
273 					len, &pb[slots_used]);
274 
275 	packet->rmsg_size = len;
276 	packet->rmsg_pgcnt = slots_used;
277 
278 	slots_used += fill_pg_buf(virt_to_page(data),
279 				offset_in_page(data),
280 				skb_headlen(skb), &pb[slots_used]);
281 
282 	for (i = 0; i < frags; i++) {
283 		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
284 
285 		slots_used += fill_pg_buf(skb_frag_page(frag),
286 					frag->page_offset,
287 					skb_frag_size(frag), &pb[slots_used]);
288 	}
289 	return slots_used;
290 }
291 
292 static int count_skb_frag_slots(struct sk_buff *skb)
293 {
294 	int i, frags = skb_shinfo(skb)->nr_frags;
295 	int pages = 0;
296 
297 	for (i = 0; i < frags; i++) {
298 		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
299 		unsigned long size = skb_frag_size(frag);
300 		unsigned long offset = frag->page_offset;
301 
302 		/* Skip unused frames from start of page */
303 		offset &= ~PAGE_MASK;
304 		pages += PFN_UP(offset + size);
305 	}
306 	return pages;
307 }
308 
309 static int netvsc_get_slots(struct sk_buff *skb)
310 {
311 	char *data = skb->data;
312 	unsigned int offset = offset_in_page(data);
313 	unsigned int len = skb_headlen(skb);
314 	int slots;
315 	int frag_slots;
316 
317 	slots = DIV_ROUND_UP(offset + len, PAGE_SIZE);
318 	frag_slots = count_skb_frag_slots(skb);
319 	return slots + frag_slots;
320 }
321 
322 static u32 get_net_transport_info(struct sk_buff *skb, u32 *trans_off)
323 {
324 	u32 ret_val = TRANSPORT_INFO_NOT_IP;
325 
326 	if ((eth_hdr(skb)->h_proto != htons(ETH_P_IP)) &&
327 		(eth_hdr(skb)->h_proto != htons(ETH_P_IPV6))) {
328 		goto not_ip;
329 	}
330 
331 	*trans_off = skb_transport_offset(skb);
332 
333 	if ((eth_hdr(skb)->h_proto == htons(ETH_P_IP))) {
334 		struct iphdr *iphdr = ip_hdr(skb);
335 
336 		if (iphdr->protocol == IPPROTO_TCP)
337 			ret_val = TRANSPORT_INFO_IPV4_TCP;
338 		else if (iphdr->protocol == IPPROTO_UDP)
339 			ret_val = TRANSPORT_INFO_IPV4_UDP;
340 	} else {
341 		if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP)
342 			ret_val = TRANSPORT_INFO_IPV6_TCP;
343 		else if (ipv6_hdr(skb)->nexthdr == IPPROTO_UDP)
344 			ret_val = TRANSPORT_INFO_IPV6_UDP;
345 	}
346 
347 not_ip:
348 	return ret_val;
349 }
350 
351 static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net)
352 {
353 	struct net_device_context *net_device_ctx = netdev_priv(net);
354 	struct hv_netvsc_packet *packet = NULL;
355 	int ret;
356 	unsigned int num_data_pgs;
357 	struct rndis_message *rndis_msg;
358 	struct rndis_packet *rndis_pkt;
359 	u32 rndis_msg_size;
360 	bool isvlan;
361 	bool linear = false;
362 	struct rndis_per_packet_info *ppi;
363 	struct ndis_tcp_ip_checksum_info *csum_info;
364 	struct ndis_tcp_lso_info *lso_info;
365 	int  hdr_offset;
366 	u32 net_trans_info;
367 	u32 hash;
368 	u32 skb_length;
369 	struct hv_page_buffer page_buf[MAX_PAGE_BUFFER_COUNT];
370 	struct hv_page_buffer *pb = page_buf;
371 	struct netvsc_stats *tx_stats = this_cpu_ptr(net_device_ctx->tx_stats);
372 
373 	/* We will atmost need two pages to describe the rndis
374 	 * header. We can only transmit MAX_PAGE_BUFFER_COUNT number
375 	 * of pages in a single packet. If skb is scattered around
376 	 * more pages we try linearizing it.
377 	 */
378 
379 check_size:
380 	skb_length = skb->len;
381 	num_data_pgs = netvsc_get_slots(skb) + 2;
382 	if (num_data_pgs > MAX_PAGE_BUFFER_COUNT && linear) {
383 		net_alert_ratelimited("packet too big: %u pages (%u bytes)\n",
384 				      num_data_pgs, skb->len);
385 		ret = -EFAULT;
386 		goto drop;
387 	} else if (num_data_pgs > MAX_PAGE_BUFFER_COUNT) {
388 		if (skb_linearize(skb)) {
389 			net_alert_ratelimited("failed to linearize skb\n");
390 			ret = -ENOMEM;
391 			goto drop;
392 		}
393 		linear = true;
394 		goto check_size;
395 	}
396 
397 	/*
398 	 * Place the rndis header in the skb head room and
399 	 * the skb->cb will be used for hv_netvsc_packet
400 	 * structure.
401 	 */
402 	ret = skb_cow_head(skb, RNDIS_AND_PPI_SIZE);
403 	if (ret) {
404 		netdev_err(net, "unable to alloc hv_netvsc_packet\n");
405 		ret = -ENOMEM;
406 		goto drop;
407 	}
408 	/* Use the skb control buffer for building up the packet */
409 	BUILD_BUG_ON(sizeof(struct hv_netvsc_packet) >
410 			FIELD_SIZEOF(struct sk_buff, cb));
411 	packet = (struct hv_netvsc_packet *)skb->cb;
412 
413 
414 	packet->q_idx = skb_get_queue_mapping(skb);
415 
416 	packet->total_data_buflen = skb->len;
417 
418 	rndis_msg = (struct rndis_message *)skb->head;
419 
420 	memset(rndis_msg, 0, RNDIS_AND_PPI_SIZE);
421 
422 	isvlan = skb->vlan_tci & VLAN_TAG_PRESENT;
423 
424 	/* Add the rndis header */
425 	rndis_msg->ndis_msg_type = RNDIS_MSG_PACKET;
426 	rndis_msg->msg_len = packet->total_data_buflen;
427 	rndis_pkt = &rndis_msg->msg.pkt;
428 	rndis_pkt->data_offset = sizeof(struct rndis_packet);
429 	rndis_pkt->data_len = packet->total_data_buflen;
430 	rndis_pkt->per_pkt_info_offset = sizeof(struct rndis_packet);
431 
432 	rndis_msg_size = RNDIS_MESSAGE_SIZE(struct rndis_packet);
433 
434 	hash = skb_get_hash_raw(skb);
435 	if (hash != 0 && net->real_num_tx_queues > 1) {
436 		rndis_msg_size += NDIS_HASH_PPI_SIZE;
437 		ppi = init_ppi_data(rndis_msg, NDIS_HASH_PPI_SIZE,
438 				    NBL_HASH_VALUE);
439 		*(u32 *)((void *)ppi + ppi->ppi_offset) = hash;
440 	}
441 
442 	if (isvlan) {
443 		struct ndis_pkt_8021q_info *vlan;
444 
445 		rndis_msg_size += NDIS_VLAN_PPI_SIZE;
446 		ppi = init_ppi_data(rndis_msg, NDIS_VLAN_PPI_SIZE,
447 					IEEE_8021Q_INFO);
448 		vlan = (struct ndis_pkt_8021q_info *)((void *)ppi +
449 						ppi->ppi_offset);
450 		vlan->vlanid = skb->vlan_tci & VLAN_VID_MASK;
451 		vlan->pri = (skb->vlan_tci & VLAN_PRIO_MASK) >>
452 				VLAN_PRIO_SHIFT;
453 	}
454 
455 	net_trans_info = get_net_transport_info(skb, &hdr_offset);
456 	if (net_trans_info == TRANSPORT_INFO_NOT_IP)
457 		goto do_send;
458 
459 	/*
460 	 * Setup the sendside checksum offload only if this is not a
461 	 * GSO packet.
462 	 */
463 	if (skb_is_gso(skb))
464 		goto do_lso;
465 
466 	if ((skb->ip_summed == CHECKSUM_NONE) ||
467 	    (skb->ip_summed == CHECKSUM_UNNECESSARY))
468 		goto do_send;
469 
470 	rndis_msg_size += NDIS_CSUM_PPI_SIZE;
471 	ppi = init_ppi_data(rndis_msg, NDIS_CSUM_PPI_SIZE,
472 			    TCPIP_CHKSUM_PKTINFO);
473 
474 	csum_info = (struct ndis_tcp_ip_checksum_info *)((void *)ppi +
475 			ppi->ppi_offset);
476 
477 	if (net_trans_info & (INFO_IPV4 << 16))
478 		csum_info->transmit.is_ipv4 = 1;
479 	else
480 		csum_info->transmit.is_ipv6 = 1;
481 
482 	if (net_trans_info & INFO_TCP) {
483 		csum_info->transmit.tcp_checksum = 1;
484 		csum_info->transmit.tcp_header_offset = hdr_offset;
485 	} else if (net_trans_info & INFO_UDP) {
486 		/* UDP checksum offload is not supported on ws2008r2.
487 		 * Furthermore, on ws2012 and ws2012r2, there are some
488 		 * issues with udp checksum offload from Linux guests.
489 		 * (these are host issues).
490 		 * For now compute the checksum here.
491 		 */
492 		struct udphdr *uh;
493 		u16 udp_len;
494 
495 		ret = skb_cow_head(skb, 0);
496 		if (ret)
497 			goto drop;
498 
499 		uh = udp_hdr(skb);
500 		udp_len = ntohs(uh->len);
501 		uh->check = 0;
502 		uh->check = csum_tcpudp_magic(ip_hdr(skb)->saddr,
503 					      ip_hdr(skb)->daddr,
504 					      udp_len, IPPROTO_UDP,
505 					      csum_partial(uh, udp_len, 0));
506 		if (uh->check == 0)
507 			uh->check = CSUM_MANGLED_0;
508 
509 		csum_info->transmit.udp_checksum = 0;
510 	}
511 	goto do_send;
512 
513 do_lso:
514 	rndis_msg_size += NDIS_LSO_PPI_SIZE;
515 	ppi = init_ppi_data(rndis_msg, NDIS_LSO_PPI_SIZE,
516 			    TCP_LARGESEND_PKTINFO);
517 
518 	lso_info = (struct ndis_tcp_lso_info *)((void *)ppi +
519 			ppi->ppi_offset);
520 
521 	lso_info->lso_v2_transmit.type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
522 	if (net_trans_info & (INFO_IPV4 << 16)) {
523 		lso_info->lso_v2_transmit.ip_version =
524 			NDIS_TCP_LARGE_SEND_OFFLOAD_IPV4;
525 		ip_hdr(skb)->tot_len = 0;
526 		ip_hdr(skb)->check = 0;
527 		tcp_hdr(skb)->check =
528 		~csum_tcpudp_magic(ip_hdr(skb)->saddr,
529 				   ip_hdr(skb)->daddr, 0, IPPROTO_TCP, 0);
530 	} else {
531 		lso_info->lso_v2_transmit.ip_version =
532 			NDIS_TCP_LARGE_SEND_OFFLOAD_IPV6;
533 		ipv6_hdr(skb)->payload_len = 0;
534 		tcp_hdr(skb)->check =
535 		~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
536 				&ipv6_hdr(skb)->daddr, 0, IPPROTO_TCP, 0);
537 	}
538 	lso_info->lso_v2_transmit.tcp_header_offset = hdr_offset;
539 	lso_info->lso_v2_transmit.mss = skb_shinfo(skb)->gso_size;
540 
541 do_send:
542 	/* Start filling in the page buffers with the rndis hdr */
543 	rndis_msg->msg_len += rndis_msg_size;
544 	packet->total_data_buflen = rndis_msg->msg_len;
545 	packet->page_buf_cnt = init_page_array(rndis_msg, rndis_msg_size,
546 					       skb, packet, &pb);
547 
548 	ret = netvsc_send(net_device_ctx->device_ctx, packet,
549 			  rndis_msg, &pb, skb);
550 
551 drop:
552 	if (ret == 0) {
553 		u64_stats_update_begin(&tx_stats->syncp);
554 		tx_stats->packets++;
555 		tx_stats->bytes += skb_length;
556 		u64_stats_update_end(&tx_stats->syncp);
557 	} else {
558 		if (ret != -EAGAIN) {
559 			dev_kfree_skb_any(skb);
560 			net->stats.tx_dropped++;
561 		}
562 	}
563 
564 	return (ret == -EAGAIN) ? NETDEV_TX_BUSY : NETDEV_TX_OK;
565 }
566 
567 /*
568  * netvsc_linkstatus_callback - Link up/down notification
569  */
570 void netvsc_linkstatus_callback(struct hv_device *device_obj,
571 				struct rndis_message *resp)
572 {
573 	struct rndis_indicate_status *indicate = &resp->msg.indicate_status;
574 	struct net_device *net;
575 	struct net_device_context *ndev_ctx;
576 	struct netvsc_device *net_device;
577 	struct netvsc_reconfig *event;
578 	unsigned long flags;
579 
580 	/* Handle link change statuses only */
581 	if (indicate->status != RNDIS_STATUS_NETWORK_CHANGE &&
582 	    indicate->status != RNDIS_STATUS_MEDIA_CONNECT &&
583 	    indicate->status != RNDIS_STATUS_MEDIA_DISCONNECT)
584 		return;
585 
586 	net_device = hv_get_drvdata(device_obj);
587 	net = net_device->ndev;
588 
589 	if (!net || net->reg_state != NETREG_REGISTERED)
590 		return;
591 
592 	ndev_ctx = netdev_priv(net);
593 
594 	event = kzalloc(sizeof(*event), GFP_ATOMIC);
595 	if (!event)
596 		return;
597 	event->event = indicate->status;
598 
599 	spin_lock_irqsave(&ndev_ctx->lock, flags);
600 	list_add_tail(&event->list, &ndev_ctx->reconfig_events);
601 	spin_unlock_irqrestore(&ndev_ctx->lock, flags);
602 
603 	schedule_delayed_work(&ndev_ctx->dwork, 0);
604 }
605 
606 /*
607  * netvsc_recv_callback -  Callback when we receive a packet from the
608  * "wire" on the specified device.
609  */
610 int netvsc_recv_callback(struct hv_device *device_obj,
611 				struct hv_netvsc_packet *packet,
612 				void **data,
613 				struct ndis_tcp_ip_checksum_info *csum_info,
614 				struct vmbus_channel *channel,
615 				u16 vlan_tci)
616 {
617 	struct net_device *net;
618 	struct net_device_context *net_device_ctx;
619 	struct sk_buff *skb;
620 	struct netvsc_stats *rx_stats;
621 
622 	net = ((struct netvsc_device *)hv_get_drvdata(device_obj))->ndev;
623 	if (!net || net->reg_state != NETREG_REGISTERED) {
624 		return NVSP_STAT_FAIL;
625 	}
626 	net_device_ctx = netdev_priv(net);
627 	rx_stats = this_cpu_ptr(net_device_ctx->rx_stats);
628 
629 	/* Allocate a skb - TODO direct I/O to pages? */
630 	skb = netdev_alloc_skb_ip_align(net, packet->total_data_buflen);
631 	if (unlikely(!skb)) {
632 		++net->stats.rx_dropped;
633 		return NVSP_STAT_FAIL;
634 	}
635 
636 	/*
637 	 * Copy to skb. This copy is needed here since the memory pointed by
638 	 * hv_netvsc_packet cannot be deallocated
639 	 */
640 	memcpy(skb_put(skb, packet->total_data_buflen), *data,
641 		packet->total_data_buflen);
642 
643 	skb->protocol = eth_type_trans(skb, net);
644 	if (csum_info) {
645 		/* We only look at the IP checksum here.
646 		 * Should we be dropping the packet if checksum
647 		 * failed? How do we deal with other checksums - TCP/UDP?
648 		 */
649 		if (csum_info->receive.ip_checksum_succeeded)
650 			skb->ip_summed = CHECKSUM_UNNECESSARY;
651 		else
652 			skb->ip_summed = CHECKSUM_NONE;
653 	}
654 
655 	if (vlan_tci & VLAN_TAG_PRESENT)
656 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
657 				       vlan_tci);
658 
659 	skb_record_rx_queue(skb, channel->
660 			    offermsg.offer.sub_channel_index);
661 
662 	u64_stats_update_begin(&rx_stats->syncp);
663 	rx_stats->packets++;
664 	rx_stats->bytes += packet->total_data_buflen;
665 	u64_stats_update_end(&rx_stats->syncp);
666 
667 	/*
668 	 * Pass the skb back up. Network stack will deallocate the skb when it
669 	 * is done.
670 	 * TODO - use NAPI?
671 	 */
672 	netif_rx(skb);
673 
674 	return 0;
675 }
676 
677 static void netvsc_get_drvinfo(struct net_device *net,
678 			       struct ethtool_drvinfo *info)
679 {
680 	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
681 	strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
682 }
683 
684 static void netvsc_get_channels(struct net_device *net,
685 				struct ethtool_channels *channel)
686 {
687 	struct net_device_context *net_device_ctx = netdev_priv(net);
688 	struct hv_device *dev = net_device_ctx->device_ctx;
689 	struct netvsc_device *nvdev = hv_get_drvdata(dev);
690 
691 	if (nvdev) {
692 		channel->max_combined	= nvdev->max_chn;
693 		channel->combined_count = nvdev->num_chn;
694 	}
695 }
696 
697 static int netvsc_set_channels(struct net_device *net,
698 			       struct ethtool_channels *channels)
699 {
700 	struct net_device_context *net_device_ctx = netdev_priv(net);
701 	struct hv_device *dev = net_device_ctx->device_ctx;
702 	struct netvsc_device *nvdev = hv_get_drvdata(dev);
703 	struct netvsc_device_info device_info;
704 	u32 num_chn;
705 	u32 max_chn;
706 	int ret = 0;
707 	bool recovering = false;
708 
709 	if (!nvdev || nvdev->destroy)
710 		return -ENODEV;
711 
712 	num_chn = nvdev->num_chn;
713 	max_chn = min_t(u32, nvdev->max_chn, num_online_cpus());
714 
715 	if (nvdev->nvsp_version < NVSP_PROTOCOL_VERSION_5) {
716 		pr_info("vRSS unsupported before NVSP Version 5\n");
717 		return -EINVAL;
718 	}
719 
720 	/* We do not support rx, tx, or other */
721 	if (!channels ||
722 	    channels->rx_count ||
723 	    channels->tx_count ||
724 	    channels->other_count ||
725 	    (channels->combined_count < 1))
726 		return -EINVAL;
727 
728 	if (channels->combined_count > max_chn) {
729 		pr_info("combined channels too high, using %d\n", max_chn);
730 		channels->combined_count = max_chn;
731 	}
732 
733 	ret = netvsc_close(net);
734 	if (ret)
735 		goto out;
736 
737  do_set:
738 	nvdev->start_remove = true;
739 	rndis_filter_device_remove(dev);
740 
741 	nvdev->num_chn = channels->combined_count;
742 
743 	net_device_ctx->device_ctx = dev;
744 	hv_set_drvdata(dev, net);
745 
746 	memset(&device_info, 0, sizeof(device_info));
747 	device_info.num_chn = nvdev->num_chn; /* passed to RNDIS */
748 	device_info.ring_size = ring_size;
749 	device_info.max_num_vrss_chns = max_num_vrss_chns;
750 
751 	ret = rndis_filter_device_add(dev, &device_info);
752 	if (ret) {
753 		if (recovering) {
754 			netdev_err(net, "unable to add netvsc device (ret %d)\n", ret);
755 			return ret;
756 		}
757 		goto recover;
758 	}
759 
760 	nvdev = hv_get_drvdata(dev);
761 
762 	ret = netif_set_real_num_tx_queues(net, nvdev->num_chn);
763 	if (ret) {
764 		if (recovering) {
765 			netdev_err(net, "could not set tx queue count (ret %d)\n", ret);
766 			return ret;
767 		}
768 		goto recover;
769 	}
770 
771 	ret = netif_set_real_num_rx_queues(net, nvdev->num_chn);
772 	if (ret) {
773 		if (recovering) {
774 			netdev_err(net, "could not set rx queue count (ret %d)\n", ret);
775 			return ret;
776 		}
777 		goto recover;
778 	}
779 
780  out:
781 	netvsc_open(net);
782 
783 	return ret;
784 
785  recover:
786 	/* If the above failed, we attempt to recover through the same
787 	 * process but with the original number of channels.
788 	 */
789 	netdev_err(net, "could not set channels, recovering\n");
790 	recovering = true;
791 	channels->combined_count = num_chn;
792 	goto do_set;
793 }
794 
795 static int netvsc_change_mtu(struct net_device *ndev, int mtu)
796 {
797 	struct net_device_context *ndevctx = netdev_priv(ndev);
798 	struct hv_device *hdev =  ndevctx->device_ctx;
799 	struct netvsc_device *nvdev = hv_get_drvdata(hdev);
800 	struct netvsc_device_info device_info;
801 	int limit = ETH_DATA_LEN;
802 	int ret = 0;
803 
804 	if (nvdev == NULL || nvdev->destroy)
805 		return -ENODEV;
806 
807 	if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
808 		limit = NETVSC_MTU - ETH_HLEN;
809 
810 	if (mtu < NETVSC_MTU_MIN || mtu > limit)
811 		return -EINVAL;
812 
813 	ret = netvsc_close(ndev);
814 	if (ret)
815 		goto out;
816 
817 	nvdev->start_remove = true;
818 	rndis_filter_device_remove(hdev);
819 
820 	ndev->mtu = mtu;
821 
822 	ndevctx->device_ctx = hdev;
823 	hv_set_drvdata(hdev, ndev);
824 
825 	memset(&device_info, 0, sizeof(device_info));
826 	device_info.ring_size = ring_size;
827 	device_info.num_chn = nvdev->num_chn;
828 	device_info.max_num_vrss_chns = max_num_vrss_chns;
829 	rndis_filter_device_add(hdev, &device_info);
830 
831 out:
832 	netvsc_open(ndev);
833 
834 	return ret;
835 }
836 
837 static struct rtnl_link_stats64 *netvsc_get_stats64(struct net_device *net,
838 						    struct rtnl_link_stats64 *t)
839 {
840 	struct net_device_context *ndev_ctx = netdev_priv(net);
841 	int cpu;
842 
843 	for_each_possible_cpu(cpu) {
844 		struct netvsc_stats *tx_stats = per_cpu_ptr(ndev_ctx->tx_stats,
845 							    cpu);
846 		struct netvsc_stats *rx_stats = per_cpu_ptr(ndev_ctx->rx_stats,
847 							    cpu);
848 		u64 tx_packets, tx_bytes, rx_packets, rx_bytes;
849 		unsigned int start;
850 
851 		do {
852 			start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
853 			tx_packets = tx_stats->packets;
854 			tx_bytes = tx_stats->bytes;
855 		} while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
856 
857 		do {
858 			start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
859 			rx_packets = rx_stats->packets;
860 			rx_bytes = rx_stats->bytes;
861 		} while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
862 
863 		t->tx_bytes	+= tx_bytes;
864 		t->tx_packets	+= tx_packets;
865 		t->rx_bytes	+= rx_bytes;
866 		t->rx_packets	+= rx_packets;
867 	}
868 
869 	t->tx_dropped	= net->stats.tx_dropped;
870 	t->tx_errors	= net->stats.tx_dropped;
871 
872 	t->rx_dropped	= net->stats.rx_dropped;
873 	t->rx_errors	= net->stats.rx_errors;
874 
875 	return t;
876 }
877 
878 static int netvsc_set_mac_addr(struct net_device *ndev, void *p)
879 {
880 	struct net_device_context *ndevctx = netdev_priv(ndev);
881 	struct hv_device *hdev =  ndevctx->device_ctx;
882 	struct sockaddr *addr = p;
883 	char save_adr[ETH_ALEN];
884 	unsigned char save_aatype;
885 	int err;
886 
887 	memcpy(save_adr, ndev->dev_addr, ETH_ALEN);
888 	save_aatype = ndev->addr_assign_type;
889 
890 	err = eth_mac_addr(ndev, p);
891 	if (err != 0)
892 		return err;
893 
894 	err = rndis_filter_set_device_mac(hdev, addr->sa_data);
895 	if (err != 0) {
896 		/* roll back to saved MAC */
897 		memcpy(ndev->dev_addr, save_adr, ETH_ALEN);
898 		ndev->addr_assign_type = save_aatype;
899 	}
900 
901 	return err;
902 }
903 
904 #ifdef CONFIG_NET_POLL_CONTROLLER
905 static void netvsc_poll_controller(struct net_device *net)
906 {
907 	/* As netvsc_start_xmit() works synchronous we don't have to
908 	 * trigger anything here.
909 	 */
910 }
911 #endif
912 
913 static const struct ethtool_ops ethtool_ops = {
914 	.get_drvinfo	= netvsc_get_drvinfo,
915 	.get_link	= ethtool_op_get_link,
916 	.get_channels   = netvsc_get_channels,
917 	.set_channels   = netvsc_set_channels,
918 };
919 
920 static const struct net_device_ops device_ops = {
921 	.ndo_open =			netvsc_open,
922 	.ndo_stop =			netvsc_close,
923 	.ndo_start_xmit =		netvsc_start_xmit,
924 	.ndo_set_rx_mode =		netvsc_set_multicast_list,
925 	.ndo_change_mtu =		netvsc_change_mtu,
926 	.ndo_validate_addr =		eth_validate_addr,
927 	.ndo_set_mac_address =		netvsc_set_mac_addr,
928 	.ndo_select_queue =		netvsc_select_queue,
929 	.ndo_get_stats64 =		netvsc_get_stats64,
930 #ifdef CONFIG_NET_POLL_CONTROLLER
931 	.ndo_poll_controller =		netvsc_poll_controller,
932 #endif
933 };
934 
935 /*
936  * Handle link status changes. For RNDIS_STATUS_NETWORK_CHANGE emulate link
937  * down/up sequence. In case of RNDIS_STATUS_MEDIA_CONNECT when carrier is
938  * present send GARP packet to network peers with netif_notify_peers().
939  */
940 static void netvsc_link_change(struct work_struct *w)
941 {
942 	struct net_device_context *ndev_ctx;
943 	struct net_device *net;
944 	struct netvsc_device *net_device;
945 	struct rndis_device *rdev;
946 	struct netvsc_reconfig *event = NULL;
947 	bool notify = false, reschedule = false;
948 	unsigned long flags, next_reconfig, delay;
949 
950 	ndev_ctx = container_of(w, struct net_device_context, dwork.work);
951 	net_device = hv_get_drvdata(ndev_ctx->device_ctx);
952 	rdev = net_device->extension;
953 	net = net_device->ndev;
954 
955 	next_reconfig = ndev_ctx->last_reconfig + LINKCHANGE_INT;
956 	if (time_is_after_jiffies(next_reconfig)) {
957 		/* link_watch only sends one notification with current state
958 		 * per second, avoid doing reconfig more frequently. Handle
959 		 * wrap around.
960 		 */
961 		delay = next_reconfig - jiffies;
962 		delay = delay < LINKCHANGE_INT ? delay : LINKCHANGE_INT;
963 		schedule_delayed_work(&ndev_ctx->dwork, delay);
964 		return;
965 	}
966 	ndev_ctx->last_reconfig = jiffies;
967 
968 	spin_lock_irqsave(&ndev_ctx->lock, flags);
969 	if (!list_empty(&ndev_ctx->reconfig_events)) {
970 		event = list_first_entry(&ndev_ctx->reconfig_events,
971 					 struct netvsc_reconfig, list);
972 		list_del(&event->list);
973 		reschedule = !list_empty(&ndev_ctx->reconfig_events);
974 	}
975 	spin_unlock_irqrestore(&ndev_ctx->lock, flags);
976 
977 	if (!event)
978 		return;
979 
980 	rtnl_lock();
981 
982 	switch (event->event) {
983 		/* Only the following events are possible due to the check in
984 		 * netvsc_linkstatus_callback()
985 		 */
986 	case RNDIS_STATUS_MEDIA_CONNECT:
987 		if (rdev->link_state) {
988 			rdev->link_state = false;
989 			netif_carrier_on(net);
990 			netif_tx_wake_all_queues(net);
991 		} else {
992 			notify = true;
993 		}
994 		kfree(event);
995 		break;
996 	case RNDIS_STATUS_MEDIA_DISCONNECT:
997 		if (!rdev->link_state) {
998 			rdev->link_state = true;
999 			netif_carrier_off(net);
1000 			netif_tx_stop_all_queues(net);
1001 		}
1002 		kfree(event);
1003 		break;
1004 	case RNDIS_STATUS_NETWORK_CHANGE:
1005 		/* Only makes sense if carrier is present */
1006 		if (!rdev->link_state) {
1007 			rdev->link_state = true;
1008 			netif_carrier_off(net);
1009 			netif_tx_stop_all_queues(net);
1010 			event->event = RNDIS_STATUS_MEDIA_CONNECT;
1011 			spin_lock_irqsave(&ndev_ctx->lock, flags);
1012 			list_add_tail(&event->list, &ndev_ctx->reconfig_events);
1013 			spin_unlock_irqrestore(&ndev_ctx->lock, flags);
1014 			reschedule = true;
1015 		}
1016 		break;
1017 	}
1018 
1019 	rtnl_unlock();
1020 
1021 	if (notify)
1022 		netdev_notify_peers(net);
1023 
1024 	/* link_watch only sends one notification with current state per
1025 	 * second, handle next reconfig event in 2 seconds.
1026 	 */
1027 	if (reschedule)
1028 		schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
1029 }
1030 
1031 static void netvsc_free_netdev(struct net_device *netdev)
1032 {
1033 	struct net_device_context *net_device_ctx = netdev_priv(netdev);
1034 
1035 	free_percpu(net_device_ctx->tx_stats);
1036 	free_percpu(net_device_ctx->rx_stats);
1037 	free_netdev(netdev);
1038 }
1039 
1040 static int netvsc_probe(struct hv_device *dev,
1041 			const struct hv_vmbus_device_id *dev_id)
1042 {
1043 	struct net_device *net = NULL;
1044 	struct net_device_context *net_device_ctx;
1045 	struct netvsc_device_info device_info;
1046 	struct netvsc_device *nvdev;
1047 	int ret;
1048 
1049 	net = alloc_etherdev_mq(sizeof(struct net_device_context),
1050 				num_online_cpus());
1051 	if (!net)
1052 		return -ENOMEM;
1053 
1054 	netif_carrier_off(net);
1055 
1056 	net_device_ctx = netdev_priv(net);
1057 	net_device_ctx->device_ctx = dev;
1058 	net_device_ctx->msg_enable = netif_msg_init(debug, default_msg);
1059 	if (netif_msg_probe(net_device_ctx))
1060 		netdev_dbg(net, "netvsc msg_enable: %d\n",
1061 			   net_device_ctx->msg_enable);
1062 
1063 	net_device_ctx->tx_stats = netdev_alloc_pcpu_stats(struct netvsc_stats);
1064 	if (!net_device_ctx->tx_stats) {
1065 		free_netdev(net);
1066 		return -ENOMEM;
1067 	}
1068 	net_device_ctx->rx_stats = netdev_alloc_pcpu_stats(struct netvsc_stats);
1069 	if (!net_device_ctx->rx_stats) {
1070 		free_percpu(net_device_ctx->tx_stats);
1071 		free_netdev(net);
1072 		return -ENOMEM;
1073 	}
1074 
1075 	hv_set_drvdata(dev, net);
1076 	INIT_DELAYED_WORK(&net_device_ctx->dwork, netvsc_link_change);
1077 	INIT_WORK(&net_device_ctx->work, do_set_multicast);
1078 
1079 	spin_lock_init(&net_device_ctx->lock);
1080 	INIT_LIST_HEAD(&net_device_ctx->reconfig_events);
1081 
1082 	net->netdev_ops = &device_ops;
1083 
1084 	net->hw_features = NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_IP_CSUM |
1085 				NETIF_F_TSO;
1086 	net->features = NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_SG | NETIF_F_RXCSUM |
1087 			NETIF_F_IP_CSUM | NETIF_F_TSO;
1088 
1089 	net->ethtool_ops = &ethtool_ops;
1090 	SET_NETDEV_DEV(net, &dev->device);
1091 
1092 	/* Notify the netvsc driver of the new device */
1093 	memset(&device_info, 0, sizeof(device_info));
1094 	device_info.ring_size = ring_size;
1095 	device_info.max_num_vrss_chns = max_num_vrss_chns;
1096 	ret = rndis_filter_device_add(dev, &device_info);
1097 	if (ret != 0) {
1098 		netdev_err(net, "unable to add netvsc device (ret %d)\n", ret);
1099 		netvsc_free_netdev(net);
1100 		hv_set_drvdata(dev, NULL);
1101 		return ret;
1102 	}
1103 	memcpy(net->dev_addr, device_info.mac_adr, ETH_ALEN);
1104 
1105 	nvdev = hv_get_drvdata(dev);
1106 	netif_set_real_num_tx_queues(net, nvdev->num_chn);
1107 	netif_set_real_num_rx_queues(net, nvdev->num_chn);
1108 
1109 	ret = register_netdev(net);
1110 	if (ret != 0) {
1111 		pr_err("Unable to register netdev.\n");
1112 		rndis_filter_device_remove(dev);
1113 		netvsc_free_netdev(net);
1114 	}
1115 
1116 	return ret;
1117 }
1118 
1119 static int netvsc_remove(struct hv_device *dev)
1120 {
1121 	struct net_device *net;
1122 	struct net_device_context *ndev_ctx;
1123 	struct netvsc_device *net_device;
1124 
1125 	net_device = hv_get_drvdata(dev);
1126 	net = net_device->ndev;
1127 
1128 	if (net == NULL) {
1129 		dev_err(&dev->device, "No net device to remove\n");
1130 		return 0;
1131 	}
1132 
1133 	net_device->start_remove = true;
1134 
1135 	ndev_ctx = netdev_priv(net);
1136 	cancel_delayed_work_sync(&ndev_ctx->dwork);
1137 	cancel_work_sync(&ndev_ctx->work);
1138 
1139 	/* Stop outbound asap */
1140 	netif_tx_disable(net);
1141 
1142 	unregister_netdev(net);
1143 
1144 	/*
1145 	 * Call to the vsc driver to let it know that the device is being
1146 	 * removed
1147 	 */
1148 	rndis_filter_device_remove(dev);
1149 
1150 	netvsc_free_netdev(net);
1151 	return 0;
1152 }
1153 
1154 static const struct hv_vmbus_device_id id_table[] = {
1155 	/* Network guid */
1156 	{ HV_NIC_GUID, },
1157 	{ },
1158 };
1159 
1160 MODULE_DEVICE_TABLE(vmbus, id_table);
1161 
1162 /* The one and only one */
1163 static struct  hv_driver netvsc_drv = {
1164 	.name = KBUILD_MODNAME,
1165 	.id_table = id_table,
1166 	.probe = netvsc_probe,
1167 	.remove = netvsc_remove,
1168 };
1169 
1170 static void __exit netvsc_drv_exit(void)
1171 {
1172 	vmbus_driver_unregister(&netvsc_drv);
1173 }
1174 
1175 static int __init netvsc_drv_init(void)
1176 {
1177 	if (ring_size < RING_SIZE_MIN) {
1178 		ring_size = RING_SIZE_MIN;
1179 		pr_info("Increased ring_size to %d (min allowed)\n",
1180 			ring_size);
1181 	}
1182 	return vmbus_driver_register(&netvsc_drv);
1183 }
1184 
1185 MODULE_LICENSE("GPL");
1186 MODULE_DESCRIPTION("Microsoft Hyper-V network driver");
1187 
1188 module_init(netvsc_drv_init);
1189 module_exit(netvsc_drv_exit);
1190