xref: /openbmc/linux/drivers/net/hyperv/netvsc_drv.c (revision 94085fe570e7b87597d4695e6fa77d4256efd29e)
1 /*
2  * Copyright (c) 2009, Microsoft Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, see <http://www.gnu.org/licenses/>.
15  *
16  * Authors:
17  *   Haiyang Zhang <haiyangz@microsoft.com>
18  *   Hank Janssen  <hjanssen@microsoft.com>
19  */
20 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
21 
22 #include <linux/init.h>
23 #include <linux/atomic.h>
24 #include <linux/module.h>
25 #include <linux/highmem.h>
26 #include <linux/device.h>
27 #include <linux/io.h>
28 #include <linux/delay.h>
29 #include <linux/netdevice.h>
30 #include <linux/inetdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/skbuff.h>
33 #include <linux/if_vlan.h>
34 #include <linux/in.h>
35 #include <linux/slab.h>
36 #include <net/arp.h>
37 #include <net/route.h>
38 #include <net/sock.h>
39 #include <net/pkt_sched.h>
40 
41 #include "hyperv_net.h"
42 
43 
44 #define RING_SIZE_MIN 64
45 #define LINKCHANGE_INT (2 * HZ)
46 static int ring_size = 128;
47 module_param(ring_size, int, S_IRUGO);
48 MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)");
49 
50 static int max_num_vrss_chns = 8;
51 
52 static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE |
53 				NETIF_MSG_LINK | NETIF_MSG_IFUP |
54 				NETIF_MSG_IFDOWN | NETIF_MSG_RX_ERR |
55 				NETIF_MSG_TX_ERR;
56 
57 static int debug = -1;
58 module_param(debug, int, S_IRUGO);
59 MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
60 
61 static void do_set_multicast(struct work_struct *w)
62 {
63 	struct net_device_context *ndevctx =
64 		container_of(w, struct net_device_context, work);
65 	struct netvsc_device *nvdev;
66 	struct rndis_device *rdev;
67 
68 	nvdev = hv_get_drvdata(ndevctx->device_ctx);
69 	if (nvdev == NULL || nvdev->ndev == NULL)
70 		return;
71 
72 	rdev = nvdev->extension;
73 	if (rdev == NULL)
74 		return;
75 
76 	if (nvdev->ndev->flags & IFF_PROMISC)
77 		rndis_filter_set_packet_filter(rdev,
78 			NDIS_PACKET_TYPE_PROMISCUOUS);
79 	else
80 		rndis_filter_set_packet_filter(rdev,
81 			NDIS_PACKET_TYPE_BROADCAST |
82 			NDIS_PACKET_TYPE_ALL_MULTICAST |
83 			NDIS_PACKET_TYPE_DIRECTED);
84 }
85 
86 static void netvsc_set_multicast_list(struct net_device *net)
87 {
88 	struct net_device_context *net_device_ctx = netdev_priv(net);
89 
90 	schedule_work(&net_device_ctx->work);
91 }
92 
93 static int netvsc_open(struct net_device *net)
94 {
95 	struct net_device_context *net_device_ctx = netdev_priv(net);
96 	struct hv_device *device_obj = net_device_ctx->device_ctx;
97 	struct netvsc_device *nvdev;
98 	struct rndis_device *rdev;
99 	int ret = 0;
100 
101 	netif_carrier_off(net);
102 
103 	/* Open up the device */
104 	ret = rndis_filter_open(device_obj);
105 	if (ret != 0) {
106 		netdev_err(net, "unable to open device (ret %d).\n", ret);
107 		return ret;
108 	}
109 
110 	netif_tx_wake_all_queues(net);
111 
112 	nvdev = hv_get_drvdata(device_obj);
113 	rdev = nvdev->extension;
114 	if (!rdev->link_state)
115 		netif_carrier_on(net);
116 
117 	return ret;
118 }
119 
120 static int netvsc_close(struct net_device *net)
121 {
122 	struct net_device_context *net_device_ctx = netdev_priv(net);
123 	struct hv_device *device_obj = net_device_ctx->device_ctx;
124 	struct netvsc_device *nvdev = hv_get_drvdata(device_obj);
125 	int ret;
126 	u32 aread, awrite, i, msec = 10, retry = 0, retry_max = 20;
127 	struct vmbus_channel *chn;
128 
129 	netif_tx_disable(net);
130 
131 	/* Make sure netvsc_set_multicast_list doesn't re-enable filter! */
132 	cancel_work_sync(&net_device_ctx->work);
133 	ret = rndis_filter_close(device_obj);
134 	if (ret != 0) {
135 		netdev_err(net, "unable to close device (ret %d).\n", ret);
136 		return ret;
137 	}
138 
139 	/* Ensure pending bytes in ring are read */
140 	while (true) {
141 		aread = 0;
142 		for (i = 0; i < nvdev->num_chn; i++) {
143 			chn = nvdev->chn_table[i];
144 			if (!chn)
145 				continue;
146 
147 			hv_get_ringbuffer_availbytes(&chn->inbound, &aread,
148 						     &awrite);
149 
150 			if (aread)
151 				break;
152 
153 			hv_get_ringbuffer_availbytes(&chn->outbound, &aread,
154 						     &awrite);
155 
156 			if (aread)
157 				break;
158 		}
159 
160 		retry++;
161 		if (retry > retry_max || aread == 0)
162 			break;
163 
164 		msleep(msec);
165 
166 		if (msec < 1000)
167 			msec *= 2;
168 	}
169 
170 	if (aread) {
171 		netdev_err(net, "Ring buffer not empty after closing rndis\n");
172 		ret = -ETIMEDOUT;
173 	}
174 
175 	return ret;
176 }
177 
178 static void *init_ppi_data(struct rndis_message *msg, u32 ppi_size,
179 				int pkt_type)
180 {
181 	struct rndis_packet *rndis_pkt;
182 	struct rndis_per_packet_info *ppi;
183 
184 	rndis_pkt = &msg->msg.pkt;
185 	rndis_pkt->data_offset += ppi_size;
186 
187 	ppi = (struct rndis_per_packet_info *)((void *)rndis_pkt +
188 		rndis_pkt->per_pkt_info_offset + rndis_pkt->per_pkt_info_len);
189 
190 	ppi->size = ppi_size;
191 	ppi->type = pkt_type;
192 	ppi->ppi_offset = sizeof(struct rndis_per_packet_info);
193 
194 	rndis_pkt->per_pkt_info_len += ppi_size;
195 
196 	return ppi;
197 }
198 
199 union sub_key {
200 	u64 k;
201 	struct {
202 		u8 pad[3];
203 		u8 kb;
204 		u32 ka;
205 	};
206 };
207 
208 /* Toeplitz hash function
209  * data: network byte order
210  * return: host byte order
211  */
212 static u32 comp_hash(u8 *key, int klen, void *data, int dlen)
213 {
214 	union sub_key subk;
215 	int k_next = 4;
216 	u8 dt;
217 	int i, j;
218 	u32 ret = 0;
219 
220 	subk.k = 0;
221 	subk.ka = ntohl(*(u32 *)key);
222 
223 	for (i = 0; i < dlen; i++) {
224 		subk.kb = key[k_next];
225 		k_next = (k_next + 1) % klen;
226 		dt = ((u8 *)data)[i];
227 		for (j = 0; j < 8; j++) {
228 			if (dt & 0x80)
229 				ret ^= subk.ka;
230 			dt <<= 1;
231 			subk.k <<= 1;
232 		}
233 	}
234 
235 	return ret;
236 }
237 
238 static bool netvsc_set_hash(u32 *hash, struct sk_buff *skb)
239 {
240 	struct flow_keys flow;
241 	int data_len;
242 
243 	if (!skb_flow_dissect_flow_keys(skb, &flow, 0) ||
244 	    !(flow.basic.n_proto == htons(ETH_P_IP) ||
245 	      flow.basic.n_proto == htons(ETH_P_IPV6)))
246 		return false;
247 
248 	if (flow.basic.ip_proto == IPPROTO_TCP)
249 		data_len = 12;
250 	else
251 		data_len = 8;
252 
253 	*hash = comp_hash(netvsc_hash_key, HASH_KEYLEN, &flow, data_len);
254 
255 	return true;
256 }
257 
258 static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
259 			void *accel_priv, select_queue_fallback_t fallback)
260 {
261 	struct net_device_context *net_device_ctx = netdev_priv(ndev);
262 	struct hv_device *hdev =  net_device_ctx->device_ctx;
263 	struct netvsc_device *nvsc_dev = hv_get_drvdata(hdev);
264 	u32 hash;
265 	u16 q_idx = 0;
266 
267 	if (nvsc_dev == NULL || ndev->real_num_tx_queues <= 1)
268 		return 0;
269 
270 	if (netvsc_set_hash(&hash, skb)) {
271 		q_idx = nvsc_dev->send_table[hash % VRSS_SEND_TAB_SIZE] %
272 			ndev->real_num_tx_queues;
273 		skb_set_hash(skb, hash, PKT_HASH_TYPE_L3);
274 	}
275 
276 	if (!nvsc_dev->chn_table[q_idx])
277 		q_idx = 0;
278 
279 	return q_idx;
280 }
281 
282 static u32 fill_pg_buf(struct page *page, u32 offset, u32 len,
283 			struct hv_page_buffer *pb)
284 {
285 	int j = 0;
286 
287 	/* Deal with compund pages by ignoring unused part
288 	 * of the page.
289 	 */
290 	page += (offset >> PAGE_SHIFT);
291 	offset &= ~PAGE_MASK;
292 
293 	while (len > 0) {
294 		unsigned long bytes;
295 
296 		bytes = PAGE_SIZE - offset;
297 		if (bytes > len)
298 			bytes = len;
299 		pb[j].pfn = page_to_pfn(page);
300 		pb[j].offset = offset;
301 		pb[j].len = bytes;
302 
303 		offset += bytes;
304 		len -= bytes;
305 
306 		if (offset == PAGE_SIZE && len) {
307 			page++;
308 			offset = 0;
309 			j++;
310 		}
311 	}
312 
313 	return j + 1;
314 }
315 
316 static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb,
317 			   struct hv_netvsc_packet *packet,
318 			   struct hv_page_buffer **page_buf)
319 {
320 	struct hv_page_buffer *pb = *page_buf;
321 	u32 slots_used = 0;
322 	char *data = skb->data;
323 	int frags = skb_shinfo(skb)->nr_frags;
324 	int i;
325 
326 	/* The packet is laid out thus:
327 	 * 1. hdr: RNDIS header and PPI
328 	 * 2. skb linear data
329 	 * 3. skb fragment data
330 	 */
331 	if (hdr != NULL)
332 		slots_used += fill_pg_buf(virt_to_page(hdr),
333 					offset_in_page(hdr),
334 					len, &pb[slots_used]);
335 
336 	packet->rmsg_size = len;
337 	packet->rmsg_pgcnt = slots_used;
338 
339 	slots_used += fill_pg_buf(virt_to_page(data),
340 				offset_in_page(data),
341 				skb_headlen(skb), &pb[slots_used]);
342 
343 	for (i = 0; i < frags; i++) {
344 		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
345 
346 		slots_used += fill_pg_buf(skb_frag_page(frag),
347 					frag->page_offset,
348 					skb_frag_size(frag), &pb[slots_used]);
349 	}
350 	return slots_used;
351 }
352 
353 static int count_skb_frag_slots(struct sk_buff *skb)
354 {
355 	int i, frags = skb_shinfo(skb)->nr_frags;
356 	int pages = 0;
357 
358 	for (i = 0; i < frags; i++) {
359 		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
360 		unsigned long size = skb_frag_size(frag);
361 		unsigned long offset = frag->page_offset;
362 
363 		/* Skip unused frames from start of page */
364 		offset &= ~PAGE_MASK;
365 		pages += PFN_UP(offset + size);
366 	}
367 	return pages;
368 }
369 
370 static int netvsc_get_slots(struct sk_buff *skb)
371 {
372 	char *data = skb->data;
373 	unsigned int offset = offset_in_page(data);
374 	unsigned int len = skb_headlen(skb);
375 	int slots;
376 	int frag_slots;
377 
378 	slots = DIV_ROUND_UP(offset + len, PAGE_SIZE);
379 	frag_slots = count_skb_frag_slots(skb);
380 	return slots + frag_slots;
381 }
382 
383 static u32 get_net_transport_info(struct sk_buff *skb, u32 *trans_off)
384 {
385 	u32 ret_val = TRANSPORT_INFO_NOT_IP;
386 
387 	if ((eth_hdr(skb)->h_proto != htons(ETH_P_IP)) &&
388 		(eth_hdr(skb)->h_proto != htons(ETH_P_IPV6))) {
389 		goto not_ip;
390 	}
391 
392 	*trans_off = skb_transport_offset(skb);
393 
394 	if ((eth_hdr(skb)->h_proto == htons(ETH_P_IP))) {
395 		struct iphdr *iphdr = ip_hdr(skb);
396 
397 		if (iphdr->protocol == IPPROTO_TCP)
398 			ret_val = TRANSPORT_INFO_IPV4_TCP;
399 		else if (iphdr->protocol == IPPROTO_UDP)
400 			ret_val = TRANSPORT_INFO_IPV4_UDP;
401 	} else {
402 		if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP)
403 			ret_val = TRANSPORT_INFO_IPV6_TCP;
404 		else if (ipv6_hdr(skb)->nexthdr == IPPROTO_UDP)
405 			ret_val = TRANSPORT_INFO_IPV6_UDP;
406 	}
407 
408 not_ip:
409 	return ret_val;
410 }
411 
412 static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net)
413 {
414 	struct net_device_context *net_device_ctx = netdev_priv(net);
415 	struct hv_netvsc_packet *packet = NULL;
416 	int ret;
417 	unsigned int num_data_pgs;
418 	struct rndis_message *rndis_msg;
419 	struct rndis_packet *rndis_pkt;
420 	u32 rndis_msg_size;
421 	bool isvlan;
422 	bool linear = false;
423 	struct rndis_per_packet_info *ppi;
424 	struct ndis_tcp_ip_checksum_info *csum_info;
425 	struct ndis_tcp_lso_info *lso_info;
426 	int  hdr_offset;
427 	u32 net_trans_info;
428 	u32 hash;
429 	u32 skb_length;
430 	struct hv_page_buffer page_buf[MAX_PAGE_BUFFER_COUNT];
431 	struct hv_page_buffer *pb = page_buf;
432 	struct netvsc_stats *tx_stats = this_cpu_ptr(net_device_ctx->tx_stats);
433 
434 	/* We will atmost need two pages to describe the rndis
435 	 * header. We can only transmit MAX_PAGE_BUFFER_COUNT number
436 	 * of pages in a single packet. If skb is scattered around
437 	 * more pages we try linearizing it.
438 	 */
439 
440 check_size:
441 	skb_length = skb->len;
442 	num_data_pgs = netvsc_get_slots(skb) + 2;
443 	if (num_data_pgs > MAX_PAGE_BUFFER_COUNT && linear) {
444 		net_alert_ratelimited("packet too big: %u pages (%u bytes)\n",
445 				      num_data_pgs, skb->len);
446 		ret = -EFAULT;
447 		goto drop;
448 	} else if (num_data_pgs > MAX_PAGE_BUFFER_COUNT) {
449 		if (skb_linearize(skb)) {
450 			net_alert_ratelimited("failed to linearize skb\n");
451 			ret = -ENOMEM;
452 			goto drop;
453 		}
454 		linear = true;
455 		goto check_size;
456 	}
457 
458 	/*
459 	 * Place the rndis header in the skb head room and
460 	 * the skb->cb will be used for hv_netvsc_packet
461 	 * structure.
462 	 */
463 	ret = skb_cow_head(skb, RNDIS_AND_PPI_SIZE);
464 	if (ret) {
465 		netdev_err(net, "unable to alloc hv_netvsc_packet\n");
466 		ret = -ENOMEM;
467 		goto drop;
468 	}
469 	/* Use the skb control buffer for building up the packet */
470 	BUILD_BUG_ON(sizeof(struct hv_netvsc_packet) >
471 			FIELD_SIZEOF(struct sk_buff, cb));
472 	packet = (struct hv_netvsc_packet *)skb->cb;
473 
474 
475 	packet->q_idx = skb_get_queue_mapping(skb);
476 
477 	packet->total_data_buflen = skb->len;
478 
479 	rndis_msg = (struct rndis_message *)skb->head;
480 
481 	memset(rndis_msg, 0, RNDIS_AND_PPI_SIZE);
482 
483 	isvlan = skb->vlan_tci & VLAN_TAG_PRESENT;
484 
485 	/* Add the rndis header */
486 	rndis_msg->ndis_msg_type = RNDIS_MSG_PACKET;
487 	rndis_msg->msg_len = packet->total_data_buflen;
488 	rndis_pkt = &rndis_msg->msg.pkt;
489 	rndis_pkt->data_offset = sizeof(struct rndis_packet);
490 	rndis_pkt->data_len = packet->total_data_buflen;
491 	rndis_pkt->per_pkt_info_offset = sizeof(struct rndis_packet);
492 
493 	rndis_msg_size = RNDIS_MESSAGE_SIZE(struct rndis_packet);
494 
495 	hash = skb_get_hash_raw(skb);
496 	if (hash != 0 && net->real_num_tx_queues > 1) {
497 		rndis_msg_size += NDIS_HASH_PPI_SIZE;
498 		ppi = init_ppi_data(rndis_msg, NDIS_HASH_PPI_SIZE,
499 				    NBL_HASH_VALUE);
500 		*(u32 *)((void *)ppi + ppi->ppi_offset) = hash;
501 	}
502 
503 	if (isvlan) {
504 		struct ndis_pkt_8021q_info *vlan;
505 
506 		rndis_msg_size += NDIS_VLAN_PPI_SIZE;
507 		ppi = init_ppi_data(rndis_msg, NDIS_VLAN_PPI_SIZE,
508 					IEEE_8021Q_INFO);
509 		vlan = (struct ndis_pkt_8021q_info *)((void *)ppi +
510 						ppi->ppi_offset);
511 		vlan->vlanid = skb->vlan_tci & VLAN_VID_MASK;
512 		vlan->pri = (skb->vlan_tci & VLAN_PRIO_MASK) >>
513 				VLAN_PRIO_SHIFT;
514 	}
515 
516 	net_trans_info = get_net_transport_info(skb, &hdr_offset);
517 	if (net_trans_info == TRANSPORT_INFO_NOT_IP)
518 		goto do_send;
519 
520 	/*
521 	 * Setup the sendside checksum offload only if this is not a
522 	 * GSO packet.
523 	 */
524 	if (skb_is_gso(skb))
525 		goto do_lso;
526 
527 	if ((skb->ip_summed == CHECKSUM_NONE) ||
528 	    (skb->ip_summed == CHECKSUM_UNNECESSARY))
529 		goto do_send;
530 
531 	rndis_msg_size += NDIS_CSUM_PPI_SIZE;
532 	ppi = init_ppi_data(rndis_msg, NDIS_CSUM_PPI_SIZE,
533 			    TCPIP_CHKSUM_PKTINFO);
534 
535 	csum_info = (struct ndis_tcp_ip_checksum_info *)((void *)ppi +
536 			ppi->ppi_offset);
537 
538 	if (net_trans_info & (INFO_IPV4 << 16))
539 		csum_info->transmit.is_ipv4 = 1;
540 	else
541 		csum_info->transmit.is_ipv6 = 1;
542 
543 	if (net_trans_info & INFO_TCP) {
544 		csum_info->transmit.tcp_checksum = 1;
545 		csum_info->transmit.tcp_header_offset = hdr_offset;
546 	} else if (net_trans_info & INFO_UDP) {
547 		/* UDP checksum offload is not supported on ws2008r2.
548 		 * Furthermore, on ws2012 and ws2012r2, there are some
549 		 * issues with udp checksum offload from Linux guests.
550 		 * (these are host issues).
551 		 * For now compute the checksum here.
552 		 */
553 		struct udphdr *uh;
554 		u16 udp_len;
555 
556 		ret = skb_cow_head(skb, 0);
557 		if (ret)
558 			goto drop;
559 
560 		uh = udp_hdr(skb);
561 		udp_len = ntohs(uh->len);
562 		uh->check = 0;
563 		uh->check = csum_tcpudp_magic(ip_hdr(skb)->saddr,
564 					      ip_hdr(skb)->daddr,
565 					      udp_len, IPPROTO_UDP,
566 					      csum_partial(uh, udp_len, 0));
567 		if (uh->check == 0)
568 			uh->check = CSUM_MANGLED_0;
569 
570 		csum_info->transmit.udp_checksum = 0;
571 	}
572 	goto do_send;
573 
574 do_lso:
575 	rndis_msg_size += NDIS_LSO_PPI_SIZE;
576 	ppi = init_ppi_data(rndis_msg, NDIS_LSO_PPI_SIZE,
577 			    TCP_LARGESEND_PKTINFO);
578 
579 	lso_info = (struct ndis_tcp_lso_info *)((void *)ppi +
580 			ppi->ppi_offset);
581 
582 	lso_info->lso_v2_transmit.type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
583 	if (net_trans_info & (INFO_IPV4 << 16)) {
584 		lso_info->lso_v2_transmit.ip_version =
585 			NDIS_TCP_LARGE_SEND_OFFLOAD_IPV4;
586 		ip_hdr(skb)->tot_len = 0;
587 		ip_hdr(skb)->check = 0;
588 		tcp_hdr(skb)->check =
589 		~csum_tcpudp_magic(ip_hdr(skb)->saddr,
590 				   ip_hdr(skb)->daddr, 0, IPPROTO_TCP, 0);
591 	} else {
592 		lso_info->lso_v2_transmit.ip_version =
593 			NDIS_TCP_LARGE_SEND_OFFLOAD_IPV6;
594 		ipv6_hdr(skb)->payload_len = 0;
595 		tcp_hdr(skb)->check =
596 		~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
597 				&ipv6_hdr(skb)->daddr, 0, IPPROTO_TCP, 0);
598 	}
599 	lso_info->lso_v2_transmit.tcp_header_offset = hdr_offset;
600 	lso_info->lso_v2_transmit.mss = skb_shinfo(skb)->gso_size;
601 
602 do_send:
603 	/* Start filling in the page buffers with the rndis hdr */
604 	rndis_msg->msg_len += rndis_msg_size;
605 	packet->total_data_buflen = rndis_msg->msg_len;
606 	packet->page_buf_cnt = init_page_array(rndis_msg, rndis_msg_size,
607 					       skb, packet, &pb);
608 
609 	ret = netvsc_send(net_device_ctx->device_ctx, packet,
610 			  rndis_msg, &pb, skb);
611 
612 drop:
613 	if (ret == 0) {
614 		u64_stats_update_begin(&tx_stats->syncp);
615 		tx_stats->packets++;
616 		tx_stats->bytes += skb_length;
617 		u64_stats_update_end(&tx_stats->syncp);
618 	} else {
619 		if (ret != -EAGAIN) {
620 			dev_kfree_skb_any(skb);
621 			net->stats.tx_dropped++;
622 		}
623 	}
624 
625 	return (ret == -EAGAIN) ? NETDEV_TX_BUSY : NETDEV_TX_OK;
626 }
627 
628 /*
629  * netvsc_linkstatus_callback - Link up/down notification
630  */
631 void netvsc_linkstatus_callback(struct hv_device *device_obj,
632 				struct rndis_message *resp)
633 {
634 	struct rndis_indicate_status *indicate = &resp->msg.indicate_status;
635 	struct net_device *net;
636 	struct net_device_context *ndev_ctx;
637 	struct netvsc_device *net_device;
638 	struct netvsc_reconfig *event;
639 	unsigned long flags;
640 
641 	/* Handle link change statuses only */
642 	if (indicate->status != RNDIS_STATUS_NETWORK_CHANGE &&
643 	    indicate->status != RNDIS_STATUS_MEDIA_CONNECT &&
644 	    indicate->status != RNDIS_STATUS_MEDIA_DISCONNECT)
645 		return;
646 
647 	net_device = hv_get_drvdata(device_obj);
648 	net = net_device->ndev;
649 
650 	if (!net || net->reg_state != NETREG_REGISTERED)
651 		return;
652 
653 	ndev_ctx = netdev_priv(net);
654 
655 	event = kzalloc(sizeof(*event), GFP_ATOMIC);
656 	if (!event)
657 		return;
658 	event->event = indicate->status;
659 
660 	spin_lock_irqsave(&ndev_ctx->lock, flags);
661 	list_add_tail(&event->list, &ndev_ctx->reconfig_events);
662 	spin_unlock_irqrestore(&ndev_ctx->lock, flags);
663 
664 	schedule_delayed_work(&ndev_ctx->dwork, 0);
665 }
666 
667 /*
668  * netvsc_recv_callback -  Callback when we receive a packet from the
669  * "wire" on the specified device.
670  */
671 int netvsc_recv_callback(struct hv_device *device_obj,
672 				struct hv_netvsc_packet *packet,
673 				void **data,
674 				struct ndis_tcp_ip_checksum_info *csum_info,
675 				struct vmbus_channel *channel,
676 				u16 vlan_tci)
677 {
678 	struct net_device *net;
679 	struct net_device_context *net_device_ctx;
680 	struct sk_buff *skb;
681 	struct netvsc_stats *rx_stats;
682 
683 	net = ((struct netvsc_device *)hv_get_drvdata(device_obj))->ndev;
684 	if (!net || net->reg_state != NETREG_REGISTERED) {
685 		return NVSP_STAT_FAIL;
686 	}
687 	net_device_ctx = netdev_priv(net);
688 	rx_stats = this_cpu_ptr(net_device_ctx->rx_stats);
689 
690 	/* Allocate a skb - TODO direct I/O to pages? */
691 	skb = netdev_alloc_skb_ip_align(net, packet->total_data_buflen);
692 	if (unlikely(!skb)) {
693 		++net->stats.rx_dropped;
694 		return NVSP_STAT_FAIL;
695 	}
696 
697 	/*
698 	 * Copy to skb. This copy is needed here since the memory pointed by
699 	 * hv_netvsc_packet cannot be deallocated
700 	 */
701 	memcpy(skb_put(skb, packet->total_data_buflen), *data,
702 		packet->total_data_buflen);
703 
704 	skb->protocol = eth_type_trans(skb, net);
705 	if (csum_info) {
706 		/* We only look at the IP checksum here.
707 		 * Should we be dropping the packet if checksum
708 		 * failed? How do we deal with other checksums - TCP/UDP?
709 		 */
710 		if (csum_info->receive.ip_checksum_succeeded)
711 			skb->ip_summed = CHECKSUM_UNNECESSARY;
712 		else
713 			skb->ip_summed = CHECKSUM_NONE;
714 	}
715 
716 	if (vlan_tci & VLAN_TAG_PRESENT)
717 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
718 				       vlan_tci);
719 
720 	skb_record_rx_queue(skb, channel->
721 			    offermsg.offer.sub_channel_index);
722 
723 	u64_stats_update_begin(&rx_stats->syncp);
724 	rx_stats->packets++;
725 	rx_stats->bytes += packet->total_data_buflen;
726 	u64_stats_update_end(&rx_stats->syncp);
727 
728 	/*
729 	 * Pass the skb back up. Network stack will deallocate the skb when it
730 	 * is done.
731 	 * TODO - use NAPI?
732 	 */
733 	netif_rx(skb);
734 
735 	return 0;
736 }
737 
738 static void netvsc_get_drvinfo(struct net_device *net,
739 			       struct ethtool_drvinfo *info)
740 {
741 	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
742 	strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
743 }
744 
745 static void netvsc_get_channels(struct net_device *net,
746 				struct ethtool_channels *channel)
747 {
748 	struct net_device_context *net_device_ctx = netdev_priv(net);
749 	struct hv_device *dev = net_device_ctx->device_ctx;
750 	struct netvsc_device *nvdev = hv_get_drvdata(dev);
751 
752 	if (nvdev) {
753 		channel->max_combined	= nvdev->max_chn;
754 		channel->combined_count = nvdev->num_chn;
755 	}
756 }
757 
758 static int netvsc_set_channels(struct net_device *net,
759 			       struct ethtool_channels *channels)
760 {
761 	struct net_device_context *net_device_ctx = netdev_priv(net);
762 	struct hv_device *dev = net_device_ctx->device_ctx;
763 	struct netvsc_device *nvdev = hv_get_drvdata(dev);
764 	struct netvsc_device_info device_info;
765 	u32 num_chn;
766 	u32 max_chn;
767 	int ret = 0;
768 	bool recovering = false;
769 
770 	if (!nvdev || nvdev->destroy)
771 		return -ENODEV;
772 
773 	num_chn = nvdev->num_chn;
774 	max_chn = min_t(u32, nvdev->max_chn, num_online_cpus());
775 
776 	if (nvdev->nvsp_version < NVSP_PROTOCOL_VERSION_5) {
777 		pr_info("vRSS unsupported before NVSP Version 5\n");
778 		return -EINVAL;
779 	}
780 
781 	/* We do not support rx, tx, or other */
782 	if (!channels ||
783 	    channels->rx_count ||
784 	    channels->tx_count ||
785 	    channels->other_count ||
786 	    (channels->combined_count < 1))
787 		return -EINVAL;
788 
789 	if (channels->combined_count > max_chn) {
790 		pr_info("combined channels too high, using %d\n", max_chn);
791 		channels->combined_count = max_chn;
792 	}
793 
794 	ret = netvsc_close(net);
795 	if (ret)
796 		goto out;
797 
798  do_set:
799 	nvdev->start_remove = true;
800 	rndis_filter_device_remove(dev);
801 
802 	nvdev->num_chn = channels->combined_count;
803 
804 	net_device_ctx->device_ctx = dev;
805 	hv_set_drvdata(dev, net);
806 
807 	memset(&device_info, 0, sizeof(device_info));
808 	device_info.num_chn = nvdev->num_chn; /* passed to RNDIS */
809 	device_info.ring_size = ring_size;
810 	device_info.max_num_vrss_chns = max_num_vrss_chns;
811 
812 	ret = rndis_filter_device_add(dev, &device_info);
813 	if (ret) {
814 		if (recovering) {
815 			netdev_err(net, "unable to add netvsc device (ret %d)\n", ret);
816 			return ret;
817 		}
818 		goto recover;
819 	}
820 
821 	nvdev = hv_get_drvdata(dev);
822 
823 	ret = netif_set_real_num_tx_queues(net, nvdev->num_chn);
824 	if (ret) {
825 		if (recovering) {
826 			netdev_err(net, "could not set tx queue count (ret %d)\n", ret);
827 			return ret;
828 		}
829 		goto recover;
830 	}
831 
832 	ret = netif_set_real_num_rx_queues(net, nvdev->num_chn);
833 	if (ret) {
834 		if (recovering) {
835 			netdev_err(net, "could not set rx queue count (ret %d)\n", ret);
836 			return ret;
837 		}
838 		goto recover;
839 	}
840 
841  out:
842 	netvsc_open(net);
843 
844 	return ret;
845 
846  recover:
847 	/* If the above failed, we attempt to recover through the same
848 	 * process but with the original number of channels.
849 	 */
850 	netdev_err(net, "could not set channels, recovering\n");
851 	recovering = true;
852 	channels->combined_count = num_chn;
853 	goto do_set;
854 }
855 
856 static int netvsc_change_mtu(struct net_device *ndev, int mtu)
857 {
858 	struct net_device_context *ndevctx = netdev_priv(ndev);
859 	struct hv_device *hdev =  ndevctx->device_ctx;
860 	struct netvsc_device *nvdev = hv_get_drvdata(hdev);
861 	struct netvsc_device_info device_info;
862 	int limit = ETH_DATA_LEN;
863 	int ret = 0;
864 
865 	if (nvdev == NULL || nvdev->destroy)
866 		return -ENODEV;
867 
868 	if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
869 		limit = NETVSC_MTU - ETH_HLEN;
870 
871 	if (mtu < NETVSC_MTU_MIN || mtu > limit)
872 		return -EINVAL;
873 
874 	ret = netvsc_close(ndev);
875 	if (ret)
876 		goto out;
877 
878 	nvdev->start_remove = true;
879 	rndis_filter_device_remove(hdev);
880 
881 	ndev->mtu = mtu;
882 
883 	ndevctx->device_ctx = hdev;
884 	hv_set_drvdata(hdev, ndev);
885 
886 	memset(&device_info, 0, sizeof(device_info));
887 	device_info.ring_size = ring_size;
888 	device_info.num_chn = nvdev->num_chn;
889 	device_info.max_num_vrss_chns = max_num_vrss_chns;
890 	rndis_filter_device_add(hdev, &device_info);
891 
892 out:
893 	netvsc_open(ndev);
894 
895 	return ret;
896 }
897 
898 static struct rtnl_link_stats64 *netvsc_get_stats64(struct net_device *net,
899 						    struct rtnl_link_stats64 *t)
900 {
901 	struct net_device_context *ndev_ctx = netdev_priv(net);
902 	int cpu;
903 
904 	for_each_possible_cpu(cpu) {
905 		struct netvsc_stats *tx_stats = per_cpu_ptr(ndev_ctx->tx_stats,
906 							    cpu);
907 		struct netvsc_stats *rx_stats = per_cpu_ptr(ndev_ctx->rx_stats,
908 							    cpu);
909 		u64 tx_packets, tx_bytes, rx_packets, rx_bytes;
910 		unsigned int start;
911 
912 		do {
913 			start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
914 			tx_packets = tx_stats->packets;
915 			tx_bytes = tx_stats->bytes;
916 		} while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
917 
918 		do {
919 			start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
920 			rx_packets = rx_stats->packets;
921 			rx_bytes = rx_stats->bytes;
922 		} while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
923 
924 		t->tx_bytes	+= tx_bytes;
925 		t->tx_packets	+= tx_packets;
926 		t->rx_bytes	+= rx_bytes;
927 		t->rx_packets	+= rx_packets;
928 	}
929 
930 	t->tx_dropped	= net->stats.tx_dropped;
931 	t->tx_errors	= net->stats.tx_dropped;
932 
933 	t->rx_dropped	= net->stats.rx_dropped;
934 	t->rx_errors	= net->stats.rx_errors;
935 
936 	return t;
937 }
938 
939 static int netvsc_set_mac_addr(struct net_device *ndev, void *p)
940 {
941 	struct net_device_context *ndevctx = netdev_priv(ndev);
942 	struct hv_device *hdev =  ndevctx->device_ctx;
943 	struct sockaddr *addr = p;
944 	char save_adr[ETH_ALEN];
945 	unsigned char save_aatype;
946 	int err;
947 
948 	memcpy(save_adr, ndev->dev_addr, ETH_ALEN);
949 	save_aatype = ndev->addr_assign_type;
950 
951 	err = eth_mac_addr(ndev, p);
952 	if (err != 0)
953 		return err;
954 
955 	err = rndis_filter_set_device_mac(hdev, addr->sa_data);
956 	if (err != 0) {
957 		/* roll back to saved MAC */
958 		memcpy(ndev->dev_addr, save_adr, ETH_ALEN);
959 		ndev->addr_assign_type = save_aatype;
960 	}
961 
962 	return err;
963 }
964 
965 #ifdef CONFIG_NET_POLL_CONTROLLER
966 static void netvsc_poll_controller(struct net_device *net)
967 {
968 	/* As netvsc_start_xmit() works synchronous we don't have to
969 	 * trigger anything here.
970 	 */
971 }
972 #endif
973 
974 static const struct ethtool_ops ethtool_ops = {
975 	.get_drvinfo	= netvsc_get_drvinfo,
976 	.get_link	= ethtool_op_get_link,
977 	.get_channels   = netvsc_get_channels,
978 	.set_channels   = netvsc_set_channels,
979 };
980 
981 static const struct net_device_ops device_ops = {
982 	.ndo_open =			netvsc_open,
983 	.ndo_stop =			netvsc_close,
984 	.ndo_start_xmit =		netvsc_start_xmit,
985 	.ndo_set_rx_mode =		netvsc_set_multicast_list,
986 	.ndo_change_mtu =		netvsc_change_mtu,
987 	.ndo_validate_addr =		eth_validate_addr,
988 	.ndo_set_mac_address =		netvsc_set_mac_addr,
989 	.ndo_select_queue =		netvsc_select_queue,
990 	.ndo_get_stats64 =		netvsc_get_stats64,
991 #ifdef CONFIG_NET_POLL_CONTROLLER
992 	.ndo_poll_controller =		netvsc_poll_controller,
993 #endif
994 };
995 
996 /*
997  * Handle link status changes. For RNDIS_STATUS_NETWORK_CHANGE emulate link
998  * down/up sequence. In case of RNDIS_STATUS_MEDIA_CONNECT when carrier is
999  * present send GARP packet to network peers with netif_notify_peers().
1000  */
1001 static void netvsc_link_change(struct work_struct *w)
1002 {
1003 	struct net_device_context *ndev_ctx;
1004 	struct net_device *net;
1005 	struct netvsc_device *net_device;
1006 	struct rndis_device *rdev;
1007 	struct netvsc_reconfig *event = NULL;
1008 	bool notify = false, reschedule = false;
1009 	unsigned long flags, next_reconfig, delay;
1010 
1011 	ndev_ctx = container_of(w, struct net_device_context, dwork.work);
1012 	net_device = hv_get_drvdata(ndev_ctx->device_ctx);
1013 	rdev = net_device->extension;
1014 	net = net_device->ndev;
1015 
1016 	next_reconfig = ndev_ctx->last_reconfig + LINKCHANGE_INT;
1017 	if (time_is_after_jiffies(next_reconfig)) {
1018 		/* link_watch only sends one notification with current state
1019 		 * per second, avoid doing reconfig more frequently. Handle
1020 		 * wrap around.
1021 		 */
1022 		delay = next_reconfig - jiffies;
1023 		delay = delay < LINKCHANGE_INT ? delay : LINKCHANGE_INT;
1024 		schedule_delayed_work(&ndev_ctx->dwork, delay);
1025 		return;
1026 	}
1027 	ndev_ctx->last_reconfig = jiffies;
1028 
1029 	spin_lock_irqsave(&ndev_ctx->lock, flags);
1030 	if (!list_empty(&ndev_ctx->reconfig_events)) {
1031 		event = list_first_entry(&ndev_ctx->reconfig_events,
1032 					 struct netvsc_reconfig, list);
1033 		list_del(&event->list);
1034 		reschedule = !list_empty(&ndev_ctx->reconfig_events);
1035 	}
1036 	spin_unlock_irqrestore(&ndev_ctx->lock, flags);
1037 
1038 	if (!event)
1039 		return;
1040 
1041 	rtnl_lock();
1042 
1043 	switch (event->event) {
1044 		/* Only the following events are possible due to the check in
1045 		 * netvsc_linkstatus_callback()
1046 		 */
1047 	case RNDIS_STATUS_MEDIA_CONNECT:
1048 		if (rdev->link_state) {
1049 			rdev->link_state = false;
1050 			netif_carrier_on(net);
1051 			netif_tx_wake_all_queues(net);
1052 		} else {
1053 			notify = true;
1054 		}
1055 		kfree(event);
1056 		break;
1057 	case RNDIS_STATUS_MEDIA_DISCONNECT:
1058 		if (!rdev->link_state) {
1059 			rdev->link_state = true;
1060 			netif_carrier_off(net);
1061 			netif_tx_stop_all_queues(net);
1062 		}
1063 		kfree(event);
1064 		break;
1065 	case RNDIS_STATUS_NETWORK_CHANGE:
1066 		/* Only makes sense if carrier is present */
1067 		if (!rdev->link_state) {
1068 			rdev->link_state = true;
1069 			netif_carrier_off(net);
1070 			netif_tx_stop_all_queues(net);
1071 			event->event = RNDIS_STATUS_MEDIA_CONNECT;
1072 			spin_lock_irqsave(&ndev_ctx->lock, flags);
1073 			list_add_tail(&event->list, &ndev_ctx->reconfig_events);
1074 			spin_unlock_irqrestore(&ndev_ctx->lock, flags);
1075 			reschedule = true;
1076 		}
1077 		break;
1078 	}
1079 
1080 	rtnl_unlock();
1081 
1082 	if (notify)
1083 		netdev_notify_peers(net);
1084 
1085 	/* link_watch only sends one notification with current state per
1086 	 * second, handle next reconfig event in 2 seconds.
1087 	 */
1088 	if (reschedule)
1089 		schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
1090 }
1091 
1092 static void netvsc_free_netdev(struct net_device *netdev)
1093 {
1094 	struct net_device_context *net_device_ctx = netdev_priv(netdev);
1095 
1096 	free_percpu(net_device_ctx->tx_stats);
1097 	free_percpu(net_device_ctx->rx_stats);
1098 	free_netdev(netdev);
1099 }
1100 
1101 static int netvsc_probe(struct hv_device *dev,
1102 			const struct hv_vmbus_device_id *dev_id)
1103 {
1104 	struct net_device *net = NULL;
1105 	struct net_device_context *net_device_ctx;
1106 	struct netvsc_device_info device_info;
1107 	struct netvsc_device *nvdev;
1108 	int ret;
1109 
1110 	net = alloc_etherdev_mq(sizeof(struct net_device_context),
1111 				num_online_cpus());
1112 	if (!net)
1113 		return -ENOMEM;
1114 
1115 	netif_carrier_off(net);
1116 
1117 	net_device_ctx = netdev_priv(net);
1118 	net_device_ctx->device_ctx = dev;
1119 	net_device_ctx->msg_enable = netif_msg_init(debug, default_msg);
1120 	if (netif_msg_probe(net_device_ctx))
1121 		netdev_dbg(net, "netvsc msg_enable: %d\n",
1122 			   net_device_ctx->msg_enable);
1123 
1124 	net_device_ctx->tx_stats = netdev_alloc_pcpu_stats(struct netvsc_stats);
1125 	if (!net_device_ctx->tx_stats) {
1126 		free_netdev(net);
1127 		return -ENOMEM;
1128 	}
1129 	net_device_ctx->rx_stats = netdev_alloc_pcpu_stats(struct netvsc_stats);
1130 	if (!net_device_ctx->rx_stats) {
1131 		free_percpu(net_device_ctx->tx_stats);
1132 		free_netdev(net);
1133 		return -ENOMEM;
1134 	}
1135 
1136 	hv_set_drvdata(dev, net);
1137 	INIT_DELAYED_WORK(&net_device_ctx->dwork, netvsc_link_change);
1138 	INIT_WORK(&net_device_ctx->work, do_set_multicast);
1139 
1140 	spin_lock_init(&net_device_ctx->lock);
1141 	INIT_LIST_HEAD(&net_device_ctx->reconfig_events);
1142 
1143 	net->netdev_ops = &device_ops;
1144 
1145 	net->hw_features = NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_IP_CSUM |
1146 				NETIF_F_TSO;
1147 	net->features = NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_SG | NETIF_F_RXCSUM |
1148 			NETIF_F_IP_CSUM | NETIF_F_TSO;
1149 
1150 	net->ethtool_ops = &ethtool_ops;
1151 	SET_NETDEV_DEV(net, &dev->device);
1152 
1153 	/* Notify the netvsc driver of the new device */
1154 	memset(&device_info, 0, sizeof(device_info));
1155 	device_info.ring_size = ring_size;
1156 	device_info.max_num_vrss_chns = max_num_vrss_chns;
1157 	ret = rndis_filter_device_add(dev, &device_info);
1158 	if (ret != 0) {
1159 		netdev_err(net, "unable to add netvsc device (ret %d)\n", ret);
1160 		netvsc_free_netdev(net);
1161 		hv_set_drvdata(dev, NULL);
1162 		return ret;
1163 	}
1164 	memcpy(net->dev_addr, device_info.mac_adr, ETH_ALEN);
1165 
1166 	nvdev = hv_get_drvdata(dev);
1167 	netif_set_real_num_tx_queues(net, nvdev->num_chn);
1168 	netif_set_real_num_rx_queues(net, nvdev->num_chn);
1169 
1170 	ret = register_netdev(net);
1171 	if (ret != 0) {
1172 		pr_err("Unable to register netdev.\n");
1173 		rndis_filter_device_remove(dev);
1174 		netvsc_free_netdev(net);
1175 	}
1176 
1177 	return ret;
1178 }
1179 
1180 static int netvsc_remove(struct hv_device *dev)
1181 {
1182 	struct net_device *net;
1183 	struct net_device_context *ndev_ctx;
1184 	struct netvsc_device *net_device;
1185 
1186 	net_device = hv_get_drvdata(dev);
1187 	net = net_device->ndev;
1188 
1189 	if (net == NULL) {
1190 		dev_err(&dev->device, "No net device to remove\n");
1191 		return 0;
1192 	}
1193 
1194 	net_device->start_remove = true;
1195 
1196 	ndev_ctx = netdev_priv(net);
1197 	cancel_delayed_work_sync(&ndev_ctx->dwork);
1198 	cancel_work_sync(&ndev_ctx->work);
1199 
1200 	/* Stop outbound asap */
1201 	netif_tx_disable(net);
1202 
1203 	unregister_netdev(net);
1204 
1205 	/*
1206 	 * Call to the vsc driver to let it know that the device is being
1207 	 * removed
1208 	 */
1209 	rndis_filter_device_remove(dev);
1210 
1211 	netvsc_free_netdev(net);
1212 	return 0;
1213 }
1214 
1215 static const struct hv_vmbus_device_id id_table[] = {
1216 	/* Network guid */
1217 	{ HV_NIC_GUID, },
1218 	{ },
1219 };
1220 
1221 MODULE_DEVICE_TABLE(vmbus, id_table);
1222 
1223 /* The one and only one */
1224 static struct  hv_driver netvsc_drv = {
1225 	.name = KBUILD_MODNAME,
1226 	.id_table = id_table,
1227 	.probe = netvsc_probe,
1228 	.remove = netvsc_remove,
1229 };
1230 
1231 static void __exit netvsc_drv_exit(void)
1232 {
1233 	vmbus_driver_unregister(&netvsc_drv);
1234 }
1235 
1236 static int __init netvsc_drv_init(void)
1237 {
1238 	if (ring_size < RING_SIZE_MIN) {
1239 		ring_size = RING_SIZE_MIN;
1240 		pr_info("Increased ring_size to %d (min allowed)\n",
1241 			ring_size);
1242 	}
1243 	return vmbus_driver_register(&netvsc_drv);
1244 }
1245 
1246 MODULE_LICENSE("GPL");
1247 MODULE_DESCRIPTION("Microsoft Hyper-V network driver");
1248 
1249 module_init(netvsc_drv_init);
1250 module_exit(netvsc_drv_exit);
1251