1 /*******************************************************************************
2  *
3  * Intel Ethernet Controller XL710 Family Linux Driver
4  * Copyright(c) 2013 - 2016 Intel Corporation.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  * The full GNU General Public License is included in this distribution in
19  * the file called "COPYING".
20  *
21  * Contact Information:
22  * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
23  * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
24  *
25  ******************************************************************************/
26 
27 #include <linux/prefetch.h>
28 #include <net/busy_poll.h>
29 #include "i40e.h"
30 #include "i40e_prototype.h"
31 
32 static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
33 				u32 td_tag)
34 {
35 	return cpu_to_le64(I40E_TX_DESC_DTYPE_DATA |
36 			   ((u64)td_cmd  << I40E_TXD_QW1_CMD_SHIFT) |
37 			   ((u64)td_offset << I40E_TXD_QW1_OFFSET_SHIFT) |
38 			   ((u64)size  << I40E_TXD_QW1_TX_BUF_SZ_SHIFT) |
39 			   ((u64)td_tag  << I40E_TXD_QW1_L2TAG1_SHIFT));
40 }
41 
42 #define I40E_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
43 #define I40E_FD_CLEAN_DELAY 10
44 /**
45  * i40e_program_fdir_filter - Program a Flow Director filter
46  * @fdir_data: Packet data that will be filter parameters
47  * @raw_packet: the pre-allocated packet buffer for FDir
48  * @pf: The PF pointer
49  * @add: True for add/update, False for remove
50  **/
51 int i40e_program_fdir_filter(struct i40e_fdir_filter *fdir_data, u8 *raw_packet,
52 			     struct i40e_pf *pf, bool add)
53 {
54 	struct i40e_filter_program_desc *fdir_desc;
55 	struct i40e_tx_buffer *tx_buf, *first;
56 	struct i40e_tx_desc *tx_desc;
57 	struct i40e_ring *tx_ring;
58 	unsigned int fpt, dcc;
59 	struct i40e_vsi *vsi;
60 	struct device *dev;
61 	dma_addr_t dma;
62 	u32 td_cmd = 0;
63 	u16 delay = 0;
64 	u16 i;
65 
66 	/* find existing FDIR VSI */
67 	vsi = NULL;
68 	for (i = 0; i < pf->num_alloc_vsi; i++)
69 		if (pf->vsi[i] && pf->vsi[i]->type == I40E_VSI_FDIR)
70 			vsi = pf->vsi[i];
71 	if (!vsi)
72 		return -ENOENT;
73 
74 	tx_ring = vsi->tx_rings[0];
75 	dev = tx_ring->dev;
76 
77 	/* we need two descriptors to add/del a filter and we can wait */
78 	do {
79 		if (I40E_DESC_UNUSED(tx_ring) > 1)
80 			break;
81 		msleep_interruptible(1);
82 		delay++;
83 	} while (delay < I40E_FD_CLEAN_DELAY);
84 
85 	if (!(I40E_DESC_UNUSED(tx_ring) > 1))
86 		return -EAGAIN;
87 
88 	dma = dma_map_single(dev, raw_packet,
89 			     I40E_FDIR_MAX_RAW_PACKET_SIZE, DMA_TO_DEVICE);
90 	if (dma_mapping_error(dev, dma))
91 		goto dma_fail;
92 
93 	/* grab the next descriptor */
94 	i = tx_ring->next_to_use;
95 	fdir_desc = I40E_TX_FDIRDESC(tx_ring, i);
96 	first = &tx_ring->tx_bi[i];
97 	memset(first, 0, sizeof(struct i40e_tx_buffer));
98 
99 	tx_ring->next_to_use = ((i + 1) < tx_ring->count) ? i + 1 : 0;
100 
101 	fpt = (fdir_data->q_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT) &
102 	      I40E_TXD_FLTR_QW0_QINDEX_MASK;
103 
104 	fpt |= (fdir_data->flex_off << I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT) &
105 	       I40E_TXD_FLTR_QW0_FLEXOFF_MASK;
106 
107 	fpt |= (fdir_data->pctype << I40E_TXD_FLTR_QW0_PCTYPE_SHIFT) &
108 	       I40E_TXD_FLTR_QW0_PCTYPE_MASK;
109 
110 	/* Use LAN VSI Id if not programmed by user */
111 	if (fdir_data->dest_vsi == 0)
112 		fpt |= (pf->vsi[pf->lan_vsi]->id) <<
113 		       I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT;
114 	else
115 		fpt |= ((u32)fdir_data->dest_vsi <<
116 			I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT) &
117 		       I40E_TXD_FLTR_QW0_DEST_VSI_MASK;
118 
119 	dcc = I40E_TX_DESC_DTYPE_FILTER_PROG;
120 
121 	if (add)
122 		dcc |= I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE <<
123 		       I40E_TXD_FLTR_QW1_PCMD_SHIFT;
124 	else
125 		dcc |= I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE <<
126 		       I40E_TXD_FLTR_QW1_PCMD_SHIFT;
127 
128 	dcc |= (fdir_data->dest_ctl << I40E_TXD_FLTR_QW1_DEST_SHIFT) &
129 	       I40E_TXD_FLTR_QW1_DEST_MASK;
130 
131 	dcc |= (fdir_data->fd_status << I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT) &
132 	       I40E_TXD_FLTR_QW1_FD_STATUS_MASK;
133 
134 	if (fdir_data->cnt_index != 0) {
135 		dcc |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
136 		dcc |= ((u32)fdir_data->cnt_index <<
137 			I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
138 			I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
139 	}
140 
141 	fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(fpt);
142 	fdir_desc->rsvd = cpu_to_le32(0);
143 	fdir_desc->dtype_cmd_cntindex = cpu_to_le32(dcc);
144 	fdir_desc->fd_id = cpu_to_le32(fdir_data->fd_id);
145 
146 	/* Now program a dummy descriptor */
147 	i = tx_ring->next_to_use;
148 	tx_desc = I40E_TX_DESC(tx_ring, i);
149 	tx_buf = &tx_ring->tx_bi[i];
150 
151 	tx_ring->next_to_use = ((i + 1) < tx_ring->count) ? i + 1 : 0;
152 
153 	memset(tx_buf, 0, sizeof(struct i40e_tx_buffer));
154 
155 	/* record length, and DMA address */
156 	dma_unmap_len_set(tx_buf, len, I40E_FDIR_MAX_RAW_PACKET_SIZE);
157 	dma_unmap_addr_set(tx_buf, dma, dma);
158 
159 	tx_desc->buffer_addr = cpu_to_le64(dma);
160 	td_cmd = I40E_TXD_CMD | I40E_TX_DESC_CMD_DUMMY;
161 
162 	tx_buf->tx_flags = I40E_TX_FLAGS_FD_SB;
163 	tx_buf->raw_buf = (void *)raw_packet;
164 
165 	tx_desc->cmd_type_offset_bsz =
166 		build_ctob(td_cmd, 0, I40E_FDIR_MAX_RAW_PACKET_SIZE, 0);
167 
168 	/* Force memory writes to complete before letting h/w
169 	 * know there are new descriptors to fetch.
170 	 */
171 	wmb();
172 
173 	/* Mark the data descriptor to be watched */
174 	first->next_to_watch = tx_desc;
175 
176 	writel(tx_ring->next_to_use, tx_ring->tail);
177 	return 0;
178 
179 dma_fail:
180 	return -1;
181 }
182 
183 #define IP_HEADER_OFFSET 14
184 #define I40E_UDPIP_DUMMY_PACKET_LEN 42
185 /**
186  * i40e_add_del_fdir_udpv4 - Add/Remove UDPv4 filters
187  * @vsi: pointer to the targeted VSI
188  * @fd_data: the flow director data required for the FDir descriptor
189  * @add: true adds a filter, false removes it
190  *
191  * Returns 0 if the filters were successfully added or removed
192  **/
193 static int i40e_add_del_fdir_udpv4(struct i40e_vsi *vsi,
194 				   struct i40e_fdir_filter *fd_data,
195 				   bool add)
196 {
197 	struct i40e_pf *pf = vsi->back;
198 	struct udphdr *udp;
199 	struct iphdr *ip;
200 	bool err = false;
201 	u8 *raw_packet;
202 	int ret;
203 	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
204 		0x45, 0, 0, 0x1c, 0, 0, 0x40, 0, 0x40, 0x11, 0, 0, 0, 0, 0, 0,
205 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 
207 	raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
208 	if (!raw_packet)
209 		return -ENOMEM;
210 	memcpy(raw_packet, packet, I40E_UDPIP_DUMMY_PACKET_LEN);
211 
212 	ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
213 	udp = (struct udphdr *)(raw_packet + IP_HEADER_OFFSET
214 	      + sizeof(struct iphdr));
215 
216 	ip->daddr = fd_data->dst_ip[0];
217 	udp->dest = fd_data->dst_port;
218 	ip->saddr = fd_data->src_ip[0];
219 	udp->source = fd_data->src_port;
220 
221 	fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
222 	ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
223 	if (ret) {
224 		dev_info(&pf->pdev->dev,
225 			 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
226 			 fd_data->pctype, fd_data->fd_id, ret);
227 		err = true;
228 	} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
229 		if (add)
230 			dev_info(&pf->pdev->dev,
231 				 "Filter OK for PCTYPE %d loc = %d\n",
232 				 fd_data->pctype, fd_data->fd_id);
233 		else
234 			dev_info(&pf->pdev->dev,
235 				 "Filter deleted for PCTYPE %d loc = %d\n",
236 				 fd_data->pctype, fd_data->fd_id);
237 	}
238 	if (err)
239 		kfree(raw_packet);
240 
241 	return err ? -EOPNOTSUPP : 0;
242 }
243 
244 #define I40E_TCPIP_DUMMY_PACKET_LEN 54
245 /**
246  * i40e_add_del_fdir_tcpv4 - Add/Remove TCPv4 filters
247  * @vsi: pointer to the targeted VSI
248  * @fd_data: the flow director data required for the FDir descriptor
249  * @add: true adds a filter, false removes it
250  *
251  * Returns 0 if the filters were successfully added or removed
252  **/
253 static int i40e_add_del_fdir_tcpv4(struct i40e_vsi *vsi,
254 				   struct i40e_fdir_filter *fd_data,
255 				   bool add)
256 {
257 	struct i40e_pf *pf = vsi->back;
258 	struct tcphdr *tcp;
259 	struct iphdr *ip;
260 	bool err = false;
261 	u8 *raw_packet;
262 	int ret;
263 	/* Dummy packet */
264 	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
265 		0x45, 0, 0, 0x28, 0, 0, 0x40, 0, 0x40, 0x6, 0, 0, 0, 0, 0, 0,
266 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80, 0x11,
267 		0x0, 0x72, 0, 0, 0, 0};
268 
269 	raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
270 	if (!raw_packet)
271 		return -ENOMEM;
272 	memcpy(raw_packet, packet, I40E_TCPIP_DUMMY_PACKET_LEN);
273 
274 	ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
275 	tcp = (struct tcphdr *)(raw_packet + IP_HEADER_OFFSET
276 	      + sizeof(struct iphdr));
277 
278 	ip->daddr = fd_data->dst_ip[0];
279 	tcp->dest = fd_data->dst_port;
280 	ip->saddr = fd_data->src_ip[0];
281 	tcp->source = fd_data->src_port;
282 
283 	if (add) {
284 		pf->fd_tcp_rule++;
285 		if (pf->flags & I40E_FLAG_FD_ATR_ENABLED) {
286 			if (I40E_DEBUG_FD & pf->hw.debug_mask)
287 				dev_info(&pf->pdev->dev, "Forcing ATR off, sideband rules for TCP/IPv4 flow being applied\n");
288 			pf->flags &= ~I40E_FLAG_FD_ATR_ENABLED;
289 		}
290 	} else {
291 		pf->fd_tcp_rule = (pf->fd_tcp_rule > 0) ?
292 				  (pf->fd_tcp_rule - 1) : 0;
293 		if (pf->fd_tcp_rule == 0) {
294 			pf->flags |= I40E_FLAG_FD_ATR_ENABLED;
295 			if (I40E_DEBUG_FD & pf->hw.debug_mask)
296 				dev_info(&pf->pdev->dev, "ATR re-enabled due to no sideband TCP/IPv4 rules\n");
297 		}
298 	}
299 
300 	fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
301 	ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
302 
303 	if (ret) {
304 		dev_info(&pf->pdev->dev,
305 			 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
306 			 fd_data->pctype, fd_data->fd_id, ret);
307 		err = true;
308 	} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
309 		if (add)
310 			dev_info(&pf->pdev->dev, "Filter OK for PCTYPE %d loc = %d)\n",
311 				 fd_data->pctype, fd_data->fd_id);
312 		else
313 			dev_info(&pf->pdev->dev,
314 				 "Filter deleted for PCTYPE %d loc = %d\n",
315 				 fd_data->pctype, fd_data->fd_id);
316 	}
317 
318 	if (err)
319 		kfree(raw_packet);
320 
321 	return err ? -EOPNOTSUPP : 0;
322 }
323 
324 /**
325  * i40e_add_del_fdir_sctpv4 - Add/Remove SCTPv4 Flow Director filters for
326  * a specific flow spec
327  * @vsi: pointer to the targeted VSI
328  * @fd_data: the flow director data required for the FDir descriptor
329  * @add: true adds a filter, false removes it
330  *
331  * Returns 0 if the filters were successfully added or removed
332  **/
333 static int i40e_add_del_fdir_sctpv4(struct i40e_vsi *vsi,
334 				    struct i40e_fdir_filter *fd_data,
335 				    bool add)
336 {
337 	return -EOPNOTSUPP;
338 }
339 
340 #define I40E_IP_DUMMY_PACKET_LEN 34
341 /**
342  * i40e_add_del_fdir_ipv4 - Add/Remove IPv4 Flow Director filters for
343  * a specific flow spec
344  * @vsi: pointer to the targeted VSI
345  * @fd_data: the flow director data required for the FDir descriptor
346  * @add: true adds a filter, false removes it
347  *
348  * Returns 0 if the filters were successfully added or removed
349  **/
350 static int i40e_add_del_fdir_ipv4(struct i40e_vsi *vsi,
351 				  struct i40e_fdir_filter *fd_data,
352 				  bool add)
353 {
354 	struct i40e_pf *pf = vsi->back;
355 	struct iphdr *ip;
356 	bool err = false;
357 	u8 *raw_packet;
358 	int ret;
359 	int i;
360 	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
361 		0x45, 0, 0, 0x14, 0, 0, 0x40, 0, 0x40, 0x10, 0, 0, 0, 0, 0, 0,
362 		0, 0, 0, 0};
363 
364 	for (i = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER;
365 	     i <= I40E_FILTER_PCTYPE_FRAG_IPV4;	i++) {
366 		raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
367 		if (!raw_packet)
368 			return -ENOMEM;
369 		memcpy(raw_packet, packet, I40E_IP_DUMMY_PACKET_LEN);
370 		ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
371 
372 		ip->saddr = fd_data->src_ip[0];
373 		ip->daddr = fd_data->dst_ip[0];
374 		ip->protocol = 0;
375 
376 		fd_data->pctype = i;
377 		ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
378 
379 		if (ret) {
380 			dev_info(&pf->pdev->dev,
381 				 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
382 				 fd_data->pctype, fd_data->fd_id, ret);
383 			err = true;
384 		} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
385 			if (add)
386 				dev_info(&pf->pdev->dev,
387 					 "Filter OK for PCTYPE %d loc = %d\n",
388 					 fd_data->pctype, fd_data->fd_id);
389 			else
390 				dev_info(&pf->pdev->dev,
391 					 "Filter deleted for PCTYPE %d loc = %d\n",
392 					 fd_data->pctype, fd_data->fd_id);
393 		}
394 	}
395 
396 	if (err)
397 		kfree(raw_packet);
398 
399 	return err ? -EOPNOTSUPP : 0;
400 }
401 
402 /**
403  * i40e_add_del_fdir - Build raw packets to add/del fdir filter
404  * @vsi: pointer to the targeted VSI
405  * @cmd: command to get or set RX flow classification rules
406  * @add: true adds a filter, false removes it
407  *
408  **/
409 int i40e_add_del_fdir(struct i40e_vsi *vsi,
410 		      struct i40e_fdir_filter *input, bool add)
411 {
412 	struct i40e_pf *pf = vsi->back;
413 	int ret;
414 
415 	switch (input->flow_type & ~FLOW_EXT) {
416 	case TCP_V4_FLOW:
417 		ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
418 		break;
419 	case UDP_V4_FLOW:
420 		ret = i40e_add_del_fdir_udpv4(vsi, input, add);
421 		break;
422 	case SCTP_V4_FLOW:
423 		ret = i40e_add_del_fdir_sctpv4(vsi, input, add);
424 		break;
425 	case IPV4_FLOW:
426 		ret = i40e_add_del_fdir_ipv4(vsi, input, add);
427 		break;
428 	case IP_USER_FLOW:
429 		switch (input->ip4_proto) {
430 		case IPPROTO_TCP:
431 			ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
432 			break;
433 		case IPPROTO_UDP:
434 			ret = i40e_add_del_fdir_udpv4(vsi, input, add);
435 			break;
436 		case IPPROTO_SCTP:
437 			ret = i40e_add_del_fdir_sctpv4(vsi, input, add);
438 			break;
439 		default:
440 			ret = i40e_add_del_fdir_ipv4(vsi, input, add);
441 			break;
442 		}
443 		break;
444 	default:
445 		dev_info(&pf->pdev->dev, "Could not specify spec type %d\n",
446 			 input->flow_type);
447 		ret = -EINVAL;
448 	}
449 
450 	/* The buffer allocated here is freed by the i40e_clean_tx_ring() */
451 	return ret;
452 }
453 
454 /**
455  * i40e_fd_handle_status - check the Programming Status for FD
456  * @rx_ring: the Rx ring for this descriptor
457  * @rx_desc: the Rx descriptor for programming Status, not a packet descriptor.
458  * @prog_id: the id originally used for programming
459  *
460  * This is used to verify if the FD programming or invalidation
461  * requested by SW to the HW is successful or not and take actions accordingly.
462  **/
463 static void i40e_fd_handle_status(struct i40e_ring *rx_ring,
464 				  union i40e_rx_desc *rx_desc, u8 prog_id)
465 {
466 	struct i40e_pf *pf = rx_ring->vsi->back;
467 	struct pci_dev *pdev = pf->pdev;
468 	u32 fcnt_prog, fcnt_avail;
469 	u32 error;
470 	u64 qw;
471 
472 	qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
473 	error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
474 		I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT;
475 
476 	if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) {
477 		pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id);
478 		if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) ||
479 		    (I40E_DEBUG_FD & pf->hw.debug_mask))
480 			dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n",
481 				 pf->fd_inv);
482 
483 		/* Check if the programming error is for ATR.
484 		 * If so, auto disable ATR and set a state for
485 		 * flush in progress. Next time we come here if flush is in
486 		 * progress do nothing, once flush is complete the state will
487 		 * be cleared.
488 		 */
489 		if (test_bit(__I40E_FD_FLUSH_REQUESTED, &pf->state))
490 			return;
491 
492 		pf->fd_add_err++;
493 		/* store the current atr filter count */
494 		pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf);
495 
496 		if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) &&
497 		    (pf->auto_disable_flags & I40E_FLAG_FD_SB_ENABLED)) {
498 			pf->auto_disable_flags |= I40E_FLAG_FD_ATR_ENABLED;
499 			set_bit(__I40E_FD_FLUSH_REQUESTED, &pf->state);
500 		}
501 
502 		/* filter programming failed most likely due to table full */
503 		fcnt_prog = i40e_get_global_fd_count(pf);
504 		fcnt_avail = pf->fdir_pf_filter_count;
505 		/* If ATR is running fcnt_prog can quickly change,
506 		 * if we are very close to full, it makes sense to disable
507 		 * FD ATR/SB and then re-enable it when there is room.
508 		 */
509 		if (fcnt_prog >= (fcnt_avail - I40E_FDIR_BUFFER_FULL_MARGIN)) {
510 			if ((pf->flags & I40E_FLAG_FD_SB_ENABLED) &&
511 			    !(pf->auto_disable_flags &
512 				     I40E_FLAG_FD_SB_ENABLED)) {
513 				if (I40E_DEBUG_FD & pf->hw.debug_mask)
514 					dev_warn(&pdev->dev, "FD filter space full, new ntuple rules will not be added\n");
515 				pf->auto_disable_flags |=
516 							I40E_FLAG_FD_SB_ENABLED;
517 			}
518 		}
519 	} else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) {
520 		if (I40E_DEBUG_FD & pf->hw.debug_mask)
521 			dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n",
522 				 rx_desc->wb.qword0.hi_dword.fd_id);
523 	}
524 }
525 
526 /**
527  * i40e_unmap_and_free_tx_resource - Release a Tx buffer
528  * @ring:      the ring that owns the buffer
529  * @tx_buffer: the buffer to free
530  **/
531 static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
532 					    struct i40e_tx_buffer *tx_buffer)
533 {
534 	if (tx_buffer->skb) {
535 		dev_kfree_skb_any(tx_buffer->skb);
536 		if (dma_unmap_len(tx_buffer, len))
537 			dma_unmap_single(ring->dev,
538 					 dma_unmap_addr(tx_buffer, dma),
539 					 dma_unmap_len(tx_buffer, len),
540 					 DMA_TO_DEVICE);
541 	} else if (dma_unmap_len(tx_buffer, len)) {
542 		dma_unmap_page(ring->dev,
543 			       dma_unmap_addr(tx_buffer, dma),
544 			       dma_unmap_len(tx_buffer, len),
545 			       DMA_TO_DEVICE);
546 	}
547 
548 	if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
549 		kfree(tx_buffer->raw_buf);
550 
551 	tx_buffer->next_to_watch = NULL;
552 	tx_buffer->skb = NULL;
553 	dma_unmap_len_set(tx_buffer, len, 0);
554 	/* tx_buffer must be completely set up in the transmit path */
555 }
556 
557 /**
558  * i40e_clean_tx_ring - Free any empty Tx buffers
559  * @tx_ring: ring to be cleaned
560  **/
561 void i40e_clean_tx_ring(struct i40e_ring *tx_ring)
562 {
563 	unsigned long bi_size;
564 	u16 i;
565 
566 	/* ring already cleared, nothing to do */
567 	if (!tx_ring->tx_bi)
568 		return;
569 
570 	/* Free all the Tx ring sk_buffs */
571 	for (i = 0; i < tx_ring->count; i++)
572 		i40e_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
573 
574 	bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
575 	memset(tx_ring->tx_bi, 0, bi_size);
576 
577 	/* Zero out the descriptor ring */
578 	memset(tx_ring->desc, 0, tx_ring->size);
579 
580 	tx_ring->next_to_use = 0;
581 	tx_ring->next_to_clean = 0;
582 
583 	if (!tx_ring->netdev)
584 		return;
585 
586 	/* cleanup Tx queue statistics */
587 	netdev_tx_reset_queue(netdev_get_tx_queue(tx_ring->netdev,
588 						  tx_ring->queue_index));
589 }
590 
591 /**
592  * i40e_free_tx_resources - Free Tx resources per queue
593  * @tx_ring: Tx descriptor ring for a specific queue
594  *
595  * Free all transmit software resources
596  **/
597 void i40e_free_tx_resources(struct i40e_ring *tx_ring)
598 {
599 	i40e_clean_tx_ring(tx_ring);
600 	kfree(tx_ring->tx_bi);
601 	tx_ring->tx_bi = NULL;
602 
603 	if (tx_ring->desc) {
604 		dma_free_coherent(tx_ring->dev, tx_ring->size,
605 				  tx_ring->desc, tx_ring->dma);
606 		tx_ring->desc = NULL;
607 	}
608 }
609 
610 /**
611  * i40e_get_tx_pending - how many tx descriptors not processed
612  * @tx_ring: the ring of descriptors
613  * @in_sw: is tx_pending being checked in SW or HW
614  *
615  * Since there is no access to the ring head register
616  * in XL710, we need to use our local copies
617  **/
618 u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw)
619 {
620 	u32 head, tail;
621 
622 	if (!in_sw)
623 		head = i40e_get_head(ring);
624 	else
625 		head = ring->next_to_clean;
626 	tail = readl(ring->tail);
627 
628 	if (head != tail)
629 		return (head < tail) ?
630 			tail - head : (tail + ring->count - head);
631 
632 	return 0;
633 }
634 
635 #define WB_STRIDE 0x3
636 
637 /**
638  * i40e_clean_tx_irq - Reclaim resources after transmit completes
639  * @tx_ring:  tx ring to clean
640  * @budget:   how many cleans we're allowed
641  *
642  * Returns true if there's any budget left (e.g. the clean is finished)
643  **/
644 static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget)
645 {
646 	u16 i = tx_ring->next_to_clean;
647 	struct i40e_tx_buffer *tx_buf;
648 	struct i40e_tx_desc *tx_head;
649 	struct i40e_tx_desc *tx_desc;
650 	unsigned int total_packets = 0;
651 	unsigned int total_bytes = 0;
652 
653 	tx_buf = &tx_ring->tx_bi[i];
654 	tx_desc = I40E_TX_DESC(tx_ring, i);
655 	i -= tx_ring->count;
656 
657 	tx_head = I40E_TX_DESC(tx_ring, i40e_get_head(tx_ring));
658 
659 	do {
660 		struct i40e_tx_desc *eop_desc = tx_buf->next_to_watch;
661 
662 		/* if next_to_watch is not set then there is no work pending */
663 		if (!eop_desc)
664 			break;
665 
666 		/* prevent any other reads prior to eop_desc */
667 		read_barrier_depends();
668 
669 		/* we have caught up to head, no work left to do */
670 		if (tx_head == tx_desc)
671 			break;
672 
673 		/* clear next_to_watch to prevent false hangs */
674 		tx_buf->next_to_watch = NULL;
675 
676 		/* update the statistics for this packet */
677 		total_bytes += tx_buf->bytecount;
678 		total_packets += tx_buf->gso_segs;
679 
680 		/* free the skb */
681 		dev_consume_skb_any(tx_buf->skb);
682 
683 		/* unmap skb header data */
684 		dma_unmap_single(tx_ring->dev,
685 				 dma_unmap_addr(tx_buf, dma),
686 				 dma_unmap_len(tx_buf, len),
687 				 DMA_TO_DEVICE);
688 
689 		/* clear tx_buffer data */
690 		tx_buf->skb = NULL;
691 		dma_unmap_len_set(tx_buf, len, 0);
692 
693 		/* unmap remaining buffers */
694 		while (tx_desc != eop_desc) {
695 
696 			tx_buf++;
697 			tx_desc++;
698 			i++;
699 			if (unlikely(!i)) {
700 				i -= tx_ring->count;
701 				tx_buf = tx_ring->tx_bi;
702 				tx_desc = I40E_TX_DESC(tx_ring, 0);
703 			}
704 
705 			/* unmap any remaining paged data */
706 			if (dma_unmap_len(tx_buf, len)) {
707 				dma_unmap_page(tx_ring->dev,
708 					       dma_unmap_addr(tx_buf, dma),
709 					       dma_unmap_len(tx_buf, len),
710 					       DMA_TO_DEVICE);
711 				dma_unmap_len_set(tx_buf, len, 0);
712 			}
713 		}
714 
715 		/* move us one more past the eop_desc for start of next pkt */
716 		tx_buf++;
717 		tx_desc++;
718 		i++;
719 		if (unlikely(!i)) {
720 			i -= tx_ring->count;
721 			tx_buf = tx_ring->tx_bi;
722 			tx_desc = I40E_TX_DESC(tx_ring, 0);
723 		}
724 
725 		prefetch(tx_desc);
726 
727 		/* update budget accounting */
728 		budget--;
729 	} while (likely(budget));
730 
731 	i += tx_ring->count;
732 	tx_ring->next_to_clean = i;
733 	u64_stats_update_begin(&tx_ring->syncp);
734 	tx_ring->stats.bytes += total_bytes;
735 	tx_ring->stats.packets += total_packets;
736 	u64_stats_update_end(&tx_ring->syncp);
737 	tx_ring->q_vector->tx.total_bytes += total_bytes;
738 	tx_ring->q_vector->tx.total_packets += total_packets;
739 
740 	if (tx_ring->flags & I40E_TXR_FLAGS_WB_ON_ITR) {
741 		unsigned int j = 0;
742 
743 		/* check to see if there are < 4 descriptors
744 		 * waiting to be written back, then kick the hardware to force
745 		 * them to be written back in case we stay in NAPI.
746 		 * In this mode on X722 we do not enable Interrupt.
747 		 */
748 		j = i40e_get_tx_pending(tx_ring, false);
749 
750 		if (budget &&
751 		    ((j / (WB_STRIDE + 1)) == 0) && (j != 0) &&
752 		    !test_bit(__I40E_DOWN, &tx_ring->vsi->state) &&
753 		    (I40E_DESC_UNUSED(tx_ring) != tx_ring->count))
754 			tx_ring->arm_wb = true;
755 	}
756 
757 	netdev_tx_completed_queue(netdev_get_tx_queue(tx_ring->netdev,
758 						      tx_ring->queue_index),
759 				  total_packets, total_bytes);
760 
761 #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
762 	if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
763 		     (I40E_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
764 		/* Make sure that anybody stopping the queue after this
765 		 * sees the new next_to_clean.
766 		 */
767 		smp_mb();
768 		if (__netif_subqueue_stopped(tx_ring->netdev,
769 					     tx_ring->queue_index) &&
770 		   !test_bit(__I40E_DOWN, &tx_ring->vsi->state)) {
771 			netif_wake_subqueue(tx_ring->netdev,
772 					    tx_ring->queue_index);
773 			++tx_ring->tx_stats.restart_queue;
774 		}
775 	}
776 
777 	return !!budget;
778 }
779 
780 /**
781  * i40e_enable_wb_on_itr - Arm hardware to do a wb, interrupts are not enabled
782  * @vsi: the VSI we care about
783  * @q_vector: the vector on which to enable writeback
784  *
785  **/
786 static void i40e_enable_wb_on_itr(struct i40e_vsi *vsi,
787 				  struct i40e_q_vector *q_vector)
788 {
789 	u16 flags = q_vector->tx.ring[0].flags;
790 	u32 val;
791 
792 	if (!(flags & I40E_TXR_FLAGS_WB_ON_ITR))
793 		return;
794 
795 	if (q_vector->arm_wb_state)
796 		return;
797 
798 	if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
799 		val = I40E_PFINT_DYN_CTLN_WB_ON_ITR_MASK |
800 		      I40E_PFINT_DYN_CTLN_ITR_INDX_MASK; /* set noitr */
801 
802 		wr32(&vsi->back->hw,
803 		     I40E_PFINT_DYN_CTLN(q_vector->v_idx + vsi->base_vector - 1),
804 		     val);
805 	} else {
806 		val = I40E_PFINT_DYN_CTL0_WB_ON_ITR_MASK |
807 		      I40E_PFINT_DYN_CTL0_ITR_INDX_MASK; /* set noitr */
808 
809 		wr32(&vsi->back->hw, I40E_PFINT_DYN_CTL0, val);
810 	}
811 	q_vector->arm_wb_state = true;
812 }
813 
814 /**
815  * i40e_force_wb - Issue SW Interrupt so HW does a wb
816  * @vsi: the VSI we care about
817  * @q_vector: the vector  on which to force writeback
818  *
819  **/
820 void i40e_force_wb(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector)
821 {
822 	if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
823 		u32 val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
824 			  I40E_PFINT_DYN_CTLN_ITR_INDX_MASK | /* set noitr */
825 			  I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK |
826 			  I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK;
827 			  /* allow 00 to be written to the index */
828 
829 		wr32(&vsi->back->hw,
830 		     I40E_PFINT_DYN_CTLN(q_vector->v_idx +
831 					 vsi->base_vector - 1), val);
832 	} else {
833 		u32 val = I40E_PFINT_DYN_CTL0_INTENA_MASK |
834 			  I40E_PFINT_DYN_CTL0_ITR_INDX_MASK | /* set noitr */
835 			  I40E_PFINT_DYN_CTL0_SWINT_TRIG_MASK |
836 			  I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_MASK;
837 			/* allow 00 to be written to the index */
838 
839 		wr32(&vsi->back->hw, I40E_PFINT_DYN_CTL0, val);
840 	}
841 }
842 
843 /**
844  * i40e_set_new_dynamic_itr - Find new ITR level
845  * @rc: structure containing ring performance data
846  *
847  * Returns true if ITR changed, false if not
848  *
849  * Stores a new ITR value based on packets and byte counts during
850  * the last interrupt.  The advantage of per interrupt computation
851  * is faster updates and more accurate ITR for the current traffic
852  * pattern.  Constants in this function were computed based on
853  * theoretical maximum wire speed and thresholds were set based on
854  * testing data as well as attempting to minimize response time
855  * while increasing bulk throughput.
856  **/
857 static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc)
858 {
859 	enum i40e_latency_range new_latency_range = rc->latency_range;
860 	struct i40e_q_vector *qv = rc->ring->q_vector;
861 	u32 new_itr = rc->itr;
862 	int bytes_per_int;
863 	int usecs;
864 
865 	if (rc->total_packets == 0 || !rc->itr)
866 		return false;
867 
868 	/* simple throttlerate management
869 	 *   0-10MB/s   lowest (50000 ints/s)
870 	 *  10-20MB/s   low    (20000 ints/s)
871 	 *  20-1249MB/s bulk   (18000 ints/s)
872 	 *  > 40000 Rx packets per second (8000 ints/s)
873 	 *
874 	 * The math works out because the divisor is in 10^(-6) which
875 	 * turns the bytes/us input value into MB/s values, but
876 	 * make sure to use usecs, as the register values written
877 	 * are in 2 usec increments in the ITR registers, and make sure
878 	 * to use the smoothed values that the countdown timer gives us.
879 	 */
880 	usecs = (rc->itr << 1) * ITR_COUNTDOWN_START;
881 	bytes_per_int = rc->total_bytes / usecs;
882 
883 	switch (new_latency_range) {
884 	case I40E_LOWEST_LATENCY:
885 		if (bytes_per_int > 10)
886 			new_latency_range = I40E_LOW_LATENCY;
887 		break;
888 	case I40E_LOW_LATENCY:
889 		if (bytes_per_int > 20)
890 			new_latency_range = I40E_BULK_LATENCY;
891 		else if (bytes_per_int <= 10)
892 			new_latency_range = I40E_LOWEST_LATENCY;
893 		break;
894 	case I40E_BULK_LATENCY:
895 	case I40E_ULTRA_LATENCY:
896 	default:
897 		if (bytes_per_int <= 20)
898 			new_latency_range = I40E_LOW_LATENCY;
899 		break;
900 	}
901 
902 	/* this is to adjust RX more aggressively when streaming small
903 	 * packets.  The value of 40000 was picked as it is just beyond
904 	 * what the hardware can receive per second if in low latency
905 	 * mode.
906 	 */
907 #define RX_ULTRA_PACKET_RATE 40000
908 
909 	if ((((rc->total_packets * 1000000) / usecs) > RX_ULTRA_PACKET_RATE) &&
910 	    (&qv->rx == rc))
911 		new_latency_range = I40E_ULTRA_LATENCY;
912 
913 	rc->latency_range = new_latency_range;
914 
915 	switch (new_latency_range) {
916 	case I40E_LOWEST_LATENCY:
917 		new_itr = I40E_ITR_50K;
918 		break;
919 	case I40E_LOW_LATENCY:
920 		new_itr = I40E_ITR_20K;
921 		break;
922 	case I40E_BULK_LATENCY:
923 		new_itr = I40E_ITR_18K;
924 		break;
925 	case I40E_ULTRA_LATENCY:
926 		new_itr = I40E_ITR_8K;
927 		break;
928 	default:
929 		break;
930 	}
931 
932 	rc->total_bytes = 0;
933 	rc->total_packets = 0;
934 
935 	if (new_itr != rc->itr) {
936 		rc->itr = new_itr;
937 		return true;
938 	}
939 
940 	return false;
941 }
942 
943 /**
944  * i40e_clean_programming_status - clean the programming status descriptor
945  * @rx_ring: the rx ring that has this descriptor
946  * @rx_desc: the rx descriptor written back by HW
947  *
948  * Flow director should handle FD_FILTER_STATUS to check its filter programming
949  * status being successful or not and take actions accordingly. FCoE should
950  * handle its context/filter programming/invalidation status and take actions.
951  *
952  **/
953 static void i40e_clean_programming_status(struct i40e_ring *rx_ring,
954 					  union i40e_rx_desc *rx_desc)
955 {
956 	u64 qw;
957 	u8 id;
958 
959 	qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
960 	id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
961 		  I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
962 
963 	if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS)
964 		i40e_fd_handle_status(rx_ring, rx_desc, id);
965 #ifdef I40E_FCOE
966 	else if ((id == I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_PROG_STATUS) ||
967 		 (id == I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_INVL_STATUS))
968 		i40e_fcoe_handle_status(rx_ring, rx_desc, id);
969 #endif
970 }
971 
972 /**
973  * i40e_setup_tx_descriptors - Allocate the Tx descriptors
974  * @tx_ring: the tx ring to set up
975  *
976  * Return 0 on success, negative on error
977  **/
978 int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
979 {
980 	struct device *dev = tx_ring->dev;
981 	int bi_size;
982 
983 	if (!dev)
984 		return -ENOMEM;
985 
986 	/* warn if we are about to overwrite the pointer */
987 	WARN_ON(tx_ring->tx_bi);
988 	bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
989 	tx_ring->tx_bi = kzalloc(bi_size, GFP_KERNEL);
990 	if (!tx_ring->tx_bi)
991 		goto err;
992 
993 	/* round up to nearest 4K */
994 	tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc);
995 	/* add u32 for head writeback, align after this takes care of
996 	 * guaranteeing this is at least one cache line in size
997 	 */
998 	tx_ring->size += sizeof(u32);
999 	tx_ring->size = ALIGN(tx_ring->size, 4096);
1000 	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
1001 					   &tx_ring->dma, GFP_KERNEL);
1002 	if (!tx_ring->desc) {
1003 		dev_info(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n",
1004 			 tx_ring->size);
1005 		goto err;
1006 	}
1007 
1008 	tx_ring->next_to_use = 0;
1009 	tx_ring->next_to_clean = 0;
1010 	return 0;
1011 
1012 err:
1013 	kfree(tx_ring->tx_bi);
1014 	tx_ring->tx_bi = NULL;
1015 	return -ENOMEM;
1016 }
1017 
1018 /**
1019  * i40e_clean_rx_ring - Free Rx buffers
1020  * @rx_ring: ring to be cleaned
1021  **/
1022 void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
1023 {
1024 	struct device *dev = rx_ring->dev;
1025 	struct i40e_rx_buffer *rx_bi;
1026 	unsigned long bi_size;
1027 	u16 i;
1028 
1029 	/* ring already cleared, nothing to do */
1030 	if (!rx_ring->rx_bi)
1031 		return;
1032 
1033 	if (ring_is_ps_enabled(rx_ring)) {
1034 		int bufsz = ALIGN(rx_ring->rx_hdr_len, 256) * rx_ring->count;
1035 
1036 		rx_bi = &rx_ring->rx_bi[0];
1037 		if (rx_bi->hdr_buf) {
1038 			dma_free_coherent(dev,
1039 					  bufsz,
1040 					  rx_bi->hdr_buf,
1041 					  rx_bi->dma);
1042 			for (i = 0; i < rx_ring->count; i++) {
1043 				rx_bi = &rx_ring->rx_bi[i];
1044 				rx_bi->dma = 0;
1045 				rx_bi->hdr_buf = NULL;
1046 			}
1047 		}
1048 	}
1049 	/* Free all the Rx ring sk_buffs */
1050 	for (i = 0; i < rx_ring->count; i++) {
1051 		rx_bi = &rx_ring->rx_bi[i];
1052 		if (rx_bi->dma) {
1053 			dma_unmap_single(dev,
1054 					 rx_bi->dma,
1055 					 rx_ring->rx_buf_len,
1056 					 DMA_FROM_DEVICE);
1057 			rx_bi->dma = 0;
1058 		}
1059 		if (rx_bi->skb) {
1060 			dev_kfree_skb(rx_bi->skb);
1061 			rx_bi->skb = NULL;
1062 		}
1063 		if (rx_bi->page) {
1064 			if (rx_bi->page_dma) {
1065 				dma_unmap_page(dev,
1066 					       rx_bi->page_dma,
1067 					       PAGE_SIZE,
1068 					       DMA_FROM_DEVICE);
1069 				rx_bi->page_dma = 0;
1070 			}
1071 			__free_page(rx_bi->page);
1072 			rx_bi->page = NULL;
1073 			rx_bi->page_offset = 0;
1074 		}
1075 	}
1076 
1077 	bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
1078 	memset(rx_ring->rx_bi, 0, bi_size);
1079 
1080 	/* Zero out the descriptor ring */
1081 	memset(rx_ring->desc, 0, rx_ring->size);
1082 
1083 	rx_ring->next_to_clean = 0;
1084 	rx_ring->next_to_use = 0;
1085 }
1086 
1087 /**
1088  * i40e_free_rx_resources - Free Rx resources
1089  * @rx_ring: ring to clean the resources from
1090  *
1091  * Free all receive software resources
1092  **/
1093 void i40e_free_rx_resources(struct i40e_ring *rx_ring)
1094 {
1095 	i40e_clean_rx_ring(rx_ring);
1096 	kfree(rx_ring->rx_bi);
1097 	rx_ring->rx_bi = NULL;
1098 
1099 	if (rx_ring->desc) {
1100 		dma_free_coherent(rx_ring->dev, rx_ring->size,
1101 				  rx_ring->desc, rx_ring->dma);
1102 		rx_ring->desc = NULL;
1103 	}
1104 }
1105 
1106 /**
1107  * i40e_alloc_rx_headers - allocate rx header buffers
1108  * @rx_ring: ring to alloc buffers
1109  *
1110  * Allocate rx header buffers for the entire ring. As these are static,
1111  * this is only called when setting up a new ring.
1112  **/
1113 void i40e_alloc_rx_headers(struct i40e_ring *rx_ring)
1114 {
1115 	struct device *dev = rx_ring->dev;
1116 	struct i40e_rx_buffer *rx_bi;
1117 	dma_addr_t dma;
1118 	void *buffer;
1119 	int buf_size;
1120 	int i;
1121 
1122 	if (rx_ring->rx_bi[0].hdr_buf)
1123 		return;
1124 	/* Make sure the buffers don't cross cache line boundaries. */
1125 	buf_size = ALIGN(rx_ring->rx_hdr_len, 256);
1126 	buffer = dma_alloc_coherent(dev, buf_size * rx_ring->count,
1127 				    &dma, GFP_KERNEL);
1128 	if (!buffer)
1129 		return;
1130 	for (i = 0; i < rx_ring->count; i++) {
1131 		rx_bi = &rx_ring->rx_bi[i];
1132 		rx_bi->dma = dma + (i * buf_size);
1133 		rx_bi->hdr_buf = buffer + (i * buf_size);
1134 	}
1135 }
1136 
1137 /**
1138  * i40e_setup_rx_descriptors - Allocate Rx descriptors
1139  * @rx_ring: Rx descriptor ring (for a specific queue) to setup
1140  *
1141  * Returns 0 on success, negative on failure
1142  **/
1143 int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
1144 {
1145 	struct device *dev = rx_ring->dev;
1146 	int bi_size;
1147 
1148 	/* warn if we are about to overwrite the pointer */
1149 	WARN_ON(rx_ring->rx_bi);
1150 	bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
1151 	rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
1152 	if (!rx_ring->rx_bi)
1153 		goto err;
1154 
1155 	u64_stats_init(&rx_ring->syncp);
1156 
1157 	/* Round up to nearest 4K */
1158 	rx_ring->size = ring_is_16byte_desc_enabled(rx_ring)
1159 		? rx_ring->count * sizeof(union i40e_16byte_rx_desc)
1160 		: rx_ring->count * sizeof(union i40e_32byte_rx_desc);
1161 	rx_ring->size = ALIGN(rx_ring->size, 4096);
1162 	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
1163 					   &rx_ring->dma, GFP_KERNEL);
1164 
1165 	if (!rx_ring->desc) {
1166 		dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
1167 			 rx_ring->size);
1168 		goto err;
1169 	}
1170 
1171 	rx_ring->next_to_clean = 0;
1172 	rx_ring->next_to_use = 0;
1173 
1174 	return 0;
1175 err:
1176 	kfree(rx_ring->rx_bi);
1177 	rx_ring->rx_bi = NULL;
1178 	return -ENOMEM;
1179 }
1180 
1181 /**
1182  * i40e_release_rx_desc - Store the new tail and head values
1183  * @rx_ring: ring to bump
1184  * @val: new head index
1185  **/
1186 static inline void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val)
1187 {
1188 	rx_ring->next_to_use = val;
1189 	/* Force memory writes to complete before letting h/w
1190 	 * know there are new descriptors to fetch.  (Only
1191 	 * applicable for weak-ordered memory model archs,
1192 	 * such as IA-64).
1193 	 */
1194 	wmb();
1195 	writel(val, rx_ring->tail);
1196 }
1197 
1198 /**
1199  * i40e_alloc_rx_buffers_ps - Replace used receive buffers; packet split
1200  * @rx_ring: ring to place buffers on
1201  * @cleaned_count: number of buffers to replace
1202  *
1203  * Returns true if any errors on allocation
1204  **/
1205 bool i40e_alloc_rx_buffers_ps(struct i40e_ring *rx_ring, u16 cleaned_count)
1206 {
1207 	u16 i = rx_ring->next_to_use;
1208 	union i40e_rx_desc *rx_desc;
1209 	struct i40e_rx_buffer *bi;
1210 	const int current_node = numa_node_id();
1211 
1212 	/* do nothing if no valid netdev defined */
1213 	if (!rx_ring->netdev || !cleaned_count)
1214 		return false;
1215 
1216 	while (cleaned_count--) {
1217 		rx_desc = I40E_RX_DESC(rx_ring, i);
1218 		bi = &rx_ring->rx_bi[i];
1219 
1220 		if (bi->skb) /* desc is in use */
1221 			goto no_buffers;
1222 
1223 	/* If we've been moved to a different NUMA node, release the
1224 	 * page so we can get a new one on the current node.
1225 	 */
1226 		if (bi->page &&  page_to_nid(bi->page) != current_node) {
1227 			dma_unmap_page(rx_ring->dev,
1228 				       bi->page_dma,
1229 				       PAGE_SIZE,
1230 				       DMA_FROM_DEVICE);
1231 			__free_page(bi->page);
1232 			bi->page = NULL;
1233 			bi->page_dma = 0;
1234 			rx_ring->rx_stats.realloc_count++;
1235 		} else if (bi->page) {
1236 			rx_ring->rx_stats.page_reuse_count++;
1237 		}
1238 
1239 		if (!bi->page) {
1240 			bi->page = alloc_page(GFP_ATOMIC);
1241 			if (!bi->page) {
1242 				rx_ring->rx_stats.alloc_page_failed++;
1243 				goto no_buffers;
1244 			}
1245 			bi->page_dma = dma_map_page(rx_ring->dev,
1246 						    bi->page,
1247 						    0,
1248 						    PAGE_SIZE,
1249 						    DMA_FROM_DEVICE);
1250 			if (dma_mapping_error(rx_ring->dev, bi->page_dma)) {
1251 				rx_ring->rx_stats.alloc_page_failed++;
1252 				__free_page(bi->page);
1253 				bi->page = NULL;
1254 				bi->page_dma = 0;
1255 				bi->page_offset = 0;
1256 				goto no_buffers;
1257 			}
1258 			bi->page_offset = 0;
1259 		}
1260 
1261 		/* Refresh the desc even if buffer_addrs didn't change
1262 		 * because each write-back erases this info.
1263 		 */
1264 		rx_desc->read.pkt_addr =
1265 				cpu_to_le64(bi->page_dma + bi->page_offset);
1266 		rx_desc->read.hdr_addr = cpu_to_le64(bi->dma);
1267 		i++;
1268 		if (i == rx_ring->count)
1269 			i = 0;
1270 	}
1271 
1272 	if (rx_ring->next_to_use != i)
1273 		i40e_release_rx_desc(rx_ring, i);
1274 
1275 	return false;
1276 
1277 no_buffers:
1278 	if (rx_ring->next_to_use != i)
1279 		i40e_release_rx_desc(rx_ring, i);
1280 
1281 	/* make sure to come back via polling to try again after
1282 	 * allocation failure
1283 	 */
1284 	return true;
1285 }
1286 
1287 /**
1288  * i40e_alloc_rx_buffers_1buf - Replace used receive buffers; single buffer
1289  * @rx_ring: ring to place buffers on
1290  * @cleaned_count: number of buffers to replace
1291  *
1292  * Returns true if any errors on allocation
1293  **/
1294 bool i40e_alloc_rx_buffers_1buf(struct i40e_ring *rx_ring, u16 cleaned_count)
1295 {
1296 	u16 i = rx_ring->next_to_use;
1297 	union i40e_rx_desc *rx_desc;
1298 	struct i40e_rx_buffer *bi;
1299 	struct sk_buff *skb;
1300 
1301 	/* do nothing if no valid netdev defined */
1302 	if (!rx_ring->netdev || !cleaned_count)
1303 		return false;
1304 
1305 	while (cleaned_count--) {
1306 		rx_desc = I40E_RX_DESC(rx_ring, i);
1307 		bi = &rx_ring->rx_bi[i];
1308 		skb = bi->skb;
1309 
1310 		if (!skb) {
1311 			skb = __netdev_alloc_skb_ip_align(rx_ring->netdev,
1312 							  rx_ring->rx_buf_len,
1313 							  GFP_ATOMIC |
1314 							  __GFP_NOWARN);
1315 			if (!skb) {
1316 				rx_ring->rx_stats.alloc_buff_failed++;
1317 				goto no_buffers;
1318 			}
1319 			/* initialize queue mapping */
1320 			skb_record_rx_queue(skb, rx_ring->queue_index);
1321 			bi->skb = skb;
1322 		}
1323 
1324 		if (!bi->dma) {
1325 			bi->dma = dma_map_single(rx_ring->dev,
1326 						 skb->data,
1327 						 rx_ring->rx_buf_len,
1328 						 DMA_FROM_DEVICE);
1329 			if (dma_mapping_error(rx_ring->dev, bi->dma)) {
1330 				rx_ring->rx_stats.alloc_buff_failed++;
1331 				bi->dma = 0;
1332 				dev_kfree_skb(bi->skb);
1333 				bi->skb = NULL;
1334 				goto no_buffers;
1335 			}
1336 		}
1337 
1338 		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
1339 		rx_desc->read.hdr_addr = 0;
1340 		i++;
1341 		if (i == rx_ring->count)
1342 			i = 0;
1343 	}
1344 
1345 	if (rx_ring->next_to_use != i)
1346 		i40e_release_rx_desc(rx_ring, i);
1347 
1348 	return false;
1349 
1350 no_buffers:
1351 	if (rx_ring->next_to_use != i)
1352 		i40e_release_rx_desc(rx_ring, i);
1353 
1354 	/* make sure to come back via polling to try again after
1355 	 * allocation failure
1356 	 */
1357 	return true;
1358 }
1359 
1360 /**
1361  * i40e_receive_skb - Send a completed packet up the stack
1362  * @rx_ring:  rx ring in play
1363  * @skb: packet to send up
1364  * @vlan_tag: vlan tag for packet
1365  **/
1366 static void i40e_receive_skb(struct i40e_ring *rx_ring,
1367 			     struct sk_buff *skb, u16 vlan_tag)
1368 {
1369 	struct i40e_q_vector *q_vector = rx_ring->q_vector;
1370 
1371 	if (vlan_tag & VLAN_VID_MASK)
1372 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
1373 
1374 	napi_gro_receive(&q_vector->napi, skb);
1375 }
1376 
1377 /**
1378  * i40e_rx_checksum - Indicate in skb if hw indicated a good cksum
1379  * @vsi: the VSI we care about
1380  * @skb: skb currently being received and modified
1381  * @rx_status: status value of last descriptor in packet
1382  * @rx_error: error value of last descriptor in packet
1383  * @rx_ptype: ptype value of last descriptor in packet
1384  **/
1385 static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
1386 				    struct sk_buff *skb,
1387 				    u32 rx_status,
1388 				    u32 rx_error,
1389 				    u16 rx_ptype)
1390 {
1391 	struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(rx_ptype);
1392 	bool ipv4, ipv6, ipv4_tunnel, ipv6_tunnel;
1393 
1394 	skb->ip_summed = CHECKSUM_NONE;
1395 
1396 	/* Rx csum enabled and ip headers found? */
1397 	if (!(vsi->netdev->features & NETIF_F_RXCSUM))
1398 		return;
1399 
1400 	/* did the hardware decode the packet and checksum? */
1401 	if (!(rx_status & BIT(I40E_RX_DESC_STATUS_L3L4P_SHIFT)))
1402 		return;
1403 
1404 	/* both known and outer_ip must be set for the below code to work */
1405 	if (!(decoded.known && decoded.outer_ip))
1406 		return;
1407 
1408 	ipv4 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
1409 	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4);
1410 	ipv6 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
1411 	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6);
1412 
1413 	if (ipv4 &&
1414 	    (rx_error & (BIT(I40E_RX_DESC_ERROR_IPE_SHIFT) |
1415 			 BIT(I40E_RX_DESC_ERROR_EIPE_SHIFT))))
1416 		goto checksum_fail;
1417 
1418 	/* likely incorrect csum if alternate IP extension headers found */
1419 	if (ipv6 &&
1420 	    rx_status & BIT(I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))
1421 		/* don't increment checksum err here, non-fatal err */
1422 		return;
1423 
1424 	/* there was some L4 error, count error and punt packet to the stack */
1425 	if (rx_error & BIT(I40E_RX_DESC_ERROR_L4E_SHIFT))
1426 		goto checksum_fail;
1427 
1428 	/* handle packets that were not able to be checksummed due
1429 	 * to arrival speed, in this case the stack can compute
1430 	 * the csum.
1431 	 */
1432 	if (rx_error & BIT(I40E_RX_DESC_ERROR_PPRS_SHIFT))
1433 		return;
1434 
1435 	/* The hardware supported by this driver does not validate outer
1436 	 * checksums for tunneled VXLAN or GENEVE frames.  I don't agree
1437 	 * with it but the specification states that you "MAY validate", it
1438 	 * doesn't make it a hard requirement so if we have validated the
1439 	 * inner checksum report CHECKSUM_UNNECESSARY.
1440 	 */
1441 
1442 	ipv4_tunnel = (rx_ptype >= I40E_RX_PTYPE_GRENAT4_MAC_PAY3) &&
1443 		     (rx_ptype <= I40E_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4);
1444 	ipv6_tunnel = (rx_ptype >= I40E_RX_PTYPE_GRENAT6_MAC_PAY3) &&
1445 		     (rx_ptype <= I40E_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4);
1446 
1447 	skb->ip_summed = CHECKSUM_UNNECESSARY;
1448 	skb->csum_level = ipv4_tunnel || ipv6_tunnel;
1449 
1450 	return;
1451 
1452 checksum_fail:
1453 	vsi->back->hw_csum_rx_error++;
1454 }
1455 
1456 /**
1457  * i40e_ptype_to_htype - get a hash type
1458  * @ptype: the ptype value from the descriptor
1459  *
1460  * Returns a hash type to be used by skb_set_hash
1461  **/
1462 static inline enum pkt_hash_types i40e_ptype_to_htype(u8 ptype)
1463 {
1464 	struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
1465 
1466 	if (!decoded.known)
1467 		return PKT_HASH_TYPE_NONE;
1468 
1469 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1470 	    decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4)
1471 		return PKT_HASH_TYPE_L4;
1472 	else if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1473 		 decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3)
1474 		return PKT_HASH_TYPE_L3;
1475 	else
1476 		return PKT_HASH_TYPE_L2;
1477 }
1478 
1479 /**
1480  * i40e_rx_hash - set the hash value in the skb
1481  * @ring: descriptor ring
1482  * @rx_desc: specific descriptor
1483  **/
1484 static inline void i40e_rx_hash(struct i40e_ring *ring,
1485 				union i40e_rx_desc *rx_desc,
1486 				struct sk_buff *skb,
1487 				u8 rx_ptype)
1488 {
1489 	u32 hash;
1490 	const __le64 rss_mask  =
1491 		cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH <<
1492 			    I40E_RX_DESC_STATUS_FLTSTAT_SHIFT);
1493 
1494 	if (ring->netdev->features & NETIF_F_RXHASH)
1495 		return;
1496 
1497 	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
1498 		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
1499 		skb_set_hash(skb, hash, i40e_ptype_to_htype(rx_ptype));
1500 	}
1501 }
1502 
1503 /**
1504  * i40e_clean_rx_irq_ps - Reclaim resources after receive; packet split
1505  * @rx_ring:  rx ring to clean
1506  * @budget:   how many cleans we're allowed
1507  *
1508  * Returns true if there's any budget left (e.g. the clean is finished)
1509  **/
1510 static int i40e_clean_rx_irq_ps(struct i40e_ring *rx_ring, const int budget)
1511 {
1512 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
1513 	u16 rx_packet_len, rx_header_len, rx_sph, rx_hbo;
1514 	u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
1515 	struct i40e_vsi *vsi = rx_ring->vsi;
1516 	u16 i = rx_ring->next_to_clean;
1517 	union i40e_rx_desc *rx_desc;
1518 	u32 rx_error, rx_status;
1519 	bool failure = false;
1520 	u8 rx_ptype;
1521 	u64 qword;
1522 	u32 copysize;
1523 
1524 	if (budget <= 0)
1525 		return 0;
1526 
1527 	do {
1528 		struct i40e_rx_buffer *rx_bi;
1529 		struct sk_buff *skb;
1530 		u16 vlan_tag;
1531 		/* return some buffers to hardware, one at a time is too slow */
1532 		if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
1533 			failure = failure ||
1534 				  i40e_alloc_rx_buffers_ps(rx_ring,
1535 							   cleaned_count);
1536 			cleaned_count = 0;
1537 		}
1538 
1539 		i = rx_ring->next_to_clean;
1540 		rx_desc = I40E_RX_DESC(rx_ring, i);
1541 		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1542 		rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
1543 			I40E_RXD_QW1_STATUS_SHIFT;
1544 
1545 		if (!(rx_status & BIT(I40E_RX_DESC_STATUS_DD_SHIFT)))
1546 			break;
1547 
1548 		/* This memory barrier is needed to keep us from reading
1549 		 * any other fields out of the rx_desc until we know the
1550 		 * DD bit is set.
1551 		 */
1552 		dma_rmb();
1553 		/* sync header buffer for reading */
1554 		dma_sync_single_range_for_cpu(rx_ring->dev,
1555 					      rx_ring->rx_bi[0].dma,
1556 					      i * rx_ring->rx_hdr_len,
1557 					      rx_ring->rx_hdr_len,
1558 					      DMA_FROM_DEVICE);
1559 		if (i40e_rx_is_programming_status(qword)) {
1560 			i40e_clean_programming_status(rx_ring, rx_desc);
1561 			I40E_RX_INCREMENT(rx_ring, i);
1562 			continue;
1563 		}
1564 		rx_bi = &rx_ring->rx_bi[i];
1565 		skb = rx_bi->skb;
1566 		if (likely(!skb)) {
1567 			skb = __netdev_alloc_skb_ip_align(rx_ring->netdev,
1568 							  rx_ring->rx_hdr_len,
1569 							  GFP_ATOMIC |
1570 							  __GFP_NOWARN);
1571 			if (!skb) {
1572 				rx_ring->rx_stats.alloc_buff_failed++;
1573 				failure = true;
1574 				break;
1575 			}
1576 
1577 			/* initialize queue mapping */
1578 			skb_record_rx_queue(skb, rx_ring->queue_index);
1579 			/* we are reusing so sync this buffer for CPU use */
1580 			dma_sync_single_range_for_cpu(rx_ring->dev,
1581 						      rx_ring->rx_bi[0].dma,
1582 						      i * rx_ring->rx_hdr_len,
1583 						      rx_ring->rx_hdr_len,
1584 						      DMA_FROM_DEVICE);
1585 		}
1586 		rx_packet_len = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
1587 				I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1588 		rx_header_len = (qword & I40E_RXD_QW1_LENGTH_HBUF_MASK) >>
1589 				I40E_RXD_QW1_LENGTH_HBUF_SHIFT;
1590 		rx_sph = (qword & I40E_RXD_QW1_LENGTH_SPH_MASK) >>
1591 			 I40E_RXD_QW1_LENGTH_SPH_SHIFT;
1592 
1593 		rx_error = (qword & I40E_RXD_QW1_ERROR_MASK) >>
1594 			   I40E_RXD_QW1_ERROR_SHIFT;
1595 		rx_hbo = rx_error & BIT(I40E_RX_DESC_ERROR_HBO_SHIFT);
1596 		rx_error &= ~BIT(I40E_RX_DESC_ERROR_HBO_SHIFT);
1597 
1598 		rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
1599 			   I40E_RXD_QW1_PTYPE_SHIFT;
1600 		/* sync half-page for reading */
1601 		dma_sync_single_range_for_cpu(rx_ring->dev,
1602 					      rx_bi->page_dma,
1603 					      rx_bi->page_offset,
1604 					      PAGE_SIZE / 2,
1605 					      DMA_FROM_DEVICE);
1606 		prefetch(page_address(rx_bi->page) + rx_bi->page_offset);
1607 		rx_bi->skb = NULL;
1608 		cleaned_count++;
1609 		copysize = 0;
1610 		if (rx_hbo || rx_sph) {
1611 			int len;
1612 
1613 			if (rx_hbo)
1614 				len = I40E_RX_HDR_SIZE;
1615 			else
1616 				len = rx_header_len;
1617 			memcpy(__skb_put(skb, len), rx_bi->hdr_buf, len);
1618 		} else if (skb->len == 0) {
1619 			int len;
1620 			unsigned char *va = page_address(rx_bi->page) +
1621 					    rx_bi->page_offset;
1622 
1623 			len = min(rx_packet_len, rx_ring->rx_hdr_len);
1624 			memcpy(__skb_put(skb, len), va, len);
1625 			copysize = len;
1626 			rx_packet_len -= len;
1627 		}
1628 		/* Get the rest of the data if this was a header split */
1629 		if (rx_packet_len) {
1630 			skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
1631 					rx_bi->page,
1632 					rx_bi->page_offset + copysize,
1633 					rx_packet_len, I40E_RXBUFFER_2048);
1634 
1635 			/* If the page count is more than 2, then both halves
1636 			 * of the page are used and we need to free it. Do it
1637 			 * here instead of in the alloc code. Otherwise one
1638 			 * of the half-pages might be released between now and
1639 			 * then, and we wouldn't know which one to use.
1640 			 * Don't call get_page and free_page since those are
1641 			 * both expensive atomic operations that just change
1642 			 * the refcount in opposite directions. Just give the
1643 			 * page to the stack; he can have our refcount.
1644 			 */
1645 			if (page_count(rx_bi->page) > 2) {
1646 				dma_unmap_page(rx_ring->dev,
1647 					       rx_bi->page_dma,
1648 					       PAGE_SIZE,
1649 					       DMA_FROM_DEVICE);
1650 				rx_bi->page = NULL;
1651 				rx_bi->page_dma = 0;
1652 				rx_ring->rx_stats.realloc_count++;
1653 			} else {
1654 				get_page(rx_bi->page);
1655 				/* switch to the other half-page here; the
1656 				 * allocation code programs the right addr
1657 				 * into HW. If we haven't used this half-page,
1658 				 * the address won't be changed, and HW can
1659 				 * just use it next time through.
1660 				 */
1661 				rx_bi->page_offset ^= PAGE_SIZE / 2;
1662 			}
1663 
1664 		}
1665 		I40E_RX_INCREMENT(rx_ring, i);
1666 
1667 		if (unlikely(
1668 		    !(rx_status & BIT(I40E_RX_DESC_STATUS_EOF_SHIFT)))) {
1669 			struct i40e_rx_buffer *next_buffer;
1670 
1671 			next_buffer = &rx_ring->rx_bi[i];
1672 			next_buffer->skb = skb;
1673 			rx_ring->rx_stats.non_eop_descs++;
1674 			continue;
1675 		}
1676 
1677 		/* ERR_MASK will only have valid bits if EOP set */
1678 		if (unlikely(rx_error & BIT(I40E_RX_DESC_ERROR_RXE_SHIFT))) {
1679 			dev_kfree_skb_any(skb);
1680 			continue;
1681 		}
1682 
1683 		i40e_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
1684 
1685 		if (unlikely(rx_status & I40E_RXD_QW1_STATUS_TSYNVALID_MASK)) {
1686 			i40e_ptp_rx_hwtstamp(vsi->back, skb, (rx_status &
1687 					   I40E_RXD_QW1_STATUS_TSYNINDX_MASK) >>
1688 					   I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT);
1689 			rx_ring->last_rx_timestamp = jiffies;
1690 		}
1691 
1692 		/* probably a little skewed due to removing CRC */
1693 		total_rx_bytes += skb->len;
1694 		total_rx_packets++;
1695 
1696 		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
1697 
1698 		i40e_rx_checksum(vsi, skb, rx_status, rx_error, rx_ptype);
1699 
1700 		vlan_tag = rx_status & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)
1701 			 ? le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1)
1702 			 : 0;
1703 #ifdef I40E_FCOE
1704 		if (!i40e_fcoe_handle_offload(rx_ring, rx_desc, skb)) {
1705 			dev_kfree_skb_any(skb);
1706 			continue;
1707 		}
1708 #endif
1709 		i40e_receive_skb(rx_ring, skb, vlan_tag);
1710 
1711 		rx_desc->wb.qword1.status_error_len = 0;
1712 
1713 	} while (likely(total_rx_packets < budget));
1714 
1715 	u64_stats_update_begin(&rx_ring->syncp);
1716 	rx_ring->stats.packets += total_rx_packets;
1717 	rx_ring->stats.bytes += total_rx_bytes;
1718 	u64_stats_update_end(&rx_ring->syncp);
1719 	rx_ring->q_vector->rx.total_packets += total_rx_packets;
1720 	rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
1721 
1722 	return failure ? budget : total_rx_packets;
1723 }
1724 
1725 /**
1726  * i40e_clean_rx_irq_1buf - Reclaim resources after receive; single buffer
1727  * @rx_ring:  rx ring to clean
1728  * @budget:   how many cleans we're allowed
1729  *
1730  * Returns number of packets cleaned
1731  **/
1732 static int i40e_clean_rx_irq_1buf(struct i40e_ring *rx_ring, int budget)
1733 {
1734 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
1735 	u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
1736 	struct i40e_vsi *vsi = rx_ring->vsi;
1737 	union i40e_rx_desc *rx_desc;
1738 	u32 rx_error, rx_status;
1739 	u16 rx_packet_len;
1740 	bool failure = false;
1741 	u8 rx_ptype;
1742 	u64 qword;
1743 	u16 i;
1744 
1745 	do {
1746 		struct i40e_rx_buffer *rx_bi;
1747 		struct sk_buff *skb;
1748 		u16 vlan_tag;
1749 		/* return some buffers to hardware, one at a time is too slow */
1750 		if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
1751 			failure = failure ||
1752 				  i40e_alloc_rx_buffers_1buf(rx_ring,
1753 							     cleaned_count);
1754 			cleaned_count = 0;
1755 		}
1756 
1757 		i = rx_ring->next_to_clean;
1758 		rx_desc = I40E_RX_DESC(rx_ring, i);
1759 		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1760 		rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
1761 			I40E_RXD_QW1_STATUS_SHIFT;
1762 
1763 		if (!(rx_status & BIT(I40E_RX_DESC_STATUS_DD_SHIFT)))
1764 			break;
1765 
1766 		/* This memory barrier is needed to keep us from reading
1767 		 * any other fields out of the rx_desc until we know the
1768 		 * DD bit is set.
1769 		 */
1770 		dma_rmb();
1771 
1772 		if (i40e_rx_is_programming_status(qword)) {
1773 			i40e_clean_programming_status(rx_ring, rx_desc);
1774 			I40E_RX_INCREMENT(rx_ring, i);
1775 			continue;
1776 		}
1777 		rx_bi = &rx_ring->rx_bi[i];
1778 		skb = rx_bi->skb;
1779 		prefetch(skb->data);
1780 
1781 		rx_packet_len = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
1782 				I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1783 
1784 		rx_error = (qword & I40E_RXD_QW1_ERROR_MASK) >>
1785 			   I40E_RXD_QW1_ERROR_SHIFT;
1786 		rx_error &= ~BIT(I40E_RX_DESC_ERROR_HBO_SHIFT);
1787 
1788 		rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
1789 			   I40E_RXD_QW1_PTYPE_SHIFT;
1790 		rx_bi->skb = NULL;
1791 		cleaned_count++;
1792 
1793 		/* Get the header and possibly the whole packet
1794 		 * If this is an skb from previous receive dma will be 0
1795 		 */
1796 		skb_put(skb, rx_packet_len);
1797 		dma_unmap_single(rx_ring->dev, rx_bi->dma, rx_ring->rx_buf_len,
1798 				 DMA_FROM_DEVICE);
1799 		rx_bi->dma = 0;
1800 
1801 		I40E_RX_INCREMENT(rx_ring, i);
1802 
1803 		if (unlikely(
1804 		    !(rx_status & BIT(I40E_RX_DESC_STATUS_EOF_SHIFT)))) {
1805 			rx_ring->rx_stats.non_eop_descs++;
1806 			continue;
1807 		}
1808 
1809 		/* ERR_MASK will only have valid bits if EOP set */
1810 		if (unlikely(rx_error & BIT(I40E_RX_DESC_ERROR_RXE_SHIFT))) {
1811 			dev_kfree_skb_any(skb);
1812 			continue;
1813 		}
1814 
1815 		i40e_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
1816 		if (unlikely(rx_status & I40E_RXD_QW1_STATUS_TSYNVALID_MASK)) {
1817 			i40e_ptp_rx_hwtstamp(vsi->back, skb, (rx_status &
1818 					   I40E_RXD_QW1_STATUS_TSYNINDX_MASK) >>
1819 					   I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT);
1820 			rx_ring->last_rx_timestamp = jiffies;
1821 		}
1822 
1823 		/* probably a little skewed due to removing CRC */
1824 		total_rx_bytes += skb->len;
1825 		total_rx_packets++;
1826 
1827 		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
1828 
1829 		i40e_rx_checksum(vsi, skb, rx_status, rx_error, rx_ptype);
1830 
1831 		vlan_tag = rx_status & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)
1832 			 ? le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1)
1833 			 : 0;
1834 #ifdef I40E_FCOE
1835 		if (!i40e_fcoe_handle_offload(rx_ring, rx_desc, skb)) {
1836 			dev_kfree_skb_any(skb);
1837 			continue;
1838 		}
1839 #endif
1840 		i40e_receive_skb(rx_ring, skb, vlan_tag);
1841 
1842 		rx_desc->wb.qword1.status_error_len = 0;
1843 	} while (likely(total_rx_packets < budget));
1844 
1845 	u64_stats_update_begin(&rx_ring->syncp);
1846 	rx_ring->stats.packets += total_rx_packets;
1847 	rx_ring->stats.bytes += total_rx_bytes;
1848 	u64_stats_update_end(&rx_ring->syncp);
1849 	rx_ring->q_vector->rx.total_packets += total_rx_packets;
1850 	rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
1851 
1852 	return failure ? budget : total_rx_packets;
1853 }
1854 
1855 static u32 i40e_buildreg_itr(const int type, const u16 itr)
1856 {
1857 	u32 val;
1858 
1859 	val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
1860 	      /* Don't clear PBA because that can cause lost interrupts that
1861 	       * came in while we were cleaning/polling
1862 	       */
1863 	      (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) |
1864 	      (itr << I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT);
1865 
1866 	return val;
1867 }
1868 
1869 /* a small macro to shorten up some long lines */
1870 #define INTREG I40E_PFINT_DYN_CTLN
1871 
1872 /**
1873  * i40e_update_enable_itr - Update itr and re-enable MSIX interrupt
1874  * @vsi: the VSI we care about
1875  * @q_vector: q_vector for which itr is being updated and interrupt enabled
1876  *
1877  **/
1878 static inline void i40e_update_enable_itr(struct i40e_vsi *vsi,
1879 					  struct i40e_q_vector *q_vector)
1880 {
1881 	struct i40e_hw *hw = &vsi->back->hw;
1882 	bool rx = false, tx = false;
1883 	u32 rxval, txval;
1884 	int vector;
1885 	int idx = q_vector->v_idx;
1886 
1887 	vector = (q_vector->v_idx + vsi->base_vector);
1888 
1889 	/* avoid dynamic calculation if in countdown mode OR if
1890 	 * all dynamic is disabled
1891 	 */
1892 	rxval = txval = i40e_buildreg_itr(I40E_ITR_NONE, 0);
1893 
1894 	if (q_vector->itr_countdown > 0 ||
1895 	    (!ITR_IS_DYNAMIC(vsi->rx_rings[idx]->rx_itr_setting) &&
1896 	     !ITR_IS_DYNAMIC(vsi->tx_rings[idx]->tx_itr_setting))) {
1897 		goto enable_int;
1898 	}
1899 
1900 	if (ITR_IS_DYNAMIC(vsi->rx_rings[idx]->rx_itr_setting)) {
1901 		rx = i40e_set_new_dynamic_itr(&q_vector->rx);
1902 		rxval = i40e_buildreg_itr(I40E_RX_ITR, q_vector->rx.itr);
1903 	}
1904 
1905 	if (ITR_IS_DYNAMIC(vsi->tx_rings[idx]->tx_itr_setting)) {
1906 		tx = i40e_set_new_dynamic_itr(&q_vector->tx);
1907 		txval = i40e_buildreg_itr(I40E_TX_ITR, q_vector->tx.itr);
1908 	}
1909 
1910 	if (rx || tx) {
1911 		/* get the higher of the two ITR adjustments and
1912 		 * use the same value for both ITR registers
1913 		 * when in adaptive mode (Rx and/or Tx)
1914 		 */
1915 		u16 itr = max(q_vector->tx.itr, q_vector->rx.itr);
1916 
1917 		q_vector->tx.itr = q_vector->rx.itr = itr;
1918 		txval = i40e_buildreg_itr(I40E_TX_ITR, itr);
1919 		tx = true;
1920 		rxval = i40e_buildreg_itr(I40E_RX_ITR, itr);
1921 		rx = true;
1922 	}
1923 
1924 	/* only need to enable the interrupt once, but need
1925 	 * to possibly update both ITR values
1926 	 */
1927 	if (rx) {
1928 		/* set the INTENA_MSK_MASK so that this first write
1929 		 * won't actually enable the interrupt, instead just
1930 		 * updating the ITR (it's bit 31 PF and VF)
1931 		 */
1932 		rxval |= BIT(31);
1933 		/* don't check _DOWN because interrupt isn't being enabled */
1934 		wr32(hw, INTREG(vector - 1), rxval);
1935 	}
1936 
1937 enable_int:
1938 	if (!test_bit(__I40E_DOWN, &vsi->state))
1939 		wr32(hw, INTREG(vector - 1), txval);
1940 
1941 	if (q_vector->itr_countdown)
1942 		q_vector->itr_countdown--;
1943 	else
1944 		q_vector->itr_countdown = ITR_COUNTDOWN_START;
1945 }
1946 
1947 /**
1948  * i40e_napi_poll - NAPI polling Rx/Tx cleanup routine
1949  * @napi: napi struct with our devices info in it
1950  * @budget: amount of work driver is allowed to do this pass, in packets
1951  *
1952  * This function will clean all queues associated with a q_vector.
1953  *
1954  * Returns the amount of work done
1955  **/
1956 int i40e_napi_poll(struct napi_struct *napi, int budget)
1957 {
1958 	struct i40e_q_vector *q_vector =
1959 			       container_of(napi, struct i40e_q_vector, napi);
1960 	struct i40e_vsi *vsi = q_vector->vsi;
1961 	struct i40e_ring *ring;
1962 	bool clean_complete = true;
1963 	bool arm_wb = false;
1964 	int budget_per_ring;
1965 	int work_done = 0;
1966 
1967 	if (test_bit(__I40E_DOWN, &vsi->state)) {
1968 		napi_complete(napi);
1969 		return 0;
1970 	}
1971 
1972 	/* Clear hung_detected bit */
1973 	clear_bit(I40E_Q_VECTOR_HUNG_DETECT, &q_vector->hung_detected);
1974 	/* Since the actual Tx work is minimal, we can give the Tx a larger
1975 	 * budget and be more aggressive about cleaning up the Tx descriptors.
1976 	 */
1977 	i40e_for_each_ring(ring, q_vector->tx) {
1978 		clean_complete = clean_complete &&
1979 				 i40e_clean_tx_irq(ring, vsi->work_limit);
1980 		arm_wb = arm_wb || ring->arm_wb;
1981 		ring->arm_wb = false;
1982 	}
1983 
1984 	/* Handle case where we are called by netpoll with a budget of 0 */
1985 	if (budget <= 0)
1986 		goto tx_only;
1987 
1988 	/* We attempt to distribute budget to each Rx queue fairly, but don't
1989 	 * allow the budget to go below 1 because that would exit polling early.
1990 	 */
1991 	budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
1992 
1993 	i40e_for_each_ring(ring, q_vector->rx) {
1994 		int cleaned;
1995 
1996 		if (ring_is_ps_enabled(ring))
1997 			cleaned = i40e_clean_rx_irq_ps(ring, budget_per_ring);
1998 		else
1999 			cleaned = i40e_clean_rx_irq_1buf(ring, budget_per_ring);
2000 
2001 		work_done += cleaned;
2002 		/* if we didn't clean as many as budgeted, we must be done */
2003 		clean_complete = clean_complete && (budget_per_ring > cleaned);
2004 	}
2005 
2006 	/* If work not completed, return budget and polling will return */
2007 	if (!clean_complete) {
2008 tx_only:
2009 		if (arm_wb) {
2010 			q_vector->tx.ring[0].tx_stats.tx_force_wb++;
2011 			i40e_enable_wb_on_itr(vsi, q_vector);
2012 		}
2013 		return budget;
2014 	}
2015 
2016 	if (vsi->back->flags & I40E_TXR_FLAGS_WB_ON_ITR)
2017 		q_vector->arm_wb_state = false;
2018 
2019 	/* Work is done so exit the polling mode and re-enable the interrupt */
2020 	napi_complete_done(napi, work_done);
2021 	if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
2022 		i40e_update_enable_itr(vsi, q_vector);
2023 	} else { /* Legacy mode */
2024 		i40e_irq_dynamic_enable_icr0(vsi->back, false);
2025 	}
2026 	return 0;
2027 }
2028 
2029 /**
2030  * i40e_atr - Add a Flow Director ATR filter
2031  * @tx_ring:  ring to add programming descriptor to
2032  * @skb:      send buffer
2033  * @tx_flags: send tx flags
2034  **/
2035 static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
2036 		     u32 tx_flags)
2037 {
2038 	struct i40e_filter_program_desc *fdir_desc;
2039 	struct i40e_pf *pf = tx_ring->vsi->back;
2040 	union {
2041 		unsigned char *network;
2042 		struct iphdr *ipv4;
2043 		struct ipv6hdr *ipv6;
2044 	} hdr;
2045 	struct tcphdr *th;
2046 	unsigned int hlen;
2047 	u32 flex_ptype, dtype_cmd;
2048 	int l4_proto;
2049 	u16 i;
2050 
2051 	/* make sure ATR is enabled */
2052 	if (!(pf->flags & I40E_FLAG_FD_ATR_ENABLED))
2053 		return;
2054 
2055 	if ((pf->auto_disable_flags & I40E_FLAG_FD_ATR_ENABLED))
2056 		return;
2057 
2058 	/* if sampling is disabled do nothing */
2059 	if (!tx_ring->atr_sample_rate)
2060 		return;
2061 
2062 	/* Currently only IPv4/IPv6 with TCP is supported */
2063 	if (!(tx_flags & (I40E_TX_FLAGS_IPV4 | I40E_TX_FLAGS_IPV6)))
2064 		return;
2065 
2066 	/* snag network header to get L4 type and address */
2067 	hdr.network = (tx_flags & I40E_TX_FLAGS_UDP_TUNNEL) ?
2068 		      skb_inner_network_header(skb) : skb_network_header(skb);
2069 
2070 	/* Note: tx_flags gets modified to reflect inner protocols in
2071 	 * tx_enable_csum function if encap is enabled.
2072 	 */
2073 	if (tx_flags & I40E_TX_FLAGS_IPV4) {
2074 		/* access ihl as u8 to avoid unaligned access on ia64 */
2075 		hlen = (hdr.network[0] & 0x0F) << 2;
2076 		l4_proto = hdr.ipv4->protocol;
2077 	} else {
2078 		hlen = hdr.network - skb->data;
2079 		l4_proto = ipv6_find_hdr(skb, &hlen, IPPROTO_TCP, NULL, NULL);
2080 		hlen -= hdr.network - skb->data;
2081 	}
2082 
2083 	if (l4_proto != IPPROTO_TCP)
2084 		return;
2085 
2086 	th = (struct tcphdr *)(hdr.network + hlen);
2087 
2088 	/* Due to lack of space, no more new filters can be programmed */
2089 	if (th->syn && (pf->auto_disable_flags & I40E_FLAG_FD_ATR_ENABLED))
2090 		return;
2091 	if ((pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE) &&
2092 	    (!(pf->auto_disable_flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE))) {
2093 		/* HW ATR eviction will take care of removing filters on FIN
2094 		 * and RST packets.
2095 		 */
2096 		if (th->fin || th->rst)
2097 			return;
2098 	}
2099 
2100 	tx_ring->atr_count++;
2101 
2102 	/* sample on all syn/fin/rst packets or once every atr sample rate */
2103 	if (!th->fin &&
2104 	    !th->syn &&
2105 	    !th->rst &&
2106 	    (tx_ring->atr_count < tx_ring->atr_sample_rate))
2107 		return;
2108 
2109 	tx_ring->atr_count = 0;
2110 
2111 	/* grab the next descriptor */
2112 	i = tx_ring->next_to_use;
2113 	fdir_desc = I40E_TX_FDIRDESC(tx_ring, i);
2114 
2115 	i++;
2116 	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
2117 
2118 	flex_ptype = (tx_ring->queue_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT) &
2119 		      I40E_TXD_FLTR_QW0_QINDEX_MASK;
2120 	flex_ptype |= (tx_flags & I40E_TX_FLAGS_IPV4) ?
2121 		      (I40E_FILTER_PCTYPE_NONF_IPV4_TCP <<
2122 		       I40E_TXD_FLTR_QW0_PCTYPE_SHIFT) :
2123 		      (I40E_FILTER_PCTYPE_NONF_IPV6_TCP <<
2124 		       I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
2125 
2126 	flex_ptype |= tx_ring->vsi->id << I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT;
2127 
2128 	dtype_cmd = I40E_TX_DESC_DTYPE_FILTER_PROG;
2129 
2130 	dtype_cmd |= (th->fin || th->rst) ?
2131 		     (I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE <<
2132 		      I40E_TXD_FLTR_QW1_PCMD_SHIFT) :
2133 		     (I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE <<
2134 		      I40E_TXD_FLTR_QW1_PCMD_SHIFT);
2135 
2136 	dtype_cmd |= I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_QINDEX <<
2137 		     I40E_TXD_FLTR_QW1_DEST_SHIFT;
2138 
2139 	dtype_cmd |= I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID <<
2140 		     I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT;
2141 
2142 	dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
2143 	if (!(tx_flags & I40E_TX_FLAGS_UDP_TUNNEL))
2144 		dtype_cmd |=
2145 			((u32)I40E_FD_ATR_STAT_IDX(pf->hw.pf_id) <<
2146 			I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
2147 			I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
2148 	else
2149 		dtype_cmd |=
2150 			((u32)I40E_FD_ATR_TUNNEL_STAT_IDX(pf->hw.pf_id) <<
2151 			I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
2152 			I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
2153 
2154 	if ((pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE) &&
2155 	    (!(pf->auto_disable_flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE)))
2156 		dtype_cmd |= I40E_TXD_FLTR_QW1_ATR_MASK;
2157 
2158 	fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
2159 	fdir_desc->rsvd = cpu_to_le32(0);
2160 	fdir_desc->dtype_cmd_cntindex = cpu_to_le32(dtype_cmd);
2161 	fdir_desc->fd_id = cpu_to_le32(0);
2162 }
2163 
2164 /**
2165  * i40e_tx_prepare_vlan_flags - prepare generic TX VLAN tagging flags for HW
2166  * @skb:     send buffer
2167  * @tx_ring: ring to send buffer on
2168  * @flags:   the tx flags to be set
2169  *
2170  * Checks the skb and set up correspondingly several generic transmit flags
2171  * related to VLAN tagging for the HW, such as VLAN, DCB, etc.
2172  *
2173  * Returns error code indicate the frame should be dropped upon error and the
2174  * otherwise  returns 0 to indicate the flags has been set properly.
2175  **/
2176 #ifdef I40E_FCOE
2177 inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
2178 				      struct i40e_ring *tx_ring,
2179 				      u32 *flags)
2180 #else
2181 static inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
2182 					     struct i40e_ring *tx_ring,
2183 					     u32 *flags)
2184 #endif
2185 {
2186 	__be16 protocol = skb->protocol;
2187 	u32  tx_flags = 0;
2188 
2189 	if (protocol == htons(ETH_P_8021Q) &&
2190 	    !(tx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) {
2191 		/* When HW VLAN acceleration is turned off by the user the
2192 		 * stack sets the protocol to 8021q so that the driver
2193 		 * can take any steps required to support the SW only
2194 		 * VLAN handling.  In our case the driver doesn't need
2195 		 * to take any further steps so just set the protocol
2196 		 * to the encapsulated ethertype.
2197 		 */
2198 		skb->protocol = vlan_get_protocol(skb);
2199 		goto out;
2200 	}
2201 
2202 	/* if we have a HW VLAN tag being added, default to the HW one */
2203 	if (skb_vlan_tag_present(skb)) {
2204 		tx_flags |= skb_vlan_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
2205 		tx_flags |= I40E_TX_FLAGS_HW_VLAN;
2206 	/* else if it is a SW VLAN, check the next protocol and store the tag */
2207 	} else if (protocol == htons(ETH_P_8021Q)) {
2208 		struct vlan_hdr *vhdr, _vhdr;
2209 
2210 		vhdr = skb_header_pointer(skb, ETH_HLEN, sizeof(_vhdr), &_vhdr);
2211 		if (!vhdr)
2212 			return -EINVAL;
2213 
2214 		protocol = vhdr->h_vlan_encapsulated_proto;
2215 		tx_flags |= ntohs(vhdr->h_vlan_TCI) << I40E_TX_FLAGS_VLAN_SHIFT;
2216 		tx_flags |= I40E_TX_FLAGS_SW_VLAN;
2217 	}
2218 
2219 	if (!(tx_ring->vsi->back->flags & I40E_FLAG_DCB_ENABLED))
2220 		goto out;
2221 
2222 	/* Insert 802.1p priority into VLAN header */
2223 	if ((tx_flags & (I40E_TX_FLAGS_HW_VLAN | I40E_TX_FLAGS_SW_VLAN)) ||
2224 	    (skb->priority != TC_PRIO_CONTROL)) {
2225 		tx_flags &= ~I40E_TX_FLAGS_VLAN_PRIO_MASK;
2226 		tx_flags |= (skb->priority & 0x7) <<
2227 				I40E_TX_FLAGS_VLAN_PRIO_SHIFT;
2228 		if (tx_flags & I40E_TX_FLAGS_SW_VLAN) {
2229 			struct vlan_ethhdr *vhdr;
2230 			int rc;
2231 
2232 			rc = skb_cow_head(skb, 0);
2233 			if (rc < 0)
2234 				return rc;
2235 			vhdr = (struct vlan_ethhdr *)skb->data;
2236 			vhdr->h_vlan_TCI = htons(tx_flags >>
2237 						 I40E_TX_FLAGS_VLAN_SHIFT);
2238 		} else {
2239 			tx_flags |= I40E_TX_FLAGS_HW_VLAN;
2240 		}
2241 	}
2242 
2243 out:
2244 	*flags = tx_flags;
2245 	return 0;
2246 }
2247 
2248 /**
2249  * i40e_tso - set up the tso context descriptor
2250  * @tx_ring:  ptr to the ring to send
2251  * @skb:      ptr to the skb we're sending
2252  * @hdr_len:  ptr to the size of the packet header
2253  * @cd_type_cmd_tso_mss: Quad Word 1
2254  *
2255  * Returns 0 if no TSO can happen, 1 if tso is going, or error
2256  **/
2257 static int i40e_tso(struct i40e_ring *tx_ring, struct sk_buff *skb,
2258 		    u8 *hdr_len, u64 *cd_type_cmd_tso_mss)
2259 {
2260 	u64 cd_cmd, cd_tso_len, cd_mss;
2261 	union {
2262 		struct iphdr *v4;
2263 		struct ipv6hdr *v6;
2264 		unsigned char *hdr;
2265 	} ip;
2266 	union {
2267 		struct tcphdr *tcp;
2268 		struct udphdr *udp;
2269 		unsigned char *hdr;
2270 	} l4;
2271 	u32 paylen, l4_offset;
2272 	int err;
2273 
2274 	if (skb->ip_summed != CHECKSUM_PARTIAL)
2275 		return 0;
2276 
2277 	if (!skb_is_gso(skb))
2278 		return 0;
2279 
2280 	err = skb_cow_head(skb, 0);
2281 	if (err < 0)
2282 		return err;
2283 
2284 	ip.hdr = skb_network_header(skb);
2285 	l4.hdr = skb_transport_header(skb);
2286 
2287 	/* initialize outer IP header fields */
2288 	if (ip.v4->version == 4) {
2289 		ip.v4->tot_len = 0;
2290 		ip.v4->check = 0;
2291 	} else {
2292 		ip.v6->payload_len = 0;
2293 	}
2294 
2295 	if (skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL | SKB_GSO_GRE |
2296 					 SKB_GSO_UDP_TUNNEL_CSUM)) {
2297 		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM) {
2298 			/* determine offset of outer transport header */
2299 			l4_offset = l4.hdr - skb->data;
2300 
2301 			/* remove payload length from outer checksum */
2302 			paylen = (__force u16)l4.udp->check;
2303 			paylen += ntohs(1) * (u16)~(skb->len - l4_offset);
2304 			l4.udp->check = ~csum_fold((__force __wsum)paylen);
2305 		}
2306 
2307 		/* reset pointers to inner headers */
2308 		ip.hdr = skb_inner_network_header(skb);
2309 		l4.hdr = skb_inner_transport_header(skb);
2310 
2311 		/* initialize inner IP header fields */
2312 		if (ip.v4->version == 4) {
2313 			ip.v4->tot_len = 0;
2314 			ip.v4->check = 0;
2315 		} else {
2316 			ip.v6->payload_len = 0;
2317 		}
2318 	}
2319 
2320 	/* determine offset of inner transport header */
2321 	l4_offset = l4.hdr - skb->data;
2322 
2323 	/* remove payload length from inner checksum */
2324 	paylen = (__force u16)l4.tcp->check;
2325 	paylen += ntohs(1) * (u16)~(skb->len - l4_offset);
2326 	l4.tcp->check = ~csum_fold((__force __wsum)paylen);
2327 
2328 	/* compute length of segmentation header */
2329 	*hdr_len = (l4.tcp->doff * 4) + l4_offset;
2330 
2331 	/* find the field values */
2332 	cd_cmd = I40E_TX_CTX_DESC_TSO;
2333 	cd_tso_len = skb->len - *hdr_len;
2334 	cd_mss = skb_shinfo(skb)->gso_size;
2335 	*cd_type_cmd_tso_mss |= (cd_cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) |
2336 				(cd_tso_len << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
2337 				(cd_mss << I40E_TXD_CTX_QW1_MSS_SHIFT);
2338 	return 1;
2339 }
2340 
2341 /**
2342  * i40e_tsyn - set up the tsyn context descriptor
2343  * @tx_ring:  ptr to the ring to send
2344  * @skb:      ptr to the skb we're sending
2345  * @tx_flags: the collected send information
2346  * @cd_type_cmd_tso_mss: Quad Word 1
2347  *
2348  * Returns 0 if no Tx timestamp can happen and 1 if the timestamp will happen
2349  **/
2350 static int i40e_tsyn(struct i40e_ring *tx_ring, struct sk_buff *skb,
2351 		     u32 tx_flags, u64 *cd_type_cmd_tso_mss)
2352 {
2353 	struct i40e_pf *pf;
2354 
2355 	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
2356 		return 0;
2357 
2358 	/* Tx timestamps cannot be sampled when doing TSO */
2359 	if (tx_flags & I40E_TX_FLAGS_TSO)
2360 		return 0;
2361 
2362 	/* only timestamp the outbound packet if the user has requested it and
2363 	 * we are not already transmitting a packet to be timestamped
2364 	 */
2365 	pf = i40e_netdev_to_pf(tx_ring->netdev);
2366 	if (!(pf->flags & I40E_FLAG_PTP))
2367 		return 0;
2368 
2369 	if (pf->ptp_tx &&
2370 	    !test_and_set_bit_lock(__I40E_PTP_TX_IN_PROGRESS, &pf->state)) {
2371 		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
2372 		pf->ptp_tx_skb = skb_get(skb);
2373 	} else {
2374 		return 0;
2375 	}
2376 
2377 	*cd_type_cmd_tso_mss |= (u64)I40E_TX_CTX_DESC_TSYN <<
2378 				I40E_TXD_CTX_QW1_CMD_SHIFT;
2379 
2380 	return 1;
2381 }
2382 
2383 /**
2384  * i40e_tx_enable_csum - Enable Tx checksum offloads
2385  * @skb: send buffer
2386  * @tx_flags: pointer to Tx flags currently set
2387  * @td_cmd: Tx descriptor command bits to set
2388  * @td_offset: Tx descriptor header offsets to set
2389  * @tx_ring: Tx descriptor ring
2390  * @cd_tunneling: ptr to context desc bits
2391  **/
2392 static int i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
2393 			       u32 *td_cmd, u32 *td_offset,
2394 			       struct i40e_ring *tx_ring,
2395 			       u32 *cd_tunneling)
2396 {
2397 	union {
2398 		struct iphdr *v4;
2399 		struct ipv6hdr *v6;
2400 		unsigned char *hdr;
2401 	} ip;
2402 	union {
2403 		struct tcphdr *tcp;
2404 		struct udphdr *udp;
2405 		unsigned char *hdr;
2406 	} l4;
2407 	unsigned char *exthdr;
2408 	u32 offset, cmd = 0, tunnel = 0;
2409 	__be16 frag_off;
2410 	u8 l4_proto = 0;
2411 
2412 	if (skb->ip_summed != CHECKSUM_PARTIAL)
2413 		return 0;
2414 
2415 	ip.hdr = skb_network_header(skb);
2416 	l4.hdr = skb_transport_header(skb);
2417 
2418 	/* compute outer L2 header size */
2419 	offset = ((ip.hdr - skb->data) / 2) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
2420 
2421 	if (skb->encapsulation) {
2422 		/* define outer network header type */
2423 		if (*tx_flags & I40E_TX_FLAGS_IPV4) {
2424 			tunnel |= (*tx_flags & I40E_TX_FLAGS_TSO) ?
2425 				  I40E_TX_CTX_EXT_IP_IPV4 :
2426 				  I40E_TX_CTX_EXT_IP_IPV4_NO_CSUM;
2427 
2428 			l4_proto = ip.v4->protocol;
2429 		} else if (*tx_flags & I40E_TX_FLAGS_IPV6) {
2430 			tunnel |= I40E_TX_CTX_EXT_IP_IPV6;
2431 
2432 			exthdr = ip.hdr + sizeof(*ip.v6);
2433 			l4_proto = ip.v6->nexthdr;
2434 			if (l4.hdr != exthdr)
2435 				ipv6_skip_exthdr(skb, exthdr - skb->data,
2436 						 &l4_proto, &frag_off);
2437 		}
2438 
2439 		/* compute outer L3 header size */
2440 		tunnel |= ((l4.hdr - ip.hdr) / 4) <<
2441 			  I40E_TXD_CTX_QW0_EXT_IPLEN_SHIFT;
2442 
2443 		/* switch IP header pointer from outer to inner header */
2444 		ip.hdr = skb_inner_network_header(skb);
2445 
2446 		/* define outer transport */
2447 		switch (l4_proto) {
2448 		case IPPROTO_UDP:
2449 			tunnel |= I40E_TXD_CTX_UDP_TUNNELING;
2450 			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2451 			break;
2452 		case IPPROTO_GRE:
2453 			tunnel |= I40E_TXD_CTX_GRE_TUNNELING;
2454 			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2455 			break;
2456 		default:
2457 			if (*tx_flags & I40E_TX_FLAGS_TSO)
2458 				return -1;
2459 
2460 			skb_checksum_help(skb);
2461 			return 0;
2462 		}
2463 
2464 		/* compute tunnel header size */
2465 		tunnel |= ((ip.hdr - l4.hdr) / 2) <<
2466 			  I40E_TXD_CTX_QW0_NATLEN_SHIFT;
2467 
2468 		/* indicate if we need to offload outer UDP header */
2469 		if ((*tx_flags & I40E_TX_FLAGS_TSO) &&
2470 		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
2471 			tunnel |= I40E_TXD_CTX_QW0_L4T_CS_MASK;
2472 
2473 		/* record tunnel offload values */
2474 		*cd_tunneling |= tunnel;
2475 
2476 		/* switch L4 header pointer from outer to inner */
2477 		l4.hdr = skb_inner_transport_header(skb);
2478 		l4_proto = 0;
2479 
2480 		/* reset type as we transition from outer to inner headers */
2481 		*tx_flags &= ~(I40E_TX_FLAGS_IPV4 | I40E_TX_FLAGS_IPV6);
2482 		if (ip.v4->version == 4)
2483 			*tx_flags |= I40E_TX_FLAGS_IPV4;
2484 		if (ip.v6->version == 6)
2485 			*tx_flags |= I40E_TX_FLAGS_IPV6;
2486 	}
2487 
2488 	/* Enable IP checksum offloads */
2489 	if (*tx_flags & I40E_TX_FLAGS_IPV4) {
2490 		l4_proto = ip.v4->protocol;
2491 		/* the stack computes the IP header already, the only time we
2492 		 * need the hardware to recompute it is in the case of TSO.
2493 		 */
2494 		cmd |= (*tx_flags & I40E_TX_FLAGS_TSO) ?
2495 		       I40E_TX_DESC_CMD_IIPT_IPV4_CSUM :
2496 		       I40E_TX_DESC_CMD_IIPT_IPV4;
2497 	} else if (*tx_flags & I40E_TX_FLAGS_IPV6) {
2498 		cmd |= I40E_TX_DESC_CMD_IIPT_IPV6;
2499 
2500 		exthdr = ip.hdr + sizeof(*ip.v6);
2501 		l4_proto = ip.v6->nexthdr;
2502 		if (l4.hdr != exthdr)
2503 			ipv6_skip_exthdr(skb, exthdr - skb->data,
2504 					 &l4_proto, &frag_off);
2505 	}
2506 
2507 	/* compute inner L3 header size */
2508 	offset |= ((l4.hdr - ip.hdr) / 4) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
2509 
2510 	/* Enable L4 checksum offloads */
2511 	switch (l4_proto) {
2512 	case IPPROTO_TCP:
2513 		/* enable checksum offloads */
2514 		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
2515 		offset |= l4.tcp->doff << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2516 		break;
2517 	case IPPROTO_SCTP:
2518 		/* enable SCTP checksum offload */
2519 		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
2520 		offset |= (sizeof(struct sctphdr) >> 2) <<
2521 			  I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2522 		break;
2523 	case IPPROTO_UDP:
2524 		/* enable UDP checksum offload */
2525 		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
2526 		offset |= (sizeof(struct udphdr) >> 2) <<
2527 			  I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2528 		break;
2529 	default:
2530 		if (*tx_flags & I40E_TX_FLAGS_TSO)
2531 			return -1;
2532 		skb_checksum_help(skb);
2533 		return 0;
2534 	}
2535 
2536 	*td_cmd |= cmd;
2537 	*td_offset |= offset;
2538 
2539 	return 1;
2540 }
2541 
2542 /**
2543  * i40e_create_tx_ctx Build the Tx context descriptor
2544  * @tx_ring:  ring to create the descriptor on
2545  * @cd_type_cmd_tso_mss: Quad Word 1
2546  * @cd_tunneling: Quad Word 0 - bits 0-31
2547  * @cd_l2tag2: Quad Word 0 - bits 32-63
2548  **/
2549 static void i40e_create_tx_ctx(struct i40e_ring *tx_ring,
2550 			       const u64 cd_type_cmd_tso_mss,
2551 			       const u32 cd_tunneling, const u32 cd_l2tag2)
2552 {
2553 	struct i40e_tx_context_desc *context_desc;
2554 	int i = tx_ring->next_to_use;
2555 
2556 	if ((cd_type_cmd_tso_mss == I40E_TX_DESC_DTYPE_CONTEXT) &&
2557 	    !cd_tunneling && !cd_l2tag2)
2558 		return;
2559 
2560 	/* grab the next descriptor */
2561 	context_desc = I40E_TX_CTXTDESC(tx_ring, i);
2562 
2563 	i++;
2564 	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
2565 
2566 	/* cpu_to_le32 and assign to struct fields */
2567 	context_desc->tunneling_params = cpu_to_le32(cd_tunneling);
2568 	context_desc->l2tag2 = cpu_to_le16(cd_l2tag2);
2569 	context_desc->rsvd = cpu_to_le16(0);
2570 	context_desc->type_cmd_tso_mss = cpu_to_le64(cd_type_cmd_tso_mss);
2571 }
2572 
2573 /**
2574  * __i40e_maybe_stop_tx - 2nd level check for tx stop conditions
2575  * @tx_ring: the ring to be checked
2576  * @size:    the size buffer we want to assure is available
2577  *
2578  * Returns -EBUSY if a stop is needed, else 0
2579  **/
2580 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
2581 {
2582 	netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
2583 	/* Memory barrier before checking head and tail */
2584 	smp_mb();
2585 
2586 	/* Check again in a case another CPU has just made room available. */
2587 	if (likely(I40E_DESC_UNUSED(tx_ring) < size))
2588 		return -EBUSY;
2589 
2590 	/* A reprieve! - use start_queue because it doesn't call schedule */
2591 	netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
2592 	++tx_ring->tx_stats.restart_queue;
2593 	return 0;
2594 }
2595 
2596 /**
2597  * __i40e_chk_linearize - Check if there are more than 8 buffers per packet
2598  * @skb:      send buffer
2599  *
2600  * Note: Our HW can't DMA more than 8 buffers to build a packet on the wire
2601  * and so we need to figure out the cases where we need to linearize the skb.
2602  *
2603  * For TSO we need to count the TSO header and segment payload separately.
2604  * As such we need to check cases where we have 7 fragments or more as we
2605  * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
2606  * the segment payload in the first descriptor, and another 7 for the
2607  * fragments.
2608  **/
2609 bool __i40e_chk_linearize(struct sk_buff *skb)
2610 {
2611 	const struct skb_frag_struct *frag, *stale;
2612 	int nr_frags, sum;
2613 
2614 	/* no need to check if number of frags is less than 7 */
2615 	nr_frags = skb_shinfo(skb)->nr_frags;
2616 	if (nr_frags < (I40E_MAX_BUFFER_TXD - 1))
2617 		return false;
2618 
2619 	/* We need to walk through the list and validate that each group
2620 	 * of 6 fragments totals at least gso_size.  However we don't need
2621 	 * to perform such validation on the last 6 since the last 6 cannot
2622 	 * inherit any data from a descriptor after them.
2623 	 */
2624 	nr_frags -= I40E_MAX_BUFFER_TXD - 2;
2625 	frag = &skb_shinfo(skb)->frags[0];
2626 
2627 	/* Initialize size to the negative value of gso_size minus 1.  We
2628 	 * use this as the worst case scenerio in which the frag ahead
2629 	 * of us only provides one byte which is why we are limited to 6
2630 	 * descriptors for a single transmit as the header and previous
2631 	 * fragment are already consuming 2 descriptors.
2632 	 */
2633 	sum = 1 - skb_shinfo(skb)->gso_size;
2634 
2635 	/* Add size of frags 0 through 4 to create our initial sum */
2636 	sum += skb_frag_size(frag++);
2637 	sum += skb_frag_size(frag++);
2638 	sum += skb_frag_size(frag++);
2639 	sum += skb_frag_size(frag++);
2640 	sum += skb_frag_size(frag++);
2641 
2642 	/* Walk through fragments adding latest fragment, testing it, and
2643 	 * then removing stale fragments from the sum.
2644 	 */
2645 	stale = &skb_shinfo(skb)->frags[0];
2646 	for (;;) {
2647 		sum += skb_frag_size(frag++);
2648 
2649 		/* if sum is negative we failed to make sufficient progress */
2650 		if (sum < 0)
2651 			return true;
2652 
2653 		/* use pre-decrement to avoid processing last fragment */
2654 		if (!--nr_frags)
2655 			break;
2656 
2657 		sum -= skb_frag_size(stale++);
2658 	}
2659 
2660 	return false;
2661 }
2662 
2663 /**
2664  * i40e_tx_map - Build the Tx descriptor
2665  * @tx_ring:  ring to send buffer on
2666  * @skb:      send buffer
2667  * @first:    first buffer info buffer to use
2668  * @tx_flags: collected send information
2669  * @hdr_len:  size of the packet header
2670  * @td_cmd:   the command field in the descriptor
2671  * @td_offset: offset for checksum or crc
2672  **/
2673 #ifdef I40E_FCOE
2674 inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
2675 			struct i40e_tx_buffer *first, u32 tx_flags,
2676 			const u8 hdr_len, u32 td_cmd, u32 td_offset)
2677 #else
2678 static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
2679 			       struct i40e_tx_buffer *first, u32 tx_flags,
2680 			       const u8 hdr_len, u32 td_cmd, u32 td_offset)
2681 #endif
2682 {
2683 	unsigned int data_len = skb->data_len;
2684 	unsigned int size = skb_headlen(skb);
2685 	struct skb_frag_struct *frag;
2686 	struct i40e_tx_buffer *tx_bi;
2687 	struct i40e_tx_desc *tx_desc;
2688 	u16 i = tx_ring->next_to_use;
2689 	u32 td_tag = 0;
2690 	dma_addr_t dma;
2691 	u16 gso_segs;
2692 	u16 desc_count = 0;
2693 	bool tail_bump = true;
2694 	bool do_rs = false;
2695 
2696 	if (tx_flags & I40E_TX_FLAGS_HW_VLAN) {
2697 		td_cmd |= I40E_TX_DESC_CMD_IL2TAG1;
2698 		td_tag = (tx_flags & I40E_TX_FLAGS_VLAN_MASK) >>
2699 			 I40E_TX_FLAGS_VLAN_SHIFT;
2700 	}
2701 
2702 	if (tx_flags & (I40E_TX_FLAGS_TSO | I40E_TX_FLAGS_FSO))
2703 		gso_segs = skb_shinfo(skb)->gso_segs;
2704 	else
2705 		gso_segs = 1;
2706 
2707 	/* multiply data chunks by size of headers */
2708 	first->bytecount = skb->len - hdr_len + (gso_segs * hdr_len);
2709 	first->gso_segs = gso_segs;
2710 	first->skb = skb;
2711 	first->tx_flags = tx_flags;
2712 
2713 	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
2714 
2715 	tx_desc = I40E_TX_DESC(tx_ring, i);
2716 	tx_bi = first;
2717 
2718 	for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
2719 		if (dma_mapping_error(tx_ring->dev, dma))
2720 			goto dma_error;
2721 
2722 		/* record length, and DMA address */
2723 		dma_unmap_len_set(tx_bi, len, size);
2724 		dma_unmap_addr_set(tx_bi, dma, dma);
2725 
2726 		tx_desc->buffer_addr = cpu_to_le64(dma);
2727 
2728 		while (unlikely(size > I40E_MAX_DATA_PER_TXD)) {
2729 			tx_desc->cmd_type_offset_bsz =
2730 				build_ctob(td_cmd, td_offset,
2731 					   I40E_MAX_DATA_PER_TXD, td_tag);
2732 
2733 			tx_desc++;
2734 			i++;
2735 			desc_count++;
2736 
2737 			if (i == tx_ring->count) {
2738 				tx_desc = I40E_TX_DESC(tx_ring, 0);
2739 				i = 0;
2740 			}
2741 
2742 			dma += I40E_MAX_DATA_PER_TXD;
2743 			size -= I40E_MAX_DATA_PER_TXD;
2744 
2745 			tx_desc->buffer_addr = cpu_to_le64(dma);
2746 		}
2747 
2748 		if (likely(!data_len))
2749 			break;
2750 
2751 		tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
2752 							  size, td_tag);
2753 
2754 		tx_desc++;
2755 		i++;
2756 		desc_count++;
2757 
2758 		if (i == tx_ring->count) {
2759 			tx_desc = I40E_TX_DESC(tx_ring, 0);
2760 			i = 0;
2761 		}
2762 
2763 		size = skb_frag_size(frag);
2764 		data_len -= size;
2765 
2766 		dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
2767 				       DMA_TO_DEVICE);
2768 
2769 		tx_bi = &tx_ring->tx_bi[i];
2770 	}
2771 
2772 	/* set next_to_watch value indicating a packet is present */
2773 	first->next_to_watch = tx_desc;
2774 
2775 	i++;
2776 	if (i == tx_ring->count)
2777 		i = 0;
2778 
2779 	tx_ring->next_to_use = i;
2780 
2781 	netdev_tx_sent_queue(netdev_get_tx_queue(tx_ring->netdev,
2782 						 tx_ring->queue_index),
2783 						 first->bytecount);
2784 	i40e_maybe_stop_tx(tx_ring, DESC_NEEDED);
2785 
2786 	/* Algorithm to optimize tail and RS bit setting:
2787 	 * if xmit_more is supported
2788 	 *	if xmit_more is true
2789 	 *		do not update tail and do not mark RS bit.
2790 	 *	if xmit_more is false and last xmit_more was false
2791 	 *		if every packet spanned less than 4 desc
2792 	 *			then set RS bit on 4th packet and update tail
2793 	 *			on every packet
2794 	 *		else
2795 	 *			update tail and set RS bit on every packet.
2796 	 *	if xmit_more is false and last_xmit_more was true
2797 	 *		update tail and set RS bit.
2798 	 *
2799 	 * Optimization: wmb to be issued only in case of tail update.
2800 	 * Also optimize the Descriptor WB path for RS bit with the same
2801 	 * algorithm.
2802 	 *
2803 	 * Note: If there are less than 4 packets
2804 	 * pending and interrupts were disabled the service task will
2805 	 * trigger a force WB.
2806 	 */
2807 	if (skb->xmit_more  &&
2808 	    !netif_xmit_stopped(netdev_get_tx_queue(tx_ring->netdev,
2809 						    tx_ring->queue_index))) {
2810 		tx_ring->flags |= I40E_TXR_FLAGS_LAST_XMIT_MORE_SET;
2811 		tail_bump = false;
2812 	} else if (!skb->xmit_more &&
2813 		   !netif_xmit_stopped(netdev_get_tx_queue(tx_ring->netdev,
2814 						       tx_ring->queue_index)) &&
2815 		   (!(tx_ring->flags & I40E_TXR_FLAGS_LAST_XMIT_MORE_SET)) &&
2816 		   (tx_ring->packet_stride < WB_STRIDE) &&
2817 		   (desc_count < WB_STRIDE)) {
2818 		tx_ring->packet_stride++;
2819 	} else {
2820 		tx_ring->packet_stride = 0;
2821 		tx_ring->flags &= ~I40E_TXR_FLAGS_LAST_XMIT_MORE_SET;
2822 		do_rs = true;
2823 	}
2824 	if (do_rs)
2825 		tx_ring->packet_stride = 0;
2826 
2827 	tx_desc->cmd_type_offset_bsz =
2828 			build_ctob(td_cmd, td_offset, size, td_tag) |
2829 			cpu_to_le64((u64)(do_rs ? I40E_TXD_CMD :
2830 						  I40E_TX_DESC_CMD_EOP) <<
2831 						  I40E_TXD_QW1_CMD_SHIFT);
2832 
2833 	/* notify HW of packet */
2834 	if (!tail_bump)
2835 		prefetchw(tx_desc + 1);
2836 
2837 	if (tail_bump) {
2838 		/* Force memory writes to complete before letting h/w
2839 		 * know there are new descriptors to fetch.  (Only
2840 		 * applicable for weak-ordered memory model archs,
2841 		 * such as IA-64).
2842 		 */
2843 		wmb();
2844 		writel(i, tx_ring->tail);
2845 	}
2846 
2847 	return;
2848 
2849 dma_error:
2850 	dev_info(tx_ring->dev, "TX DMA map failed\n");
2851 
2852 	/* clear dma mappings for failed tx_bi map */
2853 	for (;;) {
2854 		tx_bi = &tx_ring->tx_bi[i];
2855 		i40e_unmap_and_free_tx_resource(tx_ring, tx_bi);
2856 		if (tx_bi == first)
2857 			break;
2858 		if (i == 0)
2859 			i = tx_ring->count;
2860 		i--;
2861 	}
2862 
2863 	tx_ring->next_to_use = i;
2864 }
2865 
2866 /**
2867  * i40e_xmit_frame_ring - Sends buffer on Tx ring
2868  * @skb:     send buffer
2869  * @tx_ring: ring to send buffer on
2870  *
2871  * Returns NETDEV_TX_OK if sent, else an error code
2872  **/
2873 static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
2874 					struct i40e_ring *tx_ring)
2875 {
2876 	u64 cd_type_cmd_tso_mss = I40E_TX_DESC_DTYPE_CONTEXT;
2877 	u32 cd_tunneling = 0, cd_l2tag2 = 0;
2878 	struct i40e_tx_buffer *first;
2879 	u32 td_offset = 0;
2880 	u32 tx_flags = 0;
2881 	__be16 protocol;
2882 	u32 td_cmd = 0;
2883 	u8 hdr_len = 0;
2884 	int tso, count;
2885 	int tsyn;
2886 
2887 	/* prefetch the data, we'll need it later */
2888 	prefetch(skb->data);
2889 
2890 	count = i40e_xmit_descriptor_count(skb);
2891 	if (i40e_chk_linearize(skb, count)) {
2892 		if (__skb_linearize(skb))
2893 			goto out_drop;
2894 		count = TXD_USE_COUNT(skb->len);
2895 		tx_ring->tx_stats.tx_linearize++;
2896 	}
2897 
2898 	/* need: 1 descriptor per page * PAGE_SIZE/I40E_MAX_DATA_PER_TXD,
2899 	 *       + 1 desc for skb_head_len/I40E_MAX_DATA_PER_TXD,
2900 	 *       + 4 desc gap to avoid the cache line where head is,
2901 	 *       + 1 desc for context descriptor,
2902 	 * otherwise try next time
2903 	 */
2904 	if (i40e_maybe_stop_tx(tx_ring, count + 4 + 1)) {
2905 		tx_ring->tx_stats.tx_busy++;
2906 		return NETDEV_TX_BUSY;
2907 	}
2908 
2909 	/* prepare the xmit flags */
2910 	if (i40e_tx_prepare_vlan_flags(skb, tx_ring, &tx_flags))
2911 		goto out_drop;
2912 
2913 	/* obtain protocol of skb */
2914 	protocol = vlan_get_protocol(skb);
2915 
2916 	/* record the location of the first descriptor for this packet */
2917 	first = &tx_ring->tx_bi[tx_ring->next_to_use];
2918 
2919 	/* setup IPv4/IPv6 offloads */
2920 	if (protocol == htons(ETH_P_IP))
2921 		tx_flags |= I40E_TX_FLAGS_IPV4;
2922 	else if (protocol == htons(ETH_P_IPV6))
2923 		tx_flags |= I40E_TX_FLAGS_IPV6;
2924 
2925 	tso = i40e_tso(tx_ring, skb, &hdr_len, &cd_type_cmd_tso_mss);
2926 
2927 	if (tso < 0)
2928 		goto out_drop;
2929 	else if (tso)
2930 		tx_flags |= I40E_TX_FLAGS_TSO;
2931 
2932 	/* Always offload the checksum, since it's in the data descriptor */
2933 	tso = i40e_tx_enable_csum(skb, &tx_flags, &td_cmd, &td_offset,
2934 				  tx_ring, &cd_tunneling);
2935 	if (tso < 0)
2936 		goto out_drop;
2937 
2938 	tsyn = i40e_tsyn(tx_ring, skb, tx_flags, &cd_type_cmd_tso_mss);
2939 
2940 	if (tsyn)
2941 		tx_flags |= I40E_TX_FLAGS_TSYN;
2942 
2943 	skb_tx_timestamp(skb);
2944 
2945 	/* always enable CRC insertion offload */
2946 	td_cmd |= I40E_TX_DESC_CMD_ICRC;
2947 
2948 	i40e_create_tx_ctx(tx_ring, cd_type_cmd_tso_mss,
2949 			   cd_tunneling, cd_l2tag2);
2950 
2951 	/* Add Flow Director ATR if it's enabled.
2952 	 *
2953 	 * NOTE: this must always be directly before the data descriptor.
2954 	 */
2955 	i40e_atr(tx_ring, skb, tx_flags);
2956 
2957 	i40e_tx_map(tx_ring, skb, first, tx_flags, hdr_len,
2958 		    td_cmd, td_offset);
2959 
2960 	return NETDEV_TX_OK;
2961 
2962 out_drop:
2963 	dev_kfree_skb_any(skb);
2964 	return NETDEV_TX_OK;
2965 }
2966 
2967 /**
2968  * i40e_lan_xmit_frame - Selects the correct VSI and Tx queue to send buffer
2969  * @skb:    send buffer
2970  * @netdev: network interface device structure
2971  *
2972  * Returns NETDEV_TX_OK if sent, else an error code
2973  **/
2974 netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
2975 {
2976 	struct i40e_netdev_priv *np = netdev_priv(netdev);
2977 	struct i40e_vsi *vsi = np->vsi;
2978 	struct i40e_ring *tx_ring = vsi->tx_rings[skb->queue_mapping];
2979 
2980 	/* hardware can't handle really short frames, hardware padding works
2981 	 * beyond this point
2982 	 */
2983 	if (skb_put_padto(skb, I40E_MIN_TX_LEN))
2984 		return NETDEV_TX_OK;
2985 
2986 	return i40e_xmit_frame_ring(skb, tx_ring);
2987 }
2988