1 /*******************************************************************************
2  *
3  * Intel Ethernet Controller XL710 Family Linux Driver
4  * Copyright(c) 2013 - 2016 Intel Corporation.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  * The full GNU General Public License is included in this distribution in
19  * the file called "COPYING".
20  *
21  * Contact Information:
22  * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
23  * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
24  *
25  ******************************************************************************/
26 
27 #include <linux/prefetch.h>
28 #include <net/busy_poll.h>
29 #include "i40e.h"
30 #include "i40e_prototype.h"
31 
32 static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
33 				u32 td_tag)
34 {
35 	return cpu_to_le64(I40E_TX_DESC_DTYPE_DATA |
36 			   ((u64)td_cmd  << I40E_TXD_QW1_CMD_SHIFT) |
37 			   ((u64)td_offset << I40E_TXD_QW1_OFFSET_SHIFT) |
38 			   ((u64)size  << I40E_TXD_QW1_TX_BUF_SZ_SHIFT) |
39 			   ((u64)td_tag  << I40E_TXD_QW1_L2TAG1_SHIFT));
40 }
41 
42 #define I40E_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
43 /**
44  * i40e_fdir - Generate a Flow Director descriptor based on fdata
45  * @tx_ring: Tx ring to send buffer on
46  * @fdata: Flow director filter data
47  * @add: Indicate if we are adding a rule or deleting one
48  *
49  **/
50 static void i40e_fdir(struct i40e_ring *tx_ring,
51 		      struct i40e_fdir_filter *fdata, bool add)
52 {
53 	struct i40e_filter_program_desc *fdir_desc;
54 	struct i40e_pf *pf = tx_ring->vsi->back;
55 	u32 flex_ptype, dtype_cmd;
56 	u16 i;
57 
58 	/* grab the next descriptor */
59 	i = tx_ring->next_to_use;
60 	fdir_desc = I40E_TX_FDIRDESC(tx_ring, i);
61 
62 	i++;
63 	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
64 
65 	flex_ptype = I40E_TXD_FLTR_QW0_QINDEX_MASK &
66 		     (fdata->q_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT);
67 
68 	flex_ptype |= I40E_TXD_FLTR_QW0_FLEXOFF_MASK &
69 		      (fdata->flex_off << I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT);
70 
71 	flex_ptype |= I40E_TXD_FLTR_QW0_PCTYPE_MASK &
72 		      (fdata->pctype << I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
73 
74 	/* Use LAN VSI Id if not programmed by user */
75 	flex_ptype |= I40E_TXD_FLTR_QW0_DEST_VSI_MASK &
76 		      ((u32)(fdata->dest_vsi ? : pf->vsi[pf->lan_vsi]->id) <<
77 		       I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT);
78 
79 	dtype_cmd = I40E_TX_DESC_DTYPE_FILTER_PROG;
80 
81 	dtype_cmd |= add ?
82 		     I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE <<
83 		     I40E_TXD_FLTR_QW1_PCMD_SHIFT :
84 		     I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE <<
85 		     I40E_TXD_FLTR_QW1_PCMD_SHIFT;
86 
87 	dtype_cmd |= I40E_TXD_FLTR_QW1_DEST_MASK &
88 		     (fdata->dest_ctl << I40E_TXD_FLTR_QW1_DEST_SHIFT);
89 
90 	dtype_cmd |= I40E_TXD_FLTR_QW1_FD_STATUS_MASK &
91 		     (fdata->fd_status << I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT);
92 
93 	if (fdata->cnt_index) {
94 		dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
95 		dtype_cmd |= I40E_TXD_FLTR_QW1_CNTINDEX_MASK &
96 			     ((u32)fdata->cnt_index <<
97 			      I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT);
98 	}
99 
100 	fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
101 	fdir_desc->rsvd = cpu_to_le32(0);
102 	fdir_desc->dtype_cmd_cntindex = cpu_to_le32(dtype_cmd);
103 	fdir_desc->fd_id = cpu_to_le32(fdata->fd_id);
104 }
105 
106 #define I40E_FD_CLEAN_DELAY 10
107 /**
108  * i40e_program_fdir_filter - Program a Flow Director filter
109  * @fdir_data: Packet data that will be filter parameters
110  * @raw_packet: the pre-allocated packet buffer for FDir
111  * @pf: The PF pointer
112  * @add: True for add/update, False for remove
113  **/
114 static int i40e_program_fdir_filter(struct i40e_fdir_filter *fdir_data,
115 				    u8 *raw_packet, struct i40e_pf *pf,
116 				    bool add)
117 {
118 	struct i40e_tx_buffer *tx_buf, *first;
119 	struct i40e_tx_desc *tx_desc;
120 	struct i40e_ring *tx_ring;
121 	struct i40e_vsi *vsi;
122 	struct device *dev;
123 	dma_addr_t dma;
124 	u32 td_cmd = 0;
125 	u16 delay = 0;
126 	u16 i;
127 
128 	/* find existing FDIR VSI */
129 	vsi = NULL;
130 	for (i = 0; i < pf->num_alloc_vsi; i++)
131 		if (pf->vsi[i] && pf->vsi[i]->type == I40E_VSI_FDIR)
132 			vsi = pf->vsi[i];
133 	if (!vsi)
134 		return -ENOENT;
135 
136 	tx_ring = vsi->tx_rings[0];
137 	dev = tx_ring->dev;
138 
139 	/* we need two descriptors to add/del a filter and we can wait */
140 	do {
141 		if (I40E_DESC_UNUSED(tx_ring) > 1)
142 			break;
143 		msleep_interruptible(1);
144 		delay++;
145 	} while (delay < I40E_FD_CLEAN_DELAY);
146 
147 	if (!(I40E_DESC_UNUSED(tx_ring) > 1))
148 		return -EAGAIN;
149 
150 	dma = dma_map_single(dev, raw_packet,
151 			     I40E_FDIR_MAX_RAW_PACKET_SIZE, DMA_TO_DEVICE);
152 	if (dma_mapping_error(dev, dma))
153 		goto dma_fail;
154 
155 	/* grab the next descriptor */
156 	i = tx_ring->next_to_use;
157 	first = &tx_ring->tx_bi[i];
158 	i40e_fdir(tx_ring, fdir_data, add);
159 
160 	/* Now program a dummy descriptor */
161 	i = tx_ring->next_to_use;
162 	tx_desc = I40E_TX_DESC(tx_ring, i);
163 	tx_buf = &tx_ring->tx_bi[i];
164 
165 	tx_ring->next_to_use = ((i + 1) < tx_ring->count) ? i + 1 : 0;
166 
167 	memset(tx_buf, 0, sizeof(struct i40e_tx_buffer));
168 
169 	/* record length, and DMA address */
170 	dma_unmap_len_set(tx_buf, len, I40E_FDIR_MAX_RAW_PACKET_SIZE);
171 	dma_unmap_addr_set(tx_buf, dma, dma);
172 
173 	tx_desc->buffer_addr = cpu_to_le64(dma);
174 	td_cmd = I40E_TXD_CMD | I40E_TX_DESC_CMD_DUMMY;
175 
176 	tx_buf->tx_flags = I40E_TX_FLAGS_FD_SB;
177 	tx_buf->raw_buf = (void *)raw_packet;
178 
179 	tx_desc->cmd_type_offset_bsz =
180 		build_ctob(td_cmd, 0, I40E_FDIR_MAX_RAW_PACKET_SIZE, 0);
181 
182 	/* Force memory writes to complete before letting h/w
183 	 * know there are new descriptors to fetch.
184 	 */
185 	wmb();
186 
187 	/* Mark the data descriptor to be watched */
188 	first->next_to_watch = tx_desc;
189 
190 	writel(tx_ring->next_to_use, tx_ring->tail);
191 	return 0;
192 
193 dma_fail:
194 	return -1;
195 }
196 
197 #define IP_HEADER_OFFSET 14
198 #define I40E_UDPIP_DUMMY_PACKET_LEN 42
199 /**
200  * i40e_add_del_fdir_udpv4 - Add/Remove UDPv4 filters
201  * @vsi: pointer to the targeted VSI
202  * @fd_data: the flow director data required for the FDir descriptor
203  * @add: true adds a filter, false removes it
204  *
205  * Returns 0 if the filters were successfully added or removed
206  **/
207 static int i40e_add_del_fdir_udpv4(struct i40e_vsi *vsi,
208 				   struct i40e_fdir_filter *fd_data,
209 				   bool add)
210 {
211 	struct i40e_pf *pf = vsi->back;
212 	struct udphdr *udp;
213 	struct iphdr *ip;
214 	bool err = false;
215 	u8 *raw_packet;
216 	int ret;
217 	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
218 		0x45, 0, 0, 0x1c, 0, 0, 0x40, 0, 0x40, 0x11, 0, 0, 0, 0, 0, 0,
219 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
220 
221 	raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
222 	if (!raw_packet)
223 		return -ENOMEM;
224 	memcpy(raw_packet, packet, I40E_UDPIP_DUMMY_PACKET_LEN);
225 
226 	ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
227 	udp = (struct udphdr *)(raw_packet + IP_HEADER_OFFSET
228 	      + sizeof(struct iphdr));
229 
230 	ip->daddr = fd_data->dst_ip[0];
231 	udp->dest = fd_data->dst_port;
232 	ip->saddr = fd_data->src_ip[0];
233 	udp->source = fd_data->src_port;
234 
235 	fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
236 	ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
237 	if (ret) {
238 		dev_info(&pf->pdev->dev,
239 			 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
240 			 fd_data->pctype, fd_data->fd_id, ret);
241 		err = true;
242 	} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
243 		if (add)
244 			dev_info(&pf->pdev->dev,
245 				 "Filter OK for PCTYPE %d loc = %d\n",
246 				 fd_data->pctype, fd_data->fd_id);
247 		else
248 			dev_info(&pf->pdev->dev,
249 				 "Filter deleted for PCTYPE %d loc = %d\n",
250 				 fd_data->pctype, fd_data->fd_id);
251 	}
252 	if (err)
253 		kfree(raw_packet);
254 
255 	return err ? -EOPNOTSUPP : 0;
256 }
257 
258 #define I40E_TCPIP_DUMMY_PACKET_LEN 54
259 /**
260  * i40e_add_del_fdir_tcpv4 - Add/Remove TCPv4 filters
261  * @vsi: pointer to the targeted VSI
262  * @fd_data: the flow director data required for the FDir descriptor
263  * @add: true adds a filter, false removes it
264  *
265  * Returns 0 if the filters were successfully added or removed
266  **/
267 static int i40e_add_del_fdir_tcpv4(struct i40e_vsi *vsi,
268 				   struct i40e_fdir_filter *fd_data,
269 				   bool add)
270 {
271 	struct i40e_pf *pf = vsi->back;
272 	struct tcphdr *tcp;
273 	struct iphdr *ip;
274 	bool err = false;
275 	u8 *raw_packet;
276 	int ret;
277 	/* Dummy packet */
278 	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
279 		0x45, 0, 0, 0x28, 0, 0, 0x40, 0, 0x40, 0x6, 0, 0, 0, 0, 0, 0,
280 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80, 0x11,
281 		0x0, 0x72, 0, 0, 0, 0};
282 
283 	raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
284 	if (!raw_packet)
285 		return -ENOMEM;
286 	memcpy(raw_packet, packet, I40E_TCPIP_DUMMY_PACKET_LEN);
287 
288 	ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
289 	tcp = (struct tcphdr *)(raw_packet + IP_HEADER_OFFSET
290 	      + sizeof(struct iphdr));
291 
292 	ip->daddr = fd_data->dst_ip[0];
293 	tcp->dest = fd_data->dst_port;
294 	ip->saddr = fd_data->src_ip[0];
295 	tcp->source = fd_data->src_port;
296 
297 	if (add) {
298 		pf->fd_tcp_rule++;
299 		if ((pf->flags & I40E_FLAG_FD_ATR_ENABLED) &&
300 		    I40E_DEBUG_FD & pf->hw.debug_mask)
301 			dev_info(&pf->pdev->dev, "Forcing ATR off, sideband rules for TCP/IPv4 flow being applied\n");
302 		pf->auto_disable_flags |= I40E_FLAG_FD_ATR_ENABLED;
303 	} else {
304 		pf->fd_tcp_rule = (pf->fd_tcp_rule > 0) ?
305 				  (pf->fd_tcp_rule - 1) : 0;
306 		if (pf->fd_tcp_rule == 0) {
307 			if ((pf->flags & I40E_FLAG_FD_ATR_ENABLED) &&
308 			    I40E_DEBUG_FD & pf->hw.debug_mask)
309 				dev_info(&pf->pdev->dev, "ATR re-enabled due to no sideband TCP/IPv4 rules\n");
310 			pf->auto_disable_flags &= ~I40E_FLAG_FD_ATR_ENABLED;
311 		}
312 	}
313 
314 	fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
315 	ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
316 
317 	if (ret) {
318 		dev_info(&pf->pdev->dev,
319 			 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
320 			 fd_data->pctype, fd_data->fd_id, ret);
321 		err = true;
322 	} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
323 		if (add)
324 			dev_info(&pf->pdev->dev, "Filter OK for PCTYPE %d loc = %d)\n",
325 				 fd_data->pctype, fd_data->fd_id);
326 		else
327 			dev_info(&pf->pdev->dev,
328 				 "Filter deleted for PCTYPE %d loc = %d\n",
329 				 fd_data->pctype, fd_data->fd_id);
330 	}
331 
332 	if (err)
333 		kfree(raw_packet);
334 
335 	return err ? -EOPNOTSUPP : 0;
336 }
337 
338 /**
339  * i40e_add_del_fdir_sctpv4 - Add/Remove SCTPv4 Flow Director filters for
340  * a specific flow spec
341  * @vsi: pointer to the targeted VSI
342  * @fd_data: the flow director data required for the FDir descriptor
343  * @add: true adds a filter, false removes it
344  *
345  * Returns 0 if the filters were successfully added or removed
346  **/
347 static int i40e_add_del_fdir_sctpv4(struct i40e_vsi *vsi,
348 				    struct i40e_fdir_filter *fd_data,
349 				    bool add)
350 {
351 	return -EOPNOTSUPP;
352 }
353 
354 #define I40E_IP_DUMMY_PACKET_LEN 34
355 /**
356  * i40e_add_del_fdir_ipv4 - Add/Remove IPv4 Flow Director filters for
357  * a specific flow spec
358  * @vsi: pointer to the targeted VSI
359  * @fd_data: the flow director data required for the FDir descriptor
360  * @add: true adds a filter, false removes it
361  *
362  * Returns 0 if the filters were successfully added or removed
363  **/
364 static int i40e_add_del_fdir_ipv4(struct i40e_vsi *vsi,
365 				  struct i40e_fdir_filter *fd_data,
366 				  bool add)
367 {
368 	struct i40e_pf *pf = vsi->back;
369 	struct iphdr *ip;
370 	bool err = false;
371 	u8 *raw_packet;
372 	int ret;
373 	int i;
374 	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
375 		0x45, 0, 0, 0x14, 0, 0, 0x40, 0, 0x40, 0x10, 0, 0, 0, 0, 0, 0,
376 		0, 0, 0, 0};
377 
378 	for (i = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER;
379 	     i <= I40E_FILTER_PCTYPE_FRAG_IPV4;	i++) {
380 		raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
381 		if (!raw_packet)
382 			return -ENOMEM;
383 		memcpy(raw_packet, packet, I40E_IP_DUMMY_PACKET_LEN);
384 		ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
385 
386 		ip->saddr = fd_data->src_ip[0];
387 		ip->daddr = fd_data->dst_ip[0];
388 		ip->protocol = 0;
389 
390 		fd_data->pctype = i;
391 		ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
392 
393 		if (ret) {
394 			dev_info(&pf->pdev->dev,
395 				 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
396 				 fd_data->pctype, fd_data->fd_id, ret);
397 			err = true;
398 		} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
399 			if (add)
400 				dev_info(&pf->pdev->dev,
401 					 "Filter OK for PCTYPE %d loc = %d\n",
402 					 fd_data->pctype, fd_data->fd_id);
403 			else
404 				dev_info(&pf->pdev->dev,
405 					 "Filter deleted for PCTYPE %d loc = %d\n",
406 					 fd_data->pctype, fd_data->fd_id);
407 		}
408 	}
409 
410 	if (err)
411 		kfree(raw_packet);
412 
413 	return err ? -EOPNOTSUPP : 0;
414 }
415 
416 /**
417  * i40e_add_del_fdir - Build raw packets to add/del fdir filter
418  * @vsi: pointer to the targeted VSI
419  * @cmd: command to get or set RX flow classification rules
420  * @add: true adds a filter, false removes it
421  *
422  **/
423 int i40e_add_del_fdir(struct i40e_vsi *vsi,
424 		      struct i40e_fdir_filter *input, bool add)
425 {
426 	struct i40e_pf *pf = vsi->back;
427 	int ret;
428 
429 	switch (input->flow_type & ~FLOW_EXT) {
430 	case TCP_V4_FLOW:
431 		ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
432 		break;
433 	case UDP_V4_FLOW:
434 		ret = i40e_add_del_fdir_udpv4(vsi, input, add);
435 		break;
436 	case SCTP_V4_FLOW:
437 		ret = i40e_add_del_fdir_sctpv4(vsi, input, add);
438 		break;
439 	case IPV4_FLOW:
440 		ret = i40e_add_del_fdir_ipv4(vsi, input, add);
441 		break;
442 	case IP_USER_FLOW:
443 		switch (input->ip4_proto) {
444 		case IPPROTO_TCP:
445 			ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
446 			break;
447 		case IPPROTO_UDP:
448 			ret = i40e_add_del_fdir_udpv4(vsi, input, add);
449 			break;
450 		case IPPROTO_SCTP:
451 			ret = i40e_add_del_fdir_sctpv4(vsi, input, add);
452 			break;
453 		default:
454 			ret = i40e_add_del_fdir_ipv4(vsi, input, add);
455 			break;
456 		}
457 		break;
458 	default:
459 		dev_info(&pf->pdev->dev, "Could not specify spec type %d\n",
460 			 input->flow_type);
461 		ret = -EINVAL;
462 	}
463 
464 	/* The buffer allocated here is freed by the i40e_clean_tx_ring() */
465 	return ret;
466 }
467 
468 /**
469  * i40e_fd_handle_status - check the Programming Status for FD
470  * @rx_ring: the Rx ring for this descriptor
471  * @rx_desc: the Rx descriptor for programming Status, not a packet descriptor.
472  * @prog_id: the id originally used for programming
473  *
474  * This is used to verify if the FD programming or invalidation
475  * requested by SW to the HW is successful or not and take actions accordingly.
476  **/
477 static void i40e_fd_handle_status(struct i40e_ring *rx_ring,
478 				  union i40e_rx_desc *rx_desc, u8 prog_id)
479 {
480 	struct i40e_pf *pf = rx_ring->vsi->back;
481 	struct pci_dev *pdev = pf->pdev;
482 	u32 fcnt_prog, fcnt_avail;
483 	u32 error;
484 	u64 qw;
485 
486 	qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
487 	error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
488 		I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT;
489 
490 	if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) {
491 		pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id);
492 		if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) ||
493 		    (I40E_DEBUG_FD & pf->hw.debug_mask))
494 			dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n",
495 				 pf->fd_inv);
496 
497 		/* Check if the programming error is for ATR.
498 		 * If so, auto disable ATR and set a state for
499 		 * flush in progress. Next time we come here if flush is in
500 		 * progress do nothing, once flush is complete the state will
501 		 * be cleared.
502 		 */
503 		if (test_bit(__I40E_FD_FLUSH_REQUESTED, &pf->state))
504 			return;
505 
506 		pf->fd_add_err++;
507 		/* store the current atr filter count */
508 		pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf);
509 
510 		if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) &&
511 		    (pf->auto_disable_flags & I40E_FLAG_FD_SB_ENABLED)) {
512 			pf->auto_disable_flags |= I40E_FLAG_FD_ATR_ENABLED;
513 			set_bit(__I40E_FD_FLUSH_REQUESTED, &pf->state);
514 		}
515 
516 		/* filter programming failed most likely due to table full */
517 		fcnt_prog = i40e_get_global_fd_count(pf);
518 		fcnt_avail = pf->fdir_pf_filter_count;
519 		/* If ATR is running fcnt_prog can quickly change,
520 		 * if we are very close to full, it makes sense to disable
521 		 * FD ATR/SB and then re-enable it when there is room.
522 		 */
523 		if (fcnt_prog >= (fcnt_avail - I40E_FDIR_BUFFER_FULL_MARGIN)) {
524 			if ((pf->flags & I40E_FLAG_FD_SB_ENABLED) &&
525 			    !(pf->auto_disable_flags &
526 				     I40E_FLAG_FD_SB_ENABLED)) {
527 				if (I40E_DEBUG_FD & pf->hw.debug_mask)
528 					dev_warn(&pdev->dev, "FD filter space full, new ntuple rules will not be added\n");
529 				pf->auto_disable_flags |=
530 							I40E_FLAG_FD_SB_ENABLED;
531 			}
532 		}
533 	} else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) {
534 		if (I40E_DEBUG_FD & pf->hw.debug_mask)
535 			dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n",
536 				 rx_desc->wb.qword0.hi_dword.fd_id);
537 	}
538 }
539 
540 /**
541  * i40e_unmap_and_free_tx_resource - Release a Tx buffer
542  * @ring:      the ring that owns the buffer
543  * @tx_buffer: the buffer to free
544  **/
545 static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
546 					    struct i40e_tx_buffer *tx_buffer)
547 {
548 	if (tx_buffer->skb) {
549 		if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
550 			kfree(tx_buffer->raw_buf);
551 		else
552 			dev_kfree_skb_any(tx_buffer->skb);
553 		if (dma_unmap_len(tx_buffer, len))
554 			dma_unmap_single(ring->dev,
555 					 dma_unmap_addr(tx_buffer, dma),
556 					 dma_unmap_len(tx_buffer, len),
557 					 DMA_TO_DEVICE);
558 	} else if (dma_unmap_len(tx_buffer, len)) {
559 		dma_unmap_page(ring->dev,
560 			       dma_unmap_addr(tx_buffer, dma),
561 			       dma_unmap_len(tx_buffer, len),
562 			       DMA_TO_DEVICE);
563 	}
564 
565 	tx_buffer->next_to_watch = NULL;
566 	tx_buffer->skb = NULL;
567 	dma_unmap_len_set(tx_buffer, len, 0);
568 	/* tx_buffer must be completely set up in the transmit path */
569 }
570 
571 /**
572  * i40e_clean_tx_ring - Free any empty Tx buffers
573  * @tx_ring: ring to be cleaned
574  **/
575 void i40e_clean_tx_ring(struct i40e_ring *tx_ring)
576 {
577 	unsigned long bi_size;
578 	u16 i;
579 
580 	/* ring already cleared, nothing to do */
581 	if (!tx_ring->tx_bi)
582 		return;
583 
584 	/* Free all the Tx ring sk_buffs */
585 	for (i = 0; i < tx_ring->count; i++)
586 		i40e_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
587 
588 	bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
589 	memset(tx_ring->tx_bi, 0, bi_size);
590 
591 	/* Zero out the descriptor ring */
592 	memset(tx_ring->desc, 0, tx_ring->size);
593 
594 	tx_ring->next_to_use = 0;
595 	tx_ring->next_to_clean = 0;
596 
597 	if (!tx_ring->netdev)
598 		return;
599 
600 	/* cleanup Tx queue statistics */
601 	netdev_tx_reset_queue(txring_txq(tx_ring));
602 }
603 
604 /**
605  * i40e_free_tx_resources - Free Tx resources per queue
606  * @tx_ring: Tx descriptor ring for a specific queue
607  *
608  * Free all transmit software resources
609  **/
610 void i40e_free_tx_resources(struct i40e_ring *tx_ring)
611 {
612 	i40e_clean_tx_ring(tx_ring);
613 	kfree(tx_ring->tx_bi);
614 	tx_ring->tx_bi = NULL;
615 
616 	if (tx_ring->desc) {
617 		dma_free_coherent(tx_ring->dev, tx_ring->size,
618 				  tx_ring->desc, tx_ring->dma);
619 		tx_ring->desc = NULL;
620 	}
621 }
622 
623 /**
624  * i40e_get_tx_pending - how many tx descriptors not processed
625  * @tx_ring: the ring of descriptors
626  * @in_sw: is tx_pending being checked in SW or HW
627  *
628  * Since there is no access to the ring head register
629  * in XL710, we need to use our local copies
630  **/
631 u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw)
632 {
633 	u32 head, tail;
634 
635 	if (!in_sw)
636 		head = i40e_get_head(ring);
637 	else
638 		head = ring->next_to_clean;
639 	tail = readl(ring->tail);
640 
641 	if (head != tail)
642 		return (head < tail) ?
643 			tail - head : (tail + ring->count - head);
644 
645 	return 0;
646 }
647 
648 #define WB_STRIDE 0x3
649 
650 /**
651  * i40e_clean_tx_irq - Reclaim resources after transmit completes
652  * @vsi: the VSI we care about
653  * @tx_ring: Tx ring to clean
654  * @napi_budget: Used to determine if we are in netpoll
655  *
656  * Returns true if there's any budget left (e.g. the clean is finished)
657  **/
658 static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
659 			      struct i40e_ring *tx_ring, int napi_budget)
660 {
661 	u16 i = tx_ring->next_to_clean;
662 	struct i40e_tx_buffer *tx_buf;
663 	struct i40e_tx_desc *tx_head;
664 	struct i40e_tx_desc *tx_desc;
665 	unsigned int total_bytes = 0, total_packets = 0;
666 	unsigned int budget = vsi->work_limit;
667 
668 	tx_buf = &tx_ring->tx_bi[i];
669 	tx_desc = I40E_TX_DESC(tx_ring, i);
670 	i -= tx_ring->count;
671 
672 	tx_head = I40E_TX_DESC(tx_ring, i40e_get_head(tx_ring));
673 
674 	do {
675 		struct i40e_tx_desc *eop_desc = tx_buf->next_to_watch;
676 
677 		/* if next_to_watch is not set then there is no work pending */
678 		if (!eop_desc)
679 			break;
680 
681 		/* prevent any other reads prior to eop_desc */
682 		read_barrier_depends();
683 
684 		/* we have caught up to head, no work left to do */
685 		if (tx_head == tx_desc)
686 			break;
687 
688 		/* clear next_to_watch to prevent false hangs */
689 		tx_buf->next_to_watch = NULL;
690 
691 		/* update the statistics for this packet */
692 		total_bytes += tx_buf->bytecount;
693 		total_packets += tx_buf->gso_segs;
694 
695 		/* free the skb */
696 		napi_consume_skb(tx_buf->skb, napi_budget);
697 
698 		/* unmap skb header data */
699 		dma_unmap_single(tx_ring->dev,
700 				 dma_unmap_addr(tx_buf, dma),
701 				 dma_unmap_len(tx_buf, len),
702 				 DMA_TO_DEVICE);
703 
704 		/* clear tx_buffer data */
705 		tx_buf->skb = NULL;
706 		dma_unmap_len_set(tx_buf, len, 0);
707 
708 		/* unmap remaining buffers */
709 		while (tx_desc != eop_desc) {
710 
711 			tx_buf++;
712 			tx_desc++;
713 			i++;
714 			if (unlikely(!i)) {
715 				i -= tx_ring->count;
716 				tx_buf = tx_ring->tx_bi;
717 				tx_desc = I40E_TX_DESC(tx_ring, 0);
718 			}
719 
720 			/* unmap any remaining paged data */
721 			if (dma_unmap_len(tx_buf, len)) {
722 				dma_unmap_page(tx_ring->dev,
723 					       dma_unmap_addr(tx_buf, dma),
724 					       dma_unmap_len(tx_buf, len),
725 					       DMA_TO_DEVICE);
726 				dma_unmap_len_set(tx_buf, len, 0);
727 			}
728 		}
729 
730 		/* move us one more past the eop_desc for start of next pkt */
731 		tx_buf++;
732 		tx_desc++;
733 		i++;
734 		if (unlikely(!i)) {
735 			i -= tx_ring->count;
736 			tx_buf = tx_ring->tx_bi;
737 			tx_desc = I40E_TX_DESC(tx_ring, 0);
738 		}
739 
740 		prefetch(tx_desc);
741 
742 		/* update budget accounting */
743 		budget--;
744 	} while (likely(budget));
745 
746 	i += tx_ring->count;
747 	tx_ring->next_to_clean = i;
748 	u64_stats_update_begin(&tx_ring->syncp);
749 	tx_ring->stats.bytes += total_bytes;
750 	tx_ring->stats.packets += total_packets;
751 	u64_stats_update_end(&tx_ring->syncp);
752 	tx_ring->q_vector->tx.total_bytes += total_bytes;
753 	tx_ring->q_vector->tx.total_packets += total_packets;
754 
755 	if (tx_ring->flags & I40E_TXR_FLAGS_WB_ON_ITR) {
756 		/* check to see if there are < 4 descriptors
757 		 * waiting to be written back, then kick the hardware to force
758 		 * them to be written back in case we stay in NAPI.
759 		 * In this mode on X722 we do not enable Interrupt.
760 		 */
761 		unsigned int j = i40e_get_tx_pending(tx_ring, false);
762 
763 		if (budget &&
764 		    ((j / (WB_STRIDE + 1)) == 0) && (j != 0) &&
765 		    !test_bit(__I40E_DOWN, &vsi->state) &&
766 		    (I40E_DESC_UNUSED(tx_ring) != tx_ring->count))
767 			tx_ring->arm_wb = true;
768 	}
769 
770 	/* notify netdev of completed buffers */
771 	netdev_tx_completed_queue(txring_txq(tx_ring),
772 				  total_packets, total_bytes);
773 
774 #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
775 	if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
776 		     (I40E_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
777 		/* Make sure that anybody stopping the queue after this
778 		 * sees the new next_to_clean.
779 		 */
780 		smp_mb();
781 		if (__netif_subqueue_stopped(tx_ring->netdev,
782 					     tx_ring->queue_index) &&
783 		   !test_bit(__I40E_DOWN, &vsi->state)) {
784 			netif_wake_subqueue(tx_ring->netdev,
785 					    tx_ring->queue_index);
786 			++tx_ring->tx_stats.restart_queue;
787 		}
788 	}
789 
790 	return !!budget;
791 }
792 
793 /**
794  * i40e_enable_wb_on_itr - Arm hardware to do a wb, interrupts are not enabled
795  * @vsi: the VSI we care about
796  * @q_vector: the vector on which to enable writeback
797  *
798  **/
799 static void i40e_enable_wb_on_itr(struct i40e_vsi *vsi,
800 				  struct i40e_q_vector *q_vector)
801 {
802 	u16 flags = q_vector->tx.ring[0].flags;
803 	u32 val;
804 
805 	if (!(flags & I40E_TXR_FLAGS_WB_ON_ITR))
806 		return;
807 
808 	if (q_vector->arm_wb_state)
809 		return;
810 
811 	if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
812 		val = I40E_PFINT_DYN_CTLN_WB_ON_ITR_MASK |
813 		      I40E_PFINT_DYN_CTLN_ITR_INDX_MASK; /* set noitr */
814 
815 		wr32(&vsi->back->hw,
816 		     I40E_PFINT_DYN_CTLN(q_vector->v_idx + vsi->base_vector - 1),
817 		     val);
818 	} else {
819 		val = I40E_PFINT_DYN_CTL0_WB_ON_ITR_MASK |
820 		      I40E_PFINT_DYN_CTL0_ITR_INDX_MASK; /* set noitr */
821 
822 		wr32(&vsi->back->hw, I40E_PFINT_DYN_CTL0, val);
823 	}
824 	q_vector->arm_wb_state = true;
825 }
826 
827 /**
828  * i40e_force_wb - Issue SW Interrupt so HW does a wb
829  * @vsi: the VSI we care about
830  * @q_vector: the vector  on which to force writeback
831  *
832  **/
833 void i40e_force_wb(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector)
834 {
835 	if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
836 		u32 val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
837 			  I40E_PFINT_DYN_CTLN_ITR_INDX_MASK | /* set noitr */
838 			  I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK |
839 			  I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK;
840 			  /* allow 00 to be written to the index */
841 
842 		wr32(&vsi->back->hw,
843 		     I40E_PFINT_DYN_CTLN(q_vector->v_idx +
844 					 vsi->base_vector - 1), val);
845 	} else {
846 		u32 val = I40E_PFINT_DYN_CTL0_INTENA_MASK |
847 			  I40E_PFINT_DYN_CTL0_ITR_INDX_MASK | /* set noitr */
848 			  I40E_PFINT_DYN_CTL0_SWINT_TRIG_MASK |
849 			  I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_MASK;
850 			/* allow 00 to be written to the index */
851 
852 		wr32(&vsi->back->hw, I40E_PFINT_DYN_CTL0, val);
853 	}
854 }
855 
856 /**
857  * i40e_set_new_dynamic_itr - Find new ITR level
858  * @rc: structure containing ring performance data
859  *
860  * Returns true if ITR changed, false if not
861  *
862  * Stores a new ITR value based on packets and byte counts during
863  * the last interrupt.  The advantage of per interrupt computation
864  * is faster updates and more accurate ITR for the current traffic
865  * pattern.  Constants in this function were computed based on
866  * theoretical maximum wire speed and thresholds were set based on
867  * testing data as well as attempting to minimize response time
868  * while increasing bulk throughput.
869  **/
870 static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc)
871 {
872 	enum i40e_latency_range new_latency_range = rc->latency_range;
873 	struct i40e_q_vector *qv = rc->ring->q_vector;
874 	u32 new_itr = rc->itr;
875 	int bytes_per_int;
876 	int usecs;
877 
878 	if (rc->total_packets == 0 || !rc->itr)
879 		return false;
880 
881 	/* simple throttlerate management
882 	 *   0-10MB/s   lowest (50000 ints/s)
883 	 *  10-20MB/s   low    (20000 ints/s)
884 	 *  20-1249MB/s bulk   (18000 ints/s)
885 	 *  > 40000 Rx packets per second (8000 ints/s)
886 	 *
887 	 * The math works out because the divisor is in 10^(-6) which
888 	 * turns the bytes/us input value into MB/s values, but
889 	 * make sure to use usecs, as the register values written
890 	 * are in 2 usec increments in the ITR registers, and make sure
891 	 * to use the smoothed values that the countdown timer gives us.
892 	 */
893 	usecs = (rc->itr << 1) * ITR_COUNTDOWN_START;
894 	bytes_per_int = rc->total_bytes / usecs;
895 
896 	switch (new_latency_range) {
897 	case I40E_LOWEST_LATENCY:
898 		if (bytes_per_int > 10)
899 			new_latency_range = I40E_LOW_LATENCY;
900 		break;
901 	case I40E_LOW_LATENCY:
902 		if (bytes_per_int > 20)
903 			new_latency_range = I40E_BULK_LATENCY;
904 		else if (bytes_per_int <= 10)
905 			new_latency_range = I40E_LOWEST_LATENCY;
906 		break;
907 	case I40E_BULK_LATENCY:
908 	case I40E_ULTRA_LATENCY:
909 	default:
910 		if (bytes_per_int <= 20)
911 			new_latency_range = I40E_LOW_LATENCY;
912 		break;
913 	}
914 
915 	/* this is to adjust RX more aggressively when streaming small
916 	 * packets.  The value of 40000 was picked as it is just beyond
917 	 * what the hardware can receive per second if in low latency
918 	 * mode.
919 	 */
920 #define RX_ULTRA_PACKET_RATE 40000
921 
922 	if ((((rc->total_packets * 1000000) / usecs) > RX_ULTRA_PACKET_RATE) &&
923 	    (&qv->rx == rc))
924 		new_latency_range = I40E_ULTRA_LATENCY;
925 
926 	rc->latency_range = new_latency_range;
927 
928 	switch (new_latency_range) {
929 	case I40E_LOWEST_LATENCY:
930 		new_itr = I40E_ITR_50K;
931 		break;
932 	case I40E_LOW_LATENCY:
933 		new_itr = I40E_ITR_20K;
934 		break;
935 	case I40E_BULK_LATENCY:
936 		new_itr = I40E_ITR_18K;
937 		break;
938 	case I40E_ULTRA_LATENCY:
939 		new_itr = I40E_ITR_8K;
940 		break;
941 	default:
942 		break;
943 	}
944 
945 	rc->total_bytes = 0;
946 	rc->total_packets = 0;
947 
948 	if (new_itr != rc->itr) {
949 		rc->itr = new_itr;
950 		return true;
951 	}
952 
953 	return false;
954 }
955 
956 /**
957  * i40e_clean_programming_status - clean the programming status descriptor
958  * @rx_ring: the rx ring that has this descriptor
959  * @rx_desc: the rx descriptor written back by HW
960  *
961  * Flow director should handle FD_FILTER_STATUS to check its filter programming
962  * status being successful or not and take actions accordingly. FCoE should
963  * handle its context/filter programming/invalidation status and take actions.
964  *
965  **/
966 static void i40e_clean_programming_status(struct i40e_ring *rx_ring,
967 					  union i40e_rx_desc *rx_desc)
968 {
969 	u64 qw;
970 	u8 id;
971 
972 	qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
973 	id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
974 		  I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
975 
976 	if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS)
977 		i40e_fd_handle_status(rx_ring, rx_desc, id);
978 #ifdef I40E_FCOE
979 	else if ((id == I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_PROG_STATUS) ||
980 		 (id == I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_INVL_STATUS))
981 		i40e_fcoe_handle_status(rx_ring, rx_desc, id);
982 #endif
983 }
984 
985 /**
986  * i40e_setup_tx_descriptors - Allocate the Tx descriptors
987  * @tx_ring: the tx ring to set up
988  *
989  * Return 0 on success, negative on error
990  **/
991 int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
992 {
993 	struct device *dev = tx_ring->dev;
994 	int bi_size;
995 
996 	if (!dev)
997 		return -ENOMEM;
998 
999 	/* warn if we are about to overwrite the pointer */
1000 	WARN_ON(tx_ring->tx_bi);
1001 	bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
1002 	tx_ring->tx_bi = kzalloc(bi_size, GFP_KERNEL);
1003 	if (!tx_ring->tx_bi)
1004 		goto err;
1005 
1006 	/* round up to nearest 4K */
1007 	tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc);
1008 	/* add u32 for head writeback, align after this takes care of
1009 	 * guaranteeing this is at least one cache line in size
1010 	 */
1011 	tx_ring->size += sizeof(u32);
1012 	tx_ring->size = ALIGN(tx_ring->size, 4096);
1013 	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
1014 					   &tx_ring->dma, GFP_KERNEL);
1015 	if (!tx_ring->desc) {
1016 		dev_info(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n",
1017 			 tx_ring->size);
1018 		goto err;
1019 	}
1020 
1021 	tx_ring->next_to_use = 0;
1022 	tx_ring->next_to_clean = 0;
1023 	return 0;
1024 
1025 err:
1026 	kfree(tx_ring->tx_bi);
1027 	tx_ring->tx_bi = NULL;
1028 	return -ENOMEM;
1029 }
1030 
1031 /**
1032  * i40e_clean_rx_ring - Free Rx buffers
1033  * @rx_ring: ring to be cleaned
1034  **/
1035 void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
1036 {
1037 	struct device *dev = rx_ring->dev;
1038 	unsigned long bi_size;
1039 	u16 i;
1040 
1041 	/* ring already cleared, nothing to do */
1042 	if (!rx_ring->rx_bi)
1043 		return;
1044 
1045 	/* Free all the Rx ring sk_buffs */
1046 	for (i = 0; i < rx_ring->count; i++) {
1047 		struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
1048 
1049 		if (rx_bi->skb) {
1050 			dev_kfree_skb(rx_bi->skb);
1051 			rx_bi->skb = NULL;
1052 		}
1053 		if (!rx_bi->page)
1054 			continue;
1055 
1056 		dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE, DMA_FROM_DEVICE);
1057 		__free_pages(rx_bi->page, 0);
1058 
1059 		rx_bi->page = NULL;
1060 		rx_bi->page_offset = 0;
1061 	}
1062 
1063 	bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
1064 	memset(rx_ring->rx_bi, 0, bi_size);
1065 
1066 	/* Zero out the descriptor ring */
1067 	memset(rx_ring->desc, 0, rx_ring->size);
1068 
1069 	rx_ring->next_to_alloc = 0;
1070 	rx_ring->next_to_clean = 0;
1071 	rx_ring->next_to_use = 0;
1072 }
1073 
1074 /**
1075  * i40e_free_rx_resources - Free Rx resources
1076  * @rx_ring: ring to clean the resources from
1077  *
1078  * Free all receive software resources
1079  **/
1080 void i40e_free_rx_resources(struct i40e_ring *rx_ring)
1081 {
1082 	i40e_clean_rx_ring(rx_ring);
1083 	kfree(rx_ring->rx_bi);
1084 	rx_ring->rx_bi = NULL;
1085 
1086 	if (rx_ring->desc) {
1087 		dma_free_coherent(rx_ring->dev, rx_ring->size,
1088 				  rx_ring->desc, rx_ring->dma);
1089 		rx_ring->desc = NULL;
1090 	}
1091 }
1092 
1093 /**
1094  * i40e_setup_rx_descriptors - Allocate Rx descriptors
1095  * @rx_ring: Rx descriptor ring (for a specific queue) to setup
1096  *
1097  * Returns 0 on success, negative on failure
1098  **/
1099 int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
1100 {
1101 	struct device *dev = rx_ring->dev;
1102 	int bi_size;
1103 
1104 	/* warn if we are about to overwrite the pointer */
1105 	WARN_ON(rx_ring->rx_bi);
1106 	bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
1107 	rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
1108 	if (!rx_ring->rx_bi)
1109 		goto err;
1110 
1111 	u64_stats_init(&rx_ring->syncp);
1112 
1113 	/* Round up to nearest 4K */
1114 	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
1115 	rx_ring->size = ALIGN(rx_ring->size, 4096);
1116 	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
1117 					   &rx_ring->dma, GFP_KERNEL);
1118 
1119 	if (!rx_ring->desc) {
1120 		dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
1121 			 rx_ring->size);
1122 		goto err;
1123 	}
1124 
1125 	rx_ring->next_to_alloc = 0;
1126 	rx_ring->next_to_clean = 0;
1127 	rx_ring->next_to_use = 0;
1128 
1129 	return 0;
1130 err:
1131 	kfree(rx_ring->rx_bi);
1132 	rx_ring->rx_bi = NULL;
1133 	return -ENOMEM;
1134 }
1135 
1136 /**
1137  * i40e_release_rx_desc - Store the new tail and head values
1138  * @rx_ring: ring to bump
1139  * @val: new head index
1140  **/
1141 static inline void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val)
1142 {
1143 	rx_ring->next_to_use = val;
1144 
1145 	/* update next to alloc since we have filled the ring */
1146 	rx_ring->next_to_alloc = val;
1147 
1148 	/* Force memory writes to complete before letting h/w
1149 	 * know there are new descriptors to fetch.  (Only
1150 	 * applicable for weak-ordered memory model archs,
1151 	 * such as IA-64).
1152 	 */
1153 	wmb();
1154 	writel(val, rx_ring->tail);
1155 }
1156 
1157 /**
1158  * i40e_alloc_mapped_page - recycle or make a new page
1159  * @rx_ring: ring to use
1160  * @bi: rx_buffer struct to modify
1161  *
1162  * Returns true if the page was successfully allocated or
1163  * reused.
1164  **/
1165 static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
1166 				   struct i40e_rx_buffer *bi)
1167 {
1168 	struct page *page = bi->page;
1169 	dma_addr_t dma;
1170 
1171 	/* since we are recycling buffers we should seldom need to alloc */
1172 	if (likely(page)) {
1173 		rx_ring->rx_stats.page_reuse_count++;
1174 		return true;
1175 	}
1176 
1177 	/* alloc new page for storage */
1178 	page = dev_alloc_page();
1179 	if (unlikely(!page)) {
1180 		rx_ring->rx_stats.alloc_page_failed++;
1181 		return false;
1182 	}
1183 
1184 	/* map page for use */
1185 	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
1186 
1187 	/* if mapping failed free memory back to system since
1188 	 * there isn't much point in holding memory we can't use
1189 	 */
1190 	if (dma_mapping_error(rx_ring->dev, dma)) {
1191 		__free_pages(page, 0);
1192 		rx_ring->rx_stats.alloc_page_failed++;
1193 		return false;
1194 	}
1195 
1196 	bi->dma = dma;
1197 	bi->page = page;
1198 	bi->page_offset = 0;
1199 
1200 	return true;
1201 }
1202 
1203 /**
1204  * i40e_receive_skb - Send a completed packet up the stack
1205  * @rx_ring:  rx ring in play
1206  * @skb: packet to send up
1207  * @vlan_tag: vlan tag for packet
1208  **/
1209 static void i40e_receive_skb(struct i40e_ring *rx_ring,
1210 			     struct sk_buff *skb, u16 vlan_tag)
1211 {
1212 	struct i40e_q_vector *q_vector = rx_ring->q_vector;
1213 
1214 	if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
1215 	    (vlan_tag & VLAN_VID_MASK))
1216 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
1217 
1218 	napi_gro_receive(&q_vector->napi, skb);
1219 }
1220 
1221 /**
1222  * i40e_alloc_rx_buffers - Replace used receive buffers
1223  * @rx_ring: ring to place buffers on
1224  * @cleaned_count: number of buffers to replace
1225  *
1226  * Returns false if all allocations were successful, true if any fail
1227  **/
1228 bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
1229 {
1230 	u16 ntu = rx_ring->next_to_use;
1231 	union i40e_rx_desc *rx_desc;
1232 	struct i40e_rx_buffer *bi;
1233 
1234 	/* do nothing if no valid netdev defined */
1235 	if (!rx_ring->netdev || !cleaned_count)
1236 		return false;
1237 
1238 	rx_desc = I40E_RX_DESC(rx_ring, ntu);
1239 	bi = &rx_ring->rx_bi[ntu];
1240 
1241 	do {
1242 		if (!i40e_alloc_mapped_page(rx_ring, bi))
1243 			goto no_buffers;
1244 
1245 		/* Refresh the desc even if buffer_addrs didn't change
1246 		 * because each write-back erases this info.
1247 		 */
1248 		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
1249 		rx_desc->read.hdr_addr = 0;
1250 
1251 		rx_desc++;
1252 		bi++;
1253 		ntu++;
1254 		if (unlikely(ntu == rx_ring->count)) {
1255 			rx_desc = I40E_RX_DESC(rx_ring, 0);
1256 			bi = rx_ring->rx_bi;
1257 			ntu = 0;
1258 		}
1259 
1260 		/* clear the status bits for the next_to_use descriptor */
1261 		rx_desc->wb.qword1.status_error_len = 0;
1262 
1263 		cleaned_count--;
1264 	} while (cleaned_count);
1265 
1266 	if (rx_ring->next_to_use != ntu)
1267 		i40e_release_rx_desc(rx_ring, ntu);
1268 
1269 	return false;
1270 
1271 no_buffers:
1272 	if (rx_ring->next_to_use != ntu)
1273 		i40e_release_rx_desc(rx_ring, ntu);
1274 
1275 	/* make sure to come back via polling to try again after
1276 	 * allocation failure
1277 	 */
1278 	return true;
1279 }
1280 
1281 /**
1282  * i40e_rx_checksum - Indicate in skb if hw indicated a good cksum
1283  * @vsi: the VSI we care about
1284  * @skb: skb currently being received and modified
1285  * @rx_desc: the receive descriptor
1286  *
1287  * skb->protocol must be set before this function is called
1288  **/
1289 static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
1290 				    struct sk_buff *skb,
1291 				    union i40e_rx_desc *rx_desc)
1292 {
1293 	struct i40e_rx_ptype_decoded decoded;
1294 	u32 rx_error, rx_status;
1295 	bool ipv4, ipv6;
1296 	u8 ptype;
1297 	u64 qword;
1298 
1299 	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1300 	ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >> I40E_RXD_QW1_PTYPE_SHIFT;
1301 	rx_error = (qword & I40E_RXD_QW1_ERROR_MASK) >>
1302 		   I40E_RXD_QW1_ERROR_SHIFT;
1303 	rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
1304 		    I40E_RXD_QW1_STATUS_SHIFT;
1305 	decoded = decode_rx_desc_ptype(ptype);
1306 
1307 	skb->ip_summed = CHECKSUM_NONE;
1308 
1309 	skb_checksum_none_assert(skb);
1310 
1311 	/* Rx csum enabled and ip headers found? */
1312 	if (!(vsi->netdev->features & NETIF_F_RXCSUM))
1313 		return;
1314 
1315 	/* did the hardware decode the packet and checksum? */
1316 	if (!(rx_status & BIT(I40E_RX_DESC_STATUS_L3L4P_SHIFT)))
1317 		return;
1318 
1319 	/* both known and outer_ip must be set for the below code to work */
1320 	if (!(decoded.known && decoded.outer_ip))
1321 		return;
1322 
1323 	ipv4 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
1324 	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4);
1325 	ipv6 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
1326 	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6);
1327 
1328 	if (ipv4 &&
1329 	    (rx_error & (BIT(I40E_RX_DESC_ERROR_IPE_SHIFT) |
1330 			 BIT(I40E_RX_DESC_ERROR_EIPE_SHIFT))))
1331 		goto checksum_fail;
1332 
1333 	/* likely incorrect csum if alternate IP extension headers found */
1334 	if (ipv6 &&
1335 	    rx_status & BIT(I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))
1336 		/* don't increment checksum err here, non-fatal err */
1337 		return;
1338 
1339 	/* there was some L4 error, count error and punt packet to the stack */
1340 	if (rx_error & BIT(I40E_RX_DESC_ERROR_L4E_SHIFT))
1341 		goto checksum_fail;
1342 
1343 	/* handle packets that were not able to be checksummed due
1344 	 * to arrival speed, in this case the stack can compute
1345 	 * the csum.
1346 	 */
1347 	if (rx_error & BIT(I40E_RX_DESC_ERROR_PPRS_SHIFT))
1348 		return;
1349 
1350 	/* If there is an outer header present that might contain a checksum
1351 	 * we need to bump the checksum level by 1 to reflect the fact that
1352 	 * we are indicating we validated the inner checksum.
1353 	 */
1354 	if (decoded.tunnel_type >= I40E_RX_PTYPE_TUNNEL_IP_GRENAT)
1355 		skb->csum_level = 1;
1356 
1357 	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
1358 	switch (decoded.inner_prot) {
1359 	case I40E_RX_PTYPE_INNER_PROT_TCP:
1360 	case I40E_RX_PTYPE_INNER_PROT_UDP:
1361 	case I40E_RX_PTYPE_INNER_PROT_SCTP:
1362 		skb->ip_summed = CHECKSUM_UNNECESSARY;
1363 		/* fall though */
1364 	default:
1365 		break;
1366 	}
1367 
1368 	return;
1369 
1370 checksum_fail:
1371 	vsi->back->hw_csum_rx_error++;
1372 }
1373 
1374 /**
1375  * i40e_ptype_to_htype - get a hash type
1376  * @ptype: the ptype value from the descriptor
1377  *
1378  * Returns a hash type to be used by skb_set_hash
1379  **/
1380 static inline int i40e_ptype_to_htype(u8 ptype)
1381 {
1382 	struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
1383 
1384 	if (!decoded.known)
1385 		return PKT_HASH_TYPE_NONE;
1386 
1387 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1388 	    decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4)
1389 		return PKT_HASH_TYPE_L4;
1390 	else if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1391 		 decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3)
1392 		return PKT_HASH_TYPE_L3;
1393 	else
1394 		return PKT_HASH_TYPE_L2;
1395 }
1396 
1397 /**
1398  * i40e_rx_hash - set the hash value in the skb
1399  * @ring: descriptor ring
1400  * @rx_desc: specific descriptor
1401  **/
1402 static inline void i40e_rx_hash(struct i40e_ring *ring,
1403 				union i40e_rx_desc *rx_desc,
1404 				struct sk_buff *skb,
1405 				u8 rx_ptype)
1406 {
1407 	u32 hash;
1408 	const __le64 rss_mask =
1409 		cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH <<
1410 			    I40E_RX_DESC_STATUS_FLTSTAT_SHIFT);
1411 
1412 	if (!(ring->netdev->features & NETIF_F_RXHASH))
1413 		return;
1414 
1415 	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
1416 		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
1417 		skb_set_hash(skb, hash, i40e_ptype_to_htype(rx_ptype));
1418 	}
1419 }
1420 
1421 /**
1422  * i40e_process_skb_fields - Populate skb header fields from Rx descriptor
1423  * @rx_ring: rx descriptor ring packet is being transacted on
1424  * @rx_desc: pointer to the EOP Rx descriptor
1425  * @skb: pointer to current skb being populated
1426  * @rx_ptype: the packet type decoded by hardware
1427  *
1428  * This function checks the ring, descriptor, and packet information in
1429  * order to populate the hash, checksum, VLAN, protocol, and
1430  * other fields within the skb.
1431  **/
1432 static inline
1433 void i40e_process_skb_fields(struct i40e_ring *rx_ring,
1434 			     union i40e_rx_desc *rx_desc, struct sk_buff *skb,
1435 			     u8 rx_ptype)
1436 {
1437 	u64 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1438 	u32 rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
1439 			I40E_RXD_QW1_STATUS_SHIFT;
1440 	u32 rsyn = (rx_status & I40E_RXD_QW1_STATUS_TSYNINDX_MASK) >>
1441 		   I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT;
1442 
1443 	if (unlikely(rsyn)) {
1444 		i40e_ptp_rx_hwtstamp(rx_ring->vsi->back, skb, rsyn);
1445 		rx_ring->last_rx_timestamp = jiffies;
1446 	}
1447 
1448 	i40e_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
1449 
1450 	/* modifies the skb - consumes the enet header */
1451 	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
1452 
1453 	i40e_rx_checksum(rx_ring->vsi, skb, rx_desc);
1454 
1455 	skb_record_rx_queue(skb, rx_ring->queue_index);
1456 }
1457 
1458 /**
1459  * i40e_pull_tail - i40e specific version of skb_pull_tail
1460  * @rx_ring: rx descriptor ring packet is being transacted on
1461  * @skb: pointer to current skb being adjusted
1462  *
1463  * This function is an i40e specific version of __pskb_pull_tail.  The
1464  * main difference between this version and the original function is that
1465  * this function can make several assumptions about the state of things
1466  * that allow for significant optimizations versus the standard function.
1467  * As a result we can do things like drop a frag and maintain an accurate
1468  * truesize for the skb.
1469  */
1470 static void i40e_pull_tail(struct i40e_ring *rx_ring, struct sk_buff *skb)
1471 {
1472 	struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
1473 	unsigned char *va;
1474 	unsigned int pull_len;
1475 
1476 	/* it is valid to use page_address instead of kmap since we are
1477 	 * working with pages allocated out of the lomem pool per
1478 	 * alloc_page(GFP_ATOMIC)
1479 	 */
1480 	va = skb_frag_address(frag);
1481 
1482 	/* we need the header to contain the greater of either ETH_HLEN or
1483 	 * 60 bytes if the skb->len is less than 60 for skb_pad.
1484 	 */
1485 	pull_len = eth_get_headlen(va, I40E_RX_HDR_SIZE);
1486 
1487 	/* align pull length to size of long to optimize memcpy performance */
1488 	skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
1489 
1490 	/* update all of the pointers */
1491 	skb_frag_size_sub(frag, pull_len);
1492 	frag->page_offset += pull_len;
1493 	skb->data_len -= pull_len;
1494 	skb->tail += pull_len;
1495 }
1496 
1497 /**
1498  * i40e_cleanup_headers - Correct empty headers
1499  * @rx_ring: rx descriptor ring packet is being transacted on
1500  * @skb: pointer to current skb being fixed
1501  *
1502  * Also address the case where we are pulling data in on pages only
1503  * and as such no data is present in the skb header.
1504  *
1505  * In addition if skb is not at least 60 bytes we need to pad it so that
1506  * it is large enough to qualify as a valid Ethernet frame.
1507  *
1508  * Returns true if an error was encountered and skb was freed.
1509  **/
1510 static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb)
1511 {
1512 	/* place header in linear portion of buffer */
1513 	if (skb_is_nonlinear(skb))
1514 		i40e_pull_tail(rx_ring, skb);
1515 
1516 	/* if eth_skb_pad returns an error the skb was freed */
1517 	if (eth_skb_pad(skb))
1518 		return true;
1519 
1520 	return false;
1521 }
1522 
1523 /**
1524  * i40e_reuse_rx_page - page flip buffer and store it back on the ring
1525  * @rx_ring: rx descriptor ring to store buffers on
1526  * @old_buff: donor buffer to have page reused
1527  *
1528  * Synchronizes page for reuse by the adapter
1529  **/
1530 static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
1531 			       struct i40e_rx_buffer *old_buff)
1532 {
1533 	struct i40e_rx_buffer *new_buff;
1534 	u16 nta = rx_ring->next_to_alloc;
1535 
1536 	new_buff = &rx_ring->rx_bi[nta];
1537 
1538 	/* update, and store next to alloc */
1539 	nta++;
1540 	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
1541 
1542 	/* transfer page from old buffer to new buffer */
1543 	*new_buff = *old_buff;
1544 }
1545 
1546 /**
1547  * i40e_page_is_reserved - check if reuse is possible
1548  * @page: page struct to check
1549  */
1550 static inline bool i40e_page_is_reserved(struct page *page)
1551 {
1552 	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
1553 }
1554 
1555 /**
1556  * i40e_add_rx_frag - Add contents of Rx buffer to sk_buff
1557  * @rx_ring: rx descriptor ring to transact packets on
1558  * @rx_buffer: buffer containing page to add
1559  * @rx_desc: descriptor containing length of buffer written by hardware
1560  * @skb: sk_buff to place the data into
1561  *
1562  * This function will add the data contained in rx_buffer->page to the skb.
1563  * This is done either through a direct copy if the data in the buffer is
1564  * less than the skb header size, otherwise it will just attach the page as
1565  * a frag to the skb.
1566  *
1567  * The function will then update the page offset if necessary and return
1568  * true if the buffer can be reused by the adapter.
1569  **/
1570 static bool i40e_add_rx_frag(struct i40e_ring *rx_ring,
1571 			     struct i40e_rx_buffer *rx_buffer,
1572 			     union i40e_rx_desc *rx_desc,
1573 			     struct sk_buff *skb)
1574 {
1575 	struct page *page = rx_buffer->page;
1576 	u64 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1577 	unsigned int size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
1578 			    I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1579 #if (PAGE_SIZE < 8192)
1580 	unsigned int truesize = I40E_RXBUFFER_2048;
1581 #else
1582 	unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
1583 	unsigned int last_offset = PAGE_SIZE - I40E_RXBUFFER_2048;
1584 #endif
1585 
1586 	/* will the data fit in the skb we allocated? if so, just
1587 	 * copy it as it is pretty small anyway
1588 	 */
1589 	if ((size <= I40E_RX_HDR_SIZE) && !skb_is_nonlinear(skb)) {
1590 		unsigned char *va = page_address(page) + rx_buffer->page_offset;
1591 
1592 		memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
1593 
1594 		/* page is not reserved, we can reuse buffer as-is */
1595 		if (likely(!i40e_page_is_reserved(page)))
1596 			return true;
1597 
1598 		/* this page cannot be reused so discard it */
1599 		__free_pages(page, 0);
1600 		return false;
1601 	}
1602 
1603 	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
1604 			rx_buffer->page_offset, size, truesize);
1605 
1606 	/* avoid re-using remote pages */
1607 	if (unlikely(i40e_page_is_reserved(page)))
1608 		return false;
1609 
1610 #if (PAGE_SIZE < 8192)
1611 	/* if we are only owner of page we can reuse it */
1612 	if (unlikely(page_count(page) != 1))
1613 		return false;
1614 
1615 	/* flip page offset to other buffer */
1616 	rx_buffer->page_offset ^= truesize;
1617 #else
1618 	/* move offset up to the next cache line */
1619 	rx_buffer->page_offset += truesize;
1620 
1621 	if (rx_buffer->page_offset > last_offset)
1622 		return false;
1623 #endif
1624 
1625 	/* Even if we own the page, we are not allowed to use atomic_set()
1626 	 * This would break get_page_unless_zero() users.
1627 	 */
1628 	get_page(rx_buffer->page);
1629 
1630 	return true;
1631 }
1632 
1633 /**
1634  * i40e_fetch_rx_buffer - Allocate skb and populate it
1635  * @rx_ring: rx descriptor ring to transact packets on
1636  * @rx_desc: descriptor containing info written by hardware
1637  *
1638  * This function allocates an skb on the fly, and populates it with the page
1639  * data from the current receive descriptor, taking care to set up the skb
1640  * correctly, as well as handling calling the page recycle function if
1641  * necessary.
1642  */
1643 static inline
1644 struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
1645 				     union i40e_rx_desc *rx_desc)
1646 {
1647 	struct i40e_rx_buffer *rx_buffer;
1648 	struct sk_buff *skb;
1649 	struct page *page;
1650 
1651 	rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
1652 	page = rx_buffer->page;
1653 	prefetchw(page);
1654 
1655 	skb = rx_buffer->skb;
1656 
1657 	if (likely(!skb)) {
1658 		void *page_addr = page_address(page) + rx_buffer->page_offset;
1659 
1660 		/* prefetch first cache line of first page */
1661 		prefetch(page_addr);
1662 #if L1_CACHE_BYTES < 128
1663 		prefetch(page_addr + L1_CACHE_BYTES);
1664 #endif
1665 
1666 		/* allocate a skb to store the frags */
1667 		skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
1668 				       I40E_RX_HDR_SIZE,
1669 				       GFP_ATOMIC | __GFP_NOWARN);
1670 		if (unlikely(!skb)) {
1671 			rx_ring->rx_stats.alloc_buff_failed++;
1672 			return NULL;
1673 		}
1674 
1675 		/* we will be copying header into skb->data in
1676 		 * pskb_may_pull so it is in our interest to prefetch
1677 		 * it now to avoid a possible cache miss
1678 		 */
1679 		prefetchw(skb->data);
1680 	} else {
1681 		rx_buffer->skb = NULL;
1682 	}
1683 
1684 	/* we are reusing so sync this buffer for CPU use */
1685 	dma_sync_single_range_for_cpu(rx_ring->dev,
1686 				      rx_buffer->dma,
1687 				      rx_buffer->page_offset,
1688 				      I40E_RXBUFFER_2048,
1689 				      DMA_FROM_DEVICE);
1690 
1691 	/* pull page into skb */
1692 	if (i40e_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
1693 		/* hand second half of page back to the ring */
1694 		i40e_reuse_rx_page(rx_ring, rx_buffer);
1695 		rx_ring->rx_stats.page_reuse_count++;
1696 	} else {
1697 		/* we are not reusing the buffer so unmap it */
1698 		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
1699 			       DMA_FROM_DEVICE);
1700 	}
1701 
1702 	/* clear contents of buffer_info */
1703 	rx_buffer->page = NULL;
1704 
1705 	return skb;
1706 }
1707 
1708 /**
1709  * i40e_is_non_eop - process handling of non-EOP buffers
1710  * @rx_ring: Rx ring being processed
1711  * @rx_desc: Rx descriptor for current buffer
1712  * @skb: Current socket buffer containing buffer in progress
1713  *
1714  * This function updates next to clean.  If the buffer is an EOP buffer
1715  * this function exits returning false, otherwise it will place the
1716  * sk_buff in the next buffer to be chained and return true indicating
1717  * that this is in fact a non-EOP buffer.
1718  **/
1719 static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
1720 			    union i40e_rx_desc *rx_desc,
1721 			    struct sk_buff *skb)
1722 {
1723 	u32 ntc = rx_ring->next_to_clean + 1;
1724 
1725 	/* fetch, update, and store next to clean */
1726 	ntc = (ntc < rx_ring->count) ? ntc : 0;
1727 	rx_ring->next_to_clean = ntc;
1728 
1729 	prefetch(I40E_RX_DESC(rx_ring, ntc));
1730 
1731 #define staterrlen rx_desc->wb.qword1.status_error_len
1732 	if (unlikely(i40e_rx_is_programming_status(le64_to_cpu(staterrlen)))) {
1733 		i40e_clean_programming_status(rx_ring, rx_desc);
1734 		rx_ring->rx_bi[ntc].skb = skb;
1735 		return true;
1736 	}
1737 	/* if we are the last buffer then there is nothing else to do */
1738 #define I40E_RXD_EOF BIT(I40E_RX_DESC_STATUS_EOF_SHIFT)
1739 	if (likely(i40e_test_staterr(rx_desc, I40E_RXD_EOF)))
1740 		return false;
1741 
1742 	/* place skb in next buffer to be received */
1743 	rx_ring->rx_bi[ntc].skb = skb;
1744 	rx_ring->rx_stats.non_eop_descs++;
1745 
1746 	return true;
1747 }
1748 
1749 /**
1750  * i40e_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
1751  * @rx_ring: rx descriptor ring to transact packets on
1752  * @budget: Total limit on number of packets to process
1753  *
1754  * This function provides a "bounce buffer" approach to Rx interrupt
1755  * processing.  The advantage to this is that on systems that have
1756  * expensive overhead for IOMMU access this provides a means of avoiding
1757  * it by maintaining the mapping of the page to the system.
1758  *
1759  * Returns amount of work completed
1760  **/
1761 static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
1762 {
1763 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
1764 	u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
1765 	bool failure = false;
1766 
1767 	while (likely(total_rx_packets < budget)) {
1768 		union i40e_rx_desc *rx_desc;
1769 		struct sk_buff *skb;
1770 		u32 rx_status;
1771 		u16 vlan_tag;
1772 		u8 rx_ptype;
1773 		u64 qword;
1774 
1775 		/* return some buffers to hardware, one at a time is too slow */
1776 		if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
1777 			failure = failure ||
1778 				  i40e_alloc_rx_buffers(rx_ring, cleaned_count);
1779 			cleaned_count = 0;
1780 		}
1781 
1782 		rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean);
1783 
1784 		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1785 		rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
1786 			   I40E_RXD_QW1_PTYPE_SHIFT;
1787 		rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
1788 			    I40E_RXD_QW1_STATUS_SHIFT;
1789 
1790 		if (!(rx_status & BIT(I40E_RX_DESC_STATUS_DD_SHIFT)))
1791 			break;
1792 
1793 		/* status_error_len will always be zero for unused descriptors
1794 		 * because it's cleared in cleanup, and overlaps with hdr_addr
1795 		 * which is always zero because packet split isn't used, if the
1796 		 * hardware wrote DD then it will be non-zero
1797 		 */
1798 		if (!rx_desc->wb.qword1.status_error_len)
1799 			break;
1800 
1801 		/* This memory barrier is needed to keep us from reading
1802 		 * any other fields out of the rx_desc until we know the
1803 		 * DD bit is set.
1804 		 */
1805 		dma_rmb();
1806 
1807 		skb = i40e_fetch_rx_buffer(rx_ring, rx_desc);
1808 		if (!skb)
1809 			break;
1810 
1811 		cleaned_count++;
1812 
1813 		if (i40e_is_non_eop(rx_ring, rx_desc, skb))
1814 			continue;
1815 
1816 		/* ERR_MASK will only have valid bits if EOP set, and
1817 		 * what we are doing here is actually checking
1818 		 * I40E_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
1819 		 * the error field
1820 		 */
1821 		if (unlikely(i40e_test_staterr(rx_desc, BIT(I40E_RXD_QW1_ERROR_SHIFT)))) {
1822 			dev_kfree_skb_any(skb);
1823 			continue;
1824 		}
1825 
1826 		if (i40e_cleanup_headers(rx_ring, skb))
1827 			continue;
1828 
1829 		/* probably a little skewed due to removing CRC */
1830 		total_rx_bytes += skb->len;
1831 
1832 		/* populate checksum, VLAN, and protocol */
1833 		i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
1834 
1835 #ifdef I40E_FCOE
1836 		if (unlikely(
1837 		    i40e_rx_is_fcoe(rx_ptype) &&
1838 		    !i40e_fcoe_handle_offload(rx_ring, rx_desc, skb))) {
1839 			dev_kfree_skb_any(skb);
1840 			continue;
1841 		}
1842 #endif
1843 
1844 		vlan_tag = (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) ?
1845 			   le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1) : 0;
1846 
1847 		i40e_receive_skb(rx_ring, skb, vlan_tag);
1848 
1849 		/* update budget accounting */
1850 		total_rx_packets++;
1851 	}
1852 
1853 	u64_stats_update_begin(&rx_ring->syncp);
1854 	rx_ring->stats.packets += total_rx_packets;
1855 	rx_ring->stats.bytes += total_rx_bytes;
1856 	u64_stats_update_end(&rx_ring->syncp);
1857 	rx_ring->q_vector->rx.total_packets += total_rx_packets;
1858 	rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
1859 
1860 	/* guarantee a trip back through this routine if there was a failure */
1861 	return failure ? budget : total_rx_packets;
1862 }
1863 
1864 static u32 i40e_buildreg_itr(const int type, const u16 itr)
1865 {
1866 	u32 val;
1867 
1868 	val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
1869 	      /* Don't clear PBA because that can cause lost interrupts that
1870 	       * came in while we were cleaning/polling
1871 	       */
1872 	      (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) |
1873 	      (itr << I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT);
1874 
1875 	return val;
1876 }
1877 
1878 /* a small macro to shorten up some long lines */
1879 #define INTREG I40E_PFINT_DYN_CTLN
1880 static inline int get_rx_itr_enabled(struct i40e_vsi *vsi, int idx)
1881 {
1882 	return !!(vsi->rx_rings[idx]->rx_itr_setting);
1883 }
1884 
1885 static inline int get_tx_itr_enabled(struct i40e_vsi *vsi, int idx)
1886 {
1887 	return !!(vsi->tx_rings[idx]->tx_itr_setting);
1888 }
1889 
1890 /**
1891  * i40e_update_enable_itr - Update itr and re-enable MSIX interrupt
1892  * @vsi: the VSI we care about
1893  * @q_vector: q_vector for which itr is being updated and interrupt enabled
1894  *
1895  **/
1896 static inline void i40e_update_enable_itr(struct i40e_vsi *vsi,
1897 					  struct i40e_q_vector *q_vector)
1898 {
1899 	struct i40e_hw *hw = &vsi->back->hw;
1900 	bool rx = false, tx = false;
1901 	u32 rxval, txval;
1902 	int vector;
1903 	int idx = q_vector->v_idx;
1904 	int rx_itr_setting, tx_itr_setting;
1905 
1906 	vector = (q_vector->v_idx + vsi->base_vector);
1907 
1908 	/* avoid dynamic calculation if in countdown mode OR if
1909 	 * all dynamic is disabled
1910 	 */
1911 	rxval = txval = i40e_buildreg_itr(I40E_ITR_NONE, 0);
1912 
1913 	rx_itr_setting = get_rx_itr_enabled(vsi, idx);
1914 	tx_itr_setting = get_tx_itr_enabled(vsi, idx);
1915 
1916 	if (q_vector->itr_countdown > 0 ||
1917 	    (!ITR_IS_DYNAMIC(rx_itr_setting) &&
1918 	     !ITR_IS_DYNAMIC(tx_itr_setting))) {
1919 		goto enable_int;
1920 	}
1921 
1922 	if (ITR_IS_DYNAMIC(tx_itr_setting)) {
1923 		rx = i40e_set_new_dynamic_itr(&q_vector->rx);
1924 		rxval = i40e_buildreg_itr(I40E_RX_ITR, q_vector->rx.itr);
1925 	}
1926 
1927 	if (ITR_IS_DYNAMIC(tx_itr_setting)) {
1928 		tx = i40e_set_new_dynamic_itr(&q_vector->tx);
1929 		txval = i40e_buildreg_itr(I40E_TX_ITR, q_vector->tx.itr);
1930 	}
1931 
1932 	if (rx || tx) {
1933 		/* get the higher of the two ITR adjustments and
1934 		 * use the same value for both ITR registers
1935 		 * when in adaptive mode (Rx and/or Tx)
1936 		 */
1937 		u16 itr = max(q_vector->tx.itr, q_vector->rx.itr);
1938 
1939 		q_vector->tx.itr = q_vector->rx.itr = itr;
1940 		txval = i40e_buildreg_itr(I40E_TX_ITR, itr);
1941 		tx = true;
1942 		rxval = i40e_buildreg_itr(I40E_RX_ITR, itr);
1943 		rx = true;
1944 	}
1945 
1946 	/* only need to enable the interrupt once, but need
1947 	 * to possibly update both ITR values
1948 	 */
1949 	if (rx) {
1950 		/* set the INTENA_MSK_MASK so that this first write
1951 		 * won't actually enable the interrupt, instead just
1952 		 * updating the ITR (it's bit 31 PF and VF)
1953 		 */
1954 		rxval |= BIT(31);
1955 		/* don't check _DOWN because interrupt isn't being enabled */
1956 		wr32(hw, INTREG(vector - 1), rxval);
1957 	}
1958 
1959 enable_int:
1960 	if (!test_bit(__I40E_DOWN, &vsi->state))
1961 		wr32(hw, INTREG(vector - 1), txval);
1962 
1963 	if (q_vector->itr_countdown)
1964 		q_vector->itr_countdown--;
1965 	else
1966 		q_vector->itr_countdown = ITR_COUNTDOWN_START;
1967 }
1968 
1969 /**
1970  * i40e_napi_poll - NAPI polling Rx/Tx cleanup routine
1971  * @napi: napi struct with our devices info in it
1972  * @budget: amount of work driver is allowed to do this pass, in packets
1973  *
1974  * This function will clean all queues associated with a q_vector.
1975  *
1976  * Returns the amount of work done
1977  **/
1978 int i40e_napi_poll(struct napi_struct *napi, int budget)
1979 {
1980 	struct i40e_q_vector *q_vector =
1981 			       container_of(napi, struct i40e_q_vector, napi);
1982 	struct i40e_vsi *vsi = q_vector->vsi;
1983 	struct i40e_ring *ring;
1984 	bool clean_complete = true;
1985 	bool arm_wb = false;
1986 	int budget_per_ring;
1987 	int work_done = 0;
1988 
1989 	if (test_bit(__I40E_DOWN, &vsi->state)) {
1990 		napi_complete(napi);
1991 		return 0;
1992 	}
1993 
1994 	/* Clear hung_detected bit */
1995 	clear_bit(I40E_Q_VECTOR_HUNG_DETECT, &q_vector->hung_detected);
1996 	/* Since the actual Tx work is minimal, we can give the Tx a larger
1997 	 * budget and be more aggressive about cleaning up the Tx descriptors.
1998 	 */
1999 	i40e_for_each_ring(ring, q_vector->tx) {
2000 		if (!i40e_clean_tx_irq(vsi, ring, budget)) {
2001 			clean_complete = false;
2002 			continue;
2003 		}
2004 		arm_wb |= ring->arm_wb;
2005 		ring->arm_wb = false;
2006 	}
2007 
2008 	/* Handle case where we are called by netpoll with a budget of 0 */
2009 	if (budget <= 0)
2010 		goto tx_only;
2011 
2012 	/* We attempt to distribute budget to each Rx queue fairly, but don't
2013 	 * allow the budget to go below 1 because that would exit polling early.
2014 	 */
2015 	budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
2016 
2017 	i40e_for_each_ring(ring, q_vector->rx) {
2018 		int cleaned = i40e_clean_rx_irq(ring, budget_per_ring);
2019 
2020 		work_done += cleaned;
2021 		/* if we clean as many as budgeted, we must not be done */
2022 		if (cleaned >= budget_per_ring)
2023 			clean_complete = false;
2024 	}
2025 
2026 	/* If work not completed, return budget and polling will return */
2027 	if (!clean_complete) {
2028 tx_only:
2029 		if (arm_wb) {
2030 			q_vector->tx.ring[0].tx_stats.tx_force_wb++;
2031 			i40e_enable_wb_on_itr(vsi, q_vector);
2032 		}
2033 		return budget;
2034 	}
2035 
2036 	if (vsi->back->flags & I40E_TXR_FLAGS_WB_ON_ITR)
2037 		q_vector->arm_wb_state = false;
2038 
2039 	/* Work is done so exit the polling mode and re-enable the interrupt */
2040 	napi_complete_done(napi, work_done);
2041 	if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
2042 		i40e_update_enable_itr(vsi, q_vector);
2043 	} else { /* Legacy mode */
2044 		i40e_irq_dynamic_enable_icr0(vsi->back, false);
2045 	}
2046 	return 0;
2047 }
2048 
2049 /**
2050  * i40e_atr - Add a Flow Director ATR filter
2051  * @tx_ring:  ring to add programming descriptor to
2052  * @skb:      send buffer
2053  * @tx_flags: send tx flags
2054  **/
2055 static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
2056 		     u32 tx_flags)
2057 {
2058 	struct i40e_filter_program_desc *fdir_desc;
2059 	struct i40e_pf *pf = tx_ring->vsi->back;
2060 	union {
2061 		unsigned char *network;
2062 		struct iphdr *ipv4;
2063 		struct ipv6hdr *ipv6;
2064 	} hdr;
2065 	struct tcphdr *th;
2066 	unsigned int hlen;
2067 	u32 flex_ptype, dtype_cmd;
2068 	int l4_proto;
2069 	u16 i;
2070 
2071 	/* make sure ATR is enabled */
2072 	if (!(pf->flags & I40E_FLAG_FD_ATR_ENABLED))
2073 		return;
2074 
2075 	if ((pf->auto_disable_flags & I40E_FLAG_FD_ATR_ENABLED))
2076 		return;
2077 
2078 	/* if sampling is disabled do nothing */
2079 	if (!tx_ring->atr_sample_rate)
2080 		return;
2081 
2082 	/* Currently only IPv4/IPv6 with TCP is supported */
2083 	if (!(tx_flags & (I40E_TX_FLAGS_IPV4 | I40E_TX_FLAGS_IPV6)))
2084 		return;
2085 
2086 	/* snag network header to get L4 type and address */
2087 	hdr.network = (tx_flags & I40E_TX_FLAGS_UDP_TUNNEL) ?
2088 		      skb_inner_network_header(skb) : skb_network_header(skb);
2089 
2090 	/* Note: tx_flags gets modified to reflect inner protocols in
2091 	 * tx_enable_csum function if encap is enabled.
2092 	 */
2093 	if (tx_flags & I40E_TX_FLAGS_IPV4) {
2094 		/* access ihl as u8 to avoid unaligned access on ia64 */
2095 		hlen = (hdr.network[0] & 0x0F) << 2;
2096 		l4_proto = hdr.ipv4->protocol;
2097 	} else {
2098 		hlen = hdr.network - skb->data;
2099 		l4_proto = ipv6_find_hdr(skb, &hlen, IPPROTO_TCP, NULL, NULL);
2100 		hlen -= hdr.network - skb->data;
2101 	}
2102 
2103 	if (l4_proto != IPPROTO_TCP)
2104 		return;
2105 
2106 	th = (struct tcphdr *)(hdr.network + hlen);
2107 
2108 	/* Due to lack of space, no more new filters can be programmed */
2109 	if (th->syn && (pf->auto_disable_flags & I40E_FLAG_FD_ATR_ENABLED))
2110 		return;
2111 	if ((pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE) &&
2112 	    (!(pf->auto_disable_flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE))) {
2113 		/* HW ATR eviction will take care of removing filters on FIN
2114 		 * and RST packets.
2115 		 */
2116 		if (th->fin || th->rst)
2117 			return;
2118 	}
2119 
2120 	tx_ring->atr_count++;
2121 
2122 	/* sample on all syn/fin/rst packets or once every atr sample rate */
2123 	if (!th->fin &&
2124 	    !th->syn &&
2125 	    !th->rst &&
2126 	    (tx_ring->atr_count < tx_ring->atr_sample_rate))
2127 		return;
2128 
2129 	tx_ring->atr_count = 0;
2130 
2131 	/* grab the next descriptor */
2132 	i = tx_ring->next_to_use;
2133 	fdir_desc = I40E_TX_FDIRDESC(tx_ring, i);
2134 
2135 	i++;
2136 	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
2137 
2138 	flex_ptype = (tx_ring->queue_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT) &
2139 		      I40E_TXD_FLTR_QW0_QINDEX_MASK;
2140 	flex_ptype |= (tx_flags & I40E_TX_FLAGS_IPV4) ?
2141 		      (I40E_FILTER_PCTYPE_NONF_IPV4_TCP <<
2142 		       I40E_TXD_FLTR_QW0_PCTYPE_SHIFT) :
2143 		      (I40E_FILTER_PCTYPE_NONF_IPV6_TCP <<
2144 		       I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
2145 
2146 	flex_ptype |= tx_ring->vsi->id << I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT;
2147 
2148 	dtype_cmd = I40E_TX_DESC_DTYPE_FILTER_PROG;
2149 
2150 	dtype_cmd |= (th->fin || th->rst) ?
2151 		     (I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE <<
2152 		      I40E_TXD_FLTR_QW1_PCMD_SHIFT) :
2153 		     (I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE <<
2154 		      I40E_TXD_FLTR_QW1_PCMD_SHIFT);
2155 
2156 	dtype_cmd |= I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_QINDEX <<
2157 		     I40E_TXD_FLTR_QW1_DEST_SHIFT;
2158 
2159 	dtype_cmd |= I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID <<
2160 		     I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT;
2161 
2162 	dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
2163 	if (!(tx_flags & I40E_TX_FLAGS_UDP_TUNNEL))
2164 		dtype_cmd |=
2165 			((u32)I40E_FD_ATR_STAT_IDX(pf->hw.pf_id) <<
2166 			I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
2167 			I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
2168 	else
2169 		dtype_cmd |=
2170 			((u32)I40E_FD_ATR_TUNNEL_STAT_IDX(pf->hw.pf_id) <<
2171 			I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
2172 			I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
2173 
2174 	if ((pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE) &&
2175 	    (!(pf->auto_disable_flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE)))
2176 		dtype_cmd |= I40E_TXD_FLTR_QW1_ATR_MASK;
2177 
2178 	fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
2179 	fdir_desc->rsvd = cpu_to_le32(0);
2180 	fdir_desc->dtype_cmd_cntindex = cpu_to_le32(dtype_cmd);
2181 	fdir_desc->fd_id = cpu_to_le32(0);
2182 }
2183 
2184 /**
2185  * i40e_tx_prepare_vlan_flags - prepare generic TX VLAN tagging flags for HW
2186  * @skb:     send buffer
2187  * @tx_ring: ring to send buffer on
2188  * @flags:   the tx flags to be set
2189  *
2190  * Checks the skb and set up correspondingly several generic transmit flags
2191  * related to VLAN tagging for the HW, such as VLAN, DCB, etc.
2192  *
2193  * Returns error code indicate the frame should be dropped upon error and the
2194  * otherwise  returns 0 to indicate the flags has been set properly.
2195  **/
2196 #ifdef I40E_FCOE
2197 inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
2198 				      struct i40e_ring *tx_ring,
2199 				      u32 *flags)
2200 #else
2201 static inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
2202 					     struct i40e_ring *tx_ring,
2203 					     u32 *flags)
2204 #endif
2205 {
2206 	__be16 protocol = skb->protocol;
2207 	u32  tx_flags = 0;
2208 
2209 	if (protocol == htons(ETH_P_8021Q) &&
2210 	    !(tx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) {
2211 		/* When HW VLAN acceleration is turned off by the user the
2212 		 * stack sets the protocol to 8021q so that the driver
2213 		 * can take any steps required to support the SW only
2214 		 * VLAN handling.  In our case the driver doesn't need
2215 		 * to take any further steps so just set the protocol
2216 		 * to the encapsulated ethertype.
2217 		 */
2218 		skb->protocol = vlan_get_protocol(skb);
2219 		goto out;
2220 	}
2221 
2222 	/* if we have a HW VLAN tag being added, default to the HW one */
2223 	if (skb_vlan_tag_present(skb)) {
2224 		tx_flags |= skb_vlan_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
2225 		tx_flags |= I40E_TX_FLAGS_HW_VLAN;
2226 	/* else if it is a SW VLAN, check the next protocol and store the tag */
2227 	} else if (protocol == htons(ETH_P_8021Q)) {
2228 		struct vlan_hdr *vhdr, _vhdr;
2229 
2230 		vhdr = skb_header_pointer(skb, ETH_HLEN, sizeof(_vhdr), &_vhdr);
2231 		if (!vhdr)
2232 			return -EINVAL;
2233 
2234 		protocol = vhdr->h_vlan_encapsulated_proto;
2235 		tx_flags |= ntohs(vhdr->h_vlan_TCI) << I40E_TX_FLAGS_VLAN_SHIFT;
2236 		tx_flags |= I40E_TX_FLAGS_SW_VLAN;
2237 	}
2238 
2239 	if (!(tx_ring->vsi->back->flags & I40E_FLAG_DCB_ENABLED))
2240 		goto out;
2241 
2242 	/* Insert 802.1p priority into VLAN header */
2243 	if ((tx_flags & (I40E_TX_FLAGS_HW_VLAN | I40E_TX_FLAGS_SW_VLAN)) ||
2244 	    (skb->priority != TC_PRIO_CONTROL)) {
2245 		tx_flags &= ~I40E_TX_FLAGS_VLAN_PRIO_MASK;
2246 		tx_flags |= (skb->priority & 0x7) <<
2247 				I40E_TX_FLAGS_VLAN_PRIO_SHIFT;
2248 		if (tx_flags & I40E_TX_FLAGS_SW_VLAN) {
2249 			struct vlan_ethhdr *vhdr;
2250 			int rc;
2251 
2252 			rc = skb_cow_head(skb, 0);
2253 			if (rc < 0)
2254 				return rc;
2255 			vhdr = (struct vlan_ethhdr *)skb->data;
2256 			vhdr->h_vlan_TCI = htons(tx_flags >>
2257 						 I40E_TX_FLAGS_VLAN_SHIFT);
2258 		} else {
2259 			tx_flags |= I40E_TX_FLAGS_HW_VLAN;
2260 		}
2261 	}
2262 
2263 out:
2264 	*flags = tx_flags;
2265 	return 0;
2266 }
2267 
2268 /**
2269  * i40e_tso - set up the tso context descriptor
2270  * @skb:      ptr to the skb we're sending
2271  * @hdr_len:  ptr to the size of the packet header
2272  * @cd_type_cmd_tso_mss: Quad Word 1
2273  *
2274  * Returns 0 if no TSO can happen, 1 if tso is going, or error
2275  **/
2276 static int i40e_tso(struct sk_buff *skb, u8 *hdr_len, u64 *cd_type_cmd_tso_mss)
2277 {
2278 	u64 cd_cmd, cd_tso_len, cd_mss;
2279 	union {
2280 		struct iphdr *v4;
2281 		struct ipv6hdr *v6;
2282 		unsigned char *hdr;
2283 	} ip;
2284 	union {
2285 		struct tcphdr *tcp;
2286 		struct udphdr *udp;
2287 		unsigned char *hdr;
2288 	} l4;
2289 	u32 paylen, l4_offset;
2290 	int err;
2291 
2292 	if (skb->ip_summed != CHECKSUM_PARTIAL)
2293 		return 0;
2294 
2295 	if (!skb_is_gso(skb))
2296 		return 0;
2297 
2298 	err = skb_cow_head(skb, 0);
2299 	if (err < 0)
2300 		return err;
2301 
2302 	ip.hdr = skb_network_header(skb);
2303 	l4.hdr = skb_transport_header(skb);
2304 
2305 	/* initialize outer IP header fields */
2306 	if (ip.v4->version == 4) {
2307 		ip.v4->tot_len = 0;
2308 		ip.v4->check = 0;
2309 	} else {
2310 		ip.v6->payload_len = 0;
2311 	}
2312 
2313 	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
2314 					 SKB_GSO_GRE_CSUM |
2315 					 SKB_GSO_IPXIP4 |
2316 					 SKB_GSO_IPXIP6 |
2317 					 SKB_GSO_UDP_TUNNEL |
2318 					 SKB_GSO_UDP_TUNNEL_CSUM)) {
2319 		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
2320 		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
2321 			l4.udp->len = 0;
2322 
2323 			/* determine offset of outer transport header */
2324 			l4_offset = l4.hdr - skb->data;
2325 
2326 			/* remove payload length from outer checksum */
2327 			paylen = skb->len - l4_offset;
2328 			csum_replace_by_diff(&l4.udp->check, htonl(paylen));
2329 		}
2330 
2331 		/* reset pointers to inner headers */
2332 		ip.hdr = skb_inner_network_header(skb);
2333 		l4.hdr = skb_inner_transport_header(skb);
2334 
2335 		/* initialize inner IP header fields */
2336 		if (ip.v4->version == 4) {
2337 			ip.v4->tot_len = 0;
2338 			ip.v4->check = 0;
2339 		} else {
2340 			ip.v6->payload_len = 0;
2341 		}
2342 	}
2343 
2344 	/* determine offset of inner transport header */
2345 	l4_offset = l4.hdr - skb->data;
2346 
2347 	/* remove payload length from inner checksum */
2348 	paylen = skb->len - l4_offset;
2349 	csum_replace_by_diff(&l4.tcp->check, htonl(paylen));
2350 
2351 	/* compute length of segmentation header */
2352 	*hdr_len = (l4.tcp->doff * 4) + l4_offset;
2353 
2354 	/* find the field values */
2355 	cd_cmd = I40E_TX_CTX_DESC_TSO;
2356 	cd_tso_len = skb->len - *hdr_len;
2357 	cd_mss = skb_shinfo(skb)->gso_size;
2358 	*cd_type_cmd_tso_mss |= (cd_cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) |
2359 				(cd_tso_len << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
2360 				(cd_mss << I40E_TXD_CTX_QW1_MSS_SHIFT);
2361 	return 1;
2362 }
2363 
2364 /**
2365  * i40e_tsyn - set up the tsyn context descriptor
2366  * @tx_ring:  ptr to the ring to send
2367  * @skb:      ptr to the skb we're sending
2368  * @tx_flags: the collected send information
2369  * @cd_type_cmd_tso_mss: Quad Word 1
2370  *
2371  * Returns 0 if no Tx timestamp can happen and 1 if the timestamp will happen
2372  **/
2373 static int i40e_tsyn(struct i40e_ring *tx_ring, struct sk_buff *skb,
2374 		     u32 tx_flags, u64 *cd_type_cmd_tso_mss)
2375 {
2376 	struct i40e_pf *pf;
2377 
2378 	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
2379 		return 0;
2380 
2381 	/* Tx timestamps cannot be sampled when doing TSO */
2382 	if (tx_flags & I40E_TX_FLAGS_TSO)
2383 		return 0;
2384 
2385 	/* only timestamp the outbound packet if the user has requested it and
2386 	 * we are not already transmitting a packet to be timestamped
2387 	 */
2388 	pf = i40e_netdev_to_pf(tx_ring->netdev);
2389 	if (!(pf->flags & I40E_FLAG_PTP))
2390 		return 0;
2391 
2392 	if (pf->ptp_tx &&
2393 	    !test_and_set_bit_lock(__I40E_PTP_TX_IN_PROGRESS, &pf->state)) {
2394 		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
2395 		pf->ptp_tx_skb = skb_get(skb);
2396 	} else {
2397 		return 0;
2398 	}
2399 
2400 	*cd_type_cmd_tso_mss |= (u64)I40E_TX_CTX_DESC_TSYN <<
2401 				I40E_TXD_CTX_QW1_CMD_SHIFT;
2402 
2403 	return 1;
2404 }
2405 
2406 /**
2407  * i40e_tx_enable_csum - Enable Tx checksum offloads
2408  * @skb: send buffer
2409  * @tx_flags: pointer to Tx flags currently set
2410  * @td_cmd: Tx descriptor command bits to set
2411  * @td_offset: Tx descriptor header offsets to set
2412  * @tx_ring: Tx descriptor ring
2413  * @cd_tunneling: ptr to context desc bits
2414  **/
2415 static int i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
2416 			       u32 *td_cmd, u32 *td_offset,
2417 			       struct i40e_ring *tx_ring,
2418 			       u32 *cd_tunneling)
2419 {
2420 	union {
2421 		struct iphdr *v4;
2422 		struct ipv6hdr *v6;
2423 		unsigned char *hdr;
2424 	} ip;
2425 	union {
2426 		struct tcphdr *tcp;
2427 		struct udphdr *udp;
2428 		unsigned char *hdr;
2429 	} l4;
2430 	unsigned char *exthdr;
2431 	u32 offset, cmd = 0;
2432 	__be16 frag_off;
2433 	u8 l4_proto = 0;
2434 
2435 	if (skb->ip_summed != CHECKSUM_PARTIAL)
2436 		return 0;
2437 
2438 	ip.hdr = skb_network_header(skb);
2439 	l4.hdr = skb_transport_header(skb);
2440 
2441 	/* compute outer L2 header size */
2442 	offset = ((ip.hdr - skb->data) / 2) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
2443 
2444 	if (skb->encapsulation) {
2445 		u32 tunnel = 0;
2446 		/* define outer network header type */
2447 		if (*tx_flags & I40E_TX_FLAGS_IPV4) {
2448 			tunnel |= (*tx_flags & I40E_TX_FLAGS_TSO) ?
2449 				  I40E_TX_CTX_EXT_IP_IPV4 :
2450 				  I40E_TX_CTX_EXT_IP_IPV4_NO_CSUM;
2451 
2452 			l4_proto = ip.v4->protocol;
2453 		} else if (*tx_flags & I40E_TX_FLAGS_IPV6) {
2454 			tunnel |= I40E_TX_CTX_EXT_IP_IPV6;
2455 
2456 			exthdr = ip.hdr + sizeof(*ip.v6);
2457 			l4_proto = ip.v6->nexthdr;
2458 			if (l4.hdr != exthdr)
2459 				ipv6_skip_exthdr(skb, exthdr - skb->data,
2460 						 &l4_proto, &frag_off);
2461 		}
2462 
2463 		/* define outer transport */
2464 		switch (l4_proto) {
2465 		case IPPROTO_UDP:
2466 			tunnel |= I40E_TXD_CTX_UDP_TUNNELING;
2467 			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2468 			break;
2469 		case IPPROTO_GRE:
2470 			tunnel |= I40E_TXD_CTX_GRE_TUNNELING;
2471 			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2472 			break;
2473 		case IPPROTO_IPIP:
2474 		case IPPROTO_IPV6:
2475 			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2476 			l4.hdr = skb_inner_network_header(skb);
2477 			break;
2478 		default:
2479 			if (*tx_flags & I40E_TX_FLAGS_TSO)
2480 				return -1;
2481 
2482 			skb_checksum_help(skb);
2483 			return 0;
2484 		}
2485 
2486 		/* compute outer L3 header size */
2487 		tunnel |= ((l4.hdr - ip.hdr) / 4) <<
2488 			  I40E_TXD_CTX_QW0_EXT_IPLEN_SHIFT;
2489 
2490 		/* switch IP header pointer from outer to inner header */
2491 		ip.hdr = skb_inner_network_header(skb);
2492 
2493 		/* compute tunnel header size */
2494 		tunnel |= ((ip.hdr - l4.hdr) / 2) <<
2495 			  I40E_TXD_CTX_QW0_NATLEN_SHIFT;
2496 
2497 		/* indicate if we need to offload outer UDP header */
2498 		if ((*tx_flags & I40E_TX_FLAGS_TSO) &&
2499 		    !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
2500 		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
2501 			tunnel |= I40E_TXD_CTX_QW0_L4T_CS_MASK;
2502 
2503 		/* record tunnel offload values */
2504 		*cd_tunneling |= tunnel;
2505 
2506 		/* switch L4 header pointer from outer to inner */
2507 		l4.hdr = skb_inner_transport_header(skb);
2508 		l4_proto = 0;
2509 
2510 		/* reset type as we transition from outer to inner headers */
2511 		*tx_flags &= ~(I40E_TX_FLAGS_IPV4 | I40E_TX_FLAGS_IPV6);
2512 		if (ip.v4->version == 4)
2513 			*tx_flags |= I40E_TX_FLAGS_IPV4;
2514 		if (ip.v6->version == 6)
2515 			*tx_flags |= I40E_TX_FLAGS_IPV6;
2516 	}
2517 
2518 	/* Enable IP checksum offloads */
2519 	if (*tx_flags & I40E_TX_FLAGS_IPV4) {
2520 		l4_proto = ip.v4->protocol;
2521 		/* the stack computes the IP header already, the only time we
2522 		 * need the hardware to recompute it is in the case of TSO.
2523 		 */
2524 		cmd |= (*tx_flags & I40E_TX_FLAGS_TSO) ?
2525 		       I40E_TX_DESC_CMD_IIPT_IPV4_CSUM :
2526 		       I40E_TX_DESC_CMD_IIPT_IPV4;
2527 	} else if (*tx_flags & I40E_TX_FLAGS_IPV6) {
2528 		cmd |= I40E_TX_DESC_CMD_IIPT_IPV6;
2529 
2530 		exthdr = ip.hdr + sizeof(*ip.v6);
2531 		l4_proto = ip.v6->nexthdr;
2532 		if (l4.hdr != exthdr)
2533 			ipv6_skip_exthdr(skb, exthdr - skb->data,
2534 					 &l4_proto, &frag_off);
2535 	}
2536 
2537 	/* compute inner L3 header size */
2538 	offset |= ((l4.hdr - ip.hdr) / 4) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
2539 
2540 	/* Enable L4 checksum offloads */
2541 	switch (l4_proto) {
2542 	case IPPROTO_TCP:
2543 		/* enable checksum offloads */
2544 		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
2545 		offset |= l4.tcp->doff << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2546 		break;
2547 	case IPPROTO_SCTP:
2548 		/* enable SCTP checksum offload */
2549 		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
2550 		offset |= (sizeof(struct sctphdr) >> 2) <<
2551 			  I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2552 		break;
2553 	case IPPROTO_UDP:
2554 		/* enable UDP checksum offload */
2555 		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
2556 		offset |= (sizeof(struct udphdr) >> 2) <<
2557 			  I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2558 		break;
2559 	default:
2560 		if (*tx_flags & I40E_TX_FLAGS_TSO)
2561 			return -1;
2562 		skb_checksum_help(skb);
2563 		return 0;
2564 	}
2565 
2566 	*td_cmd |= cmd;
2567 	*td_offset |= offset;
2568 
2569 	return 1;
2570 }
2571 
2572 /**
2573  * i40e_create_tx_ctx Build the Tx context descriptor
2574  * @tx_ring:  ring to create the descriptor on
2575  * @cd_type_cmd_tso_mss: Quad Word 1
2576  * @cd_tunneling: Quad Word 0 - bits 0-31
2577  * @cd_l2tag2: Quad Word 0 - bits 32-63
2578  **/
2579 static void i40e_create_tx_ctx(struct i40e_ring *tx_ring,
2580 			       const u64 cd_type_cmd_tso_mss,
2581 			       const u32 cd_tunneling, const u32 cd_l2tag2)
2582 {
2583 	struct i40e_tx_context_desc *context_desc;
2584 	int i = tx_ring->next_to_use;
2585 
2586 	if ((cd_type_cmd_tso_mss == I40E_TX_DESC_DTYPE_CONTEXT) &&
2587 	    !cd_tunneling && !cd_l2tag2)
2588 		return;
2589 
2590 	/* grab the next descriptor */
2591 	context_desc = I40E_TX_CTXTDESC(tx_ring, i);
2592 
2593 	i++;
2594 	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
2595 
2596 	/* cpu_to_le32 and assign to struct fields */
2597 	context_desc->tunneling_params = cpu_to_le32(cd_tunneling);
2598 	context_desc->l2tag2 = cpu_to_le16(cd_l2tag2);
2599 	context_desc->rsvd = cpu_to_le16(0);
2600 	context_desc->type_cmd_tso_mss = cpu_to_le64(cd_type_cmd_tso_mss);
2601 }
2602 
2603 /**
2604  * __i40e_maybe_stop_tx - 2nd level check for tx stop conditions
2605  * @tx_ring: the ring to be checked
2606  * @size:    the size buffer we want to assure is available
2607  *
2608  * Returns -EBUSY if a stop is needed, else 0
2609  **/
2610 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
2611 {
2612 	netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
2613 	/* Memory barrier before checking head and tail */
2614 	smp_mb();
2615 
2616 	/* Check again in a case another CPU has just made room available. */
2617 	if (likely(I40E_DESC_UNUSED(tx_ring) < size))
2618 		return -EBUSY;
2619 
2620 	/* A reprieve! - use start_queue because it doesn't call schedule */
2621 	netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
2622 	++tx_ring->tx_stats.restart_queue;
2623 	return 0;
2624 }
2625 
2626 /**
2627  * __i40e_chk_linearize - Check if there are more than 8 buffers per packet
2628  * @skb:      send buffer
2629  *
2630  * Note: Our HW can't DMA more than 8 buffers to build a packet on the wire
2631  * and so we need to figure out the cases where we need to linearize the skb.
2632  *
2633  * For TSO we need to count the TSO header and segment payload separately.
2634  * As such we need to check cases where we have 7 fragments or more as we
2635  * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
2636  * the segment payload in the first descriptor, and another 7 for the
2637  * fragments.
2638  **/
2639 bool __i40e_chk_linearize(struct sk_buff *skb)
2640 {
2641 	const struct skb_frag_struct *frag, *stale;
2642 	int nr_frags, sum;
2643 
2644 	/* no need to check if number of frags is less than 7 */
2645 	nr_frags = skb_shinfo(skb)->nr_frags;
2646 	if (nr_frags < (I40E_MAX_BUFFER_TXD - 1))
2647 		return false;
2648 
2649 	/* We need to walk through the list and validate that each group
2650 	 * of 6 fragments totals at least gso_size.
2651 	 */
2652 	nr_frags -= I40E_MAX_BUFFER_TXD - 2;
2653 	frag = &skb_shinfo(skb)->frags[0];
2654 
2655 	/* Initialize size to the negative value of gso_size minus 1.  We
2656 	 * use this as the worst case scenerio in which the frag ahead
2657 	 * of us only provides one byte which is why we are limited to 6
2658 	 * descriptors for a single transmit as the header and previous
2659 	 * fragment are already consuming 2 descriptors.
2660 	 */
2661 	sum = 1 - skb_shinfo(skb)->gso_size;
2662 
2663 	/* Add size of frags 0 through 4 to create our initial sum */
2664 	sum += skb_frag_size(frag++);
2665 	sum += skb_frag_size(frag++);
2666 	sum += skb_frag_size(frag++);
2667 	sum += skb_frag_size(frag++);
2668 	sum += skb_frag_size(frag++);
2669 
2670 	/* Walk through fragments adding latest fragment, testing it, and
2671 	 * then removing stale fragments from the sum.
2672 	 */
2673 	stale = &skb_shinfo(skb)->frags[0];
2674 	for (;;) {
2675 		sum += skb_frag_size(frag++);
2676 
2677 		/* if sum is negative we failed to make sufficient progress */
2678 		if (sum < 0)
2679 			return true;
2680 
2681 		if (!nr_frags--)
2682 			break;
2683 
2684 		sum -= skb_frag_size(stale++);
2685 	}
2686 
2687 	return false;
2688 }
2689 
2690 /**
2691  * i40e_tx_map - Build the Tx descriptor
2692  * @tx_ring:  ring to send buffer on
2693  * @skb:      send buffer
2694  * @first:    first buffer info buffer to use
2695  * @tx_flags: collected send information
2696  * @hdr_len:  size of the packet header
2697  * @td_cmd:   the command field in the descriptor
2698  * @td_offset: offset for checksum or crc
2699  **/
2700 #ifdef I40E_FCOE
2701 inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
2702 			struct i40e_tx_buffer *first, u32 tx_flags,
2703 			const u8 hdr_len, u32 td_cmd, u32 td_offset)
2704 #else
2705 static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
2706 			       struct i40e_tx_buffer *first, u32 tx_flags,
2707 			       const u8 hdr_len, u32 td_cmd, u32 td_offset)
2708 #endif
2709 {
2710 	unsigned int data_len = skb->data_len;
2711 	unsigned int size = skb_headlen(skb);
2712 	struct skb_frag_struct *frag;
2713 	struct i40e_tx_buffer *tx_bi;
2714 	struct i40e_tx_desc *tx_desc;
2715 	u16 i = tx_ring->next_to_use;
2716 	u32 td_tag = 0;
2717 	dma_addr_t dma;
2718 	u16 gso_segs;
2719 	u16 desc_count = 0;
2720 	bool tail_bump = true;
2721 	bool do_rs = false;
2722 
2723 	if (tx_flags & I40E_TX_FLAGS_HW_VLAN) {
2724 		td_cmd |= I40E_TX_DESC_CMD_IL2TAG1;
2725 		td_tag = (tx_flags & I40E_TX_FLAGS_VLAN_MASK) >>
2726 			 I40E_TX_FLAGS_VLAN_SHIFT;
2727 	}
2728 
2729 	if (tx_flags & (I40E_TX_FLAGS_TSO | I40E_TX_FLAGS_FSO))
2730 		gso_segs = skb_shinfo(skb)->gso_segs;
2731 	else
2732 		gso_segs = 1;
2733 
2734 	/* multiply data chunks by size of headers */
2735 	first->bytecount = skb->len - hdr_len + (gso_segs * hdr_len);
2736 	first->gso_segs = gso_segs;
2737 	first->skb = skb;
2738 	first->tx_flags = tx_flags;
2739 
2740 	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
2741 
2742 	tx_desc = I40E_TX_DESC(tx_ring, i);
2743 	tx_bi = first;
2744 
2745 	for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
2746 		unsigned int max_data = I40E_MAX_DATA_PER_TXD_ALIGNED;
2747 
2748 		if (dma_mapping_error(tx_ring->dev, dma))
2749 			goto dma_error;
2750 
2751 		/* record length, and DMA address */
2752 		dma_unmap_len_set(tx_bi, len, size);
2753 		dma_unmap_addr_set(tx_bi, dma, dma);
2754 
2755 		/* align size to end of page */
2756 		max_data += -dma & (I40E_MAX_READ_REQ_SIZE - 1);
2757 		tx_desc->buffer_addr = cpu_to_le64(dma);
2758 
2759 		while (unlikely(size > I40E_MAX_DATA_PER_TXD)) {
2760 			tx_desc->cmd_type_offset_bsz =
2761 				build_ctob(td_cmd, td_offset,
2762 					   max_data, td_tag);
2763 
2764 			tx_desc++;
2765 			i++;
2766 			desc_count++;
2767 
2768 			if (i == tx_ring->count) {
2769 				tx_desc = I40E_TX_DESC(tx_ring, 0);
2770 				i = 0;
2771 			}
2772 
2773 			dma += max_data;
2774 			size -= max_data;
2775 
2776 			max_data = I40E_MAX_DATA_PER_TXD_ALIGNED;
2777 			tx_desc->buffer_addr = cpu_to_le64(dma);
2778 		}
2779 
2780 		if (likely(!data_len))
2781 			break;
2782 
2783 		tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
2784 							  size, td_tag);
2785 
2786 		tx_desc++;
2787 		i++;
2788 		desc_count++;
2789 
2790 		if (i == tx_ring->count) {
2791 			tx_desc = I40E_TX_DESC(tx_ring, 0);
2792 			i = 0;
2793 		}
2794 
2795 		size = skb_frag_size(frag);
2796 		data_len -= size;
2797 
2798 		dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
2799 				       DMA_TO_DEVICE);
2800 
2801 		tx_bi = &tx_ring->tx_bi[i];
2802 	}
2803 
2804 	/* set next_to_watch value indicating a packet is present */
2805 	first->next_to_watch = tx_desc;
2806 
2807 	i++;
2808 	if (i == tx_ring->count)
2809 		i = 0;
2810 
2811 	tx_ring->next_to_use = i;
2812 
2813 	netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount);
2814 	i40e_maybe_stop_tx(tx_ring, DESC_NEEDED);
2815 
2816 	/* Algorithm to optimize tail and RS bit setting:
2817 	 * if xmit_more is supported
2818 	 *	if xmit_more is true
2819 	 *		do not update tail and do not mark RS bit.
2820 	 *	if xmit_more is false and last xmit_more was false
2821 	 *		if every packet spanned less than 4 desc
2822 	 *			then set RS bit on 4th packet and update tail
2823 	 *			on every packet
2824 	 *		else
2825 	 *			update tail and set RS bit on every packet.
2826 	 *	if xmit_more is false and last_xmit_more was true
2827 	 *		update tail and set RS bit.
2828 	 *
2829 	 * Optimization: wmb to be issued only in case of tail update.
2830 	 * Also optimize the Descriptor WB path for RS bit with the same
2831 	 * algorithm.
2832 	 *
2833 	 * Note: If there are less than 4 packets
2834 	 * pending and interrupts were disabled the service task will
2835 	 * trigger a force WB.
2836 	 */
2837 	if (skb->xmit_more  &&
2838 	    !netif_xmit_stopped(txring_txq(tx_ring))) {
2839 		tx_ring->flags |= I40E_TXR_FLAGS_LAST_XMIT_MORE_SET;
2840 		tail_bump = false;
2841 	} else if (!skb->xmit_more &&
2842 		   !netif_xmit_stopped(txring_txq(tx_ring)) &&
2843 		   (!(tx_ring->flags & I40E_TXR_FLAGS_LAST_XMIT_MORE_SET)) &&
2844 		   (tx_ring->packet_stride < WB_STRIDE) &&
2845 		   (desc_count < WB_STRIDE)) {
2846 		tx_ring->packet_stride++;
2847 	} else {
2848 		tx_ring->packet_stride = 0;
2849 		tx_ring->flags &= ~I40E_TXR_FLAGS_LAST_XMIT_MORE_SET;
2850 		do_rs = true;
2851 	}
2852 	if (do_rs)
2853 		tx_ring->packet_stride = 0;
2854 
2855 	tx_desc->cmd_type_offset_bsz =
2856 			build_ctob(td_cmd, td_offset, size, td_tag) |
2857 			cpu_to_le64((u64)(do_rs ? I40E_TXD_CMD :
2858 						  I40E_TX_DESC_CMD_EOP) <<
2859 						  I40E_TXD_QW1_CMD_SHIFT);
2860 
2861 	/* notify HW of packet */
2862 	if (!tail_bump) {
2863 		prefetchw(tx_desc + 1);
2864 	} else {
2865 		/* Force memory writes to complete before letting h/w
2866 		 * know there are new descriptors to fetch.  (Only
2867 		 * applicable for weak-ordered memory model archs,
2868 		 * such as IA-64).
2869 		 */
2870 		wmb();
2871 		writel(i, tx_ring->tail);
2872 	}
2873 	return;
2874 
2875 dma_error:
2876 	dev_info(tx_ring->dev, "TX DMA map failed\n");
2877 
2878 	/* clear dma mappings for failed tx_bi map */
2879 	for (;;) {
2880 		tx_bi = &tx_ring->tx_bi[i];
2881 		i40e_unmap_and_free_tx_resource(tx_ring, tx_bi);
2882 		if (tx_bi == first)
2883 			break;
2884 		if (i == 0)
2885 			i = tx_ring->count;
2886 		i--;
2887 	}
2888 
2889 	tx_ring->next_to_use = i;
2890 }
2891 
2892 /**
2893  * i40e_xmit_frame_ring - Sends buffer on Tx ring
2894  * @skb:     send buffer
2895  * @tx_ring: ring to send buffer on
2896  *
2897  * Returns NETDEV_TX_OK if sent, else an error code
2898  **/
2899 static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
2900 					struct i40e_ring *tx_ring)
2901 {
2902 	u64 cd_type_cmd_tso_mss = I40E_TX_DESC_DTYPE_CONTEXT;
2903 	u32 cd_tunneling = 0, cd_l2tag2 = 0;
2904 	struct i40e_tx_buffer *first;
2905 	u32 td_offset = 0;
2906 	u32 tx_flags = 0;
2907 	__be16 protocol;
2908 	u32 td_cmd = 0;
2909 	u8 hdr_len = 0;
2910 	int tso, count;
2911 	int tsyn;
2912 
2913 	/* prefetch the data, we'll need it later */
2914 	prefetch(skb->data);
2915 
2916 	count = i40e_xmit_descriptor_count(skb);
2917 	if (i40e_chk_linearize(skb, count)) {
2918 		if (__skb_linearize(skb))
2919 			goto out_drop;
2920 		count = i40e_txd_use_count(skb->len);
2921 		tx_ring->tx_stats.tx_linearize++;
2922 	}
2923 
2924 	/* need: 1 descriptor per page * PAGE_SIZE/I40E_MAX_DATA_PER_TXD,
2925 	 *       + 1 desc for skb_head_len/I40E_MAX_DATA_PER_TXD,
2926 	 *       + 4 desc gap to avoid the cache line where head is,
2927 	 *       + 1 desc for context descriptor,
2928 	 * otherwise try next time
2929 	 */
2930 	if (i40e_maybe_stop_tx(tx_ring, count + 4 + 1)) {
2931 		tx_ring->tx_stats.tx_busy++;
2932 		return NETDEV_TX_BUSY;
2933 	}
2934 
2935 	/* prepare the xmit flags */
2936 	if (i40e_tx_prepare_vlan_flags(skb, tx_ring, &tx_flags))
2937 		goto out_drop;
2938 
2939 	/* obtain protocol of skb */
2940 	protocol = vlan_get_protocol(skb);
2941 
2942 	/* record the location of the first descriptor for this packet */
2943 	first = &tx_ring->tx_bi[tx_ring->next_to_use];
2944 
2945 	/* setup IPv4/IPv6 offloads */
2946 	if (protocol == htons(ETH_P_IP))
2947 		tx_flags |= I40E_TX_FLAGS_IPV4;
2948 	else if (protocol == htons(ETH_P_IPV6))
2949 		tx_flags |= I40E_TX_FLAGS_IPV6;
2950 
2951 	tso = i40e_tso(skb, &hdr_len, &cd_type_cmd_tso_mss);
2952 
2953 	if (tso < 0)
2954 		goto out_drop;
2955 	else if (tso)
2956 		tx_flags |= I40E_TX_FLAGS_TSO;
2957 
2958 	/* Always offload the checksum, since it's in the data descriptor */
2959 	tso = i40e_tx_enable_csum(skb, &tx_flags, &td_cmd, &td_offset,
2960 				  tx_ring, &cd_tunneling);
2961 	if (tso < 0)
2962 		goto out_drop;
2963 
2964 	tsyn = i40e_tsyn(tx_ring, skb, tx_flags, &cd_type_cmd_tso_mss);
2965 
2966 	if (tsyn)
2967 		tx_flags |= I40E_TX_FLAGS_TSYN;
2968 
2969 	skb_tx_timestamp(skb);
2970 
2971 	/* always enable CRC insertion offload */
2972 	td_cmd |= I40E_TX_DESC_CMD_ICRC;
2973 
2974 	i40e_create_tx_ctx(tx_ring, cd_type_cmd_tso_mss,
2975 			   cd_tunneling, cd_l2tag2);
2976 
2977 	/* Add Flow Director ATR if it's enabled.
2978 	 *
2979 	 * NOTE: this must always be directly before the data descriptor.
2980 	 */
2981 	i40e_atr(tx_ring, skb, tx_flags);
2982 
2983 	i40e_tx_map(tx_ring, skb, first, tx_flags, hdr_len,
2984 		    td_cmd, td_offset);
2985 
2986 	return NETDEV_TX_OK;
2987 
2988 out_drop:
2989 	dev_kfree_skb_any(skb);
2990 	return NETDEV_TX_OK;
2991 }
2992 
2993 /**
2994  * i40e_lan_xmit_frame - Selects the correct VSI and Tx queue to send buffer
2995  * @skb:    send buffer
2996  * @netdev: network interface device structure
2997  *
2998  * Returns NETDEV_TX_OK if sent, else an error code
2999  **/
3000 netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
3001 {
3002 	struct i40e_netdev_priv *np = netdev_priv(netdev);
3003 	struct i40e_vsi *vsi = np->vsi;
3004 	struct i40e_ring *tx_ring = vsi->tx_rings[skb->queue_mapping];
3005 
3006 	/* hardware can't handle really short frames, hardware padding works
3007 	 * beyond this point
3008 	 */
3009 	if (skb_put_padto(skb, I40E_MIN_TX_LEN))
3010 		return NETDEV_TX_OK;
3011 
3012 	return i40e_xmit_frame_ring(skb, tx_ring);
3013 }
3014