xref: /openbmc/linux/drivers/net/ethernet/intel/i40e/i40e_txrx.c (revision 4ed91d48259d9ddd378424d008f2e6559f7e78f8)
1 /*******************************************************************************
2  *
3  * Intel Ethernet Controller XL710 Family Linux Driver
4  * Copyright(c) 2013 - 2016 Intel Corporation.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  * The full GNU General Public License is included in this distribution in
19  * the file called "COPYING".
20  *
21  * Contact Information:
22  * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
23  * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
24  *
25  ******************************************************************************/
26 
27 #include <linux/prefetch.h>
28 #include <net/busy_poll.h>
29 #include "i40e.h"
30 #include "i40e_prototype.h"
31 
32 static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
33 				u32 td_tag)
34 {
35 	return cpu_to_le64(I40E_TX_DESC_DTYPE_DATA |
36 			   ((u64)td_cmd  << I40E_TXD_QW1_CMD_SHIFT) |
37 			   ((u64)td_offset << I40E_TXD_QW1_OFFSET_SHIFT) |
38 			   ((u64)size  << I40E_TXD_QW1_TX_BUF_SZ_SHIFT) |
39 			   ((u64)td_tag  << I40E_TXD_QW1_L2TAG1_SHIFT));
40 }
41 
42 #define I40E_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
43 /**
44  * i40e_fdir - Generate a Flow Director descriptor based on fdata
45  * @tx_ring: Tx ring to send buffer on
46  * @fdata: Flow director filter data
47  * @add: Indicate if we are adding a rule or deleting one
48  *
49  **/
50 static void i40e_fdir(struct i40e_ring *tx_ring,
51 		      struct i40e_fdir_filter *fdata, bool add)
52 {
53 	struct i40e_filter_program_desc *fdir_desc;
54 	struct i40e_pf *pf = tx_ring->vsi->back;
55 	u32 flex_ptype, dtype_cmd;
56 	u16 i;
57 
58 	/* grab the next descriptor */
59 	i = tx_ring->next_to_use;
60 	fdir_desc = I40E_TX_FDIRDESC(tx_ring, i);
61 
62 	i++;
63 	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
64 
65 	flex_ptype = I40E_TXD_FLTR_QW0_QINDEX_MASK &
66 		     (fdata->q_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT);
67 
68 	flex_ptype |= I40E_TXD_FLTR_QW0_FLEXOFF_MASK &
69 		      (fdata->flex_off << I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT);
70 
71 	flex_ptype |= I40E_TXD_FLTR_QW0_PCTYPE_MASK &
72 		      (fdata->pctype << I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
73 
74 	/* Use LAN VSI Id if not programmed by user */
75 	flex_ptype |= I40E_TXD_FLTR_QW0_DEST_VSI_MASK &
76 		      ((u32)(fdata->dest_vsi ? : pf->vsi[pf->lan_vsi]->id) <<
77 		       I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT);
78 
79 	dtype_cmd = I40E_TX_DESC_DTYPE_FILTER_PROG;
80 
81 	dtype_cmd |= add ?
82 		     I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE <<
83 		     I40E_TXD_FLTR_QW1_PCMD_SHIFT :
84 		     I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE <<
85 		     I40E_TXD_FLTR_QW1_PCMD_SHIFT;
86 
87 	dtype_cmd |= I40E_TXD_FLTR_QW1_DEST_MASK &
88 		     (fdata->dest_ctl << I40E_TXD_FLTR_QW1_DEST_SHIFT);
89 
90 	dtype_cmd |= I40E_TXD_FLTR_QW1_FD_STATUS_MASK &
91 		     (fdata->fd_status << I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT);
92 
93 	if (fdata->cnt_index) {
94 		dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
95 		dtype_cmd |= I40E_TXD_FLTR_QW1_CNTINDEX_MASK &
96 			     ((u32)fdata->cnt_index <<
97 			      I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT);
98 	}
99 
100 	fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
101 	fdir_desc->rsvd = cpu_to_le32(0);
102 	fdir_desc->dtype_cmd_cntindex = cpu_to_le32(dtype_cmd);
103 	fdir_desc->fd_id = cpu_to_le32(fdata->fd_id);
104 }
105 
106 #define I40E_FD_CLEAN_DELAY 10
107 /**
108  * i40e_program_fdir_filter - Program a Flow Director filter
109  * @fdir_data: Packet data that will be filter parameters
110  * @raw_packet: the pre-allocated packet buffer for FDir
111  * @pf: The PF pointer
112  * @add: True for add/update, False for remove
113  **/
114 static int i40e_program_fdir_filter(struct i40e_fdir_filter *fdir_data,
115 				    u8 *raw_packet, struct i40e_pf *pf,
116 				    bool add)
117 {
118 	struct i40e_tx_buffer *tx_buf, *first;
119 	struct i40e_tx_desc *tx_desc;
120 	struct i40e_ring *tx_ring;
121 	struct i40e_vsi *vsi;
122 	struct device *dev;
123 	dma_addr_t dma;
124 	u32 td_cmd = 0;
125 	u16 i;
126 
127 	/* find existing FDIR VSI */
128 	vsi = i40e_find_vsi_by_type(pf, I40E_VSI_FDIR);
129 	if (!vsi)
130 		return -ENOENT;
131 
132 	tx_ring = vsi->tx_rings[0];
133 	dev = tx_ring->dev;
134 
135 	/* we need two descriptors to add/del a filter and we can wait */
136 	for (i = I40E_FD_CLEAN_DELAY; I40E_DESC_UNUSED(tx_ring) < 2; i--) {
137 		if (!i)
138 			return -EAGAIN;
139 		msleep_interruptible(1);
140 	}
141 
142 	dma = dma_map_single(dev, raw_packet,
143 			     I40E_FDIR_MAX_RAW_PACKET_SIZE, DMA_TO_DEVICE);
144 	if (dma_mapping_error(dev, dma))
145 		goto dma_fail;
146 
147 	/* grab the next descriptor */
148 	i = tx_ring->next_to_use;
149 	first = &tx_ring->tx_bi[i];
150 	i40e_fdir(tx_ring, fdir_data, add);
151 
152 	/* Now program a dummy descriptor */
153 	i = tx_ring->next_to_use;
154 	tx_desc = I40E_TX_DESC(tx_ring, i);
155 	tx_buf = &tx_ring->tx_bi[i];
156 
157 	tx_ring->next_to_use = ((i + 1) < tx_ring->count) ? i + 1 : 0;
158 
159 	memset(tx_buf, 0, sizeof(struct i40e_tx_buffer));
160 
161 	/* record length, and DMA address */
162 	dma_unmap_len_set(tx_buf, len, I40E_FDIR_MAX_RAW_PACKET_SIZE);
163 	dma_unmap_addr_set(tx_buf, dma, dma);
164 
165 	tx_desc->buffer_addr = cpu_to_le64(dma);
166 	td_cmd = I40E_TXD_CMD | I40E_TX_DESC_CMD_DUMMY;
167 
168 	tx_buf->tx_flags = I40E_TX_FLAGS_FD_SB;
169 	tx_buf->raw_buf = (void *)raw_packet;
170 
171 	tx_desc->cmd_type_offset_bsz =
172 		build_ctob(td_cmd, 0, I40E_FDIR_MAX_RAW_PACKET_SIZE, 0);
173 
174 	/* Force memory writes to complete before letting h/w
175 	 * know there are new descriptors to fetch.
176 	 */
177 	wmb();
178 
179 	/* Mark the data descriptor to be watched */
180 	first->next_to_watch = tx_desc;
181 
182 	writel(tx_ring->next_to_use, tx_ring->tail);
183 	return 0;
184 
185 dma_fail:
186 	return -1;
187 }
188 
189 #define IP_HEADER_OFFSET 14
190 #define I40E_UDPIP_DUMMY_PACKET_LEN 42
191 /**
192  * i40e_add_del_fdir_udpv4 - Add/Remove UDPv4 filters
193  * @vsi: pointer to the targeted VSI
194  * @fd_data: the flow director data required for the FDir descriptor
195  * @add: true adds a filter, false removes it
196  *
197  * Returns 0 if the filters were successfully added or removed
198  **/
199 static int i40e_add_del_fdir_udpv4(struct i40e_vsi *vsi,
200 				   struct i40e_fdir_filter *fd_data,
201 				   bool add)
202 {
203 	struct i40e_pf *pf = vsi->back;
204 	struct udphdr *udp;
205 	struct iphdr *ip;
206 	bool err = false;
207 	u8 *raw_packet;
208 	int ret;
209 	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
210 		0x45, 0, 0, 0x1c, 0, 0, 0x40, 0, 0x40, 0x11, 0, 0, 0, 0, 0, 0,
211 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
212 
213 	raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
214 	if (!raw_packet)
215 		return -ENOMEM;
216 	memcpy(raw_packet, packet, I40E_UDPIP_DUMMY_PACKET_LEN);
217 
218 	ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
219 	udp = (struct udphdr *)(raw_packet + IP_HEADER_OFFSET
220 	      + sizeof(struct iphdr));
221 
222 	ip->daddr = fd_data->dst_ip[0];
223 	udp->dest = fd_data->dst_port;
224 	ip->saddr = fd_data->src_ip[0];
225 	udp->source = fd_data->src_port;
226 
227 	fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
228 	ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
229 	if (ret) {
230 		dev_info(&pf->pdev->dev,
231 			 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
232 			 fd_data->pctype, fd_data->fd_id, ret);
233 		err = true;
234 	} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
235 		if (add)
236 			dev_info(&pf->pdev->dev,
237 				 "Filter OK for PCTYPE %d loc = %d\n",
238 				 fd_data->pctype, fd_data->fd_id);
239 		else
240 			dev_info(&pf->pdev->dev,
241 				 "Filter deleted for PCTYPE %d loc = %d\n",
242 				 fd_data->pctype, fd_data->fd_id);
243 	}
244 	if (err)
245 		kfree(raw_packet);
246 
247 	return err ? -EOPNOTSUPP : 0;
248 }
249 
250 #define I40E_TCPIP_DUMMY_PACKET_LEN 54
251 /**
252  * i40e_add_del_fdir_tcpv4 - Add/Remove TCPv4 filters
253  * @vsi: pointer to the targeted VSI
254  * @fd_data: the flow director data required for the FDir descriptor
255  * @add: true adds a filter, false removes it
256  *
257  * Returns 0 if the filters were successfully added or removed
258  **/
259 static int i40e_add_del_fdir_tcpv4(struct i40e_vsi *vsi,
260 				   struct i40e_fdir_filter *fd_data,
261 				   bool add)
262 {
263 	struct i40e_pf *pf = vsi->back;
264 	struct tcphdr *tcp;
265 	struct iphdr *ip;
266 	bool err = false;
267 	u8 *raw_packet;
268 	int ret;
269 	/* Dummy packet */
270 	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
271 		0x45, 0, 0, 0x28, 0, 0, 0x40, 0, 0x40, 0x6, 0, 0, 0, 0, 0, 0,
272 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80, 0x11,
273 		0x0, 0x72, 0, 0, 0, 0};
274 
275 	raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
276 	if (!raw_packet)
277 		return -ENOMEM;
278 	memcpy(raw_packet, packet, I40E_TCPIP_DUMMY_PACKET_LEN);
279 
280 	ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
281 	tcp = (struct tcphdr *)(raw_packet + IP_HEADER_OFFSET
282 	      + sizeof(struct iphdr));
283 
284 	ip->daddr = fd_data->dst_ip[0];
285 	tcp->dest = fd_data->dst_port;
286 	ip->saddr = fd_data->src_ip[0];
287 	tcp->source = fd_data->src_port;
288 
289 	if (add) {
290 		pf->fd_tcp_rule++;
291 		if ((pf->flags & I40E_FLAG_FD_ATR_ENABLED) &&
292 		    I40E_DEBUG_FD & pf->hw.debug_mask)
293 			dev_info(&pf->pdev->dev, "Forcing ATR off, sideband rules for TCP/IPv4 flow being applied\n");
294 		pf->auto_disable_flags |= I40E_FLAG_FD_ATR_ENABLED;
295 	} else {
296 		pf->fd_tcp_rule = (pf->fd_tcp_rule > 0) ?
297 				  (pf->fd_tcp_rule - 1) : 0;
298 		if (pf->fd_tcp_rule == 0) {
299 			if ((pf->flags & I40E_FLAG_FD_ATR_ENABLED) &&
300 			    I40E_DEBUG_FD & pf->hw.debug_mask)
301 				dev_info(&pf->pdev->dev, "ATR re-enabled due to no sideband TCP/IPv4 rules\n");
302 			pf->auto_disable_flags &= ~I40E_FLAG_FD_ATR_ENABLED;
303 		}
304 	}
305 
306 	fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
307 	ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
308 
309 	if (ret) {
310 		dev_info(&pf->pdev->dev,
311 			 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
312 			 fd_data->pctype, fd_data->fd_id, ret);
313 		err = true;
314 	} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
315 		if (add)
316 			dev_info(&pf->pdev->dev, "Filter OK for PCTYPE %d loc = %d)\n",
317 				 fd_data->pctype, fd_data->fd_id);
318 		else
319 			dev_info(&pf->pdev->dev,
320 				 "Filter deleted for PCTYPE %d loc = %d\n",
321 				 fd_data->pctype, fd_data->fd_id);
322 	}
323 
324 	if (err)
325 		kfree(raw_packet);
326 
327 	return err ? -EOPNOTSUPP : 0;
328 }
329 
330 #define I40E_IP_DUMMY_PACKET_LEN 34
331 /**
332  * i40e_add_del_fdir_ipv4 - Add/Remove IPv4 Flow Director filters for
333  * a specific flow spec
334  * @vsi: pointer to the targeted VSI
335  * @fd_data: the flow director data required for the FDir descriptor
336  * @add: true adds a filter, false removes it
337  *
338  * Returns 0 if the filters were successfully added or removed
339  **/
340 static int i40e_add_del_fdir_ipv4(struct i40e_vsi *vsi,
341 				  struct i40e_fdir_filter *fd_data,
342 				  bool add)
343 {
344 	struct i40e_pf *pf = vsi->back;
345 	struct iphdr *ip;
346 	bool err = false;
347 	u8 *raw_packet;
348 	int ret;
349 	int i;
350 	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
351 		0x45, 0, 0, 0x14, 0, 0, 0x40, 0, 0x40, 0x10, 0, 0, 0, 0, 0, 0,
352 		0, 0, 0, 0};
353 
354 	for (i = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER;
355 	     i <= I40E_FILTER_PCTYPE_FRAG_IPV4;	i++) {
356 		raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
357 		if (!raw_packet)
358 			return -ENOMEM;
359 		memcpy(raw_packet, packet, I40E_IP_DUMMY_PACKET_LEN);
360 		ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
361 
362 		ip->saddr = fd_data->src_ip[0];
363 		ip->daddr = fd_data->dst_ip[0];
364 		ip->protocol = 0;
365 
366 		fd_data->pctype = i;
367 		ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
368 
369 		if (ret) {
370 			dev_info(&pf->pdev->dev,
371 				 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
372 				 fd_data->pctype, fd_data->fd_id, ret);
373 			err = true;
374 		} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
375 			if (add)
376 				dev_info(&pf->pdev->dev,
377 					 "Filter OK for PCTYPE %d loc = %d\n",
378 					 fd_data->pctype, fd_data->fd_id);
379 			else
380 				dev_info(&pf->pdev->dev,
381 					 "Filter deleted for PCTYPE %d loc = %d\n",
382 					 fd_data->pctype, fd_data->fd_id);
383 		}
384 	}
385 
386 	if (err)
387 		kfree(raw_packet);
388 
389 	return err ? -EOPNOTSUPP : 0;
390 }
391 
392 /**
393  * i40e_add_del_fdir - Build raw packets to add/del fdir filter
394  * @vsi: pointer to the targeted VSI
395  * @cmd: command to get or set RX flow classification rules
396  * @add: true adds a filter, false removes it
397  *
398  **/
399 int i40e_add_del_fdir(struct i40e_vsi *vsi,
400 		      struct i40e_fdir_filter *input, bool add)
401 {
402 	struct i40e_pf *pf = vsi->back;
403 	int ret;
404 
405 	switch (input->flow_type & ~FLOW_EXT) {
406 	case TCP_V4_FLOW:
407 		ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
408 		break;
409 	case UDP_V4_FLOW:
410 		ret = i40e_add_del_fdir_udpv4(vsi, input, add);
411 		break;
412 	case IP_USER_FLOW:
413 		switch (input->ip4_proto) {
414 		case IPPROTO_TCP:
415 			ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
416 			break;
417 		case IPPROTO_UDP:
418 			ret = i40e_add_del_fdir_udpv4(vsi, input, add);
419 			break;
420 		case IPPROTO_IP:
421 			ret = i40e_add_del_fdir_ipv4(vsi, input, add);
422 			break;
423 		default:
424 			/* We cannot support masking based on protocol */
425 			goto unsupported_flow;
426 		}
427 		break;
428 	default:
429 unsupported_flow:
430 		dev_info(&pf->pdev->dev, "Could not specify spec type %d\n",
431 			 input->flow_type);
432 		ret = -EINVAL;
433 	}
434 
435 	/* The buffer allocated here will be normally be freed by
436 	 * i40e_clean_fdir_tx_irq() as it reclaims resources after transmit
437 	 * completion. In the event of an error adding the buffer to the FDIR
438 	 * ring, it will immediately be freed. It may also be freed by
439 	 * i40e_clean_tx_ring() when closing the VSI.
440 	 */
441 	return ret;
442 }
443 
444 /**
445  * i40e_fd_handle_status - check the Programming Status for FD
446  * @rx_ring: the Rx ring for this descriptor
447  * @rx_desc: the Rx descriptor for programming Status, not a packet descriptor.
448  * @prog_id: the id originally used for programming
449  *
450  * This is used to verify if the FD programming or invalidation
451  * requested by SW to the HW is successful or not and take actions accordingly.
452  **/
453 static void i40e_fd_handle_status(struct i40e_ring *rx_ring,
454 				  union i40e_rx_desc *rx_desc, u8 prog_id)
455 {
456 	struct i40e_pf *pf = rx_ring->vsi->back;
457 	struct pci_dev *pdev = pf->pdev;
458 	u32 fcnt_prog, fcnt_avail;
459 	u32 error;
460 	u64 qw;
461 
462 	qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
463 	error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
464 		I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT;
465 
466 	if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) {
467 		pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id);
468 		if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) ||
469 		    (I40E_DEBUG_FD & pf->hw.debug_mask))
470 			dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n",
471 				 pf->fd_inv);
472 
473 		/* Check if the programming error is for ATR.
474 		 * If so, auto disable ATR and set a state for
475 		 * flush in progress. Next time we come here if flush is in
476 		 * progress do nothing, once flush is complete the state will
477 		 * be cleared.
478 		 */
479 		if (test_bit(__I40E_FD_FLUSH_REQUESTED, &pf->state))
480 			return;
481 
482 		pf->fd_add_err++;
483 		/* store the current atr filter count */
484 		pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf);
485 
486 		if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) &&
487 		    (pf->auto_disable_flags & I40E_FLAG_FD_SB_ENABLED)) {
488 			pf->auto_disable_flags |= I40E_FLAG_FD_ATR_ENABLED;
489 			set_bit(__I40E_FD_FLUSH_REQUESTED, &pf->state);
490 		}
491 
492 		/* filter programming failed most likely due to table full */
493 		fcnt_prog = i40e_get_global_fd_count(pf);
494 		fcnt_avail = pf->fdir_pf_filter_count;
495 		/* If ATR is running fcnt_prog can quickly change,
496 		 * if we are very close to full, it makes sense to disable
497 		 * FD ATR/SB and then re-enable it when there is room.
498 		 */
499 		if (fcnt_prog >= (fcnt_avail - I40E_FDIR_BUFFER_FULL_MARGIN)) {
500 			if ((pf->flags & I40E_FLAG_FD_SB_ENABLED) &&
501 			    !(pf->auto_disable_flags &
502 				     I40E_FLAG_FD_SB_ENABLED)) {
503 				if (I40E_DEBUG_FD & pf->hw.debug_mask)
504 					dev_warn(&pdev->dev, "FD filter space full, new ntuple rules will not be added\n");
505 				pf->auto_disable_flags |=
506 							I40E_FLAG_FD_SB_ENABLED;
507 			}
508 		}
509 	} else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) {
510 		if (I40E_DEBUG_FD & pf->hw.debug_mask)
511 			dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n",
512 				 rx_desc->wb.qword0.hi_dword.fd_id);
513 	}
514 }
515 
516 /**
517  * i40e_unmap_and_free_tx_resource - Release a Tx buffer
518  * @ring:      the ring that owns the buffer
519  * @tx_buffer: the buffer to free
520  **/
521 static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
522 					    struct i40e_tx_buffer *tx_buffer)
523 {
524 	if (tx_buffer->skb) {
525 		if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
526 			kfree(tx_buffer->raw_buf);
527 		else
528 			dev_kfree_skb_any(tx_buffer->skb);
529 		if (dma_unmap_len(tx_buffer, len))
530 			dma_unmap_single(ring->dev,
531 					 dma_unmap_addr(tx_buffer, dma),
532 					 dma_unmap_len(tx_buffer, len),
533 					 DMA_TO_DEVICE);
534 	} else if (dma_unmap_len(tx_buffer, len)) {
535 		dma_unmap_page(ring->dev,
536 			       dma_unmap_addr(tx_buffer, dma),
537 			       dma_unmap_len(tx_buffer, len),
538 			       DMA_TO_DEVICE);
539 	}
540 
541 	tx_buffer->next_to_watch = NULL;
542 	tx_buffer->skb = NULL;
543 	dma_unmap_len_set(tx_buffer, len, 0);
544 	/* tx_buffer must be completely set up in the transmit path */
545 }
546 
547 /**
548  * i40e_clean_tx_ring - Free any empty Tx buffers
549  * @tx_ring: ring to be cleaned
550  **/
551 void i40e_clean_tx_ring(struct i40e_ring *tx_ring)
552 {
553 	unsigned long bi_size;
554 	u16 i;
555 
556 	/* ring already cleared, nothing to do */
557 	if (!tx_ring->tx_bi)
558 		return;
559 
560 	/* Free all the Tx ring sk_buffs */
561 	for (i = 0; i < tx_ring->count; i++)
562 		i40e_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
563 
564 	bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
565 	memset(tx_ring->tx_bi, 0, bi_size);
566 
567 	/* Zero out the descriptor ring */
568 	memset(tx_ring->desc, 0, tx_ring->size);
569 
570 	tx_ring->next_to_use = 0;
571 	tx_ring->next_to_clean = 0;
572 
573 	if (!tx_ring->netdev)
574 		return;
575 
576 	/* cleanup Tx queue statistics */
577 	netdev_tx_reset_queue(txring_txq(tx_ring));
578 }
579 
580 /**
581  * i40e_free_tx_resources - Free Tx resources per queue
582  * @tx_ring: Tx descriptor ring for a specific queue
583  *
584  * Free all transmit software resources
585  **/
586 void i40e_free_tx_resources(struct i40e_ring *tx_ring)
587 {
588 	i40e_clean_tx_ring(tx_ring);
589 	kfree(tx_ring->tx_bi);
590 	tx_ring->tx_bi = NULL;
591 
592 	if (tx_ring->desc) {
593 		dma_free_coherent(tx_ring->dev, tx_ring->size,
594 				  tx_ring->desc, tx_ring->dma);
595 		tx_ring->desc = NULL;
596 	}
597 }
598 
599 /**
600  * i40e_get_tx_pending - how many tx descriptors not processed
601  * @tx_ring: the ring of descriptors
602  * @in_sw: is tx_pending being checked in SW or HW
603  *
604  * Since there is no access to the ring head register
605  * in XL710, we need to use our local copies
606  **/
607 u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw)
608 {
609 	u32 head, tail;
610 
611 	if (!in_sw)
612 		head = i40e_get_head(ring);
613 	else
614 		head = ring->next_to_clean;
615 	tail = readl(ring->tail);
616 
617 	if (head != tail)
618 		return (head < tail) ?
619 			tail - head : (tail + ring->count - head);
620 
621 	return 0;
622 }
623 
624 #define WB_STRIDE 4
625 
626 /**
627  * i40e_clean_tx_irq - Reclaim resources after transmit completes
628  * @vsi: the VSI we care about
629  * @tx_ring: Tx ring to clean
630  * @napi_budget: Used to determine if we are in netpoll
631  *
632  * Returns true if there's any budget left (e.g. the clean is finished)
633  **/
634 static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
635 			      struct i40e_ring *tx_ring, int napi_budget)
636 {
637 	u16 i = tx_ring->next_to_clean;
638 	struct i40e_tx_buffer *tx_buf;
639 	struct i40e_tx_desc *tx_head;
640 	struct i40e_tx_desc *tx_desc;
641 	unsigned int total_bytes = 0, total_packets = 0;
642 	unsigned int budget = vsi->work_limit;
643 
644 	tx_buf = &tx_ring->tx_bi[i];
645 	tx_desc = I40E_TX_DESC(tx_ring, i);
646 	i -= tx_ring->count;
647 
648 	tx_head = I40E_TX_DESC(tx_ring, i40e_get_head(tx_ring));
649 
650 	do {
651 		struct i40e_tx_desc *eop_desc = tx_buf->next_to_watch;
652 
653 		/* if next_to_watch is not set then there is no work pending */
654 		if (!eop_desc)
655 			break;
656 
657 		/* prevent any other reads prior to eop_desc */
658 		read_barrier_depends();
659 
660 		/* we have caught up to head, no work left to do */
661 		if (tx_head == tx_desc)
662 			break;
663 
664 		/* clear next_to_watch to prevent false hangs */
665 		tx_buf->next_to_watch = NULL;
666 
667 		/* update the statistics for this packet */
668 		total_bytes += tx_buf->bytecount;
669 		total_packets += tx_buf->gso_segs;
670 
671 		/* free the skb */
672 		napi_consume_skb(tx_buf->skb, napi_budget);
673 
674 		/* unmap skb header data */
675 		dma_unmap_single(tx_ring->dev,
676 				 dma_unmap_addr(tx_buf, dma),
677 				 dma_unmap_len(tx_buf, len),
678 				 DMA_TO_DEVICE);
679 
680 		/* clear tx_buffer data */
681 		tx_buf->skb = NULL;
682 		dma_unmap_len_set(tx_buf, len, 0);
683 
684 		/* unmap remaining buffers */
685 		while (tx_desc != eop_desc) {
686 
687 			tx_buf++;
688 			tx_desc++;
689 			i++;
690 			if (unlikely(!i)) {
691 				i -= tx_ring->count;
692 				tx_buf = tx_ring->tx_bi;
693 				tx_desc = I40E_TX_DESC(tx_ring, 0);
694 			}
695 
696 			/* unmap any remaining paged data */
697 			if (dma_unmap_len(tx_buf, len)) {
698 				dma_unmap_page(tx_ring->dev,
699 					       dma_unmap_addr(tx_buf, dma),
700 					       dma_unmap_len(tx_buf, len),
701 					       DMA_TO_DEVICE);
702 				dma_unmap_len_set(tx_buf, len, 0);
703 			}
704 		}
705 
706 		/* move us one more past the eop_desc for start of next pkt */
707 		tx_buf++;
708 		tx_desc++;
709 		i++;
710 		if (unlikely(!i)) {
711 			i -= tx_ring->count;
712 			tx_buf = tx_ring->tx_bi;
713 			tx_desc = I40E_TX_DESC(tx_ring, 0);
714 		}
715 
716 		prefetch(tx_desc);
717 
718 		/* update budget accounting */
719 		budget--;
720 	} while (likely(budget));
721 
722 	i += tx_ring->count;
723 	tx_ring->next_to_clean = i;
724 	u64_stats_update_begin(&tx_ring->syncp);
725 	tx_ring->stats.bytes += total_bytes;
726 	tx_ring->stats.packets += total_packets;
727 	u64_stats_update_end(&tx_ring->syncp);
728 	tx_ring->q_vector->tx.total_bytes += total_bytes;
729 	tx_ring->q_vector->tx.total_packets += total_packets;
730 
731 	if (tx_ring->flags & I40E_TXR_FLAGS_WB_ON_ITR) {
732 		/* check to see if there are < 4 descriptors
733 		 * waiting to be written back, then kick the hardware to force
734 		 * them to be written back in case we stay in NAPI.
735 		 * In this mode on X722 we do not enable Interrupt.
736 		 */
737 		unsigned int j = i40e_get_tx_pending(tx_ring, false);
738 
739 		if (budget &&
740 		    ((j / WB_STRIDE) == 0) && (j > 0) &&
741 		    !test_bit(__I40E_DOWN, &vsi->state) &&
742 		    (I40E_DESC_UNUSED(tx_ring) != tx_ring->count))
743 			tx_ring->arm_wb = true;
744 	}
745 
746 	/* notify netdev of completed buffers */
747 	netdev_tx_completed_queue(txring_txq(tx_ring),
748 				  total_packets, total_bytes);
749 
750 #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
751 	if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
752 		     (I40E_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
753 		/* Make sure that anybody stopping the queue after this
754 		 * sees the new next_to_clean.
755 		 */
756 		smp_mb();
757 		if (__netif_subqueue_stopped(tx_ring->netdev,
758 					     tx_ring->queue_index) &&
759 		   !test_bit(__I40E_DOWN, &vsi->state)) {
760 			netif_wake_subqueue(tx_ring->netdev,
761 					    tx_ring->queue_index);
762 			++tx_ring->tx_stats.restart_queue;
763 		}
764 	}
765 
766 	return !!budget;
767 }
768 
769 /**
770  * i40e_enable_wb_on_itr - Arm hardware to do a wb, interrupts are not enabled
771  * @vsi: the VSI we care about
772  * @q_vector: the vector on which to enable writeback
773  *
774  **/
775 static void i40e_enable_wb_on_itr(struct i40e_vsi *vsi,
776 				  struct i40e_q_vector *q_vector)
777 {
778 	u16 flags = q_vector->tx.ring[0].flags;
779 	u32 val;
780 
781 	if (!(flags & I40E_TXR_FLAGS_WB_ON_ITR))
782 		return;
783 
784 	if (q_vector->arm_wb_state)
785 		return;
786 
787 	if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
788 		val = I40E_PFINT_DYN_CTLN_WB_ON_ITR_MASK |
789 		      I40E_PFINT_DYN_CTLN_ITR_INDX_MASK; /* set noitr */
790 
791 		wr32(&vsi->back->hw,
792 		     I40E_PFINT_DYN_CTLN(q_vector->v_idx + vsi->base_vector - 1),
793 		     val);
794 	} else {
795 		val = I40E_PFINT_DYN_CTL0_WB_ON_ITR_MASK |
796 		      I40E_PFINT_DYN_CTL0_ITR_INDX_MASK; /* set noitr */
797 
798 		wr32(&vsi->back->hw, I40E_PFINT_DYN_CTL0, val);
799 	}
800 	q_vector->arm_wb_state = true;
801 }
802 
803 /**
804  * i40e_force_wb - Issue SW Interrupt so HW does a wb
805  * @vsi: the VSI we care about
806  * @q_vector: the vector  on which to force writeback
807  *
808  **/
809 void i40e_force_wb(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector)
810 {
811 	if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
812 		u32 val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
813 			  I40E_PFINT_DYN_CTLN_ITR_INDX_MASK | /* set noitr */
814 			  I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK |
815 			  I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK;
816 			  /* allow 00 to be written to the index */
817 
818 		wr32(&vsi->back->hw,
819 		     I40E_PFINT_DYN_CTLN(q_vector->v_idx +
820 					 vsi->base_vector - 1), val);
821 	} else {
822 		u32 val = I40E_PFINT_DYN_CTL0_INTENA_MASK |
823 			  I40E_PFINT_DYN_CTL0_ITR_INDX_MASK | /* set noitr */
824 			  I40E_PFINT_DYN_CTL0_SWINT_TRIG_MASK |
825 			  I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_MASK;
826 			/* allow 00 to be written to the index */
827 
828 		wr32(&vsi->back->hw, I40E_PFINT_DYN_CTL0, val);
829 	}
830 }
831 
832 /**
833  * i40e_set_new_dynamic_itr - Find new ITR level
834  * @rc: structure containing ring performance data
835  *
836  * Returns true if ITR changed, false if not
837  *
838  * Stores a new ITR value based on packets and byte counts during
839  * the last interrupt.  The advantage of per interrupt computation
840  * is faster updates and more accurate ITR for the current traffic
841  * pattern.  Constants in this function were computed based on
842  * theoretical maximum wire speed and thresholds were set based on
843  * testing data as well as attempting to minimize response time
844  * while increasing bulk throughput.
845  **/
846 static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc)
847 {
848 	enum i40e_latency_range new_latency_range = rc->latency_range;
849 	struct i40e_q_vector *qv = rc->ring->q_vector;
850 	u32 new_itr = rc->itr;
851 	int bytes_per_int;
852 	int usecs;
853 
854 	if (rc->total_packets == 0 || !rc->itr)
855 		return false;
856 
857 	/* simple throttlerate management
858 	 *   0-10MB/s   lowest (50000 ints/s)
859 	 *  10-20MB/s   low    (20000 ints/s)
860 	 *  20-1249MB/s bulk   (18000 ints/s)
861 	 *  > 40000 Rx packets per second (8000 ints/s)
862 	 *
863 	 * The math works out because the divisor is in 10^(-6) which
864 	 * turns the bytes/us input value into MB/s values, but
865 	 * make sure to use usecs, as the register values written
866 	 * are in 2 usec increments in the ITR registers, and make sure
867 	 * to use the smoothed values that the countdown timer gives us.
868 	 */
869 	usecs = (rc->itr << 1) * ITR_COUNTDOWN_START;
870 	bytes_per_int = rc->total_bytes / usecs;
871 
872 	switch (new_latency_range) {
873 	case I40E_LOWEST_LATENCY:
874 		if (bytes_per_int > 10)
875 			new_latency_range = I40E_LOW_LATENCY;
876 		break;
877 	case I40E_LOW_LATENCY:
878 		if (bytes_per_int > 20)
879 			new_latency_range = I40E_BULK_LATENCY;
880 		else if (bytes_per_int <= 10)
881 			new_latency_range = I40E_LOWEST_LATENCY;
882 		break;
883 	case I40E_BULK_LATENCY:
884 	case I40E_ULTRA_LATENCY:
885 	default:
886 		if (bytes_per_int <= 20)
887 			new_latency_range = I40E_LOW_LATENCY;
888 		break;
889 	}
890 
891 	/* this is to adjust RX more aggressively when streaming small
892 	 * packets.  The value of 40000 was picked as it is just beyond
893 	 * what the hardware can receive per second if in low latency
894 	 * mode.
895 	 */
896 #define RX_ULTRA_PACKET_RATE 40000
897 
898 	if ((((rc->total_packets * 1000000) / usecs) > RX_ULTRA_PACKET_RATE) &&
899 	    (&qv->rx == rc))
900 		new_latency_range = I40E_ULTRA_LATENCY;
901 
902 	rc->latency_range = new_latency_range;
903 
904 	switch (new_latency_range) {
905 	case I40E_LOWEST_LATENCY:
906 		new_itr = I40E_ITR_50K;
907 		break;
908 	case I40E_LOW_LATENCY:
909 		new_itr = I40E_ITR_20K;
910 		break;
911 	case I40E_BULK_LATENCY:
912 		new_itr = I40E_ITR_18K;
913 		break;
914 	case I40E_ULTRA_LATENCY:
915 		new_itr = I40E_ITR_8K;
916 		break;
917 	default:
918 		break;
919 	}
920 
921 	rc->total_bytes = 0;
922 	rc->total_packets = 0;
923 
924 	if (new_itr != rc->itr) {
925 		rc->itr = new_itr;
926 		return true;
927 	}
928 
929 	return false;
930 }
931 
932 /**
933  * i40e_clean_programming_status - clean the programming status descriptor
934  * @rx_ring: the rx ring that has this descriptor
935  * @rx_desc: the rx descriptor written back by HW
936  *
937  * Flow director should handle FD_FILTER_STATUS to check its filter programming
938  * status being successful or not and take actions accordingly. FCoE should
939  * handle its context/filter programming/invalidation status and take actions.
940  *
941  **/
942 static void i40e_clean_programming_status(struct i40e_ring *rx_ring,
943 					  union i40e_rx_desc *rx_desc)
944 {
945 	u64 qw;
946 	u8 id;
947 
948 	qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
949 	id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
950 		  I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
951 
952 	if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS)
953 		i40e_fd_handle_status(rx_ring, rx_desc, id);
954 #ifdef I40E_FCOE
955 	else if ((id == I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_PROG_STATUS) ||
956 		 (id == I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_INVL_STATUS))
957 		i40e_fcoe_handle_status(rx_ring, rx_desc, id);
958 #endif
959 }
960 
961 /**
962  * i40e_setup_tx_descriptors - Allocate the Tx descriptors
963  * @tx_ring: the tx ring to set up
964  *
965  * Return 0 on success, negative on error
966  **/
967 int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
968 {
969 	struct device *dev = tx_ring->dev;
970 	int bi_size;
971 
972 	if (!dev)
973 		return -ENOMEM;
974 
975 	/* warn if we are about to overwrite the pointer */
976 	WARN_ON(tx_ring->tx_bi);
977 	bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
978 	tx_ring->tx_bi = kzalloc(bi_size, GFP_KERNEL);
979 	if (!tx_ring->tx_bi)
980 		goto err;
981 
982 	/* round up to nearest 4K */
983 	tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc);
984 	/* add u32 for head writeback, align after this takes care of
985 	 * guaranteeing this is at least one cache line in size
986 	 */
987 	tx_ring->size += sizeof(u32);
988 	tx_ring->size = ALIGN(tx_ring->size, 4096);
989 	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
990 					   &tx_ring->dma, GFP_KERNEL);
991 	if (!tx_ring->desc) {
992 		dev_info(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n",
993 			 tx_ring->size);
994 		goto err;
995 	}
996 
997 	tx_ring->next_to_use = 0;
998 	tx_ring->next_to_clean = 0;
999 	return 0;
1000 
1001 err:
1002 	kfree(tx_ring->tx_bi);
1003 	tx_ring->tx_bi = NULL;
1004 	return -ENOMEM;
1005 }
1006 
1007 /**
1008  * i40e_clean_rx_ring - Free Rx buffers
1009  * @rx_ring: ring to be cleaned
1010  **/
1011 void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
1012 {
1013 	struct device *dev = rx_ring->dev;
1014 	unsigned long bi_size;
1015 	u16 i;
1016 
1017 	/* ring already cleared, nothing to do */
1018 	if (!rx_ring->rx_bi)
1019 		return;
1020 
1021 	if (rx_ring->skb) {
1022 		dev_kfree_skb(rx_ring->skb);
1023 		rx_ring->skb = NULL;
1024 	}
1025 
1026 	/* Free all the Rx ring sk_buffs */
1027 	for (i = 0; i < rx_ring->count; i++) {
1028 		struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
1029 
1030 		if (!rx_bi->page)
1031 			continue;
1032 
1033 		dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE, DMA_FROM_DEVICE);
1034 		__free_pages(rx_bi->page, 0);
1035 
1036 		rx_bi->page = NULL;
1037 		rx_bi->page_offset = 0;
1038 	}
1039 
1040 	bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
1041 	memset(rx_ring->rx_bi, 0, bi_size);
1042 
1043 	/* Zero out the descriptor ring */
1044 	memset(rx_ring->desc, 0, rx_ring->size);
1045 
1046 	rx_ring->next_to_alloc = 0;
1047 	rx_ring->next_to_clean = 0;
1048 	rx_ring->next_to_use = 0;
1049 }
1050 
1051 /**
1052  * i40e_free_rx_resources - Free Rx resources
1053  * @rx_ring: ring to clean the resources from
1054  *
1055  * Free all receive software resources
1056  **/
1057 void i40e_free_rx_resources(struct i40e_ring *rx_ring)
1058 {
1059 	i40e_clean_rx_ring(rx_ring);
1060 	kfree(rx_ring->rx_bi);
1061 	rx_ring->rx_bi = NULL;
1062 
1063 	if (rx_ring->desc) {
1064 		dma_free_coherent(rx_ring->dev, rx_ring->size,
1065 				  rx_ring->desc, rx_ring->dma);
1066 		rx_ring->desc = NULL;
1067 	}
1068 }
1069 
1070 /**
1071  * i40e_setup_rx_descriptors - Allocate Rx descriptors
1072  * @rx_ring: Rx descriptor ring (for a specific queue) to setup
1073  *
1074  * Returns 0 on success, negative on failure
1075  **/
1076 int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
1077 {
1078 	struct device *dev = rx_ring->dev;
1079 	int bi_size;
1080 
1081 	/* warn if we are about to overwrite the pointer */
1082 	WARN_ON(rx_ring->rx_bi);
1083 	bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
1084 	rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
1085 	if (!rx_ring->rx_bi)
1086 		goto err;
1087 
1088 	u64_stats_init(&rx_ring->syncp);
1089 
1090 	/* Round up to nearest 4K */
1091 	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
1092 	rx_ring->size = ALIGN(rx_ring->size, 4096);
1093 	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
1094 					   &rx_ring->dma, GFP_KERNEL);
1095 
1096 	if (!rx_ring->desc) {
1097 		dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
1098 			 rx_ring->size);
1099 		goto err;
1100 	}
1101 
1102 	rx_ring->next_to_alloc = 0;
1103 	rx_ring->next_to_clean = 0;
1104 	rx_ring->next_to_use = 0;
1105 
1106 	return 0;
1107 err:
1108 	kfree(rx_ring->rx_bi);
1109 	rx_ring->rx_bi = NULL;
1110 	return -ENOMEM;
1111 }
1112 
1113 /**
1114  * i40e_release_rx_desc - Store the new tail and head values
1115  * @rx_ring: ring to bump
1116  * @val: new head index
1117  **/
1118 static inline void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val)
1119 {
1120 	rx_ring->next_to_use = val;
1121 
1122 	/* update next to alloc since we have filled the ring */
1123 	rx_ring->next_to_alloc = val;
1124 
1125 	/* Force memory writes to complete before letting h/w
1126 	 * know there are new descriptors to fetch.  (Only
1127 	 * applicable for weak-ordered memory model archs,
1128 	 * such as IA-64).
1129 	 */
1130 	wmb();
1131 	writel(val, rx_ring->tail);
1132 }
1133 
1134 /**
1135  * i40e_alloc_mapped_page - recycle or make a new page
1136  * @rx_ring: ring to use
1137  * @bi: rx_buffer struct to modify
1138  *
1139  * Returns true if the page was successfully allocated or
1140  * reused.
1141  **/
1142 static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
1143 				   struct i40e_rx_buffer *bi)
1144 {
1145 	struct page *page = bi->page;
1146 	dma_addr_t dma;
1147 
1148 	/* since we are recycling buffers we should seldom need to alloc */
1149 	if (likely(page)) {
1150 		rx_ring->rx_stats.page_reuse_count++;
1151 		return true;
1152 	}
1153 
1154 	/* alloc new page for storage */
1155 	page = dev_alloc_page();
1156 	if (unlikely(!page)) {
1157 		rx_ring->rx_stats.alloc_page_failed++;
1158 		return false;
1159 	}
1160 
1161 	/* map page for use */
1162 	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
1163 
1164 	/* if mapping failed free memory back to system since
1165 	 * there isn't much point in holding memory we can't use
1166 	 */
1167 	if (dma_mapping_error(rx_ring->dev, dma)) {
1168 		__free_pages(page, 0);
1169 		rx_ring->rx_stats.alloc_page_failed++;
1170 		return false;
1171 	}
1172 
1173 	bi->dma = dma;
1174 	bi->page = page;
1175 	bi->page_offset = 0;
1176 
1177 	return true;
1178 }
1179 
1180 /**
1181  * i40e_receive_skb - Send a completed packet up the stack
1182  * @rx_ring:  rx ring in play
1183  * @skb: packet to send up
1184  * @vlan_tag: vlan tag for packet
1185  **/
1186 static void i40e_receive_skb(struct i40e_ring *rx_ring,
1187 			     struct sk_buff *skb, u16 vlan_tag)
1188 {
1189 	struct i40e_q_vector *q_vector = rx_ring->q_vector;
1190 
1191 	if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
1192 	    (vlan_tag & VLAN_VID_MASK))
1193 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
1194 
1195 	napi_gro_receive(&q_vector->napi, skb);
1196 }
1197 
1198 /**
1199  * i40e_alloc_rx_buffers - Replace used receive buffers
1200  * @rx_ring: ring to place buffers on
1201  * @cleaned_count: number of buffers to replace
1202  *
1203  * Returns false if all allocations were successful, true if any fail
1204  **/
1205 bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
1206 {
1207 	u16 ntu = rx_ring->next_to_use;
1208 	union i40e_rx_desc *rx_desc;
1209 	struct i40e_rx_buffer *bi;
1210 
1211 	/* do nothing if no valid netdev defined */
1212 	if (!rx_ring->netdev || !cleaned_count)
1213 		return false;
1214 
1215 	rx_desc = I40E_RX_DESC(rx_ring, ntu);
1216 	bi = &rx_ring->rx_bi[ntu];
1217 
1218 	do {
1219 		if (!i40e_alloc_mapped_page(rx_ring, bi))
1220 			goto no_buffers;
1221 
1222 		/* Refresh the desc even if buffer_addrs didn't change
1223 		 * because each write-back erases this info.
1224 		 */
1225 		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
1226 
1227 		rx_desc++;
1228 		bi++;
1229 		ntu++;
1230 		if (unlikely(ntu == rx_ring->count)) {
1231 			rx_desc = I40E_RX_DESC(rx_ring, 0);
1232 			bi = rx_ring->rx_bi;
1233 			ntu = 0;
1234 		}
1235 
1236 		/* clear the status bits for the next_to_use descriptor */
1237 		rx_desc->wb.qword1.status_error_len = 0;
1238 
1239 		cleaned_count--;
1240 	} while (cleaned_count);
1241 
1242 	if (rx_ring->next_to_use != ntu)
1243 		i40e_release_rx_desc(rx_ring, ntu);
1244 
1245 	return false;
1246 
1247 no_buffers:
1248 	if (rx_ring->next_to_use != ntu)
1249 		i40e_release_rx_desc(rx_ring, ntu);
1250 
1251 	/* make sure to come back via polling to try again after
1252 	 * allocation failure
1253 	 */
1254 	return true;
1255 }
1256 
1257 /**
1258  * i40e_rx_checksum - Indicate in skb if hw indicated a good cksum
1259  * @vsi: the VSI we care about
1260  * @skb: skb currently being received and modified
1261  * @rx_desc: the receive descriptor
1262  *
1263  * skb->protocol must be set before this function is called
1264  **/
1265 static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
1266 				    struct sk_buff *skb,
1267 				    union i40e_rx_desc *rx_desc)
1268 {
1269 	struct i40e_rx_ptype_decoded decoded;
1270 	u32 rx_error, rx_status;
1271 	bool ipv4, ipv6;
1272 	u8 ptype;
1273 	u64 qword;
1274 
1275 	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1276 	ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >> I40E_RXD_QW1_PTYPE_SHIFT;
1277 	rx_error = (qword & I40E_RXD_QW1_ERROR_MASK) >>
1278 		   I40E_RXD_QW1_ERROR_SHIFT;
1279 	rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
1280 		    I40E_RXD_QW1_STATUS_SHIFT;
1281 	decoded = decode_rx_desc_ptype(ptype);
1282 
1283 	skb->ip_summed = CHECKSUM_NONE;
1284 
1285 	skb_checksum_none_assert(skb);
1286 
1287 	/* Rx csum enabled and ip headers found? */
1288 	if (!(vsi->netdev->features & NETIF_F_RXCSUM))
1289 		return;
1290 
1291 	/* did the hardware decode the packet and checksum? */
1292 	if (!(rx_status & BIT(I40E_RX_DESC_STATUS_L3L4P_SHIFT)))
1293 		return;
1294 
1295 	/* both known and outer_ip must be set for the below code to work */
1296 	if (!(decoded.known && decoded.outer_ip))
1297 		return;
1298 
1299 	ipv4 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
1300 	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4);
1301 	ipv6 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
1302 	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6);
1303 
1304 	if (ipv4 &&
1305 	    (rx_error & (BIT(I40E_RX_DESC_ERROR_IPE_SHIFT) |
1306 			 BIT(I40E_RX_DESC_ERROR_EIPE_SHIFT))))
1307 		goto checksum_fail;
1308 
1309 	/* likely incorrect csum if alternate IP extension headers found */
1310 	if (ipv6 &&
1311 	    rx_status & BIT(I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))
1312 		/* don't increment checksum err here, non-fatal err */
1313 		return;
1314 
1315 	/* there was some L4 error, count error and punt packet to the stack */
1316 	if (rx_error & BIT(I40E_RX_DESC_ERROR_L4E_SHIFT))
1317 		goto checksum_fail;
1318 
1319 	/* handle packets that were not able to be checksummed due
1320 	 * to arrival speed, in this case the stack can compute
1321 	 * the csum.
1322 	 */
1323 	if (rx_error & BIT(I40E_RX_DESC_ERROR_PPRS_SHIFT))
1324 		return;
1325 
1326 	/* If there is an outer header present that might contain a checksum
1327 	 * we need to bump the checksum level by 1 to reflect the fact that
1328 	 * we are indicating we validated the inner checksum.
1329 	 */
1330 	if (decoded.tunnel_type >= I40E_RX_PTYPE_TUNNEL_IP_GRENAT)
1331 		skb->csum_level = 1;
1332 
1333 	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
1334 	switch (decoded.inner_prot) {
1335 	case I40E_RX_PTYPE_INNER_PROT_TCP:
1336 	case I40E_RX_PTYPE_INNER_PROT_UDP:
1337 	case I40E_RX_PTYPE_INNER_PROT_SCTP:
1338 		skb->ip_summed = CHECKSUM_UNNECESSARY;
1339 		/* fall though */
1340 	default:
1341 		break;
1342 	}
1343 
1344 	return;
1345 
1346 checksum_fail:
1347 	vsi->back->hw_csum_rx_error++;
1348 }
1349 
1350 /**
1351  * i40e_ptype_to_htype - get a hash type
1352  * @ptype: the ptype value from the descriptor
1353  *
1354  * Returns a hash type to be used by skb_set_hash
1355  **/
1356 static inline int i40e_ptype_to_htype(u8 ptype)
1357 {
1358 	struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
1359 
1360 	if (!decoded.known)
1361 		return PKT_HASH_TYPE_NONE;
1362 
1363 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1364 	    decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4)
1365 		return PKT_HASH_TYPE_L4;
1366 	else if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1367 		 decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3)
1368 		return PKT_HASH_TYPE_L3;
1369 	else
1370 		return PKT_HASH_TYPE_L2;
1371 }
1372 
1373 /**
1374  * i40e_rx_hash - set the hash value in the skb
1375  * @ring: descriptor ring
1376  * @rx_desc: specific descriptor
1377  **/
1378 static inline void i40e_rx_hash(struct i40e_ring *ring,
1379 				union i40e_rx_desc *rx_desc,
1380 				struct sk_buff *skb,
1381 				u8 rx_ptype)
1382 {
1383 	u32 hash;
1384 	const __le64 rss_mask =
1385 		cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH <<
1386 			    I40E_RX_DESC_STATUS_FLTSTAT_SHIFT);
1387 
1388 	if (!(ring->netdev->features & NETIF_F_RXHASH))
1389 		return;
1390 
1391 	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
1392 		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
1393 		skb_set_hash(skb, hash, i40e_ptype_to_htype(rx_ptype));
1394 	}
1395 }
1396 
1397 /**
1398  * i40e_process_skb_fields - Populate skb header fields from Rx descriptor
1399  * @rx_ring: rx descriptor ring packet is being transacted on
1400  * @rx_desc: pointer to the EOP Rx descriptor
1401  * @skb: pointer to current skb being populated
1402  * @rx_ptype: the packet type decoded by hardware
1403  *
1404  * This function checks the ring, descriptor, and packet information in
1405  * order to populate the hash, checksum, VLAN, protocol, and
1406  * other fields within the skb.
1407  **/
1408 static inline
1409 void i40e_process_skb_fields(struct i40e_ring *rx_ring,
1410 			     union i40e_rx_desc *rx_desc, struct sk_buff *skb,
1411 			     u8 rx_ptype)
1412 {
1413 	u64 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1414 	u32 rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
1415 			I40E_RXD_QW1_STATUS_SHIFT;
1416 	u32 tsynvalid = rx_status & I40E_RXD_QW1_STATUS_TSYNVALID_MASK;
1417 	u32 tsyn = (rx_status & I40E_RXD_QW1_STATUS_TSYNINDX_MASK) >>
1418 		   I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT;
1419 
1420 	if (unlikely(tsynvalid))
1421 		i40e_ptp_rx_hwtstamp(rx_ring->vsi->back, skb, tsyn);
1422 
1423 	i40e_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
1424 
1425 	/* modifies the skb - consumes the enet header */
1426 	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
1427 
1428 	i40e_rx_checksum(rx_ring->vsi, skb, rx_desc);
1429 
1430 	skb_record_rx_queue(skb, rx_ring->queue_index);
1431 }
1432 
1433 /**
1434  * i40e_cleanup_headers - Correct empty headers
1435  * @rx_ring: rx descriptor ring packet is being transacted on
1436  * @skb: pointer to current skb being fixed
1437  *
1438  * Also address the case where we are pulling data in on pages only
1439  * and as such no data is present in the skb header.
1440  *
1441  * In addition if skb is not at least 60 bytes we need to pad it so that
1442  * it is large enough to qualify as a valid Ethernet frame.
1443  *
1444  * Returns true if an error was encountered and skb was freed.
1445  **/
1446 static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb)
1447 {
1448 	/* if eth_skb_pad returns an error the skb was freed */
1449 	if (eth_skb_pad(skb))
1450 		return true;
1451 
1452 	return false;
1453 }
1454 
1455 /**
1456  * i40e_reuse_rx_page - page flip buffer and store it back on the ring
1457  * @rx_ring: rx descriptor ring to store buffers on
1458  * @old_buff: donor buffer to have page reused
1459  *
1460  * Synchronizes page for reuse by the adapter
1461  **/
1462 static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
1463 			       struct i40e_rx_buffer *old_buff)
1464 {
1465 	struct i40e_rx_buffer *new_buff;
1466 	u16 nta = rx_ring->next_to_alloc;
1467 
1468 	new_buff = &rx_ring->rx_bi[nta];
1469 
1470 	/* update, and store next to alloc */
1471 	nta++;
1472 	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
1473 
1474 	/* transfer page from old buffer to new buffer */
1475 	*new_buff = *old_buff;
1476 }
1477 
1478 /**
1479  * i40e_page_is_reusable - check if any reuse is possible
1480  * @page: page struct to check
1481  *
1482  * A page is not reusable if it was allocated under low memory
1483  * conditions, or it's not in the same NUMA node as this CPU.
1484  */
1485 static inline bool i40e_page_is_reusable(struct page *page)
1486 {
1487 	return (page_to_nid(page) == numa_mem_id()) &&
1488 		!page_is_pfmemalloc(page);
1489 }
1490 
1491 /**
1492  * i40e_can_reuse_rx_page - Determine if this page can be reused by
1493  * the adapter for another receive
1494  *
1495  * @rx_buffer: buffer containing the page
1496  * @page: page address from rx_buffer
1497  * @truesize: actual size of the buffer in this page
1498  *
1499  * If page is reusable, rx_buffer->page_offset is adjusted to point to
1500  * an unused region in the page.
1501  *
1502  * For small pages, @truesize will be a constant value, half the size
1503  * of the memory at page.  We'll attempt to alternate between high and
1504  * low halves of the page, with one half ready for use by the hardware
1505  * and the other half being consumed by the stack.  We use the page
1506  * ref count to determine whether the stack has finished consuming the
1507  * portion of this page that was passed up with a previous packet.  If
1508  * the page ref count is >1, we'll assume the "other" half page is
1509  * still busy, and this page cannot be reused.
1510  *
1511  * For larger pages, @truesize will be the actual space used by the
1512  * received packet (adjusted upward to an even multiple of the cache
1513  * line size).  This will advance through the page by the amount
1514  * actually consumed by the received packets while there is still
1515  * space for a buffer.  Each region of larger pages will be used at
1516  * most once, after which the page will not be reused.
1517  *
1518  * In either case, if the page is reusable its refcount is increased.
1519  **/
1520 static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer,
1521 				   struct page *page,
1522 				   const unsigned int truesize)
1523 {
1524 #if (PAGE_SIZE >= 8192)
1525 	unsigned int last_offset = PAGE_SIZE - I40E_RXBUFFER_2048;
1526 #endif
1527 
1528 	/* Is any reuse possible? */
1529 	if (unlikely(!i40e_page_is_reusable(page)))
1530 		return false;
1531 
1532 #if (PAGE_SIZE < 8192)
1533 	/* if we are only owner of page we can reuse it */
1534 	if (unlikely(page_count(page) != 1))
1535 		return false;
1536 
1537 	/* flip page offset to other buffer */
1538 	rx_buffer->page_offset ^= truesize;
1539 #else
1540 	/* move offset up to the next cache line */
1541 	rx_buffer->page_offset += truesize;
1542 
1543 	if (rx_buffer->page_offset > last_offset)
1544 		return false;
1545 #endif
1546 
1547 	/* Inc ref count on page before passing it up to the stack */
1548 	get_page(page);
1549 
1550 	return true;
1551 }
1552 
1553 /**
1554  * i40e_add_rx_frag - Add contents of Rx buffer to sk_buff
1555  * @rx_ring: rx descriptor ring to transact packets on
1556  * @rx_buffer: buffer containing page to add
1557  * @size: packet length from rx_desc
1558  * @skb: sk_buff to place the data into
1559  *
1560  * This function will add the data contained in rx_buffer->page to the skb.
1561  * This is done either through a direct copy if the data in the buffer is
1562  * less than the skb header size, otherwise it will just attach the page as
1563  * a frag to the skb.
1564  *
1565  * The function will then update the page offset if necessary and return
1566  * true if the buffer can be reused by the adapter.
1567  **/
1568 static bool i40e_add_rx_frag(struct i40e_ring *rx_ring,
1569 			     struct i40e_rx_buffer *rx_buffer,
1570 			     unsigned int size,
1571 			     struct sk_buff *skb)
1572 {
1573 	struct page *page = rx_buffer->page;
1574 	unsigned char *va = page_address(page) + rx_buffer->page_offset;
1575 #if (PAGE_SIZE < 8192)
1576 	unsigned int truesize = I40E_RXBUFFER_2048;
1577 #else
1578 	unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
1579 #endif
1580 	unsigned int pull_len;
1581 
1582 	if (unlikely(skb_is_nonlinear(skb)))
1583 		goto add_tail_frag;
1584 
1585 	/* will the data fit in the skb we allocated? if so, just
1586 	 * copy it as it is pretty small anyway
1587 	 */
1588 	if (size <= I40E_RX_HDR_SIZE) {
1589 		memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
1590 
1591 		/* page is reusable, we can reuse buffer as-is */
1592 		if (likely(i40e_page_is_reusable(page)))
1593 			return true;
1594 
1595 		/* this page cannot be reused so discard it */
1596 		__free_pages(page, 0);
1597 		return false;
1598 	}
1599 
1600 	/* we need the header to contain the greater of either
1601 	 * ETH_HLEN or 60 bytes if the skb->len is less than
1602 	 * 60 for skb_pad.
1603 	 */
1604 	pull_len = eth_get_headlen(va, I40E_RX_HDR_SIZE);
1605 
1606 	/* align pull length to size of long to optimize
1607 	 * memcpy performance
1608 	 */
1609 	memcpy(__skb_put(skb, pull_len), va, ALIGN(pull_len, sizeof(long)));
1610 
1611 	/* update all of the pointers */
1612 	va += pull_len;
1613 	size -= pull_len;
1614 
1615 add_tail_frag:
1616 	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
1617 			(unsigned long)va & ~PAGE_MASK, size, truesize);
1618 
1619 	return i40e_can_reuse_rx_page(rx_buffer, page, truesize);
1620 }
1621 
1622 /**
1623  * i40e_fetch_rx_buffer - Allocate skb and populate it
1624  * @rx_ring: rx descriptor ring to transact packets on
1625  * @rx_desc: descriptor containing info written by hardware
1626  *
1627  * This function allocates an skb on the fly, and populates it with the page
1628  * data from the current receive descriptor, taking care to set up the skb
1629  * correctly, as well as handling calling the page recycle function if
1630  * necessary.
1631  */
1632 static inline
1633 struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
1634 				     union i40e_rx_desc *rx_desc,
1635 				     struct sk_buff *skb)
1636 {
1637 	u64 local_status_error_len =
1638 		le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1639 	unsigned int size =
1640 		(local_status_error_len & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
1641 		I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1642 	struct i40e_rx_buffer *rx_buffer;
1643 	struct page *page;
1644 
1645 	rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
1646 	page = rx_buffer->page;
1647 	prefetchw(page);
1648 
1649 	if (likely(!skb)) {
1650 		void *page_addr = page_address(page) + rx_buffer->page_offset;
1651 
1652 		/* prefetch first cache line of first page */
1653 		prefetch(page_addr);
1654 #if L1_CACHE_BYTES < 128
1655 		prefetch(page_addr + L1_CACHE_BYTES);
1656 #endif
1657 
1658 		/* allocate a skb to store the frags */
1659 		skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
1660 				       I40E_RX_HDR_SIZE,
1661 				       GFP_ATOMIC | __GFP_NOWARN);
1662 		if (unlikely(!skb)) {
1663 			rx_ring->rx_stats.alloc_buff_failed++;
1664 			return NULL;
1665 		}
1666 
1667 		/* we will be copying header into skb->data in
1668 		 * pskb_may_pull so it is in our interest to prefetch
1669 		 * it now to avoid a possible cache miss
1670 		 */
1671 		prefetchw(skb->data);
1672 	}
1673 
1674 	/* we are reusing so sync this buffer for CPU use */
1675 	dma_sync_single_range_for_cpu(rx_ring->dev,
1676 				      rx_buffer->dma,
1677 				      rx_buffer->page_offset,
1678 				      size,
1679 				      DMA_FROM_DEVICE);
1680 
1681 	/* pull page into skb */
1682 	if (i40e_add_rx_frag(rx_ring, rx_buffer, size, skb)) {
1683 		/* hand second half of page back to the ring */
1684 		i40e_reuse_rx_page(rx_ring, rx_buffer);
1685 		rx_ring->rx_stats.page_reuse_count++;
1686 	} else {
1687 		/* we are not reusing the buffer so unmap it */
1688 		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
1689 			       DMA_FROM_DEVICE);
1690 	}
1691 
1692 	/* clear contents of buffer_info */
1693 	rx_buffer->page = NULL;
1694 
1695 	return skb;
1696 }
1697 
1698 /**
1699  * i40e_is_non_eop - process handling of non-EOP buffers
1700  * @rx_ring: Rx ring being processed
1701  * @rx_desc: Rx descriptor for current buffer
1702  * @skb: Current socket buffer containing buffer in progress
1703  *
1704  * This function updates next to clean.  If the buffer is an EOP buffer
1705  * this function exits returning false, otherwise it will place the
1706  * sk_buff in the next buffer to be chained and return true indicating
1707  * that this is in fact a non-EOP buffer.
1708  **/
1709 static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
1710 			    union i40e_rx_desc *rx_desc,
1711 			    struct sk_buff *skb)
1712 {
1713 	u32 ntc = rx_ring->next_to_clean + 1;
1714 
1715 	/* fetch, update, and store next to clean */
1716 	ntc = (ntc < rx_ring->count) ? ntc : 0;
1717 	rx_ring->next_to_clean = ntc;
1718 
1719 	prefetch(I40E_RX_DESC(rx_ring, ntc));
1720 
1721 #define staterrlen rx_desc->wb.qword1.status_error_len
1722 	if (unlikely(i40e_rx_is_programming_status(le64_to_cpu(staterrlen)))) {
1723 		i40e_clean_programming_status(rx_ring, rx_desc);
1724 		return true;
1725 	}
1726 	/* if we are the last buffer then there is nothing else to do */
1727 #define I40E_RXD_EOF BIT(I40E_RX_DESC_STATUS_EOF_SHIFT)
1728 	if (likely(i40e_test_staterr(rx_desc, I40E_RXD_EOF)))
1729 		return false;
1730 
1731 	rx_ring->rx_stats.non_eop_descs++;
1732 
1733 	return true;
1734 }
1735 
1736 /**
1737  * i40e_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
1738  * @rx_ring: rx descriptor ring to transact packets on
1739  * @budget: Total limit on number of packets to process
1740  *
1741  * This function provides a "bounce buffer" approach to Rx interrupt
1742  * processing.  The advantage to this is that on systems that have
1743  * expensive overhead for IOMMU access this provides a means of avoiding
1744  * it by maintaining the mapping of the page to the system.
1745  *
1746  * Returns amount of work completed
1747  **/
1748 static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
1749 {
1750 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
1751 	struct sk_buff *skb = rx_ring->skb;
1752 	u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
1753 	bool failure = false;
1754 
1755 	while (likely(total_rx_packets < budget)) {
1756 		union i40e_rx_desc *rx_desc;
1757 		u16 vlan_tag;
1758 		u8 rx_ptype;
1759 		u64 qword;
1760 
1761 		/* return some buffers to hardware, one at a time is too slow */
1762 		if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
1763 			failure = failure ||
1764 				  i40e_alloc_rx_buffers(rx_ring, cleaned_count);
1765 			cleaned_count = 0;
1766 		}
1767 
1768 		rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean);
1769 
1770 		/* status_error_len will always be zero for unused descriptors
1771 		 * because it's cleared in cleanup, and overlaps with hdr_addr
1772 		 * which is always zero because packet split isn't used, if the
1773 		 * hardware wrote DD then it will be non-zero
1774 		 */
1775 		if (!i40e_test_staterr(rx_desc,
1776 				       BIT(I40E_RX_DESC_STATUS_DD_SHIFT)))
1777 			break;
1778 
1779 		/* This memory barrier is needed to keep us from reading
1780 		 * any other fields out of the rx_desc until we know the
1781 		 * DD bit is set.
1782 		 */
1783 		dma_rmb();
1784 
1785 		skb = i40e_fetch_rx_buffer(rx_ring, rx_desc, skb);
1786 		if (!skb)
1787 			break;
1788 
1789 		cleaned_count++;
1790 
1791 		if (i40e_is_non_eop(rx_ring, rx_desc, skb))
1792 			continue;
1793 
1794 		/* ERR_MASK will only have valid bits if EOP set, and
1795 		 * what we are doing here is actually checking
1796 		 * I40E_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
1797 		 * the error field
1798 		 */
1799 		if (unlikely(i40e_test_staterr(rx_desc, BIT(I40E_RXD_QW1_ERROR_SHIFT)))) {
1800 			dev_kfree_skb_any(skb);
1801 			continue;
1802 		}
1803 
1804 		if (i40e_cleanup_headers(rx_ring, skb)) {
1805 			skb = NULL;
1806 			continue;
1807 		}
1808 
1809 		/* probably a little skewed due to removing CRC */
1810 		total_rx_bytes += skb->len;
1811 
1812 		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1813 		rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
1814 			   I40E_RXD_QW1_PTYPE_SHIFT;
1815 
1816 		/* populate checksum, VLAN, and protocol */
1817 		i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
1818 
1819 #ifdef I40E_FCOE
1820 		if (unlikely(
1821 		    i40e_rx_is_fcoe(rx_ptype) &&
1822 		    !i40e_fcoe_handle_offload(rx_ring, rx_desc, skb))) {
1823 			dev_kfree_skb_any(skb);
1824 			continue;
1825 		}
1826 #endif
1827 
1828 		vlan_tag = (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) ?
1829 			   le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1) : 0;
1830 
1831 		i40e_receive_skb(rx_ring, skb, vlan_tag);
1832 		skb = NULL;
1833 
1834 		/* update budget accounting */
1835 		total_rx_packets++;
1836 	}
1837 
1838 	rx_ring->skb = skb;
1839 
1840 	u64_stats_update_begin(&rx_ring->syncp);
1841 	rx_ring->stats.packets += total_rx_packets;
1842 	rx_ring->stats.bytes += total_rx_bytes;
1843 	u64_stats_update_end(&rx_ring->syncp);
1844 	rx_ring->q_vector->rx.total_packets += total_rx_packets;
1845 	rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
1846 
1847 	/* guarantee a trip back through this routine if there was a failure */
1848 	return failure ? budget : total_rx_packets;
1849 }
1850 
1851 static u32 i40e_buildreg_itr(const int type, const u16 itr)
1852 {
1853 	u32 val;
1854 
1855 	val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
1856 	      /* Don't clear PBA because that can cause lost interrupts that
1857 	       * came in while we were cleaning/polling
1858 	       */
1859 	      (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) |
1860 	      (itr << I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT);
1861 
1862 	return val;
1863 }
1864 
1865 /* a small macro to shorten up some long lines */
1866 #define INTREG I40E_PFINT_DYN_CTLN
1867 static inline int get_rx_itr(struct i40e_vsi *vsi, int idx)
1868 {
1869 	return vsi->rx_rings[idx]->rx_itr_setting;
1870 }
1871 
1872 static inline int get_tx_itr(struct i40e_vsi *vsi, int idx)
1873 {
1874 	return vsi->tx_rings[idx]->tx_itr_setting;
1875 }
1876 
1877 /**
1878  * i40e_update_enable_itr - Update itr and re-enable MSIX interrupt
1879  * @vsi: the VSI we care about
1880  * @q_vector: q_vector for which itr is being updated and interrupt enabled
1881  *
1882  **/
1883 static inline void i40e_update_enable_itr(struct i40e_vsi *vsi,
1884 					  struct i40e_q_vector *q_vector)
1885 {
1886 	struct i40e_hw *hw = &vsi->back->hw;
1887 	bool rx = false, tx = false;
1888 	u32 rxval, txval;
1889 	int vector;
1890 	int idx = q_vector->v_idx;
1891 	int rx_itr_setting, tx_itr_setting;
1892 
1893 	vector = (q_vector->v_idx + vsi->base_vector);
1894 
1895 	/* avoid dynamic calculation if in countdown mode OR if
1896 	 * all dynamic is disabled
1897 	 */
1898 	rxval = txval = i40e_buildreg_itr(I40E_ITR_NONE, 0);
1899 
1900 	rx_itr_setting = get_rx_itr(vsi, idx);
1901 	tx_itr_setting = get_tx_itr(vsi, idx);
1902 
1903 	if (q_vector->itr_countdown > 0 ||
1904 	    (!ITR_IS_DYNAMIC(rx_itr_setting) &&
1905 	     !ITR_IS_DYNAMIC(tx_itr_setting))) {
1906 		goto enable_int;
1907 	}
1908 
1909 	if (ITR_IS_DYNAMIC(tx_itr_setting)) {
1910 		rx = i40e_set_new_dynamic_itr(&q_vector->rx);
1911 		rxval = i40e_buildreg_itr(I40E_RX_ITR, q_vector->rx.itr);
1912 	}
1913 
1914 	if (ITR_IS_DYNAMIC(tx_itr_setting)) {
1915 		tx = i40e_set_new_dynamic_itr(&q_vector->tx);
1916 		txval = i40e_buildreg_itr(I40E_TX_ITR, q_vector->tx.itr);
1917 	}
1918 
1919 	if (rx || tx) {
1920 		/* get the higher of the two ITR adjustments and
1921 		 * use the same value for both ITR registers
1922 		 * when in adaptive mode (Rx and/or Tx)
1923 		 */
1924 		u16 itr = max(q_vector->tx.itr, q_vector->rx.itr);
1925 
1926 		q_vector->tx.itr = q_vector->rx.itr = itr;
1927 		txval = i40e_buildreg_itr(I40E_TX_ITR, itr);
1928 		tx = true;
1929 		rxval = i40e_buildreg_itr(I40E_RX_ITR, itr);
1930 		rx = true;
1931 	}
1932 
1933 	/* only need to enable the interrupt once, but need
1934 	 * to possibly update both ITR values
1935 	 */
1936 	if (rx) {
1937 		/* set the INTENA_MSK_MASK so that this first write
1938 		 * won't actually enable the interrupt, instead just
1939 		 * updating the ITR (it's bit 31 PF and VF)
1940 		 */
1941 		rxval |= BIT(31);
1942 		/* don't check _DOWN because interrupt isn't being enabled */
1943 		wr32(hw, INTREG(vector - 1), rxval);
1944 	}
1945 
1946 enable_int:
1947 	if (!test_bit(__I40E_DOWN, &vsi->state))
1948 		wr32(hw, INTREG(vector - 1), txval);
1949 
1950 	if (q_vector->itr_countdown)
1951 		q_vector->itr_countdown--;
1952 	else
1953 		q_vector->itr_countdown = ITR_COUNTDOWN_START;
1954 }
1955 
1956 /**
1957  * i40e_napi_poll - NAPI polling Rx/Tx cleanup routine
1958  * @napi: napi struct with our devices info in it
1959  * @budget: amount of work driver is allowed to do this pass, in packets
1960  *
1961  * This function will clean all queues associated with a q_vector.
1962  *
1963  * Returns the amount of work done
1964  **/
1965 int i40e_napi_poll(struct napi_struct *napi, int budget)
1966 {
1967 	struct i40e_q_vector *q_vector =
1968 			       container_of(napi, struct i40e_q_vector, napi);
1969 	struct i40e_vsi *vsi = q_vector->vsi;
1970 	struct i40e_ring *ring;
1971 	bool clean_complete = true;
1972 	bool arm_wb = false;
1973 	int budget_per_ring;
1974 	int work_done = 0;
1975 
1976 	if (test_bit(__I40E_DOWN, &vsi->state)) {
1977 		napi_complete(napi);
1978 		return 0;
1979 	}
1980 
1981 	/* Clear hung_detected bit */
1982 	clear_bit(I40E_Q_VECTOR_HUNG_DETECT, &q_vector->hung_detected);
1983 	/* Since the actual Tx work is minimal, we can give the Tx a larger
1984 	 * budget and be more aggressive about cleaning up the Tx descriptors.
1985 	 */
1986 	i40e_for_each_ring(ring, q_vector->tx) {
1987 		if (!i40e_clean_tx_irq(vsi, ring, budget)) {
1988 			clean_complete = false;
1989 			continue;
1990 		}
1991 		arm_wb |= ring->arm_wb;
1992 		ring->arm_wb = false;
1993 	}
1994 
1995 	/* Handle case where we are called by netpoll with a budget of 0 */
1996 	if (budget <= 0)
1997 		goto tx_only;
1998 
1999 	/* We attempt to distribute budget to each Rx queue fairly, but don't
2000 	 * allow the budget to go below 1 because that would exit polling early.
2001 	 */
2002 	budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
2003 
2004 	i40e_for_each_ring(ring, q_vector->rx) {
2005 		int cleaned = i40e_clean_rx_irq(ring, budget_per_ring);
2006 
2007 		work_done += cleaned;
2008 		/* if we clean as many as budgeted, we must not be done */
2009 		if (cleaned >= budget_per_ring)
2010 			clean_complete = false;
2011 	}
2012 
2013 	/* If work not completed, return budget and polling will return */
2014 	if (!clean_complete) {
2015 		const cpumask_t *aff_mask = &q_vector->affinity_mask;
2016 		int cpu_id = smp_processor_id();
2017 
2018 		/* It is possible that the interrupt affinity has changed but,
2019 		 * if the cpu is pegged at 100%, polling will never exit while
2020 		 * traffic continues and the interrupt will be stuck on this
2021 		 * cpu.  We check to make sure affinity is correct before we
2022 		 * continue to poll, otherwise we must stop polling so the
2023 		 * interrupt can move to the correct cpu.
2024 		 */
2025 		if (likely(cpumask_test_cpu(cpu_id, aff_mask) ||
2026 			   !(vsi->back->flags & I40E_FLAG_MSIX_ENABLED))) {
2027 tx_only:
2028 			if (arm_wb) {
2029 				q_vector->tx.ring[0].tx_stats.tx_force_wb++;
2030 				i40e_enable_wb_on_itr(vsi, q_vector);
2031 			}
2032 			return budget;
2033 		}
2034 	}
2035 
2036 	if (vsi->back->flags & I40E_TXR_FLAGS_WB_ON_ITR)
2037 		q_vector->arm_wb_state = false;
2038 
2039 	/* Work is done so exit the polling mode and re-enable the interrupt */
2040 	napi_complete_done(napi, work_done);
2041 
2042 	/* If we're prematurely stopping polling to fix the interrupt
2043 	 * affinity we want to make sure polling starts back up so we
2044 	 * issue a call to i40e_force_wb which triggers a SW interrupt.
2045 	 */
2046 	if (!clean_complete)
2047 		i40e_force_wb(vsi, q_vector);
2048 	else if (!(vsi->back->flags & I40E_FLAG_MSIX_ENABLED))
2049 		i40e_irq_dynamic_enable_icr0(vsi->back, false);
2050 	else
2051 		i40e_update_enable_itr(vsi, q_vector);
2052 
2053 	return min(work_done, budget - 1);
2054 }
2055 
2056 /**
2057  * i40e_atr - Add a Flow Director ATR filter
2058  * @tx_ring:  ring to add programming descriptor to
2059  * @skb:      send buffer
2060  * @tx_flags: send tx flags
2061  **/
2062 static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
2063 		     u32 tx_flags)
2064 {
2065 	struct i40e_filter_program_desc *fdir_desc;
2066 	struct i40e_pf *pf = tx_ring->vsi->back;
2067 	union {
2068 		unsigned char *network;
2069 		struct iphdr *ipv4;
2070 		struct ipv6hdr *ipv6;
2071 	} hdr;
2072 	struct tcphdr *th;
2073 	unsigned int hlen;
2074 	u32 flex_ptype, dtype_cmd;
2075 	int l4_proto;
2076 	u16 i;
2077 
2078 	/* make sure ATR is enabled */
2079 	if (!(pf->flags & I40E_FLAG_FD_ATR_ENABLED))
2080 		return;
2081 
2082 	if ((pf->auto_disable_flags & I40E_FLAG_FD_ATR_ENABLED))
2083 		return;
2084 
2085 	/* if sampling is disabled do nothing */
2086 	if (!tx_ring->atr_sample_rate)
2087 		return;
2088 
2089 	/* Currently only IPv4/IPv6 with TCP is supported */
2090 	if (!(tx_flags & (I40E_TX_FLAGS_IPV4 | I40E_TX_FLAGS_IPV6)))
2091 		return;
2092 
2093 	/* snag network header to get L4 type and address */
2094 	hdr.network = (tx_flags & I40E_TX_FLAGS_UDP_TUNNEL) ?
2095 		      skb_inner_network_header(skb) : skb_network_header(skb);
2096 
2097 	/* Note: tx_flags gets modified to reflect inner protocols in
2098 	 * tx_enable_csum function if encap is enabled.
2099 	 */
2100 	if (tx_flags & I40E_TX_FLAGS_IPV4) {
2101 		/* access ihl as u8 to avoid unaligned access on ia64 */
2102 		hlen = (hdr.network[0] & 0x0F) << 2;
2103 		l4_proto = hdr.ipv4->protocol;
2104 	} else {
2105 		hlen = hdr.network - skb->data;
2106 		l4_proto = ipv6_find_hdr(skb, &hlen, IPPROTO_TCP, NULL, NULL);
2107 		hlen -= hdr.network - skb->data;
2108 	}
2109 
2110 	if (l4_proto != IPPROTO_TCP)
2111 		return;
2112 
2113 	th = (struct tcphdr *)(hdr.network + hlen);
2114 
2115 	/* Due to lack of space, no more new filters can be programmed */
2116 	if (th->syn && (pf->auto_disable_flags & I40E_FLAG_FD_ATR_ENABLED))
2117 		return;
2118 	if ((pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE) &&
2119 	    (!(pf->auto_disable_flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE))) {
2120 		/* HW ATR eviction will take care of removing filters on FIN
2121 		 * and RST packets.
2122 		 */
2123 		if (th->fin || th->rst)
2124 			return;
2125 	}
2126 
2127 	tx_ring->atr_count++;
2128 
2129 	/* sample on all syn/fin/rst packets or once every atr sample rate */
2130 	if (!th->fin &&
2131 	    !th->syn &&
2132 	    !th->rst &&
2133 	    (tx_ring->atr_count < tx_ring->atr_sample_rate))
2134 		return;
2135 
2136 	tx_ring->atr_count = 0;
2137 
2138 	/* grab the next descriptor */
2139 	i = tx_ring->next_to_use;
2140 	fdir_desc = I40E_TX_FDIRDESC(tx_ring, i);
2141 
2142 	i++;
2143 	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
2144 
2145 	flex_ptype = (tx_ring->queue_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT) &
2146 		      I40E_TXD_FLTR_QW0_QINDEX_MASK;
2147 	flex_ptype |= (tx_flags & I40E_TX_FLAGS_IPV4) ?
2148 		      (I40E_FILTER_PCTYPE_NONF_IPV4_TCP <<
2149 		       I40E_TXD_FLTR_QW0_PCTYPE_SHIFT) :
2150 		      (I40E_FILTER_PCTYPE_NONF_IPV6_TCP <<
2151 		       I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
2152 
2153 	flex_ptype |= tx_ring->vsi->id << I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT;
2154 
2155 	dtype_cmd = I40E_TX_DESC_DTYPE_FILTER_PROG;
2156 
2157 	dtype_cmd |= (th->fin || th->rst) ?
2158 		     (I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE <<
2159 		      I40E_TXD_FLTR_QW1_PCMD_SHIFT) :
2160 		     (I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE <<
2161 		      I40E_TXD_FLTR_QW1_PCMD_SHIFT);
2162 
2163 	dtype_cmd |= I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_QINDEX <<
2164 		     I40E_TXD_FLTR_QW1_DEST_SHIFT;
2165 
2166 	dtype_cmd |= I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID <<
2167 		     I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT;
2168 
2169 	dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
2170 	if (!(tx_flags & I40E_TX_FLAGS_UDP_TUNNEL))
2171 		dtype_cmd |=
2172 			((u32)I40E_FD_ATR_STAT_IDX(pf->hw.pf_id) <<
2173 			I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
2174 			I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
2175 	else
2176 		dtype_cmd |=
2177 			((u32)I40E_FD_ATR_TUNNEL_STAT_IDX(pf->hw.pf_id) <<
2178 			I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
2179 			I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
2180 
2181 	if ((pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE) &&
2182 	    (!(pf->auto_disable_flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE)))
2183 		dtype_cmd |= I40E_TXD_FLTR_QW1_ATR_MASK;
2184 
2185 	fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
2186 	fdir_desc->rsvd = cpu_to_le32(0);
2187 	fdir_desc->dtype_cmd_cntindex = cpu_to_le32(dtype_cmd);
2188 	fdir_desc->fd_id = cpu_to_le32(0);
2189 }
2190 
2191 /**
2192  * i40e_tx_prepare_vlan_flags - prepare generic TX VLAN tagging flags for HW
2193  * @skb:     send buffer
2194  * @tx_ring: ring to send buffer on
2195  * @flags:   the tx flags to be set
2196  *
2197  * Checks the skb and set up correspondingly several generic transmit flags
2198  * related to VLAN tagging for the HW, such as VLAN, DCB, etc.
2199  *
2200  * Returns error code indicate the frame should be dropped upon error and the
2201  * otherwise  returns 0 to indicate the flags has been set properly.
2202  **/
2203 #ifdef I40E_FCOE
2204 inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
2205 				      struct i40e_ring *tx_ring,
2206 				      u32 *flags)
2207 #else
2208 static inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
2209 					     struct i40e_ring *tx_ring,
2210 					     u32 *flags)
2211 #endif
2212 {
2213 	__be16 protocol = skb->protocol;
2214 	u32  tx_flags = 0;
2215 
2216 	if (protocol == htons(ETH_P_8021Q) &&
2217 	    !(tx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) {
2218 		/* When HW VLAN acceleration is turned off by the user the
2219 		 * stack sets the protocol to 8021q so that the driver
2220 		 * can take any steps required to support the SW only
2221 		 * VLAN handling.  In our case the driver doesn't need
2222 		 * to take any further steps so just set the protocol
2223 		 * to the encapsulated ethertype.
2224 		 */
2225 		skb->protocol = vlan_get_protocol(skb);
2226 		goto out;
2227 	}
2228 
2229 	/* if we have a HW VLAN tag being added, default to the HW one */
2230 	if (skb_vlan_tag_present(skb)) {
2231 		tx_flags |= skb_vlan_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
2232 		tx_flags |= I40E_TX_FLAGS_HW_VLAN;
2233 	/* else if it is a SW VLAN, check the next protocol and store the tag */
2234 	} else if (protocol == htons(ETH_P_8021Q)) {
2235 		struct vlan_hdr *vhdr, _vhdr;
2236 
2237 		vhdr = skb_header_pointer(skb, ETH_HLEN, sizeof(_vhdr), &_vhdr);
2238 		if (!vhdr)
2239 			return -EINVAL;
2240 
2241 		protocol = vhdr->h_vlan_encapsulated_proto;
2242 		tx_flags |= ntohs(vhdr->h_vlan_TCI) << I40E_TX_FLAGS_VLAN_SHIFT;
2243 		tx_flags |= I40E_TX_FLAGS_SW_VLAN;
2244 	}
2245 
2246 	if (!(tx_ring->vsi->back->flags & I40E_FLAG_DCB_ENABLED))
2247 		goto out;
2248 
2249 	/* Insert 802.1p priority into VLAN header */
2250 	if ((tx_flags & (I40E_TX_FLAGS_HW_VLAN | I40E_TX_FLAGS_SW_VLAN)) ||
2251 	    (skb->priority != TC_PRIO_CONTROL)) {
2252 		tx_flags &= ~I40E_TX_FLAGS_VLAN_PRIO_MASK;
2253 		tx_flags |= (skb->priority & 0x7) <<
2254 				I40E_TX_FLAGS_VLAN_PRIO_SHIFT;
2255 		if (tx_flags & I40E_TX_FLAGS_SW_VLAN) {
2256 			struct vlan_ethhdr *vhdr;
2257 			int rc;
2258 
2259 			rc = skb_cow_head(skb, 0);
2260 			if (rc < 0)
2261 				return rc;
2262 			vhdr = (struct vlan_ethhdr *)skb->data;
2263 			vhdr->h_vlan_TCI = htons(tx_flags >>
2264 						 I40E_TX_FLAGS_VLAN_SHIFT);
2265 		} else {
2266 			tx_flags |= I40E_TX_FLAGS_HW_VLAN;
2267 		}
2268 	}
2269 
2270 out:
2271 	*flags = tx_flags;
2272 	return 0;
2273 }
2274 
2275 /**
2276  * i40e_tso - set up the tso context descriptor
2277  * @first:    pointer to first Tx buffer for xmit
2278  * @hdr_len:  ptr to the size of the packet header
2279  * @cd_type_cmd_tso_mss: Quad Word 1
2280  *
2281  * Returns 0 if no TSO can happen, 1 if tso is going, or error
2282  **/
2283 static int i40e_tso(struct i40e_tx_buffer *first, u8 *hdr_len,
2284 		    u64 *cd_type_cmd_tso_mss)
2285 {
2286 	struct sk_buff *skb = first->skb;
2287 	u64 cd_cmd, cd_tso_len, cd_mss;
2288 	union {
2289 		struct iphdr *v4;
2290 		struct ipv6hdr *v6;
2291 		unsigned char *hdr;
2292 	} ip;
2293 	union {
2294 		struct tcphdr *tcp;
2295 		struct udphdr *udp;
2296 		unsigned char *hdr;
2297 	} l4;
2298 	u32 paylen, l4_offset;
2299 	u16 gso_segs, gso_size;
2300 	int err;
2301 
2302 	if (skb->ip_summed != CHECKSUM_PARTIAL)
2303 		return 0;
2304 
2305 	if (!skb_is_gso(skb))
2306 		return 0;
2307 
2308 	err = skb_cow_head(skb, 0);
2309 	if (err < 0)
2310 		return err;
2311 
2312 	ip.hdr = skb_network_header(skb);
2313 	l4.hdr = skb_transport_header(skb);
2314 
2315 	/* initialize outer IP header fields */
2316 	if (ip.v4->version == 4) {
2317 		ip.v4->tot_len = 0;
2318 		ip.v4->check = 0;
2319 	} else {
2320 		ip.v6->payload_len = 0;
2321 	}
2322 
2323 	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
2324 					 SKB_GSO_GRE_CSUM |
2325 					 SKB_GSO_IPXIP4 |
2326 					 SKB_GSO_IPXIP6 |
2327 					 SKB_GSO_UDP_TUNNEL |
2328 					 SKB_GSO_UDP_TUNNEL_CSUM)) {
2329 		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
2330 		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
2331 			l4.udp->len = 0;
2332 
2333 			/* determine offset of outer transport header */
2334 			l4_offset = l4.hdr - skb->data;
2335 
2336 			/* remove payload length from outer checksum */
2337 			paylen = skb->len - l4_offset;
2338 			csum_replace_by_diff(&l4.udp->check,
2339 					     (__force __wsum)htonl(paylen));
2340 		}
2341 
2342 		/* reset pointers to inner headers */
2343 		ip.hdr = skb_inner_network_header(skb);
2344 		l4.hdr = skb_inner_transport_header(skb);
2345 
2346 		/* initialize inner IP header fields */
2347 		if (ip.v4->version == 4) {
2348 			ip.v4->tot_len = 0;
2349 			ip.v4->check = 0;
2350 		} else {
2351 			ip.v6->payload_len = 0;
2352 		}
2353 	}
2354 
2355 	/* determine offset of inner transport header */
2356 	l4_offset = l4.hdr - skb->data;
2357 
2358 	/* remove payload length from inner checksum */
2359 	paylen = skb->len - l4_offset;
2360 	csum_replace_by_diff(&l4.tcp->check, (__force __wsum)htonl(paylen));
2361 
2362 	/* compute length of segmentation header */
2363 	*hdr_len = (l4.tcp->doff * 4) + l4_offset;
2364 
2365 	/* pull values out of skb_shinfo */
2366 	gso_size = skb_shinfo(skb)->gso_size;
2367 	gso_segs = skb_shinfo(skb)->gso_segs;
2368 
2369 	/* update GSO size and bytecount with header size */
2370 	first->gso_segs = gso_segs;
2371 	first->bytecount += (first->gso_segs - 1) * *hdr_len;
2372 
2373 	/* find the field values */
2374 	cd_cmd = I40E_TX_CTX_DESC_TSO;
2375 	cd_tso_len = skb->len - *hdr_len;
2376 	cd_mss = gso_size;
2377 	*cd_type_cmd_tso_mss |= (cd_cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) |
2378 				(cd_tso_len << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
2379 				(cd_mss << I40E_TXD_CTX_QW1_MSS_SHIFT);
2380 	return 1;
2381 }
2382 
2383 /**
2384  * i40e_tsyn - set up the tsyn context descriptor
2385  * @tx_ring:  ptr to the ring to send
2386  * @skb:      ptr to the skb we're sending
2387  * @tx_flags: the collected send information
2388  * @cd_type_cmd_tso_mss: Quad Word 1
2389  *
2390  * Returns 0 if no Tx timestamp can happen and 1 if the timestamp will happen
2391  **/
2392 static int i40e_tsyn(struct i40e_ring *tx_ring, struct sk_buff *skb,
2393 		     u32 tx_flags, u64 *cd_type_cmd_tso_mss)
2394 {
2395 	struct i40e_pf *pf;
2396 
2397 	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
2398 		return 0;
2399 
2400 	/* Tx timestamps cannot be sampled when doing TSO */
2401 	if (tx_flags & I40E_TX_FLAGS_TSO)
2402 		return 0;
2403 
2404 	/* only timestamp the outbound packet if the user has requested it and
2405 	 * we are not already transmitting a packet to be timestamped
2406 	 */
2407 	pf = i40e_netdev_to_pf(tx_ring->netdev);
2408 	if (!(pf->flags & I40E_FLAG_PTP))
2409 		return 0;
2410 
2411 	if (pf->ptp_tx &&
2412 	    !test_and_set_bit_lock(__I40E_PTP_TX_IN_PROGRESS, &pf->state)) {
2413 		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
2414 		pf->ptp_tx_skb = skb_get(skb);
2415 	} else {
2416 		return 0;
2417 	}
2418 
2419 	*cd_type_cmd_tso_mss |= (u64)I40E_TX_CTX_DESC_TSYN <<
2420 				I40E_TXD_CTX_QW1_CMD_SHIFT;
2421 
2422 	return 1;
2423 }
2424 
2425 /**
2426  * i40e_tx_enable_csum - Enable Tx checksum offloads
2427  * @skb: send buffer
2428  * @tx_flags: pointer to Tx flags currently set
2429  * @td_cmd: Tx descriptor command bits to set
2430  * @td_offset: Tx descriptor header offsets to set
2431  * @tx_ring: Tx descriptor ring
2432  * @cd_tunneling: ptr to context desc bits
2433  **/
2434 static int i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
2435 			       u32 *td_cmd, u32 *td_offset,
2436 			       struct i40e_ring *tx_ring,
2437 			       u32 *cd_tunneling)
2438 {
2439 	union {
2440 		struct iphdr *v4;
2441 		struct ipv6hdr *v6;
2442 		unsigned char *hdr;
2443 	} ip;
2444 	union {
2445 		struct tcphdr *tcp;
2446 		struct udphdr *udp;
2447 		unsigned char *hdr;
2448 	} l4;
2449 	unsigned char *exthdr;
2450 	u32 offset, cmd = 0;
2451 	__be16 frag_off;
2452 	u8 l4_proto = 0;
2453 
2454 	if (skb->ip_summed != CHECKSUM_PARTIAL)
2455 		return 0;
2456 
2457 	ip.hdr = skb_network_header(skb);
2458 	l4.hdr = skb_transport_header(skb);
2459 
2460 	/* compute outer L2 header size */
2461 	offset = ((ip.hdr - skb->data) / 2) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
2462 
2463 	if (skb->encapsulation) {
2464 		u32 tunnel = 0;
2465 		/* define outer network header type */
2466 		if (*tx_flags & I40E_TX_FLAGS_IPV4) {
2467 			tunnel |= (*tx_flags & I40E_TX_FLAGS_TSO) ?
2468 				  I40E_TX_CTX_EXT_IP_IPV4 :
2469 				  I40E_TX_CTX_EXT_IP_IPV4_NO_CSUM;
2470 
2471 			l4_proto = ip.v4->protocol;
2472 		} else if (*tx_flags & I40E_TX_FLAGS_IPV6) {
2473 			tunnel |= I40E_TX_CTX_EXT_IP_IPV6;
2474 
2475 			exthdr = ip.hdr + sizeof(*ip.v6);
2476 			l4_proto = ip.v6->nexthdr;
2477 			if (l4.hdr != exthdr)
2478 				ipv6_skip_exthdr(skb, exthdr - skb->data,
2479 						 &l4_proto, &frag_off);
2480 		}
2481 
2482 		/* define outer transport */
2483 		switch (l4_proto) {
2484 		case IPPROTO_UDP:
2485 			tunnel |= I40E_TXD_CTX_UDP_TUNNELING;
2486 			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2487 			break;
2488 		case IPPROTO_GRE:
2489 			tunnel |= I40E_TXD_CTX_GRE_TUNNELING;
2490 			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2491 			break;
2492 		case IPPROTO_IPIP:
2493 		case IPPROTO_IPV6:
2494 			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2495 			l4.hdr = skb_inner_network_header(skb);
2496 			break;
2497 		default:
2498 			if (*tx_flags & I40E_TX_FLAGS_TSO)
2499 				return -1;
2500 
2501 			skb_checksum_help(skb);
2502 			return 0;
2503 		}
2504 
2505 		/* compute outer L3 header size */
2506 		tunnel |= ((l4.hdr - ip.hdr) / 4) <<
2507 			  I40E_TXD_CTX_QW0_EXT_IPLEN_SHIFT;
2508 
2509 		/* switch IP header pointer from outer to inner header */
2510 		ip.hdr = skb_inner_network_header(skb);
2511 
2512 		/* compute tunnel header size */
2513 		tunnel |= ((ip.hdr - l4.hdr) / 2) <<
2514 			  I40E_TXD_CTX_QW0_NATLEN_SHIFT;
2515 
2516 		/* indicate if we need to offload outer UDP header */
2517 		if ((*tx_flags & I40E_TX_FLAGS_TSO) &&
2518 		    !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
2519 		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
2520 			tunnel |= I40E_TXD_CTX_QW0_L4T_CS_MASK;
2521 
2522 		/* record tunnel offload values */
2523 		*cd_tunneling |= tunnel;
2524 
2525 		/* switch L4 header pointer from outer to inner */
2526 		l4.hdr = skb_inner_transport_header(skb);
2527 		l4_proto = 0;
2528 
2529 		/* reset type as we transition from outer to inner headers */
2530 		*tx_flags &= ~(I40E_TX_FLAGS_IPV4 | I40E_TX_FLAGS_IPV6);
2531 		if (ip.v4->version == 4)
2532 			*tx_flags |= I40E_TX_FLAGS_IPV4;
2533 		if (ip.v6->version == 6)
2534 			*tx_flags |= I40E_TX_FLAGS_IPV6;
2535 	}
2536 
2537 	/* Enable IP checksum offloads */
2538 	if (*tx_flags & I40E_TX_FLAGS_IPV4) {
2539 		l4_proto = ip.v4->protocol;
2540 		/* the stack computes the IP header already, the only time we
2541 		 * need the hardware to recompute it is in the case of TSO.
2542 		 */
2543 		cmd |= (*tx_flags & I40E_TX_FLAGS_TSO) ?
2544 		       I40E_TX_DESC_CMD_IIPT_IPV4_CSUM :
2545 		       I40E_TX_DESC_CMD_IIPT_IPV4;
2546 	} else if (*tx_flags & I40E_TX_FLAGS_IPV6) {
2547 		cmd |= I40E_TX_DESC_CMD_IIPT_IPV6;
2548 
2549 		exthdr = ip.hdr + sizeof(*ip.v6);
2550 		l4_proto = ip.v6->nexthdr;
2551 		if (l4.hdr != exthdr)
2552 			ipv6_skip_exthdr(skb, exthdr - skb->data,
2553 					 &l4_proto, &frag_off);
2554 	}
2555 
2556 	/* compute inner L3 header size */
2557 	offset |= ((l4.hdr - ip.hdr) / 4) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
2558 
2559 	/* Enable L4 checksum offloads */
2560 	switch (l4_proto) {
2561 	case IPPROTO_TCP:
2562 		/* enable checksum offloads */
2563 		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
2564 		offset |= l4.tcp->doff << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2565 		break;
2566 	case IPPROTO_SCTP:
2567 		/* enable SCTP checksum offload */
2568 		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
2569 		offset |= (sizeof(struct sctphdr) >> 2) <<
2570 			  I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2571 		break;
2572 	case IPPROTO_UDP:
2573 		/* enable UDP checksum offload */
2574 		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
2575 		offset |= (sizeof(struct udphdr) >> 2) <<
2576 			  I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2577 		break;
2578 	default:
2579 		if (*tx_flags & I40E_TX_FLAGS_TSO)
2580 			return -1;
2581 		skb_checksum_help(skb);
2582 		return 0;
2583 	}
2584 
2585 	*td_cmd |= cmd;
2586 	*td_offset |= offset;
2587 
2588 	return 1;
2589 }
2590 
2591 /**
2592  * i40e_create_tx_ctx Build the Tx context descriptor
2593  * @tx_ring:  ring to create the descriptor on
2594  * @cd_type_cmd_tso_mss: Quad Word 1
2595  * @cd_tunneling: Quad Word 0 - bits 0-31
2596  * @cd_l2tag2: Quad Word 0 - bits 32-63
2597  **/
2598 static void i40e_create_tx_ctx(struct i40e_ring *tx_ring,
2599 			       const u64 cd_type_cmd_tso_mss,
2600 			       const u32 cd_tunneling, const u32 cd_l2tag2)
2601 {
2602 	struct i40e_tx_context_desc *context_desc;
2603 	int i = tx_ring->next_to_use;
2604 
2605 	if ((cd_type_cmd_tso_mss == I40E_TX_DESC_DTYPE_CONTEXT) &&
2606 	    !cd_tunneling && !cd_l2tag2)
2607 		return;
2608 
2609 	/* grab the next descriptor */
2610 	context_desc = I40E_TX_CTXTDESC(tx_ring, i);
2611 
2612 	i++;
2613 	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
2614 
2615 	/* cpu_to_le32 and assign to struct fields */
2616 	context_desc->tunneling_params = cpu_to_le32(cd_tunneling);
2617 	context_desc->l2tag2 = cpu_to_le16(cd_l2tag2);
2618 	context_desc->rsvd = cpu_to_le16(0);
2619 	context_desc->type_cmd_tso_mss = cpu_to_le64(cd_type_cmd_tso_mss);
2620 }
2621 
2622 /**
2623  * __i40e_maybe_stop_tx - 2nd level check for tx stop conditions
2624  * @tx_ring: the ring to be checked
2625  * @size:    the size buffer we want to assure is available
2626  *
2627  * Returns -EBUSY if a stop is needed, else 0
2628  **/
2629 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
2630 {
2631 	netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
2632 	/* Memory barrier before checking head and tail */
2633 	smp_mb();
2634 
2635 	/* Check again in a case another CPU has just made room available. */
2636 	if (likely(I40E_DESC_UNUSED(tx_ring) < size))
2637 		return -EBUSY;
2638 
2639 	/* A reprieve! - use start_queue because it doesn't call schedule */
2640 	netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
2641 	++tx_ring->tx_stats.restart_queue;
2642 	return 0;
2643 }
2644 
2645 /**
2646  * __i40e_chk_linearize - Check if there are more than 8 buffers per packet
2647  * @skb:      send buffer
2648  *
2649  * Note: Our HW can't DMA more than 8 buffers to build a packet on the wire
2650  * and so we need to figure out the cases where we need to linearize the skb.
2651  *
2652  * For TSO we need to count the TSO header and segment payload separately.
2653  * As such we need to check cases where we have 7 fragments or more as we
2654  * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
2655  * the segment payload in the first descriptor, and another 7 for the
2656  * fragments.
2657  **/
2658 bool __i40e_chk_linearize(struct sk_buff *skb)
2659 {
2660 	const struct skb_frag_struct *frag, *stale;
2661 	int nr_frags, sum;
2662 
2663 	/* no need to check if number of frags is less than 7 */
2664 	nr_frags = skb_shinfo(skb)->nr_frags;
2665 	if (nr_frags < (I40E_MAX_BUFFER_TXD - 1))
2666 		return false;
2667 
2668 	/* We need to walk through the list and validate that each group
2669 	 * of 6 fragments totals at least gso_size.
2670 	 */
2671 	nr_frags -= I40E_MAX_BUFFER_TXD - 2;
2672 	frag = &skb_shinfo(skb)->frags[0];
2673 
2674 	/* Initialize size to the negative value of gso_size minus 1.  We
2675 	 * use this as the worst case scenerio in which the frag ahead
2676 	 * of us only provides one byte which is why we are limited to 6
2677 	 * descriptors for a single transmit as the header and previous
2678 	 * fragment are already consuming 2 descriptors.
2679 	 */
2680 	sum = 1 - skb_shinfo(skb)->gso_size;
2681 
2682 	/* Add size of frags 0 through 4 to create our initial sum */
2683 	sum += skb_frag_size(frag++);
2684 	sum += skb_frag_size(frag++);
2685 	sum += skb_frag_size(frag++);
2686 	sum += skb_frag_size(frag++);
2687 	sum += skb_frag_size(frag++);
2688 
2689 	/* Walk through fragments adding latest fragment, testing it, and
2690 	 * then removing stale fragments from the sum.
2691 	 */
2692 	stale = &skb_shinfo(skb)->frags[0];
2693 	for (;;) {
2694 		sum += skb_frag_size(frag++);
2695 
2696 		/* if sum is negative we failed to make sufficient progress */
2697 		if (sum < 0)
2698 			return true;
2699 
2700 		if (!nr_frags--)
2701 			break;
2702 
2703 		sum -= skb_frag_size(stale++);
2704 	}
2705 
2706 	return false;
2707 }
2708 
2709 /**
2710  * i40e_tx_map - Build the Tx descriptor
2711  * @tx_ring:  ring to send buffer on
2712  * @skb:      send buffer
2713  * @first:    first buffer info buffer to use
2714  * @tx_flags: collected send information
2715  * @hdr_len:  size of the packet header
2716  * @td_cmd:   the command field in the descriptor
2717  * @td_offset: offset for checksum or crc
2718  **/
2719 #ifdef I40E_FCOE
2720 inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
2721 			struct i40e_tx_buffer *first, u32 tx_flags,
2722 			const u8 hdr_len, u32 td_cmd, u32 td_offset)
2723 #else
2724 static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
2725 			       struct i40e_tx_buffer *first, u32 tx_flags,
2726 			       const u8 hdr_len, u32 td_cmd, u32 td_offset)
2727 #endif
2728 {
2729 	unsigned int data_len = skb->data_len;
2730 	unsigned int size = skb_headlen(skb);
2731 	struct skb_frag_struct *frag;
2732 	struct i40e_tx_buffer *tx_bi;
2733 	struct i40e_tx_desc *tx_desc;
2734 	u16 i = tx_ring->next_to_use;
2735 	u32 td_tag = 0;
2736 	dma_addr_t dma;
2737 	u16 desc_count = 1;
2738 
2739 	if (tx_flags & I40E_TX_FLAGS_HW_VLAN) {
2740 		td_cmd |= I40E_TX_DESC_CMD_IL2TAG1;
2741 		td_tag = (tx_flags & I40E_TX_FLAGS_VLAN_MASK) >>
2742 			 I40E_TX_FLAGS_VLAN_SHIFT;
2743 	}
2744 
2745 	first->tx_flags = tx_flags;
2746 
2747 	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
2748 
2749 	tx_desc = I40E_TX_DESC(tx_ring, i);
2750 	tx_bi = first;
2751 
2752 	for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
2753 		unsigned int max_data = I40E_MAX_DATA_PER_TXD_ALIGNED;
2754 
2755 		if (dma_mapping_error(tx_ring->dev, dma))
2756 			goto dma_error;
2757 
2758 		/* record length, and DMA address */
2759 		dma_unmap_len_set(tx_bi, len, size);
2760 		dma_unmap_addr_set(tx_bi, dma, dma);
2761 
2762 		/* align size to end of page */
2763 		max_data += -dma & (I40E_MAX_READ_REQ_SIZE - 1);
2764 		tx_desc->buffer_addr = cpu_to_le64(dma);
2765 
2766 		while (unlikely(size > I40E_MAX_DATA_PER_TXD)) {
2767 			tx_desc->cmd_type_offset_bsz =
2768 				build_ctob(td_cmd, td_offset,
2769 					   max_data, td_tag);
2770 
2771 			tx_desc++;
2772 			i++;
2773 			desc_count++;
2774 
2775 			if (i == tx_ring->count) {
2776 				tx_desc = I40E_TX_DESC(tx_ring, 0);
2777 				i = 0;
2778 			}
2779 
2780 			dma += max_data;
2781 			size -= max_data;
2782 
2783 			max_data = I40E_MAX_DATA_PER_TXD_ALIGNED;
2784 			tx_desc->buffer_addr = cpu_to_le64(dma);
2785 		}
2786 
2787 		if (likely(!data_len))
2788 			break;
2789 
2790 		tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
2791 							  size, td_tag);
2792 
2793 		tx_desc++;
2794 		i++;
2795 		desc_count++;
2796 
2797 		if (i == tx_ring->count) {
2798 			tx_desc = I40E_TX_DESC(tx_ring, 0);
2799 			i = 0;
2800 		}
2801 
2802 		size = skb_frag_size(frag);
2803 		data_len -= size;
2804 
2805 		dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
2806 				       DMA_TO_DEVICE);
2807 
2808 		tx_bi = &tx_ring->tx_bi[i];
2809 	}
2810 
2811 	netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount);
2812 
2813 	i++;
2814 	if (i == tx_ring->count)
2815 		i = 0;
2816 
2817 	tx_ring->next_to_use = i;
2818 
2819 	i40e_maybe_stop_tx(tx_ring, DESC_NEEDED);
2820 
2821 	/* write last descriptor with EOP bit */
2822 	td_cmd |= I40E_TX_DESC_CMD_EOP;
2823 
2824 	/* We can OR these values together as they both are checked against
2825 	 * 4 below and at this point desc_count will be used as a boolean value
2826 	 * after this if/else block.
2827 	 */
2828 	desc_count |= ++tx_ring->packet_stride;
2829 
2830 	/* Algorithm to optimize tail and RS bit setting:
2831 	 * if queue is stopped
2832 	 *	mark RS bit
2833 	 *	reset packet counter
2834 	 * else if xmit_more is supported and is true
2835 	 *	advance packet counter to 4
2836 	 *	reset desc_count to 0
2837 	 *
2838 	 * if desc_count >= 4
2839 	 *	mark RS bit
2840 	 *	reset packet counter
2841 	 * if desc_count > 0
2842 	 *	update tail
2843 	 *
2844 	 * Note: If there are less than 4 descriptors
2845 	 * pending and interrupts were disabled the service task will
2846 	 * trigger a force WB.
2847 	 */
2848 	if (netif_xmit_stopped(txring_txq(tx_ring))) {
2849 		goto do_rs;
2850 	} else if (skb->xmit_more) {
2851 		/* set stride to arm on next packet and reset desc_count */
2852 		tx_ring->packet_stride = WB_STRIDE;
2853 		desc_count = 0;
2854 	} else if (desc_count >= WB_STRIDE) {
2855 do_rs:
2856 		/* write last descriptor with RS bit set */
2857 		td_cmd |= I40E_TX_DESC_CMD_RS;
2858 		tx_ring->packet_stride = 0;
2859 	}
2860 
2861 	tx_desc->cmd_type_offset_bsz =
2862 			build_ctob(td_cmd, td_offset, size, td_tag);
2863 
2864 	/* Force memory writes to complete before letting h/w know there
2865 	 * are new descriptors to fetch.
2866 	 *
2867 	 * We also use this memory barrier to make certain all of the
2868 	 * status bits have been updated before next_to_watch is written.
2869 	 */
2870 	wmb();
2871 
2872 	/* set next_to_watch value indicating a packet is present */
2873 	first->next_to_watch = tx_desc;
2874 
2875 	/* notify HW of packet */
2876 	if (desc_count) {
2877 		writel(i, tx_ring->tail);
2878 
2879 		/* we need this if more than one processor can write to our tail
2880 		 * at a time, it synchronizes IO on IA64/Altix systems
2881 		 */
2882 		mmiowb();
2883 	}
2884 
2885 	return;
2886 
2887 dma_error:
2888 	dev_info(tx_ring->dev, "TX DMA map failed\n");
2889 
2890 	/* clear dma mappings for failed tx_bi map */
2891 	for (;;) {
2892 		tx_bi = &tx_ring->tx_bi[i];
2893 		i40e_unmap_and_free_tx_resource(tx_ring, tx_bi);
2894 		if (tx_bi == first)
2895 			break;
2896 		if (i == 0)
2897 			i = tx_ring->count;
2898 		i--;
2899 	}
2900 
2901 	tx_ring->next_to_use = i;
2902 }
2903 
2904 /**
2905  * i40e_xmit_frame_ring - Sends buffer on Tx ring
2906  * @skb:     send buffer
2907  * @tx_ring: ring to send buffer on
2908  *
2909  * Returns NETDEV_TX_OK if sent, else an error code
2910  **/
2911 static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
2912 					struct i40e_ring *tx_ring)
2913 {
2914 	u64 cd_type_cmd_tso_mss = I40E_TX_DESC_DTYPE_CONTEXT;
2915 	u32 cd_tunneling = 0, cd_l2tag2 = 0;
2916 	struct i40e_tx_buffer *first;
2917 	u32 td_offset = 0;
2918 	u32 tx_flags = 0;
2919 	__be16 protocol;
2920 	u32 td_cmd = 0;
2921 	u8 hdr_len = 0;
2922 	int tso, count;
2923 	int tsyn;
2924 
2925 	/* prefetch the data, we'll need it later */
2926 	prefetch(skb->data);
2927 
2928 	count = i40e_xmit_descriptor_count(skb);
2929 	if (i40e_chk_linearize(skb, count)) {
2930 		if (__skb_linearize(skb)) {
2931 			dev_kfree_skb_any(skb);
2932 			return NETDEV_TX_OK;
2933 		}
2934 		count = i40e_txd_use_count(skb->len);
2935 		tx_ring->tx_stats.tx_linearize++;
2936 	}
2937 
2938 	/* need: 1 descriptor per page * PAGE_SIZE/I40E_MAX_DATA_PER_TXD,
2939 	 *       + 1 desc for skb_head_len/I40E_MAX_DATA_PER_TXD,
2940 	 *       + 4 desc gap to avoid the cache line where head is,
2941 	 *       + 1 desc for context descriptor,
2942 	 * otherwise try next time
2943 	 */
2944 	if (i40e_maybe_stop_tx(tx_ring, count + 4 + 1)) {
2945 		tx_ring->tx_stats.tx_busy++;
2946 		return NETDEV_TX_BUSY;
2947 	}
2948 
2949 	/* record the location of the first descriptor for this packet */
2950 	first = &tx_ring->tx_bi[tx_ring->next_to_use];
2951 	first->skb = skb;
2952 	first->bytecount = skb->len;
2953 	first->gso_segs = 1;
2954 
2955 	/* prepare the xmit flags */
2956 	if (i40e_tx_prepare_vlan_flags(skb, tx_ring, &tx_flags))
2957 		goto out_drop;
2958 
2959 	/* obtain protocol of skb */
2960 	protocol = vlan_get_protocol(skb);
2961 
2962 	/* setup IPv4/IPv6 offloads */
2963 	if (protocol == htons(ETH_P_IP))
2964 		tx_flags |= I40E_TX_FLAGS_IPV4;
2965 	else if (protocol == htons(ETH_P_IPV6))
2966 		tx_flags |= I40E_TX_FLAGS_IPV6;
2967 
2968 	tso = i40e_tso(first, &hdr_len, &cd_type_cmd_tso_mss);
2969 
2970 	if (tso < 0)
2971 		goto out_drop;
2972 	else if (tso)
2973 		tx_flags |= I40E_TX_FLAGS_TSO;
2974 
2975 	/* Always offload the checksum, since it's in the data descriptor */
2976 	tso = i40e_tx_enable_csum(skb, &tx_flags, &td_cmd, &td_offset,
2977 				  tx_ring, &cd_tunneling);
2978 	if (tso < 0)
2979 		goto out_drop;
2980 
2981 	tsyn = i40e_tsyn(tx_ring, skb, tx_flags, &cd_type_cmd_tso_mss);
2982 
2983 	if (tsyn)
2984 		tx_flags |= I40E_TX_FLAGS_TSYN;
2985 
2986 	skb_tx_timestamp(skb);
2987 
2988 	/* always enable CRC insertion offload */
2989 	td_cmd |= I40E_TX_DESC_CMD_ICRC;
2990 
2991 	i40e_create_tx_ctx(tx_ring, cd_type_cmd_tso_mss,
2992 			   cd_tunneling, cd_l2tag2);
2993 
2994 	/* Add Flow Director ATR if it's enabled.
2995 	 *
2996 	 * NOTE: this must always be directly before the data descriptor.
2997 	 */
2998 	i40e_atr(tx_ring, skb, tx_flags);
2999 
3000 	i40e_tx_map(tx_ring, skb, first, tx_flags, hdr_len,
3001 		    td_cmd, td_offset);
3002 
3003 	return NETDEV_TX_OK;
3004 
3005 out_drop:
3006 	dev_kfree_skb_any(first->skb);
3007 	first->skb = NULL;
3008 	return NETDEV_TX_OK;
3009 }
3010 
3011 /**
3012  * i40e_lan_xmit_frame - Selects the correct VSI and Tx queue to send buffer
3013  * @skb:    send buffer
3014  * @netdev: network interface device structure
3015  *
3016  * Returns NETDEV_TX_OK if sent, else an error code
3017  **/
3018 netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
3019 {
3020 	struct i40e_netdev_priv *np = netdev_priv(netdev);
3021 	struct i40e_vsi *vsi = np->vsi;
3022 	struct i40e_ring *tx_ring = vsi->tx_rings[skb->queue_mapping];
3023 
3024 	/* hardware can't handle really short frames, hardware padding works
3025 	 * beyond this point
3026 	 */
3027 	if (skb_put_padto(skb, I40E_MIN_TX_LEN))
3028 		return NETDEV_TX_OK;
3029 
3030 	return i40e_xmit_frame_ring(skb, tx_ring);
3031 }
3032