1 /*******************************************************************************
2  *
3  * Intel Ethernet Controller XL710 Family Linux Driver
4  * Copyright(c) 2013 - 2016 Intel Corporation.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  * The full GNU General Public License is included in this distribution in
19  * the file called "COPYING".
20  *
21  * Contact Information:
22  * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
23  * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
24  *
25  ******************************************************************************/
26 
27 #include <linux/prefetch.h>
28 #include <net/busy_poll.h>
29 #include "i40e.h"
30 #include "i40e_prototype.h"
31 
32 static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
33 				u32 td_tag)
34 {
35 	return cpu_to_le64(I40E_TX_DESC_DTYPE_DATA |
36 			   ((u64)td_cmd  << I40E_TXD_QW1_CMD_SHIFT) |
37 			   ((u64)td_offset << I40E_TXD_QW1_OFFSET_SHIFT) |
38 			   ((u64)size  << I40E_TXD_QW1_TX_BUF_SZ_SHIFT) |
39 			   ((u64)td_tag  << I40E_TXD_QW1_L2TAG1_SHIFT));
40 }
41 
42 #define I40E_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
43 /**
44  * i40e_fdir - Generate a Flow Director descriptor based on fdata
45  * @tx_ring: Tx ring to send buffer on
46  * @fdata: Flow director filter data
47  * @add: Indicate if we are adding a rule or deleting one
48  *
49  **/
50 static void i40e_fdir(struct i40e_ring *tx_ring,
51 		      struct i40e_fdir_filter *fdata, bool add)
52 {
53 	struct i40e_filter_program_desc *fdir_desc;
54 	struct i40e_pf *pf = tx_ring->vsi->back;
55 	u32 flex_ptype, dtype_cmd;
56 	u16 i;
57 
58 	/* grab the next descriptor */
59 	i = tx_ring->next_to_use;
60 	fdir_desc = I40E_TX_FDIRDESC(tx_ring, i);
61 
62 	i++;
63 	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
64 
65 	flex_ptype = I40E_TXD_FLTR_QW0_QINDEX_MASK &
66 		     (fdata->q_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT);
67 
68 	flex_ptype |= I40E_TXD_FLTR_QW0_FLEXOFF_MASK &
69 		      (fdata->flex_off << I40E_TXD_FLTR_QW0_FLEXOFF_SHIFT);
70 
71 	flex_ptype |= I40E_TXD_FLTR_QW0_PCTYPE_MASK &
72 		      (fdata->pctype << I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
73 
74 	/* Use LAN VSI Id if not programmed by user */
75 	flex_ptype |= I40E_TXD_FLTR_QW0_DEST_VSI_MASK &
76 		      ((u32)(fdata->dest_vsi ? : pf->vsi[pf->lan_vsi]->id) <<
77 		       I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT);
78 
79 	dtype_cmd = I40E_TX_DESC_DTYPE_FILTER_PROG;
80 
81 	dtype_cmd |= add ?
82 		     I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE <<
83 		     I40E_TXD_FLTR_QW1_PCMD_SHIFT :
84 		     I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE <<
85 		     I40E_TXD_FLTR_QW1_PCMD_SHIFT;
86 
87 	dtype_cmd |= I40E_TXD_FLTR_QW1_DEST_MASK &
88 		     (fdata->dest_ctl << I40E_TXD_FLTR_QW1_DEST_SHIFT);
89 
90 	dtype_cmd |= I40E_TXD_FLTR_QW1_FD_STATUS_MASK &
91 		     (fdata->fd_status << I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT);
92 
93 	if (fdata->cnt_index) {
94 		dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
95 		dtype_cmd |= I40E_TXD_FLTR_QW1_CNTINDEX_MASK &
96 			     ((u32)fdata->cnt_index <<
97 			      I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT);
98 	}
99 
100 	fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
101 	fdir_desc->rsvd = cpu_to_le32(0);
102 	fdir_desc->dtype_cmd_cntindex = cpu_to_le32(dtype_cmd);
103 	fdir_desc->fd_id = cpu_to_le32(fdata->fd_id);
104 }
105 
106 #define I40E_FD_CLEAN_DELAY 10
107 /**
108  * i40e_program_fdir_filter - Program a Flow Director filter
109  * @fdir_data: Packet data that will be filter parameters
110  * @raw_packet: the pre-allocated packet buffer for FDir
111  * @pf: The PF pointer
112  * @add: True for add/update, False for remove
113  **/
114 static int i40e_program_fdir_filter(struct i40e_fdir_filter *fdir_data,
115 				    u8 *raw_packet, struct i40e_pf *pf,
116 				    bool add)
117 {
118 	struct i40e_tx_buffer *tx_buf, *first;
119 	struct i40e_tx_desc *tx_desc;
120 	struct i40e_ring *tx_ring;
121 	struct i40e_vsi *vsi;
122 	struct device *dev;
123 	dma_addr_t dma;
124 	u32 td_cmd = 0;
125 	u16 i;
126 
127 	/* find existing FDIR VSI */
128 	vsi = i40e_find_vsi_by_type(pf, I40E_VSI_FDIR);
129 	if (!vsi)
130 		return -ENOENT;
131 
132 	tx_ring = vsi->tx_rings[0];
133 	dev = tx_ring->dev;
134 
135 	/* we need two descriptors to add/del a filter and we can wait */
136 	for (i = I40E_FD_CLEAN_DELAY; I40E_DESC_UNUSED(tx_ring) < 2; i--) {
137 		if (!i)
138 			return -EAGAIN;
139 		msleep_interruptible(1);
140 	}
141 
142 	dma = dma_map_single(dev, raw_packet,
143 			     I40E_FDIR_MAX_RAW_PACKET_SIZE, DMA_TO_DEVICE);
144 	if (dma_mapping_error(dev, dma))
145 		goto dma_fail;
146 
147 	/* grab the next descriptor */
148 	i = tx_ring->next_to_use;
149 	first = &tx_ring->tx_bi[i];
150 	i40e_fdir(tx_ring, fdir_data, add);
151 
152 	/* Now program a dummy descriptor */
153 	i = tx_ring->next_to_use;
154 	tx_desc = I40E_TX_DESC(tx_ring, i);
155 	tx_buf = &tx_ring->tx_bi[i];
156 
157 	tx_ring->next_to_use = ((i + 1) < tx_ring->count) ? i + 1 : 0;
158 
159 	memset(tx_buf, 0, sizeof(struct i40e_tx_buffer));
160 
161 	/* record length, and DMA address */
162 	dma_unmap_len_set(tx_buf, len, I40E_FDIR_MAX_RAW_PACKET_SIZE);
163 	dma_unmap_addr_set(tx_buf, dma, dma);
164 
165 	tx_desc->buffer_addr = cpu_to_le64(dma);
166 	td_cmd = I40E_TXD_CMD | I40E_TX_DESC_CMD_DUMMY;
167 
168 	tx_buf->tx_flags = I40E_TX_FLAGS_FD_SB;
169 	tx_buf->raw_buf = (void *)raw_packet;
170 
171 	tx_desc->cmd_type_offset_bsz =
172 		build_ctob(td_cmd, 0, I40E_FDIR_MAX_RAW_PACKET_SIZE, 0);
173 
174 	/* Force memory writes to complete before letting h/w
175 	 * know there are new descriptors to fetch.
176 	 */
177 	wmb();
178 
179 	/* Mark the data descriptor to be watched */
180 	first->next_to_watch = tx_desc;
181 
182 	writel(tx_ring->next_to_use, tx_ring->tail);
183 	return 0;
184 
185 dma_fail:
186 	return -1;
187 }
188 
189 #define IP_HEADER_OFFSET 14
190 #define I40E_UDPIP_DUMMY_PACKET_LEN 42
191 /**
192  * i40e_add_del_fdir_udpv4 - Add/Remove UDPv4 filters
193  * @vsi: pointer to the targeted VSI
194  * @fd_data: the flow director data required for the FDir descriptor
195  * @add: true adds a filter, false removes it
196  *
197  * Returns 0 if the filters were successfully added or removed
198  **/
199 static int i40e_add_del_fdir_udpv4(struct i40e_vsi *vsi,
200 				   struct i40e_fdir_filter *fd_data,
201 				   bool add)
202 {
203 	struct i40e_pf *pf = vsi->back;
204 	struct udphdr *udp;
205 	struct iphdr *ip;
206 	bool err = false;
207 	u8 *raw_packet;
208 	int ret;
209 	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
210 		0x45, 0, 0, 0x1c, 0, 0, 0x40, 0, 0x40, 0x11, 0, 0, 0, 0, 0, 0,
211 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
212 
213 	raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
214 	if (!raw_packet)
215 		return -ENOMEM;
216 	memcpy(raw_packet, packet, I40E_UDPIP_DUMMY_PACKET_LEN);
217 
218 	ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
219 	udp = (struct udphdr *)(raw_packet + IP_HEADER_OFFSET
220 	      + sizeof(struct iphdr));
221 
222 	ip->daddr = fd_data->dst_ip[0];
223 	udp->dest = fd_data->dst_port;
224 	ip->saddr = fd_data->src_ip[0];
225 	udp->source = fd_data->src_port;
226 
227 	fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
228 	ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
229 	if (ret) {
230 		dev_info(&pf->pdev->dev,
231 			 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
232 			 fd_data->pctype, fd_data->fd_id, ret);
233 		err = true;
234 	} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
235 		if (add)
236 			dev_info(&pf->pdev->dev,
237 				 "Filter OK for PCTYPE %d loc = %d\n",
238 				 fd_data->pctype, fd_data->fd_id);
239 		else
240 			dev_info(&pf->pdev->dev,
241 				 "Filter deleted for PCTYPE %d loc = %d\n",
242 				 fd_data->pctype, fd_data->fd_id);
243 	}
244 	if (err)
245 		kfree(raw_packet);
246 
247 	return err ? -EOPNOTSUPP : 0;
248 }
249 
250 #define I40E_TCPIP_DUMMY_PACKET_LEN 54
251 /**
252  * i40e_add_del_fdir_tcpv4 - Add/Remove TCPv4 filters
253  * @vsi: pointer to the targeted VSI
254  * @fd_data: the flow director data required for the FDir descriptor
255  * @add: true adds a filter, false removes it
256  *
257  * Returns 0 if the filters were successfully added or removed
258  **/
259 static int i40e_add_del_fdir_tcpv4(struct i40e_vsi *vsi,
260 				   struct i40e_fdir_filter *fd_data,
261 				   bool add)
262 {
263 	struct i40e_pf *pf = vsi->back;
264 	struct tcphdr *tcp;
265 	struct iphdr *ip;
266 	bool err = false;
267 	u8 *raw_packet;
268 	int ret;
269 	/* Dummy packet */
270 	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
271 		0x45, 0, 0, 0x28, 0, 0, 0x40, 0, 0x40, 0x6, 0, 0, 0, 0, 0, 0,
272 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80, 0x11,
273 		0x0, 0x72, 0, 0, 0, 0};
274 
275 	raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
276 	if (!raw_packet)
277 		return -ENOMEM;
278 	memcpy(raw_packet, packet, I40E_TCPIP_DUMMY_PACKET_LEN);
279 
280 	ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
281 	tcp = (struct tcphdr *)(raw_packet + IP_HEADER_OFFSET
282 	      + sizeof(struct iphdr));
283 
284 	ip->daddr = fd_data->dst_ip[0];
285 	tcp->dest = fd_data->dst_port;
286 	ip->saddr = fd_data->src_ip[0];
287 	tcp->source = fd_data->src_port;
288 
289 	if (add) {
290 		pf->fd_tcp_rule++;
291 		if ((pf->flags & I40E_FLAG_FD_ATR_ENABLED) &&
292 		    I40E_DEBUG_FD & pf->hw.debug_mask)
293 			dev_info(&pf->pdev->dev, "Forcing ATR off, sideband rules for TCP/IPv4 flow being applied\n");
294 		pf->auto_disable_flags |= I40E_FLAG_FD_ATR_ENABLED;
295 	} else {
296 		pf->fd_tcp_rule = (pf->fd_tcp_rule > 0) ?
297 				  (pf->fd_tcp_rule - 1) : 0;
298 		if (pf->fd_tcp_rule == 0) {
299 			if ((pf->flags & I40E_FLAG_FD_ATR_ENABLED) &&
300 			    I40E_DEBUG_FD & pf->hw.debug_mask)
301 				dev_info(&pf->pdev->dev, "ATR re-enabled due to no sideband TCP/IPv4 rules\n");
302 			pf->auto_disable_flags &= ~I40E_FLAG_FD_ATR_ENABLED;
303 		}
304 	}
305 
306 	fd_data->pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
307 	ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
308 
309 	if (ret) {
310 		dev_info(&pf->pdev->dev,
311 			 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
312 			 fd_data->pctype, fd_data->fd_id, ret);
313 		err = true;
314 	} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
315 		if (add)
316 			dev_info(&pf->pdev->dev, "Filter OK for PCTYPE %d loc = %d)\n",
317 				 fd_data->pctype, fd_data->fd_id);
318 		else
319 			dev_info(&pf->pdev->dev,
320 				 "Filter deleted for PCTYPE %d loc = %d\n",
321 				 fd_data->pctype, fd_data->fd_id);
322 	}
323 
324 	if (err)
325 		kfree(raw_packet);
326 
327 	return err ? -EOPNOTSUPP : 0;
328 }
329 
330 #define I40E_IP_DUMMY_PACKET_LEN 34
331 /**
332  * i40e_add_del_fdir_ipv4 - Add/Remove IPv4 Flow Director filters for
333  * a specific flow spec
334  * @vsi: pointer to the targeted VSI
335  * @fd_data: the flow director data required for the FDir descriptor
336  * @add: true adds a filter, false removes it
337  *
338  * Returns 0 if the filters were successfully added or removed
339  **/
340 static int i40e_add_del_fdir_ipv4(struct i40e_vsi *vsi,
341 				  struct i40e_fdir_filter *fd_data,
342 				  bool add)
343 {
344 	struct i40e_pf *pf = vsi->back;
345 	struct iphdr *ip;
346 	bool err = false;
347 	u8 *raw_packet;
348 	int ret;
349 	int i;
350 	static char packet[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0,
351 		0x45, 0, 0, 0x14, 0, 0, 0x40, 0, 0x40, 0x10, 0, 0, 0, 0, 0, 0,
352 		0, 0, 0, 0};
353 
354 	for (i = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER;
355 	     i <= I40E_FILTER_PCTYPE_FRAG_IPV4;	i++) {
356 		raw_packet = kzalloc(I40E_FDIR_MAX_RAW_PACKET_SIZE, GFP_KERNEL);
357 		if (!raw_packet)
358 			return -ENOMEM;
359 		memcpy(raw_packet, packet, I40E_IP_DUMMY_PACKET_LEN);
360 		ip = (struct iphdr *)(raw_packet + IP_HEADER_OFFSET);
361 
362 		ip->saddr = fd_data->src_ip[0];
363 		ip->daddr = fd_data->dst_ip[0];
364 		ip->protocol = 0;
365 
366 		fd_data->pctype = i;
367 		ret = i40e_program_fdir_filter(fd_data, raw_packet, pf, add);
368 
369 		if (ret) {
370 			dev_info(&pf->pdev->dev,
371 				 "PCTYPE:%d, Filter command send failed for fd_id:%d (ret = %d)\n",
372 				 fd_data->pctype, fd_data->fd_id, ret);
373 			err = true;
374 		} else if (I40E_DEBUG_FD & pf->hw.debug_mask) {
375 			if (add)
376 				dev_info(&pf->pdev->dev,
377 					 "Filter OK for PCTYPE %d loc = %d\n",
378 					 fd_data->pctype, fd_data->fd_id);
379 			else
380 				dev_info(&pf->pdev->dev,
381 					 "Filter deleted for PCTYPE %d loc = %d\n",
382 					 fd_data->pctype, fd_data->fd_id);
383 		}
384 	}
385 
386 	if (err)
387 		kfree(raw_packet);
388 
389 	return err ? -EOPNOTSUPP : 0;
390 }
391 
392 /**
393  * i40e_add_del_fdir - Build raw packets to add/del fdir filter
394  * @vsi: pointer to the targeted VSI
395  * @cmd: command to get or set RX flow classification rules
396  * @add: true adds a filter, false removes it
397  *
398  **/
399 int i40e_add_del_fdir(struct i40e_vsi *vsi,
400 		      struct i40e_fdir_filter *input, bool add)
401 {
402 	struct i40e_pf *pf = vsi->back;
403 	int ret;
404 
405 	switch (input->flow_type & ~FLOW_EXT) {
406 	case TCP_V4_FLOW:
407 		ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
408 		break;
409 	case UDP_V4_FLOW:
410 		ret = i40e_add_del_fdir_udpv4(vsi, input, add);
411 		break;
412 	case IP_USER_FLOW:
413 		switch (input->ip4_proto) {
414 		case IPPROTO_TCP:
415 			ret = i40e_add_del_fdir_tcpv4(vsi, input, add);
416 			break;
417 		case IPPROTO_UDP:
418 			ret = i40e_add_del_fdir_udpv4(vsi, input, add);
419 			break;
420 		case IPPROTO_IP:
421 			ret = i40e_add_del_fdir_ipv4(vsi, input, add);
422 			break;
423 		default:
424 			/* We cannot support masking based on protocol */
425 			goto unsupported_flow;
426 		}
427 		break;
428 	default:
429 unsupported_flow:
430 		dev_info(&pf->pdev->dev, "Could not specify spec type %d\n",
431 			 input->flow_type);
432 		ret = -EINVAL;
433 	}
434 
435 	/* The buffer allocated here is freed by the i40e_clean_tx_ring() */
436 	return ret;
437 }
438 
439 /**
440  * i40e_fd_handle_status - check the Programming Status for FD
441  * @rx_ring: the Rx ring for this descriptor
442  * @rx_desc: the Rx descriptor for programming Status, not a packet descriptor.
443  * @prog_id: the id originally used for programming
444  *
445  * This is used to verify if the FD programming or invalidation
446  * requested by SW to the HW is successful or not and take actions accordingly.
447  **/
448 static void i40e_fd_handle_status(struct i40e_ring *rx_ring,
449 				  union i40e_rx_desc *rx_desc, u8 prog_id)
450 {
451 	struct i40e_pf *pf = rx_ring->vsi->back;
452 	struct pci_dev *pdev = pf->pdev;
453 	u32 fcnt_prog, fcnt_avail;
454 	u32 error;
455 	u64 qw;
456 
457 	qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
458 	error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
459 		I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT;
460 
461 	if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) {
462 		pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id);
463 		if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) ||
464 		    (I40E_DEBUG_FD & pf->hw.debug_mask))
465 			dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n",
466 				 pf->fd_inv);
467 
468 		/* Check if the programming error is for ATR.
469 		 * If so, auto disable ATR and set a state for
470 		 * flush in progress. Next time we come here if flush is in
471 		 * progress do nothing, once flush is complete the state will
472 		 * be cleared.
473 		 */
474 		if (test_bit(__I40E_FD_FLUSH_REQUESTED, &pf->state))
475 			return;
476 
477 		pf->fd_add_err++;
478 		/* store the current atr filter count */
479 		pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf);
480 
481 		if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) &&
482 		    (pf->auto_disable_flags & I40E_FLAG_FD_SB_ENABLED)) {
483 			pf->auto_disable_flags |= I40E_FLAG_FD_ATR_ENABLED;
484 			set_bit(__I40E_FD_FLUSH_REQUESTED, &pf->state);
485 		}
486 
487 		/* filter programming failed most likely due to table full */
488 		fcnt_prog = i40e_get_global_fd_count(pf);
489 		fcnt_avail = pf->fdir_pf_filter_count;
490 		/* If ATR is running fcnt_prog can quickly change,
491 		 * if we are very close to full, it makes sense to disable
492 		 * FD ATR/SB and then re-enable it when there is room.
493 		 */
494 		if (fcnt_prog >= (fcnt_avail - I40E_FDIR_BUFFER_FULL_MARGIN)) {
495 			if ((pf->flags & I40E_FLAG_FD_SB_ENABLED) &&
496 			    !(pf->auto_disable_flags &
497 				     I40E_FLAG_FD_SB_ENABLED)) {
498 				if (I40E_DEBUG_FD & pf->hw.debug_mask)
499 					dev_warn(&pdev->dev, "FD filter space full, new ntuple rules will not be added\n");
500 				pf->auto_disable_flags |=
501 							I40E_FLAG_FD_SB_ENABLED;
502 			}
503 		}
504 	} else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) {
505 		if (I40E_DEBUG_FD & pf->hw.debug_mask)
506 			dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n",
507 				 rx_desc->wb.qword0.hi_dword.fd_id);
508 	}
509 }
510 
511 /**
512  * i40e_unmap_and_free_tx_resource - Release a Tx buffer
513  * @ring:      the ring that owns the buffer
514  * @tx_buffer: the buffer to free
515  **/
516 static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
517 					    struct i40e_tx_buffer *tx_buffer)
518 {
519 	if (tx_buffer->skb) {
520 		if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
521 			kfree(tx_buffer->raw_buf);
522 		else
523 			dev_kfree_skb_any(tx_buffer->skb);
524 		if (dma_unmap_len(tx_buffer, len))
525 			dma_unmap_single(ring->dev,
526 					 dma_unmap_addr(tx_buffer, dma),
527 					 dma_unmap_len(tx_buffer, len),
528 					 DMA_TO_DEVICE);
529 	} else if (dma_unmap_len(tx_buffer, len)) {
530 		dma_unmap_page(ring->dev,
531 			       dma_unmap_addr(tx_buffer, dma),
532 			       dma_unmap_len(tx_buffer, len),
533 			       DMA_TO_DEVICE);
534 	}
535 
536 	tx_buffer->next_to_watch = NULL;
537 	tx_buffer->skb = NULL;
538 	dma_unmap_len_set(tx_buffer, len, 0);
539 	/* tx_buffer must be completely set up in the transmit path */
540 }
541 
542 /**
543  * i40e_clean_tx_ring - Free any empty Tx buffers
544  * @tx_ring: ring to be cleaned
545  **/
546 void i40e_clean_tx_ring(struct i40e_ring *tx_ring)
547 {
548 	unsigned long bi_size;
549 	u16 i;
550 
551 	/* ring already cleared, nothing to do */
552 	if (!tx_ring->tx_bi)
553 		return;
554 
555 	/* Free all the Tx ring sk_buffs */
556 	for (i = 0; i < tx_ring->count; i++)
557 		i40e_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
558 
559 	bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
560 	memset(tx_ring->tx_bi, 0, bi_size);
561 
562 	/* Zero out the descriptor ring */
563 	memset(tx_ring->desc, 0, tx_ring->size);
564 
565 	tx_ring->next_to_use = 0;
566 	tx_ring->next_to_clean = 0;
567 
568 	if (!tx_ring->netdev)
569 		return;
570 
571 	/* cleanup Tx queue statistics */
572 	netdev_tx_reset_queue(txring_txq(tx_ring));
573 }
574 
575 /**
576  * i40e_free_tx_resources - Free Tx resources per queue
577  * @tx_ring: Tx descriptor ring for a specific queue
578  *
579  * Free all transmit software resources
580  **/
581 void i40e_free_tx_resources(struct i40e_ring *tx_ring)
582 {
583 	i40e_clean_tx_ring(tx_ring);
584 	kfree(tx_ring->tx_bi);
585 	tx_ring->tx_bi = NULL;
586 
587 	if (tx_ring->desc) {
588 		dma_free_coherent(tx_ring->dev, tx_ring->size,
589 				  tx_ring->desc, tx_ring->dma);
590 		tx_ring->desc = NULL;
591 	}
592 }
593 
594 /**
595  * i40e_get_tx_pending - how many tx descriptors not processed
596  * @tx_ring: the ring of descriptors
597  * @in_sw: is tx_pending being checked in SW or HW
598  *
599  * Since there is no access to the ring head register
600  * in XL710, we need to use our local copies
601  **/
602 u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw)
603 {
604 	u32 head, tail;
605 
606 	if (!in_sw)
607 		head = i40e_get_head(ring);
608 	else
609 		head = ring->next_to_clean;
610 	tail = readl(ring->tail);
611 
612 	if (head != tail)
613 		return (head < tail) ?
614 			tail - head : (tail + ring->count - head);
615 
616 	return 0;
617 }
618 
619 #define WB_STRIDE 4
620 
621 /**
622  * i40e_clean_tx_irq - Reclaim resources after transmit completes
623  * @vsi: the VSI we care about
624  * @tx_ring: Tx ring to clean
625  * @napi_budget: Used to determine if we are in netpoll
626  *
627  * Returns true if there's any budget left (e.g. the clean is finished)
628  **/
629 static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
630 			      struct i40e_ring *tx_ring, int napi_budget)
631 {
632 	u16 i = tx_ring->next_to_clean;
633 	struct i40e_tx_buffer *tx_buf;
634 	struct i40e_tx_desc *tx_head;
635 	struct i40e_tx_desc *tx_desc;
636 	unsigned int total_bytes = 0, total_packets = 0;
637 	unsigned int budget = vsi->work_limit;
638 
639 	tx_buf = &tx_ring->tx_bi[i];
640 	tx_desc = I40E_TX_DESC(tx_ring, i);
641 	i -= tx_ring->count;
642 
643 	tx_head = I40E_TX_DESC(tx_ring, i40e_get_head(tx_ring));
644 
645 	do {
646 		struct i40e_tx_desc *eop_desc = tx_buf->next_to_watch;
647 
648 		/* if next_to_watch is not set then there is no work pending */
649 		if (!eop_desc)
650 			break;
651 
652 		/* prevent any other reads prior to eop_desc */
653 		read_barrier_depends();
654 
655 		/* we have caught up to head, no work left to do */
656 		if (tx_head == tx_desc)
657 			break;
658 
659 		/* clear next_to_watch to prevent false hangs */
660 		tx_buf->next_to_watch = NULL;
661 
662 		/* update the statistics for this packet */
663 		total_bytes += tx_buf->bytecount;
664 		total_packets += tx_buf->gso_segs;
665 
666 		/* free the skb */
667 		napi_consume_skb(tx_buf->skb, napi_budget);
668 
669 		/* unmap skb header data */
670 		dma_unmap_single(tx_ring->dev,
671 				 dma_unmap_addr(tx_buf, dma),
672 				 dma_unmap_len(tx_buf, len),
673 				 DMA_TO_DEVICE);
674 
675 		/* clear tx_buffer data */
676 		tx_buf->skb = NULL;
677 		dma_unmap_len_set(tx_buf, len, 0);
678 
679 		/* unmap remaining buffers */
680 		while (tx_desc != eop_desc) {
681 
682 			tx_buf++;
683 			tx_desc++;
684 			i++;
685 			if (unlikely(!i)) {
686 				i -= tx_ring->count;
687 				tx_buf = tx_ring->tx_bi;
688 				tx_desc = I40E_TX_DESC(tx_ring, 0);
689 			}
690 
691 			/* unmap any remaining paged data */
692 			if (dma_unmap_len(tx_buf, len)) {
693 				dma_unmap_page(tx_ring->dev,
694 					       dma_unmap_addr(tx_buf, dma),
695 					       dma_unmap_len(tx_buf, len),
696 					       DMA_TO_DEVICE);
697 				dma_unmap_len_set(tx_buf, len, 0);
698 			}
699 		}
700 
701 		/* move us one more past the eop_desc for start of next pkt */
702 		tx_buf++;
703 		tx_desc++;
704 		i++;
705 		if (unlikely(!i)) {
706 			i -= tx_ring->count;
707 			tx_buf = tx_ring->tx_bi;
708 			tx_desc = I40E_TX_DESC(tx_ring, 0);
709 		}
710 
711 		prefetch(tx_desc);
712 
713 		/* update budget accounting */
714 		budget--;
715 	} while (likely(budget));
716 
717 	i += tx_ring->count;
718 	tx_ring->next_to_clean = i;
719 	u64_stats_update_begin(&tx_ring->syncp);
720 	tx_ring->stats.bytes += total_bytes;
721 	tx_ring->stats.packets += total_packets;
722 	u64_stats_update_end(&tx_ring->syncp);
723 	tx_ring->q_vector->tx.total_bytes += total_bytes;
724 	tx_ring->q_vector->tx.total_packets += total_packets;
725 
726 	if (tx_ring->flags & I40E_TXR_FLAGS_WB_ON_ITR) {
727 		/* check to see if there are < 4 descriptors
728 		 * waiting to be written back, then kick the hardware to force
729 		 * them to be written back in case we stay in NAPI.
730 		 * In this mode on X722 we do not enable Interrupt.
731 		 */
732 		unsigned int j = i40e_get_tx_pending(tx_ring, false);
733 
734 		if (budget &&
735 		    ((j / WB_STRIDE) == 0) && (j > 0) &&
736 		    !test_bit(__I40E_DOWN, &vsi->state) &&
737 		    (I40E_DESC_UNUSED(tx_ring) != tx_ring->count))
738 			tx_ring->arm_wb = true;
739 	}
740 
741 	/* notify netdev of completed buffers */
742 	netdev_tx_completed_queue(txring_txq(tx_ring),
743 				  total_packets, total_bytes);
744 
745 #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
746 	if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
747 		     (I40E_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
748 		/* Make sure that anybody stopping the queue after this
749 		 * sees the new next_to_clean.
750 		 */
751 		smp_mb();
752 		if (__netif_subqueue_stopped(tx_ring->netdev,
753 					     tx_ring->queue_index) &&
754 		   !test_bit(__I40E_DOWN, &vsi->state)) {
755 			netif_wake_subqueue(tx_ring->netdev,
756 					    tx_ring->queue_index);
757 			++tx_ring->tx_stats.restart_queue;
758 		}
759 	}
760 
761 	return !!budget;
762 }
763 
764 /**
765  * i40e_enable_wb_on_itr - Arm hardware to do a wb, interrupts are not enabled
766  * @vsi: the VSI we care about
767  * @q_vector: the vector on which to enable writeback
768  *
769  **/
770 static void i40e_enable_wb_on_itr(struct i40e_vsi *vsi,
771 				  struct i40e_q_vector *q_vector)
772 {
773 	u16 flags = q_vector->tx.ring[0].flags;
774 	u32 val;
775 
776 	if (!(flags & I40E_TXR_FLAGS_WB_ON_ITR))
777 		return;
778 
779 	if (q_vector->arm_wb_state)
780 		return;
781 
782 	if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
783 		val = I40E_PFINT_DYN_CTLN_WB_ON_ITR_MASK |
784 		      I40E_PFINT_DYN_CTLN_ITR_INDX_MASK; /* set noitr */
785 
786 		wr32(&vsi->back->hw,
787 		     I40E_PFINT_DYN_CTLN(q_vector->v_idx + vsi->base_vector - 1),
788 		     val);
789 	} else {
790 		val = I40E_PFINT_DYN_CTL0_WB_ON_ITR_MASK |
791 		      I40E_PFINT_DYN_CTL0_ITR_INDX_MASK; /* set noitr */
792 
793 		wr32(&vsi->back->hw, I40E_PFINT_DYN_CTL0, val);
794 	}
795 	q_vector->arm_wb_state = true;
796 }
797 
798 /**
799  * i40e_force_wb - Issue SW Interrupt so HW does a wb
800  * @vsi: the VSI we care about
801  * @q_vector: the vector  on which to force writeback
802  *
803  **/
804 void i40e_force_wb(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector)
805 {
806 	if (vsi->back->flags & I40E_FLAG_MSIX_ENABLED) {
807 		u32 val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
808 			  I40E_PFINT_DYN_CTLN_ITR_INDX_MASK | /* set noitr */
809 			  I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK |
810 			  I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK;
811 			  /* allow 00 to be written to the index */
812 
813 		wr32(&vsi->back->hw,
814 		     I40E_PFINT_DYN_CTLN(q_vector->v_idx +
815 					 vsi->base_vector - 1), val);
816 	} else {
817 		u32 val = I40E_PFINT_DYN_CTL0_INTENA_MASK |
818 			  I40E_PFINT_DYN_CTL0_ITR_INDX_MASK | /* set noitr */
819 			  I40E_PFINT_DYN_CTL0_SWINT_TRIG_MASK |
820 			  I40E_PFINT_DYN_CTL0_SW_ITR_INDX_ENA_MASK;
821 			/* allow 00 to be written to the index */
822 
823 		wr32(&vsi->back->hw, I40E_PFINT_DYN_CTL0, val);
824 	}
825 }
826 
827 /**
828  * i40e_set_new_dynamic_itr - Find new ITR level
829  * @rc: structure containing ring performance data
830  *
831  * Returns true if ITR changed, false if not
832  *
833  * Stores a new ITR value based on packets and byte counts during
834  * the last interrupt.  The advantage of per interrupt computation
835  * is faster updates and more accurate ITR for the current traffic
836  * pattern.  Constants in this function were computed based on
837  * theoretical maximum wire speed and thresholds were set based on
838  * testing data as well as attempting to minimize response time
839  * while increasing bulk throughput.
840  **/
841 static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc)
842 {
843 	enum i40e_latency_range new_latency_range = rc->latency_range;
844 	struct i40e_q_vector *qv = rc->ring->q_vector;
845 	u32 new_itr = rc->itr;
846 	int bytes_per_int;
847 	int usecs;
848 
849 	if (rc->total_packets == 0 || !rc->itr)
850 		return false;
851 
852 	/* simple throttlerate management
853 	 *   0-10MB/s   lowest (50000 ints/s)
854 	 *  10-20MB/s   low    (20000 ints/s)
855 	 *  20-1249MB/s bulk   (18000 ints/s)
856 	 *  > 40000 Rx packets per second (8000 ints/s)
857 	 *
858 	 * The math works out because the divisor is in 10^(-6) which
859 	 * turns the bytes/us input value into MB/s values, but
860 	 * make sure to use usecs, as the register values written
861 	 * are in 2 usec increments in the ITR registers, and make sure
862 	 * to use the smoothed values that the countdown timer gives us.
863 	 */
864 	usecs = (rc->itr << 1) * ITR_COUNTDOWN_START;
865 	bytes_per_int = rc->total_bytes / usecs;
866 
867 	switch (new_latency_range) {
868 	case I40E_LOWEST_LATENCY:
869 		if (bytes_per_int > 10)
870 			new_latency_range = I40E_LOW_LATENCY;
871 		break;
872 	case I40E_LOW_LATENCY:
873 		if (bytes_per_int > 20)
874 			new_latency_range = I40E_BULK_LATENCY;
875 		else if (bytes_per_int <= 10)
876 			new_latency_range = I40E_LOWEST_LATENCY;
877 		break;
878 	case I40E_BULK_LATENCY:
879 	case I40E_ULTRA_LATENCY:
880 	default:
881 		if (bytes_per_int <= 20)
882 			new_latency_range = I40E_LOW_LATENCY;
883 		break;
884 	}
885 
886 	/* this is to adjust RX more aggressively when streaming small
887 	 * packets.  The value of 40000 was picked as it is just beyond
888 	 * what the hardware can receive per second if in low latency
889 	 * mode.
890 	 */
891 #define RX_ULTRA_PACKET_RATE 40000
892 
893 	if ((((rc->total_packets * 1000000) / usecs) > RX_ULTRA_PACKET_RATE) &&
894 	    (&qv->rx == rc))
895 		new_latency_range = I40E_ULTRA_LATENCY;
896 
897 	rc->latency_range = new_latency_range;
898 
899 	switch (new_latency_range) {
900 	case I40E_LOWEST_LATENCY:
901 		new_itr = I40E_ITR_50K;
902 		break;
903 	case I40E_LOW_LATENCY:
904 		new_itr = I40E_ITR_20K;
905 		break;
906 	case I40E_BULK_LATENCY:
907 		new_itr = I40E_ITR_18K;
908 		break;
909 	case I40E_ULTRA_LATENCY:
910 		new_itr = I40E_ITR_8K;
911 		break;
912 	default:
913 		break;
914 	}
915 
916 	rc->total_bytes = 0;
917 	rc->total_packets = 0;
918 
919 	if (new_itr != rc->itr) {
920 		rc->itr = new_itr;
921 		return true;
922 	}
923 
924 	return false;
925 }
926 
927 /**
928  * i40e_clean_programming_status - clean the programming status descriptor
929  * @rx_ring: the rx ring that has this descriptor
930  * @rx_desc: the rx descriptor written back by HW
931  *
932  * Flow director should handle FD_FILTER_STATUS to check its filter programming
933  * status being successful or not and take actions accordingly. FCoE should
934  * handle its context/filter programming/invalidation status and take actions.
935  *
936  **/
937 static void i40e_clean_programming_status(struct i40e_ring *rx_ring,
938 					  union i40e_rx_desc *rx_desc)
939 {
940 	u64 qw;
941 	u8 id;
942 
943 	qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
944 	id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
945 		  I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
946 
947 	if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS)
948 		i40e_fd_handle_status(rx_ring, rx_desc, id);
949 #ifdef I40E_FCOE
950 	else if ((id == I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_PROG_STATUS) ||
951 		 (id == I40E_RX_PROG_STATUS_DESC_FCOE_CTXT_INVL_STATUS))
952 		i40e_fcoe_handle_status(rx_ring, rx_desc, id);
953 #endif
954 }
955 
956 /**
957  * i40e_setup_tx_descriptors - Allocate the Tx descriptors
958  * @tx_ring: the tx ring to set up
959  *
960  * Return 0 on success, negative on error
961  **/
962 int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
963 {
964 	struct device *dev = tx_ring->dev;
965 	int bi_size;
966 
967 	if (!dev)
968 		return -ENOMEM;
969 
970 	/* warn if we are about to overwrite the pointer */
971 	WARN_ON(tx_ring->tx_bi);
972 	bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
973 	tx_ring->tx_bi = kzalloc(bi_size, GFP_KERNEL);
974 	if (!tx_ring->tx_bi)
975 		goto err;
976 
977 	/* round up to nearest 4K */
978 	tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc);
979 	/* add u32 for head writeback, align after this takes care of
980 	 * guaranteeing this is at least one cache line in size
981 	 */
982 	tx_ring->size += sizeof(u32);
983 	tx_ring->size = ALIGN(tx_ring->size, 4096);
984 	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
985 					   &tx_ring->dma, GFP_KERNEL);
986 	if (!tx_ring->desc) {
987 		dev_info(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n",
988 			 tx_ring->size);
989 		goto err;
990 	}
991 
992 	tx_ring->next_to_use = 0;
993 	tx_ring->next_to_clean = 0;
994 	return 0;
995 
996 err:
997 	kfree(tx_ring->tx_bi);
998 	tx_ring->tx_bi = NULL;
999 	return -ENOMEM;
1000 }
1001 
1002 /**
1003  * i40e_clean_rx_ring - Free Rx buffers
1004  * @rx_ring: ring to be cleaned
1005  **/
1006 void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
1007 {
1008 	struct device *dev = rx_ring->dev;
1009 	unsigned long bi_size;
1010 	u16 i;
1011 
1012 	/* ring already cleared, nothing to do */
1013 	if (!rx_ring->rx_bi)
1014 		return;
1015 
1016 	/* Free all the Rx ring sk_buffs */
1017 	for (i = 0; i < rx_ring->count; i++) {
1018 		struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
1019 
1020 		if (rx_bi->skb) {
1021 			dev_kfree_skb(rx_bi->skb);
1022 			rx_bi->skb = NULL;
1023 		}
1024 		if (!rx_bi->page)
1025 			continue;
1026 
1027 		dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE, DMA_FROM_DEVICE);
1028 		__free_pages(rx_bi->page, 0);
1029 
1030 		rx_bi->page = NULL;
1031 		rx_bi->page_offset = 0;
1032 	}
1033 
1034 	bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
1035 	memset(rx_ring->rx_bi, 0, bi_size);
1036 
1037 	/* Zero out the descriptor ring */
1038 	memset(rx_ring->desc, 0, rx_ring->size);
1039 
1040 	rx_ring->next_to_alloc = 0;
1041 	rx_ring->next_to_clean = 0;
1042 	rx_ring->next_to_use = 0;
1043 }
1044 
1045 /**
1046  * i40e_free_rx_resources - Free Rx resources
1047  * @rx_ring: ring to clean the resources from
1048  *
1049  * Free all receive software resources
1050  **/
1051 void i40e_free_rx_resources(struct i40e_ring *rx_ring)
1052 {
1053 	i40e_clean_rx_ring(rx_ring);
1054 	kfree(rx_ring->rx_bi);
1055 	rx_ring->rx_bi = NULL;
1056 
1057 	if (rx_ring->desc) {
1058 		dma_free_coherent(rx_ring->dev, rx_ring->size,
1059 				  rx_ring->desc, rx_ring->dma);
1060 		rx_ring->desc = NULL;
1061 	}
1062 }
1063 
1064 /**
1065  * i40e_setup_rx_descriptors - Allocate Rx descriptors
1066  * @rx_ring: Rx descriptor ring (for a specific queue) to setup
1067  *
1068  * Returns 0 on success, negative on failure
1069  **/
1070 int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
1071 {
1072 	struct device *dev = rx_ring->dev;
1073 	int bi_size;
1074 
1075 	/* warn if we are about to overwrite the pointer */
1076 	WARN_ON(rx_ring->rx_bi);
1077 	bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
1078 	rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
1079 	if (!rx_ring->rx_bi)
1080 		goto err;
1081 
1082 	u64_stats_init(&rx_ring->syncp);
1083 
1084 	/* Round up to nearest 4K */
1085 	rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
1086 	rx_ring->size = ALIGN(rx_ring->size, 4096);
1087 	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
1088 					   &rx_ring->dma, GFP_KERNEL);
1089 
1090 	if (!rx_ring->desc) {
1091 		dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
1092 			 rx_ring->size);
1093 		goto err;
1094 	}
1095 
1096 	rx_ring->next_to_alloc = 0;
1097 	rx_ring->next_to_clean = 0;
1098 	rx_ring->next_to_use = 0;
1099 
1100 	return 0;
1101 err:
1102 	kfree(rx_ring->rx_bi);
1103 	rx_ring->rx_bi = NULL;
1104 	return -ENOMEM;
1105 }
1106 
1107 /**
1108  * i40e_release_rx_desc - Store the new tail and head values
1109  * @rx_ring: ring to bump
1110  * @val: new head index
1111  **/
1112 static inline void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val)
1113 {
1114 	rx_ring->next_to_use = val;
1115 
1116 	/* update next to alloc since we have filled the ring */
1117 	rx_ring->next_to_alloc = val;
1118 
1119 	/* Force memory writes to complete before letting h/w
1120 	 * know there are new descriptors to fetch.  (Only
1121 	 * applicable for weak-ordered memory model archs,
1122 	 * such as IA-64).
1123 	 */
1124 	wmb();
1125 	writel(val, rx_ring->tail);
1126 }
1127 
1128 /**
1129  * i40e_alloc_mapped_page - recycle or make a new page
1130  * @rx_ring: ring to use
1131  * @bi: rx_buffer struct to modify
1132  *
1133  * Returns true if the page was successfully allocated or
1134  * reused.
1135  **/
1136 static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
1137 				   struct i40e_rx_buffer *bi)
1138 {
1139 	struct page *page = bi->page;
1140 	dma_addr_t dma;
1141 
1142 	/* since we are recycling buffers we should seldom need to alloc */
1143 	if (likely(page)) {
1144 		rx_ring->rx_stats.page_reuse_count++;
1145 		return true;
1146 	}
1147 
1148 	/* alloc new page for storage */
1149 	page = dev_alloc_page();
1150 	if (unlikely(!page)) {
1151 		rx_ring->rx_stats.alloc_page_failed++;
1152 		return false;
1153 	}
1154 
1155 	/* map page for use */
1156 	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
1157 
1158 	/* if mapping failed free memory back to system since
1159 	 * there isn't much point in holding memory we can't use
1160 	 */
1161 	if (dma_mapping_error(rx_ring->dev, dma)) {
1162 		__free_pages(page, 0);
1163 		rx_ring->rx_stats.alloc_page_failed++;
1164 		return false;
1165 	}
1166 
1167 	bi->dma = dma;
1168 	bi->page = page;
1169 	bi->page_offset = 0;
1170 
1171 	return true;
1172 }
1173 
1174 /**
1175  * i40e_receive_skb - Send a completed packet up the stack
1176  * @rx_ring:  rx ring in play
1177  * @skb: packet to send up
1178  * @vlan_tag: vlan tag for packet
1179  **/
1180 static void i40e_receive_skb(struct i40e_ring *rx_ring,
1181 			     struct sk_buff *skb, u16 vlan_tag)
1182 {
1183 	struct i40e_q_vector *q_vector = rx_ring->q_vector;
1184 
1185 	if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
1186 	    (vlan_tag & VLAN_VID_MASK))
1187 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
1188 
1189 	napi_gro_receive(&q_vector->napi, skb);
1190 }
1191 
1192 /**
1193  * i40e_alloc_rx_buffers - Replace used receive buffers
1194  * @rx_ring: ring to place buffers on
1195  * @cleaned_count: number of buffers to replace
1196  *
1197  * Returns false if all allocations were successful, true if any fail
1198  **/
1199 bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
1200 {
1201 	u16 ntu = rx_ring->next_to_use;
1202 	union i40e_rx_desc *rx_desc;
1203 	struct i40e_rx_buffer *bi;
1204 
1205 	/* do nothing if no valid netdev defined */
1206 	if (!rx_ring->netdev || !cleaned_count)
1207 		return false;
1208 
1209 	rx_desc = I40E_RX_DESC(rx_ring, ntu);
1210 	bi = &rx_ring->rx_bi[ntu];
1211 
1212 	do {
1213 		if (!i40e_alloc_mapped_page(rx_ring, bi))
1214 			goto no_buffers;
1215 
1216 		/* Refresh the desc even if buffer_addrs didn't change
1217 		 * because each write-back erases this info.
1218 		 */
1219 		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
1220 
1221 		rx_desc++;
1222 		bi++;
1223 		ntu++;
1224 		if (unlikely(ntu == rx_ring->count)) {
1225 			rx_desc = I40E_RX_DESC(rx_ring, 0);
1226 			bi = rx_ring->rx_bi;
1227 			ntu = 0;
1228 		}
1229 
1230 		/* clear the status bits for the next_to_use descriptor */
1231 		rx_desc->wb.qword1.status_error_len = 0;
1232 
1233 		cleaned_count--;
1234 	} while (cleaned_count);
1235 
1236 	if (rx_ring->next_to_use != ntu)
1237 		i40e_release_rx_desc(rx_ring, ntu);
1238 
1239 	return false;
1240 
1241 no_buffers:
1242 	if (rx_ring->next_to_use != ntu)
1243 		i40e_release_rx_desc(rx_ring, ntu);
1244 
1245 	/* make sure to come back via polling to try again after
1246 	 * allocation failure
1247 	 */
1248 	return true;
1249 }
1250 
1251 /**
1252  * i40e_rx_checksum - Indicate in skb if hw indicated a good cksum
1253  * @vsi: the VSI we care about
1254  * @skb: skb currently being received and modified
1255  * @rx_desc: the receive descriptor
1256  *
1257  * skb->protocol must be set before this function is called
1258  **/
1259 static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
1260 				    struct sk_buff *skb,
1261 				    union i40e_rx_desc *rx_desc)
1262 {
1263 	struct i40e_rx_ptype_decoded decoded;
1264 	u32 rx_error, rx_status;
1265 	bool ipv4, ipv6;
1266 	u8 ptype;
1267 	u64 qword;
1268 
1269 	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1270 	ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >> I40E_RXD_QW1_PTYPE_SHIFT;
1271 	rx_error = (qword & I40E_RXD_QW1_ERROR_MASK) >>
1272 		   I40E_RXD_QW1_ERROR_SHIFT;
1273 	rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
1274 		    I40E_RXD_QW1_STATUS_SHIFT;
1275 	decoded = decode_rx_desc_ptype(ptype);
1276 
1277 	skb->ip_summed = CHECKSUM_NONE;
1278 
1279 	skb_checksum_none_assert(skb);
1280 
1281 	/* Rx csum enabled and ip headers found? */
1282 	if (!(vsi->netdev->features & NETIF_F_RXCSUM))
1283 		return;
1284 
1285 	/* did the hardware decode the packet and checksum? */
1286 	if (!(rx_status & BIT(I40E_RX_DESC_STATUS_L3L4P_SHIFT)))
1287 		return;
1288 
1289 	/* both known and outer_ip must be set for the below code to work */
1290 	if (!(decoded.known && decoded.outer_ip))
1291 		return;
1292 
1293 	ipv4 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
1294 	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4);
1295 	ipv6 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
1296 	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6);
1297 
1298 	if (ipv4 &&
1299 	    (rx_error & (BIT(I40E_RX_DESC_ERROR_IPE_SHIFT) |
1300 			 BIT(I40E_RX_DESC_ERROR_EIPE_SHIFT))))
1301 		goto checksum_fail;
1302 
1303 	/* likely incorrect csum if alternate IP extension headers found */
1304 	if (ipv6 &&
1305 	    rx_status & BIT(I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))
1306 		/* don't increment checksum err here, non-fatal err */
1307 		return;
1308 
1309 	/* there was some L4 error, count error and punt packet to the stack */
1310 	if (rx_error & BIT(I40E_RX_DESC_ERROR_L4E_SHIFT))
1311 		goto checksum_fail;
1312 
1313 	/* handle packets that were not able to be checksummed due
1314 	 * to arrival speed, in this case the stack can compute
1315 	 * the csum.
1316 	 */
1317 	if (rx_error & BIT(I40E_RX_DESC_ERROR_PPRS_SHIFT))
1318 		return;
1319 
1320 	/* If there is an outer header present that might contain a checksum
1321 	 * we need to bump the checksum level by 1 to reflect the fact that
1322 	 * we are indicating we validated the inner checksum.
1323 	 */
1324 	if (decoded.tunnel_type >= I40E_RX_PTYPE_TUNNEL_IP_GRENAT)
1325 		skb->csum_level = 1;
1326 
1327 	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
1328 	switch (decoded.inner_prot) {
1329 	case I40E_RX_PTYPE_INNER_PROT_TCP:
1330 	case I40E_RX_PTYPE_INNER_PROT_UDP:
1331 	case I40E_RX_PTYPE_INNER_PROT_SCTP:
1332 		skb->ip_summed = CHECKSUM_UNNECESSARY;
1333 		/* fall though */
1334 	default:
1335 		break;
1336 	}
1337 
1338 	return;
1339 
1340 checksum_fail:
1341 	vsi->back->hw_csum_rx_error++;
1342 }
1343 
1344 /**
1345  * i40e_ptype_to_htype - get a hash type
1346  * @ptype: the ptype value from the descriptor
1347  *
1348  * Returns a hash type to be used by skb_set_hash
1349  **/
1350 static inline int i40e_ptype_to_htype(u8 ptype)
1351 {
1352 	struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
1353 
1354 	if (!decoded.known)
1355 		return PKT_HASH_TYPE_NONE;
1356 
1357 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1358 	    decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4)
1359 		return PKT_HASH_TYPE_L4;
1360 	else if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1361 		 decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3)
1362 		return PKT_HASH_TYPE_L3;
1363 	else
1364 		return PKT_HASH_TYPE_L2;
1365 }
1366 
1367 /**
1368  * i40e_rx_hash - set the hash value in the skb
1369  * @ring: descriptor ring
1370  * @rx_desc: specific descriptor
1371  **/
1372 static inline void i40e_rx_hash(struct i40e_ring *ring,
1373 				union i40e_rx_desc *rx_desc,
1374 				struct sk_buff *skb,
1375 				u8 rx_ptype)
1376 {
1377 	u32 hash;
1378 	const __le64 rss_mask =
1379 		cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH <<
1380 			    I40E_RX_DESC_STATUS_FLTSTAT_SHIFT);
1381 
1382 	if (!(ring->netdev->features & NETIF_F_RXHASH))
1383 		return;
1384 
1385 	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
1386 		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
1387 		skb_set_hash(skb, hash, i40e_ptype_to_htype(rx_ptype));
1388 	}
1389 }
1390 
1391 /**
1392  * i40e_process_skb_fields - Populate skb header fields from Rx descriptor
1393  * @rx_ring: rx descriptor ring packet is being transacted on
1394  * @rx_desc: pointer to the EOP Rx descriptor
1395  * @skb: pointer to current skb being populated
1396  * @rx_ptype: the packet type decoded by hardware
1397  *
1398  * This function checks the ring, descriptor, and packet information in
1399  * order to populate the hash, checksum, VLAN, protocol, and
1400  * other fields within the skb.
1401  **/
1402 static inline
1403 void i40e_process_skb_fields(struct i40e_ring *rx_ring,
1404 			     union i40e_rx_desc *rx_desc, struct sk_buff *skb,
1405 			     u8 rx_ptype)
1406 {
1407 	u64 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1408 	u32 rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
1409 			I40E_RXD_QW1_STATUS_SHIFT;
1410 	u32 tsynvalid = rx_status & I40E_RXD_QW1_STATUS_TSYNVALID_MASK;
1411 	u32 tsyn = (rx_status & I40E_RXD_QW1_STATUS_TSYNINDX_MASK) >>
1412 		   I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT;
1413 
1414 	if (unlikely(tsynvalid))
1415 		i40e_ptp_rx_hwtstamp(rx_ring->vsi->back, skb, tsyn);
1416 
1417 	i40e_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
1418 
1419 	/* modifies the skb - consumes the enet header */
1420 	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
1421 
1422 	i40e_rx_checksum(rx_ring->vsi, skb, rx_desc);
1423 
1424 	skb_record_rx_queue(skb, rx_ring->queue_index);
1425 }
1426 
1427 /**
1428  * i40e_pull_tail - i40e specific version of skb_pull_tail
1429  * @rx_ring: rx descriptor ring packet is being transacted on
1430  * @skb: pointer to current skb being adjusted
1431  *
1432  * This function is an i40e specific version of __pskb_pull_tail.  The
1433  * main difference between this version and the original function is that
1434  * this function can make several assumptions about the state of things
1435  * that allow for significant optimizations versus the standard function.
1436  * As a result we can do things like drop a frag and maintain an accurate
1437  * truesize for the skb.
1438  */
1439 static void i40e_pull_tail(struct i40e_ring *rx_ring, struct sk_buff *skb)
1440 {
1441 	struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
1442 	unsigned char *va;
1443 	unsigned int pull_len;
1444 
1445 	/* it is valid to use page_address instead of kmap since we are
1446 	 * working with pages allocated out of the lomem pool per
1447 	 * alloc_page(GFP_ATOMIC)
1448 	 */
1449 	va = skb_frag_address(frag);
1450 
1451 	/* we need the header to contain the greater of either ETH_HLEN or
1452 	 * 60 bytes if the skb->len is less than 60 for skb_pad.
1453 	 */
1454 	pull_len = eth_get_headlen(va, I40E_RX_HDR_SIZE);
1455 
1456 	/* align pull length to size of long to optimize memcpy performance */
1457 	skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
1458 
1459 	/* update all of the pointers */
1460 	skb_frag_size_sub(frag, pull_len);
1461 	frag->page_offset += pull_len;
1462 	skb->data_len -= pull_len;
1463 	skb->tail += pull_len;
1464 }
1465 
1466 /**
1467  * i40e_cleanup_headers - Correct empty headers
1468  * @rx_ring: rx descriptor ring packet is being transacted on
1469  * @skb: pointer to current skb being fixed
1470  *
1471  * Also address the case where we are pulling data in on pages only
1472  * and as such no data is present in the skb header.
1473  *
1474  * In addition if skb is not at least 60 bytes we need to pad it so that
1475  * it is large enough to qualify as a valid Ethernet frame.
1476  *
1477  * Returns true if an error was encountered and skb was freed.
1478  **/
1479 static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb)
1480 {
1481 	/* place header in linear portion of buffer */
1482 	if (skb_is_nonlinear(skb))
1483 		i40e_pull_tail(rx_ring, skb);
1484 
1485 	/* if eth_skb_pad returns an error the skb was freed */
1486 	if (eth_skb_pad(skb))
1487 		return true;
1488 
1489 	return false;
1490 }
1491 
1492 /**
1493  * i40e_reuse_rx_page - page flip buffer and store it back on the ring
1494  * @rx_ring: rx descriptor ring to store buffers on
1495  * @old_buff: donor buffer to have page reused
1496  *
1497  * Synchronizes page for reuse by the adapter
1498  **/
1499 static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
1500 			       struct i40e_rx_buffer *old_buff)
1501 {
1502 	struct i40e_rx_buffer *new_buff;
1503 	u16 nta = rx_ring->next_to_alloc;
1504 
1505 	new_buff = &rx_ring->rx_bi[nta];
1506 
1507 	/* update, and store next to alloc */
1508 	nta++;
1509 	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
1510 
1511 	/* transfer page from old buffer to new buffer */
1512 	*new_buff = *old_buff;
1513 }
1514 
1515 /**
1516  * i40e_page_is_reserved - check if reuse is possible
1517  * @page: page struct to check
1518  */
1519 static inline bool i40e_page_is_reserved(struct page *page)
1520 {
1521 	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
1522 }
1523 
1524 /**
1525  * i40e_add_rx_frag - Add contents of Rx buffer to sk_buff
1526  * @rx_ring: rx descriptor ring to transact packets on
1527  * @rx_buffer: buffer containing page to add
1528  * @rx_desc: descriptor containing length of buffer written by hardware
1529  * @skb: sk_buff to place the data into
1530  *
1531  * This function will add the data contained in rx_buffer->page to the skb.
1532  * This is done either through a direct copy if the data in the buffer is
1533  * less than the skb header size, otherwise it will just attach the page as
1534  * a frag to the skb.
1535  *
1536  * The function will then update the page offset if necessary and return
1537  * true if the buffer can be reused by the adapter.
1538  **/
1539 static bool i40e_add_rx_frag(struct i40e_ring *rx_ring,
1540 			     struct i40e_rx_buffer *rx_buffer,
1541 			     union i40e_rx_desc *rx_desc,
1542 			     struct sk_buff *skb)
1543 {
1544 	struct page *page = rx_buffer->page;
1545 	u64 qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1546 	unsigned int size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
1547 			    I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1548 #if (PAGE_SIZE < 8192)
1549 	unsigned int truesize = I40E_RXBUFFER_2048;
1550 #else
1551 	unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
1552 	unsigned int last_offset = PAGE_SIZE - I40E_RXBUFFER_2048;
1553 #endif
1554 
1555 	/* will the data fit in the skb we allocated? if so, just
1556 	 * copy it as it is pretty small anyway
1557 	 */
1558 	if ((size <= I40E_RX_HDR_SIZE) && !skb_is_nonlinear(skb)) {
1559 		unsigned char *va = page_address(page) + rx_buffer->page_offset;
1560 
1561 		memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
1562 
1563 		/* page is not reserved, we can reuse buffer as-is */
1564 		if (likely(!i40e_page_is_reserved(page)))
1565 			return true;
1566 
1567 		/* this page cannot be reused so discard it */
1568 		__free_pages(page, 0);
1569 		return false;
1570 	}
1571 
1572 	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
1573 			rx_buffer->page_offset, size, truesize);
1574 
1575 	/* avoid re-using remote pages */
1576 	if (unlikely(i40e_page_is_reserved(page)))
1577 		return false;
1578 
1579 #if (PAGE_SIZE < 8192)
1580 	/* if we are only owner of page we can reuse it */
1581 	if (unlikely(page_count(page) != 1))
1582 		return false;
1583 
1584 	/* flip page offset to other buffer */
1585 	rx_buffer->page_offset ^= truesize;
1586 #else
1587 	/* move offset up to the next cache line */
1588 	rx_buffer->page_offset += truesize;
1589 
1590 	if (rx_buffer->page_offset > last_offset)
1591 		return false;
1592 #endif
1593 
1594 	/* Even if we own the page, we are not allowed to use atomic_set()
1595 	 * This would break get_page_unless_zero() users.
1596 	 */
1597 	get_page(rx_buffer->page);
1598 
1599 	return true;
1600 }
1601 
1602 /**
1603  * i40e_fetch_rx_buffer - Allocate skb and populate it
1604  * @rx_ring: rx descriptor ring to transact packets on
1605  * @rx_desc: descriptor containing info written by hardware
1606  *
1607  * This function allocates an skb on the fly, and populates it with the page
1608  * data from the current receive descriptor, taking care to set up the skb
1609  * correctly, as well as handling calling the page recycle function if
1610  * necessary.
1611  */
1612 static inline
1613 struct sk_buff *i40e_fetch_rx_buffer(struct i40e_ring *rx_ring,
1614 				     union i40e_rx_desc *rx_desc)
1615 {
1616 	struct i40e_rx_buffer *rx_buffer;
1617 	struct sk_buff *skb;
1618 	struct page *page;
1619 
1620 	rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
1621 	page = rx_buffer->page;
1622 	prefetchw(page);
1623 
1624 	skb = rx_buffer->skb;
1625 
1626 	if (likely(!skb)) {
1627 		void *page_addr = page_address(page) + rx_buffer->page_offset;
1628 
1629 		/* prefetch first cache line of first page */
1630 		prefetch(page_addr);
1631 #if L1_CACHE_BYTES < 128
1632 		prefetch(page_addr + L1_CACHE_BYTES);
1633 #endif
1634 
1635 		/* allocate a skb to store the frags */
1636 		skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
1637 				       I40E_RX_HDR_SIZE,
1638 				       GFP_ATOMIC | __GFP_NOWARN);
1639 		if (unlikely(!skb)) {
1640 			rx_ring->rx_stats.alloc_buff_failed++;
1641 			return NULL;
1642 		}
1643 
1644 		/* we will be copying header into skb->data in
1645 		 * pskb_may_pull so it is in our interest to prefetch
1646 		 * it now to avoid a possible cache miss
1647 		 */
1648 		prefetchw(skb->data);
1649 	} else {
1650 		rx_buffer->skb = NULL;
1651 	}
1652 
1653 	/* we are reusing so sync this buffer for CPU use */
1654 	dma_sync_single_range_for_cpu(rx_ring->dev,
1655 				      rx_buffer->dma,
1656 				      rx_buffer->page_offset,
1657 				      I40E_RXBUFFER_2048,
1658 				      DMA_FROM_DEVICE);
1659 
1660 	/* pull page into skb */
1661 	if (i40e_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
1662 		/* hand second half of page back to the ring */
1663 		i40e_reuse_rx_page(rx_ring, rx_buffer);
1664 		rx_ring->rx_stats.page_reuse_count++;
1665 	} else {
1666 		/* we are not reusing the buffer so unmap it */
1667 		dma_unmap_page(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
1668 			       DMA_FROM_DEVICE);
1669 	}
1670 
1671 	/* clear contents of buffer_info */
1672 	rx_buffer->page = NULL;
1673 
1674 	return skb;
1675 }
1676 
1677 /**
1678  * i40e_is_non_eop - process handling of non-EOP buffers
1679  * @rx_ring: Rx ring being processed
1680  * @rx_desc: Rx descriptor for current buffer
1681  * @skb: Current socket buffer containing buffer in progress
1682  *
1683  * This function updates next to clean.  If the buffer is an EOP buffer
1684  * this function exits returning false, otherwise it will place the
1685  * sk_buff in the next buffer to be chained and return true indicating
1686  * that this is in fact a non-EOP buffer.
1687  **/
1688 static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
1689 			    union i40e_rx_desc *rx_desc,
1690 			    struct sk_buff *skb)
1691 {
1692 	u32 ntc = rx_ring->next_to_clean + 1;
1693 
1694 	/* fetch, update, and store next to clean */
1695 	ntc = (ntc < rx_ring->count) ? ntc : 0;
1696 	rx_ring->next_to_clean = ntc;
1697 
1698 	prefetch(I40E_RX_DESC(rx_ring, ntc));
1699 
1700 #define staterrlen rx_desc->wb.qword1.status_error_len
1701 	if (unlikely(i40e_rx_is_programming_status(le64_to_cpu(staterrlen)))) {
1702 		i40e_clean_programming_status(rx_ring, rx_desc);
1703 		rx_ring->rx_bi[ntc].skb = skb;
1704 		return true;
1705 	}
1706 	/* if we are the last buffer then there is nothing else to do */
1707 #define I40E_RXD_EOF BIT(I40E_RX_DESC_STATUS_EOF_SHIFT)
1708 	if (likely(i40e_test_staterr(rx_desc, I40E_RXD_EOF)))
1709 		return false;
1710 
1711 	/* place skb in next buffer to be received */
1712 	rx_ring->rx_bi[ntc].skb = skb;
1713 	rx_ring->rx_stats.non_eop_descs++;
1714 
1715 	return true;
1716 }
1717 
1718 /**
1719  * i40e_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
1720  * @rx_ring: rx descriptor ring to transact packets on
1721  * @budget: Total limit on number of packets to process
1722  *
1723  * This function provides a "bounce buffer" approach to Rx interrupt
1724  * processing.  The advantage to this is that on systems that have
1725  * expensive overhead for IOMMU access this provides a means of avoiding
1726  * it by maintaining the mapping of the page to the system.
1727  *
1728  * Returns amount of work completed
1729  **/
1730 static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
1731 {
1732 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
1733 	u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
1734 	bool failure = false;
1735 
1736 	while (likely(total_rx_packets < budget)) {
1737 		union i40e_rx_desc *rx_desc;
1738 		struct sk_buff *skb;
1739 		u16 vlan_tag;
1740 		u8 rx_ptype;
1741 		u64 qword;
1742 
1743 		/* return some buffers to hardware, one at a time is too slow */
1744 		if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
1745 			failure = failure ||
1746 				  i40e_alloc_rx_buffers(rx_ring, cleaned_count);
1747 			cleaned_count = 0;
1748 		}
1749 
1750 		rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean);
1751 
1752 		/* status_error_len will always be zero for unused descriptors
1753 		 * because it's cleared in cleanup, and overlaps with hdr_addr
1754 		 * which is always zero because packet split isn't used, if the
1755 		 * hardware wrote DD then it will be non-zero
1756 		 */
1757 		if (!i40e_test_staterr(rx_desc,
1758 				       BIT(I40E_RX_DESC_STATUS_DD_SHIFT)))
1759 			break;
1760 
1761 		/* This memory barrier is needed to keep us from reading
1762 		 * any other fields out of the rx_desc until we know the
1763 		 * DD bit is set.
1764 		 */
1765 		dma_rmb();
1766 
1767 		skb = i40e_fetch_rx_buffer(rx_ring, rx_desc);
1768 		if (!skb)
1769 			break;
1770 
1771 		cleaned_count++;
1772 
1773 		if (i40e_is_non_eop(rx_ring, rx_desc, skb))
1774 			continue;
1775 
1776 		/* ERR_MASK will only have valid bits if EOP set, and
1777 		 * what we are doing here is actually checking
1778 		 * I40E_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
1779 		 * the error field
1780 		 */
1781 		if (unlikely(i40e_test_staterr(rx_desc, BIT(I40E_RXD_QW1_ERROR_SHIFT)))) {
1782 			dev_kfree_skb_any(skb);
1783 			continue;
1784 		}
1785 
1786 		if (i40e_cleanup_headers(rx_ring, skb))
1787 			continue;
1788 
1789 		/* probably a little skewed due to removing CRC */
1790 		total_rx_bytes += skb->len;
1791 
1792 		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
1793 		rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
1794 			   I40E_RXD_QW1_PTYPE_SHIFT;
1795 
1796 		/* populate checksum, VLAN, and protocol */
1797 		i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
1798 
1799 #ifdef I40E_FCOE
1800 		if (unlikely(
1801 		    i40e_rx_is_fcoe(rx_ptype) &&
1802 		    !i40e_fcoe_handle_offload(rx_ring, rx_desc, skb))) {
1803 			dev_kfree_skb_any(skb);
1804 			continue;
1805 		}
1806 #endif
1807 
1808 		vlan_tag = (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) ?
1809 			   le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1) : 0;
1810 
1811 		i40e_receive_skb(rx_ring, skb, vlan_tag);
1812 
1813 		/* update budget accounting */
1814 		total_rx_packets++;
1815 	}
1816 
1817 	u64_stats_update_begin(&rx_ring->syncp);
1818 	rx_ring->stats.packets += total_rx_packets;
1819 	rx_ring->stats.bytes += total_rx_bytes;
1820 	u64_stats_update_end(&rx_ring->syncp);
1821 	rx_ring->q_vector->rx.total_packets += total_rx_packets;
1822 	rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
1823 
1824 	/* guarantee a trip back through this routine if there was a failure */
1825 	return failure ? budget : total_rx_packets;
1826 }
1827 
1828 static u32 i40e_buildreg_itr(const int type, const u16 itr)
1829 {
1830 	u32 val;
1831 
1832 	val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
1833 	      /* Don't clear PBA because that can cause lost interrupts that
1834 	       * came in while we were cleaning/polling
1835 	       */
1836 	      (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) |
1837 	      (itr << I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT);
1838 
1839 	return val;
1840 }
1841 
1842 /* a small macro to shorten up some long lines */
1843 #define INTREG I40E_PFINT_DYN_CTLN
1844 static inline int get_rx_itr_enabled(struct i40e_vsi *vsi, int idx)
1845 {
1846 	return !!(vsi->rx_rings[idx]->rx_itr_setting);
1847 }
1848 
1849 static inline int get_tx_itr_enabled(struct i40e_vsi *vsi, int idx)
1850 {
1851 	return !!(vsi->tx_rings[idx]->tx_itr_setting);
1852 }
1853 
1854 /**
1855  * i40e_update_enable_itr - Update itr and re-enable MSIX interrupt
1856  * @vsi: the VSI we care about
1857  * @q_vector: q_vector for which itr is being updated and interrupt enabled
1858  *
1859  **/
1860 static inline void i40e_update_enable_itr(struct i40e_vsi *vsi,
1861 					  struct i40e_q_vector *q_vector)
1862 {
1863 	struct i40e_hw *hw = &vsi->back->hw;
1864 	bool rx = false, tx = false;
1865 	u32 rxval, txval;
1866 	int vector;
1867 	int idx = q_vector->v_idx;
1868 	int rx_itr_setting, tx_itr_setting;
1869 
1870 	vector = (q_vector->v_idx + vsi->base_vector);
1871 
1872 	/* avoid dynamic calculation if in countdown mode OR if
1873 	 * all dynamic is disabled
1874 	 */
1875 	rxval = txval = i40e_buildreg_itr(I40E_ITR_NONE, 0);
1876 
1877 	rx_itr_setting = get_rx_itr_enabled(vsi, idx);
1878 	tx_itr_setting = get_tx_itr_enabled(vsi, idx);
1879 
1880 	if (q_vector->itr_countdown > 0 ||
1881 	    (!ITR_IS_DYNAMIC(rx_itr_setting) &&
1882 	     !ITR_IS_DYNAMIC(tx_itr_setting))) {
1883 		goto enable_int;
1884 	}
1885 
1886 	if (ITR_IS_DYNAMIC(tx_itr_setting)) {
1887 		rx = i40e_set_new_dynamic_itr(&q_vector->rx);
1888 		rxval = i40e_buildreg_itr(I40E_RX_ITR, q_vector->rx.itr);
1889 	}
1890 
1891 	if (ITR_IS_DYNAMIC(tx_itr_setting)) {
1892 		tx = i40e_set_new_dynamic_itr(&q_vector->tx);
1893 		txval = i40e_buildreg_itr(I40E_TX_ITR, q_vector->tx.itr);
1894 	}
1895 
1896 	if (rx || tx) {
1897 		/* get the higher of the two ITR adjustments and
1898 		 * use the same value for both ITR registers
1899 		 * when in adaptive mode (Rx and/or Tx)
1900 		 */
1901 		u16 itr = max(q_vector->tx.itr, q_vector->rx.itr);
1902 
1903 		q_vector->tx.itr = q_vector->rx.itr = itr;
1904 		txval = i40e_buildreg_itr(I40E_TX_ITR, itr);
1905 		tx = true;
1906 		rxval = i40e_buildreg_itr(I40E_RX_ITR, itr);
1907 		rx = true;
1908 	}
1909 
1910 	/* only need to enable the interrupt once, but need
1911 	 * to possibly update both ITR values
1912 	 */
1913 	if (rx) {
1914 		/* set the INTENA_MSK_MASK so that this first write
1915 		 * won't actually enable the interrupt, instead just
1916 		 * updating the ITR (it's bit 31 PF and VF)
1917 		 */
1918 		rxval |= BIT(31);
1919 		/* don't check _DOWN because interrupt isn't being enabled */
1920 		wr32(hw, INTREG(vector - 1), rxval);
1921 	}
1922 
1923 enable_int:
1924 	if (!test_bit(__I40E_DOWN, &vsi->state))
1925 		wr32(hw, INTREG(vector - 1), txval);
1926 
1927 	if (q_vector->itr_countdown)
1928 		q_vector->itr_countdown--;
1929 	else
1930 		q_vector->itr_countdown = ITR_COUNTDOWN_START;
1931 }
1932 
1933 /**
1934  * i40e_napi_poll - NAPI polling Rx/Tx cleanup routine
1935  * @napi: napi struct with our devices info in it
1936  * @budget: amount of work driver is allowed to do this pass, in packets
1937  *
1938  * This function will clean all queues associated with a q_vector.
1939  *
1940  * Returns the amount of work done
1941  **/
1942 int i40e_napi_poll(struct napi_struct *napi, int budget)
1943 {
1944 	struct i40e_q_vector *q_vector =
1945 			       container_of(napi, struct i40e_q_vector, napi);
1946 	struct i40e_vsi *vsi = q_vector->vsi;
1947 	struct i40e_ring *ring;
1948 	bool clean_complete = true;
1949 	bool arm_wb = false;
1950 	int budget_per_ring;
1951 	int work_done = 0;
1952 
1953 	if (test_bit(__I40E_DOWN, &vsi->state)) {
1954 		napi_complete(napi);
1955 		return 0;
1956 	}
1957 
1958 	/* Clear hung_detected bit */
1959 	clear_bit(I40E_Q_VECTOR_HUNG_DETECT, &q_vector->hung_detected);
1960 	/* Since the actual Tx work is minimal, we can give the Tx a larger
1961 	 * budget and be more aggressive about cleaning up the Tx descriptors.
1962 	 */
1963 	i40e_for_each_ring(ring, q_vector->tx) {
1964 		if (!i40e_clean_tx_irq(vsi, ring, budget)) {
1965 			clean_complete = false;
1966 			continue;
1967 		}
1968 		arm_wb |= ring->arm_wb;
1969 		ring->arm_wb = false;
1970 	}
1971 
1972 	/* Handle case where we are called by netpoll with a budget of 0 */
1973 	if (budget <= 0)
1974 		goto tx_only;
1975 
1976 	/* We attempt to distribute budget to each Rx queue fairly, but don't
1977 	 * allow the budget to go below 1 because that would exit polling early.
1978 	 */
1979 	budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
1980 
1981 	i40e_for_each_ring(ring, q_vector->rx) {
1982 		int cleaned = i40e_clean_rx_irq(ring, budget_per_ring);
1983 
1984 		work_done += cleaned;
1985 		/* if we clean as many as budgeted, we must not be done */
1986 		if (cleaned >= budget_per_ring)
1987 			clean_complete = false;
1988 	}
1989 
1990 	/* If work not completed, return budget and polling will return */
1991 	if (!clean_complete) {
1992 		const cpumask_t *aff_mask = &q_vector->affinity_mask;
1993 		int cpu_id = smp_processor_id();
1994 
1995 		/* It is possible that the interrupt affinity has changed but,
1996 		 * if the cpu is pegged at 100%, polling will never exit while
1997 		 * traffic continues and the interrupt will be stuck on this
1998 		 * cpu.  We check to make sure affinity is correct before we
1999 		 * continue to poll, otherwise we must stop polling so the
2000 		 * interrupt can move to the correct cpu.
2001 		 */
2002 		if (likely(cpumask_test_cpu(cpu_id, aff_mask) ||
2003 			   !(vsi->back->flags & I40E_FLAG_MSIX_ENABLED))) {
2004 tx_only:
2005 			if (arm_wb) {
2006 				q_vector->tx.ring[0].tx_stats.tx_force_wb++;
2007 				i40e_enable_wb_on_itr(vsi, q_vector);
2008 			}
2009 			return budget;
2010 		}
2011 	}
2012 
2013 	if (vsi->back->flags & I40E_TXR_FLAGS_WB_ON_ITR)
2014 		q_vector->arm_wb_state = false;
2015 
2016 	/* Work is done so exit the polling mode and re-enable the interrupt */
2017 	napi_complete_done(napi, work_done);
2018 
2019 	/* If we're prematurely stopping polling to fix the interrupt
2020 	 * affinity we want to make sure polling starts back up so we
2021 	 * issue a call to i40e_force_wb which triggers a SW interrupt.
2022 	 */
2023 	if (!clean_complete)
2024 		i40e_force_wb(vsi, q_vector);
2025 	else if (!(vsi->back->flags & I40E_FLAG_MSIX_ENABLED))
2026 		i40e_irq_dynamic_enable_icr0(vsi->back, false);
2027 	else
2028 		i40e_update_enable_itr(vsi, q_vector);
2029 
2030 	return min(work_done, budget - 1);
2031 }
2032 
2033 /**
2034  * i40e_atr - Add a Flow Director ATR filter
2035  * @tx_ring:  ring to add programming descriptor to
2036  * @skb:      send buffer
2037  * @tx_flags: send tx flags
2038  **/
2039 static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
2040 		     u32 tx_flags)
2041 {
2042 	struct i40e_filter_program_desc *fdir_desc;
2043 	struct i40e_pf *pf = tx_ring->vsi->back;
2044 	union {
2045 		unsigned char *network;
2046 		struct iphdr *ipv4;
2047 		struct ipv6hdr *ipv6;
2048 	} hdr;
2049 	struct tcphdr *th;
2050 	unsigned int hlen;
2051 	u32 flex_ptype, dtype_cmd;
2052 	int l4_proto;
2053 	u16 i;
2054 
2055 	/* make sure ATR is enabled */
2056 	if (!(pf->flags & I40E_FLAG_FD_ATR_ENABLED))
2057 		return;
2058 
2059 	if ((pf->auto_disable_flags & I40E_FLAG_FD_ATR_ENABLED))
2060 		return;
2061 
2062 	/* if sampling is disabled do nothing */
2063 	if (!tx_ring->atr_sample_rate)
2064 		return;
2065 
2066 	/* Currently only IPv4/IPv6 with TCP is supported */
2067 	if (!(tx_flags & (I40E_TX_FLAGS_IPV4 | I40E_TX_FLAGS_IPV6)))
2068 		return;
2069 
2070 	/* snag network header to get L4 type and address */
2071 	hdr.network = (tx_flags & I40E_TX_FLAGS_UDP_TUNNEL) ?
2072 		      skb_inner_network_header(skb) : skb_network_header(skb);
2073 
2074 	/* Note: tx_flags gets modified to reflect inner protocols in
2075 	 * tx_enable_csum function if encap is enabled.
2076 	 */
2077 	if (tx_flags & I40E_TX_FLAGS_IPV4) {
2078 		/* access ihl as u8 to avoid unaligned access on ia64 */
2079 		hlen = (hdr.network[0] & 0x0F) << 2;
2080 		l4_proto = hdr.ipv4->protocol;
2081 	} else {
2082 		hlen = hdr.network - skb->data;
2083 		l4_proto = ipv6_find_hdr(skb, &hlen, IPPROTO_TCP, NULL, NULL);
2084 		hlen -= hdr.network - skb->data;
2085 	}
2086 
2087 	if (l4_proto != IPPROTO_TCP)
2088 		return;
2089 
2090 	th = (struct tcphdr *)(hdr.network + hlen);
2091 
2092 	/* Due to lack of space, no more new filters can be programmed */
2093 	if (th->syn && (pf->auto_disable_flags & I40E_FLAG_FD_ATR_ENABLED))
2094 		return;
2095 	if ((pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE) &&
2096 	    (!(pf->auto_disable_flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE))) {
2097 		/* HW ATR eviction will take care of removing filters on FIN
2098 		 * and RST packets.
2099 		 */
2100 		if (th->fin || th->rst)
2101 			return;
2102 	}
2103 
2104 	tx_ring->atr_count++;
2105 
2106 	/* sample on all syn/fin/rst packets or once every atr sample rate */
2107 	if (!th->fin &&
2108 	    !th->syn &&
2109 	    !th->rst &&
2110 	    (tx_ring->atr_count < tx_ring->atr_sample_rate))
2111 		return;
2112 
2113 	tx_ring->atr_count = 0;
2114 
2115 	/* grab the next descriptor */
2116 	i = tx_ring->next_to_use;
2117 	fdir_desc = I40E_TX_FDIRDESC(tx_ring, i);
2118 
2119 	i++;
2120 	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
2121 
2122 	flex_ptype = (tx_ring->queue_index << I40E_TXD_FLTR_QW0_QINDEX_SHIFT) &
2123 		      I40E_TXD_FLTR_QW0_QINDEX_MASK;
2124 	flex_ptype |= (tx_flags & I40E_TX_FLAGS_IPV4) ?
2125 		      (I40E_FILTER_PCTYPE_NONF_IPV4_TCP <<
2126 		       I40E_TXD_FLTR_QW0_PCTYPE_SHIFT) :
2127 		      (I40E_FILTER_PCTYPE_NONF_IPV6_TCP <<
2128 		       I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
2129 
2130 	flex_ptype |= tx_ring->vsi->id << I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT;
2131 
2132 	dtype_cmd = I40E_TX_DESC_DTYPE_FILTER_PROG;
2133 
2134 	dtype_cmd |= (th->fin || th->rst) ?
2135 		     (I40E_FILTER_PROGRAM_DESC_PCMD_REMOVE <<
2136 		      I40E_TXD_FLTR_QW1_PCMD_SHIFT) :
2137 		     (I40E_FILTER_PROGRAM_DESC_PCMD_ADD_UPDATE <<
2138 		      I40E_TXD_FLTR_QW1_PCMD_SHIFT);
2139 
2140 	dtype_cmd |= I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_QINDEX <<
2141 		     I40E_TXD_FLTR_QW1_DEST_SHIFT;
2142 
2143 	dtype_cmd |= I40E_FILTER_PROGRAM_DESC_FD_STATUS_FD_ID <<
2144 		     I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT;
2145 
2146 	dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
2147 	if (!(tx_flags & I40E_TX_FLAGS_UDP_TUNNEL))
2148 		dtype_cmd |=
2149 			((u32)I40E_FD_ATR_STAT_IDX(pf->hw.pf_id) <<
2150 			I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
2151 			I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
2152 	else
2153 		dtype_cmd |=
2154 			((u32)I40E_FD_ATR_TUNNEL_STAT_IDX(pf->hw.pf_id) <<
2155 			I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
2156 			I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
2157 
2158 	if ((pf->flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE) &&
2159 	    (!(pf->auto_disable_flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE)))
2160 		dtype_cmd |= I40E_TXD_FLTR_QW1_ATR_MASK;
2161 
2162 	fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
2163 	fdir_desc->rsvd = cpu_to_le32(0);
2164 	fdir_desc->dtype_cmd_cntindex = cpu_to_le32(dtype_cmd);
2165 	fdir_desc->fd_id = cpu_to_le32(0);
2166 }
2167 
2168 /**
2169  * i40e_tx_prepare_vlan_flags - prepare generic TX VLAN tagging flags for HW
2170  * @skb:     send buffer
2171  * @tx_ring: ring to send buffer on
2172  * @flags:   the tx flags to be set
2173  *
2174  * Checks the skb and set up correspondingly several generic transmit flags
2175  * related to VLAN tagging for the HW, such as VLAN, DCB, etc.
2176  *
2177  * Returns error code indicate the frame should be dropped upon error and the
2178  * otherwise  returns 0 to indicate the flags has been set properly.
2179  **/
2180 #ifdef I40E_FCOE
2181 inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
2182 				      struct i40e_ring *tx_ring,
2183 				      u32 *flags)
2184 #else
2185 static inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
2186 					     struct i40e_ring *tx_ring,
2187 					     u32 *flags)
2188 #endif
2189 {
2190 	__be16 protocol = skb->protocol;
2191 	u32  tx_flags = 0;
2192 
2193 	if (protocol == htons(ETH_P_8021Q) &&
2194 	    !(tx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_TX)) {
2195 		/* When HW VLAN acceleration is turned off by the user the
2196 		 * stack sets the protocol to 8021q so that the driver
2197 		 * can take any steps required to support the SW only
2198 		 * VLAN handling.  In our case the driver doesn't need
2199 		 * to take any further steps so just set the protocol
2200 		 * to the encapsulated ethertype.
2201 		 */
2202 		skb->protocol = vlan_get_protocol(skb);
2203 		goto out;
2204 	}
2205 
2206 	/* if we have a HW VLAN tag being added, default to the HW one */
2207 	if (skb_vlan_tag_present(skb)) {
2208 		tx_flags |= skb_vlan_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
2209 		tx_flags |= I40E_TX_FLAGS_HW_VLAN;
2210 	/* else if it is a SW VLAN, check the next protocol and store the tag */
2211 	} else if (protocol == htons(ETH_P_8021Q)) {
2212 		struct vlan_hdr *vhdr, _vhdr;
2213 
2214 		vhdr = skb_header_pointer(skb, ETH_HLEN, sizeof(_vhdr), &_vhdr);
2215 		if (!vhdr)
2216 			return -EINVAL;
2217 
2218 		protocol = vhdr->h_vlan_encapsulated_proto;
2219 		tx_flags |= ntohs(vhdr->h_vlan_TCI) << I40E_TX_FLAGS_VLAN_SHIFT;
2220 		tx_flags |= I40E_TX_FLAGS_SW_VLAN;
2221 	}
2222 
2223 	if (!(tx_ring->vsi->back->flags & I40E_FLAG_DCB_ENABLED))
2224 		goto out;
2225 
2226 	/* Insert 802.1p priority into VLAN header */
2227 	if ((tx_flags & (I40E_TX_FLAGS_HW_VLAN | I40E_TX_FLAGS_SW_VLAN)) ||
2228 	    (skb->priority != TC_PRIO_CONTROL)) {
2229 		tx_flags &= ~I40E_TX_FLAGS_VLAN_PRIO_MASK;
2230 		tx_flags |= (skb->priority & 0x7) <<
2231 				I40E_TX_FLAGS_VLAN_PRIO_SHIFT;
2232 		if (tx_flags & I40E_TX_FLAGS_SW_VLAN) {
2233 			struct vlan_ethhdr *vhdr;
2234 			int rc;
2235 
2236 			rc = skb_cow_head(skb, 0);
2237 			if (rc < 0)
2238 				return rc;
2239 			vhdr = (struct vlan_ethhdr *)skb->data;
2240 			vhdr->h_vlan_TCI = htons(tx_flags >>
2241 						 I40E_TX_FLAGS_VLAN_SHIFT);
2242 		} else {
2243 			tx_flags |= I40E_TX_FLAGS_HW_VLAN;
2244 		}
2245 	}
2246 
2247 out:
2248 	*flags = tx_flags;
2249 	return 0;
2250 }
2251 
2252 /**
2253  * i40e_tso - set up the tso context descriptor
2254  * @skb:      ptr to the skb we're sending
2255  * @hdr_len:  ptr to the size of the packet header
2256  * @cd_type_cmd_tso_mss: Quad Word 1
2257  *
2258  * Returns 0 if no TSO can happen, 1 if tso is going, or error
2259  **/
2260 static int i40e_tso(struct sk_buff *skb, u8 *hdr_len, u64 *cd_type_cmd_tso_mss)
2261 {
2262 	u64 cd_cmd, cd_tso_len, cd_mss;
2263 	union {
2264 		struct iphdr *v4;
2265 		struct ipv6hdr *v6;
2266 		unsigned char *hdr;
2267 	} ip;
2268 	union {
2269 		struct tcphdr *tcp;
2270 		struct udphdr *udp;
2271 		unsigned char *hdr;
2272 	} l4;
2273 	u32 paylen, l4_offset;
2274 	int err;
2275 
2276 	if (skb->ip_summed != CHECKSUM_PARTIAL)
2277 		return 0;
2278 
2279 	if (!skb_is_gso(skb))
2280 		return 0;
2281 
2282 	err = skb_cow_head(skb, 0);
2283 	if (err < 0)
2284 		return err;
2285 
2286 	ip.hdr = skb_network_header(skb);
2287 	l4.hdr = skb_transport_header(skb);
2288 
2289 	/* initialize outer IP header fields */
2290 	if (ip.v4->version == 4) {
2291 		ip.v4->tot_len = 0;
2292 		ip.v4->check = 0;
2293 	} else {
2294 		ip.v6->payload_len = 0;
2295 	}
2296 
2297 	if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
2298 					 SKB_GSO_GRE_CSUM |
2299 					 SKB_GSO_IPXIP4 |
2300 					 SKB_GSO_IPXIP6 |
2301 					 SKB_GSO_UDP_TUNNEL |
2302 					 SKB_GSO_UDP_TUNNEL_CSUM)) {
2303 		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
2304 		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
2305 			l4.udp->len = 0;
2306 
2307 			/* determine offset of outer transport header */
2308 			l4_offset = l4.hdr - skb->data;
2309 
2310 			/* remove payload length from outer checksum */
2311 			paylen = skb->len - l4_offset;
2312 			csum_replace_by_diff(&l4.udp->check, htonl(paylen));
2313 		}
2314 
2315 		/* reset pointers to inner headers */
2316 		ip.hdr = skb_inner_network_header(skb);
2317 		l4.hdr = skb_inner_transport_header(skb);
2318 
2319 		/* initialize inner IP header fields */
2320 		if (ip.v4->version == 4) {
2321 			ip.v4->tot_len = 0;
2322 			ip.v4->check = 0;
2323 		} else {
2324 			ip.v6->payload_len = 0;
2325 		}
2326 	}
2327 
2328 	/* determine offset of inner transport header */
2329 	l4_offset = l4.hdr - skb->data;
2330 
2331 	/* remove payload length from inner checksum */
2332 	paylen = skb->len - l4_offset;
2333 	csum_replace_by_diff(&l4.tcp->check, htonl(paylen));
2334 
2335 	/* compute length of segmentation header */
2336 	*hdr_len = (l4.tcp->doff * 4) + l4_offset;
2337 
2338 	/* find the field values */
2339 	cd_cmd = I40E_TX_CTX_DESC_TSO;
2340 	cd_tso_len = skb->len - *hdr_len;
2341 	cd_mss = skb_shinfo(skb)->gso_size;
2342 	*cd_type_cmd_tso_mss |= (cd_cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) |
2343 				(cd_tso_len << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
2344 				(cd_mss << I40E_TXD_CTX_QW1_MSS_SHIFT);
2345 	return 1;
2346 }
2347 
2348 /**
2349  * i40e_tsyn - set up the tsyn context descriptor
2350  * @tx_ring:  ptr to the ring to send
2351  * @skb:      ptr to the skb we're sending
2352  * @tx_flags: the collected send information
2353  * @cd_type_cmd_tso_mss: Quad Word 1
2354  *
2355  * Returns 0 if no Tx timestamp can happen and 1 if the timestamp will happen
2356  **/
2357 static int i40e_tsyn(struct i40e_ring *tx_ring, struct sk_buff *skb,
2358 		     u32 tx_flags, u64 *cd_type_cmd_tso_mss)
2359 {
2360 	struct i40e_pf *pf;
2361 
2362 	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
2363 		return 0;
2364 
2365 	/* Tx timestamps cannot be sampled when doing TSO */
2366 	if (tx_flags & I40E_TX_FLAGS_TSO)
2367 		return 0;
2368 
2369 	/* only timestamp the outbound packet if the user has requested it and
2370 	 * we are not already transmitting a packet to be timestamped
2371 	 */
2372 	pf = i40e_netdev_to_pf(tx_ring->netdev);
2373 	if (!(pf->flags & I40E_FLAG_PTP))
2374 		return 0;
2375 
2376 	if (pf->ptp_tx &&
2377 	    !test_and_set_bit_lock(__I40E_PTP_TX_IN_PROGRESS, &pf->state)) {
2378 		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
2379 		pf->ptp_tx_skb = skb_get(skb);
2380 	} else {
2381 		return 0;
2382 	}
2383 
2384 	*cd_type_cmd_tso_mss |= (u64)I40E_TX_CTX_DESC_TSYN <<
2385 				I40E_TXD_CTX_QW1_CMD_SHIFT;
2386 
2387 	return 1;
2388 }
2389 
2390 /**
2391  * i40e_tx_enable_csum - Enable Tx checksum offloads
2392  * @skb: send buffer
2393  * @tx_flags: pointer to Tx flags currently set
2394  * @td_cmd: Tx descriptor command bits to set
2395  * @td_offset: Tx descriptor header offsets to set
2396  * @tx_ring: Tx descriptor ring
2397  * @cd_tunneling: ptr to context desc bits
2398  **/
2399 static int i40e_tx_enable_csum(struct sk_buff *skb, u32 *tx_flags,
2400 			       u32 *td_cmd, u32 *td_offset,
2401 			       struct i40e_ring *tx_ring,
2402 			       u32 *cd_tunneling)
2403 {
2404 	union {
2405 		struct iphdr *v4;
2406 		struct ipv6hdr *v6;
2407 		unsigned char *hdr;
2408 	} ip;
2409 	union {
2410 		struct tcphdr *tcp;
2411 		struct udphdr *udp;
2412 		unsigned char *hdr;
2413 	} l4;
2414 	unsigned char *exthdr;
2415 	u32 offset, cmd = 0;
2416 	__be16 frag_off;
2417 	u8 l4_proto = 0;
2418 
2419 	if (skb->ip_summed != CHECKSUM_PARTIAL)
2420 		return 0;
2421 
2422 	ip.hdr = skb_network_header(skb);
2423 	l4.hdr = skb_transport_header(skb);
2424 
2425 	/* compute outer L2 header size */
2426 	offset = ((ip.hdr - skb->data) / 2) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
2427 
2428 	if (skb->encapsulation) {
2429 		u32 tunnel = 0;
2430 		/* define outer network header type */
2431 		if (*tx_flags & I40E_TX_FLAGS_IPV4) {
2432 			tunnel |= (*tx_flags & I40E_TX_FLAGS_TSO) ?
2433 				  I40E_TX_CTX_EXT_IP_IPV4 :
2434 				  I40E_TX_CTX_EXT_IP_IPV4_NO_CSUM;
2435 
2436 			l4_proto = ip.v4->protocol;
2437 		} else if (*tx_flags & I40E_TX_FLAGS_IPV6) {
2438 			tunnel |= I40E_TX_CTX_EXT_IP_IPV6;
2439 
2440 			exthdr = ip.hdr + sizeof(*ip.v6);
2441 			l4_proto = ip.v6->nexthdr;
2442 			if (l4.hdr != exthdr)
2443 				ipv6_skip_exthdr(skb, exthdr - skb->data,
2444 						 &l4_proto, &frag_off);
2445 		}
2446 
2447 		/* define outer transport */
2448 		switch (l4_proto) {
2449 		case IPPROTO_UDP:
2450 			tunnel |= I40E_TXD_CTX_UDP_TUNNELING;
2451 			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2452 			break;
2453 		case IPPROTO_GRE:
2454 			tunnel |= I40E_TXD_CTX_GRE_TUNNELING;
2455 			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2456 			break;
2457 		case IPPROTO_IPIP:
2458 		case IPPROTO_IPV6:
2459 			*tx_flags |= I40E_TX_FLAGS_UDP_TUNNEL;
2460 			l4.hdr = skb_inner_network_header(skb);
2461 			break;
2462 		default:
2463 			if (*tx_flags & I40E_TX_FLAGS_TSO)
2464 				return -1;
2465 
2466 			skb_checksum_help(skb);
2467 			return 0;
2468 		}
2469 
2470 		/* compute outer L3 header size */
2471 		tunnel |= ((l4.hdr - ip.hdr) / 4) <<
2472 			  I40E_TXD_CTX_QW0_EXT_IPLEN_SHIFT;
2473 
2474 		/* switch IP header pointer from outer to inner header */
2475 		ip.hdr = skb_inner_network_header(skb);
2476 
2477 		/* compute tunnel header size */
2478 		tunnel |= ((ip.hdr - l4.hdr) / 2) <<
2479 			  I40E_TXD_CTX_QW0_NATLEN_SHIFT;
2480 
2481 		/* indicate if we need to offload outer UDP header */
2482 		if ((*tx_flags & I40E_TX_FLAGS_TSO) &&
2483 		    !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
2484 		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
2485 			tunnel |= I40E_TXD_CTX_QW0_L4T_CS_MASK;
2486 
2487 		/* record tunnel offload values */
2488 		*cd_tunneling |= tunnel;
2489 
2490 		/* switch L4 header pointer from outer to inner */
2491 		l4.hdr = skb_inner_transport_header(skb);
2492 		l4_proto = 0;
2493 
2494 		/* reset type as we transition from outer to inner headers */
2495 		*tx_flags &= ~(I40E_TX_FLAGS_IPV4 | I40E_TX_FLAGS_IPV6);
2496 		if (ip.v4->version == 4)
2497 			*tx_flags |= I40E_TX_FLAGS_IPV4;
2498 		if (ip.v6->version == 6)
2499 			*tx_flags |= I40E_TX_FLAGS_IPV6;
2500 	}
2501 
2502 	/* Enable IP checksum offloads */
2503 	if (*tx_flags & I40E_TX_FLAGS_IPV4) {
2504 		l4_proto = ip.v4->protocol;
2505 		/* the stack computes the IP header already, the only time we
2506 		 * need the hardware to recompute it is in the case of TSO.
2507 		 */
2508 		cmd |= (*tx_flags & I40E_TX_FLAGS_TSO) ?
2509 		       I40E_TX_DESC_CMD_IIPT_IPV4_CSUM :
2510 		       I40E_TX_DESC_CMD_IIPT_IPV4;
2511 	} else if (*tx_flags & I40E_TX_FLAGS_IPV6) {
2512 		cmd |= I40E_TX_DESC_CMD_IIPT_IPV6;
2513 
2514 		exthdr = ip.hdr + sizeof(*ip.v6);
2515 		l4_proto = ip.v6->nexthdr;
2516 		if (l4.hdr != exthdr)
2517 			ipv6_skip_exthdr(skb, exthdr - skb->data,
2518 					 &l4_proto, &frag_off);
2519 	}
2520 
2521 	/* compute inner L3 header size */
2522 	offset |= ((l4.hdr - ip.hdr) / 4) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
2523 
2524 	/* Enable L4 checksum offloads */
2525 	switch (l4_proto) {
2526 	case IPPROTO_TCP:
2527 		/* enable checksum offloads */
2528 		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
2529 		offset |= l4.tcp->doff << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2530 		break;
2531 	case IPPROTO_SCTP:
2532 		/* enable SCTP checksum offload */
2533 		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
2534 		offset |= (sizeof(struct sctphdr) >> 2) <<
2535 			  I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2536 		break;
2537 	case IPPROTO_UDP:
2538 		/* enable UDP checksum offload */
2539 		cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
2540 		offset |= (sizeof(struct udphdr) >> 2) <<
2541 			  I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
2542 		break;
2543 	default:
2544 		if (*tx_flags & I40E_TX_FLAGS_TSO)
2545 			return -1;
2546 		skb_checksum_help(skb);
2547 		return 0;
2548 	}
2549 
2550 	*td_cmd |= cmd;
2551 	*td_offset |= offset;
2552 
2553 	return 1;
2554 }
2555 
2556 /**
2557  * i40e_create_tx_ctx Build the Tx context descriptor
2558  * @tx_ring:  ring to create the descriptor on
2559  * @cd_type_cmd_tso_mss: Quad Word 1
2560  * @cd_tunneling: Quad Word 0 - bits 0-31
2561  * @cd_l2tag2: Quad Word 0 - bits 32-63
2562  **/
2563 static void i40e_create_tx_ctx(struct i40e_ring *tx_ring,
2564 			       const u64 cd_type_cmd_tso_mss,
2565 			       const u32 cd_tunneling, const u32 cd_l2tag2)
2566 {
2567 	struct i40e_tx_context_desc *context_desc;
2568 	int i = tx_ring->next_to_use;
2569 
2570 	if ((cd_type_cmd_tso_mss == I40E_TX_DESC_DTYPE_CONTEXT) &&
2571 	    !cd_tunneling && !cd_l2tag2)
2572 		return;
2573 
2574 	/* grab the next descriptor */
2575 	context_desc = I40E_TX_CTXTDESC(tx_ring, i);
2576 
2577 	i++;
2578 	tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
2579 
2580 	/* cpu_to_le32 and assign to struct fields */
2581 	context_desc->tunneling_params = cpu_to_le32(cd_tunneling);
2582 	context_desc->l2tag2 = cpu_to_le16(cd_l2tag2);
2583 	context_desc->rsvd = cpu_to_le16(0);
2584 	context_desc->type_cmd_tso_mss = cpu_to_le64(cd_type_cmd_tso_mss);
2585 }
2586 
2587 /**
2588  * __i40e_maybe_stop_tx - 2nd level check for tx stop conditions
2589  * @tx_ring: the ring to be checked
2590  * @size:    the size buffer we want to assure is available
2591  *
2592  * Returns -EBUSY if a stop is needed, else 0
2593  **/
2594 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
2595 {
2596 	netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
2597 	/* Memory barrier before checking head and tail */
2598 	smp_mb();
2599 
2600 	/* Check again in a case another CPU has just made room available. */
2601 	if (likely(I40E_DESC_UNUSED(tx_ring) < size))
2602 		return -EBUSY;
2603 
2604 	/* A reprieve! - use start_queue because it doesn't call schedule */
2605 	netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
2606 	++tx_ring->tx_stats.restart_queue;
2607 	return 0;
2608 }
2609 
2610 /**
2611  * __i40e_chk_linearize - Check if there are more than 8 buffers per packet
2612  * @skb:      send buffer
2613  *
2614  * Note: Our HW can't DMA more than 8 buffers to build a packet on the wire
2615  * and so we need to figure out the cases where we need to linearize the skb.
2616  *
2617  * For TSO we need to count the TSO header and segment payload separately.
2618  * As such we need to check cases where we have 7 fragments or more as we
2619  * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
2620  * the segment payload in the first descriptor, and another 7 for the
2621  * fragments.
2622  **/
2623 bool __i40e_chk_linearize(struct sk_buff *skb)
2624 {
2625 	const struct skb_frag_struct *frag, *stale;
2626 	int nr_frags, sum;
2627 
2628 	/* no need to check if number of frags is less than 7 */
2629 	nr_frags = skb_shinfo(skb)->nr_frags;
2630 	if (nr_frags < (I40E_MAX_BUFFER_TXD - 1))
2631 		return false;
2632 
2633 	/* We need to walk through the list and validate that each group
2634 	 * of 6 fragments totals at least gso_size.
2635 	 */
2636 	nr_frags -= I40E_MAX_BUFFER_TXD - 2;
2637 	frag = &skb_shinfo(skb)->frags[0];
2638 
2639 	/* Initialize size to the negative value of gso_size minus 1.  We
2640 	 * use this as the worst case scenerio in which the frag ahead
2641 	 * of us only provides one byte which is why we are limited to 6
2642 	 * descriptors for a single transmit as the header and previous
2643 	 * fragment are already consuming 2 descriptors.
2644 	 */
2645 	sum = 1 - skb_shinfo(skb)->gso_size;
2646 
2647 	/* Add size of frags 0 through 4 to create our initial sum */
2648 	sum += skb_frag_size(frag++);
2649 	sum += skb_frag_size(frag++);
2650 	sum += skb_frag_size(frag++);
2651 	sum += skb_frag_size(frag++);
2652 	sum += skb_frag_size(frag++);
2653 
2654 	/* Walk through fragments adding latest fragment, testing it, and
2655 	 * then removing stale fragments from the sum.
2656 	 */
2657 	stale = &skb_shinfo(skb)->frags[0];
2658 	for (;;) {
2659 		sum += skb_frag_size(frag++);
2660 
2661 		/* if sum is negative we failed to make sufficient progress */
2662 		if (sum < 0)
2663 			return true;
2664 
2665 		if (!nr_frags--)
2666 			break;
2667 
2668 		sum -= skb_frag_size(stale++);
2669 	}
2670 
2671 	return false;
2672 }
2673 
2674 /**
2675  * i40e_tx_map - Build the Tx descriptor
2676  * @tx_ring:  ring to send buffer on
2677  * @skb:      send buffer
2678  * @first:    first buffer info buffer to use
2679  * @tx_flags: collected send information
2680  * @hdr_len:  size of the packet header
2681  * @td_cmd:   the command field in the descriptor
2682  * @td_offset: offset for checksum or crc
2683  **/
2684 #ifdef I40E_FCOE
2685 inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
2686 			struct i40e_tx_buffer *first, u32 tx_flags,
2687 			const u8 hdr_len, u32 td_cmd, u32 td_offset)
2688 #else
2689 static inline void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
2690 			       struct i40e_tx_buffer *first, u32 tx_flags,
2691 			       const u8 hdr_len, u32 td_cmd, u32 td_offset)
2692 #endif
2693 {
2694 	unsigned int data_len = skb->data_len;
2695 	unsigned int size = skb_headlen(skb);
2696 	struct skb_frag_struct *frag;
2697 	struct i40e_tx_buffer *tx_bi;
2698 	struct i40e_tx_desc *tx_desc;
2699 	u16 i = tx_ring->next_to_use;
2700 	u32 td_tag = 0;
2701 	dma_addr_t dma;
2702 	u16 gso_segs;
2703 	u16 desc_count = 1;
2704 
2705 	if (tx_flags & I40E_TX_FLAGS_HW_VLAN) {
2706 		td_cmd |= I40E_TX_DESC_CMD_IL2TAG1;
2707 		td_tag = (tx_flags & I40E_TX_FLAGS_VLAN_MASK) >>
2708 			 I40E_TX_FLAGS_VLAN_SHIFT;
2709 	}
2710 
2711 	if (tx_flags & (I40E_TX_FLAGS_TSO | I40E_TX_FLAGS_FSO))
2712 		gso_segs = skb_shinfo(skb)->gso_segs;
2713 	else
2714 		gso_segs = 1;
2715 
2716 	/* multiply data chunks by size of headers */
2717 	first->bytecount = skb->len - hdr_len + (gso_segs * hdr_len);
2718 	first->gso_segs = gso_segs;
2719 	first->skb = skb;
2720 	first->tx_flags = tx_flags;
2721 
2722 	dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
2723 
2724 	tx_desc = I40E_TX_DESC(tx_ring, i);
2725 	tx_bi = first;
2726 
2727 	for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
2728 		unsigned int max_data = I40E_MAX_DATA_PER_TXD_ALIGNED;
2729 
2730 		if (dma_mapping_error(tx_ring->dev, dma))
2731 			goto dma_error;
2732 
2733 		/* record length, and DMA address */
2734 		dma_unmap_len_set(tx_bi, len, size);
2735 		dma_unmap_addr_set(tx_bi, dma, dma);
2736 
2737 		/* align size to end of page */
2738 		max_data += -dma & (I40E_MAX_READ_REQ_SIZE - 1);
2739 		tx_desc->buffer_addr = cpu_to_le64(dma);
2740 
2741 		while (unlikely(size > I40E_MAX_DATA_PER_TXD)) {
2742 			tx_desc->cmd_type_offset_bsz =
2743 				build_ctob(td_cmd, td_offset,
2744 					   max_data, td_tag);
2745 
2746 			tx_desc++;
2747 			i++;
2748 			desc_count++;
2749 
2750 			if (i == tx_ring->count) {
2751 				tx_desc = I40E_TX_DESC(tx_ring, 0);
2752 				i = 0;
2753 			}
2754 
2755 			dma += max_data;
2756 			size -= max_data;
2757 
2758 			max_data = I40E_MAX_DATA_PER_TXD_ALIGNED;
2759 			tx_desc->buffer_addr = cpu_to_le64(dma);
2760 		}
2761 
2762 		if (likely(!data_len))
2763 			break;
2764 
2765 		tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
2766 							  size, td_tag);
2767 
2768 		tx_desc++;
2769 		i++;
2770 		desc_count++;
2771 
2772 		if (i == tx_ring->count) {
2773 			tx_desc = I40E_TX_DESC(tx_ring, 0);
2774 			i = 0;
2775 		}
2776 
2777 		size = skb_frag_size(frag);
2778 		data_len -= size;
2779 
2780 		dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
2781 				       DMA_TO_DEVICE);
2782 
2783 		tx_bi = &tx_ring->tx_bi[i];
2784 	}
2785 
2786 	netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount);
2787 
2788 	i++;
2789 	if (i == tx_ring->count)
2790 		i = 0;
2791 
2792 	tx_ring->next_to_use = i;
2793 
2794 	i40e_maybe_stop_tx(tx_ring, DESC_NEEDED);
2795 
2796 	/* write last descriptor with EOP bit */
2797 	td_cmd |= I40E_TX_DESC_CMD_EOP;
2798 
2799 	/* We can OR these values together as they both are checked against
2800 	 * 4 below and at this point desc_count will be used as a boolean value
2801 	 * after this if/else block.
2802 	 */
2803 	desc_count |= ++tx_ring->packet_stride;
2804 
2805 	/* Algorithm to optimize tail and RS bit setting:
2806 	 * if queue is stopped
2807 	 *	mark RS bit
2808 	 *	reset packet counter
2809 	 * else if xmit_more is supported and is true
2810 	 *	advance packet counter to 4
2811 	 *	reset desc_count to 0
2812 	 *
2813 	 * if desc_count >= 4
2814 	 *	mark RS bit
2815 	 *	reset packet counter
2816 	 * if desc_count > 0
2817 	 *	update tail
2818 	 *
2819 	 * Note: If there are less than 4 descriptors
2820 	 * pending and interrupts were disabled the service task will
2821 	 * trigger a force WB.
2822 	 */
2823 	if (netif_xmit_stopped(txring_txq(tx_ring))) {
2824 		goto do_rs;
2825 	} else if (skb->xmit_more) {
2826 		/* set stride to arm on next packet and reset desc_count */
2827 		tx_ring->packet_stride = WB_STRIDE;
2828 		desc_count = 0;
2829 	} else if (desc_count >= WB_STRIDE) {
2830 do_rs:
2831 		/* write last descriptor with RS bit set */
2832 		td_cmd |= I40E_TX_DESC_CMD_RS;
2833 		tx_ring->packet_stride = 0;
2834 	}
2835 
2836 	tx_desc->cmd_type_offset_bsz =
2837 			build_ctob(td_cmd, td_offset, size, td_tag);
2838 
2839 	/* Force memory writes to complete before letting h/w know there
2840 	 * are new descriptors to fetch.
2841 	 *
2842 	 * We also use this memory barrier to make certain all of the
2843 	 * status bits have been updated before next_to_watch is written.
2844 	 */
2845 	wmb();
2846 
2847 	/* set next_to_watch value indicating a packet is present */
2848 	first->next_to_watch = tx_desc;
2849 
2850 	/* notify HW of packet */
2851 	if (desc_count) {
2852 		writel(i, tx_ring->tail);
2853 
2854 		/* we need this if more than one processor can write to our tail
2855 		 * at a time, it synchronizes IO on IA64/Altix systems
2856 		 */
2857 		mmiowb();
2858 	}
2859 
2860 	return;
2861 
2862 dma_error:
2863 	dev_info(tx_ring->dev, "TX DMA map failed\n");
2864 
2865 	/* clear dma mappings for failed tx_bi map */
2866 	for (;;) {
2867 		tx_bi = &tx_ring->tx_bi[i];
2868 		i40e_unmap_and_free_tx_resource(tx_ring, tx_bi);
2869 		if (tx_bi == first)
2870 			break;
2871 		if (i == 0)
2872 			i = tx_ring->count;
2873 		i--;
2874 	}
2875 
2876 	tx_ring->next_to_use = i;
2877 }
2878 
2879 /**
2880  * i40e_xmit_frame_ring - Sends buffer on Tx ring
2881  * @skb:     send buffer
2882  * @tx_ring: ring to send buffer on
2883  *
2884  * Returns NETDEV_TX_OK if sent, else an error code
2885  **/
2886 static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
2887 					struct i40e_ring *tx_ring)
2888 {
2889 	u64 cd_type_cmd_tso_mss = I40E_TX_DESC_DTYPE_CONTEXT;
2890 	u32 cd_tunneling = 0, cd_l2tag2 = 0;
2891 	struct i40e_tx_buffer *first;
2892 	u32 td_offset = 0;
2893 	u32 tx_flags = 0;
2894 	__be16 protocol;
2895 	u32 td_cmd = 0;
2896 	u8 hdr_len = 0;
2897 	int tso, count;
2898 	int tsyn;
2899 
2900 	/* prefetch the data, we'll need it later */
2901 	prefetch(skb->data);
2902 
2903 	count = i40e_xmit_descriptor_count(skb);
2904 	if (i40e_chk_linearize(skb, count)) {
2905 		if (__skb_linearize(skb))
2906 			goto out_drop;
2907 		count = i40e_txd_use_count(skb->len);
2908 		tx_ring->tx_stats.tx_linearize++;
2909 	}
2910 
2911 	/* need: 1 descriptor per page * PAGE_SIZE/I40E_MAX_DATA_PER_TXD,
2912 	 *       + 1 desc for skb_head_len/I40E_MAX_DATA_PER_TXD,
2913 	 *       + 4 desc gap to avoid the cache line where head is,
2914 	 *       + 1 desc for context descriptor,
2915 	 * otherwise try next time
2916 	 */
2917 	if (i40e_maybe_stop_tx(tx_ring, count + 4 + 1)) {
2918 		tx_ring->tx_stats.tx_busy++;
2919 		return NETDEV_TX_BUSY;
2920 	}
2921 
2922 	/* prepare the xmit flags */
2923 	if (i40e_tx_prepare_vlan_flags(skb, tx_ring, &tx_flags))
2924 		goto out_drop;
2925 
2926 	/* obtain protocol of skb */
2927 	protocol = vlan_get_protocol(skb);
2928 
2929 	/* record the location of the first descriptor for this packet */
2930 	first = &tx_ring->tx_bi[tx_ring->next_to_use];
2931 
2932 	/* setup IPv4/IPv6 offloads */
2933 	if (protocol == htons(ETH_P_IP))
2934 		tx_flags |= I40E_TX_FLAGS_IPV4;
2935 	else if (protocol == htons(ETH_P_IPV6))
2936 		tx_flags |= I40E_TX_FLAGS_IPV6;
2937 
2938 	tso = i40e_tso(skb, &hdr_len, &cd_type_cmd_tso_mss);
2939 
2940 	if (tso < 0)
2941 		goto out_drop;
2942 	else if (tso)
2943 		tx_flags |= I40E_TX_FLAGS_TSO;
2944 
2945 	/* Always offload the checksum, since it's in the data descriptor */
2946 	tso = i40e_tx_enable_csum(skb, &tx_flags, &td_cmd, &td_offset,
2947 				  tx_ring, &cd_tunneling);
2948 	if (tso < 0)
2949 		goto out_drop;
2950 
2951 	tsyn = i40e_tsyn(tx_ring, skb, tx_flags, &cd_type_cmd_tso_mss);
2952 
2953 	if (tsyn)
2954 		tx_flags |= I40E_TX_FLAGS_TSYN;
2955 
2956 	skb_tx_timestamp(skb);
2957 
2958 	/* always enable CRC insertion offload */
2959 	td_cmd |= I40E_TX_DESC_CMD_ICRC;
2960 
2961 	i40e_create_tx_ctx(tx_ring, cd_type_cmd_tso_mss,
2962 			   cd_tunneling, cd_l2tag2);
2963 
2964 	/* Add Flow Director ATR if it's enabled.
2965 	 *
2966 	 * NOTE: this must always be directly before the data descriptor.
2967 	 */
2968 	i40e_atr(tx_ring, skb, tx_flags);
2969 
2970 	i40e_tx_map(tx_ring, skb, first, tx_flags, hdr_len,
2971 		    td_cmd, td_offset);
2972 
2973 	return NETDEV_TX_OK;
2974 
2975 out_drop:
2976 	dev_kfree_skb_any(skb);
2977 	return NETDEV_TX_OK;
2978 }
2979 
2980 /**
2981  * i40e_lan_xmit_frame - Selects the correct VSI and Tx queue to send buffer
2982  * @skb:    send buffer
2983  * @netdev: network interface device structure
2984  *
2985  * Returns NETDEV_TX_OK if sent, else an error code
2986  **/
2987 netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
2988 {
2989 	struct i40e_netdev_priv *np = netdev_priv(netdev);
2990 	struct i40e_vsi *vsi = np->vsi;
2991 	struct i40e_ring *tx_ring = vsi->tx_rings[skb->queue_mapping];
2992 
2993 	/* hardware can't handle really short frames, hardware padding works
2994 	 * beyond this point
2995 	 */
2996 	if (skb_put_padto(skb, I40E_MIN_TX_LEN))
2997 		return NETDEV_TX_OK;
2998 
2999 	return i40e_xmit_frame_ring(skb, tx_ring);
3000 }
3001