xref: /openbmc/linux/drivers/net/ethernet/chelsio/cxgb3/sge.c (revision eb96b740192b2a09720aaed8a8c132e6a29d5bdb)
1 /*
2  * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 #include <linux/skbuff.h>
33 #include <linux/netdevice.h>
34 #include <linux/etherdevice.h>
35 #include <linux/if_vlan.h>
36 #include <linux/ip.h>
37 #include <linux/tcp.h>
38 #include <linux/dma-mapping.h>
39 #include <linux/slab.h>
40 #include <linux/prefetch.h>
41 #include <net/arp.h>
42 #include "common.h"
43 #include "regs.h"
44 #include "sge_defs.h"
45 #include "t3_cpl.h"
46 #include "firmware_exports.h"
47 #include "cxgb3_offload.h"
48 
49 #define USE_GTS 0
50 
51 #define SGE_RX_SM_BUF_SIZE 1536
52 
53 #define SGE_RX_COPY_THRES  256
54 #define SGE_RX_PULL_LEN    128
55 
56 #define SGE_PG_RSVD SMP_CACHE_BYTES
57 /*
58  * Page chunk size for FL0 buffers if FL0 is to be populated with page chunks.
59  * It must be a divisor of PAGE_SIZE.  If set to 0 FL0 will use sk_buffs
60  * directly.
61  */
62 #define FL0_PG_CHUNK_SIZE  2048
63 #define FL0_PG_ORDER 0
64 #define FL0_PG_ALLOC_SIZE (PAGE_SIZE << FL0_PG_ORDER)
65 #define FL1_PG_CHUNK_SIZE (PAGE_SIZE > 8192 ? 16384 : 8192)
66 #define FL1_PG_ORDER (PAGE_SIZE > 8192 ? 0 : 1)
67 #define FL1_PG_ALLOC_SIZE (PAGE_SIZE << FL1_PG_ORDER)
68 
69 #define SGE_RX_DROP_THRES 16
70 #define RX_RECLAIM_PERIOD (HZ/4)
71 
72 /*
73  * Max number of Rx buffers we replenish at a time.
74  */
75 #define MAX_RX_REFILL 16U
76 /*
77  * Period of the Tx buffer reclaim timer.  This timer does not need to run
78  * frequently as Tx buffers are usually reclaimed by new Tx packets.
79  */
80 #define TX_RECLAIM_PERIOD (HZ / 4)
81 #define TX_RECLAIM_TIMER_CHUNK 64U
82 #define TX_RECLAIM_CHUNK 16U
83 
84 /* WR size in bytes */
85 #define WR_LEN (WR_FLITS * 8)
86 
87 /*
88  * Types of Tx queues in each queue set.  Order here matters, do not change.
89  */
90 enum { TXQ_ETH, TXQ_OFLD, TXQ_CTRL };
91 
92 /* Values for sge_txq.flags */
93 enum {
94 	TXQ_RUNNING = 1 << 0,	/* fetch engine is running */
95 	TXQ_LAST_PKT_DB = 1 << 1,	/* last packet rang the doorbell */
96 };
97 
98 struct tx_desc {
99 	__be64 flit[TX_DESC_FLITS];
100 };
101 
102 struct rx_desc {
103 	__be32 addr_lo;
104 	__be32 len_gen;
105 	__be32 gen2;
106 	__be32 addr_hi;
107 };
108 
109 struct tx_sw_desc {		/* SW state per Tx descriptor */
110 	struct sk_buff *skb;
111 	u8 eop;       /* set if last descriptor for packet */
112 	u8 addr_idx;  /* buffer index of first SGL entry in descriptor */
113 	u8 fragidx;   /* first page fragment associated with descriptor */
114 	s8 sflit;     /* start flit of first SGL entry in descriptor */
115 };
116 
117 struct rx_sw_desc {                /* SW state per Rx descriptor */
118 	union {
119 		struct sk_buff *skb;
120 		struct fl_pg_chunk pg_chunk;
121 	};
122 	DEFINE_DMA_UNMAP_ADDR(dma_addr);
123 };
124 
125 struct rsp_desc {		/* response queue descriptor */
126 	struct rss_header rss_hdr;
127 	__be32 flags;
128 	__be32 len_cq;
129 	struct_group(immediate,
130 		u8 imm_data[47];
131 		u8 intr_gen;
132 	);
133 };
134 
135 /*
136  * Holds unmapping information for Tx packets that need deferred unmapping.
137  * This structure lives at skb->head and must be allocated by callers.
138  */
139 struct deferred_unmap_info {
140 	struct pci_dev *pdev;
141 	dma_addr_t addr[MAX_SKB_FRAGS + 1];
142 };
143 
144 /*
145  * Maps a number of flits to the number of Tx descriptors that can hold them.
146  * The formula is
147  *
148  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
149  *
150  * HW allows up to 4 descriptors to be combined into a WR.
151  */
152 static u8 flit_desc_map[] = {
153 	0,
154 #if SGE_NUM_GENBITS == 1
155 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
156 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
157 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
158 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
159 #elif SGE_NUM_GENBITS == 2
160 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
161 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
162 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
163 	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
164 #else
165 # error "SGE_NUM_GENBITS must be 1 or 2"
166 #endif
167 };
168 
169 static inline struct sge_qset *fl_to_qset(const struct sge_fl *q, int qidx)
170 {
171 	return container_of(q, struct sge_qset, fl[qidx]);
172 }
173 
174 static inline struct sge_qset *rspq_to_qset(const struct sge_rspq *q)
175 {
176 	return container_of(q, struct sge_qset, rspq);
177 }
178 
179 static inline struct sge_qset *txq_to_qset(const struct sge_txq *q, int qidx)
180 {
181 	return container_of(q, struct sge_qset, txq[qidx]);
182 }
183 
184 /**
185  *	refill_rspq - replenish an SGE response queue
186  *	@adapter: the adapter
187  *	@q: the response queue to replenish
188  *	@credits: how many new responses to make available
189  *
190  *	Replenishes a response queue by making the supplied number of responses
191  *	available to HW.
192  */
193 static inline void refill_rspq(struct adapter *adapter,
194 			       const struct sge_rspq *q, unsigned int credits)
195 {
196 	rmb();
197 	t3_write_reg(adapter, A_SG_RSPQ_CREDIT_RETURN,
198 		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
199 }
200 
201 /**
202  *	need_skb_unmap - does the platform need unmapping of sk_buffs?
203  *
204  *	Returns true if the platform needs sk_buff unmapping.  The compiler
205  *	optimizes away unnecessary code if this returns true.
206  */
207 static inline int need_skb_unmap(void)
208 {
209 #ifdef CONFIG_NEED_DMA_MAP_STATE
210 	return 1;
211 #else
212 	return 0;
213 #endif
214 }
215 
216 /**
217  *	unmap_skb - unmap a packet main body and its page fragments
218  *	@skb: the packet
219  *	@q: the Tx queue containing Tx descriptors for the packet
220  *	@cidx: index of Tx descriptor
221  *	@pdev: the PCI device
222  *
223  *	Unmap the main body of an sk_buff and its page fragments, if any.
224  *	Because of the fairly complicated structure of our SGLs and the desire
225  *	to conserve space for metadata, the information necessary to unmap an
226  *	sk_buff is spread across the sk_buff itself (buffer lengths), the HW Tx
227  *	descriptors (the physical addresses of the various data buffers), and
228  *	the SW descriptor state (assorted indices).  The send functions
229  *	initialize the indices for the first packet descriptor so we can unmap
230  *	the buffers held in the first Tx descriptor here, and we have enough
231  *	information at this point to set the state for the next Tx descriptor.
232  *
233  *	Note that it is possible to clean up the first descriptor of a packet
234  *	before the send routines have written the next descriptors, but this
235  *	race does not cause any problem.  We just end up writing the unmapping
236  *	info for the descriptor first.
237  */
238 static inline void unmap_skb(struct sk_buff *skb, struct sge_txq *q,
239 			     unsigned int cidx, struct pci_dev *pdev)
240 {
241 	const struct sg_ent *sgp;
242 	struct tx_sw_desc *d = &q->sdesc[cidx];
243 	int nfrags, frag_idx, curflit, j = d->addr_idx;
244 
245 	sgp = (struct sg_ent *)&q->desc[cidx].flit[d->sflit];
246 	frag_idx = d->fragidx;
247 
248 	if (frag_idx == 0 && skb_headlen(skb)) {
249 		dma_unmap_single(&pdev->dev, be64_to_cpu(sgp->addr[0]),
250 				 skb_headlen(skb), DMA_TO_DEVICE);
251 		j = 1;
252 	}
253 
254 	curflit = d->sflit + 1 + j;
255 	nfrags = skb_shinfo(skb)->nr_frags;
256 
257 	while (frag_idx < nfrags && curflit < WR_FLITS) {
258 		dma_unmap_page(&pdev->dev, be64_to_cpu(sgp->addr[j]),
259 			       skb_frag_size(&skb_shinfo(skb)->frags[frag_idx]),
260 			       DMA_TO_DEVICE);
261 		j ^= 1;
262 		if (j == 0) {
263 			sgp++;
264 			curflit++;
265 		}
266 		curflit++;
267 		frag_idx++;
268 	}
269 
270 	if (frag_idx < nfrags) {   /* SGL continues into next Tx descriptor */
271 		d = cidx + 1 == q->size ? q->sdesc : d + 1;
272 		d->fragidx = frag_idx;
273 		d->addr_idx = j;
274 		d->sflit = curflit - WR_FLITS - j; /* sflit can be -1 */
275 	}
276 }
277 
278 /**
279  *	free_tx_desc - reclaims Tx descriptors and their buffers
280  *	@adapter: the adapter
281  *	@q: the Tx queue to reclaim descriptors from
282  *	@n: the number of descriptors to reclaim
283  *
284  *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
285  *	Tx buffers.  Called with the Tx queue lock held.
286  */
287 static void free_tx_desc(struct adapter *adapter, struct sge_txq *q,
288 			 unsigned int n)
289 {
290 	struct tx_sw_desc *d;
291 	struct pci_dev *pdev = adapter->pdev;
292 	unsigned int cidx = q->cidx;
293 
294 	const int need_unmap = need_skb_unmap() &&
295 			       q->cntxt_id >= FW_TUNNEL_SGEEC_START;
296 
297 	d = &q->sdesc[cidx];
298 	while (n--) {
299 		if (d->skb) {	/* an SGL is present */
300 			if (need_unmap)
301 				unmap_skb(d->skb, q, cidx, pdev);
302 			if (d->eop) {
303 				dev_consume_skb_any(d->skb);
304 				d->skb = NULL;
305 			}
306 		}
307 		++d;
308 		if (++cidx == q->size) {
309 			cidx = 0;
310 			d = q->sdesc;
311 		}
312 	}
313 	q->cidx = cidx;
314 }
315 
316 /**
317  *	reclaim_completed_tx - reclaims completed Tx descriptors
318  *	@adapter: the adapter
319  *	@q: the Tx queue to reclaim completed descriptors from
320  *	@chunk: maximum number of descriptors to reclaim
321  *
322  *	Reclaims Tx descriptors that the SGE has indicated it has processed,
323  *	and frees the associated buffers if possible.  Called with the Tx
324  *	queue's lock held.
325  */
326 static inline unsigned int reclaim_completed_tx(struct adapter *adapter,
327 						struct sge_txq *q,
328 						unsigned int chunk)
329 {
330 	unsigned int reclaim = q->processed - q->cleaned;
331 
332 	reclaim = min(chunk, reclaim);
333 	if (reclaim) {
334 		free_tx_desc(adapter, q, reclaim);
335 		q->cleaned += reclaim;
336 		q->in_use -= reclaim;
337 	}
338 	return q->processed - q->cleaned;
339 }
340 
341 /**
342  *	should_restart_tx - are there enough resources to restart a Tx queue?
343  *	@q: the Tx queue
344  *
345  *	Checks if there are enough descriptors to restart a suspended Tx queue.
346  */
347 static inline int should_restart_tx(const struct sge_txq *q)
348 {
349 	unsigned int r = q->processed - q->cleaned;
350 
351 	return q->in_use - r < (q->size >> 1);
352 }
353 
354 static void clear_rx_desc(struct pci_dev *pdev, const struct sge_fl *q,
355 			  struct rx_sw_desc *d)
356 {
357 	if (q->use_pages && d->pg_chunk.page) {
358 		(*d->pg_chunk.p_cnt)--;
359 		if (!*d->pg_chunk.p_cnt)
360 			dma_unmap_page(&pdev->dev, d->pg_chunk.mapping,
361 				       q->alloc_size, DMA_FROM_DEVICE);
362 
363 		put_page(d->pg_chunk.page);
364 		d->pg_chunk.page = NULL;
365 	} else {
366 		dma_unmap_single(&pdev->dev, dma_unmap_addr(d, dma_addr),
367 				 q->buf_size, DMA_FROM_DEVICE);
368 		kfree_skb(d->skb);
369 		d->skb = NULL;
370 	}
371 }
372 
373 /**
374  *	free_rx_bufs - free the Rx buffers on an SGE free list
375  *	@pdev: the PCI device associated with the adapter
376  *	@q: the SGE free list to clean up
377  *
378  *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
379  *	this queue should be stopped before calling this function.
380  */
381 static void free_rx_bufs(struct pci_dev *pdev, struct sge_fl *q)
382 {
383 	unsigned int cidx = q->cidx;
384 
385 	while (q->credits--) {
386 		struct rx_sw_desc *d = &q->sdesc[cidx];
387 
388 
389 		clear_rx_desc(pdev, q, d);
390 		if (++cidx == q->size)
391 			cidx = 0;
392 	}
393 
394 	if (q->pg_chunk.page) {
395 		__free_pages(q->pg_chunk.page, q->order);
396 		q->pg_chunk.page = NULL;
397 	}
398 }
399 
400 /**
401  *	add_one_rx_buf - add a packet buffer to a free-buffer list
402  *	@va:  buffer start VA
403  *	@len: the buffer length
404  *	@d: the HW Rx descriptor to write
405  *	@sd: the SW Rx descriptor to write
406  *	@gen: the generation bit value
407  *	@pdev: the PCI device associated with the adapter
408  *
409  *	Add a buffer of the given length to the supplied HW and SW Rx
410  *	descriptors.
411  */
412 static inline int add_one_rx_buf(void *va, unsigned int len,
413 				 struct rx_desc *d, struct rx_sw_desc *sd,
414 				 unsigned int gen, struct pci_dev *pdev)
415 {
416 	dma_addr_t mapping;
417 
418 	mapping = dma_map_single(&pdev->dev, va, len, DMA_FROM_DEVICE);
419 	if (unlikely(dma_mapping_error(&pdev->dev, mapping)))
420 		return -ENOMEM;
421 
422 	dma_unmap_addr_set(sd, dma_addr, mapping);
423 
424 	d->addr_lo = cpu_to_be32(mapping);
425 	d->addr_hi = cpu_to_be32((u64) mapping >> 32);
426 	dma_wmb();
427 	d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
428 	d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
429 	return 0;
430 }
431 
432 static inline int add_one_rx_chunk(dma_addr_t mapping, struct rx_desc *d,
433 				   unsigned int gen)
434 {
435 	d->addr_lo = cpu_to_be32(mapping);
436 	d->addr_hi = cpu_to_be32((u64) mapping >> 32);
437 	dma_wmb();
438 	d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
439 	d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
440 	return 0;
441 }
442 
443 static int alloc_pg_chunk(struct adapter *adapter, struct sge_fl *q,
444 			  struct rx_sw_desc *sd, gfp_t gfp,
445 			  unsigned int order)
446 {
447 	if (!q->pg_chunk.page) {
448 		dma_addr_t mapping;
449 
450 		q->pg_chunk.page = alloc_pages(gfp, order);
451 		if (unlikely(!q->pg_chunk.page))
452 			return -ENOMEM;
453 		q->pg_chunk.va = page_address(q->pg_chunk.page);
454 		q->pg_chunk.p_cnt = q->pg_chunk.va + (PAGE_SIZE << order) -
455 				    SGE_PG_RSVD;
456 		q->pg_chunk.offset = 0;
457 		mapping = dma_map_page(&adapter->pdev->dev, q->pg_chunk.page,
458 				       0, q->alloc_size, DMA_FROM_DEVICE);
459 		if (unlikely(dma_mapping_error(&adapter->pdev->dev, mapping))) {
460 			__free_pages(q->pg_chunk.page, order);
461 			q->pg_chunk.page = NULL;
462 			return -EIO;
463 		}
464 		q->pg_chunk.mapping = mapping;
465 	}
466 	sd->pg_chunk = q->pg_chunk;
467 
468 	prefetch(sd->pg_chunk.p_cnt);
469 
470 	q->pg_chunk.offset += q->buf_size;
471 	if (q->pg_chunk.offset == (PAGE_SIZE << order))
472 		q->pg_chunk.page = NULL;
473 	else {
474 		q->pg_chunk.va += q->buf_size;
475 		get_page(q->pg_chunk.page);
476 	}
477 
478 	if (sd->pg_chunk.offset == 0)
479 		*sd->pg_chunk.p_cnt = 1;
480 	else
481 		*sd->pg_chunk.p_cnt += 1;
482 
483 	return 0;
484 }
485 
486 static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
487 {
488 	if (q->pend_cred >= q->credits / 4) {
489 		q->pend_cred = 0;
490 		wmb();
491 		t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
492 	}
493 }
494 
495 /**
496  *	refill_fl - refill an SGE free-buffer list
497  *	@adap: the adapter
498  *	@q: the free-list to refill
499  *	@n: the number of new buffers to allocate
500  *	@gfp: the gfp flags for allocating new buffers
501  *
502  *	(Re)populate an SGE free-buffer list with up to @n new packet buffers,
503  *	allocated with the supplied gfp flags.  The caller must assure that
504  *	@n does not exceed the queue's capacity.
505  */
506 static int refill_fl(struct adapter *adap, struct sge_fl *q, int n, gfp_t gfp)
507 {
508 	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
509 	struct rx_desc *d = &q->desc[q->pidx];
510 	unsigned int count = 0;
511 
512 	while (n--) {
513 		dma_addr_t mapping;
514 		int err;
515 
516 		if (q->use_pages) {
517 			if (unlikely(alloc_pg_chunk(adap, q, sd, gfp,
518 						    q->order))) {
519 nomem:				q->alloc_failed++;
520 				break;
521 			}
522 			mapping = sd->pg_chunk.mapping + sd->pg_chunk.offset;
523 			dma_unmap_addr_set(sd, dma_addr, mapping);
524 
525 			add_one_rx_chunk(mapping, d, q->gen);
526 			dma_sync_single_for_device(&adap->pdev->dev, mapping,
527 						   q->buf_size - SGE_PG_RSVD,
528 						   DMA_FROM_DEVICE);
529 		} else {
530 			void *buf_start;
531 
532 			struct sk_buff *skb = alloc_skb(q->buf_size, gfp);
533 			if (!skb)
534 				goto nomem;
535 
536 			sd->skb = skb;
537 			buf_start = skb->data;
538 			err = add_one_rx_buf(buf_start, q->buf_size, d, sd,
539 					     q->gen, adap->pdev);
540 			if (unlikely(err)) {
541 				clear_rx_desc(adap->pdev, q, sd);
542 				break;
543 			}
544 		}
545 
546 		d++;
547 		sd++;
548 		if (++q->pidx == q->size) {
549 			q->pidx = 0;
550 			q->gen ^= 1;
551 			sd = q->sdesc;
552 			d = q->desc;
553 		}
554 		count++;
555 	}
556 
557 	q->credits += count;
558 	q->pend_cred += count;
559 	ring_fl_db(adap, q);
560 
561 	return count;
562 }
563 
564 static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
565 {
566 	refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits),
567 		  GFP_ATOMIC | __GFP_COMP);
568 }
569 
570 /**
571  *	recycle_rx_buf - recycle a receive buffer
572  *	@adap: the adapter
573  *	@q: the SGE free list
574  *	@idx: index of buffer to recycle
575  *
576  *	Recycles the specified buffer on the given free list by adding it at
577  *	the next available slot on the list.
578  */
579 static void recycle_rx_buf(struct adapter *adap, struct sge_fl *q,
580 			   unsigned int idx)
581 {
582 	struct rx_desc *from = &q->desc[idx];
583 	struct rx_desc *to = &q->desc[q->pidx];
584 
585 	q->sdesc[q->pidx] = q->sdesc[idx];
586 	to->addr_lo = from->addr_lo;	/* already big endian */
587 	to->addr_hi = from->addr_hi;	/* likewise */
588 	dma_wmb();
589 	to->len_gen = cpu_to_be32(V_FLD_GEN1(q->gen));
590 	to->gen2 = cpu_to_be32(V_FLD_GEN2(q->gen));
591 
592 	if (++q->pidx == q->size) {
593 		q->pidx = 0;
594 		q->gen ^= 1;
595 	}
596 
597 	q->credits++;
598 	q->pend_cred++;
599 	ring_fl_db(adap, q);
600 }
601 
602 /**
603  *	alloc_ring - allocate resources for an SGE descriptor ring
604  *	@pdev: the PCI device
605  *	@nelem: the number of descriptors
606  *	@elem_size: the size of each descriptor
607  *	@sw_size: the size of the SW state associated with each ring element
608  *	@phys: the physical address of the allocated ring
609  *	@metadata: address of the array holding the SW state for the ring
610  *
611  *	Allocates resources for an SGE descriptor ring, such as Tx queues,
612  *	free buffer lists, or response queues.  Each SGE ring requires
613  *	space for its HW descriptors plus, optionally, space for the SW state
614  *	associated with each HW entry (the metadata).  The function returns
615  *	three values: the virtual address for the HW ring (the return value
616  *	of the function), the physical address of the HW ring, and the address
617  *	of the SW ring.
618  */
619 static void *alloc_ring(struct pci_dev *pdev, size_t nelem, size_t elem_size,
620 			size_t sw_size, dma_addr_t * phys, void *metadata)
621 {
622 	size_t len = nelem * elem_size;
623 	void *s = NULL;
624 	void *p = dma_alloc_coherent(&pdev->dev, len, phys, GFP_KERNEL);
625 
626 	if (!p)
627 		return NULL;
628 	if (sw_size && metadata) {
629 		s = kcalloc(nelem, sw_size, GFP_KERNEL);
630 
631 		if (!s) {
632 			dma_free_coherent(&pdev->dev, len, p, *phys);
633 			return NULL;
634 		}
635 		*(void **)metadata = s;
636 	}
637 	return p;
638 }
639 
640 /**
641  *	t3_reset_qset - reset a sge qset
642  *	@q: the queue set
643  *
644  *	Reset the qset structure.
645  *	the NAPI structure is preserved in the event of
646  *	the qset's reincarnation, for example during EEH recovery.
647  */
648 static void t3_reset_qset(struct sge_qset *q)
649 {
650 	if (q->adap &&
651 	    !(q->adap->flags & NAPI_INIT)) {
652 		memset(q, 0, sizeof(*q));
653 		return;
654 	}
655 
656 	q->adap = NULL;
657 	memset(&q->rspq, 0, sizeof(q->rspq));
658 	memset(q->fl, 0, sizeof(struct sge_fl) * SGE_RXQ_PER_SET);
659 	memset(q->txq, 0, sizeof(struct sge_txq) * SGE_TXQ_PER_SET);
660 	q->txq_stopped = 0;
661 	q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
662 	q->rx_reclaim_timer.function = NULL;
663 	q->nomem = 0;
664 	napi_free_frags(&q->napi);
665 }
666 
667 
668 /**
669  *	t3_free_qset - free the resources of an SGE queue set
670  *	@adapter: the adapter owning the queue set
671  *	@q: the queue set
672  *
673  *	Release the HW and SW resources associated with an SGE queue set, such
674  *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
675  *	queue set must be quiesced prior to calling this.
676  */
677 static void t3_free_qset(struct adapter *adapter, struct sge_qset *q)
678 {
679 	int i;
680 	struct pci_dev *pdev = adapter->pdev;
681 
682 	for (i = 0; i < SGE_RXQ_PER_SET; ++i)
683 		if (q->fl[i].desc) {
684 			spin_lock_irq(&adapter->sge.reg_lock);
685 			t3_sge_disable_fl(adapter, q->fl[i].cntxt_id);
686 			spin_unlock_irq(&adapter->sge.reg_lock);
687 			free_rx_bufs(pdev, &q->fl[i]);
688 			kfree(q->fl[i].sdesc);
689 			dma_free_coherent(&pdev->dev,
690 					  q->fl[i].size *
691 					  sizeof(struct rx_desc), q->fl[i].desc,
692 					  q->fl[i].phys_addr);
693 		}
694 
695 	for (i = 0; i < SGE_TXQ_PER_SET; ++i)
696 		if (q->txq[i].desc) {
697 			spin_lock_irq(&adapter->sge.reg_lock);
698 			t3_sge_enable_ecntxt(adapter, q->txq[i].cntxt_id, 0);
699 			spin_unlock_irq(&adapter->sge.reg_lock);
700 			if (q->txq[i].sdesc) {
701 				free_tx_desc(adapter, &q->txq[i],
702 					     q->txq[i].in_use);
703 				kfree(q->txq[i].sdesc);
704 			}
705 			dma_free_coherent(&pdev->dev,
706 					  q->txq[i].size *
707 					  sizeof(struct tx_desc),
708 					  q->txq[i].desc, q->txq[i].phys_addr);
709 			__skb_queue_purge(&q->txq[i].sendq);
710 		}
711 
712 	if (q->rspq.desc) {
713 		spin_lock_irq(&adapter->sge.reg_lock);
714 		t3_sge_disable_rspcntxt(adapter, q->rspq.cntxt_id);
715 		spin_unlock_irq(&adapter->sge.reg_lock);
716 		dma_free_coherent(&pdev->dev,
717 				  q->rspq.size * sizeof(struct rsp_desc),
718 				  q->rspq.desc, q->rspq.phys_addr);
719 	}
720 
721 	t3_reset_qset(q);
722 }
723 
724 /**
725  *	init_qset_cntxt - initialize an SGE queue set context info
726  *	@qs: the queue set
727  *	@id: the queue set id
728  *
729  *	Initializes the TIDs and context ids for the queues of a queue set.
730  */
731 static void init_qset_cntxt(struct sge_qset *qs, unsigned int id)
732 {
733 	qs->rspq.cntxt_id = id;
734 	qs->fl[0].cntxt_id = 2 * id;
735 	qs->fl[1].cntxt_id = 2 * id + 1;
736 	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
737 	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
738 	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
739 	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
740 	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
741 }
742 
743 /**
744  *	sgl_len - calculates the size of an SGL of the given capacity
745  *	@n: the number of SGL entries
746  *
747  *	Calculates the number of flits needed for a scatter/gather list that
748  *	can hold the given number of entries.
749  */
750 static inline unsigned int sgl_len(unsigned int n)
751 {
752 	/* alternatively: 3 * (n / 2) + 2 * (n & 1) */
753 	return (3 * n) / 2 + (n & 1);
754 }
755 
756 /**
757  *	flits_to_desc - returns the num of Tx descriptors for the given flits
758  *	@n: the number of flits
759  *
760  *	Calculates the number of Tx descriptors needed for the supplied number
761  *	of flits.
762  */
763 static inline unsigned int flits_to_desc(unsigned int n)
764 {
765 	BUG_ON(n >= ARRAY_SIZE(flit_desc_map));
766 	return flit_desc_map[n];
767 }
768 
769 /**
770  *	get_packet - return the next ingress packet buffer from a free list
771  *	@adap: the adapter that received the packet
772  *	@fl: the SGE free list holding the packet
773  *	@len: the packet length including any SGE padding
774  *	@drop_thres: # of remaining buffers before we start dropping packets
775  *
776  *	Get the next packet from a free list and complete setup of the
777  *	sk_buff.  If the packet is small we make a copy and recycle the
778  *	original buffer, otherwise we use the original buffer itself.  If a
779  *	positive drop threshold is supplied packets are dropped and their
780  *	buffers recycled if (a) the number of remaining buffers is under the
781  *	threshold and the packet is too big to copy, or (b) the packet should
782  *	be copied but there is no memory for the copy.
783  */
784 static struct sk_buff *get_packet(struct adapter *adap, struct sge_fl *fl,
785 				  unsigned int len, unsigned int drop_thres)
786 {
787 	struct sk_buff *skb = NULL;
788 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
789 
790 	prefetch(sd->skb->data);
791 	fl->credits--;
792 
793 	if (len <= SGE_RX_COPY_THRES) {
794 		skb = alloc_skb(len, GFP_ATOMIC);
795 		if (likely(skb != NULL)) {
796 			__skb_put(skb, len);
797 			dma_sync_single_for_cpu(&adap->pdev->dev,
798 						dma_unmap_addr(sd, dma_addr),
799 						len, DMA_FROM_DEVICE);
800 			memcpy(skb->data, sd->skb->data, len);
801 			dma_sync_single_for_device(&adap->pdev->dev,
802 						   dma_unmap_addr(sd, dma_addr),
803 						   len, DMA_FROM_DEVICE);
804 		} else if (!drop_thres)
805 			goto use_orig_buf;
806 recycle:
807 		recycle_rx_buf(adap, fl, fl->cidx);
808 		return skb;
809 	}
810 
811 	if (unlikely(fl->credits < drop_thres) &&
812 	    refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits - 1),
813 		      GFP_ATOMIC | __GFP_COMP) == 0)
814 		goto recycle;
815 
816 use_orig_buf:
817 	dma_unmap_single(&adap->pdev->dev, dma_unmap_addr(sd, dma_addr),
818 			 fl->buf_size, DMA_FROM_DEVICE);
819 	skb = sd->skb;
820 	skb_put(skb, len);
821 	__refill_fl(adap, fl);
822 	return skb;
823 }
824 
825 /**
826  *	get_packet_pg - return the next ingress packet buffer from a free list
827  *	@adap: the adapter that received the packet
828  *	@fl: the SGE free list holding the packet
829  *	@q: the queue
830  *	@len: the packet length including any SGE padding
831  *	@drop_thres: # of remaining buffers before we start dropping packets
832  *
833  *	Get the next packet from a free list populated with page chunks.
834  *	If the packet is small we make a copy and recycle the original buffer,
835  *	otherwise we attach the original buffer as a page fragment to a fresh
836  *	sk_buff.  If a positive drop threshold is supplied packets are dropped
837  *	and their buffers recycled if (a) the number of remaining buffers is
838  *	under the threshold and the packet is too big to copy, or (b) there's
839  *	no system memory.
840  *
841  * 	Note: this function is similar to @get_packet but deals with Rx buffers
842  * 	that are page chunks rather than sk_buffs.
843  */
844 static struct sk_buff *get_packet_pg(struct adapter *adap, struct sge_fl *fl,
845 				     struct sge_rspq *q, unsigned int len,
846 				     unsigned int drop_thres)
847 {
848 	struct sk_buff *newskb, *skb;
849 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
850 
851 	dma_addr_t dma_addr = dma_unmap_addr(sd, dma_addr);
852 
853 	newskb = skb = q->pg_skb;
854 	if (!skb && (len <= SGE_RX_COPY_THRES)) {
855 		newskb = alloc_skb(len, GFP_ATOMIC);
856 		if (likely(newskb != NULL)) {
857 			__skb_put(newskb, len);
858 			dma_sync_single_for_cpu(&adap->pdev->dev, dma_addr,
859 						len, DMA_FROM_DEVICE);
860 			memcpy(newskb->data, sd->pg_chunk.va, len);
861 			dma_sync_single_for_device(&adap->pdev->dev, dma_addr,
862 						   len, DMA_FROM_DEVICE);
863 		} else if (!drop_thres)
864 			return NULL;
865 recycle:
866 		fl->credits--;
867 		recycle_rx_buf(adap, fl, fl->cidx);
868 		q->rx_recycle_buf++;
869 		return newskb;
870 	}
871 
872 	if (unlikely(q->rx_recycle_buf || (!skb && fl->credits <= drop_thres)))
873 		goto recycle;
874 
875 	prefetch(sd->pg_chunk.p_cnt);
876 
877 	if (!skb)
878 		newskb = alloc_skb(SGE_RX_PULL_LEN, GFP_ATOMIC);
879 
880 	if (unlikely(!newskb)) {
881 		if (!drop_thres)
882 			return NULL;
883 		goto recycle;
884 	}
885 
886 	dma_sync_single_for_cpu(&adap->pdev->dev, dma_addr, len,
887 				DMA_FROM_DEVICE);
888 	(*sd->pg_chunk.p_cnt)--;
889 	if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page)
890 		dma_unmap_page(&adap->pdev->dev, sd->pg_chunk.mapping,
891 			       fl->alloc_size, DMA_FROM_DEVICE);
892 	if (!skb) {
893 		__skb_put(newskb, SGE_RX_PULL_LEN);
894 		memcpy(newskb->data, sd->pg_chunk.va, SGE_RX_PULL_LEN);
895 		skb_fill_page_desc(newskb, 0, sd->pg_chunk.page,
896 				   sd->pg_chunk.offset + SGE_RX_PULL_LEN,
897 				   len - SGE_RX_PULL_LEN);
898 		newskb->len = len;
899 		newskb->data_len = len - SGE_RX_PULL_LEN;
900 		newskb->truesize += newskb->data_len;
901 	} else {
902 		skb_fill_page_desc(newskb, skb_shinfo(newskb)->nr_frags,
903 				   sd->pg_chunk.page,
904 				   sd->pg_chunk.offset, len);
905 		newskb->len += len;
906 		newskb->data_len += len;
907 		newskb->truesize += len;
908 	}
909 
910 	fl->credits--;
911 	/*
912 	 * We do not refill FLs here, we let the caller do it to overlap a
913 	 * prefetch.
914 	 */
915 	return newskb;
916 }
917 
918 /**
919  *	get_imm_packet - return the next ingress packet buffer from a response
920  *	@resp: the response descriptor containing the packet data
921  *
922  *	Return a packet containing the immediate data of the given response.
923  */
924 static inline struct sk_buff *get_imm_packet(const struct rsp_desc *resp)
925 {
926 	struct sk_buff *skb = alloc_skb(IMMED_PKT_SIZE, GFP_ATOMIC);
927 
928 	if (skb) {
929 		__skb_put(skb, IMMED_PKT_SIZE);
930 		BUILD_BUG_ON(IMMED_PKT_SIZE != sizeof(resp->immediate));
931 		skb_copy_to_linear_data(skb, &resp->immediate, IMMED_PKT_SIZE);
932 	}
933 	return skb;
934 }
935 
936 /**
937  *	calc_tx_descs - calculate the number of Tx descriptors for a packet
938  *	@skb: the packet
939  *
940  * 	Returns the number of Tx descriptors needed for the given Ethernet
941  * 	packet.  Ethernet packets require addition of WR and CPL headers.
942  */
943 static inline unsigned int calc_tx_descs(const struct sk_buff *skb)
944 {
945 	unsigned int flits;
946 
947 	if (skb->len <= WR_LEN - sizeof(struct cpl_tx_pkt))
948 		return 1;
949 
950 	flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 2;
951 	if (skb_shinfo(skb)->gso_size)
952 		flits++;
953 	return flits_to_desc(flits);
954 }
955 
956 /*	map_skb - map a packet main body and its page fragments
957  *	@pdev: the PCI device
958  *	@skb: the packet
959  *	@addr: placeholder to save the mapped addresses
960  *
961  *	map the main body of an sk_buff and its page fragments, if any.
962  */
963 static int map_skb(struct pci_dev *pdev, const struct sk_buff *skb,
964 		   dma_addr_t *addr)
965 {
966 	const skb_frag_t *fp, *end;
967 	const struct skb_shared_info *si;
968 
969 	if (skb_headlen(skb)) {
970 		*addr = dma_map_single(&pdev->dev, skb->data,
971 				       skb_headlen(skb), DMA_TO_DEVICE);
972 		if (dma_mapping_error(&pdev->dev, *addr))
973 			goto out_err;
974 		addr++;
975 	}
976 
977 	si = skb_shinfo(skb);
978 	end = &si->frags[si->nr_frags];
979 
980 	for (fp = si->frags; fp < end; fp++) {
981 		*addr = skb_frag_dma_map(&pdev->dev, fp, 0, skb_frag_size(fp),
982 					 DMA_TO_DEVICE);
983 		if (dma_mapping_error(&pdev->dev, *addr))
984 			goto unwind;
985 		addr++;
986 	}
987 	return 0;
988 
989 unwind:
990 	while (fp-- > si->frags)
991 		dma_unmap_page(&pdev->dev, *--addr, skb_frag_size(fp),
992 			       DMA_TO_DEVICE);
993 
994 	dma_unmap_single(&pdev->dev, addr[-1], skb_headlen(skb),
995 			 DMA_TO_DEVICE);
996 out_err:
997 	return -ENOMEM;
998 }
999 
1000 /**
1001  *	write_sgl - populate a scatter/gather list for a packet
1002  *	@skb: the packet
1003  *	@sgp: the SGL to populate
1004  *	@start: start address of skb main body data to include in the SGL
1005  *	@len: length of skb main body data to include in the SGL
1006  *	@addr: the list of the mapped addresses
1007  *
1008  *	Copies the scatter/gather list for the buffers that make up a packet
1009  *	and returns the SGL size in 8-byte words.  The caller must size the SGL
1010  *	appropriately.
1011  */
1012 static inline unsigned int write_sgl(const struct sk_buff *skb,
1013 				     struct sg_ent *sgp, unsigned char *start,
1014 				     unsigned int len, const dma_addr_t *addr)
1015 {
1016 	unsigned int i, j = 0, k = 0, nfrags;
1017 
1018 	if (len) {
1019 		sgp->len[0] = cpu_to_be32(len);
1020 		sgp->addr[j++] = cpu_to_be64(addr[k++]);
1021 	}
1022 
1023 	nfrags = skb_shinfo(skb)->nr_frags;
1024 	for (i = 0; i < nfrags; i++) {
1025 		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1026 
1027 		sgp->len[j] = cpu_to_be32(skb_frag_size(frag));
1028 		sgp->addr[j] = cpu_to_be64(addr[k++]);
1029 		j ^= 1;
1030 		if (j == 0)
1031 			++sgp;
1032 	}
1033 	if (j)
1034 		sgp->len[j] = 0;
1035 	return ((nfrags + (len != 0)) * 3) / 2 + j;
1036 }
1037 
1038 /**
1039  *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
1040  *	@adap: the adapter
1041  *	@q: the Tx queue
1042  *
1043  *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
1044  *	where the HW is going to sleep just after we checked, however,
1045  *	then the interrupt handler will detect the outstanding TX packet
1046  *	and ring the doorbell for us.
1047  *
1048  *	When GTS is disabled we unconditionally ring the doorbell.
1049  */
1050 static inline void check_ring_tx_db(struct adapter *adap, struct sge_txq *q)
1051 {
1052 #if USE_GTS
1053 	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
1054 	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
1055 		set_bit(TXQ_LAST_PKT_DB, &q->flags);
1056 		t3_write_reg(adap, A_SG_KDOORBELL,
1057 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1058 	}
1059 #else
1060 	wmb();			/* write descriptors before telling HW */
1061 	t3_write_reg(adap, A_SG_KDOORBELL,
1062 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1063 #endif
1064 }
1065 
1066 static inline void wr_gen2(struct tx_desc *d, unsigned int gen)
1067 {
1068 #if SGE_NUM_GENBITS == 2
1069 	d->flit[TX_DESC_FLITS - 1] = cpu_to_be64(gen);
1070 #endif
1071 }
1072 
1073 /**
1074  *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
1075  *	@ndesc: number of Tx descriptors spanned by the SGL
1076  *	@skb: the packet corresponding to the WR
1077  *	@d: first Tx descriptor to be written
1078  *	@pidx: index of above descriptors
1079  *	@q: the SGE Tx queue
1080  *	@sgl: the SGL
1081  *	@flits: number of flits to the start of the SGL in the first descriptor
1082  *	@sgl_flits: the SGL size in flits
1083  *	@gen: the Tx descriptor generation
1084  *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
1085  *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
1086  *
1087  *	Write a work request header and an associated SGL.  If the SGL is
1088  *	small enough to fit into one Tx descriptor it has already been written
1089  *	and we just need to write the WR header.  Otherwise we distribute the
1090  *	SGL across the number of descriptors it spans.
1091  */
1092 static void write_wr_hdr_sgl(unsigned int ndesc, struct sk_buff *skb,
1093 			     struct tx_desc *d, unsigned int pidx,
1094 			     const struct sge_txq *q,
1095 			     const struct sg_ent *sgl,
1096 			     unsigned int flits, unsigned int sgl_flits,
1097 			     unsigned int gen, __be32 wr_hi,
1098 			     __be32 wr_lo)
1099 {
1100 	struct work_request_hdr *wrp = (struct work_request_hdr *)d;
1101 	struct tx_sw_desc *sd = &q->sdesc[pidx];
1102 
1103 	sd->skb = skb;
1104 	if (need_skb_unmap()) {
1105 		sd->fragidx = 0;
1106 		sd->addr_idx = 0;
1107 		sd->sflit = flits;
1108 	}
1109 
1110 	if (likely(ndesc == 1)) {
1111 		sd->eop = 1;
1112 		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
1113 				   V_WR_SGLSFLT(flits)) | wr_hi;
1114 		dma_wmb();
1115 		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
1116 				   V_WR_GEN(gen)) | wr_lo;
1117 		wr_gen2(d, gen);
1118 	} else {
1119 		unsigned int ogen = gen;
1120 		const u64 *fp = (const u64 *)sgl;
1121 		struct work_request_hdr *wp = wrp;
1122 
1123 		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
1124 				   V_WR_SGLSFLT(flits)) | wr_hi;
1125 
1126 		while (sgl_flits) {
1127 			unsigned int avail = WR_FLITS - flits;
1128 
1129 			if (avail > sgl_flits)
1130 				avail = sgl_flits;
1131 			memcpy(&d->flit[flits], fp, avail * sizeof(*fp));
1132 			sgl_flits -= avail;
1133 			ndesc--;
1134 			if (!sgl_flits)
1135 				break;
1136 
1137 			fp += avail;
1138 			d++;
1139 			sd->eop = 0;
1140 			sd++;
1141 			if (++pidx == q->size) {
1142 				pidx = 0;
1143 				gen ^= 1;
1144 				d = q->desc;
1145 				sd = q->sdesc;
1146 			}
1147 
1148 			sd->skb = skb;
1149 			wrp = (struct work_request_hdr *)d;
1150 			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1151 					   V_WR_SGLSFLT(1)) | wr_hi;
1152 			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1153 							sgl_flits + 1)) |
1154 					   V_WR_GEN(gen)) | wr_lo;
1155 			wr_gen2(d, gen);
1156 			flits = 1;
1157 		}
1158 		sd->eop = 1;
1159 		wrp->wr_hi |= htonl(F_WR_EOP);
1160 		dma_wmb();
1161 		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1162 		wr_gen2((struct tx_desc *)wp, ogen);
1163 		WARN_ON(ndesc != 0);
1164 	}
1165 }
1166 
1167 /**
1168  *	write_tx_pkt_wr - write a TX_PKT work request
1169  *	@adap: the adapter
1170  *	@skb: the packet to send
1171  *	@pi: the egress interface
1172  *	@pidx: index of the first Tx descriptor to write
1173  *	@gen: the generation value to use
1174  *	@q: the Tx queue
1175  *	@ndesc: number of descriptors the packet will occupy
1176  *	@compl: the value of the COMPL bit to use
1177  *	@addr: address
1178  *
1179  *	Generate a TX_PKT work request to send the supplied packet.
1180  */
1181 static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
1182 			    const struct port_info *pi,
1183 			    unsigned int pidx, unsigned int gen,
1184 			    struct sge_txq *q, unsigned int ndesc,
1185 			    unsigned int compl, const dma_addr_t *addr)
1186 {
1187 	unsigned int flits, sgl_flits, cntrl, tso_info;
1188 	struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
1189 	struct tx_desc *d = &q->desc[pidx];
1190 	struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)d;
1191 
1192 	cpl->len = htonl(skb->len);
1193 	cntrl = V_TXPKT_INTF(pi->port_id);
1194 
1195 	if (skb_vlan_tag_present(skb))
1196 		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(skb_vlan_tag_get(skb));
1197 
1198 	tso_info = V_LSO_MSS(skb_shinfo(skb)->gso_size);
1199 	if (tso_info) {
1200 		int eth_type;
1201 		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)cpl;
1202 
1203 		d->flit[2] = 0;
1204 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1205 		hdr->cntrl = htonl(cntrl);
1206 		eth_type = skb_network_offset(skb) == ETH_HLEN ?
1207 		    CPL_ETH_II : CPL_ETH_II_VLAN;
1208 		tso_info |= V_LSO_ETH_TYPE(eth_type) |
1209 		    V_LSO_IPHDR_WORDS(ip_hdr(skb)->ihl) |
1210 		    V_LSO_TCPHDR_WORDS(tcp_hdr(skb)->doff);
1211 		hdr->lso_info = htonl(tso_info);
1212 		flits = 3;
1213 	} else {
1214 		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1215 		cntrl |= F_TXPKT_IPCSUM_DIS;	/* SW calculates IP csum */
1216 		cntrl |= V_TXPKT_L4CSUM_DIS(skb->ip_summed != CHECKSUM_PARTIAL);
1217 		cpl->cntrl = htonl(cntrl);
1218 
1219 		if (skb->len <= WR_LEN - sizeof(*cpl)) {
1220 			q->sdesc[pidx].skb = NULL;
1221 			if (!skb->data_len)
1222 				skb_copy_from_linear_data(skb, &d->flit[2],
1223 							  skb->len);
1224 			else
1225 				skb_copy_bits(skb, 0, &d->flit[2], skb->len);
1226 
1227 			flits = (skb->len + 7) / 8 + 2;
1228 			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(skb->len & 7) |
1229 					      V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT)
1230 					      | F_WR_SOP | F_WR_EOP | compl);
1231 			dma_wmb();
1232 			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(gen) |
1233 					      V_WR_TID(q->token));
1234 			wr_gen2(d, gen);
1235 			dev_consume_skb_any(skb);
1236 			return;
1237 		}
1238 
1239 		flits = 2;
1240 	}
1241 
1242 	sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
1243 	sgl_flits = write_sgl(skb, sgp, skb->data, skb_headlen(skb), addr);
1244 
1245 	write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits, gen,
1246 			 htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | compl),
1247 			 htonl(V_WR_TID(q->token)));
1248 }
1249 
1250 static inline void t3_stop_tx_queue(struct netdev_queue *txq,
1251 				    struct sge_qset *qs, struct sge_txq *q)
1252 {
1253 	netif_tx_stop_queue(txq);
1254 	set_bit(TXQ_ETH, &qs->txq_stopped);
1255 	q->stops++;
1256 }
1257 
1258 /**
1259  *	t3_eth_xmit - add a packet to the Ethernet Tx queue
1260  *	@skb: the packet
1261  *	@dev: the egress net device
1262  *
1263  *	Add a packet to an SGE Tx queue.  Runs with softirqs disabled.
1264  */
1265 netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
1266 {
1267 	int qidx;
1268 	unsigned int ndesc, pidx, credits, gen, compl;
1269 	const struct port_info *pi = netdev_priv(dev);
1270 	struct adapter *adap = pi->adapter;
1271 	struct netdev_queue *txq;
1272 	struct sge_qset *qs;
1273 	struct sge_txq *q;
1274 	dma_addr_t addr[MAX_SKB_FRAGS + 1];
1275 
1276 	/*
1277 	 * The chip min packet length is 9 octets but play safe and reject
1278 	 * anything shorter than an Ethernet header.
1279 	 */
1280 	if (unlikely(skb->len < ETH_HLEN)) {
1281 		dev_kfree_skb_any(skb);
1282 		return NETDEV_TX_OK;
1283 	}
1284 
1285 	qidx = skb_get_queue_mapping(skb);
1286 	qs = &pi->qs[qidx];
1287 	q = &qs->txq[TXQ_ETH];
1288 	txq = netdev_get_tx_queue(dev, qidx);
1289 
1290 	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
1291 
1292 	credits = q->size - q->in_use;
1293 	ndesc = calc_tx_descs(skb);
1294 
1295 	if (unlikely(credits < ndesc)) {
1296 		t3_stop_tx_queue(txq, qs, q);
1297 		dev_err(&adap->pdev->dev,
1298 			"%s: Tx ring %u full while queue awake!\n",
1299 			dev->name, q->cntxt_id & 7);
1300 		return NETDEV_TX_BUSY;
1301 	}
1302 
1303 	/* Check if ethernet packet can't be sent as immediate data */
1304 	if (skb->len > (WR_LEN - sizeof(struct cpl_tx_pkt))) {
1305 		if (unlikely(map_skb(adap->pdev, skb, addr) < 0)) {
1306 			dev_kfree_skb(skb);
1307 			return NETDEV_TX_OK;
1308 		}
1309 	}
1310 
1311 	q->in_use += ndesc;
1312 	if (unlikely(credits - ndesc < q->stop_thres)) {
1313 		t3_stop_tx_queue(txq, qs, q);
1314 
1315 		if (should_restart_tx(q) &&
1316 		    test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
1317 			q->restarts++;
1318 			netif_tx_start_queue(txq);
1319 		}
1320 	}
1321 
1322 	gen = q->gen;
1323 	q->unacked += ndesc;
1324 	compl = (q->unacked & 8) << (S_WR_COMPL - 3);
1325 	q->unacked &= 7;
1326 	pidx = q->pidx;
1327 	q->pidx += ndesc;
1328 	if (q->pidx >= q->size) {
1329 		q->pidx -= q->size;
1330 		q->gen ^= 1;
1331 	}
1332 
1333 	/* update port statistics */
1334 	if (skb->ip_summed == CHECKSUM_PARTIAL)
1335 		qs->port_stats[SGE_PSTAT_TX_CSUM]++;
1336 	if (skb_shinfo(skb)->gso_size)
1337 		qs->port_stats[SGE_PSTAT_TSO]++;
1338 	if (skb_vlan_tag_present(skb))
1339 		qs->port_stats[SGE_PSTAT_VLANINS]++;
1340 
1341 	/*
1342 	 * We do not use Tx completion interrupts to free DMAd Tx packets.
1343 	 * This is good for performance but means that we rely on new Tx
1344 	 * packets arriving to run the destructors of completed packets,
1345 	 * which open up space in their sockets' send queues.  Sometimes
1346 	 * we do not get such new packets causing Tx to stall.  A single
1347 	 * UDP transmitter is a good example of this situation.  We have
1348 	 * a clean up timer that periodically reclaims completed packets
1349 	 * but it doesn't run often enough (nor do we want it to) to prevent
1350 	 * lengthy stalls.  A solution to this problem is to run the
1351 	 * destructor early, after the packet is queued but before it's DMAd.
1352 	 * A cons is that we lie to socket memory accounting, but the amount
1353 	 * of extra memory is reasonable (limited by the number of Tx
1354 	 * descriptors), the packets do actually get freed quickly by new
1355 	 * packets almost always, and for protocols like TCP that wait for
1356 	 * acks to really free up the data the extra memory is even less.
1357 	 * On the positive side we run the destructors on the sending CPU
1358 	 * rather than on a potentially different completing CPU, usually a
1359 	 * good thing.  We also run them without holding our Tx queue lock,
1360 	 * unlike what reclaim_completed_tx() would otherwise do.
1361 	 *
1362 	 * Run the destructor before telling the DMA engine about the packet
1363 	 * to make sure it doesn't complete and get freed prematurely.
1364 	 */
1365 	if (likely(!skb_shared(skb)))
1366 		skb_orphan(skb);
1367 
1368 	write_tx_pkt_wr(adap, skb, pi, pidx, gen, q, ndesc, compl, addr);
1369 	check_ring_tx_db(adap, q);
1370 	return NETDEV_TX_OK;
1371 }
1372 
1373 /**
1374  *	write_imm - write a packet into a Tx descriptor as immediate data
1375  *	@d: the Tx descriptor to write
1376  *	@skb: the packet
1377  *	@len: the length of packet data to write as immediate data
1378  *	@gen: the generation bit value to write
1379  *
1380  *	Writes a packet as immediate data into a Tx descriptor.  The packet
1381  *	contains a work request at its beginning.  We must write the packet
1382  *	carefully so the SGE doesn't read it accidentally before it's written
1383  *	in its entirety.
1384  */
1385 static inline void write_imm(struct tx_desc *d, struct sk_buff *skb,
1386 			     unsigned int len, unsigned int gen)
1387 {
1388 	struct work_request_hdr *from = (struct work_request_hdr *)skb->data;
1389 	struct work_request_hdr *to = (struct work_request_hdr *)d;
1390 
1391 	if (likely(!skb->data_len))
1392 		memcpy(&to[1], &from[1], len - sizeof(*from));
1393 	else
1394 		skb_copy_bits(skb, sizeof(*from), &to[1], len - sizeof(*from));
1395 
1396 	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1397 					V_WR_BCNTLFLT(len & 7));
1398 	dma_wmb();
1399 	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1400 					V_WR_LEN((len + 7) / 8));
1401 	wr_gen2(d, gen);
1402 	kfree_skb(skb);
1403 }
1404 
1405 /**
1406  *	check_desc_avail - check descriptor availability on a send queue
1407  *	@adap: the adapter
1408  *	@q: the send queue
1409  *	@skb: the packet needing the descriptors
1410  *	@ndesc: the number of Tx descriptors needed
1411  *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1412  *
1413  *	Checks if the requested number of Tx descriptors is available on an
1414  *	SGE send queue.  If the queue is already suspended or not enough
1415  *	descriptors are available the packet is queued for later transmission.
1416  *	Must be called with the Tx queue locked.
1417  *
1418  *	Returns 0 if enough descriptors are available, 1 if there aren't
1419  *	enough descriptors and the packet has been queued, and 2 if the caller
1420  *	needs to retry because there weren't enough descriptors at the
1421  *	beginning of the call but some freed up in the mean time.
1422  */
1423 static inline int check_desc_avail(struct adapter *adap, struct sge_txq *q,
1424 				   struct sk_buff *skb, unsigned int ndesc,
1425 				   unsigned int qid)
1426 {
1427 	if (unlikely(!skb_queue_empty(&q->sendq))) {
1428 	      addq_exit:__skb_queue_tail(&q->sendq, skb);
1429 		return 1;
1430 	}
1431 	if (unlikely(q->size - q->in_use < ndesc)) {
1432 		struct sge_qset *qs = txq_to_qset(q, qid);
1433 
1434 		set_bit(qid, &qs->txq_stopped);
1435 		smp_mb__after_atomic();
1436 
1437 		if (should_restart_tx(q) &&
1438 		    test_and_clear_bit(qid, &qs->txq_stopped))
1439 			return 2;
1440 
1441 		q->stops++;
1442 		goto addq_exit;
1443 	}
1444 	return 0;
1445 }
1446 
1447 /**
1448  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1449  *	@q: the SGE control Tx queue
1450  *
1451  *	This is a variant of reclaim_completed_tx() that is used for Tx queues
1452  *	that send only immediate data (presently just the control queues) and
1453  *	thus do not have any sk_buffs to release.
1454  */
1455 static inline void reclaim_completed_tx_imm(struct sge_txq *q)
1456 {
1457 	unsigned int reclaim = q->processed - q->cleaned;
1458 
1459 	q->in_use -= reclaim;
1460 	q->cleaned += reclaim;
1461 }
1462 
1463 static inline int immediate(const struct sk_buff *skb)
1464 {
1465 	return skb->len <= WR_LEN;
1466 }
1467 
1468 /**
1469  *	ctrl_xmit - send a packet through an SGE control Tx queue
1470  *	@adap: the adapter
1471  *	@q: the control queue
1472  *	@skb: the packet
1473  *
1474  *	Send a packet through an SGE control Tx queue.  Packets sent through
1475  *	a control queue must fit entirely as immediate data in a single Tx
1476  *	descriptor and have no page fragments.
1477  */
1478 static int ctrl_xmit(struct adapter *adap, struct sge_txq *q,
1479 		     struct sk_buff *skb)
1480 {
1481 	int ret;
1482 	struct work_request_hdr *wrp = (struct work_request_hdr *)skb->data;
1483 
1484 	if (unlikely(!immediate(skb))) {
1485 		WARN_ON(1);
1486 		dev_kfree_skb(skb);
1487 		return NET_XMIT_SUCCESS;
1488 	}
1489 
1490 	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1491 	wrp->wr_lo = htonl(V_WR_TID(q->token));
1492 
1493 	spin_lock(&q->lock);
1494       again:reclaim_completed_tx_imm(q);
1495 
1496 	ret = check_desc_avail(adap, q, skb, 1, TXQ_CTRL);
1497 	if (unlikely(ret)) {
1498 		if (ret == 1) {
1499 			spin_unlock(&q->lock);
1500 			return NET_XMIT_CN;
1501 		}
1502 		goto again;
1503 	}
1504 
1505 	write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1506 
1507 	q->in_use++;
1508 	if (++q->pidx >= q->size) {
1509 		q->pidx = 0;
1510 		q->gen ^= 1;
1511 	}
1512 	spin_unlock(&q->lock);
1513 	wmb();
1514 	t3_write_reg(adap, A_SG_KDOORBELL,
1515 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1516 	return NET_XMIT_SUCCESS;
1517 }
1518 
1519 /**
1520  *	restart_ctrlq - restart a suspended control queue
1521  *	@w: pointer to the work associated with this handler
1522  *
1523  *	Resumes transmission on a suspended Tx control queue.
1524  */
1525 static void restart_ctrlq(struct work_struct *w)
1526 {
1527 	struct sk_buff *skb;
1528 	struct sge_qset *qs = container_of(w, struct sge_qset,
1529 					   txq[TXQ_CTRL].qresume_task);
1530 	struct sge_txq *q = &qs->txq[TXQ_CTRL];
1531 
1532 	spin_lock(&q->lock);
1533       again:reclaim_completed_tx_imm(q);
1534 
1535 	while (q->in_use < q->size &&
1536 	       (skb = __skb_dequeue(&q->sendq)) != NULL) {
1537 
1538 		write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1539 
1540 		if (++q->pidx >= q->size) {
1541 			q->pidx = 0;
1542 			q->gen ^= 1;
1543 		}
1544 		q->in_use++;
1545 	}
1546 
1547 	if (!skb_queue_empty(&q->sendq)) {
1548 		set_bit(TXQ_CTRL, &qs->txq_stopped);
1549 		smp_mb__after_atomic();
1550 
1551 		if (should_restart_tx(q) &&
1552 		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1553 			goto again;
1554 		q->stops++;
1555 	}
1556 
1557 	spin_unlock(&q->lock);
1558 	wmb();
1559 	t3_write_reg(qs->adap, A_SG_KDOORBELL,
1560 		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1561 }
1562 
1563 /*
1564  * Send a management message through control queue 0
1565  */
1566 int t3_mgmt_tx(struct adapter *adap, struct sk_buff *skb)
1567 {
1568 	int ret;
1569 	local_bh_disable();
1570 	ret = ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], skb);
1571 	local_bh_enable();
1572 
1573 	return ret;
1574 }
1575 
1576 /**
1577  *	deferred_unmap_destructor - unmap a packet when it is freed
1578  *	@skb: the packet
1579  *
1580  *	This is the packet destructor used for Tx packets that need to remain
1581  *	mapped until they are freed rather than until their Tx descriptors are
1582  *	freed.
1583  */
1584 static void deferred_unmap_destructor(struct sk_buff *skb)
1585 {
1586 	int i;
1587 	const dma_addr_t *p;
1588 	const struct skb_shared_info *si;
1589 	const struct deferred_unmap_info *dui;
1590 
1591 	dui = (struct deferred_unmap_info *)skb->head;
1592 	p = dui->addr;
1593 
1594 	if (skb_tail_pointer(skb) - skb_transport_header(skb))
1595 		dma_unmap_single(&dui->pdev->dev, *p++,
1596 				 skb_tail_pointer(skb) - skb_transport_header(skb),
1597 				 DMA_TO_DEVICE);
1598 
1599 	si = skb_shinfo(skb);
1600 	for (i = 0; i < si->nr_frags; i++)
1601 		dma_unmap_page(&dui->pdev->dev, *p++,
1602 			       skb_frag_size(&si->frags[i]), DMA_TO_DEVICE);
1603 }
1604 
1605 static void setup_deferred_unmapping(struct sk_buff *skb, struct pci_dev *pdev,
1606 				     const struct sg_ent *sgl, int sgl_flits)
1607 {
1608 	dma_addr_t *p;
1609 	struct deferred_unmap_info *dui;
1610 
1611 	dui = (struct deferred_unmap_info *)skb->head;
1612 	dui->pdev = pdev;
1613 	for (p = dui->addr; sgl_flits >= 3; sgl++, sgl_flits -= 3) {
1614 		*p++ = be64_to_cpu(sgl->addr[0]);
1615 		*p++ = be64_to_cpu(sgl->addr[1]);
1616 	}
1617 	if (sgl_flits)
1618 		*p = be64_to_cpu(sgl->addr[0]);
1619 }
1620 
1621 /**
1622  *	write_ofld_wr - write an offload work request
1623  *	@adap: the adapter
1624  *	@skb: the packet to send
1625  *	@q: the Tx queue
1626  *	@pidx: index of the first Tx descriptor to write
1627  *	@gen: the generation value to use
1628  *	@ndesc: number of descriptors the packet will occupy
1629  *	@addr: the address
1630  *
1631  *	Write an offload work request to send the supplied packet.  The packet
1632  *	data already carry the work request with most fields populated.
1633  */
1634 static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb,
1635 			  struct sge_txq *q, unsigned int pidx,
1636 			  unsigned int gen, unsigned int ndesc,
1637 			  const dma_addr_t *addr)
1638 {
1639 	unsigned int sgl_flits, flits;
1640 	struct work_request_hdr *from;
1641 	struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
1642 	struct tx_desc *d = &q->desc[pidx];
1643 
1644 	if (immediate(skb)) {
1645 		q->sdesc[pidx].skb = NULL;
1646 		write_imm(d, skb, skb->len, gen);
1647 		return;
1648 	}
1649 
1650 	/* Only TX_DATA builds SGLs */
1651 
1652 	from = (struct work_request_hdr *)skb->data;
1653 	memcpy(&d->flit[1], &from[1],
1654 	       skb_transport_offset(skb) - sizeof(*from));
1655 
1656 	flits = skb_transport_offset(skb) / 8;
1657 	sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
1658 	sgl_flits = write_sgl(skb, sgp, skb_transport_header(skb),
1659 			      skb_tail_pointer(skb) - skb_transport_header(skb),
1660 			      addr);
1661 	if (need_skb_unmap()) {
1662 		setup_deferred_unmapping(skb, adap->pdev, sgp, sgl_flits);
1663 		skb->destructor = deferred_unmap_destructor;
1664 	}
1665 
1666 	write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits,
1667 			 gen, from->wr_hi, from->wr_lo);
1668 }
1669 
1670 /**
1671  *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
1672  *	@skb: the packet
1673  *
1674  * 	Returns the number of Tx descriptors needed for the given offload
1675  * 	packet.  These packets are already fully constructed.
1676  */
1677 static inline unsigned int calc_tx_descs_ofld(const struct sk_buff *skb)
1678 {
1679 	unsigned int flits, cnt;
1680 
1681 	if (skb->len <= WR_LEN)
1682 		return 1;	/* packet fits as immediate data */
1683 
1684 	flits = skb_transport_offset(skb) / 8;	/* headers */
1685 	cnt = skb_shinfo(skb)->nr_frags;
1686 	if (skb_tail_pointer(skb) != skb_transport_header(skb))
1687 		cnt++;
1688 	return flits_to_desc(flits + sgl_len(cnt));
1689 }
1690 
1691 /**
1692  *	ofld_xmit - send a packet through an offload queue
1693  *	@adap: the adapter
1694  *	@q: the Tx offload queue
1695  *	@skb: the packet
1696  *
1697  *	Send an offload packet through an SGE offload queue.
1698  */
1699 static int ofld_xmit(struct adapter *adap, struct sge_txq *q,
1700 		     struct sk_buff *skb)
1701 {
1702 	int ret;
1703 	unsigned int ndesc = calc_tx_descs_ofld(skb), pidx, gen;
1704 
1705 	spin_lock(&q->lock);
1706 again:	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
1707 
1708 	ret = check_desc_avail(adap, q, skb, ndesc, TXQ_OFLD);
1709 	if (unlikely(ret)) {
1710 		if (ret == 1) {
1711 			skb->priority = ndesc;	/* save for restart */
1712 			spin_unlock(&q->lock);
1713 			return NET_XMIT_CN;
1714 		}
1715 		goto again;
1716 	}
1717 
1718 	if (!immediate(skb) &&
1719 	    map_skb(adap->pdev, skb, (dma_addr_t *)skb->head)) {
1720 		spin_unlock(&q->lock);
1721 		return NET_XMIT_SUCCESS;
1722 	}
1723 
1724 	gen = q->gen;
1725 	q->in_use += ndesc;
1726 	pidx = q->pidx;
1727 	q->pidx += ndesc;
1728 	if (q->pidx >= q->size) {
1729 		q->pidx -= q->size;
1730 		q->gen ^= 1;
1731 	}
1732 	spin_unlock(&q->lock);
1733 
1734 	write_ofld_wr(adap, skb, q, pidx, gen, ndesc, (dma_addr_t *)skb->head);
1735 	check_ring_tx_db(adap, q);
1736 	return NET_XMIT_SUCCESS;
1737 }
1738 
1739 /**
1740  *	restart_offloadq - restart a suspended offload queue
1741  *	@w: pointer to the work associated with this handler
1742  *
1743  *	Resumes transmission on a suspended Tx offload queue.
1744  */
1745 static void restart_offloadq(struct work_struct *w)
1746 {
1747 	struct sk_buff *skb;
1748 	struct sge_qset *qs = container_of(w, struct sge_qset,
1749 					   txq[TXQ_OFLD].qresume_task);
1750 	struct sge_txq *q = &qs->txq[TXQ_OFLD];
1751 	const struct port_info *pi = netdev_priv(qs->netdev);
1752 	struct adapter *adap = pi->adapter;
1753 	unsigned int written = 0;
1754 
1755 	spin_lock(&q->lock);
1756 again:	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
1757 
1758 	while ((skb = skb_peek(&q->sendq)) != NULL) {
1759 		unsigned int gen, pidx;
1760 		unsigned int ndesc = skb->priority;
1761 
1762 		if (unlikely(q->size - q->in_use < ndesc)) {
1763 			set_bit(TXQ_OFLD, &qs->txq_stopped);
1764 			smp_mb__after_atomic();
1765 
1766 			if (should_restart_tx(q) &&
1767 			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
1768 				goto again;
1769 			q->stops++;
1770 			break;
1771 		}
1772 
1773 		if (!immediate(skb) &&
1774 		    map_skb(adap->pdev, skb, (dma_addr_t *)skb->head))
1775 			break;
1776 
1777 		gen = q->gen;
1778 		q->in_use += ndesc;
1779 		pidx = q->pidx;
1780 		q->pidx += ndesc;
1781 		written += ndesc;
1782 		if (q->pidx >= q->size) {
1783 			q->pidx -= q->size;
1784 			q->gen ^= 1;
1785 		}
1786 		__skb_unlink(skb, &q->sendq);
1787 		spin_unlock(&q->lock);
1788 
1789 		write_ofld_wr(adap, skb, q, pidx, gen, ndesc,
1790 			      (dma_addr_t *)skb->head);
1791 		spin_lock(&q->lock);
1792 	}
1793 	spin_unlock(&q->lock);
1794 
1795 #if USE_GTS
1796 	set_bit(TXQ_RUNNING, &q->flags);
1797 	set_bit(TXQ_LAST_PKT_DB, &q->flags);
1798 #endif
1799 	wmb();
1800 	if (likely(written))
1801 		t3_write_reg(adap, A_SG_KDOORBELL,
1802 			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1803 }
1804 
1805 /**
1806  *	queue_set - return the queue set a packet should use
1807  *	@skb: the packet
1808  *
1809  *	Maps a packet to the SGE queue set it should use.  The desired queue
1810  *	set is carried in bits 1-3 in the packet's priority.
1811  */
1812 static inline int queue_set(const struct sk_buff *skb)
1813 {
1814 	return skb->priority >> 1;
1815 }
1816 
1817 /**
1818  *	is_ctrl_pkt - return whether an offload packet is a control packet
1819  *	@skb: the packet
1820  *
1821  *	Determines whether an offload packet should use an OFLD or a CTRL
1822  *	Tx queue.  This is indicated by bit 0 in the packet's priority.
1823  */
1824 static inline int is_ctrl_pkt(const struct sk_buff *skb)
1825 {
1826 	return skb->priority & 1;
1827 }
1828 
1829 /**
1830  *	t3_offload_tx - send an offload packet
1831  *	@tdev: the offload device to send to
1832  *	@skb: the packet
1833  *
1834  *	Sends an offload packet.  We use the packet priority to select the
1835  *	appropriate Tx queue as follows: bit 0 indicates whether the packet
1836  *	should be sent as regular or control, bits 1-3 select the queue set.
1837  */
1838 int t3_offload_tx(struct t3cdev *tdev, struct sk_buff *skb)
1839 {
1840 	struct adapter *adap = tdev2adap(tdev);
1841 	struct sge_qset *qs = &adap->sge.qs[queue_set(skb)];
1842 
1843 	if (unlikely(is_ctrl_pkt(skb)))
1844 		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], skb);
1845 
1846 	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], skb);
1847 }
1848 
1849 /**
1850  *	offload_enqueue - add an offload packet to an SGE offload receive queue
1851  *	@q: the SGE response queue
1852  *	@skb: the packet
1853  *
1854  *	Add a new offload packet to an SGE response queue's offload packet
1855  *	queue.  If the packet is the first on the queue it schedules the RX
1856  *	softirq to process the queue.
1857  */
1858 static inline void offload_enqueue(struct sge_rspq *q, struct sk_buff *skb)
1859 {
1860 	int was_empty = skb_queue_empty(&q->rx_queue);
1861 
1862 	__skb_queue_tail(&q->rx_queue, skb);
1863 
1864 	if (was_empty) {
1865 		struct sge_qset *qs = rspq_to_qset(q);
1866 
1867 		napi_schedule(&qs->napi);
1868 	}
1869 }
1870 
1871 /**
1872  *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
1873  *	@tdev: the offload device that will be receiving the packets
1874  *	@q: the SGE response queue that assembled the bundle
1875  *	@skbs: the partial bundle
1876  *	@n: the number of packets in the bundle
1877  *
1878  *	Delivers a (partial) bundle of Rx offload packets to an offload device.
1879  */
1880 static inline void deliver_partial_bundle(struct t3cdev *tdev,
1881 					  struct sge_rspq *q,
1882 					  struct sk_buff *skbs[], int n)
1883 {
1884 	if (n) {
1885 		q->offload_bundles++;
1886 		tdev->recv(tdev, skbs, n);
1887 	}
1888 }
1889 
1890 /**
1891  *	ofld_poll - NAPI handler for offload packets in interrupt mode
1892  *	@napi: the network device doing the polling
1893  *	@budget: polling budget
1894  *
1895  *	The NAPI handler for offload packets when a response queue is serviced
1896  *	by the hard interrupt handler, i.e., when it's operating in non-polling
1897  *	mode.  Creates small packet batches and sends them through the offload
1898  *	receive handler.  Batches need to be of modest size as we do prefetches
1899  *	on the packets in each.
1900  */
1901 static int ofld_poll(struct napi_struct *napi, int budget)
1902 {
1903 	struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
1904 	struct sge_rspq *q = &qs->rspq;
1905 	struct adapter *adapter = qs->adap;
1906 	int work_done = 0;
1907 
1908 	while (work_done < budget) {
1909 		struct sk_buff *skb, *tmp, *skbs[RX_BUNDLE_SIZE];
1910 		struct sk_buff_head queue;
1911 		int ngathered;
1912 
1913 		spin_lock_irq(&q->lock);
1914 		__skb_queue_head_init(&queue);
1915 		skb_queue_splice_init(&q->rx_queue, &queue);
1916 		if (skb_queue_empty(&queue)) {
1917 			napi_complete_done(napi, work_done);
1918 			spin_unlock_irq(&q->lock);
1919 			return work_done;
1920 		}
1921 		spin_unlock_irq(&q->lock);
1922 
1923 		ngathered = 0;
1924 		skb_queue_walk_safe(&queue, skb, tmp) {
1925 			if (work_done >= budget)
1926 				break;
1927 			work_done++;
1928 
1929 			__skb_unlink(skb, &queue);
1930 			prefetch(skb->data);
1931 			skbs[ngathered] = skb;
1932 			if (++ngathered == RX_BUNDLE_SIZE) {
1933 				q->offload_bundles++;
1934 				adapter->tdev.recv(&adapter->tdev, skbs,
1935 						   ngathered);
1936 				ngathered = 0;
1937 			}
1938 		}
1939 		if (!skb_queue_empty(&queue)) {
1940 			/* splice remaining packets back onto Rx queue */
1941 			spin_lock_irq(&q->lock);
1942 			skb_queue_splice(&queue, &q->rx_queue);
1943 			spin_unlock_irq(&q->lock);
1944 		}
1945 		deliver_partial_bundle(&adapter->tdev, q, skbs, ngathered);
1946 	}
1947 
1948 	return work_done;
1949 }
1950 
1951 /**
1952  *	rx_offload - process a received offload packet
1953  *	@tdev: the offload device receiving the packet
1954  *	@rq: the response queue that received the packet
1955  *	@skb: the packet
1956  *	@rx_gather: a gather list of packets if we are building a bundle
1957  *	@gather_idx: index of the next available slot in the bundle
1958  *
1959  *	Process an ingress offload packet and add it to the offload ingress
1960  *	queue. 	Returns the index of the next available slot in the bundle.
1961  */
1962 static inline int rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
1963 			     struct sk_buff *skb, struct sk_buff *rx_gather[],
1964 			     unsigned int gather_idx)
1965 {
1966 	skb_reset_mac_header(skb);
1967 	skb_reset_network_header(skb);
1968 	skb_reset_transport_header(skb);
1969 
1970 	if (rq->polling) {
1971 		rx_gather[gather_idx++] = skb;
1972 		if (gather_idx == RX_BUNDLE_SIZE) {
1973 			tdev->recv(tdev, rx_gather, RX_BUNDLE_SIZE);
1974 			gather_idx = 0;
1975 			rq->offload_bundles++;
1976 		}
1977 	} else
1978 		offload_enqueue(rq, skb);
1979 
1980 	return gather_idx;
1981 }
1982 
1983 /**
1984  *	restart_tx - check whether to restart suspended Tx queues
1985  *	@qs: the queue set to resume
1986  *
1987  *	Restarts suspended Tx queues of an SGE queue set if they have enough
1988  *	free resources to resume operation.
1989  */
1990 static void restart_tx(struct sge_qset *qs)
1991 {
1992 	if (test_bit(TXQ_ETH, &qs->txq_stopped) &&
1993 	    should_restart_tx(&qs->txq[TXQ_ETH]) &&
1994 	    test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
1995 		qs->txq[TXQ_ETH].restarts++;
1996 		if (netif_running(qs->netdev))
1997 			netif_tx_wake_queue(qs->tx_q);
1998 	}
1999 
2000 	if (test_bit(TXQ_OFLD, &qs->txq_stopped) &&
2001 	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
2002 	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
2003 		qs->txq[TXQ_OFLD].restarts++;
2004 
2005 		/* The work can be quite lengthy so we use driver's own queue */
2006 		queue_work(cxgb3_wq, &qs->txq[TXQ_OFLD].qresume_task);
2007 	}
2008 	if (test_bit(TXQ_CTRL, &qs->txq_stopped) &&
2009 	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
2010 	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
2011 		qs->txq[TXQ_CTRL].restarts++;
2012 
2013 		/* The work can be quite lengthy so we use driver's own queue */
2014 		queue_work(cxgb3_wq, &qs->txq[TXQ_CTRL].qresume_task);
2015 	}
2016 }
2017 
2018 /**
2019  *	cxgb3_arp_process - process an ARP request probing a private IP address
2020  *	@pi: the port info
2021  *	@skb: the skbuff containing the ARP request
2022  *
2023  *	Check if the ARP request is probing the private IP address
2024  *	dedicated to iSCSI, generate an ARP reply if so.
2025  */
2026 static void cxgb3_arp_process(struct port_info *pi, struct sk_buff *skb)
2027 {
2028 	struct net_device *dev = skb->dev;
2029 	struct arphdr *arp;
2030 	unsigned char *arp_ptr;
2031 	unsigned char *sha;
2032 	__be32 sip, tip;
2033 
2034 	if (!dev)
2035 		return;
2036 
2037 	skb_reset_network_header(skb);
2038 	arp = arp_hdr(skb);
2039 
2040 	if (arp->ar_op != htons(ARPOP_REQUEST))
2041 		return;
2042 
2043 	arp_ptr = (unsigned char *)(arp + 1);
2044 	sha = arp_ptr;
2045 	arp_ptr += dev->addr_len;
2046 	memcpy(&sip, arp_ptr, sizeof(sip));
2047 	arp_ptr += sizeof(sip);
2048 	arp_ptr += dev->addr_len;
2049 	memcpy(&tip, arp_ptr, sizeof(tip));
2050 
2051 	if (tip != pi->iscsi_ipv4addr)
2052 		return;
2053 
2054 	arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
2055 		 pi->iscsic.mac_addr, sha);
2056 
2057 }
2058 
2059 static inline int is_arp(struct sk_buff *skb)
2060 {
2061 	return skb->protocol == htons(ETH_P_ARP);
2062 }
2063 
2064 static void cxgb3_process_iscsi_prov_pack(struct port_info *pi,
2065 					struct sk_buff *skb)
2066 {
2067 	if (is_arp(skb)) {
2068 		cxgb3_arp_process(pi, skb);
2069 		return;
2070 	}
2071 
2072 	if (pi->iscsic.recv)
2073 		pi->iscsic.recv(pi, skb);
2074 
2075 }
2076 
2077 /**
2078  *	rx_eth - process an ingress ethernet packet
2079  *	@adap: the adapter
2080  *	@rq: the response queue that received the packet
2081  *	@skb: the packet
2082  *	@pad: padding
2083  *	@lro: large receive offload
2084  *
2085  *	Process an ingress ethernet packet and deliver it to the stack.
2086  *	The padding is 2 if the packet was delivered in an Rx buffer and 0
2087  *	if it was immediate data in a response.
2088  */
2089 static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
2090 		   struct sk_buff *skb, int pad, int lro)
2091 {
2092 	struct cpl_rx_pkt *p = (struct cpl_rx_pkt *)(skb->data + pad);
2093 	struct sge_qset *qs = rspq_to_qset(rq);
2094 	struct port_info *pi;
2095 
2096 	skb_pull(skb, sizeof(*p) + pad);
2097 	skb->protocol = eth_type_trans(skb, adap->port[p->iff]);
2098 	pi = netdev_priv(skb->dev);
2099 	if ((skb->dev->features & NETIF_F_RXCSUM) && p->csum_valid &&
2100 	    p->csum == htons(0xffff) && !p->fragment) {
2101 		qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2102 		skb->ip_summed = CHECKSUM_UNNECESSARY;
2103 	} else
2104 		skb_checksum_none_assert(skb);
2105 	skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]);
2106 
2107 	if (p->vlan_valid) {
2108 		qs->port_stats[SGE_PSTAT_VLANEX]++;
2109 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(p->vlan));
2110 	}
2111 	if (rq->polling) {
2112 		if (lro)
2113 			napi_gro_receive(&qs->napi, skb);
2114 		else {
2115 			if (unlikely(pi->iscsic.flags))
2116 				cxgb3_process_iscsi_prov_pack(pi, skb);
2117 			netif_receive_skb(skb);
2118 		}
2119 	} else
2120 		netif_rx(skb);
2121 }
2122 
2123 static inline int is_eth_tcp(u32 rss)
2124 {
2125 	return G_HASHTYPE(ntohl(rss)) == RSS_HASH_4_TUPLE;
2126 }
2127 
2128 /**
2129  *	lro_add_page - add a page chunk to an LRO session
2130  *	@adap: the adapter
2131  *	@qs: the associated queue set
2132  *	@fl: the free list containing the page chunk to add
2133  *	@len: packet length
2134  *	@complete: Indicates the last fragment of a frame
2135  *
2136  *	Add a received packet contained in a page chunk to an existing LRO
2137  *	session.
2138  */
2139 static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
2140 			 struct sge_fl *fl, int len, int complete)
2141 {
2142 	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2143 	struct port_info *pi = netdev_priv(qs->netdev);
2144 	struct sk_buff *skb = NULL;
2145 	struct cpl_rx_pkt *cpl;
2146 	skb_frag_t *rx_frag;
2147 	int nr_frags;
2148 	int offset = 0;
2149 
2150 	if (!qs->nomem) {
2151 		skb = napi_get_frags(&qs->napi);
2152 		qs->nomem = !skb;
2153 	}
2154 
2155 	fl->credits--;
2156 
2157 	dma_sync_single_for_cpu(&adap->pdev->dev,
2158 				dma_unmap_addr(sd, dma_addr),
2159 				fl->buf_size - SGE_PG_RSVD, DMA_FROM_DEVICE);
2160 
2161 	(*sd->pg_chunk.p_cnt)--;
2162 	if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page)
2163 		dma_unmap_page(&adap->pdev->dev, sd->pg_chunk.mapping,
2164 			       fl->alloc_size, DMA_FROM_DEVICE);
2165 
2166 	if (!skb) {
2167 		put_page(sd->pg_chunk.page);
2168 		if (complete)
2169 			qs->nomem = 0;
2170 		return;
2171 	}
2172 
2173 	rx_frag = skb_shinfo(skb)->frags;
2174 	nr_frags = skb_shinfo(skb)->nr_frags;
2175 
2176 	if (!nr_frags) {
2177 		offset = 2 + sizeof(struct cpl_rx_pkt);
2178 		cpl = qs->lro_va = sd->pg_chunk.va + 2;
2179 
2180 		if ((qs->netdev->features & NETIF_F_RXCSUM) &&
2181 		     cpl->csum_valid && cpl->csum == htons(0xffff)) {
2182 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2183 			qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
2184 		} else
2185 			skb->ip_summed = CHECKSUM_NONE;
2186 	} else
2187 		cpl = qs->lro_va;
2188 
2189 	len -= offset;
2190 
2191 	rx_frag += nr_frags;
2192 	__skb_frag_set_page(rx_frag, sd->pg_chunk.page);
2193 	skb_frag_off_set(rx_frag, sd->pg_chunk.offset + offset);
2194 	skb_frag_size_set(rx_frag, len);
2195 
2196 	skb->len += len;
2197 	skb->data_len += len;
2198 	skb->truesize += len;
2199 	skb_shinfo(skb)->nr_frags++;
2200 
2201 	if (!complete)
2202 		return;
2203 
2204 	skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]);
2205 
2206 	if (cpl->vlan_valid) {
2207 		qs->port_stats[SGE_PSTAT_VLANEX]++;
2208 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(cpl->vlan));
2209 	}
2210 	napi_gro_frags(&qs->napi);
2211 }
2212 
2213 /**
2214  *	handle_rsp_cntrl_info - handles control information in a response
2215  *	@qs: the queue set corresponding to the response
2216  *	@flags: the response control flags
2217  *
2218  *	Handles the control information of an SGE response, such as GTS
2219  *	indications and completion credits for the queue set's Tx queues.
2220  *	HW coalesces credits, we don't do any extra SW coalescing.
2221  */
2222 static inline void handle_rsp_cntrl_info(struct sge_qset *qs, u32 flags)
2223 {
2224 	unsigned int credits;
2225 
2226 #if USE_GTS
2227 	if (flags & F_RSPD_TXQ0_GTS)
2228 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2229 #endif
2230 
2231 	credits = G_RSPD_TXQ0_CR(flags);
2232 	if (credits)
2233 		qs->txq[TXQ_ETH].processed += credits;
2234 
2235 	credits = G_RSPD_TXQ2_CR(flags);
2236 	if (credits)
2237 		qs->txq[TXQ_CTRL].processed += credits;
2238 
2239 # if USE_GTS
2240 	if (flags & F_RSPD_TXQ1_GTS)
2241 		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2242 # endif
2243 	credits = G_RSPD_TXQ1_CR(flags);
2244 	if (credits)
2245 		qs->txq[TXQ_OFLD].processed += credits;
2246 }
2247 
2248 /**
2249  *	check_ring_db - check if we need to ring any doorbells
2250  *	@adap: the adapter
2251  *	@qs: the queue set whose Tx queues are to be examined
2252  *	@sleeping: indicates which Tx queue sent GTS
2253  *
2254  *	Checks if some of a queue set's Tx queues need to ring their doorbells
2255  *	to resume transmission after idling while they still have unprocessed
2256  *	descriptors.
2257  */
2258 static void check_ring_db(struct adapter *adap, struct sge_qset *qs,
2259 			  unsigned int sleeping)
2260 {
2261 	if (sleeping & F_RSPD_TXQ0_GTS) {
2262 		struct sge_txq *txq = &qs->txq[TXQ_ETH];
2263 
2264 		if (txq->cleaned + txq->in_use != txq->processed &&
2265 		    !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
2266 			set_bit(TXQ_RUNNING, &txq->flags);
2267 			t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
2268 				     V_EGRCNTX(txq->cntxt_id));
2269 		}
2270 	}
2271 
2272 	if (sleeping & F_RSPD_TXQ1_GTS) {
2273 		struct sge_txq *txq = &qs->txq[TXQ_OFLD];
2274 
2275 		if (txq->cleaned + txq->in_use != txq->processed &&
2276 		    !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
2277 			set_bit(TXQ_RUNNING, &txq->flags);
2278 			t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
2279 				     V_EGRCNTX(txq->cntxt_id));
2280 		}
2281 	}
2282 }
2283 
2284 /**
2285  *	is_new_response - check if a response is newly written
2286  *	@r: the response descriptor
2287  *	@q: the response queue
2288  *
2289  *	Returns true if a response descriptor contains a yet unprocessed
2290  *	response.
2291  */
2292 static inline int is_new_response(const struct rsp_desc *r,
2293 				  const struct sge_rspq *q)
2294 {
2295 	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2296 }
2297 
2298 static inline void clear_rspq_bufstate(struct sge_rspq * const q)
2299 {
2300 	q->pg_skb = NULL;
2301 	q->rx_recycle_buf = 0;
2302 }
2303 
2304 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2305 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2306 			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2307 			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2308 			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2309 
2310 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2311 #define NOMEM_INTR_DELAY 2500
2312 
2313 /**
2314  *	process_responses - process responses from an SGE response queue
2315  *	@adap: the adapter
2316  *	@qs: the queue set to which the response queue belongs
2317  *	@budget: how many responses can be processed in this round
2318  *
2319  *	Process responses from an SGE response queue up to the supplied budget.
2320  *	Responses include received packets as well as credits and other events
2321  *	for the queues that belong to the response queue's queue set.
2322  *	A negative budget is effectively unlimited.
2323  *
2324  *	Additionally choose the interrupt holdoff time for the next interrupt
2325  *	on this queue.  If the system is under memory shortage use a fairly
2326  *	long delay to help recovery.
2327  */
2328 static int process_responses(struct adapter *adap, struct sge_qset *qs,
2329 			     int budget)
2330 {
2331 	struct sge_rspq *q = &qs->rspq;
2332 	struct rsp_desc *r = &q->desc[q->cidx];
2333 	int budget_left = budget;
2334 	unsigned int sleeping = 0;
2335 	struct sk_buff *offload_skbs[RX_BUNDLE_SIZE];
2336 	int ngathered = 0;
2337 
2338 	q->next_holdoff = q->holdoff_tmr;
2339 
2340 	while (likely(budget_left && is_new_response(r, q))) {
2341 		int packet_complete, eth, ethpad = 2;
2342 		int lro = !!(qs->netdev->features & NETIF_F_GRO);
2343 		struct sk_buff *skb = NULL;
2344 		u32 len, flags;
2345 		__be32 rss_hi, rss_lo;
2346 
2347 		dma_rmb();
2348 		eth = r->rss_hdr.opcode == CPL_RX_PKT;
2349 		rss_hi = *(const __be32 *)r;
2350 		rss_lo = r->rss_hdr.rss_hash_val;
2351 		flags = ntohl(r->flags);
2352 
2353 		if (unlikely(flags & F_RSPD_ASYNC_NOTIF)) {
2354 			skb = alloc_skb(AN_PKT_SIZE, GFP_ATOMIC);
2355 			if (!skb)
2356 				goto no_mem;
2357 
2358 			__skb_put_data(skb, r, AN_PKT_SIZE);
2359 			skb->data[0] = CPL_ASYNC_NOTIF;
2360 			rss_hi = htonl(CPL_ASYNC_NOTIF << 24);
2361 			q->async_notif++;
2362 		} else if (flags & F_RSPD_IMM_DATA_VALID) {
2363 			skb = get_imm_packet(r);
2364 			if (unlikely(!skb)) {
2365 no_mem:
2366 				q->next_holdoff = NOMEM_INTR_DELAY;
2367 				q->nomem++;
2368 				/* consume one credit since we tried */
2369 				budget_left--;
2370 				break;
2371 			}
2372 			q->imm_data++;
2373 			ethpad = 0;
2374 		} else if ((len = ntohl(r->len_cq)) != 0) {
2375 			struct sge_fl *fl;
2376 
2377 			lro &= eth && is_eth_tcp(rss_hi);
2378 
2379 			fl = (len & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2380 			if (fl->use_pages) {
2381 				void *addr = fl->sdesc[fl->cidx].pg_chunk.va;
2382 
2383 				net_prefetch(addr);
2384 				__refill_fl(adap, fl);
2385 				if (lro > 0) {
2386 					lro_add_page(adap, qs, fl,
2387 						     G_RSPD_LEN(len),
2388 						     flags & F_RSPD_EOP);
2389 					goto next_fl;
2390 				}
2391 
2392 				skb = get_packet_pg(adap, fl, q,
2393 						    G_RSPD_LEN(len),
2394 						    eth ?
2395 						    SGE_RX_DROP_THRES : 0);
2396 				q->pg_skb = skb;
2397 			} else
2398 				skb = get_packet(adap, fl, G_RSPD_LEN(len),
2399 						 eth ? SGE_RX_DROP_THRES : 0);
2400 			if (unlikely(!skb)) {
2401 				if (!eth)
2402 					goto no_mem;
2403 				q->rx_drops++;
2404 			} else if (unlikely(r->rss_hdr.opcode == CPL_TRACE_PKT))
2405 				__skb_pull(skb, 2);
2406 next_fl:
2407 			if (++fl->cidx == fl->size)
2408 				fl->cidx = 0;
2409 		} else
2410 			q->pure_rsps++;
2411 
2412 		if (flags & RSPD_CTRL_MASK) {
2413 			sleeping |= flags & RSPD_GTS_MASK;
2414 			handle_rsp_cntrl_info(qs, flags);
2415 		}
2416 
2417 		r++;
2418 		if (unlikely(++q->cidx == q->size)) {
2419 			q->cidx = 0;
2420 			q->gen ^= 1;
2421 			r = q->desc;
2422 		}
2423 		prefetch(r);
2424 
2425 		if (++q->credits >= (q->size / 4)) {
2426 			refill_rspq(adap, q, q->credits);
2427 			q->credits = 0;
2428 		}
2429 
2430 		packet_complete = flags &
2431 				  (F_RSPD_EOP | F_RSPD_IMM_DATA_VALID |
2432 				   F_RSPD_ASYNC_NOTIF);
2433 
2434 		if (skb != NULL && packet_complete) {
2435 			if (eth)
2436 				rx_eth(adap, q, skb, ethpad, lro);
2437 			else {
2438 				q->offload_pkts++;
2439 				/* Preserve the RSS info in csum & priority */
2440 				skb->csum = rss_hi;
2441 				skb->priority = rss_lo;
2442 				ngathered = rx_offload(&adap->tdev, q, skb,
2443 						       offload_skbs,
2444 						       ngathered);
2445 			}
2446 
2447 			if (flags & F_RSPD_EOP)
2448 				clear_rspq_bufstate(q);
2449 		}
2450 		--budget_left;
2451 	}
2452 
2453 	deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered);
2454 
2455 	if (sleeping)
2456 		check_ring_db(adap, qs, sleeping);
2457 
2458 	smp_mb();		/* commit Tx queue .processed updates */
2459 	if (unlikely(qs->txq_stopped != 0))
2460 		restart_tx(qs);
2461 
2462 	budget -= budget_left;
2463 	return budget;
2464 }
2465 
2466 static inline int is_pure_response(const struct rsp_desc *r)
2467 {
2468 	__be32 n = r->flags & htonl(F_RSPD_ASYNC_NOTIF | F_RSPD_IMM_DATA_VALID);
2469 
2470 	return (n | r->len_cq) == 0;
2471 }
2472 
2473 /**
2474  *	napi_rx_handler - the NAPI handler for Rx processing
2475  *	@napi: the napi instance
2476  *	@budget: how many packets we can process in this round
2477  *
2478  *	Handler for new data events when using NAPI.
2479  */
2480 static int napi_rx_handler(struct napi_struct *napi, int budget)
2481 {
2482 	struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
2483 	struct adapter *adap = qs->adap;
2484 	int work_done = process_responses(adap, qs, budget);
2485 
2486 	if (likely(work_done < budget)) {
2487 		napi_complete_done(napi, work_done);
2488 
2489 		/*
2490 		 * Because we don't atomically flush the following
2491 		 * write it is possible that in very rare cases it can
2492 		 * reach the device in a way that races with a new
2493 		 * response being written plus an error interrupt
2494 		 * causing the NAPI interrupt handler below to return
2495 		 * unhandled status to the OS.  To protect against
2496 		 * this would require flushing the write and doing
2497 		 * both the write and the flush with interrupts off.
2498 		 * Way too expensive and unjustifiable given the
2499 		 * rarity of the race.
2500 		 *
2501 		 * The race cannot happen at all with MSI-X.
2502 		 */
2503 		t3_write_reg(adap, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2504 			     V_NEWTIMER(qs->rspq.next_holdoff) |
2505 			     V_NEWINDEX(qs->rspq.cidx));
2506 	}
2507 	return work_done;
2508 }
2509 
2510 /*
2511  * Returns true if the device is already scheduled for polling.
2512  */
2513 static inline int napi_is_scheduled(struct napi_struct *napi)
2514 {
2515 	return test_bit(NAPI_STATE_SCHED, &napi->state);
2516 }
2517 
2518 /**
2519  *	process_pure_responses - process pure responses from a response queue
2520  *	@adap: the adapter
2521  *	@qs: the queue set owning the response queue
2522  *	@r: the first pure response to process
2523  *
2524  *	A simpler version of process_responses() that handles only pure (i.e.,
2525  *	non data-carrying) responses.  Such respones are too light-weight to
2526  *	justify calling a softirq under NAPI, so we handle them specially in
2527  *	the interrupt handler.  The function is called with a pointer to a
2528  *	response, which the caller must ensure is a valid pure response.
2529  *
2530  *	Returns 1 if it encounters a valid data-carrying response, 0 otherwise.
2531  */
2532 static int process_pure_responses(struct adapter *adap, struct sge_qset *qs,
2533 				  struct rsp_desc *r)
2534 {
2535 	struct sge_rspq *q = &qs->rspq;
2536 	unsigned int sleeping = 0;
2537 
2538 	do {
2539 		u32 flags = ntohl(r->flags);
2540 
2541 		r++;
2542 		if (unlikely(++q->cidx == q->size)) {
2543 			q->cidx = 0;
2544 			q->gen ^= 1;
2545 			r = q->desc;
2546 		}
2547 		prefetch(r);
2548 
2549 		if (flags & RSPD_CTRL_MASK) {
2550 			sleeping |= flags & RSPD_GTS_MASK;
2551 			handle_rsp_cntrl_info(qs, flags);
2552 		}
2553 
2554 		q->pure_rsps++;
2555 		if (++q->credits >= (q->size / 4)) {
2556 			refill_rspq(adap, q, q->credits);
2557 			q->credits = 0;
2558 		}
2559 		if (!is_new_response(r, q))
2560 			break;
2561 		dma_rmb();
2562 	} while (is_pure_response(r));
2563 
2564 	if (sleeping)
2565 		check_ring_db(adap, qs, sleeping);
2566 
2567 	smp_mb();		/* commit Tx queue .processed updates */
2568 	if (unlikely(qs->txq_stopped != 0))
2569 		restart_tx(qs);
2570 
2571 	return is_new_response(r, q);
2572 }
2573 
2574 /**
2575  *	handle_responses - decide what to do with new responses in NAPI mode
2576  *	@adap: the adapter
2577  *	@q: the response queue
2578  *
2579  *	This is used by the NAPI interrupt handlers to decide what to do with
2580  *	new SGE responses.  If there are no new responses it returns -1.  If
2581  *	there are new responses and they are pure (i.e., non-data carrying)
2582  *	it handles them straight in hard interrupt context as they are very
2583  *	cheap and don't deliver any packets.  Finally, if there are any data
2584  *	signaling responses it schedules the NAPI handler.  Returns 1 if it
2585  *	schedules NAPI, 0 if all new responses were pure.
2586  *
2587  *	The caller must ascertain NAPI is not already running.
2588  */
2589 static inline int handle_responses(struct adapter *adap, struct sge_rspq *q)
2590 {
2591 	struct sge_qset *qs = rspq_to_qset(q);
2592 	struct rsp_desc *r = &q->desc[q->cidx];
2593 
2594 	if (!is_new_response(r, q))
2595 		return -1;
2596 	dma_rmb();
2597 	if (is_pure_response(r) && process_pure_responses(adap, qs, r) == 0) {
2598 		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2599 			     V_NEWTIMER(q->holdoff_tmr) | V_NEWINDEX(q->cidx));
2600 		return 0;
2601 	}
2602 	napi_schedule(&qs->napi);
2603 	return 1;
2604 }
2605 
2606 /*
2607  * The MSI-X interrupt handler for an SGE response queue for the non-NAPI case
2608  * (i.e., response queue serviced in hard interrupt).
2609  */
2610 static irqreturn_t t3_sge_intr_msix(int irq, void *cookie)
2611 {
2612 	struct sge_qset *qs = cookie;
2613 	struct adapter *adap = qs->adap;
2614 	struct sge_rspq *q = &qs->rspq;
2615 
2616 	spin_lock(&q->lock);
2617 	if (process_responses(adap, qs, -1) == 0)
2618 		q->unhandled_irqs++;
2619 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2620 		     V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
2621 	spin_unlock(&q->lock);
2622 	return IRQ_HANDLED;
2623 }
2624 
2625 /*
2626  * The MSI-X interrupt handler for an SGE response queue for the NAPI case
2627  * (i.e., response queue serviced by NAPI polling).
2628  */
2629 static irqreturn_t t3_sge_intr_msix_napi(int irq, void *cookie)
2630 {
2631 	struct sge_qset *qs = cookie;
2632 	struct sge_rspq *q = &qs->rspq;
2633 
2634 	spin_lock(&q->lock);
2635 
2636 	if (handle_responses(qs->adap, q) < 0)
2637 		q->unhandled_irqs++;
2638 	spin_unlock(&q->lock);
2639 	return IRQ_HANDLED;
2640 }
2641 
2642 /*
2643  * The non-NAPI MSI interrupt handler.  This needs to handle data events from
2644  * SGE response queues as well as error and other async events as they all use
2645  * the same MSI vector.  We use one SGE response queue per port in this mode
2646  * and protect all response queues with queue 0's lock.
2647  */
2648 static irqreturn_t t3_intr_msi(int irq, void *cookie)
2649 {
2650 	int new_packets = 0;
2651 	struct adapter *adap = cookie;
2652 	struct sge_rspq *q = &adap->sge.qs[0].rspq;
2653 
2654 	spin_lock(&q->lock);
2655 
2656 	if (process_responses(adap, &adap->sge.qs[0], -1)) {
2657 		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2658 			     V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
2659 		new_packets = 1;
2660 	}
2661 
2662 	if (adap->params.nports == 2 &&
2663 	    process_responses(adap, &adap->sge.qs[1], -1)) {
2664 		struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2665 
2666 		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q1->cntxt_id) |
2667 			     V_NEWTIMER(q1->next_holdoff) |
2668 			     V_NEWINDEX(q1->cidx));
2669 		new_packets = 1;
2670 	}
2671 
2672 	if (!new_packets && t3_slow_intr_handler(adap) == 0)
2673 		q->unhandled_irqs++;
2674 
2675 	spin_unlock(&q->lock);
2676 	return IRQ_HANDLED;
2677 }
2678 
2679 static int rspq_check_napi(struct sge_qset *qs)
2680 {
2681 	struct sge_rspq *q = &qs->rspq;
2682 
2683 	if (!napi_is_scheduled(&qs->napi) &&
2684 	    is_new_response(&q->desc[q->cidx], q)) {
2685 		napi_schedule(&qs->napi);
2686 		return 1;
2687 	}
2688 	return 0;
2689 }
2690 
2691 /*
2692  * The MSI interrupt handler for the NAPI case (i.e., response queues serviced
2693  * by NAPI polling).  Handles data events from SGE response queues as well as
2694  * error and other async events as they all use the same MSI vector.  We use
2695  * one SGE response queue per port in this mode and protect all response
2696  * queues with queue 0's lock.
2697  */
2698 static irqreturn_t t3_intr_msi_napi(int irq, void *cookie)
2699 {
2700 	int new_packets;
2701 	struct adapter *adap = cookie;
2702 	struct sge_rspq *q = &adap->sge.qs[0].rspq;
2703 
2704 	spin_lock(&q->lock);
2705 
2706 	new_packets = rspq_check_napi(&adap->sge.qs[0]);
2707 	if (adap->params.nports == 2)
2708 		new_packets += rspq_check_napi(&adap->sge.qs[1]);
2709 	if (!new_packets && t3_slow_intr_handler(adap) == 0)
2710 		q->unhandled_irqs++;
2711 
2712 	spin_unlock(&q->lock);
2713 	return IRQ_HANDLED;
2714 }
2715 
2716 /*
2717  * A helper function that processes responses and issues GTS.
2718  */
2719 static inline int process_responses_gts(struct adapter *adap,
2720 					struct sge_rspq *rq)
2721 {
2722 	int work;
2723 
2724 	work = process_responses(adap, rspq_to_qset(rq), -1);
2725 	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2726 		     V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2727 	return work;
2728 }
2729 
2730 /*
2731  * The legacy INTx interrupt handler.  This needs to handle data events from
2732  * SGE response queues as well as error and other async events as they all use
2733  * the same interrupt pin.  We use one SGE response queue per port in this mode
2734  * and protect all response queues with queue 0's lock.
2735  */
2736 static irqreturn_t t3_intr(int irq, void *cookie)
2737 {
2738 	int work_done, w0, w1;
2739 	struct adapter *adap = cookie;
2740 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2741 	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2742 
2743 	spin_lock(&q0->lock);
2744 
2745 	w0 = is_new_response(&q0->desc[q0->cidx], q0);
2746 	w1 = adap->params.nports == 2 &&
2747 	    is_new_response(&q1->desc[q1->cidx], q1);
2748 
2749 	if (likely(w0 | w1)) {
2750 		t3_write_reg(adap, A_PL_CLI, 0);
2751 		t3_read_reg(adap, A_PL_CLI);	/* flush */
2752 
2753 		if (likely(w0))
2754 			process_responses_gts(adap, q0);
2755 
2756 		if (w1)
2757 			process_responses_gts(adap, q1);
2758 
2759 		work_done = w0 | w1;
2760 	} else
2761 		work_done = t3_slow_intr_handler(adap);
2762 
2763 	spin_unlock(&q0->lock);
2764 	return IRQ_RETVAL(work_done != 0);
2765 }
2766 
2767 /*
2768  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2769  * Handles data events from SGE response queues as well as error and other
2770  * async events as they all use the same interrupt pin.  We use one SGE
2771  * response queue per port in this mode and protect all response queues with
2772  * queue 0's lock.
2773  */
2774 static irqreturn_t t3b_intr(int irq, void *cookie)
2775 {
2776 	u32 map;
2777 	struct adapter *adap = cookie;
2778 	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2779 
2780 	t3_write_reg(adap, A_PL_CLI, 0);
2781 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2782 
2783 	if (unlikely(!map))	/* shared interrupt, most likely */
2784 		return IRQ_NONE;
2785 
2786 	spin_lock(&q0->lock);
2787 
2788 	if (unlikely(map & F_ERRINTR))
2789 		t3_slow_intr_handler(adap);
2790 
2791 	if (likely(map & 1))
2792 		process_responses_gts(adap, q0);
2793 
2794 	if (map & 2)
2795 		process_responses_gts(adap, &adap->sge.qs[1].rspq);
2796 
2797 	spin_unlock(&q0->lock);
2798 	return IRQ_HANDLED;
2799 }
2800 
2801 /*
2802  * NAPI interrupt handler for legacy INTx interrupts for T3B-based cards.
2803  * Handles data events from SGE response queues as well as error and other
2804  * async events as they all use the same interrupt pin.  We use one SGE
2805  * response queue per port in this mode and protect all response queues with
2806  * queue 0's lock.
2807  */
2808 static irqreturn_t t3b_intr_napi(int irq, void *cookie)
2809 {
2810 	u32 map;
2811 	struct adapter *adap = cookie;
2812 	struct sge_qset *qs0 = &adap->sge.qs[0];
2813 	struct sge_rspq *q0 = &qs0->rspq;
2814 
2815 	t3_write_reg(adap, A_PL_CLI, 0);
2816 	map = t3_read_reg(adap, A_SG_DATA_INTR);
2817 
2818 	if (unlikely(!map))	/* shared interrupt, most likely */
2819 		return IRQ_NONE;
2820 
2821 	spin_lock(&q0->lock);
2822 
2823 	if (unlikely(map & F_ERRINTR))
2824 		t3_slow_intr_handler(adap);
2825 
2826 	if (likely(map & 1))
2827 		napi_schedule(&qs0->napi);
2828 
2829 	if (map & 2)
2830 		napi_schedule(&adap->sge.qs[1].napi);
2831 
2832 	spin_unlock(&q0->lock);
2833 	return IRQ_HANDLED;
2834 }
2835 
2836 /**
2837  *	t3_intr_handler - select the top-level interrupt handler
2838  *	@adap: the adapter
2839  *	@polling: whether using NAPI to service response queues
2840  *
2841  *	Selects the top-level interrupt handler based on the type of interrupts
2842  *	(MSI-X, MSI, or legacy) and whether NAPI will be used to service the
2843  *	response queues.
2844  */
2845 irq_handler_t t3_intr_handler(struct adapter *adap, int polling)
2846 {
2847 	if (adap->flags & USING_MSIX)
2848 		return polling ? t3_sge_intr_msix_napi : t3_sge_intr_msix;
2849 	if (adap->flags & USING_MSI)
2850 		return polling ? t3_intr_msi_napi : t3_intr_msi;
2851 	if (adap->params.rev > 0)
2852 		return polling ? t3b_intr_napi : t3b_intr;
2853 	return t3_intr;
2854 }
2855 
2856 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
2857 		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
2858 		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
2859 		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
2860 		    F_HIRCQPARITYERROR)
2861 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
2862 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
2863 		      F_RSPQDISABLED)
2864 
2865 /**
2866  *	t3_sge_err_intr_handler - SGE async event interrupt handler
2867  *	@adapter: the adapter
2868  *
2869  *	Interrupt handler for SGE asynchronous (non-data) events.
2870  */
2871 void t3_sge_err_intr_handler(struct adapter *adapter)
2872 {
2873 	unsigned int v, status = t3_read_reg(adapter, A_SG_INT_CAUSE) &
2874 				 ~F_FLEMPTY;
2875 
2876 	if (status & SGE_PARERR)
2877 		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
2878 			 status & SGE_PARERR);
2879 	if (status & SGE_FRAMINGERR)
2880 		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
2881 			 status & SGE_FRAMINGERR);
2882 
2883 	if (status & F_RSPQCREDITOVERFOW)
2884 		CH_ALERT(adapter, "SGE response queue credit overflow\n");
2885 
2886 	if (status & F_RSPQDISABLED) {
2887 		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
2888 
2889 		CH_ALERT(adapter,
2890 			 "packet delivered to disabled response queue "
2891 			 "(0x%x)\n", (v >> S_RSPQ0DISABLED) & 0xff);
2892 	}
2893 
2894 	if (status & (F_HIPIODRBDROPERR | F_LOPIODRBDROPERR))
2895 		queue_work(cxgb3_wq, &adapter->db_drop_task);
2896 
2897 	if (status & (F_HIPRIORITYDBFULL | F_LOPRIORITYDBFULL))
2898 		queue_work(cxgb3_wq, &adapter->db_full_task);
2899 
2900 	if (status & (F_HIPRIORITYDBEMPTY | F_LOPRIORITYDBEMPTY))
2901 		queue_work(cxgb3_wq, &adapter->db_empty_task);
2902 
2903 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
2904 	if (status &  SGE_FATALERR)
2905 		t3_fatal_err(adapter);
2906 }
2907 
2908 /**
2909  *	sge_timer_tx - perform periodic maintenance of an SGE qset
2910  *	@t: a timer list containing the SGE queue set to maintain
2911  *
2912  *	Runs periodically from a timer to perform maintenance of an SGE queue
2913  *	set.  It performs two tasks:
2914  *
2915  *	Cleans up any completed Tx descriptors that may still be pending.
2916  *	Normal descriptor cleanup happens when new packets are added to a Tx
2917  *	queue so this timer is relatively infrequent and does any cleanup only
2918  *	if the Tx queue has not seen any new packets in a while.  We make a
2919  *	best effort attempt to reclaim descriptors, in that we don't wait
2920  *	around if we cannot get a queue's lock (which most likely is because
2921  *	someone else is queueing new packets and so will also handle the clean
2922  *	up).  Since control queues use immediate data exclusively we don't
2923  *	bother cleaning them up here.
2924  *
2925  */
2926 static void sge_timer_tx(struct timer_list *t)
2927 {
2928 	struct sge_qset *qs = from_timer(qs, t, tx_reclaim_timer);
2929 	struct port_info *pi = netdev_priv(qs->netdev);
2930 	struct adapter *adap = pi->adapter;
2931 	unsigned int tbd[SGE_TXQ_PER_SET] = {0, 0};
2932 	unsigned long next_period;
2933 
2934 	if (__netif_tx_trylock(qs->tx_q)) {
2935                 tbd[TXQ_ETH] = reclaim_completed_tx(adap, &qs->txq[TXQ_ETH],
2936                                                      TX_RECLAIM_TIMER_CHUNK);
2937 		__netif_tx_unlock(qs->tx_q);
2938 	}
2939 
2940 	if (spin_trylock(&qs->txq[TXQ_OFLD].lock)) {
2941 		tbd[TXQ_OFLD] = reclaim_completed_tx(adap, &qs->txq[TXQ_OFLD],
2942 						     TX_RECLAIM_TIMER_CHUNK);
2943 		spin_unlock(&qs->txq[TXQ_OFLD].lock);
2944 	}
2945 
2946 	next_period = TX_RECLAIM_PERIOD >>
2947                       (max(tbd[TXQ_ETH], tbd[TXQ_OFLD]) /
2948                       TX_RECLAIM_TIMER_CHUNK);
2949 	mod_timer(&qs->tx_reclaim_timer, jiffies + next_period);
2950 }
2951 
2952 /**
2953  *	sge_timer_rx - perform periodic maintenance of an SGE qset
2954  *	@t: the timer list containing the SGE queue set to maintain
2955  *
2956  *	a) Replenishes Rx queues that have run out due to memory shortage.
2957  *	Normally new Rx buffers are added when existing ones are consumed but
2958  *	when out of memory a queue can become empty.  We try to add only a few
2959  *	buffers here, the queue will be replenished fully as these new buffers
2960  *	are used up if memory shortage has subsided.
2961  *
2962  *	b) Return coalesced response queue credits in case a response queue is
2963  *	starved.
2964  *
2965  */
2966 static void sge_timer_rx(struct timer_list *t)
2967 {
2968 	spinlock_t *lock;
2969 	struct sge_qset *qs = from_timer(qs, t, rx_reclaim_timer);
2970 	struct port_info *pi = netdev_priv(qs->netdev);
2971 	struct adapter *adap = pi->adapter;
2972 	u32 status;
2973 
2974 	lock = adap->params.rev > 0 ?
2975 	       &qs->rspq.lock : &adap->sge.qs[0].rspq.lock;
2976 
2977 	if (!spin_trylock_irq(lock))
2978 		goto out;
2979 
2980 	if (napi_is_scheduled(&qs->napi))
2981 		goto unlock;
2982 
2983 	if (adap->params.rev < 4) {
2984 		status = t3_read_reg(adap, A_SG_RSPQ_FL_STATUS);
2985 
2986 		if (status & (1 << qs->rspq.cntxt_id)) {
2987 			qs->rspq.starved++;
2988 			if (qs->rspq.credits) {
2989 				qs->rspq.credits--;
2990 				refill_rspq(adap, &qs->rspq, 1);
2991 				qs->rspq.restarted++;
2992 				t3_write_reg(adap, A_SG_RSPQ_FL_STATUS,
2993 					     1 << qs->rspq.cntxt_id);
2994 			}
2995 		}
2996 	}
2997 
2998 	if (qs->fl[0].credits < qs->fl[0].size)
2999 		__refill_fl(adap, &qs->fl[0]);
3000 	if (qs->fl[1].credits < qs->fl[1].size)
3001 		__refill_fl(adap, &qs->fl[1]);
3002 
3003 unlock:
3004 	spin_unlock_irq(lock);
3005 out:
3006 	mod_timer(&qs->rx_reclaim_timer, jiffies + RX_RECLAIM_PERIOD);
3007 }
3008 
3009 /**
3010  *	t3_update_qset_coalesce - update coalescing settings for a queue set
3011  *	@qs: the SGE queue set
3012  *	@p: new queue set parameters
3013  *
3014  *	Update the coalescing settings for an SGE queue set.  Nothing is done
3015  *	if the queue set is not initialized yet.
3016  */
3017 void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
3018 {
3019 	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);/* can't be 0 */
3020 	qs->rspq.polling = p->polling;
3021 	qs->napi.poll = p->polling ? napi_rx_handler : ofld_poll;
3022 }
3023 
3024 /**
3025  *	t3_sge_alloc_qset - initialize an SGE queue set
3026  *	@adapter: the adapter
3027  *	@id: the queue set id
3028  *	@nports: how many Ethernet ports will be using this queue set
3029  *	@irq_vec_idx: the IRQ vector index for response queue interrupts
3030  *	@p: configuration parameters for this queue set
3031  *	@ntxq: number of Tx queues for the queue set
3032  *	@dev: net device associated with this queue set
3033  *	@netdevq: net device TX queue associated with this queue set
3034  *
3035  *	Allocate resources and initialize an SGE queue set.  A queue set
3036  *	comprises a response queue, two Rx free-buffer queues, and up to 3
3037  *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
3038  *	queue, offload queue, and control queue.
3039  */
3040 int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
3041 		      int irq_vec_idx, const struct qset_params *p,
3042 		      int ntxq, struct net_device *dev,
3043 		      struct netdev_queue *netdevq)
3044 {
3045 	int i, avail, ret = -ENOMEM;
3046 	struct sge_qset *q = &adapter->sge.qs[id];
3047 
3048 	init_qset_cntxt(q, id);
3049 	timer_setup(&q->tx_reclaim_timer, sge_timer_tx, 0);
3050 	timer_setup(&q->rx_reclaim_timer, sge_timer_rx, 0);
3051 
3052 	q->fl[0].desc = alloc_ring(adapter->pdev, p->fl_size,
3053 				   sizeof(struct rx_desc),
3054 				   sizeof(struct rx_sw_desc),
3055 				   &q->fl[0].phys_addr, &q->fl[0].sdesc);
3056 	if (!q->fl[0].desc)
3057 		goto err;
3058 
3059 	q->fl[1].desc = alloc_ring(adapter->pdev, p->jumbo_size,
3060 				   sizeof(struct rx_desc),
3061 				   sizeof(struct rx_sw_desc),
3062 				   &q->fl[1].phys_addr, &q->fl[1].sdesc);
3063 	if (!q->fl[1].desc)
3064 		goto err;
3065 
3066 	q->rspq.desc = alloc_ring(adapter->pdev, p->rspq_size,
3067 				  sizeof(struct rsp_desc), 0,
3068 				  &q->rspq.phys_addr, NULL);
3069 	if (!q->rspq.desc)
3070 		goto err;
3071 
3072 	for (i = 0; i < ntxq; ++i) {
3073 		/*
3074 		 * The control queue always uses immediate data so does not
3075 		 * need to keep track of any sk_buffs.
3076 		 */
3077 		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
3078 
3079 		q->txq[i].desc = alloc_ring(adapter->pdev, p->txq_size[i],
3080 					    sizeof(struct tx_desc), sz,
3081 					    &q->txq[i].phys_addr,
3082 					    &q->txq[i].sdesc);
3083 		if (!q->txq[i].desc)
3084 			goto err;
3085 
3086 		q->txq[i].gen = 1;
3087 		q->txq[i].size = p->txq_size[i];
3088 		spin_lock_init(&q->txq[i].lock);
3089 		skb_queue_head_init(&q->txq[i].sendq);
3090 	}
3091 
3092 	INIT_WORK(&q->txq[TXQ_OFLD].qresume_task, restart_offloadq);
3093 	INIT_WORK(&q->txq[TXQ_CTRL].qresume_task, restart_ctrlq);
3094 
3095 	q->fl[0].gen = q->fl[1].gen = 1;
3096 	q->fl[0].size = p->fl_size;
3097 	q->fl[1].size = p->jumbo_size;
3098 
3099 	q->rspq.gen = 1;
3100 	q->rspq.size = p->rspq_size;
3101 	spin_lock_init(&q->rspq.lock);
3102 	skb_queue_head_init(&q->rspq.rx_queue);
3103 
3104 	q->txq[TXQ_ETH].stop_thres = nports *
3105 	    flits_to_desc(sgl_len(MAX_SKB_FRAGS + 1) + 3);
3106 
3107 #if FL0_PG_CHUNK_SIZE > 0
3108 	q->fl[0].buf_size = FL0_PG_CHUNK_SIZE;
3109 #else
3110 	q->fl[0].buf_size = SGE_RX_SM_BUF_SIZE + sizeof(struct cpl_rx_data);
3111 #endif
3112 #if FL1_PG_CHUNK_SIZE > 0
3113 	q->fl[1].buf_size = FL1_PG_CHUNK_SIZE;
3114 #else
3115 	q->fl[1].buf_size = is_offload(adapter) ?
3116 		(16 * 1024) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
3117 		MAX_FRAME_SIZE + 2 + sizeof(struct cpl_rx_pkt);
3118 #endif
3119 
3120 	q->fl[0].use_pages = FL0_PG_CHUNK_SIZE > 0;
3121 	q->fl[1].use_pages = FL1_PG_CHUNK_SIZE > 0;
3122 	q->fl[0].order = FL0_PG_ORDER;
3123 	q->fl[1].order = FL1_PG_ORDER;
3124 	q->fl[0].alloc_size = FL0_PG_ALLOC_SIZE;
3125 	q->fl[1].alloc_size = FL1_PG_ALLOC_SIZE;
3126 
3127 	spin_lock_irq(&adapter->sge.reg_lock);
3128 
3129 	/* FL threshold comparison uses < */
3130 	ret = t3_sge_init_rspcntxt(adapter, q->rspq.cntxt_id, irq_vec_idx,
3131 				   q->rspq.phys_addr, q->rspq.size,
3132 				   q->fl[0].buf_size - SGE_PG_RSVD, 1, 0);
3133 	if (ret)
3134 		goto err_unlock;
3135 
3136 	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
3137 		ret = t3_sge_init_flcntxt(adapter, q->fl[i].cntxt_id, 0,
3138 					  q->fl[i].phys_addr, q->fl[i].size,
3139 					  q->fl[i].buf_size - SGE_PG_RSVD,
3140 					  p->cong_thres, 1, 0);
3141 		if (ret)
3142 			goto err_unlock;
3143 	}
3144 
3145 	ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
3146 				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
3147 				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
3148 				 1, 0);
3149 	if (ret)
3150 		goto err_unlock;
3151 
3152 	if (ntxq > 1) {
3153 		ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_OFLD].cntxt_id,
3154 					 USE_GTS, SGE_CNTXT_OFLD, id,
3155 					 q->txq[TXQ_OFLD].phys_addr,
3156 					 q->txq[TXQ_OFLD].size, 0, 1, 0);
3157 		if (ret)
3158 			goto err_unlock;
3159 	}
3160 
3161 	if (ntxq > 2) {
3162 		ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_CTRL].cntxt_id, 0,
3163 					 SGE_CNTXT_CTRL, id,
3164 					 q->txq[TXQ_CTRL].phys_addr,
3165 					 q->txq[TXQ_CTRL].size,
3166 					 q->txq[TXQ_CTRL].token, 1, 0);
3167 		if (ret)
3168 			goto err_unlock;
3169 	}
3170 
3171 	spin_unlock_irq(&adapter->sge.reg_lock);
3172 
3173 	q->adap = adapter;
3174 	q->netdev = dev;
3175 	q->tx_q = netdevq;
3176 	t3_update_qset_coalesce(q, p);
3177 
3178 	avail = refill_fl(adapter, &q->fl[0], q->fl[0].size,
3179 			  GFP_KERNEL | __GFP_COMP);
3180 	if (!avail) {
3181 		CH_ALERT(adapter, "free list queue 0 initialization failed\n");
3182 		ret = -ENOMEM;
3183 		goto err;
3184 	}
3185 	if (avail < q->fl[0].size)
3186 		CH_WARN(adapter, "free list queue 0 enabled with %d credits\n",
3187 			avail);
3188 
3189 	avail = refill_fl(adapter, &q->fl[1], q->fl[1].size,
3190 			  GFP_KERNEL | __GFP_COMP);
3191 	if (avail < q->fl[1].size)
3192 		CH_WARN(adapter, "free list queue 1 enabled with %d credits\n",
3193 			avail);
3194 	refill_rspq(adapter, &q->rspq, q->rspq.size - 1);
3195 
3196 	t3_write_reg(adapter, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
3197 		     V_NEWTIMER(q->rspq.holdoff_tmr));
3198 
3199 	return 0;
3200 
3201 err_unlock:
3202 	spin_unlock_irq(&adapter->sge.reg_lock);
3203 err:
3204 	t3_free_qset(adapter, q);
3205 	return ret;
3206 }
3207 
3208 /**
3209  *      t3_start_sge_timers - start SGE timer call backs
3210  *      @adap: the adapter
3211  *
3212  *      Starts each SGE queue set's timer call back
3213  */
3214 void t3_start_sge_timers(struct adapter *adap)
3215 {
3216 	int i;
3217 
3218 	for (i = 0; i < SGE_QSETS; ++i) {
3219 		struct sge_qset *q = &adap->sge.qs[i];
3220 
3221 		if (q->tx_reclaim_timer.function)
3222 			mod_timer(&q->tx_reclaim_timer,
3223 				  jiffies + TX_RECLAIM_PERIOD);
3224 
3225 		if (q->rx_reclaim_timer.function)
3226 			mod_timer(&q->rx_reclaim_timer,
3227 				  jiffies + RX_RECLAIM_PERIOD);
3228 	}
3229 }
3230 
3231 /**
3232  *	t3_stop_sge_timers - stop SGE timer call backs
3233  *	@adap: the adapter
3234  *
3235  *	Stops each SGE queue set's timer call back
3236  */
3237 void t3_stop_sge_timers(struct adapter *adap)
3238 {
3239 	int i;
3240 
3241 	for (i = 0; i < SGE_QSETS; ++i) {
3242 		struct sge_qset *q = &adap->sge.qs[i];
3243 
3244 		if (q->tx_reclaim_timer.function)
3245 			del_timer_sync(&q->tx_reclaim_timer);
3246 		if (q->rx_reclaim_timer.function)
3247 			del_timer_sync(&q->rx_reclaim_timer);
3248 	}
3249 }
3250 
3251 /**
3252  *	t3_free_sge_resources - free SGE resources
3253  *	@adap: the adapter
3254  *
3255  *	Frees resources used by the SGE queue sets.
3256  */
3257 void t3_free_sge_resources(struct adapter *adap)
3258 {
3259 	int i;
3260 
3261 	for (i = 0; i < SGE_QSETS; ++i)
3262 		t3_free_qset(adap, &adap->sge.qs[i]);
3263 }
3264 
3265 /**
3266  *	t3_sge_start - enable SGE
3267  *	@adap: the adapter
3268  *
3269  *	Enables the SGE for DMAs.  This is the last step in starting packet
3270  *	transfers.
3271  */
3272 void t3_sge_start(struct adapter *adap)
3273 {
3274 	t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
3275 }
3276 
3277 /**
3278  *	t3_sge_stop_dma - Disable SGE DMA engine operation
3279  *	@adap: the adapter
3280  *
3281  *	Can be invoked from interrupt context e.g.  error handler.
3282  *
3283  *	Note that this function cannot disable the restart of works as
3284  *	it cannot wait if called from interrupt context, however the
3285  *	works will have no effect since the doorbells are disabled. The
3286  *	driver will call tg3_sge_stop() later from process context, at
3287  *	which time the works will be stopped if they are still running.
3288  */
3289 void t3_sge_stop_dma(struct adapter *adap)
3290 {
3291 	t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, 0);
3292 }
3293 
3294 /**
3295  *	t3_sge_stop - disable SGE operation completly
3296  *	@adap: the adapter
3297  *
3298  *	Called from process context. Disables the DMA engine and any
3299  *	pending queue restart works.
3300  */
3301 void t3_sge_stop(struct adapter *adap)
3302 {
3303 	int i;
3304 
3305 	t3_sge_stop_dma(adap);
3306 
3307 	/* workqueues aren't initialized otherwise */
3308 	if (!(adap->flags & FULL_INIT_DONE))
3309 		return;
3310 	for (i = 0; i < SGE_QSETS; ++i) {
3311 		struct sge_qset *qs = &adap->sge.qs[i];
3312 
3313 		cancel_work_sync(&qs->txq[TXQ_OFLD].qresume_task);
3314 		cancel_work_sync(&qs->txq[TXQ_CTRL].qresume_task);
3315 	}
3316 }
3317 
3318 /**
3319  *	t3_sge_init - initialize SGE
3320  *	@adap: the adapter
3321  *	@p: the SGE parameters
3322  *
3323  *	Performs SGE initialization needed every time after a chip reset.
3324  *	We do not initialize any of the queue sets here, instead the driver
3325  *	top-level must request those individually.  We also do not enable DMA
3326  *	here, that should be done after the queues have been set up.
3327  */
3328 void t3_sge_init(struct adapter *adap, struct sge_params *p)
3329 {
3330 	unsigned int ctrl, ups = ffs(pci_resource_len(adap->pdev, 2) >> 12);
3331 
3332 	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
3333 	    F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
3334 	    V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
3335 	    V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
3336 #if SGE_NUM_GENBITS == 1
3337 	ctrl |= F_EGRGENCTRL;
3338 #endif
3339 	if (adap->params.rev > 0) {
3340 		if (!(adap->flags & (USING_MSIX | USING_MSI)))
3341 			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
3342 	}
3343 	t3_write_reg(adap, A_SG_CONTROL, ctrl);
3344 	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
3345 		     V_LORCQDRBTHRSH(512));
3346 	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
3347 	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
3348 		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
3349 	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
3350 		     adap->params.rev < T3_REV_C ? 1000 : 500);
3351 	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
3352 	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
3353 	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
3354 	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
3355 	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
3356 }
3357 
3358 /**
3359  *	t3_sge_prep - one-time SGE initialization
3360  *	@adap: the associated adapter
3361  *	@p: SGE parameters
3362  *
3363  *	Performs one-time initialization of SGE SW state.  Includes determining
3364  *	defaults for the assorted SGE parameters, which admins can change until
3365  *	they are used to initialize the SGE.
3366  */
3367 void t3_sge_prep(struct adapter *adap, struct sge_params *p)
3368 {
3369 	int i;
3370 
3371 	p->max_pkt_size = (16 * 1024) - sizeof(struct cpl_rx_data) -
3372 	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
3373 
3374 	for (i = 0; i < SGE_QSETS; ++i) {
3375 		struct qset_params *q = p->qset + i;
3376 
3377 		q->polling = adap->params.rev > 0;
3378 		q->coalesce_usecs = 5;
3379 		q->rspq_size = 1024;
3380 		q->fl_size = 1024;
3381 		q->jumbo_size = 512;
3382 		q->txq_size[TXQ_ETH] = 1024;
3383 		q->txq_size[TXQ_OFLD] = 1024;
3384 		q->txq_size[TXQ_CTRL] = 256;
3385 		q->cong_thres = 0;
3386 	}
3387 
3388 	spin_lock_init(&adap->sge.reg_lock);
3389 }
3390