xref: /openbmc/linux/net/sched/sch_sfq.c (revision 99fee508)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/pkt_cls.h>
27 #include <net/red.h>
28 
29 
30 /*	Stochastic Fairness Queuing algorithm.
31 	=======================================
32 
33 	Source:
34 	Paul E. McKenney "Stochastic Fairness Queuing",
35 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36 
37 	Paul E. McKenney "Stochastic Fairness Queuing",
38 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
39 
40 
41 	See also:
42 	M. Shreedhar and George Varghese "Efficient Fair
43 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44 
45 
46 	This is not the thing that is usually called (W)FQ nowadays.
47 	It does not use any timestamp mechanism, but instead
48 	processes queues in round-robin order.
49 
50 	ADVANTAGE:
51 
52 	- It is very cheap. Both CPU and memory requirements are minimal.
53 
54 	DRAWBACKS:
55 
56 	- "Stochastic" -> It is not 100% fair.
57 	When hash collisions occur, several flows are considered as one.
58 
59 	- "Round-robin" -> It introduces larger delays than virtual clock
60 	based schemes, and should not be used for isolating interactive
61 	traffic	from non-interactive. It means, that this scheduler
62 	should be used as leaf of CBQ or P3, which put interactive traffic
63 	to higher priority band.
64 
65 	We still need true WFQ for top level CSZ, but using WFQ
66 	for the best effort traffic is absolutely pointless:
67 	SFQ is superior for this purpose.
68 
69 	IMPLEMENTATION:
70 	This implementation limits :
71 	- maximal queue length per flow to 127 packets.
72 	- max mtu to 2^18-1;
73 	- max 65408 flows,
74 	- number of hash buckets to 65536.
75 
76 	It is easy to increase these values, but not in flight.  */
77 
78 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
79 #define SFQ_DEFAULT_FLOWS	128
80 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
81 #define SFQ_EMPTY_SLOT		0xffff
82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
83 
84 /* We use 16 bits to store allot, and want to handle packets up to 64K
85  * Scale allot by 8 (1<<3) so that no overflow occurs.
86  */
87 #define SFQ_ALLOT_SHIFT		3
88 #define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
89 
90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
91 typedef u16 sfq_index;
92 
93 /*
94  * We dont use pointers to save space.
95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
97  * are 'pointers' to dep[] array
98  */
99 struct sfq_head {
100 	sfq_index	next;
101 	sfq_index	prev;
102 };
103 
104 struct sfq_slot {
105 	struct sk_buff	*skblist_next;
106 	struct sk_buff	*skblist_prev;
107 	sfq_index	qlen; /* number of skbs in skblist */
108 	sfq_index	next; /* next slot in sfq RR chain */
109 	struct sfq_head dep; /* anchor in dep[] chains */
110 	unsigned short	hash; /* hash value (index in ht[]) */
111 	short		allot; /* credit for this slot */
112 
113 	unsigned int    backlog;
114 	struct red_vars vars;
115 };
116 
117 struct sfq_sched_data {
118 /* frequently used fields */
119 	int		limit;		/* limit of total number of packets in this qdisc */
120 	unsigned int	divisor;	/* number of slots in hash table */
121 	u8		headdrop;
122 	u8		maxdepth;	/* limit of packets per flow */
123 
124 	u32		perturbation;
125 	u8		cur_depth;	/* depth of longest slot */
126 	u8		flags;
127 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
128 	struct tcf_proto __rcu *filter_list;
129 	struct tcf_block *block;
130 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
131 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
132 
133 	struct red_parms *red_parms;
134 	struct tc_sfqred_stats stats;
135 	struct sfq_slot *tail;		/* current slot in round */
136 
137 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
138 					/* Linked lists of slots, indexed by depth
139 					 * dep[0] : list of unused flows
140 					 * dep[1] : list of flows with 1 packet
141 					 * dep[X] : list of flows with X packets
142 					 */
143 
144 	unsigned int	maxflows;	/* number of flows in flows array */
145 	int		perturb_period;
146 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
147 	struct timer_list perturb_timer;
148 };
149 
150 /*
151  * sfq_head are either in a sfq_slot or in dep[] array
152  */
153 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
154 {
155 	if (val < SFQ_MAX_FLOWS)
156 		return &q->slots[val].dep;
157 	return &q->dep[val - SFQ_MAX_FLOWS];
158 }
159 
160 static unsigned int sfq_hash(const struct sfq_sched_data *q,
161 			     const struct sk_buff *skb)
162 {
163 	return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
164 }
165 
166 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
167 				 int *qerr)
168 {
169 	struct sfq_sched_data *q = qdisc_priv(sch);
170 	struct tcf_result res;
171 	struct tcf_proto *fl;
172 	int result;
173 
174 	if (TC_H_MAJ(skb->priority) == sch->handle &&
175 	    TC_H_MIN(skb->priority) > 0 &&
176 	    TC_H_MIN(skb->priority) <= q->divisor)
177 		return TC_H_MIN(skb->priority);
178 
179 	fl = rcu_dereference_bh(q->filter_list);
180 	if (!fl)
181 		return sfq_hash(q, skb) + 1;
182 
183 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
184 	result = tcf_classify(skb, fl, &res, false);
185 	if (result >= 0) {
186 #ifdef CONFIG_NET_CLS_ACT
187 		switch (result) {
188 		case TC_ACT_STOLEN:
189 		case TC_ACT_QUEUED:
190 		case TC_ACT_TRAP:
191 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
192 		case TC_ACT_SHOT:
193 			return 0;
194 		}
195 #endif
196 		if (TC_H_MIN(res.classid) <= q->divisor)
197 			return TC_H_MIN(res.classid);
198 	}
199 	return 0;
200 }
201 
202 /*
203  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
204  */
205 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
206 {
207 	sfq_index p, n;
208 	struct sfq_slot *slot = &q->slots[x];
209 	int qlen = slot->qlen;
210 
211 	p = qlen + SFQ_MAX_FLOWS;
212 	n = q->dep[qlen].next;
213 
214 	slot->dep.next = n;
215 	slot->dep.prev = p;
216 
217 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
218 	sfq_dep_head(q, n)->prev = x;
219 }
220 
221 #define sfq_unlink(q, x, n, p)			\
222 	do {					\
223 		n = q->slots[x].dep.next;	\
224 		p = q->slots[x].dep.prev;	\
225 		sfq_dep_head(q, p)->next = n;	\
226 		sfq_dep_head(q, n)->prev = p;	\
227 	} while (0)
228 
229 
230 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
231 {
232 	sfq_index p, n;
233 	int d;
234 
235 	sfq_unlink(q, x, n, p);
236 
237 	d = q->slots[x].qlen--;
238 	if (n == p && q->cur_depth == d)
239 		q->cur_depth--;
240 	sfq_link(q, x);
241 }
242 
243 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
244 {
245 	sfq_index p, n;
246 	int d;
247 
248 	sfq_unlink(q, x, n, p);
249 
250 	d = ++q->slots[x].qlen;
251 	if (q->cur_depth < d)
252 		q->cur_depth = d;
253 	sfq_link(q, x);
254 }
255 
256 /* helper functions : might be changed when/if skb use a standard list_head */
257 
258 /* remove one skb from tail of slot queue */
259 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
260 {
261 	struct sk_buff *skb = slot->skblist_prev;
262 
263 	slot->skblist_prev = skb->prev;
264 	skb->prev->next = (struct sk_buff *)slot;
265 	skb->next = skb->prev = NULL;
266 	return skb;
267 }
268 
269 /* remove one skb from head of slot queue */
270 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
271 {
272 	struct sk_buff *skb = slot->skblist_next;
273 
274 	slot->skblist_next = skb->next;
275 	skb->next->prev = (struct sk_buff *)slot;
276 	skb->next = skb->prev = NULL;
277 	return skb;
278 }
279 
280 static inline void slot_queue_init(struct sfq_slot *slot)
281 {
282 	memset(slot, 0, sizeof(*slot));
283 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
284 }
285 
286 /* add skb to slot queue (tail add) */
287 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
288 {
289 	skb->prev = slot->skblist_prev;
290 	skb->next = (struct sk_buff *)slot;
291 	slot->skblist_prev->next = skb;
292 	slot->skblist_prev = skb;
293 }
294 
295 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
296 {
297 	struct sfq_sched_data *q = qdisc_priv(sch);
298 	sfq_index x, d = q->cur_depth;
299 	struct sk_buff *skb;
300 	unsigned int len;
301 	struct sfq_slot *slot;
302 
303 	/* Queue is full! Find the longest slot and drop tail packet from it */
304 	if (d > 1) {
305 		x = q->dep[d].next;
306 		slot = &q->slots[x];
307 drop:
308 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
309 		len = qdisc_pkt_len(skb);
310 		slot->backlog -= len;
311 		sfq_dec(q, x);
312 		sch->q.qlen--;
313 		qdisc_qstats_backlog_dec(sch, skb);
314 		qdisc_drop(skb, sch, to_free);
315 		return len;
316 	}
317 
318 	if (d == 1) {
319 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
320 		x = q->tail->next;
321 		slot = &q->slots[x];
322 		q->tail->next = slot->next;
323 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
324 		goto drop;
325 	}
326 
327 	return 0;
328 }
329 
330 /* Is ECN parameter configured */
331 static int sfq_prob_mark(const struct sfq_sched_data *q)
332 {
333 	return q->flags & TC_RED_ECN;
334 }
335 
336 /* Should packets over max threshold just be marked */
337 static int sfq_hard_mark(const struct sfq_sched_data *q)
338 {
339 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
340 }
341 
342 static int sfq_headdrop(const struct sfq_sched_data *q)
343 {
344 	return q->headdrop;
345 }
346 
347 static int
348 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
349 {
350 	struct sfq_sched_data *q = qdisc_priv(sch);
351 	unsigned int hash, dropped;
352 	sfq_index x, qlen;
353 	struct sfq_slot *slot;
354 	int uninitialized_var(ret);
355 	struct sk_buff *head;
356 	int delta;
357 
358 	hash = sfq_classify(skb, sch, &ret);
359 	if (hash == 0) {
360 		if (ret & __NET_XMIT_BYPASS)
361 			qdisc_qstats_drop(sch);
362 		__qdisc_drop(skb, to_free);
363 		return ret;
364 	}
365 	hash--;
366 
367 	x = q->ht[hash];
368 	slot = &q->slots[x];
369 	if (x == SFQ_EMPTY_SLOT) {
370 		x = q->dep[0].next; /* get a free slot */
371 		if (x >= SFQ_MAX_FLOWS)
372 			return qdisc_drop(skb, sch, to_free);
373 		q->ht[hash] = x;
374 		slot = &q->slots[x];
375 		slot->hash = hash;
376 		slot->backlog = 0; /* should already be 0 anyway... */
377 		red_set_vars(&slot->vars);
378 		goto enqueue;
379 	}
380 	if (q->red_parms) {
381 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
382 							&slot->vars,
383 							slot->backlog);
384 		switch (red_action(q->red_parms,
385 				   &slot->vars,
386 				   slot->vars.qavg)) {
387 		case RED_DONT_MARK:
388 			break;
389 
390 		case RED_PROB_MARK:
391 			qdisc_qstats_overlimit(sch);
392 			if (sfq_prob_mark(q)) {
393 				/* We know we have at least one packet in queue */
394 				if (sfq_headdrop(q) &&
395 				    INET_ECN_set_ce(slot->skblist_next)) {
396 					q->stats.prob_mark_head++;
397 					break;
398 				}
399 				if (INET_ECN_set_ce(skb)) {
400 					q->stats.prob_mark++;
401 					break;
402 				}
403 			}
404 			q->stats.prob_drop++;
405 			goto congestion_drop;
406 
407 		case RED_HARD_MARK:
408 			qdisc_qstats_overlimit(sch);
409 			if (sfq_hard_mark(q)) {
410 				/* We know we have at least one packet in queue */
411 				if (sfq_headdrop(q) &&
412 				    INET_ECN_set_ce(slot->skblist_next)) {
413 					q->stats.forced_mark_head++;
414 					break;
415 				}
416 				if (INET_ECN_set_ce(skb)) {
417 					q->stats.forced_mark++;
418 					break;
419 				}
420 			}
421 			q->stats.forced_drop++;
422 			goto congestion_drop;
423 		}
424 	}
425 
426 	if (slot->qlen >= q->maxdepth) {
427 congestion_drop:
428 		if (!sfq_headdrop(q))
429 			return qdisc_drop(skb, sch, to_free);
430 
431 		/* We know we have at least one packet in queue */
432 		head = slot_dequeue_head(slot);
433 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
434 		sch->qstats.backlog -= delta;
435 		slot->backlog -= delta;
436 		qdisc_drop(head, sch, to_free);
437 
438 		slot_queue_add(slot, skb);
439 		qdisc_tree_reduce_backlog(sch, 0, delta);
440 		return NET_XMIT_CN;
441 	}
442 
443 enqueue:
444 	qdisc_qstats_backlog_inc(sch, skb);
445 	slot->backlog += qdisc_pkt_len(skb);
446 	slot_queue_add(slot, skb);
447 	sfq_inc(q, x);
448 	if (slot->qlen == 1) {		/* The flow is new */
449 		if (q->tail == NULL) {	/* It is the first flow */
450 			slot->next = x;
451 		} else {
452 			slot->next = q->tail->next;
453 			q->tail->next = x;
454 		}
455 		/* We put this flow at the end of our flow list.
456 		 * This might sound unfair for a new flow to wait after old ones,
457 		 * but we could endup servicing new flows only, and freeze old ones.
458 		 */
459 		q->tail = slot;
460 		/* We could use a bigger initial quantum for new flows */
461 		slot->allot = q->scaled_quantum;
462 	}
463 	if (++sch->q.qlen <= q->limit)
464 		return NET_XMIT_SUCCESS;
465 
466 	qlen = slot->qlen;
467 	dropped = sfq_drop(sch, to_free);
468 	/* Return Congestion Notification only if we dropped a packet
469 	 * from this flow.
470 	 */
471 	if (qlen != slot->qlen) {
472 		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
473 		return NET_XMIT_CN;
474 	}
475 
476 	/* As we dropped a packet, better let upper stack know this */
477 	qdisc_tree_reduce_backlog(sch, 1, dropped);
478 	return NET_XMIT_SUCCESS;
479 }
480 
481 static struct sk_buff *
482 sfq_dequeue(struct Qdisc *sch)
483 {
484 	struct sfq_sched_data *q = qdisc_priv(sch);
485 	struct sk_buff *skb;
486 	sfq_index a, next_a;
487 	struct sfq_slot *slot;
488 
489 	/* No active slots */
490 	if (q->tail == NULL)
491 		return NULL;
492 
493 next_slot:
494 	a = q->tail->next;
495 	slot = &q->slots[a];
496 	if (slot->allot <= 0) {
497 		q->tail = slot;
498 		slot->allot += q->scaled_quantum;
499 		goto next_slot;
500 	}
501 	skb = slot_dequeue_head(slot);
502 	sfq_dec(q, a);
503 	qdisc_bstats_update(sch, skb);
504 	sch->q.qlen--;
505 	qdisc_qstats_backlog_dec(sch, skb);
506 	slot->backlog -= qdisc_pkt_len(skb);
507 	/* Is the slot empty? */
508 	if (slot->qlen == 0) {
509 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
510 		next_a = slot->next;
511 		if (a == next_a) {
512 			q->tail = NULL; /* no more active slots */
513 			return skb;
514 		}
515 		q->tail->next = next_a;
516 	} else {
517 		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
518 	}
519 	return skb;
520 }
521 
522 static void
523 sfq_reset(struct Qdisc *sch)
524 {
525 	struct sk_buff *skb;
526 
527 	while ((skb = sfq_dequeue(sch)) != NULL)
528 		rtnl_kfree_skbs(skb, skb);
529 }
530 
531 /*
532  * When q->perturbation is changed, we rehash all queued skbs
533  * to avoid OOO (Out Of Order) effects.
534  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
535  * counters.
536  */
537 static void sfq_rehash(struct Qdisc *sch)
538 {
539 	struct sfq_sched_data *q = qdisc_priv(sch);
540 	struct sk_buff *skb;
541 	int i;
542 	struct sfq_slot *slot;
543 	struct sk_buff_head list;
544 	int dropped = 0;
545 	unsigned int drop_len = 0;
546 
547 	__skb_queue_head_init(&list);
548 
549 	for (i = 0; i < q->maxflows; i++) {
550 		slot = &q->slots[i];
551 		if (!slot->qlen)
552 			continue;
553 		while (slot->qlen) {
554 			skb = slot_dequeue_head(slot);
555 			sfq_dec(q, i);
556 			__skb_queue_tail(&list, skb);
557 		}
558 		slot->backlog = 0;
559 		red_set_vars(&slot->vars);
560 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
561 	}
562 	q->tail = NULL;
563 
564 	while ((skb = __skb_dequeue(&list)) != NULL) {
565 		unsigned int hash = sfq_hash(q, skb);
566 		sfq_index x = q->ht[hash];
567 
568 		slot = &q->slots[x];
569 		if (x == SFQ_EMPTY_SLOT) {
570 			x = q->dep[0].next; /* get a free slot */
571 			if (x >= SFQ_MAX_FLOWS) {
572 drop:
573 				qdisc_qstats_backlog_dec(sch, skb);
574 				drop_len += qdisc_pkt_len(skb);
575 				kfree_skb(skb);
576 				dropped++;
577 				continue;
578 			}
579 			q->ht[hash] = x;
580 			slot = &q->slots[x];
581 			slot->hash = hash;
582 		}
583 		if (slot->qlen >= q->maxdepth)
584 			goto drop;
585 		slot_queue_add(slot, skb);
586 		if (q->red_parms)
587 			slot->vars.qavg = red_calc_qavg(q->red_parms,
588 							&slot->vars,
589 							slot->backlog);
590 		slot->backlog += qdisc_pkt_len(skb);
591 		sfq_inc(q, x);
592 		if (slot->qlen == 1) {		/* The flow is new */
593 			if (q->tail == NULL) {	/* It is the first flow */
594 				slot->next = x;
595 			} else {
596 				slot->next = q->tail->next;
597 				q->tail->next = x;
598 			}
599 			q->tail = slot;
600 			slot->allot = q->scaled_quantum;
601 		}
602 	}
603 	sch->q.qlen -= dropped;
604 	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
605 }
606 
607 static void sfq_perturbation(unsigned long arg)
608 {
609 	struct Qdisc *sch = (struct Qdisc *)arg;
610 	struct sfq_sched_data *q = qdisc_priv(sch);
611 	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
612 
613 	spin_lock(root_lock);
614 	q->perturbation = prandom_u32();
615 	if (!q->filter_list && q->tail)
616 		sfq_rehash(sch);
617 	spin_unlock(root_lock);
618 
619 	if (q->perturb_period)
620 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
621 }
622 
623 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
624 {
625 	struct sfq_sched_data *q = qdisc_priv(sch);
626 	struct tc_sfq_qopt *ctl = nla_data(opt);
627 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
628 	unsigned int qlen, dropped = 0;
629 	struct red_parms *p = NULL;
630 	struct sk_buff *to_free = NULL;
631 	struct sk_buff *tail = NULL;
632 
633 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
634 		return -EINVAL;
635 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
636 		ctl_v1 = nla_data(opt);
637 	if (ctl->divisor &&
638 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
639 		return -EINVAL;
640 	if (ctl_v1 && ctl_v1->qth_min) {
641 		p = kmalloc(sizeof(*p), GFP_KERNEL);
642 		if (!p)
643 			return -ENOMEM;
644 	}
645 	sch_tree_lock(sch);
646 	if (ctl->quantum) {
647 		q->quantum = ctl->quantum;
648 		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
649 	}
650 	q->perturb_period = ctl->perturb_period * HZ;
651 	if (ctl->flows)
652 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
653 	if (ctl->divisor) {
654 		q->divisor = ctl->divisor;
655 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
656 	}
657 	if (ctl_v1) {
658 		if (ctl_v1->depth)
659 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
660 		if (p) {
661 			swap(q->red_parms, p);
662 			red_set_parms(q->red_parms,
663 				      ctl_v1->qth_min, ctl_v1->qth_max,
664 				      ctl_v1->Wlog,
665 				      ctl_v1->Plog, ctl_v1->Scell_log,
666 				      NULL,
667 				      ctl_v1->max_P);
668 		}
669 		q->flags = ctl_v1->flags;
670 		q->headdrop = ctl_v1->headdrop;
671 	}
672 	if (ctl->limit) {
673 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
674 		q->maxflows = min_t(u32, q->maxflows, q->limit);
675 	}
676 
677 	qlen = sch->q.qlen;
678 	while (sch->q.qlen > q->limit) {
679 		dropped += sfq_drop(sch, &to_free);
680 		if (!tail)
681 			tail = to_free;
682 	}
683 
684 	rtnl_kfree_skbs(to_free, tail);
685 	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
686 
687 	del_timer(&q->perturb_timer);
688 	if (q->perturb_period) {
689 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
690 		q->perturbation = prandom_u32();
691 	}
692 	sch_tree_unlock(sch);
693 	kfree(p);
694 	return 0;
695 }
696 
697 static void *sfq_alloc(size_t sz)
698 {
699 	return  kvmalloc(sz, GFP_KERNEL);
700 }
701 
702 static void sfq_free(void *addr)
703 {
704 	kvfree(addr);
705 }
706 
707 static void sfq_destroy(struct Qdisc *sch)
708 {
709 	struct sfq_sched_data *q = qdisc_priv(sch);
710 
711 	tcf_block_put(q->block);
712 	q->perturb_period = 0;
713 	del_timer_sync(&q->perturb_timer);
714 	sfq_free(q->ht);
715 	sfq_free(q->slots);
716 	kfree(q->red_parms);
717 }
718 
719 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
720 {
721 	struct sfq_sched_data *q = qdisc_priv(sch);
722 	int i;
723 	int err;
724 
725 	setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
726 			       (unsigned long)sch);
727 
728 	err = tcf_block_get(&q->block, &q->filter_list);
729 	if (err)
730 		return err;
731 
732 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
733 		q->dep[i].next = i + SFQ_MAX_FLOWS;
734 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
735 	}
736 
737 	q->limit = SFQ_MAX_DEPTH;
738 	q->maxdepth = SFQ_MAX_DEPTH;
739 	q->cur_depth = 0;
740 	q->tail = NULL;
741 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
742 	q->maxflows = SFQ_DEFAULT_FLOWS;
743 	q->quantum = psched_mtu(qdisc_dev(sch));
744 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
745 	q->perturb_period = 0;
746 	q->perturbation = prandom_u32();
747 
748 	if (opt) {
749 		int err = sfq_change(sch, opt);
750 		if (err)
751 			return err;
752 	}
753 
754 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
755 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
756 	if (!q->ht || !q->slots) {
757 		/* Note: sfq_destroy() will be called by our caller */
758 		return -ENOMEM;
759 	}
760 
761 	for (i = 0; i < q->divisor; i++)
762 		q->ht[i] = SFQ_EMPTY_SLOT;
763 
764 	for (i = 0; i < q->maxflows; i++) {
765 		slot_queue_init(&q->slots[i]);
766 		sfq_link(q, i);
767 	}
768 	if (q->limit >= 1)
769 		sch->flags |= TCQ_F_CAN_BYPASS;
770 	else
771 		sch->flags &= ~TCQ_F_CAN_BYPASS;
772 	return 0;
773 }
774 
775 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
776 {
777 	struct sfq_sched_data *q = qdisc_priv(sch);
778 	unsigned char *b = skb_tail_pointer(skb);
779 	struct tc_sfq_qopt_v1 opt;
780 	struct red_parms *p = q->red_parms;
781 
782 	memset(&opt, 0, sizeof(opt));
783 	opt.v0.quantum	= q->quantum;
784 	opt.v0.perturb_period = q->perturb_period / HZ;
785 	opt.v0.limit	= q->limit;
786 	opt.v0.divisor	= q->divisor;
787 	opt.v0.flows	= q->maxflows;
788 	opt.depth	= q->maxdepth;
789 	opt.headdrop	= q->headdrop;
790 
791 	if (p) {
792 		opt.qth_min	= p->qth_min >> p->Wlog;
793 		opt.qth_max	= p->qth_max >> p->Wlog;
794 		opt.Wlog	= p->Wlog;
795 		opt.Plog	= p->Plog;
796 		opt.Scell_log	= p->Scell_log;
797 		opt.max_P	= p->max_P;
798 	}
799 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
800 	opt.flags	= q->flags;
801 
802 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
803 		goto nla_put_failure;
804 
805 	return skb->len;
806 
807 nla_put_failure:
808 	nlmsg_trim(skb, b);
809 	return -1;
810 }
811 
812 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
813 {
814 	return NULL;
815 }
816 
817 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
818 {
819 	return 0;
820 }
821 
822 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
823 			      u32 classid)
824 {
825 	/* we cannot bypass queue discipline anymore */
826 	sch->flags &= ~TCQ_F_CAN_BYPASS;
827 	return 0;
828 }
829 
830 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
831 {
832 }
833 
834 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl)
835 {
836 	struct sfq_sched_data *q = qdisc_priv(sch);
837 
838 	if (cl)
839 		return NULL;
840 	return q->block;
841 }
842 
843 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
844 			  struct sk_buff *skb, struct tcmsg *tcm)
845 {
846 	tcm->tcm_handle |= TC_H_MIN(cl);
847 	return 0;
848 }
849 
850 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
851 				struct gnet_dump *d)
852 {
853 	struct sfq_sched_data *q = qdisc_priv(sch);
854 	sfq_index idx = q->ht[cl - 1];
855 	struct gnet_stats_queue qs = { 0 };
856 	struct tc_sfq_xstats xstats = { 0 };
857 
858 	if (idx != SFQ_EMPTY_SLOT) {
859 		const struct sfq_slot *slot = &q->slots[idx];
860 
861 		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
862 		qs.qlen = slot->qlen;
863 		qs.backlog = slot->backlog;
864 	}
865 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
866 		return -1;
867 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
868 }
869 
870 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
871 {
872 	struct sfq_sched_data *q = qdisc_priv(sch);
873 	unsigned int i;
874 
875 	if (arg->stop)
876 		return;
877 
878 	for (i = 0; i < q->divisor; i++) {
879 		if (q->ht[i] == SFQ_EMPTY_SLOT ||
880 		    arg->count < arg->skip) {
881 			arg->count++;
882 			continue;
883 		}
884 		if (arg->fn(sch, i + 1, arg) < 0) {
885 			arg->stop = 1;
886 			break;
887 		}
888 		arg->count++;
889 	}
890 }
891 
892 static const struct Qdisc_class_ops sfq_class_ops = {
893 	.leaf		=	sfq_leaf,
894 	.find		=	sfq_find,
895 	.tcf_block	=	sfq_tcf_block,
896 	.bind_tcf	=	sfq_bind,
897 	.unbind_tcf	=	sfq_unbind,
898 	.dump		=	sfq_dump_class,
899 	.dump_stats	=	sfq_dump_class_stats,
900 	.walk		=	sfq_walk,
901 };
902 
903 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
904 	.cl_ops		=	&sfq_class_ops,
905 	.id		=	"sfq",
906 	.priv_size	=	sizeof(struct sfq_sched_data),
907 	.enqueue	=	sfq_enqueue,
908 	.dequeue	=	sfq_dequeue,
909 	.peek		=	qdisc_peek_dequeued,
910 	.init		=	sfq_init,
911 	.reset		=	sfq_reset,
912 	.destroy	=	sfq_destroy,
913 	.change		=	NULL,
914 	.dump		=	sfq_dump,
915 	.owner		=	THIS_MODULE,
916 };
917 
918 static int __init sfq_module_init(void)
919 {
920 	return register_qdisc(&sfq_qdisc_ops);
921 }
922 static void __exit sfq_module_exit(void)
923 {
924 	unregister_qdisc(&sfq_qdisc_ops);
925 }
926 module_init(sfq_module_init)
927 module_exit(sfq_module_exit)
928 MODULE_LICENSE("GPL");
929