xref: /openbmc/linux/net/sched/sch_sfq.c (revision 5d0e4d78)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/pkt_cls.h>
27 #include <net/red.h>
28 
29 
30 /*	Stochastic Fairness Queuing algorithm.
31 	=======================================
32 
33 	Source:
34 	Paul E. McKenney "Stochastic Fairness Queuing",
35 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36 
37 	Paul E. McKenney "Stochastic Fairness Queuing",
38 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
39 
40 
41 	See also:
42 	M. Shreedhar and George Varghese "Efficient Fair
43 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44 
45 
46 	This is not the thing that is usually called (W)FQ nowadays.
47 	It does not use any timestamp mechanism, but instead
48 	processes queues in round-robin order.
49 
50 	ADVANTAGE:
51 
52 	- It is very cheap. Both CPU and memory requirements are minimal.
53 
54 	DRAWBACKS:
55 
56 	- "Stochastic" -> It is not 100% fair.
57 	When hash collisions occur, several flows are considered as one.
58 
59 	- "Round-robin" -> It introduces larger delays than virtual clock
60 	based schemes, and should not be used for isolating interactive
61 	traffic	from non-interactive. It means, that this scheduler
62 	should be used as leaf of CBQ or P3, which put interactive traffic
63 	to higher priority band.
64 
65 	We still need true WFQ for top level CSZ, but using WFQ
66 	for the best effort traffic is absolutely pointless:
67 	SFQ is superior for this purpose.
68 
69 	IMPLEMENTATION:
70 	This implementation limits :
71 	- maximal queue length per flow to 127 packets.
72 	- max mtu to 2^18-1;
73 	- max 65408 flows,
74 	- number of hash buckets to 65536.
75 
76 	It is easy to increase these values, but not in flight.  */
77 
78 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
79 #define SFQ_DEFAULT_FLOWS	128
80 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
81 #define SFQ_EMPTY_SLOT		0xffff
82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
83 
84 /* We use 16 bits to store allot, and want to handle packets up to 64K
85  * Scale allot by 8 (1<<3) so that no overflow occurs.
86  */
87 #define SFQ_ALLOT_SHIFT		3
88 #define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
89 
90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
91 typedef u16 sfq_index;
92 
93 /*
94  * We dont use pointers to save space.
95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
97  * are 'pointers' to dep[] array
98  */
99 struct sfq_head {
100 	sfq_index	next;
101 	sfq_index	prev;
102 };
103 
104 struct sfq_slot {
105 	struct sk_buff	*skblist_next;
106 	struct sk_buff	*skblist_prev;
107 	sfq_index	qlen; /* number of skbs in skblist */
108 	sfq_index	next; /* next slot in sfq RR chain */
109 	struct sfq_head dep; /* anchor in dep[] chains */
110 	unsigned short	hash; /* hash value (index in ht[]) */
111 	short		allot; /* credit for this slot */
112 
113 	unsigned int    backlog;
114 	struct red_vars vars;
115 };
116 
117 struct sfq_sched_data {
118 /* frequently used fields */
119 	int		limit;		/* limit of total number of packets in this qdisc */
120 	unsigned int	divisor;	/* number of slots in hash table */
121 	u8		headdrop;
122 	u8		maxdepth;	/* limit of packets per flow */
123 
124 	u32		perturbation;
125 	u8		cur_depth;	/* depth of longest slot */
126 	u8		flags;
127 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
128 	struct tcf_proto __rcu *filter_list;
129 	struct tcf_block *block;
130 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
131 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
132 
133 	struct red_parms *red_parms;
134 	struct tc_sfqred_stats stats;
135 	struct sfq_slot *tail;		/* current slot in round */
136 
137 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
138 					/* Linked lists of slots, indexed by depth
139 					 * dep[0] : list of unused flows
140 					 * dep[1] : list of flows with 1 packet
141 					 * dep[X] : list of flows with X packets
142 					 */
143 
144 	unsigned int	maxflows;	/* number of flows in flows array */
145 	int		perturb_period;
146 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
147 	struct timer_list perturb_timer;
148 };
149 
150 /*
151  * sfq_head are either in a sfq_slot or in dep[] array
152  */
153 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
154 {
155 	if (val < SFQ_MAX_FLOWS)
156 		return &q->slots[val].dep;
157 	return &q->dep[val - SFQ_MAX_FLOWS];
158 }
159 
160 static unsigned int sfq_hash(const struct sfq_sched_data *q,
161 			     const struct sk_buff *skb)
162 {
163 	return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
164 }
165 
166 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
167 				 int *qerr)
168 {
169 	struct sfq_sched_data *q = qdisc_priv(sch);
170 	struct tcf_result res;
171 	struct tcf_proto *fl;
172 	int result;
173 
174 	if (TC_H_MAJ(skb->priority) == sch->handle &&
175 	    TC_H_MIN(skb->priority) > 0 &&
176 	    TC_H_MIN(skb->priority) <= q->divisor)
177 		return TC_H_MIN(skb->priority);
178 
179 	fl = rcu_dereference_bh(q->filter_list);
180 	if (!fl)
181 		return sfq_hash(q, skb) + 1;
182 
183 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
184 	result = tcf_classify(skb, fl, &res, false);
185 	if (result >= 0) {
186 #ifdef CONFIG_NET_CLS_ACT
187 		switch (result) {
188 		case TC_ACT_STOLEN:
189 		case TC_ACT_QUEUED:
190 		case TC_ACT_TRAP:
191 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
192 		case TC_ACT_SHOT:
193 			return 0;
194 		}
195 #endif
196 		if (TC_H_MIN(res.classid) <= q->divisor)
197 			return TC_H_MIN(res.classid);
198 	}
199 	return 0;
200 }
201 
202 /*
203  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
204  */
205 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
206 {
207 	sfq_index p, n;
208 	struct sfq_slot *slot = &q->slots[x];
209 	int qlen = slot->qlen;
210 
211 	p = qlen + SFQ_MAX_FLOWS;
212 	n = q->dep[qlen].next;
213 
214 	slot->dep.next = n;
215 	slot->dep.prev = p;
216 
217 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
218 	sfq_dep_head(q, n)->prev = x;
219 }
220 
221 #define sfq_unlink(q, x, n, p)			\
222 	do {					\
223 		n = q->slots[x].dep.next;	\
224 		p = q->slots[x].dep.prev;	\
225 		sfq_dep_head(q, p)->next = n;	\
226 		sfq_dep_head(q, n)->prev = p;	\
227 	} while (0)
228 
229 
230 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
231 {
232 	sfq_index p, n;
233 	int d;
234 
235 	sfq_unlink(q, x, n, p);
236 
237 	d = q->slots[x].qlen--;
238 	if (n == p && q->cur_depth == d)
239 		q->cur_depth--;
240 	sfq_link(q, x);
241 }
242 
243 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
244 {
245 	sfq_index p, n;
246 	int d;
247 
248 	sfq_unlink(q, x, n, p);
249 
250 	d = ++q->slots[x].qlen;
251 	if (q->cur_depth < d)
252 		q->cur_depth = d;
253 	sfq_link(q, x);
254 }
255 
256 /* helper functions : might be changed when/if skb use a standard list_head */
257 
258 /* remove one skb from tail of slot queue */
259 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
260 {
261 	struct sk_buff *skb = slot->skblist_prev;
262 
263 	slot->skblist_prev = skb->prev;
264 	skb->prev->next = (struct sk_buff *)slot;
265 	skb->next = skb->prev = NULL;
266 	return skb;
267 }
268 
269 /* remove one skb from head of slot queue */
270 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
271 {
272 	struct sk_buff *skb = slot->skblist_next;
273 
274 	slot->skblist_next = skb->next;
275 	skb->next->prev = (struct sk_buff *)slot;
276 	skb->next = skb->prev = NULL;
277 	return skb;
278 }
279 
280 static inline void slot_queue_init(struct sfq_slot *slot)
281 {
282 	memset(slot, 0, sizeof(*slot));
283 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
284 }
285 
286 /* add skb to slot queue (tail add) */
287 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
288 {
289 	skb->prev = slot->skblist_prev;
290 	skb->next = (struct sk_buff *)slot;
291 	slot->skblist_prev->next = skb;
292 	slot->skblist_prev = skb;
293 }
294 
295 static unsigned int sfq_drop(struct Qdisc *sch)
296 {
297 	struct sfq_sched_data *q = qdisc_priv(sch);
298 	sfq_index x, d = q->cur_depth;
299 	struct sk_buff *skb;
300 	unsigned int len;
301 	struct sfq_slot *slot;
302 
303 	/* Queue is full! Find the longest slot and drop tail packet from it */
304 	if (d > 1) {
305 		x = q->dep[d].next;
306 		slot = &q->slots[x];
307 drop:
308 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
309 		len = qdisc_pkt_len(skb);
310 		slot->backlog -= len;
311 		sfq_dec(q, x);
312 		sch->q.qlen--;
313 		qdisc_qstats_drop(sch);
314 		qdisc_qstats_backlog_dec(sch, skb);
315 		kfree_skb(skb);
316 		return len;
317 	}
318 
319 	if (d == 1) {
320 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
321 		x = q->tail->next;
322 		slot = &q->slots[x];
323 		q->tail->next = slot->next;
324 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
325 		goto drop;
326 	}
327 
328 	return 0;
329 }
330 
331 /* Is ECN parameter configured */
332 static int sfq_prob_mark(const struct sfq_sched_data *q)
333 {
334 	return q->flags & TC_RED_ECN;
335 }
336 
337 /* Should packets over max threshold just be marked */
338 static int sfq_hard_mark(const struct sfq_sched_data *q)
339 {
340 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
341 }
342 
343 static int sfq_headdrop(const struct sfq_sched_data *q)
344 {
345 	return q->headdrop;
346 }
347 
348 static int
349 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
350 {
351 	struct sfq_sched_data *q = qdisc_priv(sch);
352 	unsigned int hash, dropped;
353 	sfq_index x, qlen;
354 	struct sfq_slot *slot;
355 	int uninitialized_var(ret);
356 	struct sk_buff *head;
357 	int delta;
358 
359 	hash = sfq_classify(skb, sch, &ret);
360 	if (hash == 0) {
361 		if (ret & __NET_XMIT_BYPASS)
362 			qdisc_qstats_drop(sch);
363 		kfree_skb(skb);
364 		return ret;
365 	}
366 	hash--;
367 
368 	x = q->ht[hash];
369 	slot = &q->slots[x];
370 	if (x == SFQ_EMPTY_SLOT) {
371 		x = q->dep[0].next; /* get a free slot */
372 		if (x >= SFQ_MAX_FLOWS)
373 			return qdisc_drop(skb, sch, to_free);
374 		q->ht[hash] = x;
375 		slot = &q->slots[x];
376 		slot->hash = hash;
377 		slot->backlog = 0; /* should already be 0 anyway... */
378 		red_set_vars(&slot->vars);
379 		goto enqueue;
380 	}
381 	if (q->red_parms) {
382 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
383 							&slot->vars,
384 							slot->backlog);
385 		switch (red_action(q->red_parms,
386 				   &slot->vars,
387 				   slot->vars.qavg)) {
388 		case RED_DONT_MARK:
389 			break;
390 
391 		case RED_PROB_MARK:
392 			qdisc_qstats_overlimit(sch);
393 			if (sfq_prob_mark(q)) {
394 				/* We know we have at least one packet in queue */
395 				if (sfq_headdrop(q) &&
396 				    INET_ECN_set_ce(slot->skblist_next)) {
397 					q->stats.prob_mark_head++;
398 					break;
399 				}
400 				if (INET_ECN_set_ce(skb)) {
401 					q->stats.prob_mark++;
402 					break;
403 				}
404 			}
405 			q->stats.prob_drop++;
406 			goto congestion_drop;
407 
408 		case RED_HARD_MARK:
409 			qdisc_qstats_overlimit(sch);
410 			if (sfq_hard_mark(q)) {
411 				/* We know we have at least one packet in queue */
412 				if (sfq_headdrop(q) &&
413 				    INET_ECN_set_ce(slot->skblist_next)) {
414 					q->stats.forced_mark_head++;
415 					break;
416 				}
417 				if (INET_ECN_set_ce(skb)) {
418 					q->stats.forced_mark++;
419 					break;
420 				}
421 			}
422 			q->stats.forced_drop++;
423 			goto congestion_drop;
424 		}
425 	}
426 
427 	if (slot->qlen >= q->maxdepth) {
428 congestion_drop:
429 		if (!sfq_headdrop(q))
430 			return qdisc_drop(skb, sch, to_free);
431 
432 		/* We know we have at least one packet in queue */
433 		head = slot_dequeue_head(slot);
434 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
435 		sch->qstats.backlog -= delta;
436 		slot->backlog -= delta;
437 		qdisc_drop(head, sch, to_free);
438 
439 		slot_queue_add(slot, skb);
440 		return NET_XMIT_CN;
441 	}
442 
443 enqueue:
444 	qdisc_qstats_backlog_inc(sch, skb);
445 	slot->backlog += qdisc_pkt_len(skb);
446 	slot_queue_add(slot, skb);
447 	sfq_inc(q, x);
448 	if (slot->qlen == 1) {		/* The flow is new */
449 		if (q->tail == NULL) {	/* It is the first flow */
450 			slot->next = x;
451 		} else {
452 			slot->next = q->tail->next;
453 			q->tail->next = x;
454 		}
455 		/* We put this flow at the end of our flow list.
456 		 * This might sound unfair for a new flow to wait after old ones,
457 		 * but we could endup servicing new flows only, and freeze old ones.
458 		 */
459 		q->tail = slot;
460 		/* We could use a bigger initial quantum for new flows */
461 		slot->allot = q->scaled_quantum;
462 	}
463 	if (++sch->q.qlen <= q->limit)
464 		return NET_XMIT_SUCCESS;
465 
466 	qlen = slot->qlen;
467 	dropped = sfq_drop(sch);
468 	/* Return Congestion Notification only if we dropped a packet
469 	 * from this flow.
470 	 */
471 	if (qlen != slot->qlen)
472 		return NET_XMIT_CN;
473 
474 	/* As we dropped a packet, better let upper stack know this */
475 	qdisc_tree_reduce_backlog(sch, 1, dropped);
476 	return NET_XMIT_SUCCESS;
477 }
478 
479 static struct sk_buff *
480 sfq_dequeue(struct Qdisc *sch)
481 {
482 	struct sfq_sched_data *q = qdisc_priv(sch);
483 	struct sk_buff *skb;
484 	sfq_index a, next_a;
485 	struct sfq_slot *slot;
486 
487 	/* No active slots */
488 	if (q->tail == NULL)
489 		return NULL;
490 
491 next_slot:
492 	a = q->tail->next;
493 	slot = &q->slots[a];
494 	if (slot->allot <= 0) {
495 		q->tail = slot;
496 		slot->allot += q->scaled_quantum;
497 		goto next_slot;
498 	}
499 	skb = slot_dequeue_head(slot);
500 	sfq_dec(q, a);
501 	qdisc_bstats_update(sch, skb);
502 	sch->q.qlen--;
503 	qdisc_qstats_backlog_dec(sch, skb);
504 	slot->backlog -= qdisc_pkt_len(skb);
505 	/* Is the slot empty? */
506 	if (slot->qlen == 0) {
507 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
508 		next_a = slot->next;
509 		if (a == next_a) {
510 			q->tail = NULL; /* no more active slots */
511 			return skb;
512 		}
513 		q->tail->next = next_a;
514 	} else {
515 		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
516 	}
517 	return skb;
518 }
519 
520 static void
521 sfq_reset(struct Qdisc *sch)
522 {
523 	struct sk_buff *skb;
524 
525 	while ((skb = sfq_dequeue(sch)) != NULL)
526 		rtnl_kfree_skbs(skb, skb);
527 }
528 
529 /*
530  * When q->perturbation is changed, we rehash all queued skbs
531  * to avoid OOO (Out Of Order) effects.
532  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
533  * counters.
534  */
535 static void sfq_rehash(struct Qdisc *sch)
536 {
537 	struct sfq_sched_data *q = qdisc_priv(sch);
538 	struct sk_buff *skb;
539 	int i;
540 	struct sfq_slot *slot;
541 	struct sk_buff_head list;
542 	int dropped = 0;
543 	unsigned int drop_len = 0;
544 
545 	__skb_queue_head_init(&list);
546 
547 	for (i = 0; i < q->maxflows; i++) {
548 		slot = &q->slots[i];
549 		if (!slot->qlen)
550 			continue;
551 		while (slot->qlen) {
552 			skb = slot_dequeue_head(slot);
553 			sfq_dec(q, i);
554 			__skb_queue_tail(&list, skb);
555 		}
556 		slot->backlog = 0;
557 		red_set_vars(&slot->vars);
558 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
559 	}
560 	q->tail = NULL;
561 
562 	while ((skb = __skb_dequeue(&list)) != NULL) {
563 		unsigned int hash = sfq_hash(q, skb);
564 		sfq_index x = q->ht[hash];
565 
566 		slot = &q->slots[x];
567 		if (x == SFQ_EMPTY_SLOT) {
568 			x = q->dep[0].next; /* get a free slot */
569 			if (x >= SFQ_MAX_FLOWS) {
570 drop:
571 				qdisc_qstats_backlog_dec(sch, skb);
572 				drop_len += qdisc_pkt_len(skb);
573 				kfree_skb(skb);
574 				dropped++;
575 				continue;
576 			}
577 			q->ht[hash] = x;
578 			slot = &q->slots[x];
579 			slot->hash = hash;
580 		}
581 		if (slot->qlen >= q->maxdepth)
582 			goto drop;
583 		slot_queue_add(slot, skb);
584 		if (q->red_parms)
585 			slot->vars.qavg = red_calc_qavg(q->red_parms,
586 							&slot->vars,
587 							slot->backlog);
588 		slot->backlog += qdisc_pkt_len(skb);
589 		sfq_inc(q, x);
590 		if (slot->qlen == 1) {		/* The flow is new */
591 			if (q->tail == NULL) {	/* It is the first flow */
592 				slot->next = x;
593 			} else {
594 				slot->next = q->tail->next;
595 				q->tail->next = x;
596 			}
597 			q->tail = slot;
598 			slot->allot = q->scaled_quantum;
599 		}
600 	}
601 	sch->q.qlen -= dropped;
602 	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
603 }
604 
605 static void sfq_perturbation(unsigned long arg)
606 {
607 	struct Qdisc *sch = (struct Qdisc *)arg;
608 	struct sfq_sched_data *q = qdisc_priv(sch);
609 	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
610 
611 	spin_lock(root_lock);
612 	q->perturbation = prandom_u32();
613 	if (!q->filter_list && q->tail)
614 		sfq_rehash(sch);
615 	spin_unlock(root_lock);
616 
617 	if (q->perturb_period)
618 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
619 }
620 
621 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
622 {
623 	struct sfq_sched_data *q = qdisc_priv(sch);
624 	struct tc_sfq_qopt *ctl = nla_data(opt);
625 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
626 	unsigned int qlen, dropped = 0;
627 	struct red_parms *p = NULL;
628 
629 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
630 		return -EINVAL;
631 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
632 		ctl_v1 = nla_data(opt);
633 	if (ctl->divisor &&
634 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
635 		return -EINVAL;
636 	if (ctl_v1 && ctl_v1->qth_min) {
637 		p = kmalloc(sizeof(*p), GFP_KERNEL);
638 		if (!p)
639 			return -ENOMEM;
640 	}
641 	sch_tree_lock(sch);
642 	if (ctl->quantum) {
643 		q->quantum = ctl->quantum;
644 		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
645 	}
646 	q->perturb_period = ctl->perturb_period * HZ;
647 	if (ctl->flows)
648 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
649 	if (ctl->divisor) {
650 		q->divisor = ctl->divisor;
651 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
652 	}
653 	if (ctl_v1) {
654 		if (ctl_v1->depth)
655 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
656 		if (p) {
657 			swap(q->red_parms, p);
658 			red_set_parms(q->red_parms,
659 				      ctl_v1->qth_min, ctl_v1->qth_max,
660 				      ctl_v1->Wlog,
661 				      ctl_v1->Plog, ctl_v1->Scell_log,
662 				      NULL,
663 				      ctl_v1->max_P);
664 		}
665 		q->flags = ctl_v1->flags;
666 		q->headdrop = ctl_v1->headdrop;
667 	}
668 	if (ctl->limit) {
669 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
670 		q->maxflows = min_t(u32, q->maxflows, q->limit);
671 	}
672 
673 	qlen = sch->q.qlen;
674 	while (sch->q.qlen > q->limit)
675 		dropped += sfq_drop(sch);
676 	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
677 
678 	del_timer(&q->perturb_timer);
679 	if (q->perturb_period) {
680 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
681 		q->perturbation = prandom_u32();
682 	}
683 	sch_tree_unlock(sch);
684 	kfree(p);
685 	return 0;
686 }
687 
688 static void *sfq_alloc(size_t sz)
689 {
690 	return  kvmalloc(sz, GFP_KERNEL);
691 }
692 
693 static void sfq_free(void *addr)
694 {
695 	kvfree(addr);
696 }
697 
698 static void sfq_destroy(struct Qdisc *sch)
699 {
700 	struct sfq_sched_data *q = qdisc_priv(sch);
701 
702 	tcf_block_put(q->block);
703 	q->perturb_period = 0;
704 	del_timer_sync(&q->perturb_timer);
705 	sfq_free(q->ht);
706 	sfq_free(q->slots);
707 	kfree(q->red_parms);
708 }
709 
710 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
711 {
712 	struct sfq_sched_data *q = qdisc_priv(sch);
713 	int i;
714 	int err;
715 
716 	err = tcf_block_get(&q->block, &q->filter_list);
717 	if (err)
718 		return err;
719 
720 	setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
721 			       (unsigned long)sch);
722 
723 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
724 		q->dep[i].next = i + SFQ_MAX_FLOWS;
725 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
726 	}
727 
728 	q->limit = SFQ_MAX_DEPTH;
729 	q->maxdepth = SFQ_MAX_DEPTH;
730 	q->cur_depth = 0;
731 	q->tail = NULL;
732 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
733 	q->maxflows = SFQ_DEFAULT_FLOWS;
734 	q->quantum = psched_mtu(qdisc_dev(sch));
735 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
736 	q->perturb_period = 0;
737 	q->perturbation = prandom_u32();
738 
739 	if (opt) {
740 		int err = sfq_change(sch, opt);
741 		if (err)
742 			return err;
743 	}
744 
745 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
746 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
747 	if (!q->ht || !q->slots) {
748 		/* Note: sfq_destroy() will be called by our caller */
749 		return -ENOMEM;
750 	}
751 
752 	for (i = 0; i < q->divisor; i++)
753 		q->ht[i] = SFQ_EMPTY_SLOT;
754 
755 	for (i = 0; i < q->maxflows; i++) {
756 		slot_queue_init(&q->slots[i]);
757 		sfq_link(q, i);
758 	}
759 	if (q->limit >= 1)
760 		sch->flags |= TCQ_F_CAN_BYPASS;
761 	else
762 		sch->flags &= ~TCQ_F_CAN_BYPASS;
763 	return 0;
764 }
765 
766 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
767 {
768 	struct sfq_sched_data *q = qdisc_priv(sch);
769 	unsigned char *b = skb_tail_pointer(skb);
770 	struct tc_sfq_qopt_v1 opt;
771 	struct red_parms *p = q->red_parms;
772 
773 	memset(&opt, 0, sizeof(opt));
774 	opt.v0.quantum	= q->quantum;
775 	opt.v0.perturb_period = q->perturb_period / HZ;
776 	opt.v0.limit	= q->limit;
777 	opt.v0.divisor	= q->divisor;
778 	opt.v0.flows	= q->maxflows;
779 	opt.depth	= q->maxdepth;
780 	opt.headdrop	= q->headdrop;
781 
782 	if (p) {
783 		opt.qth_min	= p->qth_min >> p->Wlog;
784 		opt.qth_max	= p->qth_max >> p->Wlog;
785 		opt.Wlog	= p->Wlog;
786 		opt.Plog	= p->Plog;
787 		opt.Scell_log	= p->Scell_log;
788 		opt.max_P	= p->max_P;
789 	}
790 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
791 	opt.flags	= q->flags;
792 
793 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
794 		goto nla_put_failure;
795 
796 	return skb->len;
797 
798 nla_put_failure:
799 	nlmsg_trim(skb, b);
800 	return -1;
801 }
802 
803 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
804 {
805 	return NULL;
806 }
807 
808 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
809 {
810 	return 0;
811 }
812 
813 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
814 			      u32 classid)
815 {
816 	/* we cannot bypass queue discipline anymore */
817 	sch->flags &= ~TCQ_F_CAN_BYPASS;
818 	return 0;
819 }
820 
821 static void sfq_put(struct Qdisc *q, unsigned long cl)
822 {
823 }
824 
825 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl)
826 {
827 	struct sfq_sched_data *q = qdisc_priv(sch);
828 
829 	if (cl)
830 		return NULL;
831 	return q->block;
832 }
833 
834 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
835 			  struct sk_buff *skb, struct tcmsg *tcm)
836 {
837 	tcm->tcm_handle |= TC_H_MIN(cl);
838 	return 0;
839 }
840 
841 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
842 				struct gnet_dump *d)
843 {
844 	struct sfq_sched_data *q = qdisc_priv(sch);
845 	sfq_index idx = q->ht[cl - 1];
846 	struct gnet_stats_queue qs = { 0 };
847 	struct tc_sfq_xstats xstats = { 0 };
848 
849 	if (idx != SFQ_EMPTY_SLOT) {
850 		const struct sfq_slot *slot = &q->slots[idx];
851 
852 		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
853 		qs.qlen = slot->qlen;
854 		qs.backlog = slot->backlog;
855 	}
856 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
857 		return -1;
858 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
859 }
860 
861 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
862 {
863 	struct sfq_sched_data *q = qdisc_priv(sch);
864 	unsigned int i;
865 
866 	if (arg->stop)
867 		return;
868 
869 	for (i = 0; i < q->divisor; i++) {
870 		if (q->ht[i] == SFQ_EMPTY_SLOT ||
871 		    arg->count < arg->skip) {
872 			arg->count++;
873 			continue;
874 		}
875 		if (arg->fn(sch, i + 1, arg) < 0) {
876 			arg->stop = 1;
877 			break;
878 		}
879 		arg->count++;
880 	}
881 }
882 
883 static const struct Qdisc_class_ops sfq_class_ops = {
884 	.leaf		=	sfq_leaf,
885 	.get		=	sfq_get,
886 	.put		=	sfq_put,
887 	.tcf_block	=	sfq_tcf_block,
888 	.bind_tcf	=	sfq_bind,
889 	.unbind_tcf	=	sfq_put,
890 	.dump		=	sfq_dump_class,
891 	.dump_stats	=	sfq_dump_class_stats,
892 	.walk		=	sfq_walk,
893 };
894 
895 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
896 	.cl_ops		=	&sfq_class_ops,
897 	.id		=	"sfq",
898 	.priv_size	=	sizeof(struct sfq_sched_data),
899 	.enqueue	=	sfq_enqueue,
900 	.dequeue	=	sfq_dequeue,
901 	.peek		=	qdisc_peek_dequeued,
902 	.init		=	sfq_init,
903 	.reset		=	sfq_reset,
904 	.destroy	=	sfq_destroy,
905 	.change		=	NULL,
906 	.dump		=	sfq_dump,
907 	.owner		=	THIS_MODULE,
908 };
909 
910 static int __init sfq_module_init(void)
911 {
912 	return register_qdisc(&sfq_qdisc_ops);
913 }
914 static void __exit sfq_module_exit(void)
915 {
916 	unregister_qdisc(&sfq_qdisc_ops);
917 }
918 module_init(sfq_module_init)
919 module_exit(sfq_module_exit)
920 MODULE_LICENSE("GPL");
921