xref: /openbmc/linux/net/sched/sch_sfq.c (revision 23c2b932)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/red.h>
27 
28 
29 /*	Stochastic Fairness Queuing algorithm.
30 	=======================================
31 
32 	Source:
33 	Paul E. McKenney "Stochastic Fairness Queuing",
34 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
35 
36 	Paul E. McKenney "Stochastic Fairness Queuing",
37 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
38 
39 
40 	See also:
41 	M. Shreedhar and George Varghese "Efficient Fair
42 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
43 
44 
45 	This is not the thing that is usually called (W)FQ nowadays.
46 	It does not use any timestamp mechanism, but instead
47 	processes queues in round-robin order.
48 
49 	ADVANTAGE:
50 
51 	- It is very cheap. Both CPU and memory requirements are minimal.
52 
53 	DRAWBACKS:
54 
55 	- "Stochastic" -> It is not 100% fair.
56 	When hash collisions occur, several flows are considered as one.
57 
58 	- "Round-robin" -> It introduces larger delays than virtual clock
59 	based schemes, and should not be used for isolating interactive
60 	traffic	from non-interactive. It means, that this scheduler
61 	should be used as leaf of CBQ or P3, which put interactive traffic
62 	to higher priority band.
63 
64 	We still need true WFQ for top level CSZ, but using WFQ
65 	for the best effort traffic is absolutely pointless:
66 	SFQ is superior for this purpose.
67 
68 	IMPLEMENTATION:
69 	This implementation limits :
70 	- maximal queue length per flow to 127 packets.
71 	- max mtu to 2^18-1;
72 	- max 65408 flows,
73 	- number of hash buckets to 65536.
74 
75 	It is easy to increase these values, but not in flight.  */
76 
77 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
78 #define SFQ_DEFAULT_FLOWS	128
79 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
80 #define SFQ_EMPTY_SLOT		0xffff
81 #define SFQ_DEFAULT_HASH_DIVISOR 1024
82 
83 /* We use 16 bits to store allot, and want to handle packets up to 64K
84  * Scale allot by 8 (1<<3) so that no overflow occurs.
85  */
86 #define SFQ_ALLOT_SHIFT		3
87 #define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
88 
89 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
90 typedef u16 sfq_index;
91 
92 /*
93  * We dont use pointers to save space.
94  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
95  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
96  * are 'pointers' to dep[] array
97  */
98 struct sfq_head {
99 	sfq_index	next;
100 	sfq_index	prev;
101 };
102 
103 struct sfq_slot {
104 	struct sk_buff	*skblist_next;
105 	struct sk_buff	*skblist_prev;
106 	sfq_index	qlen; /* number of skbs in skblist */
107 	sfq_index	next; /* next slot in sfq RR chain */
108 	struct sfq_head dep; /* anchor in dep[] chains */
109 	unsigned short	hash; /* hash value (index in ht[]) */
110 	short		allot; /* credit for this slot */
111 
112 	unsigned int    backlog;
113 	struct red_vars vars;
114 };
115 
116 struct sfq_sched_data {
117 /* frequently used fields */
118 	int		limit;		/* limit of total number of packets in this qdisc */
119 	unsigned int	divisor;	/* number of slots in hash table */
120 	u8		headdrop;
121 	u8		maxdepth;	/* limit of packets per flow */
122 
123 	u32		perturbation;
124 	u8		cur_depth;	/* depth of longest slot */
125 	u8		flags;
126 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
127 	struct tcf_proto __rcu *filter_list;
128 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
129 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
130 
131 	struct red_parms *red_parms;
132 	struct tc_sfqred_stats stats;
133 	struct sfq_slot *tail;		/* current slot in round */
134 
135 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
136 					/* Linked lists of slots, indexed by depth
137 					 * dep[0] : list of unused flows
138 					 * dep[1] : list of flows with 1 packet
139 					 * dep[X] : list of flows with X packets
140 					 */
141 
142 	unsigned int	maxflows;	/* number of flows in flows array */
143 	int		perturb_period;
144 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
145 	struct timer_list perturb_timer;
146 };
147 
148 /*
149  * sfq_head are either in a sfq_slot or in dep[] array
150  */
151 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
152 {
153 	if (val < SFQ_MAX_FLOWS)
154 		return &q->slots[val].dep;
155 	return &q->dep[val - SFQ_MAX_FLOWS];
156 }
157 
158 static unsigned int sfq_hash(const struct sfq_sched_data *q,
159 			     const struct sk_buff *skb)
160 {
161 	return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
162 }
163 
164 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
165 				 int *qerr)
166 {
167 	struct sfq_sched_data *q = qdisc_priv(sch);
168 	struct tcf_result res;
169 	struct tcf_proto *fl;
170 	int result;
171 
172 	if (TC_H_MAJ(skb->priority) == sch->handle &&
173 	    TC_H_MIN(skb->priority) > 0 &&
174 	    TC_H_MIN(skb->priority) <= q->divisor)
175 		return TC_H_MIN(skb->priority);
176 
177 	fl = rcu_dereference_bh(q->filter_list);
178 	if (!fl)
179 		return sfq_hash(q, skb) + 1;
180 
181 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
182 	result = tc_classify(skb, fl, &res, false);
183 	if (result >= 0) {
184 #ifdef CONFIG_NET_CLS_ACT
185 		switch (result) {
186 		case TC_ACT_STOLEN:
187 		case TC_ACT_QUEUED:
188 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
189 		case TC_ACT_SHOT:
190 			return 0;
191 		}
192 #endif
193 		if (TC_H_MIN(res.classid) <= q->divisor)
194 			return TC_H_MIN(res.classid);
195 	}
196 	return 0;
197 }
198 
199 /*
200  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
201  */
202 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
203 {
204 	sfq_index p, n;
205 	struct sfq_slot *slot = &q->slots[x];
206 	int qlen = slot->qlen;
207 
208 	p = qlen + SFQ_MAX_FLOWS;
209 	n = q->dep[qlen].next;
210 
211 	slot->dep.next = n;
212 	slot->dep.prev = p;
213 
214 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
215 	sfq_dep_head(q, n)->prev = x;
216 }
217 
218 #define sfq_unlink(q, x, n, p)			\
219 	do {					\
220 		n = q->slots[x].dep.next;	\
221 		p = q->slots[x].dep.prev;	\
222 		sfq_dep_head(q, p)->next = n;	\
223 		sfq_dep_head(q, n)->prev = p;	\
224 	} while (0)
225 
226 
227 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
228 {
229 	sfq_index p, n;
230 	int d;
231 
232 	sfq_unlink(q, x, n, p);
233 
234 	d = q->slots[x].qlen--;
235 	if (n == p && q->cur_depth == d)
236 		q->cur_depth--;
237 	sfq_link(q, x);
238 }
239 
240 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
241 {
242 	sfq_index p, n;
243 	int d;
244 
245 	sfq_unlink(q, x, n, p);
246 
247 	d = ++q->slots[x].qlen;
248 	if (q->cur_depth < d)
249 		q->cur_depth = d;
250 	sfq_link(q, x);
251 }
252 
253 /* helper functions : might be changed when/if skb use a standard list_head */
254 
255 /* remove one skb from tail of slot queue */
256 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
257 {
258 	struct sk_buff *skb = slot->skblist_prev;
259 
260 	slot->skblist_prev = skb->prev;
261 	skb->prev->next = (struct sk_buff *)slot;
262 	skb->next = skb->prev = NULL;
263 	return skb;
264 }
265 
266 /* remove one skb from head of slot queue */
267 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
268 {
269 	struct sk_buff *skb = slot->skblist_next;
270 
271 	slot->skblist_next = skb->next;
272 	skb->next->prev = (struct sk_buff *)slot;
273 	skb->next = skb->prev = NULL;
274 	return skb;
275 }
276 
277 static inline void slot_queue_init(struct sfq_slot *slot)
278 {
279 	memset(slot, 0, sizeof(*slot));
280 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
281 }
282 
283 /* add skb to slot queue (tail add) */
284 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
285 {
286 	skb->prev = slot->skblist_prev;
287 	skb->next = (struct sk_buff *)slot;
288 	slot->skblist_prev->next = skb;
289 	slot->skblist_prev = skb;
290 }
291 
292 static unsigned int sfq_drop(struct Qdisc *sch)
293 {
294 	struct sfq_sched_data *q = qdisc_priv(sch);
295 	sfq_index x, d = q->cur_depth;
296 	struct sk_buff *skb;
297 	unsigned int len;
298 	struct sfq_slot *slot;
299 
300 	/* Queue is full! Find the longest slot and drop tail packet from it */
301 	if (d > 1) {
302 		x = q->dep[d].next;
303 		slot = &q->slots[x];
304 drop:
305 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
306 		len = qdisc_pkt_len(skb);
307 		slot->backlog -= len;
308 		sfq_dec(q, x);
309 		sch->q.qlen--;
310 		qdisc_qstats_drop(sch);
311 		qdisc_qstats_backlog_dec(sch, skb);
312 		kfree_skb(skb);
313 		return len;
314 	}
315 
316 	if (d == 1) {
317 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
318 		x = q->tail->next;
319 		slot = &q->slots[x];
320 		q->tail->next = slot->next;
321 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
322 		goto drop;
323 	}
324 
325 	return 0;
326 }
327 
328 /* Is ECN parameter configured */
329 static int sfq_prob_mark(const struct sfq_sched_data *q)
330 {
331 	return q->flags & TC_RED_ECN;
332 }
333 
334 /* Should packets over max threshold just be marked */
335 static int sfq_hard_mark(const struct sfq_sched_data *q)
336 {
337 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
338 }
339 
340 static int sfq_headdrop(const struct sfq_sched_data *q)
341 {
342 	return q->headdrop;
343 }
344 
345 static int
346 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
347 {
348 	struct sfq_sched_data *q = qdisc_priv(sch);
349 	unsigned int hash, dropped;
350 	sfq_index x, qlen;
351 	struct sfq_slot *slot;
352 	int uninitialized_var(ret);
353 	struct sk_buff *head;
354 	int delta;
355 
356 	hash = sfq_classify(skb, sch, &ret);
357 	if (hash == 0) {
358 		if (ret & __NET_XMIT_BYPASS)
359 			qdisc_qstats_drop(sch);
360 		kfree_skb(skb);
361 		return ret;
362 	}
363 	hash--;
364 
365 	x = q->ht[hash];
366 	slot = &q->slots[x];
367 	if (x == SFQ_EMPTY_SLOT) {
368 		x = q->dep[0].next; /* get a free slot */
369 		if (x >= SFQ_MAX_FLOWS)
370 			return qdisc_drop(skb, sch);
371 		q->ht[hash] = x;
372 		slot = &q->slots[x];
373 		slot->hash = hash;
374 		slot->backlog = 0; /* should already be 0 anyway... */
375 		red_set_vars(&slot->vars);
376 		goto enqueue;
377 	}
378 	if (q->red_parms) {
379 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
380 							&slot->vars,
381 							slot->backlog);
382 		switch (red_action(q->red_parms,
383 				   &slot->vars,
384 				   slot->vars.qavg)) {
385 		case RED_DONT_MARK:
386 			break;
387 
388 		case RED_PROB_MARK:
389 			qdisc_qstats_overlimit(sch);
390 			if (sfq_prob_mark(q)) {
391 				/* We know we have at least one packet in queue */
392 				if (sfq_headdrop(q) &&
393 				    INET_ECN_set_ce(slot->skblist_next)) {
394 					q->stats.prob_mark_head++;
395 					break;
396 				}
397 				if (INET_ECN_set_ce(skb)) {
398 					q->stats.prob_mark++;
399 					break;
400 				}
401 			}
402 			q->stats.prob_drop++;
403 			goto congestion_drop;
404 
405 		case RED_HARD_MARK:
406 			qdisc_qstats_overlimit(sch);
407 			if (sfq_hard_mark(q)) {
408 				/* We know we have at least one packet in queue */
409 				if (sfq_headdrop(q) &&
410 				    INET_ECN_set_ce(slot->skblist_next)) {
411 					q->stats.forced_mark_head++;
412 					break;
413 				}
414 				if (INET_ECN_set_ce(skb)) {
415 					q->stats.forced_mark++;
416 					break;
417 				}
418 			}
419 			q->stats.forced_drop++;
420 			goto congestion_drop;
421 		}
422 	}
423 
424 	if (slot->qlen >= q->maxdepth) {
425 congestion_drop:
426 		if (!sfq_headdrop(q))
427 			return qdisc_drop(skb, sch);
428 
429 		/* We know we have at least one packet in queue */
430 		head = slot_dequeue_head(slot);
431 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
432 		sch->qstats.backlog -= delta;
433 		slot->backlog -= delta;
434 		qdisc_drop(head, sch);
435 
436 		slot_queue_add(slot, skb);
437 		return NET_XMIT_CN;
438 	}
439 
440 enqueue:
441 	qdisc_qstats_backlog_inc(sch, skb);
442 	slot->backlog += qdisc_pkt_len(skb);
443 	slot_queue_add(slot, skb);
444 	sfq_inc(q, x);
445 	if (slot->qlen == 1) {		/* The flow is new */
446 		if (q->tail == NULL) {	/* It is the first flow */
447 			slot->next = x;
448 		} else {
449 			slot->next = q->tail->next;
450 			q->tail->next = x;
451 		}
452 		/* We put this flow at the end of our flow list.
453 		 * This might sound unfair for a new flow to wait after old ones,
454 		 * but we could endup servicing new flows only, and freeze old ones.
455 		 */
456 		q->tail = slot;
457 		/* We could use a bigger initial quantum for new flows */
458 		slot->allot = q->scaled_quantum;
459 	}
460 	if (++sch->q.qlen <= q->limit)
461 		return NET_XMIT_SUCCESS;
462 
463 	qlen = slot->qlen;
464 	dropped = sfq_drop(sch);
465 	/* Return Congestion Notification only if we dropped a packet
466 	 * from this flow.
467 	 */
468 	if (qlen != slot->qlen)
469 		return NET_XMIT_CN;
470 
471 	/* As we dropped a packet, better let upper stack know this */
472 	qdisc_tree_reduce_backlog(sch, 1, dropped);
473 	return NET_XMIT_SUCCESS;
474 }
475 
476 static struct sk_buff *
477 sfq_dequeue(struct Qdisc *sch)
478 {
479 	struct sfq_sched_data *q = qdisc_priv(sch);
480 	struct sk_buff *skb;
481 	sfq_index a, next_a;
482 	struct sfq_slot *slot;
483 
484 	/* No active slots */
485 	if (q->tail == NULL)
486 		return NULL;
487 
488 next_slot:
489 	a = q->tail->next;
490 	slot = &q->slots[a];
491 	if (slot->allot <= 0) {
492 		q->tail = slot;
493 		slot->allot += q->scaled_quantum;
494 		goto next_slot;
495 	}
496 	skb = slot_dequeue_head(slot);
497 	sfq_dec(q, a);
498 	qdisc_bstats_update(sch, skb);
499 	sch->q.qlen--;
500 	qdisc_qstats_backlog_dec(sch, skb);
501 	slot->backlog -= qdisc_pkt_len(skb);
502 	/* Is the slot empty? */
503 	if (slot->qlen == 0) {
504 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
505 		next_a = slot->next;
506 		if (a == next_a) {
507 			q->tail = NULL; /* no more active slots */
508 			return skb;
509 		}
510 		q->tail->next = next_a;
511 	} else {
512 		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
513 	}
514 	return skb;
515 }
516 
517 static void
518 sfq_reset(struct Qdisc *sch)
519 {
520 	struct sk_buff *skb;
521 
522 	while ((skb = sfq_dequeue(sch)) != NULL)
523 		kfree_skb(skb);
524 }
525 
526 /*
527  * When q->perturbation is changed, we rehash all queued skbs
528  * to avoid OOO (Out Of Order) effects.
529  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
530  * counters.
531  */
532 static void sfq_rehash(struct Qdisc *sch)
533 {
534 	struct sfq_sched_data *q = qdisc_priv(sch);
535 	struct sk_buff *skb;
536 	int i;
537 	struct sfq_slot *slot;
538 	struct sk_buff_head list;
539 	int dropped = 0;
540 	unsigned int drop_len = 0;
541 
542 	__skb_queue_head_init(&list);
543 
544 	for (i = 0; i < q->maxflows; i++) {
545 		slot = &q->slots[i];
546 		if (!slot->qlen)
547 			continue;
548 		while (slot->qlen) {
549 			skb = slot_dequeue_head(slot);
550 			sfq_dec(q, i);
551 			__skb_queue_tail(&list, skb);
552 		}
553 		slot->backlog = 0;
554 		red_set_vars(&slot->vars);
555 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
556 	}
557 	q->tail = NULL;
558 
559 	while ((skb = __skb_dequeue(&list)) != NULL) {
560 		unsigned int hash = sfq_hash(q, skb);
561 		sfq_index x = q->ht[hash];
562 
563 		slot = &q->slots[x];
564 		if (x == SFQ_EMPTY_SLOT) {
565 			x = q->dep[0].next; /* get a free slot */
566 			if (x >= SFQ_MAX_FLOWS) {
567 drop:
568 				qdisc_qstats_backlog_dec(sch, skb);
569 				drop_len += qdisc_pkt_len(skb);
570 				kfree_skb(skb);
571 				dropped++;
572 				continue;
573 			}
574 			q->ht[hash] = x;
575 			slot = &q->slots[x];
576 			slot->hash = hash;
577 		}
578 		if (slot->qlen >= q->maxdepth)
579 			goto drop;
580 		slot_queue_add(slot, skb);
581 		if (q->red_parms)
582 			slot->vars.qavg = red_calc_qavg(q->red_parms,
583 							&slot->vars,
584 							slot->backlog);
585 		slot->backlog += qdisc_pkt_len(skb);
586 		sfq_inc(q, x);
587 		if (slot->qlen == 1) {		/* The flow is new */
588 			if (q->tail == NULL) {	/* It is the first flow */
589 				slot->next = x;
590 			} else {
591 				slot->next = q->tail->next;
592 				q->tail->next = x;
593 			}
594 			q->tail = slot;
595 			slot->allot = q->scaled_quantum;
596 		}
597 	}
598 	sch->q.qlen -= dropped;
599 	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
600 }
601 
602 static void sfq_perturbation(unsigned long arg)
603 {
604 	struct Qdisc *sch = (struct Qdisc *)arg;
605 	struct sfq_sched_data *q = qdisc_priv(sch);
606 	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
607 
608 	spin_lock(root_lock);
609 	q->perturbation = prandom_u32();
610 	if (!q->filter_list && q->tail)
611 		sfq_rehash(sch);
612 	spin_unlock(root_lock);
613 
614 	if (q->perturb_period)
615 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
616 }
617 
618 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
619 {
620 	struct sfq_sched_data *q = qdisc_priv(sch);
621 	struct tc_sfq_qopt *ctl = nla_data(opt);
622 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
623 	unsigned int qlen, dropped = 0;
624 	struct red_parms *p = NULL;
625 
626 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
627 		return -EINVAL;
628 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
629 		ctl_v1 = nla_data(opt);
630 	if (ctl->divisor &&
631 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
632 		return -EINVAL;
633 	if (ctl_v1 && ctl_v1->qth_min) {
634 		p = kmalloc(sizeof(*p), GFP_KERNEL);
635 		if (!p)
636 			return -ENOMEM;
637 	}
638 	sch_tree_lock(sch);
639 	if (ctl->quantum) {
640 		q->quantum = ctl->quantum;
641 		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
642 	}
643 	q->perturb_period = ctl->perturb_period * HZ;
644 	if (ctl->flows)
645 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
646 	if (ctl->divisor) {
647 		q->divisor = ctl->divisor;
648 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
649 	}
650 	if (ctl_v1) {
651 		if (ctl_v1->depth)
652 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
653 		if (p) {
654 			swap(q->red_parms, p);
655 			red_set_parms(q->red_parms,
656 				      ctl_v1->qth_min, ctl_v1->qth_max,
657 				      ctl_v1->Wlog,
658 				      ctl_v1->Plog, ctl_v1->Scell_log,
659 				      NULL,
660 				      ctl_v1->max_P);
661 		}
662 		q->flags = ctl_v1->flags;
663 		q->headdrop = ctl_v1->headdrop;
664 	}
665 	if (ctl->limit) {
666 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
667 		q->maxflows = min_t(u32, q->maxflows, q->limit);
668 	}
669 
670 	qlen = sch->q.qlen;
671 	while (sch->q.qlen > q->limit)
672 		dropped += sfq_drop(sch);
673 	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
674 
675 	del_timer(&q->perturb_timer);
676 	if (q->perturb_period) {
677 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
678 		q->perturbation = prandom_u32();
679 	}
680 	sch_tree_unlock(sch);
681 	kfree(p);
682 	return 0;
683 }
684 
685 static void *sfq_alloc(size_t sz)
686 {
687 	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
688 
689 	if (!ptr)
690 		ptr = vmalloc(sz);
691 	return ptr;
692 }
693 
694 static void sfq_free(void *addr)
695 {
696 	kvfree(addr);
697 }
698 
699 static void sfq_destroy(struct Qdisc *sch)
700 {
701 	struct sfq_sched_data *q = qdisc_priv(sch);
702 
703 	tcf_destroy_chain(&q->filter_list);
704 	q->perturb_period = 0;
705 	del_timer_sync(&q->perturb_timer);
706 	sfq_free(q->ht);
707 	sfq_free(q->slots);
708 	kfree(q->red_parms);
709 }
710 
711 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
712 {
713 	struct sfq_sched_data *q = qdisc_priv(sch);
714 	int i;
715 
716 	q->perturb_timer.function = sfq_perturbation;
717 	q->perturb_timer.data = (unsigned long)sch;
718 	init_timer_deferrable(&q->perturb_timer);
719 
720 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
721 		q->dep[i].next = i + SFQ_MAX_FLOWS;
722 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
723 	}
724 
725 	q->limit = SFQ_MAX_DEPTH;
726 	q->maxdepth = SFQ_MAX_DEPTH;
727 	q->cur_depth = 0;
728 	q->tail = NULL;
729 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
730 	q->maxflows = SFQ_DEFAULT_FLOWS;
731 	q->quantum = psched_mtu(qdisc_dev(sch));
732 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
733 	q->perturb_period = 0;
734 	q->perturbation = prandom_u32();
735 
736 	if (opt) {
737 		int err = sfq_change(sch, opt);
738 		if (err)
739 			return err;
740 	}
741 
742 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
743 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
744 	if (!q->ht || !q->slots) {
745 		sfq_destroy(sch);
746 		return -ENOMEM;
747 	}
748 	for (i = 0; i < q->divisor; i++)
749 		q->ht[i] = SFQ_EMPTY_SLOT;
750 
751 	for (i = 0; i < q->maxflows; i++) {
752 		slot_queue_init(&q->slots[i]);
753 		sfq_link(q, i);
754 	}
755 	if (q->limit >= 1)
756 		sch->flags |= TCQ_F_CAN_BYPASS;
757 	else
758 		sch->flags &= ~TCQ_F_CAN_BYPASS;
759 	return 0;
760 }
761 
762 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
763 {
764 	struct sfq_sched_data *q = qdisc_priv(sch);
765 	unsigned char *b = skb_tail_pointer(skb);
766 	struct tc_sfq_qopt_v1 opt;
767 	struct red_parms *p = q->red_parms;
768 
769 	memset(&opt, 0, sizeof(opt));
770 	opt.v0.quantum	= q->quantum;
771 	opt.v0.perturb_period = q->perturb_period / HZ;
772 	opt.v0.limit	= q->limit;
773 	opt.v0.divisor	= q->divisor;
774 	opt.v0.flows	= q->maxflows;
775 	opt.depth	= q->maxdepth;
776 	opt.headdrop	= q->headdrop;
777 
778 	if (p) {
779 		opt.qth_min	= p->qth_min >> p->Wlog;
780 		opt.qth_max	= p->qth_max >> p->Wlog;
781 		opt.Wlog	= p->Wlog;
782 		opt.Plog	= p->Plog;
783 		opt.Scell_log	= p->Scell_log;
784 		opt.max_P	= p->max_P;
785 	}
786 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
787 	opt.flags	= q->flags;
788 
789 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
790 		goto nla_put_failure;
791 
792 	return skb->len;
793 
794 nla_put_failure:
795 	nlmsg_trim(skb, b);
796 	return -1;
797 }
798 
799 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
800 {
801 	return NULL;
802 }
803 
804 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
805 {
806 	return 0;
807 }
808 
809 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
810 			      u32 classid)
811 {
812 	/* we cannot bypass queue discipline anymore */
813 	sch->flags &= ~TCQ_F_CAN_BYPASS;
814 	return 0;
815 }
816 
817 static void sfq_put(struct Qdisc *q, unsigned long cl)
818 {
819 }
820 
821 static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
822 					     unsigned long cl)
823 {
824 	struct sfq_sched_data *q = qdisc_priv(sch);
825 
826 	if (cl)
827 		return NULL;
828 	return &q->filter_list;
829 }
830 
831 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
832 			  struct sk_buff *skb, struct tcmsg *tcm)
833 {
834 	tcm->tcm_handle |= TC_H_MIN(cl);
835 	return 0;
836 }
837 
838 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
839 				struct gnet_dump *d)
840 {
841 	struct sfq_sched_data *q = qdisc_priv(sch);
842 	sfq_index idx = q->ht[cl - 1];
843 	struct gnet_stats_queue qs = { 0 };
844 	struct tc_sfq_xstats xstats = { 0 };
845 
846 	if (idx != SFQ_EMPTY_SLOT) {
847 		const struct sfq_slot *slot = &q->slots[idx];
848 
849 		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
850 		qs.qlen = slot->qlen;
851 		qs.backlog = slot->backlog;
852 	}
853 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
854 		return -1;
855 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
856 }
857 
858 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
859 {
860 	struct sfq_sched_data *q = qdisc_priv(sch);
861 	unsigned int i;
862 
863 	if (arg->stop)
864 		return;
865 
866 	for (i = 0; i < q->divisor; i++) {
867 		if (q->ht[i] == SFQ_EMPTY_SLOT ||
868 		    arg->count < arg->skip) {
869 			arg->count++;
870 			continue;
871 		}
872 		if (arg->fn(sch, i + 1, arg) < 0) {
873 			arg->stop = 1;
874 			break;
875 		}
876 		arg->count++;
877 	}
878 }
879 
880 static const struct Qdisc_class_ops sfq_class_ops = {
881 	.leaf		=	sfq_leaf,
882 	.get		=	sfq_get,
883 	.put		=	sfq_put,
884 	.tcf_chain	=	sfq_find_tcf,
885 	.bind_tcf	=	sfq_bind,
886 	.unbind_tcf	=	sfq_put,
887 	.dump		=	sfq_dump_class,
888 	.dump_stats	=	sfq_dump_class_stats,
889 	.walk		=	sfq_walk,
890 };
891 
892 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
893 	.cl_ops		=	&sfq_class_ops,
894 	.id		=	"sfq",
895 	.priv_size	=	sizeof(struct sfq_sched_data),
896 	.enqueue	=	sfq_enqueue,
897 	.dequeue	=	sfq_dequeue,
898 	.peek		=	qdisc_peek_dequeued,
899 	.drop		=	sfq_drop,
900 	.init		=	sfq_init,
901 	.reset		=	sfq_reset,
902 	.destroy	=	sfq_destroy,
903 	.change		=	NULL,
904 	.dump		=	sfq_dump,
905 	.owner		=	THIS_MODULE,
906 };
907 
908 static int __init sfq_module_init(void)
909 {
910 	return register_qdisc(&sfq_qdisc_ops);
911 }
912 static void __exit sfq_module_exit(void)
913 {
914 	unregister_qdisc(&sfq_qdisc_ops);
915 }
916 module_init(sfq_module_init)
917 module_exit(sfq_module_exit)
918 MODULE_LICENSE("GPL");
919