xref: /openbmc/linux/net/sched/sch_sfq.c (revision dea54fba)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/pkt_cls.h>
27 #include <net/red.h>
28 
29 
30 /*	Stochastic Fairness Queuing algorithm.
31 	=======================================
32 
33 	Source:
34 	Paul E. McKenney "Stochastic Fairness Queuing",
35 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36 
37 	Paul E. McKenney "Stochastic Fairness Queuing",
38 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
39 
40 
41 	See also:
42 	M. Shreedhar and George Varghese "Efficient Fair
43 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44 
45 
46 	This is not the thing that is usually called (W)FQ nowadays.
47 	It does not use any timestamp mechanism, but instead
48 	processes queues in round-robin order.
49 
50 	ADVANTAGE:
51 
52 	- It is very cheap. Both CPU and memory requirements are minimal.
53 
54 	DRAWBACKS:
55 
56 	- "Stochastic" -> It is not 100% fair.
57 	When hash collisions occur, several flows are considered as one.
58 
59 	- "Round-robin" -> It introduces larger delays than virtual clock
60 	based schemes, and should not be used for isolating interactive
61 	traffic	from non-interactive. It means, that this scheduler
62 	should be used as leaf of CBQ or P3, which put interactive traffic
63 	to higher priority band.
64 
65 	We still need true WFQ for top level CSZ, but using WFQ
66 	for the best effort traffic is absolutely pointless:
67 	SFQ is superior for this purpose.
68 
69 	IMPLEMENTATION:
70 	This implementation limits :
71 	- maximal queue length per flow to 127 packets.
72 	- max mtu to 2^18-1;
73 	- max 65408 flows,
74 	- number of hash buckets to 65536.
75 
76 	It is easy to increase these values, but not in flight.  */
77 
78 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
79 #define SFQ_DEFAULT_FLOWS	128
80 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
81 #define SFQ_EMPTY_SLOT		0xffff
82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
83 
84 /* We use 16 bits to store allot, and want to handle packets up to 64K
85  * Scale allot by 8 (1<<3) so that no overflow occurs.
86  */
87 #define SFQ_ALLOT_SHIFT		3
88 #define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
89 
90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
91 typedef u16 sfq_index;
92 
93 /*
94  * We dont use pointers to save space.
95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
97  * are 'pointers' to dep[] array
98  */
99 struct sfq_head {
100 	sfq_index	next;
101 	sfq_index	prev;
102 };
103 
104 struct sfq_slot {
105 	struct sk_buff	*skblist_next;
106 	struct sk_buff	*skblist_prev;
107 	sfq_index	qlen; /* number of skbs in skblist */
108 	sfq_index	next; /* next slot in sfq RR chain */
109 	struct sfq_head dep; /* anchor in dep[] chains */
110 	unsigned short	hash; /* hash value (index in ht[]) */
111 	short		allot; /* credit for this slot */
112 
113 	unsigned int    backlog;
114 	struct red_vars vars;
115 };
116 
117 struct sfq_sched_data {
118 /* frequently used fields */
119 	int		limit;		/* limit of total number of packets in this qdisc */
120 	unsigned int	divisor;	/* number of slots in hash table */
121 	u8		headdrop;
122 	u8		maxdepth;	/* limit of packets per flow */
123 
124 	u32		perturbation;
125 	u8		cur_depth;	/* depth of longest slot */
126 	u8		flags;
127 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
128 	struct tcf_proto __rcu *filter_list;
129 	struct tcf_block *block;
130 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
131 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
132 
133 	struct red_parms *red_parms;
134 	struct tc_sfqred_stats stats;
135 	struct sfq_slot *tail;		/* current slot in round */
136 
137 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
138 					/* Linked lists of slots, indexed by depth
139 					 * dep[0] : list of unused flows
140 					 * dep[1] : list of flows with 1 packet
141 					 * dep[X] : list of flows with X packets
142 					 */
143 
144 	unsigned int	maxflows;	/* number of flows in flows array */
145 	int		perturb_period;
146 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
147 	struct timer_list perturb_timer;
148 };
149 
150 /*
151  * sfq_head are either in a sfq_slot or in dep[] array
152  */
153 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
154 {
155 	if (val < SFQ_MAX_FLOWS)
156 		return &q->slots[val].dep;
157 	return &q->dep[val - SFQ_MAX_FLOWS];
158 }
159 
160 static unsigned int sfq_hash(const struct sfq_sched_data *q,
161 			     const struct sk_buff *skb)
162 {
163 	return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
164 }
165 
166 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
167 				 int *qerr)
168 {
169 	struct sfq_sched_data *q = qdisc_priv(sch);
170 	struct tcf_result res;
171 	struct tcf_proto *fl;
172 	int result;
173 
174 	if (TC_H_MAJ(skb->priority) == sch->handle &&
175 	    TC_H_MIN(skb->priority) > 0 &&
176 	    TC_H_MIN(skb->priority) <= q->divisor)
177 		return TC_H_MIN(skb->priority);
178 
179 	fl = rcu_dereference_bh(q->filter_list);
180 	if (!fl)
181 		return sfq_hash(q, skb) + 1;
182 
183 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
184 	result = tcf_classify(skb, fl, &res, false);
185 	if (result >= 0) {
186 #ifdef CONFIG_NET_CLS_ACT
187 		switch (result) {
188 		case TC_ACT_STOLEN:
189 		case TC_ACT_QUEUED:
190 		case TC_ACT_TRAP:
191 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
192 		case TC_ACT_SHOT:
193 			return 0;
194 		}
195 #endif
196 		if (TC_H_MIN(res.classid) <= q->divisor)
197 			return TC_H_MIN(res.classid);
198 	}
199 	return 0;
200 }
201 
202 /*
203  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
204  */
205 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
206 {
207 	sfq_index p, n;
208 	struct sfq_slot *slot = &q->slots[x];
209 	int qlen = slot->qlen;
210 
211 	p = qlen + SFQ_MAX_FLOWS;
212 	n = q->dep[qlen].next;
213 
214 	slot->dep.next = n;
215 	slot->dep.prev = p;
216 
217 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
218 	sfq_dep_head(q, n)->prev = x;
219 }
220 
221 #define sfq_unlink(q, x, n, p)			\
222 	do {					\
223 		n = q->slots[x].dep.next;	\
224 		p = q->slots[x].dep.prev;	\
225 		sfq_dep_head(q, p)->next = n;	\
226 		sfq_dep_head(q, n)->prev = p;	\
227 	} while (0)
228 
229 
230 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
231 {
232 	sfq_index p, n;
233 	int d;
234 
235 	sfq_unlink(q, x, n, p);
236 
237 	d = q->slots[x].qlen--;
238 	if (n == p && q->cur_depth == d)
239 		q->cur_depth--;
240 	sfq_link(q, x);
241 }
242 
243 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
244 {
245 	sfq_index p, n;
246 	int d;
247 
248 	sfq_unlink(q, x, n, p);
249 
250 	d = ++q->slots[x].qlen;
251 	if (q->cur_depth < d)
252 		q->cur_depth = d;
253 	sfq_link(q, x);
254 }
255 
256 /* helper functions : might be changed when/if skb use a standard list_head */
257 
258 /* remove one skb from tail of slot queue */
259 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
260 {
261 	struct sk_buff *skb = slot->skblist_prev;
262 
263 	slot->skblist_prev = skb->prev;
264 	skb->prev->next = (struct sk_buff *)slot;
265 	skb->next = skb->prev = NULL;
266 	return skb;
267 }
268 
269 /* remove one skb from head of slot queue */
270 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
271 {
272 	struct sk_buff *skb = slot->skblist_next;
273 
274 	slot->skblist_next = skb->next;
275 	skb->next->prev = (struct sk_buff *)slot;
276 	skb->next = skb->prev = NULL;
277 	return skb;
278 }
279 
280 static inline void slot_queue_init(struct sfq_slot *slot)
281 {
282 	memset(slot, 0, sizeof(*slot));
283 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
284 }
285 
286 /* add skb to slot queue (tail add) */
287 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
288 {
289 	skb->prev = slot->skblist_prev;
290 	skb->next = (struct sk_buff *)slot;
291 	slot->skblist_prev->next = skb;
292 	slot->skblist_prev = skb;
293 }
294 
295 static unsigned int sfq_drop(struct Qdisc *sch)
296 {
297 	struct sfq_sched_data *q = qdisc_priv(sch);
298 	sfq_index x, d = q->cur_depth;
299 	struct sk_buff *skb;
300 	unsigned int len;
301 	struct sfq_slot *slot;
302 
303 	/* Queue is full! Find the longest slot and drop tail packet from it */
304 	if (d > 1) {
305 		x = q->dep[d].next;
306 		slot = &q->slots[x];
307 drop:
308 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
309 		len = qdisc_pkt_len(skb);
310 		slot->backlog -= len;
311 		sfq_dec(q, x);
312 		sch->q.qlen--;
313 		qdisc_qstats_drop(sch);
314 		qdisc_qstats_backlog_dec(sch, skb);
315 		kfree_skb(skb);
316 		return len;
317 	}
318 
319 	if (d == 1) {
320 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
321 		x = q->tail->next;
322 		slot = &q->slots[x];
323 		q->tail->next = slot->next;
324 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
325 		goto drop;
326 	}
327 
328 	return 0;
329 }
330 
331 /* Is ECN parameter configured */
332 static int sfq_prob_mark(const struct sfq_sched_data *q)
333 {
334 	return q->flags & TC_RED_ECN;
335 }
336 
337 /* Should packets over max threshold just be marked */
338 static int sfq_hard_mark(const struct sfq_sched_data *q)
339 {
340 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
341 }
342 
343 static int sfq_headdrop(const struct sfq_sched_data *q)
344 {
345 	return q->headdrop;
346 }
347 
348 static int
349 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
350 {
351 	struct sfq_sched_data *q = qdisc_priv(sch);
352 	unsigned int hash, dropped;
353 	sfq_index x, qlen;
354 	struct sfq_slot *slot;
355 	int uninitialized_var(ret);
356 	struct sk_buff *head;
357 	int delta;
358 
359 	hash = sfq_classify(skb, sch, &ret);
360 	if (hash == 0) {
361 		if (ret & __NET_XMIT_BYPASS)
362 			qdisc_qstats_drop(sch);
363 		kfree_skb(skb);
364 		return ret;
365 	}
366 	hash--;
367 
368 	x = q->ht[hash];
369 	slot = &q->slots[x];
370 	if (x == SFQ_EMPTY_SLOT) {
371 		x = q->dep[0].next; /* get a free slot */
372 		if (x >= SFQ_MAX_FLOWS)
373 			return qdisc_drop(skb, sch, to_free);
374 		q->ht[hash] = x;
375 		slot = &q->slots[x];
376 		slot->hash = hash;
377 		slot->backlog = 0; /* should already be 0 anyway... */
378 		red_set_vars(&slot->vars);
379 		goto enqueue;
380 	}
381 	if (q->red_parms) {
382 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
383 							&slot->vars,
384 							slot->backlog);
385 		switch (red_action(q->red_parms,
386 				   &slot->vars,
387 				   slot->vars.qavg)) {
388 		case RED_DONT_MARK:
389 			break;
390 
391 		case RED_PROB_MARK:
392 			qdisc_qstats_overlimit(sch);
393 			if (sfq_prob_mark(q)) {
394 				/* We know we have at least one packet in queue */
395 				if (sfq_headdrop(q) &&
396 				    INET_ECN_set_ce(slot->skblist_next)) {
397 					q->stats.prob_mark_head++;
398 					break;
399 				}
400 				if (INET_ECN_set_ce(skb)) {
401 					q->stats.prob_mark++;
402 					break;
403 				}
404 			}
405 			q->stats.prob_drop++;
406 			goto congestion_drop;
407 
408 		case RED_HARD_MARK:
409 			qdisc_qstats_overlimit(sch);
410 			if (sfq_hard_mark(q)) {
411 				/* We know we have at least one packet in queue */
412 				if (sfq_headdrop(q) &&
413 				    INET_ECN_set_ce(slot->skblist_next)) {
414 					q->stats.forced_mark_head++;
415 					break;
416 				}
417 				if (INET_ECN_set_ce(skb)) {
418 					q->stats.forced_mark++;
419 					break;
420 				}
421 			}
422 			q->stats.forced_drop++;
423 			goto congestion_drop;
424 		}
425 	}
426 
427 	if (slot->qlen >= q->maxdepth) {
428 congestion_drop:
429 		if (!sfq_headdrop(q))
430 			return qdisc_drop(skb, sch, to_free);
431 
432 		/* We know we have at least one packet in queue */
433 		head = slot_dequeue_head(slot);
434 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
435 		sch->qstats.backlog -= delta;
436 		slot->backlog -= delta;
437 		qdisc_drop(head, sch, to_free);
438 
439 		slot_queue_add(slot, skb);
440 		qdisc_tree_reduce_backlog(sch, 0, delta);
441 		return NET_XMIT_CN;
442 	}
443 
444 enqueue:
445 	qdisc_qstats_backlog_inc(sch, skb);
446 	slot->backlog += qdisc_pkt_len(skb);
447 	slot_queue_add(slot, skb);
448 	sfq_inc(q, x);
449 	if (slot->qlen == 1) {		/* The flow is new */
450 		if (q->tail == NULL) {	/* It is the first flow */
451 			slot->next = x;
452 		} else {
453 			slot->next = q->tail->next;
454 			q->tail->next = x;
455 		}
456 		/* We put this flow at the end of our flow list.
457 		 * This might sound unfair for a new flow to wait after old ones,
458 		 * but we could endup servicing new flows only, and freeze old ones.
459 		 */
460 		q->tail = slot;
461 		/* We could use a bigger initial quantum for new flows */
462 		slot->allot = q->scaled_quantum;
463 	}
464 	if (++sch->q.qlen <= q->limit)
465 		return NET_XMIT_SUCCESS;
466 
467 	qlen = slot->qlen;
468 	dropped = sfq_drop(sch);
469 	/* Return Congestion Notification only if we dropped a packet
470 	 * from this flow.
471 	 */
472 	if (qlen != slot->qlen) {
473 		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
474 		return NET_XMIT_CN;
475 	}
476 
477 	/* As we dropped a packet, better let upper stack know this */
478 	qdisc_tree_reduce_backlog(sch, 1, dropped);
479 	return NET_XMIT_SUCCESS;
480 }
481 
482 static struct sk_buff *
483 sfq_dequeue(struct Qdisc *sch)
484 {
485 	struct sfq_sched_data *q = qdisc_priv(sch);
486 	struct sk_buff *skb;
487 	sfq_index a, next_a;
488 	struct sfq_slot *slot;
489 
490 	/* No active slots */
491 	if (q->tail == NULL)
492 		return NULL;
493 
494 next_slot:
495 	a = q->tail->next;
496 	slot = &q->slots[a];
497 	if (slot->allot <= 0) {
498 		q->tail = slot;
499 		slot->allot += q->scaled_quantum;
500 		goto next_slot;
501 	}
502 	skb = slot_dequeue_head(slot);
503 	sfq_dec(q, a);
504 	qdisc_bstats_update(sch, skb);
505 	sch->q.qlen--;
506 	qdisc_qstats_backlog_dec(sch, skb);
507 	slot->backlog -= qdisc_pkt_len(skb);
508 	/* Is the slot empty? */
509 	if (slot->qlen == 0) {
510 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
511 		next_a = slot->next;
512 		if (a == next_a) {
513 			q->tail = NULL; /* no more active slots */
514 			return skb;
515 		}
516 		q->tail->next = next_a;
517 	} else {
518 		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
519 	}
520 	return skb;
521 }
522 
523 static void
524 sfq_reset(struct Qdisc *sch)
525 {
526 	struct sk_buff *skb;
527 
528 	while ((skb = sfq_dequeue(sch)) != NULL)
529 		rtnl_kfree_skbs(skb, skb);
530 }
531 
532 /*
533  * When q->perturbation is changed, we rehash all queued skbs
534  * to avoid OOO (Out Of Order) effects.
535  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
536  * counters.
537  */
538 static void sfq_rehash(struct Qdisc *sch)
539 {
540 	struct sfq_sched_data *q = qdisc_priv(sch);
541 	struct sk_buff *skb;
542 	int i;
543 	struct sfq_slot *slot;
544 	struct sk_buff_head list;
545 	int dropped = 0;
546 	unsigned int drop_len = 0;
547 
548 	__skb_queue_head_init(&list);
549 
550 	for (i = 0; i < q->maxflows; i++) {
551 		slot = &q->slots[i];
552 		if (!slot->qlen)
553 			continue;
554 		while (slot->qlen) {
555 			skb = slot_dequeue_head(slot);
556 			sfq_dec(q, i);
557 			__skb_queue_tail(&list, skb);
558 		}
559 		slot->backlog = 0;
560 		red_set_vars(&slot->vars);
561 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
562 	}
563 	q->tail = NULL;
564 
565 	while ((skb = __skb_dequeue(&list)) != NULL) {
566 		unsigned int hash = sfq_hash(q, skb);
567 		sfq_index x = q->ht[hash];
568 
569 		slot = &q->slots[x];
570 		if (x == SFQ_EMPTY_SLOT) {
571 			x = q->dep[0].next; /* get a free slot */
572 			if (x >= SFQ_MAX_FLOWS) {
573 drop:
574 				qdisc_qstats_backlog_dec(sch, skb);
575 				drop_len += qdisc_pkt_len(skb);
576 				kfree_skb(skb);
577 				dropped++;
578 				continue;
579 			}
580 			q->ht[hash] = x;
581 			slot = &q->slots[x];
582 			slot->hash = hash;
583 		}
584 		if (slot->qlen >= q->maxdepth)
585 			goto drop;
586 		slot_queue_add(slot, skb);
587 		if (q->red_parms)
588 			slot->vars.qavg = red_calc_qavg(q->red_parms,
589 							&slot->vars,
590 							slot->backlog);
591 		slot->backlog += qdisc_pkt_len(skb);
592 		sfq_inc(q, x);
593 		if (slot->qlen == 1) {		/* The flow is new */
594 			if (q->tail == NULL) {	/* It is the first flow */
595 				slot->next = x;
596 			} else {
597 				slot->next = q->tail->next;
598 				q->tail->next = x;
599 			}
600 			q->tail = slot;
601 			slot->allot = q->scaled_quantum;
602 		}
603 	}
604 	sch->q.qlen -= dropped;
605 	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
606 }
607 
608 static void sfq_perturbation(unsigned long arg)
609 {
610 	struct Qdisc *sch = (struct Qdisc *)arg;
611 	struct sfq_sched_data *q = qdisc_priv(sch);
612 	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
613 
614 	spin_lock(root_lock);
615 	q->perturbation = prandom_u32();
616 	if (!q->filter_list && q->tail)
617 		sfq_rehash(sch);
618 	spin_unlock(root_lock);
619 
620 	if (q->perturb_period)
621 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
622 }
623 
624 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
625 {
626 	struct sfq_sched_data *q = qdisc_priv(sch);
627 	struct tc_sfq_qopt *ctl = nla_data(opt);
628 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
629 	unsigned int qlen, dropped = 0;
630 	struct red_parms *p = NULL;
631 
632 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
633 		return -EINVAL;
634 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
635 		ctl_v1 = nla_data(opt);
636 	if (ctl->divisor &&
637 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
638 		return -EINVAL;
639 	if (ctl_v1 && ctl_v1->qth_min) {
640 		p = kmalloc(sizeof(*p), GFP_KERNEL);
641 		if (!p)
642 			return -ENOMEM;
643 	}
644 	sch_tree_lock(sch);
645 	if (ctl->quantum) {
646 		q->quantum = ctl->quantum;
647 		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
648 	}
649 	q->perturb_period = ctl->perturb_period * HZ;
650 	if (ctl->flows)
651 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
652 	if (ctl->divisor) {
653 		q->divisor = ctl->divisor;
654 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
655 	}
656 	if (ctl_v1) {
657 		if (ctl_v1->depth)
658 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
659 		if (p) {
660 			swap(q->red_parms, p);
661 			red_set_parms(q->red_parms,
662 				      ctl_v1->qth_min, ctl_v1->qth_max,
663 				      ctl_v1->Wlog,
664 				      ctl_v1->Plog, ctl_v1->Scell_log,
665 				      NULL,
666 				      ctl_v1->max_P);
667 		}
668 		q->flags = ctl_v1->flags;
669 		q->headdrop = ctl_v1->headdrop;
670 	}
671 	if (ctl->limit) {
672 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
673 		q->maxflows = min_t(u32, q->maxflows, q->limit);
674 	}
675 
676 	qlen = sch->q.qlen;
677 	while (sch->q.qlen > q->limit)
678 		dropped += sfq_drop(sch);
679 	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
680 
681 	del_timer(&q->perturb_timer);
682 	if (q->perturb_period) {
683 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
684 		q->perturbation = prandom_u32();
685 	}
686 	sch_tree_unlock(sch);
687 	kfree(p);
688 	return 0;
689 }
690 
691 static void *sfq_alloc(size_t sz)
692 {
693 	return  kvmalloc(sz, GFP_KERNEL);
694 }
695 
696 static void sfq_free(void *addr)
697 {
698 	kvfree(addr);
699 }
700 
701 static void sfq_destroy(struct Qdisc *sch)
702 {
703 	struct sfq_sched_data *q = qdisc_priv(sch);
704 
705 	tcf_block_put(q->block);
706 	q->perturb_period = 0;
707 	del_timer_sync(&q->perturb_timer);
708 	sfq_free(q->ht);
709 	sfq_free(q->slots);
710 	kfree(q->red_parms);
711 }
712 
713 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
714 {
715 	struct sfq_sched_data *q = qdisc_priv(sch);
716 	int i;
717 	int err;
718 
719 	err = tcf_block_get(&q->block, &q->filter_list);
720 	if (err)
721 		return err;
722 
723 	setup_deferrable_timer(&q->perturb_timer, sfq_perturbation,
724 			       (unsigned long)sch);
725 
726 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
727 		q->dep[i].next = i + SFQ_MAX_FLOWS;
728 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
729 	}
730 
731 	q->limit = SFQ_MAX_DEPTH;
732 	q->maxdepth = SFQ_MAX_DEPTH;
733 	q->cur_depth = 0;
734 	q->tail = NULL;
735 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
736 	q->maxflows = SFQ_DEFAULT_FLOWS;
737 	q->quantum = psched_mtu(qdisc_dev(sch));
738 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
739 	q->perturb_period = 0;
740 	q->perturbation = prandom_u32();
741 
742 	if (opt) {
743 		int err = sfq_change(sch, opt);
744 		if (err)
745 			return err;
746 	}
747 
748 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
749 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
750 	if (!q->ht || !q->slots) {
751 		/* Note: sfq_destroy() will be called by our caller */
752 		return -ENOMEM;
753 	}
754 
755 	for (i = 0; i < q->divisor; i++)
756 		q->ht[i] = SFQ_EMPTY_SLOT;
757 
758 	for (i = 0; i < q->maxflows; i++) {
759 		slot_queue_init(&q->slots[i]);
760 		sfq_link(q, i);
761 	}
762 	if (q->limit >= 1)
763 		sch->flags |= TCQ_F_CAN_BYPASS;
764 	else
765 		sch->flags &= ~TCQ_F_CAN_BYPASS;
766 	return 0;
767 }
768 
769 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
770 {
771 	struct sfq_sched_data *q = qdisc_priv(sch);
772 	unsigned char *b = skb_tail_pointer(skb);
773 	struct tc_sfq_qopt_v1 opt;
774 	struct red_parms *p = q->red_parms;
775 
776 	memset(&opt, 0, sizeof(opt));
777 	opt.v0.quantum	= q->quantum;
778 	opt.v0.perturb_period = q->perturb_period / HZ;
779 	opt.v0.limit	= q->limit;
780 	opt.v0.divisor	= q->divisor;
781 	opt.v0.flows	= q->maxflows;
782 	opt.depth	= q->maxdepth;
783 	opt.headdrop	= q->headdrop;
784 
785 	if (p) {
786 		opt.qth_min	= p->qth_min >> p->Wlog;
787 		opt.qth_max	= p->qth_max >> p->Wlog;
788 		opt.Wlog	= p->Wlog;
789 		opt.Plog	= p->Plog;
790 		opt.Scell_log	= p->Scell_log;
791 		opt.max_P	= p->max_P;
792 	}
793 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
794 	opt.flags	= q->flags;
795 
796 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
797 		goto nla_put_failure;
798 
799 	return skb->len;
800 
801 nla_put_failure:
802 	nlmsg_trim(skb, b);
803 	return -1;
804 }
805 
806 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
807 {
808 	return NULL;
809 }
810 
811 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
812 {
813 	return 0;
814 }
815 
816 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
817 			      u32 classid)
818 {
819 	/* we cannot bypass queue discipline anymore */
820 	sch->flags &= ~TCQ_F_CAN_BYPASS;
821 	return 0;
822 }
823 
824 static void sfq_put(struct Qdisc *q, unsigned long cl)
825 {
826 }
827 
828 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl)
829 {
830 	struct sfq_sched_data *q = qdisc_priv(sch);
831 
832 	if (cl)
833 		return NULL;
834 	return q->block;
835 }
836 
837 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
838 			  struct sk_buff *skb, struct tcmsg *tcm)
839 {
840 	tcm->tcm_handle |= TC_H_MIN(cl);
841 	return 0;
842 }
843 
844 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
845 				struct gnet_dump *d)
846 {
847 	struct sfq_sched_data *q = qdisc_priv(sch);
848 	sfq_index idx = q->ht[cl - 1];
849 	struct gnet_stats_queue qs = { 0 };
850 	struct tc_sfq_xstats xstats = { 0 };
851 
852 	if (idx != SFQ_EMPTY_SLOT) {
853 		const struct sfq_slot *slot = &q->slots[idx];
854 
855 		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
856 		qs.qlen = slot->qlen;
857 		qs.backlog = slot->backlog;
858 	}
859 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
860 		return -1;
861 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
862 }
863 
864 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
865 {
866 	struct sfq_sched_data *q = qdisc_priv(sch);
867 	unsigned int i;
868 
869 	if (arg->stop)
870 		return;
871 
872 	for (i = 0; i < q->divisor; i++) {
873 		if (q->ht[i] == SFQ_EMPTY_SLOT ||
874 		    arg->count < arg->skip) {
875 			arg->count++;
876 			continue;
877 		}
878 		if (arg->fn(sch, i + 1, arg) < 0) {
879 			arg->stop = 1;
880 			break;
881 		}
882 		arg->count++;
883 	}
884 }
885 
886 static const struct Qdisc_class_ops sfq_class_ops = {
887 	.leaf		=	sfq_leaf,
888 	.get		=	sfq_get,
889 	.put		=	sfq_put,
890 	.tcf_block	=	sfq_tcf_block,
891 	.bind_tcf	=	sfq_bind,
892 	.unbind_tcf	=	sfq_put,
893 	.dump		=	sfq_dump_class,
894 	.dump_stats	=	sfq_dump_class_stats,
895 	.walk		=	sfq_walk,
896 };
897 
898 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
899 	.cl_ops		=	&sfq_class_ops,
900 	.id		=	"sfq",
901 	.priv_size	=	sizeof(struct sfq_sched_data),
902 	.enqueue	=	sfq_enqueue,
903 	.dequeue	=	sfq_dequeue,
904 	.peek		=	qdisc_peek_dequeued,
905 	.init		=	sfq_init,
906 	.reset		=	sfq_reset,
907 	.destroy	=	sfq_destroy,
908 	.change		=	NULL,
909 	.dump		=	sfq_dump,
910 	.owner		=	THIS_MODULE,
911 };
912 
913 static int __init sfq_module_init(void)
914 {
915 	return register_qdisc(&sfq_qdisc_ops);
916 }
917 static void __exit sfq_module_exit(void)
918 {
919 	unregister_qdisc(&sfq_qdisc_ops);
920 }
921 module_init(sfq_module_init)
922 module_exit(sfq_module_exit)
923 MODULE_LICENSE("GPL");
924