xref: /openbmc/linux/net/sched/sch_sfq.c (revision ce932d0c5589e9766e089c22c66890dfc48fbd94)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/flow_keys.h>
27 #include <net/red.h>
28 
29 
30 /*	Stochastic Fairness Queuing algorithm.
31 	=======================================
32 
33 	Source:
34 	Paul E. McKenney "Stochastic Fairness Queuing",
35 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36 
37 	Paul E. McKenney "Stochastic Fairness Queuing",
38 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
39 
40 
41 	See also:
42 	M. Shreedhar and George Varghese "Efficient Fair
43 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44 
45 
46 	This is not the thing that is usually called (W)FQ nowadays.
47 	It does not use any timestamp mechanism, but instead
48 	processes queues in round-robin order.
49 
50 	ADVANTAGE:
51 
52 	- It is very cheap. Both CPU and memory requirements are minimal.
53 
54 	DRAWBACKS:
55 
56 	- "Stochastic" -> It is not 100% fair.
57 	When hash collisions occur, several flows are considered as one.
58 
59 	- "Round-robin" -> It introduces larger delays than virtual clock
60 	based schemes, and should not be used for isolating interactive
61 	traffic	from non-interactive. It means, that this scheduler
62 	should be used as leaf of CBQ or P3, which put interactive traffic
63 	to higher priority band.
64 
65 	We still need true WFQ for top level CSZ, but using WFQ
66 	for the best effort traffic is absolutely pointless:
67 	SFQ is superior for this purpose.
68 
69 	IMPLEMENTATION:
70 	This implementation limits :
71 	- maximal queue length per flow to 127 packets.
72 	- max mtu to 2^18-1;
73 	- max 65408 flows,
74 	- number of hash buckets to 65536.
75 
76 	It is easy to increase these values, but not in flight.  */
77 
78 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
79 #define SFQ_DEFAULT_FLOWS	128
80 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
81 #define SFQ_EMPTY_SLOT		0xffff
82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
83 
84 /* We use 16 bits to store allot, and want to handle packets up to 64K
85  * Scale allot by 8 (1<<3) so that no overflow occurs.
86  */
87 #define SFQ_ALLOT_SHIFT		3
88 #define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
89 
90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
91 typedef u16 sfq_index;
92 
93 /*
94  * We dont use pointers to save space.
95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
97  * are 'pointers' to dep[] array
98  */
99 struct sfq_head {
100 	sfq_index	next;
101 	sfq_index	prev;
102 };
103 
104 struct sfq_slot {
105 	struct sk_buff	*skblist_next;
106 	struct sk_buff	*skblist_prev;
107 	sfq_index	qlen; /* number of skbs in skblist */
108 	sfq_index	next; /* next slot in sfq RR chain */
109 	struct sfq_head dep; /* anchor in dep[] chains */
110 	unsigned short	hash; /* hash value (index in ht[]) */
111 	short		allot; /* credit for this slot */
112 
113 	unsigned int    backlog;
114 	struct red_vars vars;
115 };
116 
117 struct sfq_sched_data {
118 /* frequently used fields */
119 	int		limit;		/* limit of total number of packets in this qdisc */
120 	unsigned int	divisor;	/* number of slots in hash table */
121 	u8		headdrop;
122 	u8		maxdepth;	/* limit of packets per flow */
123 
124 	u32		perturbation;
125 	u8		cur_depth;	/* depth of longest slot */
126 	u8		flags;
127 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
128 	struct tcf_proto *filter_list;
129 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
130 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
131 
132 	struct red_parms *red_parms;
133 	struct tc_sfqred_stats stats;
134 	struct sfq_slot *tail;		/* current slot in round */
135 
136 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
137 					/* Linked lists of slots, indexed by depth
138 					 * dep[0] : list of unused flows
139 					 * dep[1] : list of flows with 1 packet
140 					 * dep[X] : list of flows with X packets
141 					 */
142 
143 	unsigned int	maxflows;	/* number of flows in flows array */
144 	int		perturb_period;
145 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
146 	struct timer_list perturb_timer;
147 };
148 
149 /*
150  * sfq_head are either in a sfq_slot or in dep[] array
151  */
152 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
153 {
154 	if (val < SFQ_MAX_FLOWS)
155 		return &q->slots[val].dep;
156 	return &q->dep[val - SFQ_MAX_FLOWS];
157 }
158 
159 /*
160  * In order to be able to quickly rehash our queue when timer changes
161  * q->perturbation, we store flow_keys in skb->cb[]
162  */
163 struct sfq_skb_cb {
164        struct flow_keys        keys;
165 };
166 
167 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
168 {
169 	qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
170 	return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
171 }
172 
173 static unsigned int sfq_hash(const struct sfq_sched_data *q,
174 			     const struct sk_buff *skb)
175 {
176 	const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
177 	unsigned int hash;
178 
179 	hash = jhash_3words((__force u32)keys->dst,
180 			    (__force u32)keys->src ^ keys->ip_proto,
181 			    (__force u32)keys->ports, q->perturbation);
182 	return hash & (q->divisor - 1);
183 }
184 
185 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
186 				 int *qerr)
187 {
188 	struct sfq_sched_data *q = qdisc_priv(sch);
189 	struct tcf_result res;
190 	int result;
191 
192 	if (TC_H_MAJ(skb->priority) == sch->handle &&
193 	    TC_H_MIN(skb->priority) > 0 &&
194 	    TC_H_MIN(skb->priority) <= q->divisor)
195 		return TC_H_MIN(skb->priority);
196 
197 	if (!q->filter_list) {
198 		skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
199 		return sfq_hash(q, skb) + 1;
200 	}
201 
202 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
203 	result = tc_classify(skb, q->filter_list, &res);
204 	if (result >= 0) {
205 #ifdef CONFIG_NET_CLS_ACT
206 		switch (result) {
207 		case TC_ACT_STOLEN:
208 		case TC_ACT_QUEUED:
209 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
210 		case TC_ACT_SHOT:
211 			return 0;
212 		}
213 #endif
214 		if (TC_H_MIN(res.classid) <= q->divisor)
215 			return TC_H_MIN(res.classid);
216 	}
217 	return 0;
218 }
219 
220 /*
221  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
222  */
223 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
224 {
225 	sfq_index p, n;
226 	struct sfq_slot *slot = &q->slots[x];
227 	int qlen = slot->qlen;
228 
229 	p = qlen + SFQ_MAX_FLOWS;
230 	n = q->dep[qlen].next;
231 
232 	slot->dep.next = n;
233 	slot->dep.prev = p;
234 
235 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
236 	sfq_dep_head(q, n)->prev = x;
237 }
238 
239 #define sfq_unlink(q, x, n, p)			\
240 	n = q->slots[x].dep.next;		\
241 	p = q->slots[x].dep.prev;		\
242 	sfq_dep_head(q, p)->next = n;		\
243 	sfq_dep_head(q, n)->prev = p
244 
245 
246 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
247 {
248 	sfq_index p, n;
249 	int d;
250 
251 	sfq_unlink(q, x, n, p);
252 
253 	d = q->slots[x].qlen--;
254 	if (n == p && q->cur_depth == d)
255 		q->cur_depth--;
256 	sfq_link(q, x);
257 }
258 
259 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
260 {
261 	sfq_index p, n;
262 	int d;
263 
264 	sfq_unlink(q, x, n, p);
265 
266 	d = ++q->slots[x].qlen;
267 	if (q->cur_depth < d)
268 		q->cur_depth = d;
269 	sfq_link(q, x);
270 }
271 
272 /* helper functions : might be changed when/if skb use a standard list_head */
273 
274 /* remove one skb from tail of slot queue */
275 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
276 {
277 	struct sk_buff *skb = slot->skblist_prev;
278 
279 	slot->skblist_prev = skb->prev;
280 	skb->prev->next = (struct sk_buff *)slot;
281 	skb->next = skb->prev = NULL;
282 	return skb;
283 }
284 
285 /* remove one skb from head of slot queue */
286 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
287 {
288 	struct sk_buff *skb = slot->skblist_next;
289 
290 	slot->skblist_next = skb->next;
291 	skb->next->prev = (struct sk_buff *)slot;
292 	skb->next = skb->prev = NULL;
293 	return skb;
294 }
295 
296 static inline void slot_queue_init(struct sfq_slot *slot)
297 {
298 	memset(slot, 0, sizeof(*slot));
299 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
300 }
301 
302 /* add skb to slot queue (tail add) */
303 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
304 {
305 	skb->prev = slot->skblist_prev;
306 	skb->next = (struct sk_buff *)slot;
307 	slot->skblist_prev->next = skb;
308 	slot->skblist_prev = skb;
309 }
310 
311 #define	slot_queue_walk(slot, skb)		\
312 	for (skb = slot->skblist_next;		\
313 	     skb != (struct sk_buff *)slot;	\
314 	     skb = skb->next)
315 
316 static unsigned int sfq_drop(struct Qdisc *sch)
317 {
318 	struct sfq_sched_data *q = qdisc_priv(sch);
319 	sfq_index x, d = q->cur_depth;
320 	struct sk_buff *skb;
321 	unsigned int len;
322 	struct sfq_slot *slot;
323 
324 	/* Queue is full! Find the longest slot and drop tail packet from it */
325 	if (d > 1) {
326 		x = q->dep[d].next;
327 		slot = &q->slots[x];
328 drop:
329 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
330 		len = qdisc_pkt_len(skb);
331 		slot->backlog -= len;
332 		sfq_dec(q, x);
333 		kfree_skb(skb);
334 		sch->q.qlen--;
335 		sch->qstats.drops++;
336 		sch->qstats.backlog -= len;
337 		return len;
338 	}
339 
340 	if (d == 1) {
341 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
342 		x = q->tail->next;
343 		slot = &q->slots[x];
344 		q->tail->next = slot->next;
345 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
346 		goto drop;
347 	}
348 
349 	return 0;
350 }
351 
352 /* Is ECN parameter configured */
353 static int sfq_prob_mark(const struct sfq_sched_data *q)
354 {
355 	return q->flags & TC_RED_ECN;
356 }
357 
358 /* Should packets over max threshold just be marked */
359 static int sfq_hard_mark(const struct sfq_sched_data *q)
360 {
361 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
362 }
363 
364 static int sfq_headdrop(const struct sfq_sched_data *q)
365 {
366 	return q->headdrop;
367 }
368 
369 static int
370 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
371 {
372 	struct sfq_sched_data *q = qdisc_priv(sch);
373 	unsigned int hash;
374 	sfq_index x, qlen;
375 	struct sfq_slot *slot;
376 	int uninitialized_var(ret);
377 	struct sk_buff *head;
378 	int delta;
379 
380 	hash = sfq_classify(skb, sch, &ret);
381 	if (hash == 0) {
382 		if (ret & __NET_XMIT_BYPASS)
383 			sch->qstats.drops++;
384 		kfree_skb(skb);
385 		return ret;
386 	}
387 	hash--;
388 
389 	x = q->ht[hash];
390 	slot = &q->slots[x];
391 	if (x == SFQ_EMPTY_SLOT) {
392 		x = q->dep[0].next; /* get a free slot */
393 		if (x >= SFQ_MAX_FLOWS)
394 			return qdisc_drop(skb, sch);
395 		q->ht[hash] = x;
396 		slot = &q->slots[x];
397 		slot->hash = hash;
398 		slot->backlog = 0; /* should already be 0 anyway... */
399 		red_set_vars(&slot->vars);
400 		goto enqueue;
401 	}
402 	if (q->red_parms) {
403 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
404 							&slot->vars,
405 							slot->backlog);
406 		switch (red_action(q->red_parms,
407 				   &slot->vars,
408 				   slot->vars.qavg)) {
409 		case RED_DONT_MARK:
410 			break;
411 
412 		case RED_PROB_MARK:
413 			sch->qstats.overlimits++;
414 			if (sfq_prob_mark(q)) {
415 				/* We know we have at least one packet in queue */
416 				if (sfq_headdrop(q) &&
417 				    INET_ECN_set_ce(slot->skblist_next)) {
418 					q->stats.prob_mark_head++;
419 					break;
420 				}
421 				if (INET_ECN_set_ce(skb)) {
422 					q->stats.prob_mark++;
423 					break;
424 				}
425 			}
426 			q->stats.prob_drop++;
427 			goto congestion_drop;
428 
429 		case RED_HARD_MARK:
430 			sch->qstats.overlimits++;
431 			if (sfq_hard_mark(q)) {
432 				/* We know we have at least one packet in queue */
433 				if (sfq_headdrop(q) &&
434 				    INET_ECN_set_ce(slot->skblist_next)) {
435 					q->stats.forced_mark_head++;
436 					break;
437 				}
438 				if (INET_ECN_set_ce(skb)) {
439 					q->stats.forced_mark++;
440 					break;
441 				}
442 			}
443 			q->stats.forced_drop++;
444 			goto congestion_drop;
445 		}
446 	}
447 
448 	if (slot->qlen >= q->maxdepth) {
449 congestion_drop:
450 		if (!sfq_headdrop(q))
451 			return qdisc_drop(skb, sch);
452 
453 		/* We know we have at least one packet in queue */
454 		head = slot_dequeue_head(slot);
455 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
456 		sch->qstats.backlog -= delta;
457 		slot->backlog -= delta;
458 		qdisc_drop(head, sch);
459 
460 		slot_queue_add(slot, skb);
461 		return NET_XMIT_CN;
462 	}
463 
464 enqueue:
465 	sch->qstats.backlog += qdisc_pkt_len(skb);
466 	slot->backlog += qdisc_pkt_len(skb);
467 	slot_queue_add(slot, skb);
468 	sfq_inc(q, x);
469 	if (slot->qlen == 1) {		/* The flow is new */
470 		if (q->tail == NULL) {	/* It is the first flow */
471 			slot->next = x;
472 		} else {
473 			slot->next = q->tail->next;
474 			q->tail->next = x;
475 		}
476 		/* We put this flow at the end of our flow list.
477 		 * This might sound unfair for a new flow to wait after old ones,
478 		 * but we could endup servicing new flows only, and freeze old ones.
479 		 */
480 		q->tail = slot;
481 		/* We could use a bigger initial quantum for new flows */
482 		slot->allot = q->scaled_quantum;
483 	}
484 	if (++sch->q.qlen <= q->limit)
485 		return NET_XMIT_SUCCESS;
486 
487 	qlen = slot->qlen;
488 	sfq_drop(sch);
489 	/* Return Congestion Notification only if we dropped a packet
490 	 * from this flow.
491 	 */
492 	if (qlen != slot->qlen)
493 		return NET_XMIT_CN;
494 
495 	/* As we dropped a packet, better let upper stack know this */
496 	qdisc_tree_decrease_qlen(sch, 1);
497 	return NET_XMIT_SUCCESS;
498 }
499 
500 static struct sk_buff *
501 sfq_dequeue(struct Qdisc *sch)
502 {
503 	struct sfq_sched_data *q = qdisc_priv(sch);
504 	struct sk_buff *skb;
505 	sfq_index a, next_a;
506 	struct sfq_slot *slot;
507 
508 	/* No active slots */
509 	if (q->tail == NULL)
510 		return NULL;
511 
512 next_slot:
513 	a = q->tail->next;
514 	slot = &q->slots[a];
515 	if (slot->allot <= 0) {
516 		q->tail = slot;
517 		slot->allot += q->scaled_quantum;
518 		goto next_slot;
519 	}
520 	skb = slot_dequeue_head(slot);
521 	sfq_dec(q, a);
522 	qdisc_bstats_update(sch, skb);
523 	sch->q.qlen--;
524 	sch->qstats.backlog -= qdisc_pkt_len(skb);
525 	slot->backlog -= qdisc_pkt_len(skb);
526 	/* Is the slot empty? */
527 	if (slot->qlen == 0) {
528 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
529 		next_a = slot->next;
530 		if (a == next_a) {
531 			q->tail = NULL; /* no more active slots */
532 			return skb;
533 		}
534 		q->tail->next = next_a;
535 	} else {
536 		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
537 	}
538 	return skb;
539 }
540 
541 static void
542 sfq_reset(struct Qdisc *sch)
543 {
544 	struct sk_buff *skb;
545 
546 	while ((skb = sfq_dequeue(sch)) != NULL)
547 		kfree_skb(skb);
548 }
549 
550 /*
551  * When q->perturbation is changed, we rehash all queued skbs
552  * to avoid OOO (Out Of Order) effects.
553  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
554  * counters.
555  */
556 static void sfq_rehash(struct Qdisc *sch)
557 {
558 	struct sfq_sched_data *q = qdisc_priv(sch);
559 	struct sk_buff *skb;
560 	int i;
561 	struct sfq_slot *slot;
562 	struct sk_buff_head list;
563 	int dropped = 0;
564 
565 	__skb_queue_head_init(&list);
566 
567 	for (i = 0; i < q->maxflows; i++) {
568 		slot = &q->slots[i];
569 		if (!slot->qlen)
570 			continue;
571 		while (slot->qlen) {
572 			skb = slot_dequeue_head(slot);
573 			sfq_dec(q, i);
574 			__skb_queue_tail(&list, skb);
575 		}
576 		slot->backlog = 0;
577 		red_set_vars(&slot->vars);
578 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
579 	}
580 	q->tail = NULL;
581 
582 	while ((skb = __skb_dequeue(&list)) != NULL) {
583 		unsigned int hash = sfq_hash(q, skb);
584 		sfq_index x = q->ht[hash];
585 
586 		slot = &q->slots[x];
587 		if (x == SFQ_EMPTY_SLOT) {
588 			x = q->dep[0].next; /* get a free slot */
589 			if (x >= SFQ_MAX_FLOWS) {
590 drop:				sch->qstats.backlog -= qdisc_pkt_len(skb);
591 				kfree_skb(skb);
592 				dropped++;
593 				continue;
594 			}
595 			q->ht[hash] = x;
596 			slot = &q->slots[x];
597 			slot->hash = hash;
598 		}
599 		if (slot->qlen >= q->maxdepth)
600 			goto drop;
601 		slot_queue_add(slot, skb);
602 		if (q->red_parms)
603 			slot->vars.qavg = red_calc_qavg(q->red_parms,
604 							&slot->vars,
605 							slot->backlog);
606 		slot->backlog += qdisc_pkt_len(skb);
607 		sfq_inc(q, x);
608 		if (slot->qlen == 1) {		/* The flow is new */
609 			if (q->tail == NULL) {	/* It is the first flow */
610 				slot->next = x;
611 			} else {
612 				slot->next = q->tail->next;
613 				q->tail->next = x;
614 			}
615 			q->tail = slot;
616 			slot->allot = q->scaled_quantum;
617 		}
618 	}
619 	sch->q.qlen -= dropped;
620 	qdisc_tree_decrease_qlen(sch, dropped);
621 }
622 
623 static void sfq_perturbation(unsigned long arg)
624 {
625 	struct Qdisc *sch = (struct Qdisc *)arg;
626 	struct sfq_sched_data *q = qdisc_priv(sch);
627 	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
628 
629 	spin_lock(root_lock);
630 	q->perturbation = net_random();
631 	if (!q->filter_list && q->tail)
632 		sfq_rehash(sch);
633 	spin_unlock(root_lock);
634 
635 	if (q->perturb_period)
636 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
637 }
638 
639 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
640 {
641 	struct sfq_sched_data *q = qdisc_priv(sch);
642 	struct tc_sfq_qopt *ctl = nla_data(opt);
643 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
644 	unsigned int qlen;
645 	struct red_parms *p = NULL;
646 
647 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
648 		return -EINVAL;
649 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
650 		ctl_v1 = nla_data(opt);
651 	if (ctl->divisor &&
652 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
653 		return -EINVAL;
654 	if (ctl_v1 && ctl_v1->qth_min) {
655 		p = kmalloc(sizeof(*p), GFP_KERNEL);
656 		if (!p)
657 			return -ENOMEM;
658 	}
659 	sch_tree_lock(sch);
660 	if (ctl->quantum) {
661 		q->quantum = ctl->quantum;
662 		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
663 	}
664 	q->perturb_period = ctl->perturb_period * HZ;
665 	if (ctl->flows)
666 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
667 	if (ctl->divisor) {
668 		q->divisor = ctl->divisor;
669 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
670 	}
671 	if (ctl_v1) {
672 		if (ctl_v1->depth)
673 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
674 		if (p) {
675 			swap(q->red_parms, p);
676 			red_set_parms(q->red_parms,
677 				      ctl_v1->qth_min, ctl_v1->qth_max,
678 				      ctl_v1->Wlog,
679 				      ctl_v1->Plog, ctl_v1->Scell_log,
680 				      NULL,
681 				      ctl_v1->max_P);
682 		}
683 		q->flags = ctl_v1->flags;
684 		q->headdrop = ctl_v1->headdrop;
685 	}
686 	if (ctl->limit) {
687 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
688 		q->maxflows = min_t(u32, q->maxflows, q->limit);
689 	}
690 
691 	qlen = sch->q.qlen;
692 	while (sch->q.qlen > q->limit)
693 		sfq_drop(sch);
694 	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
695 
696 	del_timer(&q->perturb_timer);
697 	if (q->perturb_period) {
698 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
699 		q->perturbation = net_random();
700 	}
701 	sch_tree_unlock(sch);
702 	kfree(p);
703 	return 0;
704 }
705 
706 static void *sfq_alloc(size_t sz)
707 {
708 	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
709 
710 	if (!ptr)
711 		ptr = vmalloc(sz);
712 	return ptr;
713 }
714 
715 static void sfq_free(void *addr)
716 {
717 	if (addr) {
718 		if (is_vmalloc_addr(addr))
719 			vfree(addr);
720 		else
721 			kfree(addr);
722 	}
723 }
724 
725 static void sfq_destroy(struct Qdisc *sch)
726 {
727 	struct sfq_sched_data *q = qdisc_priv(sch);
728 
729 	tcf_destroy_chain(&q->filter_list);
730 	q->perturb_period = 0;
731 	del_timer_sync(&q->perturb_timer);
732 	sfq_free(q->ht);
733 	sfq_free(q->slots);
734 	kfree(q->red_parms);
735 }
736 
737 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
738 {
739 	struct sfq_sched_data *q = qdisc_priv(sch);
740 	int i;
741 
742 	q->perturb_timer.function = sfq_perturbation;
743 	q->perturb_timer.data = (unsigned long)sch;
744 	init_timer_deferrable(&q->perturb_timer);
745 
746 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
747 		q->dep[i].next = i + SFQ_MAX_FLOWS;
748 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
749 	}
750 
751 	q->limit = SFQ_MAX_DEPTH;
752 	q->maxdepth = SFQ_MAX_DEPTH;
753 	q->cur_depth = 0;
754 	q->tail = NULL;
755 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
756 	q->maxflows = SFQ_DEFAULT_FLOWS;
757 	q->quantum = psched_mtu(qdisc_dev(sch));
758 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
759 	q->perturb_period = 0;
760 	q->perturbation = net_random();
761 
762 	if (opt) {
763 		int err = sfq_change(sch, opt);
764 		if (err)
765 			return err;
766 	}
767 
768 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
769 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
770 	if (!q->ht || !q->slots) {
771 		sfq_destroy(sch);
772 		return -ENOMEM;
773 	}
774 	for (i = 0; i < q->divisor; i++)
775 		q->ht[i] = SFQ_EMPTY_SLOT;
776 
777 	for (i = 0; i < q->maxflows; i++) {
778 		slot_queue_init(&q->slots[i]);
779 		sfq_link(q, i);
780 	}
781 	if (q->limit >= 1)
782 		sch->flags |= TCQ_F_CAN_BYPASS;
783 	else
784 		sch->flags &= ~TCQ_F_CAN_BYPASS;
785 	return 0;
786 }
787 
788 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
789 {
790 	struct sfq_sched_data *q = qdisc_priv(sch);
791 	unsigned char *b = skb_tail_pointer(skb);
792 	struct tc_sfq_qopt_v1 opt;
793 	struct red_parms *p = q->red_parms;
794 
795 	memset(&opt, 0, sizeof(opt));
796 	opt.v0.quantum	= q->quantum;
797 	opt.v0.perturb_period = q->perturb_period / HZ;
798 	opt.v0.limit	= q->limit;
799 	opt.v0.divisor	= q->divisor;
800 	opt.v0.flows	= q->maxflows;
801 	opt.depth	= q->maxdepth;
802 	opt.headdrop	= q->headdrop;
803 
804 	if (p) {
805 		opt.qth_min	= p->qth_min >> p->Wlog;
806 		opt.qth_max	= p->qth_max >> p->Wlog;
807 		opt.Wlog	= p->Wlog;
808 		opt.Plog	= p->Plog;
809 		opt.Scell_log	= p->Scell_log;
810 		opt.max_P	= p->max_P;
811 	}
812 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
813 	opt.flags	= q->flags;
814 
815 	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
816 
817 	return skb->len;
818 
819 nla_put_failure:
820 	nlmsg_trim(skb, b);
821 	return -1;
822 }
823 
824 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
825 {
826 	return NULL;
827 }
828 
829 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
830 {
831 	return 0;
832 }
833 
834 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
835 			      u32 classid)
836 {
837 	/* we cannot bypass queue discipline anymore */
838 	sch->flags &= ~TCQ_F_CAN_BYPASS;
839 	return 0;
840 }
841 
842 static void sfq_put(struct Qdisc *q, unsigned long cl)
843 {
844 }
845 
846 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
847 {
848 	struct sfq_sched_data *q = qdisc_priv(sch);
849 
850 	if (cl)
851 		return NULL;
852 	return &q->filter_list;
853 }
854 
855 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
856 			  struct sk_buff *skb, struct tcmsg *tcm)
857 {
858 	tcm->tcm_handle |= TC_H_MIN(cl);
859 	return 0;
860 }
861 
862 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
863 				struct gnet_dump *d)
864 {
865 	struct sfq_sched_data *q = qdisc_priv(sch);
866 	sfq_index idx = q->ht[cl - 1];
867 	struct gnet_stats_queue qs = { 0 };
868 	struct tc_sfq_xstats xstats = { 0 };
869 
870 	if (idx != SFQ_EMPTY_SLOT) {
871 		const struct sfq_slot *slot = &q->slots[idx];
872 
873 		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
874 		qs.qlen = slot->qlen;
875 		qs.backlog = slot->backlog;
876 	}
877 	if (gnet_stats_copy_queue(d, &qs) < 0)
878 		return -1;
879 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
880 }
881 
882 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
883 {
884 	struct sfq_sched_data *q = qdisc_priv(sch);
885 	unsigned int i;
886 
887 	if (arg->stop)
888 		return;
889 
890 	for (i = 0; i < q->divisor; i++) {
891 		if (q->ht[i] == SFQ_EMPTY_SLOT ||
892 		    arg->count < arg->skip) {
893 			arg->count++;
894 			continue;
895 		}
896 		if (arg->fn(sch, i + 1, arg) < 0) {
897 			arg->stop = 1;
898 			break;
899 		}
900 		arg->count++;
901 	}
902 }
903 
904 static const struct Qdisc_class_ops sfq_class_ops = {
905 	.leaf		=	sfq_leaf,
906 	.get		=	sfq_get,
907 	.put		=	sfq_put,
908 	.tcf_chain	=	sfq_find_tcf,
909 	.bind_tcf	=	sfq_bind,
910 	.unbind_tcf	=	sfq_put,
911 	.dump		=	sfq_dump_class,
912 	.dump_stats	=	sfq_dump_class_stats,
913 	.walk		=	sfq_walk,
914 };
915 
916 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
917 	.cl_ops		=	&sfq_class_ops,
918 	.id		=	"sfq",
919 	.priv_size	=	sizeof(struct sfq_sched_data),
920 	.enqueue	=	sfq_enqueue,
921 	.dequeue	=	sfq_dequeue,
922 	.peek		=	qdisc_peek_dequeued,
923 	.drop		=	sfq_drop,
924 	.init		=	sfq_init,
925 	.reset		=	sfq_reset,
926 	.destroy	=	sfq_destroy,
927 	.change		=	NULL,
928 	.dump		=	sfq_dump,
929 	.owner		=	THIS_MODULE,
930 };
931 
932 static int __init sfq_module_init(void)
933 {
934 	return register_qdisc(&sfq_qdisc_ops);
935 }
936 static void __exit sfq_module_exit(void)
937 {
938 	unregister_qdisc(&sfq_qdisc_ops);
939 }
940 module_init(sfq_module_init)
941 module_exit(sfq_module_exit)
942 MODULE_LICENSE("GPL");
943