xref: /openbmc/linux/net/sched/sch_sfq.c (revision b34e08d5)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/flow_keys.h>
27 #include <net/red.h>
28 
29 
30 /*	Stochastic Fairness Queuing algorithm.
31 	=======================================
32 
33 	Source:
34 	Paul E. McKenney "Stochastic Fairness Queuing",
35 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36 
37 	Paul E. McKenney "Stochastic Fairness Queuing",
38 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
39 
40 
41 	See also:
42 	M. Shreedhar and George Varghese "Efficient Fair
43 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44 
45 
46 	This is not the thing that is usually called (W)FQ nowadays.
47 	It does not use any timestamp mechanism, but instead
48 	processes queues in round-robin order.
49 
50 	ADVANTAGE:
51 
52 	- It is very cheap. Both CPU and memory requirements are minimal.
53 
54 	DRAWBACKS:
55 
56 	- "Stochastic" -> It is not 100% fair.
57 	When hash collisions occur, several flows are considered as one.
58 
59 	- "Round-robin" -> It introduces larger delays than virtual clock
60 	based schemes, and should not be used for isolating interactive
61 	traffic	from non-interactive. It means, that this scheduler
62 	should be used as leaf of CBQ or P3, which put interactive traffic
63 	to higher priority band.
64 
65 	We still need true WFQ for top level CSZ, but using WFQ
66 	for the best effort traffic is absolutely pointless:
67 	SFQ is superior for this purpose.
68 
69 	IMPLEMENTATION:
70 	This implementation limits :
71 	- maximal queue length per flow to 127 packets.
72 	- max mtu to 2^18-1;
73 	- max 65408 flows,
74 	- number of hash buckets to 65536.
75 
76 	It is easy to increase these values, but not in flight.  */
77 
78 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
79 #define SFQ_DEFAULT_FLOWS	128
80 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
81 #define SFQ_EMPTY_SLOT		0xffff
82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
83 
84 /* We use 16 bits to store allot, and want to handle packets up to 64K
85  * Scale allot by 8 (1<<3) so that no overflow occurs.
86  */
87 #define SFQ_ALLOT_SHIFT		3
88 #define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
89 
90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
91 typedef u16 sfq_index;
92 
93 /*
94  * We dont use pointers to save space.
95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
97  * are 'pointers' to dep[] array
98  */
99 struct sfq_head {
100 	sfq_index	next;
101 	sfq_index	prev;
102 };
103 
104 struct sfq_slot {
105 	struct sk_buff	*skblist_next;
106 	struct sk_buff	*skblist_prev;
107 	sfq_index	qlen; /* number of skbs in skblist */
108 	sfq_index	next; /* next slot in sfq RR chain */
109 	struct sfq_head dep; /* anchor in dep[] chains */
110 	unsigned short	hash; /* hash value (index in ht[]) */
111 	short		allot; /* credit for this slot */
112 
113 	unsigned int    backlog;
114 	struct red_vars vars;
115 };
116 
117 struct sfq_sched_data {
118 /* frequently used fields */
119 	int		limit;		/* limit of total number of packets in this qdisc */
120 	unsigned int	divisor;	/* number of slots in hash table */
121 	u8		headdrop;
122 	u8		maxdepth;	/* limit of packets per flow */
123 
124 	u32		perturbation;
125 	u8		cur_depth;	/* depth of longest slot */
126 	u8		flags;
127 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
128 	struct tcf_proto *filter_list;
129 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
130 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
131 
132 	struct red_parms *red_parms;
133 	struct tc_sfqred_stats stats;
134 	struct sfq_slot *tail;		/* current slot in round */
135 
136 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
137 					/* Linked lists of slots, indexed by depth
138 					 * dep[0] : list of unused flows
139 					 * dep[1] : list of flows with 1 packet
140 					 * dep[X] : list of flows with X packets
141 					 */
142 
143 	unsigned int	maxflows;	/* number of flows in flows array */
144 	int		perturb_period;
145 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
146 	struct timer_list perturb_timer;
147 };
148 
149 /*
150  * sfq_head are either in a sfq_slot or in dep[] array
151  */
152 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
153 {
154 	if (val < SFQ_MAX_FLOWS)
155 		return &q->slots[val].dep;
156 	return &q->dep[val - SFQ_MAX_FLOWS];
157 }
158 
159 /*
160  * In order to be able to quickly rehash our queue when timer changes
161  * q->perturbation, we store flow_keys in skb->cb[]
162  */
163 struct sfq_skb_cb {
164        struct flow_keys        keys;
165 };
166 
167 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
168 {
169 	qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
170 	return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
171 }
172 
173 static unsigned int sfq_hash(const struct sfq_sched_data *q,
174 			     const struct sk_buff *skb)
175 {
176 	const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
177 	unsigned int hash;
178 
179 	hash = jhash_3words((__force u32)keys->dst,
180 			    (__force u32)keys->src ^ keys->ip_proto,
181 			    (__force u32)keys->ports, q->perturbation);
182 	return hash & (q->divisor - 1);
183 }
184 
185 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
186 				 int *qerr)
187 {
188 	struct sfq_sched_data *q = qdisc_priv(sch);
189 	struct tcf_result res;
190 	int result;
191 
192 	if (TC_H_MAJ(skb->priority) == sch->handle &&
193 	    TC_H_MIN(skb->priority) > 0 &&
194 	    TC_H_MIN(skb->priority) <= q->divisor)
195 		return TC_H_MIN(skb->priority);
196 
197 	if (!q->filter_list) {
198 		skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
199 		return sfq_hash(q, skb) + 1;
200 	}
201 
202 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
203 	result = tc_classify(skb, q->filter_list, &res);
204 	if (result >= 0) {
205 #ifdef CONFIG_NET_CLS_ACT
206 		switch (result) {
207 		case TC_ACT_STOLEN:
208 		case TC_ACT_QUEUED:
209 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
210 		case TC_ACT_SHOT:
211 			return 0;
212 		}
213 #endif
214 		if (TC_H_MIN(res.classid) <= q->divisor)
215 			return TC_H_MIN(res.classid);
216 	}
217 	return 0;
218 }
219 
220 /*
221  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
222  */
223 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
224 {
225 	sfq_index p, n;
226 	struct sfq_slot *slot = &q->slots[x];
227 	int qlen = slot->qlen;
228 
229 	p = qlen + SFQ_MAX_FLOWS;
230 	n = q->dep[qlen].next;
231 
232 	slot->dep.next = n;
233 	slot->dep.prev = p;
234 
235 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
236 	sfq_dep_head(q, n)->prev = x;
237 }
238 
239 #define sfq_unlink(q, x, n, p)			\
240 	do {					\
241 		n = q->slots[x].dep.next;	\
242 		p = q->slots[x].dep.prev;	\
243 		sfq_dep_head(q, p)->next = n;	\
244 		sfq_dep_head(q, n)->prev = p;	\
245 	} while (0)
246 
247 
248 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
249 {
250 	sfq_index p, n;
251 	int d;
252 
253 	sfq_unlink(q, x, n, p);
254 
255 	d = q->slots[x].qlen--;
256 	if (n == p && q->cur_depth == d)
257 		q->cur_depth--;
258 	sfq_link(q, x);
259 }
260 
261 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
262 {
263 	sfq_index p, n;
264 	int d;
265 
266 	sfq_unlink(q, x, n, p);
267 
268 	d = ++q->slots[x].qlen;
269 	if (q->cur_depth < d)
270 		q->cur_depth = d;
271 	sfq_link(q, x);
272 }
273 
274 /* helper functions : might be changed when/if skb use a standard list_head */
275 
276 /* remove one skb from tail of slot queue */
277 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
278 {
279 	struct sk_buff *skb = slot->skblist_prev;
280 
281 	slot->skblist_prev = skb->prev;
282 	skb->prev->next = (struct sk_buff *)slot;
283 	skb->next = skb->prev = NULL;
284 	return skb;
285 }
286 
287 /* remove one skb from head of slot queue */
288 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
289 {
290 	struct sk_buff *skb = slot->skblist_next;
291 
292 	slot->skblist_next = skb->next;
293 	skb->next->prev = (struct sk_buff *)slot;
294 	skb->next = skb->prev = NULL;
295 	return skb;
296 }
297 
298 static inline void slot_queue_init(struct sfq_slot *slot)
299 {
300 	memset(slot, 0, sizeof(*slot));
301 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
302 }
303 
304 /* add skb to slot queue (tail add) */
305 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
306 {
307 	skb->prev = slot->skblist_prev;
308 	skb->next = (struct sk_buff *)slot;
309 	slot->skblist_prev->next = skb;
310 	slot->skblist_prev = skb;
311 }
312 
313 #define	slot_queue_walk(slot, skb)		\
314 	for (skb = slot->skblist_next;		\
315 	     skb != (struct sk_buff *)slot;	\
316 	     skb = skb->next)
317 
318 static unsigned int sfq_drop(struct Qdisc *sch)
319 {
320 	struct sfq_sched_data *q = qdisc_priv(sch);
321 	sfq_index x, d = q->cur_depth;
322 	struct sk_buff *skb;
323 	unsigned int len;
324 	struct sfq_slot *slot;
325 
326 	/* Queue is full! Find the longest slot and drop tail packet from it */
327 	if (d > 1) {
328 		x = q->dep[d].next;
329 		slot = &q->slots[x];
330 drop:
331 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
332 		len = qdisc_pkt_len(skb);
333 		slot->backlog -= len;
334 		sfq_dec(q, x);
335 		kfree_skb(skb);
336 		sch->q.qlen--;
337 		sch->qstats.drops++;
338 		sch->qstats.backlog -= len;
339 		return len;
340 	}
341 
342 	if (d == 1) {
343 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
344 		x = q->tail->next;
345 		slot = &q->slots[x];
346 		q->tail->next = slot->next;
347 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
348 		goto drop;
349 	}
350 
351 	return 0;
352 }
353 
354 /* Is ECN parameter configured */
355 static int sfq_prob_mark(const struct sfq_sched_data *q)
356 {
357 	return q->flags & TC_RED_ECN;
358 }
359 
360 /* Should packets over max threshold just be marked */
361 static int sfq_hard_mark(const struct sfq_sched_data *q)
362 {
363 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
364 }
365 
366 static int sfq_headdrop(const struct sfq_sched_data *q)
367 {
368 	return q->headdrop;
369 }
370 
371 static int
372 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
373 {
374 	struct sfq_sched_data *q = qdisc_priv(sch);
375 	unsigned int hash;
376 	sfq_index x, qlen;
377 	struct sfq_slot *slot;
378 	int uninitialized_var(ret);
379 	struct sk_buff *head;
380 	int delta;
381 
382 	hash = sfq_classify(skb, sch, &ret);
383 	if (hash == 0) {
384 		if (ret & __NET_XMIT_BYPASS)
385 			sch->qstats.drops++;
386 		kfree_skb(skb);
387 		return ret;
388 	}
389 	hash--;
390 
391 	x = q->ht[hash];
392 	slot = &q->slots[x];
393 	if (x == SFQ_EMPTY_SLOT) {
394 		x = q->dep[0].next; /* get a free slot */
395 		if (x >= SFQ_MAX_FLOWS)
396 			return qdisc_drop(skb, sch);
397 		q->ht[hash] = x;
398 		slot = &q->slots[x];
399 		slot->hash = hash;
400 		slot->backlog = 0; /* should already be 0 anyway... */
401 		red_set_vars(&slot->vars);
402 		goto enqueue;
403 	}
404 	if (q->red_parms) {
405 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
406 							&slot->vars,
407 							slot->backlog);
408 		switch (red_action(q->red_parms,
409 				   &slot->vars,
410 				   slot->vars.qavg)) {
411 		case RED_DONT_MARK:
412 			break;
413 
414 		case RED_PROB_MARK:
415 			sch->qstats.overlimits++;
416 			if (sfq_prob_mark(q)) {
417 				/* We know we have at least one packet in queue */
418 				if (sfq_headdrop(q) &&
419 				    INET_ECN_set_ce(slot->skblist_next)) {
420 					q->stats.prob_mark_head++;
421 					break;
422 				}
423 				if (INET_ECN_set_ce(skb)) {
424 					q->stats.prob_mark++;
425 					break;
426 				}
427 			}
428 			q->stats.prob_drop++;
429 			goto congestion_drop;
430 
431 		case RED_HARD_MARK:
432 			sch->qstats.overlimits++;
433 			if (sfq_hard_mark(q)) {
434 				/* We know we have at least one packet in queue */
435 				if (sfq_headdrop(q) &&
436 				    INET_ECN_set_ce(slot->skblist_next)) {
437 					q->stats.forced_mark_head++;
438 					break;
439 				}
440 				if (INET_ECN_set_ce(skb)) {
441 					q->stats.forced_mark++;
442 					break;
443 				}
444 			}
445 			q->stats.forced_drop++;
446 			goto congestion_drop;
447 		}
448 	}
449 
450 	if (slot->qlen >= q->maxdepth) {
451 congestion_drop:
452 		if (!sfq_headdrop(q))
453 			return qdisc_drop(skb, sch);
454 
455 		/* We know we have at least one packet in queue */
456 		head = slot_dequeue_head(slot);
457 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
458 		sch->qstats.backlog -= delta;
459 		slot->backlog -= delta;
460 		qdisc_drop(head, sch);
461 
462 		slot_queue_add(slot, skb);
463 		return NET_XMIT_CN;
464 	}
465 
466 enqueue:
467 	sch->qstats.backlog += qdisc_pkt_len(skb);
468 	slot->backlog += qdisc_pkt_len(skb);
469 	slot_queue_add(slot, skb);
470 	sfq_inc(q, x);
471 	if (slot->qlen == 1) {		/* The flow is new */
472 		if (q->tail == NULL) {	/* It is the first flow */
473 			slot->next = x;
474 		} else {
475 			slot->next = q->tail->next;
476 			q->tail->next = x;
477 		}
478 		/* We put this flow at the end of our flow list.
479 		 * This might sound unfair for a new flow to wait after old ones,
480 		 * but we could endup servicing new flows only, and freeze old ones.
481 		 */
482 		q->tail = slot;
483 		/* We could use a bigger initial quantum for new flows */
484 		slot->allot = q->scaled_quantum;
485 	}
486 	if (++sch->q.qlen <= q->limit)
487 		return NET_XMIT_SUCCESS;
488 
489 	qlen = slot->qlen;
490 	sfq_drop(sch);
491 	/* Return Congestion Notification only if we dropped a packet
492 	 * from this flow.
493 	 */
494 	if (qlen != slot->qlen)
495 		return NET_XMIT_CN;
496 
497 	/* As we dropped a packet, better let upper stack know this */
498 	qdisc_tree_decrease_qlen(sch, 1);
499 	return NET_XMIT_SUCCESS;
500 }
501 
502 static struct sk_buff *
503 sfq_dequeue(struct Qdisc *sch)
504 {
505 	struct sfq_sched_data *q = qdisc_priv(sch);
506 	struct sk_buff *skb;
507 	sfq_index a, next_a;
508 	struct sfq_slot *slot;
509 
510 	/* No active slots */
511 	if (q->tail == NULL)
512 		return NULL;
513 
514 next_slot:
515 	a = q->tail->next;
516 	slot = &q->slots[a];
517 	if (slot->allot <= 0) {
518 		q->tail = slot;
519 		slot->allot += q->scaled_quantum;
520 		goto next_slot;
521 	}
522 	skb = slot_dequeue_head(slot);
523 	sfq_dec(q, a);
524 	qdisc_bstats_update(sch, skb);
525 	sch->q.qlen--;
526 	sch->qstats.backlog -= qdisc_pkt_len(skb);
527 	slot->backlog -= qdisc_pkt_len(skb);
528 	/* Is the slot empty? */
529 	if (slot->qlen == 0) {
530 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
531 		next_a = slot->next;
532 		if (a == next_a) {
533 			q->tail = NULL; /* no more active slots */
534 			return skb;
535 		}
536 		q->tail->next = next_a;
537 	} else {
538 		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
539 	}
540 	return skb;
541 }
542 
543 static void
544 sfq_reset(struct Qdisc *sch)
545 {
546 	struct sk_buff *skb;
547 
548 	while ((skb = sfq_dequeue(sch)) != NULL)
549 		kfree_skb(skb);
550 }
551 
552 /*
553  * When q->perturbation is changed, we rehash all queued skbs
554  * to avoid OOO (Out Of Order) effects.
555  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
556  * counters.
557  */
558 static void sfq_rehash(struct Qdisc *sch)
559 {
560 	struct sfq_sched_data *q = qdisc_priv(sch);
561 	struct sk_buff *skb;
562 	int i;
563 	struct sfq_slot *slot;
564 	struct sk_buff_head list;
565 	int dropped = 0;
566 
567 	__skb_queue_head_init(&list);
568 
569 	for (i = 0; i < q->maxflows; i++) {
570 		slot = &q->slots[i];
571 		if (!slot->qlen)
572 			continue;
573 		while (slot->qlen) {
574 			skb = slot_dequeue_head(slot);
575 			sfq_dec(q, i);
576 			__skb_queue_tail(&list, skb);
577 		}
578 		slot->backlog = 0;
579 		red_set_vars(&slot->vars);
580 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
581 	}
582 	q->tail = NULL;
583 
584 	while ((skb = __skb_dequeue(&list)) != NULL) {
585 		unsigned int hash = sfq_hash(q, skb);
586 		sfq_index x = q->ht[hash];
587 
588 		slot = &q->slots[x];
589 		if (x == SFQ_EMPTY_SLOT) {
590 			x = q->dep[0].next; /* get a free slot */
591 			if (x >= SFQ_MAX_FLOWS) {
592 drop:				sch->qstats.backlog -= qdisc_pkt_len(skb);
593 				kfree_skb(skb);
594 				dropped++;
595 				continue;
596 			}
597 			q->ht[hash] = x;
598 			slot = &q->slots[x];
599 			slot->hash = hash;
600 		}
601 		if (slot->qlen >= q->maxdepth)
602 			goto drop;
603 		slot_queue_add(slot, skb);
604 		if (q->red_parms)
605 			slot->vars.qavg = red_calc_qavg(q->red_parms,
606 							&slot->vars,
607 							slot->backlog);
608 		slot->backlog += qdisc_pkt_len(skb);
609 		sfq_inc(q, x);
610 		if (slot->qlen == 1) {		/* The flow is new */
611 			if (q->tail == NULL) {	/* It is the first flow */
612 				slot->next = x;
613 			} else {
614 				slot->next = q->tail->next;
615 				q->tail->next = x;
616 			}
617 			q->tail = slot;
618 			slot->allot = q->scaled_quantum;
619 		}
620 	}
621 	sch->q.qlen -= dropped;
622 	qdisc_tree_decrease_qlen(sch, dropped);
623 }
624 
625 static void sfq_perturbation(unsigned long arg)
626 {
627 	struct Qdisc *sch = (struct Qdisc *)arg;
628 	struct sfq_sched_data *q = qdisc_priv(sch);
629 	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
630 
631 	spin_lock(root_lock);
632 	q->perturbation = prandom_u32();
633 	if (!q->filter_list && q->tail)
634 		sfq_rehash(sch);
635 	spin_unlock(root_lock);
636 
637 	if (q->perturb_period)
638 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
639 }
640 
641 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
642 {
643 	struct sfq_sched_data *q = qdisc_priv(sch);
644 	struct tc_sfq_qopt *ctl = nla_data(opt);
645 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
646 	unsigned int qlen;
647 	struct red_parms *p = NULL;
648 
649 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
650 		return -EINVAL;
651 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
652 		ctl_v1 = nla_data(opt);
653 	if (ctl->divisor &&
654 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
655 		return -EINVAL;
656 	if (ctl_v1 && ctl_v1->qth_min) {
657 		p = kmalloc(sizeof(*p), GFP_KERNEL);
658 		if (!p)
659 			return -ENOMEM;
660 	}
661 	sch_tree_lock(sch);
662 	if (ctl->quantum) {
663 		q->quantum = ctl->quantum;
664 		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
665 	}
666 	q->perturb_period = ctl->perturb_period * HZ;
667 	if (ctl->flows)
668 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
669 	if (ctl->divisor) {
670 		q->divisor = ctl->divisor;
671 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
672 	}
673 	if (ctl_v1) {
674 		if (ctl_v1->depth)
675 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
676 		if (p) {
677 			swap(q->red_parms, p);
678 			red_set_parms(q->red_parms,
679 				      ctl_v1->qth_min, ctl_v1->qth_max,
680 				      ctl_v1->Wlog,
681 				      ctl_v1->Plog, ctl_v1->Scell_log,
682 				      NULL,
683 				      ctl_v1->max_P);
684 		}
685 		q->flags = ctl_v1->flags;
686 		q->headdrop = ctl_v1->headdrop;
687 	}
688 	if (ctl->limit) {
689 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
690 		q->maxflows = min_t(u32, q->maxflows, q->limit);
691 	}
692 
693 	qlen = sch->q.qlen;
694 	while (sch->q.qlen > q->limit)
695 		sfq_drop(sch);
696 	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
697 
698 	del_timer(&q->perturb_timer);
699 	if (q->perturb_period) {
700 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
701 		q->perturbation = prandom_u32();
702 	}
703 	sch_tree_unlock(sch);
704 	kfree(p);
705 	return 0;
706 }
707 
708 static void *sfq_alloc(size_t sz)
709 {
710 	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
711 
712 	if (!ptr)
713 		ptr = vmalloc(sz);
714 	return ptr;
715 }
716 
717 static void sfq_free(void *addr)
718 {
719 	if (addr) {
720 		if (is_vmalloc_addr(addr))
721 			vfree(addr);
722 		else
723 			kfree(addr);
724 	}
725 }
726 
727 static void sfq_destroy(struct Qdisc *sch)
728 {
729 	struct sfq_sched_data *q = qdisc_priv(sch);
730 
731 	tcf_destroy_chain(&q->filter_list);
732 	q->perturb_period = 0;
733 	del_timer_sync(&q->perturb_timer);
734 	sfq_free(q->ht);
735 	sfq_free(q->slots);
736 	kfree(q->red_parms);
737 }
738 
739 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
740 {
741 	struct sfq_sched_data *q = qdisc_priv(sch);
742 	int i;
743 
744 	q->perturb_timer.function = sfq_perturbation;
745 	q->perturb_timer.data = (unsigned long)sch;
746 	init_timer_deferrable(&q->perturb_timer);
747 
748 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
749 		q->dep[i].next = i + SFQ_MAX_FLOWS;
750 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
751 	}
752 
753 	q->limit = SFQ_MAX_DEPTH;
754 	q->maxdepth = SFQ_MAX_DEPTH;
755 	q->cur_depth = 0;
756 	q->tail = NULL;
757 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
758 	q->maxflows = SFQ_DEFAULT_FLOWS;
759 	q->quantum = psched_mtu(qdisc_dev(sch));
760 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
761 	q->perturb_period = 0;
762 	q->perturbation = prandom_u32();
763 
764 	if (opt) {
765 		int err = sfq_change(sch, opt);
766 		if (err)
767 			return err;
768 	}
769 
770 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
771 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
772 	if (!q->ht || !q->slots) {
773 		sfq_destroy(sch);
774 		return -ENOMEM;
775 	}
776 	for (i = 0; i < q->divisor; i++)
777 		q->ht[i] = SFQ_EMPTY_SLOT;
778 
779 	for (i = 0; i < q->maxflows; i++) {
780 		slot_queue_init(&q->slots[i]);
781 		sfq_link(q, i);
782 	}
783 	if (q->limit >= 1)
784 		sch->flags |= TCQ_F_CAN_BYPASS;
785 	else
786 		sch->flags &= ~TCQ_F_CAN_BYPASS;
787 	return 0;
788 }
789 
790 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
791 {
792 	struct sfq_sched_data *q = qdisc_priv(sch);
793 	unsigned char *b = skb_tail_pointer(skb);
794 	struct tc_sfq_qopt_v1 opt;
795 	struct red_parms *p = q->red_parms;
796 
797 	memset(&opt, 0, sizeof(opt));
798 	opt.v0.quantum	= q->quantum;
799 	opt.v0.perturb_period = q->perturb_period / HZ;
800 	opt.v0.limit	= q->limit;
801 	opt.v0.divisor	= q->divisor;
802 	opt.v0.flows	= q->maxflows;
803 	opt.depth	= q->maxdepth;
804 	opt.headdrop	= q->headdrop;
805 
806 	if (p) {
807 		opt.qth_min	= p->qth_min >> p->Wlog;
808 		opt.qth_max	= p->qth_max >> p->Wlog;
809 		opt.Wlog	= p->Wlog;
810 		opt.Plog	= p->Plog;
811 		opt.Scell_log	= p->Scell_log;
812 		opt.max_P	= p->max_P;
813 	}
814 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
815 	opt.flags	= q->flags;
816 
817 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
818 		goto nla_put_failure;
819 
820 	return skb->len;
821 
822 nla_put_failure:
823 	nlmsg_trim(skb, b);
824 	return -1;
825 }
826 
827 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
828 {
829 	return NULL;
830 }
831 
832 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
833 {
834 	return 0;
835 }
836 
837 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
838 			      u32 classid)
839 {
840 	/* we cannot bypass queue discipline anymore */
841 	sch->flags &= ~TCQ_F_CAN_BYPASS;
842 	return 0;
843 }
844 
845 static void sfq_put(struct Qdisc *q, unsigned long cl)
846 {
847 }
848 
849 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
850 {
851 	struct sfq_sched_data *q = qdisc_priv(sch);
852 
853 	if (cl)
854 		return NULL;
855 	return &q->filter_list;
856 }
857 
858 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
859 			  struct sk_buff *skb, struct tcmsg *tcm)
860 {
861 	tcm->tcm_handle |= TC_H_MIN(cl);
862 	return 0;
863 }
864 
865 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
866 				struct gnet_dump *d)
867 {
868 	struct sfq_sched_data *q = qdisc_priv(sch);
869 	sfq_index idx = q->ht[cl - 1];
870 	struct gnet_stats_queue qs = { 0 };
871 	struct tc_sfq_xstats xstats = { 0 };
872 
873 	if (idx != SFQ_EMPTY_SLOT) {
874 		const struct sfq_slot *slot = &q->slots[idx];
875 
876 		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
877 		qs.qlen = slot->qlen;
878 		qs.backlog = slot->backlog;
879 	}
880 	if (gnet_stats_copy_queue(d, &qs) < 0)
881 		return -1;
882 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
883 }
884 
885 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
886 {
887 	struct sfq_sched_data *q = qdisc_priv(sch);
888 	unsigned int i;
889 
890 	if (arg->stop)
891 		return;
892 
893 	for (i = 0; i < q->divisor; i++) {
894 		if (q->ht[i] == SFQ_EMPTY_SLOT ||
895 		    arg->count < arg->skip) {
896 			arg->count++;
897 			continue;
898 		}
899 		if (arg->fn(sch, i + 1, arg) < 0) {
900 			arg->stop = 1;
901 			break;
902 		}
903 		arg->count++;
904 	}
905 }
906 
907 static const struct Qdisc_class_ops sfq_class_ops = {
908 	.leaf		=	sfq_leaf,
909 	.get		=	sfq_get,
910 	.put		=	sfq_put,
911 	.tcf_chain	=	sfq_find_tcf,
912 	.bind_tcf	=	sfq_bind,
913 	.unbind_tcf	=	sfq_put,
914 	.dump		=	sfq_dump_class,
915 	.dump_stats	=	sfq_dump_class_stats,
916 	.walk		=	sfq_walk,
917 };
918 
919 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
920 	.cl_ops		=	&sfq_class_ops,
921 	.id		=	"sfq",
922 	.priv_size	=	sizeof(struct sfq_sched_data),
923 	.enqueue	=	sfq_enqueue,
924 	.dequeue	=	sfq_dequeue,
925 	.peek		=	qdisc_peek_dequeued,
926 	.drop		=	sfq_drop,
927 	.init		=	sfq_init,
928 	.reset		=	sfq_reset,
929 	.destroy	=	sfq_destroy,
930 	.change		=	NULL,
931 	.dump		=	sfq_dump,
932 	.owner		=	THIS_MODULE,
933 };
934 
935 static int __init sfq_module_init(void)
936 {
937 	return register_qdisc(&sfq_qdisc_ops);
938 }
939 static void __exit sfq_module_exit(void)
940 {
941 	unregister_qdisc(&sfq_qdisc_ops);
942 }
943 module_init(sfq_module_init)
944 module_exit(sfq_module_exit)
945 MODULE_LICENSE("GPL");
946