xref: /openbmc/linux/net/sched/sch_sfq.c (revision 060f35a317ef09101b128f399dce7ed13d019461)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  */
7 
8 #include <linux/module.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/jiffies.h>
12 #include <linux/string.h>
13 #include <linux/in.h>
14 #include <linux/errno.h>
15 #include <linux/init.h>
16 #include <linux/skbuff.h>
17 #include <linux/siphash.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 #include <net/pkt_cls.h>
23 #include <net/red.h>
24 
25 
26 /*	Stochastic Fairness Queuing algorithm.
27 	=======================================
28 
29 	Source:
30 	Paul E. McKenney "Stochastic Fairness Queuing",
31 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
32 
33 	Paul E. McKenney "Stochastic Fairness Queuing",
34 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
35 
36 
37 	See also:
38 	M. Shreedhar and George Varghese "Efficient Fair
39 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
40 
41 
42 	This is not the thing that is usually called (W)FQ nowadays.
43 	It does not use any timestamp mechanism, but instead
44 	processes queues in round-robin order.
45 
46 	ADVANTAGE:
47 
48 	- It is very cheap. Both CPU and memory requirements are minimal.
49 
50 	DRAWBACKS:
51 
52 	- "Stochastic" -> It is not 100% fair.
53 	When hash collisions occur, several flows are considered as one.
54 
55 	- "Round-robin" -> It introduces larger delays than virtual clock
56 	based schemes, and should not be used for isolating interactive
57 	traffic	from non-interactive. It means, that this scheduler
58 	should be used as leaf of CBQ or P3, which put interactive traffic
59 	to higher priority band.
60 
61 	We still need true WFQ for top level CSZ, but using WFQ
62 	for the best effort traffic is absolutely pointless:
63 	SFQ is superior for this purpose.
64 
65 	IMPLEMENTATION:
66 	This implementation limits :
67 	- maximal queue length per flow to 127 packets.
68 	- max mtu to 2^18-1;
69 	- max 65408 flows,
70 	- number of hash buckets to 65536.
71 
72 	It is easy to increase these values, but not in flight.  */
73 
74 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
75 #define SFQ_DEFAULT_FLOWS	128
76 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
77 #define SFQ_EMPTY_SLOT		0xffff
78 #define SFQ_DEFAULT_HASH_DIVISOR 1024
79 
80 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
81 typedef u16 sfq_index;
82 
83 /*
84  * We dont use pointers to save space.
85  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
86  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
87  * are 'pointers' to dep[] array
88  */
89 struct sfq_head {
90 	sfq_index	next;
91 	sfq_index	prev;
92 };
93 
94 struct sfq_slot {
95 	struct sk_buff	*skblist_next;
96 	struct sk_buff	*skblist_prev;
97 	sfq_index	qlen; /* number of skbs in skblist */
98 	sfq_index	next; /* next slot in sfq RR chain */
99 	struct sfq_head dep; /* anchor in dep[] chains */
100 	unsigned short	hash; /* hash value (index in ht[]) */
101 	int		allot; /* credit for this slot */
102 
103 	unsigned int    backlog;
104 	struct red_vars vars;
105 };
106 
107 struct sfq_sched_data {
108 /* frequently used fields */
109 	int		limit;		/* limit of total number of packets in this qdisc */
110 	unsigned int	divisor;	/* number of slots in hash table */
111 	u8		headdrop;
112 	u8		maxdepth;	/* limit of packets per flow */
113 
114 	siphash_key_t 	perturbation;
115 	u8		cur_depth;	/* depth of longest slot */
116 	u8		flags;
117 	struct tcf_proto __rcu *filter_list;
118 	struct tcf_block *block;
119 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
120 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
121 
122 	struct red_parms *red_parms;
123 	struct tc_sfqred_stats stats;
124 	struct sfq_slot *tail;		/* current slot in round */
125 
126 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
127 					/* Linked lists of slots, indexed by depth
128 					 * dep[0] : list of unused flows
129 					 * dep[1] : list of flows with 1 packet
130 					 * dep[X] : list of flows with X packets
131 					 */
132 
133 	unsigned int	maxflows;	/* number of flows in flows array */
134 	int		perturb_period;
135 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
136 	struct timer_list perturb_timer;
137 	struct Qdisc	*sch;
138 };
139 
140 /*
141  * sfq_head are either in a sfq_slot or in dep[] array
142  */
sfq_dep_head(struct sfq_sched_data * q,sfq_index val)143 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
144 {
145 	if (val < SFQ_MAX_FLOWS)
146 		return &q->slots[val].dep;
147 	return &q->dep[val - SFQ_MAX_FLOWS];
148 }
149 
sfq_hash(const struct sfq_sched_data * q,const struct sk_buff * skb)150 static unsigned int sfq_hash(const struct sfq_sched_data *q,
151 			     const struct sk_buff *skb)
152 {
153 	return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
154 }
155 
sfq_classify(struct sk_buff * skb,struct Qdisc * sch,int * qerr)156 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
157 				 int *qerr)
158 {
159 	struct sfq_sched_data *q = qdisc_priv(sch);
160 	struct tcf_result res;
161 	struct tcf_proto *fl;
162 	int result;
163 
164 	if (TC_H_MAJ(skb->priority) == sch->handle &&
165 	    TC_H_MIN(skb->priority) > 0 &&
166 	    TC_H_MIN(skb->priority) <= q->divisor)
167 		return TC_H_MIN(skb->priority);
168 
169 	fl = rcu_dereference_bh(q->filter_list);
170 	if (!fl)
171 		return sfq_hash(q, skb) + 1;
172 
173 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
174 	result = tcf_classify(skb, NULL, fl, &res, false);
175 	if (result >= 0) {
176 #ifdef CONFIG_NET_CLS_ACT
177 		switch (result) {
178 		case TC_ACT_STOLEN:
179 		case TC_ACT_QUEUED:
180 		case TC_ACT_TRAP:
181 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
182 			fallthrough;
183 		case TC_ACT_SHOT:
184 			return 0;
185 		}
186 #endif
187 		if (TC_H_MIN(res.classid) <= q->divisor)
188 			return TC_H_MIN(res.classid);
189 	}
190 	return 0;
191 }
192 
193 /*
194  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
195  */
sfq_link(struct sfq_sched_data * q,sfq_index x)196 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
197 {
198 	sfq_index p, n;
199 	struct sfq_slot *slot = &q->slots[x];
200 	int qlen = slot->qlen;
201 
202 	p = qlen + SFQ_MAX_FLOWS;
203 	n = q->dep[qlen].next;
204 
205 	slot->dep.next = n;
206 	slot->dep.prev = p;
207 
208 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
209 	sfq_dep_head(q, n)->prev = x;
210 }
211 
212 #define sfq_unlink(q, x, n, p)			\
213 	do {					\
214 		n = q->slots[x].dep.next;	\
215 		p = q->slots[x].dep.prev;	\
216 		sfq_dep_head(q, p)->next = n;	\
217 		sfq_dep_head(q, n)->prev = p;	\
218 	} while (0)
219 
220 
sfq_dec(struct sfq_sched_data * q,sfq_index x)221 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
222 {
223 	sfq_index p, n;
224 	int d;
225 
226 	sfq_unlink(q, x, n, p);
227 
228 	d = q->slots[x].qlen--;
229 	if (n == p && q->cur_depth == d)
230 		q->cur_depth--;
231 	sfq_link(q, x);
232 }
233 
sfq_inc(struct sfq_sched_data * q,sfq_index x)234 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
235 {
236 	sfq_index p, n;
237 	int d;
238 
239 	sfq_unlink(q, x, n, p);
240 
241 	d = ++q->slots[x].qlen;
242 	if (q->cur_depth < d)
243 		q->cur_depth = d;
244 	sfq_link(q, x);
245 }
246 
247 /* helper functions : might be changed when/if skb use a standard list_head */
248 
249 /* remove one skb from tail of slot queue */
slot_dequeue_tail(struct sfq_slot * slot)250 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
251 {
252 	struct sk_buff *skb = slot->skblist_prev;
253 
254 	slot->skblist_prev = skb->prev;
255 	skb->prev->next = (struct sk_buff *)slot;
256 	skb->next = skb->prev = NULL;
257 	return skb;
258 }
259 
260 /* remove one skb from head of slot queue */
slot_dequeue_head(struct sfq_slot * slot)261 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
262 {
263 	struct sk_buff *skb = slot->skblist_next;
264 
265 	slot->skblist_next = skb->next;
266 	skb->next->prev = (struct sk_buff *)slot;
267 	skb->next = skb->prev = NULL;
268 	return skb;
269 }
270 
slot_queue_init(struct sfq_slot * slot)271 static inline void slot_queue_init(struct sfq_slot *slot)
272 {
273 	memset(slot, 0, sizeof(*slot));
274 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
275 }
276 
277 /* add skb to slot queue (tail add) */
slot_queue_add(struct sfq_slot * slot,struct sk_buff * skb)278 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
279 {
280 	skb->prev = slot->skblist_prev;
281 	skb->next = (struct sk_buff *)slot;
282 	slot->skblist_prev->next = skb;
283 	slot->skblist_prev = skb;
284 }
285 
sfq_drop(struct Qdisc * sch,struct sk_buff ** to_free)286 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
287 {
288 	struct sfq_sched_data *q = qdisc_priv(sch);
289 	sfq_index x, d = q->cur_depth;
290 	struct sk_buff *skb;
291 	unsigned int len;
292 	struct sfq_slot *slot;
293 
294 	/* Queue is full! Find the longest slot and drop tail packet from it */
295 	if (d > 1) {
296 		x = q->dep[d].next;
297 		slot = &q->slots[x];
298 drop:
299 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
300 		len = qdisc_pkt_len(skb);
301 		slot->backlog -= len;
302 		sfq_dec(q, x);
303 		sch->q.qlen--;
304 		qdisc_qstats_backlog_dec(sch, skb);
305 		qdisc_drop(skb, sch, to_free);
306 		return len;
307 	}
308 
309 	if (d == 1) {
310 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
311 		x = q->tail->next;
312 		slot = &q->slots[x];
313 		q->tail->next = slot->next;
314 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
315 		goto drop;
316 	}
317 
318 	return 0;
319 }
320 
321 /* Is ECN parameter configured */
sfq_prob_mark(const struct sfq_sched_data * q)322 static int sfq_prob_mark(const struct sfq_sched_data *q)
323 {
324 	return q->flags & TC_RED_ECN;
325 }
326 
327 /* Should packets over max threshold just be marked */
sfq_hard_mark(const struct sfq_sched_data * q)328 static int sfq_hard_mark(const struct sfq_sched_data *q)
329 {
330 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
331 }
332 
sfq_headdrop(const struct sfq_sched_data * q)333 static int sfq_headdrop(const struct sfq_sched_data *q)
334 {
335 	return q->headdrop;
336 }
337 
338 static int
sfq_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)339 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
340 {
341 	struct sfq_sched_data *q = qdisc_priv(sch);
342 	unsigned int hash, dropped;
343 	sfq_index x, qlen;
344 	struct sfq_slot *slot;
345 	int ret;
346 	struct sk_buff *head;
347 	int delta;
348 
349 	hash = sfq_classify(skb, sch, &ret);
350 	if (hash == 0) {
351 		if (ret & __NET_XMIT_BYPASS)
352 			qdisc_qstats_drop(sch);
353 		__qdisc_drop(skb, to_free);
354 		return ret;
355 	}
356 	hash--;
357 
358 	x = q->ht[hash];
359 	slot = &q->slots[x];
360 	if (x == SFQ_EMPTY_SLOT) {
361 		x = q->dep[0].next; /* get a free slot */
362 		if (x >= SFQ_MAX_FLOWS)
363 			return qdisc_drop(skb, sch, to_free);
364 		q->ht[hash] = x;
365 		slot = &q->slots[x];
366 		slot->hash = hash;
367 		slot->backlog = 0; /* should already be 0 anyway... */
368 		red_set_vars(&slot->vars);
369 		goto enqueue;
370 	}
371 	if (q->red_parms) {
372 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
373 							&slot->vars,
374 							slot->backlog);
375 		switch (red_action(q->red_parms,
376 				   &slot->vars,
377 				   slot->vars.qavg)) {
378 		case RED_DONT_MARK:
379 			break;
380 
381 		case RED_PROB_MARK:
382 			qdisc_qstats_overlimit(sch);
383 			if (sfq_prob_mark(q)) {
384 				/* We know we have at least one packet in queue */
385 				if (sfq_headdrop(q) &&
386 				    INET_ECN_set_ce(slot->skblist_next)) {
387 					q->stats.prob_mark_head++;
388 					break;
389 				}
390 				if (INET_ECN_set_ce(skb)) {
391 					q->stats.prob_mark++;
392 					break;
393 				}
394 			}
395 			q->stats.prob_drop++;
396 			goto congestion_drop;
397 
398 		case RED_HARD_MARK:
399 			qdisc_qstats_overlimit(sch);
400 			if (sfq_hard_mark(q)) {
401 				/* We know we have at least one packet in queue */
402 				if (sfq_headdrop(q) &&
403 				    INET_ECN_set_ce(slot->skblist_next)) {
404 					q->stats.forced_mark_head++;
405 					break;
406 				}
407 				if (INET_ECN_set_ce(skb)) {
408 					q->stats.forced_mark++;
409 					break;
410 				}
411 			}
412 			q->stats.forced_drop++;
413 			goto congestion_drop;
414 		}
415 	}
416 
417 	if (slot->qlen >= q->maxdepth) {
418 congestion_drop:
419 		if (!sfq_headdrop(q))
420 			return qdisc_drop(skb, sch, to_free);
421 
422 		/* We know we have at least one packet in queue */
423 		head = slot_dequeue_head(slot);
424 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
425 		sch->qstats.backlog -= delta;
426 		slot->backlog -= delta;
427 		qdisc_drop(head, sch, to_free);
428 
429 		slot_queue_add(slot, skb);
430 		qdisc_tree_reduce_backlog(sch, 0, delta);
431 		return NET_XMIT_CN;
432 	}
433 
434 enqueue:
435 	qdisc_qstats_backlog_inc(sch, skb);
436 	slot->backlog += qdisc_pkt_len(skb);
437 	slot_queue_add(slot, skb);
438 	sfq_inc(q, x);
439 	if (slot->qlen == 1) {		/* The flow is new */
440 		if (q->tail == NULL) {	/* It is the first flow */
441 			slot->next = x;
442 		} else {
443 			slot->next = q->tail->next;
444 			q->tail->next = x;
445 		}
446 		/* We put this flow at the end of our flow list.
447 		 * This might sound unfair for a new flow to wait after old ones,
448 		 * but we could endup servicing new flows only, and freeze old ones.
449 		 */
450 		q->tail = slot;
451 		/* We could use a bigger initial quantum for new flows */
452 		slot->allot = q->quantum;
453 	}
454 	if (++sch->q.qlen <= q->limit)
455 		return NET_XMIT_SUCCESS;
456 
457 	qlen = slot->qlen;
458 	dropped = sfq_drop(sch, to_free);
459 	/* Return Congestion Notification only if we dropped a packet
460 	 * from this flow.
461 	 */
462 	if (qlen != slot->qlen) {
463 		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
464 		return NET_XMIT_CN;
465 	}
466 
467 	/* As we dropped a packet, better let upper stack know this */
468 	qdisc_tree_reduce_backlog(sch, 1, dropped);
469 	return NET_XMIT_SUCCESS;
470 }
471 
472 static struct sk_buff *
sfq_dequeue(struct Qdisc * sch)473 sfq_dequeue(struct Qdisc *sch)
474 {
475 	struct sfq_sched_data *q = qdisc_priv(sch);
476 	struct sk_buff *skb;
477 	sfq_index a, next_a;
478 	struct sfq_slot *slot;
479 
480 	/* No active slots */
481 	if (q->tail == NULL)
482 		return NULL;
483 
484 next_slot:
485 	a = q->tail->next;
486 	slot = &q->slots[a];
487 	if (slot->allot <= 0) {
488 		q->tail = slot;
489 		slot->allot += q->quantum;
490 		goto next_slot;
491 	}
492 	skb = slot_dequeue_head(slot);
493 	sfq_dec(q, a);
494 	qdisc_bstats_update(sch, skb);
495 	sch->q.qlen--;
496 	qdisc_qstats_backlog_dec(sch, skb);
497 	slot->backlog -= qdisc_pkt_len(skb);
498 	/* Is the slot empty? */
499 	if (slot->qlen == 0) {
500 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
501 		next_a = slot->next;
502 		if (a == next_a) {
503 			q->tail = NULL; /* no more active slots */
504 			return skb;
505 		}
506 		q->tail->next = next_a;
507 	} else {
508 		slot->allot -= qdisc_pkt_len(skb);
509 	}
510 	return skb;
511 }
512 
513 static void
sfq_reset(struct Qdisc * sch)514 sfq_reset(struct Qdisc *sch)
515 {
516 	struct sk_buff *skb;
517 
518 	while ((skb = sfq_dequeue(sch)) != NULL)
519 		rtnl_kfree_skbs(skb, skb);
520 }
521 
522 /*
523  * When q->perturbation is changed, we rehash all queued skbs
524  * to avoid OOO (Out Of Order) effects.
525  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
526  * counters.
527  */
sfq_rehash(struct Qdisc * sch)528 static void sfq_rehash(struct Qdisc *sch)
529 {
530 	struct sfq_sched_data *q = qdisc_priv(sch);
531 	struct sk_buff *skb;
532 	int i;
533 	struct sfq_slot *slot;
534 	struct sk_buff_head list;
535 	int dropped = 0;
536 	unsigned int drop_len = 0;
537 
538 	__skb_queue_head_init(&list);
539 
540 	for (i = 0; i < q->maxflows; i++) {
541 		slot = &q->slots[i];
542 		if (!slot->qlen)
543 			continue;
544 		while (slot->qlen) {
545 			skb = slot_dequeue_head(slot);
546 			sfq_dec(q, i);
547 			__skb_queue_tail(&list, skb);
548 		}
549 		slot->backlog = 0;
550 		red_set_vars(&slot->vars);
551 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
552 	}
553 	q->tail = NULL;
554 
555 	while ((skb = __skb_dequeue(&list)) != NULL) {
556 		unsigned int hash = sfq_hash(q, skb);
557 		sfq_index x = q->ht[hash];
558 
559 		slot = &q->slots[x];
560 		if (x == SFQ_EMPTY_SLOT) {
561 			x = q->dep[0].next; /* get a free slot */
562 			if (x >= SFQ_MAX_FLOWS) {
563 drop:
564 				qdisc_qstats_backlog_dec(sch, skb);
565 				drop_len += qdisc_pkt_len(skb);
566 				kfree_skb(skb);
567 				dropped++;
568 				continue;
569 			}
570 			q->ht[hash] = x;
571 			slot = &q->slots[x];
572 			slot->hash = hash;
573 		}
574 		if (slot->qlen >= q->maxdepth)
575 			goto drop;
576 		slot_queue_add(slot, skb);
577 		if (q->red_parms)
578 			slot->vars.qavg = red_calc_qavg(q->red_parms,
579 							&slot->vars,
580 							slot->backlog);
581 		slot->backlog += qdisc_pkt_len(skb);
582 		sfq_inc(q, x);
583 		if (slot->qlen == 1) {		/* The flow is new */
584 			if (q->tail == NULL) {	/* It is the first flow */
585 				slot->next = x;
586 			} else {
587 				slot->next = q->tail->next;
588 				q->tail->next = x;
589 			}
590 			q->tail = slot;
591 			slot->allot = q->quantum;
592 		}
593 	}
594 	sch->q.qlen -= dropped;
595 	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
596 }
597 
sfq_perturbation(struct timer_list * t)598 static void sfq_perturbation(struct timer_list *t)
599 {
600 	struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
601 	struct Qdisc *sch = q->sch;
602 	spinlock_t *root_lock;
603 	siphash_key_t nkey;
604 	int period;
605 
606 	get_random_bytes(&nkey, sizeof(nkey));
607 	rcu_read_lock();
608 	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
609 	spin_lock(root_lock);
610 	q->perturbation = nkey;
611 	if (!q->filter_list && q->tail)
612 		sfq_rehash(sch);
613 	spin_unlock(root_lock);
614 
615 	/* q->perturb_period can change under us from
616 	 * sfq_change() and sfq_destroy().
617 	 */
618 	period = READ_ONCE(q->perturb_period);
619 	if (period)
620 		mod_timer(&q->perturb_timer, jiffies + period);
621 	rcu_read_unlock();
622 }
623 
sfq_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)624 static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
625 		      struct netlink_ext_ack *extack)
626 {
627 	struct sfq_sched_data *q = qdisc_priv(sch);
628 	struct tc_sfq_qopt *ctl = nla_data(opt);
629 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
630 	unsigned int qlen, dropped = 0;
631 	struct red_parms *p = NULL;
632 	struct sk_buff *to_free = NULL;
633 	struct sk_buff *tail = NULL;
634 
635 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
636 		return -EINVAL;
637 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
638 		ctl_v1 = nla_data(opt);
639 	if (ctl->divisor &&
640 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
641 		return -EINVAL;
642 
643 	if ((int)ctl->quantum < 0) {
644 		NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
645 		return -EINVAL;
646 	}
647 	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
648 					ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
649 		return -EINVAL;
650 	if (ctl_v1 && ctl_v1->qth_min) {
651 		p = kmalloc(sizeof(*p), GFP_KERNEL);
652 		if (!p)
653 			return -ENOMEM;
654 	}
655 	if (ctl->limit == 1) {
656 		NL_SET_ERR_MSG_MOD(extack, "invalid limit");
657 		return -EINVAL;
658 	}
659 	sch_tree_lock(sch);
660 	if (ctl->quantum)
661 		q->quantum = ctl->quantum;
662 	WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ);
663 	if (ctl->flows)
664 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
665 	if (ctl->divisor) {
666 		q->divisor = ctl->divisor;
667 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
668 	}
669 	if (ctl_v1) {
670 		if (ctl_v1->depth)
671 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
672 		if (p) {
673 			swap(q->red_parms, p);
674 			red_set_parms(q->red_parms,
675 				      ctl_v1->qth_min, ctl_v1->qth_max,
676 				      ctl_v1->Wlog,
677 				      ctl_v1->Plog, ctl_v1->Scell_log,
678 				      NULL,
679 				      ctl_v1->max_P);
680 		}
681 		q->flags = ctl_v1->flags;
682 		q->headdrop = ctl_v1->headdrop;
683 	}
684 	if (ctl->limit) {
685 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
686 		q->maxflows = min_t(u32, q->maxflows, q->limit);
687 	}
688 
689 	qlen = sch->q.qlen;
690 	while (sch->q.qlen > q->limit) {
691 		dropped += sfq_drop(sch, &to_free);
692 		if (!tail)
693 			tail = to_free;
694 	}
695 
696 	rtnl_kfree_skbs(to_free, tail);
697 	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
698 
699 	del_timer(&q->perturb_timer);
700 	if (q->perturb_period) {
701 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
702 		get_random_bytes(&q->perturbation, sizeof(q->perturbation));
703 	}
704 	sch_tree_unlock(sch);
705 	kfree(p);
706 	return 0;
707 }
708 
sfq_alloc(size_t sz)709 static void *sfq_alloc(size_t sz)
710 {
711 	return  kvmalloc(sz, GFP_KERNEL);
712 }
713 
sfq_free(void * addr)714 static void sfq_free(void *addr)
715 {
716 	kvfree(addr);
717 }
718 
sfq_destroy(struct Qdisc * sch)719 static void sfq_destroy(struct Qdisc *sch)
720 {
721 	struct sfq_sched_data *q = qdisc_priv(sch);
722 
723 	tcf_block_put(q->block);
724 	WRITE_ONCE(q->perturb_period, 0);
725 	del_timer_sync(&q->perturb_timer);
726 	sfq_free(q->ht);
727 	sfq_free(q->slots);
728 	kfree(q->red_parms);
729 }
730 
sfq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)731 static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
732 		    struct netlink_ext_ack *extack)
733 {
734 	struct sfq_sched_data *q = qdisc_priv(sch);
735 	int i;
736 	int err;
737 
738 	q->sch = sch;
739 	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
740 
741 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
742 	if (err)
743 		return err;
744 
745 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
746 		q->dep[i].next = i + SFQ_MAX_FLOWS;
747 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
748 	}
749 
750 	q->limit = SFQ_MAX_DEPTH;
751 	q->maxdepth = SFQ_MAX_DEPTH;
752 	q->cur_depth = 0;
753 	q->tail = NULL;
754 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
755 	q->maxflows = SFQ_DEFAULT_FLOWS;
756 	q->quantum = psched_mtu(qdisc_dev(sch));
757 	q->perturb_period = 0;
758 	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
759 
760 	if (opt) {
761 		int err = sfq_change(sch, opt, extack);
762 		if (err)
763 			return err;
764 	}
765 
766 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
767 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
768 	if (!q->ht || !q->slots) {
769 		/* Note: sfq_destroy() will be called by our caller */
770 		return -ENOMEM;
771 	}
772 
773 	for (i = 0; i < q->divisor; i++)
774 		q->ht[i] = SFQ_EMPTY_SLOT;
775 
776 	for (i = 0; i < q->maxflows; i++) {
777 		slot_queue_init(&q->slots[i]);
778 		sfq_link(q, i);
779 	}
780 	if (q->limit >= 1)
781 		sch->flags |= TCQ_F_CAN_BYPASS;
782 	else
783 		sch->flags &= ~TCQ_F_CAN_BYPASS;
784 	return 0;
785 }
786 
sfq_dump(struct Qdisc * sch,struct sk_buff * skb)787 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
788 {
789 	struct sfq_sched_data *q = qdisc_priv(sch);
790 	unsigned char *b = skb_tail_pointer(skb);
791 	struct tc_sfq_qopt_v1 opt;
792 	struct red_parms *p = q->red_parms;
793 
794 	memset(&opt, 0, sizeof(opt));
795 	opt.v0.quantum	= q->quantum;
796 	opt.v0.perturb_period = q->perturb_period / HZ;
797 	opt.v0.limit	= q->limit;
798 	opt.v0.divisor	= q->divisor;
799 	opt.v0.flows	= q->maxflows;
800 	opt.depth	= q->maxdepth;
801 	opt.headdrop	= q->headdrop;
802 
803 	if (p) {
804 		opt.qth_min	= p->qth_min >> p->Wlog;
805 		opt.qth_max	= p->qth_max >> p->Wlog;
806 		opt.Wlog	= p->Wlog;
807 		opt.Plog	= p->Plog;
808 		opt.Scell_log	= p->Scell_log;
809 		opt.max_P	= p->max_P;
810 	}
811 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
812 	opt.flags	= q->flags;
813 
814 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
815 		goto nla_put_failure;
816 
817 	return skb->len;
818 
819 nla_put_failure:
820 	nlmsg_trim(skb, b);
821 	return -1;
822 }
823 
sfq_leaf(struct Qdisc * sch,unsigned long arg)824 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
825 {
826 	return NULL;
827 }
828 
sfq_find(struct Qdisc * sch,u32 classid)829 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
830 {
831 	return 0;
832 }
833 
sfq_bind(struct Qdisc * sch,unsigned long parent,u32 classid)834 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
835 			      u32 classid)
836 {
837 	return 0;
838 }
839 
sfq_unbind(struct Qdisc * q,unsigned long cl)840 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
841 {
842 }
843 
sfq_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)844 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
845 				       struct netlink_ext_ack *extack)
846 {
847 	struct sfq_sched_data *q = qdisc_priv(sch);
848 
849 	if (cl)
850 		return NULL;
851 	return q->block;
852 }
853 
sfq_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)854 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
855 			  struct sk_buff *skb, struct tcmsg *tcm)
856 {
857 	tcm->tcm_handle |= TC_H_MIN(cl);
858 	return 0;
859 }
860 
sfq_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)861 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
862 				struct gnet_dump *d)
863 {
864 	struct sfq_sched_data *q = qdisc_priv(sch);
865 	sfq_index idx = q->ht[cl - 1];
866 	struct gnet_stats_queue qs = { 0 };
867 	struct tc_sfq_xstats xstats = { 0 };
868 
869 	if (idx != SFQ_EMPTY_SLOT) {
870 		const struct sfq_slot *slot = &q->slots[idx];
871 
872 		xstats.allot = slot->allot;
873 		qs.qlen = slot->qlen;
874 		qs.backlog = slot->backlog;
875 	}
876 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
877 		return -1;
878 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
879 }
880 
sfq_walk(struct Qdisc * sch,struct qdisc_walker * arg)881 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
882 {
883 	struct sfq_sched_data *q = qdisc_priv(sch);
884 	unsigned int i;
885 
886 	if (arg->stop)
887 		return;
888 
889 	for (i = 0; i < q->divisor; i++) {
890 		if (q->ht[i] == SFQ_EMPTY_SLOT) {
891 			arg->count++;
892 			continue;
893 		}
894 		if (!tc_qdisc_stats_dump(sch, i + 1, arg))
895 			break;
896 	}
897 }
898 
899 static const struct Qdisc_class_ops sfq_class_ops = {
900 	.leaf		=	sfq_leaf,
901 	.find		=	sfq_find,
902 	.tcf_block	=	sfq_tcf_block,
903 	.bind_tcf	=	sfq_bind,
904 	.unbind_tcf	=	sfq_unbind,
905 	.dump		=	sfq_dump_class,
906 	.dump_stats	=	sfq_dump_class_stats,
907 	.walk		=	sfq_walk,
908 };
909 
910 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
911 	.cl_ops		=	&sfq_class_ops,
912 	.id		=	"sfq",
913 	.priv_size	=	sizeof(struct sfq_sched_data),
914 	.enqueue	=	sfq_enqueue,
915 	.dequeue	=	sfq_dequeue,
916 	.peek		=	qdisc_peek_dequeued,
917 	.init		=	sfq_init,
918 	.reset		=	sfq_reset,
919 	.destroy	=	sfq_destroy,
920 	.change		=	NULL,
921 	.dump		=	sfq_dump,
922 	.owner		=	THIS_MODULE,
923 };
924 
sfq_module_init(void)925 static int __init sfq_module_init(void)
926 {
927 	return register_qdisc(&sfq_qdisc_ops);
928 }
sfq_module_exit(void)929 static void __exit sfq_module_exit(void)
930 {
931 	unregister_qdisc(&sfq_qdisc_ops);
932 }
933 module_init(sfq_module_init)
934 module_exit(sfq_module_exit)
935 MODULE_LICENSE("GPL");
936