xref: /openbmc/linux/net/sched/cls_u32.c (revision 5a244f48)
1 /*
2  * net/sched/cls_u32.c	Ugly (or Universal) 32bit key Packet Classifier.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  *	The filters are packed to hash tables of key nodes
12  *	with a set of 32bit key/mask pairs at every node.
13  *	Nodes reference next level hash tables etc.
14  *
15  *	This scheme is the best universal classifier I managed to
16  *	invent; it is not super-fast, but it is not slow (provided you
17  *	program it correctly), and general enough.  And its relative
18  *	speed grows as the number of rules becomes larger.
19  *
20  *	It seems that it represents the best middle point between
21  *	speed and manageability both by human and by machine.
22  *
23  *	It is especially useful for link sharing combined with QoS;
24  *	pure RSVP doesn't need such a general approach and can use
25  *	much simpler (and faster) schemes, sort of cls_rsvp.c.
26  *
27  *	JHS: We should remove the CONFIG_NET_CLS_IND from here
28  *	eventually when the meta match extension is made available
29  *
30  *	nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
31  */
32 
33 #include <linux/module.h>
34 #include <linux/slab.h>
35 #include <linux/types.h>
36 #include <linux/kernel.h>
37 #include <linux/string.h>
38 #include <linux/errno.h>
39 #include <linux/percpu.h>
40 #include <linux/rtnetlink.h>
41 #include <linux/skbuff.h>
42 #include <linux/bitmap.h>
43 #include <linux/netdevice.h>
44 #include <linux/hash.h>
45 #include <net/netlink.h>
46 #include <net/act_api.h>
47 #include <net/pkt_cls.h>
48 #include <linux/netdevice.h>
49 
50 struct tc_u_knode {
51 	struct tc_u_knode __rcu	*next;
52 	u32			handle;
53 	struct tc_u_hnode __rcu	*ht_up;
54 	struct tcf_exts		exts;
55 #ifdef CONFIG_NET_CLS_IND
56 	int			ifindex;
57 #endif
58 	u8			fshift;
59 	struct tcf_result	res;
60 	struct tc_u_hnode __rcu	*ht_down;
61 #ifdef CONFIG_CLS_U32_PERF
62 	struct tc_u32_pcnt __percpu *pf;
63 #endif
64 	u32			flags;
65 #ifdef CONFIG_CLS_U32_MARK
66 	u32			val;
67 	u32			mask;
68 	u32 __percpu		*pcpu_success;
69 #endif
70 	struct tcf_proto	*tp;
71 	struct rcu_head		rcu;
72 	/* The 'sel' field MUST be the last field in structure to allow for
73 	 * tc_u32_keys allocated at end of structure.
74 	 */
75 	struct tc_u32_sel	sel;
76 };
77 
78 struct tc_u_hnode {
79 	struct tc_u_hnode __rcu	*next;
80 	u32			handle;
81 	u32			prio;
82 	struct tc_u_common	*tp_c;
83 	int			refcnt;
84 	unsigned int		divisor;
85 	struct rcu_head		rcu;
86 	/* The 'ht' field MUST be the last field in structure to allow for
87 	 * more entries allocated at end of structure.
88 	 */
89 	struct tc_u_knode __rcu	*ht[1];
90 };
91 
92 struct tc_u_common {
93 	struct tc_u_hnode __rcu	*hlist;
94 	struct Qdisc		*q;
95 	int			refcnt;
96 	u32			hgenerator;
97 	struct hlist_node	hnode;
98 	struct rcu_head		rcu;
99 };
100 
101 static inline unsigned int u32_hash_fold(__be32 key,
102 					 const struct tc_u32_sel *sel,
103 					 u8 fshift)
104 {
105 	unsigned int h = ntohl(key & sel->hmask) >> fshift;
106 
107 	return h;
108 }
109 
110 static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp,
111 			struct tcf_result *res)
112 {
113 	struct {
114 		struct tc_u_knode *knode;
115 		unsigned int	  off;
116 	} stack[TC_U32_MAXDEPTH];
117 
118 	struct tc_u_hnode *ht = rcu_dereference_bh(tp->root);
119 	unsigned int off = skb_network_offset(skb);
120 	struct tc_u_knode *n;
121 	int sdepth = 0;
122 	int off2 = 0;
123 	int sel = 0;
124 #ifdef CONFIG_CLS_U32_PERF
125 	int j;
126 #endif
127 	int i, r;
128 
129 next_ht:
130 	n = rcu_dereference_bh(ht->ht[sel]);
131 
132 next_knode:
133 	if (n) {
134 		struct tc_u32_key *key = n->sel.keys;
135 
136 #ifdef CONFIG_CLS_U32_PERF
137 		__this_cpu_inc(n->pf->rcnt);
138 		j = 0;
139 #endif
140 
141 		if (tc_skip_sw(n->flags)) {
142 			n = rcu_dereference_bh(n->next);
143 			goto next_knode;
144 		}
145 
146 #ifdef CONFIG_CLS_U32_MARK
147 		if ((skb->mark & n->mask) != n->val) {
148 			n = rcu_dereference_bh(n->next);
149 			goto next_knode;
150 		} else {
151 			__this_cpu_inc(*n->pcpu_success);
152 		}
153 #endif
154 
155 		for (i = n->sel.nkeys; i > 0; i--, key++) {
156 			int toff = off + key->off + (off2 & key->offmask);
157 			__be32 *data, hdata;
158 
159 			if (skb_headroom(skb) + toff > INT_MAX)
160 				goto out;
161 
162 			data = skb_header_pointer(skb, toff, 4, &hdata);
163 			if (!data)
164 				goto out;
165 			if ((*data ^ key->val) & key->mask) {
166 				n = rcu_dereference_bh(n->next);
167 				goto next_knode;
168 			}
169 #ifdef CONFIG_CLS_U32_PERF
170 			__this_cpu_inc(n->pf->kcnts[j]);
171 			j++;
172 #endif
173 		}
174 
175 		ht = rcu_dereference_bh(n->ht_down);
176 		if (!ht) {
177 check_terminal:
178 			if (n->sel.flags & TC_U32_TERMINAL) {
179 
180 				*res = n->res;
181 #ifdef CONFIG_NET_CLS_IND
182 				if (!tcf_match_indev(skb, n->ifindex)) {
183 					n = rcu_dereference_bh(n->next);
184 					goto next_knode;
185 				}
186 #endif
187 #ifdef CONFIG_CLS_U32_PERF
188 				__this_cpu_inc(n->pf->rhit);
189 #endif
190 				r = tcf_exts_exec(skb, &n->exts, res);
191 				if (r < 0) {
192 					n = rcu_dereference_bh(n->next);
193 					goto next_knode;
194 				}
195 
196 				return r;
197 			}
198 			n = rcu_dereference_bh(n->next);
199 			goto next_knode;
200 		}
201 
202 		/* PUSH */
203 		if (sdepth >= TC_U32_MAXDEPTH)
204 			goto deadloop;
205 		stack[sdepth].knode = n;
206 		stack[sdepth].off = off;
207 		sdepth++;
208 
209 		ht = rcu_dereference_bh(n->ht_down);
210 		sel = 0;
211 		if (ht->divisor) {
212 			__be32 *data, hdata;
213 
214 			data = skb_header_pointer(skb, off + n->sel.hoff, 4,
215 						  &hdata);
216 			if (!data)
217 				goto out;
218 			sel = ht->divisor & u32_hash_fold(*data, &n->sel,
219 							  n->fshift);
220 		}
221 		if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))
222 			goto next_ht;
223 
224 		if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
225 			off2 = n->sel.off + 3;
226 			if (n->sel.flags & TC_U32_VAROFFSET) {
227 				__be16 *data, hdata;
228 
229 				data = skb_header_pointer(skb,
230 							  off + n->sel.offoff,
231 							  2, &hdata);
232 				if (!data)
233 					goto out;
234 				off2 += ntohs(n->sel.offmask & *data) >>
235 					n->sel.offshift;
236 			}
237 			off2 &= ~3;
238 		}
239 		if (n->sel.flags & TC_U32_EAT) {
240 			off += off2;
241 			off2 = 0;
242 		}
243 
244 		if (off < skb->len)
245 			goto next_ht;
246 	}
247 
248 	/* POP */
249 	if (sdepth--) {
250 		n = stack[sdepth].knode;
251 		ht = rcu_dereference_bh(n->ht_up);
252 		off = stack[sdepth].off;
253 		goto check_terminal;
254 	}
255 out:
256 	return -1;
257 
258 deadloop:
259 	net_warn_ratelimited("cls_u32: dead loop\n");
260 	return -1;
261 }
262 
263 static struct tc_u_hnode *u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
264 {
265 	struct tc_u_hnode *ht;
266 
267 	for (ht = rtnl_dereference(tp_c->hlist);
268 	     ht;
269 	     ht = rtnl_dereference(ht->next))
270 		if (ht->handle == handle)
271 			break;
272 
273 	return ht;
274 }
275 
276 static struct tc_u_knode *u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
277 {
278 	unsigned int sel;
279 	struct tc_u_knode *n = NULL;
280 
281 	sel = TC_U32_HASH(handle);
282 	if (sel > ht->divisor)
283 		goto out;
284 
285 	for (n = rtnl_dereference(ht->ht[sel]);
286 	     n;
287 	     n = rtnl_dereference(n->next))
288 		if (n->handle == handle)
289 			break;
290 out:
291 	return n;
292 }
293 
294 
295 static void *u32_get(struct tcf_proto *tp, u32 handle)
296 {
297 	struct tc_u_hnode *ht;
298 	struct tc_u_common *tp_c = tp->data;
299 
300 	if (TC_U32_HTID(handle) == TC_U32_ROOT)
301 		ht = rtnl_dereference(tp->root);
302 	else
303 		ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));
304 
305 	if (!ht)
306 		return NULL;
307 
308 	if (TC_U32_KEY(handle) == 0)
309 		return ht;
310 
311 	return u32_lookup_key(ht, handle);
312 }
313 
314 static u32 gen_new_htid(struct tc_u_common *tp_c)
315 {
316 	int i = 0x800;
317 
318 	/* hgenerator only used inside rtnl lock it is safe to increment
319 	 * without read _copy_ update semantics
320 	 */
321 	do {
322 		if (++tp_c->hgenerator == 0x7FF)
323 			tp_c->hgenerator = 1;
324 	} while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
325 
326 	return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
327 }
328 
329 static struct hlist_head *tc_u_common_hash;
330 
331 #define U32_HASH_SHIFT 10
332 #define U32_HASH_SIZE (1 << U32_HASH_SHIFT)
333 
334 static unsigned int tc_u_hash(const struct tcf_proto *tp)
335 {
336 	struct net_device *dev = tp->q->dev_queue->dev;
337 	u32 qhandle = tp->q->handle;
338 	int ifindex = dev->ifindex;
339 
340 	return hash_64((u64)ifindex << 32 | qhandle, U32_HASH_SHIFT);
341 }
342 
343 static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp)
344 {
345 	struct tc_u_common *tc;
346 	unsigned int h;
347 
348 	h = tc_u_hash(tp);
349 	hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) {
350 		if (tc->q == tp->q)
351 			return tc;
352 	}
353 	return NULL;
354 }
355 
356 static int u32_init(struct tcf_proto *tp)
357 {
358 	struct tc_u_hnode *root_ht;
359 	struct tc_u_common *tp_c;
360 	unsigned int h;
361 
362 	tp_c = tc_u_common_find(tp);
363 
364 	root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
365 	if (root_ht == NULL)
366 		return -ENOBUFS;
367 
368 	root_ht->refcnt++;
369 	root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
370 	root_ht->prio = tp->prio;
371 
372 	if (tp_c == NULL) {
373 		tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
374 		if (tp_c == NULL) {
375 			kfree(root_ht);
376 			return -ENOBUFS;
377 		}
378 		tp_c->q = tp->q;
379 		INIT_HLIST_NODE(&tp_c->hnode);
380 
381 		h = tc_u_hash(tp);
382 		hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]);
383 	}
384 
385 	tp_c->refcnt++;
386 	RCU_INIT_POINTER(root_ht->next, tp_c->hlist);
387 	rcu_assign_pointer(tp_c->hlist, root_ht);
388 	root_ht->tp_c = tp_c;
389 
390 	rcu_assign_pointer(tp->root, root_ht);
391 	tp->data = tp_c;
392 	return 0;
393 }
394 
395 static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
396 			   bool free_pf)
397 {
398 	tcf_exts_destroy(&n->exts);
399 	if (n->ht_down)
400 		n->ht_down->refcnt--;
401 #ifdef CONFIG_CLS_U32_PERF
402 	if (free_pf)
403 		free_percpu(n->pf);
404 #endif
405 #ifdef CONFIG_CLS_U32_MARK
406 	if (free_pf)
407 		free_percpu(n->pcpu_success);
408 #endif
409 	kfree(n);
410 	return 0;
411 }
412 
413 /* u32_delete_key_rcu should be called when free'ing a copied
414  * version of a tc_u_knode obtained from u32_init_knode(). When
415  * copies are obtained from u32_init_knode() the statistics are
416  * shared between the old and new copies to allow readers to
417  * continue to update the statistics during the copy. To support
418  * this the u32_delete_key_rcu variant does not free the percpu
419  * statistics.
420  */
421 static void u32_delete_key_rcu(struct rcu_head *rcu)
422 {
423 	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
424 
425 	u32_destroy_key(key->tp, key, false);
426 }
427 
428 /* u32_delete_key_freepf_rcu is the rcu callback variant
429  * that free's the entire structure including the statistics
430  * percpu variables. Only use this if the key is not a copy
431  * returned by u32_init_knode(). See u32_delete_key_rcu()
432  * for the variant that should be used with keys return from
433  * u32_init_knode()
434  */
435 static void u32_delete_key_freepf_rcu(struct rcu_head *rcu)
436 {
437 	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
438 
439 	u32_destroy_key(key->tp, key, true);
440 }
441 
442 static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
443 {
444 	struct tc_u_knode __rcu **kp;
445 	struct tc_u_knode *pkp;
446 	struct tc_u_hnode *ht = rtnl_dereference(key->ht_up);
447 
448 	if (ht) {
449 		kp = &ht->ht[TC_U32_HASH(key->handle)];
450 		for (pkp = rtnl_dereference(*kp); pkp;
451 		     kp = &pkp->next, pkp = rtnl_dereference(*kp)) {
452 			if (pkp == key) {
453 				RCU_INIT_POINTER(*kp, key->next);
454 
455 				tcf_unbind_filter(tp, &key->res);
456 				call_rcu(&key->rcu, u32_delete_key_freepf_rcu);
457 				return 0;
458 			}
459 		}
460 	}
461 	WARN_ON(1);
462 	return 0;
463 }
464 
465 static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
466 {
467 	struct net_device *dev = tp->q->dev_queue->dev;
468 	struct tc_cls_u32_offload cls_u32 = {};
469 
470 	if (!tc_should_offload(dev, 0))
471 		return;
472 
473 	tc_cls_common_offload_init(&cls_u32.common, tp);
474 	cls_u32.command = TC_CLSU32_DELETE_KNODE;
475 	cls_u32.knode.handle = handle;
476 
477 	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32);
478 }
479 
480 static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
481 				u32 flags)
482 {
483 	struct net_device *dev = tp->q->dev_queue->dev;
484 	struct tc_cls_u32_offload cls_u32 = {};
485 	int err;
486 
487 	if (!tc_should_offload(dev, flags))
488 		return tc_skip_sw(flags) ? -EINVAL : 0;
489 
490 	tc_cls_common_offload_init(&cls_u32.common, tp);
491 	cls_u32.command = TC_CLSU32_NEW_HNODE;
492 	cls_u32.hnode.divisor = h->divisor;
493 	cls_u32.hnode.handle = h->handle;
494 	cls_u32.hnode.prio = h->prio;
495 
496 	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32);
497 	if (tc_skip_sw(flags))
498 		return err;
499 
500 	return 0;
501 }
502 
503 static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
504 {
505 	struct net_device *dev = tp->q->dev_queue->dev;
506 	struct tc_cls_u32_offload cls_u32 = {};
507 
508 	if (!tc_should_offload(dev, 0))
509 		return;
510 
511 	tc_cls_common_offload_init(&cls_u32.common, tp);
512 	cls_u32.command = TC_CLSU32_DELETE_HNODE;
513 	cls_u32.hnode.divisor = h->divisor;
514 	cls_u32.hnode.handle = h->handle;
515 	cls_u32.hnode.prio = h->prio;
516 
517 	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32);
518 }
519 
520 static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
521 				u32 flags)
522 {
523 	struct net_device *dev = tp->q->dev_queue->dev;
524 	struct tc_cls_u32_offload cls_u32 = {};
525 	int err;
526 
527 	if (!tc_should_offload(dev, flags))
528 		return tc_skip_sw(flags) ? -EINVAL : 0;
529 
530 	tc_cls_common_offload_init(&cls_u32.common, tp);
531 	cls_u32.command = TC_CLSU32_REPLACE_KNODE;
532 	cls_u32.knode.handle = n->handle;
533 	cls_u32.knode.fshift = n->fshift;
534 #ifdef CONFIG_CLS_U32_MARK
535 	cls_u32.knode.val = n->val;
536 	cls_u32.knode.mask = n->mask;
537 #else
538 	cls_u32.knode.val = 0;
539 	cls_u32.knode.mask = 0;
540 #endif
541 	cls_u32.knode.sel = &n->sel;
542 	cls_u32.knode.exts = &n->exts;
543 	if (n->ht_down)
544 		cls_u32.knode.link_handle = n->ht_down->handle;
545 
546 	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32);
547 
548 	if (!err)
549 		n->flags |= TCA_CLS_FLAGS_IN_HW;
550 
551 	if (tc_skip_sw(flags))
552 		return err;
553 
554 	return 0;
555 }
556 
557 static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
558 {
559 	struct tc_u_knode *n;
560 	unsigned int h;
561 
562 	for (h = 0; h <= ht->divisor; h++) {
563 		while ((n = rtnl_dereference(ht->ht[h])) != NULL) {
564 			RCU_INIT_POINTER(ht->ht[h],
565 					 rtnl_dereference(n->next));
566 			tcf_unbind_filter(tp, &n->res);
567 			u32_remove_hw_knode(tp, n->handle);
568 			call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
569 		}
570 	}
571 }
572 
573 static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
574 {
575 	struct tc_u_common *tp_c = tp->data;
576 	struct tc_u_hnode __rcu **hn;
577 	struct tc_u_hnode *phn;
578 
579 	WARN_ON(ht->refcnt);
580 
581 	u32_clear_hnode(tp, ht);
582 
583 	hn = &tp_c->hlist;
584 	for (phn = rtnl_dereference(*hn);
585 	     phn;
586 	     hn = &phn->next, phn = rtnl_dereference(*hn)) {
587 		if (phn == ht) {
588 			u32_clear_hw_hnode(tp, ht);
589 			RCU_INIT_POINTER(*hn, ht->next);
590 			kfree_rcu(ht, rcu);
591 			return 0;
592 		}
593 	}
594 
595 	return -ENOENT;
596 }
597 
598 static bool ht_empty(struct tc_u_hnode *ht)
599 {
600 	unsigned int h;
601 
602 	for (h = 0; h <= ht->divisor; h++)
603 		if (rcu_access_pointer(ht->ht[h]))
604 			return false;
605 
606 	return true;
607 }
608 
609 static void u32_destroy(struct tcf_proto *tp)
610 {
611 	struct tc_u_common *tp_c = tp->data;
612 	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
613 
614 	WARN_ON(root_ht == NULL);
615 
616 	if (root_ht && --root_ht->refcnt == 0)
617 		u32_destroy_hnode(tp, root_ht);
618 
619 	if (--tp_c->refcnt == 0) {
620 		struct tc_u_hnode *ht;
621 
622 		hlist_del(&tp_c->hnode);
623 
624 		for (ht = rtnl_dereference(tp_c->hlist);
625 		     ht;
626 		     ht = rtnl_dereference(ht->next)) {
627 			ht->refcnt--;
628 			u32_clear_hnode(tp, ht);
629 		}
630 
631 		while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) {
632 			RCU_INIT_POINTER(tp_c->hlist, ht->next);
633 			kfree_rcu(ht, rcu);
634 		}
635 
636 		kfree(tp_c);
637 	}
638 
639 	tp->data = NULL;
640 }
641 
642 static int u32_delete(struct tcf_proto *tp, void *arg, bool *last)
643 {
644 	struct tc_u_hnode *ht = arg;
645 	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
646 	struct tc_u_common *tp_c = tp->data;
647 	int ret = 0;
648 
649 	if (ht == NULL)
650 		goto out;
651 
652 	if (TC_U32_KEY(ht->handle)) {
653 		u32_remove_hw_knode(tp, ht->handle);
654 		ret = u32_delete_key(tp, (struct tc_u_knode *)ht);
655 		goto out;
656 	}
657 
658 	if (root_ht == ht)
659 		return -EINVAL;
660 
661 	if (ht->refcnt == 1) {
662 		ht->refcnt--;
663 		u32_destroy_hnode(tp, ht);
664 	} else {
665 		return -EBUSY;
666 	}
667 
668 out:
669 	*last = true;
670 	if (root_ht) {
671 		if (root_ht->refcnt > 1) {
672 			*last = false;
673 			goto ret;
674 		}
675 		if (root_ht->refcnt == 1) {
676 			if (!ht_empty(root_ht)) {
677 				*last = false;
678 				goto ret;
679 			}
680 		}
681 	}
682 
683 	if (tp_c->refcnt > 1) {
684 		*last = false;
685 		goto ret;
686 	}
687 
688 	if (tp_c->refcnt == 1) {
689 		struct tc_u_hnode *ht;
690 
691 		for (ht = rtnl_dereference(tp_c->hlist);
692 		     ht;
693 		     ht = rtnl_dereference(ht->next))
694 			if (!ht_empty(ht)) {
695 				*last = false;
696 				break;
697 			}
698 	}
699 
700 ret:
701 	return ret;
702 }
703 
704 #define NR_U32_NODE (1<<12)
705 static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
706 {
707 	struct tc_u_knode *n;
708 	unsigned long i;
709 	unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long),
710 					GFP_KERNEL);
711 	if (!bitmap)
712 		return handle | 0xFFF;
713 
714 	for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]);
715 	     n;
716 	     n = rtnl_dereference(n->next))
717 		set_bit(TC_U32_NODE(n->handle), bitmap);
718 
719 	i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800);
720 	if (i >= NR_U32_NODE)
721 		i = find_next_zero_bit(bitmap, NR_U32_NODE, 1);
722 
723 	kfree(bitmap);
724 	return handle | (i >= NR_U32_NODE ? 0xFFF : i);
725 }
726 
727 static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
728 	[TCA_U32_CLASSID]	= { .type = NLA_U32 },
729 	[TCA_U32_HASH]		= { .type = NLA_U32 },
730 	[TCA_U32_LINK]		= { .type = NLA_U32 },
731 	[TCA_U32_DIVISOR]	= { .type = NLA_U32 },
732 	[TCA_U32_SEL]		= { .len = sizeof(struct tc_u32_sel) },
733 	[TCA_U32_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
734 	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },
735 	[TCA_U32_FLAGS]		= { .type = NLA_U32 },
736 };
737 
738 static int u32_set_parms(struct net *net, struct tcf_proto *tp,
739 			 unsigned long base, struct tc_u_hnode *ht,
740 			 struct tc_u_knode *n, struct nlattr **tb,
741 			 struct nlattr *est, bool ovr)
742 {
743 	int err;
744 
745 	err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr);
746 	if (err < 0)
747 		return err;
748 
749 	if (tb[TCA_U32_LINK]) {
750 		u32 handle = nla_get_u32(tb[TCA_U32_LINK]);
751 		struct tc_u_hnode *ht_down = NULL, *ht_old;
752 
753 		if (TC_U32_KEY(handle))
754 			return -EINVAL;
755 
756 		if (handle) {
757 			ht_down = u32_lookup_ht(ht->tp_c, handle);
758 
759 			if (ht_down == NULL)
760 				return -EINVAL;
761 			ht_down->refcnt++;
762 		}
763 
764 		ht_old = rtnl_dereference(n->ht_down);
765 		rcu_assign_pointer(n->ht_down, ht_down);
766 
767 		if (ht_old)
768 			ht_old->refcnt--;
769 	}
770 	if (tb[TCA_U32_CLASSID]) {
771 		n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]);
772 		tcf_bind_filter(tp, &n->res, base);
773 	}
774 
775 #ifdef CONFIG_NET_CLS_IND
776 	if (tb[TCA_U32_INDEV]) {
777 		int ret;
778 		ret = tcf_change_indev(net, tb[TCA_U32_INDEV]);
779 		if (ret < 0)
780 			return -EINVAL;
781 		n->ifindex = ret;
782 	}
783 #endif
784 	return 0;
785 }
786 
787 static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c,
788 			      struct tc_u_knode *n)
789 {
790 	struct tc_u_knode __rcu **ins;
791 	struct tc_u_knode *pins;
792 	struct tc_u_hnode *ht;
793 
794 	if (TC_U32_HTID(n->handle) == TC_U32_ROOT)
795 		ht = rtnl_dereference(tp->root);
796 	else
797 		ht = u32_lookup_ht(tp_c, TC_U32_HTID(n->handle));
798 
799 	ins = &ht->ht[TC_U32_HASH(n->handle)];
800 
801 	/* The node must always exist for it to be replaced if this is not the
802 	 * case then something went very wrong elsewhere.
803 	 */
804 	for (pins = rtnl_dereference(*ins); ;
805 	     ins = &pins->next, pins = rtnl_dereference(*ins))
806 		if (pins->handle == n->handle)
807 			break;
808 
809 	RCU_INIT_POINTER(n->next, pins->next);
810 	rcu_assign_pointer(*ins, n);
811 }
812 
813 static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
814 					 struct tc_u_knode *n)
815 {
816 	struct tc_u_knode *new;
817 	struct tc_u32_sel *s = &n->sel;
818 
819 	new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key),
820 		      GFP_KERNEL);
821 
822 	if (!new)
823 		return NULL;
824 
825 	RCU_INIT_POINTER(new->next, n->next);
826 	new->handle = n->handle;
827 	RCU_INIT_POINTER(new->ht_up, n->ht_up);
828 
829 #ifdef CONFIG_NET_CLS_IND
830 	new->ifindex = n->ifindex;
831 #endif
832 	new->fshift = n->fshift;
833 	new->res = n->res;
834 	new->flags = n->flags;
835 	RCU_INIT_POINTER(new->ht_down, n->ht_down);
836 
837 	/* bump reference count as long as we hold pointer to structure */
838 	if (new->ht_down)
839 		new->ht_down->refcnt++;
840 
841 #ifdef CONFIG_CLS_U32_PERF
842 	/* Statistics may be incremented by readers during update
843 	 * so we must keep them in tact. When the node is later destroyed
844 	 * a special destroy call must be made to not free the pf memory.
845 	 */
846 	new->pf = n->pf;
847 #endif
848 
849 #ifdef CONFIG_CLS_U32_MARK
850 	new->val = n->val;
851 	new->mask = n->mask;
852 	/* Similarly success statistics must be moved as pointers */
853 	new->pcpu_success = n->pcpu_success;
854 #endif
855 	new->tp = tp;
856 	memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
857 
858 	if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) {
859 		kfree(new);
860 		return NULL;
861 	}
862 
863 	return new;
864 }
865 
866 static int u32_change(struct net *net, struct sk_buff *in_skb,
867 		      struct tcf_proto *tp, unsigned long base, u32 handle,
868 		      struct nlattr **tca, void **arg, bool ovr)
869 {
870 	struct tc_u_common *tp_c = tp->data;
871 	struct tc_u_hnode *ht;
872 	struct tc_u_knode *n;
873 	struct tc_u32_sel *s;
874 	struct nlattr *opt = tca[TCA_OPTIONS];
875 	struct nlattr *tb[TCA_U32_MAX + 1];
876 	u32 htid, flags = 0;
877 	int err;
878 #ifdef CONFIG_CLS_U32_PERF
879 	size_t size;
880 #endif
881 
882 	if (opt == NULL)
883 		return handle ? -EINVAL : 0;
884 
885 	err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, NULL);
886 	if (err < 0)
887 		return err;
888 
889 	if (tb[TCA_U32_FLAGS]) {
890 		flags = nla_get_u32(tb[TCA_U32_FLAGS]);
891 		if (!tc_flags_valid(flags))
892 			return -EINVAL;
893 	}
894 
895 	n = *arg;
896 	if (n) {
897 		struct tc_u_knode *new;
898 
899 		if (TC_U32_KEY(n->handle) == 0)
900 			return -EINVAL;
901 
902 		if (n->flags != flags)
903 			return -EINVAL;
904 
905 		new = u32_init_knode(tp, n);
906 		if (!new)
907 			return -ENOMEM;
908 
909 		err = u32_set_parms(net, tp, base,
910 				    rtnl_dereference(n->ht_up), new, tb,
911 				    tca[TCA_RATE], ovr);
912 
913 		if (err) {
914 			u32_destroy_key(tp, new, false);
915 			return err;
916 		}
917 
918 		err = u32_replace_hw_knode(tp, new, flags);
919 		if (err) {
920 			u32_destroy_key(tp, new, false);
921 			return err;
922 		}
923 
924 		if (!tc_in_hw(new->flags))
925 			new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
926 
927 		u32_replace_knode(tp, tp_c, new);
928 		tcf_unbind_filter(tp, &n->res);
929 		call_rcu(&n->rcu, u32_delete_key_rcu);
930 		return 0;
931 	}
932 
933 	if (tb[TCA_U32_DIVISOR]) {
934 		unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
935 
936 		if (--divisor > 0x100)
937 			return -EINVAL;
938 		if (TC_U32_KEY(handle))
939 			return -EINVAL;
940 		if (handle == 0) {
941 			handle = gen_new_htid(tp->data);
942 			if (handle == 0)
943 				return -ENOMEM;
944 		}
945 		ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
946 		if (ht == NULL)
947 			return -ENOBUFS;
948 		ht->tp_c = tp_c;
949 		ht->refcnt = 1;
950 		ht->divisor = divisor;
951 		ht->handle = handle;
952 		ht->prio = tp->prio;
953 
954 		err = u32_replace_hw_hnode(tp, ht, flags);
955 		if (err) {
956 			kfree(ht);
957 			return err;
958 		}
959 
960 		RCU_INIT_POINTER(ht->next, tp_c->hlist);
961 		rcu_assign_pointer(tp_c->hlist, ht);
962 		*arg = ht;
963 
964 		return 0;
965 	}
966 
967 	if (tb[TCA_U32_HASH]) {
968 		htid = nla_get_u32(tb[TCA_U32_HASH]);
969 		if (TC_U32_HTID(htid) == TC_U32_ROOT) {
970 			ht = rtnl_dereference(tp->root);
971 			htid = ht->handle;
972 		} else {
973 			ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
974 			if (ht == NULL)
975 				return -EINVAL;
976 		}
977 	} else {
978 		ht = rtnl_dereference(tp->root);
979 		htid = ht->handle;
980 	}
981 
982 	if (ht->divisor < TC_U32_HASH(htid))
983 		return -EINVAL;
984 
985 	if (handle) {
986 		if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
987 			return -EINVAL;
988 		handle = htid | TC_U32_NODE(handle);
989 	} else
990 		handle = gen_new_kid(ht, htid);
991 
992 	if (tb[TCA_U32_SEL] == NULL)
993 		return -EINVAL;
994 
995 	s = nla_data(tb[TCA_U32_SEL]);
996 
997 	n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
998 	if (n == NULL)
999 		return -ENOBUFS;
1000 
1001 #ifdef CONFIG_CLS_U32_PERF
1002 	size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64);
1003 	n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt));
1004 	if (!n->pf) {
1005 		kfree(n);
1006 		return -ENOBUFS;
1007 	}
1008 #endif
1009 
1010 	memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
1011 	RCU_INIT_POINTER(n->ht_up, ht);
1012 	n->handle = handle;
1013 	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
1014 	n->flags = flags;
1015 	n->tp = tp;
1016 
1017 	err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
1018 	if (err < 0)
1019 		goto errout;
1020 
1021 #ifdef CONFIG_CLS_U32_MARK
1022 	n->pcpu_success = alloc_percpu(u32);
1023 	if (!n->pcpu_success) {
1024 		err = -ENOMEM;
1025 		goto errout;
1026 	}
1027 
1028 	if (tb[TCA_U32_MARK]) {
1029 		struct tc_u32_mark *mark;
1030 
1031 		mark = nla_data(tb[TCA_U32_MARK]);
1032 		n->val = mark->val;
1033 		n->mask = mark->mask;
1034 	}
1035 #endif
1036 
1037 	err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr);
1038 	if (err == 0) {
1039 		struct tc_u_knode __rcu **ins;
1040 		struct tc_u_knode *pins;
1041 
1042 		err = u32_replace_hw_knode(tp, n, flags);
1043 		if (err)
1044 			goto errhw;
1045 
1046 		if (!tc_in_hw(n->flags))
1047 			n->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
1048 
1049 		ins = &ht->ht[TC_U32_HASH(handle)];
1050 		for (pins = rtnl_dereference(*ins); pins;
1051 		     ins = &pins->next, pins = rtnl_dereference(*ins))
1052 			if (TC_U32_NODE(handle) < TC_U32_NODE(pins->handle))
1053 				break;
1054 
1055 		RCU_INIT_POINTER(n->next, pins);
1056 		rcu_assign_pointer(*ins, n);
1057 		*arg = n;
1058 		return 0;
1059 	}
1060 
1061 errhw:
1062 #ifdef CONFIG_CLS_U32_MARK
1063 	free_percpu(n->pcpu_success);
1064 #endif
1065 
1066 errout:
1067 	tcf_exts_destroy(&n->exts);
1068 #ifdef CONFIG_CLS_U32_PERF
1069 	free_percpu(n->pf);
1070 #endif
1071 	kfree(n);
1072 	return err;
1073 }
1074 
1075 static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
1076 {
1077 	struct tc_u_common *tp_c = tp->data;
1078 	struct tc_u_hnode *ht;
1079 	struct tc_u_knode *n;
1080 	unsigned int h;
1081 
1082 	if (arg->stop)
1083 		return;
1084 
1085 	for (ht = rtnl_dereference(tp_c->hlist);
1086 	     ht;
1087 	     ht = rtnl_dereference(ht->next)) {
1088 		if (ht->prio != tp->prio)
1089 			continue;
1090 		if (arg->count >= arg->skip) {
1091 			if (arg->fn(tp, ht, arg) < 0) {
1092 				arg->stop = 1;
1093 				return;
1094 			}
1095 		}
1096 		arg->count++;
1097 		for (h = 0; h <= ht->divisor; h++) {
1098 			for (n = rtnl_dereference(ht->ht[h]);
1099 			     n;
1100 			     n = rtnl_dereference(n->next)) {
1101 				if (arg->count < arg->skip) {
1102 					arg->count++;
1103 					continue;
1104 				}
1105 				if (arg->fn(tp, n, arg) < 0) {
1106 					arg->stop = 1;
1107 					return;
1108 				}
1109 				arg->count++;
1110 			}
1111 		}
1112 	}
1113 }
1114 
1115 static void u32_bind_class(void *fh, u32 classid, unsigned long cl)
1116 {
1117 	struct tc_u_knode *n = fh;
1118 
1119 	if (n && n->res.classid == classid)
1120 		n->res.class = cl;
1121 }
1122 
1123 static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
1124 		    struct sk_buff *skb, struct tcmsg *t)
1125 {
1126 	struct tc_u_knode *n = fh;
1127 	struct tc_u_hnode *ht_up, *ht_down;
1128 	struct nlattr *nest;
1129 
1130 	if (n == NULL)
1131 		return skb->len;
1132 
1133 	t->tcm_handle = n->handle;
1134 
1135 	nest = nla_nest_start(skb, TCA_OPTIONS);
1136 	if (nest == NULL)
1137 		goto nla_put_failure;
1138 
1139 	if (TC_U32_KEY(n->handle) == 0) {
1140 		struct tc_u_hnode *ht = fh;
1141 		u32 divisor = ht->divisor + 1;
1142 
1143 		if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor))
1144 			goto nla_put_failure;
1145 	} else {
1146 #ifdef CONFIG_CLS_U32_PERF
1147 		struct tc_u32_pcnt *gpf;
1148 		int cpu;
1149 #endif
1150 
1151 		if (nla_put(skb, TCA_U32_SEL,
1152 			    sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
1153 			    &n->sel))
1154 			goto nla_put_failure;
1155 
1156 		ht_up = rtnl_dereference(n->ht_up);
1157 		if (ht_up) {
1158 			u32 htid = n->handle & 0xFFFFF000;
1159 			if (nla_put_u32(skb, TCA_U32_HASH, htid))
1160 				goto nla_put_failure;
1161 		}
1162 		if (n->res.classid &&
1163 		    nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid))
1164 			goto nla_put_failure;
1165 
1166 		ht_down = rtnl_dereference(n->ht_down);
1167 		if (ht_down &&
1168 		    nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
1169 			goto nla_put_failure;
1170 
1171 		if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags))
1172 			goto nla_put_failure;
1173 
1174 #ifdef CONFIG_CLS_U32_MARK
1175 		if ((n->val || n->mask)) {
1176 			struct tc_u32_mark mark = {.val = n->val,
1177 						   .mask = n->mask,
1178 						   .success = 0};
1179 			int cpum;
1180 
1181 			for_each_possible_cpu(cpum) {
1182 				__u32 cnt = *per_cpu_ptr(n->pcpu_success, cpum);
1183 
1184 				mark.success += cnt;
1185 			}
1186 
1187 			if (nla_put(skb, TCA_U32_MARK, sizeof(mark), &mark))
1188 				goto nla_put_failure;
1189 		}
1190 #endif
1191 
1192 		if (tcf_exts_dump(skb, &n->exts) < 0)
1193 			goto nla_put_failure;
1194 
1195 #ifdef CONFIG_NET_CLS_IND
1196 		if (n->ifindex) {
1197 			struct net_device *dev;
1198 			dev = __dev_get_by_index(net, n->ifindex);
1199 			if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
1200 				goto nla_put_failure;
1201 		}
1202 #endif
1203 #ifdef CONFIG_CLS_U32_PERF
1204 		gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
1205 			      n->sel.nkeys * sizeof(u64),
1206 			      GFP_KERNEL);
1207 		if (!gpf)
1208 			goto nla_put_failure;
1209 
1210 		for_each_possible_cpu(cpu) {
1211 			int i;
1212 			struct tc_u32_pcnt *pf = per_cpu_ptr(n->pf, cpu);
1213 
1214 			gpf->rcnt += pf->rcnt;
1215 			gpf->rhit += pf->rhit;
1216 			for (i = 0; i < n->sel.nkeys; i++)
1217 				gpf->kcnts[i] += pf->kcnts[i];
1218 		}
1219 
1220 		if (nla_put_64bit(skb, TCA_U32_PCNT,
1221 				  sizeof(struct tc_u32_pcnt) +
1222 				  n->sel.nkeys * sizeof(u64),
1223 				  gpf, TCA_U32_PAD)) {
1224 			kfree(gpf);
1225 			goto nla_put_failure;
1226 		}
1227 		kfree(gpf);
1228 #endif
1229 	}
1230 
1231 	nla_nest_end(skb, nest);
1232 
1233 	if (TC_U32_KEY(n->handle))
1234 		if (tcf_exts_dump_stats(skb, &n->exts) < 0)
1235 			goto nla_put_failure;
1236 	return skb->len;
1237 
1238 nla_put_failure:
1239 	nla_nest_cancel(skb, nest);
1240 	return -1;
1241 }
1242 
1243 static struct tcf_proto_ops cls_u32_ops __read_mostly = {
1244 	.kind		=	"u32",
1245 	.classify	=	u32_classify,
1246 	.init		=	u32_init,
1247 	.destroy	=	u32_destroy,
1248 	.get		=	u32_get,
1249 	.change		=	u32_change,
1250 	.delete		=	u32_delete,
1251 	.walk		=	u32_walk,
1252 	.dump		=	u32_dump,
1253 	.bind_class	=	u32_bind_class,
1254 	.owner		=	THIS_MODULE,
1255 };
1256 
1257 static int __init init_u32(void)
1258 {
1259 	int i, ret;
1260 
1261 	pr_info("u32 classifier\n");
1262 #ifdef CONFIG_CLS_U32_PERF
1263 	pr_info("    Performance counters on\n");
1264 #endif
1265 #ifdef CONFIG_NET_CLS_IND
1266 	pr_info("    input device check on\n");
1267 #endif
1268 #ifdef CONFIG_NET_CLS_ACT
1269 	pr_info("    Actions configured\n");
1270 #endif
1271 	tc_u_common_hash = kvmalloc_array(U32_HASH_SIZE,
1272 					  sizeof(struct hlist_head),
1273 					  GFP_KERNEL);
1274 	if (!tc_u_common_hash)
1275 		return -ENOMEM;
1276 
1277 	for (i = 0; i < U32_HASH_SIZE; i++)
1278 		INIT_HLIST_HEAD(&tc_u_common_hash[i]);
1279 
1280 	ret = register_tcf_proto_ops(&cls_u32_ops);
1281 	if (ret)
1282 		kvfree(tc_u_common_hash);
1283 	return ret;
1284 }
1285 
1286 static void __exit exit_u32(void)
1287 {
1288 	unregister_tcf_proto_ops(&cls_u32_ops);
1289 	kvfree(tc_u_common_hash);
1290 }
1291 
1292 module_init(init_u32)
1293 module_exit(exit_u32)
1294 MODULE_LICENSE("GPL");
1295