xref: /openbmc/linux/net/openvswitch/flow_table.c (revision 23dabf88abb48a866fdb19ee08ebcf1ddd9b1840)
1 /*
2  * Copyright (c) 2007-2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #include "flow.h"
20 #include "datapath.h"
21 #include <linux/uaccess.h>
22 #include <linux/netdevice.h>
23 #include <linux/etherdevice.h>
24 #include <linux/if_ether.h>
25 #include <linux/if_vlan.h>
26 #include <net/llc_pdu.h>
27 #include <linux/kernel.h>
28 #include <linux/hash.h>
29 #include <linux/jiffies.h>
30 #include <linux/llc.h>
31 #include <linux/module.h>
32 #include <linux/in.h>
33 #include <linux/rcupdate.h>
34 #include <linux/if_arp.h>
35 #include <linux/ip.h>
36 #include <linux/ipv6.h>
37 #include <linux/sctp.h>
38 #include <linux/tcp.h>
39 #include <linux/udp.h>
40 #include <linux/icmp.h>
41 #include <linux/icmpv6.h>
42 #include <linux/rculist.h>
43 #include <net/ip.h>
44 #include <net/ipv6.h>
45 #include <net/ndisc.h>
46 
47 #define TBL_MIN_BUCKETS		1024
48 #define REHASH_INTERVAL		(10 * 60 * HZ)
49 
50 static struct kmem_cache *flow_cache;
51 
52 static u16 range_n_bytes(const struct sw_flow_key_range *range)
53 {
54 	return range->end - range->start;
55 }
56 
57 void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
58 		       const struct sw_flow_mask *mask)
59 {
60 	const long *m = (const long *)((const u8 *)&mask->key +
61 				mask->range.start);
62 	const long *s = (const long *)((const u8 *)src +
63 				mask->range.start);
64 	long *d = (long *)((u8 *)dst + mask->range.start);
65 	int i;
66 
67 	/* The memory outside of the 'mask->range' are not set since
68 	 * further operations on 'dst' only uses contents within
69 	 * 'mask->range'.
70 	 */
71 	for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long))
72 		*d++ = *s++ & *m++;
73 }
74 
75 struct sw_flow *ovs_flow_alloc(void)
76 {
77 	struct sw_flow *flow;
78 	int cpu;
79 
80 	flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
81 	if (!flow)
82 		return ERR_PTR(-ENOMEM);
83 
84 	flow->sf_acts = NULL;
85 	flow->mask = NULL;
86 
87 	flow->stats = alloc_percpu(struct flow_stats);
88 	if (!flow->stats)
89 		goto err;
90 
91 	for_each_possible_cpu(cpu) {
92 		struct flow_stats *cpu_stats;
93 
94 		cpu_stats = per_cpu_ptr(flow->stats, cpu);
95 		spin_lock_init(&cpu_stats->lock);
96 	}
97 	return flow;
98 err:
99 	kmem_cache_free(flow_cache, flow);
100 	return ERR_PTR(-ENOMEM);
101 }
102 
103 int ovs_flow_tbl_count(struct flow_table *table)
104 {
105 	return table->count;
106 }
107 
108 static struct flex_array *alloc_buckets(unsigned int n_buckets)
109 {
110 	struct flex_array *buckets;
111 	int i, err;
112 
113 	buckets = flex_array_alloc(sizeof(struct hlist_head),
114 				   n_buckets, GFP_KERNEL);
115 	if (!buckets)
116 		return NULL;
117 
118 	err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
119 	if (err) {
120 		flex_array_free(buckets);
121 		return NULL;
122 	}
123 
124 	for (i = 0; i < n_buckets; i++)
125 		INIT_HLIST_HEAD((struct hlist_head *)
126 					flex_array_get(buckets, i));
127 
128 	return buckets;
129 }
130 
131 static void flow_free(struct sw_flow *flow)
132 {
133 	kfree((struct sf_flow_acts __force *)flow->sf_acts);
134 	free_percpu(flow->stats);
135 	kmem_cache_free(flow_cache, flow);
136 }
137 
138 static void rcu_free_flow_callback(struct rcu_head *rcu)
139 {
140 	struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
141 
142 	flow_free(flow);
143 }
144 
145 void ovs_flow_free(struct sw_flow *flow, bool deferred)
146 {
147 	if (!flow)
148 		return;
149 
150 	if (flow->mask) {
151 		struct sw_flow_mask *mask = flow->mask;
152 
153 		/* ovs-lock is required to protect mask-refcount and
154 		 * mask list.
155 		 */
156 		ASSERT_OVSL();
157 		BUG_ON(!mask->ref_count);
158 		mask->ref_count--;
159 
160 		if (!mask->ref_count) {
161 			list_del_rcu(&mask->list);
162 			if (deferred)
163 				kfree_rcu(mask, rcu);
164 			else
165 				kfree(mask);
166 		}
167 	}
168 
169 	if (deferred)
170 		call_rcu(&flow->rcu, rcu_free_flow_callback);
171 	else
172 		flow_free(flow);
173 }
174 
175 static void free_buckets(struct flex_array *buckets)
176 {
177 	flex_array_free(buckets);
178 }
179 
180 
181 static void __table_instance_destroy(struct table_instance *ti)
182 {
183 	free_buckets(ti->buckets);
184 	kfree(ti);
185 }
186 
187 static struct table_instance *table_instance_alloc(int new_size)
188 {
189 	struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
190 
191 	if (!ti)
192 		return NULL;
193 
194 	ti->buckets = alloc_buckets(new_size);
195 
196 	if (!ti->buckets) {
197 		kfree(ti);
198 		return NULL;
199 	}
200 	ti->n_buckets = new_size;
201 	ti->node_ver = 0;
202 	ti->keep_flows = false;
203 	get_random_bytes(&ti->hash_seed, sizeof(u32));
204 
205 	return ti;
206 }
207 
208 int ovs_flow_tbl_init(struct flow_table *table)
209 {
210 	struct table_instance *ti;
211 
212 	ti = table_instance_alloc(TBL_MIN_BUCKETS);
213 
214 	if (!ti)
215 		return -ENOMEM;
216 
217 	rcu_assign_pointer(table->ti, ti);
218 	INIT_LIST_HEAD(&table->mask_list);
219 	table->last_rehash = jiffies;
220 	table->count = 0;
221 	return 0;
222 }
223 
224 static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
225 {
226 	struct table_instance *ti = container_of(rcu, struct table_instance, rcu);
227 
228 	__table_instance_destroy(ti);
229 }
230 
231 static void table_instance_destroy(struct table_instance *ti, bool deferred)
232 {
233 	int i;
234 
235 	if (!ti)
236 		return;
237 
238 	if (ti->keep_flows)
239 		goto skip_flows;
240 
241 	for (i = 0; i < ti->n_buckets; i++) {
242 		struct sw_flow *flow;
243 		struct hlist_head *head = flex_array_get(ti->buckets, i);
244 		struct hlist_node *n;
245 		int ver = ti->node_ver;
246 
247 		hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
248 			hlist_del_rcu(&flow->hash_node[ver]);
249 			ovs_flow_free(flow, deferred);
250 		}
251 	}
252 
253 skip_flows:
254 	if (deferred)
255 		call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
256 	else
257 		__table_instance_destroy(ti);
258 }
259 
260 void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred)
261 {
262 	struct table_instance *ti = ovsl_dereference(table->ti);
263 
264 	table_instance_destroy(ti, deferred);
265 }
266 
267 struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
268 				       u32 *bucket, u32 *last)
269 {
270 	struct sw_flow *flow;
271 	struct hlist_head *head;
272 	int ver;
273 	int i;
274 
275 	ver = ti->node_ver;
276 	while (*bucket < ti->n_buckets) {
277 		i = 0;
278 		head = flex_array_get(ti->buckets, *bucket);
279 		hlist_for_each_entry_rcu(flow, head, hash_node[ver]) {
280 			if (i < *last) {
281 				i++;
282 				continue;
283 			}
284 			*last = i + 1;
285 			return flow;
286 		}
287 		(*bucket)++;
288 		*last = 0;
289 	}
290 
291 	return NULL;
292 }
293 
294 static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
295 {
296 	hash = jhash_1word(hash, ti->hash_seed);
297 	return flex_array_get(ti->buckets,
298 				(hash & (ti->n_buckets - 1)));
299 }
300 
301 static void table_instance_insert(struct table_instance *ti, struct sw_flow *flow)
302 {
303 	struct hlist_head *head;
304 
305 	head = find_bucket(ti, flow->hash);
306 	hlist_add_head_rcu(&flow->hash_node[ti->node_ver], head);
307 }
308 
309 static void flow_table_copy_flows(struct table_instance *old,
310 				  struct table_instance *new)
311 {
312 	int old_ver;
313 	int i;
314 
315 	old_ver = old->node_ver;
316 	new->node_ver = !old_ver;
317 
318 	/* Insert in new table. */
319 	for (i = 0; i < old->n_buckets; i++) {
320 		struct sw_flow *flow;
321 		struct hlist_head *head;
322 
323 		head = flex_array_get(old->buckets, i);
324 
325 		hlist_for_each_entry(flow, head, hash_node[old_ver])
326 			table_instance_insert(new, flow);
327 	}
328 
329 	old->keep_flows = true;
330 }
331 
332 static struct table_instance *table_instance_rehash(struct table_instance *ti,
333 					    int n_buckets)
334 {
335 	struct table_instance *new_ti;
336 
337 	new_ti = table_instance_alloc(n_buckets);
338 	if (!new_ti)
339 		return NULL;
340 
341 	flow_table_copy_flows(ti, new_ti);
342 
343 	return new_ti;
344 }
345 
346 int ovs_flow_tbl_flush(struct flow_table *flow_table)
347 {
348 	struct table_instance *old_ti;
349 	struct table_instance *new_ti;
350 
351 	old_ti = ovsl_dereference(flow_table->ti);
352 	new_ti = table_instance_alloc(TBL_MIN_BUCKETS);
353 	if (!new_ti)
354 		return -ENOMEM;
355 
356 	rcu_assign_pointer(flow_table->ti, new_ti);
357 	flow_table->last_rehash = jiffies;
358 	flow_table->count = 0;
359 
360 	table_instance_destroy(old_ti, true);
361 	return 0;
362 }
363 
364 static u32 flow_hash(const struct sw_flow_key *key, int key_start,
365 		     int key_end)
366 {
367 	const u32 *hash_key = (const u32 *)((const u8 *)key + key_start);
368 	int hash_u32s = (key_end - key_start) >> 2;
369 
370 	/* Make sure number of hash bytes are multiple of u32. */
371 	BUILD_BUG_ON(sizeof(long) % sizeof(u32));
372 
373 	return arch_fast_hash2(hash_key, hash_u32s, 0);
374 }
375 
376 static int flow_key_start(const struct sw_flow_key *key)
377 {
378 	if (key->tun_key.ipv4_dst)
379 		return 0;
380 	else
381 		return rounddown(offsetof(struct sw_flow_key, phy),
382 					  sizeof(long));
383 }
384 
385 static bool cmp_key(const struct sw_flow_key *key1,
386 		    const struct sw_flow_key *key2,
387 		    int key_start, int key_end)
388 {
389 	const long *cp1 = (const long *)((const u8 *)key1 + key_start);
390 	const long *cp2 = (const long *)((const u8 *)key2 + key_start);
391 	long diffs = 0;
392 	int i;
393 
394 	for (i = key_start; i < key_end;  i += sizeof(long))
395 		diffs |= *cp1++ ^ *cp2++;
396 
397 	return diffs == 0;
398 }
399 
400 static bool flow_cmp_masked_key(const struct sw_flow *flow,
401 				const struct sw_flow_key *key,
402 				int key_start, int key_end)
403 {
404 	return cmp_key(&flow->key, key, key_start, key_end);
405 }
406 
407 bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
408 			       struct sw_flow_match *match)
409 {
410 	struct sw_flow_key *key = match->key;
411 	int key_start = flow_key_start(key);
412 	int key_end = match->range.end;
413 
414 	return cmp_key(&flow->unmasked_key, key, key_start, key_end);
415 }
416 
417 static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
418 					  const struct sw_flow_key *unmasked,
419 					  struct sw_flow_mask *mask)
420 {
421 	struct sw_flow *flow;
422 	struct hlist_head *head;
423 	int key_start = mask->range.start;
424 	int key_end = mask->range.end;
425 	u32 hash;
426 	struct sw_flow_key masked_key;
427 
428 	ovs_flow_mask_key(&masked_key, unmasked, mask);
429 	hash = flow_hash(&masked_key, key_start, key_end);
430 	head = find_bucket(ti, hash);
431 	hlist_for_each_entry_rcu(flow, head, hash_node[ti->node_ver]) {
432 		if (flow->mask == mask && flow->hash == hash &&
433 		    flow_cmp_masked_key(flow, &masked_key,
434 					  key_start, key_end))
435 			return flow;
436 	}
437 	return NULL;
438 }
439 
440 struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
441 				    const struct sw_flow_key *key,
442 				    u32 *n_mask_hit)
443 {
444 	struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
445 	struct sw_flow_mask *mask;
446 	struct sw_flow *flow;
447 
448 	*n_mask_hit = 0;
449 	list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
450 		(*n_mask_hit)++;
451 		flow = masked_flow_lookup(ti, key, mask);
452 		if (flow)  /* Found */
453 			return flow;
454 	}
455 	return NULL;
456 }
457 
458 struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
459 				    const struct sw_flow_key *key)
460 {
461 	u32 __always_unused n_mask_hit;
462 
463 	return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit);
464 }
465 
466 int ovs_flow_tbl_num_masks(const struct flow_table *table)
467 {
468 	struct sw_flow_mask *mask;
469 	int num = 0;
470 
471 	list_for_each_entry(mask, &table->mask_list, list)
472 		num++;
473 
474 	return num;
475 }
476 
477 static struct table_instance *table_instance_expand(struct table_instance *ti)
478 {
479 	return table_instance_rehash(ti, ti->n_buckets * 2);
480 }
481 
482 void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
483 {
484 	struct table_instance *ti = ovsl_dereference(table->ti);
485 
486 	BUG_ON(table->count == 0);
487 	hlist_del_rcu(&flow->hash_node[ti->node_ver]);
488 	table->count--;
489 }
490 
491 static struct sw_flow_mask *mask_alloc(void)
492 {
493 	struct sw_flow_mask *mask;
494 
495 	mask = kmalloc(sizeof(*mask), GFP_KERNEL);
496 	if (mask)
497 		mask->ref_count = 1;
498 
499 	return mask;
500 }
501 
502 static bool mask_equal(const struct sw_flow_mask *a,
503 		       const struct sw_flow_mask *b)
504 {
505 	const u8 *a_ = (const u8 *)&a->key + a->range.start;
506 	const u8 *b_ = (const u8 *)&b->key + b->range.start;
507 
508 	return  (a->range.end == b->range.end)
509 		&& (a->range.start == b->range.start)
510 		&& (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
511 }
512 
513 static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
514 					   const struct sw_flow_mask *mask)
515 {
516 	struct list_head *ml;
517 
518 	list_for_each(ml, &tbl->mask_list) {
519 		struct sw_flow_mask *m;
520 		m = container_of(ml, struct sw_flow_mask, list);
521 		if (mask_equal(mask, m))
522 			return m;
523 	}
524 
525 	return NULL;
526 }
527 
528 /* Add 'mask' into the mask list, if it is not already there. */
529 static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
530 			    struct sw_flow_mask *new)
531 {
532 	struct sw_flow_mask *mask;
533 	mask = flow_mask_find(tbl, new);
534 	if (!mask) {
535 		/* Allocate a new mask if none exsits. */
536 		mask = mask_alloc();
537 		if (!mask)
538 			return -ENOMEM;
539 		mask->key = new->key;
540 		mask->range = new->range;
541 		list_add_rcu(&mask->list, &tbl->mask_list);
542 	} else {
543 		BUG_ON(!mask->ref_count);
544 		mask->ref_count++;
545 	}
546 
547 	flow->mask = mask;
548 	return 0;
549 }
550 
551 int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
552 			struct sw_flow_mask *mask)
553 {
554 	struct table_instance *new_ti = NULL;
555 	struct table_instance *ti;
556 	int err;
557 
558 	err = flow_mask_insert(table, flow, mask);
559 	if (err)
560 		return err;
561 
562 	flow->hash = flow_hash(&flow->key, flow->mask->range.start,
563 			flow->mask->range.end);
564 	ti = ovsl_dereference(table->ti);
565 	table_instance_insert(ti, flow);
566 	table->count++;
567 
568 	/* Expand table, if necessary, to make room. */
569 	if (table->count > ti->n_buckets)
570 		new_ti = table_instance_expand(ti);
571 	else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL))
572 		new_ti = table_instance_rehash(ti, ti->n_buckets);
573 
574 	if (new_ti) {
575 		rcu_assign_pointer(table->ti, new_ti);
576 		table_instance_destroy(ti, true);
577 		table->last_rehash = jiffies;
578 	}
579 	return 0;
580 }
581 
582 /* Initializes the flow module.
583  * Returns zero if successful or a negative error code. */
584 int ovs_flow_init(void)
585 {
586 	BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
587 	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
588 
589 	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
590 					0, NULL);
591 	if (flow_cache == NULL)
592 		return -ENOMEM;
593 
594 	return 0;
595 }
596 
597 /* Uninitializes the flow module. */
598 void ovs_flow_exit(void)
599 {
600 	kmem_cache_destroy(flow_cache);
601 }
602