xref: /openbmc/linux/net/openvswitch/datapath.c (revision f7777dcc)
1 /*
2  * Copyright (c) 2007-2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/init.h>
22 #include <linux/module.h>
23 #include <linux/if_arp.h>
24 #include <linux/if_vlan.h>
25 #include <linux/in.h>
26 #include <linux/ip.h>
27 #include <linux/jhash.h>
28 #include <linux/delay.h>
29 #include <linux/time.h>
30 #include <linux/etherdevice.h>
31 #include <linux/genetlink.h>
32 #include <linux/kernel.h>
33 #include <linux/kthread.h>
34 #include <linux/mutex.h>
35 #include <linux/percpu.h>
36 #include <linux/rcupdate.h>
37 #include <linux/tcp.h>
38 #include <linux/udp.h>
39 #include <linux/ethtool.h>
40 #include <linux/wait.h>
41 #include <asm/div64.h>
42 #include <linux/highmem.h>
43 #include <linux/netfilter_bridge.h>
44 #include <linux/netfilter_ipv4.h>
45 #include <linux/inetdevice.h>
46 #include <linux/list.h>
47 #include <linux/lockdep.h>
48 #include <linux/openvswitch.h>
49 #include <linux/rculist.h>
50 #include <linux/dmi.h>
51 #include <linux/workqueue.h>
52 #include <net/genetlink.h>
53 #include <net/net_namespace.h>
54 #include <net/netns/generic.h>
55 
56 #include "datapath.h"
57 #include "flow.h"
58 #include "vport-internal_dev.h"
59 #include "vport-netdev.h"
60 
61 
62 #define REHASH_FLOW_INTERVAL (10 * 60 * HZ)
63 static void rehash_flow_table(struct work_struct *work);
64 static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table);
65 
66 int ovs_net_id __read_mostly;
67 
68 static void ovs_notify(struct sk_buff *skb, struct genl_info *info,
69 		       struct genl_multicast_group *grp)
70 {
71 	genl_notify(skb, genl_info_net(info), info->snd_portid,
72 		    grp->id, info->nlhdr, GFP_KERNEL);
73 }
74 
75 /**
76  * DOC: Locking:
77  *
78  * All writes e.g. Writes to device state (add/remove datapath, port, set
79  * operations on vports, etc.), Writes to other state (flow table
80  * modifications, set miscellaneous datapath parameters, etc.) are protected
81  * by ovs_lock.
82  *
83  * Reads are protected by RCU.
84  *
85  * There are a few special cases (mostly stats) that have their own
86  * synchronization but they nest under all of above and don't interact with
87  * each other.
88  *
89  * The RTNL lock nests inside ovs_mutex.
90  */
91 
92 static DEFINE_MUTEX(ovs_mutex);
93 
94 void ovs_lock(void)
95 {
96 	mutex_lock(&ovs_mutex);
97 }
98 
99 void ovs_unlock(void)
100 {
101 	mutex_unlock(&ovs_mutex);
102 }
103 
104 #ifdef CONFIG_LOCKDEP
105 int lockdep_ovsl_is_held(void)
106 {
107 	if (debug_locks)
108 		return lockdep_is_held(&ovs_mutex);
109 	else
110 		return 1;
111 }
112 #endif
113 
114 static struct vport *new_vport(const struct vport_parms *);
115 static int queue_gso_packets(struct net *, int dp_ifindex, struct sk_buff *,
116 			     const struct dp_upcall_info *);
117 static int queue_userspace_packet(struct net *, int dp_ifindex,
118 				  struct sk_buff *,
119 				  const struct dp_upcall_info *);
120 
121 /* Must be called with rcu_read_lock or ovs_mutex. */
122 static struct datapath *get_dp(struct net *net, int dp_ifindex)
123 {
124 	struct datapath *dp = NULL;
125 	struct net_device *dev;
126 
127 	rcu_read_lock();
128 	dev = dev_get_by_index_rcu(net, dp_ifindex);
129 	if (dev) {
130 		struct vport *vport = ovs_internal_dev_get_vport(dev);
131 		if (vport)
132 			dp = vport->dp;
133 	}
134 	rcu_read_unlock();
135 
136 	return dp;
137 }
138 
139 /* Must be called with rcu_read_lock or ovs_mutex. */
140 const char *ovs_dp_name(const struct datapath *dp)
141 {
142 	struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
143 	return vport->ops->get_name(vport);
144 }
145 
146 static int get_dpifindex(struct datapath *dp)
147 {
148 	struct vport *local;
149 	int ifindex;
150 
151 	rcu_read_lock();
152 
153 	local = ovs_vport_rcu(dp, OVSP_LOCAL);
154 	if (local)
155 		ifindex = netdev_vport_priv(local)->dev->ifindex;
156 	else
157 		ifindex = 0;
158 
159 	rcu_read_unlock();
160 
161 	return ifindex;
162 }
163 
164 static void destroy_dp_rcu(struct rcu_head *rcu)
165 {
166 	struct datapath *dp = container_of(rcu, struct datapath, rcu);
167 
168 	ovs_flow_tbl_destroy((__force struct flow_table *)dp->table, false);
169 	free_percpu(dp->stats_percpu);
170 	release_net(ovs_dp_get_net(dp));
171 	kfree(dp->ports);
172 	kfree(dp);
173 }
174 
175 static struct hlist_head *vport_hash_bucket(const struct datapath *dp,
176 					    u16 port_no)
177 {
178 	return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)];
179 }
180 
181 struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
182 {
183 	struct vport *vport;
184 	struct hlist_head *head;
185 
186 	head = vport_hash_bucket(dp, port_no);
187 	hlist_for_each_entry_rcu(vport, head, dp_hash_node) {
188 		if (vport->port_no == port_no)
189 			return vport;
190 	}
191 	return NULL;
192 }
193 
194 /* Called with ovs_mutex. */
195 static struct vport *new_vport(const struct vport_parms *parms)
196 {
197 	struct vport *vport;
198 
199 	vport = ovs_vport_add(parms);
200 	if (!IS_ERR(vport)) {
201 		struct datapath *dp = parms->dp;
202 		struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
203 
204 		hlist_add_head_rcu(&vport->dp_hash_node, head);
205 	}
206 	return vport;
207 }
208 
209 void ovs_dp_detach_port(struct vport *p)
210 {
211 	ASSERT_OVSL();
212 
213 	/* First drop references to device. */
214 	hlist_del_rcu(&p->dp_hash_node);
215 
216 	/* Then destroy it. */
217 	ovs_vport_del(p);
218 }
219 
220 /* Must be called with rcu_read_lock. */
221 void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
222 {
223 	struct datapath *dp = p->dp;
224 	struct sw_flow *flow;
225 	struct dp_stats_percpu *stats;
226 	struct sw_flow_key key;
227 	u64 *stats_counter;
228 	int error;
229 
230 	stats = this_cpu_ptr(dp->stats_percpu);
231 
232 	/* Extract flow from 'skb' into 'key'. */
233 	error = ovs_flow_extract(skb, p->port_no, &key);
234 	if (unlikely(error)) {
235 		kfree_skb(skb);
236 		return;
237 	}
238 
239 	/* Look up flow. */
240 	flow = ovs_flow_lookup(rcu_dereference(dp->table), &key);
241 	if (unlikely(!flow)) {
242 		struct dp_upcall_info upcall;
243 
244 		upcall.cmd = OVS_PACKET_CMD_MISS;
245 		upcall.key = &key;
246 		upcall.userdata = NULL;
247 		upcall.portid = p->upcall_portid;
248 		ovs_dp_upcall(dp, skb, &upcall);
249 		consume_skb(skb);
250 		stats_counter = &stats->n_missed;
251 		goto out;
252 	}
253 
254 	OVS_CB(skb)->flow = flow;
255 	OVS_CB(skb)->pkt_key = &key;
256 
257 	stats_counter = &stats->n_hit;
258 	ovs_flow_used(OVS_CB(skb)->flow, skb);
259 	ovs_execute_actions(dp, skb);
260 
261 out:
262 	/* Update datapath statistics. */
263 	u64_stats_update_begin(&stats->sync);
264 	(*stats_counter)++;
265 	u64_stats_update_end(&stats->sync);
266 }
267 
268 static struct genl_family dp_packet_genl_family = {
269 	.id = GENL_ID_GENERATE,
270 	.hdrsize = sizeof(struct ovs_header),
271 	.name = OVS_PACKET_FAMILY,
272 	.version = OVS_PACKET_VERSION,
273 	.maxattr = OVS_PACKET_ATTR_MAX,
274 	.netnsok = true,
275 	.parallel_ops = true,
276 };
277 
278 int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
279 		  const struct dp_upcall_info *upcall_info)
280 {
281 	struct dp_stats_percpu *stats;
282 	int dp_ifindex;
283 	int err;
284 
285 	if (upcall_info->portid == 0) {
286 		err = -ENOTCONN;
287 		goto err;
288 	}
289 
290 	dp_ifindex = get_dpifindex(dp);
291 	if (!dp_ifindex) {
292 		err = -ENODEV;
293 		goto err;
294 	}
295 
296 	if (!skb_is_gso(skb))
297 		err = queue_userspace_packet(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info);
298 	else
299 		err = queue_gso_packets(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info);
300 	if (err)
301 		goto err;
302 
303 	return 0;
304 
305 err:
306 	stats = this_cpu_ptr(dp->stats_percpu);
307 
308 	u64_stats_update_begin(&stats->sync);
309 	stats->n_lost++;
310 	u64_stats_update_end(&stats->sync);
311 
312 	return err;
313 }
314 
315 static int queue_gso_packets(struct net *net, int dp_ifindex,
316 			     struct sk_buff *skb,
317 			     const struct dp_upcall_info *upcall_info)
318 {
319 	unsigned short gso_type = skb_shinfo(skb)->gso_type;
320 	struct dp_upcall_info later_info;
321 	struct sw_flow_key later_key;
322 	struct sk_buff *segs, *nskb;
323 	int err;
324 
325 	segs = __skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM, false);
326 	if (IS_ERR(segs))
327 		return PTR_ERR(segs);
328 
329 	/* Queue all of the segments. */
330 	skb = segs;
331 	do {
332 		err = queue_userspace_packet(net, dp_ifindex, skb, upcall_info);
333 		if (err)
334 			break;
335 
336 		if (skb == segs && gso_type & SKB_GSO_UDP) {
337 			/* The initial flow key extracted by ovs_flow_extract()
338 			 * in this case is for a first fragment, so we need to
339 			 * properly mark later fragments.
340 			 */
341 			later_key = *upcall_info->key;
342 			later_key.ip.frag = OVS_FRAG_TYPE_LATER;
343 
344 			later_info = *upcall_info;
345 			later_info.key = &later_key;
346 			upcall_info = &later_info;
347 		}
348 	} while ((skb = skb->next));
349 
350 	/* Free all of the segments. */
351 	skb = segs;
352 	do {
353 		nskb = skb->next;
354 		if (err)
355 			kfree_skb(skb);
356 		else
357 			consume_skb(skb);
358 	} while ((skb = nskb));
359 	return err;
360 }
361 
362 static size_t key_attr_size(void)
363 {
364 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
365 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
366 		  + nla_total_size(8)   /* OVS_TUNNEL_KEY_ATTR_ID */
367 		  + nla_total_size(4)   /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */
368 		  + nla_total_size(4)   /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */
369 		  + nla_total_size(1)   /* OVS_TUNNEL_KEY_ATTR_TOS */
370 		  + nla_total_size(1)   /* OVS_TUNNEL_KEY_ATTR_TTL */
371 		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
372 		  + nla_total_size(0)   /* OVS_TUNNEL_KEY_ATTR_CSUM */
373 		+ nla_total_size(4)   /* OVS_KEY_ATTR_IN_PORT */
374 		+ nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */
375 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
376 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
377 		+ nla_total_size(4)   /* OVS_KEY_ATTR_8021Q */
378 		+ nla_total_size(0)   /* OVS_KEY_ATTR_ENCAP */
379 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
380 		+ nla_total_size(40)  /* OVS_KEY_ATTR_IPV6 */
381 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ICMPV6 */
382 		+ nla_total_size(28); /* OVS_KEY_ATTR_ND */
383 }
384 
385 static size_t upcall_msg_size(const struct sk_buff *skb,
386 			      const struct nlattr *userdata)
387 {
388 	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
389 		+ nla_total_size(skb->len) /* OVS_PACKET_ATTR_PACKET */
390 		+ nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */
391 
392 	/* OVS_PACKET_ATTR_USERDATA */
393 	if (userdata)
394 		size += NLA_ALIGN(userdata->nla_len);
395 
396 	return size;
397 }
398 
399 static int queue_userspace_packet(struct net *net, int dp_ifindex,
400 				  struct sk_buff *skb,
401 				  const struct dp_upcall_info *upcall_info)
402 {
403 	struct ovs_header *upcall;
404 	struct sk_buff *nskb = NULL;
405 	struct sk_buff *user_skb; /* to be queued to userspace */
406 	struct nlattr *nla;
407 	int err;
408 
409 	if (vlan_tx_tag_present(skb)) {
410 		nskb = skb_clone(skb, GFP_ATOMIC);
411 		if (!nskb)
412 			return -ENOMEM;
413 
414 		nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb));
415 		if (!nskb)
416 			return -ENOMEM;
417 
418 		nskb->vlan_tci = 0;
419 		skb = nskb;
420 	}
421 
422 	if (nla_attr_size(skb->len) > USHRT_MAX) {
423 		err = -EFBIG;
424 		goto out;
425 	}
426 
427 	user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata), GFP_ATOMIC);
428 	if (!user_skb) {
429 		err = -ENOMEM;
430 		goto out;
431 	}
432 
433 	upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
434 			     0, upcall_info->cmd);
435 	upcall->dp_ifindex = dp_ifindex;
436 
437 	nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
438 	ovs_flow_to_nlattrs(upcall_info->key, upcall_info->key, user_skb);
439 	nla_nest_end(user_skb, nla);
440 
441 	if (upcall_info->userdata)
442 		__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
443 			  nla_len(upcall_info->userdata),
444 			  nla_data(upcall_info->userdata));
445 
446 	nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
447 
448 	skb_copy_and_csum_dev(skb, nla_data(nla));
449 
450 	genlmsg_end(user_skb, upcall);
451 	err = genlmsg_unicast(net, user_skb, upcall_info->portid);
452 
453 out:
454 	kfree_skb(nskb);
455 	return err;
456 }
457 
458 /* Called with ovs_mutex. */
459 static int flush_flows(struct datapath *dp)
460 {
461 	struct flow_table *old_table;
462 	struct flow_table *new_table;
463 
464 	old_table = ovsl_dereference(dp->table);
465 	new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS);
466 	if (!new_table)
467 		return -ENOMEM;
468 
469 	rcu_assign_pointer(dp->table, new_table);
470 
471 	ovs_flow_tbl_destroy(old_table, true);
472 	return 0;
473 }
474 
475 static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, int attr_len)
476 {
477 
478 	struct sw_flow_actions *acts;
479 	int new_acts_size;
480 	int req_size = NLA_ALIGN(attr_len);
481 	int next_offset = offsetof(struct sw_flow_actions, actions) +
482 					(*sfa)->actions_len;
483 
484 	if (req_size <= (ksize(*sfa) - next_offset))
485 		goto out;
486 
487 	new_acts_size = ksize(*sfa) * 2;
488 
489 	if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
490 		if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size)
491 			return ERR_PTR(-EMSGSIZE);
492 		new_acts_size = MAX_ACTIONS_BUFSIZE;
493 	}
494 
495 	acts = ovs_flow_actions_alloc(new_acts_size);
496 	if (IS_ERR(acts))
497 		return (void *)acts;
498 
499 	memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len);
500 	acts->actions_len = (*sfa)->actions_len;
501 	kfree(*sfa);
502 	*sfa = acts;
503 
504 out:
505 	(*sfa)->actions_len += req_size;
506 	return  (struct nlattr *) ((unsigned char *)(*sfa) + next_offset);
507 }
508 
509 static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len)
510 {
511 	struct nlattr *a;
512 
513 	a = reserve_sfa_size(sfa, nla_attr_size(len));
514 	if (IS_ERR(a))
515 		return PTR_ERR(a);
516 
517 	a->nla_type = attrtype;
518 	a->nla_len = nla_attr_size(len);
519 
520 	if (data)
521 		memcpy(nla_data(a), data, len);
522 	memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len));
523 
524 	return 0;
525 }
526 
527 static inline int add_nested_action_start(struct sw_flow_actions **sfa, int attrtype)
528 {
529 	int used = (*sfa)->actions_len;
530 	int err;
531 
532 	err = add_action(sfa, attrtype, NULL, 0);
533 	if (err)
534 		return err;
535 
536 	return used;
537 }
538 
539 static inline void add_nested_action_end(struct sw_flow_actions *sfa, int st_offset)
540 {
541 	struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + st_offset);
542 
543 	a->nla_len = sfa->actions_len - st_offset;
544 }
545 
546 static int validate_and_copy_actions(const struct nlattr *attr,
547 				     const struct sw_flow_key *key, int depth,
548 				     struct sw_flow_actions **sfa);
549 
550 static int validate_and_copy_sample(const struct nlattr *attr,
551 				    const struct sw_flow_key *key, int depth,
552 				    struct sw_flow_actions **sfa)
553 {
554 	const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
555 	const struct nlattr *probability, *actions;
556 	const struct nlattr *a;
557 	int rem, start, err, st_acts;
558 
559 	memset(attrs, 0, sizeof(attrs));
560 	nla_for_each_nested(a, attr, rem) {
561 		int type = nla_type(a);
562 		if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
563 			return -EINVAL;
564 		attrs[type] = a;
565 	}
566 	if (rem)
567 		return -EINVAL;
568 
569 	probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
570 	if (!probability || nla_len(probability) != sizeof(u32))
571 		return -EINVAL;
572 
573 	actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
574 	if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
575 		return -EINVAL;
576 
577 	/* validation done, copy sample action. */
578 	start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE);
579 	if (start < 0)
580 		return start;
581 	err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, nla_data(probability), sizeof(u32));
582 	if (err)
583 		return err;
584 	st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS);
585 	if (st_acts < 0)
586 		return st_acts;
587 
588 	err = validate_and_copy_actions(actions, key, depth + 1, sfa);
589 	if (err)
590 		return err;
591 
592 	add_nested_action_end(*sfa, st_acts);
593 	add_nested_action_end(*sfa, start);
594 
595 	return 0;
596 }
597 
598 static int validate_tp_port(const struct sw_flow_key *flow_key)
599 {
600 	if (flow_key->eth.type == htons(ETH_P_IP)) {
601 		if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst)
602 			return 0;
603 	} else if (flow_key->eth.type == htons(ETH_P_IPV6)) {
604 		if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst)
605 			return 0;
606 	}
607 
608 	return -EINVAL;
609 }
610 
611 static int validate_and_copy_set_tun(const struct nlattr *attr,
612 				     struct sw_flow_actions **sfa)
613 {
614 	struct sw_flow_match match;
615 	struct sw_flow_key key;
616 	int err, start;
617 
618 	ovs_match_init(&match, &key, NULL);
619 	err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &match, false);
620 	if (err)
621 		return err;
622 
623 	start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET);
624 	if (start < 0)
625 		return start;
626 
627 	err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key,
628 			sizeof(match.key->tun_key));
629 	add_nested_action_end(*sfa, start);
630 
631 	return err;
632 }
633 
634 static int validate_set(const struct nlattr *a,
635 			const struct sw_flow_key *flow_key,
636 			struct sw_flow_actions **sfa,
637 			bool *set_tun)
638 {
639 	const struct nlattr *ovs_key = nla_data(a);
640 	int key_type = nla_type(ovs_key);
641 
642 	/* There can be only one key in a action */
643 	if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
644 		return -EINVAL;
645 
646 	if (key_type > OVS_KEY_ATTR_MAX ||
647 	   (ovs_key_lens[key_type] != nla_len(ovs_key) &&
648 	    ovs_key_lens[key_type] != -1))
649 		return -EINVAL;
650 
651 	switch (key_type) {
652 	const struct ovs_key_ipv4 *ipv4_key;
653 	const struct ovs_key_ipv6 *ipv6_key;
654 	int err;
655 
656 	case OVS_KEY_ATTR_PRIORITY:
657 	case OVS_KEY_ATTR_SKB_MARK:
658 	case OVS_KEY_ATTR_ETHERNET:
659 		break;
660 
661 	case OVS_KEY_ATTR_TUNNEL:
662 		*set_tun = true;
663 		err = validate_and_copy_set_tun(a, sfa);
664 		if (err)
665 			return err;
666 		break;
667 
668 	case OVS_KEY_ATTR_IPV4:
669 		if (flow_key->eth.type != htons(ETH_P_IP))
670 			return -EINVAL;
671 
672 		if (!flow_key->ip.proto)
673 			return -EINVAL;
674 
675 		ipv4_key = nla_data(ovs_key);
676 		if (ipv4_key->ipv4_proto != flow_key->ip.proto)
677 			return -EINVAL;
678 
679 		if (ipv4_key->ipv4_frag != flow_key->ip.frag)
680 			return -EINVAL;
681 
682 		break;
683 
684 	case OVS_KEY_ATTR_IPV6:
685 		if (flow_key->eth.type != htons(ETH_P_IPV6))
686 			return -EINVAL;
687 
688 		if (!flow_key->ip.proto)
689 			return -EINVAL;
690 
691 		ipv6_key = nla_data(ovs_key);
692 		if (ipv6_key->ipv6_proto != flow_key->ip.proto)
693 			return -EINVAL;
694 
695 		if (ipv6_key->ipv6_frag != flow_key->ip.frag)
696 			return -EINVAL;
697 
698 		if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000)
699 			return -EINVAL;
700 
701 		break;
702 
703 	case OVS_KEY_ATTR_TCP:
704 		if (flow_key->ip.proto != IPPROTO_TCP)
705 			return -EINVAL;
706 
707 		return validate_tp_port(flow_key);
708 
709 	case OVS_KEY_ATTR_UDP:
710 		if (flow_key->ip.proto != IPPROTO_UDP)
711 			return -EINVAL;
712 
713 		return validate_tp_port(flow_key);
714 
715 	case OVS_KEY_ATTR_SCTP:
716 		if (flow_key->ip.proto != IPPROTO_SCTP)
717 			return -EINVAL;
718 
719 		return validate_tp_port(flow_key);
720 
721 	default:
722 		return -EINVAL;
723 	}
724 
725 	return 0;
726 }
727 
728 static int validate_userspace(const struct nlattr *attr)
729 {
730 	static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] =	{
731 		[OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
732 		[OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC },
733 	};
734 	struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
735 	int error;
736 
737 	error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
738 				 attr, userspace_policy);
739 	if (error)
740 		return error;
741 
742 	if (!a[OVS_USERSPACE_ATTR_PID] ||
743 	    !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
744 		return -EINVAL;
745 
746 	return 0;
747 }
748 
749 static int copy_action(const struct nlattr *from,
750 		       struct sw_flow_actions **sfa)
751 {
752 	int totlen = NLA_ALIGN(from->nla_len);
753 	struct nlattr *to;
754 
755 	to = reserve_sfa_size(sfa, from->nla_len);
756 	if (IS_ERR(to))
757 		return PTR_ERR(to);
758 
759 	memcpy(to, from, totlen);
760 	return 0;
761 }
762 
763 static int validate_and_copy_actions(const struct nlattr *attr,
764 				     const struct sw_flow_key *key,
765 				     int depth,
766 				     struct sw_flow_actions **sfa)
767 {
768 	const struct nlattr *a;
769 	int rem, err;
770 
771 	if (depth >= SAMPLE_ACTION_DEPTH)
772 		return -EOVERFLOW;
773 
774 	nla_for_each_nested(a, attr, rem) {
775 		/* Expected argument lengths, (u32)-1 for variable length. */
776 		static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
777 			[OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
778 			[OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
779 			[OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
780 			[OVS_ACTION_ATTR_POP_VLAN] = 0,
781 			[OVS_ACTION_ATTR_SET] = (u32)-1,
782 			[OVS_ACTION_ATTR_SAMPLE] = (u32)-1
783 		};
784 		const struct ovs_action_push_vlan *vlan;
785 		int type = nla_type(a);
786 		bool skip_copy;
787 
788 		if (type > OVS_ACTION_ATTR_MAX ||
789 		    (action_lens[type] != nla_len(a) &&
790 		     action_lens[type] != (u32)-1))
791 			return -EINVAL;
792 
793 		skip_copy = false;
794 		switch (type) {
795 		case OVS_ACTION_ATTR_UNSPEC:
796 			return -EINVAL;
797 
798 		case OVS_ACTION_ATTR_USERSPACE:
799 			err = validate_userspace(a);
800 			if (err)
801 				return err;
802 			break;
803 
804 		case OVS_ACTION_ATTR_OUTPUT:
805 			if (nla_get_u32(a) >= DP_MAX_PORTS)
806 				return -EINVAL;
807 			break;
808 
809 
810 		case OVS_ACTION_ATTR_POP_VLAN:
811 			break;
812 
813 		case OVS_ACTION_ATTR_PUSH_VLAN:
814 			vlan = nla_data(a);
815 			if (vlan->vlan_tpid != htons(ETH_P_8021Q))
816 				return -EINVAL;
817 			if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
818 				return -EINVAL;
819 			break;
820 
821 		case OVS_ACTION_ATTR_SET:
822 			err = validate_set(a, key, sfa, &skip_copy);
823 			if (err)
824 				return err;
825 			break;
826 
827 		case OVS_ACTION_ATTR_SAMPLE:
828 			err = validate_and_copy_sample(a, key, depth, sfa);
829 			if (err)
830 				return err;
831 			skip_copy = true;
832 			break;
833 
834 		default:
835 			return -EINVAL;
836 		}
837 		if (!skip_copy) {
838 			err = copy_action(a, sfa);
839 			if (err)
840 				return err;
841 		}
842 	}
843 
844 	if (rem > 0)
845 		return -EINVAL;
846 
847 	return 0;
848 }
849 
850 static void clear_stats(struct sw_flow *flow)
851 {
852 	flow->used = 0;
853 	flow->tcp_flags = 0;
854 	flow->packet_count = 0;
855 	flow->byte_count = 0;
856 }
857 
858 static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
859 {
860 	struct ovs_header *ovs_header = info->userhdr;
861 	struct nlattr **a = info->attrs;
862 	struct sw_flow_actions *acts;
863 	struct sk_buff *packet;
864 	struct sw_flow *flow;
865 	struct datapath *dp;
866 	struct ethhdr *eth;
867 	int len;
868 	int err;
869 
870 	err = -EINVAL;
871 	if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
872 	    !a[OVS_PACKET_ATTR_ACTIONS])
873 		goto err;
874 
875 	len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
876 	packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
877 	err = -ENOMEM;
878 	if (!packet)
879 		goto err;
880 	skb_reserve(packet, NET_IP_ALIGN);
881 
882 	nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);
883 
884 	skb_reset_mac_header(packet);
885 	eth = eth_hdr(packet);
886 
887 	/* Normally, setting the skb 'protocol' field would be handled by a
888 	 * call to eth_type_trans(), but it assumes there's a sending
889 	 * device, which we may not have. */
890 	if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
891 		packet->protocol = eth->h_proto;
892 	else
893 		packet->protocol = htons(ETH_P_802_2);
894 
895 	/* Build an sw_flow for sending this packet. */
896 	flow = ovs_flow_alloc();
897 	err = PTR_ERR(flow);
898 	if (IS_ERR(flow))
899 		goto err_kfree_skb;
900 
901 	err = ovs_flow_extract(packet, -1, &flow->key);
902 	if (err)
903 		goto err_flow_free;
904 
905 	err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]);
906 	if (err)
907 		goto err_flow_free;
908 	acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS]));
909 	err = PTR_ERR(acts);
910 	if (IS_ERR(acts))
911 		goto err_flow_free;
912 
913 	err = validate_and_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts);
914 	rcu_assign_pointer(flow->sf_acts, acts);
915 	if (err)
916 		goto err_flow_free;
917 
918 	OVS_CB(packet)->flow = flow;
919 	OVS_CB(packet)->pkt_key = &flow->key;
920 	packet->priority = flow->key.phy.priority;
921 	packet->mark = flow->key.phy.skb_mark;
922 
923 	rcu_read_lock();
924 	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
925 	err = -ENODEV;
926 	if (!dp)
927 		goto err_unlock;
928 
929 	local_bh_disable();
930 	err = ovs_execute_actions(dp, packet);
931 	local_bh_enable();
932 	rcu_read_unlock();
933 
934 	ovs_flow_free(flow, false);
935 	return err;
936 
937 err_unlock:
938 	rcu_read_unlock();
939 err_flow_free:
940 	ovs_flow_free(flow, false);
941 err_kfree_skb:
942 	kfree_skb(packet);
943 err:
944 	return err;
945 }
946 
947 static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
948 	[OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN },
949 	[OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
950 	[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
951 };
952 
953 static struct genl_ops dp_packet_genl_ops[] = {
954 	{ .cmd = OVS_PACKET_CMD_EXECUTE,
955 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
956 	  .policy = packet_policy,
957 	  .doit = ovs_packet_cmd_execute
958 	}
959 };
960 
961 static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
962 {
963 	struct flow_table *table;
964 	int i;
965 
966 	table = rcu_dereference_check(dp->table, lockdep_ovsl_is_held());
967 	stats->n_flows = ovs_flow_tbl_count(table);
968 
969 	stats->n_hit = stats->n_missed = stats->n_lost = 0;
970 	for_each_possible_cpu(i) {
971 		const struct dp_stats_percpu *percpu_stats;
972 		struct dp_stats_percpu local_stats;
973 		unsigned int start;
974 
975 		percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
976 
977 		do {
978 			start = u64_stats_fetch_begin_bh(&percpu_stats->sync);
979 			local_stats = *percpu_stats;
980 		} while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start));
981 
982 		stats->n_hit += local_stats.n_hit;
983 		stats->n_missed += local_stats.n_missed;
984 		stats->n_lost += local_stats.n_lost;
985 	}
986 }
987 
988 static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
989 	[OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
990 	[OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
991 	[OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
992 };
993 
994 static struct genl_family dp_flow_genl_family = {
995 	.id = GENL_ID_GENERATE,
996 	.hdrsize = sizeof(struct ovs_header),
997 	.name = OVS_FLOW_FAMILY,
998 	.version = OVS_FLOW_VERSION,
999 	.maxattr = OVS_FLOW_ATTR_MAX,
1000 	.netnsok = true,
1001 	.parallel_ops = true,
1002 };
1003 
1004 static struct genl_multicast_group ovs_dp_flow_multicast_group = {
1005 	.name = OVS_FLOW_MCGROUP
1006 };
1007 
1008 static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb);
1009 static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
1010 {
1011 	const struct nlattr *a;
1012 	struct nlattr *start;
1013 	int err = 0, rem;
1014 
1015 	start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE);
1016 	if (!start)
1017 		return -EMSGSIZE;
1018 
1019 	nla_for_each_nested(a, attr, rem) {
1020 		int type = nla_type(a);
1021 		struct nlattr *st_sample;
1022 
1023 		switch (type) {
1024 		case OVS_SAMPLE_ATTR_PROBABILITY:
1025 			if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, sizeof(u32), nla_data(a)))
1026 				return -EMSGSIZE;
1027 			break;
1028 		case OVS_SAMPLE_ATTR_ACTIONS:
1029 			st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS);
1030 			if (!st_sample)
1031 				return -EMSGSIZE;
1032 			err = actions_to_attr(nla_data(a), nla_len(a), skb);
1033 			if (err)
1034 				return err;
1035 			nla_nest_end(skb, st_sample);
1036 			break;
1037 		}
1038 	}
1039 
1040 	nla_nest_end(skb, start);
1041 	return err;
1042 }
1043 
1044 static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
1045 {
1046 	const struct nlattr *ovs_key = nla_data(a);
1047 	int key_type = nla_type(ovs_key);
1048 	struct nlattr *start;
1049 	int err;
1050 
1051 	switch (key_type) {
1052 	case OVS_KEY_ATTR_IPV4_TUNNEL:
1053 		start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
1054 		if (!start)
1055 			return -EMSGSIZE;
1056 
1057 		err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key),
1058 					     nla_data(ovs_key));
1059 		if (err)
1060 			return err;
1061 		nla_nest_end(skb, start);
1062 		break;
1063 	default:
1064 		if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key))
1065 			return -EMSGSIZE;
1066 		break;
1067 	}
1068 
1069 	return 0;
1070 }
1071 
1072 static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb)
1073 {
1074 	const struct nlattr *a;
1075 	int rem, err;
1076 
1077 	nla_for_each_attr(a, attr, len, rem) {
1078 		int type = nla_type(a);
1079 
1080 		switch (type) {
1081 		case OVS_ACTION_ATTR_SET:
1082 			err = set_action_to_attr(a, skb);
1083 			if (err)
1084 				return err;
1085 			break;
1086 
1087 		case OVS_ACTION_ATTR_SAMPLE:
1088 			err = sample_action_to_attr(a, skb);
1089 			if (err)
1090 				return err;
1091 			break;
1092 		default:
1093 			if (nla_put(skb, type, nla_len(a), nla_data(a)))
1094 				return -EMSGSIZE;
1095 			break;
1096 		}
1097 	}
1098 
1099 	return 0;
1100 }
1101 
1102 static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
1103 {
1104 	return NLMSG_ALIGN(sizeof(struct ovs_header))
1105 		+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */
1106 		+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */
1107 		+ nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
1108 		+ nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
1109 		+ nla_total_size(8) /* OVS_FLOW_ATTR_USED */
1110 		+ nla_total_size(acts->actions_len); /* OVS_FLOW_ATTR_ACTIONS */
1111 }
1112 
1113 /* Called with ovs_mutex. */
1114 static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
1115 				  struct sk_buff *skb, u32 portid,
1116 				  u32 seq, u32 flags, u8 cmd)
1117 {
1118 	const int skb_orig_len = skb->len;
1119 	struct nlattr *start;
1120 	struct ovs_flow_stats stats;
1121 	struct ovs_header *ovs_header;
1122 	struct nlattr *nla;
1123 	unsigned long used;
1124 	u8 tcp_flags;
1125 	int err;
1126 
1127 	ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd);
1128 	if (!ovs_header)
1129 		return -EMSGSIZE;
1130 
1131 	ovs_header->dp_ifindex = get_dpifindex(dp);
1132 
1133 	/* Fill flow key. */
1134 	nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
1135 	if (!nla)
1136 		goto nla_put_failure;
1137 
1138 	err = ovs_flow_to_nlattrs(&flow->unmasked_key,
1139 			&flow->unmasked_key, skb);
1140 	if (err)
1141 		goto error;
1142 	nla_nest_end(skb, nla);
1143 
1144 	nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK);
1145 	if (!nla)
1146 		goto nla_put_failure;
1147 
1148 	err = ovs_flow_to_nlattrs(&flow->key, &flow->mask->key, skb);
1149 	if (err)
1150 		goto error;
1151 
1152 	nla_nest_end(skb, nla);
1153 
1154 	spin_lock_bh(&flow->lock);
1155 	used = flow->used;
1156 	stats.n_packets = flow->packet_count;
1157 	stats.n_bytes = flow->byte_count;
1158 	tcp_flags = flow->tcp_flags;
1159 	spin_unlock_bh(&flow->lock);
1160 
1161 	if (used &&
1162 	    nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used)))
1163 		goto nla_put_failure;
1164 
1165 	if (stats.n_packets &&
1166 	    nla_put(skb, OVS_FLOW_ATTR_STATS,
1167 		    sizeof(struct ovs_flow_stats), &stats))
1168 		goto nla_put_failure;
1169 
1170 	if (tcp_flags &&
1171 	    nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, tcp_flags))
1172 		goto nla_put_failure;
1173 
1174 	/* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
1175 	 * this is the first flow to be dumped into 'skb'.  This is unusual for
1176 	 * Netlink but individual action lists can be longer than
1177 	 * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
1178 	 * The userspace caller can always fetch the actions separately if it
1179 	 * really wants them.  (Most userspace callers in fact don't care.)
1180 	 *
1181 	 * This can only fail for dump operations because the skb is always
1182 	 * properly sized for single flows.
1183 	 */
1184 	start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS);
1185 	if (start) {
1186 		const struct sw_flow_actions *sf_acts;
1187 
1188 		sf_acts = rcu_dereference_check(flow->sf_acts,
1189 						lockdep_ovsl_is_held());
1190 
1191 		err = actions_to_attr(sf_acts->actions, sf_acts->actions_len, skb);
1192 		if (!err)
1193 			nla_nest_end(skb, start);
1194 		else {
1195 			if (skb_orig_len)
1196 				goto error;
1197 
1198 			nla_nest_cancel(skb, start);
1199 		}
1200 	} else if (skb_orig_len)
1201 		goto nla_put_failure;
1202 
1203 	return genlmsg_end(skb, ovs_header);
1204 
1205 nla_put_failure:
1206 	err = -EMSGSIZE;
1207 error:
1208 	genlmsg_cancel(skb, ovs_header);
1209 	return err;
1210 }
1211 
1212 static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow)
1213 {
1214 	const struct sw_flow_actions *sf_acts;
1215 
1216 	sf_acts = ovsl_dereference(flow->sf_acts);
1217 
1218 	return genlmsg_new(ovs_flow_cmd_msg_size(sf_acts), GFP_KERNEL);
1219 }
1220 
1221 static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow,
1222 					       struct datapath *dp,
1223 					       u32 portid, u32 seq, u8 cmd)
1224 {
1225 	struct sk_buff *skb;
1226 	int retval;
1227 
1228 	skb = ovs_flow_cmd_alloc_info(flow);
1229 	if (!skb)
1230 		return ERR_PTR(-ENOMEM);
1231 
1232 	retval = ovs_flow_cmd_fill_info(flow, dp, skb, portid, seq, 0, cmd);
1233 	BUG_ON(retval < 0);
1234 	return skb;
1235 }
1236 
1237 static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
1238 {
1239 	struct nlattr **a = info->attrs;
1240 	struct ovs_header *ovs_header = info->userhdr;
1241 	struct sw_flow_key key, masked_key;
1242 	struct sw_flow *flow = NULL;
1243 	struct sw_flow_mask mask;
1244 	struct sk_buff *reply;
1245 	struct datapath *dp;
1246 	struct flow_table *table;
1247 	struct sw_flow_actions *acts = NULL;
1248 	struct sw_flow_match match;
1249 	int error;
1250 
1251 	/* Extract key. */
1252 	error = -EINVAL;
1253 	if (!a[OVS_FLOW_ATTR_KEY])
1254 		goto error;
1255 
1256 	ovs_match_init(&match, &key, &mask);
1257 	error = ovs_match_from_nlattrs(&match,
1258 			a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
1259 	if (error)
1260 		goto error;
1261 
1262 	/* Validate actions. */
1263 	if (a[OVS_FLOW_ATTR_ACTIONS]) {
1264 		acts = ovs_flow_actions_alloc(nla_len(a[OVS_FLOW_ATTR_ACTIONS]));
1265 		error = PTR_ERR(acts);
1266 		if (IS_ERR(acts))
1267 			goto error;
1268 
1269 		ovs_flow_key_mask(&masked_key, &key, &mask);
1270 		error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS],
1271 						  &masked_key, 0, &acts);
1272 		if (error) {
1273 			OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
1274 			goto err_kfree;
1275 		}
1276 	} else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
1277 		error = -EINVAL;
1278 		goto error;
1279 	}
1280 
1281 	ovs_lock();
1282 	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
1283 	error = -ENODEV;
1284 	if (!dp)
1285 		goto err_unlock_ovs;
1286 
1287 	table = ovsl_dereference(dp->table);
1288 
1289 	/* Check if this is a duplicate flow */
1290 	flow = ovs_flow_lookup(table, &key);
1291 	if (!flow) {
1292 		struct sw_flow_mask *mask_p;
1293 		/* Bail out if we're not allowed to create a new flow. */
1294 		error = -ENOENT;
1295 		if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
1296 			goto err_unlock_ovs;
1297 
1298 		/* Expand table, if necessary, to make room. */
1299 		if (ovs_flow_tbl_need_to_expand(table)) {
1300 			struct flow_table *new_table;
1301 
1302 			new_table = ovs_flow_tbl_expand(table);
1303 			if (!IS_ERR(new_table)) {
1304 				rcu_assign_pointer(dp->table, new_table);
1305 				ovs_flow_tbl_destroy(table, true);
1306 				table = ovsl_dereference(dp->table);
1307 			}
1308 		}
1309 
1310 		/* Allocate flow. */
1311 		flow = ovs_flow_alloc();
1312 		if (IS_ERR(flow)) {
1313 			error = PTR_ERR(flow);
1314 			goto err_unlock_ovs;
1315 		}
1316 		clear_stats(flow);
1317 
1318 		flow->key = masked_key;
1319 		flow->unmasked_key = key;
1320 
1321 		/* Make sure mask is unique in the system */
1322 		mask_p = ovs_sw_flow_mask_find(table, &mask);
1323 		if (!mask_p) {
1324 			/* Allocate a new mask if none exsits. */
1325 			mask_p = ovs_sw_flow_mask_alloc();
1326 			if (!mask_p)
1327 				goto err_flow_free;
1328 			mask_p->key = mask.key;
1329 			mask_p->range = mask.range;
1330 			ovs_sw_flow_mask_insert(table, mask_p);
1331 		}
1332 
1333 		ovs_sw_flow_mask_add_ref(mask_p);
1334 		flow->mask = mask_p;
1335 		rcu_assign_pointer(flow->sf_acts, acts);
1336 
1337 		/* Put flow in bucket. */
1338 		ovs_flow_insert(table, flow);
1339 
1340 		reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid,
1341 						info->snd_seq, OVS_FLOW_CMD_NEW);
1342 	} else {
1343 		/* We found a matching flow. */
1344 		struct sw_flow_actions *old_acts;
1345 
1346 		/* Bail out if we're not allowed to modify an existing flow.
1347 		 * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
1348 		 * because Generic Netlink treats the latter as a dump
1349 		 * request.  We also accept NLM_F_EXCL in case that bug ever
1350 		 * gets fixed.
1351 		 */
1352 		error = -EEXIST;
1353 		if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW &&
1354 		    info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))
1355 			goto err_unlock_ovs;
1356 
1357 		/* The unmasked key has to be the same for flow updates. */
1358 		error = -EINVAL;
1359 		if (!ovs_flow_cmp_unmasked_key(flow, &key, match.range.end)) {
1360 			OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n");
1361 			goto err_unlock_ovs;
1362 		}
1363 
1364 		/* Update actions. */
1365 		old_acts = ovsl_dereference(flow->sf_acts);
1366 		rcu_assign_pointer(flow->sf_acts, acts);
1367 		ovs_flow_deferred_free_acts(old_acts);
1368 
1369 		reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid,
1370 					       info->snd_seq, OVS_FLOW_CMD_NEW);
1371 
1372 		/* Clear stats. */
1373 		if (a[OVS_FLOW_ATTR_CLEAR]) {
1374 			spin_lock_bh(&flow->lock);
1375 			clear_stats(flow);
1376 			spin_unlock_bh(&flow->lock);
1377 		}
1378 	}
1379 	ovs_unlock();
1380 
1381 	if (!IS_ERR(reply))
1382 		ovs_notify(reply, info, &ovs_dp_flow_multicast_group);
1383 	else
1384 		netlink_set_err(sock_net(skb->sk)->genl_sock, 0,
1385 				ovs_dp_flow_multicast_group.id, PTR_ERR(reply));
1386 	return 0;
1387 
1388 err_flow_free:
1389 	ovs_flow_free(flow, false);
1390 err_unlock_ovs:
1391 	ovs_unlock();
1392 err_kfree:
1393 	kfree(acts);
1394 error:
1395 	return error;
1396 }
1397 
1398 static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
1399 {
1400 	struct nlattr **a = info->attrs;
1401 	struct ovs_header *ovs_header = info->userhdr;
1402 	struct sw_flow_key key;
1403 	struct sk_buff *reply;
1404 	struct sw_flow *flow;
1405 	struct datapath *dp;
1406 	struct flow_table *table;
1407 	struct sw_flow_match match;
1408 	int err;
1409 
1410 	if (!a[OVS_FLOW_ATTR_KEY]) {
1411 		OVS_NLERR("Flow get message rejected, Key attribute missing.\n");
1412 		return -EINVAL;
1413 	}
1414 
1415 	ovs_match_init(&match, &key, NULL);
1416 	err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL);
1417 	if (err)
1418 		return err;
1419 
1420 	ovs_lock();
1421 	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
1422 	if (!dp) {
1423 		err = -ENODEV;
1424 		goto unlock;
1425 	}
1426 
1427 	table = ovsl_dereference(dp->table);
1428 	flow = ovs_flow_lookup_unmasked_key(table, &match);
1429 	if (!flow) {
1430 		err = -ENOENT;
1431 		goto unlock;
1432 	}
1433 
1434 	reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid,
1435 					info->snd_seq, OVS_FLOW_CMD_NEW);
1436 	if (IS_ERR(reply)) {
1437 		err = PTR_ERR(reply);
1438 		goto unlock;
1439 	}
1440 
1441 	ovs_unlock();
1442 	return genlmsg_reply(reply, info);
1443 unlock:
1444 	ovs_unlock();
1445 	return err;
1446 }
1447 
1448 static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
1449 {
1450 	struct nlattr **a = info->attrs;
1451 	struct ovs_header *ovs_header = info->userhdr;
1452 	struct sw_flow_key key;
1453 	struct sk_buff *reply;
1454 	struct sw_flow *flow;
1455 	struct datapath *dp;
1456 	struct flow_table *table;
1457 	struct sw_flow_match match;
1458 	int err;
1459 
1460 	ovs_lock();
1461 	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
1462 	if (!dp) {
1463 		err = -ENODEV;
1464 		goto unlock;
1465 	}
1466 
1467 	if (!a[OVS_FLOW_ATTR_KEY]) {
1468 		err = flush_flows(dp);
1469 		goto unlock;
1470 	}
1471 
1472 	ovs_match_init(&match, &key, NULL);
1473 	err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL);
1474 	if (err)
1475 		goto unlock;
1476 
1477 	table = ovsl_dereference(dp->table);
1478 	flow = ovs_flow_lookup_unmasked_key(table, &match);
1479 	if (!flow) {
1480 		err = -ENOENT;
1481 		goto unlock;
1482 	}
1483 
1484 	reply = ovs_flow_cmd_alloc_info(flow);
1485 	if (!reply) {
1486 		err = -ENOMEM;
1487 		goto unlock;
1488 	}
1489 
1490 	ovs_flow_remove(table, flow);
1491 
1492 	err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid,
1493 				     info->snd_seq, 0, OVS_FLOW_CMD_DEL);
1494 	BUG_ON(err < 0);
1495 
1496 	ovs_flow_free(flow, true);
1497 	ovs_unlock();
1498 
1499 	ovs_notify(reply, info, &ovs_dp_flow_multicast_group);
1500 	return 0;
1501 unlock:
1502 	ovs_unlock();
1503 	return err;
1504 }
1505 
1506 static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1507 {
1508 	struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
1509 	struct datapath *dp;
1510 	struct flow_table *table;
1511 
1512 	rcu_read_lock();
1513 	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
1514 	if (!dp) {
1515 		rcu_read_unlock();
1516 		return -ENODEV;
1517 	}
1518 
1519 	table = rcu_dereference(dp->table);
1520 	for (;;) {
1521 		struct sw_flow *flow;
1522 		u32 bucket, obj;
1523 
1524 		bucket = cb->args[0];
1525 		obj = cb->args[1];
1526 		flow = ovs_flow_dump_next(table, &bucket, &obj);
1527 		if (!flow)
1528 			break;
1529 
1530 		if (ovs_flow_cmd_fill_info(flow, dp, skb,
1531 					   NETLINK_CB(cb->skb).portid,
1532 					   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1533 					   OVS_FLOW_CMD_NEW) < 0)
1534 			break;
1535 
1536 		cb->args[0] = bucket;
1537 		cb->args[1] = obj;
1538 	}
1539 	rcu_read_unlock();
1540 	return skb->len;
1541 }
1542 
1543 static struct genl_ops dp_flow_genl_ops[] = {
1544 	{ .cmd = OVS_FLOW_CMD_NEW,
1545 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1546 	  .policy = flow_policy,
1547 	  .doit = ovs_flow_cmd_new_or_set
1548 	},
1549 	{ .cmd = OVS_FLOW_CMD_DEL,
1550 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1551 	  .policy = flow_policy,
1552 	  .doit = ovs_flow_cmd_del
1553 	},
1554 	{ .cmd = OVS_FLOW_CMD_GET,
1555 	  .flags = 0,		    /* OK for unprivileged users. */
1556 	  .policy = flow_policy,
1557 	  .doit = ovs_flow_cmd_get,
1558 	  .dumpit = ovs_flow_cmd_dump
1559 	},
1560 	{ .cmd = OVS_FLOW_CMD_SET,
1561 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1562 	  .policy = flow_policy,
1563 	  .doit = ovs_flow_cmd_new_or_set,
1564 	},
1565 };
1566 
1567 static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
1568 	[OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
1569 	[OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
1570 };
1571 
1572 static struct genl_family dp_datapath_genl_family = {
1573 	.id = GENL_ID_GENERATE,
1574 	.hdrsize = sizeof(struct ovs_header),
1575 	.name = OVS_DATAPATH_FAMILY,
1576 	.version = OVS_DATAPATH_VERSION,
1577 	.maxattr = OVS_DP_ATTR_MAX,
1578 	.netnsok = true,
1579 	.parallel_ops = true,
1580 };
1581 
1582 static struct genl_multicast_group ovs_dp_datapath_multicast_group = {
1583 	.name = OVS_DATAPATH_MCGROUP
1584 };
1585 
1586 static size_t ovs_dp_cmd_msg_size(void)
1587 {
1588 	size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header));
1589 
1590 	msgsize += nla_total_size(IFNAMSIZ);
1591 	msgsize += nla_total_size(sizeof(struct ovs_dp_stats));
1592 
1593 	return msgsize;
1594 }
1595 
1596 static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
1597 				u32 portid, u32 seq, u32 flags, u8 cmd)
1598 {
1599 	struct ovs_header *ovs_header;
1600 	struct ovs_dp_stats dp_stats;
1601 	int err;
1602 
1603 	ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
1604 				   flags, cmd);
1605 	if (!ovs_header)
1606 		goto error;
1607 
1608 	ovs_header->dp_ifindex = get_dpifindex(dp);
1609 
1610 	rcu_read_lock();
1611 	err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp));
1612 	rcu_read_unlock();
1613 	if (err)
1614 		goto nla_put_failure;
1615 
1616 	get_dp_stats(dp, &dp_stats);
1617 	if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats))
1618 		goto nla_put_failure;
1619 
1620 	return genlmsg_end(skb, ovs_header);
1621 
1622 nla_put_failure:
1623 	genlmsg_cancel(skb, ovs_header);
1624 error:
1625 	return -EMSGSIZE;
1626 }
1627 
1628 static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 portid,
1629 					     u32 seq, u8 cmd)
1630 {
1631 	struct sk_buff *skb;
1632 	int retval;
1633 
1634 	skb = genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL);
1635 	if (!skb)
1636 		return ERR_PTR(-ENOMEM);
1637 
1638 	retval = ovs_dp_cmd_fill_info(dp, skb, portid, seq, 0, cmd);
1639 	if (retval < 0) {
1640 		kfree_skb(skb);
1641 		return ERR_PTR(retval);
1642 	}
1643 	return skb;
1644 }
1645 
1646 /* Called with ovs_mutex. */
1647 static struct datapath *lookup_datapath(struct net *net,
1648 					struct ovs_header *ovs_header,
1649 					struct nlattr *a[OVS_DP_ATTR_MAX + 1])
1650 {
1651 	struct datapath *dp;
1652 
1653 	if (!a[OVS_DP_ATTR_NAME])
1654 		dp = get_dp(net, ovs_header->dp_ifindex);
1655 	else {
1656 		struct vport *vport;
1657 
1658 		rcu_read_lock();
1659 		vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME]));
1660 		dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL;
1661 		rcu_read_unlock();
1662 	}
1663 	return dp ? dp : ERR_PTR(-ENODEV);
1664 }
1665 
1666 static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
1667 {
1668 	struct nlattr **a = info->attrs;
1669 	struct vport_parms parms;
1670 	struct sk_buff *reply;
1671 	struct datapath *dp;
1672 	struct vport *vport;
1673 	struct ovs_net *ovs_net;
1674 	int err, i;
1675 
1676 	err = -EINVAL;
1677 	if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
1678 		goto err;
1679 
1680 	ovs_lock();
1681 
1682 	err = -ENOMEM;
1683 	dp = kzalloc(sizeof(*dp), GFP_KERNEL);
1684 	if (dp == NULL)
1685 		goto err_unlock_ovs;
1686 
1687 	ovs_dp_set_net(dp, hold_net(sock_net(skb->sk)));
1688 
1689 	/* Allocate table. */
1690 	err = -ENOMEM;
1691 	rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS));
1692 	if (!dp->table)
1693 		goto err_free_dp;
1694 
1695 	dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
1696 	if (!dp->stats_percpu) {
1697 		err = -ENOMEM;
1698 		goto err_destroy_table;
1699 	}
1700 
1701 	dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
1702 			GFP_KERNEL);
1703 	if (!dp->ports) {
1704 		err = -ENOMEM;
1705 		goto err_destroy_percpu;
1706 	}
1707 
1708 	for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
1709 		INIT_HLIST_HEAD(&dp->ports[i]);
1710 
1711 	/* Set up our datapath device. */
1712 	parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
1713 	parms.type = OVS_VPORT_TYPE_INTERNAL;
1714 	parms.options = NULL;
1715 	parms.dp = dp;
1716 	parms.port_no = OVSP_LOCAL;
1717 	parms.upcall_portid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]);
1718 
1719 	vport = new_vport(&parms);
1720 	if (IS_ERR(vport)) {
1721 		err = PTR_ERR(vport);
1722 		if (err == -EBUSY)
1723 			err = -EEXIST;
1724 
1725 		goto err_destroy_ports_array;
1726 	}
1727 
1728 	reply = ovs_dp_cmd_build_info(dp, info->snd_portid,
1729 				      info->snd_seq, OVS_DP_CMD_NEW);
1730 	err = PTR_ERR(reply);
1731 	if (IS_ERR(reply))
1732 		goto err_destroy_local_port;
1733 
1734 	ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id);
1735 	list_add_tail_rcu(&dp->list_node, &ovs_net->dps);
1736 
1737 	ovs_unlock();
1738 
1739 	ovs_notify(reply, info, &ovs_dp_datapath_multicast_group);
1740 	return 0;
1741 
1742 err_destroy_local_port:
1743 	ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
1744 err_destroy_ports_array:
1745 	kfree(dp->ports);
1746 err_destroy_percpu:
1747 	free_percpu(dp->stats_percpu);
1748 err_destroy_table:
1749 	ovs_flow_tbl_destroy(ovsl_dereference(dp->table), false);
1750 err_free_dp:
1751 	release_net(ovs_dp_get_net(dp));
1752 	kfree(dp);
1753 err_unlock_ovs:
1754 	ovs_unlock();
1755 err:
1756 	return err;
1757 }
1758 
1759 /* Called with ovs_mutex. */
1760 static void __dp_destroy(struct datapath *dp)
1761 {
1762 	int i;
1763 
1764 	for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
1765 		struct vport *vport;
1766 		struct hlist_node *n;
1767 
1768 		hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node)
1769 			if (vport->port_no != OVSP_LOCAL)
1770 				ovs_dp_detach_port(vport);
1771 	}
1772 
1773 	list_del_rcu(&dp->list_node);
1774 
1775 	/* OVSP_LOCAL is datapath internal port. We need to make sure that
1776 	 * all port in datapath are destroyed first before freeing datapath.
1777 	 */
1778 	ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
1779 
1780 	call_rcu(&dp->rcu, destroy_dp_rcu);
1781 }
1782 
1783 static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
1784 {
1785 	struct sk_buff *reply;
1786 	struct datapath *dp;
1787 	int err;
1788 
1789 	ovs_lock();
1790 	dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1791 	err = PTR_ERR(dp);
1792 	if (IS_ERR(dp))
1793 		goto unlock;
1794 
1795 	reply = ovs_dp_cmd_build_info(dp, info->snd_portid,
1796 				      info->snd_seq, OVS_DP_CMD_DEL);
1797 	err = PTR_ERR(reply);
1798 	if (IS_ERR(reply))
1799 		goto unlock;
1800 
1801 	__dp_destroy(dp);
1802 	ovs_unlock();
1803 
1804 	ovs_notify(reply, info, &ovs_dp_datapath_multicast_group);
1805 
1806 	return 0;
1807 unlock:
1808 	ovs_unlock();
1809 	return err;
1810 }
1811 
1812 static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
1813 {
1814 	struct sk_buff *reply;
1815 	struct datapath *dp;
1816 	int err;
1817 
1818 	ovs_lock();
1819 	dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1820 	err = PTR_ERR(dp);
1821 	if (IS_ERR(dp))
1822 		goto unlock;
1823 
1824 	reply = ovs_dp_cmd_build_info(dp, info->snd_portid,
1825 				      info->snd_seq, OVS_DP_CMD_NEW);
1826 	if (IS_ERR(reply)) {
1827 		err = PTR_ERR(reply);
1828 		netlink_set_err(sock_net(skb->sk)->genl_sock, 0,
1829 				ovs_dp_datapath_multicast_group.id, err);
1830 		err = 0;
1831 		goto unlock;
1832 	}
1833 
1834 	ovs_unlock();
1835 	ovs_notify(reply, info, &ovs_dp_datapath_multicast_group);
1836 
1837 	return 0;
1838 unlock:
1839 	ovs_unlock();
1840 	return err;
1841 }
1842 
1843 static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
1844 {
1845 	struct sk_buff *reply;
1846 	struct datapath *dp;
1847 	int err;
1848 
1849 	ovs_lock();
1850 	dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
1851 	if (IS_ERR(dp)) {
1852 		err = PTR_ERR(dp);
1853 		goto unlock;
1854 	}
1855 
1856 	reply = ovs_dp_cmd_build_info(dp, info->snd_portid,
1857 				      info->snd_seq, OVS_DP_CMD_NEW);
1858 	if (IS_ERR(reply)) {
1859 		err = PTR_ERR(reply);
1860 		goto unlock;
1861 	}
1862 
1863 	ovs_unlock();
1864 	return genlmsg_reply(reply, info);
1865 
1866 unlock:
1867 	ovs_unlock();
1868 	return err;
1869 }
1870 
1871 static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1872 {
1873 	struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
1874 	struct datapath *dp;
1875 	int skip = cb->args[0];
1876 	int i = 0;
1877 
1878 	rcu_read_lock();
1879 	list_for_each_entry_rcu(dp, &ovs_net->dps, list_node) {
1880 		if (i >= skip &&
1881 		    ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid,
1882 					 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1883 					 OVS_DP_CMD_NEW) < 0)
1884 			break;
1885 		i++;
1886 	}
1887 	rcu_read_unlock();
1888 
1889 	cb->args[0] = i;
1890 
1891 	return skb->len;
1892 }
1893 
1894 static struct genl_ops dp_datapath_genl_ops[] = {
1895 	{ .cmd = OVS_DP_CMD_NEW,
1896 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1897 	  .policy = datapath_policy,
1898 	  .doit = ovs_dp_cmd_new
1899 	},
1900 	{ .cmd = OVS_DP_CMD_DEL,
1901 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1902 	  .policy = datapath_policy,
1903 	  .doit = ovs_dp_cmd_del
1904 	},
1905 	{ .cmd = OVS_DP_CMD_GET,
1906 	  .flags = 0,		    /* OK for unprivileged users. */
1907 	  .policy = datapath_policy,
1908 	  .doit = ovs_dp_cmd_get,
1909 	  .dumpit = ovs_dp_cmd_dump
1910 	},
1911 	{ .cmd = OVS_DP_CMD_SET,
1912 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1913 	  .policy = datapath_policy,
1914 	  .doit = ovs_dp_cmd_set,
1915 	},
1916 };
1917 
1918 static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
1919 	[OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
1920 	[OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
1921 	[OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
1922 	[OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
1923 	[OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
1924 	[OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
1925 };
1926 
1927 static struct genl_family dp_vport_genl_family = {
1928 	.id = GENL_ID_GENERATE,
1929 	.hdrsize = sizeof(struct ovs_header),
1930 	.name = OVS_VPORT_FAMILY,
1931 	.version = OVS_VPORT_VERSION,
1932 	.maxattr = OVS_VPORT_ATTR_MAX,
1933 	.netnsok = true,
1934 	.parallel_ops = true,
1935 };
1936 
1937 struct genl_multicast_group ovs_dp_vport_multicast_group = {
1938 	.name = OVS_VPORT_MCGROUP
1939 };
1940 
1941 /* Called with ovs_mutex or RCU read lock. */
1942 static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
1943 				   u32 portid, u32 seq, u32 flags, u8 cmd)
1944 {
1945 	struct ovs_header *ovs_header;
1946 	struct ovs_vport_stats vport_stats;
1947 	int err;
1948 
1949 	ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family,
1950 				 flags, cmd);
1951 	if (!ovs_header)
1952 		return -EMSGSIZE;
1953 
1954 	ovs_header->dp_ifindex = get_dpifindex(vport->dp);
1955 
1956 	if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
1957 	    nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
1958 	    nla_put_string(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport)) ||
1959 	    nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_portid))
1960 		goto nla_put_failure;
1961 
1962 	ovs_vport_get_stats(vport, &vport_stats);
1963 	if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
1964 		    &vport_stats))
1965 		goto nla_put_failure;
1966 
1967 	err = ovs_vport_get_options(vport, skb);
1968 	if (err == -EMSGSIZE)
1969 		goto error;
1970 
1971 	return genlmsg_end(skb, ovs_header);
1972 
1973 nla_put_failure:
1974 	err = -EMSGSIZE;
1975 error:
1976 	genlmsg_cancel(skb, ovs_header);
1977 	return err;
1978 }
1979 
1980 /* Called with ovs_mutex or RCU read lock. */
1981 struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid,
1982 					 u32 seq, u8 cmd)
1983 {
1984 	struct sk_buff *skb;
1985 	int retval;
1986 
1987 	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
1988 	if (!skb)
1989 		return ERR_PTR(-ENOMEM);
1990 
1991 	retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd);
1992 	BUG_ON(retval < 0);
1993 
1994 	return skb;
1995 }
1996 
1997 /* Called with ovs_mutex or RCU read lock. */
1998 static struct vport *lookup_vport(struct net *net,
1999 				  struct ovs_header *ovs_header,
2000 				  struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
2001 {
2002 	struct datapath *dp;
2003 	struct vport *vport;
2004 
2005 	if (a[OVS_VPORT_ATTR_NAME]) {
2006 		vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME]));
2007 		if (!vport)
2008 			return ERR_PTR(-ENODEV);
2009 		if (ovs_header->dp_ifindex &&
2010 		    ovs_header->dp_ifindex != get_dpifindex(vport->dp))
2011 			return ERR_PTR(-ENODEV);
2012 		return vport;
2013 	} else if (a[OVS_VPORT_ATTR_PORT_NO]) {
2014 		u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
2015 
2016 		if (port_no >= DP_MAX_PORTS)
2017 			return ERR_PTR(-EFBIG);
2018 
2019 		dp = get_dp(net, ovs_header->dp_ifindex);
2020 		if (!dp)
2021 			return ERR_PTR(-ENODEV);
2022 
2023 		vport = ovs_vport_ovsl_rcu(dp, port_no);
2024 		if (!vport)
2025 			return ERR_PTR(-ENODEV);
2026 		return vport;
2027 	} else
2028 		return ERR_PTR(-EINVAL);
2029 }
2030 
2031 static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
2032 {
2033 	struct nlattr **a = info->attrs;
2034 	struct ovs_header *ovs_header = info->userhdr;
2035 	struct vport_parms parms;
2036 	struct sk_buff *reply;
2037 	struct vport *vport;
2038 	struct datapath *dp;
2039 	u32 port_no;
2040 	int err;
2041 
2042 	err = -EINVAL;
2043 	if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
2044 	    !a[OVS_VPORT_ATTR_UPCALL_PID])
2045 		goto exit;
2046 
2047 	ovs_lock();
2048 	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
2049 	err = -ENODEV;
2050 	if (!dp)
2051 		goto exit_unlock;
2052 
2053 	if (a[OVS_VPORT_ATTR_PORT_NO]) {
2054 		port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
2055 
2056 		err = -EFBIG;
2057 		if (port_no >= DP_MAX_PORTS)
2058 			goto exit_unlock;
2059 
2060 		vport = ovs_vport_ovsl(dp, port_no);
2061 		err = -EBUSY;
2062 		if (vport)
2063 			goto exit_unlock;
2064 	} else {
2065 		for (port_no = 1; ; port_no++) {
2066 			if (port_no >= DP_MAX_PORTS) {
2067 				err = -EFBIG;
2068 				goto exit_unlock;
2069 			}
2070 			vport = ovs_vport_ovsl(dp, port_no);
2071 			if (!vport)
2072 				break;
2073 		}
2074 	}
2075 
2076 	parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
2077 	parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
2078 	parms.options = a[OVS_VPORT_ATTR_OPTIONS];
2079 	parms.dp = dp;
2080 	parms.port_no = port_no;
2081 	parms.upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
2082 
2083 	vport = new_vport(&parms);
2084 	err = PTR_ERR(vport);
2085 	if (IS_ERR(vport))
2086 		goto exit_unlock;
2087 
2088 	err = 0;
2089 	reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq,
2090 					 OVS_VPORT_CMD_NEW);
2091 	if (IS_ERR(reply)) {
2092 		err = PTR_ERR(reply);
2093 		ovs_dp_detach_port(vport);
2094 		goto exit_unlock;
2095 	}
2096 
2097 	ovs_notify(reply, info, &ovs_dp_vport_multicast_group);
2098 
2099 exit_unlock:
2100 	ovs_unlock();
2101 exit:
2102 	return err;
2103 }
2104 
2105 static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
2106 {
2107 	struct nlattr **a = info->attrs;
2108 	struct sk_buff *reply;
2109 	struct vport *vport;
2110 	int err;
2111 
2112 	ovs_lock();
2113 	vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
2114 	err = PTR_ERR(vport);
2115 	if (IS_ERR(vport))
2116 		goto exit_unlock;
2117 
2118 	if (a[OVS_VPORT_ATTR_TYPE] &&
2119 	    nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) {
2120 		err = -EINVAL;
2121 		goto exit_unlock;
2122 	}
2123 
2124 	reply = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
2125 	if (!reply) {
2126 		err = -ENOMEM;
2127 		goto exit_unlock;
2128 	}
2129 
2130 	if (a[OVS_VPORT_ATTR_OPTIONS]) {
2131 		err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
2132 		if (err)
2133 			goto exit_free;
2134 	}
2135 
2136 	if (a[OVS_VPORT_ATTR_UPCALL_PID])
2137 		vport->upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
2138 
2139 	err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2140 				      info->snd_seq, 0, OVS_VPORT_CMD_NEW);
2141 	BUG_ON(err < 0);
2142 
2143 	ovs_unlock();
2144 	ovs_notify(reply, info, &ovs_dp_vport_multicast_group);
2145 	return 0;
2146 
2147 exit_free:
2148 	kfree_skb(reply);
2149 exit_unlock:
2150 	ovs_unlock();
2151 	return err;
2152 }
2153 
2154 static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
2155 {
2156 	struct nlattr **a = info->attrs;
2157 	struct sk_buff *reply;
2158 	struct vport *vport;
2159 	int err;
2160 
2161 	ovs_lock();
2162 	vport = lookup_vport(sock_net(skb->sk), info->userhdr, a);
2163 	err = PTR_ERR(vport);
2164 	if (IS_ERR(vport))
2165 		goto exit_unlock;
2166 
2167 	if (vport->port_no == OVSP_LOCAL) {
2168 		err = -EINVAL;
2169 		goto exit_unlock;
2170 	}
2171 
2172 	reply = ovs_vport_cmd_build_info(vport, info->snd_portid,
2173 					 info->snd_seq, OVS_VPORT_CMD_DEL);
2174 	err = PTR_ERR(reply);
2175 	if (IS_ERR(reply))
2176 		goto exit_unlock;
2177 
2178 	err = 0;
2179 	ovs_dp_detach_port(vport);
2180 
2181 	ovs_notify(reply, info, &ovs_dp_vport_multicast_group);
2182 
2183 exit_unlock:
2184 	ovs_unlock();
2185 	return err;
2186 }
2187 
2188 static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
2189 {
2190 	struct nlattr **a = info->attrs;
2191 	struct ovs_header *ovs_header = info->userhdr;
2192 	struct sk_buff *reply;
2193 	struct vport *vport;
2194 	int err;
2195 
2196 	rcu_read_lock();
2197 	vport = lookup_vport(sock_net(skb->sk), ovs_header, a);
2198 	err = PTR_ERR(vport);
2199 	if (IS_ERR(vport))
2200 		goto exit_unlock;
2201 
2202 	reply = ovs_vport_cmd_build_info(vport, info->snd_portid,
2203 					 info->snd_seq, OVS_VPORT_CMD_NEW);
2204 	err = PTR_ERR(reply);
2205 	if (IS_ERR(reply))
2206 		goto exit_unlock;
2207 
2208 	rcu_read_unlock();
2209 
2210 	return genlmsg_reply(reply, info);
2211 
2212 exit_unlock:
2213 	rcu_read_unlock();
2214 	return err;
2215 }
2216 
2217 static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
2218 {
2219 	struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
2220 	struct datapath *dp;
2221 	int bucket = cb->args[0], skip = cb->args[1];
2222 	int i, j = 0;
2223 
2224 	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
2225 	if (!dp)
2226 		return -ENODEV;
2227 
2228 	rcu_read_lock();
2229 	for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) {
2230 		struct vport *vport;
2231 
2232 		j = 0;
2233 		hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
2234 			if (j >= skip &&
2235 			    ovs_vport_cmd_fill_info(vport, skb,
2236 						    NETLINK_CB(cb->skb).portid,
2237 						    cb->nlh->nlmsg_seq,
2238 						    NLM_F_MULTI,
2239 						    OVS_VPORT_CMD_NEW) < 0)
2240 				goto out;
2241 
2242 			j++;
2243 		}
2244 		skip = 0;
2245 	}
2246 out:
2247 	rcu_read_unlock();
2248 
2249 	cb->args[0] = i;
2250 	cb->args[1] = j;
2251 
2252 	return skb->len;
2253 }
2254 
2255 static struct genl_ops dp_vport_genl_ops[] = {
2256 	{ .cmd = OVS_VPORT_CMD_NEW,
2257 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2258 	  .policy = vport_policy,
2259 	  .doit = ovs_vport_cmd_new
2260 	},
2261 	{ .cmd = OVS_VPORT_CMD_DEL,
2262 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2263 	  .policy = vport_policy,
2264 	  .doit = ovs_vport_cmd_del
2265 	},
2266 	{ .cmd = OVS_VPORT_CMD_GET,
2267 	  .flags = 0,		    /* OK for unprivileged users. */
2268 	  .policy = vport_policy,
2269 	  .doit = ovs_vport_cmd_get,
2270 	  .dumpit = ovs_vport_cmd_dump
2271 	},
2272 	{ .cmd = OVS_VPORT_CMD_SET,
2273 	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2274 	  .policy = vport_policy,
2275 	  .doit = ovs_vport_cmd_set,
2276 	},
2277 };
2278 
2279 struct genl_family_and_ops {
2280 	struct genl_family *family;
2281 	struct genl_ops *ops;
2282 	int n_ops;
2283 	struct genl_multicast_group *group;
2284 };
2285 
2286 static const struct genl_family_and_ops dp_genl_families[] = {
2287 	{ &dp_datapath_genl_family,
2288 	  dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops),
2289 	  &ovs_dp_datapath_multicast_group },
2290 	{ &dp_vport_genl_family,
2291 	  dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops),
2292 	  &ovs_dp_vport_multicast_group },
2293 	{ &dp_flow_genl_family,
2294 	  dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops),
2295 	  &ovs_dp_flow_multicast_group },
2296 	{ &dp_packet_genl_family,
2297 	  dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops),
2298 	  NULL },
2299 };
2300 
2301 static void dp_unregister_genl(int n_families)
2302 {
2303 	int i;
2304 
2305 	for (i = 0; i < n_families; i++)
2306 		genl_unregister_family(dp_genl_families[i].family);
2307 }
2308 
2309 static int dp_register_genl(void)
2310 {
2311 	int n_registered;
2312 	int err;
2313 	int i;
2314 
2315 	n_registered = 0;
2316 	for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
2317 		const struct genl_family_and_ops *f = &dp_genl_families[i];
2318 
2319 		err = genl_register_family_with_ops(f->family, f->ops,
2320 						    f->n_ops);
2321 		if (err)
2322 			goto error;
2323 		n_registered++;
2324 
2325 		if (f->group) {
2326 			err = genl_register_mc_group(f->family, f->group);
2327 			if (err)
2328 				goto error;
2329 		}
2330 	}
2331 
2332 	return 0;
2333 
2334 error:
2335 	dp_unregister_genl(n_registered);
2336 	return err;
2337 }
2338 
2339 static void rehash_flow_table(struct work_struct *work)
2340 {
2341 	struct datapath *dp;
2342 	struct net *net;
2343 
2344 	ovs_lock();
2345 	rtnl_lock();
2346 	for_each_net(net) {
2347 		struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
2348 
2349 		list_for_each_entry(dp, &ovs_net->dps, list_node) {
2350 			struct flow_table *old_table = ovsl_dereference(dp->table);
2351 			struct flow_table *new_table;
2352 
2353 			new_table = ovs_flow_tbl_rehash(old_table);
2354 			if (!IS_ERR(new_table)) {
2355 				rcu_assign_pointer(dp->table, new_table);
2356 				ovs_flow_tbl_destroy(old_table, true);
2357 			}
2358 		}
2359 	}
2360 	rtnl_unlock();
2361 	ovs_unlock();
2362 	schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
2363 }
2364 
2365 static int __net_init ovs_init_net(struct net *net)
2366 {
2367 	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
2368 
2369 	INIT_LIST_HEAD(&ovs_net->dps);
2370 	INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
2371 	return 0;
2372 }
2373 
2374 static void __net_exit ovs_exit_net(struct net *net)
2375 {
2376 	struct datapath *dp, *dp_next;
2377 	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
2378 
2379 	ovs_lock();
2380 	list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
2381 		__dp_destroy(dp);
2382 	ovs_unlock();
2383 
2384 	cancel_work_sync(&ovs_net->dp_notify_work);
2385 }
2386 
2387 static struct pernet_operations ovs_net_ops = {
2388 	.init = ovs_init_net,
2389 	.exit = ovs_exit_net,
2390 	.id   = &ovs_net_id,
2391 	.size = sizeof(struct ovs_net),
2392 };
2393 
2394 static int __init dp_init(void)
2395 {
2396 	int err;
2397 
2398 	BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));
2399 
2400 	pr_info("Open vSwitch switching datapath\n");
2401 
2402 	err = ovs_flow_init();
2403 	if (err)
2404 		goto error;
2405 
2406 	err = ovs_vport_init();
2407 	if (err)
2408 		goto error_flow_exit;
2409 
2410 	err = register_pernet_device(&ovs_net_ops);
2411 	if (err)
2412 		goto error_vport_exit;
2413 
2414 	err = register_netdevice_notifier(&ovs_dp_device_notifier);
2415 	if (err)
2416 		goto error_netns_exit;
2417 
2418 	err = dp_register_genl();
2419 	if (err < 0)
2420 		goto error_unreg_notifier;
2421 
2422 	schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
2423 
2424 	return 0;
2425 
2426 error_unreg_notifier:
2427 	unregister_netdevice_notifier(&ovs_dp_device_notifier);
2428 error_netns_exit:
2429 	unregister_pernet_device(&ovs_net_ops);
2430 error_vport_exit:
2431 	ovs_vport_exit();
2432 error_flow_exit:
2433 	ovs_flow_exit();
2434 error:
2435 	return err;
2436 }
2437 
2438 static void dp_cleanup(void)
2439 {
2440 	cancel_delayed_work_sync(&rehash_flow_wq);
2441 	dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
2442 	unregister_netdevice_notifier(&ovs_dp_device_notifier);
2443 	unregister_pernet_device(&ovs_net_ops);
2444 	rcu_barrier();
2445 	ovs_vport_exit();
2446 	ovs_flow_exit();
2447 }
2448 
2449 module_init(dp_init);
2450 module_exit(dp_cleanup);
2451 
2452 MODULE_DESCRIPTION("Open vSwitch switching datapath");
2453 MODULE_LICENSE("GPL");
2454