xref: /openbmc/linux/net/openvswitch/flow.c (revision 23dabf88abb48a866fdb19ee08ebcf1ddd9b1840)
1 /*
2  * Copyright (c) 2007-2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #include "flow.h"
20 #include "datapath.h"
21 #include <linux/uaccess.h>
22 #include <linux/netdevice.h>
23 #include <linux/etherdevice.h>
24 #include <linux/if_ether.h>
25 #include <linux/if_vlan.h>
26 #include <net/llc_pdu.h>
27 #include <linux/kernel.h>
28 #include <linux/jhash.h>
29 #include <linux/jiffies.h>
30 #include <linux/llc.h>
31 #include <linux/module.h>
32 #include <linux/in.h>
33 #include <linux/rcupdate.h>
34 #include <linux/if_arp.h>
35 #include <linux/ip.h>
36 #include <linux/ipv6.h>
37 #include <linux/sctp.h>
38 #include <linux/smp.h>
39 #include <linux/tcp.h>
40 #include <linux/udp.h>
41 #include <linux/icmp.h>
42 #include <linux/icmpv6.h>
43 #include <linux/rculist.h>
44 #include <net/ip.h>
45 #include <net/ip_tunnels.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 
49 u64 ovs_flow_used_time(unsigned long flow_jiffies)
50 {
51 	struct timespec cur_ts;
52 	u64 cur_ms, idle_ms;
53 
54 	ktime_get_ts(&cur_ts);
55 	idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
56 	cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
57 		 cur_ts.tv_nsec / NSEC_PER_MSEC;
58 
59 	return cur_ms - idle_ms;
60 }
61 
62 #define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF))
63 
64 void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
65 {
66 	struct flow_stats *stats;
67 	__be16 tcp_flags = 0;
68 
69 	stats = this_cpu_ptr(flow->stats);
70 
71 	if ((flow->key.eth.type == htons(ETH_P_IP) ||
72 	     flow->key.eth.type == htons(ETH_P_IPV6)) &&
73 	    flow->key.ip.frag != OVS_FRAG_TYPE_LATER &&
74 	    flow->key.ip.proto == IPPROTO_TCP &&
75 	    likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) {
76 		tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
77 	}
78 
79 	spin_lock(&stats->lock);
80 	stats->used = jiffies;
81 	stats->packet_count++;
82 	stats->byte_count += skb->len;
83 	stats->tcp_flags |= tcp_flags;
84 	spin_unlock(&stats->lock);
85 }
86 
87 static void stats_read(struct flow_stats *stats,
88 		       struct ovs_flow_stats *ovs_stats,
89 		       unsigned long *used, __be16 *tcp_flags)
90 {
91 	spin_lock(&stats->lock);
92 	if (!*used || time_after(stats->used, *used))
93 		*used = stats->used;
94 	*tcp_flags |= stats->tcp_flags;
95 	ovs_stats->n_packets += stats->packet_count;
96 	ovs_stats->n_bytes += stats->byte_count;
97 	spin_unlock(&stats->lock);
98 }
99 
100 void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats,
101 			unsigned long *used, __be16 *tcp_flags)
102 {
103 	int cpu;
104 
105 	*used = 0;
106 	*tcp_flags = 0;
107 	memset(ovs_stats, 0, sizeof(*ovs_stats));
108 
109 	local_bh_disable();
110 
111 	for_each_possible_cpu(cpu) {
112 		struct flow_stats *stats;
113 
114 		stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
115 		stats_read(stats, ovs_stats, used, tcp_flags);
116 	}
117 
118 	local_bh_enable();
119 }
120 
121 static void stats_reset(struct flow_stats *stats)
122 {
123 	spin_lock(&stats->lock);
124 	stats->used = 0;
125 	stats->packet_count = 0;
126 	stats->byte_count = 0;
127 	stats->tcp_flags = 0;
128 	spin_unlock(&stats->lock);
129 }
130 
131 void ovs_flow_stats_clear(struct sw_flow *flow)
132 {
133 	int cpu;
134 
135 	local_bh_disable();
136 
137 	for_each_possible_cpu(cpu)
138 		stats_reset(per_cpu_ptr(flow->stats, cpu));
139 
140 	local_bh_enable();
141 }
142 
143 static int check_header(struct sk_buff *skb, int len)
144 {
145 	if (unlikely(skb->len < len))
146 		return -EINVAL;
147 	if (unlikely(!pskb_may_pull(skb, len)))
148 		return -ENOMEM;
149 	return 0;
150 }
151 
152 static bool arphdr_ok(struct sk_buff *skb)
153 {
154 	return pskb_may_pull(skb, skb_network_offset(skb) +
155 				  sizeof(struct arp_eth_header));
156 }
157 
158 static int check_iphdr(struct sk_buff *skb)
159 {
160 	unsigned int nh_ofs = skb_network_offset(skb);
161 	unsigned int ip_len;
162 	int err;
163 
164 	err = check_header(skb, nh_ofs + sizeof(struct iphdr));
165 	if (unlikely(err))
166 		return err;
167 
168 	ip_len = ip_hdrlen(skb);
169 	if (unlikely(ip_len < sizeof(struct iphdr) ||
170 		     skb->len < nh_ofs + ip_len))
171 		return -EINVAL;
172 
173 	skb_set_transport_header(skb, nh_ofs + ip_len);
174 	return 0;
175 }
176 
177 static bool tcphdr_ok(struct sk_buff *skb)
178 {
179 	int th_ofs = skb_transport_offset(skb);
180 	int tcp_len;
181 
182 	if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr))))
183 		return false;
184 
185 	tcp_len = tcp_hdrlen(skb);
186 	if (unlikely(tcp_len < sizeof(struct tcphdr) ||
187 		     skb->len < th_ofs + tcp_len))
188 		return false;
189 
190 	return true;
191 }
192 
193 static bool udphdr_ok(struct sk_buff *skb)
194 {
195 	return pskb_may_pull(skb, skb_transport_offset(skb) +
196 				  sizeof(struct udphdr));
197 }
198 
199 static bool sctphdr_ok(struct sk_buff *skb)
200 {
201 	return pskb_may_pull(skb, skb_transport_offset(skb) +
202 				  sizeof(struct sctphdr));
203 }
204 
205 static bool icmphdr_ok(struct sk_buff *skb)
206 {
207 	return pskb_may_pull(skb, skb_transport_offset(skb) +
208 				  sizeof(struct icmphdr));
209 }
210 
211 static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
212 {
213 	unsigned int nh_ofs = skb_network_offset(skb);
214 	unsigned int nh_len;
215 	int payload_ofs;
216 	struct ipv6hdr *nh;
217 	uint8_t nexthdr;
218 	__be16 frag_off;
219 	int err;
220 
221 	err = check_header(skb, nh_ofs + sizeof(*nh));
222 	if (unlikely(err))
223 		return err;
224 
225 	nh = ipv6_hdr(skb);
226 	nexthdr = nh->nexthdr;
227 	payload_ofs = (u8 *)(nh + 1) - skb->data;
228 
229 	key->ip.proto = NEXTHDR_NONE;
230 	key->ip.tos = ipv6_get_dsfield(nh);
231 	key->ip.ttl = nh->hop_limit;
232 	key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
233 	key->ipv6.addr.src = nh->saddr;
234 	key->ipv6.addr.dst = nh->daddr;
235 
236 	payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
237 	if (unlikely(payload_ofs < 0))
238 		return -EINVAL;
239 
240 	if (frag_off) {
241 		if (frag_off & htons(~0x7))
242 			key->ip.frag = OVS_FRAG_TYPE_LATER;
243 		else
244 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
245 	}
246 
247 	nh_len = payload_ofs - nh_ofs;
248 	skb_set_transport_header(skb, nh_ofs + nh_len);
249 	key->ip.proto = nexthdr;
250 	return nh_len;
251 }
252 
253 static bool icmp6hdr_ok(struct sk_buff *skb)
254 {
255 	return pskb_may_pull(skb, skb_transport_offset(skb) +
256 				  sizeof(struct icmp6hdr));
257 }
258 
259 static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
260 {
261 	struct qtag_prefix {
262 		__be16 eth_type; /* ETH_P_8021Q */
263 		__be16 tci;
264 	};
265 	struct qtag_prefix *qp;
266 
267 	if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16)))
268 		return 0;
269 
270 	if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) +
271 					 sizeof(__be16))))
272 		return -ENOMEM;
273 
274 	qp = (struct qtag_prefix *) skb->data;
275 	key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT);
276 	__skb_pull(skb, sizeof(struct qtag_prefix));
277 
278 	return 0;
279 }
280 
281 static __be16 parse_ethertype(struct sk_buff *skb)
282 {
283 	struct llc_snap_hdr {
284 		u8  dsap;  /* Always 0xAA */
285 		u8  ssap;  /* Always 0xAA */
286 		u8  ctrl;
287 		u8  oui[3];
288 		__be16 ethertype;
289 	};
290 	struct llc_snap_hdr *llc;
291 	__be16 proto;
292 
293 	proto = *(__be16 *) skb->data;
294 	__skb_pull(skb, sizeof(__be16));
295 
296 	if (ntohs(proto) >= ETH_P_802_3_MIN)
297 		return proto;
298 
299 	if (skb->len < sizeof(struct llc_snap_hdr))
300 		return htons(ETH_P_802_2);
301 
302 	if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr))))
303 		return htons(0);
304 
305 	llc = (struct llc_snap_hdr *) skb->data;
306 	if (llc->dsap != LLC_SAP_SNAP ||
307 	    llc->ssap != LLC_SAP_SNAP ||
308 	    (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0)
309 		return htons(ETH_P_802_2);
310 
311 	__skb_pull(skb, sizeof(struct llc_snap_hdr));
312 
313 	if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN)
314 		return llc->ethertype;
315 
316 	return htons(ETH_P_802_2);
317 }
318 
319 static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
320 			int nh_len)
321 {
322 	struct icmp6hdr *icmp = icmp6_hdr(skb);
323 
324 	/* The ICMPv6 type and code fields use the 16-bit transport port
325 	 * fields, so we need to store them in 16-bit network byte order.
326 	 */
327 	key->ipv6.tp.src = htons(icmp->icmp6_type);
328 	key->ipv6.tp.dst = htons(icmp->icmp6_code);
329 
330 	if (icmp->icmp6_code == 0 &&
331 	    (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
332 	     icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) {
333 		int icmp_len = skb->len - skb_transport_offset(skb);
334 		struct nd_msg *nd;
335 		int offset;
336 
337 		/* In order to process neighbor discovery options, we need the
338 		 * entire packet.
339 		 */
340 		if (unlikely(icmp_len < sizeof(*nd)))
341 			return 0;
342 
343 		if (unlikely(skb_linearize(skb)))
344 			return -ENOMEM;
345 
346 		nd = (struct nd_msg *)skb_transport_header(skb);
347 		key->ipv6.nd.target = nd->target;
348 
349 		icmp_len -= sizeof(*nd);
350 		offset = 0;
351 		while (icmp_len >= 8) {
352 			struct nd_opt_hdr *nd_opt =
353 				 (struct nd_opt_hdr *)(nd->opt + offset);
354 			int opt_len = nd_opt->nd_opt_len * 8;
355 
356 			if (unlikely(!opt_len || opt_len > icmp_len))
357 				return 0;
358 
359 			/* Store the link layer address if the appropriate
360 			 * option is provided.  It is considered an error if
361 			 * the same link layer option is specified twice.
362 			 */
363 			if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR
364 			    && opt_len == 8) {
365 				if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll)))
366 					goto invalid;
367 				ether_addr_copy(key->ipv6.nd.sll,
368 						&nd->opt[offset+sizeof(*nd_opt)]);
369 			} else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
370 				   && opt_len == 8) {
371 				if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll)))
372 					goto invalid;
373 				ether_addr_copy(key->ipv6.nd.tll,
374 						&nd->opt[offset+sizeof(*nd_opt)]);
375 			}
376 
377 			icmp_len -= opt_len;
378 			offset += opt_len;
379 		}
380 	}
381 
382 	return 0;
383 
384 invalid:
385 	memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
386 	memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
387 	memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
388 
389 	return 0;
390 }
391 
392 /**
393  * ovs_flow_extract - extracts a flow key from an Ethernet frame.
394  * @skb: sk_buff that contains the frame, with skb->data pointing to the
395  * Ethernet header
396  * @in_port: port number on which @skb was received.
397  * @key: output flow key
398  *
399  * The caller must ensure that skb->len >= ETH_HLEN.
400  *
401  * Returns 0 if successful, otherwise a negative errno value.
402  *
403  * Initializes @skb header pointers as follows:
404  *
405  *    - skb->mac_header: the Ethernet header.
406  *
407  *    - skb->network_header: just past the Ethernet header, or just past the
408  *      VLAN header, to the first byte of the Ethernet payload.
409  *
410  *    - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
411  *      on output, then just past the IP header, if one is present and
412  *      of a correct length, otherwise the same as skb->network_header.
413  *      For other key->eth.type values it is left untouched.
414  */
415 int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
416 {
417 	int error;
418 	struct ethhdr *eth;
419 
420 	memset(key, 0, sizeof(*key));
421 
422 	key->phy.priority = skb->priority;
423 	if (OVS_CB(skb)->tun_key)
424 		memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key));
425 	key->phy.in_port = in_port;
426 	key->phy.skb_mark = skb->mark;
427 
428 	skb_reset_mac_header(skb);
429 
430 	/* Link layer.  We are guaranteed to have at least the 14 byte Ethernet
431 	 * header in the linear data area.
432 	 */
433 	eth = eth_hdr(skb);
434 	ether_addr_copy(key->eth.src, eth->h_source);
435 	ether_addr_copy(key->eth.dst, eth->h_dest);
436 
437 	__skb_pull(skb, 2 * ETH_ALEN);
438 	/* We are going to push all headers that we pull, so no need to
439 	 * update skb->csum here.
440 	 */
441 
442 	if (vlan_tx_tag_present(skb))
443 		key->eth.tci = htons(skb->vlan_tci);
444 	else if (eth->h_proto == htons(ETH_P_8021Q))
445 		if (unlikely(parse_vlan(skb, key)))
446 			return -ENOMEM;
447 
448 	key->eth.type = parse_ethertype(skb);
449 	if (unlikely(key->eth.type == htons(0)))
450 		return -ENOMEM;
451 
452 	skb_reset_network_header(skb);
453 	__skb_push(skb, skb->data - skb_mac_header(skb));
454 
455 	/* Network layer. */
456 	if (key->eth.type == htons(ETH_P_IP)) {
457 		struct iphdr *nh;
458 		__be16 offset;
459 
460 		error = check_iphdr(skb);
461 		if (unlikely(error)) {
462 			if (error == -EINVAL) {
463 				skb->transport_header = skb->network_header;
464 				error = 0;
465 			}
466 			return error;
467 		}
468 
469 		nh = ip_hdr(skb);
470 		key->ipv4.addr.src = nh->saddr;
471 		key->ipv4.addr.dst = nh->daddr;
472 
473 		key->ip.proto = nh->protocol;
474 		key->ip.tos = nh->tos;
475 		key->ip.ttl = nh->ttl;
476 
477 		offset = nh->frag_off & htons(IP_OFFSET);
478 		if (offset) {
479 			key->ip.frag = OVS_FRAG_TYPE_LATER;
480 			return 0;
481 		}
482 		if (nh->frag_off & htons(IP_MF) ||
483 			 skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
484 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
485 
486 		/* Transport layer. */
487 		if (key->ip.proto == IPPROTO_TCP) {
488 			if (tcphdr_ok(skb)) {
489 				struct tcphdr *tcp = tcp_hdr(skb);
490 				key->ipv4.tp.src = tcp->source;
491 				key->ipv4.tp.dst = tcp->dest;
492 				key->ipv4.tp.flags = TCP_FLAGS_BE16(tcp);
493 			}
494 		} else if (key->ip.proto == IPPROTO_UDP) {
495 			if (udphdr_ok(skb)) {
496 				struct udphdr *udp = udp_hdr(skb);
497 				key->ipv4.tp.src = udp->source;
498 				key->ipv4.tp.dst = udp->dest;
499 			}
500 		} else if (key->ip.proto == IPPROTO_SCTP) {
501 			if (sctphdr_ok(skb)) {
502 				struct sctphdr *sctp = sctp_hdr(skb);
503 				key->ipv4.tp.src = sctp->source;
504 				key->ipv4.tp.dst = sctp->dest;
505 			}
506 		} else if (key->ip.proto == IPPROTO_ICMP) {
507 			if (icmphdr_ok(skb)) {
508 				struct icmphdr *icmp = icmp_hdr(skb);
509 				/* The ICMP type and code fields use the 16-bit
510 				 * transport port fields, so we need to store
511 				 * them in 16-bit network byte order. */
512 				key->ipv4.tp.src = htons(icmp->type);
513 				key->ipv4.tp.dst = htons(icmp->code);
514 			}
515 		}
516 
517 	} else if ((key->eth.type == htons(ETH_P_ARP) ||
518 		   key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) {
519 		struct arp_eth_header *arp;
520 
521 		arp = (struct arp_eth_header *)skb_network_header(skb);
522 
523 		if (arp->ar_hrd == htons(ARPHRD_ETHER)
524 				&& arp->ar_pro == htons(ETH_P_IP)
525 				&& arp->ar_hln == ETH_ALEN
526 				&& arp->ar_pln == 4) {
527 
528 			/* We only match on the lower 8 bits of the opcode. */
529 			if (ntohs(arp->ar_op) <= 0xff)
530 				key->ip.proto = ntohs(arp->ar_op);
531 			memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
532 			memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
533 			ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha);
534 			ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha);
535 		}
536 	} else if (key->eth.type == htons(ETH_P_IPV6)) {
537 		int nh_len;             /* IPv6 Header + Extensions */
538 
539 		nh_len = parse_ipv6hdr(skb, key);
540 		if (unlikely(nh_len < 0)) {
541 			if (nh_len == -EINVAL) {
542 				skb->transport_header = skb->network_header;
543 				error = 0;
544 			} else {
545 				error = nh_len;
546 			}
547 			return error;
548 		}
549 
550 		if (key->ip.frag == OVS_FRAG_TYPE_LATER)
551 			return 0;
552 		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
553 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
554 
555 		/* Transport layer. */
556 		if (key->ip.proto == NEXTHDR_TCP) {
557 			if (tcphdr_ok(skb)) {
558 				struct tcphdr *tcp = tcp_hdr(skb);
559 				key->ipv6.tp.src = tcp->source;
560 				key->ipv6.tp.dst = tcp->dest;
561 				key->ipv6.tp.flags = TCP_FLAGS_BE16(tcp);
562 			}
563 		} else if (key->ip.proto == NEXTHDR_UDP) {
564 			if (udphdr_ok(skb)) {
565 				struct udphdr *udp = udp_hdr(skb);
566 				key->ipv6.tp.src = udp->source;
567 				key->ipv6.tp.dst = udp->dest;
568 			}
569 		} else if (key->ip.proto == NEXTHDR_SCTP) {
570 			if (sctphdr_ok(skb)) {
571 				struct sctphdr *sctp = sctp_hdr(skb);
572 				key->ipv6.tp.src = sctp->source;
573 				key->ipv6.tp.dst = sctp->dest;
574 			}
575 		} else if (key->ip.proto == NEXTHDR_ICMP) {
576 			if (icmp6hdr_ok(skb)) {
577 				error = parse_icmpv6(skb, key, nh_len);
578 				if (error)
579 					return error;
580 			}
581 		}
582 	}
583 
584 	return 0;
585 }
586