xref: /openbmc/linux/net/openvswitch/flow.c (revision 5a86bf34)
1 /*
2  * Copyright (c) 2007-2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #include "flow.h"
20 #include "datapath.h"
21 #include <linux/uaccess.h>
22 #include <linux/netdevice.h>
23 #include <linux/etherdevice.h>
24 #include <linux/if_ether.h>
25 #include <linux/if_vlan.h>
26 #include <net/llc_pdu.h>
27 #include <linux/kernel.h>
28 #include <linux/jhash.h>
29 #include <linux/jiffies.h>
30 #include <linux/llc.h>
31 #include <linux/module.h>
32 #include <linux/in.h>
33 #include <linux/rcupdate.h>
34 #include <linux/if_arp.h>
35 #include <linux/ip.h>
36 #include <linux/ipv6.h>
37 #include <linux/sctp.h>
38 #include <linux/smp.h>
39 #include <linux/tcp.h>
40 #include <linux/udp.h>
41 #include <linux/icmp.h>
42 #include <linux/icmpv6.h>
43 #include <linux/rculist.h>
44 #include <net/ip.h>
45 #include <net/ip_tunnels.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 
49 u64 ovs_flow_used_time(unsigned long flow_jiffies)
50 {
51 	struct timespec cur_ts;
52 	u64 cur_ms, idle_ms;
53 
54 	ktime_get_ts(&cur_ts);
55 	idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
56 	cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
57 		 cur_ts.tv_nsec / NSEC_PER_MSEC;
58 
59 	return cur_ms - idle_ms;
60 }
61 
62 #define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF))
63 
64 void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
65 {
66 	struct flow_stats *stats;
67 	__be16 tcp_flags = 0;
68 
69 	if (!flow->stats.is_percpu)
70 		stats = flow->stats.stat;
71 	else
72 		stats = this_cpu_ptr(flow->stats.cpu_stats);
73 
74 	if ((flow->key.eth.type == htons(ETH_P_IP) ||
75 	     flow->key.eth.type == htons(ETH_P_IPV6)) &&
76 	    flow->key.ip.proto == IPPROTO_TCP &&
77 	    likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) {
78 		tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
79 	}
80 
81 	spin_lock(&stats->lock);
82 	stats->used = jiffies;
83 	stats->packet_count++;
84 	stats->byte_count += skb->len;
85 	stats->tcp_flags |= tcp_flags;
86 	spin_unlock(&stats->lock);
87 }
88 
89 static void stats_read(struct flow_stats *stats,
90 		       struct ovs_flow_stats *ovs_stats,
91 		       unsigned long *used, __be16 *tcp_flags)
92 {
93 	spin_lock(&stats->lock);
94 	if (time_after(stats->used, *used))
95 		*used = stats->used;
96 	*tcp_flags |= stats->tcp_flags;
97 	ovs_stats->n_packets += stats->packet_count;
98 	ovs_stats->n_bytes += stats->byte_count;
99 	spin_unlock(&stats->lock);
100 }
101 
102 void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats,
103 			unsigned long *used, __be16 *tcp_flags)
104 {
105 	int cpu, cur_cpu;
106 
107 	*used = 0;
108 	*tcp_flags = 0;
109 	memset(ovs_stats, 0, sizeof(*ovs_stats));
110 
111 	if (!flow->stats.is_percpu) {
112 		stats_read(flow->stats.stat, ovs_stats, used, tcp_flags);
113 	} else {
114 		cur_cpu = get_cpu();
115 		for_each_possible_cpu(cpu) {
116 			struct flow_stats *stats;
117 
118 			if (cpu == cur_cpu)
119 				local_bh_disable();
120 
121 			stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
122 			stats_read(stats, ovs_stats, used, tcp_flags);
123 
124 			if (cpu == cur_cpu)
125 				local_bh_enable();
126 		}
127 		put_cpu();
128 	}
129 }
130 
131 static void stats_reset(struct flow_stats *stats)
132 {
133 	spin_lock(&stats->lock);
134 	stats->used = 0;
135 	stats->packet_count = 0;
136 	stats->byte_count = 0;
137 	stats->tcp_flags = 0;
138 	spin_unlock(&stats->lock);
139 }
140 
141 void ovs_flow_stats_clear(struct sw_flow *flow)
142 {
143 	int cpu, cur_cpu;
144 
145 	if (!flow->stats.is_percpu) {
146 		stats_reset(flow->stats.stat);
147 	} else {
148 		cur_cpu = get_cpu();
149 
150 		for_each_possible_cpu(cpu) {
151 
152 			if (cpu == cur_cpu)
153 				local_bh_disable();
154 
155 			stats_reset(per_cpu_ptr(flow->stats.cpu_stats, cpu));
156 
157 			if (cpu == cur_cpu)
158 				local_bh_enable();
159 		}
160 		put_cpu();
161 	}
162 }
163 
164 static int check_header(struct sk_buff *skb, int len)
165 {
166 	if (unlikely(skb->len < len))
167 		return -EINVAL;
168 	if (unlikely(!pskb_may_pull(skb, len)))
169 		return -ENOMEM;
170 	return 0;
171 }
172 
173 static bool arphdr_ok(struct sk_buff *skb)
174 {
175 	return pskb_may_pull(skb, skb_network_offset(skb) +
176 				  sizeof(struct arp_eth_header));
177 }
178 
179 static int check_iphdr(struct sk_buff *skb)
180 {
181 	unsigned int nh_ofs = skb_network_offset(skb);
182 	unsigned int ip_len;
183 	int err;
184 
185 	err = check_header(skb, nh_ofs + sizeof(struct iphdr));
186 	if (unlikely(err))
187 		return err;
188 
189 	ip_len = ip_hdrlen(skb);
190 	if (unlikely(ip_len < sizeof(struct iphdr) ||
191 		     skb->len < nh_ofs + ip_len))
192 		return -EINVAL;
193 
194 	skb_set_transport_header(skb, nh_ofs + ip_len);
195 	return 0;
196 }
197 
198 static bool tcphdr_ok(struct sk_buff *skb)
199 {
200 	int th_ofs = skb_transport_offset(skb);
201 	int tcp_len;
202 
203 	if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr))))
204 		return false;
205 
206 	tcp_len = tcp_hdrlen(skb);
207 	if (unlikely(tcp_len < sizeof(struct tcphdr) ||
208 		     skb->len < th_ofs + tcp_len))
209 		return false;
210 
211 	return true;
212 }
213 
214 static bool udphdr_ok(struct sk_buff *skb)
215 {
216 	return pskb_may_pull(skb, skb_transport_offset(skb) +
217 				  sizeof(struct udphdr));
218 }
219 
220 static bool sctphdr_ok(struct sk_buff *skb)
221 {
222 	return pskb_may_pull(skb, skb_transport_offset(skb) +
223 				  sizeof(struct sctphdr));
224 }
225 
226 static bool icmphdr_ok(struct sk_buff *skb)
227 {
228 	return pskb_may_pull(skb, skb_transport_offset(skb) +
229 				  sizeof(struct icmphdr));
230 }
231 
232 static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
233 {
234 	unsigned int nh_ofs = skb_network_offset(skb);
235 	unsigned int nh_len;
236 	int payload_ofs;
237 	struct ipv6hdr *nh;
238 	uint8_t nexthdr;
239 	__be16 frag_off;
240 	int err;
241 
242 	err = check_header(skb, nh_ofs + sizeof(*nh));
243 	if (unlikely(err))
244 		return err;
245 
246 	nh = ipv6_hdr(skb);
247 	nexthdr = nh->nexthdr;
248 	payload_ofs = (u8 *)(nh + 1) - skb->data;
249 
250 	key->ip.proto = NEXTHDR_NONE;
251 	key->ip.tos = ipv6_get_dsfield(nh);
252 	key->ip.ttl = nh->hop_limit;
253 	key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
254 	key->ipv6.addr.src = nh->saddr;
255 	key->ipv6.addr.dst = nh->daddr;
256 
257 	payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
258 	if (unlikely(payload_ofs < 0))
259 		return -EINVAL;
260 
261 	if (frag_off) {
262 		if (frag_off & htons(~0x7))
263 			key->ip.frag = OVS_FRAG_TYPE_LATER;
264 		else
265 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
266 	}
267 
268 	nh_len = payload_ofs - nh_ofs;
269 	skb_set_transport_header(skb, nh_ofs + nh_len);
270 	key->ip.proto = nexthdr;
271 	return nh_len;
272 }
273 
274 static bool icmp6hdr_ok(struct sk_buff *skb)
275 {
276 	return pskb_may_pull(skb, skb_transport_offset(skb) +
277 				  sizeof(struct icmp6hdr));
278 }
279 
280 static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
281 {
282 	struct qtag_prefix {
283 		__be16 eth_type; /* ETH_P_8021Q */
284 		__be16 tci;
285 	};
286 	struct qtag_prefix *qp;
287 
288 	if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16)))
289 		return 0;
290 
291 	if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) +
292 					 sizeof(__be16))))
293 		return -ENOMEM;
294 
295 	qp = (struct qtag_prefix *) skb->data;
296 	key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT);
297 	__skb_pull(skb, sizeof(struct qtag_prefix));
298 
299 	return 0;
300 }
301 
302 static __be16 parse_ethertype(struct sk_buff *skb)
303 {
304 	struct llc_snap_hdr {
305 		u8  dsap;  /* Always 0xAA */
306 		u8  ssap;  /* Always 0xAA */
307 		u8  ctrl;
308 		u8  oui[3];
309 		__be16 ethertype;
310 	};
311 	struct llc_snap_hdr *llc;
312 	__be16 proto;
313 
314 	proto = *(__be16 *) skb->data;
315 	__skb_pull(skb, sizeof(__be16));
316 
317 	if (ntohs(proto) >= ETH_P_802_3_MIN)
318 		return proto;
319 
320 	if (skb->len < sizeof(struct llc_snap_hdr))
321 		return htons(ETH_P_802_2);
322 
323 	if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr))))
324 		return htons(0);
325 
326 	llc = (struct llc_snap_hdr *) skb->data;
327 	if (llc->dsap != LLC_SAP_SNAP ||
328 	    llc->ssap != LLC_SAP_SNAP ||
329 	    (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0)
330 		return htons(ETH_P_802_2);
331 
332 	__skb_pull(skb, sizeof(struct llc_snap_hdr));
333 
334 	if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN)
335 		return llc->ethertype;
336 
337 	return htons(ETH_P_802_2);
338 }
339 
340 static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
341 			int nh_len)
342 {
343 	struct icmp6hdr *icmp = icmp6_hdr(skb);
344 
345 	/* The ICMPv6 type and code fields use the 16-bit transport port
346 	 * fields, so we need to store them in 16-bit network byte order.
347 	 */
348 	key->ipv6.tp.src = htons(icmp->icmp6_type);
349 	key->ipv6.tp.dst = htons(icmp->icmp6_code);
350 
351 	if (icmp->icmp6_code == 0 &&
352 	    (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
353 	     icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) {
354 		int icmp_len = skb->len - skb_transport_offset(skb);
355 		struct nd_msg *nd;
356 		int offset;
357 
358 		/* In order to process neighbor discovery options, we need the
359 		 * entire packet.
360 		 */
361 		if (unlikely(icmp_len < sizeof(*nd)))
362 			return 0;
363 
364 		if (unlikely(skb_linearize(skb)))
365 			return -ENOMEM;
366 
367 		nd = (struct nd_msg *)skb_transport_header(skb);
368 		key->ipv6.nd.target = nd->target;
369 
370 		icmp_len -= sizeof(*nd);
371 		offset = 0;
372 		while (icmp_len >= 8) {
373 			struct nd_opt_hdr *nd_opt =
374 				 (struct nd_opt_hdr *)(nd->opt + offset);
375 			int opt_len = nd_opt->nd_opt_len * 8;
376 
377 			if (unlikely(!opt_len || opt_len > icmp_len))
378 				return 0;
379 
380 			/* Store the link layer address if the appropriate
381 			 * option is provided.  It is considered an error if
382 			 * the same link layer option is specified twice.
383 			 */
384 			if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR
385 			    && opt_len == 8) {
386 				if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll)))
387 					goto invalid;
388 				memcpy(key->ipv6.nd.sll,
389 				    &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
390 			} else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
391 				   && opt_len == 8) {
392 				if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll)))
393 					goto invalid;
394 				memcpy(key->ipv6.nd.tll,
395 				    &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
396 			}
397 
398 			icmp_len -= opt_len;
399 			offset += opt_len;
400 		}
401 	}
402 
403 	return 0;
404 
405 invalid:
406 	memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
407 	memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
408 	memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
409 
410 	return 0;
411 }
412 
413 /**
414  * ovs_flow_extract - extracts a flow key from an Ethernet frame.
415  * @skb: sk_buff that contains the frame, with skb->data pointing to the
416  * Ethernet header
417  * @in_port: port number on which @skb was received.
418  * @key: output flow key
419  *
420  * The caller must ensure that skb->len >= ETH_HLEN.
421  *
422  * Returns 0 if successful, otherwise a negative errno value.
423  *
424  * Initializes @skb header pointers as follows:
425  *
426  *    - skb->mac_header: the Ethernet header.
427  *
428  *    - skb->network_header: just past the Ethernet header, or just past the
429  *      VLAN header, to the first byte of the Ethernet payload.
430  *
431  *    - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
432  *      on output, then just past the IP header, if one is present and
433  *      of a correct length, otherwise the same as skb->network_header.
434  *      For other key->eth.type values it is left untouched.
435  */
436 int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
437 {
438 	int error;
439 	struct ethhdr *eth;
440 
441 	memset(key, 0, sizeof(*key));
442 
443 	key->phy.priority = skb->priority;
444 	if (OVS_CB(skb)->tun_key)
445 		memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key));
446 	key->phy.in_port = in_port;
447 	key->phy.skb_mark = skb->mark;
448 
449 	skb_reset_mac_header(skb);
450 
451 	/* Link layer.  We are guaranteed to have at least the 14 byte Ethernet
452 	 * header in the linear data area.
453 	 */
454 	eth = eth_hdr(skb);
455 	memcpy(key->eth.src, eth->h_source, ETH_ALEN);
456 	memcpy(key->eth.dst, eth->h_dest, ETH_ALEN);
457 
458 	__skb_pull(skb, 2 * ETH_ALEN);
459 	/* We are going to push all headers that we pull, so no need to
460 	 * update skb->csum here.
461 	 */
462 
463 	if (vlan_tx_tag_present(skb))
464 		key->eth.tci = htons(skb->vlan_tci);
465 	else if (eth->h_proto == htons(ETH_P_8021Q))
466 		if (unlikely(parse_vlan(skb, key)))
467 			return -ENOMEM;
468 
469 	key->eth.type = parse_ethertype(skb);
470 	if (unlikely(key->eth.type == htons(0)))
471 		return -ENOMEM;
472 
473 	skb_reset_network_header(skb);
474 	__skb_push(skb, skb->data - skb_mac_header(skb));
475 
476 	/* Network layer. */
477 	if (key->eth.type == htons(ETH_P_IP)) {
478 		struct iphdr *nh;
479 		__be16 offset;
480 
481 		error = check_iphdr(skb);
482 		if (unlikely(error)) {
483 			if (error == -EINVAL) {
484 				skb->transport_header = skb->network_header;
485 				error = 0;
486 			}
487 			return error;
488 		}
489 
490 		nh = ip_hdr(skb);
491 		key->ipv4.addr.src = nh->saddr;
492 		key->ipv4.addr.dst = nh->daddr;
493 
494 		key->ip.proto = nh->protocol;
495 		key->ip.tos = nh->tos;
496 		key->ip.ttl = nh->ttl;
497 
498 		offset = nh->frag_off & htons(IP_OFFSET);
499 		if (offset) {
500 			key->ip.frag = OVS_FRAG_TYPE_LATER;
501 			return 0;
502 		}
503 		if (nh->frag_off & htons(IP_MF) ||
504 			 skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
505 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
506 
507 		/* Transport layer. */
508 		if (key->ip.proto == IPPROTO_TCP) {
509 			if (tcphdr_ok(skb)) {
510 				struct tcphdr *tcp = tcp_hdr(skb);
511 				key->ipv4.tp.src = tcp->source;
512 				key->ipv4.tp.dst = tcp->dest;
513 				key->ipv4.tp.flags = TCP_FLAGS_BE16(tcp);
514 			}
515 		} else if (key->ip.proto == IPPROTO_UDP) {
516 			if (udphdr_ok(skb)) {
517 				struct udphdr *udp = udp_hdr(skb);
518 				key->ipv4.tp.src = udp->source;
519 				key->ipv4.tp.dst = udp->dest;
520 			}
521 		} else if (key->ip.proto == IPPROTO_SCTP) {
522 			if (sctphdr_ok(skb)) {
523 				struct sctphdr *sctp = sctp_hdr(skb);
524 				key->ipv4.tp.src = sctp->source;
525 				key->ipv4.tp.dst = sctp->dest;
526 			}
527 		} else if (key->ip.proto == IPPROTO_ICMP) {
528 			if (icmphdr_ok(skb)) {
529 				struct icmphdr *icmp = icmp_hdr(skb);
530 				/* The ICMP type and code fields use the 16-bit
531 				 * transport port fields, so we need to store
532 				 * them in 16-bit network byte order. */
533 				key->ipv4.tp.src = htons(icmp->type);
534 				key->ipv4.tp.dst = htons(icmp->code);
535 			}
536 		}
537 
538 	} else if ((key->eth.type == htons(ETH_P_ARP) ||
539 		   key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) {
540 		struct arp_eth_header *arp;
541 
542 		arp = (struct arp_eth_header *)skb_network_header(skb);
543 
544 		if (arp->ar_hrd == htons(ARPHRD_ETHER)
545 				&& arp->ar_pro == htons(ETH_P_IP)
546 				&& arp->ar_hln == ETH_ALEN
547 				&& arp->ar_pln == 4) {
548 
549 			/* We only match on the lower 8 bits of the opcode. */
550 			if (ntohs(arp->ar_op) <= 0xff)
551 				key->ip.proto = ntohs(arp->ar_op);
552 			memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
553 			memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
554 			memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
555 			memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
556 		}
557 	} else if (key->eth.type == htons(ETH_P_IPV6)) {
558 		int nh_len;             /* IPv6 Header + Extensions */
559 
560 		nh_len = parse_ipv6hdr(skb, key);
561 		if (unlikely(nh_len < 0)) {
562 			if (nh_len == -EINVAL) {
563 				skb->transport_header = skb->network_header;
564 				error = 0;
565 			} else {
566 				error = nh_len;
567 			}
568 			return error;
569 		}
570 
571 		if (key->ip.frag == OVS_FRAG_TYPE_LATER)
572 			return 0;
573 		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
574 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
575 
576 		/* Transport layer. */
577 		if (key->ip.proto == NEXTHDR_TCP) {
578 			if (tcphdr_ok(skb)) {
579 				struct tcphdr *tcp = tcp_hdr(skb);
580 				key->ipv6.tp.src = tcp->source;
581 				key->ipv6.tp.dst = tcp->dest;
582 				key->ipv6.tp.flags = TCP_FLAGS_BE16(tcp);
583 			}
584 		} else if (key->ip.proto == NEXTHDR_UDP) {
585 			if (udphdr_ok(skb)) {
586 				struct udphdr *udp = udp_hdr(skb);
587 				key->ipv6.tp.src = udp->source;
588 				key->ipv6.tp.dst = udp->dest;
589 			}
590 		} else if (key->ip.proto == NEXTHDR_SCTP) {
591 			if (sctphdr_ok(skb)) {
592 				struct sctphdr *sctp = sctp_hdr(skb);
593 				key->ipv6.tp.src = sctp->source;
594 				key->ipv6.tp.dst = sctp->dest;
595 			}
596 		} else if (key->ip.proto == NEXTHDR_ICMP) {
597 			if (icmp6hdr_ok(skb)) {
598 				error = parse_icmpv6(skb, key, nh_len);
599 				if (error)
600 					return error;
601 			}
602 		}
603 	}
604 
605 	return 0;
606 }
607