xref: /openbmc/linux/net/core/flow_dissector.c (revision bc000245)
1 #include <linux/skbuff.h>
2 #include <linux/export.h>
3 #include <linux/ip.h>
4 #include <linux/ipv6.h>
5 #include <linux/if_vlan.h>
6 #include <net/ip.h>
7 #include <net/ipv6.h>
8 #include <linux/igmp.h>
9 #include <linux/icmp.h>
10 #include <linux/sctp.h>
11 #include <linux/dccp.h>
12 #include <linux/if_tunnel.h>
13 #include <linux/if_pppox.h>
14 #include <linux/ppp_defs.h>
15 #include <net/flow_keys.h>
16 
17 /* copy saddr & daddr, possibly using 64bit load/store
18  * Equivalent to :	flow->src = iph->saddr;
19  *			flow->dst = iph->daddr;
20  */
21 static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
22 {
23 	BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
24 		     offsetof(typeof(*flow), src) + sizeof(flow->src));
25 	memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
26 }
27 
28 /**
29  * skb_flow_get_ports - extract the upper layer ports and return them
30  * @skb: buffer to extract the ports from
31  * @thoff: transport header offset
32  * @ip_proto: protocol for which to get port offset
33  *
34  * The function will try to retrieve the ports at offset thoff + poff where poff
35  * is the protocol port offset returned from proto_ports_offset
36  */
37 __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto)
38 {
39 	int poff = proto_ports_offset(ip_proto);
40 
41 	if (poff >= 0) {
42 		__be32 *ports, _ports;
43 
44 		ports = skb_header_pointer(skb, thoff + poff,
45 					   sizeof(_ports), &_ports);
46 		if (ports)
47 			return *ports;
48 	}
49 
50 	return 0;
51 }
52 EXPORT_SYMBOL(skb_flow_get_ports);
53 
54 bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
55 {
56 	int nhoff = skb_network_offset(skb);
57 	u8 ip_proto;
58 	__be16 proto = skb->protocol;
59 
60 	memset(flow, 0, sizeof(*flow));
61 
62 again:
63 	switch (proto) {
64 	case __constant_htons(ETH_P_IP): {
65 		const struct iphdr *iph;
66 		struct iphdr _iph;
67 ip:
68 		iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
69 		if (!iph || iph->ihl < 5)
70 			return false;
71 		nhoff += iph->ihl * 4;
72 
73 		ip_proto = iph->protocol;
74 		if (ip_is_fragment(iph))
75 			ip_proto = 0;
76 
77 		iph_to_flow_copy_addrs(flow, iph);
78 		break;
79 	}
80 	case __constant_htons(ETH_P_IPV6): {
81 		const struct ipv6hdr *iph;
82 		struct ipv6hdr _iph;
83 ipv6:
84 		iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
85 		if (!iph)
86 			return false;
87 
88 		ip_proto = iph->nexthdr;
89 		flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
90 		flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
91 		nhoff += sizeof(struct ipv6hdr);
92 		break;
93 	}
94 	case __constant_htons(ETH_P_8021AD):
95 	case __constant_htons(ETH_P_8021Q): {
96 		const struct vlan_hdr *vlan;
97 		struct vlan_hdr _vlan;
98 
99 		vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan);
100 		if (!vlan)
101 			return false;
102 
103 		proto = vlan->h_vlan_encapsulated_proto;
104 		nhoff += sizeof(*vlan);
105 		goto again;
106 	}
107 	case __constant_htons(ETH_P_PPP_SES): {
108 		struct {
109 			struct pppoe_hdr hdr;
110 			__be16 proto;
111 		} *hdr, _hdr;
112 		hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
113 		if (!hdr)
114 			return false;
115 		proto = hdr->proto;
116 		nhoff += PPPOE_SES_HLEN;
117 		switch (proto) {
118 		case __constant_htons(PPP_IP):
119 			goto ip;
120 		case __constant_htons(PPP_IPV6):
121 			goto ipv6;
122 		default:
123 			return false;
124 		}
125 	}
126 	default:
127 		return false;
128 	}
129 
130 	switch (ip_proto) {
131 	case IPPROTO_GRE: {
132 		struct gre_hdr {
133 			__be16 flags;
134 			__be16 proto;
135 		} *hdr, _hdr;
136 
137 		hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
138 		if (!hdr)
139 			return false;
140 		/*
141 		 * Only look inside GRE if version zero and no
142 		 * routing
143 		 */
144 		if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
145 			proto = hdr->proto;
146 			nhoff += 4;
147 			if (hdr->flags & GRE_CSUM)
148 				nhoff += 4;
149 			if (hdr->flags & GRE_KEY)
150 				nhoff += 4;
151 			if (hdr->flags & GRE_SEQ)
152 				nhoff += 4;
153 			if (proto == htons(ETH_P_TEB)) {
154 				const struct ethhdr *eth;
155 				struct ethhdr _eth;
156 
157 				eth = skb_header_pointer(skb, nhoff,
158 							 sizeof(_eth), &_eth);
159 				if (!eth)
160 					return false;
161 				proto = eth->h_proto;
162 				nhoff += sizeof(*eth);
163 			}
164 			goto again;
165 		}
166 		break;
167 	}
168 	case IPPROTO_IPIP:
169 		proto = htons(ETH_P_IP);
170 		goto ip;
171 	case IPPROTO_IPV6:
172 		proto = htons(ETH_P_IPV6);
173 		goto ipv6;
174 	default:
175 		break;
176 	}
177 
178 	flow->ip_proto = ip_proto;
179 	flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);
180 	flow->thoff = (u16) nhoff;
181 
182 	return true;
183 }
184 EXPORT_SYMBOL(skb_flow_dissect);
185 
186 static u32 hashrnd __read_mostly;
187 static __always_inline void __flow_hash_secret_init(void)
188 {
189 	net_get_random_once(&hashrnd, sizeof(hashrnd));
190 }
191 
192 static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
193 {
194 	__flow_hash_secret_init();
195 	return jhash_3words(a, b, c, hashrnd);
196 }
197 
198 static __always_inline u32 __flow_hash_1word(u32 a)
199 {
200 	__flow_hash_secret_init();
201 	return jhash_1word(a, hashrnd);
202 }
203 
204 /*
205  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
206  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
207  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
208  * if hash is a canonical 4-tuple hash over transport ports.
209  */
210 void __skb_get_rxhash(struct sk_buff *skb)
211 {
212 	struct flow_keys keys;
213 	u32 hash;
214 
215 	if (!skb_flow_dissect(skb, &keys))
216 		return;
217 
218 	if (keys.ports)
219 		skb->l4_rxhash = 1;
220 
221 	/* get a consistent hash (same value on both flow directions) */
222 	if (((__force u32)keys.dst < (__force u32)keys.src) ||
223 	    (((__force u32)keys.dst == (__force u32)keys.src) &&
224 	     ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
225 		swap(keys.dst, keys.src);
226 		swap(keys.port16[0], keys.port16[1]);
227 	}
228 
229 	hash = __flow_hash_3words((__force u32)keys.dst,
230 				  (__force u32)keys.src,
231 				  (__force u32)keys.ports);
232 	if (!hash)
233 		hash = 1;
234 
235 	skb->rxhash = hash;
236 }
237 EXPORT_SYMBOL(__skb_get_rxhash);
238 
239 /*
240  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
241  * to be used as a distribution range.
242  */
243 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
244 		  unsigned int num_tx_queues)
245 {
246 	u32 hash;
247 	u16 qoffset = 0;
248 	u16 qcount = num_tx_queues;
249 
250 	if (skb_rx_queue_recorded(skb)) {
251 		hash = skb_get_rx_queue(skb);
252 		while (unlikely(hash >= num_tx_queues))
253 			hash -= num_tx_queues;
254 		return hash;
255 	}
256 
257 	if (dev->num_tc) {
258 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
259 		qoffset = dev->tc_to_txq[tc].offset;
260 		qcount = dev->tc_to_txq[tc].count;
261 	}
262 
263 	if (skb->sk && skb->sk->sk_hash)
264 		hash = skb->sk->sk_hash;
265 	else
266 		hash = (__force u16) skb->protocol;
267 	hash = __flow_hash_1word(hash);
268 
269 	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
270 }
271 EXPORT_SYMBOL(__skb_tx_hash);
272 
273 /* __skb_get_poff() returns the offset to the payload as far as it could
274  * be dissected. The main user is currently BPF, so that we can dynamically
275  * truncate packets without needing to push actual payload to the user
276  * space and can analyze headers only, instead.
277  */
278 u32 __skb_get_poff(const struct sk_buff *skb)
279 {
280 	struct flow_keys keys;
281 	u32 poff = 0;
282 
283 	if (!skb_flow_dissect(skb, &keys))
284 		return 0;
285 
286 	poff += keys.thoff;
287 	switch (keys.ip_proto) {
288 	case IPPROTO_TCP: {
289 		const struct tcphdr *tcph;
290 		struct tcphdr _tcph;
291 
292 		tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph);
293 		if (!tcph)
294 			return poff;
295 
296 		poff += max_t(u32, sizeof(struct tcphdr), tcph->doff * 4);
297 		break;
298 	}
299 	case IPPROTO_UDP:
300 	case IPPROTO_UDPLITE:
301 		poff += sizeof(struct udphdr);
302 		break;
303 	/* For the rest, we do not really care about header
304 	 * extensions at this point for now.
305 	 */
306 	case IPPROTO_ICMP:
307 		poff += sizeof(struct icmphdr);
308 		break;
309 	case IPPROTO_ICMPV6:
310 		poff += sizeof(struct icmp6hdr);
311 		break;
312 	case IPPROTO_IGMP:
313 		poff += sizeof(struct igmphdr);
314 		break;
315 	case IPPROTO_DCCP:
316 		poff += sizeof(struct dccp_hdr);
317 		break;
318 	case IPPROTO_SCTP:
319 		poff += sizeof(struct sctphdr);
320 		break;
321 	}
322 
323 	return poff;
324 }
325 
326 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
327 {
328 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
329 		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
330 				     dev->name, queue_index,
331 				     dev->real_num_tx_queues);
332 		return 0;
333 	}
334 	return queue_index;
335 }
336 
337 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
338 {
339 #ifdef CONFIG_XPS
340 	struct xps_dev_maps *dev_maps;
341 	struct xps_map *map;
342 	int queue_index = -1;
343 
344 	rcu_read_lock();
345 	dev_maps = rcu_dereference(dev->xps_maps);
346 	if (dev_maps) {
347 		map = rcu_dereference(
348 		    dev_maps->cpu_map[raw_smp_processor_id()]);
349 		if (map) {
350 			if (map->len == 1)
351 				queue_index = map->queues[0];
352 			else {
353 				u32 hash;
354 				if (skb->sk && skb->sk->sk_hash)
355 					hash = skb->sk->sk_hash;
356 				else
357 					hash = (__force u16) skb->protocol ^
358 					    skb->rxhash;
359 				hash = __flow_hash_1word(hash);
360 				queue_index = map->queues[
361 				    ((u64)hash * map->len) >> 32];
362 			}
363 			if (unlikely(queue_index >= dev->real_num_tx_queues))
364 				queue_index = -1;
365 		}
366 	}
367 	rcu_read_unlock();
368 
369 	return queue_index;
370 #else
371 	return -1;
372 #endif
373 }
374 
375 u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
376 {
377 	struct sock *sk = skb->sk;
378 	int queue_index = sk_tx_queue_get(sk);
379 
380 	if (queue_index < 0 || skb->ooo_okay ||
381 	    queue_index >= dev->real_num_tx_queues) {
382 		int new_index = get_xps_queue(dev, skb);
383 		if (new_index < 0)
384 			new_index = skb_tx_hash(dev, skb);
385 
386 		if (queue_index != new_index && sk &&
387 		    rcu_access_pointer(sk->sk_dst_cache))
388 			sk_tx_queue_set(sk, new_index);
389 
390 		queue_index = new_index;
391 	}
392 
393 	return queue_index;
394 }
395 EXPORT_SYMBOL(__netdev_pick_tx);
396 
397 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
398 				    struct sk_buff *skb)
399 {
400 	int queue_index = 0;
401 
402 	if (dev->real_num_tx_queues != 1) {
403 		const struct net_device_ops *ops = dev->netdev_ops;
404 		if (ops->ndo_select_queue)
405 			queue_index = ops->ndo_select_queue(dev, skb);
406 		else
407 			queue_index = __netdev_pick_tx(dev, skb);
408 		queue_index = dev_cap_txqueue(dev, queue_index);
409 	}
410 
411 	skb_set_queue_mapping(skb, queue_index);
412 	return netdev_get_tx_queue(dev, queue_index);
413 }
414