xref: /openbmc/linux/net/mctp/route.c (revision 68635970)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Management Component Transport Protocol (MCTP) - routing
4  * implementation.
5  *
6  * This is currently based on a simple routing table, with no dst cache. The
7  * number of routes should stay fairly small, so the lookup cost is small.
8  *
9  * Copyright (c) 2021 Code Construct
10  * Copyright (c) 2021 Google
11  */
12 
13 #include <linux/idr.h>
14 #include <linux/mctp.h>
15 #include <linux/netdevice.h>
16 #include <linux/rtnetlink.h>
17 #include <linux/skbuff.h>
18 
19 #include <uapi/linux/if_arp.h>
20 
21 #include <net/mctp.h>
22 #include <net/mctpdevice.h>
23 #include <net/netlink.h>
24 #include <net/sock.h>
25 
26 static const unsigned int mctp_message_maxlen = 64 * 1024;
27 
28 /* route output callbacks */
29 static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb)
30 {
31 	kfree_skb(skb);
32 	return 0;
33 }
34 
35 static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
36 {
37 	struct mctp_skb_cb *cb = mctp_cb(skb);
38 	struct mctp_hdr *mh;
39 	struct sock *sk;
40 	u8 type;
41 
42 	WARN_ON(!rcu_read_lock_held());
43 
44 	/* TODO: look up in skb->cb? */
45 	mh = mctp_hdr(skb);
46 
47 	if (!skb_headlen(skb))
48 		return NULL;
49 
50 	type = (*(u8 *)skb->data) & 0x7f;
51 
52 	sk_for_each_rcu(sk, &net->mctp.binds) {
53 		struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
54 
55 		if (msk->bind_net != MCTP_NET_ANY && msk->bind_net != cb->net)
56 			continue;
57 
58 		if (msk->bind_type != type)
59 			continue;
60 
61 		if (msk->bind_addr != MCTP_ADDR_ANY &&
62 		    msk->bind_addr != mh->dest)
63 			continue;
64 
65 		return msk;
66 	}
67 
68 	return NULL;
69 }
70 
71 static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local,
72 			   mctp_eid_t peer, u8 tag)
73 {
74 	if (key->local_addr != local)
75 		return false;
76 
77 	if (key->peer_addr != peer)
78 		return false;
79 
80 	if (key->tag != tag)
81 		return false;
82 
83 	return true;
84 }
85 
86 /* returns a key (with key->lock held, and refcounted), or NULL if no such
87  * key exists.
88  */
89 static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb,
90 					   mctp_eid_t peer,
91 					   unsigned long *irqflags)
92 	__acquires(&key->lock)
93 {
94 	struct mctp_sk_key *key, *ret;
95 	unsigned long flags;
96 	struct mctp_hdr *mh;
97 	u8 tag;
98 
99 	mh = mctp_hdr(skb);
100 	tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);
101 
102 	ret = NULL;
103 	spin_lock_irqsave(&net->mctp.keys_lock, flags);
104 
105 	hlist_for_each_entry(key, &net->mctp.keys, hlist) {
106 		if (!mctp_key_match(key, mh->dest, peer, tag))
107 			continue;
108 
109 		spin_lock(&key->lock);
110 		if (key->valid) {
111 			refcount_inc(&key->refs);
112 			ret = key;
113 			break;
114 		}
115 		spin_unlock(&key->lock);
116 	}
117 
118 	if (ret) {
119 		spin_unlock(&net->mctp.keys_lock);
120 		*irqflags = flags;
121 	} else {
122 		spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
123 	}
124 
125 	return ret;
126 }
127 
128 static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk,
129 					  mctp_eid_t local, mctp_eid_t peer,
130 					  u8 tag, gfp_t gfp)
131 {
132 	struct mctp_sk_key *key;
133 
134 	key = kzalloc(sizeof(*key), gfp);
135 	if (!key)
136 		return NULL;
137 
138 	key->peer_addr = peer;
139 	key->local_addr = local;
140 	key->tag = tag;
141 	key->sk = &msk->sk;
142 	key->valid = true;
143 	spin_lock_init(&key->lock);
144 	refcount_set(&key->refs, 1);
145 
146 	return key;
147 }
148 
149 void mctp_key_unref(struct mctp_sk_key *key)
150 {
151 	if (refcount_dec_and_test(&key->refs))
152 		kfree(key);
153 }
154 
155 static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk)
156 {
157 	struct net *net = sock_net(&msk->sk);
158 	struct mctp_sk_key *tmp;
159 	unsigned long flags;
160 	int rc = 0;
161 
162 	spin_lock_irqsave(&net->mctp.keys_lock, flags);
163 
164 	hlist_for_each_entry(tmp, &net->mctp.keys, hlist) {
165 		if (mctp_key_match(tmp, key->local_addr, key->peer_addr,
166 				   key->tag)) {
167 			spin_lock(&tmp->lock);
168 			if (tmp->valid)
169 				rc = -EEXIST;
170 			spin_unlock(&tmp->lock);
171 			if (rc)
172 				break;
173 		}
174 	}
175 
176 	if (!rc) {
177 		refcount_inc(&key->refs);
178 		hlist_add_head(&key->hlist, &net->mctp.keys);
179 		hlist_add_head(&key->sklist, &msk->keys);
180 	}
181 
182 	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
183 
184 	return rc;
185 }
186 
187 /* We're done with the key; unset valid and remove from lists. There may still
188  * be outstanding refs on the key though...
189  */
190 static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net,
191 				   unsigned long flags)
192 	__releases(&key->lock)
193 {
194 	struct sk_buff *skb;
195 
196 	skb = key->reasm_head;
197 	key->reasm_head = NULL;
198 	key->reasm_dead = true;
199 	key->valid = false;
200 	spin_unlock_irqrestore(&key->lock, flags);
201 
202 	spin_lock_irqsave(&net->mctp.keys_lock, flags);
203 	hlist_del(&key->hlist);
204 	hlist_del(&key->sklist);
205 	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
206 
207 	/* one unref for the lists */
208 	mctp_key_unref(key);
209 
210 	/* and one for the local reference */
211 	mctp_key_unref(key);
212 
213 	if (skb)
214 		kfree_skb(skb);
215 
216 }
217 
218 static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb)
219 {
220 	struct mctp_hdr *hdr = mctp_hdr(skb);
221 	u8 exp_seq, this_seq;
222 
223 	this_seq = (hdr->flags_seq_tag >> MCTP_HDR_SEQ_SHIFT)
224 		& MCTP_HDR_SEQ_MASK;
225 
226 	if (!key->reasm_head) {
227 		key->reasm_head = skb;
228 		key->reasm_tailp = &(skb_shinfo(skb)->frag_list);
229 		key->last_seq = this_seq;
230 		return 0;
231 	}
232 
233 	exp_seq = (key->last_seq + 1) & MCTP_HDR_SEQ_MASK;
234 
235 	if (this_seq != exp_seq)
236 		return -EINVAL;
237 
238 	if (key->reasm_head->len + skb->len > mctp_message_maxlen)
239 		return -EINVAL;
240 
241 	skb->next = NULL;
242 	skb->sk = NULL;
243 	*key->reasm_tailp = skb;
244 	key->reasm_tailp = &skb->next;
245 
246 	key->last_seq = this_seq;
247 
248 	key->reasm_head->data_len += skb->len;
249 	key->reasm_head->len += skb->len;
250 	key->reasm_head->truesize += skb->truesize;
251 
252 	return 0;
253 }
254 
255 static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
256 {
257 	struct net *net = dev_net(skb->dev);
258 	struct mctp_sk_key *key;
259 	struct mctp_sock *msk;
260 	struct mctp_hdr *mh;
261 	unsigned long f;
262 	u8 tag, flags;
263 	int rc;
264 
265 	msk = NULL;
266 	rc = -EINVAL;
267 
268 	/* we may be receiving a locally-routed packet; drop source sk
269 	 * accounting
270 	 */
271 	skb_orphan(skb);
272 
273 	/* ensure we have enough data for a header and a type */
274 	if (skb->len < sizeof(struct mctp_hdr) + 1)
275 		goto out;
276 
277 	/* grab header, advance data ptr */
278 	mh = mctp_hdr(skb);
279 	skb_pull(skb, sizeof(struct mctp_hdr));
280 
281 	if (mh->ver != 1)
282 		goto out;
283 
284 	flags = mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM);
285 	tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);
286 
287 	rcu_read_lock();
288 
289 	/* lookup socket / reasm context, exactly matching (src,dest,tag).
290 	 * we hold a ref on the key, and key->lock held.
291 	 */
292 	key = mctp_lookup_key(net, skb, mh->src, &f);
293 
294 	if (flags & MCTP_HDR_FLAG_SOM) {
295 		if (key) {
296 			msk = container_of(key->sk, struct mctp_sock, sk);
297 		} else {
298 			/* first response to a broadcast? do a more general
299 			 * key lookup to find the socket, but don't use this
300 			 * key for reassembly - we'll create a more specific
301 			 * one for future packets if required (ie, !EOM).
302 			 */
303 			key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY, &f);
304 			if (key) {
305 				msk = container_of(key->sk,
306 						   struct mctp_sock, sk);
307 				spin_unlock_irqrestore(&key->lock, f);
308 				mctp_key_unref(key);
309 				key = NULL;
310 			}
311 		}
312 
313 		if (!key && !msk && (tag & MCTP_HDR_FLAG_TO))
314 			msk = mctp_lookup_bind(net, skb);
315 
316 		if (!msk) {
317 			rc = -ENOENT;
318 			goto out_unlock;
319 		}
320 
321 		/* single-packet message? deliver to socket, clean up any
322 		 * pending key.
323 		 */
324 		if (flags & MCTP_HDR_FLAG_EOM) {
325 			sock_queue_rcv_skb(&msk->sk, skb);
326 			if (key) {
327 				/* we've hit a pending reassembly; not much we
328 				 * can do but drop it
329 				 */
330 				__mctp_key_unlock_drop(key, net, f);
331 				key = NULL;
332 			}
333 			rc = 0;
334 			goto out_unlock;
335 		}
336 
337 		/* broadcast response or a bind() - create a key for further
338 		 * packets for this message
339 		 */
340 		if (!key) {
341 			key = mctp_key_alloc(msk, mh->dest, mh->src,
342 					     tag, GFP_ATOMIC);
343 			if (!key) {
344 				rc = -ENOMEM;
345 				goto out_unlock;
346 			}
347 
348 			/* we can queue without the key lock here, as the
349 			 * key isn't observable yet
350 			 */
351 			mctp_frag_queue(key, skb);
352 
353 			/* if the key_add fails, we've raced with another
354 			 * SOM packet with the same src, dest and tag. There's
355 			 * no way to distinguish future packets, so all we
356 			 * can do is drop; we'll free the skb on exit from
357 			 * this function.
358 			 */
359 			rc = mctp_key_add(key, msk);
360 			if (rc)
361 				kfree(key);
362 
363 			/* we don't need to release key->lock on exit */
364 			key = NULL;
365 
366 		} else {
367 			if (key->reasm_head || key->reasm_dead) {
368 				/* duplicate start? drop everything */
369 				__mctp_key_unlock_drop(key, net, f);
370 				rc = -EEXIST;
371 				key = NULL;
372 			} else {
373 				rc = mctp_frag_queue(key, skb);
374 			}
375 		}
376 
377 	} else if (key) {
378 		/* this packet continues a previous message; reassemble
379 		 * using the message-specific key
380 		 */
381 
382 		/* we need to be continuing an existing reassembly... */
383 		if (!key->reasm_head)
384 			rc = -EINVAL;
385 		else
386 			rc = mctp_frag_queue(key, skb);
387 
388 		/* end of message? deliver to socket, and we're done with
389 		 * the reassembly/response key
390 		 */
391 		if (!rc && flags & MCTP_HDR_FLAG_EOM) {
392 			sock_queue_rcv_skb(key->sk, key->reasm_head);
393 			key->reasm_head = NULL;
394 			__mctp_key_unlock_drop(key, net, f);
395 			key = NULL;
396 		}
397 
398 	} else {
399 		/* not a start, no matching key */
400 		rc = -ENOENT;
401 	}
402 
403 out_unlock:
404 	rcu_read_unlock();
405 	if (key) {
406 		spin_unlock_irqrestore(&key->lock, f);
407 		mctp_key_unref(key);
408 	}
409 out:
410 	if (rc)
411 		kfree_skb(skb);
412 	return rc;
413 }
414 
415 static unsigned int mctp_route_mtu(struct mctp_route *rt)
416 {
417 	return rt->mtu ?: READ_ONCE(rt->dev->dev->mtu);
418 }
419 
420 static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb)
421 {
422 	struct mctp_hdr *hdr = mctp_hdr(skb);
423 	char daddr_buf[MAX_ADDR_LEN];
424 	char *daddr = NULL;
425 	unsigned int mtu;
426 	int rc;
427 
428 	skb->protocol = htons(ETH_P_MCTP);
429 
430 	mtu = READ_ONCE(skb->dev->mtu);
431 	if (skb->len > mtu) {
432 		kfree_skb(skb);
433 		return -EMSGSIZE;
434 	}
435 
436 	/* If lookup fails let the device handle daddr==NULL */
437 	if (mctp_neigh_lookup(route->dev, hdr->dest, daddr_buf) == 0)
438 		daddr = daddr_buf;
439 
440 	rc = dev_hard_header(skb, skb->dev, ntohs(skb->protocol),
441 			     daddr, skb->dev->dev_addr, skb->len);
442 	if (rc) {
443 		kfree_skb(skb);
444 		return -EHOSTUNREACH;
445 	}
446 
447 	rc = dev_queue_xmit(skb);
448 	if (rc)
449 		rc = net_xmit_errno(rc);
450 
451 	return rc;
452 }
453 
454 /* route alloc/release */
455 static void mctp_route_release(struct mctp_route *rt)
456 {
457 	if (refcount_dec_and_test(&rt->refs)) {
458 		mctp_dev_put(rt->dev);
459 		kfree_rcu(rt, rcu);
460 	}
461 }
462 
463 /* returns a route with the refcount at 1 */
464 static struct mctp_route *mctp_route_alloc(void)
465 {
466 	struct mctp_route *rt;
467 
468 	rt = kzalloc(sizeof(*rt), GFP_KERNEL);
469 	if (!rt)
470 		return NULL;
471 
472 	INIT_LIST_HEAD(&rt->list);
473 	refcount_set(&rt->refs, 1);
474 	rt->output = mctp_route_discard;
475 
476 	return rt;
477 }
478 
479 unsigned int mctp_default_net(struct net *net)
480 {
481 	return READ_ONCE(net->mctp.default_net);
482 }
483 
484 int mctp_default_net_set(struct net *net, unsigned int index)
485 {
486 	if (index == 0)
487 		return -EINVAL;
488 	WRITE_ONCE(net->mctp.default_net, index);
489 	return 0;
490 }
491 
492 /* tag management */
493 static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key,
494 			     struct mctp_sock *msk)
495 {
496 	struct netns_mctp *mns = &net->mctp;
497 
498 	lockdep_assert_held(&mns->keys_lock);
499 
500 	/* we hold the net->key_lock here, allowing updates to both
501 	 * then net and sk
502 	 */
503 	hlist_add_head_rcu(&key->hlist, &mns->keys);
504 	hlist_add_head_rcu(&key->sklist, &msk->keys);
505 	refcount_inc(&key->refs);
506 }
507 
508 /* Allocate a locally-owned tag value for (saddr, daddr), and reserve
509  * it for the socket msk
510  */
511 static int mctp_alloc_local_tag(struct mctp_sock *msk,
512 				mctp_eid_t saddr, mctp_eid_t daddr, u8 *tagp)
513 {
514 	struct net *net = sock_net(&msk->sk);
515 	struct netns_mctp *mns = &net->mctp;
516 	struct mctp_sk_key *key, *tmp;
517 	unsigned long flags;
518 	int rc = -EAGAIN;
519 	u8 tagbits;
520 
521 	/* for NULL destination EIDs, we may get a response from any peer */
522 	if (daddr == MCTP_ADDR_NULL)
523 		daddr = MCTP_ADDR_ANY;
524 
525 	/* be optimistic, alloc now */
526 	key = mctp_key_alloc(msk, saddr, daddr, 0, GFP_KERNEL);
527 	if (!key)
528 		return -ENOMEM;
529 
530 	/* 8 possible tag values */
531 	tagbits = 0xff;
532 
533 	spin_lock_irqsave(&mns->keys_lock, flags);
534 
535 	/* Walk through the existing keys, looking for potential conflicting
536 	 * tags. If we find a conflict, clear that bit from tagbits
537 	 */
538 	hlist_for_each_entry(tmp, &mns->keys, hlist) {
539 		/* We can check the lookup fields (*_addr, tag) without the
540 		 * lock held, they don't change over the lifetime of the key.
541 		 */
542 
543 		/* if we don't own the tag, it can't conflict */
544 		if (tmp->tag & MCTP_HDR_FLAG_TO)
545 			continue;
546 
547 		if (!((tmp->peer_addr == daddr ||
548 		       tmp->peer_addr == MCTP_ADDR_ANY) &&
549 		       tmp->local_addr == saddr))
550 			continue;
551 
552 		spin_lock(&tmp->lock);
553 		/* key must still be valid. If we find a match, clear the
554 		 * potential tag value
555 		 */
556 		if (tmp->valid)
557 			tagbits &= ~(1 << tmp->tag);
558 		spin_unlock(&tmp->lock);
559 
560 		if (!tagbits)
561 			break;
562 	}
563 
564 	if (tagbits) {
565 		key->tag = __ffs(tagbits);
566 		mctp_reserve_tag(net, key, msk);
567 		*tagp = key->tag;
568 		rc = 0;
569 	}
570 
571 	spin_unlock_irqrestore(&mns->keys_lock, flags);
572 
573 	if (!tagbits)
574 		kfree(key);
575 
576 	return rc;
577 }
578 
579 /* routing lookups */
580 static bool mctp_rt_match_eid(struct mctp_route *rt,
581 			      unsigned int net, mctp_eid_t eid)
582 {
583 	return READ_ONCE(rt->dev->net) == net &&
584 		rt->min <= eid && rt->max >= eid;
585 }
586 
587 /* compares match, used for duplicate prevention */
588 static bool mctp_rt_compare_exact(struct mctp_route *rt1,
589 				  struct mctp_route *rt2)
590 {
591 	ASSERT_RTNL();
592 	return rt1->dev->net == rt2->dev->net &&
593 		rt1->min == rt2->min &&
594 		rt1->max == rt2->max;
595 }
596 
597 struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet,
598 				     mctp_eid_t daddr)
599 {
600 	struct mctp_route *tmp, *rt = NULL;
601 
602 	list_for_each_entry_rcu(tmp, &net->mctp.routes, list) {
603 		/* TODO: add metrics */
604 		if (mctp_rt_match_eid(tmp, dnet, daddr)) {
605 			if (refcount_inc_not_zero(&tmp->refs)) {
606 				rt = tmp;
607 				break;
608 			}
609 		}
610 	}
611 
612 	return rt;
613 }
614 
615 static struct mctp_route *mctp_route_lookup_null(struct net *net,
616 						 struct net_device *dev)
617 {
618 	struct mctp_route *rt;
619 
620 	list_for_each_entry_rcu(rt, &net->mctp.routes, list) {
621 		if (rt->dev->dev == dev && rt->type == RTN_LOCAL &&
622 		    refcount_inc_not_zero(&rt->refs))
623 			return rt;
624 	}
625 
626 	return NULL;
627 }
628 
629 /* sends a skb to rt and releases the route. */
630 int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb)
631 {
632 	int rc;
633 
634 	rc = rt->output(rt, skb);
635 	mctp_route_release(rt);
636 	return rc;
637 }
638 
639 static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb,
640 				  unsigned int mtu, u8 tag)
641 {
642 	const unsigned int hlen = sizeof(struct mctp_hdr);
643 	struct mctp_hdr *hdr, *hdr2;
644 	unsigned int pos, size;
645 	struct sk_buff *skb2;
646 	int rc;
647 	u8 seq;
648 
649 	hdr = mctp_hdr(skb);
650 	seq = 0;
651 	rc = 0;
652 
653 	if (mtu < hlen + 1) {
654 		kfree_skb(skb);
655 		return -EMSGSIZE;
656 	}
657 
658 	/* we've got the header */
659 	skb_pull(skb, hlen);
660 
661 	for (pos = 0; pos < skb->len;) {
662 		/* size of message payload */
663 		size = min(mtu - hlen, skb->len - pos);
664 
665 		skb2 = alloc_skb(MCTP_HEADER_MAXLEN + hlen + size, GFP_KERNEL);
666 		if (!skb2) {
667 			rc = -ENOMEM;
668 			break;
669 		}
670 
671 		/* generic skb copy */
672 		skb2->protocol = skb->protocol;
673 		skb2->priority = skb->priority;
674 		skb2->dev = skb->dev;
675 		memcpy(skb2->cb, skb->cb, sizeof(skb2->cb));
676 
677 		if (skb->sk)
678 			skb_set_owner_w(skb2, skb->sk);
679 
680 		/* establish packet */
681 		skb_reserve(skb2, MCTP_HEADER_MAXLEN);
682 		skb_reset_network_header(skb2);
683 		skb_put(skb2, hlen + size);
684 		skb2->transport_header = skb2->network_header + hlen;
685 
686 		/* copy header fields, calculate SOM/EOM flags & seq */
687 		hdr2 = mctp_hdr(skb2);
688 		hdr2->ver = hdr->ver;
689 		hdr2->dest = hdr->dest;
690 		hdr2->src = hdr->src;
691 		hdr2->flags_seq_tag = tag &
692 			(MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);
693 
694 		if (pos == 0)
695 			hdr2->flags_seq_tag |= MCTP_HDR_FLAG_SOM;
696 
697 		if (pos + size == skb->len)
698 			hdr2->flags_seq_tag |= MCTP_HDR_FLAG_EOM;
699 
700 		hdr2->flags_seq_tag |= seq << MCTP_HDR_SEQ_SHIFT;
701 
702 		/* copy message payload */
703 		skb_copy_bits(skb, pos, skb_transport_header(skb2), size);
704 
705 		/* do route, but don't drop the rt reference */
706 		rc = rt->output(rt, skb2);
707 		if (rc)
708 			break;
709 
710 		seq = (seq + 1) & MCTP_HDR_SEQ_MASK;
711 		pos += size;
712 	}
713 
714 	mctp_route_release(rt);
715 	consume_skb(skb);
716 	return rc;
717 }
718 
719 int mctp_local_output(struct sock *sk, struct mctp_route *rt,
720 		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag)
721 {
722 	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
723 	struct mctp_skb_cb *cb = mctp_cb(skb);
724 	struct mctp_hdr *hdr;
725 	unsigned long flags;
726 	unsigned int mtu;
727 	mctp_eid_t saddr;
728 	int rc;
729 	u8 tag;
730 
731 	if (WARN_ON(!rt->dev))
732 		return -EINVAL;
733 
734 	spin_lock_irqsave(&rt->dev->addrs_lock, flags);
735 	if (rt->dev->num_addrs == 0) {
736 		rc = -EHOSTUNREACH;
737 	} else {
738 		/* use the outbound interface's first address as our source */
739 		saddr = rt->dev->addrs[0];
740 		rc = 0;
741 	}
742 	spin_unlock_irqrestore(&rt->dev->addrs_lock, flags);
743 
744 	if (rc)
745 		return rc;
746 
747 	if (req_tag & MCTP_HDR_FLAG_TO) {
748 		rc = mctp_alloc_local_tag(msk, saddr, daddr, &tag);
749 		if (rc)
750 			return rc;
751 		tag |= MCTP_HDR_FLAG_TO;
752 	} else {
753 		tag = req_tag;
754 	}
755 
756 
757 	skb->protocol = htons(ETH_P_MCTP);
758 	skb->priority = 0;
759 	skb_reset_transport_header(skb);
760 	skb_push(skb, sizeof(struct mctp_hdr));
761 	skb_reset_network_header(skb);
762 	skb->dev = rt->dev->dev;
763 
764 	/* cb->net will have been set on initial ingress */
765 	cb->src = saddr;
766 
767 	/* set up common header fields */
768 	hdr = mctp_hdr(skb);
769 	hdr->ver = 1;
770 	hdr->dest = daddr;
771 	hdr->src = saddr;
772 
773 	mtu = mctp_route_mtu(rt);
774 
775 	if (skb->len + sizeof(struct mctp_hdr) <= mtu) {
776 		hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM |
777 			tag;
778 		return mctp_do_route(rt, skb);
779 	} else {
780 		return mctp_do_fragment_route(rt, skb, mtu, tag);
781 	}
782 }
783 
784 /* route management */
785 static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start,
786 			  unsigned int daddr_extent, unsigned int mtu,
787 			  unsigned char type)
788 {
789 	int (*rtfn)(struct mctp_route *rt, struct sk_buff *skb);
790 	struct net *net = dev_net(mdev->dev);
791 	struct mctp_route *rt, *ert;
792 
793 	if (!mctp_address_ok(daddr_start))
794 		return -EINVAL;
795 
796 	if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255)
797 		return -EINVAL;
798 
799 	switch (type) {
800 	case RTN_LOCAL:
801 		rtfn = mctp_route_input;
802 		break;
803 	case RTN_UNICAST:
804 		rtfn = mctp_route_output;
805 		break;
806 	default:
807 		return -EINVAL;
808 	}
809 
810 	rt = mctp_route_alloc();
811 	if (!rt)
812 		return -ENOMEM;
813 
814 	rt->min = daddr_start;
815 	rt->max = daddr_start + daddr_extent;
816 	rt->mtu = mtu;
817 	rt->dev = mdev;
818 	mctp_dev_hold(rt->dev);
819 	rt->type = type;
820 	rt->output = rtfn;
821 
822 	ASSERT_RTNL();
823 	/* Prevent duplicate identical routes. */
824 	list_for_each_entry(ert, &net->mctp.routes, list) {
825 		if (mctp_rt_compare_exact(rt, ert)) {
826 			mctp_route_release(rt);
827 			return -EEXIST;
828 		}
829 	}
830 
831 	list_add_rcu(&rt->list, &net->mctp.routes);
832 
833 	return 0;
834 }
835 
836 static int mctp_route_remove(struct mctp_dev *mdev, mctp_eid_t daddr_start,
837 			     unsigned int daddr_extent, unsigned char type)
838 {
839 	struct net *net = dev_net(mdev->dev);
840 	struct mctp_route *rt, *tmp;
841 	mctp_eid_t daddr_end;
842 	bool dropped;
843 
844 	if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255)
845 		return -EINVAL;
846 
847 	daddr_end = daddr_start + daddr_extent;
848 	dropped = false;
849 
850 	ASSERT_RTNL();
851 
852 	list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) {
853 		if (rt->dev == mdev &&
854 		    rt->min == daddr_start && rt->max == daddr_end &&
855 		    rt->type == type) {
856 			list_del_rcu(&rt->list);
857 			/* TODO: immediate RTM_DELROUTE */
858 			mctp_route_release(rt);
859 			dropped = true;
860 		}
861 	}
862 
863 	return dropped ? 0 : -ENOENT;
864 }
865 
866 int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr)
867 {
868 	return mctp_route_add(mdev, addr, 0, 0, RTN_LOCAL);
869 }
870 
871 int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr)
872 {
873 	return mctp_route_remove(mdev, addr, 0, RTN_LOCAL);
874 }
875 
876 /* removes all entries for a given device */
877 void mctp_route_remove_dev(struct mctp_dev *mdev)
878 {
879 	struct net *net = dev_net(mdev->dev);
880 	struct mctp_route *rt, *tmp;
881 
882 	ASSERT_RTNL();
883 	list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) {
884 		if (rt->dev == mdev) {
885 			list_del_rcu(&rt->list);
886 			/* TODO: immediate RTM_DELROUTE */
887 			mctp_route_release(rt);
888 		}
889 	}
890 }
891 
892 /* Incoming packet-handling */
893 
894 static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev,
895 				struct packet_type *pt,
896 				struct net_device *orig_dev)
897 {
898 	struct net *net = dev_net(dev);
899 	struct mctp_dev *mdev;
900 	struct mctp_skb_cb *cb;
901 	struct mctp_route *rt;
902 	struct mctp_hdr *mh;
903 
904 	rcu_read_lock();
905 	mdev = __mctp_dev_get(dev);
906 	rcu_read_unlock();
907 	if (!mdev) {
908 		/* basic non-data sanity checks */
909 		goto err_drop;
910 	}
911 
912 	if (!pskb_may_pull(skb, sizeof(struct mctp_hdr)))
913 		goto err_drop;
914 
915 	skb_reset_transport_header(skb);
916 	skb_reset_network_header(skb);
917 
918 	/* We have enough for a header; decode and route */
919 	mh = mctp_hdr(skb);
920 	if (mh->ver < MCTP_VER_MIN || mh->ver > MCTP_VER_MAX)
921 		goto err_drop;
922 
923 	cb = __mctp_cb(skb);
924 	cb->net = READ_ONCE(mdev->net);
925 
926 	rt = mctp_route_lookup(net, cb->net, mh->dest);
927 
928 	/* NULL EID, but addressed to our physical address */
929 	if (!rt && mh->dest == MCTP_ADDR_NULL && skb->pkt_type == PACKET_HOST)
930 		rt = mctp_route_lookup_null(net, dev);
931 
932 	if (!rt)
933 		goto err_drop;
934 
935 	mctp_do_route(rt, skb);
936 
937 	return NET_RX_SUCCESS;
938 
939 err_drop:
940 	kfree_skb(skb);
941 	return NET_RX_DROP;
942 }
943 
944 static struct packet_type mctp_packet_type = {
945 	.type = cpu_to_be16(ETH_P_MCTP),
946 	.func = mctp_pkttype_receive,
947 };
948 
949 /* netlink interface */
950 
951 static const struct nla_policy rta_mctp_policy[RTA_MAX + 1] = {
952 	[RTA_DST]		= { .type = NLA_U8 },
953 	[RTA_METRICS]		= { .type = NLA_NESTED },
954 	[RTA_OIF]		= { .type = NLA_U32 },
955 };
956 
957 /* Common part for RTM_NEWROUTE and RTM_DELROUTE parsing.
958  * tb must hold RTA_MAX+1 elements.
959  */
960 static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh,
961 			      struct netlink_ext_ack *extack,
962 			      struct nlattr **tb, struct rtmsg **rtm,
963 			      struct mctp_dev **mdev, mctp_eid_t *daddr_start)
964 {
965 	struct net *net = sock_net(skb->sk);
966 	struct net_device *dev;
967 	unsigned int ifindex;
968 	int rc;
969 
970 	rc = nlmsg_parse(nlh, sizeof(struct rtmsg), tb, RTA_MAX,
971 			 rta_mctp_policy, extack);
972 	if (rc < 0) {
973 		NL_SET_ERR_MSG(extack, "incorrect format");
974 		return rc;
975 	}
976 
977 	if (!tb[RTA_DST]) {
978 		NL_SET_ERR_MSG(extack, "dst EID missing");
979 		return -EINVAL;
980 	}
981 	*daddr_start = nla_get_u8(tb[RTA_DST]);
982 
983 	if (!tb[RTA_OIF]) {
984 		NL_SET_ERR_MSG(extack, "ifindex missing");
985 		return -EINVAL;
986 	}
987 	ifindex = nla_get_u32(tb[RTA_OIF]);
988 
989 	*rtm = nlmsg_data(nlh);
990 	if ((*rtm)->rtm_family != AF_MCTP) {
991 		NL_SET_ERR_MSG(extack, "route family must be AF_MCTP");
992 		return -EINVAL;
993 	}
994 
995 	dev = __dev_get_by_index(net, ifindex);
996 	if (!dev) {
997 		NL_SET_ERR_MSG(extack, "bad ifindex");
998 		return -ENODEV;
999 	}
1000 	*mdev = mctp_dev_get_rtnl(dev);
1001 	if (!*mdev)
1002 		return -ENODEV;
1003 
1004 	if (dev->flags & IFF_LOOPBACK) {
1005 		NL_SET_ERR_MSG(extack, "no routes to loopback");
1006 		return -EINVAL;
1007 	}
1008 
1009 	return 0;
1010 }
1011 
1012 static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
1013 			 struct netlink_ext_ack *extack)
1014 {
1015 	struct nlattr *tb[RTA_MAX + 1];
1016 	mctp_eid_t daddr_start;
1017 	struct mctp_dev *mdev;
1018 	struct rtmsg *rtm;
1019 	unsigned int mtu;
1020 	int rc;
1021 
1022 	rc = mctp_route_nlparse(skb, nlh, extack, tb,
1023 				&rtm, &mdev, &daddr_start);
1024 	if (rc < 0)
1025 		return rc;
1026 
1027 	if (rtm->rtm_type != RTN_UNICAST) {
1028 		NL_SET_ERR_MSG(extack, "rtm_type must be RTN_UNICAST");
1029 		return -EINVAL;
1030 	}
1031 
1032 	/* TODO: parse mtu from nlparse */
1033 	mtu = 0;
1034 
1035 	if (rtm->rtm_type != RTN_UNICAST)
1036 		return -EINVAL;
1037 
1038 	rc = mctp_route_add(mdev, daddr_start, rtm->rtm_dst_len, mtu,
1039 			    rtm->rtm_type);
1040 	return rc;
1041 }
1042 
1043 static int mctp_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
1044 			 struct netlink_ext_ack *extack)
1045 {
1046 	struct nlattr *tb[RTA_MAX + 1];
1047 	mctp_eid_t daddr_start;
1048 	struct mctp_dev *mdev;
1049 	struct rtmsg *rtm;
1050 	int rc;
1051 
1052 	rc = mctp_route_nlparse(skb, nlh, extack, tb,
1053 				&rtm, &mdev, &daddr_start);
1054 	if (rc < 0)
1055 		return rc;
1056 
1057 	/* we only have unicast routes */
1058 	if (rtm->rtm_type != RTN_UNICAST)
1059 		return -EINVAL;
1060 
1061 	rc = mctp_route_remove(mdev, daddr_start, rtm->rtm_dst_len, RTN_UNICAST);
1062 	return rc;
1063 }
1064 
1065 static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt,
1066 			    u32 portid, u32 seq, int event, unsigned int flags)
1067 {
1068 	struct nlmsghdr *nlh;
1069 	struct rtmsg *hdr;
1070 	void *metrics;
1071 
1072 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags);
1073 	if (!nlh)
1074 		return -EMSGSIZE;
1075 
1076 	hdr = nlmsg_data(nlh);
1077 	hdr->rtm_family = AF_MCTP;
1078 
1079 	/* we use the _len fields as a number of EIDs, rather than
1080 	 * a number of bits in the address
1081 	 */
1082 	hdr->rtm_dst_len = rt->max - rt->min;
1083 	hdr->rtm_src_len = 0;
1084 	hdr->rtm_tos = 0;
1085 	hdr->rtm_table = RT_TABLE_DEFAULT;
1086 	hdr->rtm_protocol = RTPROT_STATIC; /* everything is user-defined */
1087 	hdr->rtm_scope = RT_SCOPE_LINK; /* TODO: scope in mctp_route? */
1088 	hdr->rtm_type = rt->type;
1089 
1090 	if (nla_put_u8(skb, RTA_DST, rt->min))
1091 		goto cancel;
1092 
1093 	metrics = nla_nest_start_noflag(skb, RTA_METRICS);
1094 	if (!metrics)
1095 		goto cancel;
1096 
1097 	if (rt->mtu) {
1098 		if (nla_put_u32(skb, RTAX_MTU, rt->mtu))
1099 			goto cancel;
1100 	}
1101 
1102 	nla_nest_end(skb, metrics);
1103 
1104 	if (rt->dev) {
1105 		if (nla_put_u32(skb, RTA_OIF, rt->dev->dev->ifindex))
1106 			goto cancel;
1107 	}
1108 
1109 	/* TODO: conditional neighbour physaddr? */
1110 
1111 	nlmsg_end(skb, nlh);
1112 
1113 	return 0;
1114 
1115 cancel:
1116 	nlmsg_cancel(skb, nlh);
1117 	return -EMSGSIZE;
1118 }
1119 
1120 static int mctp_dump_rtinfo(struct sk_buff *skb, struct netlink_callback *cb)
1121 {
1122 	struct net *net = sock_net(skb->sk);
1123 	struct mctp_route *rt;
1124 	int s_idx, idx;
1125 
1126 	/* TODO: allow filtering on route data, possibly under
1127 	 * cb->strict_check
1128 	 */
1129 
1130 	/* TODO: change to struct overlay */
1131 	s_idx = cb->args[0];
1132 	idx = 0;
1133 
1134 	rcu_read_lock();
1135 	list_for_each_entry_rcu(rt, &net->mctp.routes, list) {
1136 		if (idx++ < s_idx)
1137 			continue;
1138 		if (mctp_fill_rtinfo(skb, rt,
1139 				     NETLINK_CB(cb->skb).portid,
1140 				     cb->nlh->nlmsg_seq,
1141 				     RTM_NEWROUTE, NLM_F_MULTI) < 0)
1142 			break;
1143 	}
1144 
1145 	rcu_read_unlock();
1146 	cb->args[0] = idx;
1147 
1148 	return skb->len;
1149 }
1150 
1151 /* net namespace implementation */
1152 static int __net_init mctp_routes_net_init(struct net *net)
1153 {
1154 	struct netns_mctp *ns = &net->mctp;
1155 
1156 	INIT_LIST_HEAD(&ns->routes);
1157 	INIT_HLIST_HEAD(&ns->binds);
1158 	mutex_init(&ns->bind_lock);
1159 	INIT_HLIST_HEAD(&ns->keys);
1160 	spin_lock_init(&ns->keys_lock);
1161 	WARN_ON(mctp_default_net_set(net, MCTP_INITIAL_DEFAULT_NET));
1162 	return 0;
1163 }
1164 
1165 static void __net_exit mctp_routes_net_exit(struct net *net)
1166 {
1167 	struct mctp_route *rt;
1168 
1169 	rcu_read_lock();
1170 	list_for_each_entry_rcu(rt, &net->mctp.routes, list)
1171 		mctp_route_release(rt);
1172 	rcu_read_unlock();
1173 }
1174 
1175 static struct pernet_operations mctp_net_ops = {
1176 	.init = mctp_routes_net_init,
1177 	.exit = mctp_routes_net_exit,
1178 };
1179 
1180 int __init mctp_routes_init(void)
1181 {
1182 	dev_add_pack(&mctp_packet_type);
1183 
1184 	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETROUTE,
1185 			     NULL, mctp_dump_rtinfo, 0);
1186 	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWROUTE,
1187 			     mctp_newroute, NULL, 0);
1188 	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELROUTE,
1189 			     mctp_delroute, NULL, 0);
1190 
1191 	return register_pernet_subsys(&mctp_net_ops);
1192 }
1193 
1194 void __exit mctp_routes_exit(void)
1195 {
1196 	unregister_pernet_subsys(&mctp_net_ops);
1197 	rtnl_unregister(PF_MCTP, RTM_DELROUTE);
1198 	rtnl_unregister(PF_MCTP, RTM_NEWROUTE);
1199 	rtnl_unregister(PF_MCTP, RTM_GETROUTE);
1200 	dev_remove_pack(&mctp_packet_type);
1201 }
1202