xref: /openbmc/linux/net/mptcp/pm.c (revision 7c2435ef)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3  *
4  * Copyright (c) 2019, Intel Corporation.
5  */
6 #define pr_fmt(fmt) "MPTCP: " fmt
7 
8 #include <linux/kernel.h>
9 #include <net/tcp.h>
10 #include <net/mptcp.h>
11 #include "protocol.h"
12 
13 #include "mib.h"
14 
15 /* path manager command handlers */
16 
17 int mptcp_pm_announce_addr(struct mptcp_sock *msk,
18 			   const struct mptcp_addr_info *addr,
19 			   bool echo)
20 {
21 	u8 add_addr = READ_ONCE(msk->pm.addr_signal);
22 
23 	pr_debug("msk=%p, local_id=%d, echo=%d", msk, addr->id, echo);
24 
25 	lockdep_assert_held(&msk->pm.lock);
26 
27 	if (add_addr &
28 	    (echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) {
29 		MPTCP_INC_STATS(sock_net((struct sock *)msk),
30 				echo ? MPTCP_MIB_ECHOADDTXDROP : MPTCP_MIB_ADDADDRTXDROP);
31 		return -EINVAL;
32 	}
33 
34 	if (echo) {
35 		msk->pm.remote = *addr;
36 		add_addr |= BIT(MPTCP_ADD_ADDR_ECHO);
37 	} else {
38 		msk->pm.local = *addr;
39 		add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL);
40 	}
41 	WRITE_ONCE(msk->pm.addr_signal, add_addr);
42 	return 0;
43 }
44 
45 int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list)
46 {
47 	u8 rm_addr = READ_ONCE(msk->pm.addr_signal);
48 
49 	pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
50 
51 	if (rm_addr) {
52 		MPTCP_ADD_STATS(sock_net((struct sock *)msk),
53 				MPTCP_MIB_RMADDRTXDROP, rm_list->nr);
54 		return -EINVAL;
55 	}
56 
57 	msk->pm.rm_list_tx = *rm_list;
58 	rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL);
59 	WRITE_ONCE(msk->pm.addr_signal, rm_addr);
60 	mptcp_pm_nl_addr_send_ack(msk);
61 	return 0;
62 }
63 
64 int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list)
65 {
66 	pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
67 
68 	spin_lock_bh(&msk->pm.lock);
69 	mptcp_pm_nl_rm_subflow_received(msk, rm_list);
70 	spin_unlock_bh(&msk->pm.lock);
71 	return 0;
72 }
73 
74 /* path manager event handlers */
75 
76 void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side)
77 {
78 	struct mptcp_pm_data *pm = &msk->pm;
79 
80 	pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side);
81 
82 	WRITE_ONCE(pm->server_side, server_side);
83 	mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC);
84 }
85 
86 bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
87 {
88 	struct mptcp_pm_data *pm = &msk->pm;
89 	unsigned int subflows_max;
90 	int ret = 0;
91 
92 	if (mptcp_pm_is_userspace(msk))
93 		return mptcp_userspace_pm_active(msk);
94 
95 	subflows_max = mptcp_pm_get_subflows_max(msk);
96 
97 	pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
98 		 subflows_max, READ_ONCE(pm->accept_subflow));
99 
100 	/* try to avoid acquiring the lock below */
101 	if (!READ_ONCE(pm->accept_subflow))
102 		return false;
103 
104 	spin_lock_bh(&pm->lock);
105 	if (READ_ONCE(pm->accept_subflow)) {
106 		ret = pm->subflows < subflows_max;
107 		if (ret && ++pm->subflows == subflows_max)
108 			WRITE_ONCE(pm->accept_subflow, false);
109 	}
110 	spin_unlock_bh(&pm->lock);
111 
112 	return ret;
113 }
114 
115 /* return true if the new status bit is currently cleared, that is, this event
116  * can be server, eventually by an already scheduled work
117  */
118 static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
119 				   enum mptcp_pm_status new_status)
120 {
121 	pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status,
122 		 BIT(new_status));
123 	if (msk->pm.status & BIT(new_status))
124 		return false;
125 
126 	msk->pm.status |= BIT(new_status);
127 	mptcp_schedule_work((struct sock *)msk);
128 	return true;
129 }
130 
131 void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk)
132 {
133 	struct mptcp_pm_data *pm = &msk->pm;
134 	bool announce = false;
135 
136 	pr_debug("msk=%p", msk);
137 
138 	spin_lock_bh(&pm->lock);
139 
140 	/* mptcp_pm_fully_established() can be invoked by multiple
141 	 * racing paths - accept() and check_fully_established()
142 	 * be sure to serve this event only once.
143 	 */
144 	if (READ_ONCE(pm->work_pending) &&
145 	    !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)))
146 		mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
147 
148 	if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0)
149 		announce = true;
150 
151 	msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED);
152 	spin_unlock_bh(&pm->lock);
153 
154 	if (announce)
155 		mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, GFP_ATOMIC);
156 }
157 
158 void mptcp_pm_connection_closed(struct mptcp_sock *msk)
159 {
160 	pr_debug("msk=%p", msk);
161 }
162 
163 void mptcp_pm_subflow_established(struct mptcp_sock *msk)
164 {
165 	struct mptcp_pm_data *pm = &msk->pm;
166 
167 	pr_debug("msk=%p", msk);
168 
169 	if (!READ_ONCE(pm->work_pending))
170 		return;
171 
172 	spin_lock_bh(&pm->lock);
173 
174 	if (READ_ONCE(pm->work_pending))
175 		mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
176 
177 	spin_unlock_bh(&pm->lock);
178 }
179 
180 void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
181 				 const struct mptcp_subflow_context *subflow)
182 {
183 	struct mptcp_pm_data *pm = &msk->pm;
184 	bool update_subflows;
185 
186 	update_subflows = (subflow->request_join || subflow->mp_join) &&
187 			  mptcp_pm_is_kernel(msk);
188 	if (!READ_ONCE(pm->work_pending) && !update_subflows)
189 		return;
190 
191 	spin_lock_bh(&pm->lock);
192 	if (update_subflows)
193 		__mptcp_pm_close_subflow(msk);
194 
195 	/* Even if this subflow is not really established, tell the PM to try
196 	 * to pick the next ones, if possible.
197 	 */
198 	if (mptcp_pm_nl_check_work_pending(msk))
199 		mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
200 
201 	spin_unlock_bh(&pm->lock);
202 }
203 
204 void mptcp_pm_add_addr_received(const struct sock *ssk,
205 				const struct mptcp_addr_info *addr)
206 {
207 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
208 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
209 	struct mptcp_pm_data *pm = &msk->pm;
210 
211 	pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
212 		 READ_ONCE(pm->accept_addr));
213 
214 	mptcp_event_addr_announced(ssk, addr);
215 
216 	spin_lock_bh(&pm->lock);
217 
218 	if (mptcp_pm_is_userspace(msk)) {
219 		if (mptcp_userspace_pm_active(msk)) {
220 			mptcp_pm_announce_addr(msk, addr, true);
221 			mptcp_pm_add_addr_send_ack(msk);
222 		} else {
223 			__MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
224 		}
225 	} else if (!READ_ONCE(pm->accept_addr)) {
226 		mptcp_pm_announce_addr(msk, addr, true);
227 		mptcp_pm_add_addr_send_ack(msk);
228 	} else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) {
229 		pm->remote = *addr;
230 	} else {
231 		__MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
232 	}
233 
234 	spin_unlock_bh(&pm->lock);
235 }
236 
237 void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
238 			      const struct mptcp_addr_info *addr)
239 {
240 	struct mptcp_pm_data *pm = &msk->pm;
241 
242 	pr_debug("msk=%p", msk);
243 
244 	spin_lock_bh(&pm->lock);
245 
246 	if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending))
247 		mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
248 
249 	spin_unlock_bh(&pm->lock);
250 }
251 
252 void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk)
253 {
254 	if (!mptcp_pm_should_add_signal(msk))
255 		return;
256 
257 	mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK);
258 }
259 
260 void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
261 			       const struct mptcp_rm_list *rm_list)
262 {
263 	struct mptcp_pm_data *pm = &msk->pm;
264 	u8 i;
265 
266 	pr_debug("msk=%p remote_ids_nr=%d", msk, rm_list->nr);
267 
268 	for (i = 0; i < rm_list->nr; i++)
269 		mptcp_event_addr_removed(msk, rm_list->ids[i]);
270 
271 	spin_lock_bh(&pm->lock);
272 	if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED))
273 		pm->rm_list_rx = *rm_list;
274 	else
275 		__MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP);
276 	spin_unlock_bh(&pm->lock);
277 }
278 
279 void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
280 {
281 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
282 	struct sock *sk = subflow->conn;
283 	struct mptcp_sock *msk;
284 
285 	pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
286 	msk = mptcp_sk(sk);
287 	if (subflow->backup != bkup) {
288 		subflow->backup = bkup;
289 		mptcp_data_lock(sk);
290 		if (!sock_owned_by_user(sk))
291 			msk->last_snd = NULL;
292 		else
293 			__set_bit(MPTCP_RESET_SCHEDULER,  &msk->cb_flags);
294 		mptcp_data_unlock(sk);
295 	}
296 
297 	mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
298 }
299 
300 void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
301 {
302 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
303 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
304 
305 	pr_debug("fail_seq=%llu", fail_seq);
306 
307 	if (!READ_ONCE(msk->allow_infinite_fallback))
308 		return;
309 
310 	if (!subflow->fail_tout) {
311 		pr_debug("send MP_FAIL response and infinite map");
312 
313 		subflow->send_mp_fail = 1;
314 		subflow->send_infinite_map = 1;
315 		tcp_send_ack(sk);
316 	} else {
317 		pr_debug("MP_FAIL response received");
318 		WRITE_ONCE(subflow->fail_tout, 0);
319 	}
320 }
321 
322 /* path manager helpers */
323 
324 bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
325 			      unsigned int opt_size, unsigned int remaining,
326 			      struct mptcp_addr_info *addr, bool *echo,
327 			      bool *drop_other_suboptions)
328 {
329 	int ret = false;
330 	u8 add_addr;
331 	u8 family;
332 	bool port;
333 
334 	spin_lock_bh(&msk->pm.lock);
335 
336 	/* double check after the lock is acquired */
337 	if (!mptcp_pm_should_add_signal(msk))
338 		goto out_unlock;
339 
340 	/* always drop every other options for pure ack ADD_ADDR; this is a
341 	 * plain dup-ack from TCP perspective. The other MPTCP-relevant info,
342 	 * if any, will be carried by the 'original' TCP ack
343 	 */
344 	if (skb && skb_is_tcp_pure_ack(skb)) {
345 		remaining += opt_size;
346 		*drop_other_suboptions = true;
347 	}
348 
349 	*echo = mptcp_pm_should_add_signal_echo(msk);
350 	port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port);
351 
352 	family = *echo ? msk->pm.remote.family : msk->pm.local.family;
353 	if (remaining < mptcp_add_addr_len(family, *echo, port))
354 		goto out_unlock;
355 
356 	if (*echo) {
357 		*addr = msk->pm.remote;
358 		add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO);
359 	} else {
360 		*addr = msk->pm.local;
361 		add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL);
362 	}
363 	WRITE_ONCE(msk->pm.addr_signal, add_addr);
364 	ret = true;
365 
366 out_unlock:
367 	spin_unlock_bh(&msk->pm.lock);
368 	return ret;
369 }
370 
371 bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
372 			     struct mptcp_rm_list *rm_list)
373 {
374 	int ret = false, len;
375 	u8 rm_addr;
376 
377 	spin_lock_bh(&msk->pm.lock);
378 
379 	/* double check after the lock is acquired */
380 	if (!mptcp_pm_should_rm_signal(msk))
381 		goto out_unlock;
382 
383 	rm_addr = msk->pm.addr_signal & ~BIT(MPTCP_RM_ADDR_SIGNAL);
384 	len = mptcp_rm_addr_len(&msk->pm.rm_list_tx);
385 	if (len < 0) {
386 		WRITE_ONCE(msk->pm.addr_signal, rm_addr);
387 		goto out_unlock;
388 	}
389 	if (remaining < len)
390 		goto out_unlock;
391 
392 	*rm_list = msk->pm.rm_list_tx;
393 	WRITE_ONCE(msk->pm.addr_signal, rm_addr);
394 	ret = true;
395 
396 out_unlock:
397 	spin_unlock_bh(&msk->pm.lock);
398 	return ret;
399 }
400 
401 int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
402 {
403 	return mptcp_pm_nl_get_local_id(msk, skc);
404 }
405 
406 void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
407 {
408 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
409 	u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp);
410 
411 	/* keep track of rtx periods with no progress */
412 	if (!subflow->stale_count) {
413 		subflow->stale_rcv_tstamp = rcv_tstamp;
414 		subflow->stale_count++;
415 	} else if (subflow->stale_rcv_tstamp == rcv_tstamp) {
416 		if (subflow->stale_count < U8_MAX)
417 			subflow->stale_count++;
418 		mptcp_pm_nl_subflow_chk_stale(msk, ssk);
419 	} else {
420 		subflow->stale_count = 0;
421 		mptcp_subflow_set_active(subflow);
422 	}
423 }
424 
425 /* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses,
426  * otherwise allow any matching local/remote pair
427  */
428 bool mptcp_pm_addr_families_match(const struct sock *sk,
429 				  const struct mptcp_addr_info *loc,
430 				  const struct mptcp_addr_info *rem)
431 {
432 	bool mptcp_is_v4 = sk->sk_family == AF_INET;
433 
434 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
435 	bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6);
436 	bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6);
437 
438 	if (mptcp_is_v4)
439 		return loc_is_v4 && rem_is_v4;
440 
441 	if (ipv6_only_sock(sk))
442 		return !loc_is_v4 && !rem_is_v4;
443 
444 	return loc_is_v4 == rem_is_v4;
445 #else
446 	return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET;
447 #endif
448 }
449 
450 void mptcp_pm_data_reset(struct mptcp_sock *msk)
451 {
452 	u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk));
453 	struct mptcp_pm_data *pm = &msk->pm;
454 
455 	pm->add_addr_signaled = 0;
456 	pm->add_addr_accepted = 0;
457 	pm->local_addr_used = 0;
458 	pm->subflows = 0;
459 	pm->rm_list_tx.nr = 0;
460 	pm->rm_list_rx.nr = 0;
461 	WRITE_ONCE(pm->pm_type, pm_type);
462 
463 	if (pm_type == MPTCP_PM_TYPE_KERNEL) {
464 		bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk);
465 
466 		/* pm->work_pending must be only be set to 'true' when
467 		 * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL
468 		 */
469 		WRITE_ONCE(pm->work_pending,
470 			   (!!mptcp_pm_get_local_addr_max(msk) &&
471 			    subflows_allowed) ||
472 			   !!mptcp_pm_get_add_addr_signal_max(msk));
473 		WRITE_ONCE(pm->accept_addr,
474 			   !!mptcp_pm_get_add_addr_accept_max(msk) &&
475 			   subflows_allowed);
476 		WRITE_ONCE(pm->accept_subflow, subflows_allowed);
477 	} else {
478 		WRITE_ONCE(pm->work_pending, 0);
479 		WRITE_ONCE(pm->accept_addr, 0);
480 		WRITE_ONCE(pm->accept_subflow, 0);
481 	}
482 
483 	WRITE_ONCE(pm->addr_signal, 0);
484 	WRITE_ONCE(pm->remote_deny_join_id0, false);
485 	pm->status = 0;
486 	bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
487 }
488 
489 void mptcp_pm_data_init(struct mptcp_sock *msk)
490 {
491 	spin_lock_init(&msk->pm.lock);
492 	INIT_LIST_HEAD(&msk->pm.anno_list);
493 	INIT_LIST_HEAD(&msk->pm.userspace_pm_local_addr_list);
494 	mptcp_pm_data_reset(msk);
495 }
496 
497 void __init mptcp_pm_init(void)
498 {
499 	mptcp_pm_nl_init();
500 }
501