xref: /openbmc/linux/net/rds/tcp_recv.c (revision 9a87ffc99ec8eb8d35eed7c4f816d75f5cc9662e)
170041088SAndy Grover /*
2eee2fa6aSKa-Cheong Poon  * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
370041088SAndy Grover  *
470041088SAndy Grover  * This software is available to you under a choice of one of two
570041088SAndy Grover  * licenses.  You may choose to be licensed under the terms of the GNU
670041088SAndy Grover  * General Public License (GPL) Version 2, available from the file
770041088SAndy Grover  * COPYING in the main directory of this source tree, or the
870041088SAndy Grover  * OpenIB.org BSD license below:
970041088SAndy Grover  *
1070041088SAndy Grover  *     Redistribution and use in source and binary forms, with or
1170041088SAndy Grover  *     without modification, are permitted provided that the following
1270041088SAndy Grover  *     conditions are met:
1370041088SAndy Grover  *
1470041088SAndy Grover  *      - Redistributions of source code must retain the above
1570041088SAndy Grover  *        copyright notice, this list of conditions and the following
1670041088SAndy Grover  *        disclaimer.
1770041088SAndy Grover  *
1870041088SAndy Grover  *      - Redistributions in binary form must reproduce the above
1970041088SAndy Grover  *        copyright notice, this list of conditions and the following
2070041088SAndy Grover  *        disclaimer in the documentation and/or other materials
2170041088SAndy Grover  *        provided with the distribution.
2270041088SAndy Grover  *
2370041088SAndy Grover  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
2470041088SAndy Grover  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
2570041088SAndy Grover  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
2670041088SAndy Grover  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
2770041088SAndy Grover  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
2870041088SAndy Grover  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2970041088SAndy Grover  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3070041088SAndy Grover  * SOFTWARE.
3170041088SAndy Grover  *
3270041088SAndy Grover  */
3370041088SAndy Grover #include <linux/kernel.h>
345a0e3ad6STejun Heo #include <linux/slab.h>
3570041088SAndy Grover #include <net/tcp.h>
36*40e0b090SPeilin Ye #include <trace/events/sock.h>
3770041088SAndy Grover 
3870041088SAndy Grover #include "rds.h"
3970041088SAndy Grover #include "tcp.h"
4070041088SAndy Grover 
4170041088SAndy Grover static struct kmem_cache *rds_tcp_incoming_slab;
4270041088SAndy Grover 
rds_tcp_inc_purge(struct rds_incoming * inc)43809fa148SAndy Grover static void rds_tcp_inc_purge(struct rds_incoming *inc)
4470041088SAndy Grover {
4570041088SAndy Grover 	struct rds_tcp_incoming *tinc;
4670041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
4770041088SAndy Grover 	rdsdebug("purging tinc %p inc %p\n", tinc, inc);
4870041088SAndy Grover 	skb_queue_purge(&tinc->ti_skb_list);
4970041088SAndy Grover }
5070041088SAndy Grover 
rds_tcp_inc_free(struct rds_incoming * inc)5170041088SAndy Grover void rds_tcp_inc_free(struct rds_incoming *inc)
5270041088SAndy Grover {
5370041088SAndy Grover 	struct rds_tcp_incoming *tinc;
5470041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
5570041088SAndy Grover 	rds_tcp_inc_purge(inc);
5670041088SAndy Grover 	rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
5770041088SAndy Grover 	kmem_cache_free(rds_tcp_incoming_slab, tinc);
5870041088SAndy Grover }
5970041088SAndy Grover 
6070041088SAndy Grover /*
6170041088SAndy Grover  * this is pretty lame, but, whatever.
6270041088SAndy Grover  */
rds_tcp_inc_copy_to_user(struct rds_incoming * inc,struct iov_iter * to)63c310e72cSAl Viro int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
6470041088SAndy Grover {
6570041088SAndy Grover 	struct rds_tcp_incoming *tinc;
6670041088SAndy Grover 	struct sk_buff *skb;
6770041088SAndy Grover 	int ret = 0;
6870041088SAndy Grover 
69c310e72cSAl Viro 	if (!iov_iter_count(to))
7070041088SAndy Grover 		goto out;
7170041088SAndy Grover 
7270041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
7370041088SAndy Grover 
7470041088SAndy Grover 	skb_queue_walk(&tinc->ti_skb_list, skb) {
75c310e72cSAl Viro 		unsigned long to_copy, skb_off;
76c310e72cSAl Viro 		for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) {
77c310e72cSAl Viro 			to_copy = iov_iter_count(to);
7870041088SAndy Grover 			to_copy = min(to_copy, skb->len - skb_off);
7970041088SAndy Grover 
80c310e72cSAl Viro 			if (skb_copy_datagram_iter(skb, skb_off, to, to_copy))
81c310e72cSAl Viro 				return -EFAULT;
8270041088SAndy Grover 
83b075cfdbSAndy Grover 			rds_stats_add(s_copy_to_user, to_copy);
8470041088SAndy Grover 			ret += to_copy;
85c310e72cSAl Viro 
86c310e72cSAl Viro 			if (!iov_iter_count(to))
8770041088SAndy Grover 				goto out;
8870041088SAndy Grover 		}
8970041088SAndy Grover 	}
9070041088SAndy Grover out:
9170041088SAndy Grover 	return ret;
9270041088SAndy Grover }
9370041088SAndy Grover 
9470041088SAndy Grover /*
9570041088SAndy Grover  * We have a series of skbs that have fragmented pieces of the congestion
9670041088SAndy Grover  * bitmap.  They must add up to the exact size of the congestion bitmap.  We
9770041088SAndy Grover  * use the skb helpers to copy those into the pages that make up the in-memory
9870041088SAndy Grover  * congestion bitmap for the remote address of this connection.  We then tell
9970041088SAndy Grover  * the congestion core that the bitmap has been changed so that it can wake up
10070041088SAndy Grover  * sleepers.
10170041088SAndy Grover  *
10270041088SAndy Grover  * This is racing with sending paths which are using test_bit to see if the
10370041088SAndy Grover  * bitmap indicates that their recipient is congested.
10470041088SAndy Grover  */
10570041088SAndy Grover 
rds_tcp_cong_recv(struct rds_connection * conn,struct rds_tcp_incoming * tinc)10670041088SAndy Grover static void rds_tcp_cong_recv(struct rds_connection *conn,
10770041088SAndy Grover 			      struct rds_tcp_incoming *tinc)
10870041088SAndy Grover {
10970041088SAndy Grover 	struct sk_buff *skb;
11070041088SAndy Grover 	unsigned int to_copy, skb_off;
11170041088SAndy Grover 	unsigned int map_off;
11270041088SAndy Grover 	unsigned int map_page;
11370041088SAndy Grover 	struct rds_cong_map *map;
11470041088SAndy Grover 	int ret;
11570041088SAndy Grover 
11670041088SAndy Grover 	/* catch completely corrupt packets */
11770041088SAndy Grover 	if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
11870041088SAndy Grover 		return;
11970041088SAndy Grover 
12070041088SAndy Grover 	map_page = 0;
12170041088SAndy Grover 	map_off = 0;
12270041088SAndy Grover 	map = conn->c_fcong;
12370041088SAndy Grover 
12470041088SAndy Grover 	skb_queue_walk(&tinc->ti_skb_list, skb) {
12570041088SAndy Grover 		skb_off = 0;
12670041088SAndy Grover 		while (skb_off < skb->len) {
12770041088SAndy Grover 			to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
12870041088SAndy Grover 					skb->len - skb_off);
12970041088SAndy Grover 
13070041088SAndy Grover 			BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
13170041088SAndy Grover 
13270041088SAndy Grover 			/* only returns 0 or -error */
13370041088SAndy Grover 			ret = skb_copy_bits(skb, skb_off,
13470041088SAndy Grover 				(void *)map->m_page_addrs[map_page] + map_off,
13570041088SAndy Grover 				to_copy);
13670041088SAndy Grover 			BUG_ON(ret != 0);
13770041088SAndy Grover 
13870041088SAndy Grover 			skb_off += to_copy;
13970041088SAndy Grover 			map_off += to_copy;
14070041088SAndy Grover 			if (map_off == PAGE_SIZE) {
14170041088SAndy Grover 				map_off = 0;
14270041088SAndy Grover 				map_page++;
14370041088SAndy Grover 			}
14470041088SAndy Grover 		}
14570041088SAndy Grover 	}
14670041088SAndy Grover 
14770041088SAndy Grover 	rds_cong_map_updated(map, ~(u64) 0);
14870041088SAndy Grover }
14970041088SAndy Grover 
15070041088SAndy Grover struct rds_tcp_desc_arg {
1512da43c4aSSowmini Varadhan 	struct rds_conn_path *conn_path;
15270041088SAndy Grover 	gfp_t gfp;
15370041088SAndy Grover };
15470041088SAndy Grover 
rds_tcp_data_recv(read_descriptor_t * desc,struct sk_buff * skb,unsigned int offset,size_t len)15570041088SAndy Grover static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
15670041088SAndy Grover 			     unsigned int offset, size_t len)
15770041088SAndy Grover {
15870041088SAndy Grover 	struct rds_tcp_desc_arg *arg = desc->arg.data;
1592da43c4aSSowmini Varadhan 	struct rds_conn_path *cp = arg->conn_path;
1602da43c4aSSowmini Varadhan 	struct rds_tcp_connection *tc = cp->cp_transport_data;
16170041088SAndy Grover 	struct rds_tcp_incoming *tinc = tc->t_tinc;
16270041088SAndy Grover 	struct sk_buff *clone;
16370041088SAndy Grover 	size_t left = len, to_copy;
16470041088SAndy Grover 
16570041088SAndy Grover 	rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
16670041088SAndy Grover 		 len);
16770041088SAndy Grover 
16870041088SAndy Grover 	/*
16970041088SAndy Grover 	 * tcp_read_sock() interprets partial progress as an indication to stop
17070041088SAndy Grover 	 * processing.
17170041088SAndy Grover 	 */
17270041088SAndy Grover 	while (left) {
1738690bfa1SAndy Grover 		if (!tinc) {
17470041088SAndy Grover 			tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
17570041088SAndy Grover 						arg->gfp);
1768690bfa1SAndy Grover 			if (!tinc) {
17770041088SAndy Grover 				desc->error = -ENOMEM;
17870041088SAndy Grover 				goto out;
17970041088SAndy Grover 			}
18070041088SAndy Grover 			tc->t_tinc = tinc;
181379aecbcSZheng Yongjun 			rdsdebug("allocated tinc %p\n", tinc);
1822da43c4aSSowmini Varadhan 			rds_inc_path_init(&tinc->ti_inc, cp,
183eee2fa6aSKa-Cheong Poon 					  &cp->cp_conn->c_faddr);
1843289025aSSantosh Shilimkar 			tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
1853289025aSSantosh Shilimkar 					local_clock();
1863289025aSSantosh Shilimkar 
18770041088SAndy Grover 			/*
18870041088SAndy Grover 			 * XXX * we might be able to use the __ variants when
18970041088SAndy Grover 			 * we've already serialized at a higher level.
19070041088SAndy Grover 			 */
19170041088SAndy Grover 			skb_queue_head_init(&tinc->ti_skb_list);
19270041088SAndy Grover 		}
19370041088SAndy Grover 
19470041088SAndy Grover 		if (left && tc->t_tinc_hdr_rem) {
19570041088SAndy Grover 			to_copy = min(tc->t_tinc_hdr_rem, left);
19670041088SAndy Grover 			rdsdebug("copying %zu header from skb %p\n", to_copy,
19770041088SAndy Grover 				 skb);
19870041088SAndy Grover 			skb_copy_bits(skb, offset,
19970041088SAndy Grover 				      (char *)&tinc->ti_inc.i_hdr +
20070041088SAndy Grover 						sizeof(struct rds_header) -
20170041088SAndy Grover 						tc->t_tinc_hdr_rem,
20270041088SAndy Grover 				      to_copy);
20370041088SAndy Grover 			tc->t_tinc_hdr_rem -= to_copy;
20470041088SAndy Grover 			left -= to_copy;
20570041088SAndy Grover 			offset += to_copy;
20670041088SAndy Grover 
20770041088SAndy Grover 			if (tc->t_tinc_hdr_rem == 0) {
20870041088SAndy Grover 				/* could be 0 for a 0 len message */
20970041088SAndy Grover 				tc->t_tinc_data_rem =
21070041088SAndy Grover 					be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
2113289025aSSantosh Shilimkar 				tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
2123289025aSSantosh Shilimkar 					local_clock();
21370041088SAndy Grover 			}
21470041088SAndy Grover 		}
21570041088SAndy Grover 
21670041088SAndy Grover 		if (left && tc->t_tinc_data_rem) {
217947d2756SSowmini Varadhan 			to_copy = min(tc->t_tinc_data_rem, left);
218947d2756SSowmini Varadhan 
219947d2756SSowmini Varadhan 			clone = pskb_extract(skb, offset, to_copy, arg->gfp);
2208690bfa1SAndy Grover 			if (!clone) {
22170041088SAndy Grover 				desc->error = -ENOMEM;
22270041088SAndy Grover 				goto out;
22370041088SAndy Grover 			}
22470041088SAndy Grover 
22570041088SAndy Grover 			skb_queue_tail(&tinc->ti_skb_list, clone);
22670041088SAndy Grover 
22770041088SAndy Grover 			rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
22870041088SAndy Grover 				 "clone %p data %p len %d\n",
22970041088SAndy Grover 				 skb, skb->data, skb->len, offset, to_copy,
23070041088SAndy Grover 				 clone, clone->data, clone->len);
23170041088SAndy Grover 
23270041088SAndy Grover 			tc->t_tinc_data_rem -= to_copy;
23370041088SAndy Grover 			left -= to_copy;
23470041088SAndy Grover 			offset += to_copy;
23570041088SAndy Grover 		}
23670041088SAndy Grover 
23770041088SAndy Grover 		if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
2382da43c4aSSowmini Varadhan 			struct rds_connection *conn = cp->cp_conn;
2392da43c4aSSowmini Varadhan 
24070041088SAndy Grover 			if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
24170041088SAndy Grover 				rds_tcp_cong_recv(conn, tinc);
24270041088SAndy Grover 			else
243eee2fa6aSKa-Cheong Poon 				rds_recv_incoming(conn, &conn->c_faddr,
244eee2fa6aSKa-Cheong Poon 						  &conn->c_laddr,
245eee2fa6aSKa-Cheong Poon 						  &tinc->ti_inc,
2466114eab5SCong Wang 						  arg->gfp);
24770041088SAndy Grover 
24870041088SAndy Grover 			tc->t_tinc_hdr_rem = sizeof(struct rds_header);
24970041088SAndy Grover 			tc->t_tinc_data_rem = 0;
25070041088SAndy Grover 			tc->t_tinc = NULL;
25170041088SAndy Grover 			rds_inc_put(&tinc->ti_inc);
25270041088SAndy Grover 			tinc = NULL;
25370041088SAndy Grover 		}
25470041088SAndy Grover 	}
25570041088SAndy Grover out:
25670041088SAndy Grover 	rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
25770041088SAndy Grover 		 len, left, skb->len,
25870041088SAndy Grover 		 skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
25970041088SAndy Grover 	return len - left;
26070041088SAndy Grover }
26170041088SAndy Grover 
26270041088SAndy Grover /* the caller has to hold the sock lock */
rds_tcp_read_sock(struct rds_conn_path * cp,gfp_t gfp)2632da43c4aSSowmini Varadhan static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp)
26470041088SAndy Grover {
2652da43c4aSSowmini Varadhan 	struct rds_tcp_connection *tc = cp->cp_transport_data;
26670041088SAndy Grover 	struct socket *sock = tc->t_sock;
26770041088SAndy Grover 	read_descriptor_t desc;
26870041088SAndy Grover 	struct rds_tcp_desc_arg arg;
26970041088SAndy Grover 
27070041088SAndy Grover 	/* It's like glib in the kernel! */
2712da43c4aSSowmini Varadhan 	arg.conn_path = cp;
27270041088SAndy Grover 	arg.gfp = gfp;
27370041088SAndy Grover 	desc.arg.data = &arg;
27470041088SAndy Grover 	desc.error = 0;
27570041088SAndy Grover 	desc.count = 1; /* give more than one skb per call */
27670041088SAndy Grover 
27770041088SAndy Grover 	tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
27870041088SAndy Grover 	rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
27970041088SAndy Grover 		 desc.error);
28070041088SAndy Grover 
28170041088SAndy Grover 	return desc.error;
28270041088SAndy Grover }
28370041088SAndy Grover 
28470041088SAndy Grover /*
28570041088SAndy Grover  * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
28670041088SAndy Grover  * data_ready.
28770041088SAndy Grover  *
28870041088SAndy Grover  * if we fail to allocate we're in trouble.. blindly wait some time before
28970041088SAndy Grover  * trying again to see if the VM can free up something for us.
29070041088SAndy Grover  */
rds_tcp_recv_path(struct rds_conn_path * cp)2912da43c4aSSowmini Varadhan int rds_tcp_recv_path(struct rds_conn_path *cp)
29270041088SAndy Grover {
2932da43c4aSSowmini Varadhan 	struct rds_tcp_connection *tc = cp->cp_transport_data;
29470041088SAndy Grover 	struct socket *sock = tc->t_sock;
29570041088SAndy Grover 	int ret = 0;
29670041088SAndy Grover 
2972da43c4aSSowmini Varadhan 	rdsdebug("recv worker path [%d] tc %p sock %p\n",
2982da43c4aSSowmini Varadhan 		 cp->cp_index, tc, sock);
29970041088SAndy Grover 
30070041088SAndy Grover 	lock_sock(sock->sk);
3012da43c4aSSowmini Varadhan 	ret = rds_tcp_read_sock(cp, GFP_KERNEL);
30270041088SAndy Grover 	release_sock(sock->sk);
30370041088SAndy Grover 
30470041088SAndy Grover 	return ret;
30570041088SAndy Grover }
30670041088SAndy Grover 
rds_tcp_data_ready(struct sock * sk)307676d2369SDavid S. Miller void rds_tcp_data_ready(struct sock *sk)
30870041088SAndy Grover {
309676d2369SDavid S. Miller 	void (*ready)(struct sock *sk);
310ea3b1ea5SSowmini Varadhan 	struct rds_conn_path *cp;
31170041088SAndy Grover 	struct rds_tcp_connection *tc;
31270041088SAndy Grover 
313*40e0b090SPeilin Ye 	trace_sk_data_ready(sk);
314676d2369SDavid S. Miller 	rdsdebug("data ready sk %p\n", sk);
31570041088SAndy Grover 
31638036629SEric Dumazet 	read_lock_bh(&sk->sk_callback_lock);
317ea3b1ea5SSowmini Varadhan 	cp = sk->sk_user_data;
318ea3b1ea5SSowmini Varadhan 	if (!cp) { /* check for teardown race */
31970041088SAndy Grover 		ready = sk->sk_data_ready;
32070041088SAndy Grover 		goto out;
32170041088SAndy Grover 	}
32270041088SAndy Grover 
323ea3b1ea5SSowmini Varadhan 	tc = cp->cp_transport_data;
32470041088SAndy Grover 	ready = tc->t_orig_data_ready;
32570041088SAndy Grover 	rds_tcp_stats_inc(s_tcp_data_ready_calls);
32670041088SAndy Grover 
3273db6e0d1SSowmini Varadhan 	if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) {
3283db6e0d1SSowmini Varadhan 		rcu_read_lock();
329ebeeb1adSSowmini Varadhan 		if (!rds_destroy_pending(cp->cp_conn))
330ea3b1ea5SSowmini Varadhan 			queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
3313db6e0d1SSowmini Varadhan 		rcu_read_unlock();
3323db6e0d1SSowmini Varadhan 	}
33370041088SAndy Grover out:
33438036629SEric Dumazet 	read_unlock_bh(&sk->sk_callback_lock);
335676d2369SDavid S. Miller 	ready(sk);
33670041088SAndy Grover }
33770041088SAndy Grover 
rds_tcp_recv_init(void)338ef87b7eaSZach Brown int rds_tcp_recv_init(void)
33970041088SAndy Grover {
34070041088SAndy Grover 	rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
34170041088SAndy Grover 					sizeof(struct rds_tcp_incoming),
34270041088SAndy Grover 					0, 0, NULL);
3438690bfa1SAndy Grover 	if (!rds_tcp_incoming_slab)
34470041088SAndy Grover 		return -ENOMEM;
34570041088SAndy Grover 	return 0;
34670041088SAndy Grover }
34770041088SAndy Grover 
rds_tcp_recv_exit(void)34870041088SAndy Grover void rds_tcp_recv_exit(void)
34970041088SAndy Grover {
35070041088SAndy Grover 	kmem_cache_destroy(rds_tcp_incoming_slab);
35170041088SAndy Grover }
352