xref: /openbmc/linux/net/rds/tcp_recv.c (revision ebeeb1ad9b8adcc37c2ec21a96f39e9d35199b46)
170041088SAndy Grover /*
270041088SAndy Grover  * Copyright (c) 2006 Oracle.  All rights reserved.
370041088SAndy Grover  *
470041088SAndy Grover  * This software is available to you under a choice of one of two
570041088SAndy Grover  * licenses.  You may choose to be licensed under the terms of the GNU
670041088SAndy Grover  * General Public License (GPL) Version 2, available from the file
770041088SAndy Grover  * COPYING in the main directory of this source tree, or the
870041088SAndy Grover  * OpenIB.org BSD license below:
970041088SAndy Grover  *
1070041088SAndy Grover  *     Redistribution and use in source and binary forms, with or
1170041088SAndy Grover  *     without modification, are permitted provided that the following
1270041088SAndy Grover  *     conditions are met:
1370041088SAndy Grover  *
1470041088SAndy Grover  *      - Redistributions of source code must retain the above
1570041088SAndy Grover  *        copyright notice, this list of conditions and the following
1670041088SAndy Grover  *        disclaimer.
1770041088SAndy Grover  *
1870041088SAndy Grover  *      - Redistributions in binary form must reproduce the above
1970041088SAndy Grover  *        copyright notice, this list of conditions and the following
2070041088SAndy Grover  *        disclaimer in the documentation and/or other materials
2170041088SAndy Grover  *        provided with the distribution.
2270041088SAndy Grover  *
2370041088SAndy Grover  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
2470041088SAndy Grover  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
2570041088SAndy Grover  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
2670041088SAndy Grover  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
2770041088SAndy Grover  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
2870041088SAndy Grover  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2970041088SAndy Grover  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3070041088SAndy Grover  * SOFTWARE.
3170041088SAndy Grover  *
3270041088SAndy Grover  */
3370041088SAndy Grover #include <linux/kernel.h>
345a0e3ad6STejun Heo #include <linux/slab.h>
3570041088SAndy Grover #include <net/tcp.h>
3670041088SAndy Grover 
3770041088SAndy Grover #include "rds.h"
3870041088SAndy Grover #include "tcp.h"
3970041088SAndy Grover 
4070041088SAndy Grover static struct kmem_cache *rds_tcp_incoming_slab;
4170041088SAndy Grover 
42809fa148SAndy Grover static void rds_tcp_inc_purge(struct rds_incoming *inc)
4370041088SAndy Grover {
4470041088SAndy Grover 	struct rds_tcp_incoming *tinc;
4570041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
4670041088SAndy Grover 	rdsdebug("purging tinc %p inc %p\n", tinc, inc);
4770041088SAndy Grover 	skb_queue_purge(&tinc->ti_skb_list);
4870041088SAndy Grover }
4970041088SAndy Grover 
5070041088SAndy Grover void rds_tcp_inc_free(struct rds_incoming *inc)
5170041088SAndy Grover {
5270041088SAndy Grover 	struct rds_tcp_incoming *tinc;
5370041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
5470041088SAndy Grover 	rds_tcp_inc_purge(inc);
5570041088SAndy Grover 	rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
5670041088SAndy Grover 	kmem_cache_free(rds_tcp_incoming_slab, tinc);
5770041088SAndy Grover }
5870041088SAndy Grover 
5970041088SAndy Grover /*
6070041088SAndy Grover  * this is pretty lame, but, whatever.
6170041088SAndy Grover  */
62c310e72cSAl Viro int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
6370041088SAndy Grover {
6470041088SAndy Grover 	struct rds_tcp_incoming *tinc;
6570041088SAndy Grover 	struct sk_buff *skb;
6670041088SAndy Grover 	int ret = 0;
6770041088SAndy Grover 
68c310e72cSAl Viro 	if (!iov_iter_count(to))
6970041088SAndy Grover 		goto out;
7070041088SAndy Grover 
7170041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
7270041088SAndy Grover 
7370041088SAndy Grover 	skb_queue_walk(&tinc->ti_skb_list, skb) {
74c310e72cSAl Viro 		unsigned long to_copy, skb_off;
75c310e72cSAl Viro 		for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) {
76c310e72cSAl Viro 			to_copy = iov_iter_count(to);
7770041088SAndy Grover 			to_copy = min(to_copy, skb->len - skb_off);
7870041088SAndy Grover 
79c310e72cSAl Viro 			if (skb_copy_datagram_iter(skb, skb_off, to, to_copy))
80c310e72cSAl Viro 				return -EFAULT;
8170041088SAndy Grover 
82b075cfdbSAndy Grover 			rds_stats_add(s_copy_to_user, to_copy);
8370041088SAndy Grover 			ret += to_copy;
84c310e72cSAl Viro 
85c310e72cSAl Viro 			if (!iov_iter_count(to))
8670041088SAndy Grover 				goto out;
8770041088SAndy Grover 		}
8870041088SAndy Grover 	}
8970041088SAndy Grover out:
9070041088SAndy Grover 	return ret;
9170041088SAndy Grover }
9270041088SAndy Grover 
9370041088SAndy Grover /*
9470041088SAndy Grover  * We have a series of skbs that have fragmented pieces of the congestion
9570041088SAndy Grover  * bitmap.  They must add up to the exact size of the congestion bitmap.  We
9670041088SAndy Grover  * use the skb helpers to copy those into the pages that make up the in-memory
9770041088SAndy Grover  * congestion bitmap for the remote address of this connection.  We then tell
9870041088SAndy Grover  * the congestion core that the bitmap has been changed so that it can wake up
9970041088SAndy Grover  * sleepers.
10070041088SAndy Grover  *
10170041088SAndy Grover  * This is racing with sending paths which are using test_bit to see if the
10270041088SAndy Grover  * bitmap indicates that their recipient is congested.
10370041088SAndy Grover  */
10470041088SAndy Grover 
10570041088SAndy Grover static void rds_tcp_cong_recv(struct rds_connection *conn,
10670041088SAndy Grover 			      struct rds_tcp_incoming *tinc)
10770041088SAndy Grover {
10870041088SAndy Grover 	struct sk_buff *skb;
10970041088SAndy Grover 	unsigned int to_copy, skb_off;
11070041088SAndy Grover 	unsigned int map_off;
11170041088SAndy Grover 	unsigned int map_page;
11270041088SAndy Grover 	struct rds_cong_map *map;
11370041088SAndy Grover 	int ret;
11470041088SAndy Grover 
11570041088SAndy Grover 	/* catch completely corrupt packets */
11670041088SAndy Grover 	if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
11770041088SAndy Grover 		return;
11870041088SAndy Grover 
11970041088SAndy Grover 	map_page = 0;
12070041088SAndy Grover 	map_off = 0;
12170041088SAndy Grover 	map = conn->c_fcong;
12270041088SAndy Grover 
12370041088SAndy Grover 	skb_queue_walk(&tinc->ti_skb_list, skb) {
12470041088SAndy Grover 		skb_off = 0;
12570041088SAndy Grover 		while (skb_off < skb->len) {
12670041088SAndy Grover 			to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
12770041088SAndy Grover 					skb->len - skb_off);
12870041088SAndy Grover 
12970041088SAndy Grover 			BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
13070041088SAndy Grover 
13170041088SAndy Grover 			/* only returns 0 or -error */
13270041088SAndy Grover 			ret = skb_copy_bits(skb, skb_off,
13370041088SAndy Grover 				(void *)map->m_page_addrs[map_page] + map_off,
13470041088SAndy Grover 				to_copy);
13570041088SAndy Grover 			BUG_ON(ret != 0);
13670041088SAndy Grover 
13770041088SAndy Grover 			skb_off += to_copy;
13870041088SAndy Grover 			map_off += to_copy;
13970041088SAndy Grover 			if (map_off == PAGE_SIZE) {
14070041088SAndy Grover 				map_off = 0;
14170041088SAndy Grover 				map_page++;
14270041088SAndy Grover 			}
14370041088SAndy Grover 		}
14470041088SAndy Grover 	}
14570041088SAndy Grover 
14670041088SAndy Grover 	rds_cong_map_updated(map, ~(u64) 0);
14770041088SAndy Grover }
14870041088SAndy Grover 
14970041088SAndy Grover struct rds_tcp_desc_arg {
1502da43c4aSSowmini Varadhan 	struct rds_conn_path *conn_path;
15170041088SAndy Grover 	gfp_t gfp;
15270041088SAndy Grover };
15370041088SAndy Grover 
15470041088SAndy Grover static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
15570041088SAndy Grover 			     unsigned int offset, size_t len)
15670041088SAndy Grover {
15770041088SAndy Grover 	struct rds_tcp_desc_arg *arg = desc->arg.data;
1582da43c4aSSowmini Varadhan 	struct rds_conn_path *cp = arg->conn_path;
1592da43c4aSSowmini Varadhan 	struct rds_tcp_connection *tc = cp->cp_transport_data;
16070041088SAndy Grover 	struct rds_tcp_incoming *tinc = tc->t_tinc;
16170041088SAndy Grover 	struct sk_buff *clone;
16270041088SAndy Grover 	size_t left = len, to_copy;
16370041088SAndy Grover 
16470041088SAndy Grover 	rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
16570041088SAndy Grover 		 len);
16670041088SAndy Grover 
16770041088SAndy Grover 	/*
16870041088SAndy Grover 	 * tcp_read_sock() interprets partial progress as an indication to stop
16970041088SAndy Grover 	 * processing.
17070041088SAndy Grover 	 */
17170041088SAndy Grover 	while (left) {
1728690bfa1SAndy Grover 		if (!tinc) {
17370041088SAndy Grover 			tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
17470041088SAndy Grover 						arg->gfp);
1758690bfa1SAndy Grover 			if (!tinc) {
17670041088SAndy Grover 				desc->error = -ENOMEM;
17770041088SAndy Grover 				goto out;
17870041088SAndy Grover 			}
17970041088SAndy Grover 			tc->t_tinc = tinc;
18070041088SAndy Grover 			rdsdebug("alloced tinc %p\n", tinc);
1812da43c4aSSowmini Varadhan 			rds_inc_path_init(&tinc->ti_inc, cp,
1822da43c4aSSowmini Varadhan 					  cp->cp_conn->c_faddr);
1833289025aSSantosh Shilimkar 			tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
1843289025aSSantosh Shilimkar 					local_clock();
1853289025aSSantosh Shilimkar 
18670041088SAndy Grover 			/*
18770041088SAndy Grover 			 * XXX * we might be able to use the __ variants when
18870041088SAndy Grover 			 * we've already serialized at a higher level.
18970041088SAndy Grover 			 */
19070041088SAndy Grover 			skb_queue_head_init(&tinc->ti_skb_list);
19170041088SAndy Grover 		}
19270041088SAndy Grover 
19370041088SAndy Grover 		if (left && tc->t_tinc_hdr_rem) {
19470041088SAndy Grover 			to_copy = min(tc->t_tinc_hdr_rem, left);
19570041088SAndy Grover 			rdsdebug("copying %zu header from skb %p\n", to_copy,
19670041088SAndy Grover 				 skb);
19770041088SAndy Grover 			skb_copy_bits(skb, offset,
19870041088SAndy Grover 				      (char *)&tinc->ti_inc.i_hdr +
19970041088SAndy Grover 						sizeof(struct rds_header) -
20070041088SAndy Grover 						tc->t_tinc_hdr_rem,
20170041088SAndy Grover 				      to_copy);
20270041088SAndy Grover 			tc->t_tinc_hdr_rem -= to_copy;
20370041088SAndy Grover 			left -= to_copy;
20470041088SAndy Grover 			offset += to_copy;
20570041088SAndy Grover 
20670041088SAndy Grover 			if (tc->t_tinc_hdr_rem == 0) {
20770041088SAndy Grover 				/* could be 0 for a 0 len message */
20870041088SAndy Grover 				tc->t_tinc_data_rem =
20970041088SAndy Grover 					be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
2103289025aSSantosh Shilimkar 				tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
2113289025aSSantosh Shilimkar 					local_clock();
21270041088SAndy Grover 			}
21370041088SAndy Grover 		}
21470041088SAndy Grover 
21570041088SAndy Grover 		if (left && tc->t_tinc_data_rem) {
216947d2756SSowmini Varadhan 			to_copy = min(tc->t_tinc_data_rem, left);
217947d2756SSowmini Varadhan 
218947d2756SSowmini Varadhan 			clone = pskb_extract(skb, offset, to_copy, arg->gfp);
2198690bfa1SAndy Grover 			if (!clone) {
22070041088SAndy Grover 				desc->error = -ENOMEM;
22170041088SAndy Grover 				goto out;
22270041088SAndy Grover 			}
22370041088SAndy Grover 
22470041088SAndy Grover 			skb_queue_tail(&tinc->ti_skb_list, clone);
22570041088SAndy Grover 
22670041088SAndy Grover 			rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
22770041088SAndy Grover 				 "clone %p data %p len %d\n",
22870041088SAndy Grover 				 skb, skb->data, skb->len, offset, to_copy,
22970041088SAndy Grover 				 clone, clone->data, clone->len);
23070041088SAndy Grover 
23170041088SAndy Grover 			tc->t_tinc_data_rem -= to_copy;
23270041088SAndy Grover 			left -= to_copy;
23370041088SAndy Grover 			offset += to_copy;
23470041088SAndy Grover 		}
23570041088SAndy Grover 
23670041088SAndy Grover 		if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
2372da43c4aSSowmini Varadhan 			struct rds_connection *conn = cp->cp_conn;
2382da43c4aSSowmini Varadhan 
23970041088SAndy Grover 			if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
24070041088SAndy Grover 				rds_tcp_cong_recv(conn, tinc);
24170041088SAndy Grover 			else
24270041088SAndy Grover 				rds_recv_incoming(conn, conn->c_faddr,
24370041088SAndy Grover 						  conn->c_laddr, &tinc->ti_inc,
2446114eab5SCong Wang 						  arg->gfp);
24570041088SAndy Grover 
24670041088SAndy Grover 			tc->t_tinc_hdr_rem = sizeof(struct rds_header);
24770041088SAndy Grover 			tc->t_tinc_data_rem = 0;
24870041088SAndy Grover 			tc->t_tinc = NULL;
24970041088SAndy Grover 			rds_inc_put(&tinc->ti_inc);
25070041088SAndy Grover 			tinc = NULL;
25170041088SAndy Grover 		}
25270041088SAndy Grover 	}
25370041088SAndy Grover out:
25470041088SAndy Grover 	rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
25570041088SAndy Grover 		 len, left, skb->len,
25670041088SAndy Grover 		 skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
25770041088SAndy Grover 	return len - left;
25870041088SAndy Grover }
25970041088SAndy Grover 
26070041088SAndy Grover /* the caller has to hold the sock lock */
2612da43c4aSSowmini Varadhan static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp)
26270041088SAndy Grover {
2632da43c4aSSowmini Varadhan 	struct rds_tcp_connection *tc = cp->cp_transport_data;
26470041088SAndy Grover 	struct socket *sock = tc->t_sock;
26570041088SAndy Grover 	read_descriptor_t desc;
26670041088SAndy Grover 	struct rds_tcp_desc_arg arg;
26770041088SAndy Grover 
26870041088SAndy Grover 	/* It's like glib in the kernel! */
2692da43c4aSSowmini Varadhan 	arg.conn_path = cp;
27070041088SAndy Grover 	arg.gfp = gfp;
27170041088SAndy Grover 	desc.arg.data = &arg;
27270041088SAndy Grover 	desc.error = 0;
27370041088SAndy Grover 	desc.count = 1; /* give more than one skb per call */
27470041088SAndy Grover 
27570041088SAndy Grover 	tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
27670041088SAndy Grover 	rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
27770041088SAndy Grover 		 desc.error);
27870041088SAndy Grover 
27970041088SAndy Grover 	return desc.error;
28070041088SAndy Grover }
28170041088SAndy Grover 
28270041088SAndy Grover /*
28370041088SAndy Grover  * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
28470041088SAndy Grover  * data_ready.
28570041088SAndy Grover  *
28670041088SAndy Grover  * if we fail to allocate we're in trouble.. blindly wait some time before
28770041088SAndy Grover  * trying again to see if the VM can free up something for us.
28870041088SAndy Grover  */
2892da43c4aSSowmini Varadhan int rds_tcp_recv_path(struct rds_conn_path *cp)
29070041088SAndy Grover {
2912da43c4aSSowmini Varadhan 	struct rds_tcp_connection *tc = cp->cp_transport_data;
29270041088SAndy Grover 	struct socket *sock = tc->t_sock;
29370041088SAndy Grover 	int ret = 0;
29470041088SAndy Grover 
2952da43c4aSSowmini Varadhan 	rdsdebug("recv worker path [%d] tc %p sock %p\n",
2962da43c4aSSowmini Varadhan 		 cp->cp_index, tc, sock);
29770041088SAndy Grover 
29870041088SAndy Grover 	lock_sock(sock->sk);
2992da43c4aSSowmini Varadhan 	ret = rds_tcp_read_sock(cp, GFP_KERNEL);
30070041088SAndy Grover 	release_sock(sock->sk);
30170041088SAndy Grover 
30270041088SAndy Grover 	return ret;
30370041088SAndy Grover }
30470041088SAndy Grover 
305676d2369SDavid S. Miller void rds_tcp_data_ready(struct sock *sk)
30670041088SAndy Grover {
307676d2369SDavid S. Miller 	void (*ready)(struct sock *sk);
308ea3b1ea5SSowmini Varadhan 	struct rds_conn_path *cp;
30970041088SAndy Grover 	struct rds_tcp_connection *tc;
31070041088SAndy Grover 
311676d2369SDavid S. Miller 	rdsdebug("data ready sk %p\n", sk);
31270041088SAndy Grover 
31338036629SEric Dumazet 	read_lock_bh(&sk->sk_callback_lock);
314ea3b1ea5SSowmini Varadhan 	cp = sk->sk_user_data;
315ea3b1ea5SSowmini Varadhan 	if (!cp) { /* check for teardown race */
31670041088SAndy Grover 		ready = sk->sk_data_ready;
31770041088SAndy Grover 		goto out;
31870041088SAndy Grover 	}
31970041088SAndy Grover 
320ea3b1ea5SSowmini Varadhan 	tc = cp->cp_transport_data;
32170041088SAndy Grover 	ready = tc->t_orig_data_ready;
32270041088SAndy Grover 	rds_tcp_stats_inc(s_tcp_data_ready_calls);
32370041088SAndy Grover 
3243db6e0d1SSowmini Varadhan 	if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) {
3253db6e0d1SSowmini Varadhan 		rcu_read_lock();
326*ebeeb1adSSowmini Varadhan 		if (!rds_destroy_pending(cp->cp_conn))
327ea3b1ea5SSowmini Varadhan 			queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
3283db6e0d1SSowmini Varadhan 		rcu_read_unlock();
3293db6e0d1SSowmini Varadhan 	}
33070041088SAndy Grover out:
33138036629SEric Dumazet 	read_unlock_bh(&sk->sk_callback_lock);
332676d2369SDavid S. Miller 	ready(sk);
33370041088SAndy Grover }
33470041088SAndy Grover 
335ef87b7eaSZach Brown int rds_tcp_recv_init(void)
33670041088SAndy Grover {
33770041088SAndy Grover 	rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
33870041088SAndy Grover 					sizeof(struct rds_tcp_incoming),
33970041088SAndy Grover 					0, 0, NULL);
3408690bfa1SAndy Grover 	if (!rds_tcp_incoming_slab)
34170041088SAndy Grover 		return -ENOMEM;
34270041088SAndy Grover 	return 0;
34370041088SAndy Grover }
34470041088SAndy Grover 
34570041088SAndy Grover void rds_tcp_recv_exit(void)
34670041088SAndy Grover {
34770041088SAndy Grover 	kmem_cache_destroy(rds_tcp_incoming_slab);
34870041088SAndy Grover }
349