xref: /openbmc/linux/net/rds/tcp_recv.c (revision 38036629cded6b96a9f9689758a88d067c4d4d44)
170041088SAndy Grover /*
270041088SAndy Grover  * Copyright (c) 2006 Oracle.  All rights reserved.
370041088SAndy Grover  *
470041088SAndy Grover  * This software is available to you under a choice of one of two
570041088SAndy Grover  * licenses.  You may choose to be licensed under the terms of the GNU
670041088SAndy Grover  * General Public License (GPL) Version 2, available from the file
770041088SAndy Grover  * COPYING in the main directory of this source tree, or the
870041088SAndy Grover  * OpenIB.org BSD license below:
970041088SAndy Grover  *
1070041088SAndy Grover  *     Redistribution and use in source and binary forms, with or
1170041088SAndy Grover  *     without modification, are permitted provided that the following
1270041088SAndy Grover  *     conditions are met:
1370041088SAndy Grover  *
1470041088SAndy Grover  *      - Redistributions of source code must retain the above
1570041088SAndy Grover  *        copyright notice, this list of conditions and the following
1670041088SAndy Grover  *        disclaimer.
1770041088SAndy Grover  *
1870041088SAndy Grover  *      - Redistributions in binary form must reproduce the above
1970041088SAndy Grover  *        copyright notice, this list of conditions and the following
2070041088SAndy Grover  *        disclaimer in the documentation and/or other materials
2170041088SAndy Grover  *        provided with the distribution.
2270041088SAndy Grover  *
2370041088SAndy Grover  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
2470041088SAndy Grover  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
2570041088SAndy Grover  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
2670041088SAndy Grover  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
2770041088SAndy Grover  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
2870041088SAndy Grover  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2970041088SAndy Grover  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3070041088SAndy Grover  * SOFTWARE.
3170041088SAndy Grover  *
3270041088SAndy Grover  */
3370041088SAndy Grover #include <linux/kernel.h>
345a0e3ad6STejun Heo #include <linux/slab.h>
3570041088SAndy Grover #include <net/tcp.h>
3670041088SAndy Grover 
3770041088SAndy Grover #include "rds.h"
3870041088SAndy Grover #include "tcp.h"
3970041088SAndy Grover 
4070041088SAndy Grover static struct kmem_cache *rds_tcp_incoming_slab;
4170041088SAndy Grover 
42809fa148SAndy Grover static void rds_tcp_inc_purge(struct rds_incoming *inc)
4370041088SAndy Grover {
4470041088SAndy Grover 	struct rds_tcp_incoming *tinc;
4570041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
4670041088SAndy Grover 	rdsdebug("purging tinc %p inc %p\n", tinc, inc);
4770041088SAndy Grover 	skb_queue_purge(&tinc->ti_skb_list);
4870041088SAndy Grover }
4970041088SAndy Grover 
5070041088SAndy Grover void rds_tcp_inc_free(struct rds_incoming *inc)
5170041088SAndy Grover {
5270041088SAndy Grover 	struct rds_tcp_incoming *tinc;
5370041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
5470041088SAndy Grover 	rds_tcp_inc_purge(inc);
5570041088SAndy Grover 	rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
5670041088SAndy Grover 	kmem_cache_free(rds_tcp_incoming_slab, tinc);
5770041088SAndy Grover }
5870041088SAndy Grover 
5970041088SAndy Grover /*
6070041088SAndy Grover  * this is pretty lame, but, whatever.
6170041088SAndy Grover  */
62c310e72cSAl Viro int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
6370041088SAndy Grover {
6470041088SAndy Grover 	struct rds_tcp_incoming *tinc;
6570041088SAndy Grover 	struct sk_buff *skb;
6670041088SAndy Grover 	int ret = 0;
6770041088SAndy Grover 
68c310e72cSAl Viro 	if (!iov_iter_count(to))
6970041088SAndy Grover 		goto out;
7070041088SAndy Grover 
7170041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
7270041088SAndy Grover 
7370041088SAndy Grover 	skb_queue_walk(&tinc->ti_skb_list, skb) {
74c310e72cSAl Viro 		unsigned long to_copy, skb_off;
75c310e72cSAl Viro 		for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) {
76c310e72cSAl Viro 			to_copy = iov_iter_count(to);
7770041088SAndy Grover 			to_copy = min(to_copy, skb->len - skb_off);
7870041088SAndy Grover 
79c310e72cSAl Viro 			if (skb_copy_datagram_iter(skb, skb_off, to, to_copy))
80c310e72cSAl Viro 				return -EFAULT;
8170041088SAndy Grover 
82b075cfdbSAndy Grover 			rds_stats_add(s_copy_to_user, to_copy);
8370041088SAndy Grover 			ret += to_copy;
84c310e72cSAl Viro 
85c310e72cSAl Viro 			if (!iov_iter_count(to))
8670041088SAndy Grover 				goto out;
8770041088SAndy Grover 		}
8870041088SAndy Grover 	}
8970041088SAndy Grover out:
9070041088SAndy Grover 	return ret;
9170041088SAndy Grover }
9270041088SAndy Grover 
9370041088SAndy Grover /*
9470041088SAndy Grover  * We have a series of skbs that have fragmented pieces of the congestion
9570041088SAndy Grover  * bitmap.  They must add up to the exact size of the congestion bitmap.  We
9670041088SAndy Grover  * use the skb helpers to copy those into the pages that make up the in-memory
9770041088SAndy Grover  * congestion bitmap for the remote address of this connection.  We then tell
9870041088SAndy Grover  * the congestion core that the bitmap has been changed so that it can wake up
9970041088SAndy Grover  * sleepers.
10070041088SAndy Grover  *
10170041088SAndy Grover  * This is racing with sending paths which are using test_bit to see if the
10270041088SAndy Grover  * bitmap indicates that their recipient is congested.
10370041088SAndy Grover  */
10470041088SAndy Grover 
10570041088SAndy Grover static void rds_tcp_cong_recv(struct rds_connection *conn,
10670041088SAndy Grover 			      struct rds_tcp_incoming *tinc)
10770041088SAndy Grover {
10870041088SAndy Grover 	struct sk_buff *skb;
10970041088SAndy Grover 	unsigned int to_copy, skb_off;
11070041088SAndy Grover 	unsigned int map_off;
11170041088SAndy Grover 	unsigned int map_page;
11270041088SAndy Grover 	struct rds_cong_map *map;
11370041088SAndy Grover 	int ret;
11470041088SAndy Grover 
11570041088SAndy Grover 	/* catch completely corrupt packets */
11670041088SAndy Grover 	if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
11770041088SAndy Grover 		return;
11870041088SAndy Grover 
11970041088SAndy Grover 	map_page = 0;
12070041088SAndy Grover 	map_off = 0;
12170041088SAndy Grover 	map = conn->c_fcong;
12270041088SAndy Grover 
12370041088SAndy Grover 	skb_queue_walk(&tinc->ti_skb_list, skb) {
12470041088SAndy Grover 		skb_off = 0;
12570041088SAndy Grover 		while (skb_off < skb->len) {
12670041088SAndy Grover 			to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
12770041088SAndy Grover 					skb->len - skb_off);
12870041088SAndy Grover 
12970041088SAndy Grover 			BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
13070041088SAndy Grover 
13170041088SAndy Grover 			/* only returns 0 or -error */
13270041088SAndy Grover 			ret = skb_copy_bits(skb, skb_off,
13370041088SAndy Grover 				(void *)map->m_page_addrs[map_page] + map_off,
13470041088SAndy Grover 				to_copy);
13570041088SAndy Grover 			BUG_ON(ret != 0);
13670041088SAndy Grover 
13770041088SAndy Grover 			skb_off += to_copy;
13870041088SAndy Grover 			map_off += to_copy;
13970041088SAndy Grover 			if (map_off == PAGE_SIZE) {
14070041088SAndy Grover 				map_off = 0;
14170041088SAndy Grover 				map_page++;
14270041088SAndy Grover 			}
14370041088SAndy Grover 		}
14470041088SAndy Grover 	}
14570041088SAndy Grover 
14670041088SAndy Grover 	rds_cong_map_updated(map, ~(u64) 0);
14770041088SAndy Grover }
14870041088SAndy Grover 
14970041088SAndy Grover struct rds_tcp_desc_arg {
15070041088SAndy Grover 	struct rds_connection *conn;
15170041088SAndy Grover 	gfp_t gfp;
15270041088SAndy Grover };
15370041088SAndy Grover 
15470041088SAndy Grover static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
15570041088SAndy Grover 			     unsigned int offset, size_t len)
15670041088SAndy Grover {
15770041088SAndy Grover 	struct rds_tcp_desc_arg *arg = desc->arg.data;
15870041088SAndy Grover 	struct rds_connection *conn = arg->conn;
15970041088SAndy Grover 	struct rds_tcp_connection *tc = conn->c_transport_data;
16070041088SAndy Grover 	struct rds_tcp_incoming *tinc = tc->t_tinc;
16170041088SAndy Grover 	struct sk_buff *clone;
16270041088SAndy Grover 	size_t left = len, to_copy;
16370041088SAndy Grover 
16470041088SAndy Grover 	rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
16570041088SAndy Grover 		 len);
16670041088SAndy Grover 
16770041088SAndy Grover 	/*
16870041088SAndy Grover 	 * tcp_read_sock() interprets partial progress as an indication to stop
16970041088SAndy Grover 	 * processing.
17070041088SAndy Grover 	 */
17170041088SAndy Grover 	while (left) {
1728690bfa1SAndy Grover 		if (!tinc) {
17370041088SAndy Grover 			tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
17470041088SAndy Grover 					        arg->gfp);
1758690bfa1SAndy Grover 			if (!tinc) {
17670041088SAndy Grover 				desc->error = -ENOMEM;
17770041088SAndy Grover 				goto out;
17870041088SAndy Grover 			}
17970041088SAndy Grover 			tc->t_tinc = tinc;
18070041088SAndy Grover 			rdsdebug("alloced tinc %p\n", tinc);
18170041088SAndy Grover 			rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
18270041088SAndy Grover 			/*
18370041088SAndy Grover 			 * XXX * we might be able to use the __ variants when
18470041088SAndy Grover 			 * we've already serialized at a higher level.
18570041088SAndy Grover 			 */
18670041088SAndy Grover 			skb_queue_head_init(&tinc->ti_skb_list);
18770041088SAndy Grover 		}
18870041088SAndy Grover 
18970041088SAndy Grover 		if (left && tc->t_tinc_hdr_rem) {
19070041088SAndy Grover 			to_copy = min(tc->t_tinc_hdr_rem, left);
19170041088SAndy Grover 			rdsdebug("copying %zu header from skb %p\n", to_copy,
19270041088SAndy Grover 				 skb);
19370041088SAndy Grover 			skb_copy_bits(skb, offset,
19470041088SAndy Grover 				      (char *)&tinc->ti_inc.i_hdr +
19570041088SAndy Grover 						sizeof(struct rds_header) -
19670041088SAndy Grover 						tc->t_tinc_hdr_rem,
19770041088SAndy Grover 				      to_copy);
19870041088SAndy Grover 			tc->t_tinc_hdr_rem -= to_copy;
19970041088SAndy Grover 			left -= to_copy;
20070041088SAndy Grover 			offset += to_copy;
20170041088SAndy Grover 
20270041088SAndy Grover 			if (tc->t_tinc_hdr_rem == 0) {
20370041088SAndy Grover 				/* could be 0 for a 0 len message */
20470041088SAndy Grover 				tc->t_tinc_data_rem =
20570041088SAndy Grover 					be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
20670041088SAndy Grover 			}
20770041088SAndy Grover 		}
20870041088SAndy Grover 
20970041088SAndy Grover 		if (left && tc->t_tinc_data_rem) {
210947d2756SSowmini Varadhan 			to_copy = min(tc->t_tinc_data_rem, left);
211947d2756SSowmini Varadhan 
212947d2756SSowmini Varadhan 			clone = pskb_extract(skb, offset, to_copy, arg->gfp);
2138690bfa1SAndy Grover 			if (!clone) {
21470041088SAndy Grover 				desc->error = -ENOMEM;
21570041088SAndy Grover 				goto out;
21670041088SAndy Grover 			}
21770041088SAndy Grover 
21870041088SAndy Grover 			skb_queue_tail(&tinc->ti_skb_list, clone);
21970041088SAndy Grover 
22070041088SAndy Grover 			rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
22170041088SAndy Grover 				 "clone %p data %p len %d\n",
22270041088SAndy Grover 				 skb, skb->data, skb->len, offset, to_copy,
22370041088SAndy Grover 				 clone, clone->data, clone->len);
22470041088SAndy Grover 
22570041088SAndy Grover 			tc->t_tinc_data_rem -= to_copy;
22670041088SAndy Grover 			left -= to_copy;
22770041088SAndy Grover 			offset += to_copy;
22870041088SAndy Grover 		}
22970041088SAndy Grover 
23070041088SAndy Grover 		if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
23170041088SAndy Grover 			if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
23270041088SAndy Grover 				rds_tcp_cong_recv(conn, tinc);
23370041088SAndy Grover 			else
23470041088SAndy Grover 				rds_recv_incoming(conn, conn->c_faddr,
23570041088SAndy Grover 						  conn->c_laddr, &tinc->ti_inc,
2366114eab5SCong Wang 						  arg->gfp);
23770041088SAndy Grover 
23870041088SAndy Grover 			tc->t_tinc_hdr_rem = sizeof(struct rds_header);
23970041088SAndy Grover 			tc->t_tinc_data_rem = 0;
24070041088SAndy Grover 			tc->t_tinc = NULL;
24170041088SAndy Grover 			rds_inc_put(&tinc->ti_inc);
24270041088SAndy Grover 			tinc = NULL;
24370041088SAndy Grover 		}
24470041088SAndy Grover 	}
24570041088SAndy Grover out:
24670041088SAndy Grover 	rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
24770041088SAndy Grover 		 len, left, skb->len,
24870041088SAndy Grover 		 skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
24970041088SAndy Grover 	return len - left;
25070041088SAndy Grover }
25170041088SAndy Grover 
25270041088SAndy Grover /* the caller has to hold the sock lock */
2536114eab5SCong Wang static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp)
25470041088SAndy Grover {
25570041088SAndy Grover 	struct rds_tcp_connection *tc = conn->c_transport_data;
25670041088SAndy Grover 	struct socket *sock = tc->t_sock;
25770041088SAndy Grover 	read_descriptor_t desc;
25870041088SAndy Grover 	struct rds_tcp_desc_arg arg;
25970041088SAndy Grover 
26070041088SAndy Grover 	/* It's like glib in the kernel! */
26170041088SAndy Grover 	arg.conn = conn;
26270041088SAndy Grover 	arg.gfp = gfp;
26370041088SAndy Grover 	desc.arg.data = &arg;
26470041088SAndy Grover 	desc.error = 0;
26570041088SAndy Grover 	desc.count = 1; /* give more than one skb per call */
26670041088SAndy Grover 
26770041088SAndy Grover 	tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
26870041088SAndy Grover 	rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
26970041088SAndy Grover 		 desc.error);
27070041088SAndy Grover 
27170041088SAndy Grover 	return desc.error;
27270041088SAndy Grover }
27370041088SAndy Grover 
27470041088SAndy Grover /*
27570041088SAndy Grover  * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
27670041088SAndy Grover  * data_ready.
27770041088SAndy Grover  *
27870041088SAndy Grover  * if we fail to allocate we're in trouble.. blindly wait some time before
27970041088SAndy Grover  * trying again to see if the VM can free up something for us.
28070041088SAndy Grover  */
28170041088SAndy Grover int rds_tcp_recv(struct rds_connection *conn)
28270041088SAndy Grover {
28370041088SAndy Grover 	struct rds_tcp_connection *tc = conn->c_transport_data;
28470041088SAndy Grover 	struct socket *sock = tc->t_sock;
28570041088SAndy Grover 	int ret = 0;
28670041088SAndy Grover 
28770041088SAndy Grover 	rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
28870041088SAndy Grover 
28970041088SAndy Grover 	lock_sock(sock->sk);
2906114eab5SCong Wang 	ret = rds_tcp_read_sock(conn, GFP_KERNEL);
29170041088SAndy Grover 	release_sock(sock->sk);
29270041088SAndy Grover 
29370041088SAndy Grover 	return ret;
29470041088SAndy Grover }
29570041088SAndy Grover 
296676d2369SDavid S. Miller void rds_tcp_data_ready(struct sock *sk)
29770041088SAndy Grover {
298676d2369SDavid S. Miller 	void (*ready)(struct sock *sk);
29970041088SAndy Grover 	struct rds_connection *conn;
30070041088SAndy Grover 	struct rds_tcp_connection *tc;
30170041088SAndy Grover 
302676d2369SDavid S. Miller 	rdsdebug("data ready sk %p\n", sk);
30370041088SAndy Grover 
304*38036629SEric Dumazet 	read_lock_bh(&sk->sk_callback_lock);
30570041088SAndy Grover 	conn = sk->sk_user_data;
3068690bfa1SAndy Grover 	if (!conn) { /* check for teardown race */
30770041088SAndy Grover 		ready = sk->sk_data_ready;
30870041088SAndy Grover 		goto out;
30970041088SAndy Grover 	}
31070041088SAndy Grover 
31170041088SAndy Grover 	tc = conn->c_transport_data;
31270041088SAndy Grover 	ready = tc->t_orig_data_ready;
31370041088SAndy Grover 	rds_tcp_stats_inc(s_tcp_data_ready_calls);
31470041088SAndy Grover 
3156114eab5SCong Wang 	if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM)
31670041088SAndy Grover 		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
31770041088SAndy Grover out:
318*38036629SEric Dumazet 	read_unlock_bh(&sk->sk_callback_lock);
319676d2369SDavid S. Miller 	ready(sk);
32070041088SAndy Grover }
32170041088SAndy Grover 
322ef87b7eaSZach Brown int rds_tcp_recv_init(void)
32370041088SAndy Grover {
32470041088SAndy Grover 	rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
32570041088SAndy Grover 					sizeof(struct rds_tcp_incoming),
32670041088SAndy Grover 					0, 0, NULL);
3278690bfa1SAndy Grover 	if (!rds_tcp_incoming_slab)
32870041088SAndy Grover 		return -ENOMEM;
32970041088SAndy Grover 	return 0;
33070041088SAndy Grover }
33170041088SAndy Grover 
33270041088SAndy Grover void rds_tcp_recv_exit(void)
33370041088SAndy Grover {
33470041088SAndy Grover 	kmem_cache_destroy(rds_tcp_incoming_slab);
33570041088SAndy Grover }
336