xref: /openbmc/linux/net/rds/tcp_recv.c (revision 5a0e3ad6af8660be21ca98a971cd00f331318c05)
170041088SAndy Grover /*
270041088SAndy Grover  * Copyright (c) 2006 Oracle.  All rights reserved.
370041088SAndy Grover  *
470041088SAndy Grover  * This software is available to you under a choice of one of two
570041088SAndy Grover  * licenses.  You may choose to be licensed under the terms of the GNU
670041088SAndy Grover  * General Public License (GPL) Version 2, available from the file
770041088SAndy Grover  * COPYING in the main directory of this source tree, or the
870041088SAndy Grover  * OpenIB.org BSD license below:
970041088SAndy Grover  *
1070041088SAndy Grover  *     Redistribution and use in source and binary forms, with or
1170041088SAndy Grover  *     without modification, are permitted provided that the following
1270041088SAndy Grover  *     conditions are met:
1370041088SAndy Grover  *
1470041088SAndy Grover  *      - Redistributions of source code must retain the above
1570041088SAndy Grover  *        copyright notice, this list of conditions and the following
1670041088SAndy Grover  *        disclaimer.
1770041088SAndy Grover  *
1870041088SAndy Grover  *      - Redistributions in binary form must reproduce the above
1970041088SAndy Grover  *        copyright notice, this list of conditions and the following
2070041088SAndy Grover  *        disclaimer in the documentation and/or other materials
2170041088SAndy Grover  *        provided with the distribution.
2270041088SAndy Grover  *
2370041088SAndy Grover  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
2470041088SAndy Grover  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
2570041088SAndy Grover  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
2670041088SAndy Grover  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
2770041088SAndy Grover  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
2870041088SAndy Grover  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2970041088SAndy Grover  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3070041088SAndy Grover  * SOFTWARE.
3170041088SAndy Grover  *
3270041088SAndy Grover  */
3370041088SAndy Grover #include <linux/kernel.h>
34*5a0e3ad6STejun Heo #include <linux/slab.h>
3570041088SAndy Grover #include <net/tcp.h>
3670041088SAndy Grover 
3770041088SAndy Grover #include "rds.h"
3870041088SAndy Grover #include "tcp.h"
3970041088SAndy Grover 
4070041088SAndy Grover static struct kmem_cache *rds_tcp_incoming_slab;
4170041088SAndy Grover 
4270041088SAndy Grover void rds_tcp_inc_purge(struct rds_incoming *inc)
4370041088SAndy Grover {
4470041088SAndy Grover 	struct rds_tcp_incoming *tinc;
4570041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
4670041088SAndy Grover 	rdsdebug("purging tinc %p inc %p\n", tinc, inc);
4770041088SAndy Grover 	skb_queue_purge(&tinc->ti_skb_list);
4870041088SAndy Grover }
4970041088SAndy Grover 
5070041088SAndy Grover void rds_tcp_inc_free(struct rds_incoming *inc)
5170041088SAndy Grover {
5270041088SAndy Grover 	struct rds_tcp_incoming *tinc;
5370041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
5470041088SAndy Grover 	rds_tcp_inc_purge(inc);
5570041088SAndy Grover 	rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
5670041088SAndy Grover 	kmem_cache_free(rds_tcp_incoming_slab, tinc);
5770041088SAndy Grover }
5870041088SAndy Grover 
5970041088SAndy Grover /*
6070041088SAndy Grover  * this is pretty lame, but, whatever.
6170041088SAndy Grover  */
6270041088SAndy Grover int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
6370041088SAndy Grover 			     size_t size)
6470041088SAndy Grover {
6570041088SAndy Grover 	struct rds_tcp_incoming *tinc;
6670041088SAndy Grover 	struct iovec *iov, tmp;
6770041088SAndy Grover 	struct sk_buff *skb;
6870041088SAndy Grover 	unsigned long to_copy, skb_off;
6970041088SAndy Grover 	int ret = 0;
7070041088SAndy Grover 
7170041088SAndy Grover 	if (size == 0)
7270041088SAndy Grover 		goto out;
7370041088SAndy Grover 
7470041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
7570041088SAndy Grover 	iov = first_iov;
7670041088SAndy Grover 	tmp = *iov;
7770041088SAndy Grover 
7870041088SAndy Grover 	skb_queue_walk(&tinc->ti_skb_list, skb) {
7970041088SAndy Grover 		skb_off = 0;
8070041088SAndy Grover 		while (skb_off < skb->len) {
8170041088SAndy Grover 			while (tmp.iov_len == 0) {
8270041088SAndy Grover 				iov++;
8370041088SAndy Grover 				tmp = *iov;
8470041088SAndy Grover 			}
8570041088SAndy Grover 
8670041088SAndy Grover 			to_copy = min(tmp.iov_len, size);
8770041088SAndy Grover 			to_copy = min(to_copy, skb->len - skb_off);
8870041088SAndy Grover 
8970041088SAndy Grover 			rdsdebug("ret %d size %zu skb %p skb_off %lu "
9070041088SAndy Grover 				 "skblen %d iov_base %p iov_len %zu cpy %lu\n",
9170041088SAndy Grover 				 ret, size, skb, skb_off, skb->len,
9270041088SAndy Grover 				 tmp.iov_base, tmp.iov_len, to_copy);
9370041088SAndy Grover 
9470041088SAndy Grover 			/* modifies tmp as it copies */
9570041088SAndy Grover 			if (skb_copy_datagram_iovec(skb, skb_off, &tmp,
9670041088SAndy Grover 						    to_copy)) {
9770041088SAndy Grover 				ret = -EFAULT;
9870041088SAndy Grover 				goto out;
9970041088SAndy Grover 			}
10070041088SAndy Grover 
10170041088SAndy Grover 			size -= to_copy;
10270041088SAndy Grover 			ret += to_copy;
10370041088SAndy Grover 			skb_off += to_copy;
10470041088SAndy Grover 			if (size == 0)
10570041088SAndy Grover 				goto out;
10670041088SAndy Grover 		}
10770041088SAndy Grover 	}
10870041088SAndy Grover out:
10970041088SAndy Grover 	return ret;
11070041088SAndy Grover }
11170041088SAndy Grover 
11270041088SAndy Grover /*
11370041088SAndy Grover  * We have a series of skbs that have fragmented pieces of the congestion
11470041088SAndy Grover  * bitmap.  They must add up to the exact size of the congestion bitmap.  We
11570041088SAndy Grover  * use the skb helpers to copy those into the pages that make up the in-memory
11670041088SAndy Grover  * congestion bitmap for the remote address of this connection.  We then tell
11770041088SAndy Grover  * the congestion core that the bitmap has been changed so that it can wake up
11870041088SAndy Grover  * sleepers.
11970041088SAndy Grover  *
12070041088SAndy Grover  * This is racing with sending paths which are using test_bit to see if the
12170041088SAndy Grover  * bitmap indicates that their recipient is congested.
12270041088SAndy Grover  */
12370041088SAndy Grover 
12470041088SAndy Grover static void rds_tcp_cong_recv(struct rds_connection *conn,
12570041088SAndy Grover 			      struct rds_tcp_incoming *tinc)
12670041088SAndy Grover {
12770041088SAndy Grover 	struct sk_buff *skb;
12870041088SAndy Grover 	unsigned int to_copy, skb_off;
12970041088SAndy Grover 	unsigned int map_off;
13070041088SAndy Grover 	unsigned int map_page;
13170041088SAndy Grover 	struct rds_cong_map *map;
13270041088SAndy Grover 	int ret;
13370041088SAndy Grover 
13470041088SAndy Grover 	/* catch completely corrupt packets */
13570041088SAndy Grover 	if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
13670041088SAndy Grover 		return;
13770041088SAndy Grover 
13870041088SAndy Grover 	map_page = 0;
13970041088SAndy Grover 	map_off = 0;
14070041088SAndy Grover 	map = conn->c_fcong;
14170041088SAndy Grover 
14270041088SAndy Grover 	skb_queue_walk(&tinc->ti_skb_list, skb) {
14370041088SAndy Grover 		skb_off = 0;
14470041088SAndy Grover 		while (skb_off < skb->len) {
14570041088SAndy Grover 			to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
14670041088SAndy Grover 					skb->len - skb_off);
14770041088SAndy Grover 
14870041088SAndy Grover 			BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
14970041088SAndy Grover 
15070041088SAndy Grover 			/* only returns 0 or -error */
15170041088SAndy Grover 			ret = skb_copy_bits(skb, skb_off,
15270041088SAndy Grover 				(void *)map->m_page_addrs[map_page] + map_off,
15370041088SAndy Grover 				to_copy);
15470041088SAndy Grover 			BUG_ON(ret != 0);
15570041088SAndy Grover 
15670041088SAndy Grover 			skb_off += to_copy;
15770041088SAndy Grover 			map_off += to_copy;
15870041088SAndy Grover 			if (map_off == PAGE_SIZE) {
15970041088SAndy Grover 				map_off = 0;
16070041088SAndy Grover 				map_page++;
16170041088SAndy Grover 			}
16270041088SAndy Grover 		}
16370041088SAndy Grover 	}
16470041088SAndy Grover 
16570041088SAndy Grover 	rds_cong_map_updated(map, ~(u64) 0);
16670041088SAndy Grover }
16770041088SAndy Grover 
16870041088SAndy Grover struct rds_tcp_desc_arg {
16970041088SAndy Grover 	struct rds_connection *conn;
17070041088SAndy Grover 	gfp_t gfp;
17170041088SAndy Grover 	enum km_type km;
17270041088SAndy Grover };
17370041088SAndy Grover 
17470041088SAndy Grover static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
17570041088SAndy Grover 			     unsigned int offset, size_t len)
17670041088SAndy Grover {
17770041088SAndy Grover 	struct rds_tcp_desc_arg *arg = desc->arg.data;
17870041088SAndy Grover 	struct rds_connection *conn = arg->conn;
17970041088SAndy Grover 	struct rds_tcp_connection *tc = conn->c_transport_data;
18070041088SAndy Grover 	struct rds_tcp_incoming *tinc = tc->t_tinc;
18170041088SAndy Grover 	struct sk_buff *clone;
18270041088SAndy Grover 	size_t left = len, to_copy;
18370041088SAndy Grover 
18470041088SAndy Grover 	rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
18570041088SAndy Grover 		 len);
18670041088SAndy Grover 
18770041088SAndy Grover 	/*
18870041088SAndy Grover 	 * tcp_read_sock() interprets partial progress as an indication to stop
18970041088SAndy Grover 	 * processing.
19070041088SAndy Grover 	 */
19170041088SAndy Grover 	while (left) {
19270041088SAndy Grover 		if (tinc == NULL) {
19370041088SAndy Grover 			tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
19470041088SAndy Grover 					        arg->gfp);
19570041088SAndy Grover 			if (tinc == NULL) {
19670041088SAndy Grover 				desc->error = -ENOMEM;
19770041088SAndy Grover 				goto out;
19870041088SAndy Grover 			}
19970041088SAndy Grover 			tc->t_tinc = tinc;
20070041088SAndy Grover 			rdsdebug("alloced tinc %p\n", tinc);
20170041088SAndy Grover 			rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
20270041088SAndy Grover 			/*
20370041088SAndy Grover 			 * XXX * we might be able to use the __ variants when
20470041088SAndy Grover 			 * we've already serialized at a higher level.
20570041088SAndy Grover 			 */
20670041088SAndy Grover 			skb_queue_head_init(&tinc->ti_skb_list);
20770041088SAndy Grover 		}
20870041088SAndy Grover 
20970041088SAndy Grover 		if (left && tc->t_tinc_hdr_rem) {
21070041088SAndy Grover 			to_copy = min(tc->t_tinc_hdr_rem, left);
21170041088SAndy Grover 			rdsdebug("copying %zu header from skb %p\n", to_copy,
21270041088SAndy Grover 				 skb);
21370041088SAndy Grover 			skb_copy_bits(skb, offset,
21470041088SAndy Grover 				      (char *)&tinc->ti_inc.i_hdr +
21570041088SAndy Grover 						sizeof(struct rds_header) -
21670041088SAndy Grover 						tc->t_tinc_hdr_rem,
21770041088SAndy Grover 				      to_copy);
21870041088SAndy Grover 			tc->t_tinc_hdr_rem -= to_copy;
21970041088SAndy Grover 			left -= to_copy;
22070041088SAndy Grover 			offset += to_copy;
22170041088SAndy Grover 
22270041088SAndy Grover 			if (tc->t_tinc_hdr_rem == 0) {
22370041088SAndy Grover 				/* could be 0 for a 0 len message */
22470041088SAndy Grover 				tc->t_tinc_data_rem =
22570041088SAndy Grover 					be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
22670041088SAndy Grover 			}
22770041088SAndy Grover 		}
22870041088SAndy Grover 
22970041088SAndy Grover 		if (left && tc->t_tinc_data_rem) {
23070041088SAndy Grover 			clone = skb_clone(skb, arg->gfp);
23170041088SAndy Grover 			if (clone == NULL) {
23270041088SAndy Grover 				desc->error = -ENOMEM;
23370041088SAndy Grover 				goto out;
23470041088SAndy Grover 			}
23570041088SAndy Grover 
23670041088SAndy Grover 			to_copy = min(tc->t_tinc_data_rem, left);
23770041088SAndy Grover 			pskb_pull(clone, offset);
23870041088SAndy Grover 			pskb_trim(clone, to_copy);
23970041088SAndy Grover 			skb_queue_tail(&tinc->ti_skb_list, clone);
24070041088SAndy Grover 
24170041088SAndy Grover 			rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
24270041088SAndy Grover 				 "clone %p data %p len %d\n",
24370041088SAndy Grover 				 skb, skb->data, skb->len, offset, to_copy,
24470041088SAndy Grover 				 clone, clone->data, clone->len);
24570041088SAndy Grover 
24670041088SAndy Grover 			tc->t_tinc_data_rem -= to_copy;
24770041088SAndy Grover 			left -= to_copy;
24870041088SAndy Grover 			offset += to_copy;
24970041088SAndy Grover 		}
25070041088SAndy Grover 
25170041088SAndy Grover 		if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
25270041088SAndy Grover 			if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
25370041088SAndy Grover 				rds_tcp_cong_recv(conn, tinc);
25470041088SAndy Grover 			else
25570041088SAndy Grover 				rds_recv_incoming(conn, conn->c_faddr,
25670041088SAndy Grover 						  conn->c_laddr, &tinc->ti_inc,
25770041088SAndy Grover 						  arg->gfp, arg->km);
25870041088SAndy Grover 
25970041088SAndy Grover 			tc->t_tinc_hdr_rem = sizeof(struct rds_header);
26070041088SAndy Grover 			tc->t_tinc_data_rem = 0;
26170041088SAndy Grover 			tc->t_tinc = NULL;
26270041088SAndy Grover 			rds_inc_put(&tinc->ti_inc);
26370041088SAndy Grover 			tinc = NULL;
26470041088SAndy Grover 		}
26570041088SAndy Grover 	}
26670041088SAndy Grover out:
26770041088SAndy Grover 	rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
26870041088SAndy Grover 		 len, left, skb->len,
26970041088SAndy Grover 		 skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
27070041088SAndy Grover 	return len - left;
27170041088SAndy Grover }
27270041088SAndy Grover 
27370041088SAndy Grover /* the caller has to hold the sock lock */
27470041088SAndy Grover int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km)
27570041088SAndy Grover {
27670041088SAndy Grover 	struct rds_tcp_connection *tc = conn->c_transport_data;
27770041088SAndy Grover 	struct socket *sock = tc->t_sock;
27870041088SAndy Grover 	read_descriptor_t desc;
27970041088SAndy Grover 	struct rds_tcp_desc_arg arg;
28070041088SAndy Grover 
28170041088SAndy Grover 	/* It's like glib in the kernel! */
28270041088SAndy Grover 	arg.conn = conn;
28370041088SAndy Grover 	arg.gfp = gfp;
28470041088SAndy Grover 	arg.km = km;
28570041088SAndy Grover 	desc.arg.data = &arg;
28670041088SAndy Grover 	desc.error = 0;
28770041088SAndy Grover 	desc.count = 1; /* give more than one skb per call */
28870041088SAndy Grover 
28970041088SAndy Grover 	tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
29070041088SAndy Grover 	rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
29170041088SAndy Grover 		 desc.error);
29270041088SAndy Grover 
29370041088SAndy Grover 	return desc.error;
29470041088SAndy Grover }
29570041088SAndy Grover 
29670041088SAndy Grover /*
29770041088SAndy Grover  * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
29870041088SAndy Grover  * data_ready.
29970041088SAndy Grover  *
30070041088SAndy Grover  * if we fail to allocate we're in trouble.. blindly wait some time before
30170041088SAndy Grover  * trying again to see if the VM can free up something for us.
30270041088SAndy Grover  */
30370041088SAndy Grover int rds_tcp_recv(struct rds_connection *conn)
30470041088SAndy Grover {
30570041088SAndy Grover 	struct rds_tcp_connection *tc = conn->c_transport_data;
30670041088SAndy Grover 	struct socket *sock = tc->t_sock;
30770041088SAndy Grover 	int ret = 0;
30870041088SAndy Grover 
30970041088SAndy Grover 	rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
31070041088SAndy Grover 
31170041088SAndy Grover 	lock_sock(sock->sk);
31270041088SAndy Grover 	ret = rds_tcp_read_sock(conn, GFP_KERNEL, KM_USER0);
31370041088SAndy Grover 	release_sock(sock->sk);
31470041088SAndy Grover 
31570041088SAndy Grover 	return ret;
31670041088SAndy Grover }
31770041088SAndy Grover 
31870041088SAndy Grover void rds_tcp_data_ready(struct sock *sk, int bytes)
31970041088SAndy Grover {
32070041088SAndy Grover 	void (*ready)(struct sock *sk, int bytes);
32170041088SAndy Grover 	struct rds_connection *conn;
32270041088SAndy Grover 	struct rds_tcp_connection *tc;
32370041088SAndy Grover 
32470041088SAndy Grover 	rdsdebug("data ready sk %p bytes %d\n", sk, bytes);
32570041088SAndy Grover 
32670041088SAndy Grover 	read_lock(&sk->sk_callback_lock);
32770041088SAndy Grover 	conn = sk->sk_user_data;
32870041088SAndy Grover 	if (conn == NULL) { /* check for teardown race */
32970041088SAndy Grover 		ready = sk->sk_data_ready;
33070041088SAndy Grover 		goto out;
33170041088SAndy Grover 	}
33270041088SAndy Grover 
33370041088SAndy Grover 	tc = conn->c_transport_data;
33470041088SAndy Grover 	ready = tc->t_orig_data_ready;
33570041088SAndy Grover 	rds_tcp_stats_inc(s_tcp_data_ready_calls);
33670041088SAndy Grover 
33770041088SAndy Grover 	if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM)
33870041088SAndy Grover 		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
33970041088SAndy Grover out:
34070041088SAndy Grover 	read_unlock(&sk->sk_callback_lock);
34170041088SAndy Grover 	ready(sk, bytes);
34270041088SAndy Grover }
34370041088SAndy Grover 
34470041088SAndy Grover int __init rds_tcp_recv_init(void)
34570041088SAndy Grover {
34670041088SAndy Grover 	rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
34770041088SAndy Grover 					sizeof(struct rds_tcp_incoming),
34870041088SAndy Grover 					0, 0, NULL);
34970041088SAndy Grover 	if (rds_tcp_incoming_slab == NULL)
35070041088SAndy Grover 		return -ENOMEM;
35170041088SAndy Grover 	return 0;
35270041088SAndy Grover }
35370041088SAndy Grover 
35470041088SAndy Grover void rds_tcp_recv_exit(void)
35570041088SAndy Grover {
35670041088SAndy Grover 	kmem_cache_destroy(rds_tcp_incoming_slab);
35770041088SAndy Grover }
358