xref: /openbmc/linux/net/rds/tcp_recv.c (revision 70041088e3b976627ba9a183b812f39ef8a9ba0e)
1*70041088SAndy Grover /*
2*70041088SAndy Grover  * Copyright (c) 2006 Oracle.  All rights reserved.
3*70041088SAndy Grover  *
4*70041088SAndy Grover  * This software is available to you under a choice of one of two
5*70041088SAndy Grover  * licenses.  You may choose to be licensed under the terms of the GNU
6*70041088SAndy Grover  * General Public License (GPL) Version 2, available from the file
7*70041088SAndy Grover  * COPYING in the main directory of this source tree, or the
8*70041088SAndy Grover  * OpenIB.org BSD license below:
9*70041088SAndy Grover  *
10*70041088SAndy Grover  *     Redistribution and use in source and binary forms, with or
11*70041088SAndy Grover  *     without modification, are permitted provided that the following
12*70041088SAndy Grover  *     conditions are met:
13*70041088SAndy Grover  *
14*70041088SAndy Grover  *      - Redistributions of source code must retain the above
15*70041088SAndy Grover  *        copyright notice, this list of conditions and the following
16*70041088SAndy Grover  *        disclaimer.
17*70041088SAndy Grover  *
18*70041088SAndy Grover  *      - Redistributions in binary form must reproduce the above
19*70041088SAndy Grover  *        copyright notice, this list of conditions and the following
20*70041088SAndy Grover  *        disclaimer in the documentation and/or other materials
21*70041088SAndy Grover  *        provided with the distribution.
22*70041088SAndy Grover  *
23*70041088SAndy Grover  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24*70041088SAndy Grover  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25*70041088SAndy Grover  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26*70041088SAndy Grover  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27*70041088SAndy Grover  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28*70041088SAndy Grover  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29*70041088SAndy Grover  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30*70041088SAndy Grover  * SOFTWARE.
31*70041088SAndy Grover  *
32*70041088SAndy Grover  */
33*70041088SAndy Grover #include <linux/kernel.h>
34*70041088SAndy Grover #include <net/tcp.h>
35*70041088SAndy Grover 
36*70041088SAndy Grover #include "rds.h"
37*70041088SAndy Grover #include "tcp.h"
38*70041088SAndy Grover 
39*70041088SAndy Grover static struct kmem_cache *rds_tcp_incoming_slab;
40*70041088SAndy Grover 
41*70041088SAndy Grover void rds_tcp_inc_purge(struct rds_incoming *inc)
42*70041088SAndy Grover {
43*70041088SAndy Grover 	struct rds_tcp_incoming *tinc;
44*70041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
45*70041088SAndy Grover 	rdsdebug("purging tinc %p inc %p\n", tinc, inc);
46*70041088SAndy Grover 	skb_queue_purge(&tinc->ti_skb_list);
47*70041088SAndy Grover }
48*70041088SAndy Grover 
49*70041088SAndy Grover void rds_tcp_inc_free(struct rds_incoming *inc)
50*70041088SAndy Grover {
51*70041088SAndy Grover 	struct rds_tcp_incoming *tinc;
52*70041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
53*70041088SAndy Grover 	rds_tcp_inc_purge(inc);
54*70041088SAndy Grover 	rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
55*70041088SAndy Grover 	kmem_cache_free(rds_tcp_incoming_slab, tinc);
56*70041088SAndy Grover }
57*70041088SAndy Grover 
58*70041088SAndy Grover /*
59*70041088SAndy Grover  * this is pretty lame, but, whatever.
60*70041088SAndy Grover  */
61*70041088SAndy Grover int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
62*70041088SAndy Grover 			     size_t size)
63*70041088SAndy Grover {
64*70041088SAndy Grover 	struct rds_tcp_incoming *tinc;
65*70041088SAndy Grover 	struct iovec *iov, tmp;
66*70041088SAndy Grover 	struct sk_buff *skb;
67*70041088SAndy Grover 	unsigned long to_copy, skb_off;
68*70041088SAndy Grover 	int ret = 0;
69*70041088SAndy Grover 
70*70041088SAndy Grover 	if (size == 0)
71*70041088SAndy Grover 		goto out;
72*70041088SAndy Grover 
73*70041088SAndy Grover 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
74*70041088SAndy Grover 	iov = first_iov;
75*70041088SAndy Grover 	tmp = *iov;
76*70041088SAndy Grover 
77*70041088SAndy Grover 	skb_queue_walk(&tinc->ti_skb_list, skb) {
78*70041088SAndy Grover 		skb_off = 0;
79*70041088SAndy Grover 		while (skb_off < skb->len) {
80*70041088SAndy Grover 			while (tmp.iov_len == 0) {
81*70041088SAndy Grover 				iov++;
82*70041088SAndy Grover 				tmp = *iov;
83*70041088SAndy Grover 			}
84*70041088SAndy Grover 
85*70041088SAndy Grover 			to_copy = min(tmp.iov_len, size);
86*70041088SAndy Grover 			to_copy = min(to_copy, skb->len - skb_off);
87*70041088SAndy Grover 
88*70041088SAndy Grover 			rdsdebug("ret %d size %zu skb %p skb_off %lu "
89*70041088SAndy Grover 				 "skblen %d iov_base %p iov_len %zu cpy %lu\n",
90*70041088SAndy Grover 				 ret, size, skb, skb_off, skb->len,
91*70041088SAndy Grover 				 tmp.iov_base, tmp.iov_len, to_copy);
92*70041088SAndy Grover 
93*70041088SAndy Grover 			/* modifies tmp as it copies */
94*70041088SAndy Grover 			if (skb_copy_datagram_iovec(skb, skb_off, &tmp,
95*70041088SAndy Grover 						    to_copy)) {
96*70041088SAndy Grover 				ret = -EFAULT;
97*70041088SAndy Grover 				goto out;
98*70041088SAndy Grover 			}
99*70041088SAndy Grover 
100*70041088SAndy Grover 			size -= to_copy;
101*70041088SAndy Grover 			ret += to_copy;
102*70041088SAndy Grover 			skb_off += to_copy;
103*70041088SAndy Grover 			if (size == 0)
104*70041088SAndy Grover 				goto out;
105*70041088SAndy Grover 		}
106*70041088SAndy Grover 	}
107*70041088SAndy Grover out:
108*70041088SAndy Grover 	return ret;
109*70041088SAndy Grover }
110*70041088SAndy Grover 
111*70041088SAndy Grover /*
112*70041088SAndy Grover  * We have a series of skbs that have fragmented pieces of the congestion
113*70041088SAndy Grover  * bitmap.  They must add up to the exact size of the congestion bitmap.  We
114*70041088SAndy Grover  * use the skb helpers to copy those into the pages that make up the in-memory
115*70041088SAndy Grover  * congestion bitmap for the remote address of this connection.  We then tell
116*70041088SAndy Grover  * the congestion core that the bitmap has been changed so that it can wake up
117*70041088SAndy Grover  * sleepers.
118*70041088SAndy Grover  *
119*70041088SAndy Grover  * This is racing with sending paths which are using test_bit to see if the
120*70041088SAndy Grover  * bitmap indicates that their recipient is congested.
121*70041088SAndy Grover  */
122*70041088SAndy Grover 
123*70041088SAndy Grover static void rds_tcp_cong_recv(struct rds_connection *conn,
124*70041088SAndy Grover 			      struct rds_tcp_incoming *tinc)
125*70041088SAndy Grover {
126*70041088SAndy Grover 	struct sk_buff *skb;
127*70041088SAndy Grover 	unsigned int to_copy, skb_off;
128*70041088SAndy Grover 	unsigned int map_off;
129*70041088SAndy Grover 	unsigned int map_page;
130*70041088SAndy Grover 	struct rds_cong_map *map;
131*70041088SAndy Grover 	int ret;
132*70041088SAndy Grover 
133*70041088SAndy Grover 	/* catch completely corrupt packets */
134*70041088SAndy Grover 	if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
135*70041088SAndy Grover 		return;
136*70041088SAndy Grover 
137*70041088SAndy Grover 	map_page = 0;
138*70041088SAndy Grover 	map_off = 0;
139*70041088SAndy Grover 	map = conn->c_fcong;
140*70041088SAndy Grover 
141*70041088SAndy Grover 	skb_queue_walk(&tinc->ti_skb_list, skb) {
142*70041088SAndy Grover 		skb_off = 0;
143*70041088SAndy Grover 		while (skb_off < skb->len) {
144*70041088SAndy Grover 			to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
145*70041088SAndy Grover 					skb->len - skb_off);
146*70041088SAndy Grover 
147*70041088SAndy Grover 			BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
148*70041088SAndy Grover 
149*70041088SAndy Grover 			/* only returns 0 or -error */
150*70041088SAndy Grover 			ret = skb_copy_bits(skb, skb_off,
151*70041088SAndy Grover 				(void *)map->m_page_addrs[map_page] + map_off,
152*70041088SAndy Grover 				to_copy);
153*70041088SAndy Grover 			BUG_ON(ret != 0);
154*70041088SAndy Grover 
155*70041088SAndy Grover 			skb_off += to_copy;
156*70041088SAndy Grover 			map_off += to_copy;
157*70041088SAndy Grover 			if (map_off == PAGE_SIZE) {
158*70041088SAndy Grover 				map_off = 0;
159*70041088SAndy Grover 				map_page++;
160*70041088SAndy Grover 			}
161*70041088SAndy Grover 		}
162*70041088SAndy Grover 	}
163*70041088SAndy Grover 
164*70041088SAndy Grover 	rds_cong_map_updated(map, ~(u64) 0);
165*70041088SAndy Grover }
166*70041088SAndy Grover 
167*70041088SAndy Grover struct rds_tcp_desc_arg {
168*70041088SAndy Grover 	struct rds_connection *conn;
169*70041088SAndy Grover 	gfp_t gfp;
170*70041088SAndy Grover 	enum km_type km;
171*70041088SAndy Grover };
172*70041088SAndy Grover 
173*70041088SAndy Grover static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
174*70041088SAndy Grover 			     unsigned int offset, size_t len)
175*70041088SAndy Grover {
176*70041088SAndy Grover 	struct rds_tcp_desc_arg *arg = desc->arg.data;
177*70041088SAndy Grover 	struct rds_connection *conn = arg->conn;
178*70041088SAndy Grover 	struct rds_tcp_connection *tc = conn->c_transport_data;
179*70041088SAndy Grover 	struct rds_tcp_incoming *tinc = tc->t_tinc;
180*70041088SAndy Grover 	struct sk_buff *clone;
181*70041088SAndy Grover 	size_t left = len, to_copy;
182*70041088SAndy Grover 
183*70041088SAndy Grover 	rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
184*70041088SAndy Grover 		 len);
185*70041088SAndy Grover 
186*70041088SAndy Grover 	/*
187*70041088SAndy Grover 	 * tcp_read_sock() interprets partial progress as an indication to stop
188*70041088SAndy Grover 	 * processing.
189*70041088SAndy Grover 	 */
190*70041088SAndy Grover 	while (left) {
191*70041088SAndy Grover 		if (tinc == NULL) {
192*70041088SAndy Grover 			tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
193*70041088SAndy Grover 					        arg->gfp);
194*70041088SAndy Grover 			if (tinc == NULL) {
195*70041088SAndy Grover 				desc->error = -ENOMEM;
196*70041088SAndy Grover 				goto out;
197*70041088SAndy Grover 			}
198*70041088SAndy Grover 			tc->t_tinc = tinc;
199*70041088SAndy Grover 			rdsdebug("alloced tinc %p\n", tinc);
200*70041088SAndy Grover 			rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
201*70041088SAndy Grover 			/*
202*70041088SAndy Grover 			 * XXX * we might be able to use the __ variants when
203*70041088SAndy Grover 			 * we've already serialized at a higher level.
204*70041088SAndy Grover 			 */
205*70041088SAndy Grover 			skb_queue_head_init(&tinc->ti_skb_list);
206*70041088SAndy Grover 		}
207*70041088SAndy Grover 
208*70041088SAndy Grover 		if (left && tc->t_tinc_hdr_rem) {
209*70041088SAndy Grover 			to_copy = min(tc->t_tinc_hdr_rem, left);
210*70041088SAndy Grover 			rdsdebug("copying %zu header from skb %p\n", to_copy,
211*70041088SAndy Grover 				 skb);
212*70041088SAndy Grover 			skb_copy_bits(skb, offset,
213*70041088SAndy Grover 				      (char *)&tinc->ti_inc.i_hdr +
214*70041088SAndy Grover 						sizeof(struct rds_header) -
215*70041088SAndy Grover 						tc->t_tinc_hdr_rem,
216*70041088SAndy Grover 				      to_copy);
217*70041088SAndy Grover 			tc->t_tinc_hdr_rem -= to_copy;
218*70041088SAndy Grover 			left -= to_copy;
219*70041088SAndy Grover 			offset += to_copy;
220*70041088SAndy Grover 
221*70041088SAndy Grover 			if (tc->t_tinc_hdr_rem == 0) {
222*70041088SAndy Grover 				/* could be 0 for a 0 len message */
223*70041088SAndy Grover 				tc->t_tinc_data_rem =
224*70041088SAndy Grover 					be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
225*70041088SAndy Grover 			}
226*70041088SAndy Grover 		}
227*70041088SAndy Grover 
228*70041088SAndy Grover 		if (left && tc->t_tinc_data_rem) {
229*70041088SAndy Grover 			clone = skb_clone(skb, arg->gfp);
230*70041088SAndy Grover 			if (clone == NULL) {
231*70041088SAndy Grover 				desc->error = -ENOMEM;
232*70041088SAndy Grover 				goto out;
233*70041088SAndy Grover 			}
234*70041088SAndy Grover 
235*70041088SAndy Grover 			to_copy = min(tc->t_tinc_data_rem, left);
236*70041088SAndy Grover 			pskb_pull(clone, offset);
237*70041088SAndy Grover 			pskb_trim(clone, to_copy);
238*70041088SAndy Grover 			skb_queue_tail(&tinc->ti_skb_list, clone);
239*70041088SAndy Grover 
240*70041088SAndy Grover 			rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
241*70041088SAndy Grover 				 "clone %p data %p len %d\n",
242*70041088SAndy Grover 				 skb, skb->data, skb->len, offset, to_copy,
243*70041088SAndy Grover 				 clone, clone->data, clone->len);
244*70041088SAndy Grover 
245*70041088SAndy Grover 			tc->t_tinc_data_rem -= to_copy;
246*70041088SAndy Grover 			left -= to_copy;
247*70041088SAndy Grover 			offset += to_copy;
248*70041088SAndy Grover 		}
249*70041088SAndy Grover 
250*70041088SAndy Grover 		if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
251*70041088SAndy Grover 			if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
252*70041088SAndy Grover 				rds_tcp_cong_recv(conn, tinc);
253*70041088SAndy Grover 			else
254*70041088SAndy Grover 				rds_recv_incoming(conn, conn->c_faddr,
255*70041088SAndy Grover 						  conn->c_laddr, &tinc->ti_inc,
256*70041088SAndy Grover 						  arg->gfp, arg->km);
257*70041088SAndy Grover 
258*70041088SAndy Grover 			tc->t_tinc_hdr_rem = sizeof(struct rds_header);
259*70041088SAndy Grover 			tc->t_tinc_data_rem = 0;
260*70041088SAndy Grover 			tc->t_tinc = NULL;
261*70041088SAndy Grover 			rds_inc_put(&tinc->ti_inc);
262*70041088SAndy Grover 			tinc = NULL;
263*70041088SAndy Grover 		}
264*70041088SAndy Grover 	}
265*70041088SAndy Grover out:
266*70041088SAndy Grover 	rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
267*70041088SAndy Grover 		 len, left, skb->len,
268*70041088SAndy Grover 		 skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
269*70041088SAndy Grover 	return len - left;
270*70041088SAndy Grover }
271*70041088SAndy Grover 
272*70041088SAndy Grover /* the caller has to hold the sock lock */
273*70041088SAndy Grover int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km)
274*70041088SAndy Grover {
275*70041088SAndy Grover 	struct rds_tcp_connection *tc = conn->c_transport_data;
276*70041088SAndy Grover 	struct socket *sock = tc->t_sock;
277*70041088SAndy Grover 	read_descriptor_t desc;
278*70041088SAndy Grover 	struct rds_tcp_desc_arg arg;
279*70041088SAndy Grover 
280*70041088SAndy Grover 	/* It's like glib in the kernel! */
281*70041088SAndy Grover 	arg.conn = conn;
282*70041088SAndy Grover 	arg.gfp = gfp;
283*70041088SAndy Grover 	arg.km = km;
284*70041088SAndy Grover 	desc.arg.data = &arg;
285*70041088SAndy Grover 	desc.error = 0;
286*70041088SAndy Grover 	desc.count = 1; /* give more than one skb per call */
287*70041088SAndy Grover 
288*70041088SAndy Grover 	tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
289*70041088SAndy Grover 	rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
290*70041088SAndy Grover 		 desc.error);
291*70041088SAndy Grover 
292*70041088SAndy Grover 	return desc.error;
293*70041088SAndy Grover }
294*70041088SAndy Grover 
295*70041088SAndy Grover /*
296*70041088SAndy Grover  * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
297*70041088SAndy Grover  * data_ready.
298*70041088SAndy Grover  *
299*70041088SAndy Grover  * if we fail to allocate we're in trouble.. blindly wait some time before
300*70041088SAndy Grover  * trying again to see if the VM can free up something for us.
301*70041088SAndy Grover  */
302*70041088SAndy Grover int rds_tcp_recv(struct rds_connection *conn)
303*70041088SAndy Grover {
304*70041088SAndy Grover 	struct rds_tcp_connection *tc = conn->c_transport_data;
305*70041088SAndy Grover 	struct socket *sock = tc->t_sock;
306*70041088SAndy Grover 	int ret = 0;
307*70041088SAndy Grover 
308*70041088SAndy Grover 	rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
309*70041088SAndy Grover 
310*70041088SAndy Grover 	lock_sock(sock->sk);
311*70041088SAndy Grover 	ret = rds_tcp_read_sock(conn, GFP_KERNEL, KM_USER0);
312*70041088SAndy Grover 	release_sock(sock->sk);
313*70041088SAndy Grover 
314*70041088SAndy Grover 	return ret;
315*70041088SAndy Grover }
316*70041088SAndy Grover 
317*70041088SAndy Grover void rds_tcp_data_ready(struct sock *sk, int bytes)
318*70041088SAndy Grover {
319*70041088SAndy Grover 	void (*ready)(struct sock *sk, int bytes);
320*70041088SAndy Grover 	struct rds_connection *conn;
321*70041088SAndy Grover 	struct rds_tcp_connection *tc;
322*70041088SAndy Grover 
323*70041088SAndy Grover 	rdsdebug("data ready sk %p bytes %d\n", sk, bytes);
324*70041088SAndy Grover 
325*70041088SAndy Grover 	read_lock(&sk->sk_callback_lock);
326*70041088SAndy Grover 	conn = sk->sk_user_data;
327*70041088SAndy Grover 	if (conn == NULL) { /* check for teardown race */
328*70041088SAndy Grover 		ready = sk->sk_data_ready;
329*70041088SAndy Grover 		goto out;
330*70041088SAndy Grover 	}
331*70041088SAndy Grover 
332*70041088SAndy Grover 	tc = conn->c_transport_data;
333*70041088SAndy Grover 	ready = tc->t_orig_data_ready;
334*70041088SAndy Grover 	rds_tcp_stats_inc(s_tcp_data_ready_calls);
335*70041088SAndy Grover 
336*70041088SAndy Grover 	if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM)
337*70041088SAndy Grover 		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
338*70041088SAndy Grover out:
339*70041088SAndy Grover 	read_unlock(&sk->sk_callback_lock);
340*70041088SAndy Grover 	ready(sk, bytes);
341*70041088SAndy Grover }
342*70041088SAndy Grover 
343*70041088SAndy Grover int __init rds_tcp_recv_init(void)
344*70041088SAndy Grover {
345*70041088SAndy Grover 	rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
346*70041088SAndy Grover 					sizeof(struct rds_tcp_incoming),
347*70041088SAndy Grover 					0, 0, NULL);
348*70041088SAndy Grover 	if (rds_tcp_incoming_slab == NULL)
349*70041088SAndy Grover 		return -ENOMEM;
350*70041088SAndy Grover 	return 0;
351*70041088SAndy Grover }
352*70041088SAndy Grover 
353*70041088SAndy Grover void rds_tcp_recv_exit(void)
354*70041088SAndy Grover {
355*70041088SAndy Grover 	kmem_cache_destroy(rds_tcp_incoming_slab);
356*70041088SAndy Grover }
357