170041088SAndy Grover /*
2eee2fa6aSKa-Cheong Poon * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
370041088SAndy Grover *
470041088SAndy Grover * This software is available to you under a choice of one of two
570041088SAndy Grover * licenses. You may choose to be licensed under the terms of the GNU
670041088SAndy Grover * General Public License (GPL) Version 2, available from the file
770041088SAndy Grover * COPYING in the main directory of this source tree, or the
870041088SAndy Grover * OpenIB.org BSD license below:
970041088SAndy Grover *
1070041088SAndy Grover * Redistribution and use in source and binary forms, with or
1170041088SAndy Grover * without modification, are permitted provided that the following
1270041088SAndy Grover * conditions are met:
1370041088SAndy Grover *
1470041088SAndy Grover * - Redistributions of source code must retain the above
1570041088SAndy Grover * copyright notice, this list of conditions and the following
1670041088SAndy Grover * disclaimer.
1770041088SAndy Grover *
1870041088SAndy Grover * - Redistributions in binary form must reproduce the above
1970041088SAndy Grover * copyright notice, this list of conditions and the following
2070041088SAndy Grover * disclaimer in the documentation and/or other materials
2170041088SAndy Grover * provided with the distribution.
2270041088SAndy Grover *
2370041088SAndy Grover * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
2470041088SAndy Grover * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
2570041088SAndy Grover * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
2670041088SAndy Grover * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
2770041088SAndy Grover * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
2870041088SAndy Grover * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2970041088SAndy Grover * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3070041088SAndy Grover * SOFTWARE.
3170041088SAndy Grover *
3270041088SAndy Grover */
3370041088SAndy Grover #include <linux/kernel.h>
345a0e3ad6STejun Heo #include <linux/slab.h>
3570041088SAndy Grover #include <net/tcp.h>
36*40e0b090SPeilin Ye #include <trace/events/sock.h>
3770041088SAndy Grover
3870041088SAndy Grover #include "rds.h"
3970041088SAndy Grover #include "tcp.h"
4070041088SAndy Grover
4170041088SAndy Grover static struct kmem_cache *rds_tcp_incoming_slab;
4270041088SAndy Grover
rds_tcp_inc_purge(struct rds_incoming * inc)43809fa148SAndy Grover static void rds_tcp_inc_purge(struct rds_incoming *inc)
4470041088SAndy Grover {
4570041088SAndy Grover struct rds_tcp_incoming *tinc;
4670041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
4770041088SAndy Grover rdsdebug("purging tinc %p inc %p\n", tinc, inc);
4870041088SAndy Grover skb_queue_purge(&tinc->ti_skb_list);
4970041088SAndy Grover }
5070041088SAndy Grover
rds_tcp_inc_free(struct rds_incoming * inc)5170041088SAndy Grover void rds_tcp_inc_free(struct rds_incoming *inc)
5270041088SAndy Grover {
5370041088SAndy Grover struct rds_tcp_incoming *tinc;
5470041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
5570041088SAndy Grover rds_tcp_inc_purge(inc);
5670041088SAndy Grover rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
5770041088SAndy Grover kmem_cache_free(rds_tcp_incoming_slab, tinc);
5870041088SAndy Grover }
5970041088SAndy Grover
6070041088SAndy Grover /*
6170041088SAndy Grover * this is pretty lame, but, whatever.
6270041088SAndy Grover */
rds_tcp_inc_copy_to_user(struct rds_incoming * inc,struct iov_iter * to)63c310e72cSAl Viro int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
6470041088SAndy Grover {
6570041088SAndy Grover struct rds_tcp_incoming *tinc;
6670041088SAndy Grover struct sk_buff *skb;
6770041088SAndy Grover int ret = 0;
6870041088SAndy Grover
69c310e72cSAl Viro if (!iov_iter_count(to))
7070041088SAndy Grover goto out;
7170041088SAndy Grover
7270041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
7370041088SAndy Grover
7470041088SAndy Grover skb_queue_walk(&tinc->ti_skb_list, skb) {
75c310e72cSAl Viro unsigned long to_copy, skb_off;
76c310e72cSAl Viro for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) {
77c310e72cSAl Viro to_copy = iov_iter_count(to);
7870041088SAndy Grover to_copy = min(to_copy, skb->len - skb_off);
7970041088SAndy Grover
80c310e72cSAl Viro if (skb_copy_datagram_iter(skb, skb_off, to, to_copy))
81c310e72cSAl Viro return -EFAULT;
8270041088SAndy Grover
83b075cfdbSAndy Grover rds_stats_add(s_copy_to_user, to_copy);
8470041088SAndy Grover ret += to_copy;
85c310e72cSAl Viro
86c310e72cSAl Viro if (!iov_iter_count(to))
8770041088SAndy Grover goto out;
8870041088SAndy Grover }
8970041088SAndy Grover }
9070041088SAndy Grover out:
9170041088SAndy Grover return ret;
9270041088SAndy Grover }
9370041088SAndy Grover
9470041088SAndy Grover /*
9570041088SAndy Grover * We have a series of skbs that have fragmented pieces of the congestion
9670041088SAndy Grover * bitmap. They must add up to the exact size of the congestion bitmap. We
9770041088SAndy Grover * use the skb helpers to copy those into the pages that make up the in-memory
9870041088SAndy Grover * congestion bitmap for the remote address of this connection. We then tell
9970041088SAndy Grover * the congestion core that the bitmap has been changed so that it can wake up
10070041088SAndy Grover * sleepers.
10170041088SAndy Grover *
10270041088SAndy Grover * This is racing with sending paths which are using test_bit to see if the
10370041088SAndy Grover * bitmap indicates that their recipient is congested.
10470041088SAndy Grover */
10570041088SAndy Grover
rds_tcp_cong_recv(struct rds_connection * conn,struct rds_tcp_incoming * tinc)10670041088SAndy Grover static void rds_tcp_cong_recv(struct rds_connection *conn,
10770041088SAndy Grover struct rds_tcp_incoming *tinc)
10870041088SAndy Grover {
10970041088SAndy Grover struct sk_buff *skb;
11070041088SAndy Grover unsigned int to_copy, skb_off;
11170041088SAndy Grover unsigned int map_off;
11270041088SAndy Grover unsigned int map_page;
11370041088SAndy Grover struct rds_cong_map *map;
11470041088SAndy Grover int ret;
11570041088SAndy Grover
11670041088SAndy Grover /* catch completely corrupt packets */
11770041088SAndy Grover if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
11870041088SAndy Grover return;
11970041088SAndy Grover
12070041088SAndy Grover map_page = 0;
12170041088SAndy Grover map_off = 0;
12270041088SAndy Grover map = conn->c_fcong;
12370041088SAndy Grover
12470041088SAndy Grover skb_queue_walk(&tinc->ti_skb_list, skb) {
12570041088SAndy Grover skb_off = 0;
12670041088SAndy Grover while (skb_off < skb->len) {
12770041088SAndy Grover to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
12870041088SAndy Grover skb->len - skb_off);
12970041088SAndy Grover
13070041088SAndy Grover BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
13170041088SAndy Grover
13270041088SAndy Grover /* only returns 0 or -error */
13370041088SAndy Grover ret = skb_copy_bits(skb, skb_off,
13470041088SAndy Grover (void *)map->m_page_addrs[map_page] + map_off,
13570041088SAndy Grover to_copy);
13670041088SAndy Grover BUG_ON(ret != 0);
13770041088SAndy Grover
13870041088SAndy Grover skb_off += to_copy;
13970041088SAndy Grover map_off += to_copy;
14070041088SAndy Grover if (map_off == PAGE_SIZE) {
14170041088SAndy Grover map_off = 0;
14270041088SAndy Grover map_page++;
14370041088SAndy Grover }
14470041088SAndy Grover }
14570041088SAndy Grover }
14670041088SAndy Grover
14770041088SAndy Grover rds_cong_map_updated(map, ~(u64) 0);
14870041088SAndy Grover }
14970041088SAndy Grover
15070041088SAndy Grover struct rds_tcp_desc_arg {
1512da43c4aSSowmini Varadhan struct rds_conn_path *conn_path;
15270041088SAndy Grover gfp_t gfp;
15370041088SAndy Grover };
15470041088SAndy Grover
rds_tcp_data_recv(read_descriptor_t * desc,struct sk_buff * skb,unsigned int offset,size_t len)15570041088SAndy Grover static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
15670041088SAndy Grover unsigned int offset, size_t len)
15770041088SAndy Grover {
15870041088SAndy Grover struct rds_tcp_desc_arg *arg = desc->arg.data;
1592da43c4aSSowmini Varadhan struct rds_conn_path *cp = arg->conn_path;
1602da43c4aSSowmini Varadhan struct rds_tcp_connection *tc = cp->cp_transport_data;
16170041088SAndy Grover struct rds_tcp_incoming *tinc = tc->t_tinc;
16270041088SAndy Grover struct sk_buff *clone;
16370041088SAndy Grover size_t left = len, to_copy;
16470041088SAndy Grover
16570041088SAndy Grover rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
16670041088SAndy Grover len);
16770041088SAndy Grover
16870041088SAndy Grover /*
16970041088SAndy Grover * tcp_read_sock() interprets partial progress as an indication to stop
17070041088SAndy Grover * processing.
17170041088SAndy Grover */
17270041088SAndy Grover while (left) {
1738690bfa1SAndy Grover if (!tinc) {
17470041088SAndy Grover tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
17570041088SAndy Grover arg->gfp);
1768690bfa1SAndy Grover if (!tinc) {
17770041088SAndy Grover desc->error = -ENOMEM;
17870041088SAndy Grover goto out;
17970041088SAndy Grover }
18070041088SAndy Grover tc->t_tinc = tinc;
181379aecbcSZheng Yongjun rdsdebug("allocated tinc %p\n", tinc);
1822da43c4aSSowmini Varadhan rds_inc_path_init(&tinc->ti_inc, cp,
183eee2fa6aSKa-Cheong Poon &cp->cp_conn->c_faddr);
1843289025aSSantosh Shilimkar tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
1853289025aSSantosh Shilimkar local_clock();
1863289025aSSantosh Shilimkar
18770041088SAndy Grover /*
18870041088SAndy Grover * XXX * we might be able to use the __ variants when
18970041088SAndy Grover * we've already serialized at a higher level.
19070041088SAndy Grover */
19170041088SAndy Grover skb_queue_head_init(&tinc->ti_skb_list);
19270041088SAndy Grover }
19370041088SAndy Grover
19470041088SAndy Grover if (left && tc->t_tinc_hdr_rem) {
19570041088SAndy Grover to_copy = min(tc->t_tinc_hdr_rem, left);
19670041088SAndy Grover rdsdebug("copying %zu header from skb %p\n", to_copy,
19770041088SAndy Grover skb);
19870041088SAndy Grover skb_copy_bits(skb, offset,
19970041088SAndy Grover (char *)&tinc->ti_inc.i_hdr +
20070041088SAndy Grover sizeof(struct rds_header) -
20170041088SAndy Grover tc->t_tinc_hdr_rem,
20270041088SAndy Grover to_copy);
20370041088SAndy Grover tc->t_tinc_hdr_rem -= to_copy;
20470041088SAndy Grover left -= to_copy;
20570041088SAndy Grover offset += to_copy;
20670041088SAndy Grover
20770041088SAndy Grover if (tc->t_tinc_hdr_rem == 0) {
20870041088SAndy Grover /* could be 0 for a 0 len message */
20970041088SAndy Grover tc->t_tinc_data_rem =
21070041088SAndy Grover be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
2113289025aSSantosh Shilimkar tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
2123289025aSSantosh Shilimkar local_clock();
21370041088SAndy Grover }
21470041088SAndy Grover }
21570041088SAndy Grover
21670041088SAndy Grover if (left && tc->t_tinc_data_rem) {
217947d2756SSowmini Varadhan to_copy = min(tc->t_tinc_data_rem, left);
218947d2756SSowmini Varadhan
219947d2756SSowmini Varadhan clone = pskb_extract(skb, offset, to_copy, arg->gfp);
2208690bfa1SAndy Grover if (!clone) {
22170041088SAndy Grover desc->error = -ENOMEM;
22270041088SAndy Grover goto out;
22370041088SAndy Grover }
22470041088SAndy Grover
22570041088SAndy Grover skb_queue_tail(&tinc->ti_skb_list, clone);
22670041088SAndy Grover
22770041088SAndy Grover rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
22870041088SAndy Grover "clone %p data %p len %d\n",
22970041088SAndy Grover skb, skb->data, skb->len, offset, to_copy,
23070041088SAndy Grover clone, clone->data, clone->len);
23170041088SAndy Grover
23270041088SAndy Grover tc->t_tinc_data_rem -= to_copy;
23370041088SAndy Grover left -= to_copy;
23470041088SAndy Grover offset += to_copy;
23570041088SAndy Grover }
23670041088SAndy Grover
23770041088SAndy Grover if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
2382da43c4aSSowmini Varadhan struct rds_connection *conn = cp->cp_conn;
2392da43c4aSSowmini Varadhan
24070041088SAndy Grover if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
24170041088SAndy Grover rds_tcp_cong_recv(conn, tinc);
24270041088SAndy Grover else
243eee2fa6aSKa-Cheong Poon rds_recv_incoming(conn, &conn->c_faddr,
244eee2fa6aSKa-Cheong Poon &conn->c_laddr,
245eee2fa6aSKa-Cheong Poon &tinc->ti_inc,
2466114eab5SCong Wang arg->gfp);
24770041088SAndy Grover
24870041088SAndy Grover tc->t_tinc_hdr_rem = sizeof(struct rds_header);
24970041088SAndy Grover tc->t_tinc_data_rem = 0;
25070041088SAndy Grover tc->t_tinc = NULL;
25170041088SAndy Grover rds_inc_put(&tinc->ti_inc);
25270041088SAndy Grover tinc = NULL;
25370041088SAndy Grover }
25470041088SAndy Grover }
25570041088SAndy Grover out:
25670041088SAndy Grover rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
25770041088SAndy Grover len, left, skb->len,
25870041088SAndy Grover skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
25970041088SAndy Grover return len - left;
26070041088SAndy Grover }
26170041088SAndy Grover
26270041088SAndy Grover /* the caller has to hold the sock lock */
rds_tcp_read_sock(struct rds_conn_path * cp,gfp_t gfp)2632da43c4aSSowmini Varadhan static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp)
26470041088SAndy Grover {
2652da43c4aSSowmini Varadhan struct rds_tcp_connection *tc = cp->cp_transport_data;
26670041088SAndy Grover struct socket *sock = tc->t_sock;
26770041088SAndy Grover read_descriptor_t desc;
26870041088SAndy Grover struct rds_tcp_desc_arg arg;
26970041088SAndy Grover
27070041088SAndy Grover /* It's like glib in the kernel! */
2712da43c4aSSowmini Varadhan arg.conn_path = cp;
27270041088SAndy Grover arg.gfp = gfp;
27370041088SAndy Grover desc.arg.data = &arg;
27470041088SAndy Grover desc.error = 0;
27570041088SAndy Grover desc.count = 1; /* give more than one skb per call */
27670041088SAndy Grover
27770041088SAndy Grover tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
27870041088SAndy Grover rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
27970041088SAndy Grover desc.error);
28070041088SAndy Grover
28170041088SAndy Grover return desc.error;
28270041088SAndy Grover }
28370041088SAndy Grover
28470041088SAndy Grover /*
28570041088SAndy Grover * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
28670041088SAndy Grover * data_ready.
28770041088SAndy Grover *
28870041088SAndy Grover * if we fail to allocate we're in trouble.. blindly wait some time before
28970041088SAndy Grover * trying again to see if the VM can free up something for us.
29070041088SAndy Grover */
rds_tcp_recv_path(struct rds_conn_path * cp)2912da43c4aSSowmini Varadhan int rds_tcp_recv_path(struct rds_conn_path *cp)
29270041088SAndy Grover {
2932da43c4aSSowmini Varadhan struct rds_tcp_connection *tc = cp->cp_transport_data;
29470041088SAndy Grover struct socket *sock = tc->t_sock;
29570041088SAndy Grover int ret = 0;
29670041088SAndy Grover
2972da43c4aSSowmini Varadhan rdsdebug("recv worker path [%d] tc %p sock %p\n",
2982da43c4aSSowmini Varadhan cp->cp_index, tc, sock);
29970041088SAndy Grover
30070041088SAndy Grover lock_sock(sock->sk);
3012da43c4aSSowmini Varadhan ret = rds_tcp_read_sock(cp, GFP_KERNEL);
30270041088SAndy Grover release_sock(sock->sk);
30370041088SAndy Grover
30470041088SAndy Grover return ret;
30570041088SAndy Grover }
30670041088SAndy Grover
rds_tcp_data_ready(struct sock * sk)307676d2369SDavid S. Miller void rds_tcp_data_ready(struct sock *sk)
30870041088SAndy Grover {
309676d2369SDavid S. Miller void (*ready)(struct sock *sk);
310ea3b1ea5SSowmini Varadhan struct rds_conn_path *cp;
31170041088SAndy Grover struct rds_tcp_connection *tc;
31270041088SAndy Grover
313*40e0b090SPeilin Ye trace_sk_data_ready(sk);
314676d2369SDavid S. Miller rdsdebug("data ready sk %p\n", sk);
31570041088SAndy Grover
31638036629SEric Dumazet read_lock_bh(&sk->sk_callback_lock);
317ea3b1ea5SSowmini Varadhan cp = sk->sk_user_data;
318ea3b1ea5SSowmini Varadhan if (!cp) { /* check for teardown race */
31970041088SAndy Grover ready = sk->sk_data_ready;
32070041088SAndy Grover goto out;
32170041088SAndy Grover }
32270041088SAndy Grover
323ea3b1ea5SSowmini Varadhan tc = cp->cp_transport_data;
32470041088SAndy Grover ready = tc->t_orig_data_ready;
32570041088SAndy Grover rds_tcp_stats_inc(s_tcp_data_ready_calls);
32670041088SAndy Grover
3273db6e0d1SSowmini Varadhan if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) {
3283db6e0d1SSowmini Varadhan rcu_read_lock();
329ebeeb1adSSowmini Varadhan if (!rds_destroy_pending(cp->cp_conn))
330ea3b1ea5SSowmini Varadhan queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
3313db6e0d1SSowmini Varadhan rcu_read_unlock();
3323db6e0d1SSowmini Varadhan }
33370041088SAndy Grover out:
33438036629SEric Dumazet read_unlock_bh(&sk->sk_callback_lock);
335676d2369SDavid S. Miller ready(sk);
33670041088SAndy Grover }
33770041088SAndy Grover
rds_tcp_recv_init(void)338ef87b7eaSZach Brown int rds_tcp_recv_init(void)
33970041088SAndy Grover {
34070041088SAndy Grover rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
34170041088SAndy Grover sizeof(struct rds_tcp_incoming),
34270041088SAndy Grover 0, 0, NULL);
3438690bfa1SAndy Grover if (!rds_tcp_incoming_slab)
34470041088SAndy Grover return -ENOMEM;
34570041088SAndy Grover return 0;
34670041088SAndy Grover }
34770041088SAndy Grover
rds_tcp_recv_exit(void)34870041088SAndy Grover void rds_tcp_recv_exit(void)
34970041088SAndy Grover {
35070041088SAndy Grover kmem_cache_destroy(rds_tcp_incoming_slab);
35170041088SAndy Grover }
352