170041088SAndy Grover /* 270041088SAndy Grover * Copyright (c) 2006 Oracle. All rights reserved. 370041088SAndy Grover * 470041088SAndy Grover * This software is available to you under a choice of one of two 570041088SAndy Grover * licenses. You may choose to be licensed under the terms of the GNU 670041088SAndy Grover * General Public License (GPL) Version 2, available from the file 770041088SAndy Grover * COPYING in the main directory of this source tree, or the 870041088SAndy Grover * OpenIB.org BSD license below: 970041088SAndy Grover * 1070041088SAndy Grover * Redistribution and use in source and binary forms, with or 1170041088SAndy Grover * without modification, are permitted provided that the following 1270041088SAndy Grover * conditions are met: 1370041088SAndy Grover * 1470041088SAndy Grover * - Redistributions of source code must retain the above 1570041088SAndy Grover * copyright notice, this list of conditions and the following 1670041088SAndy Grover * disclaimer. 1770041088SAndy Grover * 1870041088SAndy Grover * - Redistributions in binary form must reproduce the above 1970041088SAndy Grover * copyright notice, this list of conditions and the following 2070041088SAndy Grover * disclaimer in the documentation and/or other materials 2170041088SAndy Grover * provided with the distribution. 2270041088SAndy Grover * 2370041088SAndy Grover * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 2470041088SAndy Grover * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 2570041088SAndy Grover * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 2670041088SAndy Grover * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 2770041088SAndy Grover * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 2870041088SAndy Grover * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 2970041088SAndy Grover * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3070041088SAndy Grover * SOFTWARE. 3170041088SAndy Grover * 3270041088SAndy Grover */ 3370041088SAndy Grover #include <linux/kernel.h> 345a0e3ad6STejun Heo #include <linux/slab.h> 3570041088SAndy Grover #include <net/tcp.h> 3670041088SAndy Grover 370cb43965SSowmini Varadhan #include "rds_single_path.h" 3870041088SAndy Grover #include "rds.h" 3970041088SAndy Grover #include "tcp.h" 4070041088SAndy Grover 4170041088SAndy Grover static struct kmem_cache *rds_tcp_incoming_slab; 4270041088SAndy Grover 43809fa148SAndy Grover static void rds_tcp_inc_purge(struct rds_incoming *inc) 4470041088SAndy Grover { 4570041088SAndy Grover struct rds_tcp_incoming *tinc; 4670041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 4770041088SAndy Grover rdsdebug("purging tinc %p inc %p\n", tinc, inc); 4870041088SAndy Grover skb_queue_purge(&tinc->ti_skb_list); 4970041088SAndy Grover } 5070041088SAndy Grover 5170041088SAndy Grover void rds_tcp_inc_free(struct rds_incoming *inc) 5270041088SAndy Grover { 5370041088SAndy Grover struct rds_tcp_incoming *tinc; 5470041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 5570041088SAndy Grover rds_tcp_inc_purge(inc); 5670041088SAndy Grover rdsdebug("freeing tinc %p inc %p\n", tinc, inc); 5770041088SAndy Grover kmem_cache_free(rds_tcp_incoming_slab, tinc); 5870041088SAndy Grover } 5970041088SAndy Grover 6070041088SAndy Grover /* 6170041088SAndy Grover * this is pretty lame, but, whatever. 6270041088SAndy Grover */ 63c310e72cSAl Viro int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to) 6470041088SAndy Grover { 6570041088SAndy Grover struct rds_tcp_incoming *tinc; 6670041088SAndy Grover struct sk_buff *skb; 6770041088SAndy Grover int ret = 0; 6870041088SAndy Grover 69c310e72cSAl Viro if (!iov_iter_count(to)) 7070041088SAndy Grover goto out; 7170041088SAndy Grover 7270041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 7370041088SAndy Grover 7470041088SAndy Grover skb_queue_walk(&tinc->ti_skb_list, skb) { 75c310e72cSAl Viro unsigned long to_copy, skb_off; 76c310e72cSAl Viro for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) { 77c310e72cSAl Viro to_copy = iov_iter_count(to); 7870041088SAndy Grover to_copy = min(to_copy, skb->len - skb_off); 7970041088SAndy Grover 80c310e72cSAl Viro if (skb_copy_datagram_iter(skb, skb_off, to, to_copy)) 81c310e72cSAl Viro return -EFAULT; 8270041088SAndy Grover 83b075cfdbSAndy Grover rds_stats_add(s_copy_to_user, to_copy); 8470041088SAndy Grover ret += to_copy; 85c310e72cSAl Viro 86c310e72cSAl Viro if (!iov_iter_count(to)) 8770041088SAndy Grover goto out; 8870041088SAndy Grover } 8970041088SAndy Grover } 9070041088SAndy Grover out: 9170041088SAndy Grover return ret; 9270041088SAndy Grover } 9370041088SAndy Grover 9470041088SAndy Grover /* 9570041088SAndy Grover * We have a series of skbs that have fragmented pieces of the congestion 9670041088SAndy Grover * bitmap. They must add up to the exact size of the congestion bitmap. We 9770041088SAndy Grover * use the skb helpers to copy those into the pages that make up the in-memory 9870041088SAndy Grover * congestion bitmap for the remote address of this connection. We then tell 9970041088SAndy Grover * the congestion core that the bitmap has been changed so that it can wake up 10070041088SAndy Grover * sleepers. 10170041088SAndy Grover * 10270041088SAndy Grover * This is racing with sending paths which are using test_bit to see if the 10370041088SAndy Grover * bitmap indicates that their recipient is congested. 10470041088SAndy Grover */ 10570041088SAndy Grover 10670041088SAndy Grover static void rds_tcp_cong_recv(struct rds_connection *conn, 10770041088SAndy Grover struct rds_tcp_incoming *tinc) 10870041088SAndy Grover { 10970041088SAndy Grover struct sk_buff *skb; 11070041088SAndy Grover unsigned int to_copy, skb_off; 11170041088SAndy Grover unsigned int map_off; 11270041088SAndy Grover unsigned int map_page; 11370041088SAndy Grover struct rds_cong_map *map; 11470041088SAndy Grover int ret; 11570041088SAndy Grover 11670041088SAndy Grover /* catch completely corrupt packets */ 11770041088SAndy Grover if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) 11870041088SAndy Grover return; 11970041088SAndy Grover 12070041088SAndy Grover map_page = 0; 12170041088SAndy Grover map_off = 0; 12270041088SAndy Grover map = conn->c_fcong; 12370041088SAndy Grover 12470041088SAndy Grover skb_queue_walk(&tinc->ti_skb_list, skb) { 12570041088SAndy Grover skb_off = 0; 12670041088SAndy Grover while (skb_off < skb->len) { 12770041088SAndy Grover to_copy = min_t(unsigned int, PAGE_SIZE - map_off, 12870041088SAndy Grover skb->len - skb_off); 12970041088SAndy Grover 13070041088SAndy Grover BUG_ON(map_page >= RDS_CONG_MAP_PAGES); 13170041088SAndy Grover 13270041088SAndy Grover /* only returns 0 or -error */ 13370041088SAndy Grover ret = skb_copy_bits(skb, skb_off, 13470041088SAndy Grover (void *)map->m_page_addrs[map_page] + map_off, 13570041088SAndy Grover to_copy); 13670041088SAndy Grover BUG_ON(ret != 0); 13770041088SAndy Grover 13870041088SAndy Grover skb_off += to_copy; 13970041088SAndy Grover map_off += to_copy; 14070041088SAndy Grover if (map_off == PAGE_SIZE) { 14170041088SAndy Grover map_off = 0; 14270041088SAndy Grover map_page++; 14370041088SAndy Grover } 14470041088SAndy Grover } 14570041088SAndy Grover } 14670041088SAndy Grover 14770041088SAndy Grover rds_cong_map_updated(map, ~(u64) 0); 14870041088SAndy Grover } 14970041088SAndy Grover 15070041088SAndy Grover struct rds_tcp_desc_arg { 15170041088SAndy Grover struct rds_connection *conn; 15270041088SAndy Grover gfp_t gfp; 15370041088SAndy Grover }; 15470041088SAndy Grover 15570041088SAndy Grover static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, 15670041088SAndy Grover unsigned int offset, size_t len) 15770041088SAndy Grover { 15870041088SAndy Grover struct rds_tcp_desc_arg *arg = desc->arg.data; 15970041088SAndy Grover struct rds_connection *conn = arg->conn; 16070041088SAndy Grover struct rds_tcp_connection *tc = conn->c_transport_data; 16170041088SAndy Grover struct rds_tcp_incoming *tinc = tc->t_tinc; 16270041088SAndy Grover struct sk_buff *clone; 16370041088SAndy Grover size_t left = len, to_copy; 16470041088SAndy Grover 16570041088SAndy Grover rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset, 16670041088SAndy Grover len); 16770041088SAndy Grover 16870041088SAndy Grover /* 16970041088SAndy Grover * tcp_read_sock() interprets partial progress as an indication to stop 17070041088SAndy Grover * processing. 17170041088SAndy Grover */ 17270041088SAndy Grover while (left) { 1738690bfa1SAndy Grover if (!tinc) { 17470041088SAndy Grover tinc = kmem_cache_alloc(rds_tcp_incoming_slab, 17570041088SAndy Grover arg->gfp); 1768690bfa1SAndy Grover if (!tinc) { 17770041088SAndy Grover desc->error = -ENOMEM; 17870041088SAndy Grover goto out; 17970041088SAndy Grover } 18070041088SAndy Grover tc->t_tinc = tinc; 18170041088SAndy Grover rdsdebug("alloced tinc %p\n", tinc); 18270041088SAndy Grover rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr); 18370041088SAndy Grover /* 18470041088SAndy Grover * XXX * we might be able to use the __ variants when 18570041088SAndy Grover * we've already serialized at a higher level. 18670041088SAndy Grover */ 18770041088SAndy Grover skb_queue_head_init(&tinc->ti_skb_list); 18870041088SAndy Grover } 18970041088SAndy Grover 19070041088SAndy Grover if (left && tc->t_tinc_hdr_rem) { 19170041088SAndy Grover to_copy = min(tc->t_tinc_hdr_rem, left); 19270041088SAndy Grover rdsdebug("copying %zu header from skb %p\n", to_copy, 19370041088SAndy Grover skb); 19470041088SAndy Grover skb_copy_bits(skb, offset, 19570041088SAndy Grover (char *)&tinc->ti_inc.i_hdr + 19670041088SAndy Grover sizeof(struct rds_header) - 19770041088SAndy Grover tc->t_tinc_hdr_rem, 19870041088SAndy Grover to_copy); 19970041088SAndy Grover tc->t_tinc_hdr_rem -= to_copy; 20070041088SAndy Grover left -= to_copy; 20170041088SAndy Grover offset += to_copy; 20270041088SAndy Grover 20370041088SAndy Grover if (tc->t_tinc_hdr_rem == 0) { 20470041088SAndy Grover /* could be 0 for a 0 len message */ 20570041088SAndy Grover tc->t_tinc_data_rem = 20670041088SAndy Grover be32_to_cpu(tinc->ti_inc.i_hdr.h_len); 20770041088SAndy Grover } 20870041088SAndy Grover } 20970041088SAndy Grover 21070041088SAndy Grover if (left && tc->t_tinc_data_rem) { 211947d2756SSowmini Varadhan to_copy = min(tc->t_tinc_data_rem, left); 212947d2756SSowmini Varadhan 213947d2756SSowmini Varadhan clone = pskb_extract(skb, offset, to_copy, arg->gfp); 2148690bfa1SAndy Grover if (!clone) { 21570041088SAndy Grover desc->error = -ENOMEM; 21670041088SAndy Grover goto out; 21770041088SAndy Grover } 21870041088SAndy Grover 21970041088SAndy Grover skb_queue_tail(&tinc->ti_skb_list, clone); 22070041088SAndy Grover 22170041088SAndy Grover rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " 22270041088SAndy Grover "clone %p data %p len %d\n", 22370041088SAndy Grover skb, skb->data, skb->len, offset, to_copy, 22470041088SAndy Grover clone, clone->data, clone->len); 22570041088SAndy Grover 22670041088SAndy Grover tc->t_tinc_data_rem -= to_copy; 22770041088SAndy Grover left -= to_copy; 22870041088SAndy Grover offset += to_copy; 22970041088SAndy Grover } 23070041088SAndy Grover 23170041088SAndy Grover if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { 23270041088SAndy Grover if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 23370041088SAndy Grover rds_tcp_cong_recv(conn, tinc); 23470041088SAndy Grover else 23570041088SAndy Grover rds_recv_incoming(conn, conn->c_faddr, 23670041088SAndy Grover conn->c_laddr, &tinc->ti_inc, 2376114eab5SCong Wang arg->gfp); 23870041088SAndy Grover 23970041088SAndy Grover tc->t_tinc_hdr_rem = sizeof(struct rds_header); 24070041088SAndy Grover tc->t_tinc_data_rem = 0; 24170041088SAndy Grover tc->t_tinc = NULL; 24270041088SAndy Grover rds_inc_put(&tinc->ti_inc); 24370041088SAndy Grover tinc = NULL; 24470041088SAndy Grover } 24570041088SAndy Grover } 24670041088SAndy Grover out: 24770041088SAndy Grover rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n", 24870041088SAndy Grover len, left, skb->len, 24970041088SAndy Grover skb_queue_len(&tc->t_sock->sk->sk_receive_queue)); 25070041088SAndy Grover return len - left; 25170041088SAndy Grover } 25270041088SAndy Grover 25370041088SAndy Grover /* the caller has to hold the sock lock */ 2546114eab5SCong Wang static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp) 25570041088SAndy Grover { 25670041088SAndy Grover struct rds_tcp_connection *tc = conn->c_transport_data; 25770041088SAndy Grover struct socket *sock = tc->t_sock; 25870041088SAndy Grover read_descriptor_t desc; 25970041088SAndy Grover struct rds_tcp_desc_arg arg; 26070041088SAndy Grover 26170041088SAndy Grover /* It's like glib in the kernel! */ 26270041088SAndy Grover arg.conn = conn; 26370041088SAndy Grover arg.gfp = gfp; 26470041088SAndy Grover desc.arg.data = &arg; 26570041088SAndy Grover desc.error = 0; 26670041088SAndy Grover desc.count = 1; /* give more than one skb per call */ 26770041088SAndy Grover 26870041088SAndy Grover tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv); 26970041088SAndy Grover rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, 27070041088SAndy Grover desc.error); 27170041088SAndy Grover 27270041088SAndy Grover return desc.error; 27370041088SAndy Grover } 27470041088SAndy Grover 27570041088SAndy Grover /* 27670041088SAndy Grover * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from 27770041088SAndy Grover * data_ready. 27870041088SAndy Grover * 27970041088SAndy Grover * if we fail to allocate we're in trouble.. blindly wait some time before 28070041088SAndy Grover * trying again to see if the VM can free up something for us. 28170041088SAndy Grover */ 28270041088SAndy Grover int rds_tcp_recv(struct rds_connection *conn) 28370041088SAndy Grover { 28470041088SAndy Grover struct rds_tcp_connection *tc = conn->c_transport_data; 28570041088SAndy Grover struct socket *sock = tc->t_sock; 28670041088SAndy Grover int ret = 0; 28770041088SAndy Grover 28870041088SAndy Grover rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock); 28970041088SAndy Grover 29070041088SAndy Grover lock_sock(sock->sk); 2916114eab5SCong Wang ret = rds_tcp_read_sock(conn, GFP_KERNEL); 29270041088SAndy Grover release_sock(sock->sk); 29370041088SAndy Grover 29470041088SAndy Grover return ret; 29570041088SAndy Grover } 29670041088SAndy Grover 297676d2369SDavid S. Miller void rds_tcp_data_ready(struct sock *sk) 29870041088SAndy Grover { 299676d2369SDavid S. Miller void (*ready)(struct sock *sk); 300*ea3b1ea5SSowmini Varadhan struct rds_conn_path *cp; 30170041088SAndy Grover struct rds_tcp_connection *tc; 30270041088SAndy Grover 303676d2369SDavid S. Miller rdsdebug("data ready sk %p\n", sk); 30470041088SAndy Grover 30538036629SEric Dumazet read_lock_bh(&sk->sk_callback_lock); 306*ea3b1ea5SSowmini Varadhan cp = sk->sk_user_data; 307*ea3b1ea5SSowmini Varadhan if (!cp) { /* check for teardown race */ 30870041088SAndy Grover ready = sk->sk_data_ready; 30970041088SAndy Grover goto out; 31070041088SAndy Grover } 31170041088SAndy Grover 312*ea3b1ea5SSowmini Varadhan tc = cp->cp_transport_data; 31370041088SAndy Grover ready = tc->t_orig_data_ready; 31470041088SAndy Grover rds_tcp_stats_inc(s_tcp_data_ready_calls); 31570041088SAndy Grover 316*ea3b1ea5SSowmini Varadhan if (rds_tcp_read_sock(cp->cp_conn, GFP_ATOMIC) == -ENOMEM) 317*ea3b1ea5SSowmini Varadhan queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); 31870041088SAndy Grover out: 31938036629SEric Dumazet read_unlock_bh(&sk->sk_callback_lock); 320676d2369SDavid S. Miller ready(sk); 32170041088SAndy Grover } 32270041088SAndy Grover 323ef87b7eaSZach Brown int rds_tcp_recv_init(void) 32470041088SAndy Grover { 32570041088SAndy Grover rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", 32670041088SAndy Grover sizeof(struct rds_tcp_incoming), 32770041088SAndy Grover 0, 0, NULL); 3288690bfa1SAndy Grover if (!rds_tcp_incoming_slab) 32970041088SAndy Grover return -ENOMEM; 33070041088SAndy Grover return 0; 33170041088SAndy Grover } 33270041088SAndy Grover 33370041088SAndy Grover void rds_tcp_recv_exit(void) 33470041088SAndy Grover { 33570041088SAndy Grover kmem_cache_destroy(rds_tcp_incoming_slab); 33670041088SAndy Grover } 337