170041088SAndy Grover /* 270041088SAndy Grover * Copyright (c) 2006 Oracle. All rights reserved. 370041088SAndy Grover * 470041088SAndy Grover * This software is available to you under a choice of one of two 570041088SAndy Grover * licenses. You may choose to be licensed under the terms of the GNU 670041088SAndy Grover * General Public License (GPL) Version 2, available from the file 770041088SAndy Grover * COPYING in the main directory of this source tree, or the 870041088SAndy Grover * OpenIB.org BSD license below: 970041088SAndy Grover * 1070041088SAndy Grover * Redistribution and use in source and binary forms, with or 1170041088SAndy Grover * without modification, are permitted provided that the following 1270041088SAndy Grover * conditions are met: 1370041088SAndy Grover * 1470041088SAndy Grover * - Redistributions of source code must retain the above 1570041088SAndy Grover * copyright notice, this list of conditions and the following 1670041088SAndy Grover * disclaimer. 1770041088SAndy Grover * 1870041088SAndy Grover * - Redistributions in binary form must reproduce the above 1970041088SAndy Grover * copyright notice, this list of conditions and the following 2070041088SAndy Grover * disclaimer in the documentation and/or other materials 2170041088SAndy Grover * provided with the distribution. 2270041088SAndy Grover * 2370041088SAndy Grover * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 2470041088SAndy Grover * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 2570041088SAndy Grover * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 2670041088SAndy Grover * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 2770041088SAndy Grover * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 2870041088SAndy Grover * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 2970041088SAndy Grover * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3070041088SAndy Grover * SOFTWARE. 3170041088SAndy Grover * 3270041088SAndy Grover */ 3370041088SAndy Grover #include <linux/kernel.h> 345a0e3ad6STejun Heo #include <linux/slab.h> 3570041088SAndy Grover #include <net/tcp.h> 3670041088SAndy Grover 3770041088SAndy Grover #include "rds.h" 3870041088SAndy Grover #include "tcp.h" 3970041088SAndy Grover 4070041088SAndy Grover static struct kmem_cache *rds_tcp_incoming_slab; 4170041088SAndy Grover 42809fa148SAndy Grover static void rds_tcp_inc_purge(struct rds_incoming *inc) 4370041088SAndy Grover { 4470041088SAndy Grover struct rds_tcp_incoming *tinc; 4570041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 4670041088SAndy Grover rdsdebug("purging tinc %p inc %p\n", tinc, inc); 4770041088SAndy Grover skb_queue_purge(&tinc->ti_skb_list); 4870041088SAndy Grover } 4970041088SAndy Grover 5070041088SAndy Grover void rds_tcp_inc_free(struct rds_incoming *inc) 5170041088SAndy Grover { 5270041088SAndy Grover struct rds_tcp_incoming *tinc; 5370041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 5470041088SAndy Grover rds_tcp_inc_purge(inc); 5570041088SAndy Grover rdsdebug("freeing tinc %p inc %p\n", tinc, inc); 5670041088SAndy Grover kmem_cache_free(rds_tcp_incoming_slab, tinc); 5770041088SAndy Grover } 5870041088SAndy Grover 5970041088SAndy Grover /* 6070041088SAndy Grover * this is pretty lame, but, whatever. 6170041088SAndy Grover */ 62c310e72cSAl Viro int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to) 6370041088SAndy Grover { 6470041088SAndy Grover struct rds_tcp_incoming *tinc; 6570041088SAndy Grover struct sk_buff *skb; 6670041088SAndy Grover int ret = 0; 6770041088SAndy Grover 68c310e72cSAl Viro if (!iov_iter_count(to)) 6970041088SAndy Grover goto out; 7070041088SAndy Grover 7170041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 7270041088SAndy Grover 7370041088SAndy Grover skb_queue_walk(&tinc->ti_skb_list, skb) { 74c310e72cSAl Viro unsigned long to_copy, skb_off; 75c310e72cSAl Viro for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) { 76c310e72cSAl Viro to_copy = iov_iter_count(to); 7770041088SAndy Grover to_copy = min(to_copy, skb->len - skb_off); 7870041088SAndy Grover 79c310e72cSAl Viro if (skb_copy_datagram_iter(skb, skb_off, to, to_copy)) 80c310e72cSAl Viro return -EFAULT; 8170041088SAndy Grover 82b075cfdbSAndy Grover rds_stats_add(s_copy_to_user, to_copy); 8370041088SAndy Grover ret += to_copy; 84c310e72cSAl Viro 85c310e72cSAl Viro if (!iov_iter_count(to)) 8670041088SAndy Grover goto out; 8770041088SAndy Grover } 8870041088SAndy Grover } 8970041088SAndy Grover out: 9070041088SAndy Grover return ret; 9170041088SAndy Grover } 9270041088SAndy Grover 9370041088SAndy Grover /* 9470041088SAndy Grover * We have a series of skbs that have fragmented pieces of the congestion 9570041088SAndy Grover * bitmap. They must add up to the exact size of the congestion bitmap. We 9670041088SAndy Grover * use the skb helpers to copy those into the pages that make up the in-memory 9770041088SAndy Grover * congestion bitmap for the remote address of this connection. We then tell 9870041088SAndy Grover * the congestion core that the bitmap has been changed so that it can wake up 9970041088SAndy Grover * sleepers. 10070041088SAndy Grover * 10170041088SAndy Grover * This is racing with sending paths which are using test_bit to see if the 10270041088SAndy Grover * bitmap indicates that their recipient is congested. 10370041088SAndy Grover */ 10470041088SAndy Grover 10570041088SAndy Grover static void rds_tcp_cong_recv(struct rds_connection *conn, 10670041088SAndy Grover struct rds_tcp_incoming *tinc) 10770041088SAndy Grover { 10870041088SAndy Grover struct sk_buff *skb; 10970041088SAndy Grover unsigned int to_copy, skb_off; 11070041088SAndy Grover unsigned int map_off; 11170041088SAndy Grover unsigned int map_page; 11270041088SAndy Grover struct rds_cong_map *map; 11370041088SAndy Grover int ret; 11470041088SAndy Grover 11570041088SAndy Grover /* catch completely corrupt packets */ 11670041088SAndy Grover if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) 11770041088SAndy Grover return; 11870041088SAndy Grover 11970041088SAndy Grover map_page = 0; 12070041088SAndy Grover map_off = 0; 12170041088SAndy Grover map = conn->c_fcong; 12270041088SAndy Grover 12370041088SAndy Grover skb_queue_walk(&tinc->ti_skb_list, skb) { 12470041088SAndy Grover skb_off = 0; 12570041088SAndy Grover while (skb_off < skb->len) { 12670041088SAndy Grover to_copy = min_t(unsigned int, PAGE_SIZE - map_off, 12770041088SAndy Grover skb->len - skb_off); 12870041088SAndy Grover 12970041088SAndy Grover BUG_ON(map_page >= RDS_CONG_MAP_PAGES); 13070041088SAndy Grover 13170041088SAndy Grover /* only returns 0 or -error */ 13270041088SAndy Grover ret = skb_copy_bits(skb, skb_off, 13370041088SAndy Grover (void *)map->m_page_addrs[map_page] + map_off, 13470041088SAndy Grover to_copy); 13570041088SAndy Grover BUG_ON(ret != 0); 13670041088SAndy Grover 13770041088SAndy Grover skb_off += to_copy; 13870041088SAndy Grover map_off += to_copy; 13970041088SAndy Grover if (map_off == PAGE_SIZE) { 14070041088SAndy Grover map_off = 0; 14170041088SAndy Grover map_page++; 14270041088SAndy Grover } 14370041088SAndy Grover } 14470041088SAndy Grover } 14570041088SAndy Grover 14670041088SAndy Grover rds_cong_map_updated(map, ~(u64) 0); 14770041088SAndy Grover } 14870041088SAndy Grover 14970041088SAndy Grover struct rds_tcp_desc_arg { 1502da43c4aSSowmini Varadhan struct rds_conn_path *conn_path; 15170041088SAndy Grover gfp_t gfp; 15270041088SAndy Grover }; 15370041088SAndy Grover 15470041088SAndy Grover static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, 15570041088SAndy Grover unsigned int offset, size_t len) 15670041088SAndy Grover { 15770041088SAndy Grover struct rds_tcp_desc_arg *arg = desc->arg.data; 1582da43c4aSSowmini Varadhan struct rds_conn_path *cp = arg->conn_path; 1592da43c4aSSowmini Varadhan struct rds_tcp_connection *tc = cp->cp_transport_data; 16070041088SAndy Grover struct rds_tcp_incoming *tinc = tc->t_tinc; 16170041088SAndy Grover struct sk_buff *clone; 16270041088SAndy Grover size_t left = len, to_copy; 16370041088SAndy Grover 16470041088SAndy Grover rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset, 16570041088SAndy Grover len); 16670041088SAndy Grover 16770041088SAndy Grover /* 16870041088SAndy Grover * tcp_read_sock() interprets partial progress as an indication to stop 16970041088SAndy Grover * processing. 17070041088SAndy Grover */ 17170041088SAndy Grover while (left) { 1728690bfa1SAndy Grover if (!tinc) { 17370041088SAndy Grover tinc = kmem_cache_alloc(rds_tcp_incoming_slab, 17470041088SAndy Grover arg->gfp); 1758690bfa1SAndy Grover if (!tinc) { 17670041088SAndy Grover desc->error = -ENOMEM; 17770041088SAndy Grover goto out; 17870041088SAndy Grover } 17970041088SAndy Grover tc->t_tinc = tinc; 18070041088SAndy Grover rdsdebug("alloced tinc %p\n", tinc); 1812da43c4aSSowmini Varadhan rds_inc_path_init(&tinc->ti_inc, cp, 1822da43c4aSSowmini Varadhan cp->cp_conn->c_faddr); 1833289025aSSantosh Shilimkar tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = 1843289025aSSantosh Shilimkar local_clock(); 1853289025aSSantosh Shilimkar 18670041088SAndy Grover /* 18770041088SAndy Grover * XXX * we might be able to use the __ variants when 18870041088SAndy Grover * we've already serialized at a higher level. 18970041088SAndy Grover */ 19070041088SAndy Grover skb_queue_head_init(&tinc->ti_skb_list); 19170041088SAndy Grover } 19270041088SAndy Grover 19370041088SAndy Grover if (left && tc->t_tinc_hdr_rem) { 19470041088SAndy Grover to_copy = min(tc->t_tinc_hdr_rem, left); 19570041088SAndy Grover rdsdebug("copying %zu header from skb %p\n", to_copy, 19670041088SAndy Grover skb); 19770041088SAndy Grover skb_copy_bits(skb, offset, 19870041088SAndy Grover (char *)&tinc->ti_inc.i_hdr + 19970041088SAndy Grover sizeof(struct rds_header) - 20070041088SAndy Grover tc->t_tinc_hdr_rem, 20170041088SAndy Grover to_copy); 20270041088SAndy Grover tc->t_tinc_hdr_rem -= to_copy; 20370041088SAndy Grover left -= to_copy; 20470041088SAndy Grover offset += to_copy; 20570041088SAndy Grover 20670041088SAndy Grover if (tc->t_tinc_hdr_rem == 0) { 20770041088SAndy Grover /* could be 0 for a 0 len message */ 20870041088SAndy Grover tc->t_tinc_data_rem = 20970041088SAndy Grover be32_to_cpu(tinc->ti_inc.i_hdr.h_len); 2103289025aSSantosh Shilimkar tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] = 2113289025aSSantosh Shilimkar local_clock(); 21270041088SAndy Grover } 21370041088SAndy Grover } 21470041088SAndy Grover 21570041088SAndy Grover if (left && tc->t_tinc_data_rem) { 216947d2756SSowmini Varadhan to_copy = min(tc->t_tinc_data_rem, left); 217947d2756SSowmini Varadhan 218947d2756SSowmini Varadhan clone = pskb_extract(skb, offset, to_copy, arg->gfp); 2198690bfa1SAndy Grover if (!clone) { 22070041088SAndy Grover desc->error = -ENOMEM; 22170041088SAndy Grover goto out; 22270041088SAndy Grover } 22370041088SAndy Grover 22470041088SAndy Grover skb_queue_tail(&tinc->ti_skb_list, clone); 22570041088SAndy Grover 22670041088SAndy Grover rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " 22770041088SAndy Grover "clone %p data %p len %d\n", 22870041088SAndy Grover skb, skb->data, skb->len, offset, to_copy, 22970041088SAndy Grover clone, clone->data, clone->len); 23070041088SAndy Grover 23170041088SAndy Grover tc->t_tinc_data_rem -= to_copy; 23270041088SAndy Grover left -= to_copy; 23370041088SAndy Grover offset += to_copy; 23470041088SAndy Grover } 23570041088SAndy Grover 23670041088SAndy Grover if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { 2372da43c4aSSowmini Varadhan struct rds_connection *conn = cp->cp_conn; 2382da43c4aSSowmini Varadhan 23970041088SAndy Grover if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 24070041088SAndy Grover rds_tcp_cong_recv(conn, tinc); 24170041088SAndy Grover else 24270041088SAndy Grover rds_recv_incoming(conn, conn->c_faddr, 24370041088SAndy Grover conn->c_laddr, &tinc->ti_inc, 2446114eab5SCong Wang arg->gfp); 24570041088SAndy Grover 24670041088SAndy Grover tc->t_tinc_hdr_rem = sizeof(struct rds_header); 24770041088SAndy Grover tc->t_tinc_data_rem = 0; 24870041088SAndy Grover tc->t_tinc = NULL; 24970041088SAndy Grover rds_inc_put(&tinc->ti_inc); 25070041088SAndy Grover tinc = NULL; 25170041088SAndy Grover } 25270041088SAndy Grover } 25370041088SAndy Grover out: 25470041088SAndy Grover rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n", 25570041088SAndy Grover len, left, skb->len, 25670041088SAndy Grover skb_queue_len(&tc->t_sock->sk->sk_receive_queue)); 25770041088SAndy Grover return len - left; 25870041088SAndy Grover } 25970041088SAndy Grover 26070041088SAndy Grover /* the caller has to hold the sock lock */ 2612da43c4aSSowmini Varadhan static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp) 26270041088SAndy Grover { 2632da43c4aSSowmini Varadhan struct rds_tcp_connection *tc = cp->cp_transport_data; 26470041088SAndy Grover struct socket *sock = tc->t_sock; 26570041088SAndy Grover read_descriptor_t desc; 26670041088SAndy Grover struct rds_tcp_desc_arg arg; 26770041088SAndy Grover 26870041088SAndy Grover /* It's like glib in the kernel! */ 2692da43c4aSSowmini Varadhan arg.conn_path = cp; 27070041088SAndy Grover arg.gfp = gfp; 27170041088SAndy Grover desc.arg.data = &arg; 27270041088SAndy Grover desc.error = 0; 27370041088SAndy Grover desc.count = 1; /* give more than one skb per call */ 27470041088SAndy Grover 27570041088SAndy Grover tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv); 27670041088SAndy Grover rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, 27770041088SAndy Grover desc.error); 27870041088SAndy Grover 27970041088SAndy Grover return desc.error; 28070041088SAndy Grover } 28170041088SAndy Grover 28270041088SAndy Grover /* 28370041088SAndy Grover * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from 28470041088SAndy Grover * data_ready. 28570041088SAndy Grover * 28670041088SAndy Grover * if we fail to allocate we're in trouble.. blindly wait some time before 28770041088SAndy Grover * trying again to see if the VM can free up something for us. 28870041088SAndy Grover */ 2892da43c4aSSowmini Varadhan int rds_tcp_recv_path(struct rds_conn_path *cp) 29070041088SAndy Grover { 2912da43c4aSSowmini Varadhan struct rds_tcp_connection *tc = cp->cp_transport_data; 29270041088SAndy Grover struct socket *sock = tc->t_sock; 29370041088SAndy Grover int ret = 0; 29470041088SAndy Grover 2952da43c4aSSowmini Varadhan rdsdebug("recv worker path [%d] tc %p sock %p\n", 2962da43c4aSSowmini Varadhan cp->cp_index, tc, sock); 29770041088SAndy Grover 29870041088SAndy Grover lock_sock(sock->sk); 2992da43c4aSSowmini Varadhan ret = rds_tcp_read_sock(cp, GFP_KERNEL); 30070041088SAndy Grover release_sock(sock->sk); 30170041088SAndy Grover 30270041088SAndy Grover return ret; 30370041088SAndy Grover } 30470041088SAndy Grover 305676d2369SDavid S. Miller void rds_tcp_data_ready(struct sock *sk) 30670041088SAndy Grover { 307676d2369SDavid S. Miller void (*ready)(struct sock *sk); 308ea3b1ea5SSowmini Varadhan struct rds_conn_path *cp; 30970041088SAndy Grover struct rds_tcp_connection *tc; 31070041088SAndy Grover 311676d2369SDavid S. Miller rdsdebug("data ready sk %p\n", sk); 31270041088SAndy Grover 31338036629SEric Dumazet read_lock_bh(&sk->sk_callback_lock); 314ea3b1ea5SSowmini Varadhan cp = sk->sk_user_data; 315ea3b1ea5SSowmini Varadhan if (!cp) { /* check for teardown race */ 31670041088SAndy Grover ready = sk->sk_data_ready; 31770041088SAndy Grover goto out; 31870041088SAndy Grover } 31970041088SAndy Grover 320ea3b1ea5SSowmini Varadhan tc = cp->cp_transport_data; 32170041088SAndy Grover ready = tc->t_orig_data_ready; 32270041088SAndy Grover rds_tcp_stats_inc(s_tcp_data_ready_calls); 32370041088SAndy Grover 3243db6e0d1SSowmini Varadhan if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) { 3253db6e0d1SSowmini Varadhan rcu_read_lock(); 326*ebeeb1adSSowmini Varadhan if (!rds_destroy_pending(cp->cp_conn)) 327ea3b1ea5SSowmini Varadhan queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); 3283db6e0d1SSowmini Varadhan rcu_read_unlock(); 3293db6e0d1SSowmini Varadhan } 33070041088SAndy Grover out: 33138036629SEric Dumazet read_unlock_bh(&sk->sk_callback_lock); 332676d2369SDavid S. Miller ready(sk); 33370041088SAndy Grover } 33470041088SAndy Grover 335ef87b7eaSZach Brown int rds_tcp_recv_init(void) 33670041088SAndy Grover { 33770041088SAndy Grover rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", 33870041088SAndy Grover sizeof(struct rds_tcp_incoming), 33970041088SAndy Grover 0, 0, NULL); 3408690bfa1SAndy Grover if (!rds_tcp_incoming_slab) 34170041088SAndy Grover return -ENOMEM; 34270041088SAndy Grover return 0; 34370041088SAndy Grover } 34470041088SAndy Grover 34570041088SAndy Grover void rds_tcp_recv_exit(void) 34670041088SAndy Grover { 34770041088SAndy Grover kmem_cache_destroy(rds_tcp_incoming_slab); 34870041088SAndy Grover } 349