170041088SAndy Grover /* 270041088SAndy Grover * Copyright (c) 2006 Oracle. All rights reserved. 370041088SAndy Grover * 470041088SAndy Grover * This software is available to you under a choice of one of two 570041088SAndy Grover * licenses. You may choose to be licensed under the terms of the GNU 670041088SAndy Grover * General Public License (GPL) Version 2, available from the file 770041088SAndy Grover * COPYING in the main directory of this source tree, or the 870041088SAndy Grover * OpenIB.org BSD license below: 970041088SAndy Grover * 1070041088SAndy Grover * Redistribution and use in source and binary forms, with or 1170041088SAndy Grover * without modification, are permitted provided that the following 1270041088SAndy Grover * conditions are met: 1370041088SAndy Grover * 1470041088SAndy Grover * - Redistributions of source code must retain the above 1570041088SAndy Grover * copyright notice, this list of conditions and the following 1670041088SAndy Grover * disclaimer. 1770041088SAndy Grover * 1870041088SAndy Grover * - Redistributions in binary form must reproduce the above 1970041088SAndy Grover * copyright notice, this list of conditions and the following 2070041088SAndy Grover * disclaimer in the documentation and/or other materials 2170041088SAndy Grover * provided with the distribution. 2270041088SAndy Grover * 2370041088SAndy Grover * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 2470041088SAndy Grover * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 2570041088SAndy Grover * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 2670041088SAndy Grover * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 2770041088SAndy Grover * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 2870041088SAndy Grover * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 2970041088SAndy Grover * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3070041088SAndy Grover * SOFTWARE. 3170041088SAndy Grover * 3270041088SAndy Grover */ 3370041088SAndy Grover #include <linux/kernel.h> 345a0e3ad6STejun Heo #include <linux/slab.h> 3570041088SAndy Grover #include <net/tcp.h> 3670041088SAndy Grover 3770041088SAndy Grover #include "rds.h" 3870041088SAndy Grover #include "tcp.h" 3970041088SAndy Grover 4070041088SAndy Grover static struct kmem_cache *rds_tcp_incoming_slab; 4170041088SAndy Grover 42809fa148SAndy Grover static void rds_tcp_inc_purge(struct rds_incoming *inc) 4370041088SAndy Grover { 4470041088SAndy Grover struct rds_tcp_incoming *tinc; 4570041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 4670041088SAndy Grover rdsdebug("purging tinc %p inc %p\n", tinc, inc); 4770041088SAndy Grover skb_queue_purge(&tinc->ti_skb_list); 4870041088SAndy Grover } 4970041088SAndy Grover 5070041088SAndy Grover void rds_tcp_inc_free(struct rds_incoming *inc) 5170041088SAndy Grover { 5270041088SAndy Grover struct rds_tcp_incoming *tinc; 5370041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 5470041088SAndy Grover rds_tcp_inc_purge(inc); 5570041088SAndy Grover rdsdebug("freeing tinc %p inc %p\n", tinc, inc); 5670041088SAndy Grover kmem_cache_free(rds_tcp_incoming_slab, tinc); 5770041088SAndy Grover } 5870041088SAndy Grover 5970041088SAndy Grover /* 6070041088SAndy Grover * this is pretty lame, but, whatever. 6170041088SAndy Grover */ 62c310e72cSAl Viro int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to) 6370041088SAndy Grover { 6470041088SAndy Grover struct rds_tcp_incoming *tinc; 6570041088SAndy Grover struct sk_buff *skb; 6670041088SAndy Grover int ret = 0; 6770041088SAndy Grover 68c310e72cSAl Viro if (!iov_iter_count(to)) 6970041088SAndy Grover goto out; 7070041088SAndy Grover 7170041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 7270041088SAndy Grover 7370041088SAndy Grover skb_queue_walk(&tinc->ti_skb_list, skb) { 74c310e72cSAl Viro unsigned long to_copy, skb_off; 75c310e72cSAl Viro for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) { 76c310e72cSAl Viro to_copy = iov_iter_count(to); 7770041088SAndy Grover to_copy = min(to_copy, skb->len - skb_off); 7870041088SAndy Grover 79c310e72cSAl Viro if (skb_copy_datagram_iter(skb, skb_off, to, to_copy)) 80c310e72cSAl Viro return -EFAULT; 8170041088SAndy Grover 82b075cfdbSAndy Grover rds_stats_add(s_copy_to_user, to_copy); 8370041088SAndy Grover ret += to_copy; 84c310e72cSAl Viro 85c310e72cSAl Viro if (!iov_iter_count(to)) 8670041088SAndy Grover goto out; 8770041088SAndy Grover } 8870041088SAndy Grover } 8970041088SAndy Grover out: 9070041088SAndy Grover return ret; 9170041088SAndy Grover } 9270041088SAndy Grover 9370041088SAndy Grover /* 9470041088SAndy Grover * We have a series of skbs that have fragmented pieces of the congestion 9570041088SAndy Grover * bitmap. They must add up to the exact size of the congestion bitmap. We 9670041088SAndy Grover * use the skb helpers to copy those into the pages that make up the in-memory 9770041088SAndy Grover * congestion bitmap for the remote address of this connection. We then tell 9870041088SAndy Grover * the congestion core that the bitmap has been changed so that it can wake up 9970041088SAndy Grover * sleepers. 10070041088SAndy Grover * 10170041088SAndy Grover * This is racing with sending paths which are using test_bit to see if the 10270041088SAndy Grover * bitmap indicates that their recipient is congested. 10370041088SAndy Grover */ 10470041088SAndy Grover 10570041088SAndy Grover static void rds_tcp_cong_recv(struct rds_connection *conn, 10670041088SAndy Grover struct rds_tcp_incoming *tinc) 10770041088SAndy Grover { 10870041088SAndy Grover struct sk_buff *skb; 10970041088SAndy Grover unsigned int to_copy, skb_off; 11070041088SAndy Grover unsigned int map_off; 11170041088SAndy Grover unsigned int map_page; 11270041088SAndy Grover struct rds_cong_map *map; 11370041088SAndy Grover int ret; 11470041088SAndy Grover 11570041088SAndy Grover /* catch completely corrupt packets */ 11670041088SAndy Grover if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) 11770041088SAndy Grover return; 11870041088SAndy Grover 11970041088SAndy Grover map_page = 0; 12070041088SAndy Grover map_off = 0; 12170041088SAndy Grover map = conn->c_fcong; 12270041088SAndy Grover 12370041088SAndy Grover skb_queue_walk(&tinc->ti_skb_list, skb) { 12470041088SAndy Grover skb_off = 0; 12570041088SAndy Grover while (skb_off < skb->len) { 12670041088SAndy Grover to_copy = min_t(unsigned int, PAGE_SIZE - map_off, 12770041088SAndy Grover skb->len - skb_off); 12870041088SAndy Grover 12970041088SAndy Grover BUG_ON(map_page >= RDS_CONG_MAP_PAGES); 13070041088SAndy Grover 13170041088SAndy Grover /* only returns 0 or -error */ 13270041088SAndy Grover ret = skb_copy_bits(skb, skb_off, 13370041088SAndy Grover (void *)map->m_page_addrs[map_page] + map_off, 13470041088SAndy Grover to_copy); 13570041088SAndy Grover BUG_ON(ret != 0); 13670041088SAndy Grover 13770041088SAndy Grover skb_off += to_copy; 13870041088SAndy Grover map_off += to_copy; 13970041088SAndy Grover if (map_off == PAGE_SIZE) { 14070041088SAndy Grover map_off = 0; 14170041088SAndy Grover map_page++; 14270041088SAndy Grover } 14370041088SAndy Grover } 14470041088SAndy Grover } 14570041088SAndy Grover 14670041088SAndy Grover rds_cong_map_updated(map, ~(u64) 0); 14770041088SAndy Grover } 14870041088SAndy Grover 14970041088SAndy Grover struct rds_tcp_desc_arg { 15070041088SAndy Grover struct rds_connection *conn; 15170041088SAndy Grover gfp_t gfp; 15270041088SAndy Grover }; 15370041088SAndy Grover 15470041088SAndy Grover static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, 15570041088SAndy Grover unsigned int offset, size_t len) 15670041088SAndy Grover { 15770041088SAndy Grover struct rds_tcp_desc_arg *arg = desc->arg.data; 15870041088SAndy Grover struct rds_connection *conn = arg->conn; 15970041088SAndy Grover struct rds_tcp_connection *tc = conn->c_transport_data; 16070041088SAndy Grover struct rds_tcp_incoming *tinc = tc->t_tinc; 16170041088SAndy Grover struct sk_buff *clone; 16270041088SAndy Grover size_t left = len, to_copy; 16370041088SAndy Grover 16470041088SAndy Grover rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset, 16570041088SAndy Grover len); 16670041088SAndy Grover 16770041088SAndy Grover /* 16870041088SAndy Grover * tcp_read_sock() interprets partial progress as an indication to stop 16970041088SAndy Grover * processing. 17070041088SAndy Grover */ 17170041088SAndy Grover while (left) { 1728690bfa1SAndy Grover if (!tinc) { 17370041088SAndy Grover tinc = kmem_cache_alloc(rds_tcp_incoming_slab, 17470041088SAndy Grover arg->gfp); 1758690bfa1SAndy Grover if (!tinc) { 17670041088SAndy Grover desc->error = -ENOMEM; 17770041088SAndy Grover goto out; 17870041088SAndy Grover } 17970041088SAndy Grover tc->t_tinc = tinc; 18070041088SAndy Grover rdsdebug("alloced tinc %p\n", tinc); 18170041088SAndy Grover rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr); 18270041088SAndy Grover /* 18370041088SAndy Grover * XXX * we might be able to use the __ variants when 18470041088SAndy Grover * we've already serialized at a higher level. 18570041088SAndy Grover */ 18670041088SAndy Grover skb_queue_head_init(&tinc->ti_skb_list); 18770041088SAndy Grover } 18870041088SAndy Grover 18970041088SAndy Grover if (left && tc->t_tinc_hdr_rem) { 19070041088SAndy Grover to_copy = min(tc->t_tinc_hdr_rem, left); 19170041088SAndy Grover rdsdebug("copying %zu header from skb %p\n", to_copy, 19270041088SAndy Grover skb); 19370041088SAndy Grover skb_copy_bits(skb, offset, 19470041088SAndy Grover (char *)&tinc->ti_inc.i_hdr + 19570041088SAndy Grover sizeof(struct rds_header) - 19670041088SAndy Grover tc->t_tinc_hdr_rem, 19770041088SAndy Grover to_copy); 19870041088SAndy Grover tc->t_tinc_hdr_rem -= to_copy; 19970041088SAndy Grover left -= to_copy; 20070041088SAndy Grover offset += to_copy; 20170041088SAndy Grover 20270041088SAndy Grover if (tc->t_tinc_hdr_rem == 0) { 20370041088SAndy Grover /* could be 0 for a 0 len message */ 20470041088SAndy Grover tc->t_tinc_data_rem = 20570041088SAndy Grover be32_to_cpu(tinc->ti_inc.i_hdr.h_len); 20670041088SAndy Grover } 20770041088SAndy Grover } 20870041088SAndy Grover 20970041088SAndy Grover if (left && tc->t_tinc_data_rem) { 210947d2756SSowmini Varadhan to_copy = min(tc->t_tinc_data_rem, left); 211947d2756SSowmini Varadhan 212947d2756SSowmini Varadhan clone = pskb_extract(skb, offset, to_copy, arg->gfp); 2138690bfa1SAndy Grover if (!clone) { 21470041088SAndy Grover desc->error = -ENOMEM; 21570041088SAndy Grover goto out; 21670041088SAndy Grover } 21770041088SAndy Grover 21870041088SAndy Grover skb_queue_tail(&tinc->ti_skb_list, clone); 21970041088SAndy Grover 22070041088SAndy Grover rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " 22170041088SAndy Grover "clone %p data %p len %d\n", 22270041088SAndy Grover skb, skb->data, skb->len, offset, to_copy, 22370041088SAndy Grover clone, clone->data, clone->len); 22470041088SAndy Grover 22570041088SAndy Grover tc->t_tinc_data_rem -= to_copy; 22670041088SAndy Grover left -= to_copy; 22770041088SAndy Grover offset += to_copy; 22870041088SAndy Grover } 22970041088SAndy Grover 23070041088SAndy Grover if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { 23170041088SAndy Grover if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 23270041088SAndy Grover rds_tcp_cong_recv(conn, tinc); 23370041088SAndy Grover else 23470041088SAndy Grover rds_recv_incoming(conn, conn->c_faddr, 23570041088SAndy Grover conn->c_laddr, &tinc->ti_inc, 2366114eab5SCong Wang arg->gfp); 23770041088SAndy Grover 23870041088SAndy Grover tc->t_tinc_hdr_rem = sizeof(struct rds_header); 23970041088SAndy Grover tc->t_tinc_data_rem = 0; 24070041088SAndy Grover tc->t_tinc = NULL; 24170041088SAndy Grover rds_inc_put(&tinc->ti_inc); 24270041088SAndy Grover tinc = NULL; 24370041088SAndy Grover } 24470041088SAndy Grover } 24570041088SAndy Grover out: 24670041088SAndy Grover rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n", 24770041088SAndy Grover len, left, skb->len, 24870041088SAndy Grover skb_queue_len(&tc->t_sock->sk->sk_receive_queue)); 24970041088SAndy Grover return len - left; 25070041088SAndy Grover } 25170041088SAndy Grover 25270041088SAndy Grover /* the caller has to hold the sock lock */ 2536114eab5SCong Wang static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp) 25470041088SAndy Grover { 25570041088SAndy Grover struct rds_tcp_connection *tc = conn->c_transport_data; 25670041088SAndy Grover struct socket *sock = tc->t_sock; 25770041088SAndy Grover read_descriptor_t desc; 25870041088SAndy Grover struct rds_tcp_desc_arg arg; 25970041088SAndy Grover 26070041088SAndy Grover /* It's like glib in the kernel! */ 26170041088SAndy Grover arg.conn = conn; 26270041088SAndy Grover arg.gfp = gfp; 26370041088SAndy Grover desc.arg.data = &arg; 26470041088SAndy Grover desc.error = 0; 26570041088SAndy Grover desc.count = 1; /* give more than one skb per call */ 26670041088SAndy Grover 26770041088SAndy Grover tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv); 26870041088SAndy Grover rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, 26970041088SAndy Grover desc.error); 27070041088SAndy Grover 27170041088SAndy Grover return desc.error; 27270041088SAndy Grover } 27370041088SAndy Grover 27470041088SAndy Grover /* 27570041088SAndy Grover * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from 27670041088SAndy Grover * data_ready. 27770041088SAndy Grover * 27870041088SAndy Grover * if we fail to allocate we're in trouble.. blindly wait some time before 27970041088SAndy Grover * trying again to see if the VM can free up something for us. 28070041088SAndy Grover */ 28170041088SAndy Grover int rds_tcp_recv(struct rds_connection *conn) 28270041088SAndy Grover { 28370041088SAndy Grover struct rds_tcp_connection *tc = conn->c_transport_data; 28470041088SAndy Grover struct socket *sock = tc->t_sock; 28570041088SAndy Grover int ret = 0; 28670041088SAndy Grover 28770041088SAndy Grover rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock); 28870041088SAndy Grover 28970041088SAndy Grover lock_sock(sock->sk); 2906114eab5SCong Wang ret = rds_tcp_read_sock(conn, GFP_KERNEL); 29170041088SAndy Grover release_sock(sock->sk); 29270041088SAndy Grover 29370041088SAndy Grover return ret; 29470041088SAndy Grover } 29570041088SAndy Grover 296676d2369SDavid S. Miller void rds_tcp_data_ready(struct sock *sk) 29770041088SAndy Grover { 298676d2369SDavid S. Miller void (*ready)(struct sock *sk); 29970041088SAndy Grover struct rds_connection *conn; 30070041088SAndy Grover struct rds_tcp_connection *tc; 30170041088SAndy Grover 302676d2369SDavid S. Miller rdsdebug("data ready sk %p\n", sk); 30370041088SAndy Grover 304*38036629SEric Dumazet read_lock_bh(&sk->sk_callback_lock); 30570041088SAndy Grover conn = sk->sk_user_data; 3068690bfa1SAndy Grover if (!conn) { /* check for teardown race */ 30770041088SAndy Grover ready = sk->sk_data_ready; 30870041088SAndy Grover goto out; 30970041088SAndy Grover } 31070041088SAndy Grover 31170041088SAndy Grover tc = conn->c_transport_data; 31270041088SAndy Grover ready = tc->t_orig_data_ready; 31370041088SAndy Grover rds_tcp_stats_inc(s_tcp_data_ready_calls); 31470041088SAndy Grover 3156114eab5SCong Wang if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM) 31670041088SAndy Grover queue_delayed_work(rds_wq, &conn->c_recv_w, 0); 31770041088SAndy Grover out: 318*38036629SEric Dumazet read_unlock_bh(&sk->sk_callback_lock); 319676d2369SDavid S. Miller ready(sk); 32070041088SAndy Grover } 32170041088SAndy Grover 322ef87b7eaSZach Brown int rds_tcp_recv_init(void) 32370041088SAndy Grover { 32470041088SAndy Grover rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", 32570041088SAndy Grover sizeof(struct rds_tcp_incoming), 32670041088SAndy Grover 0, 0, NULL); 3278690bfa1SAndy Grover if (!rds_tcp_incoming_slab) 32870041088SAndy Grover return -ENOMEM; 32970041088SAndy Grover return 0; 33070041088SAndy Grover } 33170041088SAndy Grover 33270041088SAndy Grover void rds_tcp_recv_exit(void) 33370041088SAndy Grover { 33470041088SAndy Grover kmem_cache_destroy(rds_tcp_incoming_slab); 33570041088SAndy Grover } 336