170041088SAndy Grover /* 270041088SAndy Grover * Copyright (c) 2006 Oracle. All rights reserved. 370041088SAndy Grover * 470041088SAndy Grover * This software is available to you under a choice of one of two 570041088SAndy Grover * licenses. You may choose to be licensed under the terms of the GNU 670041088SAndy Grover * General Public License (GPL) Version 2, available from the file 770041088SAndy Grover * COPYING in the main directory of this source tree, or the 870041088SAndy Grover * OpenIB.org BSD license below: 970041088SAndy Grover * 1070041088SAndy Grover * Redistribution and use in source and binary forms, with or 1170041088SAndy Grover * without modification, are permitted provided that the following 1270041088SAndy Grover * conditions are met: 1370041088SAndy Grover * 1470041088SAndy Grover * - Redistributions of source code must retain the above 1570041088SAndy Grover * copyright notice, this list of conditions and the following 1670041088SAndy Grover * disclaimer. 1770041088SAndy Grover * 1870041088SAndy Grover * - Redistributions in binary form must reproduce the above 1970041088SAndy Grover * copyright notice, this list of conditions and the following 2070041088SAndy Grover * disclaimer in the documentation and/or other materials 2170041088SAndy Grover * provided with the distribution. 2270041088SAndy Grover * 2370041088SAndy Grover * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 2470041088SAndy Grover * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 2570041088SAndy Grover * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 2670041088SAndy Grover * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 2770041088SAndy Grover * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 2870041088SAndy Grover * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 2970041088SAndy Grover * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3070041088SAndy Grover * SOFTWARE. 3170041088SAndy Grover * 3270041088SAndy Grover */ 3370041088SAndy Grover #include <linux/kernel.h> 34*5a0e3ad6STejun Heo #include <linux/slab.h> 3570041088SAndy Grover #include <net/tcp.h> 3670041088SAndy Grover 3770041088SAndy Grover #include "rds.h" 3870041088SAndy Grover #include "tcp.h" 3970041088SAndy Grover 4070041088SAndy Grover static struct kmem_cache *rds_tcp_incoming_slab; 4170041088SAndy Grover 4270041088SAndy Grover void rds_tcp_inc_purge(struct rds_incoming *inc) 4370041088SAndy Grover { 4470041088SAndy Grover struct rds_tcp_incoming *tinc; 4570041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 4670041088SAndy Grover rdsdebug("purging tinc %p inc %p\n", tinc, inc); 4770041088SAndy Grover skb_queue_purge(&tinc->ti_skb_list); 4870041088SAndy Grover } 4970041088SAndy Grover 5070041088SAndy Grover void rds_tcp_inc_free(struct rds_incoming *inc) 5170041088SAndy Grover { 5270041088SAndy Grover struct rds_tcp_incoming *tinc; 5370041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 5470041088SAndy Grover rds_tcp_inc_purge(inc); 5570041088SAndy Grover rdsdebug("freeing tinc %p inc %p\n", tinc, inc); 5670041088SAndy Grover kmem_cache_free(rds_tcp_incoming_slab, tinc); 5770041088SAndy Grover } 5870041088SAndy Grover 5970041088SAndy Grover /* 6070041088SAndy Grover * this is pretty lame, but, whatever. 6170041088SAndy Grover */ 6270041088SAndy Grover int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, 6370041088SAndy Grover size_t size) 6470041088SAndy Grover { 6570041088SAndy Grover struct rds_tcp_incoming *tinc; 6670041088SAndy Grover struct iovec *iov, tmp; 6770041088SAndy Grover struct sk_buff *skb; 6870041088SAndy Grover unsigned long to_copy, skb_off; 6970041088SAndy Grover int ret = 0; 7070041088SAndy Grover 7170041088SAndy Grover if (size == 0) 7270041088SAndy Grover goto out; 7370041088SAndy Grover 7470041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 7570041088SAndy Grover iov = first_iov; 7670041088SAndy Grover tmp = *iov; 7770041088SAndy Grover 7870041088SAndy Grover skb_queue_walk(&tinc->ti_skb_list, skb) { 7970041088SAndy Grover skb_off = 0; 8070041088SAndy Grover while (skb_off < skb->len) { 8170041088SAndy Grover while (tmp.iov_len == 0) { 8270041088SAndy Grover iov++; 8370041088SAndy Grover tmp = *iov; 8470041088SAndy Grover } 8570041088SAndy Grover 8670041088SAndy Grover to_copy = min(tmp.iov_len, size); 8770041088SAndy Grover to_copy = min(to_copy, skb->len - skb_off); 8870041088SAndy Grover 8970041088SAndy Grover rdsdebug("ret %d size %zu skb %p skb_off %lu " 9070041088SAndy Grover "skblen %d iov_base %p iov_len %zu cpy %lu\n", 9170041088SAndy Grover ret, size, skb, skb_off, skb->len, 9270041088SAndy Grover tmp.iov_base, tmp.iov_len, to_copy); 9370041088SAndy Grover 9470041088SAndy Grover /* modifies tmp as it copies */ 9570041088SAndy Grover if (skb_copy_datagram_iovec(skb, skb_off, &tmp, 9670041088SAndy Grover to_copy)) { 9770041088SAndy Grover ret = -EFAULT; 9870041088SAndy Grover goto out; 9970041088SAndy Grover } 10070041088SAndy Grover 10170041088SAndy Grover size -= to_copy; 10270041088SAndy Grover ret += to_copy; 10370041088SAndy Grover skb_off += to_copy; 10470041088SAndy Grover if (size == 0) 10570041088SAndy Grover goto out; 10670041088SAndy Grover } 10770041088SAndy Grover } 10870041088SAndy Grover out: 10970041088SAndy Grover return ret; 11070041088SAndy Grover } 11170041088SAndy Grover 11270041088SAndy Grover /* 11370041088SAndy Grover * We have a series of skbs that have fragmented pieces of the congestion 11470041088SAndy Grover * bitmap. They must add up to the exact size of the congestion bitmap. We 11570041088SAndy Grover * use the skb helpers to copy those into the pages that make up the in-memory 11670041088SAndy Grover * congestion bitmap for the remote address of this connection. We then tell 11770041088SAndy Grover * the congestion core that the bitmap has been changed so that it can wake up 11870041088SAndy Grover * sleepers. 11970041088SAndy Grover * 12070041088SAndy Grover * This is racing with sending paths which are using test_bit to see if the 12170041088SAndy Grover * bitmap indicates that their recipient is congested. 12270041088SAndy Grover */ 12370041088SAndy Grover 12470041088SAndy Grover static void rds_tcp_cong_recv(struct rds_connection *conn, 12570041088SAndy Grover struct rds_tcp_incoming *tinc) 12670041088SAndy Grover { 12770041088SAndy Grover struct sk_buff *skb; 12870041088SAndy Grover unsigned int to_copy, skb_off; 12970041088SAndy Grover unsigned int map_off; 13070041088SAndy Grover unsigned int map_page; 13170041088SAndy Grover struct rds_cong_map *map; 13270041088SAndy Grover int ret; 13370041088SAndy Grover 13470041088SAndy Grover /* catch completely corrupt packets */ 13570041088SAndy Grover if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) 13670041088SAndy Grover return; 13770041088SAndy Grover 13870041088SAndy Grover map_page = 0; 13970041088SAndy Grover map_off = 0; 14070041088SAndy Grover map = conn->c_fcong; 14170041088SAndy Grover 14270041088SAndy Grover skb_queue_walk(&tinc->ti_skb_list, skb) { 14370041088SAndy Grover skb_off = 0; 14470041088SAndy Grover while (skb_off < skb->len) { 14570041088SAndy Grover to_copy = min_t(unsigned int, PAGE_SIZE - map_off, 14670041088SAndy Grover skb->len - skb_off); 14770041088SAndy Grover 14870041088SAndy Grover BUG_ON(map_page >= RDS_CONG_MAP_PAGES); 14970041088SAndy Grover 15070041088SAndy Grover /* only returns 0 or -error */ 15170041088SAndy Grover ret = skb_copy_bits(skb, skb_off, 15270041088SAndy Grover (void *)map->m_page_addrs[map_page] + map_off, 15370041088SAndy Grover to_copy); 15470041088SAndy Grover BUG_ON(ret != 0); 15570041088SAndy Grover 15670041088SAndy Grover skb_off += to_copy; 15770041088SAndy Grover map_off += to_copy; 15870041088SAndy Grover if (map_off == PAGE_SIZE) { 15970041088SAndy Grover map_off = 0; 16070041088SAndy Grover map_page++; 16170041088SAndy Grover } 16270041088SAndy Grover } 16370041088SAndy Grover } 16470041088SAndy Grover 16570041088SAndy Grover rds_cong_map_updated(map, ~(u64) 0); 16670041088SAndy Grover } 16770041088SAndy Grover 16870041088SAndy Grover struct rds_tcp_desc_arg { 16970041088SAndy Grover struct rds_connection *conn; 17070041088SAndy Grover gfp_t gfp; 17170041088SAndy Grover enum km_type km; 17270041088SAndy Grover }; 17370041088SAndy Grover 17470041088SAndy Grover static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, 17570041088SAndy Grover unsigned int offset, size_t len) 17670041088SAndy Grover { 17770041088SAndy Grover struct rds_tcp_desc_arg *arg = desc->arg.data; 17870041088SAndy Grover struct rds_connection *conn = arg->conn; 17970041088SAndy Grover struct rds_tcp_connection *tc = conn->c_transport_data; 18070041088SAndy Grover struct rds_tcp_incoming *tinc = tc->t_tinc; 18170041088SAndy Grover struct sk_buff *clone; 18270041088SAndy Grover size_t left = len, to_copy; 18370041088SAndy Grover 18470041088SAndy Grover rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset, 18570041088SAndy Grover len); 18670041088SAndy Grover 18770041088SAndy Grover /* 18870041088SAndy Grover * tcp_read_sock() interprets partial progress as an indication to stop 18970041088SAndy Grover * processing. 19070041088SAndy Grover */ 19170041088SAndy Grover while (left) { 19270041088SAndy Grover if (tinc == NULL) { 19370041088SAndy Grover tinc = kmem_cache_alloc(rds_tcp_incoming_slab, 19470041088SAndy Grover arg->gfp); 19570041088SAndy Grover if (tinc == NULL) { 19670041088SAndy Grover desc->error = -ENOMEM; 19770041088SAndy Grover goto out; 19870041088SAndy Grover } 19970041088SAndy Grover tc->t_tinc = tinc; 20070041088SAndy Grover rdsdebug("alloced tinc %p\n", tinc); 20170041088SAndy Grover rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr); 20270041088SAndy Grover /* 20370041088SAndy Grover * XXX * we might be able to use the __ variants when 20470041088SAndy Grover * we've already serialized at a higher level. 20570041088SAndy Grover */ 20670041088SAndy Grover skb_queue_head_init(&tinc->ti_skb_list); 20770041088SAndy Grover } 20870041088SAndy Grover 20970041088SAndy Grover if (left && tc->t_tinc_hdr_rem) { 21070041088SAndy Grover to_copy = min(tc->t_tinc_hdr_rem, left); 21170041088SAndy Grover rdsdebug("copying %zu header from skb %p\n", to_copy, 21270041088SAndy Grover skb); 21370041088SAndy Grover skb_copy_bits(skb, offset, 21470041088SAndy Grover (char *)&tinc->ti_inc.i_hdr + 21570041088SAndy Grover sizeof(struct rds_header) - 21670041088SAndy Grover tc->t_tinc_hdr_rem, 21770041088SAndy Grover to_copy); 21870041088SAndy Grover tc->t_tinc_hdr_rem -= to_copy; 21970041088SAndy Grover left -= to_copy; 22070041088SAndy Grover offset += to_copy; 22170041088SAndy Grover 22270041088SAndy Grover if (tc->t_tinc_hdr_rem == 0) { 22370041088SAndy Grover /* could be 0 for a 0 len message */ 22470041088SAndy Grover tc->t_tinc_data_rem = 22570041088SAndy Grover be32_to_cpu(tinc->ti_inc.i_hdr.h_len); 22670041088SAndy Grover } 22770041088SAndy Grover } 22870041088SAndy Grover 22970041088SAndy Grover if (left && tc->t_tinc_data_rem) { 23070041088SAndy Grover clone = skb_clone(skb, arg->gfp); 23170041088SAndy Grover if (clone == NULL) { 23270041088SAndy Grover desc->error = -ENOMEM; 23370041088SAndy Grover goto out; 23470041088SAndy Grover } 23570041088SAndy Grover 23670041088SAndy Grover to_copy = min(tc->t_tinc_data_rem, left); 23770041088SAndy Grover pskb_pull(clone, offset); 23870041088SAndy Grover pskb_trim(clone, to_copy); 23970041088SAndy Grover skb_queue_tail(&tinc->ti_skb_list, clone); 24070041088SAndy Grover 24170041088SAndy Grover rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " 24270041088SAndy Grover "clone %p data %p len %d\n", 24370041088SAndy Grover skb, skb->data, skb->len, offset, to_copy, 24470041088SAndy Grover clone, clone->data, clone->len); 24570041088SAndy Grover 24670041088SAndy Grover tc->t_tinc_data_rem -= to_copy; 24770041088SAndy Grover left -= to_copy; 24870041088SAndy Grover offset += to_copy; 24970041088SAndy Grover } 25070041088SAndy Grover 25170041088SAndy Grover if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { 25270041088SAndy Grover if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 25370041088SAndy Grover rds_tcp_cong_recv(conn, tinc); 25470041088SAndy Grover else 25570041088SAndy Grover rds_recv_incoming(conn, conn->c_faddr, 25670041088SAndy Grover conn->c_laddr, &tinc->ti_inc, 25770041088SAndy Grover arg->gfp, arg->km); 25870041088SAndy Grover 25970041088SAndy Grover tc->t_tinc_hdr_rem = sizeof(struct rds_header); 26070041088SAndy Grover tc->t_tinc_data_rem = 0; 26170041088SAndy Grover tc->t_tinc = NULL; 26270041088SAndy Grover rds_inc_put(&tinc->ti_inc); 26370041088SAndy Grover tinc = NULL; 26470041088SAndy Grover } 26570041088SAndy Grover } 26670041088SAndy Grover out: 26770041088SAndy Grover rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n", 26870041088SAndy Grover len, left, skb->len, 26970041088SAndy Grover skb_queue_len(&tc->t_sock->sk->sk_receive_queue)); 27070041088SAndy Grover return len - left; 27170041088SAndy Grover } 27270041088SAndy Grover 27370041088SAndy Grover /* the caller has to hold the sock lock */ 27470041088SAndy Grover int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km) 27570041088SAndy Grover { 27670041088SAndy Grover struct rds_tcp_connection *tc = conn->c_transport_data; 27770041088SAndy Grover struct socket *sock = tc->t_sock; 27870041088SAndy Grover read_descriptor_t desc; 27970041088SAndy Grover struct rds_tcp_desc_arg arg; 28070041088SAndy Grover 28170041088SAndy Grover /* It's like glib in the kernel! */ 28270041088SAndy Grover arg.conn = conn; 28370041088SAndy Grover arg.gfp = gfp; 28470041088SAndy Grover arg.km = km; 28570041088SAndy Grover desc.arg.data = &arg; 28670041088SAndy Grover desc.error = 0; 28770041088SAndy Grover desc.count = 1; /* give more than one skb per call */ 28870041088SAndy Grover 28970041088SAndy Grover tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv); 29070041088SAndy Grover rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, 29170041088SAndy Grover desc.error); 29270041088SAndy Grover 29370041088SAndy Grover return desc.error; 29470041088SAndy Grover } 29570041088SAndy Grover 29670041088SAndy Grover /* 29770041088SAndy Grover * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from 29870041088SAndy Grover * data_ready. 29970041088SAndy Grover * 30070041088SAndy Grover * if we fail to allocate we're in trouble.. blindly wait some time before 30170041088SAndy Grover * trying again to see if the VM can free up something for us. 30270041088SAndy Grover */ 30370041088SAndy Grover int rds_tcp_recv(struct rds_connection *conn) 30470041088SAndy Grover { 30570041088SAndy Grover struct rds_tcp_connection *tc = conn->c_transport_data; 30670041088SAndy Grover struct socket *sock = tc->t_sock; 30770041088SAndy Grover int ret = 0; 30870041088SAndy Grover 30970041088SAndy Grover rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock); 31070041088SAndy Grover 31170041088SAndy Grover lock_sock(sock->sk); 31270041088SAndy Grover ret = rds_tcp_read_sock(conn, GFP_KERNEL, KM_USER0); 31370041088SAndy Grover release_sock(sock->sk); 31470041088SAndy Grover 31570041088SAndy Grover return ret; 31670041088SAndy Grover } 31770041088SAndy Grover 31870041088SAndy Grover void rds_tcp_data_ready(struct sock *sk, int bytes) 31970041088SAndy Grover { 32070041088SAndy Grover void (*ready)(struct sock *sk, int bytes); 32170041088SAndy Grover struct rds_connection *conn; 32270041088SAndy Grover struct rds_tcp_connection *tc; 32370041088SAndy Grover 32470041088SAndy Grover rdsdebug("data ready sk %p bytes %d\n", sk, bytes); 32570041088SAndy Grover 32670041088SAndy Grover read_lock(&sk->sk_callback_lock); 32770041088SAndy Grover conn = sk->sk_user_data; 32870041088SAndy Grover if (conn == NULL) { /* check for teardown race */ 32970041088SAndy Grover ready = sk->sk_data_ready; 33070041088SAndy Grover goto out; 33170041088SAndy Grover } 33270041088SAndy Grover 33370041088SAndy Grover tc = conn->c_transport_data; 33470041088SAndy Grover ready = tc->t_orig_data_ready; 33570041088SAndy Grover rds_tcp_stats_inc(s_tcp_data_ready_calls); 33670041088SAndy Grover 33770041088SAndy Grover if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM) 33870041088SAndy Grover queue_delayed_work(rds_wq, &conn->c_recv_w, 0); 33970041088SAndy Grover out: 34070041088SAndy Grover read_unlock(&sk->sk_callback_lock); 34170041088SAndy Grover ready(sk, bytes); 34270041088SAndy Grover } 34370041088SAndy Grover 34470041088SAndy Grover int __init rds_tcp_recv_init(void) 34570041088SAndy Grover { 34670041088SAndy Grover rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", 34770041088SAndy Grover sizeof(struct rds_tcp_incoming), 34870041088SAndy Grover 0, 0, NULL); 34970041088SAndy Grover if (rds_tcp_incoming_slab == NULL) 35070041088SAndy Grover return -ENOMEM; 35170041088SAndy Grover return 0; 35270041088SAndy Grover } 35370041088SAndy Grover 35470041088SAndy Grover void rds_tcp_recv_exit(void) 35570041088SAndy Grover { 35670041088SAndy Grover kmem_cache_destroy(rds_tcp_incoming_slab); 35770041088SAndy Grover } 358