1*70041088SAndy Grover /* 2*70041088SAndy Grover * Copyright (c) 2006 Oracle. All rights reserved. 3*70041088SAndy Grover * 4*70041088SAndy Grover * This software is available to you under a choice of one of two 5*70041088SAndy Grover * licenses. You may choose to be licensed under the terms of the GNU 6*70041088SAndy Grover * General Public License (GPL) Version 2, available from the file 7*70041088SAndy Grover * COPYING in the main directory of this source tree, or the 8*70041088SAndy Grover * OpenIB.org BSD license below: 9*70041088SAndy Grover * 10*70041088SAndy Grover * Redistribution and use in source and binary forms, with or 11*70041088SAndy Grover * without modification, are permitted provided that the following 12*70041088SAndy Grover * conditions are met: 13*70041088SAndy Grover * 14*70041088SAndy Grover * - Redistributions of source code must retain the above 15*70041088SAndy Grover * copyright notice, this list of conditions and the following 16*70041088SAndy Grover * disclaimer. 17*70041088SAndy Grover * 18*70041088SAndy Grover * - Redistributions in binary form must reproduce the above 19*70041088SAndy Grover * copyright notice, this list of conditions and the following 20*70041088SAndy Grover * disclaimer in the documentation and/or other materials 21*70041088SAndy Grover * provided with the distribution. 22*70041088SAndy Grover * 23*70041088SAndy Grover * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24*70041088SAndy Grover * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25*70041088SAndy Grover * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26*70041088SAndy Grover * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27*70041088SAndy Grover * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28*70041088SAndy Grover * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29*70041088SAndy Grover * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30*70041088SAndy Grover * SOFTWARE. 31*70041088SAndy Grover * 32*70041088SAndy Grover */ 33*70041088SAndy Grover #include <linux/kernel.h> 34*70041088SAndy Grover #include <net/tcp.h> 35*70041088SAndy Grover 36*70041088SAndy Grover #include "rds.h" 37*70041088SAndy Grover #include "tcp.h" 38*70041088SAndy Grover 39*70041088SAndy Grover static struct kmem_cache *rds_tcp_incoming_slab; 40*70041088SAndy Grover 41*70041088SAndy Grover void rds_tcp_inc_purge(struct rds_incoming *inc) 42*70041088SAndy Grover { 43*70041088SAndy Grover struct rds_tcp_incoming *tinc; 44*70041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 45*70041088SAndy Grover rdsdebug("purging tinc %p inc %p\n", tinc, inc); 46*70041088SAndy Grover skb_queue_purge(&tinc->ti_skb_list); 47*70041088SAndy Grover } 48*70041088SAndy Grover 49*70041088SAndy Grover void rds_tcp_inc_free(struct rds_incoming *inc) 50*70041088SAndy Grover { 51*70041088SAndy Grover struct rds_tcp_incoming *tinc; 52*70041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 53*70041088SAndy Grover rds_tcp_inc_purge(inc); 54*70041088SAndy Grover rdsdebug("freeing tinc %p inc %p\n", tinc, inc); 55*70041088SAndy Grover kmem_cache_free(rds_tcp_incoming_slab, tinc); 56*70041088SAndy Grover } 57*70041088SAndy Grover 58*70041088SAndy Grover /* 59*70041088SAndy Grover * this is pretty lame, but, whatever. 60*70041088SAndy Grover */ 61*70041088SAndy Grover int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, 62*70041088SAndy Grover size_t size) 63*70041088SAndy Grover { 64*70041088SAndy Grover struct rds_tcp_incoming *tinc; 65*70041088SAndy Grover struct iovec *iov, tmp; 66*70041088SAndy Grover struct sk_buff *skb; 67*70041088SAndy Grover unsigned long to_copy, skb_off; 68*70041088SAndy Grover int ret = 0; 69*70041088SAndy Grover 70*70041088SAndy Grover if (size == 0) 71*70041088SAndy Grover goto out; 72*70041088SAndy Grover 73*70041088SAndy Grover tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 74*70041088SAndy Grover iov = first_iov; 75*70041088SAndy Grover tmp = *iov; 76*70041088SAndy Grover 77*70041088SAndy Grover skb_queue_walk(&tinc->ti_skb_list, skb) { 78*70041088SAndy Grover skb_off = 0; 79*70041088SAndy Grover while (skb_off < skb->len) { 80*70041088SAndy Grover while (tmp.iov_len == 0) { 81*70041088SAndy Grover iov++; 82*70041088SAndy Grover tmp = *iov; 83*70041088SAndy Grover } 84*70041088SAndy Grover 85*70041088SAndy Grover to_copy = min(tmp.iov_len, size); 86*70041088SAndy Grover to_copy = min(to_copy, skb->len - skb_off); 87*70041088SAndy Grover 88*70041088SAndy Grover rdsdebug("ret %d size %zu skb %p skb_off %lu " 89*70041088SAndy Grover "skblen %d iov_base %p iov_len %zu cpy %lu\n", 90*70041088SAndy Grover ret, size, skb, skb_off, skb->len, 91*70041088SAndy Grover tmp.iov_base, tmp.iov_len, to_copy); 92*70041088SAndy Grover 93*70041088SAndy Grover /* modifies tmp as it copies */ 94*70041088SAndy Grover if (skb_copy_datagram_iovec(skb, skb_off, &tmp, 95*70041088SAndy Grover to_copy)) { 96*70041088SAndy Grover ret = -EFAULT; 97*70041088SAndy Grover goto out; 98*70041088SAndy Grover } 99*70041088SAndy Grover 100*70041088SAndy Grover size -= to_copy; 101*70041088SAndy Grover ret += to_copy; 102*70041088SAndy Grover skb_off += to_copy; 103*70041088SAndy Grover if (size == 0) 104*70041088SAndy Grover goto out; 105*70041088SAndy Grover } 106*70041088SAndy Grover } 107*70041088SAndy Grover out: 108*70041088SAndy Grover return ret; 109*70041088SAndy Grover } 110*70041088SAndy Grover 111*70041088SAndy Grover /* 112*70041088SAndy Grover * We have a series of skbs that have fragmented pieces of the congestion 113*70041088SAndy Grover * bitmap. They must add up to the exact size of the congestion bitmap. We 114*70041088SAndy Grover * use the skb helpers to copy those into the pages that make up the in-memory 115*70041088SAndy Grover * congestion bitmap for the remote address of this connection. We then tell 116*70041088SAndy Grover * the congestion core that the bitmap has been changed so that it can wake up 117*70041088SAndy Grover * sleepers. 118*70041088SAndy Grover * 119*70041088SAndy Grover * This is racing with sending paths which are using test_bit to see if the 120*70041088SAndy Grover * bitmap indicates that their recipient is congested. 121*70041088SAndy Grover */ 122*70041088SAndy Grover 123*70041088SAndy Grover static void rds_tcp_cong_recv(struct rds_connection *conn, 124*70041088SAndy Grover struct rds_tcp_incoming *tinc) 125*70041088SAndy Grover { 126*70041088SAndy Grover struct sk_buff *skb; 127*70041088SAndy Grover unsigned int to_copy, skb_off; 128*70041088SAndy Grover unsigned int map_off; 129*70041088SAndy Grover unsigned int map_page; 130*70041088SAndy Grover struct rds_cong_map *map; 131*70041088SAndy Grover int ret; 132*70041088SAndy Grover 133*70041088SAndy Grover /* catch completely corrupt packets */ 134*70041088SAndy Grover if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES) 135*70041088SAndy Grover return; 136*70041088SAndy Grover 137*70041088SAndy Grover map_page = 0; 138*70041088SAndy Grover map_off = 0; 139*70041088SAndy Grover map = conn->c_fcong; 140*70041088SAndy Grover 141*70041088SAndy Grover skb_queue_walk(&tinc->ti_skb_list, skb) { 142*70041088SAndy Grover skb_off = 0; 143*70041088SAndy Grover while (skb_off < skb->len) { 144*70041088SAndy Grover to_copy = min_t(unsigned int, PAGE_SIZE - map_off, 145*70041088SAndy Grover skb->len - skb_off); 146*70041088SAndy Grover 147*70041088SAndy Grover BUG_ON(map_page >= RDS_CONG_MAP_PAGES); 148*70041088SAndy Grover 149*70041088SAndy Grover /* only returns 0 or -error */ 150*70041088SAndy Grover ret = skb_copy_bits(skb, skb_off, 151*70041088SAndy Grover (void *)map->m_page_addrs[map_page] + map_off, 152*70041088SAndy Grover to_copy); 153*70041088SAndy Grover BUG_ON(ret != 0); 154*70041088SAndy Grover 155*70041088SAndy Grover skb_off += to_copy; 156*70041088SAndy Grover map_off += to_copy; 157*70041088SAndy Grover if (map_off == PAGE_SIZE) { 158*70041088SAndy Grover map_off = 0; 159*70041088SAndy Grover map_page++; 160*70041088SAndy Grover } 161*70041088SAndy Grover } 162*70041088SAndy Grover } 163*70041088SAndy Grover 164*70041088SAndy Grover rds_cong_map_updated(map, ~(u64) 0); 165*70041088SAndy Grover } 166*70041088SAndy Grover 167*70041088SAndy Grover struct rds_tcp_desc_arg { 168*70041088SAndy Grover struct rds_connection *conn; 169*70041088SAndy Grover gfp_t gfp; 170*70041088SAndy Grover enum km_type km; 171*70041088SAndy Grover }; 172*70041088SAndy Grover 173*70041088SAndy Grover static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, 174*70041088SAndy Grover unsigned int offset, size_t len) 175*70041088SAndy Grover { 176*70041088SAndy Grover struct rds_tcp_desc_arg *arg = desc->arg.data; 177*70041088SAndy Grover struct rds_connection *conn = arg->conn; 178*70041088SAndy Grover struct rds_tcp_connection *tc = conn->c_transport_data; 179*70041088SAndy Grover struct rds_tcp_incoming *tinc = tc->t_tinc; 180*70041088SAndy Grover struct sk_buff *clone; 181*70041088SAndy Grover size_t left = len, to_copy; 182*70041088SAndy Grover 183*70041088SAndy Grover rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset, 184*70041088SAndy Grover len); 185*70041088SAndy Grover 186*70041088SAndy Grover /* 187*70041088SAndy Grover * tcp_read_sock() interprets partial progress as an indication to stop 188*70041088SAndy Grover * processing. 189*70041088SAndy Grover */ 190*70041088SAndy Grover while (left) { 191*70041088SAndy Grover if (tinc == NULL) { 192*70041088SAndy Grover tinc = kmem_cache_alloc(rds_tcp_incoming_slab, 193*70041088SAndy Grover arg->gfp); 194*70041088SAndy Grover if (tinc == NULL) { 195*70041088SAndy Grover desc->error = -ENOMEM; 196*70041088SAndy Grover goto out; 197*70041088SAndy Grover } 198*70041088SAndy Grover tc->t_tinc = tinc; 199*70041088SAndy Grover rdsdebug("alloced tinc %p\n", tinc); 200*70041088SAndy Grover rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr); 201*70041088SAndy Grover /* 202*70041088SAndy Grover * XXX * we might be able to use the __ variants when 203*70041088SAndy Grover * we've already serialized at a higher level. 204*70041088SAndy Grover */ 205*70041088SAndy Grover skb_queue_head_init(&tinc->ti_skb_list); 206*70041088SAndy Grover } 207*70041088SAndy Grover 208*70041088SAndy Grover if (left && tc->t_tinc_hdr_rem) { 209*70041088SAndy Grover to_copy = min(tc->t_tinc_hdr_rem, left); 210*70041088SAndy Grover rdsdebug("copying %zu header from skb %p\n", to_copy, 211*70041088SAndy Grover skb); 212*70041088SAndy Grover skb_copy_bits(skb, offset, 213*70041088SAndy Grover (char *)&tinc->ti_inc.i_hdr + 214*70041088SAndy Grover sizeof(struct rds_header) - 215*70041088SAndy Grover tc->t_tinc_hdr_rem, 216*70041088SAndy Grover to_copy); 217*70041088SAndy Grover tc->t_tinc_hdr_rem -= to_copy; 218*70041088SAndy Grover left -= to_copy; 219*70041088SAndy Grover offset += to_copy; 220*70041088SAndy Grover 221*70041088SAndy Grover if (tc->t_tinc_hdr_rem == 0) { 222*70041088SAndy Grover /* could be 0 for a 0 len message */ 223*70041088SAndy Grover tc->t_tinc_data_rem = 224*70041088SAndy Grover be32_to_cpu(tinc->ti_inc.i_hdr.h_len); 225*70041088SAndy Grover } 226*70041088SAndy Grover } 227*70041088SAndy Grover 228*70041088SAndy Grover if (left && tc->t_tinc_data_rem) { 229*70041088SAndy Grover clone = skb_clone(skb, arg->gfp); 230*70041088SAndy Grover if (clone == NULL) { 231*70041088SAndy Grover desc->error = -ENOMEM; 232*70041088SAndy Grover goto out; 233*70041088SAndy Grover } 234*70041088SAndy Grover 235*70041088SAndy Grover to_copy = min(tc->t_tinc_data_rem, left); 236*70041088SAndy Grover pskb_pull(clone, offset); 237*70041088SAndy Grover pskb_trim(clone, to_copy); 238*70041088SAndy Grover skb_queue_tail(&tinc->ti_skb_list, clone); 239*70041088SAndy Grover 240*70041088SAndy Grover rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " 241*70041088SAndy Grover "clone %p data %p len %d\n", 242*70041088SAndy Grover skb, skb->data, skb->len, offset, to_copy, 243*70041088SAndy Grover clone, clone->data, clone->len); 244*70041088SAndy Grover 245*70041088SAndy Grover tc->t_tinc_data_rem -= to_copy; 246*70041088SAndy Grover left -= to_copy; 247*70041088SAndy Grover offset += to_copy; 248*70041088SAndy Grover } 249*70041088SAndy Grover 250*70041088SAndy Grover if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) { 251*70041088SAndy Grover if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 252*70041088SAndy Grover rds_tcp_cong_recv(conn, tinc); 253*70041088SAndy Grover else 254*70041088SAndy Grover rds_recv_incoming(conn, conn->c_faddr, 255*70041088SAndy Grover conn->c_laddr, &tinc->ti_inc, 256*70041088SAndy Grover arg->gfp, arg->km); 257*70041088SAndy Grover 258*70041088SAndy Grover tc->t_tinc_hdr_rem = sizeof(struct rds_header); 259*70041088SAndy Grover tc->t_tinc_data_rem = 0; 260*70041088SAndy Grover tc->t_tinc = NULL; 261*70041088SAndy Grover rds_inc_put(&tinc->ti_inc); 262*70041088SAndy Grover tinc = NULL; 263*70041088SAndy Grover } 264*70041088SAndy Grover } 265*70041088SAndy Grover out: 266*70041088SAndy Grover rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n", 267*70041088SAndy Grover len, left, skb->len, 268*70041088SAndy Grover skb_queue_len(&tc->t_sock->sk->sk_receive_queue)); 269*70041088SAndy Grover return len - left; 270*70041088SAndy Grover } 271*70041088SAndy Grover 272*70041088SAndy Grover /* the caller has to hold the sock lock */ 273*70041088SAndy Grover int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km) 274*70041088SAndy Grover { 275*70041088SAndy Grover struct rds_tcp_connection *tc = conn->c_transport_data; 276*70041088SAndy Grover struct socket *sock = tc->t_sock; 277*70041088SAndy Grover read_descriptor_t desc; 278*70041088SAndy Grover struct rds_tcp_desc_arg arg; 279*70041088SAndy Grover 280*70041088SAndy Grover /* It's like glib in the kernel! */ 281*70041088SAndy Grover arg.conn = conn; 282*70041088SAndy Grover arg.gfp = gfp; 283*70041088SAndy Grover arg.km = km; 284*70041088SAndy Grover desc.arg.data = &arg; 285*70041088SAndy Grover desc.error = 0; 286*70041088SAndy Grover desc.count = 1; /* give more than one skb per call */ 287*70041088SAndy Grover 288*70041088SAndy Grover tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv); 289*70041088SAndy Grover rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, 290*70041088SAndy Grover desc.error); 291*70041088SAndy Grover 292*70041088SAndy Grover return desc.error; 293*70041088SAndy Grover } 294*70041088SAndy Grover 295*70041088SAndy Grover /* 296*70041088SAndy Grover * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from 297*70041088SAndy Grover * data_ready. 298*70041088SAndy Grover * 299*70041088SAndy Grover * if we fail to allocate we're in trouble.. blindly wait some time before 300*70041088SAndy Grover * trying again to see if the VM can free up something for us. 301*70041088SAndy Grover */ 302*70041088SAndy Grover int rds_tcp_recv(struct rds_connection *conn) 303*70041088SAndy Grover { 304*70041088SAndy Grover struct rds_tcp_connection *tc = conn->c_transport_data; 305*70041088SAndy Grover struct socket *sock = tc->t_sock; 306*70041088SAndy Grover int ret = 0; 307*70041088SAndy Grover 308*70041088SAndy Grover rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock); 309*70041088SAndy Grover 310*70041088SAndy Grover lock_sock(sock->sk); 311*70041088SAndy Grover ret = rds_tcp_read_sock(conn, GFP_KERNEL, KM_USER0); 312*70041088SAndy Grover release_sock(sock->sk); 313*70041088SAndy Grover 314*70041088SAndy Grover return ret; 315*70041088SAndy Grover } 316*70041088SAndy Grover 317*70041088SAndy Grover void rds_tcp_data_ready(struct sock *sk, int bytes) 318*70041088SAndy Grover { 319*70041088SAndy Grover void (*ready)(struct sock *sk, int bytes); 320*70041088SAndy Grover struct rds_connection *conn; 321*70041088SAndy Grover struct rds_tcp_connection *tc; 322*70041088SAndy Grover 323*70041088SAndy Grover rdsdebug("data ready sk %p bytes %d\n", sk, bytes); 324*70041088SAndy Grover 325*70041088SAndy Grover read_lock(&sk->sk_callback_lock); 326*70041088SAndy Grover conn = sk->sk_user_data; 327*70041088SAndy Grover if (conn == NULL) { /* check for teardown race */ 328*70041088SAndy Grover ready = sk->sk_data_ready; 329*70041088SAndy Grover goto out; 330*70041088SAndy Grover } 331*70041088SAndy Grover 332*70041088SAndy Grover tc = conn->c_transport_data; 333*70041088SAndy Grover ready = tc->t_orig_data_ready; 334*70041088SAndy Grover rds_tcp_stats_inc(s_tcp_data_ready_calls); 335*70041088SAndy Grover 336*70041088SAndy Grover if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM) 337*70041088SAndy Grover queue_delayed_work(rds_wq, &conn->c_recv_w, 0); 338*70041088SAndy Grover out: 339*70041088SAndy Grover read_unlock(&sk->sk_callback_lock); 340*70041088SAndy Grover ready(sk, bytes); 341*70041088SAndy Grover } 342*70041088SAndy Grover 343*70041088SAndy Grover int __init rds_tcp_recv_init(void) 344*70041088SAndy Grover { 345*70041088SAndy Grover rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", 346*70041088SAndy Grover sizeof(struct rds_tcp_incoming), 347*70041088SAndy Grover 0, 0, NULL); 348*70041088SAndy Grover if (rds_tcp_incoming_slab == NULL) 349*70041088SAndy Grover return -ENOMEM; 350*70041088SAndy Grover return 0; 351*70041088SAndy Grover } 352*70041088SAndy Grover 353*70041088SAndy Grover void rds_tcp_recv_exit(void) 354*70041088SAndy Grover { 355*70041088SAndy Grover kmem_cache_destroy(rds_tcp_incoming_slab); 356*70041088SAndy Grover } 357