1 /* 2 * Copyright (c) 2006 Oracle. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 * 32 */ 33 #include <linux/module.h> 34 #include <linux/errno.h> 35 #include <linux/kernel.h> 36 #include <linux/gfp.h> 37 #include <linux/in.h> 38 #include <linux/poll.h> 39 #include <net/sock.h> 40 41 #include "rds.h" 42 43 char *rds_str_array(char **array, size_t elements, size_t index) 44 { 45 if ((index < elements) && array[index]) 46 return array[index]; 47 else 48 return "unknown"; 49 } 50 EXPORT_SYMBOL(rds_str_array); 51 52 /* this is just used for stats gathering :/ */ 53 static DEFINE_SPINLOCK(rds_sock_lock); 54 static unsigned long rds_sock_count; 55 static LIST_HEAD(rds_sock_list); 56 DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq); 57 58 /* 59 * This is called as the final descriptor referencing this socket is closed. 60 * We have to unbind the socket so that another socket can be bound to the 61 * address it was using. 62 * 63 * We have to be careful about racing with the incoming path. sock_orphan() 64 * sets SOCK_DEAD and we use that as an indicator to the rx path that new 65 * messages shouldn't be queued. 66 */ 67 static int rds_release(struct socket *sock) 68 { 69 struct sock *sk = sock->sk; 70 struct rds_sock *rs; 71 unsigned long flags; 72 73 if (!sk) 74 goto out; 75 76 rs = rds_sk_to_rs(sk); 77 78 sock_orphan(sk); 79 /* Note - rds_clear_recv_queue grabs rs_recv_lock, so 80 * that ensures the recv path has completed messing 81 * with the socket. */ 82 rds_clear_recv_queue(rs); 83 rds_cong_remove_socket(rs); 84 85 /* 86 * the binding lookup hash uses rcu, we need to 87 * make sure we sychronize_rcu before we free our 88 * entry 89 */ 90 rds_remove_bound(rs); 91 synchronize_rcu(); 92 93 rds_send_drop_to(rs, NULL); 94 rds_rdma_drop_keys(rs); 95 rds_notify_queue_get(rs, NULL); 96 97 spin_lock_irqsave(&rds_sock_lock, flags); 98 list_del_init(&rs->rs_item); 99 rds_sock_count--; 100 spin_unlock_irqrestore(&rds_sock_lock, flags); 101 102 rds_trans_put(rs->rs_transport); 103 104 sock->sk = NULL; 105 sock_put(sk); 106 out: 107 return 0; 108 } 109 110 /* 111 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep. 112 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK 113 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but 114 * this seems more conservative. 115 * NB - normally, one would use sk_callback_lock for this, but we can 116 * get here from interrupts, whereas the network code grabs sk_callback_lock 117 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. 118 */ 119 void rds_wake_sk_sleep(struct rds_sock *rs) 120 { 121 unsigned long flags; 122 123 read_lock_irqsave(&rs->rs_recv_lock, flags); 124 __rds_wake_sk_sleep(rds_rs_to_sk(rs)); 125 read_unlock_irqrestore(&rs->rs_recv_lock, flags); 126 } 127 128 static int rds_getname(struct socket *sock, struct sockaddr *uaddr, 129 int *uaddr_len, int peer) 130 { 131 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 132 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 133 134 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 135 136 /* racey, don't care */ 137 if (peer) { 138 if (!rs->rs_conn_addr) 139 return -ENOTCONN; 140 141 sin->sin_port = rs->rs_conn_port; 142 sin->sin_addr.s_addr = rs->rs_conn_addr; 143 } else { 144 sin->sin_port = rs->rs_bound_port; 145 sin->sin_addr.s_addr = rs->rs_bound_addr; 146 } 147 148 sin->sin_family = AF_INET; 149 150 *uaddr_len = sizeof(*sin); 151 return 0; 152 } 153 154 /* 155 * RDS' poll is without a doubt the least intuitive part of the interface, 156 * as POLLIN and POLLOUT do not behave entirely as you would expect from 157 * a network protocol. 158 * 159 * POLLIN is asserted if 160 * - there is data on the receive queue. 161 * - to signal that a previously congested destination may have become 162 * uncongested 163 * - A notification has been queued to the socket (this can be a congestion 164 * update, or a RDMA completion). 165 * 166 * POLLOUT is asserted if there is room on the send queue. This does not mean 167 * however, that the next sendmsg() call will succeed. If the application tries 168 * to send to a congested destination, the system call may still fail (and 169 * return ENOBUFS). 170 */ 171 static unsigned int rds_poll(struct file *file, struct socket *sock, 172 poll_table *wait) 173 { 174 struct sock *sk = sock->sk; 175 struct rds_sock *rs = rds_sk_to_rs(sk); 176 unsigned int mask = 0; 177 unsigned long flags; 178 179 poll_wait(file, sk_sleep(sk), wait); 180 181 if (rs->rs_seen_congestion) 182 poll_wait(file, &rds_poll_waitq, wait); 183 184 read_lock_irqsave(&rs->rs_recv_lock, flags); 185 if (!rs->rs_cong_monitor) { 186 /* When a congestion map was updated, we signal POLLIN for 187 * "historical" reasons. Applications can also poll for 188 * WRBAND instead. */ 189 if (rds_cong_updated_since(&rs->rs_cong_track)) 190 mask |= (POLLIN | POLLRDNORM | POLLWRBAND); 191 } else { 192 spin_lock(&rs->rs_lock); 193 if (rs->rs_cong_notify) 194 mask |= (POLLIN | POLLRDNORM); 195 spin_unlock(&rs->rs_lock); 196 } 197 if (!list_empty(&rs->rs_recv_queue) || 198 !list_empty(&rs->rs_notify_queue)) 199 mask |= (POLLIN | POLLRDNORM); 200 if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) 201 mask |= (POLLOUT | POLLWRNORM); 202 read_unlock_irqrestore(&rs->rs_recv_lock, flags); 203 204 /* clear state any time we wake a seen-congested socket */ 205 if (mask) 206 rs->rs_seen_congestion = 0; 207 208 return mask; 209 } 210 211 static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 212 { 213 return -ENOIOCTLCMD; 214 } 215 216 static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, 217 int len) 218 { 219 struct sockaddr_in sin; 220 int ret = 0; 221 222 /* racing with another thread binding seems ok here */ 223 if (rs->rs_bound_addr == 0) { 224 ret = -ENOTCONN; /* XXX not a great errno */ 225 goto out; 226 } 227 228 if (len < sizeof(struct sockaddr_in)) { 229 ret = -EINVAL; 230 goto out; 231 } 232 233 if (copy_from_user(&sin, optval, sizeof(sin))) { 234 ret = -EFAULT; 235 goto out; 236 } 237 238 rds_send_drop_to(rs, &sin); 239 out: 240 return ret; 241 } 242 243 static int rds_set_bool_option(unsigned char *optvar, char __user *optval, 244 int optlen) 245 { 246 int value; 247 248 if (optlen < sizeof(int)) 249 return -EINVAL; 250 if (get_user(value, (int __user *) optval)) 251 return -EFAULT; 252 *optvar = !!value; 253 return 0; 254 } 255 256 static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, 257 int optlen) 258 { 259 int ret; 260 261 ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen); 262 if (ret == 0) { 263 if (rs->rs_cong_monitor) { 264 rds_cong_add_socket(rs); 265 } else { 266 rds_cong_remove_socket(rs); 267 rs->rs_cong_mask = 0; 268 rs->rs_cong_notify = 0; 269 } 270 } 271 return ret; 272 } 273 274 static int rds_setsockopt(struct socket *sock, int level, int optname, 275 char __user *optval, unsigned int optlen) 276 { 277 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 278 int ret; 279 280 if (level != SOL_RDS) { 281 ret = -ENOPROTOOPT; 282 goto out; 283 } 284 285 switch (optname) { 286 case RDS_CANCEL_SENT_TO: 287 ret = rds_cancel_sent_to(rs, optval, optlen); 288 break; 289 case RDS_GET_MR: 290 ret = rds_get_mr(rs, optval, optlen); 291 break; 292 case RDS_GET_MR_FOR_DEST: 293 ret = rds_get_mr_for_dest(rs, optval, optlen); 294 break; 295 case RDS_FREE_MR: 296 ret = rds_free_mr(rs, optval, optlen); 297 break; 298 case RDS_RECVERR: 299 ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen); 300 break; 301 case RDS_CONG_MONITOR: 302 ret = rds_cong_monitor(rs, optval, optlen); 303 break; 304 default: 305 ret = -ENOPROTOOPT; 306 } 307 out: 308 return ret; 309 } 310 311 static int rds_getsockopt(struct socket *sock, int level, int optname, 312 char __user *optval, int __user *optlen) 313 { 314 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 315 int ret = -ENOPROTOOPT, len; 316 317 if (level != SOL_RDS) 318 goto out; 319 320 if (get_user(len, optlen)) { 321 ret = -EFAULT; 322 goto out; 323 } 324 325 switch (optname) { 326 case RDS_INFO_FIRST ... RDS_INFO_LAST: 327 ret = rds_info_getsockopt(sock, optname, optval, 328 optlen); 329 break; 330 331 case RDS_RECVERR: 332 if (len < sizeof(int)) 333 ret = -EINVAL; 334 else 335 if (put_user(rs->rs_recverr, (int __user *) optval) || 336 put_user(sizeof(int), optlen)) 337 ret = -EFAULT; 338 else 339 ret = 0; 340 break; 341 default: 342 break; 343 } 344 345 out: 346 return ret; 347 348 } 349 350 static int rds_connect(struct socket *sock, struct sockaddr *uaddr, 351 int addr_len, int flags) 352 { 353 struct sock *sk = sock->sk; 354 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 355 struct rds_sock *rs = rds_sk_to_rs(sk); 356 int ret = 0; 357 358 lock_sock(sk); 359 360 if (addr_len != sizeof(struct sockaddr_in)) { 361 ret = -EINVAL; 362 goto out; 363 } 364 365 if (sin->sin_family != AF_INET) { 366 ret = -EAFNOSUPPORT; 367 goto out; 368 } 369 370 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 371 ret = -EDESTADDRREQ; 372 goto out; 373 } 374 375 rs->rs_conn_addr = sin->sin_addr.s_addr; 376 rs->rs_conn_port = sin->sin_port; 377 378 out: 379 release_sock(sk); 380 return ret; 381 } 382 383 static struct proto rds_proto = { 384 .name = "RDS", 385 .owner = THIS_MODULE, 386 .obj_size = sizeof(struct rds_sock), 387 }; 388 389 static const struct proto_ops rds_proto_ops = { 390 .family = AF_RDS, 391 .owner = THIS_MODULE, 392 .release = rds_release, 393 .bind = rds_bind, 394 .connect = rds_connect, 395 .socketpair = sock_no_socketpair, 396 .accept = sock_no_accept, 397 .getname = rds_getname, 398 .poll = rds_poll, 399 .ioctl = rds_ioctl, 400 .listen = sock_no_listen, 401 .shutdown = sock_no_shutdown, 402 .setsockopt = rds_setsockopt, 403 .getsockopt = rds_getsockopt, 404 .sendmsg = rds_sendmsg, 405 .recvmsg = rds_recvmsg, 406 .mmap = sock_no_mmap, 407 .sendpage = sock_no_sendpage, 408 }; 409 410 static int __rds_create(struct socket *sock, struct sock *sk, int protocol) 411 { 412 unsigned long flags; 413 struct rds_sock *rs; 414 415 sock_init_data(sock, sk); 416 sock->ops = &rds_proto_ops; 417 sk->sk_protocol = protocol; 418 419 rs = rds_sk_to_rs(sk); 420 spin_lock_init(&rs->rs_lock); 421 rwlock_init(&rs->rs_recv_lock); 422 INIT_LIST_HEAD(&rs->rs_send_queue); 423 INIT_LIST_HEAD(&rs->rs_recv_queue); 424 INIT_LIST_HEAD(&rs->rs_notify_queue); 425 INIT_LIST_HEAD(&rs->rs_cong_list); 426 spin_lock_init(&rs->rs_rdma_lock); 427 rs->rs_rdma_keys = RB_ROOT; 428 429 spin_lock_irqsave(&rds_sock_lock, flags); 430 list_add_tail(&rs->rs_item, &rds_sock_list); 431 rds_sock_count++; 432 spin_unlock_irqrestore(&rds_sock_lock, flags); 433 434 return 0; 435 } 436 437 static int rds_create(struct net *net, struct socket *sock, int protocol, 438 int kern) 439 { 440 struct sock *sk; 441 442 if (sock->type != SOCK_SEQPACKET || protocol) 443 return -ESOCKTNOSUPPORT; 444 445 sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto); 446 if (!sk) 447 return -ENOMEM; 448 449 return __rds_create(sock, sk, protocol); 450 } 451 452 void rds_sock_addref(struct rds_sock *rs) 453 { 454 sock_hold(rds_rs_to_sk(rs)); 455 } 456 457 void rds_sock_put(struct rds_sock *rs) 458 { 459 sock_put(rds_rs_to_sk(rs)); 460 } 461 462 static const struct net_proto_family rds_family_ops = { 463 .family = AF_RDS, 464 .create = rds_create, 465 .owner = THIS_MODULE, 466 }; 467 468 static void rds_sock_inc_info(struct socket *sock, unsigned int len, 469 struct rds_info_iterator *iter, 470 struct rds_info_lengths *lens) 471 { 472 struct rds_sock *rs; 473 struct rds_incoming *inc; 474 unsigned long flags; 475 unsigned int total = 0; 476 477 len /= sizeof(struct rds_info_message); 478 479 spin_lock_irqsave(&rds_sock_lock, flags); 480 481 list_for_each_entry(rs, &rds_sock_list, rs_item) { 482 read_lock(&rs->rs_recv_lock); 483 484 /* XXX too lazy to maintain counts.. */ 485 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 486 total++; 487 if (total <= len) 488 rds_inc_info_copy(inc, iter, inc->i_saddr, 489 rs->rs_bound_addr, 1); 490 } 491 492 read_unlock(&rs->rs_recv_lock); 493 } 494 495 spin_unlock_irqrestore(&rds_sock_lock, flags); 496 497 lens->nr = total; 498 lens->each = sizeof(struct rds_info_message); 499 } 500 501 static void rds_sock_info(struct socket *sock, unsigned int len, 502 struct rds_info_iterator *iter, 503 struct rds_info_lengths *lens) 504 { 505 struct rds_info_socket sinfo; 506 struct rds_sock *rs; 507 unsigned long flags; 508 509 len /= sizeof(struct rds_info_socket); 510 511 spin_lock_irqsave(&rds_sock_lock, flags); 512 513 if (len < rds_sock_count) 514 goto out; 515 516 list_for_each_entry(rs, &rds_sock_list, rs_item) { 517 sinfo.sndbuf = rds_sk_sndbuf(rs); 518 sinfo.rcvbuf = rds_sk_rcvbuf(rs); 519 sinfo.bound_addr = rs->rs_bound_addr; 520 sinfo.connected_addr = rs->rs_conn_addr; 521 sinfo.bound_port = rs->rs_bound_port; 522 sinfo.connected_port = rs->rs_conn_port; 523 sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); 524 525 rds_info_copy(iter, &sinfo, sizeof(sinfo)); 526 } 527 528 out: 529 lens->nr = rds_sock_count; 530 lens->each = sizeof(struct rds_info_socket); 531 532 spin_unlock_irqrestore(&rds_sock_lock, flags); 533 } 534 535 static void rds_exit(void) 536 { 537 sock_unregister(rds_family_ops.family); 538 proto_unregister(&rds_proto); 539 rds_conn_exit(); 540 rds_cong_exit(); 541 rds_sysctl_exit(); 542 rds_threads_exit(); 543 rds_stats_exit(); 544 rds_page_exit(); 545 rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); 546 rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); 547 } 548 module_exit(rds_exit); 549 550 static int rds_init(void) 551 { 552 int ret; 553 554 ret = rds_conn_init(); 555 if (ret) 556 goto out; 557 ret = rds_threads_init(); 558 if (ret) 559 goto out_conn; 560 ret = rds_sysctl_init(); 561 if (ret) 562 goto out_threads; 563 ret = rds_stats_init(); 564 if (ret) 565 goto out_sysctl; 566 ret = proto_register(&rds_proto, 1); 567 if (ret) 568 goto out_stats; 569 ret = sock_register(&rds_family_ops); 570 if (ret) 571 goto out_proto; 572 573 rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); 574 rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); 575 576 goto out; 577 578 out_proto: 579 proto_unregister(&rds_proto); 580 out_stats: 581 rds_stats_exit(); 582 out_sysctl: 583 rds_sysctl_exit(); 584 out_threads: 585 rds_threads_exit(); 586 out_conn: 587 rds_conn_exit(); 588 rds_cong_exit(); 589 rds_page_exit(); 590 out: 591 return ret; 592 } 593 module_init(rds_init); 594 595 #define DRV_VERSION "4.0" 596 #define DRV_RELDATE "Feb 12, 2009" 597 598 MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); 599 MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" 600 " v" DRV_VERSION " (" DRV_RELDATE ")"); 601 MODULE_VERSION(DRV_VERSION); 602 MODULE_LICENSE("Dual BSD/GPL"); 603 MODULE_ALIAS_NETPROTO(PF_RDS); 604