1 /* 2 * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 * 32 */ 33 #include <linux/module.h> 34 #include <linux/errno.h> 35 #include <linux/kernel.h> 36 #include <linux/gfp.h> 37 #include <linux/in.h> 38 #include <linux/ipv6.h> 39 #include <linux/poll.h> 40 #include <net/sock.h> 41 42 #include "rds.h" 43 44 /* this is just used for stats gathering :/ */ 45 static DEFINE_SPINLOCK(rds_sock_lock); 46 static unsigned long rds_sock_count; 47 static LIST_HEAD(rds_sock_list); 48 DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq); 49 50 /* 51 * This is called as the final descriptor referencing this socket is closed. 52 * We have to unbind the socket so that another socket can be bound to the 53 * address it was using. 54 * 55 * We have to be careful about racing with the incoming path. sock_orphan() 56 * sets SOCK_DEAD and we use that as an indicator to the rx path that new 57 * messages shouldn't be queued. 58 */ 59 static int rds_release(struct socket *sock) 60 { 61 struct sock *sk = sock->sk; 62 struct rds_sock *rs; 63 64 if (!sk) 65 goto out; 66 67 rs = rds_sk_to_rs(sk); 68 69 sock_orphan(sk); 70 /* Note - rds_clear_recv_queue grabs rs_recv_lock, so 71 * that ensures the recv path has completed messing 72 * with the socket. */ 73 rds_clear_recv_queue(rs); 74 rds_cong_remove_socket(rs); 75 76 rds_remove_bound(rs); 77 78 rds_send_drop_to(rs, NULL); 79 rds_rdma_drop_keys(rs); 80 rds_notify_queue_get(rs, NULL); 81 rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue); 82 83 spin_lock_bh(&rds_sock_lock); 84 list_del_init(&rs->rs_item); 85 rds_sock_count--; 86 spin_unlock_bh(&rds_sock_lock); 87 88 rds_trans_put(rs->rs_transport); 89 90 sock->sk = NULL; 91 sock_put(sk); 92 out: 93 return 0; 94 } 95 96 /* 97 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep. 98 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK 99 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but 100 * this seems more conservative. 101 * NB - normally, one would use sk_callback_lock for this, but we can 102 * get here from interrupts, whereas the network code grabs sk_callback_lock 103 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. 104 */ 105 void rds_wake_sk_sleep(struct rds_sock *rs) 106 { 107 unsigned long flags; 108 109 read_lock_irqsave(&rs->rs_recv_lock, flags); 110 __rds_wake_sk_sleep(rds_rs_to_sk(rs)); 111 read_unlock_irqrestore(&rs->rs_recv_lock, flags); 112 } 113 114 static int rds_getname(struct socket *sock, struct sockaddr *uaddr, 115 int peer) 116 { 117 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 118 struct sockaddr_in6 *sin6; 119 struct sockaddr_in *sin; 120 int uaddr_len; 121 122 /* racey, don't care */ 123 if (peer) { 124 if (ipv6_addr_any(&rs->rs_conn_addr)) 125 return -ENOTCONN; 126 127 if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) { 128 sin = (struct sockaddr_in *)uaddr; 129 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 130 sin->sin_family = AF_INET; 131 sin->sin_port = rs->rs_conn_port; 132 sin->sin_addr.s_addr = rs->rs_conn_addr_v4; 133 uaddr_len = sizeof(*sin); 134 } else { 135 sin6 = (struct sockaddr_in6 *)uaddr; 136 sin6->sin6_family = AF_INET6; 137 sin6->sin6_port = rs->rs_conn_port; 138 sin6->sin6_addr = rs->rs_conn_addr; 139 sin6->sin6_flowinfo = 0; 140 /* scope_id is the same as in the bound address. */ 141 sin6->sin6_scope_id = rs->rs_bound_scope_id; 142 uaddr_len = sizeof(*sin6); 143 } 144 } else { 145 /* If socket is not yet bound and the socket is connected, 146 * set the return address family to be the same as the 147 * connected address, but with 0 address value. If it is not 148 * connected, set the family to be AF_UNSPEC (value 0) and 149 * the address size to be that of an IPv4 address. 150 */ 151 if (ipv6_addr_any(&rs->rs_bound_addr)) { 152 if (ipv6_addr_any(&rs->rs_conn_addr)) { 153 sin = (struct sockaddr_in *)uaddr; 154 memset(sin, 0, sizeof(*sin)); 155 sin->sin_family = AF_UNSPEC; 156 return sizeof(*sin); 157 } 158 159 #if IS_ENABLED(CONFIG_IPV6) 160 if (!(ipv6_addr_type(&rs->rs_conn_addr) & 161 IPV6_ADDR_MAPPED)) { 162 sin6 = (struct sockaddr_in6 *)uaddr; 163 memset(sin6, 0, sizeof(*sin6)); 164 sin6->sin6_family = AF_INET6; 165 return sizeof(*sin6); 166 } 167 #endif 168 169 sin = (struct sockaddr_in *)uaddr; 170 memset(sin, 0, sizeof(*sin)); 171 sin->sin_family = AF_INET; 172 return sizeof(*sin); 173 } 174 if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { 175 sin = (struct sockaddr_in *)uaddr; 176 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 177 sin->sin_family = AF_INET; 178 sin->sin_port = rs->rs_bound_port; 179 sin->sin_addr.s_addr = rs->rs_bound_addr_v4; 180 uaddr_len = sizeof(*sin); 181 } else { 182 sin6 = (struct sockaddr_in6 *)uaddr; 183 sin6->sin6_family = AF_INET6; 184 sin6->sin6_port = rs->rs_bound_port; 185 sin6->sin6_addr = rs->rs_bound_addr; 186 sin6->sin6_flowinfo = 0; 187 sin6->sin6_scope_id = rs->rs_bound_scope_id; 188 uaddr_len = sizeof(*sin6); 189 } 190 } 191 192 return uaddr_len; 193 } 194 195 /* 196 * RDS' poll is without a doubt the least intuitive part of the interface, 197 * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from 198 * a network protocol. 199 * 200 * EPOLLIN is asserted if 201 * - there is data on the receive queue. 202 * - to signal that a previously congested destination may have become 203 * uncongested 204 * - A notification has been queued to the socket (this can be a congestion 205 * update, or a RDMA completion, or a MSG_ZEROCOPY completion). 206 * 207 * EPOLLOUT is asserted if there is room on the send queue. This does not mean 208 * however, that the next sendmsg() call will succeed. If the application tries 209 * to send to a congested destination, the system call may still fail (and 210 * return ENOBUFS). 211 */ 212 static __poll_t rds_poll(struct file *file, struct socket *sock, 213 poll_table *wait) 214 { 215 struct sock *sk = sock->sk; 216 struct rds_sock *rs = rds_sk_to_rs(sk); 217 __poll_t mask = 0; 218 unsigned long flags; 219 220 poll_wait(file, sk_sleep(sk), wait); 221 222 if (rs->rs_seen_congestion) 223 poll_wait(file, &rds_poll_waitq, wait); 224 225 read_lock_irqsave(&rs->rs_recv_lock, flags); 226 if (!rs->rs_cong_monitor) { 227 /* When a congestion map was updated, we signal EPOLLIN for 228 * "historical" reasons. Applications can also poll for 229 * WRBAND instead. */ 230 if (rds_cong_updated_since(&rs->rs_cong_track)) 231 mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND); 232 } else { 233 spin_lock(&rs->rs_lock); 234 if (rs->rs_cong_notify) 235 mask |= (EPOLLIN | EPOLLRDNORM); 236 spin_unlock(&rs->rs_lock); 237 } 238 if (!list_empty(&rs->rs_recv_queue) || 239 !list_empty(&rs->rs_notify_queue) || 240 !list_empty(&rs->rs_zcookie_queue.zcookie_head)) 241 mask |= (EPOLLIN | EPOLLRDNORM); 242 if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) 243 mask |= (EPOLLOUT | EPOLLWRNORM); 244 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 245 mask |= POLLERR; 246 read_unlock_irqrestore(&rs->rs_recv_lock, flags); 247 248 /* clear state any time we wake a seen-congested socket */ 249 if (mask) 250 rs->rs_seen_congestion = 0; 251 252 return mask; 253 } 254 255 static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 256 { 257 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 258 rds_tos_t utos, tos = 0; 259 260 switch (cmd) { 261 case SIOCRDSSETTOS: 262 if (get_user(utos, (rds_tos_t __user *)arg)) 263 return -EFAULT; 264 265 if (rs->rs_transport && 266 rs->rs_transport->get_tos_map) 267 tos = rs->rs_transport->get_tos_map(utos); 268 else 269 return -ENOIOCTLCMD; 270 271 spin_lock_bh(&rds_sock_lock); 272 if (rs->rs_tos || rs->rs_conn) { 273 spin_unlock_bh(&rds_sock_lock); 274 return -EINVAL; 275 } 276 rs->rs_tos = tos; 277 spin_unlock_bh(&rds_sock_lock); 278 break; 279 case SIOCRDSGETTOS: 280 spin_lock_bh(&rds_sock_lock); 281 tos = rs->rs_tos; 282 spin_unlock_bh(&rds_sock_lock); 283 if (put_user(tos, (rds_tos_t __user *)arg)) 284 return -EFAULT; 285 break; 286 default: 287 return -ENOIOCTLCMD; 288 } 289 290 return 0; 291 } 292 293 static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, 294 int len) 295 { 296 struct sockaddr_in6 sin6; 297 struct sockaddr_in sin; 298 int ret = 0; 299 300 /* racing with another thread binding seems ok here */ 301 if (ipv6_addr_any(&rs->rs_bound_addr)) { 302 ret = -ENOTCONN; /* XXX not a great errno */ 303 goto out; 304 } 305 306 if (len < sizeof(struct sockaddr_in)) { 307 ret = -EINVAL; 308 goto out; 309 } else if (len < sizeof(struct sockaddr_in6)) { 310 /* Assume IPv4 */ 311 if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) { 312 ret = -EFAULT; 313 goto out; 314 } 315 ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); 316 sin6.sin6_port = sin.sin_port; 317 } else { 318 if (copy_from_user(&sin6, optval, 319 sizeof(struct sockaddr_in6))) { 320 ret = -EFAULT; 321 goto out; 322 } 323 } 324 325 rds_send_drop_to(rs, &sin6); 326 out: 327 return ret; 328 } 329 330 static int rds_set_bool_option(unsigned char *optvar, char __user *optval, 331 int optlen) 332 { 333 int value; 334 335 if (optlen < sizeof(int)) 336 return -EINVAL; 337 if (get_user(value, (int __user *) optval)) 338 return -EFAULT; 339 *optvar = !!value; 340 return 0; 341 } 342 343 static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, 344 int optlen) 345 { 346 int ret; 347 348 ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen); 349 if (ret == 0) { 350 if (rs->rs_cong_monitor) { 351 rds_cong_add_socket(rs); 352 } else { 353 rds_cong_remove_socket(rs); 354 rs->rs_cong_mask = 0; 355 rs->rs_cong_notify = 0; 356 } 357 } 358 return ret; 359 } 360 361 static int rds_set_transport(struct rds_sock *rs, char __user *optval, 362 int optlen) 363 { 364 int t_type; 365 366 if (rs->rs_transport) 367 return -EOPNOTSUPP; /* previously attached to transport */ 368 369 if (optlen != sizeof(int)) 370 return -EINVAL; 371 372 if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type))) 373 return -EFAULT; 374 375 if (t_type < 0 || t_type >= RDS_TRANS_COUNT) 376 return -EINVAL; 377 378 rs->rs_transport = rds_trans_get(t_type); 379 380 return rs->rs_transport ? 0 : -ENOPROTOOPT; 381 } 382 383 static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, 384 int optlen, int optname) 385 { 386 int val, valbool; 387 388 if (optlen != sizeof(int)) 389 return -EFAULT; 390 391 if (get_user(val, (int __user *)optval)) 392 return -EFAULT; 393 394 valbool = val ? 1 : 0; 395 396 if (optname == SO_TIMESTAMP_NEW) 397 sock_set_flag(sk, SOCK_TSTAMP_NEW); 398 399 if (valbool) 400 sock_set_flag(sk, SOCK_RCVTSTAMP); 401 else 402 sock_reset_flag(sk, SOCK_RCVTSTAMP); 403 404 return 0; 405 } 406 407 static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, 408 int optlen) 409 { 410 struct rds_rx_trace_so trace; 411 int i; 412 413 if (optlen != sizeof(struct rds_rx_trace_so)) 414 return -EFAULT; 415 416 if (copy_from_user(&trace, optval, sizeof(trace))) 417 return -EFAULT; 418 419 if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX) 420 return -EFAULT; 421 422 rs->rs_rx_traces = trace.rx_traces; 423 for (i = 0; i < rs->rs_rx_traces; i++) { 424 if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) { 425 rs->rs_rx_traces = 0; 426 return -EFAULT; 427 } 428 rs->rs_rx_trace[i] = trace.rx_trace_pos[i]; 429 } 430 431 return 0; 432 } 433 434 static int rds_setsockopt(struct socket *sock, int level, int optname, 435 char __user *optval, unsigned int optlen) 436 { 437 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 438 int ret; 439 440 if (level != SOL_RDS) { 441 ret = -ENOPROTOOPT; 442 goto out; 443 } 444 445 switch (optname) { 446 case RDS_CANCEL_SENT_TO: 447 ret = rds_cancel_sent_to(rs, optval, optlen); 448 break; 449 case RDS_GET_MR: 450 ret = rds_get_mr(rs, optval, optlen); 451 break; 452 case RDS_GET_MR_FOR_DEST: 453 ret = rds_get_mr_for_dest(rs, optval, optlen); 454 break; 455 case RDS_FREE_MR: 456 ret = rds_free_mr(rs, optval, optlen); 457 break; 458 case RDS_RECVERR: 459 ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen); 460 break; 461 case RDS_CONG_MONITOR: 462 ret = rds_cong_monitor(rs, optval, optlen); 463 break; 464 case SO_RDS_TRANSPORT: 465 lock_sock(sock->sk); 466 ret = rds_set_transport(rs, optval, optlen); 467 release_sock(sock->sk); 468 break; 469 case SO_TIMESTAMP_OLD: 470 case SO_TIMESTAMP_NEW: 471 lock_sock(sock->sk); 472 ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname); 473 release_sock(sock->sk); 474 break; 475 case SO_RDS_MSG_RXPATH_LATENCY: 476 ret = rds_recv_track_latency(rs, optval, optlen); 477 break; 478 default: 479 ret = -ENOPROTOOPT; 480 } 481 out: 482 return ret; 483 } 484 485 static int rds_getsockopt(struct socket *sock, int level, int optname, 486 char __user *optval, int __user *optlen) 487 { 488 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 489 int ret = -ENOPROTOOPT, len; 490 int trans; 491 492 if (level != SOL_RDS) 493 goto out; 494 495 if (get_user(len, optlen)) { 496 ret = -EFAULT; 497 goto out; 498 } 499 500 switch (optname) { 501 case RDS_INFO_FIRST ... RDS_INFO_LAST: 502 ret = rds_info_getsockopt(sock, optname, optval, 503 optlen); 504 break; 505 506 case RDS_RECVERR: 507 if (len < sizeof(int)) 508 ret = -EINVAL; 509 else 510 if (put_user(rs->rs_recverr, (int __user *) optval) || 511 put_user(sizeof(int), optlen)) 512 ret = -EFAULT; 513 else 514 ret = 0; 515 break; 516 case SO_RDS_TRANSPORT: 517 if (len < sizeof(int)) { 518 ret = -EINVAL; 519 break; 520 } 521 trans = (rs->rs_transport ? rs->rs_transport->t_type : 522 RDS_TRANS_NONE); /* unbound */ 523 if (put_user(trans, (int __user *)optval) || 524 put_user(sizeof(int), optlen)) 525 ret = -EFAULT; 526 else 527 ret = 0; 528 break; 529 default: 530 break; 531 } 532 533 out: 534 return ret; 535 536 } 537 538 static int rds_connect(struct socket *sock, struct sockaddr *uaddr, 539 int addr_len, int flags) 540 { 541 struct sock *sk = sock->sk; 542 struct sockaddr_in *sin; 543 struct rds_sock *rs = rds_sk_to_rs(sk); 544 int ret = 0; 545 546 if (addr_len < offsetofend(struct sockaddr, sa_family)) 547 return -EINVAL; 548 549 lock_sock(sk); 550 551 switch (uaddr->sa_family) { 552 case AF_INET: 553 sin = (struct sockaddr_in *)uaddr; 554 if (addr_len < sizeof(struct sockaddr_in)) { 555 ret = -EINVAL; 556 break; 557 } 558 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 559 ret = -EDESTADDRREQ; 560 break; 561 } 562 if (ipv4_is_multicast(sin->sin_addr.s_addr) || 563 sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) { 564 ret = -EINVAL; 565 break; 566 } 567 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr); 568 rs->rs_conn_port = sin->sin_port; 569 break; 570 571 #if IS_ENABLED(CONFIG_IPV6) 572 case AF_INET6: { 573 struct sockaddr_in6 *sin6; 574 int addr_type; 575 576 sin6 = (struct sockaddr_in6 *)uaddr; 577 if (addr_len < sizeof(struct sockaddr_in6)) { 578 ret = -EINVAL; 579 break; 580 } 581 addr_type = ipv6_addr_type(&sin6->sin6_addr); 582 if (!(addr_type & IPV6_ADDR_UNICAST)) { 583 __be32 addr4; 584 585 if (!(addr_type & IPV6_ADDR_MAPPED)) { 586 ret = -EPROTOTYPE; 587 break; 588 } 589 590 /* It is a mapped address. Need to do some sanity 591 * checks. 592 */ 593 addr4 = sin6->sin6_addr.s6_addr32[3]; 594 if (addr4 == htonl(INADDR_ANY) || 595 addr4 == htonl(INADDR_BROADCAST) || 596 ipv4_is_multicast(addr4)) { 597 ret = -EPROTOTYPE; 598 break; 599 } 600 } 601 602 if (addr_type & IPV6_ADDR_LINKLOCAL) { 603 /* If socket is arleady bound to a link local address, 604 * the peer address must be on the same link. 605 */ 606 if (sin6->sin6_scope_id == 0 || 607 (!ipv6_addr_any(&rs->rs_bound_addr) && 608 rs->rs_bound_scope_id && 609 sin6->sin6_scope_id != rs->rs_bound_scope_id)) { 610 ret = -EINVAL; 611 break; 612 } 613 /* Remember the connected address scope ID. It will 614 * be checked against the binding local address when 615 * the socket is bound. 616 */ 617 rs->rs_bound_scope_id = sin6->sin6_scope_id; 618 } 619 rs->rs_conn_addr = sin6->sin6_addr; 620 rs->rs_conn_port = sin6->sin6_port; 621 break; 622 } 623 #endif 624 625 default: 626 ret = -EAFNOSUPPORT; 627 break; 628 } 629 630 release_sock(sk); 631 return ret; 632 } 633 634 static struct proto rds_proto = { 635 .name = "RDS", 636 .owner = THIS_MODULE, 637 .obj_size = sizeof(struct rds_sock), 638 }; 639 640 static const struct proto_ops rds_proto_ops = { 641 .family = AF_RDS, 642 .owner = THIS_MODULE, 643 .release = rds_release, 644 .bind = rds_bind, 645 .connect = rds_connect, 646 .socketpair = sock_no_socketpair, 647 .accept = sock_no_accept, 648 .getname = rds_getname, 649 .poll = rds_poll, 650 .ioctl = rds_ioctl, 651 .listen = sock_no_listen, 652 .shutdown = sock_no_shutdown, 653 .setsockopt = rds_setsockopt, 654 .getsockopt = rds_getsockopt, 655 .sendmsg = rds_sendmsg, 656 .recvmsg = rds_recvmsg, 657 .mmap = sock_no_mmap, 658 .sendpage = sock_no_sendpage, 659 }; 660 661 static void rds_sock_destruct(struct sock *sk) 662 { 663 struct rds_sock *rs = rds_sk_to_rs(sk); 664 665 WARN_ON((&rs->rs_item != rs->rs_item.next || 666 &rs->rs_item != rs->rs_item.prev)); 667 } 668 669 static int __rds_create(struct socket *sock, struct sock *sk, int protocol) 670 { 671 struct rds_sock *rs; 672 673 sock_init_data(sock, sk); 674 sock->ops = &rds_proto_ops; 675 sk->sk_protocol = protocol; 676 sk->sk_destruct = rds_sock_destruct; 677 678 rs = rds_sk_to_rs(sk); 679 spin_lock_init(&rs->rs_lock); 680 rwlock_init(&rs->rs_recv_lock); 681 INIT_LIST_HEAD(&rs->rs_send_queue); 682 INIT_LIST_HEAD(&rs->rs_recv_queue); 683 INIT_LIST_HEAD(&rs->rs_notify_queue); 684 INIT_LIST_HEAD(&rs->rs_cong_list); 685 rds_message_zcopy_queue_init(&rs->rs_zcookie_queue); 686 spin_lock_init(&rs->rs_rdma_lock); 687 rs->rs_rdma_keys = RB_ROOT; 688 rs->rs_rx_traces = 0; 689 rs->rs_tos = 0; 690 rs->rs_conn = NULL; 691 692 spin_lock_bh(&rds_sock_lock); 693 list_add_tail(&rs->rs_item, &rds_sock_list); 694 rds_sock_count++; 695 spin_unlock_bh(&rds_sock_lock); 696 697 return 0; 698 } 699 700 static int rds_create(struct net *net, struct socket *sock, int protocol, 701 int kern) 702 { 703 struct sock *sk; 704 705 if (sock->type != SOCK_SEQPACKET || protocol) 706 return -ESOCKTNOSUPPORT; 707 708 sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern); 709 if (!sk) 710 return -ENOMEM; 711 712 return __rds_create(sock, sk, protocol); 713 } 714 715 void rds_sock_addref(struct rds_sock *rs) 716 { 717 sock_hold(rds_rs_to_sk(rs)); 718 } 719 720 void rds_sock_put(struct rds_sock *rs) 721 { 722 sock_put(rds_rs_to_sk(rs)); 723 } 724 725 static const struct net_proto_family rds_family_ops = { 726 .family = AF_RDS, 727 .create = rds_create, 728 .owner = THIS_MODULE, 729 }; 730 731 static void rds_sock_inc_info(struct socket *sock, unsigned int len, 732 struct rds_info_iterator *iter, 733 struct rds_info_lengths *lens) 734 { 735 struct rds_sock *rs; 736 struct rds_incoming *inc; 737 unsigned int total = 0; 738 739 len /= sizeof(struct rds_info_message); 740 741 spin_lock_bh(&rds_sock_lock); 742 743 list_for_each_entry(rs, &rds_sock_list, rs_item) { 744 /* This option only supports IPv4 sockets. */ 745 if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) 746 continue; 747 748 read_lock(&rs->rs_recv_lock); 749 750 /* XXX too lazy to maintain counts.. */ 751 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 752 total++; 753 if (total <= len) 754 rds_inc_info_copy(inc, iter, 755 inc->i_saddr.s6_addr32[3], 756 rs->rs_bound_addr_v4, 757 1); 758 } 759 760 read_unlock(&rs->rs_recv_lock); 761 } 762 763 spin_unlock_bh(&rds_sock_lock); 764 765 lens->nr = total; 766 lens->each = sizeof(struct rds_info_message); 767 } 768 769 #if IS_ENABLED(CONFIG_IPV6) 770 static void rds6_sock_inc_info(struct socket *sock, unsigned int len, 771 struct rds_info_iterator *iter, 772 struct rds_info_lengths *lens) 773 { 774 struct rds_incoming *inc; 775 unsigned int total = 0; 776 struct rds_sock *rs; 777 778 len /= sizeof(struct rds6_info_message); 779 780 spin_lock_bh(&rds_sock_lock); 781 782 list_for_each_entry(rs, &rds_sock_list, rs_item) { 783 read_lock(&rs->rs_recv_lock); 784 785 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 786 total++; 787 if (total <= len) 788 rds6_inc_info_copy(inc, iter, &inc->i_saddr, 789 &rs->rs_bound_addr, 1); 790 } 791 792 read_unlock(&rs->rs_recv_lock); 793 } 794 795 spin_unlock_bh(&rds_sock_lock); 796 797 lens->nr = total; 798 lens->each = sizeof(struct rds6_info_message); 799 } 800 #endif 801 802 static void rds_sock_info(struct socket *sock, unsigned int len, 803 struct rds_info_iterator *iter, 804 struct rds_info_lengths *lens) 805 { 806 struct rds_info_socket sinfo; 807 unsigned int cnt = 0; 808 struct rds_sock *rs; 809 810 len /= sizeof(struct rds_info_socket); 811 812 spin_lock_bh(&rds_sock_lock); 813 814 if (len < rds_sock_count) { 815 cnt = rds_sock_count; 816 goto out; 817 } 818 819 list_for_each_entry(rs, &rds_sock_list, rs_item) { 820 /* This option only supports IPv4 sockets. */ 821 if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) 822 continue; 823 sinfo.sndbuf = rds_sk_sndbuf(rs); 824 sinfo.rcvbuf = rds_sk_rcvbuf(rs); 825 sinfo.bound_addr = rs->rs_bound_addr_v4; 826 sinfo.connected_addr = rs->rs_conn_addr_v4; 827 sinfo.bound_port = rs->rs_bound_port; 828 sinfo.connected_port = rs->rs_conn_port; 829 sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); 830 831 rds_info_copy(iter, &sinfo, sizeof(sinfo)); 832 cnt++; 833 } 834 835 out: 836 lens->nr = cnt; 837 lens->each = sizeof(struct rds_info_socket); 838 839 spin_unlock_bh(&rds_sock_lock); 840 } 841 842 #if IS_ENABLED(CONFIG_IPV6) 843 static void rds6_sock_info(struct socket *sock, unsigned int len, 844 struct rds_info_iterator *iter, 845 struct rds_info_lengths *lens) 846 { 847 struct rds6_info_socket sinfo6; 848 struct rds_sock *rs; 849 850 len /= sizeof(struct rds6_info_socket); 851 852 spin_lock_bh(&rds_sock_lock); 853 854 if (len < rds_sock_count) 855 goto out; 856 857 list_for_each_entry(rs, &rds_sock_list, rs_item) { 858 sinfo6.sndbuf = rds_sk_sndbuf(rs); 859 sinfo6.rcvbuf = rds_sk_rcvbuf(rs); 860 sinfo6.bound_addr = rs->rs_bound_addr; 861 sinfo6.connected_addr = rs->rs_conn_addr; 862 sinfo6.bound_port = rs->rs_bound_port; 863 sinfo6.connected_port = rs->rs_conn_port; 864 sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs)); 865 866 rds_info_copy(iter, &sinfo6, sizeof(sinfo6)); 867 } 868 869 out: 870 lens->nr = rds_sock_count; 871 lens->each = sizeof(struct rds6_info_socket); 872 873 spin_unlock_bh(&rds_sock_lock); 874 } 875 #endif 876 877 static void rds_exit(void) 878 { 879 sock_unregister(rds_family_ops.family); 880 proto_unregister(&rds_proto); 881 rds_conn_exit(); 882 rds_cong_exit(); 883 rds_sysctl_exit(); 884 rds_threads_exit(); 885 rds_stats_exit(); 886 rds_page_exit(); 887 rds_bind_lock_destroy(); 888 rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); 889 rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); 890 #if IS_ENABLED(CONFIG_IPV6) 891 rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info); 892 rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); 893 #endif 894 } 895 module_exit(rds_exit); 896 897 u32 rds_gen_num; 898 899 static int rds_init(void) 900 { 901 int ret; 902 903 net_get_random_once(&rds_gen_num, sizeof(rds_gen_num)); 904 905 ret = rds_bind_lock_init(); 906 if (ret) 907 goto out; 908 909 ret = rds_conn_init(); 910 if (ret) 911 goto out_bind; 912 913 ret = rds_threads_init(); 914 if (ret) 915 goto out_conn; 916 ret = rds_sysctl_init(); 917 if (ret) 918 goto out_threads; 919 ret = rds_stats_init(); 920 if (ret) 921 goto out_sysctl; 922 ret = proto_register(&rds_proto, 1); 923 if (ret) 924 goto out_stats; 925 ret = sock_register(&rds_family_ops); 926 if (ret) 927 goto out_proto; 928 929 rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); 930 rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); 931 #if IS_ENABLED(CONFIG_IPV6) 932 rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info); 933 rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); 934 #endif 935 936 goto out; 937 938 out_proto: 939 proto_unregister(&rds_proto); 940 out_stats: 941 rds_stats_exit(); 942 out_sysctl: 943 rds_sysctl_exit(); 944 out_threads: 945 rds_threads_exit(); 946 out_conn: 947 rds_conn_exit(); 948 rds_cong_exit(); 949 rds_page_exit(); 950 out_bind: 951 rds_bind_lock_destroy(); 952 out: 953 return ret; 954 } 955 module_init(rds_init); 956 957 #define DRV_VERSION "4.0" 958 #define DRV_RELDATE "Feb 12, 2009" 959 960 MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); 961 MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" 962 " v" DRV_VERSION " (" DRV_RELDATE ")"); 963 MODULE_VERSION(DRV_VERSION); 964 MODULE_LICENSE("Dual BSD/GPL"); 965 MODULE_ALIAS_NETPROTO(PF_RDS); 966