1 /* 2 * Shared Memory Communications over RDMA (SMC-R) and RoCE 3 * 4 * AF_SMC protocol family socket handler keeping the AF_INET sock address type 5 * applies to SOCK_STREAM sockets only 6 * offers an alternative communication option for TCP-protocol sockets 7 * applicable with RoCE-cards only 8 * 9 * Initial restrictions: 10 * - non-blocking connect postponed 11 * - IPv6 support postponed 12 * - support for alternate links postponed 13 * - partial support for non-blocking sockets only 14 * - support for urgent data postponed 15 * 16 * Copyright IBM Corp. 2016 17 * 18 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 19 * based on prototype from Frank Blaschka 20 */ 21 22 #define KMSG_COMPONENT "smc" 23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 24 25 #include <linux/module.h> 26 #include <linux/socket.h> 27 #include <linux/inetdevice.h> 28 #include <linux/workqueue.h> 29 #include <linux/in.h> 30 #include <linux/sched/signal.h> 31 32 #include <net/sock.h> 33 #include <net/tcp.h> 34 #include <net/smc.h> 35 36 #include "smc.h" 37 #include "smc_clc.h" 38 #include "smc_llc.h" 39 #include "smc_cdc.h" 40 #include "smc_core.h" 41 #include "smc_ib.h" 42 #include "smc_pnet.h" 43 #include "smc_tx.h" 44 #include "smc_rx.h" 45 #include "smc_close.h" 46 47 static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group 48 * creation 49 */ 50 51 struct smc_lgr_list smc_lgr_list = { /* established link groups */ 52 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), 53 .list = LIST_HEAD_INIT(smc_lgr_list.list), 54 }; 55 56 static void smc_tcp_listen_work(struct work_struct *); 57 58 static void smc_set_keepalive(struct sock *sk, int val) 59 { 60 struct smc_sock *smc = smc_sk(sk); 61 62 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); 63 } 64 65 static struct smc_hashinfo smc_v4_hashinfo = { 66 .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), 67 }; 68 69 int smc_hash_sk(struct sock *sk) 70 { 71 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 72 struct hlist_head *head; 73 74 head = &h->ht; 75 76 write_lock_bh(&h->lock); 77 sk_add_node(sk, head); 78 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 79 write_unlock_bh(&h->lock); 80 81 return 0; 82 } 83 EXPORT_SYMBOL_GPL(smc_hash_sk); 84 85 void smc_unhash_sk(struct sock *sk) 86 { 87 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 88 89 write_lock_bh(&h->lock); 90 if (sk_del_node_init(sk)) 91 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 92 write_unlock_bh(&h->lock); 93 } 94 EXPORT_SYMBOL_GPL(smc_unhash_sk); 95 96 struct proto smc_proto = { 97 .name = "SMC", 98 .owner = THIS_MODULE, 99 .keepalive = smc_set_keepalive, 100 .hash = smc_hash_sk, 101 .unhash = smc_unhash_sk, 102 .obj_size = sizeof(struct smc_sock), 103 .h.smc_hash = &smc_v4_hashinfo, 104 .slab_flags = SLAB_TYPESAFE_BY_RCU, 105 }; 106 EXPORT_SYMBOL_GPL(smc_proto); 107 108 static int smc_release(struct socket *sock) 109 { 110 struct sock *sk = sock->sk; 111 struct smc_sock *smc; 112 int rc = 0; 113 114 if (!sk) 115 goto out; 116 117 smc = smc_sk(sk); 118 sock_hold(sk); 119 if (sk->sk_state == SMC_LISTEN) 120 /* smc_close_non_accepted() is called and acquires 121 * sock lock for child sockets again 122 */ 123 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 124 else 125 lock_sock(sk); 126 127 if (smc->use_fallback) { 128 sk->sk_state = SMC_CLOSED; 129 sk->sk_state_change(sk); 130 } else { 131 rc = smc_close_active(smc); 132 sock_set_flag(sk, SOCK_DEAD); 133 sk->sk_shutdown |= SHUTDOWN_MASK; 134 } 135 if (smc->clcsock) { 136 sock_release(smc->clcsock); 137 smc->clcsock = NULL; 138 } 139 140 /* detach socket */ 141 sock_orphan(sk); 142 sock->sk = NULL; 143 if (smc->use_fallback) { 144 schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN); 145 } else if (sk->sk_state == SMC_CLOSED) { 146 smc_conn_free(&smc->conn); 147 schedule_delayed_work(&smc->sock_put_work, 148 SMC_CLOSE_SOCK_PUT_DELAY); 149 } 150 release_sock(sk); 151 152 sock_put(sk); 153 out: 154 return rc; 155 } 156 157 static void smc_destruct(struct sock *sk) 158 { 159 if (sk->sk_state != SMC_CLOSED) 160 return; 161 if (!sock_flag(sk, SOCK_DEAD)) 162 return; 163 164 sk_refcnt_debug_dec(sk); 165 } 166 167 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) 168 { 169 struct smc_sock *smc; 170 struct sock *sk; 171 172 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0); 173 if (!sk) 174 return NULL; 175 176 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ 177 sk->sk_state = SMC_INIT; 178 sk->sk_destruct = smc_destruct; 179 sk->sk_protocol = SMCPROTO_SMC; 180 smc = smc_sk(sk); 181 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 182 INIT_LIST_HEAD(&smc->accept_q); 183 spin_lock_init(&smc->accept_q_lock); 184 INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work); 185 sk->sk_prot->hash(sk); 186 sk_refcnt_debug_inc(sk); 187 188 return sk; 189 } 190 191 static int smc_bind(struct socket *sock, struct sockaddr *uaddr, 192 int addr_len) 193 { 194 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 195 struct sock *sk = sock->sk; 196 struct smc_sock *smc; 197 int rc; 198 199 smc = smc_sk(sk); 200 201 /* replicate tests from inet_bind(), to be safe wrt. future changes */ 202 rc = -EINVAL; 203 if (addr_len < sizeof(struct sockaddr_in)) 204 goto out; 205 206 rc = -EAFNOSUPPORT; 207 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ 208 if ((addr->sin_family != AF_INET) && 209 ((addr->sin_family != AF_UNSPEC) || 210 (addr->sin_addr.s_addr != htonl(INADDR_ANY)))) 211 goto out; 212 213 lock_sock(sk); 214 215 /* Check if socket is already active */ 216 rc = -EINVAL; 217 if (sk->sk_state != SMC_INIT) 218 goto out_rel; 219 220 smc->clcsock->sk->sk_reuse = sk->sk_reuse; 221 rc = kernel_bind(smc->clcsock, uaddr, addr_len); 222 223 out_rel: 224 release_sock(sk); 225 out: 226 return rc; 227 } 228 229 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, 230 unsigned long mask) 231 { 232 /* options we don't get control via setsockopt for */ 233 nsk->sk_type = osk->sk_type; 234 nsk->sk_sndbuf = osk->sk_sndbuf; 235 nsk->sk_rcvbuf = osk->sk_rcvbuf; 236 nsk->sk_sndtimeo = osk->sk_sndtimeo; 237 nsk->sk_rcvtimeo = osk->sk_rcvtimeo; 238 nsk->sk_mark = osk->sk_mark; 239 nsk->sk_priority = osk->sk_priority; 240 nsk->sk_rcvlowat = osk->sk_rcvlowat; 241 nsk->sk_bound_dev_if = osk->sk_bound_dev_if; 242 nsk->sk_err = osk->sk_err; 243 244 nsk->sk_flags &= ~mask; 245 nsk->sk_flags |= osk->sk_flags & mask; 246 } 247 248 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ 249 (1UL << SOCK_KEEPOPEN) | \ 250 (1UL << SOCK_LINGER) | \ 251 (1UL << SOCK_BROADCAST) | \ 252 (1UL << SOCK_TIMESTAMP) | \ 253 (1UL << SOCK_DBG) | \ 254 (1UL << SOCK_RCVTSTAMP) | \ 255 (1UL << SOCK_RCVTSTAMPNS) | \ 256 (1UL << SOCK_LOCALROUTE) | \ 257 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ 258 (1UL << SOCK_RXQ_OVFL) | \ 259 (1UL << SOCK_WIFI_STATUS) | \ 260 (1UL << SOCK_NOFCS) | \ 261 (1UL << SOCK_FILTER_LOCKED)) 262 /* copy only relevant settings and flags of SOL_SOCKET level from smc to 263 * clc socket (since smc is not called for these options from net/core) 264 */ 265 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) 266 { 267 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); 268 } 269 270 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ 271 (1UL << SOCK_KEEPOPEN) | \ 272 (1UL << SOCK_LINGER) | \ 273 (1UL << SOCK_DBG)) 274 /* copy only settings and flags relevant for smc from clc to smc socket */ 275 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) 276 { 277 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); 278 } 279 280 /* determine subnet and mask of internal TCP socket */ 281 int smc_netinfo_by_tcpsk(struct socket *clcsock, 282 __be32 *subnet, u8 *prefix_len) 283 { 284 struct dst_entry *dst = sk_dst_get(clcsock->sk); 285 struct in_device *in_dev; 286 struct sockaddr_in addr; 287 int rc = -ENOENT; 288 int len; 289 290 if (!dst) { 291 rc = -ENOTCONN; 292 goto out; 293 } 294 if (!dst->dev) { 295 rc = -ENODEV; 296 goto out_rel; 297 } 298 299 /* get address to which the internal TCP socket is bound */ 300 kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); 301 /* analyze IPv4 specific data of net_device belonging to TCP socket */ 302 rcu_read_lock(); 303 in_dev = __in_dev_get_rcu(dst->dev); 304 for_ifa(in_dev) { 305 if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) 306 continue; 307 *prefix_len = inet_mask_len(ifa->ifa_mask); 308 *subnet = ifa->ifa_address & ifa->ifa_mask; 309 rc = 0; 310 break; 311 } endfor_ifa(in_dev); 312 rcu_read_unlock(); 313 314 out_rel: 315 dst_release(dst); 316 out: 317 return rc; 318 } 319 320 static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid) 321 { 322 struct smc_link_group *lgr = smc->conn.lgr; 323 struct smc_link *link; 324 int rest; 325 int rc; 326 327 link = &lgr->lnk[SMC_SINGLE_LINK]; 328 /* receive CONFIRM LINK request from server over RoCE fabric */ 329 rest = wait_for_completion_interruptible_timeout( 330 &link->llc_confirm, 331 SMC_LLC_WAIT_FIRST_TIME); 332 if (rest <= 0) { 333 struct smc_clc_msg_decline dclc; 334 335 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 336 SMC_CLC_DECLINE); 337 return rc; 338 } 339 340 rc = smc_ib_modify_qp_rts(link); 341 if (rc) 342 return SMC_CLC_DECL_INTERR; 343 344 smc_wr_remember_qp_attr(link); 345 346 rc = smc_wr_reg_send(link, 347 smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); 348 if (rc) 349 return SMC_CLC_DECL_INTERR; 350 351 /* send CONFIRM LINK response over RoCE fabric */ 352 rc = smc_llc_send_confirm_link(link, 353 link->smcibdev->mac[link->ibport - 1], 354 gid, SMC_LLC_RESP); 355 if (rc < 0) 356 return SMC_CLC_DECL_TCL; 357 358 return rc; 359 } 360 361 static void smc_conn_save_peer_info(struct smc_sock *smc, 362 struct smc_clc_msg_accept_confirm *clc) 363 { 364 smc->conn.peer_conn_idx = clc->conn_idx; 365 smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); 366 smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size); 367 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); 368 } 369 370 static void smc_link_save_peer_info(struct smc_link *link, 371 struct smc_clc_msg_accept_confirm *clc) 372 { 373 link->peer_qpn = ntoh24(clc->qpn); 374 memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); 375 memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); 376 link->peer_psn = ntoh24(clc->psn); 377 link->peer_mtu = clc->qp_mtu; 378 } 379 380 /* setup for RDMA connection of client */ 381 static int smc_connect_rdma(struct smc_sock *smc) 382 { 383 struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr; 384 struct smc_clc_msg_accept_confirm aclc; 385 int local_contact = SMC_FIRST_CONTACT; 386 struct smc_ib_device *smcibdev; 387 struct smc_link *link; 388 u8 srv_first_contact; 389 int reason_code = 0; 390 int rc = 0; 391 u8 ibport; 392 393 if (!tcp_sk(smc->clcsock->sk)->syn_smc) { 394 /* peer has not signalled SMC-capability */ 395 smc->use_fallback = true; 396 goto out_connected; 397 } 398 399 /* IPSec connections opt out of SMC-R optimizations */ 400 if (using_ipsec(smc)) { 401 reason_code = SMC_CLC_DECL_IPSEC; 402 goto decline_rdma; 403 } 404 405 /* PNET table look up: search active ib_device and port 406 * within same PNETID that also contains the ethernet device 407 * used for the internal TCP socket 408 */ 409 smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport); 410 if (!smcibdev) { 411 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 412 goto decline_rdma; 413 } 414 415 /* do inband token exchange */ 416 reason_code = smc_clc_send_proposal(smc, smcibdev, ibport); 417 if (reason_code < 0) { 418 rc = reason_code; 419 goto out_err; 420 } 421 if (reason_code > 0) /* configuration error */ 422 goto decline_rdma; 423 /* receive SMC Accept CLC message */ 424 reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc), 425 SMC_CLC_ACCEPT); 426 if (reason_code < 0) { 427 rc = reason_code; 428 goto out_err; 429 } 430 if (reason_code > 0) 431 goto decline_rdma; 432 433 srv_first_contact = aclc.hdr.flag; 434 mutex_lock(&smc_create_lgr_pending); 435 local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev, 436 ibport, &aclc.lcl, srv_first_contact); 437 if (local_contact < 0) { 438 rc = local_contact; 439 if (rc == -ENOMEM) 440 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 441 else if (rc == -ENOLINK) 442 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ 443 goto decline_rdma_unlock; 444 } 445 link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 446 447 smc_conn_save_peer_info(smc, &aclc); 448 449 /* create send buffer and rmb */ 450 rc = smc_buf_create(smc); 451 if (rc) { 452 reason_code = SMC_CLC_DECL_MEM; 453 goto decline_rdma_unlock; 454 } 455 456 if (local_contact == SMC_FIRST_CONTACT) 457 smc_link_save_peer_info(link, &aclc); 458 459 rc = smc_rmb_rtoken_handling(&smc->conn, &aclc); 460 if (rc) { 461 reason_code = SMC_CLC_DECL_INTERR; 462 goto decline_rdma_unlock; 463 } 464 465 smc_close_init(smc); 466 smc_rx_init(smc); 467 468 if (local_contact == SMC_FIRST_CONTACT) { 469 rc = smc_ib_ready_link(link); 470 if (rc) { 471 reason_code = SMC_CLC_DECL_INTERR; 472 goto decline_rdma_unlock; 473 } 474 } else { 475 struct smc_buf_desc *buf_desc = smc->conn.rmb_desc; 476 477 if (!buf_desc->reused) { 478 /* register memory region for new rmb */ 479 rc = smc_wr_reg_send(link, 480 buf_desc->mr_rx[SMC_SINGLE_LINK]); 481 if (rc) { 482 reason_code = SMC_CLC_DECL_INTERR; 483 goto decline_rdma_unlock; 484 } 485 } 486 } 487 smc_rmb_sync_sg_for_device(&smc->conn); 488 489 rc = smc_clc_send_confirm(smc); 490 if (rc) 491 goto out_err_unlock; 492 493 if (local_contact == SMC_FIRST_CONTACT) { 494 /* QP confirmation over RoCE fabric */ 495 reason_code = smc_clnt_conf_first_link( 496 smc, &smcibdev->gid[ibport - 1]); 497 if (reason_code < 0) { 498 rc = reason_code; 499 goto out_err_unlock; 500 } 501 if (reason_code > 0) 502 goto decline_rdma_unlock; 503 } 504 505 mutex_unlock(&smc_create_lgr_pending); 506 smc_tx_init(smc); 507 508 out_connected: 509 smc_copy_sock_settings_to_clc(smc); 510 if (smc->sk.sk_state == SMC_INIT) 511 smc->sk.sk_state = SMC_ACTIVE; 512 513 return rc ? rc : local_contact; 514 515 decline_rdma_unlock: 516 mutex_unlock(&smc_create_lgr_pending); 517 smc_conn_free(&smc->conn); 518 decline_rdma: 519 /* RDMA setup failed, switch back to TCP */ 520 smc->use_fallback = true; 521 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { 522 rc = smc_clc_send_decline(smc, reason_code); 523 if (rc < sizeof(struct smc_clc_msg_decline)) 524 goto out_err; 525 } 526 goto out_connected; 527 528 out_err_unlock: 529 mutex_unlock(&smc_create_lgr_pending); 530 smc_conn_free(&smc->conn); 531 out_err: 532 return rc; 533 } 534 535 static int smc_connect(struct socket *sock, struct sockaddr *addr, 536 int alen, int flags) 537 { 538 struct sock *sk = sock->sk; 539 struct smc_sock *smc; 540 int rc = -EINVAL; 541 542 smc = smc_sk(sk); 543 544 /* separate smc parameter checking to be safe */ 545 if (alen < sizeof(addr->sa_family)) 546 goto out_err; 547 if (addr->sa_family != AF_INET) 548 goto out_err; 549 smc->addr = addr; /* needed for nonblocking connect */ 550 551 lock_sock(sk); 552 switch (sk->sk_state) { 553 default: 554 goto out; 555 case SMC_ACTIVE: 556 rc = -EISCONN; 557 goto out; 558 case SMC_INIT: 559 rc = 0; 560 break; 561 } 562 563 smc_copy_sock_settings_to_clc(smc); 564 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 565 rc = kernel_connect(smc->clcsock, addr, alen, flags); 566 if (rc) 567 goto out; 568 569 /* setup RDMA connection */ 570 rc = smc_connect_rdma(smc); 571 if (rc < 0) 572 goto out; 573 else 574 rc = 0; /* success cases including fallback */ 575 576 out: 577 release_sock(sk); 578 out_err: 579 return rc; 580 } 581 582 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) 583 { 584 struct sock *sk = &lsmc->sk; 585 struct socket *new_clcsock; 586 struct sock *new_sk; 587 int rc; 588 589 release_sock(&lsmc->sk); 590 new_sk = smc_sock_alloc(sock_net(sk), NULL); 591 if (!new_sk) { 592 rc = -ENOMEM; 593 lsmc->sk.sk_err = ENOMEM; 594 *new_smc = NULL; 595 lock_sock(&lsmc->sk); 596 goto out; 597 } 598 *new_smc = smc_sk(new_sk); 599 600 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); 601 lock_sock(&lsmc->sk); 602 if (rc < 0) { 603 lsmc->sk.sk_err = -rc; 604 new_sk->sk_state = SMC_CLOSED; 605 sock_set_flag(new_sk, SOCK_DEAD); 606 sk->sk_prot->unhash(new_sk); 607 sock_put(new_sk); 608 *new_smc = NULL; 609 goto out; 610 } 611 if (lsmc->sk.sk_state == SMC_CLOSED) { 612 if (new_clcsock) 613 sock_release(new_clcsock); 614 new_sk->sk_state = SMC_CLOSED; 615 sock_set_flag(new_sk, SOCK_DEAD); 616 sk->sk_prot->unhash(new_sk); 617 sock_put(new_sk); 618 *new_smc = NULL; 619 goto out; 620 } 621 622 (*new_smc)->clcsock = new_clcsock; 623 out: 624 return rc; 625 } 626 627 /* add a just created sock to the accept queue of the listen sock as 628 * candidate for a following socket accept call from user space 629 */ 630 static void smc_accept_enqueue(struct sock *parent, struct sock *sk) 631 { 632 struct smc_sock *par = smc_sk(parent); 633 634 sock_hold(sk); 635 spin_lock(&par->accept_q_lock); 636 list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); 637 spin_unlock(&par->accept_q_lock); 638 sk_acceptq_added(parent); 639 } 640 641 /* remove a socket from the accept queue of its parental listening socket */ 642 static void smc_accept_unlink(struct sock *sk) 643 { 644 struct smc_sock *par = smc_sk(sk)->listen_smc; 645 646 spin_lock(&par->accept_q_lock); 647 list_del_init(&smc_sk(sk)->accept_q); 648 spin_unlock(&par->accept_q_lock); 649 sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); 650 sock_put(sk); 651 } 652 653 /* remove a sock from the accept queue to bind it to a new socket created 654 * for a socket accept call from user space 655 */ 656 struct sock *smc_accept_dequeue(struct sock *parent, 657 struct socket *new_sock) 658 { 659 struct smc_sock *isk, *n; 660 struct sock *new_sk; 661 662 list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { 663 new_sk = (struct sock *)isk; 664 665 smc_accept_unlink(new_sk); 666 if (new_sk->sk_state == SMC_CLOSED) { 667 new_sk->sk_prot->unhash(new_sk); 668 sock_put(new_sk); 669 continue; 670 } 671 if (new_sock) 672 sock_graft(new_sk, new_sock); 673 return new_sk; 674 } 675 return NULL; 676 } 677 678 /* clean up for a created but never accepted sock */ 679 void smc_close_non_accepted(struct sock *sk) 680 { 681 struct smc_sock *smc = smc_sk(sk); 682 683 sock_hold(sk); 684 lock_sock(sk); 685 if (!sk->sk_lingertime) 686 /* wait for peer closing */ 687 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; 688 if (smc->use_fallback) { 689 sk->sk_state = SMC_CLOSED; 690 } else { 691 smc_close_active(smc); 692 sock_set_flag(sk, SOCK_DEAD); 693 sk->sk_shutdown |= SHUTDOWN_MASK; 694 } 695 if (smc->clcsock) { 696 struct socket *tcp; 697 698 tcp = smc->clcsock; 699 smc->clcsock = NULL; 700 sock_release(tcp); 701 } 702 if (smc->use_fallback) { 703 schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN); 704 } else if (sk->sk_state == SMC_CLOSED) { 705 smc_conn_free(&smc->conn); 706 schedule_delayed_work(&smc->sock_put_work, 707 SMC_CLOSE_SOCK_PUT_DELAY); 708 } 709 release_sock(sk); 710 sock_put(sk); 711 } 712 713 static int smc_serv_conf_first_link(struct smc_sock *smc) 714 { 715 struct smc_link_group *lgr = smc->conn.lgr; 716 struct smc_link *link; 717 int rest; 718 int rc; 719 720 link = &lgr->lnk[SMC_SINGLE_LINK]; 721 722 rc = smc_wr_reg_send(link, 723 smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); 724 if (rc) 725 return SMC_CLC_DECL_INTERR; 726 727 /* send CONFIRM LINK request to client over the RoCE fabric */ 728 rc = smc_llc_send_confirm_link(link, 729 link->smcibdev->mac[link->ibport - 1], 730 &link->smcibdev->gid[link->ibport - 1], 731 SMC_LLC_REQ); 732 if (rc < 0) 733 return SMC_CLC_DECL_TCL; 734 735 /* receive CONFIRM LINK response from client over the RoCE fabric */ 736 rest = wait_for_completion_interruptible_timeout( 737 &link->llc_confirm_resp, 738 SMC_LLC_WAIT_FIRST_TIME); 739 if (rest <= 0) { 740 struct smc_clc_msg_decline dclc; 741 742 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 743 SMC_CLC_DECLINE); 744 } 745 746 return rc; 747 } 748 749 /* setup for RDMA connection of server */ 750 static void smc_listen_work(struct work_struct *work) 751 { 752 struct smc_sock *new_smc = container_of(work, struct smc_sock, 753 smc_listen_work); 754 struct socket *newclcsock = new_smc->clcsock; 755 struct smc_sock *lsmc = new_smc->listen_smc; 756 struct smc_clc_msg_accept_confirm cclc; 757 int local_contact = SMC_REUSE_CONTACT; 758 struct sock *newsmcsk = &new_smc->sk; 759 struct smc_clc_msg_proposal pclc; 760 struct smc_ib_device *smcibdev; 761 struct sockaddr_in peeraddr; 762 struct smc_link *link; 763 int reason_code = 0; 764 int rc = 0, len; 765 __be32 subnet; 766 u8 prefix_len; 767 u8 ibport; 768 769 /* check if peer is smc capable */ 770 if (!tcp_sk(newclcsock->sk)->syn_smc) { 771 new_smc->use_fallback = true; 772 goto out_connected; 773 } 774 775 /* do inband token exchange - 776 *wait for and receive SMC Proposal CLC message 777 */ 778 reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc), 779 SMC_CLC_PROPOSAL); 780 if (reason_code < 0) 781 goto out_err; 782 if (reason_code > 0) 783 goto decline_rdma; 784 785 /* IPSec connections opt out of SMC-R optimizations */ 786 if (using_ipsec(new_smc)) { 787 reason_code = SMC_CLC_DECL_IPSEC; 788 goto decline_rdma; 789 } 790 791 /* PNET table look up: search active ib_device and port 792 * within same PNETID that also contains the ethernet device 793 * used for the internal TCP socket 794 */ 795 smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport); 796 if (!smcibdev) { 797 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 798 goto decline_rdma; 799 } 800 801 /* determine subnet and mask from internal TCP socket */ 802 rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len); 803 if (rc) { 804 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 805 goto decline_rdma; 806 } 807 if ((pclc.outgoing_subnet != subnet) || 808 (pclc.prefix_len != prefix_len)) { 809 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 810 goto decline_rdma; 811 } 812 813 /* get address of the peer connected to the internal TCP socket */ 814 kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len); 815 816 /* allocate connection / link group */ 817 mutex_lock(&smc_create_lgr_pending); 818 local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr, 819 smcibdev, ibport, &pclc.lcl, 0); 820 if (local_contact < 0) { 821 rc = local_contact; 822 if (rc == -ENOMEM) 823 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 824 goto decline_rdma_unlock; 825 } 826 link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 827 828 /* create send buffer and rmb */ 829 rc = smc_buf_create(new_smc); 830 if (rc) { 831 reason_code = SMC_CLC_DECL_MEM; 832 goto decline_rdma_unlock; 833 } 834 835 smc_close_init(new_smc); 836 smc_rx_init(new_smc); 837 838 if (local_contact != SMC_FIRST_CONTACT) { 839 struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc; 840 841 if (!buf_desc->reused) { 842 /* register memory region for new rmb */ 843 rc = smc_wr_reg_send(link, 844 buf_desc->mr_rx[SMC_SINGLE_LINK]); 845 if (rc) { 846 reason_code = SMC_CLC_DECL_INTERR; 847 goto decline_rdma_unlock; 848 } 849 } 850 } 851 smc_rmb_sync_sg_for_device(&new_smc->conn); 852 853 rc = smc_clc_send_accept(new_smc, local_contact); 854 if (rc) 855 goto out_err_unlock; 856 857 /* receive SMC Confirm CLC message */ 858 reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), 859 SMC_CLC_CONFIRM); 860 if (reason_code < 0) 861 goto out_err_unlock; 862 if (reason_code > 0) 863 goto decline_rdma_unlock; 864 smc_conn_save_peer_info(new_smc, &cclc); 865 if (local_contact == SMC_FIRST_CONTACT) 866 smc_link_save_peer_info(link, &cclc); 867 868 rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); 869 if (rc) { 870 reason_code = SMC_CLC_DECL_INTERR; 871 goto decline_rdma_unlock; 872 } 873 874 if (local_contact == SMC_FIRST_CONTACT) { 875 rc = smc_ib_ready_link(link); 876 if (rc) { 877 reason_code = SMC_CLC_DECL_INTERR; 878 goto decline_rdma_unlock; 879 } 880 /* QP confirmation over RoCE fabric */ 881 reason_code = smc_serv_conf_first_link(new_smc); 882 if (reason_code < 0) { 883 /* peer is not aware of a problem */ 884 rc = reason_code; 885 goto out_err_unlock; 886 } 887 if (reason_code > 0) 888 goto decline_rdma_unlock; 889 } 890 891 smc_tx_init(new_smc); 892 mutex_unlock(&smc_create_lgr_pending); 893 894 out_connected: 895 sk_refcnt_debug_inc(newsmcsk); 896 if (newsmcsk->sk_state == SMC_INIT) 897 newsmcsk->sk_state = SMC_ACTIVE; 898 enqueue: 899 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); 900 if (lsmc->sk.sk_state == SMC_LISTEN) { 901 smc_accept_enqueue(&lsmc->sk, newsmcsk); 902 } else { /* no longer listening */ 903 smc_close_non_accepted(newsmcsk); 904 } 905 release_sock(&lsmc->sk); 906 907 /* Wake up accept */ 908 lsmc->sk.sk_data_ready(&lsmc->sk); 909 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ 910 return; 911 912 decline_rdma_unlock: 913 mutex_unlock(&smc_create_lgr_pending); 914 decline_rdma: 915 /* RDMA setup failed, switch back to TCP */ 916 smc_conn_free(&new_smc->conn); 917 new_smc->use_fallback = true; 918 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { 919 rc = smc_clc_send_decline(new_smc, reason_code); 920 if (rc < sizeof(struct smc_clc_msg_decline)) 921 goto out_err; 922 } 923 goto out_connected; 924 925 out_err_unlock: 926 mutex_unlock(&smc_create_lgr_pending); 927 out_err: 928 newsmcsk->sk_state = SMC_CLOSED; 929 smc_conn_free(&new_smc->conn); 930 goto enqueue; /* queue new sock with sk_err set */ 931 } 932 933 static void smc_tcp_listen_work(struct work_struct *work) 934 { 935 struct smc_sock *lsmc = container_of(work, struct smc_sock, 936 tcp_listen_work); 937 struct smc_sock *new_smc; 938 int rc = 0; 939 940 lock_sock(&lsmc->sk); 941 while (lsmc->sk.sk_state == SMC_LISTEN) { 942 rc = smc_clcsock_accept(lsmc, &new_smc); 943 if (rc) 944 goto out; 945 if (!new_smc) 946 continue; 947 948 new_smc->listen_smc = lsmc; 949 new_smc->use_fallback = false; /* assume rdma capability first*/ 950 sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */ 951 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); 952 smc_copy_sock_settings_to_smc(new_smc); 953 schedule_work(&new_smc->smc_listen_work); 954 } 955 956 out: 957 release_sock(&lsmc->sk); 958 lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */ 959 } 960 961 static int smc_listen(struct socket *sock, int backlog) 962 { 963 struct sock *sk = sock->sk; 964 struct smc_sock *smc; 965 int rc; 966 967 smc = smc_sk(sk); 968 lock_sock(sk); 969 970 rc = -EINVAL; 971 if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) 972 goto out; 973 974 rc = 0; 975 if (sk->sk_state == SMC_LISTEN) { 976 sk->sk_max_ack_backlog = backlog; 977 goto out; 978 } 979 /* some socket options are handled in core, so we could not apply 980 * them to the clc socket -- copy smc socket options to clc socket 981 */ 982 smc_copy_sock_settings_to_clc(smc); 983 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 984 985 rc = kernel_listen(smc->clcsock, backlog); 986 if (rc) 987 goto out; 988 sk->sk_max_ack_backlog = backlog; 989 sk->sk_ack_backlog = 0; 990 sk->sk_state = SMC_LISTEN; 991 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 992 schedule_work(&smc->tcp_listen_work); 993 994 out: 995 release_sock(sk); 996 return rc; 997 } 998 999 static int smc_accept(struct socket *sock, struct socket *new_sock, 1000 int flags, bool kern) 1001 { 1002 struct sock *sk = sock->sk, *nsk; 1003 DECLARE_WAITQUEUE(wait, current); 1004 struct smc_sock *lsmc; 1005 long timeo; 1006 int rc = 0; 1007 1008 lsmc = smc_sk(sk); 1009 lock_sock(sk); 1010 1011 if (lsmc->sk.sk_state != SMC_LISTEN) { 1012 rc = -EINVAL; 1013 goto out; 1014 } 1015 1016 /* Wait for an incoming connection */ 1017 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 1018 add_wait_queue_exclusive(sk_sleep(sk), &wait); 1019 while (!(nsk = smc_accept_dequeue(sk, new_sock))) { 1020 set_current_state(TASK_INTERRUPTIBLE); 1021 if (!timeo) { 1022 rc = -EAGAIN; 1023 break; 1024 } 1025 release_sock(sk); 1026 timeo = schedule_timeout(timeo); 1027 /* wakeup by sk_data_ready in smc_listen_work() */ 1028 sched_annotate_sleep(); 1029 lock_sock(sk); 1030 if (signal_pending(current)) { 1031 rc = sock_intr_errno(timeo); 1032 break; 1033 } 1034 } 1035 set_current_state(TASK_RUNNING); 1036 remove_wait_queue(sk_sleep(sk), &wait); 1037 1038 if (!rc) 1039 rc = sock_error(nsk); 1040 1041 out: 1042 release_sock(sk); 1043 return rc; 1044 } 1045 1046 static int smc_getname(struct socket *sock, struct sockaddr *addr, 1047 int *len, int peer) 1048 { 1049 struct smc_sock *smc; 1050 1051 if (peer && (sock->sk->sk_state != SMC_ACTIVE) && 1052 (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) 1053 return -ENOTCONN; 1054 1055 smc = smc_sk(sock->sk); 1056 1057 return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer); 1058 } 1059 1060 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) 1061 { 1062 struct sock *sk = sock->sk; 1063 struct smc_sock *smc; 1064 int rc = -EPIPE; 1065 1066 smc = smc_sk(sk); 1067 lock_sock(sk); 1068 if ((sk->sk_state != SMC_ACTIVE) && 1069 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1070 (sk->sk_state != SMC_INIT)) 1071 goto out; 1072 if (smc->use_fallback) 1073 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); 1074 else 1075 rc = smc_tx_sendmsg(smc, msg, len); 1076 out: 1077 release_sock(sk); 1078 return rc; 1079 } 1080 1081 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 1082 int flags) 1083 { 1084 struct sock *sk = sock->sk; 1085 struct smc_sock *smc; 1086 int rc = -ENOTCONN; 1087 1088 smc = smc_sk(sk); 1089 lock_sock(sk); 1090 if ((sk->sk_state == SMC_INIT) || 1091 (sk->sk_state == SMC_LISTEN) || 1092 (sk->sk_state == SMC_CLOSED)) 1093 goto out; 1094 1095 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 1096 rc = 0; 1097 goto out; 1098 } 1099 1100 if (smc->use_fallback) 1101 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); 1102 else 1103 rc = smc_rx_recvmsg(smc, msg, len, flags); 1104 1105 out: 1106 release_sock(sk); 1107 return rc; 1108 } 1109 1110 static unsigned int smc_accept_poll(struct sock *parent) 1111 { 1112 struct smc_sock *isk; 1113 struct sock *sk; 1114 1115 lock_sock(parent); 1116 list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) { 1117 sk = (struct sock *)isk; 1118 1119 if (sk->sk_state == SMC_ACTIVE) { 1120 release_sock(parent); 1121 return POLLIN | POLLRDNORM; 1122 } 1123 } 1124 release_sock(parent); 1125 1126 return 0; 1127 } 1128 1129 static unsigned int smc_poll(struct file *file, struct socket *sock, 1130 poll_table *wait) 1131 { 1132 struct sock *sk = sock->sk; 1133 __poll_t mask = 0; 1134 struct smc_sock *smc; 1135 int rc; 1136 1137 smc = smc_sk(sock->sk); 1138 if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { 1139 /* delegate to CLC child sock */ 1140 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); 1141 /* if non-blocking connect finished ... */ 1142 lock_sock(sk); 1143 if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) { 1144 sk->sk_err = smc->clcsock->sk->sk_err; 1145 if (sk->sk_err) { 1146 mask |= POLLERR; 1147 } else { 1148 rc = smc_connect_rdma(smc); 1149 if (rc < 0) 1150 mask |= POLLERR; 1151 else 1152 /* success cases including fallback */ 1153 mask |= POLLOUT | POLLWRNORM; 1154 } 1155 } 1156 release_sock(sk); 1157 } else { 1158 sock_poll_wait(file, sk_sleep(sk), wait); 1159 if (sk->sk_state == SMC_LISTEN) 1160 /* woken up by sk_data_ready in smc_listen_work() */ 1161 mask |= smc_accept_poll(sk); 1162 if (sk->sk_err) 1163 mask |= POLLERR; 1164 if (atomic_read(&smc->conn.sndbuf_space) || 1165 (sk->sk_shutdown & SEND_SHUTDOWN)) { 1166 mask |= POLLOUT | POLLWRNORM; 1167 } else { 1168 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1169 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1170 } 1171 if (atomic_read(&smc->conn.bytes_to_rcv)) 1172 mask |= POLLIN | POLLRDNORM; 1173 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1174 (sk->sk_state == SMC_CLOSED)) 1175 mask |= POLLHUP; 1176 if (sk->sk_shutdown & RCV_SHUTDOWN) 1177 mask |= POLLIN | POLLRDNORM | POLLRDHUP; 1178 if (sk->sk_state == SMC_APPCLOSEWAIT1) 1179 mask |= POLLIN; 1180 1181 } 1182 1183 return mask; 1184 } 1185 1186 static int smc_shutdown(struct socket *sock, int how) 1187 { 1188 struct sock *sk = sock->sk; 1189 struct smc_sock *smc; 1190 int rc = -EINVAL; 1191 int rc1 = 0; 1192 1193 smc = smc_sk(sk); 1194 1195 if ((how < SHUT_RD) || (how > SHUT_RDWR)) 1196 return rc; 1197 1198 lock_sock(sk); 1199 1200 rc = -ENOTCONN; 1201 if ((sk->sk_state != SMC_LISTEN) && 1202 (sk->sk_state != SMC_ACTIVE) && 1203 (sk->sk_state != SMC_PEERCLOSEWAIT1) && 1204 (sk->sk_state != SMC_PEERCLOSEWAIT2) && 1205 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1206 (sk->sk_state != SMC_APPCLOSEWAIT2) && 1207 (sk->sk_state != SMC_APPFINCLOSEWAIT)) 1208 goto out; 1209 if (smc->use_fallback) { 1210 rc = kernel_sock_shutdown(smc->clcsock, how); 1211 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; 1212 if (sk->sk_shutdown == SHUTDOWN_MASK) 1213 sk->sk_state = SMC_CLOSED; 1214 goto out; 1215 } 1216 switch (how) { 1217 case SHUT_RDWR: /* shutdown in both directions */ 1218 rc = smc_close_active(smc); 1219 break; 1220 case SHUT_WR: 1221 rc = smc_close_shutdown_write(smc); 1222 break; 1223 case SHUT_RD: 1224 if (sk->sk_state == SMC_LISTEN) 1225 rc = smc_close_active(smc); 1226 else 1227 rc = 0; 1228 /* nothing more to do because peer is not involved */ 1229 break; 1230 } 1231 rc1 = kernel_sock_shutdown(smc->clcsock, how); 1232 /* map sock_shutdown_cmd constants to sk_shutdown value range */ 1233 sk->sk_shutdown |= how + 1; 1234 1235 out: 1236 release_sock(sk); 1237 return rc ? rc : rc1; 1238 } 1239 1240 static int smc_setsockopt(struct socket *sock, int level, int optname, 1241 char __user *optval, unsigned int optlen) 1242 { 1243 struct sock *sk = sock->sk; 1244 struct smc_sock *smc; 1245 1246 smc = smc_sk(sk); 1247 1248 /* generic setsockopts reaching us here always apply to the 1249 * CLC socket 1250 */ 1251 return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, 1252 optval, optlen); 1253 } 1254 1255 static int smc_getsockopt(struct socket *sock, int level, int optname, 1256 char __user *optval, int __user *optlen) 1257 { 1258 struct smc_sock *smc; 1259 1260 smc = smc_sk(sock->sk); 1261 /* socket options apply to the CLC socket */ 1262 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, 1263 optval, optlen); 1264 } 1265 1266 static int smc_ioctl(struct socket *sock, unsigned int cmd, 1267 unsigned long arg) 1268 { 1269 struct smc_sock *smc; 1270 1271 smc = smc_sk(sock->sk); 1272 if (smc->use_fallback) 1273 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); 1274 else 1275 return sock_no_ioctl(sock, cmd, arg); 1276 } 1277 1278 static ssize_t smc_sendpage(struct socket *sock, struct page *page, 1279 int offset, size_t size, int flags) 1280 { 1281 struct sock *sk = sock->sk; 1282 struct smc_sock *smc; 1283 int rc = -EPIPE; 1284 1285 smc = smc_sk(sk); 1286 lock_sock(sk); 1287 if (sk->sk_state != SMC_ACTIVE) 1288 goto out; 1289 if (smc->use_fallback) 1290 rc = kernel_sendpage(smc->clcsock, page, offset, 1291 size, flags); 1292 else 1293 rc = sock_no_sendpage(sock, page, offset, size, flags); 1294 1295 out: 1296 release_sock(sk); 1297 return rc; 1298 } 1299 1300 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, 1301 struct pipe_inode_info *pipe, size_t len, 1302 unsigned int flags) 1303 { 1304 struct sock *sk = sock->sk; 1305 struct smc_sock *smc; 1306 int rc = -ENOTCONN; 1307 1308 smc = smc_sk(sk); 1309 lock_sock(sk); 1310 if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED)) 1311 goto out; 1312 if (smc->use_fallback) { 1313 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, 1314 pipe, len, flags); 1315 } else { 1316 rc = -EOPNOTSUPP; 1317 } 1318 out: 1319 release_sock(sk); 1320 return rc; 1321 } 1322 1323 /* must look like tcp */ 1324 static const struct proto_ops smc_sock_ops = { 1325 .family = PF_SMC, 1326 .owner = THIS_MODULE, 1327 .release = smc_release, 1328 .bind = smc_bind, 1329 .connect = smc_connect, 1330 .socketpair = sock_no_socketpair, 1331 .accept = smc_accept, 1332 .getname = smc_getname, 1333 .poll = smc_poll, 1334 .ioctl = smc_ioctl, 1335 .listen = smc_listen, 1336 .shutdown = smc_shutdown, 1337 .setsockopt = smc_setsockopt, 1338 .getsockopt = smc_getsockopt, 1339 .sendmsg = smc_sendmsg, 1340 .recvmsg = smc_recvmsg, 1341 .mmap = sock_no_mmap, 1342 .sendpage = smc_sendpage, 1343 .splice_read = smc_splice_read, 1344 }; 1345 1346 static int smc_create(struct net *net, struct socket *sock, int protocol, 1347 int kern) 1348 { 1349 struct smc_sock *smc; 1350 struct sock *sk; 1351 int rc; 1352 1353 rc = -ESOCKTNOSUPPORT; 1354 if (sock->type != SOCK_STREAM) 1355 goto out; 1356 1357 rc = -EPROTONOSUPPORT; 1358 if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP)) 1359 goto out; 1360 1361 rc = -ENOBUFS; 1362 sock->ops = &smc_sock_ops; 1363 sk = smc_sock_alloc(net, sock); 1364 if (!sk) 1365 goto out; 1366 1367 /* create internal TCP socket for CLC handshake and fallback */ 1368 smc = smc_sk(sk); 1369 smc->use_fallback = false; /* assume rdma capability first */ 1370 rc = sock_create_kern(net, PF_INET, SOCK_STREAM, 1371 IPPROTO_TCP, &smc->clcsock); 1372 if (rc) 1373 sk_common_release(sk); 1374 smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); 1375 smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); 1376 1377 out: 1378 return rc; 1379 } 1380 1381 static const struct net_proto_family smc_sock_family_ops = { 1382 .family = PF_SMC, 1383 .owner = THIS_MODULE, 1384 .create = smc_create, 1385 }; 1386 1387 static int __init smc_init(void) 1388 { 1389 int rc; 1390 1391 rc = smc_pnet_init(); 1392 if (rc) 1393 return rc; 1394 1395 rc = smc_llc_init(); 1396 if (rc) { 1397 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); 1398 goto out_pnet; 1399 } 1400 1401 rc = smc_cdc_init(); 1402 if (rc) { 1403 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); 1404 goto out_pnet; 1405 } 1406 1407 rc = proto_register(&smc_proto, 1); 1408 if (rc) { 1409 pr_err("%s: proto_register fails with %d\n", __func__, rc); 1410 goto out_pnet; 1411 } 1412 1413 rc = sock_register(&smc_sock_family_ops); 1414 if (rc) { 1415 pr_err("%s: sock_register fails with %d\n", __func__, rc); 1416 goto out_proto; 1417 } 1418 INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); 1419 1420 rc = smc_ib_register_client(); 1421 if (rc) { 1422 pr_err("%s: ib_register fails with %d\n", __func__, rc); 1423 goto out_sock; 1424 } 1425 1426 static_branch_enable(&tcp_have_smc); 1427 return 0; 1428 1429 out_sock: 1430 sock_unregister(PF_SMC); 1431 out_proto: 1432 proto_unregister(&smc_proto); 1433 out_pnet: 1434 smc_pnet_exit(); 1435 return rc; 1436 } 1437 1438 static void __exit smc_exit(void) 1439 { 1440 struct smc_link_group *lgr, *lg; 1441 LIST_HEAD(lgr_freeing_list); 1442 1443 spin_lock_bh(&smc_lgr_list.lock); 1444 if (!list_empty(&smc_lgr_list.list)) 1445 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); 1446 spin_unlock_bh(&smc_lgr_list.lock); 1447 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { 1448 list_del_init(&lgr->list); 1449 smc_lgr_free(lgr); /* free link group */ 1450 } 1451 static_branch_disable(&tcp_have_smc); 1452 smc_ib_unregister_client(); 1453 sock_unregister(PF_SMC); 1454 proto_unregister(&smc_proto); 1455 smc_pnet_exit(); 1456 } 1457 1458 module_init(smc_init); 1459 module_exit(smc_exit); 1460 1461 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); 1462 MODULE_DESCRIPTION("smc socket address family"); 1463 MODULE_LICENSE("GPL"); 1464 MODULE_ALIAS_NETPROTO(PF_SMC); 1465