1 /* 2 * Shared Memory Communications over RDMA (SMC-R) and RoCE 3 * 4 * AF_SMC protocol family socket handler keeping the AF_INET sock address type 5 * applies to SOCK_STREAM sockets only 6 * offers an alternative communication option for TCP-protocol sockets 7 * applicable with RoCE-cards only 8 * 9 * Initial restrictions: 10 * - non-blocking connect postponed 11 * - IPv6 support postponed 12 * - support for alternate links postponed 13 * - partial support for non-blocking sockets only 14 * - support for urgent data postponed 15 * 16 * Copyright IBM Corp. 2016 17 * 18 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 19 * based on prototype from Frank Blaschka 20 */ 21 22 #define KMSG_COMPONENT "smc" 23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 24 25 #include <linux/module.h> 26 #include <linux/socket.h> 27 #include <linux/inetdevice.h> 28 #include <linux/workqueue.h> 29 #include <linux/in.h> 30 #include <linux/sched/signal.h> 31 32 #include <net/sock.h> 33 #include <net/tcp.h> 34 #include <net/smc.h> 35 36 #include "smc.h" 37 #include "smc_clc.h" 38 #include "smc_llc.h" 39 #include "smc_cdc.h" 40 #include "smc_core.h" 41 #include "smc_ib.h" 42 #include "smc_pnet.h" 43 #include "smc_tx.h" 44 #include "smc_rx.h" 45 #include "smc_close.h" 46 47 static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group 48 * creation 49 */ 50 51 struct smc_lgr_list smc_lgr_list = { /* established link groups */ 52 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), 53 .list = LIST_HEAD_INIT(smc_lgr_list.list), 54 }; 55 56 static void smc_tcp_listen_work(struct work_struct *); 57 58 static void smc_set_keepalive(struct sock *sk, int val) 59 { 60 struct smc_sock *smc = smc_sk(sk); 61 62 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); 63 } 64 65 static struct smc_hashinfo smc_v4_hashinfo = { 66 .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), 67 }; 68 69 int smc_hash_sk(struct sock *sk) 70 { 71 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 72 struct hlist_head *head; 73 74 head = &h->ht; 75 76 write_lock_bh(&h->lock); 77 sk_add_node(sk, head); 78 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 79 write_unlock_bh(&h->lock); 80 81 return 0; 82 } 83 EXPORT_SYMBOL_GPL(smc_hash_sk); 84 85 void smc_unhash_sk(struct sock *sk) 86 { 87 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 88 89 write_lock_bh(&h->lock); 90 if (sk_del_node_init(sk)) 91 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 92 write_unlock_bh(&h->lock); 93 } 94 EXPORT_SYMBOL_GPL(smc_unhash_sk); 95 96 struct proto smc_proto = { 97 .name = "SMC", 98 .owner = THIS_MODULE, 99 .keepalive = smc_set_keepalive, 100 .hash = smc_hash_sk, 101 .unhash = smc_unhash_sk, 102 .obj_size = sizeof(struct smc_sock), 103 .h.smc_hash = &smc_v4_hashinfo, 104 .slab_flags = SLAB_TYPESAFE_BY_RCU, 105 }; 106 EXPORT_SYMBOL_GPL(smc_proto); 107 108 static int smc_release(struct socket *sock) 109 { 110 struct sock *sk = sock->sk; 111 struct smc_sock *smc; 112 int rc = 0; 113 114 if (!sk) 115 goto out; 116 117 smc = smc_sk(sk); 118 if (sk->sk_state == SMC_LISTEN) 119 /* smc_close_non_accepted() is called and acquires 120 * sock lock for child sockets again 121 */ 122 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 123 else 124 lock_sock(sk); 125 126 if (!smc->use_fallback) { 127 rc = smc_close_active(smc); 128 sock_set_flag(sk, SOCK_DEAD); 129 sk->sk_shutdown |= SHUTDOWN_MASK; 130 } 131 if (smc->clcsock) { 132 sock_release(smc->clcsock); 133 smc->clcsock = NULL; 134 } 135 if (smc->use_fallback) { 136 sock_put(sk); /* passive closing */ 137 sk->sk_state = SMC_CLOSED; 138 sk->sk_state_change(sk); 139 } 140 141 /* detach socket */ 142 sock_orphan(sk); 143 sock->sk = NULL; 144 if (!smc->use_fallback && sk->sk_state == SMC_CLOSED) 145 smc_conn_free(&smc->conn); 146 release_sock(sk); 147 148 sk->sk_prot->unhash(sk); 149 sock_put(sk); /* final sock_put */ 150 out: 151 return rc; 152 } 153 154 static void smc_destruct(struct sock *sk) 155 { 156 if (sk->sk_state != SMC_CLOSED) 157 return; 158 if (!sock_flag(sk, SOCK_DEAD)) 159 return; 160 161 sk_refcnt_debug_dec(sk); 162 } 163 164 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) 165 { 166 struct smc_sock *smc; 167 struct sock *sk; 168 169 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0); 170 if (!sk) 171 return NULL; 172 173 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ 174 sk->sk_state = SMC_INIT; 175 sk->sk_destruct = smc_destruct; 176 sk->sk_protocol = SMCPROTO_SMC; 177 smc = smc_sk(sk); 178 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 179 INIT_LIST_HEAD(&smc->accept_q); 180 spin_lock_init(&smc->accept_q_lock); 181 sk->sk_prot->hash(sk); 182 sk_refcnt_debug_inc(sk); 183 184 return sk; 185 } 186 187 static int smc_bind(struct socket *sock, struct sockaddr *uaddr, 188 int addr_len) 189 { 190 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 191 struct sock *sk = sock->sk; 192 struct smc_sock *smc; 193 int rc; 194 195 smc = smc_sk(sk); 196 197 /* replicate tests from inet_bind(), to be safe wrt. future changes */ 198 rc = -EINVAL; 199 if (addr_len < sizeof(struct sockaddr_in)) 200 goto out; 201 202 rc = -EAFNOSUPPORT; 203 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ 204 if ((addr->sin_family != AF_INET) && 205 ((addr->sin_family != AF_UNSPEC) || 206 (addr->sin_addr.s_addr != htonl(INADDR_ANY)))) 207 goto out; 208 209 lock_sock(sk); 210 211 /* Check if socket is already active */ 212 rc = -EINVAL; 213 if (sk->sk_state != SMC_INIT) 214 goto out_rel; 215 216 smc->clcsock->sk->sk_reuse = sk->sk_reuse; 217 rc = kernel_bind(smc->clcsock, uaddr, addr_len); 218 219 out_rel: 220 release_sock(sk); 221 out: 222 return rc; 223 } 224 225 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, 226 unsigned long mask) 227 { 228 /* options we don't get control via setsockopt for */ 229 nsk->sk_type = osk->sk_type; 230 nsk->sk_sndbuf = osk->sk_sndbuf; 231 nsk->sk_rcvbuf = osk->sk_rcvbuf; 232 nsk->sk_sndtimeo = osk->sk_sndtimeo; 233 nsk->sk_rcvtimeo = osk->sk_rcvtimeo; 234 nsk->sk_mark = osk->sk_mark; 235 nsk->sk_priority = osk->sk_priority; 236 nsk->sk_rcvlowat = osk->sk_rcvlowat; 237 nsk->sk_bound_dev_if = osk->sk_bound_dev_if; 238 nsk->sk_err = osk->sk_err; 239 240 nsk->sk_flags &= ~mask; 241 nsk->sk_flags |= osk->sk_flags & mask; 242 } 243 244 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ 245 (1UL << SOCK_KEEPOPEN) | \ 246 (1UL << SOCK_LINGER) | \ 247 (1UL << SOCK_BROADCAST) | \ 248 (1UL << SOCK_TIMESTAMP) | \ 249 (1UL << SOCK_DBG) | \ 250 (1UL << SOCK_RCVTSTAMP) | \ 251 (1UL << SOCK_RCVTSTAMPNS) | \ 252 (1UL << SOCK_LOCALROUTE) | \ 253 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ 254 (1UL << SOCK_RXQ_OVFL) | \ 255 (1UL << SOCK_WIFI_STATUS) | \ 256 (1UL << SOCK_NOFCS) | \ 257 (1UL << SOCK_FILTER_LOCKED)) 258 /* copy only relevant settings and flags of SOL_SOCKET level from smc to 259 * clc socket (since smc is not called for these options from net/core) 260 */ 261 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) 262 { 263 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); 264 } 265 266 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ 267 (1UL << SOCK_KEEPOPEN) | \ 268 (1UL << SOCK_LINGER) | \ 269 (1UL << SOCK_DBG)) 270 /* copy only settings and flags relevant for smc from clc to smc socket */ 271 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) 272 { 273 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); 274 } 275 276 /* determine subnet and mask of internal TCP socket */ 277 int smc_netinfo_by_tcpsk(struct socket *clcsock, 278 __be32 *subnet, u8 *prefix_len) 279 { 280 struct dst_entry *dst = sk_dst_get(clcsock->sk); 281 struct in_device *in_dev; 282 struct sockaddr_in addr; 283 int rc = -ENOENT; 284 int len; 285 286 if (!dst) { 287 rc = -ENOTCONN; 288 goto out; 289 } 290 if (!dst->dev) { 291 rc = -ENODEV; 292 goto out_rel; 293 } 294 295 /* get address to which the internal TCP socket is bound */ 296 kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); 297 /* analyze IPv4 specific data of net_device belonging to TCP socket */ 298 rcu_read_lock(); 299 in_dev = __in_dev_get_rcu(dst->dev); 300 for_ifa(in_dev) { 301 if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) 302 continue; 303 *prefix_len = inet_mask_len(ifa->ifa_mask); 304 *subnet = ifa->ifa_address & ifa->ifa_mask; 305 rc = 0; 306 break; 307 } endfor_ifa(in_dev); 308 rcu_read_unlock(); 309 310 out_rel: 311 dst_release(dst); 312 out: 313 return rc; 314 } 315 316 static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid) 317 { 318 struct smc_link_group *lgr = smc->conn.lgr; 319 struct smc_link *link; 320 int rest; 321 int rc; 322 323 link = &lgr->lnk[SMC_SINGLE_LINK]; 324 /* receive CONFIRM LINK request from server over RoCE fabric */ 325 rest = wait_for_completion_interruptible_timeout( 326 &link->llc_confirm, 327 SMC_LLC_WAIT_FIRST_TIME); 328 if (rest <= 0) { 329 struct smc_clc_msg_decline dclc; 330 331 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 332 SMC_CLC_DECLINE); 333 return rc; 334 } 335 336 rc = smc_ib_modify_qp_rts(link); 337 if (rc) 338 return SMC_CLC_DECL_INTERR; 339 340 smc_wr_remember_qp_attr(link); 341 342 rc = smc_wr_reg_send(link, 343 smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); 344 if (rc) 345 return SMC_CLC_DECL_INTERR; 346 347 /* send CONFIRM LINK response over RoCE fabric */ 348 rc = smc_llc_send_confirm_link(link, 349 link->smcibdev->mac[link->ibport - 1], 350 gid, SMC_LLC_RESP); 351 if (rc < 0) 352 return SMC_CLC_DECL_TCL; 353 354 return rc; 355 } 356 357 static void smc_conn_save_peer_info(struct smc_sock *smc, 358 struct smc_clc_msg_accept_confirm *clc) 359 { 360 smc->conn.peer_conn_idx = clc->conn_idx; 361 smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); 362 smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size); 363 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); 364 } 365 366 static void smc_link_save_peer_info(struct smc_link *link, 367 struct smc_clc_msg_accept_confirm *clc) 368 { 369 link->peer_qpn = ntoh24(clc->qpn); 370 memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); 371 memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); 372 link->peer_psn = ntoh24(clc->psn); 373 link->peer_mtu = clc->qp_mtu; 374 } 375 376 static void smc_lgr_forget(struct smc_link_group *lgr) 377 { 378 spin_lock_bh(&smc_lgr_list.lock); 379 /* do not use this link group for new connections */ 380 if (!list_empty(&lgr->list)) 381 list_del_init(&lgr->list); 382 spin_unlock_bh(&smc_lgr_list.lock); 383 } 384 385 /* setup for RDMA connection of client */ 386 static int smc_connect_rdma(struct smc_sock *smc) 387 { 388 struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr; 389 struct smc_clc_msg_accept_confirm aclc; 390 int local_contact = SMC_FIRST_CONTACT; 391 struct smc_ib_device *smcibdev; 392 struct smc_link *link; 393 u8 srv_first_contact; 394 int reason_code = 0; 395 int rc = 0; 396 u8 ibport; 397 398 sock_hold(&smc->sk); /* sock put in passive closing */ 399 400 if (!tcp_sk(smc->clcsock->sk)->syn_smc) { 401 /* peer has not signalled SMC-capability */ 402 smc->use_fallback = true; 403 goto out_connected; 404 } 405 406 /* IPSec connections opt out of SMC-R optimizations */ 407 if (using_ipsec(smc)) { 408 reason_code = SMC_CLC_DECL_IPSEC; 409 goto decline_rdma; 410 } 411 412 /* PNET table look up: search active ib_device and port 413 * within same PNETID that also contains the ethernet device 414 * used for the internal TCP socket 415 */ 416 smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport); 417 if (!smcibdev) { 418 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 419 goto decline_rdma; 420 } 421 422 /* do inband token exchange */ 423 reason_code = smc_clc_send_proposal(smc, smcibdev, ibport); 424 if (reason_code < 0) { 425 rc = reason_code; 426 goto out_err; 427 } 428 if (reason_code > 0) /* configuration error */ 429 goto decline_rdma; 430 /* receive SMC Accept CLC message */ 431 reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc), 432 SMC_CLC_ACCEPT); 433 if (reason_code < 0) { 434 rc = reason_code; 435 goto out_err; 436 } 437 if (reason_code > 0) 438 goto decline_rdma; 439 440 srv_first_contact = aclc.hdr.flag; 441 mutex_lock(&smc_create_lgr_pending); 442 local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev, 443 ibport, &aclc.lcl, srv_first_contact); 444 if (local_contact < 0) { 445 rc = local_contact; 446 if (rc == -ENOMEM) 447 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 448 else if (rc == -ENOLINK) 449 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ 450 goto decline_rdma_unlock; 451 } 452 link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 453 454 smc_conn_save_peer_info(smc, &aclc); 455 456 /* create send buffer and rmb */ 457 rc = smc_buf_create(smc); 458 if (rc) { 459 reason_code = SMC_CLC_DECL_MEM; 460 goto decline_rdma_unlock; 461 } 462 463 if (local_contact == SMC_FIRST_CONTACT) 464 smc_link_save_peer_info(link, &aclc); 465 466 rc = smc_rmb_rtoken_handling(&smc->conn, &aclc); 467 if (rc) { 468 reason_code = SMC_CLC_DECL_INTERR; 469 goto decline_rdma_unlock; 470 } 471 472 smc_close_init(smc); 473 smc_rx_init(smc); 474 475 if (local_contact == SMC_FIRST_CONTACT) { 476 rc = smc_ib_ready_link(link); 477 if (rc) { 478 reason_code = SMC_CLC_DECL_INTERR; 479 goto decline_rdma_unlock; 480 } 481 } else { 482 struct smc_buf_desc *buf_desc = smc->conn.rmb_desc; 483 484 if (!buf_desc->reused) { 485 /* register memory region for new rmb */ 486 rc = smc_wr_reg_send(link, 487 buf_desc->mr_rx[SMC_SINGLE_LINK]); 488 if (rc) { 489 reason_code = SMC_CLC_DECL_INTERR; 490 goto decline_rdma_unlock; 491 } 492 } 493 } 494 smc_rmb_sync_sg_for_device(&smc->conn); 495 496 rc = smc_clc_send_confirm(smc); 497 if (rc) 498 goto out_err_unlock; 499 500 if (local_contact == SMC_FIRST_CONTACT) { 501 /* QP confirmation over RoCE fabric */ 502 reason_code = smc_clnt_conf_first_link( 503 smc, &smcibdev->gid[ibport - 1]); 504 if (reason_code < 0) { 505 rc = reason_code; 506 goto out_err_unlock; 507 } 508 if (reason_code > 0) 509 goto decline_rdma_unlock; 510 } 511 512 mutex_unlock(&smc_create_lgr_pending); 513 smc_tx_init(smc); 514 515 out_connected: 516 smc_copy_sock_settings_to_clc(smc); 517 if (smc->sk.sk_state == SMC_INIT) 518 smc->sk.sk_state = SMC_ACTIVE; 519 520 return rc ? rc : local_contact; 521 522 decline_rdma_unlock: 523 if (local_contact == SMC_FIRST_CONTACT) 524 smc_lgr_forget(smc->conn.lgr); 525 mutex_unlock(&smc_create_lgr_pending); 526 smc_conn_free(&smc->conn); 527 decline_rdma: 528 /* RDMA setup failed, switch back to TCP */ 529 smc->use_fallback = true; 530 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { 531 rc = smc_clc_send_decline(smc, reason_code); 532 if (rc < 0) 533 goto out_err; 534 } 535 goto out_connected; 536 537 out_err_unlock: 538 if (local_contact == SMC_FIRST_CONTACT) 539 smc_lgr_forget(smc->conn.lgr); 540 mutex_unlock(&smc_create_lgr_pending); 541 smc_conn_free(&smc->conn); 542 out_err: 543 if (smc->sk.sk_state == SMC_INIT) 544 sock_put(&smc->sk); /* passive closing */ 545 return rc; 546 } 547 548 static int smc_connect(struct socket *sock, struct sockaddr *addr, 549 int alen, int flags) 550 { 551 struct sock *sk = sock->sk; 552 struct smc_sock *smc; 553 int rc = -EINVAL; 554 555 smc = smc_sk(sk); 556 557 /* separate smc parameter checking to be safe */ 558 if (alen < sizeof(addr->sa_family)) 559 goto out_err; 560 if (addr->sa_family != AF_INET) 561 goto out_err; 562 smc->addr = addr; /* needed for nonblocking connect */ 563 564 lock_sock(sk); 565 switch (sk->sk_state) { 566 default: 567 goto out; 568 case SMC_ACTIVE: 569 rc = -EISCONN; 570 goto out; 571 case SMC_INIT: 572 rc = 0; 573 break; 574 } 575 576 smc_copy_sock_settings_to_clc(smc); 577 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 578 rc = kernel_connect(smc->clcsock, addr, alen, flags); 579 if (rc) 580 goto out; 581 582 /* setup RDMA connection */ 583 rc = smc_connect_rdma(smc); 584 if (rc < 0) 585 goto out; 586 else 587 rc = 0; /* success cases including fallback */ 588 589 out: 590 release_sock(sk); 591 out_err: 592 return rc; 593 } 594 595 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) 596 { 597 struct socket *new_clcsock = NULL; 598 struct sock *lsk = &lsmc->sk; 599 struct sock *new_sk; 600 int rc; 601 602 release_sock(lsk); 603 new_sk = smc_sock_alloc(sock_net(lsk), NULL); 604 if (!new_sk) { 605 rc = -ENOMEM; 606 lsk->sk_err = ENOMEM; 607 *new_smc = NULL; 608 lock_sock(lsk); 609 goto out; 610 } 611 *new_smc = smc_sk(new_sk); 612 613 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); 614 lock_sock(lsk); 615 if (rc < 0) 616 lsk->sk_err = -rc; 617 if (rc < 0 || lsk->sk_state == SMC_CLOSED) { 618 if (new_clcsock) 619 sock_release(new_clcsock); 620 new_sk->sk_state = SMC_CLOSED; 621 sock_set_flag(new_sk, SOCK_DEAD); 622 new_sk->sk_prot->unhash(new_sk); 623 sock_put(new_sk); /* final */ 624 *new_smc = NULL; 625 goto out; 626 } 627 628 (*new_smc)->clcsock = new_clcsock; 629 out: 630 return rc; 631 } 632 633 /* add a just created sock to the accept queue of the listen sock as 634 * candidate for a following socket accept call from user space 635 */ 636 static void smc_accept_enqueue(struct sock *parent, struct sock *sk) 637 { 638 struct smc_sock *par = smc_sk(parent); 639 640 sock_hold(sk); /* sock_put in smc_accept_unlink () */ 641 spin_lock(&par->accept_q_lock); 642 list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); 643 spin_unlock(&par->accept_q_lock); 644 sk_acceptq_added(parent); 645 } 646 647 /* remove a socket from the accept queue of its parental listening socket */ 648 static void smc_accept_unlink(struct sock *sk) 649 { 650 struct smc_sock *par = smc_sk(sk)->listen_smc; 651 652 spin_lock(&par->accept_q_lock); 653 list_del_init(&smc_sk(sk)->accept_q); 654 spin_unlock(&par->accept_q_lock); 655 sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); 656 sock_put(sk); /* sock_hold in smc_accept_enqueue */ 657 } 658 659 /* remove a sock from the accept queue to bind it to a new socket created 660 * for a socket accept call from user space 661 */ 662 struct sock *smc_accept_dequeue(struct sock *parent, 663 struct socket *new_sock) 664 { 665 struct smc_sock *isk, *n; 666 struct sock *new_sk; 667 668 list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { 669 new_sk = (struct sock *)isk; 670 671 smc_accept_unlink(new_sk); 672 if (new_sk->sk_state == SMC_CLOSED) { 673 if (isk->clcsock) { 674 sock_release(isk->clcsock); 675 isk->clcsock = NULL; 676 } 677 new_sk->sk_prot->unhash(new_sk); 678 sock_put(new_sk); /* final */ 679 continue; 680 } 681 if (new_sock) 682 sock_graft(new_sk, new_sock); 683 return new_sk; 684 } 685 return NULL; 686 } 687 688 /* clean up for a created but never accepted sock */ 689 void smc_close_non_accepted(struct sock *sk) 690 { 691 struct smc_sock *smc = smc_sk(sk); 692 693 lock_sock(sk); 694 if (!sk->sk_lingertime) 695 /* wait for peer closing */ 696 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; 697 if (!smc->use_fallback) { 698 smc_close_active(smc); 699 sock_set_flag(sk, SOCK_DEAD); 700 sk->sk_shutdown |= SHUTDOWN_MASK; 701 } 702 if (smc->clcsock) { 703 struct socket *tcp; 704 705 tcp = smc->clcsock; 706 smc->clcsock = NULL; 707 sock_release(tcp); 708 } 709 if (smc->use_fallback) { 710 sock_put(sk); /* passive closing */ 711 sk->sk_state = SMC_CLOSED; 712 } else { 713 if (sk->sk_state == SMC_CLOSED) 714 smc_conn_free(&smc->conn); 715 } 716 release_sock(sk); 717 sk->sk_prot->unhash(sk); 718 sock_put(sk); /* final sock_put */ 719 } 720 721 static int smc_serv_conf_first_link(struct smc_sock *smc) 722 { 723 struct smc_link_group *lgr = smc->conn.lgr; 724 struct smc_link *link; 725 int rest; 726 int rc; 727 728 link = &lgr->lnk[SMC_SINGLE_LINK]; 729 730 rc = smc_wr_reg_send(link, 731 smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]); 732 if (rc) 733 return SMC_CLC_DECL_INTERR; 734 735 /* send CONFIRM LINK request to client over the RoCE fabric */ 736 rc = smc_llc_send_confirm_link(link, 737 link->smcibdev->mac[link->ibport - 1], 738 &link->smcibdev->gid[link->ibport - 1], 739 SMC_LLC_REQ); 740 if (rc < 0) 741 return SMC_CLC_DECL_TCL; 742 743 /* receive CONFIRM LINK response from client over the RoCE fabric */ 744 rest = wait_for_completion_interruptible_timeout( 745 &link->llc_confirm_resp, 746 SMC_LLC_WAIT_FIRST_TIME); 747 if (rest <= 0) { 748 struct smc_clc_msg_decline dclc; 749 750 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 751 SMC_CLC_DECLINE); 752 } 753 754 return rc; 755 } 756 757 /* setup for RDMA connection of server */ 758 static void smc_listen_work(struct work_struct *work) 759 { 760 struct smc_sock *new_smc = container_of(work, struct smc_sock, 761 smc_listen_work); 762 struct smc_clc_msg_proposal_prefix *pclc_prfx; 763 struct socket *newclcsock = new_smc->clcsock; 764 struct smc_sock *lsmc = new_smc->listen_smc; 765 struct smc_clc_msg_accept_confirm cclc; 766 int local_contact = SMC_REUSE_CONTACT; 767 struct sock *newsmcsk = &new_smc->sk; 768 struct smc_clc_msg_proposal *pclc; 769 struct smc_ib_device *smcibdev; 770 struct sockaddr_in peeraddr; 771 u8 buf[SMC_CLC_MAX_LEN]; 772 struct smc_link *link; 773 int reason_code = 0; 774 int rc = 0, len; 775 __be32 subnet; 776 u8 prefix_len; 777 u8 ibport; 778 779 /* check if peer is smc capable */ 780 if (!tcp_sk(newclcsock->sk)->syn_smc) { 781 new_smc->use_fallback = true; 782 goto out_connected; 783 } 784 785 /* do inband token exchange - 786 *wait for and receive SMC Proposal CLC message 787 */ 788 reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf), 789 SMC_CLC_PROPOSAL); 790 if (reason_code < 0) 791 goto out_err; 792 if (reason_code > 0) 793 goto decline_rdma; 794 795 /* IPSec connections opt out of SMC-R optimizations */ 796 if (using_ipsec(new_smc)) { 797 reason_code = SMC_CLC_DECL_IPSEC; 798 goto decline_rdma; 799 } 800 801 /* PNET table look up: search active ib_device and port 802 * within same PNETID that also contains the ethernet device 803 * used for the internal TCP socket 804 */ 805 smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport); 806 if (!smcibdev) { 807 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 808 goto decline_rdma; 809 } 810 811 /* determine subnet and mask from internal TCP socket */ 812 rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len); 813 if (rc) { 814 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 815 goto decline_rdma; 816 } 817 818 pclc = (struct smc_clc_msg_proposal *)&buf; 819 pclc_prfx = smc_clc_proposal_get_prefix(pclc); 820 if (pclc_prfx->outgoing_subnet != subnet || 821 pclc_prfx->prefix_len != prefix_len) { 822 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 823 goto decline_rdma; 824 } 825 826 /* get address of the peer connected to the internal TCP socket */ 827 kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len); 828 829 /* allocate connection / link group */ 830 mutex_lock(&smc_create_lgr_pending); 831 local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr, 832 smcibdev, ibport, &pclc->lcl, 0); 833 if (local_contact < 0) { 834 rc = local_contact; 835 if (rc == -ENOMEM) 836 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 837 goto decline_rdma_unlock; 838 } 839 link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 840 841 /* create send buffer and rmb */ 842 rc = smc_buf_create(new_smc); 843 if (rc) { 844 reason_code = SMC_CLC_DECL_MEM; 845 goto decline_rdma_unlock; 846 } 847 848 smc_close_init(new_smc); 849 smc_rx_init(new_smc); 850 851 if (local_contact != SMC_FIRST_CONTACT) { 852 struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc; 853 854 if (!buf_desc->reused) { 855 /* register memory region for new rmb */ 856 rc = smc_wr_reg_send(link, 857 buf_desc->mr_rx[SMC_SINGLE_LINK]); 858 if (rc) { 859 reason_code = SMC_CLC_DECL_INTERR; 860 goto decline_rdma_unlock; 861 } 862 } 863 } 864 smc_rmb_sync_sg_for_device(&new_smc->conn); 865 866 rc = smc_clc_send_accept(new_smc, local_contact); 867 if (rc) 868 goto out_err_unlock; 869 870 /* receive SMC Confirm CLC message */ 871 reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), 872 SMC_CLC_CONFIRM); 873 if (reason_code < 0) 874 goto out_err_unlock; 875 if (reason_code > 0) 876 goto decline_rdma_unlock; 877 smc_conn_save_peer_info(new_smc, &cclc); 878 if (local_contact == SMC_FIRST_CONTACT) 879 smc_link_save_peer_info(link, &cclc); 880 881 rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); 882 if (rc) { 883 reason_code = SMC_CLC_DECL_INTERR; 884 goto decline_rdma_unlock; 885 } 886 887 if (local_contact == SMC_FIRST_CONTACT) { 888 rc = smc_ib_ready_link(link); 889 if (rc) { 890 reason_code = SMC_CLC_DECL_INTERR; 891 goto decline_rdma_unlock; 892 } 893 /* QP confirmation over RoCE fabric */ 894 reason_code = smc_serv_conf_first_link(new_smc); 895 if (reason_code < 0) 896 /* peer is not aware of a problem */ 897 goto out_err_unlock; 898 if (reason_code > 0) 899 goto decline_rdma_unlock; 900 } 901 902 smc_tx_init(new_smc); 903 mutex_unlock(&smc_create_lgr_pending); 904 905 out_connected: 906 sk_refcnt_debug_inc(newsmcsk); 907 if (newsmcsk->sk_state == SMC_INIT) 908 newsmcsk->sk_state = SMC_ACTIVE; 909 enqueue: 910 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); 911 if (lsmc->sk.sk_state == SMC_LISTEN) { 912 smc_accept_enqueue(&lsmc->sk, newsmcsk); 913 } else { /* no longer listening */ 914 smc_close_non_accepted(newsmcsk); 915 } 916 release_sock(&lsmc->sk); 917 918 /* Wake up accept */ 919 lsmc->sk.sk_data_ready(&lsmc->sk); 920 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ 921 return; 922 923 decline_rdma_unlock: 924 if (local_contact == SMC_FIRST_CONTACT) 925 smc_lgr_forget(new_smc->conn.lgr); 926 mutex_unlock(&smc_create_lgr_pending); 927 decline_rdma: 928 /* RDMA setup failed, switch back to TCP */ 929 smc_conn_free(&new_smc->conn); 930 new_smc->use_fallback = true; 931 if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { 932 if (smc_clc_send_decline(new_smc, reason_code) < 0) 933 goto out_err; 934 } 935 goto out_connected; 936 937 out_err_unlock: 938 if (local_contact == SMC_FIRST_CONTACT) 939 smc_lgr_forget(new_smc->conn.lgr); 940 mutex_unlock(&smc_create_lgr_pending); 941 out_err: 942 if (newsmcsk->sk_state == SMC_INIT) 943 sock_put(&new_smc->sk); /* passive closing */ 944 newsmcsk->sk_state = SMC_CLOSED; 945 smc_conn_free(&new_smc->conn); 946 goto enqueue; /* queue new sock with sk_err set */ 947 } 948 949 static void smc_tcp_listen_work(struct work_struct *work) 950 { 951 struct smc_sock *lsmc = container_of(work, struct smc_sock, 952 tcp_listen_work); 953 struct sock *lsk = &lsmc->sk; 954 struct smc_sock *new_smc; 955 int rc = 0; 956 957 lock_sock(lsk); 958 while (lsk->sk_state == SMC_LISTEN) { 959 rc = smc_clcsock_accept(lsmc, &new_smc); 960 if (rc) 961 goto out; 962 if (!new_smc) 963 continue; 964 965 new_smc->listen_smc = lsmc; 966 new_smc->use_fallback = false; /* assume rdma capability first*/ 967 sock_hold(lsk); /* sock_put in smc_listen_work */ 968 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); 969 smc_copy_sock_settings_to_smc(new_smc); 970 sock_hold(&new_smc->sk); /* sock_put in passive closing */ 971 if (!schedule_work(&new_smc->smc_listen_work)) 972 sock_put(&new_smc->sk); 973 } 974 975 out: 976 if (lsmc->clcsock) { 977 sock_release(lsmc->clcsock); 978 lsmc->clcsock = NULL; 979 } 980 release_sock(lsk); 981 /* no more listening, wake up smc_close_wait_listen_clcsock and 982 * accept 983 */ 984 lsk->sk_state_change(lsk); 985 sock_put(&lsmc->sk); /* sock_hold in smc_listen */ 986 } 987 988 static int smc_listen(struct socket *sock, int backlog) 989 { 990 struct sock *sk = sock->sk; 991 struct smc_sock *smc; 992 int rc; 993 994 smc = smc_sk(sk); 995 lock_sock(sk); 996 997 rc = -EINVAL; 998 if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) 999 goto out; 1000 1001 rc = 0; 1002 if (sk->sk_state == SMC_LISTEN) { 1003 sk->sk_max_ack_backlog = backlog; 1004 goto out; 1005 } 1006 /* some socket options are handled in core, so we could not apply 1007 * them to the clc socket -- copy smc socket options to clc socket 1008 */ 1009 smc_copy_sock_settings_to_clc(smc); 1010 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 1011 1012 rc = kernel_listen(smc->clcsock, backlog); 1013 if (rc) 1014 goto out; 1015 sk->sk_max_ack_backlog = backlog; 1016 sk->sk_ack_backlog = 0; 1017 sk->sk_state = SMC_LISTEN; 1018 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 1019 sock_hold(sk); /* sock_hold in tcp_listen_worker */ 1020 if (!schedule_work(&smc->tcp_listen_work)) 1021 sock_put(sk); 1022 1023 out: 1024 release_sock(sk); 1025 return rc; 1026 } 1027 1028 static int smc_accept(struct socket *sock, struct socket *new_sock, 1029 int flags, bool kern) 1030 { 1031 struct sock *sk = sock->sk, *nsk; 1032 DECLARE_WAITQUEUE(wait, current); 1033 struct smc_sock *lsmc; 1034 long timeo; 1035 int rc = 0; 1036 1037 lsmc = smc_sk(sk); 1038 sock_hold(sk); /* sock_put below */ 1039 lock_sock(sk); 1040 1041 if (lsmc->sk.sk_state != SMC_LISTEN) { 1042 rc = -EINVAL; 1043 goto out; 1044 } 1045 1046 /* Wait for an incoming connection */ 1047 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 1048 add_wait_queue_exclusive(sk_sleep(sk), &wait); 1049 while (!(nsk = smc_accept_dequeue(sk, new_sock))) { 1050 set_current_state(TASK_INTERRUPTIBLE); 1051 if (!timeo) { 1052 rc = -EAGAIN; 1053 break; 1054 } 1055 release_sock(sk); 1056 timeo = schedule_timeout(timeo); 1057 /* wakeup by sk_data_ready in smc_listen_work() */ 1058 sched_annotate_sleep(); 1059 lock_sock(sk); 1060 if (signal_pending(current)) { 1061 rc = sock_intr_errno(timeo); 1062 break; 1063 } 1064 } 1065 set_current_state(TASK_RUNNING); 1066 remove_wait_queue(sk_sleep(sk), &wait); 1067 1068 if (!rc) 1069 rc = sock_error(nsk); 1070 1071 out: 1072 release_sock(sk); 1073 sock_put(sk); /* sock_hold above */ 1074 return rc; 1075 } 1076 1077 static int smc_getname(struct socket *sock, struct sockaddr *addr, 1078 int *len, int peer) 1079 { 1080 struct smc_sock *smc; 1081 1082 if (peer && (sock->sk->sk_state != SMC_ACTIVE) && 1083 (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) 1084 return -ENOTCONN; 1085 1086 smc = smc_sk(sock->sk); 1087 1088 return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer); 1089 } 1090 1091 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) 1092 { 1093 struct sock *sk = sock->sk; 1094 struct smc_sock *smc; 1095 int rc = -EPIPE; 1096 1097 smc = smc_sk(sk); 1098 lock_sock(sk); 1099 if ((sk->sk_state != SMC_ACTIVE) && 1100 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1101 (sk->sk_state != SMC_INIT)) 1102 goto out; 1103 if (smc->use_fallback) 1104 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); 1105 else 1106 rc = smc_tx_sendmsg(smc, msg, len); 1107 out: 1108 release_sock(sk); 1109 return rc; 1110 } 1111 1112 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 1113 int flags) 1114 { 1115 struct sock *sk = sock->sk; 1116 struct smc_sock *smc; 1117 int rc = -ENOTCONN; 1118 1119 smc = smc_sk(sk); 1120 lock_sock(sk); 1121 if ((sk->sk_state == SMC_INIT) || 1122 (sk->sk_state == SMC_LISTEN) || 1123 (sk->sk_state == SMC_CLOSED)) 1124 goto out; 1125 1126 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 1127 rc = 0; 1128 goto out; 1129 } 1130 1131 if (smc->use_fallback) 1132 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); 1133 else 1134 rc = smc_rx_recvmsg(smc, msg, len, flags); 1135 1136 out: 1137 release_sock(sk); 1138 return rc; 1139 } 1140 1141 static __poll_t smc_accept_poll(struct sock *parent) 1142 { 1143 struct smc_sock *isk = smc_sk(parent); 1144 __poll_t mask = 0; 1145 1146 spin_lock(&isk->accept_q_lock); 1147 if (!list_empty(&isk->accept_q)) 1148 mask = EPOLLIN | EPOLLRDNORM; 1149 spin_unlock(&isk->accept_q_lock); 1150 1151 return mask; 1152 } 1153 1154 static __poll_t smc_poll(struct file *file, struct socket *sock, 1155 poll_table *wait) 1156 { 1157 struct sock *sk = sock->sk; 1158 __poll_t mask = 0; 1159 struct smc_sock *smc; 1160 int rc; 1161 1162 if (!sk) 1163 return EPOLLNVAL; 1164 1165 smc = smc_sk(sock->sk); 1166 sock_hold(sk); 1167 lock_sock(sk); 1168 if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { 1169 /* delegate to CLC child sock */ 1170 release_sock(sk); 1171 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); 1172 /* if non-blocking connect finished ... */ 1173 lock_sock(sk); 1174 if ((sk->sk_state == SMC_INIT) && (mask & EPOLLOUT)) { 1175 sk->sk_err = smc->clcsock->sk->sk_err; 1176 if (sk->sk_err) { 1177 mask |= EPOLLERR; 1178 } else { 1179 rc = smc_connect_rdma(smc); 1180 if (rc < 0) 1181 mask |= EPOLLERR; 1182 /* success cases including fallback */ 1183 mask |= EPOLLOUT | EPOLLWRNORM; 1184 } 1185 } 1186 } else { 1187 if (sk->sk_state != SMC_CLOSED) { 1188 release_sock(sk); 1189 sock_poll_wait(file, sk_sleep(sk), wait); 1190 lock_sock(sk); 1191 } 1192 if (sk->sk_err) 1193 mask |= EPOLLERR; 1194 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1195 (sk->sk_state == SMC_CLOSED)) 1196 mask |= EPOLLHUP; 1197 if (sk->sk_state == SMC_LISTEN) { 1198 /* woken up by sk_data_ready in smc_listen_work() */ 1199 mask = smc_accept_poll(sk); 1200 } else { 1201 if (atomic_read(&smc->conn.sndbuf_space) || 1202 sk->sk_shutdown & SEND_SHUTDOWN) { 1203 mask |= EPOLLOUT | EPOLLWRNORM; 1204 } else { 1205 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1206 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1207 } 1208 if (atomic_read(&smc->conn.bytes_to_rcv)) 1209 mask |= EPOLLIN | EPOLLRDNORM; 1210 if (sk->sk_shutdown & RCV_SHUTDOWN) 1211 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 1212 if (sk->sk_state == SMC_APPCLOSEWAIT1) 1213 mask |= EPOLLIN; 1214 } 1215 1216 } 1217 release_sock(sk); 1218 sock_put(sk); 1219 1220 return mask; 1221 } 1222 1223 static int smc_shutdown(struct socket *sock, int how) 1224 { 1225 struct sock *sk = sock->sk; 1226 struct smc_sock *smc; 1227 int rc = -EINVAL; 1228 int rc1 = 0; 1229 1230 smc = smc_sk(sk); 1231 1232 if ((how < SHUT_RD) || (how > SHUT_RDWR)) 1233 return rc; 1234 1235 lock_sock(sk); 1236 1237 rc = -ENOTCONN; 1238 if ((sk->sk_state != SMC_LISTEN) && 1239 (sk->sk_state != SMC_ACTIVE) && 1240 (sk->sk_state != SMC_PEERCLOSEWAIT1) && 1241 (sk->sk_state != SMC_PEERCLOSEWAIT2) && 1242 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1243 (sk->sk_state != SMC_APPCLOSEWAIT2) && 1244 (sk->sk_state != SMC_APPFINCLOSEWAIT)) 1245 goto out; 1246 if (smc->use_fallback) { 1247 rc = kernel_sock_shutdown(smc->clcsock, how); 1248 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; 1249 if (sk->sk_shutdown == SHUTDOWN_MASK) 1250 sk->sk_state = SMC_CLOSED; 1251 goto out; 1252 } 1253 switch (how) { 1254 case SHUT_RDWR: /* shutdown in both directions */ 1255 rc = smc_close_active(smc); 1256 break; 1257 case SHUT_WR: 1258 rc = smc_close_shutdown_write(smc); 1259 break; 1260 case SHUT_RD: 1261 if (sk->sk_state == SMC_LISTEN) 1262 rc = smc_close_active(smc); 1263 else 1264 rc = 0; 1265 /* nothing more to do because peer is not involved */ 1266 break; 1267 } 1268 rc1 = kernel_sock_shutdown(smc->clcsock, how); 1269 /* map sock_shutdown_cmd constants to sk_shutdown value range */ 1270 sk->sk_shutdown |= how + 1; 1271 1272 out: 1273 release_sock(sk); 1274 return rc ? rc : rc1; 1275 } 1276 1277 static int smc_setsockopt(struct socket *sock, int level, int optname, 1278 char __user *optval, unsigned int optlen) 1279 { 1280 struct sock *sk = sock->sk; 1281 struct smc_sock *smc; 1282 1283 smc = smc_sk(sk); 1284 1285 /* generic setsockopts reaching us here always apply to the 1286 * CLC socket 1287 */ 1288 return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, 1289 optval, optlen); 1290 } 1291 1292 static int smc_getsockopt(struct socket *sock, int level, int optname, 1293 char __user *optval, int __user *optlen) 1294 { 1295 struct smc_sock *smc; 1296 1297 smc = smc_sk(sock->sk); 1298 /* socket options apply to the CLC socket */ 1299 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, 1300 optval, optlen); 1301 } 1302 1303 static int smc_ioctl(struct socket *sock, unsigned int cmd, 1304 unsigned long arg) 1305 { 1306 struct smc_sock *smc; 1307 1308 smc = smc_sk(sock->sk); 1309 if (smc->use_fallback) 1310 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); 1311 else 1312 return sock_no_ioctl(sock, cmd, arg); 1313 } 1314 1315 static ssize_t smc_sendpage(struct socket *sock, struct page *page, 1316 int offset, size_t size, int flags) 1317 { 1318 struct sock *sk = sock->sk; 1319 struct smc_sock *smc; 1320 int rc = -EPIPE; 1321 1322 smc = smc_sk(sk); 1323 lock_sock(sk); 1324 if (sk->sk_state != SMC_ACTIVE) 1325 goto out; 1326 if (smc->use_fallback) 1327 rc = kernel_sendpage(smc->clcsock, page, offset, 1328 size, flags); 1329 else 1330 rc = sock_no_sendpage(sock, page, offset, size, flags); 1331 1332 out: 1333 release_sock(sk); 1334 return rc; 1335 } 1336 1337 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, 1338 struct pipe_inode_info *pipe, size_t len, 1339 unsigned int flags) 1340 { 1341 struct sock *sk = sock->sk; 1342 struct smc_sock *smc; 1343 int rc = -ENOTCONN; 1344 1345 smc = smc_sk(sk); 1346 lock_sock(sk); 1347 if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED)) 1348 goto out; 1349 if (smc->use_fallback) { 1350 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, 1351 pipe, len, flags); 1352 } else { 1353 rc = -EOPNOTSUPP; 1354 } 1355 out: 1356 release_sock(sk); 1357 return rc; 1358 } 1359 1360 /* must look like tcp */ 1361 static const struct proto_ops smc_sock_ops = { 1362 .family = PF_SMC, 1363 .owner = THIS_MODULE, 1364 .release = smc_release, 1365 .bind = smc_bind, 1366 .connect = smc_connect, 1367 .socketpair = sock_no_socketpair, 1368 .accept = smc_accept, 1369 .getname = smc_getname, 1370 .poll = smc_poll, 1371 .ioctl = smc_ioctl, 1372 .listen = smc_listen, 1373 .shutdown = smc_shutdown, 1374 .setsockopt = smc_setsockopt, 1375 .getsockopt = smc_getsockopt, 1376 .sendmsg = smc_sendmsg, 1377 .recvmsg = smc_recvmsg, 1378 .mmap = sock_no_mmap, 1379 .sendpage = smc_sendpage, 1380 .splice_read = smc_splice_read, 1381 }; 1382 1383 static int smc_create(struct net *net, struct socket *sock, int protocol, 1384 int kern) 1385 { 1386 struct smc_sock *smc; 1387 struct sock *sk; 1388 int rc; 1389 1390 rc = -ESOCKTNOSUPPORT; 1391 if (sock->type != SOCK_STREAM) 1392 goto out; 1393 1394 rc = -EPROTONOSUPPORT; 1395 if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP)) 1396 goto out; 1397 1398 rc = -ENOBUFS; 1399 sock->ops = &smc_sock_ops; 1400 sk = smc_sock_alloc(net, sock); 1401 if (!sk) 1402 goto out; 1403 1404 /* create internal TCP socket for CLC handshake and fallback */ 1405 smc = smc_sk(sk); 1406 smc->use_fallback = false; /* assume rdma capability first */ 1407 rc = sock_create_kern(net, PF_INET, SOCK_STREAM, 1408 IPPROTO_TCP, &smc->clcsock); 1409 if (rc) 1410 sk_common_release(sk); 1411 smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); 1412 smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); 1413 1414 out: 1415 return rc; 1416 } 1417 1418 static const struct net_proto_family smc_sock_family_ops = { 1419 .family = PF_SMC, 1420 .owner = THIS_MODULE, 1421 .create = smc_create, 1422 }; 1423 1424 static int __init smc_init(void) 1425 { 1426 int rc; 1427 1428 rc = smc_pnet_init(); 1429 if (rc) 1430 return rc; 1431 1432 rc = smc_llc_init(); 1433 if (rc) { 1434 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); 1435 goto out_pnet; 1436 } 1437 1438 rc = smc_cdc_init(); 1439 if (rc) { 1440 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); 1441 goto out_pnet; 1442 } 1443 1444 rc = proto_register(&smc_proto, 1); 1445 if (rc) { 1446 pr_err("%s: proto_register fails with %d\n", __func__, rc); 1447 goto out_pnet; 1448 } 1449 1450 rc = sock_register(&smc_sock_family_ops); 1451 if (rc) { 1452 pr_err("%s: sock_register fails with %d\n", __func__, rc); 1453 goto out_proto; 1454 } 1455 INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); 1456 1457 rc = smc_ib_register_client(); 1458 if (rc) { 1459 pr_err("%s: ib_register fails with %d\n", __func__, rc); 1460 goto out_sock; 1461 } 1462 1463 static_branch_enable(&tcp_have_smc); 1464 return 0; 1465 1466 out_sock: 1467 sock_unregister(PF_SMC); 1468 out_proto: 1469 proto_unregister(&smc_proto); 1470 out_pnet: 1471 smc_pnet_exit(); 1472 return rc; 1473 } 1474 1475 static void __exit smc_exit(void) 1476 { 1477 struct smc_link_group *lgr, *lg; 1478 LIST_HEAD(lgr_freeing_list); 1479 1480 spin_lock_bh(&smc_lgr_list.lock); 1481 if (!list_empty(&smc_lgr_list.list)) 1482 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); 1483 spin_unlock_bh(&smc_lgr_list.lock); 1484 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { 1485 list_del_init(&lgr->list); 1486 smc_lgr_free(lgr); /* free link group */ 1487 } 1488 static_branch_disable(&tcp_have_smc); 1489 smc_ib_unregister_client(); 1490 sock_unregister(PF_SMC); 1491 proto_unregister(&smc_proto); 1492 smc_pnet_exit(); 1493 } 1494 1495 module_init(smc_init); 1496 module_exit(smc_exit); 1497 1498 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); 1499 MODULE_DESCRIPTION("smc socket address family"); 1500 MODULE_LICENSE("GPL"); 1501 MODULE_ALIAS_NETPROTO(PF_SMC); 1502