1 /* 2 * Shared Memory Communications over RDMA (SMC-R) and RoCE 3 * 4 * AF_SMC protocol family socket handler keeping the AF_INET sock address type 5 * applies to SOCK_STREAM sockets only 6 * offers an alternative communication option for TCP-protocol sockets 7 * applicable with RoCE-cards only 8 * 9 * Initial restrictions: 10 * - support for alternate links postponed 11 * 12 * Copyright IBM Corp. 2016, 2018 13 * 14 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 15 * based on prototype from Frank Blaschka 16 */ 17 18 #define KMSG_COMPONENT "smc" 19 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 20 21 #include <linux/module.h> 22 #include <linux/socket.h> 23 #include <linux/workqueue.h> 24 #include <linux/in.h> 25 #include <linux/sched/signal.h> 26 #include <linux/if_vlan.h> 27 28 #include <net/sock.h> 29 #include <net/tcp.h> 30 #include <net/smc.h> 31 #include <asm/ioctls.h> 32 33 #include <net/net_namespace.h> 34 #include <net/netns/generic.h> 35 #include "smc_netns.h" 36 37 #include "smc.h" 38 #include "smc_clc.h" 39 #include "smc_llc.h" 40 #include "smc_cdc.h" 41 #include "smc_core.h" 42 #include "smc_ib.h" 43 #include "smc_ism.h" 44 #include "smc_pnet.h" 45 #include "smc_tx.h" 46 #include "smc_rx.h" 47 #include "smc_close.h" 48 49 static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group 50 * creation on server 51 */ 52 static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group 53 * creation on client 54 */ 55 56 static void smc_tcp_listen_work(struct work_struct *); 57 static void smc_connect_work(struct work_struct *); 58 59 static void smc_set_keepalive(struct sock *sk, int val) 60 { 61 struct smc_sock *smc = smc_sk(sk); 62 63 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); 64 } 65 66 static struct smc_hashinfo smc_v4_hashinfo = { 67 .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), 68 }; 69 70 static struct smc_hashinfo smc_v6_hashinfo = { 71 .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), 72 }; 73 74 int smc_hash_sk(struct sock *sk) 75 { 76 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 77 struct hlist_head *head; 78 79 head = &h->ht; 80 81 write_lock_bh(&h->lock); 82 sk_add_node(sk, head); 83 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 84 write_unlock_bh(&h->lock); 85 86 return 0; 87 } 88 EXPORT_SYMBOL_GPL(smc_hash_sk); 89 90 void smc_unhash_sk(struct sock *sk) 91 { 92 struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; 93 94 write_lock_bh(&h->lock); 95 if (sk_del_node_init(sk)) 96 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 97 write_unlock_bh(&h->lock); 98 } 99 EXPORT_SYMBOL_GPL(smc_unhash_sk); 100 101 struct proto smc_proto = { 102 .name = "SMC", 103 .owner = THIS_MODULE, 104 .keepalive = smc_set_keepalive, 105 .hash = smc_hash_sk, 106 .unhash = smc_unhash_sk, 107 .obj_size = sizeof(struct smc_sock), 108 .h.smc_hash = &smc_v4_hashinfo, 109 .slab_flags = SLAB_TYPESAFE_BY_RCU, 110 }; 111 EXPORT_SYMBOL_GPL(smc_proto); 112 113 struct proto smc_proto6 = { 114 .name = "SMC6", 115 .owner = THIS_MODULE, 116 .keepalive = smc_set_keepalive, 117 .hash = smc_hash_sk, 118 .unhash = smc_unhash_sk, 119 .obj_size = sizeof(struct smc_sock), 120 .h.smc_hash = &smc_v6_hashinfo, 121 .slab_flags = SLAB_TYPESAFE_BY_RCU, 122 }; 123 EXPORT_SYMBOL_GPL(smc_proto6); 124 125 static int smc_release(struct socket *sock) 126 { 127 struct sock *sk = sock->sk; 128 struct smc_sock *smc; 129 int rc = 0; 130 131 if (!sk) 132 goto out; 133 134 smc = smc_sk(sk); 135 136 /* cleanup for a dangling non-blocking connect */ 137 if (smc->connect_info && sk->sk_state == SMC_INIT) 138 tcp_abort(smc->clcsock->sk, ECONNABORTED); 139 flush_work(&smc->connect_work); 140 kfree(smc->connect_info); 141 smc->connect_info = NULL; 142 143 if (sk->sk_state == SMC_LISTEN) 144 /* smc_close_non_accepted() is called and acquires 145 * sock lock for child sockets again 146 */ 147 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 148 else 149 lock_sock(sk); 150 151 if (!smc->use_fallback) { 152 rc = smc_close_active(smc); 153 sock_set_flag(sk, SOCK_DEAD); 154 sk->sk_shutdown |= SHUTDOWN_MASK; 155 } else { 156 if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT) 157 sock_put(sk); /* passive closing */ 158 if (sk->sk_state == SMC_LISTEN) { 159 /* wake up clcsock accept */ 160 rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); 161 } 162 sk->sk_state = SMC_CLOSED; 163 sk->sk_state_change(sk); 164 } 165 166 sk->sk_prot->unhash(sk); 167 168 if (sk->sk_state == SMC_CLOSED) { 169 if (smc->clcsock) { 170 release_sock(sk); 171 smc_clcsock_release(smc); 172 lock_sock(sk); 173 } 174 if (!smc->use_fallback) 175 smc_conn_free(&smc->conn); 176 } 177 178 /* detach socket */ 179 sock_orphan(sk); 180 sock->sk = NULL; 181 release_sock(sk); 182 183 sock_put(sk); /* final sock_put */ 184 out: 185 return rc; 186 } 187 188 static void smc_destruct(struct sock *sk) 189 { 190 if (sk->sk_state != SMC_CLOSED) 191 return; 192 if (!sock_flag(sk, SOCK_DEAD)) 193 return; 194 195 sk_refcnt_debug_dec(sk); 196 } 197 198 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, 199 int protocol) 200 { 201 struct smc_sock *smc; 202 struct proto *prot; 203 struct sock *sk; 204 205 prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; 206 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); 207 if (!sk) 208 return NULL; 209 210 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ 211 sk->sk_state = SMC_INIT; 212 sk->sk_destruct = smc_destruct; 213 sk->sk_protocol = protocol; 214 smc = smc_sk(sk); 215 INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); 216 INIT_WORK(&smc->connect_work, smc_connect_work); 217 INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); 218 INIT_LIST_HEAD(&smc->accept_q); 219 spin_lock_init(&smc->accept_q_lock); 220 spin_lock_init(&smc->conn.send_lock); 221 sk->sk_prot->hash(sk); 222 sk_refcnt_debug_inc(sk); 223 mutex_init(&smc->clcsock_release_lock); 224 225 return sk; 226 } 227 228 static int smc_bind(struct socket *sock, struct sockaddr *uaddr, 229 int addr_len) 230 { 231 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 232 struct sock *sk = sock->sk; 233 struct smc_sock *smc; 234 int rc; 235 236 smc = smc_sk(sk); 237 238 /* replicate tests from inet_bind(), to be safe wrt. future changes */ 239 rc = -EINVAL; 240 if (addr_len < sizeof(struct sockaddr_in)) 241 goto out; 242 243 rc = -EAFNOSUPPORT; 244 if (addr->sin_family != AF_INET && 245 addr->sin_family != AF_INET6 && 246 addr->sin_family != AF_UNSPEC) 247 goto out; 248 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ 249 if (addr->sin_family == AF_UNSPEC && 250 addr->sin_addr.s_addr != htonl(INADDR_ANY)) 251 goto out; 252 253 lock_sock(sk); 254 255 /* Check if socket is already active */ 256 rc = -EINVAL; 257 if (sk->sk_state != SMC_INIT) 258 goto out_rel; 259 260 smc->clcsock->sk->sk_reuse = sk->sk_reuse; 261 rc = kernel_bind(smc->clcsock, uaddr, addr_len); 262 263 out_rel: 264 release_sock(sk); 265 out: 266 return rc; 267 } 268 269 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, 270 unsigned long mask) 271 { 272 /* options we don't get control via setsockopt for */ 273 nsk->sk_type = osk->sk_type; 274 nsk->sk_sndbuf = osk->sk_sndbuf; 275 nsk->sk_rcvbuf = osk->sk_rcvbuf; 276 nsk->sk_sndtimeo = osk->sk_sndtimeo; 277 nsk->sk_rcvtimeo = osk->sk_rcvtimeo; 278 nsk->sk_mark = osk->sk_mark; 279 nsk->sk_priority = osk->sk_priority; 280 nsk->sk_rcvlowat = osk->sk_rcvlowat; 281 nsk->sk_bound_dev_if = osk->sk_bound_dev_if; 282 nsk->sk_err = osk->sk_err; 283 284 nsk->sk_flags &= ~mask; 285 nsk->sk_flags |= osk->sk_flags & mask; 286 } 287 288 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ 289 (1UL << SOCK_KEEPOPEN) | \ 290 (1UL << SOCK_LINGER) | \ 291 (1UL << SOCK_BROADCAST) | \ 292 (1UL << SOCK_TIMESTAMP) | \ 293 (1UL << SOCK_DBG) | \ 294 (1UL << SOCK_RCVTSTAMP) | \ 295 (1UL << SOCK_RCVTSTAMPNS) | \ 296 (1UL << SOCK_LOCALROUTE) | \ 297 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ 298 (1UL << SOCK_RXQ_OVFL) | \ 299 (1UL << SOCK_WIFI_STATUS) | \ 300 (1UL << SOCK_NOFCS) | \ 301 (1UL << SOCK_FILTER_LOCKED) | \ 302 (1UL << SOCK_TSTAMP_NEW)) 303 /* copy only relevant settings and flags of SOL_SOCKET level from smc to 304 * clc socket (since smc is not called for these options from net/core) 305 */ 306 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) 307 { 308 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); 309 } 310 311 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ 312 (1UL << SOCK_KEEPOPEN) | \ 313 (1UL << SOCK_LINGER) | \ 314 (1UL << SOCK_DBG)) 315 /* copy only settings and flags relevant for smc from clc to smc socket */ 316 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) 317 { 318 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); 319 } 320 321 /* register a new rmb, send confirm_rkey msg to register with peer */ 322 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, 323 bool conf_rkey) 324 { 325 if (!rmb_desc->wr_reg) { 326 /* register memory region for new rmb */ 327 if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { 328 rmb_desc->regerr = 1; 329 return -EFAULT; 330 } 331 rmb_desc->wr_reg = 1; 332 } 333 if (!conf_rkey) 334 return 0; 335 /* exchange confirm_rkey msg with peer */ 336 if (smc_llc_do_confirm_rkey(link, rmb_desc)) { 337 rmb_desc->regerr = 1; 338 return -EFAULT; 339 } 340 return 0; 341 } 342 343 static int smc_clnt_conf_first_link(struct smc_sock *smc) 344 { 345 struct net *net = sock_net(smc->clcsock->sk); 346 struct smc_link_group *lgr = smc->conn.lgr; 347 struct smc_link *link; 348 int rest; 349 int rc; 350 351 link = &lgr->lnk[SMC_SINGLE_LINK]; 352 /* receive CONFIRM LINK request from server over RoCE fabric */ 353 rest = wait_for_completion_interruptible_timeout( 354 &link->llc_confirm, 355 SMC_LLC_WAIT_FIRST_TIME); 356 if (rest <= 0) { 357 struct smc_clc_msg_decline dclc; 358 359 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 360 SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); 361 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; 362 } 363 364 if (link->llc_confirm_rc) 365 return SMC_CLC_DECL_RMBE_EC; 366 367 rc = smc_ib_modify_qp_rts(link); 368 if (rc) 369 return SMC_CLC_DECL_ERR_RDYLNK; 370 371 smc_wr_remember_qp_attr(link); 372 373 if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) 374 return SMC_CLC_DECL_ERR_REGRMB; 375 376 /* send CONFIRM LINK response over RoCE fabric */ 377 rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); 378 if (rc < 0) 379 return SMC_CLC_DECL_TIMEOUT_CL; 380 381 /* receive ADD LINK request from server over RoCE fabric */ 382 rest = wait_for_completion_interruptible_timeout(&link->llc_add, 383 SMC_LLC_WAIT_TIME); 384 if (rest <= 0) { 385 struct smc_clc_msg_decline dclc; 386 387 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 388 SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); 389 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; 390 } 391 392 /* send add link reject message, only one link supported for now */ 393 rc = smc_llc_send_add_link(link, 394 link->smcibdev->mac[link->ibport - 1], 395 link->gid, SMC_LLC_RESP); 396 if (rc < 0) 397 return SMC_CLC_DECL_TIMEOUT_AL; 398 399 smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); 400 401 return 0; 402 } 403 404 static void smcr_conn_save_peer_info(struct smc_sock *smc, 405 struct smc_clc_msg_accept_confirm *clc) 406 { 407 int bufsize = smc_uncompress_bufsize(clc->rmbe_size); 408 409 smc->conn.peer_rmbe_idx = clc->rmbe_idx; 410 smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); 411 smc->conn.peer_rmbe_size = bufsize; 412 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); 413 smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); 414 } 415 416 static void smcd_conn_save_peer_info(struct smc_sock *smc, 417 struct smc_clc_msg_accept_confirm *clc) 418 { 419 int bufsize = smc_uncompress_bufsize(clc->dmbe_size); 420 421 smc->conn.peer_rmbe_idx = clc->dmbe_idx; 422 smc->conn.peer_token = clc->token; 423 /* msg header takes up space in the buffer */ 424 smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); 425 atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); 426 smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx; 427 } 428 429 static void smc_conn_save_peer_info(struct smc_sock *smc, 430 struct smc_clc_msg_accept_confirm *clc) 431 { 432 if (smc->conn.lgr->is_smcd) 433 smcd_conn_save_peer_info(smc, clc); 434 else 435 smcr_conn_save_peer_info(smc, clc); 436 } 437 438 static void smc_link_save_peer_info(struct smc_link *link, 439 struct smc_clc_msg_accept_confirm *clc) 440 { 441 link->peer_qpn = ntoh24(clc->qpn); 442 memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); 443 memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); 444 link->peer_psn = ntoh24(clc->psn); 445 link->peer_mtu = clc->qp_mtu; 446 } 447 448 static void smc_switch_to_fallback(struct smc_sock *smc) 449 { 450 smc->use_fallback = true; 451 if (smc->sk.sk_socket && smc->sk.sk_socket->file) { 452 smc->clcsock->file = smc->sk.sk_socket->file; 453 smc->clcsock->file->private_data = smc->clcsock; 454 } 455 } 456 457 /* fall back during connect */ 458 static int smc_connect_fallback(struct smc_sock *smc, int reason_code) 459 { 460 smc_switch_to_fallback(smc); 461 smc->fallback_rsn = reason_code; 462 smc_copy_sock_settings_to_clc(smc); 463 if (smc->sk.sk_state == SMC_INIT) 464 smc->sk.sk_state = SMC_ACTIVE; 465 return 0; 466 } 467 468 /* decline and fall back during connect */ 469 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) 470 { 471 int rc; 472 473 if (reason_code < 0) { /* error, fallback is not possible */ 474 if (smc->sk.sk_state == SMC_INIT) 475 sock_put(&smc->sk); /* passive closing */ 476 return reason_code; 477 } 478 if (reason_code != SMC_CLC_DECL_PEERDECL) { 479 rc = smc_clc_send_decline(smc, reason_code); 480 if (rc < 0) { 481 if (smc->sk.sk_state == SMC_INIT) 482 sock_put(&smc->sk); /* passive closing */ 483 return rc; 484 } 485 } 486 return smc_connect_fallback(smc, reason_code); 487 } 488 489 /* abort connecting */ 490 static int smc_connect_abort(struct smc_sock *smc, int reason_code, 491 int local_contact) 492 { 493 if (local_contact == SMC_FIRST_CONTACT) 494 smc_lgr_forget(smc->conn.lgr); 495 if (smc->conn.lgr->is_smcd) 496 /* there is only one lgr role for SMC-D; use server lock */ 497 mutex_unlock(&smc_server_lgr_pending); 498 else 499 mutex_unlock(&smc_client_lgr_pending); 500 501 smc_conn_free(&smc->conn); 502 return reason_code; 503 } 504 505 /* check if there is a rdma device available for this connection. */ 506 /* called for connect and listen */ 507 static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, 508 u8 *ibport, unsigned short vlan_id, u8 gid[]) 509 { 510 int reason_code = 0; 511 512 /* PNET table look up: search active ib_device and port 513 * within same PNETID that also contains the ethernet device 514 * used for the internal TCP socket 515 */ 516 smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id, 517 gid); 518 if (!(*ibdev)) 519 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ 520 521 return reason_code; 522 } 523 524 /* check if there is an ISM device available for this connection. */ 525 /* called for connect and listen */ 526 static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev) 527 { 528 /* Find ISM device with same PNETID as connecting interface */ 529 smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev); 530 if (!(*ismdev)) 531 return SMC_CLC_DECL_CNFERR; /* configuration error */ 532 return 0; 533 } 534 535 /* Check for VLAN ID and register it on ISM device just for CLC handshake */ 536 static int smc_connect_ism_vlan_setup(struct smc_sock *smc, 537 struct smcd_dev *ismdev, 538 unsigned short vlan_id) 539 { 540 if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id)) 541 return SMC_CLC_DECL_CNFERR; 542 return 0; 543 } 544 545 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is 546 * used, the VLAN ID will be registered again during the connection setup. 547 */ 548 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, 549 struct smcd_dev *ismdev, 550 unsigned short vlan_id) 551 { 552 if (!is_smcd) 553 return 0; 554 if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id)) 555 return SMC_CLC_DECL_CNFERR; 556 return 0; 557 } 558 559 /* CLC handshake during connect */ 560 static int smc_connect_clc(struct smc_sock *smc, int smc_type, 561 struct smc_clc_msg_accept_confirm *aclc, 562 struct smc_ib_device *ibdev, u8 ibport, 563 u8 gid[], struct smcd_dev *ismdev) 564 { 565 int rc = 0; 566 567 /* do inband token exchange */ 568 rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev); 569 if (rc) 570 return rc; 571 /* receive SMC Accept CLC message */ 572 return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT, 573 CLC_WAIT_TIME); 574 } 575 576 /* setup for RDMA connection of client */ 577 static int smc_connect_rdma(struct smc_sock *smc, 578 struct smc_clc_msg_accept_confirm *aclc, 579 struct smc_ib_device *ibdev, u8 ibport) 580 { 581 int local_contact = SMC_FIRST_CONTACT; 582 struct smc_link *link; 583 int reason_code = 0; 584 585 mutex_lock(&smc_client_lgr_pending); 586 local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev, 587 ibport, ntoh24(aclc->qpn), &aclc->lcl, 588 NULL, 0); 589 if (local_contact < 0) { 590 if (local_contact == -ENOMEM) 591 reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ 592 else if (local_contact == -ENOLINK) 593 reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ 594 else 595 reason_code = SMC_CLC_DECL_INTERR; /* other error */ 596 mutex_unlock(&smc_client_lgr_pending); 597 return reason_code; 598 } 599 link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 600 601 smc_conn_save_peer_info(smc, aclc); 602 603 /* create send buffer and rmb */ 604 if (smc_buf_create(smc, false)) 605 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); 606 607 if (local_contact == SMC_FIRST_CONTACT) 608 smc_link_save_peer_info(link, aclc); 609 610 if (smc_rmb_rtoken_handling(&smc->conn, aclc)) 611 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, 612 local_contact); 613 614 smc_close_init(smc); 615 smc_rx_init(smc); 616 617 if (local_contact == SMC_FIRST_CONTACT) { 618 if (smc_ib_ready_link(link)) 619 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, 620 local_contact); 621 } else { 622 if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) 623 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, 624 local_contact); 625 } 626 smc_rmb_sync_sg_for_device(&smc->conn); 627 628 reason_code = smc_clc_send_confirm(smc); 629 if (reason_code) 630 return smc_connect_abort(smc, reason_code, local_contact); 631 632 smc_tx_init(smc); 633 634 if (local_contact == SMC_FIRST_CONTACT) { 635 /* QP confirmation over RoCE fabric */ 636 reason_code = smc_clnt_conf_first_link(smc); 637 if (reason_code) 638 return smc_connect_abort(smc, reason_code, 639 local_contact); 640 } 641 mutex_unlock(&smc_client_lgr_pending); 642 643 smc_copy_sock_settings_to_clc(smc); 644 if (smc->sk.sk_state == SMC_INIT) 645 smc->sk.sk_state = SMC_ACTIVE; 646 647 return 0; 648 } 649 650 /* setup for ISM connection of client */ 651 static int smc_connect_ism(struct smc_sock *smc, 652 struct smc_clc_msg_accept_confirm *aclc, 653 struct smcd_dev *ismdev) 654 { 655 int local_contact = SMC_FIRST_CONTACT; 656 int rc = 0; 657 658 /* there is only one lgr role for SMC-D; use server lock */ 659 mutex_lock(&smc_server_lgr_pending); 660 local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, 0, 661 NULL, ismdev, aclc->gid); 662 if (local_contact < 0) { 663 mutex_unlock(&smc_server_lgr_pending); 664 return SMC_CLC_DECL_MEM; 665 } 666 667 /* Create send and receive buffers */ 668 if (smc_buf_create(smc, true)) 669 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); 670 671 smc_conn_save_peer_info(smc, aclc); 672 smc_close_init(smc); 673 smc_rx_init(smc); 674 smc_tx_init(smc); 675 676 rc = smc_clc_send_confirm(smc); 677 if (rc) 678 return smc_connect_abort(smc, rc, local_contact); 679 mutex_unlock(&smc_server_lgr_pending); 680 681 smc_copy_sock_settings_to_clc(smc); 682 if (smc->sk.sk_state == SMC_INIT) 683 smc->sk.sk_state = SMC_ACTIVE; 684 685 return 0; 686 } 687 688 /* perform steps before actually connecting */ 689 static int __smc_connect(struct smc_sock *smc) 690 { 691 bool ism_supported = false, rdma_supported = false; 692 struct smc_clc_msg_accept_confirm aclc; 693 struct smc_ib_device *ibdev; 694 struct smcd_dev *ismdev; 695 u8 gid[SMC_GID_SIZE]; 696 unsigned short vlan; 697 int smc_type; 698 int rc = 0; 699 u8 ibport; 700 701 sock_hold(&smc->sk); /* sock put in passive closing */ 702 703 if (smc->use_fallback) 704 return smc_connect_fallback(smc, smc->fallback_rsn); 705 706 /* if peer has not signalled SMC-capability, fall back */ 707 if (!tcp_sk(smc->clcsock->sk)->syn_smc) 708 return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC); 709 710 /* IPSec connections opt out of SMC-R optimizations */ 711 if (using_ipsec(smc)) 712 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); 713 714 /* check for VLAN ID */ 715 if (smc_vlan_by_tcpsk(smc->clcsock, &vlan)) 716 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); 717 718 /* check if there is an ism device available */ 719 if (!smc_check_ism(smc, &ismdev) && 720 !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) { 721 /* ISM is supported for this connection */ 722 ism_supported = true; 723 smc_type = SMC_TYPE_D; 724 } 725 726 /* check if there is a rdma device available */ 727 if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) { 728 /* RDMA is supported for this connection */ 729 rdma_supported = true; 730 if (ism_supported) 731 smc_type = SMC_TYPE_B; /* both */ 732 else 733 smc_type = SMC_TYPE_R; /* only RDMA */ 734 } 735 736 /* if neither ISM nor RDMA are supported, fallback */ 737 if (!rdma_supported && !ism_supported) 738 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV); 739 740 /* perform CLC handshake */ 741 rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev); 742 if (rc) { 743 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); 744 return smc_connect_decline_fallback(smc, rc); 745 } 746 747 /* depending on previous steps, connect using rdma or ism */ 748 if (rdma_supported && aclc.hdr.path == SMC_TYPE_R) 749 rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); 750 else if (ism_supported && aclc.hdr.path == SMC_TYPE_D) 751 rc = smc_connect_ism(smc, &aclc, ismdev); 752 else 753 rc = SMC_CLC_DECL_MODEUNSUPP; 754 if (rc) { 755 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); 756 return smc_connect_decline_fallback(smc, rc); 757 } 758 759 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); 760 return 0; 761 } 762 763 static void smc_connect_work(struct work_struct *work) 764 { 765 struct smc_sock *smc = container_of(work, struct smc_sock, 766 connect_work); 767 int rc; 768 769 lock_sock(&smc->sk); 770 rc = kernel_connect(smc->clcsock, &smc->connect_info->addr, 771 smc->connect_info->alen, smc->connect_info->flags); 772 if (smc->clcsock->sk->sk_err) { 773 smc->sk.sk_err = smc->clcsock->sk->sk_err; 774 goto out; 775 } 776 if (rc < 0) { 777 smc->sk.sk_err = -rc; 778 goto out; 779 } 780 781 rc = __smc_connect(smc); 782 if (rc < 0) 783 smc->sk.sk_err = -rc; 784 785 out: 786 if (!sock_flag(&smc->sk, SOCK_DEAD)) { 787 if (smc->sk.sk_err) { 788 smc->sk.sk_state_change(&smc->sk); 789 } else { /* allow polling before and after fallback decision */ 790 smc->clcsock->sk->sk_write_space(smc->clcsock->sk); 791 smc->sk.sk_write_space(&smc->sk); 792 } 793 } 794 kfree(smc->connect_info); 795 smc->connect_info = NULL; 796 release_sock(&smc->sk); 797 } 798 799 static int smc_connect(struct socket *sock, struct sockaddr *addr, 800 int alen, int flags) 801 { 802 struct sock *sk = sock->sk; 803 struct smc_sock *smc; 804 int rc = -EINVAL; 805 806 smc = smc_sk(sk); 807 808 /* separate smc parameter checking to be safe */ 809 if (alen < sizeof(addr->sa_family)) 810 goto out_err; 811 if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) 812 goto out_err; 813 814 lock_sock(sk); 815 switch (sk->sk_state) { 816 default: 817 goto out; 818 case SMC_ACTIVE: 819 rc = -EISCONN; 820 goto out; 821 case SMC_INIT: 822 rc = 0; 823 break; 824 } 825 826 smc_copy_sock_settings_to_clc(smc); 827 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 828 if (flags & O_NONBLOCK) { 829 if (smc->connect_info) { 830 rc = -EALREADY; 831 goto out; 832 } 833 smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL); 834 if (!smc->connect_info) { 835 rc = -ENOMEM; 836 goto out; 837 } 838 smc->connect_info->alen = alen; 839 smc->connect_info->flags = flags ^ O_NONBLOCK; 840 memcpy(&smc->connect_info->addr, addr, alen); 841 schedule_work(&smc->connect_work); 842 rc = -EINPROGRESS; 843 } else { 844 rc = kernel_connect(smc->clcsock, addr, alen, flags); 845 if (rc) 846 goto out; 847 848 rc = __smc_connect(smc); 849 if (rc < 0) 850 goto out; 851 else 852 rc = 0; /* success cases including fallback */ 853 } 854 855 out: 856 release_sock(sk); 857 out_err: 858 return rc; 859 } 860 861 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) 862 { 863 struct socket *new_clcsock = NULL; 864 struct sock *lsk = &lsmc->sk; 865 struct sock *new_sk; 866 int rc = -EINVAL; 867 868 release_sock(lsk); 869 new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); 870 if (!new_sk) { 871 rc = -ENOMEM; 872 lsk->sk_err = ENOMEM; 873 *new_smc = NULL; 874 lock_sock(lsk); 875 goto out; 876 } 877 *new_smc = smc_sk(new_sk); 878 879 mutex_lock(&lsmc->clcsock_release_lock); 880 if (lsmc->clcsock) 881 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); 882 mutex_unlock(&lsmc->clcsock_release_lock); 883 lock_sock(lsk); 884 if (rc < 0) 885 lsk->sk_err = -rc; 886 if (rc < 0 || lsk->sk_state == SMC_CLOSED) { 887 new_sk->sk_prot->unhash(new_sk); 888 if (new_clcsock) 889 sock_release(new_clcsock); 890 new_sk->sk_state = SMC_CLOSED; 891 sock_set_flag(new_sk, SOCK_DEAD); 892 sock_put(new_sk); /* final */ 893 *new_smc = NULL; 894 goto out; 895 } 896 897 (*new_smc)->clcsock = new_clcsock; 898 out: 899 return rc; 900 } 901 902 /* add a just created sock to the accept queue of the listen sock as 903 * candidate for a following socket accept call from user space 904 */ 905 static void smc_accept_enqueue(struct sock *parent, struct sock *sk) 906 { 907 struct smc_sock *par = smc_sk(parent); 908 909 sock_hold(sk); /* sock_put in smc_accept_unlink () */ 910 spin_lock(&par->accept_q_lock); 911 list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); 912 spin_unlock(&par->accept_q_lock); 913 sk_acceptq_added(parent); 914 } 915 916 /* remove a socket from the accept queue of its parental listening socket */ 917 static void smc_accept_unlink(struct sock *sk) 918 { 919 struct smc_sock *par = smc_sk(sk)->listen_smc; 920 921 spin_lock(&par->accept_q_lock); 922 list_del_init(&smc_sk(sk)->accept_q); 923 spin_unlock(&par->accept_q_lock); 924 sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); 925 sock_put(sk); /* sock_hold in smc_accept_enqueue */ 926 } 927 928 /* remove a sock from the accept queue to bind it to a new socket created 929 * for a socket accept call from user space 930 */ 931 struct sock *smc_accept_dequeue(struct sock *parent, 932 struct socket *new_sock) 933 { 934 struct smc_sock *isk, *n; 935 struct sock *new_sk; 936 937 list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { 938 new_sk = (struct sock *)isk; 939 940 smc_accept_unlink(new_sk); 941 if (new_sk->sk_state == SMC_CLOSED) { 942 new_sk->sk_prot->unhash(new_sk); 943 if (isk->clcsock) { 944 sock_release(isk->clcsock); 945 isk->clcsock = NULL; 946 } 947 sock_put(new_sk); /* final */ 948 continue; 949 } 950 if (new_sock) { 951 sock_graft(new_sk, new_sock); 952 if (isk->use_fallback) { 953 smc_sk(new_sk)->clcsock->file = new_sock->file; 954 isk->clcsock->file->private_data = isk->clcsock; 955 } 956 } 957 return new_sk; 958 } 959 return NULL; 960 } 961 962 /* clean up for a created but never accepted sock */ 963 void smc_close_non_accepted(struct sock *sk) 964 { 965 struct smc_sock *smc = smc_sk(sk); 966 967 lock_sock(sk); 968 if (!sk->sk_lingertime) 969 /* wait for peer closing */ 970 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; 971 if (!smc->use_fallback) { 972 smc_close_active(smc); 973 sock_set_flag(sk, SOCK_DEAD); 974 sk->sk_shutdown |= SHUTDOWN_MASK; 975 } 976 sk->sk_prot->unhash(sk); 977 if (smc->clcsock) { 978 struct socket *tcp; 979 980 tcp = smc->clcsock; 981 smc->clcsock = NULL; 982 sock_release(tcp); 983 } 984 if (smc->use_fallback) { 985 sock_put(sk); /* passive closing */ 986 sk->sk_state = SMC_CLOSED; 987 } else { 988 if (sk->sk_state == SMC_CLOSED) 989 smc_conn_free(&smc->conn); 990 } 991 release_sock(sk); 992 sock_put(sk); /* final sock_put */ 993 } 994 995 static int smc_serv_conf_first_link(struct smc_sock *smc) 996 { 997 struct net *net = sock_net(smc->clcsock->sk); 998 struct smc_link_group *lgr = smc->conn.lgr; 999 struct smc_link *link; 1000 int rest; 1001 int rc; 1002 1003 link = &lgr->lnk[SMC_SINGLE_LINK]; 1004 1005 if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) 1006 return SMC_CLC_DECL_ERR_REGRMB; 1007 1008 /* send CONFIRM LINK request to client over the RoCE fabric */ 1009 rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); 1010 if (rc < 0) 1011 return SMC_CLC_DECL_TIMEOUT_CL; 1012 1013 /* receive CONFIRM LINK response from client over the RoCE fabric */ 1014 rest = wait_for_completion_interruptible_timeout( 1015 &link->llc_confirm_resp, 1016 SMC_LLC_WAIT_FIRST_TIME); 1017 if (rest <= 0) { 1018 struct smc_clc_msg_decline dclc; 1019 1020 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 1021 SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); 1022 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; 1023 } 1024 1025 if (link->llc_confirm_resp_rc) 1026 return SMC_CLC_DECL_RMBE_EC; 1027 1028 /* send ADD LINK request to client over the RoCE fabric */ 1029 rc = smc_llc_send_add_link(link, 1030 link->smcibdev->mac[link->ibport - 1], 1031 link->gid, SMC_LLC_REQ); 1032 if (rc < 0) 1033 return SMC_CLC_DECL_TIMEOUT_AL; 1034 1035 /* receive ADD LINK response from client over the RoCE fabric */ 1036 rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, 1037 SMC_LLC_WAIT_TIME); 1038 if (rest <= 0) { 1039 struct smc_clc_msg_decline dclc; 1040 1041 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), 1042 SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); 1043 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; 1044 } 1045 1046 smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); 1047 1048 return 0; 1049 } 1050 1051 /* listen worker: finish */ 1052 static void smc_listen_out(struct smc_sock *new_smc) 1053 { 1054 struct smc_sock *lsmc = new_smc->listen_smc; 1055 struct sock *newsmcsk = &new_smc->sk; 1056 1057 if (lsmc->sk.sk_state == SMC_LISTEN) { 1058 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); 1059 smc_accept_enqueue(&lsmc->sk, newsmcsk); 1060 release_sock(&lsmc->sk); 1061 } else { /* no longer listening */ 1062 smc_close_non_accepted(newsmcsk); 1063 } 1064 1065 /* Wake up accept */ 1066 lsmc->sk.sk_data_ready(&lsmc->sk); 1067 sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ 1068 } 1069 1070 /* listen worker: finish in state connected */ 1071 static void smc_listen_out_connected(struct smc_sock *new_smc) 1072 { 1073 struct sock *newsmcsk = &new_smc->sk; 1074 1075 sk_refcnt_debug_inc(newsmcsk); 1076 if (newsmcsk->sk_state == SMC_INIT) 1077 newsmcsk->sk_state = SMC_ACTIVE; 1078 1079 smc_listen_out(new_smc); 1080 } 1081 1082 /* listen worker: finish in error state */ 1083 static void smc_listen_out_err(struct smc_sock *new_smc) 1084 { 1085 struct sock *newsmcsk = &new_smc->sk; 1086 1087 if (newsmcsk->sk_state == SMC_INIT) 1088 sock_put(&new_smc->sk); /* passive closing */ 1089 newsmcsk->sk_state = SMC_CLOSED; 1090 smc_conn_free(&new_smc->conn); 1091 1092 smc_listen_out(new_smc); 1093 } 1094 1095 /* listen worker: decline and fall back if possible */ 1096 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, 1097 int local_contact) 1098 { 1099 /* RDMA setup failed, switch back to TCP */ 1100 if (local_contact == SMC_FIRST_CONTACT) 1101 smc_lgr_forget(new_smc->conn.lgr); 1102 if (reason_code < 0) { /* error, no fallback possible */ 1103 smc_listen_out_err(new_smc); 1104 return; 1105 } 1106 smc_conn_free(&new_smc->conn); 1107 smc_switch_to_fallback(new_smc); 1108 new_smc->fallback_rsn = reason_code; 1109 if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { 1110 if (smc_clc_send_decline(new_smc, reason_code) < 0) { 1111 smc_listen_out_err(new_smc); 1112 return; 1113 } 1114 } 1115 smc_listen_out_connected(new_smc); 1116 } 1117 1118 /* listen worker: check prefixes */ 1119 static int smc_listen_rdma_check(struct smc_sock *new_smc, 1120 struct smc_clc_msg_proposal *pclc) 1121 { 1122 struct smc_clc_msg_proposal_prefix *pclc_prfx; 1123 struct socket *newclcsock = new_smc->clcsock; 1124 1125 pclc_prfx = smc_clc_proposal_get_prefix(pclc); 1126 if (smc_clc_prfx_match(newclcsock, pclc_prfx)) 1127 return SMC_CLC_DECL_CNFERR; 1128 1129 return 0; 1130 } 1131 1132 /* listen worker: initialize connection and buffers */ 1133 static int smc_listen_rdma_init(struct smc_sock *new_smc, 1134 struct smc_clc_msg_proposal *pclc, 1135 struct smc_ib_device *ibdev, u8 ibport, 1136 int *local_contact) 1137 { 1138 /* allocate connection / link group */ 1139 *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, 0, 1140 &pclc->lcl, NULL, 0); 1141 if (*local_contact < 0) { 1142 if (*local_contact == -ENOMEM) 1143 return SMC_CLC_DECL_MEM;/* insufficient memory*/ 1144 return SMC_CLC_DECL_INTERR; /* other error */ 1145 } 1146 1147 /* create send buffer and rmb */ 1148 if (smc_buf_create(new_smc, false)) 1149 return SMC_CLC_DECL_MEM; 1150 1151 return 0; 1152 } 1153 1154 /* listen worker: initialize connection and buffers for SMC-D */ 1155 static int smc_listen_ism_init(struct smc_sock *new_smc, 1156 struct smc_clc_msg_proposal *pclc, 1157 struct smcd_dev *ismdev, 1158 int *local_contact) 1159 { 1160 struct smc_clc_msg_smcd *pclc_smcd; 1161 1162 pclc_smcd = smc_get_clc_msg_smcd(pclc); 1163 *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, 0, NULL, 1164 ismdev, pclc_smcd->gid); 1165 if (*local_contact < 0) { 1166 if (*local_contact == -ENOMEM) 1167 return SMC_CLC_DECL_MEM;/* insufficient memory*/ 1168 return SMC_CLC_DECL_INTERR; /* other error */ 1169 } 1170 1171 /* Check if peer can be reached via ISM device */ 1172 if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid, 1173 new_smc->conn.lgr->vlan_id, 1174 new_smc->conn.lgr->smcd)) { 1175 if (*local_contact == SMC_FIRST_CONTACT) 1176 smc_lgr_forget(new_smc->conn.lgr); 1177 smc_conn_free(&new_smc->conn); 1178 return SMC_CLC_DECL_CNFERR; 1179 } 1180 1181 /* Create send and receive buffers */ 1182 if (smc_buf_create(new_smc, true)) { 1183 if (*local_contact == SMC_FIRST_CONTACT) 1184 smc_lgr_forget(new_smc->conn.lgr); 1185 smc_conn_free(&new_smc->conn); 1186 return SMC_CLC_DECL_MEM; 1187 } 1188 1189 return 0; 1190 } 1191 1192 /* listen worker: register buffers */ 1193 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) 1194 { 1195 struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 1196 1197 if (local_contact != SMC_FIRST_CONTACT) { 1198 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) 1199 return SMC_CLC_DECL_ERR_REGRMB; 1200 } 1201 smc_rmb_sync_sg_for_device(&new_smc->conn); 1202 1203 return 0; 1204 } 1205 1206 /* listen worker: finish RDMA setup */ 1207 static int smc_listen_rdma_finish(struct smc_sock *new_smc, 1208 struct smc_clc_msg_accept_confirm *cclc, 1209 int local_contact) 1210 { 1211 struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; 1212 int reason_code = 0; 1213 1214 if (local_contact == SMC_FIRST_CONTACT) 1215 smc_link_save_peer_info(link, cclc); 1216 1217 if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { 1218 reason_code = SMC_CLC_DECL_ERR_RTOK; 1219 goto decline; 1220 } 1221 1222 if (local_contact == SMC_FIRST_CONTACT) { 1223 if (smc_ib_ready_link(link)) { 1224 reason_code = SMC_CLC_DECL_ERR_RDYLNK; 1225 goto decline; 1226 } 1227 /* QP confirmation over RoCE fabric */ 1228 reason_code = smc_serv_conf_first_link(new_smc); 1229 if (reason_code) 1230 goto decline; 1231 } 1232 return 0; 1233 1234 decline: 1235 smc_listen_decline(new_smc, reason_code, local_contact); 1236 return reason_code; 1237 } 1238 1239 /* setup for RDMA connection of server */ 1240 static void smc_listen_work(struct work_struct *work) 1241 { 1242 struct smc_sock *new_smc = container_of(work, struct smc_sock, 1243 smc_listen_work); 1244 struct socket *newclcsock = new_smc->clcsock; 1245 struct smc_clc_msg_accept_confirm cclc; 1246 struct smc_clc_msg_proposal *pclc; 1247 struct smc_ib_device *ibdev; 1248 bool ism_supported = false; 1249 struct smcd_dev *ismdev; 1250 u8 buf[SMC_CLC_MAX_LEN]; 1251 int local_contact = 0; 1252 unsigned short vlan; 1253 int reason_code = 0; 1254 int rc = 0; 1255 u8 ibport; 1256 1257 if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) 1258 return smc_listen_out_err(new_smc); 1259 1260 if (new_smc->use_fallback) { 1261 smc_listen_out_connected(new_smc); 1262 return; 1263 } 1264 1265 /* check if peer is smc capable */ 1266 if (!tcp_sk(newclcsock->sk)->syn_smc) { 1267 smc_switch_to_fallback(new_smc); 1268 new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; 1269 smc_listen_out_connected(new_smc); 1270 return; 1271 } 1272 1273 /* do inband token exchange - 1274 * wait for and receive SMC Proposal CLC message 1275 */ 1276 pclc = (struct smc_clc_msg_proposal *)&buf; 1277 reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, 1278 SMC_CLC_PROPOSAL, CLC_WAIT_TIME); 1279 if (reason_code) { 1280 smc_listen_decline(new_smc, reason_code, 0); 1281 return; 1282 } 1283 1284 /* IPSec connections opt out of SMC-R optimizations */ 1285 if (using_ipsec(new_smc)) { 1286 smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0); 1287 return; 1288 } 1289 1290 mutex_lock(&smc_server_lgr_pending); 1291 smc_close_init(new_smc); 1292 smc_rx_init(new_smc); 1293 smc_tx_init(new_smc); 1294 1295 /* check if ISM is available */ 1296 if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) && 1297 !smc_check_ism(new_smc, &ismdev) && 1298 !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) { 1299 ism_supported = true; 1300 } 1301 1302 /* check if RDMA is available */ 1303 if (!ism_supported && 1304 ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) || 1305 smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) || 1306 smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) || 1307 smc_listen_rdma_check(new_smc, pclc) || 1308 smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, 1309 &local_contact) || 1310 smc_listen_rdma_reg(new_smc, local_contact))) { 1311 /* SMC not supported, decline */ 1312 mutex_unlock(&smc_server_lgr_pending); 1313 smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP, 1314 local_contact); 1315 return; 1316 } 1317 1318 /* send SMC Accept CLC message */ 1319 rc = smc_clc_send_accept(new_smc, local_contact); 1320 if (rc) { 1321 mutex_unlock(&smc_server_lgr_pending); 1322 smc_listen_decline(new_smc, rc, local_contact); 1323 return; 1324 } 1325 1326 /* SMC-D does not need this lock any more */ 1327 if (ism_supported) 1328 mutex_unlock(&smc_server_lgr_pending); 1329 1330 /* receive SMC Confirm CLC message */ 1331 reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), 1332 SMC_CLC_CONFIRM, CLC_WAIT_TIME); 1333 if (reason_code) { 1334 if (!ism_supported) 1335 mutex_unlock(&smc_server_lgr_pending); 1336 smc_listen_decline(new_smc, reason_code, local_contact); 1337 return; 1338 } 1339 1340 /* finish worker */ 1341 if (!ism_supported) { 1342 rc = smc_listen_rdma_finish(new_smc, &cclc, local_contact); 1343 mutex_unlock(&smc_server_lgr_pending); 1344 if (rc) 1345 return; 1346 } 1347 smc_conn_save_peer_info(new_smc, &cclc); 1348 smc_listen_out_connected(new_smc); 1349 } 1350 1351 static void smc_tcp_listen_work(struct work_struct *work) 1352 { 1353 struct smc_sock *lsmc = container_of(work, struct smc_sock, 1354 tcp_listen_work); 1355 struct sock *lsk = &lsmc->sk; 1356 struct smc_sock *new_smc; 1357 int rc = 0; 1358 1359 lock_sock(lsk); 1360 while (lsk->sk_state == SMC_LISTEN) { 1361 rc = smc_clcsock_accept(lsmc, &new_smc); 1362 if (rc) 1363 goto out; 1364 if (!new_smc) 1365 continue; 1366 1367 new_smc->listen_smc = lsmc; 1368 new_smc->use_fallback = lsmc->use_fallback; 1369 new_smc->fallback_rsn = lsmc->fallback_rsn; 1370 sock_hold(lsk); /* sock_put in smc_listen_work */ 1371 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); 1372 smc_copy_sock_settings_to_smc(new_smc); 1373 new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; 1374 new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; 1375 sock_hold(&new_smc->sk); /* sock_put in passive closing */ 1376 if (!schedule_work(&new_smc->smc_listen_work)) 1377 sock_put(&new_smc->sk); 1378 } 1379 1380 out: 1381 release_sock(lsk); 1382 sock_put(&lsmc->sk); /* sock_hold in smc_listen */ 1383 } 1384 1385 static int smc_listen(struct socket *sock, int backlog) 1386 { 1387 struct sock *sk = sock->sk; 1388 struct smc_sock *smc; 1389 int rc; 1390 1391 smc = smc_sk(sk); 1392 lock_sock(sk); 1393 1394 rc = -EINVAL; 1395 if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) 1396 goto out; 1397 1398 rc = 0; 1399 if (sk->sk_state == SMC_LISTEN) { 1400 sk->sk_max_ack_backlog = backlog; 1401 goto out; 1402 } 1403 /* some socket options are handled in core, so we could not apply 1404 * them to the clc socket -- copy smc socket options to clc socket 1405 */ 1406 smc_copy_sock_settings_to_clc(smc); 1407 if (!smc->use_fallback) 1408 tcp_sk(smc->clcsock->sk)->syn_smc = 1; 1409 1410 rc = kernel_listen(smc->clcsock, backlog); 1411 if (rc) 1412 goto out; 1413 sk->sk_max_ack_backlog = backlog; 1414 sk->sk_ack_backlog = 0; 1415 sk->sk_state = SMC_LISTEN; 1416 sock_hold(sk); /* sock_hold in tcp_listen_worker */ 1417 if (!schedule_work(&smc->tcp_listen_work)) 1418 sock_put(sk); 1419 1420 out: 1421 release_sock(sk); 1422 return rc; 1423 } 1424 1425 static int smc_accept(struct socket *sock, struct socket *new_sock, 1426 int flags, bool kern) 1427 { 1428 struct sock *sk = sock->sk, *nsk; 1429 DECLARE_WAITQUEUE(wait, current); 1430 struct smc_sock *lsmc; 1431 long timeo; 1432 int rc = 0; 1433 1434 lsmc = smc_sk(sk); 1435 sock_hold(sk); /* sock_put below */ 1436 lock_sock(sk); 1437 1438 if (lsmc->sk.sk_state != SMC_LISTEN) { 1439 rc = -EINVAL; 1440 release_sock(sk); 1441 goto out; 1442 } 1443 1444 /* Wait for an incoming connection */ 1445 timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 1446 add_wait_queue_exclusive(sk_sleep(sk), &wait); 1447 while (!(nsk = smc_accept_dequeue(sk, new_sock))) { 1448 set_current_state(TASK_INTERRUPTIBLE); 1449 if (!timeo) { 1450 rc = -EAGAIN; 1451 break; 1452 } 1453 release_sock(sk); 1454 timeo = schedule_timeout(timeo); 1455 /* wakeup by sk_data_ready in smc_listen_work() */ 1456 sched_annotate_sleep(); 1457 lock_sock(sk); 1458 if (signal_pending(current)) { 1459 rc = sock_intr_errno(timeo); 1460 break; 1461 } 1462 } 1463 set_current_state(TASK_RUNNING); 1464 remove_wait_queue(sk_sleep(sk), &wait); 1465 1466 if (!rc) 1467 rc = sock_error(nsk); 1468 release_sock(sk); 1469 if (rc) 1470 goto out; 1471 1472 if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { 1473 /* wait till data arrives on the socket */ 1474 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * 1475 MSEC_PER_SEC); 1476 if (smc_sk(nsk)->use_fallback) { 1477 struct sock *clcsk = smc_sk(nsk)->clcsock->sk; 1478 1479 lock_sock(clcsk); 1480 if (skb_queue_empty(&clcsk->sk_receive_queue)) 1481 sk_wait_data(clcsk, &timeo, NULL); 1482 release_sock(clcsk); 1483 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { 1484 lock_sock(nsk); 1485 smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available); 1486 release_sock(nsk); 1487 } 1488 } 1489 1490 out: 1491 sock_put(sk); /* sock_hold above */ 1492 return rc; 1493 } 1494 1495 static int smc_getname(struct socket *sock, struct sockaddr *addr, 1496 int peer) 1497 { 1498 struct smc_sock *smc; 1499 1500 if (peer && (sock->sk->sk_state != SMC_ACTIVE) && 1501 (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) 1502 return -ENOTCONN; 1503 1504 smc = smc_sk(sock->sk); 1505 1506 return smc->clcsock->ops->getname(smc->clcsock, addr, peer); 1507 } 1508 1509 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) 1510 { 1511 struct sock *sk = sock->sk; 1512 struct smc_sock *smc; 1513 int rc = -EPIPE; 1514 1515 smc = smc_sk(sk); 1516 lock_sock(sk); 1517 if ((sk->sk_state != SMC_ACTIVE) && 1518 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1519 (sk->sk_state != SMC_INIT)) 1520 goto out; 1521 1522 if (msg->msg_flags & MSG_FASTOPEN) { 1523 if (sk->sk_state == SMC_INIT) { 1524 smc_switch_to_fallback(smc); 1525 smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; 1526 } else { 1527 rc = -EINVAL; 1528 goto out; 1529 } 1530 } 1531 1532 if (smc->use_fallback) 1533 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); 1534 else 1535 rc = smc_tx_sendmsg(smc, msg, len); 1536 out: 1537 release_sock(sk); 1538 return rc; 1539 } 1540 1541 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, 1542 int flags) 1543 { 1544 struct sock *sk = sock->sk; 1545 struct smc_sock *smc; 1546 int rc = -ENOTCONN; 1547 1548 smc = smc_sk(sk); 1549 lock_sock(sk); 1550 if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { 1551 /* socket was connected before, no more data to read */ 1552 rc = 0; 1553 goto out; 1554 } 1555 if ((sk->sk_state == SMC_INIT) || 1556 (sk->sk_state == SMC_LISTEN) || 1557 (sk->sk_state == SMC_CLOSED)) 1558 goto out; 1559 1560 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 1561 rc = 0; 1562 goto out; 1563 } 1564 1565 if (smc->use_fallback) { 1566 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); 1567 } else { 1568 msg->msg_namelen = 0; 1569 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); 1570 } 1571 1572 out: 1573 release_sock(sk); 1574 return rc; 1575 } 1576 1577 static __poll_t smc_accept_poll(struct sock *parent) 1578 { 1579 struct smc_sock *isk = smc_sk(parent); 1580 __poll_t mask = 0; 1581 1582 spin_lock(&isk->accept_q_lock); 1583 if (!list_empty(&isk->accept_q)) 1584 mask = EPOLLIN | EPOLLRDNORM; 1585 spin_unlock(&isk->accept_q_lock); 1586 1587 return mask; 1588 } 1589 1590 static __poll_t smc_poll(struct file *file, struct socket *sock, 1591 poll_table *wait) 1592 { 1593 struct sock *sk = sock->sk; 1594 __poll_t mask = 0; 1595 struct smc_sock *smc; 1596 1597 if (!sk) 1598 return EPOLLNVAL; 1599 1600 smc = smc_sk(sock->sk); 1601 if (smc->use_fallback) { 1602 /* delegate to CLC child sock */ 1603 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); 1604 sk->sk_err = smc->clcsock->sk->sk_err; 1605 if (sk->sk_err) 1606 mask |= EPOLLERR; 1607 } else { 1608 if (sk->sk_state != SMC_CLOSED) 1609 sock_poll_wait(file, sock, wait); 1610 if (sk->sk_err) 1611 mask |= EPOLLERR; 1612 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 1613 (sk->sk_state == SMC_CLOSED)) 1614 mask |= EPOLLHUP; 1615 if (sk->sk_state == SMC_LISTEN) { 1616 /* woken up by sk_data_ready in smc_listen_work() */ 1617 mask = smc_accept_poll(sk); 1618 } else { 1619 if (atomic_read(&smc->conn.sndbuf_space) || 1620 sk->sk_shutdown & SEND_SHUTDOWN) { 1621 mask |= EPOLLOUT | EPOLLWRNORM; 1622 } else { 1623 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1624 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1625 } 1626 if (atomic_read(&smc->conn.bytes_to_rcv)) 1627 mask |= EPOLLIN | EPOLLRDNORM; 1628 if (sk->sk_shutdown & RCV_SHUTDOWN) 1629 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 1630 if (sk->sk_state == SMC_APPCLOSEWAIT1) 1631 mask |= EPOLLIN; 1632 if (smc->conn.urg_state == SMC_URG_VALID) 1633 mask |= EPOLLPRI; 1634 } 1635 } 1636 1637 return mask; 1638 } 1639 1640 static int smc_shutdown(struct socket *sock, int how) 1641 { 1642 struct sock *sk = sock->sk; 1643 struct smc_sock *smc; 1644 int rc = -EINVAL; 1645 int rc1 = 0; 1646 1647 smc = smc_sk(sk); 1648 1649 if ((how < SHUT_RD) || (how > SHUT_RDWR)) 1650 return rc; 1651 1652 lock_sock(sk); 1653 1654 rc = -ENOTCONN; 1655 if ((sk->sk_state != SMC_ACTIVE) && 1656 (sk->sk_state != SMC_PEERCLOSEWAIT1) && 1657 (sk->sk_state != SMC_PEERCLOSEWAIT2) && 1658 (sk->sk_state != SMC_APPCLOSEWAIT1) && 1659 (sk->sk_state != SMC_APPCLOSEWAIT2) && 1660 (sk->sk_state != SMC_APPFINCLOSEWAIT)) 1661 goto out; 1662 if (smc->use_fallback) { 1663 rc = kernel_sock_shutdown(smc->clcsock, how); 1664 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; 1665 if (sk->sk_shutdown == SHUTDOWN_MASK) 1666 sk->sk_state = SMC_CLOSED; 1667 goto out; 1668 } 1669 switch (how) { 1670 case SHUT_RDWR: /* shutdown in both directions */ 1671 rc = smc_close_active(smc); 1672 break; 1673 case SHUT_WR: 1674 rc = smc_close_shutdown_write(smc); 1675 break; 1676 case SHUT_RD: 1677 rc = 0; 1678 /* nothing more to do because peer is not involved */ 1679 break; 1680 } 1681 if (smc->clcsock) 1682 rc1 = kernel_sock_shutdown(smc->clcsock, how); 1683 /* map sock_shutdown_cmd constants to sk_shutdown value range */ 1684 sk->sk_shutdown |= how + 1; 1685 1686 out: 1687 release_sock(sk); 1688 return rc ? rc : rc1; 1689 } 1690 1691 static int smc_setsockopt(struct socket *sock, int level, int optname, 1692 char __user *optval, unsigned int optlen) 1693 { 1694 struct sock *sk = sock->sk; 1695 struct smc_sock *smc; 1696 int val, rc; 1697 1698 smc = smc_sk(sk); 1699 1700 /* generic setsockopts reaching us here always apply to the 1701 * CLC socket 1702 */ 1703 rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, 1704 optval, optlen); 1705 if (smc->clcsock->sk->sk_err) { 1706 sk->sk_err = smc->clcsock->sk->sk_err; 1707 sk->sk_error_report(sk); 1708 } 1709 if (rc) 1710 return rc; 1711 1712 if (optlen < sizeof(int)) 1713 return -EINVAL; 1714 if (get_user(val, (int __user *)optval)) 1715 return -EFAULT; 1716 1717 lock_sock(sk); 1718 switch (optname) { 1719 case TCP_ULP: 1720 case TCP_FASTOPEN: 1721 case TCP_FASTOPEN_CONNECT: 1722 case TCP_FASTOPEN_KEY: 1723 case TCP_FASTOPEN_NO_COOKIE: 1724 /* option not supported by SMC */ 1725 if (sk->sk_state == SMC_INIT) { 1726 smc_switch_to_fallback(smc); 1727 smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; 1728 } else { 1729 if (!smc->use_fallback) 1730 rc = -EINVAL; 1731 } 1732 break; 1733 case TCP_NODELAY: 1734 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { 1735 if (val && !smc->use_fallback) 1736 mod_delayed_work(system_wq, &smc->conn.tx_work, 1737 0); 1738 } 1739 break; 1740 case TCP_CORK: 1741 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { 1742 if (!val && !smc->use_fallback) 1743 mod_delayed_work(system_wq, &smc->conn.tx_work, 1744 0); 1745 } 1746 break; 1747 case TCP_DEFER_ACCEPT: 1748 smc->sockopt_defer_accept = val; 1749 break; 1750 default: 1751 break; 1752 } 1753 release_sock(sk); 1754 1755 return rc; 1756 } 1757 1758 static int smc_getsockopt(struct socket *sock, int level, int optname, 1759 char __user *optval, int __user *optlen) 1760 { 1761 struct smc_sock *smc; 1762 1763 smc = smc_sk(sock->sk); 1764 /* socket options apply to the CLC socket */ 1765 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, 1766 optval, optlen); 1767 } 1768 1769 static int smc_ioctl(struct socket *sock, unsigned int cmd, 1770 unsigned long arg) 1771 { 1772 union smc_host_cursor cons, urg; 1773 struct smc_connection *conn; 1774 struct smc_sock *smc; 1775 int answ; 1776 1777 smc = smc_sk(sock->sk); 1778 conn = &smc->conn; 1779 lock_sock(&smc->sk); 1780 if (smc->use_fallback) { 1781 if (!smc->clcsock) { 1782 release_sock(&smc->sk); 1783 return -EBADF; 1784 } 1785 answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); 1786 release_sock(&smc->sk); 1787 return answ; 1788 } 1789 switch (cmd) { 1790 case SIOCINQ: /* same as FIONREAD */ 1791 if (smc->sk.sk_state == SMC_LISTEN) { 1792 release_sock(&smc->sk); 1793 return -EINVAL; 1794 } 1795 if (smc->sk.sk_state == SMC_INIT || 1796 smc->sk.sk_state == SMC_CLOSED) 1797 answ = 0; 1798 else 1799 answ = atomic_read(&smc->conn.bytes_to_rcv); 1800 break; 1801 case SIOCOUTQ: 1802 /* output queue size (not send + not acked) */ 1803 if (smc->sk.sk_state == SMC_LISTEN) { 1804 release_sock(&smc->sk); 1805 return -EINVAL; 1806 } 1807 if (smc->sk.sk_state == SMC_INIT || 1808 smc->sk.sk_state == SMC_CLOSED) 1809 answ = 0; 1810 else 1811 answ = smc->conn.sndbuf_desc->len - 1812 atomic_read(&smc->conn.sndbuf_space); 1813 break; 1814 case SIOCOUTQNSD: 1815 /* output queue size (not send only) */ 1816 if (smc->sk.sk_state == SMC_LISTEN) { 1817 release_sock(&smc->sk); 1818 return -EINVAL; 1819 } 1820 if (smc->sk.sk_state == SMC_INIT || 1821 smc->sk.sk_state == SMC_CLOSED) 1822 answ = 0; 1823 else 1824 answ = smc_tx_prepared_sends(&smc->conn); 1825 break; 1826 case SIOCATMARK: 1827 if (smc->sk.sk_state == SMC_LISTEN) { 1828 release_sock(&smc->sk); 1829 return -EINVAL; 1830 } 1831 if (smc->sk.sk_state == SMC_INIT || 1832 smc->sk.sk_state == SMC_CLOSED) { 1833 answ = 0; 1834 } else { 1835 smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); 1836 smc_curs_copy(&urg, &conn->urg_curs, conn); 1837 answ = smc_curs_diff(conn->rmb_desc->len, 1838 &cons, &urg) == 1; 1839 } 1840 break; 1841 default: 1842 release_sock(&smc->sk); 1843 return -ENOIOCTLCMD; 1844 } 1845 release_sock(&smc->sk); 1846 1847 return put_user(answ, (int __user *)arg); 1848 } 1849 1850 static ssize_t smc_sendpage(struct socket *sock, struct page *page, 1851 int offset, size_t size, int flags) 1852 { 1853 struct sock *sk = sock->sk; 1854 struct smc_sock *smc; 1855 int rc = -EPIPE; 1856 1857 smc = smc_sk(sk); 1858 lock_sock(sk); 1859 if (sk->sk_state != SMC_ACTIVE) { 1860 release_sock(sk); 1861 goto out; 1862 } 1863 release_sock(sk); 1864 if (smc->use_fallback) 1865 rc = kernel_sendpage(smc->clcsock, page, offset, 1866 size, flags); 1867 else 1868 rc = sock_no_sendpage(sock, page, offset, size, flags); 1869 1870 out: 1871 return rc; 1872 } 1873 1874 /* Map the affected portions of the rmbe into an spd, note the number of bytes 1875 * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor 1876 * updates till whenever a respective page has been fully processed. 1877 * Note that subsequent recv() calls have to wait till all splice() processing 1878 * completed. 1879 */ 1880 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, 1881 struct pipe_inode_info *pipe, size_t len, 1882 unsigned int flags) 1883 { 1884 struct sock *sk = sock->sk; 1885 struct smc_sock *smc; 1886 int rc = -ENOTCONN; 1887 1888 smc = smc_sk(sk); 1889 lock_sock(sk); 1890 if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { 1891 /* socket was connected before, no more data to read */ 1892 rc = 0; 1893 goto out; 1894 } 1895 if (sk->sk_state == SMC_INIT || 1896 sk->sk_state == SMC_LISTEN || 1897 sk->sk_state == SMC_CLOSED) 1898 goto out; 1899 1900 if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { 1901 rc = 0; 1902 goto out; 1903 } 1904 1905 if (smc->use_fallback) { 1906 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, 1907 pipe, len, flags); 1908 } else { 1909 if (*ppos) { 1910 rc = -ESPIPE; 1911 goto out; 1912 } 1913 if (flags & SPLICE_F_NONBLOCK) 1914 flags = MSG_DONTWAIT; 1915 else 1916 flags = 0; 1917 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); 1918 } 1919 out: 1920 release_sock(sk); 1921 1922 return rc; 1923 } 1924 1925 /* must look like tcp */ 1926 static const struct proto_ops smc_sock_ops = { 1927 .family = PF_SMC, 1928 .owner = THIS_MODULE, 1929 .release = smc_release, 1930 .bind = smc_bind, 1931 .connect = smc_connect, 1932 .socketpair = sock_no_socketpair, 1933 .accept = smc_accept, 1934 .getname = smc_getname, 1935 .poll = smc_poll, 1936 .ioctl = smc_ioctl, 1937 .listen = smc_listen, 1938 .shutdown = smc_shutdown, 1939 .setsockopt = smc_setsockopt, 1940 .getsockopt = smc_getsockopt, 1941 .sendmsg = smc_sendmsg, 1942 .recvmsg = smc_recvmsg, 1943 .mmap = sock_no_mmap, 1944 .sendpage = smc_sendpage, 1945 .splice_read = smc_splice_read, 1946 }; 1947 1948 static int smc_create(struct net *net, struct socket *sock, int protocol, 1949 int kern) 1950 { 1951 int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; 1952 struct smc_sock *smc; 1953 struct sock *sk; 1954 int rc; 1955 1956 rc = -ESOCKTNOSUPPORT; 1957 if (sock->type != SOCK_STREAM) 1958 goto out; 1959 1960 rc = -EPROTONOSUPPORT; 1961 if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6) 1962 goto out; 1963 1964 rc = -ENOBUFS; 1965 sock->ops = &smc_sock_ops; 1966 sk = smc_sock_alloc(net, sock, protocol); 1967 if (!sk) 1968 goto out; 1969 1970 /* create internal TCP socket for CLC handshake and fallback */ 1971 smc = smc_sk(sk); 1972 smc->use_fallback = false; /* assume rdma capability first */ 1973 smc->fallback_rsn = 0; 1974 rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, 1975 &smc->clcsock); 1976 if (rc) { 1977 sk_common_release(sk); 1978 goto out; 1979 } 1980 smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); 1981 smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); 1982 1983 out: 1984 return rc; 1985 } 1986 1987 static const struct net_proto_family smc_sock_family_ops = { 1988 .family = PF_SMC, 1989 .owner = THIS_MODULE, 1990 .create = smc_create, 1991 }; 1992 1993 unsigned int smc_net_id; 1994 1995 static __net_init int smc_net_init(struct net *net) 1996 { 1997 return smc_pnet_net_init(net); 1998 } 1999 2000 static void __net_exit smc_net_exit(struct net *net) 2001 { 2002 smc_pnet_net_exit(net); 2003 } 2004 2005 static struct pernet_operations smc_net_ops = { 2006 .init = smc_net_init, 2007 .exit = smc_net_exit, 2008 .id = &smc_net_id, 2009 .size = sizeof(struct smc_net), 2010 }; 2011 2012 static int __init smc_init(void) 2013 { 2014 int rc; 2015 2016 rc = register_pernet_subsys(&smc_net_ops); 2017 if (rc) 2018 return rc; 2019 2020 rc = smc_pnet_init(); 2021 if (rc) 2022 return rc; 2023 2024 rc = smc_llc_init(); 2025 if (rc) { 2026 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); 2027 goto out_pnet; 2028 } 2029 2030 rc = smc_cdc_init(); 2031 if (rc) { 2032 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); 2033 goto out_pnet; 2034 } 2035 2036 rc = proto_register(&smc_proto, 1); 2037 if (rc) { 2038 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); 2039 goto out_pnet; 2040 } 2041 2042 rc = proto_register(&smc_proto6, 1); 2043 if (rc) { 2044 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc); 2045 goto out_proto; 2046 } 2047 2048 rc = sock_register(&smc_sock_family_ops); 2049 if (rc) { 2050 pr_err("%s: sock_register fails with %d\n", __func__, rc); 2051 goto out_proto6; 2052 } 2053 INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); 2054 INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); 2055 2056 rc = smc_ib_register_client(); 2057 if (rc) { 2058 pr_err("%s: ib_register fails with %d\n", __func__, rc); 2059 goto out_sock; 2060 } 2061 2062 static_branch_enable(&tcp_have_smc); 2063 return 0; 2064 2065 out_sock: 2066 sock_unregister(PF_SMC); 2067 out_proto6: 2068 proto_unregister(&smc_proto6); 2069 out_proto: 2070 proto_unregister(&smc_proto); 2071 out_pnet: 2072 smc_pnet_exit(); 2073 return rc; 2074 } 2075 2076 static void __exit smc_exit(void) 2077 { 2078 smc_core_exit(); 2079 static_branch_disable(&tcp_have_smc); 2080 smc_ib_unregister_client(); 2081 sock_unregister(PF_SMC); 2082 proto_unregister(&smc_proto6); 2083 proto_unregister(&smc_proto); 2084 smc_pnet_exit(); 2085 unregister_pernet_subsys(&smc_net_ops); 2086 } 2087 2088 module_init(smc_init); 2089 module_exit(smc_exit); 2090 2091 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); 2092 MODULE_DESCRIPTION("smc socket address family"); 2093 MODULE_LICENSE("GPL"); 2094 MODULE_ALIAS_NETPROTO(PF_SMC); 2095