1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * CLC (connection layer control) handshake over initial TCP socket to 6 * prepare for RDMA traffic 7 * 8 * Copyright IBM Corp. 2016, 2018 9 * 10 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 11 */ 12 13 #include <linux/in.h> 14 #include <linux/inetdevice.h> 15 #include <linux/if_ether.h> 16 #include <linux/sched/signal.h> 17 18 #include <net/addrconf.h> 19 #include <net/sock.h> 20 #include <net/tcp.h> 21 22 #include "smc.h" 23 #include "smc_core.h" 24 #include "smc_clc.h" 25 #include "smc_ib.h" 26 27 /* eye catcher "SMCR" EBCDIC for CLC messages */ 28 static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; 29 30 /* check if received message has a correct header length and contains valid 31 * heading and trailing eyecatchers 32 */ 33 static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) 34 { 35 struct smc_clc_msg_proposal_prefix *pclc_prfx; 36 struct smc_clc_msg_accept_confirm *clc; 37 struct smc_clc_msg_proposal *pclc; 38 struct smc_clc_msg_decline *dclc; 39 struct smc_clc_msg_trail *trl; 40 41 if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) 42 return false; 43 switch (clcm->type) { 44 case SMC_CLC_PROPOSAL: 45 pclc = (struct smc_clc_msg_proposal *)clcm; 46 pclc_prfx = smc_clc_proposal_get_prefix(pclc); 47 if (ntohs(pclc->hdr.length) != 48 sizeof(*pclc) + ntohs(pclc->iparea_offset) + 49 sizeof(*pclc_prfx) + 50 pclc_prfx->ipv6_prefixes_cnt * 51 sizeof(struct smc_clc_ipv6_prefix) + 52 sizeof(*trl)) 53 return false; 54 trl = (struct smc_clc_msg_trail *) 55 ((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl)); 56 break; 57 case SMC_CLC_ACCEPT: 58 case SMC_CLC_CONFIRM: 59 clc = (struct smc_clc_msg_accept_confirm *)clcm; 60 if (ntohs(clc->hdr.length) != sizeof(*clc)) 61 return false; 62 trl = &clc->trl; 63 break; 64 case SMC_CLC_DECLINE: 65 dclc = (struct smc_clc_msg_decline *)clcm; 66 if (ntohs(dclc->hdr.length) != sizeof(*dclc)) 67 return false; 68 trl = &dclc->trl; 69 break; 70 default: 71 return false; 72 } 73 if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) 74 return false; 75 return true; 76 } 77 78 /* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */ 79 static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, 80 struct smc_clc_msg_proposal_prefix *prop) 81 { 82 struct in_device *in_dev = __in_dev_get_rcu(dst->dev); 83 84 if (!in_dev) 85 return -ENODEV; 86 for_ifa(in_dev) { 87 if (!inet_ifa_match(ipv4, ifa)) 88 continue; 89 prop->prefix_len = inet_mask_len(ifa->ifa_mask); 90 prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask; 91 /* prop->ipv6_prefixes_cnt = 0; already done by memset before */ 92 return 0; 93 } endfor_ifa(in_dev); 94 return -ENOENT; 95 } 96 97 /* fill CLC proposal msg with ipv6 prefixes from device */ 98 static int smc_clc_prfx_set6_rcu(struct dst_entry *dst, 99 struct smc_clc_msg_proposal_prefix *prop, 100 struct smc_clc_ipv6_prefix *ipv6_prfx) 101 { 102 #if IS_ENABLED(CONFIG_IPV6) 103 struct inet6_dev *in6_dev = __in6_dev_get(dst->dev); 104 struct inet6_ifaddr *ifa; 105 int cnt = 0; 106 107 if (!in6_dev) 108 return -ENODEV; 109 /* use a maximum of 8 IPv6 prefixes from device */ 110 list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { 111 if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) 112 continue; 113 ipv6_addr_prefix(&ipv6_prfx[cnt].prefix, 114 &ifa->addr, ifa->prefix_len); 115 ipv6_prfx[cnt].prefix_len = ifa->prefix_len; 116 cnt++; 117 if (cnt == SMC_CLC_MAX_V6_PREFIX) 118 break; 119 } 120 prop->ipv6_prefixes_cnt = cnt; 121 if (cnt) 122 return 0; 123 #endif 124 return -ENOENT; 125 } 126 127 /* retrieve and set prefixes in CLC proposal msg */ 128 static int smc_clc_prfx_set(struct socket *clcsock, 129 struct smc_clc_msg_proposal_prefix *prop, 130 struct smc_clc_ipv6_prefix *ipv6_prfx) 131 { 132 struct dst_entry *dst = sk_dst_get(clcsock->sk); 133 struct sockaddr_storage addrs; 134 struct sockaddr_in6 *addr6; 135 struct sockaddr_in *addr; 136 int rc = -ENOENT; 137 138 memset(prop, 0, sizeof(*prop)); 139 if (!dst) { 140 rc = -ENOTCONN; 141 goto out; 142 } 143 if (!dst->dev) { 144 rc = -ENODEV; 145 goto out_rel; 146 } 147 /* get address to which the internal TCP socket is bound */ 148 kernel_getsockname(clcsock, (struct sockaddr *)&addrs); 149 /* analyze IP specific data of net_device belonging to TCP socket */ 150 addr6 = (struct sockaddr_in6 *)&addrs; 151 rcu_read_lock(); 152 if (addrs.ss_family == PF_INET) { 153 /* IPv4 */ 154 addr = (struct sockaddr_in *)&addrs; 155 rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop); 156 } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) { 157 /* mapped IPv4 address - peer is IPv4 only */ 158 rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3], 159 prop); 160 } else { 161 /* IPv6 */ 162 rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx); 163 } 164 rcu_read_unlock(); 165 out_rel: 166 dst_release(dst); 167 out: 168 return rc; 169 } 170 171 /* match ipv4 addrs of dev against addr in CLC proposal */ 172 static int smc_clc_prfx_match4_rcu(struct net_device *dev, 173 struct smc_clc_msg_proposal_prefix *prop) 174 { 175 struct in_device *in_dev = __in_dev_get_rcu(dev); 176 177 if (!in_dev) 178 return -ENODEV; 179 for_ifa(in_dev) { 180 if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) && 181 inet_ifa_match(prop->outgoing_subnet, ifa)) 182 return 0; 183 } endfor_ifa(in_dev); 184 185 return -ENOENT; 186 } 187 188 /* match ipv6 addrs of dev against addrs in CLC proposal */ 189 static int smc_clc_prfx_match6_rcu(struct net_device *dev, 190 struct smc_clc_msg_proposal_prefix *prop) 191 { 192 #if IS_ENABLED(CONFIG_IPV6) 193 struct inet6_dev *in6_dev = __in6_dev_get(dev); 194 struct smc_clc_ipv6_prefix *ipv6_prfx; 195 struct inet6_ifaddr *ifa; 196 int i, max; 197 198 if (!in6_dev) 199 return -ENODEV; 200 /* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */ 201 ipv6_prfx = (struct smc_clc_ipv6_prefix *)((u8 *)prop + sizeof(*prop)); 202 max = min_t(u8, prop->ipv6_prefixes_cnt, SMC_CLC_MAX_V6_PREFIX); 203 list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { 204 if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) 205 continue; 206 for (i = 0; i < max; i++) { 207 if (ifa->prefix_len == ipv6_prfx[i].prefix_len && 208 ipv6_prefix_equal(&ifa->addr, &ipv6_prfx[i].prefix, 209 ifa->prefix_len)) 210 return 0; 211 } 212 } 213 #endif 214 return -ENOENT; 215 } 216 217 /* check if proposed prefixes match one of our device prefixes */ 218 int smc_clc_prfx_match(struct socket *clcsock, 219 struct smc_clc_msg_proposal_prefix *prop) 220 { 221 struct dst_entry *dst = sk_dst_get(clcsock->sk); 222 int rc; 223 224 if (!dst) { 225 rc = -ENOTCONN; 226 goto out; 227 } 228 if (!dst->dev) { 229 rc = -ENODEV; 230 goto out_rel; 231 } 232 rcu_read_lock(); 233 if (!prop->ipv6_prefixes_cnt) 234 rc = smc_clc_prfx_match4_rcu(dst->dev, prop); 235 else 236 rc = smc_clc_prfx_match6_rcu(dst->dev, prop); 237 rcu_read_unlock(); 238 out_rel: 239 dst_release(dst); 240 out: 241 return rc; 242 } 243 244 /* Wait for data on the tcp-socket, analyze received data 245 * Returns: 246 * 0 if success and it was not a decline that we received. 247 * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send. 248 * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise. 249 */ 250 int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, 251 u8 expected_type) 252 { 253 long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo; 254 struct sock *clc_sk = smc->clcsock->sk; 255 struct smc_clc_msg_hdr *clcm = buf; 256 struct msghdr msg = {NULL, 0}; 257 int reason_code = 0; 258 struct kvec vec = {buf, buflen}; 259 int len, datlen; 260 int krflags; 261 262 /* peek the first few bytes to determine length of data to receive 263 * so we don't consume any subsequent CLC message or payload data 264 * in the TCP byte stream 265 */ 266 /* 267 * Caller must make sure that buflen is no less than 268 * sizeof(struct smc_clc_msg_hdr) 269 */ 270 krflags = MSG_PEEK | MSG_WAITALL; 271 smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; 272 iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, 273 sizeof(struct smc_clc_msg_hdr)); 274 len = sock_recvmsg(smc->clcsock, &msg, krflags); 275 if (signal_pending(current)) { 276 reason_code = -EINTR; 277 clc_sk->sk_err = EINTR; 278 smc->sk.sk_err = EINTR; 279 goto out; 280 } 281 if (clc_sk->sk_err) { 282 reason_code = -clc_sk->sk_err; 283 smc->sk.sk_err = clc_sk->sk_err; 284 goto out; 285 } 286 if (!len) { /* peer has performed orderly shutdown */ 287 smc->sk.sk_err = ECONNRESET; 288 reason_code = -ECONNRESET; 289 goto out; 290 } 291 if (len < 0) { 292 smc->sk.sk_err = -len; 293 reason_code = len; 294 goto out; 295 } 296 datlen = ntohs(clcm->length); 297 if ((len < sizeof(struct smc_clc_msg_hdr)) || 298 (datlen > buflen) || 299 ((clcm->type != SMC_CLC_DECLINE) && 300 (clcm->type != expected_type))) { 301 smc->sk.sk_err = EPROTO; 302 reason_code = -EPROTO; 303 goto out; 304 } 305 306 /* receive the complete CLC message */ 307 memset(&msg, 0, sizeof(struct msghdr)); 308 iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, datlen); 309 krflags = MSG_WAITALL; 310 len = sock_recvmsg(smc->clcsock, &msg, krflags); 311 if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) { 312 smc->sk.sk_err = EPROTO; 313 reason_code = -EPROTO; 314 goto out; 315 } 316 if (clcm->type == SMC_CLC_DECLINE) { 317 reason_code = SMC_CLC_DECL_REPLY; 318 if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { 319 smc->conn.lgr->sync_err = 1; 320 smc_lgr_terminate(smc->conn.lgr); 321 } 322 } 323 324 out: 325 smc->clcsock->sk->sk_rcvtimeo = rcvtimeo; 326 return reason_code; 327 } 328 329 /* send CLC DECLINE message across internal TCP socket */ 330 int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) 331 { 332 struct smc_clc_msg_decline dclc; 333 struct msghdr msg; 334 struct kvec vec; 335 int len; 336 337 memset(&dclc, 0, sizeof(dclc)); 338 memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); 339 dclc.hdr.type = SMC_CLC_DECLINE; 340 dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); 341 dclc.hdr.version = SMC_CLC_V1; 342 dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0; 343 memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); 344 dclc.peer_diagnosis = htonl(peer_diag_info); 345 memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); 346 347 memset(&msg, 0, sizeof(msg)); 348 vec.iov_base = &dclc; 349 vec.iov_len = sizeof(struct smc_clc_msg_decline); 350 len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, 351 sizeof(struct smc_clc_msg_decline)); 352 if (len < sizeof(struct smc_clc_msg_decline)) 353 smc->sk.sk_err = EPROTO; 354 if (len < 0) 355 smc->sk.sk_err = -len; 356 return sock_error(&smc->sk); 357 } 358 359 /* send CLC PROPOSAL message across internal TCP socket */ 360 int smc_clc_send_proposal(struct smc_sock *smc, 361 struct smc_ib_device *smcibdev, 362 u8 ibport) 363 { 364 struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; 365 struct smc_clc_msg_proposal_prefix pclc_prfx; 366 struct smc_clc_msg_proposal pclc; 367 struct smc_clc_msg_trail trl; 368 int len, i, plen, rc; 369 int reason_code = 0; 370 struct kvec vec[4]; 371 struct msghdr msg; 372 373 /* retrieve ip prefixes for CLC proposal msg */ 374 rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx, ipv6_prfx); 375 if (rc) 376 return SMC_CLC_DECL_CNFERR; /* configuration error */ 377 378 /* send SMC Proposal CLC message */ 379 plen = sizeof(pclc) + sizeof(pclc_prfx) + 380 (pclc_prfx.ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) + 381 sizeof(trl); 382 memset(&pclc, 0, sizeof(pclc)); 383 memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); 384 pclc.hdr.type = SMC_CLC_PROPOSAL; 385 pclc.hdr.length = htons(plen); 386 pclc.hdr.version = SMC_CLC_V1; /* SMC version */ 387 memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); 388 memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE); 389 memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN); 390 pclc.iparea_offset = htons(0); 391 392 memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); 393 memset(&msg, 0, sizeof(msg)); 394 i = 0; 395 vec[i].iov_base = &pclc; 396 vec[i++].iov_len = sizeof(pclc); 397 vec[i].iov_base = &pclc_prfx; 398 vec[i++].iov_len = sizeof(pclc_prfx); 399 if (pclc_prfx.ipv6_prefixes_cnt > 0) { 400 vec[i].iov_base = &ipv6_prfx[0]; 401 vec[i++].iov_len = pclc_prfx.ipv6_prefixes_cnt * 402 sizeof(ipv6_prfx[0]); 403 } 404 vec[i].iov_base = &trl; 405 vec[i++].iov_len = sizeof(trl); 406 /* due to the few bytes needed for clc-handshake this cannot block */ 407 len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen); 408 if (len < sizeof(pclc)) { 409 if (len >= 0) { 410 reason_code = -ENETUNREACH; 411 smc->sk.sk_err = -reason_code; 412 } else { 413 smc->sk.sk_err = smc->clcsock->sk->sk_err; 414 reason_code = -smc->sk.sk_err; 415 } 416 } 417 418 return reason_code; 419 } 420 421 /* send CLC CONFIRM message across internal TCP socket */ 422 int smc_clc_send_confirm(struct smc_sock *smc) 423 { 424 struct smc_connection *conn = &smc->conn; 425 struct smc_clc_msg_accept_confirm cclc; 426 struct smc_link *link; 427 int reason_code = 0; 428 struct msghdr msg; 429 struct kvec vec; 430 int len; 431 432 link = &conn->lgr->lnk[SMC_SINGLE_LINK]; 433 /* send SMC Confirm CLC msg */ 434 memset(&cclc, 0, sizeof(cclc)); 435 memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); 436 cclc.hdr.type = SMC_CLC_CONFIRM; 437 cclc.hdr.length = htons(sizeof(cclc)); 438 cclc.hdr.version = SMC_CLC_V1; /* SMC version */ 439 memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); 440 memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], 441 SMC_GID_SIZE); 442 memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); 443 hton24(cclc.qpn, link->roce_qp->qp_num); 444 cclc.rmb_rkey = 445 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); 446 cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ 447 cclc.rmbe_alert_token = htonl(conn->alert_token_local); 448 cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); 449 cclc.rmbe_size = conn->rmbe_size_short; 450 cclc.rmb_dma_addr = cpu_to_be64( 451 (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); 452 hton24(cclc.psn, link->psn_initial); 453 454 memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); 455 456 memset(&msg, 0, sizeof(msg)); 457 vec.iov_base = &cclc; 458 vec.iov_len = sizeof(cclc); 459 len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc)); 460 if (len < sizeof(cclc)) { 461 if (len >= 0) { 462 reason_code = -ENETUNREACH; 463 smc->sk.sk_err = -reason_code; 464 } else { 465 smc->sk.sk_err = smc->clcsock->sk->sk_err; 466 reason_code = -smc->sk.sk_err; 467 } 468 } 469 return reason_code; 470 } 471 472 /* send CLC ACCEPT message across internal TCP socket */ 473 int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) 474 { 475 struct smc_connection *conn = &new_smc->conn; 476 struct smc_clc_msg_accept_confirm aclc; 477 struct smc_link *link; 478 struct msghdr msg; 479 struct kvec vec; 480 int rc = 0; 481 int len; 482 483 link = &conn->lgr->lnk[SMC_SINGLE_LINK]; 484 memset(&aclc, 0, sizeof(aclc)); 485 memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); 486 aclc.hdr.type = SMC_CLC_ACCEPT; 487 aclc.hdr.length = htons(sizeof(aclc)); 488 aclc.hdr.version = SMC_CLC_V1; /* SMC version */ 489 if (srv_first_contact) 490 aclc.hdr.flag = 1; 491 memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); 492 memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], 493 SMC_GID_SIZE); 494 memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); 495 hton24(aclc.qpn, link->roce_qp->qp_num); 496 aclc.rmb_rkey = 497 htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); 498 aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ 499 aclc.rmbe_alert_token = htonl(conn->alert_token_local); 500 aclc.qp_mtu = link->path_mtu; 501 aclc.rmbe_size = conn->rmbe_size_short, 502 aclc.rmb_dma_addr = cpu_to_be64( 503 (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); 504 hton24(aclc.psn, link->psn_initial); 505 memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); 506 507 memset(&msg, 0, sizeof(msg)); 508 vec.iov_base = &aclc; 509 vec.iov_len = sizeof(aclc); 510 len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc)); 511 if (len < sizeof(aclc)) { 512 if (len >= 0) 513 new_smc->sk.sk_err = EPROTO; 514 else 515 new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err; 516 rc = sock_error(&new_smc->sk); 517 } 518 519 return rc; 520 } 521