1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 * 5 * Basic Transport Functions exploiting Infiniband API 6 * 7 * Copyright IBM Corp. 2016 8 * 9 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 10 */ 11 12 #include <linux/socket.h> 13 #include <linux/if_vlan.h> 14 #include <linux/random.h> 15 #include <linux/workqueue.h> 16 #include <net/tcp.h> 17 #include <net/sock.h> 18 #include <rdma/ib_verbs.h> 19 #include <rdma/ib_cache.h> 20 21 #include "smc.h" 22 #include "smc_clc.h" 23 #include "smc_core.h" 24 #include "smc_ib.h" 25 #include "smc_wr.h" 26 #include "smc_llc.h" 27 #include "smc_cdc.h" 28 #include "smc_close.h" 29 #include "smc_ism.h" 30 31 #define SMC_LGR_NUM_INCR 256 32 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) 33 #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) 34 #define SMC_LGR_FREE_DELAY_FAST (8 * HZ) 35 36 static struct smc_lgr_list smc_lgr_list = { /* established link groups */ 37 .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), 38 .list = LIST_HEAD_INIT(smc_lgr_list.list), 39 .num = 0, 40 }; 41 42 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, 43 struct smc_buf_desc *buf_desc); 44 45 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) 46 { 47 /* client link group creation always follows the server link group 48 * creation. For client use a somewhat higher removal delay time, 49 * otherwise there is a risk of out-of-sync link groups. 50 */ 51 mod_delayed_work(system_wq, &lgr->free_work, 52 (!lgr->is_smcd && lgr->role == SMC_CLNT) ? 53 SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); 54 } 55 56 void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr) 57 { 58 mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST); 59 } 60 61 /* Register connection's alert token in our lookup structure. 62 * To use rbtrees we have to implement our own insert core. 63 * Requires @conns_lock 64 * @smc connection to register 65 * Returns 0 on success, != otherwise. 66 */ 67 static void smc_lgr_add_alert_token(struct smc_connection *conn) 68 { 69 struct rb_node **link, *parent = NULL; 70 u32 token = conn->alert_token_local; 71 72 link = &conn->lgr->conns_all.rb_node; 73 while (*link) { 74 struct smc_connection *cur = rb_entry(*link, 75 struct smc_connection, alert_node); 76 77 parent = *link; 78 if (cur->alert_token_local > token) 79 link = &parent->rb_left; 80 else 81 link = &parent->rb_right; 82 } 83 /* Put the new node there */ 84 rb_link_node(&conn->alert_node, parent, link); 85 rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); 86 } 87 88 /* Register connection in link group by assigning an alert token 89 * registered in a search tree. 90 * Requires @conns_lock 91 * Note that '0' is a reserved value and not assigned. 92 */ 93 static void smc_lgr_register_conn(struct smc_connection *conn) 94 { 95 struct smc_sock *smc = container_of(conn, struct smc_sock, conn); 96 static atomic_t nexttoken = ATOMIC_INIT(0); 97 98 /* find a new alert_token_local value not yet used by some connection 99 * in this link group 100 */ 101 sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */ 102 while (!conn->alert_token_local) { 103 conn->alert_token_local = atomic_inc_return(&nexttoken); 104 if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr)) 105 conn->alert_token_local = 0; 106 } 107 smc_lgr_add_alert_token(conn); 108 conn->lgr->conns_num++; 109 } 110 111 /* Unregister connection and reset the alert token of the given connection< 112 */ 113 static void __smc_lgr_unregister_conn(struct smc_connection *conn) 114 { 115 struct smc_sock *smc = container_of(conn, struct smc_sock, conn); 116 struct smc_link_group *lgr = conn->lgr; 117 118 rb_erase(&conn->alert_node, &lgr->conns_all); 119 lgr->conns_num--; 120 conn->alert_token_local = 0; 121 conn->lgr = NULL; 122 sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ 123 } 124 125 /* Unregister connection from lgr 126 */ 127 static void smc_lgr_unregister_conn(struct smc_connection *conn) 128 { 129 struct smc_link_group *lgr = conn->lgr; 130 131 write_lock_bh(&lgr->conns_lock); 132 if (conn->alert_token_local) { 133 __smc_lgr_unregister_conn(conn); 134 } 135 write_unlock_bh(&lgr->conns_lock); 136 } 137 138 /* Send delete link, either as client to request the initiation 139 * of the DELETE LINK sequence from server; or as server to 140 * initiate the delete processing. See smc_llc_rx_delete_link(). 141 */ 142 static int smc_link_send_delete(struct smc_link *lnk) 143 { 144 if (lnk->state == SMC_LNK_ACTIVE && 145 !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) { 146 smc_llc_link_deleting(lnk); 147 return 0; 148 } 149 return -ENOTCONN; 150 } 151 152 static void smc_lgr_free_work(struct work_struct *work) 153 { 154 struct smc_link_group *lgr = container_of(to_delayed_work(work), 155 struct smc_link_group, 156 free_work); 157 bool conns; 158 159 spin_lock_bh(&smc_lgr_list.lock); 160 if (list_empty(&lgr->list)) 161 goto free; 162 read_lock_bh(&lgr->conns_lock); 163 conns = RB_EMPTY_ROOT(&lgr->conns_all); 164 read_unlock_bh(&lgr->conns_lock); 165 if (!conns) { /* number of lgr connections is no longer zero */ 166 spin_unlock_bh(&smc_lgr_list.lock); 167 return; 168 } 169 list_del_init(&lgr->list); /* remove from smc_lgr_list */ 170 free: 171 spin_unlock_bh(&smc_lgr_list.lock); 172 173 if (!lgr->is_smcd && !lgr->terminating) { 174 /* try to send del link msg, on error free lgr immediately */ 175 if (!smc_link_send_delete(&lgr->lnk[SMC_SINGLE_LINK])) { 176 /* reschedule in case we never receive a response */ 177 smc_lgr_schedule_free_work(lgr); 178 return; 179 } 180 } 181 182 if (!delayed_work_pending(&lgr->free_work)) { 183 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; 184 185 if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) 186 smc_llc_link_inactive(lnk); 187 if (lgr->is_smcd) 188 smc_ism_signal_shutdown(lgr); 189 smc_lgr_free(lgr); 190 } 191 } 192 193 /* create a new SMC link group */ 194 static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, 195 struct smc_ib_device *smcibdev, u8 ibport, 196 char *peer_systemid, unsigned short vlan_id, 197 struct smcd_dev *smcismdev, u64 peer_gid) 198 { 199 struct smc_link_group *lgr; 200 struct smc_link *lnk; 201 u8 rndvec[3]; 202 int rc = 0; 203 int i; 204 205 if (is_smcd && vlan_id) { 206 rc = smc_ism_get_vlan(smcismdev, vlan_id); 207 if (rc) 208 goto out; 209 } 210 211 lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); 212 if (!lgr) { 213 rc = -ENOMEM; 214 goto out; 215 } 216 lgr->is_smcd = is_smcd; 217 lgr->sync_err = 0; 218 lgr->vlan_id = vlan_id; 219 rwlock_init(&lgr->sndbufs_lock); 220 rwlock_init(&lgr->rmbs_lock); 221 rwlock_init(&lgr->conns_lock); 222 for (i = 0; i < SMC_RMBE_SIZES; i++) { 223 INIT_LIST_HEAD(&lgr->sndbufs[i]); 224 INIT_LIST_HEAD(&lgr->rmbs[i]); 225 } 226 smc_lgr_list.num += SMC_LGR_NUM_INCR; 227 memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); 228 INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); 229 lgr->conns_all = RB_ROOT; 230 if (is_smcd) { 231 /* SMC-D specific settings */ 232 lgr->peer_gid = peer_gid; 233 lgr->smcd = smcismdev; 234 } else { 235 /* SMC-R specific settings */ 236 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; 237 memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); 238 239 lnk = &lgr->lnk[SMC_SINGLE_LINK]; 240 /* initialize link */ 241 lnk->state = SMC_LNK_ACTIVATING; 242 lnk->link_id = SMC_SINGLE_LINK; 243 lnk->smcibdev = smcibdev; 244 lnk->ibport = ibport; 245 lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; 246 if (!smcibdev->initialized) 247 smc_ib_setup_per_ibdev(smcibdev); 248 get_random_bytes(rndvec, sizeof(rndvec)); 249 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + 250 (rndvec[2] << 16); 251 rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, 252 vlan_id, lnk->gid, &lnk->sgid_index); 253 if (rc) 254 goto free_lgr; 255 rc = smc_llc_link_init(lnk); 256 if (rc) 257 goto free_lgr; 258 rc = smc_wr_alloc_link_mem(lnk); 259 if (rc) 260 goto clear_llc_lnk; 261 rc = smc_ib_create_protection_domain(lnk); 262 if (rc) 263 goto free_link_mem; 264 rc = smc_ib_create_queue_pair(lnk); 265 if (rc) 266 goto dealloc_pd; 267 rc = smc_wr_create_link(lnk); 268 if (rc) 269 goto destroy_qp; 270 } 271 smc->conn.lgr = lgr; 272 spin_lock_bh(&smc_lgr_list.lock); 273 list_add(&lgr->list, &smc_lgr_list.list); 274 spin_unlock_bh(&smc_lgr_list.lock); 275 return 0; 276 277 destroy_qp: 278 smc_ib_destroy_queue_pair(lnk); 279 dealloc_pd: 280 smc_ib_dealloc_protection_domain(lnk); 281 free_link_mem: 282 smc_wr_free_link_mem(lnk); 283 clear_llc_lnk: 284 smc_llc_link_clear(lnk); 285 free_lgr: 286 kfree(lgr); 287 out: 288 return rc; 289 } 290 291 static void smc_buf_unuse(struct smc_connection *conn, 292 struct smc_link_group *lgr) 293 { 294 if (conn->sndbuf_desc) 295 conn->sndbuf_desc->used = 0; 296 if (conn->rmb_desc) { 297 if (!conn->rmb_desc->regerr) { 298 conn->rmb_desc->reused = 1; 299 conn->rmb_desc->used = 0; 300 } else { 301 /* buf registration failed, reuse not possible */ 302 write_lock_bh(&lgr->rmbs_lock); 303 list_del(&conn->rmb_desc->list); 304 write_unlock_bh(&lgr->rmbs_lock); 305 306 smc_buf_free(lgr, true, conn->rmb_desc); 307 } 308 } 309 } 310 311 /* remove a finished connection from its link group */ 312 void smc_conn_free(struct smc_connection *conn) 313 { 314 struct smc_link_group *lgr = conn->lgr; 315 316 if (!lgr) 317 return; 318 if (lgr->is_smcd) { 319 smc_ism_unset_conn(conn); 320 tasklet_kill(&conn->rx_tsklet); 321 } else { 322 smc_cdc_tx_dismiss_slots(conn); 323 } 324 smc_lgr_unregister_conn(conn); /* unsets conn->lgr */ 325 smc_buf_unuse(conn, lgr); /* allow buffer reuse */ 326 327 if (!lgr->conns_num) 328 smc_lgr_schedule_free_work(lgr); 329 } 330 331 static void smc_link_clear(struct smc_link *lnk) 332 { 333 lnk->peer_qpn = 0; 334 smc_llc_link_clear(lnk); 335 smc_ib_modify_qp_reset(lnk); 336 smc_wr_free_link(lnk); 337 smc_ib_destroy_queue_pair(lnk); 338 smc_ib_dealloc_protection_domain(lnk); 339 smc_wr_free_link_mem(lnk); 340 } 341 342 static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, 343 struct smc_buf_desc *buf_desc) 344 { 345 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; 346 347 if (is_rmb) { 348 if (buf_desc->mr_rx[SMC_SINGLE_LINK]) 349 smc_ib_put_memory_region( 350 buf_desc->mr_rx[SMC_SINGLE_LINK]); 351 smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, 352 DMA_FROM_DEVICE); 353 } else { 354 smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, 355 DMA_TO_DEVICE); 356 } 357 sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); 358 if (buf_desc->pages) 359 __free_pages(buf_desc->pages, buf_desc->order); 360 kfree(buf_desc); 361 } 362 363 static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, 364 struct smc_buf_desc *buf_desc) 365 { 366 if (is_dmb) { 367 /* restore original buf len */ 368 buf_desc->len += sizeof(struct smcd_cdc_msg); 369 smc_ism_unregister_dmb(lgr->smcd, buf_desc); 370 } else { 371 kfree(buf_desc->cpu_addr); 372 } 373 kfree(buf_desc); 374 } 375 376 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, 377 struct smc_buf_desc *buf_desc) 378 { 379 if (lgr->is_smcd) 380 smcd_buf_free(lgr, is_rmb, buf_desc); 381 else 382 smcr_buf_free(lgr, is_rmb, buf_desc); 383 } 384 385 static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) 386 { 387 struct smc_buf_desc *buf_desc, *bf_desc; 388 struct list_head *buf_list; 389 int i; 390 391 for (i = 0; i < SMC_RMBE_SIZES; i++) { 392 if (is_rmb) 393 buf_list = &lgr->rmbs[i]; 394 else 395 buf_list = &lgr->sndbufs[i]; 396 list_for_each_entry_safe(buf_desc, bf_desc, buf_list, 397 list) { 398 list_del(&buf_desc->list); 399 smc_buf_free(lgr, is_rmb, buf_desc); 400 } 401 } 402 } 403 404 static void smc_lgr_free_bufs(struct smc_link_group *lgr) 405 { 406 /* free send buffers */ 407 __smc_lgr_free_bufs(lgr, false); 408 /* free rmbs */ 409 __smc_lgr_free_bufs(lgr, true); 410 } 411 412 /* remove a link group */ 413 void smc_lgr_free(struct smc_link_group *lgr) 414 { 415 smc_lgr_free_bufs(lgr); 416 if (lgr->is_smcd) 417 smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); 418 else 419 smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); 420 kfree(lgr); 421 } 422 423 void smc_lgr_forget(struct smc_link_group *lgr) 424 { 425 spin_lock_bh(&smc_lgr_list.lock); 426 /* do not use this link group for new connections */ 427 if (!list_empty(&lgr->list)) 428 list_del_init(&lgr->list); 429 spin_unlock_bh(&smc_lgr_list.lock); 430 } 431 432 /* terminate linkgroup abnormally */ 433 static void __smc_lgr_terminate(struct smc_link_group *lgr) 434 { 435 struct smc_connection *conn; 436 struct smc_sock *smc; 437 struct rb_node *node; 438 439 if (lgr->terminating) 440 return; /* lgr already terminating */ 441 lgr->terminating = 1; 442 if (!list_empty(&lgr->list)) /* forget lgr */ 443 list_del_init(&lgr->list); 444 if (!lgr->is_smcd) 445 smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); 446 447 write_lock_bh(&lgr->conns_lock); 448 node = rb_first(&lgr->conns_all); 449 while (node) { 450 conn = rb_entry(node, struct smc_connection, alert_node); 451 smc = container_of(conn, struct smc_sock, conn); 452 sock_hold(&smc->sk); /* sock_put in close work */ 453 conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; 454 __smc_lgr_unregister_conn(conn); 455 write_unlock_bh(&lgr->conns_lock); 456 if (!schedule_work(&conn->close_work)) 457 sock_put(&smc->sk); 458 write_lock_bh(&lgr->conns_lock); 459 node = rb_first(&lgr->conns_all); 460 } 461 write_unlock_bh(&lgr->conns_lock); 462 if (!lgr->is_smcd) 463 wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); 464 smc_lgr_schedule_free_work(lgr); 465 } 466 467 void smc_lgr_terminate(struct smc_link_group *lgr) 468 { 469 spin_lock_bh(&smc_lgr_list.lock); 470 __smc_lgr_terminate(lgr); 471 spin_unlock_bh(&smc_lgr_list.lock); 472 } 473 474 /* Called when IB port is terminated */ 475 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) 476 { 477 struct smc_link_group *lgr, *l; 478 479 spin_lock_bh(&smc_lgr_list.lock); 480 list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { 481 if (!lgr->is_smcd && 482 lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && 483 lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) 484 __smc_lgr_terminate(lgr); 485 } 486 spin_unlock_bh(&smc_lgr_list.lock); 487 } 488 489 /* Called when SMC-D device is terminated or peer is lost */ 490 void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) 491 { 492 struct smc_link_group *lgr, *l; 493 LIST_HEAD(lgr_free_list); 494 495 /* run common cleanup function and build free list */ 496 spin_lock_bh(&smc_lgr_list.lock); 497 list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { 498 if (lgr->is_smcd && lgr->smcd == dev && 499 (!peer_gid || lgr->peer_gid == peer_gid) && 500 (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { 501 __smc_lgr_terminate(lgr); 502 list_move(&lgr->list, &lgr_free_list); 503 } 504 } 505 spin_unlock_bh(&smc_lgr_list.lock); 506 507 /* cancel the regular free workers and actually free lgrs */ 508 list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { 509 list_del_init(&lgr->list); 510 cancel_delayed_work_sync(&lgr->free_work); 511 if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */ 512 smc_ism_signal_shutdown(lgr); 513 smc_lgr_free(lgr); 514 } 515 } 516 517 /* Determine vlan of internal TCP socket. 518 * @vlan_id: address to store the determined vlan id into 519 */ 520 int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) 521 { 522 struct dst_entry *dst = sk_dst_get(clcsock->sk); 523 struct net_device *ndev; 524 int i, nest_lvl, rc = 0; 525 526 *vlan_id = 0; 527 if (!dst) { 528 rc = -ENOTCONN; 529 goto out; 530 } 531 if (!dst->dev) { 532 rc = -ENODEV; 533 goto out_rel; 534 } 535 536 ndev = dst->dev; 537 if (is_vlan_dev(ndev)) { 538 *vlan_id = vlan_dev_vlan_id(ndev); 539 goto out_rel; 540 } 541 542 rtnl_lock(); 543 nest_lvl = dev_get_nest_level(ndev); 544 for (i = 0; i < nest_lvl; i++) { 545 struct list_head *lower = &ndev->adj_list.lower; 546 547 if (list_empty(lower)) 548 break; 549 lower = lower->next; 550 ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower); 551 if (is_vlan_dev(ndev)) { 552 *vlan_id = vlan_dev_vlan_id(ndev); 553 break; 554 } 555 } 556 rtnl_unlock(); 557 558 out_rel: 559 dst_release(dst); 560 out: 561 return rc; 562 } 563 564 static bool smcr_lgr_match(struct smc_link_group *lgr, 565 struct smc_clc_msg_local *lcl, 566 enum smc_lgr_role role, u32 clcqpn) 567 { 568 return !memcmp(lgr->peer_systemid, lcl->id_for_peer, 569 SMC_SYSTEMID_LEN) && 570 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, 571 SMC_GID_SIZE) && 572 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, 573 sizeof(lcl->mac)) && 574 lgr->role == role && 575 (lgr->role == SMC_SERV || 576 lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn); 577 } 578 579 static bool smcd_lgr_match(struct smc_link_group *lgr, 580 struct smcd_dev *smcismdev, u64 peer_gid) 581 { 582 return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; 583 } 584 585 /* create a new SMC connection (and a new link group if necessary) */ 586 int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, 587 struct smc_ib_device *smcibdev, u8 ibport, u32 clcqpn, 588 struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, 589 u64 peer_gid) 590 { 591 struct smc_connection *conn = &smc->conn; 592 int local_contact = SMC_FIRST_CONTACT; 593 struct smc_link_group *lgr; 594 unsigned short vlan_id; 595 enum smc_lgr_role role; 596 int rc = 0; 597 598 role = smc->listen_smc ? SMC_SERV : SMC_CLNT; 599 rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id); 600 if (rc) 601 return rc; 602 603 if ((role == SMC_CLNT) && srv_first_contact) 604 /* create new link group as well */ 605 goto create; 606 607 /* determine if an existing link group can be reused */ 608 spin_lock_bh(&smc_lgr_list.lock); 609 list_for_each_entry(lgr, &smc_lgr_list.list, list) { 610 write_lock_bh(&lgr->conns_lock); 611 if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : 612 smcr_lgr_match(lgr, lcl, role, clcqpn)) && 613 !lgr->sync_err && 614 lgr->vlan_id == vlan_id && 615 (role == SMC_CLNT || 616 lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { 617 /* link group found */ 618 local_contact = SMC_REUSE_CONTACT; 619 conn->lgr = lgr; 620 smc_lgr_register_conn(conn); /* add smc conn to lgr */ 621 write_unlock_bh(&lgr->conns_lock); 622 break; 623 } 624 write_unlock_bh(&lgr->conns_lock); 625 } 626 spin_unlock_bh(&smc_lgr_list.lock); 627 628 if (role == SMC_CLNT && !srv_first_contact && 629 (local_contact == SMC_FIRST_CONTACT)) { 630 /* Server reuses a link group, but Client wants to start 631 * a new one 632 * send out_of_sync decline, reason synchr. error 633 */ 634 return -ENOLINK; 635 } 636 637 create: 638 if (local_contact == SMC_FIRST_CONTACT) { 639 rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport, 640 lcl->id_for_peer, vlan_id, smcd, peer_gid); 641 if (rc) 642 goto out; 643 smc_lgr_register_conn(conn); /* add smc conn to lgr */ 644 } 645 conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; 646 conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; 647 conn->urg_state = SMC_URG_READ; 648 if (is_smcd) { 649 conn->rx_off = sizeof(struct smcd_cdc_msg); 650 smcd_cdc_rx_init(conn); /* init tasklet for this conn */ 651 } 652 #ifndef KERNEL_HAS_ATOMIC64 653 spin_lock_init(&conn->acurs_lock); 654 #endif 655 656 out: 657 return rc ? rc : local_contact; 658 } 659 660 /* convert the RMB size into the compressed notation - minimum 16K. 661 * In contrast to plain ilog2, this rounds towards the next power of 2, 662 * so the socket application gets at least its desired sndbuf / rcvbuf size. 663 */ 664 static u8 smc_compress_bufsize(int size) 665 { 666 u8 compressed; 667 668 if (size <= SMC_BUF_MIN_SIZE) 669 return 0; 670 671 size = (size - 1) >> 14; 672 compressed = ilog2(size) + 1; 673 if (compressed >= SMC_RMBE_SIZES) 674 compressed = SMC_RMBE_SIZES - 1; 675 return compressed; 676 } 677 678 /* convert the RMB size from compressed notation into integer */ 679 int smc_uncompress_bufsize(u8 compressed) 680 { 681 u32 size; 682 683 size = 0x00000001 << (((int)compressed) + 14); 684 return (int)size; 685 } 686 687 /* try to reuse a sndbuf or rmb description slot for a certain 688 * buffer size; if not available, return NULL 689 */ 690 static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, 691 rwlock_t *lock, 692 struct list_head *buf_list) 693 { 694 struct smc_buf_desc *buf_slot; 695 696 read_lock_bh(lock); 697 list_for_each_entry(buf_slot, buf_list, list) { 698 if (cmpxchg(&buf_slot->used, 0, 1) == 0) { 699 read_unlock_bh(lock); 700 return buf_slot; 701 } 702 } 703 read_unlock_bh(lock); 704 return NULL; 705 } 706 707 /* one of the conditions for announcing a receiver's current window size is 708 * that it "results in a minimum increase in the window size of 10% of the 709 * receive buffer space" [RFC7609] 710 */ 711 static inline int smc_rmb_wnd_update_limit(int rmbe_size) 712 { 713 return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); 714 } 715 716 static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, 717 bool is_rmb, int bufsize) 718 { 719 struct smc_buf_desc *buf_desc; 720 struct smc_link *lnk; 721 int rc; 722 723 /* try to alloc a new buffer */ 724 buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); 725 if (!buf_desc) 726 return ERR_PTR(-ENOMEM); 727 728 buf_desc->order = get_order(bufsize); 729 buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | 730 __GFP_NOMEMALLOC | __GFP_COMP | 731 __GFP_NORETRY | __GFP_ZERO, 732 buf_desc->order); 733 if (!buf_desc->pages) { 734 kfree(buf_desc); 735 return ERR_PTR(-EAGAIN); 736 } 737 buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); 738 739 /* build the sg table from the pages */ 740 lnk = &lgr->lnk[SMC_SINGLE_LINK]; 741 rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, 742 GFP_KERNEL); 743 if (rc) { 744 smc_buf_free(lgr, is_rmb, buf_desc); 745 return ERR_PTR(rc); 746 } 747 sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, 748 buf_desc->cpu_addr, bufsize); 749 750 /* map sg table to DMA address */ 751 rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, 752 is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); 753 /* SMC protocol depends on mapping to one DMA address only */ 754 if (rc != 1) { 755 smc_buf_free(lgr, is_rmb, buf_desc); 756 return ERR_PTR(-EAGAIN); 757 } 758 759 /* create a new memory region for the RMB */ 760 if (is_rmb) { 761 rc = smc_ib_get_memory_region(lnk->roce_pd, 762 IB_ACCESS_REMOTE_WRITE | 763 IB_ACCESS_LOCAL_WRITE, 764 buf_desc); 765 if (rc) { 766 smc_buf_free(lgr, is_rmb, buf_desc); 767 return ERR_PTR(rc); 768 } 769 } 770 771 buf_desc->len = bufsize; 772 return buf_desc; 773 } 774 775 #define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ 776 777 static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, 778 bool is_dmb, int bufsize) 779 { 780 struct smc_buf_desc *buf_desc; 781 int rc; 782 783 if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) 784 return ERR_PTR(-EAGAIN); 785 786 /* try to alloc a new DMB */ 787 buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); 788 if (!buf_desc) 789 return ERR_PTR(-ENOMEM); 790 if (is_dmb) { 791 rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); 792 if (rc) { 793 kfree(buf_desc); 794 return ERR_PTR(-EAGAIN); 795 } 796 buf_desc->pages = virt_to_page(buf_desc->cpu_addr); 797 /* CDC header stored in buf. So, pretend it was smaller */ 798 buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg); 799 } else { 800 buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | 801 __GFP_NOWARN | __GFP_NORETRY | 802 __GFP_NOMEMALLOC); 803 if (!buf_desc->cpu_addr) { 804 kfree(buf_desc); 805 return ERR_PTR(-EAGAIN); 806 } 807 buf_desc->len = bufsize; 808 } 809 return buf_desc; 810 } 811 812 static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) 813 { 814 struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); 815 struct smc_connection *conn = &smc->conn; 816 struct smc_link_group *lgr = conn->lgr; 817 struct list_head *buf_list; 818 int bufsize, bufsize_short; 819 int sk_buf_size; 820 rwlock_t *lock; 821 822 if (is_rmb) 823 /* use socket recv buffer size (w/o overhead) as start value */ 824 sk_buf_size = smc->sk.sk_rcvbuf / 2; 825 else 826 /* use socket send buffer size (w/o overhead) as start value */ 827 sk_buf_size = smc->sk.sk_sndbuf / 2; 828 829 for (bufsize_short = smc_compress_bufsize(sk_buf_size); 830 bufsize_short >= 0; bufsize_short--) { 831 832 if (is_rmb) { 833 lock = &lgr->rmbs_lock; 834 buf_list = &lgr->rmbs[bufsize_short]; 835 } else { 836 lock = &lgr->sndbufs_lock; 837 buf_list = &lgr->sndbufs[bufsize_short]; 838 } 839 bufsize = smc_uncompress_bufsize(bufsize_short); 840 if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) 841 continue; 842 843 /* check for reusable slot in the link group */ 844 buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); 845 if (buf_desc) { 846 memset(buf_desc->cpu_addr, 0, bufsize); 847 break; /* found reusable slot */ 848 } 849 850 if (is_smcd) 851 buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); 852 else 853 buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); 854 855 if (PTR_ERR(buf_desc) == -ENOMEM) 856 break; 857 if (IS_ERR(buf_desc)) 858 continue; 859 860 buf_desc->used = 1; 861 write_lock_bh(lock); 862 list_add(&buf_desc->list, buf_list); 863 write_unlock_bh(lock); 864 break; /* found */ 865 } 866 867 if (IS_ERR(buf_desc)) 868 return -ENOMEM; 869 870 if (is_rmb) { 871 conn->rmb_desc = buf_desc; 872 conn->rmbe_size_short = bufsize_short; 873 smc->sk.sk_rcvbuf = bufsize * 2; 874 atomic_set(&conn->bytes_to_rcv, 0); 875 conn->rmbe_update_limit = 876 smc_rmb_wnd_update_limit(buf_desc->len); 877 if (is_smcd) 878 smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ 879 } else { 880 conn->sndbuf_desc = buf_desc; 881 smc->sk.sk_sndbuf = bufsize * 2; 882 atomic_set(&conn->sndbuf_space, bufsize); 883 } 884 return 0; 885 } 886 887 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) 888 { 889 struct smc_link_group *lgr = conn->lgr; 890 891 if (!conn->lgr || conn->lgr->is_smcd) 892 return; 893 smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 894 conn->sndbuf_desc, DMA_TO_DEVICE); 895 } 896 897 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) 898 { 899 struct smc_link_group *lgr = conn->lgr; 900 901 if (!conn->lgr || conn->lgr->is_smcd) 902 return; 903 smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 904 conn->sndbuf_desc, DMA_TO_DEVICE); 905 } 906 907 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) 908 { 909 struct smc_link_group *lgr = conn->lgr; 910 911 if (!conn->lgr || conn->lgr->is_smcd) 912 return; 913 smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 914 conn->rmb_desc, DMA_FROM_DEVICE); 915 } 916 917 void smc_rmb_sync_sg_for_device(struct smc_connection *conn) 918 { 919 struct smc_link_group *lgr = conn->lgr; 920 921 if (!conn->lgr || conn->lgr->is_smcd) 922 return; 923 smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, 924 conn->rmb_desc, DMA_FROM_DEVICE); 925 } 926 927 /* create the send and receive buffer for an SMC socket; 928 * receive buffers are called RMBs; 929 * (even though the SMC protocol allows more than one RMB-element per RMB, 930 * the Linux implementation uses just one RMB-element per RMB, i.e. uses an 931 * extra RMB for every connection in a link group 932 */ 933 int smc_buf_create(struct smc_sock *smc, bool is_smcd) 934 { 935 int rc; 936 937 /* create send buffer */ 938 rc = __smc_buf_create(smc, is_smcd, false); 939 if (rc) 940 return rc; 941 /* create rmb */ 942 rc = __smc_buf_create(smc, is_smcd, true); 943 if (rc) 944 smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); 945 return rc; 946 } 947 948 static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) 949 { 950 int i; 951 952 for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) { 953 if (!test_and_set_bit(i, lgr->rtokens_used_mask)) 954 return i; 955 } 956 return -ENOSPC; 957 } 958 959 /* add a new rtoken from peer */ 960 int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) 961 { 962 u64 dma_addr = be64_to_cpu(nw_vaddr); 963 u32 rkey = ntohl(nw_rkey); 964 int i; 965 966 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { 967 if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && 968 (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && 969 test_bit(i, lgr->rtokens_used_mask)) { 970 /* already in list */ 971 return i; 972 } 973 } 974 i = smc_rmb_reserve_rtoken_idx(lgr); 975 if (i < 0) 976 return i; 977 lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; 978 lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; 979 return i; 980 } 981 982 /* delete an rtoken */ 983 int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) 984 { 985 u32 rkey = ntohl(nw_rkey); 986 int i; 987 988 for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { 989 if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && 990 test_bit(i, lgr->rtokens_used_mask)) { 991 lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; 992 lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; 993 994 clear_bit(i, lgr->rtokens_used_mask); 995 return 0; 996 } 997 } 998 return -ENOENT; 999 } 1000 1001 /* save rkey and dma_addr received from peer during clc handshake */ 1002 int smc_rmb_rtoken_handling(struct smc_connection *conn, 1003 struct smc_clc_msg_accept_confirm *clc) 1004 { 1005 conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, 1006 clc->rmb_rkey); 1007 if (conn->rtoken_idx < 0) 1008 return conn->rtoken_idx; 1009 return 0; 1010 } 1011 1012 /* Called (from smc_exit) when module is removed */ 1013 void smc_core_exit(void) 1014 { 1015 struct smc_link_group *lgr, *lg; 1016 LIST_HEAD(lgr_freeing_list); 1017 1018 spin_lock_bh(&smc_lgr_list.lock); 1019 if (!list_empty(&smc_lgr_list.list)) 1020 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); 1021 spin_unlock_bh(&smc_lgr_list.lock); 1022 list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { 1023 list_del_init(&lgr->list); 1024 if (!lgr->is_smcd) { 1025 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; 1026 1027 if (lnk->state == SMC_LNK_ACTIVE) 1028 smc_llc_send_delete_link(lnk, SMC_LLC_REQ, 1029 false); 1030 smc_llc_link_inactive(lnk); 1031 } 1032 cancel_delayed_work_sync(&lgr->free_work); 1033 if (lgr->is_smcd) 1034 smc_ism_signal_shutdown(lgr); 1035 smc_lgr_free(lgr); /* free link group */ 1036 } 1037 } 1038