1 /* 2 * Shared Memory Communications over RDMA (SMC-R) and RoCE 3 * 4 * IB infrastructure: 5 * Establish SMC-R as an Infiniband Client to be notified about added and 6 * removed IB devices of type RDMA. 7 * Determine device and port characteristics for these IB devices. 8 * 9 * Copyright IBM Corp. 2016 10 * 11 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 12 */ 13 14 #include <linux/random.h> 15 #include <linux/workqueue.h> 16 #include <rdma/ib_verbs.h> 17 18 #include "smc_pnet.h" 19 #include "smc_ib.h" 20 #include "smc_core.h" 21 #include "smc_wr.h" 22 #include "smc.h" 23 24 #define SMC_QP_MIN_RNR_TIMER 5 25 #define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */ 26 #define SMC_QP_RETRY_CNT 7 /* 7: infinite */ 27 #define SMC_QP_RNR_RETRY 7 /* 7: infinite */ 28 29 struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ 30 .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock), 31 .list = LIST_HEAD_INIT(smc_ib_devices.list), 32 }; 33 34 #define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%" 35 36 u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system 37 * identifier 38 */ 39 40 int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, 41 struct ib_mr **mr) 42 { 43 int rc; 44 45 if (*mr) 46 return 0; /* already done */ 47 48 /* obtain unique key - 49 * next invocation of get_dma_mr returns a different key! 50 */ 51 *mr = pd->device->get_dma_mr(pd, access_flags); 52 rc = PTR_ERR_OR_ZERO(*mr); 53 if (IS_ERR(*mr)) 54 *mr = NULL; 55 return rc; 56 } 57 58 static int smc_ib_modify_qp_init(struct smc_link *lnk) 59 { 60 struct ib_qp_attr qp_attr; 61 62 memset(&qp_attr, 0, sizeof(qp_attr)); 63 qp_attr.qp_state = IB_QPS_INIT; 64 qp_attr.pkey_index = 0; 65 qp_attr.port_num = lnk->ibport; 66 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE 67 | IB_ACCESS_REMOTE_WRITE; 68 return ib_modify_qp(lnk->roce_qp, &qp_attr, 69 IB_QP_STATE | IB_QP_PKEY_INDEX | 70 IB_QP_ACCESS_FLAGS | IB_QP_PORT); 71 } 72 73 static int smc_ib_modify_qp_rtr(struct smc_link *lnk) 74 { 75 enum ib_qp_attr_mask qp_attr_mask = 76 IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | 77 IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; 78 struct ib_qp_attr qp_attr; 79 80 memset(&qp_attr, 0, sizeof(qp_attr)); 81 qp_attr.qp_state = IB_QPS_RTR; 82 qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); 83 qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; 84 rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); 85 rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, 0, 1, 0); 86 rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); 87 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, 88 sizeof(lnk->peer_mac)); 89 qp_attr.dest_qp_num = lnk->peer_qpn; 90 qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ 91 qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming 92 * requests 93 */ 94 qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER; 95 96 return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask); 97 } 98 99 int smc_ib_modify_qp_rts(struct smc_link *lnk) 100 { 101 struct ib_qp_attr qp_attr; 102 103 memset(&qp_attr, 0, sizeof(qp_attr)); 104 qp_attr.qp_state = IB_QPS_RTS; 105 qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */ 106 qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */ 107 qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */ 108 qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */ 109 qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and 110 * atomic ops allowed 111 */ 112 return ib_modify_qp(lnk->roce_qp, &qp_attr, 113 IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | 114 IB_QP_SQ_PSN | IB_QP_RNR_RETRY | 115 IB_QP_MAX_QP_RD_ATOMIC); 116 } 117 118 int smc_ib_modify_qp_reset(struct smc_link *lnk) 119 { 120 struct ib_qp_attr qp_attr; 121 122 memset(&qp_attr, 0, sizeof(qp_attr)); 123 qp_attr.qp_state = IB_QPS_RESET; 124 return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); 125 } 126 127 int smc_ib_ready_link(struct smc_link *lnk) 128 { 129 struct smc_link_group *lgr = 130 container_of(lnk, struct smc_link_group, lnk[0]); 131 int rc = 0; 132 133 rc = smc_ib_modify_qp_init(lnk); 134 if (rc) 135 goto out; 136 137 rc = smc_ib_modify_qp_rtr(lnk); 138 if (rc) 139 goto out; 140 smc_wr_remember_qp_attr(lnk); 141 rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, 142 IB_CQ_SOLICITED_MASK); 143 if (rc) 144 goto out; 145 rc = smc_wr_rx_post_init(lnk); 146 if (rc) 147 goto out; 148 smc_wr_remember_qp_attr(lnk); 149 150 if (lgr->role == SMC_SERV) { 151 rc = smc_ib_modify_qp_rts(lnk); 152 if (rc) 153 goto out; 154 smc_wr_remember_qp_attr(lnk); 155 } 156 out: 157 return rc; 158 } 159 160 /* process context wrapper for might_sleep smc_ib_remember_port_attr */ 161 static void smc_ib_port_event_work(struct work_struct *work) 162 { 163 struct smc_ib_device *smcibdev = container_of( 164 work, struct smc_ib_device, port_event_work); 165 u8 port_idx; 166 167 for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { 168 smc_ib_remember_port_attr(smcibdev, port_idx + 1); 169 clear_bit(port_idx, &smcibdev->port_event_mask); 170 } 171 } 172 173 /* can be called in IRQ context */ 174 static void smc_ib_global_event_handler(struct ib_event_handler *handler, 175 struct ib_event *ibevent) 176 { 177 struct smc_ib_device *smcibdev; 178 u8 port_idx; 179 180 smcibdev = container_of(handler, struct smc_ib_device, event_handler); 181 182 switch (ibevent->event) { 183 case IB_EVENT_PORT_ERR: 184 port_idx = ibevent->element.port_num - 1; 185 set_bit(port_idx, &smcibdev->port_event_mask); 186 schedule_work(&smcibdev->port_event_work); 187 /* fall through */ 188 case IB_EVENT_DEVICE_FATAL: 189 /* tbd in follow-on patch: 190 * abnormal close of corresponding connections 191 */ 192 break; 193 case IB_EVENT_PORT_ACTIVE: 194 port_idx = ibevent->element.port_num - 1; 195 set_bit(port_idx, &smcibdev->port_event_mask); 196 schedule_work(&smcibdev->port_event_work); 197 break; 198 default: 199 break; 200 } 201 } 202 203 void smc_ib_dealloc_protection_domain(struct smc_link *lnk) 204 { 205 ib_dealloc_pd(lnk->roce_pd); 206 lnk->roce_pd = NULL; 207 } 208 209 int smc_ib_create_protection_domain(struct smc_link *lnk) 210 { 211 int rc; 212 213 lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); 214 rc = PTR_ERR_OR_ZERO(lnk->roce_pd); 215 if (IS_ERR(lnk->roce_pd)) 216 lnk->roce_pd = NULL; 217 return rc; 218 } 219 220 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) 221 { 222 switch (ibevent->event) { 223 case IB_EVENT_DEVICE_FATAL: 224 case IB_EVENT_GID_CHANGE: 225 case IB_EVENT_PORT_ERR: 226 case IB_EVENT_QP_ACCESS_ERR: 227 /* tbd in follow-on patch: 228 * abnormal close of corresponding connections 229 */ 230 break; 231 default: 232 break; 233 } 234 } 235 236 void smc_ib_destroy_queue_pair(struct smc_link *lnk) 237 { 238 ib_destroy_qp(lnk->roce_qp); 239 lnk->roce_qp = NULL; 240 } 241 242 /* create a queue pair within the protection domain for a link */ 243 int smc_ib_create_queue_pair(struct smc_link *lnk) 244 { 245 struct ib_qp_init_attr qp_attr = { 246 .event_handler = smc_ib_qp_event_handler, 247 .qp_context = lnk, 248 .send_cq = lnk->smcibdev->roce_cq_send, 249 .recv_cq = lnk->smcibdev->roce_cq_recv, 250 .srq = NULL, 251 .cap = { 252 .max_send_wr = SMC_WR_BUF_CNT, 253 /* include unsolicited rdma_writes as well, 254 * there are max. 2 RDMA_WRITE per 1 WR_SEND 255 */ 256 .max_recv_wr = SMC_WR_BUF_CNT * 3, 257 .max_send_sge = SMC_IB_MAX_SEND_SGE, 258 .max_recv_sge = 1, 259 }, 260 .sq_sig_type = IB_SIGNAL_REQ_WR, 261 .qp_type = IB_QPT_RC, 262 }; 263 int rc; 264 265 lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); 266 rc = PTR_ERR_OR_ZERO(lnk->roce_qp); 267 if (IS_ERR(lnk->roce_qp)) 268 lnk->roce_qp = NULL; 269 else 270 smc_wr_remember_qp_attr(lnk); 271 return rc; 272 } 273 274 /* map a new TX or RX buffer to DMA */ 275 int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size, 276 struct smc_buf_desc *buf_slot, 277 enum dma_data_direction data_direction) 278 { 279 int rc = 0; 280 281 if (buf_slot->dma_addr[SMC_SINGLE_LINK]) 282 return rc; /* already mapped */ 283 buf_slot->dma_addr[SMC_SINGLE_LINK] = 284 ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr, 285 buf_size, data_direction); 286 if (ib_dma_mapping_error(smcibdev->ibdev, 287 buf_slot->dma_addr[SMC_SINGLE_LINK])) 288 rc = -EIO; 289 return rc; 290 } 291 292 void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size, 293 struct smc_buf_desc *buf_slot, 294 enum dma_data_direction data_direction) 295 { 296 if (!buf_slot->dma_addr[SMC_SINGLE_LINK]) 297 return; /* already unmapped */ 298 ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size, 299 data_direction); 300 buf_slot->dma_addr[SMC_SINGLE_LINK] = 0; 301 } 302 303 static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) 304 { 305 struct net_device *ndev; 306 int rc; 307 308 rc = ib_query_gid(smcibdev->ibdev, ibport, 0, 309 &smcibdev->gid[ibport - 1], NULL); 310 /* the SMC protocol requires specification of the roce MAC address; 311 * if net_device cannot be determined, it can be derived from gid 0 312 */ 313 ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); 314 if (ndev) { 315 memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); 316 } else if (!rc) { 317 memcpy(&smcibdev->mac[ibport - 1][0], 318 &smcibdev->gid[ibport - 1].raw[8], 3); 319 memcpy(&smcibdev->mac[ibport - 1][3], 320 &smcibdev->gid[ibport - 1].raw[13], 3); 321 smcibdev->mac[ibport - 1][0] &= ~0x02; 322 } 323 return rc; 324 } 325 326 /* Create an identifier unique for this instance of SMC-R. 327 * The MAC-address of the first active registered IB device 328 * plus a random 2-byte number is used to create this identifier. 329 * This name is delivered to the peer during connection initialization. 330 */ 331 static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, 332 u8 ibport) 333 { 334 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], 335 sizeof(smcibdev->mac[ibport - 1])); 336 get_random_bytes(&local_systemid[0], 2); 337 } 338 339 bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) 340 { 341 return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; 342 } 343 344 int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) 345 { 346 int rc; 347 348 memset(&smcibdev->pattr[ibport - 1], 0, 349 sizeof(smcibdev->pattr[ibport - 1])); 350 rc = ib_query_port(smcibdev->ibdev, ibport, 351 &smcibdev->pattr[ibport - 1]); 352 if (rc) 353 goto out; 354 rc = smc_ib_fill_gid_and_mac(smcibdev, ibport); 355 if (rc) 356 goto out; 357 if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET, 358 sizeof(local_systemid)) && 359 smc_ib_port_active(smcibdev, ibport)) 360 /* create unique system identifier */ 361 smc_ib_define_local_systemid(smcibdev, ibport); 362 out: 363 return rc; 364 } 365 366 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) 367 { 368 struct ib_cq_init_attr cqattr = { 369 .cqe = SMC_WR_MAX_CQE, .comp_vector = 0 }; 370 long rc; 371 372 smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, 373 smc_wr_tx_cq_handler, NULL, 374 smcibdev, &cqattr); 375 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); 376 if (IS_ERR(smcibdev->roce_cq_send)) { 377 smcibdev->roce_cq_send = NULL; 378 return rc; 379 } 380 smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, 381 smc_wr_rx_cq_handler, NULL, 382 smcibdev, &cqattr); 383 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); 384 if (IS_ERR(smcibdev->roce_cq_recv)) { 385 smcibdev->roce_cq_recv = NULL; 386 goto err; 387 } 388 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, 389 smc_ib_global_event_handler); 390 ib_register_event_handler(&smcibdev->event_handler); 391 smc_wr_add_dev(smcibdev); 392 smcibdev->initialized = 1; 393 return rc; 394 395 err: 396 ib_destroy_cq(smcibdev->roce_cq_send); 397 return rc; 398 } 399 400 static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) 401 { 402 if (!smcibdev->initialized) 403 return; 404 smc_wr_remove_dev(smcibdev); 405 ib_unregister_event_handler(&smcibdev->event_handler); 406 ib_destroy_cq(smcibdev->roce_cq_recv); 407 ib_destroy_cq(smcibdev->roce_cq_send); 408 } 409 410 static struct ib_client smc_ib_client; 411 412 /* callback function for ib_register_client() */ 413 static void smc_ib_add_dev(struct ib_device *ibdev) 414 { 415 struct smc_ib_device *smcibdev; 416 417 if (ibdev->node_type != RDMA_NODE_IB_CA) 418 return; 419 420 smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); 421 if (!smcibdev) 422 return; 423 424 smcibdev->ibdev = ibdev; 425 INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); 426 427 spin_lock(&smc_ib_devices.lock); 428 list_add_tail(&smcibdev->list, &smc_ib_devices.list); 429 spin_unlock(&smc_ib_devices.lock); 430 ib_set_client_data(ibdev, &smc_ib_client, smcibdev); 431 } 432 433 /* callback function for ib_register_client() */ 434 static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) 435 { 436 struct smc_ib_device *smcibdev; 437 438 smcibdev = ib_get_client_data(ibdev, &smc_ib_client); 439 ib_set_client_data(ibdev, &smc_ib_client, NULL); 440 spin_lock(&smc_ib_devices.lock); 441 list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ 442 spin_unlock(&smc_ib_devices.lock); 443 smc_pnet_remove_by_ibdev(smcibdev); 444 smc_ib_cleanup_per_ibdev(smcibdev); 445 kfree(smcibdev); 446 } 447 448 static struct ib_client smc_ib_client = { 449 .name = "smc_ib", 450 .add = smc_ib_add_dev, 451 .remove = smc_ib_remove_dev, 452 }; 453 454 int __init smc_ib_register_client(void) 455 { 456 return ib_register_client(&smc_ib_client); 457 } 458 459 void smc_ib_unregister_client(void) 460 { 461 ib_unregister_client(&smc_ib_client); 462 } 463