1 /* 2 * Shared Memory Communications over RDMA (SMC-R) and RoCE 3 * 4 * IB infrastructure: 5 * Establish SMC-R as an Infiniband Client to be notified about added and 6 * removed IB devices of type RDMA. 7 * Determine device and port characteristics for these IB devices. 8 * 9 * Copyright IBM Corp. 2016 10 * 11 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> 12 */ 13 14 #include <linux/random.h> 15 #include <linux/workqueue.h> 16 #include <rdma/ib_verbs.h> 17 18 #include "smc_pnet.h" 19 #include "smc_ib.h" 20 #include "smc_core.h" 21 #include "smc_wr.h" 22 #include "smc.h" 23 24 #define SMC_QP_MIN_RNR_TIMER 5 25 #define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */ 26 #define SMC_QP_RETRY_CNT 7 /* 7: infinite */ 27 #define SMC_QP_RNR_RETRY 7 /* 7: infinite */ 28 29 struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ 30 .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock), 31 .list = LIST_HEAD_INIT(smc_ib_devices.list), 32 }; 33 34 #define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%" 35 36 u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system 37 * identifier 38 */ 39 40 static int smc_ib_modify_qp_init(struct smc_link *lnk) 41 { 42 struct ib_qp_attr qp_attr; 43 44 memset(&qp_attr, 0, sizeof(qp_attr)); 45 qp_attr.qp_state = IB_QPS_INIT; 46 qp_attr.pkey_index = 0; 47 qp_attr.port_num = lnk->ibport; 48 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE 49 | IB_ACCESS_REMOTE_WRITE; 50 return ib_modify_qp(lnk->roce_qp, &qp_attr, 51 IB_QP_STATE | IB_QP_PKEY_INDEX | 52 IB_QP_ACCESS_FLAGS | IB_QP_PORT); 53 } 54 55 static int smc_ib_modify_qp_rtr(struct smc_link *lnk) 56 { 57 enum ib_qp_attr_mask qp_attr_mask = 58 IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | 59 IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; 60 struct ib_qp_attr qp_attr; 61 62 memset(&qp_attr, 0, sizeof(qp_attr)); 63 qp_attr.qp_state = IB_QPS_RTR; 64 qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); 65 qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; 66 rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); 67 rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, 0, 1, 0); 68 rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); 69 memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, 70 sizeof(lnk->peer_mac)); 71 qp_attr.dest_qp_num = lnk->peer_qpn; 72 qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ 73 qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming 74 * requests 75 */ 76 qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER; 77 78 return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask); 79 } 80 81 int smc_ib_modify_qp_rts(struct smc_link *lnk) 82 { 83 struct ib_qp_attr qp_attr; 84 85 memset(&qp_attr, 0, sizeof(qp_attr)); 86 qp_attr.qp_state = IB_QPS_RTS; 87 qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */ 88 qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */ 89 qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */ 90 qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */ 91 qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and 92 * atomic ops allowed 93 */ 94 return ib_modify_qp(lnk->roce_qp, &qp_attr, 95 IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | 96 IB_QP_SQ_PSN | IB_QP_RNR_RETRY | 97 IB_QP_MAX_QP_RD_ATOMIC); 98 } 99 100 int smc_ib_modify_qp_reset(struct smc_link *lnk) 101 { 102 struct ib_qp_attr qp_attr; 103 104 memset(&qp_attr, 0, sizeof(qp_attr)); 105 qp_attr.qp_state = IB_QPS_RESET; 106 return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); 107 } 108 109 int smc_ib_ready_link(struct smc_link *lnk) 110 { 111 struct smc_link_group *lgr = 112 container_of(lnk, struct smc_link_group, lnk[0]); 113 int rc = 0; 114 115 rc = smc_ib_modify_qp_init(lnk); 116 if (rc) 117 goto out; 118 119 rc = smc_ib_modify_qp_rtr(lnk); 120 if (rc) 121 goto out; 122 smc_wr_remember_qp_attr(lnk); 123 rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, 124 IB_CQ_SOLICITED_MASK); 125 if (rc) 126 goto out; 127 rc = smc_wr_rx_post_init(lnk); 128 if (rc) 129 goto out; 130 smc_wr_remember_qp_attr(lnk); 131 132 if (lgr->role == SMC_SERV) { 133 rc = smc_ib_modify_qp_rts(lnk); 134 if (rc) 135 goto out; 136 smc_wr_remember_qp_attr(lnk); 137 } 138 out: 139 return rc; 140 } 141 142 /* process context wrapper for might_sleep smc_ib_remember_port_attr */ 143 static void smc_ib_port_event_work(struct work_struct *work) 144 { 145 struct smc_ib_device *smcibdev = container_of( 146 work, struct smc_ib_device, port_event_work); 147 u8 port_idx; 148 149 for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { 150 smc_ib_remember_port_attr(smcibdev, port_idx + 1); 151 clear_bit(port_idx, &smcibdev->port_event_mask); 152 } 153 } 154 155 /* can be called in IRQ context */ 156 static void smc_ib_global_event_handler(struct ib_event_handler *handler, 157 struct ib_event *ibevent) 158 { 159 struct smc_ib_device *smcibdev; 160 u8 port_idx; 161 162 smcibdev = container_of(handler, struct smc_ib_device, event_handler); 163 164 switch (ibevent->event) { 165 case IB_EVENT_PORT_ERR: 166 port_idx = ibevent->element.port_num - 1; 167 set_bit(port_idx, &smcibdev->port_event_mask); 168 schedule_work(&smcibdev->port_event_work); 169 /* fall through */ 170 case IB_EVENT_DEVICE_FATAL: 171 /* tbd in follow-on patch: 172 * abnormal close of corresponding connections 173 */ 174 break; 175 case IB_EVENT_PORT_ACTIVE: 176 port_idx = ibevent->element.port_num - 1; 177 set_bit(port_idx, &smcibdev->port_event_mask); 178 schedule_work(&smcibdev->port_event_work); 179 break; 180 default: 181 break; 182 } 183 } 184 185 void smc_ib_dealloc_protection_domain(struct smc_link *lnk) 186 { 187 ib_dealloc_pd(lnk->roce_pd); 188 lnk->roce_pd = NULL; 189 } 190 191 int smc_ib_create_protection_domain(struct smc_link *lnk) 192 { 193 int rc; 194 195 lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 196 IB_PD_UNSAFE_GLOBAL_RKEY); 197 rc = PTR_ERR_OR_ZERO(lnk->roce_pd); 198 if (IS_ERR(lnk->roce_pd)) 199 lnk->roce_pd = NULL; 200 return rc; 201 } 202 203 static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) 204 { 205 switch (ibevent->event) { 206 case IB_EVENT_DEVICE_FATAL: 207 case IB_EVENT_GID_CHANGE: 208 case IB_EVENT_PORT_ERR: 209 case IB_EVENT_QP_ACCESS_ERR: 210 /* tbd in follow-on patch: 211 * abnormal close of corresponding connections 212 */ 213 break; 214 default: 215 break; 216 } 217 } 218 219 void smc_ib_destroy_queue_pair(struct smc_link *lnk) 220 { 221 ib_destroy_qp(lnk->roce_qp); 222 lnk->roce_qp = NULL; 223 } 224 225 /* create a queue pair within the protection domain for a link */ 226 int smc_ib_create_queue_pair(struct smc_link *lnk) 227 { 228 struct ib_qp_init_attr qp_attr = { 229 .event_handler = smc_ib_qp_event_handler, 230 .qp_context = lnk, 231 .send_cq = lnk->smcibdev->roce_cq_send, 232 .recv_cq = lnk->smcibdev->roce_cq_recv, 233 .srq = NULL, 234 .cap = { 235 .max_send_wr = SMC_WR_BUF_CNT, 236 /* include unsolicited rdma_writes as well, 237 * there are max. 2 RDMA_WRITE per 1 WR_SEND 238 */ 239 .max_recv_wr = SMC_WR_BUF_CNT * 3, 240 .max_send_sge = SMC_IB_MAX_SEND_SGE, 241 .max_recv_sge = 1, 242 }, 243 .sq_sig_type = IB_SIGNAL_REQ_WR, 244 .qp_type = IB_QPT_RC, 245 }; 246 int rc; 247 248 lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); 249 rc = PTR_ERR_OR_ZERO(lnk->roce_qp); 250 if (IS_ERR(lnk->roce_qp)) 251 lnk->roce_qp = NULL; 252 else 253 smc_wr_remember_qp_attr(lnk); 254 return rc; 255 } 256 257 /* map a new TX or RX buffer to DMA */ 258 int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size, 259 struct smc_buf_desc *buf_slot, 260 enum dma_data_direction data_direction) 261 { 262 int rc = 0; 263 264 if (buf_slot->dma_addr[SMC_SINGLE_LINK]) 265 return rc; /* already mapped */ 266 buf_slot->dma_addr[SMC_SINGLE_LINK] = 267 ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr, 268 buf_size, data_direction); 269 if (ib_dma_mapping_error(smcibdev->ibdev, 270 buf_slot->dma_addr[SMC_SINGLE_LINK])) 271 rc = -EIO; 272 return rc; 273 } 274 275 void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size, 276 struct smc_buf_desc *buf_slot, 277 enum dma_data_direction data_direction) 278 { 279 if (!buf_slot->dma_addr[SMC_SINGLE_LINK]) 280 return; /* already unmapped */ 281 ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size, 282 data_direction); 283 buf_slot->dma_addr[SMC_SINGLE_LINK] = 0; 284 } 285 286 static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) 287 { 288 struct net_device *ndev; 289 int rc; 290 291 rc = ib_query_gid(smcibdev->ibdev, ibport, 0, 292 &smcibdev->gid[ibport - 1], NULL); 293 /* the SMC protocol requires specification of the roce MAC address; 294 * if net_device cannot be determined, it can be derived from gid 0 295 */ 296 ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); 297 if (ndev) { 298 memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); 299 } else if (!rc) { 300 memcpy(&smcibdev->mac[ibport - 1][0], 301 &smcibdev->gid[ibport - 1].raw[8], 3); 302 memcpy(&smcibdev->mac[ibport - 1][3], 303 &smcibdev->gid[ibport - 1].raw[13], 3); 304 smcibdev->mac[ibport - 1][0] &= ~0x02; 305 } 306 return rc; 307 } 308 309 /* Create an identifier unique for this instance of SMC-R. 310 * The MAC-address of the first active registered IB device 311 * plus a random 2-byte number is used to create this identifier. 312 * This name is delivered to the peer during connection initialization. 313 */ 314 static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, 315 u8 ibport) 316 { 317 memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], 318 sizeof(smcibdev->mac[ibport - 1])); 319 get_random_bytes(&local_systemid[0], 2); 320 } 321 322 bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) 323 { 324 return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; 325 } 326 327 int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) 328 { 329 int rc; 330 331 memset(&smcibdev->pattr[ibport - 1], 0, 332 sizeof(smcibdev->pattr[ibport - 1])); 333 rc = ib_query_port(smcibdev->ibdev, ibport, 334 &smcibdev->pattr[ibport - 1]); 335 if (rc) 336 goto out; 337 rc = smc_ib_fill_gid_and_mac(smcibdev, ibport); 338 if (rc) 339 goto out; 340 if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET, 341 sizeof(local_systemid)) && 342 smc_ib_port_active(smcibdev, ibport)) 343 /* create unique system identifier */ 344 smc_ib_define_local_systemid(smcibdev, ibport); 345 out: 346 return rc; 347 } 348 349 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) 350 { 351 struct ib_cq_init_attr cqattr = { 352 .cqe = SMC_WR_MAX_CQE, .comp_vector = 0 }; 353 long rc; 354 355 smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, 356 smc_wr_tx_cq_handler, NULL, 357 smcibdev, &cqattr); 358 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); 359 if (IS_ERR(smcibdev->roce_cq_send)) { 360 smcibdev->roce_cq_send = NULL; 361 return rc; 362 } 363 smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, 364 smc_wr_rx_cq_handler, NULL, 365 smcibdev, &cqattr); 366 rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); 367 if (IS_ERR(smcibdev->roce_cq_recv)) { 368 smcibdev->roce_cq_recv = NULL; 369 goto err; 370 } 371 INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, 372 smc_ib_global_event_handler); 373 ib_register_event_handler(&smcibdev->event_handler); 374 smc_wr_add_dev(smcibdev); 375 smcibdev->initialized = 1; 376 return rc; 377 378 err: 379 ib_destroy_cq(smcibdev->roce_cq_send); 380 return rc; 381 } 382 383 static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) 384 { 385 if (!smcibdev->initialized) 386 return; 387 smc_wr_remove_dev(smcibdev); 388 ib_unregister_event_handler(&smcibdev->event_handler); 389 ib_destroy_cq(smcibdev->roce_cq_recv); 390 ib_destroy_cq(smcibdev->roce_cq_send); 391 } 392 393 static struct ib_client smc_ib_client; 394 395 /* callback function for ib_register_client() */ 396 static void smc_ib_add_dev(struct ib_device *ibdev) 397 { 398 struct smc_ib_device *smcibdev; 399 400 if (ibdev->node_type != RDMA_NODE_IB_CA) 401 return; 402 403 smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); 404 if (!smcibdev) 405 return; 406 407 smcibdev->ibdev = ibdev; 408 INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); 409 410 spin_lock(&smc_ib_devices.lock); 411 list_add_tail(&smcibdev->list, &smc_ib_devices.list); 412 spin_unlock(&smc_ib_devices.lock); 413 ib_set_client_data(ibdev, &smc_ib_client, smcibdev); 414 } 415 416 /* callback function for ib_register_client() */ 417 static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) 418 { 419 struct smc_ib_device *smcibdev; 420 421 smcibdev = ib_get_client_data(ibdev, &smc_ib_client); 422 ib_set_client_data(ibdev, &smc_ib_client, NULL); 423 spin_lock(&smc_ib_devices.lock); 424 list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ 425 spin_unlock(&smc_ib_devices.lock); 426 smc_pnet_remove_by_ibdev(smcibdev); 427 smc_ib_cleanup_per_ibdev(smcibdev); 428 kfree(smcibdev); 429 } 430 431 static struct ib_client smc_ib_client = { 432 .name = "smc_ib", 433 .add = smc_ib_add_dev, 434 .remove = smc_ib_remove_dev, 435 }; 436 437 int __init smc_ib_register_client(void) 438 { 439 return ib_register_client(&smc_ib_client); 440 } 441 442 void smc_ib_unregister_client(void) 443 { 444 ib_unregister_client(&smc_ib_client); 445 } 446