1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2015 HGST, a Western Digital Company. 4 */ 5 #include <linux/module.h> 6 #include <linux/err.h> 7 #include <linux/slab.h> 8 #include <rdma/ib_verbs.h> 9 10 #include "core_priv.h" 11 12 #include <trace/events/rdma_core.h> 13 /* Max size for shared CQ, may require tuning */ 14 #define IB_MAX_SHARED_CQ_SZ 4096U 15 16 /* # of WCs to poll for with a single call to ib_poll_cq */ 17 #define IB_POLL_BATCH 16 18 #define IB_POLL_BATCH_DIRECT 8 19 20 /* # of WCs to iterate over before yielding */ 21 #define IB_POLL_BUDGET_IRQ 256 22 #define IB_POLL_BUDGET_WORKQUEUE 65536 23 24 #define IB_POLL_FLAGS \ 25 (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) 26 27 static const struct dim_cq_moder 28 rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = { 29 {1, 0, 1, 0}, 30 {1, 0, 4, 0}, 31 {2, 0, 4, 0}, 32 {2, 0, 8, 0}, 33 {4, 0, 8, 0}, 34 {16, 0, 8, 0}, 35 {16, 0, 16, 0}, 36 {32, 0, 16, 0}, 37 {32, 0, 32, 0}, 38 }; 39 40 static void ib_cq_rdma_dim_work(struct work_struct *w) 41 { 42 struct dim *dim = container_of(w, struct dim, work); 43 struct ib_cq *cq = dim->priv; 44 45 u16 usec = rdma_dim_prof[dim->profile_ix].usec; 46 u16 comps = rdma_dim_prof[dim->profile_ix].comps; 47 48 dim->state = DIM_START_MEASURE; 49 50 trace_cq_modify(cq, comps, usec); 51 cq->device->ops.modify_cq(cq, comps, usec); 52 } 53 54 static void rdma_dim_init(struct ib_cq *cq) 55 { 56 struct dim *dim; 57 58 if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim || 59 cq->poll_ctx == IB_POLL_DIRECT) 60 return; 61 62 dim = kzalloc(sizeof(struct dim), GFP_KERNEL); 63 if (!dim) 64 return; 65 66 dim->state = DIM_START_MEASURE; 67 dim->tune_state = DIM_GOING_RIGHT; 68 dim->profile_ix = RDMA_DIM_START_PROFILE; 69 dim->priv = cq; 70 cq->dim = dim; 71 72 INIT_WORK(&dim->work, ib_cq_rdma_dim_work); 73 } 74 75 static void rdma_dim_destroy(struct ib_cq *cq) 76 { 77 if (!cq->dim) 78 return; 79 80 cancel_work_sync(&cq->dim->work); 81 kfree(cq->dim); 82 } 83 84 static int __poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) 85 { 86 int rc; 87 88 rc = ib_poll_cq(cq, num_entries, wc); 89 trace_cq_poll(cq, num_entries, rc); 90 return rc; 91 } 92 93 static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs, 94 int batch) 95 { 96 int i, n, completed = 0; 97 98 trace_cq_process(cq); 99 100 /* 101 * budget might be (-1) if the caller does not 102 * want to bound this call, thus we need unsigned 103 * minimum here. 104 */ 105 while ((n = __poll_cq(cq, min_t(u32, batch, 106 budget - completed), wcs)) > 0) { 107 for (i = 0; i < n; i++) { 108 struct ib_wc *wc = &wcs[i]; 109 110 if (wc->wr_cqe) 111 wc->wr_cqe->done(cq, wc); 112 else 113 WARN_ON_ONCE(wc->status == IB_WC_SUCCESS); 114 } 115 116 completed += n; 117 118 if (n != batch || (budget != -1 && completed >= budget)) 119 break; 120 } 121 122 return completed; 123 } 124 125 /** 126 * ib_process_direct_cq - process a CQ in caller context 127 * @cq: CQ to process 128 * @budget: number of CQEs to poll for 129 * 130 * This function is used to process all outstanding CQ entries. 131 * It does not offload CQ processing to a different context and does 132 * not ask for completion interrupts from the HCA. 133 * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger 134 * concurrent processing. 135 * 136 * Note: do not pass -1 as %budget unless it is guaranteed that the number 137 * of completions that will be processed is small. 138 */ 139 int ib_process_cq_direct(struct ib_cq *cq, int budget) 140 { 141 struct ib_wc wcs[IB_POLL_BATCH_DIRECT]; 142 143 return __ib_process_cq(cq, budget, wcs, IB_POLL_BATCH_DIRECT); 144 } 145 EXPORT_SYMBOL(ib_process_cq_direct); 146 147 static void ib_cq_completion_direct(struct ib_cq *cq, void *private) 148 { 149 WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq); 150 } 151 152 static int ib_poll_handler(struct irq_poll *iop, int budget) 153 { 154 struct ib_cq *cq = container_of(iop, struct ib_cq, iop); 155 struct dim *dim = cq->dim; 156 int completed; 157 158 completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); 159 if (completed < budget) { 160 irq_poll_complete(&cq->iop); 161 if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) { 162 trace_cq_reschedule(cq); 163 irq_poll_sched(&cq->iop); 164 } 165 } 166 167 if (dim) 168 rdma_dim(dim, completed); 169 170 return completed; 171 } 172 173 static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) 174 { 175 trace_cq_schedule(cq); 176 irq_poll_sched(&cq->iop); 177 } 178 179 static void ib_cq_poll_work(struct work_struct *work) 180 { 181 struct ib_cq *cq = container_of(work, struct ib_cq, work); 182 int completed; 183 184 completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc, 185 IB_POLL_BATCH); 186 if (completed >= IB_POLL_BUDGET_WORKQUEUE || 187 ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) 188 queue_work(cq->comp_wq, &cq->work); 189 else if (cq->dim) 190 rdma_dim(cq->dim, completed); 191 } 192 193 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) 194 { 195 trace_cq_schedule(cq); 196 queue_work(cq->comp_wq, &cq->work); 197 } 198 199 /** 200 * __ib_alloc_cq_user - allocate a completion queue 201 * @dev: device to allocate the CQ for 202 * @private: driver private data, accessible from cq->cq_context 203 * @nr_cqe: number of CQEs to allocate 204 * @comp_vector: HCA completion vectors for this CQ 205 * @poll_ctx: context to poll the CQ from. 206 * @caller: module owner name. 207 * @udata: Valid user data or NULL for kernel object 208 * 209 * This is the proper interface to allocate a CQ for in-kernel users. A 210 * CQ allocated with this interface will automatically be polled from the 211 * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id 212 * to use this CQ abstraction. 213 */ 214 struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, 215 int nr_cqe, int comp_vector, 216 enum ib_poll_context poll_ctx, 217 const char *caller, struct ib_udata *udata) 218 { 219 struct ib_cq_init_attr cq_attr = { 220 .cqe = nr_cqe, 221 .comp_vector = comp_vector, 222 }; 223 struct ib_cq *cq; 224 int ret = -ENOMEM; 225 226 cq = rdma_zalloc_drv_obj(dev, ib_cq); 227 if (!cq) 228 return ERR_PTR(ret); 229 230 cq->device = dev; 231 cq->cq_context = private; 232 cq->poll_ctx = poll_ctx; 233 atomic_set(&cq->usecnt, 0); 234 cq->comp_vector = comp_vector; 235 236 cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); 237 if (!cq->wc) 238 goto out_free_cq; 239 240 cq->res.type = RDMA_RESTRACK_CQ; 241 rdma_restrack_set_task(&cq->res, caller); 242 243 ret = dev->ops.create_cq(cq, &cq_attr, NULL); 244 if (ret) 245 goto out_free_wc; 246 247 rdma_restrack_kadd(&cq->res); 248 249 rdma_dim_init(cq); 250 251 switch (cq->poll_ctx) { 252 case IB_POLL_DIRECT: 253 cq->comp_handler = ib_cq_completion_direct; 254 break; 255 case IB_POLL_SOFTIRQ: 256 cq->comp_handler = ib_cq_completion_softirq; 257 258 irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler); 259 ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); 260 break; 261 case IB_POLL_WORKQUEUE: 262 case IB_POLL_UNBOUND_WORKQUEUE: 263 cq->comp_handler = ib_cq_completion_workqueue; 264 INIT_WORK(&cq->work, ib_cq_poll_work); 265 ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); 266 cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ? 267 ib_comp_wq : ib_comp_unbound_wq; 268 break; 269 default: 270 ret = -EINVAL; 271 goto out_destroy_cq; 272 } 273 274 trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx); 275 return cq; 276 277 out_destroy_cq: 278 rdma_dim_destroy(cq); 279 rdma_restrack_del(&cq->res); 280 cq->device->ops.destroy_cq(cq, udata); 281 out_free_wc: 282 kfree(cq->wc); 283 out_free_cq: 284 kfree(cq); 285 trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret); 286 return ERR_PTR(ret); 287 } 288 EXPORT_SYMBOL(__ib_alloc_cq_user); 289 290 /** 291 * __ib_alloc_cq_any - allocate a completion queue 292 * @dev: device to allocate the CQ for 293 * @private: driver private data, accessible from cq->cq_context 294 * @nr_cqe: number of CQEs to allocate 295 * @poll_ctx: context to poll the CQ from 296 * @caller: module owner name 297 * 298 * Attempt to spread ULP Completion Queues over each device's interrupt 299 * vectors. A simple best-effort mechanism is used. 300 */ 301 struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private, 302 int nr_cqe, enum ib_poll_context poll_ctx, 303 const char *caller) 304 { 305 static atomic_t counter; 306 int comp_vector = 0; 307 308 if (dev->num_comp_vectors > 1) 309 comp_vector = 310 atomic_inc_return(&counter) % 311 min_t(int, dev->num_comp_vectors, num_online_cpus()); 312 313 return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx, 314 caller, NULL); 315 } 316 EXPORT_SYMBOL(__ib_alloc_cq_any); 317 318 /** 319 * ib_free_cq_user - free a completion queue 320 * @cq: completion queue to free. 321 * @udata: User data or NULL for kernel object 322 */ 323 void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) 324 { 325 if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) 326 return; 327 if (WARN_ON_ONCE(cq->cqe_used)) 328 return; 329 330 switch (cq->poll_ctx) { 331 case IB_POLL_DIRECT: 332 break; 333 case IB_POLL_SOFTIRQ: 334 irq_poll_disable(&cq->iop); 335 break; 336 case IB_POLL_WORKQUEUE: 337 case IB_POLL_UNBOUND_WORKQUEUE: 338 cancel_work_sync(&cq->work); 339 break; 340 default: 341 WARN_ON_ONCE(1); 342 } 343 344 rdma_dim_destroy(cq); 345 trace_cq_free(cq); 346 rdma_restrack_del(&cq->res); 347 cq->device->ops.destroy_cq(cq, udata); 348 kfree(cq->wc); 349 kfree(cq); 350 } 351 EXPORT_SYMBOL(ib_free_cq_user); 352 353 void ib_cq_pool_init(struct ib_device *dev) 354 { 355 unsigned int i; 356 357 spin_lock_init(&dev->cq_pools_lock); 358 for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) 359 INIT_LIST_HEAD(&dev->cq_pools[i]); 360 } 361 362 void ib_cq_pool_destroy(struct ib_device *dev) 363 { 364 struct ib_cq *cq, *n; 365 unsigned int i; 366 367 for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) { 368 list_for_each_entry_safe(cq, n, &dev->cq_pools[i], 369 pool_entry) { 370 WARN_ON(cq->cqe_used); 371 cq->shared = false; 372 ib_free_cq(cq); 373 } 374 } 375 } 376 377 static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes, 378 enum ib_poll_context poll_ctx) 379 { 380 LIST_HEAD(tmp_list); 381 unsigned int nr_cqs, i; 382 struct ib_cq *cq, *n; 383 int ret; 384 385 if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { 386 WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); 387 return -EINVAL; 388 } 389 390 /* 391 * Allocate at least as many CQEs as requested, and otherwise 392 * a reasonable batch size so that we can share CQs between 393 * multiple users instead of allocating a larger number of CQs. 394 */ 395 nr_cqes = min_t(unsigned int, dev->attrs.max_cqe, 396 max(nr_cqes, IB_MAX_SHARED_CQ_SZ)); 397 nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); 398 for (i = 0; i < nr_cqs; i++) { 399 cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx); 400 if (IS_ERR(cq)) { 401 ret = PTR_ERR(cq); 402 goto out_free_cqs; 403 } 404 cq->shared = true; 405 list_add_tail(&cq->pool_entry, &tmp_list); 406 } 407 408 spin_lock_irq(&dev->cq_pools_lock); 409 list_splice(&tmp_list, &dev->cq_pools[poll_ctx]); 410 spin_unlock_irq(&dev->cq_pools_lock); 411 412 return 0; 413 414 out_free_cqs: 415 list_for_each_entry_safe(cq, n, &tmp_list, pool_entry) { 416 cq->shared = false; 417 ib_free_cq(cq); 418 } 419 return ret; 420 } 421 422 /** 423 * ib_cq_pool_get() - Find the least used completion queue that matches 424 * a given cpu hint (or least used for wild card affinity) and fits 425 * nr_cqe. 426 * @dev: rdma device 427 * @nr_cqe: number of needed cqe entries 428 * @comp_vector_hint: completion vector hint (-1) for the driver to assign 429 * a comp vector based on internal counter 430 * @poll_ctx: cq polling context 431 * 432 * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and 433 * claim entries in it for us. In case there is no available cq, allocate 434 * a new cq with the requirements and add it to the device pool. 435 * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value 436 * for @poll_ctx. 437 */ 438 struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, 439 int comp_vector_hint, 440 enum ib_poll_context poll_ctx) 441 { 442 static unsigned int default_comp_vector; 443 unsigned int vector, num_comp_vectors; 444 struct ib_cq *cq, *found = NULL; 445 int ret; 446 447 if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { 448 WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); 449 return ERR_PTR(-EINVAL); 450 } 451 452 num_comp_vectors = 453 min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); 454 /* Project the affinty to the device completion vector range */ 455 if (comp_vector_hint < 0) { 456 comp_vector_hint = 457 (READ_ONCE(default_comp_vector) + 1) % num_comp_vectors; 458 WRITE_ONCE(default_comp_vector, comp_vector_hint); 459 } 460 vector = comp_vector_hint % num_comp_vectors; 461 462 /* 463 * Find the least used CQ with correct affinity and 464 * enough free CQ entries 465 */ 466 while (!found) { 467 spin_lock_irq(&dev->cq_pools_lock); 468 list_for_each_entry(cq, &dev->cq_pools[poll_ctx], 469 pool_entry) { 470 /* 471 * Check to see if we have found a CQ with the 472 * correct completion vector 473 */ 474 if (vector != cq->comp_vector) 475 continue; 476 if (cq->cqe_used + nr_cqe > cq->cqe) 477 continue; 478 found = cq; 479 break; 480 } 481 482 if (found) { 483 found->cqe_used += nr_cqe; 484 spin_unlock_irq(&dev->cq_pools_lock); 485 486 return found; 487 } 488 spin_unlock_irq(&dev->cq_pools_lock); 489 490 /* 491 * Didn't find a match or ran out of CQs in the device 492 * pool, allocate a new array of CQs. 493 */ 494 ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx); 495 if (ret) 496 return ERR_PTR(ret); 497 } 498 499 return found; 500 } 501 EXPORT_SYMBOL(ib_cq_pool_get); 502 503 /** 504 * ib_cq_pool_put - Return a CQ taken from a shared pool. 505 * @cq: The CQ to return. 506 * @nr_cqe: The max number of cqes that the user had requested. 507 */ 508 void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe) 509 { 510 if (WARN_ON_ONCE(nr_cqe > cq->cqe_used)) 511 return; 512 513 spin_lock_irq(&cq->device->cq_pools_lock); 514 cq->cqe_used -= nr_cqe; 515 spin_unlock_irq(&cq->device->cq_pools_lock); 516 } 517 EXPORT_SYMBOL(ib_cq_pool_put); 518