1 /* 2 * RDMA resource limiting controller for cgroups. 3 * 4 * Used to allow a cgroup hierarchy to stop processes from consuming 5 * additional RDMA resources after a certain limit is reached. 6 * 7 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> 8 * 9 * This file is subject to the terms and conditions of version 2 of the GNU 10 * General Public License. See the file COPYING in the main directory of the 11 * Linux distribution for more details. 12 */ 13 14 #include <linux/bitops.h> 15 #include <linux/slab.h> 16 #include <linux/seq_file.h> 17 #include <linux/cgroup.h> 18 #include <linux/parser.h> 19 #include <linux/cgroup_rdma.h> 20 21 #define RDMACG_MAX_STR "max" 22 23 /* 24 * Protects list of resource pools maintained on per cgroup basis 25 * and rdma device list. 26 */ 27 static DEFINE_MUTEX(rdmacg_mutex); 28 static LIST_HEAD(rdmacg_devices); 29 30 enum rdmacg_file_type { 31 RDMACG_RESOURCE_TYPE_MAX, 32 RDMACG_RESOURCE_TYPE_STAT, 33 }; 34 35 /* 36 * resource table definition as to be seen by the user. 37 * Need to add entries to it when more resources are 38 * added/defined at IB verb/core layer. 39 */ 40 static char const *rdmacg_resource_names[] = { 41 [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle", 42 [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object", 43 }; 44 45 /* resource tracker for each resource of rdma cgroup */ 46 struct rdmacg_resource { 47 int max; 48 int usage; 49 }; 50 51 /* 52 * resource pool object which represents per cgroup, per device 53 * resources. There are multiple instances of this object per cgroup, 54 * therefore it cannot be embedded within rdma_cgroup structure. It 55 * is maintained as list. 56 */ 57 struct rdmacg_resource_pool { 58 struct rdmacg_device *device; 59 struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; 60 61 struct list_head cg_node; 62 struct list_head dev_node; 63 64 /* count active user tasks of this pool */ 65 u64 usage_sum; 66 /* total number counts which are set to max */ 67 int num_max_cnt; 68 }; 69 70 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) 71 { 72 return container_of(css, struct rdma_cgroup, css); 73 } 74 75 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) 76 { 77 return css_rdmacg(cg->css.parent); 78 } 79 80 static inline struct rdma_cgroup *get_current_rdmacg(void) 81 { 82 return css_rdmacg(task_get_css(current, rdma_cgrp_id)); 83 } 84 85 static void set_resource_limit(struct rdmacg_resource_pool *rpool, 86 int index, int new_max) 87 { 88 if (new_max == S32_MAX) { 89 if (rpool->resources[index].max != S32_MAX) 90 rpool->num_max_cnt++; 91 } else { 92 if (rpool->resources[index].max == S32_MAX) 93 rpool->num_max_cnt--; 94 } 95 rpool->resources[index].max = new_max; 96 } 97 98 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) 99 { 100 int i; 101 102 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) 103 set_resource_limit(rpool, i, S32_MAX); 104 } 105 106 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) 107 { 108 lockdep_assert_held(&rdmacg_mutex); 109 110 list_del(&rpool->cg_node); 111 list_del(&rpool->dev_node); 112 kfree(rpool); 113 } 114 115 static struct rdmacg_resource_pool * 116 find_cg_rpool_locked(struct rdma_cgroup *cg, 117 struct rdmacg_device *device) 118 119 { 120 struct rdmacg_resource_pool *pool; 121 122 lockdep_assert_held(&rdmacg_mutex); 123 124 list_for_each_entry(pool, &cg->rpools, cg_node) 125 if (pool->device == device) 126 return pool; 127 128 return NULL; 129 } 130 131 static struct rdmacg_resource_pool * 132 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) 133 { 134 struct rdmacg_resource_pool *rpool; 135 136 rpool = find_cg_rpool_locked(cg, device); 137 if (rpool) 138 return rpool; 139 140 rpool = kzalloc(sizeof(*rpool), GFP_KERNEL); 141 if (!rpool) 142 return ERR_PTR(-ENOMEM); 143 144 rpool->device = device; 145 set_all_resource_max_limit(rpool); 146 147 INIT_LIST_HEAD(&rpool->cg_node); 148 INIT_LIST_HEAD(&rpool->dev_node); 149 list_add_tail(&rpool->cg_node, &cg->rpools); 150 list_add_tail(&rpool->dev_node, &device->rpools); 151 return rpool; 152 } 153 154 /** 155 * uncharge_cg_locked - uncharge resource for rdma cgroup 156 * @cg: pointer to cg to uncharge and all parents in hierarchy 157 * @device: pointer to rdmacg device 158 * @index: index of the resource to uncharge in cg (resource pool) 159 * 160 * It also frees the resource pool which was created as part of 161 * charging operation when there are no resources attached to 162 * resource pool. 163 */ 164 static void 165 uncharge_cg_locked(struct rdma_cgroup *cg, 166 struct rdmacg_device *device, 167 enum rdmacg_resource_type index) 168 { 169 struct rdmacg_resource_pool *rpool; 170 171 rpool = find_cg_rpool_locked(cg, device); 172 173 /* 174 * rpool cannot be null at this stage. Let kernel operate in case 175 * if there a bug in IB stack or rdma controller, instead of crashing 176 * the system. 177 */ 178 if (unlikely(!rpool)) { 179 pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); 180 return; 181 } 182 183 rpool->resources[index].usage--; 184 185 /* 186 * A negative count (or overflow) is invalid, 187 * it indicates a bug in the rdma controller. 188 */ 189 WARN_ON_ONCE(rpool->resources[index].usage < 0); 190 rpool->usage_sum--; 191 if (rpool->usage_sum == 0 && 192 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { 193 /* 194 * No user of the rpool and all entries are set to max, so 195 * safe to delete this rpool. 196 */ 197 free_cg_rpool_locked(rpool); 198 } 199 } 200 201 /** 202 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count 203 * @device: pointer to rdmacg device 204 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup 205 * stop uncharging 206 * @index: index of the resource to uncharge in cg in given resource pool 207 */ 208 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, 209 struct rdmacg_device *device, 210 struct rdma_cgroup *stop_cg, 211 enum rdmacg_resource_type index) 212 { 213 struct rdma_cgroup *p; 214 215 mutex_lock(&rdmacg_mutex); 216 217 for (p = cg; p != stop_cg; p = parent_rdmacg(p)) 218 uncharge_cg_locked(p, device, index); 219 220 mutex_unlock(&rdmacg_mutex); 221 222 css_put(&cg->css); 223 } 224 225 /** 226 * rdmacg_uncharge - hierarchically uncharge rdma resource count 227 * @device: pointer to rdmacg device 228 * @index: index of the resource to uncharge in cgroup in given resource pool 229 */ 230 void rdmacg_uncharge(struct rdma_cgroup *cg, 231 struct rdmacg_device *device, 232 enum rdmacg_resource_type index) 233 { 234 if (index >= RDMACG_RESOURCE_MAX) 235 return; 236 237 rdmacg_uncharge_hierarchy(cg, device, NULL, index); 238 } 239 EXPORT_SYMBOL(rdmacg_uncharge); 240 241 /** 242 * rdmacg_try_charge - hierarchically try to charge the rdma resource 243 * @rdmacg: pointer to rdma cgroup which will own this resource 244 * @device: pointer to rdmacg device 245 * @index: index of the resource to charge in cgroup (resource pool) 246 * 247 * This function follows charging resource in hierarchical way. 248 * It will fail if the charge would cause the new value to exceed the 249 * hierarchical limit. 250 * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. 251 * Returns pointer to rdmacg for this resource when charging is successful. 252 * 253 * Charger needs to account resources on two criteria. 254 * (a) per cgroup & (b) per device resource usage. 255 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross 256 * the configured limits. Per device provides granular configuration 257 * in multi device usage. It allocates resource pool in the hierarchy 258 * for each parent it come across for first resource. Later on resource 259 * pool will be available. Therefore it will be much faster thereon 260 * to charge/uncharge. 261 */ 262 int rdmacg_try_charge(struct rdma_cgroup **rdmacg, 263 struct rdmacg_device *device, 264 enum rdmacg_resource_type index) 265 { 266 struct rdma_cgroup *cg, *p; 267 struct rdmacg_resource_pool *rpool; 268 s64 new; 269 int ret = 0; 270 271 if (index >= RDMACG_RESOURCE_MAX) 272 return -EINVAL; 273 274 /* 275 * hold on to css, as cgroup can be removed but resource 276 * accounting happens on css. 277 */ 278 cg = get_current_rdmacg(); 279 280 mutex_lock(&rdmacg_mutex); 281 for (p = cg; p; p = parent_rdmacg(p)) { 282 rpool = get_cg_rpool_locked(p, device); 283 if (IS_ERR(rpool)) { 284 ret = PTR_ERR(rpool); 285 goto err; 286 } else { 287 new = rpool->resources[index].usage + 1; 288 if (new > rpool->resources[index].max) { 289 ret = -EAGAIN; 290 goto err; 291 } else { 292 rpool->resources[index].usage = new; 293 rpool->usage_sum++; 294 } 295 } 296 } 297 mutex_unlock(&rdmacg_mutex); 298 299 *rdmacg = cg; 300 return 0; 301 302 err: 303 mutex_unlock(&rdmacg_mutex); 304 rdmacg_uncharge_hierarchy(cg, device, p, index); 305 return ret; 306 } 307 EXPORT_SYMBOL(rdmacg_try_charge); 308 309 /** 310 * rdmacg_register_device - register rdmacg device to rdma controller. 311 * @device: pointer to rdmacg device whose resources need to be accounted. 312 * 313 * If IB stack wish a device to participate in rdma cgroup resource 314 * tracking, it must invoke this API to register with rdma cgroup before 315 * any user space application can start using the RDMA resources. 316 * Returns 0 on success or EINVAL when table length given is beyond 317 * supported size. 318 */ 319 int rdmacg_register_device(struct rdmacg_device *device) 320 { 321 INIT_LIST_HEAD(&device->dev_node); 322 INIT_LIST_HEAD(&device->rpools); 323 324 mutex_lock(&rdmacg_mutex); 325 list_add_tail(&device->dev_node, &rdmacg_devices); 326 mutex_unlock(&rdmacg_mutex); 327 return 0; 328 } 329 EXPORT_SYMBOL(rdmacg_register_device); 330 331 /** 332 * rdmacg_unregister_device - unregister rdmacg device from rdma controller. 333 * @device: pointer to rdmacg device which was previously registered with rdma 334 * controller using rdmacg_register_device(). 335 * 336 * IB stack must invoke this after all the resources of the IB device 337 * are destroyed and after ensuring that no more resources will be created 338 * when this API is invoked. 339 */ 340 void rdmacg_unregister_device(struct rdmacg_device *device) 341 { 342 struct rdmacg_resource_pool *rpool, *tmp; 343 344 /* 345 * Synchronize with any active resource settings, 346 * usage query happening via configfs. 347 */ 348 mutex_lock(&rdmacg_mutex); 349 list_del_init(&device->dev_node); 350 351 /* 352 * Now that this device is off the cgroup list, its safe to free 353 * all the rpool resources. 354 */ 355 list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) 356 free_cg_rpool_locked(rpool); 357 358 mutex_unlock(&rdmacg_mutex); 359 } 360 EXPORT_SYMBOL(rdmacg_unregister_device); 361 362 static int parse_resource(char *c, int *intval) 363 { 364 substring_t argstr; 365 char *name, *value = c; 366 size_t len; 367 int ret, i; 368 369 name = strsep(&value, "="); 370 if (!name || !value) 371 return -EINVAL; 372 373 i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name); 374 if (i < 0) 375 return i; 376 377 len = strlen(value); 378 379 argstr.from = value; 380 argstr.to = value + len; 381 382 ret = match_int(&argstr, intval); 383 if (ret >= 0) { 384 if (*intval < 0) 385 return -EINVAL; 386 return i; 387 } 388 if (strncmp(value, RDMACG_MAX_STR, len) == 0) { 389 *intval = S32_MAX; 390 return i; 391 } 392 return -EINVAL; 393 } 394 395 static int rdmacg_parse_limits(char *options, 396 int *new_limits, unsigned long *enables) 397 { 398 char *c; 399 int err = -EINVAL; 400 401 /* parse resource options */ 402 while ((c = strsep(&options, " ")) != NULL) { 403 int index, intval; 404 405 index = parse_resource(c, &intval); 406 if (index < 0) 407 goto err; 408 409 new_limits[index] = intval; 410 *enables |= BIT(index); 411 } 412 return 0; 413 414 err: 415 return err; 416 } 417 418 static struct rdmacg_device *rdmacg_get_device_locked(const char *name) 419 { 420 struct rdmacg_device *device; 421 422 lockdep_assert_held(&rdmacg_mutex); 423 424 list_for_each_entry(device, &rdmacg_devices, dev_node) 425 if (!strcmp(name, device->name)) 426 return device; 427 428 return NULL; 429 } 430 431 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, 432 char *buf, size_t nbytes, loff_t off) 433 { 434 struct rdma_cgroup *cg = css_rdmacg(of_css(of)); 435 const char *dev_name; 436 struct rdmacg_resource_pool *rpool; 437 struct rdmacg_device *device; 438 char *options = strstrip(buf); 439 int *new_limits; 440 unsigned long enables = 0; 441 int i = 0, ret = 0; 442 443 /* extract the device name first */ 444 dev_name = strsep(&options, " "); 445 if (!dev_name) { 446 ret = -EINVAL; 447 goto err; 448 } 449 450 new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL); 451 if (!new_limits) { 452 ret = -ENOMEM; 453 goto err; 454 } 455 456 ret = rdmacg_parse_limits(options, new_limits, &enables); 457 if (ret) 458 goto parse_err; 459 460 /* acquire lock to synchronize with hot plug devices */ 461 mutex_lock(&rdmacg_mutex); 462 463 device = rdmacg_get_device_locked(dev_name); 464 if (!device) { 465 ret = -ENODEV; 466 goto dev_err; 467 } 468 469 rpool = get_cg_rpool_locked(cg, device); 470 if (IS_ERR(rpool)) { 471 ret = PTR_ERR(rpool); 472 goto dev_err; 473 } 474 475 /* now set the new limits of the rpool */ 476 for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) 477 set_resource_limit(rpool, i, new_limits[i]); 478 479 if (rpool->usage_sum == 0 && 480 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { 481 /* 482 * No user of the rpool and all entries are set to max, so 483 * safe to delete this rpool. 484 */ 485 free_cg_rpool_locked(rpool); 486 } 487 488 dev_err: 489 mutex_unlock(&rdmacg_mutex); 490 491 parse_err: 492 kfree(new_limits); 493 494 err: 495 return ret ?: nbytes; 496 } 497 498 static void print_rpool_values(struct seq_file *sf, 499 struct rdmacg_resource_pool *rpool) 500 { 501 enum rdmacg_file_type sf_type; 502 int i; 503 u32 value; 504 505 sf_type = seq_cft(sf)->private; 506 507 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { 508 seq_puts(sf, rdmacg_resource_names[i]); 509 seq_putc(sf, '='); 510 if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { 511 if (rpool) 512 value = rpool->resources[i].max; 513 else 514 value = S32_MAX; 515 } else { 516 if (rpool) 517 value = rpool->resources[i].usage; 518 else 519 value = 0; 520 } 521 522 if (value == S32_MAX) 523 seq_puts(sf, RDMACG_MAX_STR); 524 else 525 seq_printf(sf, "%d", value); 526 seq_putc(sf, ' '); 527 } 528 } 529 530 static int rdmacg_resource_read(struct seq_file *sf, void *v) 531 { 532 struct rdmacg_device *device; 533 struct rdmacg_resource_pool *rpool; 534 struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); 535 536 mutex_lock(&rdmacg_mutex); 537 538 list_for_each_entry(device, &rdmacg_devices, dev_node) { 539 seq_printf(sf, "%s ", device->name); 540 541 rpool = find_cg_rpool_locked(cg, device); 542 print_rpool_values(sf, rpool); 543 544 seq_putc(sf, '\n'); 545 } 546 547 mutex_unlock(&rdmacg_mutex); 548 return 0; 549 } 550 551 static struct cftype rdmacg_files[] = { 552 { 553 .name = "max", 554 .write = rdmacg_resource_set_max, 555 .seq_show = rdmacg_resource_read, 556 .private = RDMACG_RESOURCE_TYPE_MAX, 557 .flags = CFTYPE_NOT_ON_ROOT, 558 }, 559 { 560 .name = "current", 561 .seq_show = rdmacg_resource_read, 562 .private = RDMACG_RESOURCE_TYPE_STAT, 563 .flags = CFTYPE_NOT_ON_ROOT, 564 }, 565 { } /* terminate */ 566 }; 567 568 static struct cgroup_subsys_state * 569 rdmacg_css_alloc(struct cgroup_subsys_state *parent) 570 { 571 struct rdma_cgroup *cg; 572 573 cg = kzalloc(sizeof(*cg), GFP_KERNEL); 574 if (!cg) 575 return ERR_PTR(-ENOMEM); 576 577 INIT_LIST_HEAD(&cg->rpools); 578 return &cg->css; 579 } 580 581 static void rdmacg_css_free(struct cgroup_subsys_state *css) 582 { 583 struct rdma_cgroup *cg = css_rdmacg(css); 584 585 kfree(cg); 586 } 587 588 /** 589 * rdmacg_css_offline - cgroup css_offline callback 590 * @css: css of interest 591 * 592 * This function is called when @css is about to go away and responsible 593 * for shooting down all rdmacg associated with @css. As part of that it 594 * marks all the resource pool entries to max value, so that when resources are 595 * uncharged, associated resource pool can be freed as well. 596 */ 597 static void rdmacg_css_offline(struct cgroup_subsys_state *css) 598 { 599 struct rdma_cgroup *cg = css_rdmacg(css); 600 struct rdmacg_resource_pool *rpool; 601 602 mutex_lock(&rdmacg_mutex); 603 604 list_for_each_entry(rpool, &cg->rpools, cg_node) 605 set_all_resource_max_limit(rpool); 606 607 mutex_unlock(&rdmacg_mutex); 608 } 609 610 struct cgroup_subsys rdma_cgrp_subsys = { 611 .css_alloc = rdmacg_css_alloc, 612 .css_free = rdmacg_css_free, 613 .css_offline = rdmacg_css_offline, 614 .legacy_cftypes = rdmacg_files, 615 .dfl_cftypes = rdmacg_files, 616 }; 617