1 /* 2 * RDMA resource limiting controller for cgroups. 3 * 4 * Used to allow a cgroup hierarchy to stop processes from consuming 5 * additional RDMA resources after a certain limit is reached. 6 * 7 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> 8 * 9 * This file is subject to the terms and conditions of version 2 of the GNU 10 * General Public License. See the file COPYING in the main directory of the 11 * Linux distribution for more details. 12 */ 13 14 #include <linux/bitops.h> 15 #include <linux/slab.h> 16 #include <linux/seq_file.h> 17 #include <linux/cgroup.h> 18 #include <linux/parser.h> 19 #include <linux/cgroup_rdma.h> 20 21 #define RDMACG_MAX_STR "max" 22 23 /* 24 * Protects list of resource pools maintained on per cgroup basis 25 * and rdma device list. 26 */ 27 static DEFINE_MUTEX(rdmacg_mutex); 28 static LIST_HEAD(rdmacg_devices); 29 30 enum rdmacg_file_type { 31 RDMACG_RESOURCE_TYPE_MAX, 32 RDMACG_RESOURCE_TYPE_STAT, 33 }; 34 35 /* 36 * resource table definition as to be seen by the user. 37 * Need to add entries to it when more resources are 38 * added/defined at IB verb/core layer. 39 */ 40 static char const *rdmacg_resource_names[] = { 41 [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle", 42 [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object", 43 }; 44 45 /* resource tracker for each resource of rdma cgroup */ 46 struct rdmacg_resource { 47 int max; 48 int usage; 49 }; 50 51 /* 52 * resource pool object which represents per cgroup, per device 53 * resources. There are multiple instances of this object per cgroup, 54 * therefore it cannot be embedded within rdma_cgroup structure. It 55 * is maintained as list. 56 */ 57 struct rdmacg_resource_pool { 58 struct rdmacg_device *device; 59 struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; 60 61 struct list_head cg_node; 62 struct list_head dev_node; 63 64 /* count active user tasks of this pool */ 65 u64 usage_sum; 66 /* total number counts which are set to max */ 67 int num_max_cnt; 68 }; 69 70 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) 71 { 72 return container_of(css, struct rdma_cgroup, css); 73 } 74 75 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) 76 { 77 return css_rdmacg(cg->css.parent); 78 } 79 80 static inline struct rdma_cgroup *get_current_rdmacg(void) 81 { 82 return css_rdmacg(task_get_css(current, rdma_cgrp_id)); 83 } 84 85 static void set_resource_limit(struct rdmacg_resource_pool *rpool, 86 int index, int new_max) 87 { 88 if (new_max == S32_MAX) { 89 if (rpool->resources[index].max != S32_MAX) 90 rpool->num_max_cnt++; 91 } else { 92 if (rpool->resources[index].max == S32_MAX) 93 rpool->num_max_cnt--; 94 } 95 rpool->resources[index].max = new_max; 96 } 97 98 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) 99 { 100 int i; 101 102 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) 103 set_resource_limit(rpool, i, S32_MAX); 104 } 105 106 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) 107 { 108 lockdep_assert_held(&rdmacg_mutex); 109 110 list_del(&rpool->cg_node); 111 list_del(&rpool->dev_node); 112 kfree(rpool); 113 } 114 115 static struct rdmacg_resource_pool * 116 find_cg_rpool_locked(struct rdma_cgroup *cg, 117 struct rdmacg_device *device) 118 119 { 120 struct rdmacg_resource_pool *pool; 121 122 lockdep_assert_held(&rdmacg_mutex); 123 124 list_for_each_entry(pool, &cg->rpools, cg_node) 125 if (pool->device == device) 126 return pool; 127 128 return NULL; 129 } 130 131 static struct rdmacg_resource_pool * 132 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) 133 { 134 struct rdmacg_resource_pool *rpool; 135 136 rpool = find_cg_rpool_locked(cg, device); 137 if (rpool) 138 return rpool; 139 140 rpool = kzalloc(sizeof(*rpool), GFP_KERNEL); 141 if (!rpool) 142 return ERR_PTR(-ENOMEM); 143 144 rpool->device = device; 145 set_all_resource_max_limit(rpool); 146 147 INIT_LIST_HEAD(&rpool->cg_node); 148 INIT_LIST_HEAD(&rpool->dev_node); 149 list_add_tail(&rpool->cg_node, &cg->rpools); 150 list_add_tail(&rpool->dev_node, &device->rpools); 151 return rpool; 152 } 153 154 /** 155 * uncharge_cg_locked - uncharge resource for rdma cgroup 156 * @cg: pointer to cg to uncharge and all parents in hierarchy 157 * @device: pointer to rdmacg device 158 * @index: index of the resource to uncharge in cg (resource pool) 159 * 160 * It also frees the resource pool which was created as part of 161 * charging operation when there are no resources attached to 162 * resource pool. 163 */ 164 static void 165 uncharge_cg_locked(struct rdma_cgroup *cg, 166 struct rdmacg_device *device, 167 enum rdmacg_resource_type index) 168 { 169 struct rdmacg_resource_pool *rpool; 170 171 rpool = find_cg_rpool_locked(cg, device); 172 173 /* 174 * rpool cannot be null at this stage. Let kernel operate in case 175 * if there a bug in IB stack or rdma controller, instead of crashing 176 * the system. 177 */ 178 if (unlikely(!rpool)) { 179 pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); 180 return; 181 } 182 183 rpool->resources[index].usage--; 184 185 /* 186 * A negative count (or overflow) is invalid, 187 * it indicates a bug in the rdma controller. 188 */ 189 WARN_ON_ONCE(rpool->resources[index].usage < 0); 190 rpool->usage_sum--; 191 if (rpool->usage_sum == 0 && 192 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { 193 /* 194 * No user of the rpool and all entries are set to max, so 195 * safe to delete this rpool. 196 */ 197 free_cg_rpool_locked(rpool); 198 } 199 } 200 201 /** 202 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count 203 * @device: pointer to rdmacg device 204 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup 205 * stop uncharging 206 * @index: index of the resource to uncharge in cg in given resource pool 207 */ 208 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, 209 struct rdmacg_device *device, 210 struct rdma_cgroup *stop_cg, 211 enum rdmacg_resource_type index) 212 { 213 struct rdma_cgroup *p; 214 215 mutex_lock(&rdmacg_mutex); 216 217 for (p = cg; p != stop_cg; p = parent_rdmacg(p)) 218 uncharge_cg_locked(p, device, index); 219 220 mutex_unlock(&rdmacg_mutex); 221 222 css_put(&cg->css); 223 } 224 225 /** 226 * rdmacg_uncharge - hierarchically uncharge rdma resource count 227 * @device: pointer to rdmacg device 228 * @index: index of the resource to uncharge in cgroup in given resource pool 229 */ 230 void rdmacg_uncharge(struct rdma_cgroup *cg, 231 struct rdmacg_device *device, 232 enum rdmacg_resource_type index) 233 { 234 if (index >= RDMACG_RESOURCE_MAX) 235 return; 236 237 rdmacg_uncharge_hierarchy(cg, device, NULL, index); 238 } 239 EXPORT_SYMBOL(rdmacg_uncharge); 240 241 /** 242 * rdmacg_try_charge - hierarchically try to charge the rdma resource 243 * @rdmacg: pointer to rdma cgroup which will own this resource 244 * @device: pointer to rdmacg device 245 * @index: index of the resource to charge in cgroup (resource pool) 246 * 247 * This function follows charging resource in hierarchical way. 248 * It will fail if the charge would cause the new value to exceed the 249 * hierarchical limit. 250 * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. 251 * Returns pointer to rdmacg for this resource when charging is successful. 252 * 253 * Charger needs to account resources on two criteria. 254 * (a) per cgroup & (b) per device resource usage. 255 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross 256 * the configured limits. Per device provides granular configuration 257 * in multi device usage. It allocates resource pool in the hierarchy 258 * for each parent it come across for first resource. Later on resource 259 * pool will be available. Therefore it will be much faster thereon 260 * to charge/uncharge. 261 */ 262 int rdmacg_try_charge(struct rdma_cgroup **rdmacg, 263 struct rdmacg_device *device, 264 enum rdmacg_resource_type index) 265 { 266 struct rdma_cgroup *cg, *p; 267 struct rdmacg_resource_pool *rpool; 268 s64 new; 269 int ret = 0; 270 271 if (index >= RDMACG_RESOURCE_MAX) 272 return -EINVAL; 273 274 /* 275 * hold on to css, as cgroup can be removed but resource 276 * accounting happens on css. 277 */ 278 cg = get_current_rdmacg(); 279 280 mutex_lock(&rdmacg_mutex); 281 for (p = cg; p; p = parent_rdmacg(p)) { 282 rpool = get_cg_rpool_locked(p, device); 283 if (IS_ERR(rpool)) { 284 ret = PTR_ERR(rpool); 285 goto err; 286 } else { 287 new = rpool->resources[index].usage + 1; 288 if (new > rpool->resources[index].max) { 289 ret = -EAGAIN; 290 goto err; 291 } else { 292 rpool->resources[index].usage = new; 293 rpool->usage_sum++; 294 } 295 } 296 } 297 mutex_unlock(&rdmacg_mutex); 298 299 *rdmacg = cg; 300 return 0; 301 302 err: 303 mutex_unlock(&rdmacg_mutex); 304 rdmacg_uncharge_hierarchy(cg, device, p, index); 305 return ret; 306 } 307 EXPORT_SYMBOL(rdmacg_try_charge); 308 309 /** 310 * rdmacg_register_device - register rdmacg device to rdma controller. 311 * @device: pointer to rdmacg device whose resources need to be accounted. 312 * 313 * If IB stack wish a device to participate in rdma cgroup resource 314 * tracking, it must invoke this API to register with rdma cgroup before 315 * any user space application can start using the RDMA resources. 316 * Returns 0 on success or EINVAL when table length given is beyond 317 * supported size. 318 */ 319 int rdmacg_register_device(struct rdmacg_device *device) 320 { 321 INIT_LIST_HEAD(&device->dev_node); 322 INIT_LIST_HEAD(&device->rpools); 323 324 mutex_lock(&rdmacg_mutex); 325 list_add_tail(&device->dev_node, &rdmacg_devices); 326 mutex_unlock(&rdmacg_mutex); 327 return 0; 328 } 329 EXPORT_SYMBOL(rdmacg_register_device); 330 331 /** 332 * rdmacg_unregister_device - unregister rdmacg device from rdma controller. 333 * @device: pointer to rdmacg device which was previously registered with rdma 334 * controller using rdmacg_register_device(). 335 * 336 * IB stack must invoke this after all the resources of the IB device 337 * are destroyed and after ensuring that no more resources will be created 338 * when this API is invoked. 339 */ 340 void rdmacg_unregister_device(struct rdmacg_device *device) 341 { 342 struct rdmacg_resource_pool *rpool, *tmp; 343 344 /* 345 * Synchronize with any active resource settings, 346 * usage query happening via configfs. 347 */ 348 mutex_lock(&rdmacg_mutex); 349 list_del_init(&device->dev_node); 350 351 /* 352 * Now that this device is off the cgroup list, its safe to free 353 * all the rpool resources. 354 */ 355 list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) 356 free_cg_rpool_locked(rpool); 357 358 mutex_unlock(&rdmacg_mutex); 359 } 360 EXPORT_SYMBOL(rdmacg_unregister_device); 361 362 static int parse_resource(char *c, int *intval) 363 { 364 substring_t argstr; 365 const char **table = &rdmacg_resource_names[0]; 366 char *name, *value = c; 367 size_t len; 368 int ret, i = 0; 369 370 name = strsep(&value, "="); 371 if (!name || !value) 372 return -EINVAL; 373 374 len = strlen(value); 375 376 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { 377 if (strcmp(table[i], name)) 378 continue; 379 380 argstr.from = value; 381 argstr.to = value + len; 382 383 ret = match_int(&argstr, intval); 384 if (ret >= 0) { 385 if (*intval < 0) 386 break; 387 return i; 388 } 389 if (strncmp(value, RDMACG_MAX_STR, len) == 0) { 390 *intval = S32_MAX; 391 return i; 392 } 393 break; 394 } 395 return -EINVAL; 396 } 397 398 static int rdmacg_parse_limits(char *options, 399 int *new_limits, unsigned long *enables) 400 { 401 char *c; 402 int err = -EINVAL; 403 404 /* parse resource options */ 405 while ((c = strsep(&options, " ")) != NULL) { 406 int index, intval; 407 408 index = parse_resource(c, &intval); 409 if (index < 0) 410 goto err; 411 412 new_limits[index] = intval; 413 *enables |= BIT(index); 414 } 415 return 0; 416 417 err: 418 return err; 419 } 420 421 static struct rdmacg_device *rdmacg_get_device_locked(const char *name) 422 { 423 struct rdmacg_device *device; 424 425 lockdep_assert_held(&rdmacg_mutex); 426 427 list_for_each_entry(device, &rdmacg_devices, dev_node) 428 if (!strcmp(name, device->name)) 429 return device; 430 431 return NULL; 432 } 433 434 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, 435 char *buf, size_t nbytes, loff_t off) 436 { 437 struct rdma_cgroup *cg = css_rdmacg(of_css(of)); 438 const char *dev_name; 439 struct rdmacg_resource_pool *rpool; 440 struct rdmacg_device *device; 441 char *options = strstrip(buf); 442 int *new_limits; 443 unsigned long enables = 0; 444 int i = 0, ret = 0; 445 446 /* extract the device name first */ 447 dev_name = strsep(&options, " "); 448 if (!dev_name) { 449 ret = -EINVAL; 450 goto err; 451 } 452 453 new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL); 454 if (!new_limits) { 455 ret = -ENOMEM; 456 goto err; 457 } 458 459 ret = rdmacg_parse_limits(options, new_limits, &enables); 460 if (ret) 461 goto parse_err; 462 463 /* acquire lock to synchronize with hot plug devices */ 464 mutex_lock(&rdmacg_mutex); 465 466 device = rdmacg_get_device_locked(dev_name); 467 if (!device) { 468 ret = -ENODEV; 469 goto dev_err; 470 } 471 472 rpool = get_cg_rpool_locked(cg, device); 473 if (IS_ERR(rpool)) { 474 ret = PTR_ERR(rpool); 475 goto dev_err; 476 } 477 478 /* now set the new limits of the rpool */ 479 for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) 480 set_resource_limit(rpool, i, new_limits[i]); 481 482 if (rpool->usage_sum == 0 && 483 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { 484 /* 485 * No user of the rpool and all entries are set to max, so 486 * safe to delete this rpool. 487 */ 488 free_cg_rpool_locked(rpool); 489 } 490 491 dev_err: 492 mutex_unlock(&rdmacg_mutex); 493 494 parse_err: 495 kfree(new_limits); 496 497 err: 498 return ret ?: nbytes; 499 } 500 501 static void print_rpool_values(struct seq_file *sf, 502 struct rdmacg_resource_pool *rpool) 503 { 504 enum rdmacg_file_type sf_type; 505 int i; 506 u32 value; 507 508 sf_type = seq_cft(sf)->private; 509 510 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { 511 seq_puts(sf, rdmacg_resource_names[i]); 512 seq_putc(sf, '='); 513 if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { 514 if (rpool) 515 value = rpool->resources[i].max; 516 else 517 value = S32_MAX; 518 } else { 519 if (rpool) 520 value = rpool->resources[i].usage; 521 else 522 value = 0; 523 } 524 525 if (value == S32_MAX) 526 seq_puts(sf, RDMACG_MAX_STR); 527 else 528 seq_printf(sf, "%d", value); 529 seq_putc(sf, ' '); 530 } 531 } 532 533 static int rdmacg_resource_read(struct seq_file *sf, void *v) 534 { 535 struct rdmacg_device *device; 536 struct rdmacg_resource_pool *rpool; 537 struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); 538 539 mutex_lock(&rdmacg_mutex); 540 541 list_for_each_entry(device, &rdmacg_devices, dev_node) { 542 seq_printf(sf, "%s ", device->name); 543 544 rpool = find_cg_rpool_locked(cg, device); 545 print_rpool_values(sf, rpool); 546 547 seq_putc(sf, '\n'); 548 } 549 550 mutex_unlock(&rdmacg_mutex); 551 return 0; 552 } 553 554 static struct cftype rdmacg_files[] = { 555 { 556 .name = "max", 557 .write = rdmacg_resource_set_max, 558 .seq_show = rdmacg_resource_read, 559 .private = RDMACG_RESOURCE_TYPE_MAX, 560 .flags = CFTYPE_NOT_ON_ROOT, 561 }, 562 { 563 .name = "current", 564 .seq_show = rdmacg_resource_read, 565 .private = RDMACG_RESOURCE_TYPE_STAT, 566 .flags = CFTYPE_NOT_ON_ROOT, 567 }, 568 { } /* terminate */ 569 }; 570 571 static struct cgroup_subsys_state * 572 rdmacg_css_alloc(struct cgroup_subsys_state *parent) 573 { 574 struct rdma_cgroup *cg; 575 576 cg = kzalloc(sizeof(*cg), GFP_KERNEL); 577 if (!cg) 578 return ERR_PTR(-ENOMEM); 579 580 INIT_LIST_HEAD(&cg->rpools); 581 return &cg->css; 582 } 583 584 static void rdmacg_css_free(struct cgroup_subsys_state *css) 585 { 586 struct rdma_cgroup *cg = css_rdmacg(css); 587 588 kfree(cg); 589 } 590 591 /** 592 * rdmacg_css_offline - cgroup css_offline callback 593 * @css: css of interest 594 * 595 * This function is called when @css is about to go away and responsible 596 * for shooting down all rdmacg associated with @css. As part of that it 597 * marks all the resource pool entries to max value, so that when resources are 598 * uncharged, associated resource pool can be freed as well. 599 */ 600 static void rdmacg_css_offline(struct cgroup_subsys_state *css) 601 { 602 struct rdma_cgroup *cg = css_rdmacg(css); 603 struct rdmacg_resource_pool *rpool; 604 605 mutex_lock(&rdmacg_mutex); 606 607 list_for_each_entry(rpool, &cg->rpools, cg_node) 608 set_all_resource_max_limit(rpool); 609 610 mutex_unlock(&rdmacg_mutex); 611 } 612 613 struct cgroup_subsys rdma_cgrp_subsys = { 614 .css_alloc = rdmacg_css_alloc, 615 .css_free = rdmacg_css_free, 616 .css_offline = rdmacg_css_offline, 617 .legacy_cftypes = rdmacg_files, 618 .dfl_cftypes = rdmacg_files, 619 }; 620