1 /* 2 * RDMA resource limiting controller for cgroups. 3 * 4 * Used to allow a cgroup hierarchy to stop processes from consuming 5 * additional RDMA resources after a certain limit is reached. 6 * 7 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> 8 * 9 * This file is subject to the terms and conditions of version 2 of the GNU 10 * General Public License. See the file COPYING in the main directory of the 11 * Linux distribution for more details. 12 */ 13 14 #include <linux/bitops.h> 15 #include <linux/slab.h> 16 #include <linux/seq_file.h> 17 #include <linux/cgroup.h> 18 #include <linux/parser.h> 19 #include <linux/cgroup_rdma.h> 20 21 #define RDMACG_MAX_STR "max" 22 23 /* 24 * Protects list of resource pools maintained on per cgroup basis 25 * and rdma device list. 26 */ 27 static DEFINE_MUTEX(rdmacg_mutex); 28 static LIST_HEAD(rdmacg_devices); 29 30 enum rdmacg_file_type { 31 RDMACG_RESOURCE_TYPE_MAX, 32 RDMACG_RESOURCE_TYPE_STAT, 33 }; 34 35 /* 36 * resource table definition as to be seen by the user. 37 * Need to add entries to it when more resources are 38 * added/defined at IB verb/core layer. 39 */ 40 static char const *rdmacg_resource_names[] = { 41 [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle", 42 [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object", 43 }; 44 45 /* resource tracker for each resource of rdma cgroup */ 46 struct rdmacg_resource { 47 int max; 48 int usage; 49 }; 50 51 /* 52 * resource pool object which represents per cgroup, per device 53 * resources. There are multiple instances of this object per cgroup, 54 * therefore it cannot be embedded within rdma_cgroup structure. It 55 * is maintained as list. 56 */ 57 struct rdmacg_resource_pool { 58 struct rdmacg_device *device; 59 struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; 60 61 struct list_head cg_node; 62 struct list_head dev_node; 63 64 /* count active user tasks of this pool */ 65 u64 usage_sum; 66 /* total number counts which are set to max */ 67 int num_max_cnt; 68 }; 69 70 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) 71 { 72 return container_of(css, struct rdma_cgroup, css); 73 } 74 75 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) 76 { 77 return css_rdmacg(cg->css.parent); 78 } 79 80 static inline struct rdma_cgroup *get_current_rdmacg(void) 81 { 82 return css_rdmacg(task_get_css(current, rdma_cgrp_id)); 83 } 84 85 static void set_resource_limit(struct rdmacg_resource_pool *rpool, 86 int index, int new_max) 87 { 88 if (new_max == S32_MAX) { 89 if (rpool->resources[index].max != S32_MAX) 90 rpool->num_max_cnt++; 91 } else { 92 if (rpool->resources[index].max == S32_MAX) 93 rpool->num_max_cnt--; 94 } 95 rpool->resources[index].max = new_max; 96 } 97 98 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) 99 { 100 int i; 101 102 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) 103 set_resource_limit(rpool, i, S32_MAX); 104 } 105 106 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) 107 { 108 lockdep_assert_held(&rdmacg_mutex); 109 110 list_del(&rpool->cg_node); 111 list_del(&rpool->dev_node); 112 kfree(rpool); 113 } 114 115 static struct rdmacg_resource_pool * 116 find_cg_rpool_locked(struct rdma_cgroup *cg, 117 struct rdmacg_device *device) 118 119 { 120 struct rdmacg_resource_pool *pool; 121 122 lockdep_assert_held(&rdmacg_mutex); 123 124 list_for_each_entry(pool, &cg->rpools, cg_node) 125 if (pool->device == device) 126 return pool; 127 128 return NULL; 129 } 130 131 static struct rdmacg_resource_pool * 132 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) 133 { 134 struct rdmacg_resource_pool *rpool; 135 136 rpool = find_cg_rpool_locked(cg, device); 137 if (rpool) 138 return rpool; 139 140 rpool = kzalloc(sizeof(*rpool), GFP_KERNEL); 141 if (!rpool) 142 return ERR_PTR(-ENOMEM); 143 144 rpool->device = device; 145 set_all_resource_max_limit(rpool); 146 147 INIT_LIST_HEAD(&rpool->cg_node); 148 INIT_LIST_HEAD(&rpool->dev_node); 149 list_add_tail(&rpool->cg_node, &cg->rpools); 150 list_add_tail(&rpool->dev_node, &device->rpools); 151 return rpool; 152 } 153 154 /** 155 * uncharge_cg_locked - uncharge resource for rdma cgroup 156 * @cg: pointer to cg to uncharge and all parents in hierarchy 157 * @device: pointer to rdmacg device 158 * @index: index of the resource to uncharge in cg (resource pool) 159 * 160 * It also frees the resource pool which was created as part of 161 * charging operation when there are no resources attached to 162 * resource pool. 163 */ 164 static void 165 uncharge_cg_locked(struct rdma_cgroup *cg, 166 struct rdmacg_device *device, 167 enum rdmacg_resource_type index) 168 { 169 struct rdmacg_resource_pool *rpool; 170 171 rpool = find_cg_rpool_locked(cg, device); 172 173 /* 174 * rpool cannot be null at this stage. Let kernel operate in case 175 * if there a bug in IB stack or rdma controller, instead of crashing 176 * the system. 177 */ 178 if (unlikely(!rpool)) { 179 pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); 180 return; 181 } 182 183 rpool->resources[index].usage--; 184 185 /* 186 * A negative count (or overflow) is invalid, 187 * it indicates a bug in the rdma controller. 188 */ 189 WARN_ON_ONCE(rpool->resources[index].usage < 0); 190 rpool->usage_sum--; 191 if (rpool->usage_sum == 0 && 192 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { 193 /* 194 * No user of the rpool and all entries are set to max, so 195 * safe to delete this rpool. 196 */ 197 free_cg_rpool_locked(rpool); 198 } 199 } 200 201 /** 202 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count 203 * @device: pointer to rdmacg device 204 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup 205 * stop uncharging 206 * @index: index of the resource to uncharge in cg in given resource pool 207 */ 208 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, 209 struct rdmacg_device *device, 210 struct rdma_cgroup *stop_cg, 211 enum rdmacg_resource_type index) 212 { 213 struct rdma_cgroup *p; 214 215 mutex_lock(&rdmacg_mutex); 216 217 for (p = cg; p != stop_cg; p = parent_rdmacg(p)) 218 uncharge_cg_locked(p, device, index); 219 220 mutex_unlock(&rdmacg_mutex); 221 222 css_put(&cg->css); 223 } 224 225 /** 226 * rdmacg_uncharge - hierarchically uncharge rdma resource count 227 * @device: pointer to rdmacg device 228 * @index: index of the resource to uncharge in cgroup in given resource pool 229 */ 230 void rdmacg_uncharge(struct rdma_cgroup *cg, 231 struct rdmacg_device *device, 232 enum rdmacg_resource_type index) 233 { 234 if (index >= RDMACG_RESOURCE_MAX) 235 return; 236 237 rdmacg_uncharge_hierarchy(cg, device, NULL, index); 238 } 239 EXPORT_SYMBOL(rdmacg_uncharge); 240 241 /** 242 * rdmacg_try_charge - hierarchically try to charge the rdma resource 243 * @rdmacg: pointer to rdma cgroup which will own this resource 244 * @device: pointer to rdmacg device 245 * @index: index of the resource to charge in cgroup (resource pool) 246 * 247 * This function follows charging resource in hierarchical way. 248 * It will fail if the charge would cause the new value to exceed the 249 * hierarchical limit. 250 * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. 251 * Returns pointer to rdmacg for this resource when charging is successful. 252 * 253 * Charger needs to account resources on two criteria. 254 * (a) per cgroup & (b) per device resource usage. 255 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross 256 * the configured limits. Per device provides granular configuration 257 * in multi device usage. It allocates resource pool in the hierarchy 258 * for each parent it come across for first resource. Later on resource 259 * pool will be available. Therefore it will be much faster thereon 260 * to charge/uncharge. 261 */ 262 int rdmacg_try_charge(struct rdma_cgroup **rdmacg, 263 struct rdmacg_device *device, 264 enum rdmacg_resource_type index) 265 { 266 struct rdma_cgroup *cg, *p; 267 struct rdmacg_resource_pool *rpool; 268 s64 new; 269 int ret = 0; 270 271 if (index >= RDMACG_RESOURCE_MAX) 272 return -EINVAL; 273 274 /* 275 * hold on to css, as cgroup can be removed but resource 276 * accounting happens on css. 277 */ 278 cg = get_current_rdmacg(); 279 280 mutex_lock(&rdmacg_mutex); 281 for (p = cg; p; p = parent_rdmacg(p)) { 282 rpool = get_cg_rpool_locked(p, device); 283 if (IS_ERR(rpool)) { 284 ret = PTR_ERR(rpool); 285 goto err; 286 } else { 287 new = rpool->resources[index].usage + 1; 288 if (new > rpool->resources[index].max) { 289 ret = -EAGAIN; 290 goto err; 291 } else { 292 rpool->resources[index].usage = new; 293 rpool->usage_sum++; 294 } 295 } 296 } 297 mutex_unlock(&rdmacg_mutex); 298 299 *rdmacg = cg; 300 return 0; 301 302 err: 303 mutex_unlock(&rdmacg_mutex); 304 rdmacg_uncharge_hierarchy(cg, device, p, index); 305 return ret; 306 } 307 EXPORT_SYMBOL(rdmacg_try_charge); 308 309 /** 310 * rdmacg_register_device - register rdmacg device to rdma controller. 311 * @device: pointer to rdmacg device whose resources need to be accounted. 312 * 313 * If IB stack wish a device to participate in rdma cgroup resource 314 * tracking, it must invoke this API to register with rdma cgroup before 315 * any user space application can start using the RDMA resources. 316 */ 317 void rdmacg_register_device(struct rdmacg_device *device) 318 { 319 INIT_LIST_HEAD(&device->dev_node); 320 INIT_LIST_HEAD(&device->rpools); 321 322 mutex_lock(&rdmacg_mutex); 323 list_add_tail(&device->dev_node, &rdmacg_devices); 324 mutex_unlock(&rdmacg_mutex); 325 } 326 EXPORT_SYMBOL(rdmacg_register_device); 327 328 /** 329 * rdmacg_unregister_device - unregister rdmacg device from rdma controller. 330 * @device: pointer to rdmacg device which was previously registered with rdma 331 * controller using rdmacg_register_device(). 332 * 333 * IB stack must invoke this after all the resources of the IB device 334 * are destroyed and after ensuring that no more resources will be created 335 * when this API is invoked. 336 */ 337 void rdmacg_unregister_device(struct rdmacg_device *device) 338 { 339 struct rdmacg_resource_pool *rpool, *tmp; 340 341 /* 342 * Synchronize with any active resource settings, 343 * usage query happening via configfs. 344 */ 345 mutex_lock(&rdmacg_mutex); 346 list_del_init(&device->dev_node); 347 348 /* 349 * Now that this device is off the cgroup list, its safe to free 350 * all the rpool resources. 351 */ 352 list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) 353 free_cg_rpool_locked(rpool); 354 355 mutex_unlock(&rdmacg_mutex); 356 } 357 EXPORT_SYMBOL(rdmacg_unregister_device); 358 359 static int parse_resource(char *c, int *intval) 360 { 361 substring_t argstr; 362 char *name, *value = c; 363 size_t len; 364 int ret, i; 365 366 name = strsep(&value, "="); 367 if (!name || !value) 368 return -EINVAL; 369 370 i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name); 371 if (i < 0) 372 return i; 373 374 len = strlen(value); 375 376 argstr.from = value; 377 argstr.to = value + len; 378 379 ret = match_int(&argstr, intval); 380 if (ret >= 0) { 381 if (*intval < 0) 382 return -EINVAL; 383 return i; 384 } 385 if (strncmp(value, RDMACG_MAX_STR, len) == 0) { 386 *intval = S32_MAX; 387 return i; 388 } 389 return -EINVAL; 390 } 391 392 static int rdmacg_parse_limits(char *options, 393 int *new_limits, unsigned long *enables) 394 { 395 char *c; 396 int err = -EINVAL; 397 398 /* parse resource options */ 399 while ((c = strsep(&options, " ")) != NULL) { 400 int index, intval; 401 402 index = parse_resource(c, &intval); 403 if (index < 0) 404 goto err; 405 406 new_limits[index] = intval; 407 *enables |= BIT(index); 408 } 409 return 0; 410 411 err: 412 return err; 413 } 414 415 static struct rdmacg_device *rdmacg_get_device_locked(const char *name) 416 { 417 struct rdmacg_device *device; 418 419 lockdep_assert_held(&rdmacg_mutex); 420 421 list_for_each_entry(device, &rdmacg_devices, dev_node) 422 if (!strcmp(name, device->name)) 423 return device; 424 425 return NULL; 426 } 427 428 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, 429 char *buf, size_t nbytes, loff_t off) 430 { 431 struct rdma_cgroup *cg = css_rdmacg(of_css(of)); 432 const char *dev_name; 433 struct rdmacg_resource_pool *rpool; 434 struct rdmacg_device *device; 435 char *options = strstrip(buf); 436 int *new_limits; 437 unsigned long enables = 0; 438 int i = 0, ret = 0; 439 440 /* extract the device name first */ 441 dev_name = strsep(&options, " "); 442 if (!dev_name) { 443 ret = -EINVAL; 444 goto err; 445 } 446 447 new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL); 448 if (!new_limits) { 449 ret = -ENOMEM; 450 goto err; 451 } 452 453 ret = rdmacg_parse_limits(options, new_limits, &enables); 454 if (ret) 455 goto parse_err; 456 457 /* acquire lock to synchronize with hot plug devices */ 458 mutex_lock(&rdmacg_mutex); 459 460 device = rdmacg_get_device_locked(dev_name); 461 if (!device) { 462 ret = -ENODEV; 463 goto dev_err; 464 } 465 466 rpool = get_cg_rpool_locked(cg, device); 467 if (IS_ERR(rpool)) { 468 ret = PTR_ERR(rpool); 469 goto dev_err; 470 } 471 472 /* now set the new limits of the rpool */ 473 for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) 474 set_resource_limit(rpool, i, new_limits[i]); 475 476 if (rpool->usage_sum == 0 && 477 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { 478 /* 479 * No user of the rpool and all entries are set to max, so 480 * safe to delete this rpool. 481 */ 482 free_cg_rpool_locked(rpool); 483 } 484 485 dev_err: 486 mutex_unlock(&rdmacg_mutex); 487 488 parse_err: 489 kfree(new_limits); 490 491 err: 492 return ret ?: nbytes; 493 } 494 495 static void print_rpool_values(struct seq_file *sf, 496 struct rdmacg_resource_pool *rpool) 497 { 498 enum rdmacg_file_type sf_type; 499 int i; 500 u32 value; 501 502 sf_type = seq_cft(sf)->private; 503 504 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { 505 seq_puts(sf, rdmacg_resource_names[i]); 506 seq_putc(sf, '='); 507 if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { 508 if (rpool) 509 value = rpool->resources[i].max; 510 else 511 value = S32_MAX; 512 } else { 513 if (rpool) 514 value = rpool->resources[i].usage; 515 else 516 value = 0; 517 } 518 519 if (value == S32_MAX) 520 seq_puts(sf, RDMACG_MAX_STR); 521 else 522 seq_printf(sf, "%d", value); 523 seq_putc(sf, ' '); 524 } 525 } 526 527 static int rdmacg_resource_read(struct seq_file *sf, void *v) 528 { 529 struct rdmacg_device *device; 530 struct rdmacg_resource_pool *rpool; 531 struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); 532 533 mutex_lock(&rdmacg_mutex); 534 535 list_for_each_entry(device, &rdmacg_devices, dev_node) { 536 seq_printf(sf, "%s ", device->name); 537 538 rpool = find_cg_rpool_locked(cg, device); 539 print_rpool_values(sf, rpool); 540 541 seq_putc(sf, '\n'); 542 } 543 544 mutex_unlock(&rdmacg_mutex); 545 return 0; 546 } 547 548 static struct cftype rdmacg_files[] = { 549 { 550 .name = "max", 551 .write = rdmacg_resource_set_max, 552 .seq_show = rdmacg_resource_read, 553 .private = RDMACG_RESOURCE_TYPE_MAX, 554 .flags = CFTYPE_NOT_ON_ROOT, 555 }, 556 { 557 .name = "current", 558 .seq_show = rdmacg_resource_read, 559 .private = RDMACG_RESOURCE_TYPE_STAT, 560 .flags = CFTYPE_NOT_ON_ROOT, 561 }, 562 { } /* terminate */ 563 }; 564 565 static struct cgroup_subsys_state * 566 rdmacg_css_alloc(struct cgroup_subsys_state *parent) 567 { 568 struct rdma_cgroup *cg; 569 570 cg = kzalloc(sizeof(*cg), GFP_KERNEL); 571 if (!cg) 572 return ERR_PTR(-ENOMEM); 573 574 INIT_LIST_HEAD(&cg->rpools); 575 return &cg->css; 576 } 577 578 static void rdmacg_css_free(struct cgroup_subsys_state *css) 579 { 580 struct rdma_cgroup *cg = css_rdmacg(css); 581 582 kfree(cg); 583 } 584 585 /** 586 * rdmacg_css_offline - cgroup css_offline callback 587 * @css: css of interest 588 * 589 * This function is called when @css is about to go away and responsible 590 * for shooting down all rdmacg associated with @css. As part of that it 591 * marks all the resource pool entries to max value, so that when resources are 592 * uncharged, associated resource pool can be freed as well. 593 */ 594 static void rdmacg_css_offline(struct cgroup_subsys_state *css) 595 { 596 struct rdma_cgroup *cg = css_rdmacg(css); 597 struct rdmacg_resource_pool *rpool; 598 599 mutex_lock(&rdmacg_mutex); 600 601 list_for_each_entry(rpool, &cg->rpools, cg_node) 602 set_all_resource_max_limit(rpool); 603 604 mutex_unlock(&rdmacg_mutex); 605 } 606 607 struct cgroup_subsys rdma_cgrp_subsys = { 608 .css_alloc = rdmacg_css_alloc, 609 .css_free = rdmacg_css_free, 610 .css_offline = rdmacg_css_offline, 611 .legacy_cftypes = rdmacg_files, 612 .dfl_cftypes = rdmacg_files, 613 }; 614