1 /* 2 * 3 * Copyright IBM Corporation, 2012 4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> 5 * 6 * Cgroup v2 7 * Copyright (C) 2019 Red Hat, Inc. 8 * Author: Giuseppe Scrivano <gscrivan@redhat.com> 9 * 10 * This program is free software; you can redistribute it and/or modify it 11 * under the terms of version 2.1 of the GNU Lesser General Public License 12 * as published by the Free Software Foundation. 13 * 14 * This program is distributed in the hope that it would be useful, but 15 * WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 17 * 18 */ 19 20 #include <linux/cgroup.h> 21 #include <linux/page_counter.h> 22 #include <linux/slab.h> 23 #include <linux/hugetlb.h> 24 #include <linux/hugetlb_cgroup.h> 25 26 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 27 #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) 28 #define MEMFILE_ATTR(val) ((val) & 0xffff) 29 30 static struct hugetlb_cgroup *root_h_cgroup __read_mostly; 31 32 static inline struct page_counter * 33 __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, 34 bool rsvd) 35 { 36 if (rsvd) 37 return &h_cg->rsvd_hugepage[idx]; 38 return &h_cg->hugepage[idx]; 39 } 40 41 static inline struct page_counter * 42 hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) 43 { 44 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); 45 } 46 47 static inline struct page_counter * 48 hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) 49 { 50 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); 51 } 52 53 static inline 54 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) 55 { 56 return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; 57 } 58 59 static inline 60 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 61 { 62 return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); 63 } 64 65 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 66 { 67 return (h_cg == root_h_cgroup); 68 } 69 70 static inline struct hugetlb_cgroup * 71 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) 72 { 73 return hugetlb_cgroup_from_css(h_cg->css.parent); 74 } 75 76 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) 77 { 78 struct hstate *h; 79 80 for_each_hstate(h) { 81 if (page_counter_read( 82 hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) 83 return true; 84 } 85 return false; 86 } 87 88 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, 89 struct hugetlb_cgroup *parent_h_cgroup) 90 { 91 int idx; 92 93 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { 94 struct page_counter *fault_parent = NULL; 95 struct page_counter *rsvd_parent = NULL; 96 unsigned long limit; 97 int ret; 98 99 if (parent_h_cgroup) { 100 fault_parent = hugetlb_cgroup_counter_from_cgroup( 101 parent_h_cgroup, idx); 102 rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( 103 parent_h_cgroup, idx); 104 } 105 page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, 106 idx), 107 fault_parent); 108 page_counter_init( 109 hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), 110 rsvd_parent); 111 112 limit = round_down(PAGE_COUNTER_MAX, 113 pages_per_huge_page(&hstates[idx])); 114 115 ret = page_counter_set_max( 116 hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), 117 limit); 118 VM_BUG_ON(ret); 119 ret = page_counter_set_max( 120 hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), 121 limit); 122 VM_BUG_ON(ret); 123 } 124 } 125 126 static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) 127 { 128 int node; 129 130 for_each_node(node) 131 kfree(h_cgroup->nodeinfo[node]); 132 kfree(h_cgroup); 133 } 134 135 static struct cgroup_subsys_state * 136 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 137 { 138 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); 139 struct hugetlb_cgroup *h_cgroup; 140 int node; 141 142 h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), 143 GFP_KERNEL); 144 145 if (!h_cgroup) 146 return ERR_PTR(-ENOMEM); 147 148 if (!parent_h_cgroup) 149 root_h_cgroup = h_cgroup; 150 151 /* 152 * TODO: this routine can waste much memory for nodes which will 153 * never be onlined. It's better to use memory hotplug callback 154 * function. 155 */ 156 for_each_node(node) { 157 /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ 158 int node_to_alloc = 159 node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; 160 h_cgroup->nodeinfo[node] = 161 kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), 162 GFP_KERNEL, node_to_alloc); 163 if (!h_cgroup->nodeinfo[node]) 164 goto fail_alloc_nodeinfo; 165 } 166 167 hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); 168 return &h_cgroup->css; 169 170 fail_alloc_nodeinfo: 171 hugetlb_cgroup_free(h_cgroup); 172 return ERR_PTR(-ENOMEM); 173 } 174 175 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) 176 { 177 hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); 178 } 179 180 /* 181 * Should be called with hugetlb_lock held. 182 * Since we are holding hugetlb_lock, pages cannot get moved from 183 * active list or uncharged from the cgroup, So no need to get 184 * page reference and test for page active here. This function 185 * cannot fail. 186 */ 187 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, 188 struct page *page) 189 { 190 unsigned int nr_pages; 191 struct page_counter *counter; 192 struct hugetlb_cgroup *page_hcg; 193 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); 194 195 page_hcg = hugetlb_cgroup_from_page(page); 196 /* 197 * We can have pages in active list without any cgroup 198 * ie, hugepage with less than 3 pages. We can safely 199 * ignore those pages. 200 */ 201 if (!page_hcg || page_hcg != h_cg) 202 goto out; 203 204 nr_pages = compound_nr(page); 205 if (!parent) { 206 parent = root_h_cgroup; 207 /* root has no limit */ 208 page_counter_charge(&parent->hugepage[idx], nr_pages); 209 } 210 counter = &h_cg->hugepage[idx]; 211 /* Take the pages off the local counter */ 212 page_counter_cancel(counter, nr_pages); 213 214 set_hugetlb_cgroup(page, parent); 215 out: 216 return; 217 } 218 219 /* 220 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 221 * the parent cgroup. 222 */ 223 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) 224 { 225 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 226 struct hstate *h; 227 struct page *page; 228 229 do { 230 for_each_hstate(h) { 231 spin_lock_irq(&hugetlb_lock); 232 list_for_each_entry(page, &h->hugepage_activelist, lru) 233 hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page); 234 235 spin_unlock_irq(&hugetlb_lock); 236 } 237 cond_resched(); 238 } while (hugetlb_cgroup_have_usage(h_cg)); 239 } 240 241 static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, 242 enum hugetlb_memory_event event) 243 { 244 atomic_long_inc(&hugetlb->events_local[idx][event]); 245 cgroup_file_notify(&hugetlb->events_local_file[idx]); 246 247 do { 248 atomic_long_inc(&hugetlb->events[idx][event]); 249 cgroup_file_notify(&hugetlb->events_file[idx]); 250 } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && 251 !hugetlb_cgroup_is_root(hugetlb)); 252 } 253 254 static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 255 struct hugetlb_cgroup **ptr, 256 bool rsvd) 257 { 258 int ret = 0; 259 struct page_counter *counter; 260 struct hugetlb_cgroup *h_cg = NULL; 261 262 if (hugetlb_cgroup_disabled()) 263 goto done; 264 /* 265 * We don't charge any cgroup if the compound page have less 266 * than 3 pages. 267 */ 268 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 269 goto done; 270 again: 271 rcu_read_lock(); 272 h_cg = hugetlb_cgroup_from_task(current); 273 if (!css_tryget(&h_cg->css)) { 274 rcu_read_unlock(); 275 goto again; 276 } 277 rcu_read_unlock(); 278 279 if (!page_counter_try_charge( 280 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), 281 nr_pages, &counter)) { 282 ret = -ENOMEM; 283 hugetlb_event(h_cg, idx, HUGETLB_MAX); 284 css_put(&h_cg->css); 285 goto done; 286 } 287 /* Reservations take a reference to the css because they do not get 288 * reparented. 289 */ 290 if (!rsvd) 291 css_put(&h_cg->css); 292 done: 293 *ptr = h_cg; 294 return ret; 295 } 296 297 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 298 struct hugetlb_cgroup **ptr) 299 { 300 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); 301 } 302 303 int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, 304 struct hugetlb_cgroup **ptr) 305 { 306 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); 307 } 308 309 /* Should be called with hugetlb_lock held */ 310 static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 311 struct hugetlb_cgroup *h_cg, 312 struct page *page, bool rsvd) 313 { 314 if (hugetlb_cgroup_disabled() || !h_cg) 315 return; 316 317 __set_hugetlb_cgroup(page, h_cg, rsvd); 318 if (!rsvd) { 319 unsigned long usage = 320 h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; 321 /* 322 * This write is not atomic due to fetching usage and writing 323 * to it, but that's fine because we call this with 324 * hugetlb_lock held anyway. 325 */ 326 WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], 327 usage + nr_pages); 328 } 329 } 330 331 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 332 struct hugetlb_cgroup *h_cg, 333 struct page *page) 334 { 335 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false); 336 } 337 338 void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, 339 struct hugetlb_cgroup *h_cg, 340 struct page *page) 341 { 342 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true); 343 } 344 345 /* 346 * Should be called with hugetlb_lock held 347 */ 348 static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, 349 struct page *page, bool rsvd) 350 { 351 struct hugetlb_cgroup *h_cg; 352 353 if (hugetlb_cgroup_disabled()) 354 return; 355 lockdep_assert_held(&hugetlb_lock); 356 h_cg = __hugetlb_cgroup_from_page(page, rsvd); 357 if (unlikely(!h_cg)) 358 return; 359 __set_hugetlb_cgroup(page, NULL, rsvd); 360 361 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, 362 rsvd), 363 nr_pages); 364 365 if (rsvd) 366 css_put(&h_cg->css); 367 else { 368 unsigned long usage = 369 h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; 370 /* 371 * This write is not atomic due to fetching usage and writing 372 * to it, but that's fine because we call this with 373 * hugetlb_lock held anyway. 374 */ 375 WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], 376 usage - nr_pages); 377 } 378 } 379 380 void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, 381 struct page *page) 382 { 383 __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false); 384 } 385 386 void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages, 387 struct page *page) 388 { 389 __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true); 390 } 391 392 static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 393 struct hugetlb_cgroup *h_cg, 394 bool rsvd) 395 { 396 if (hugetlb_cgroup_disabled() || !h_cg) 397 return; 398 399 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 400 return; 401 402 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, 403 rsvd), 404 nr_pages); 405 406 if (rsvd) 407 css_put(&h_cg->css); 408 } 409 410 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 411 struct hugetlb_cgroup *h_cg) 412 { 413 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); 414 } 415 416 void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, 417 struct hugetlb_cgroup *h_cg) 418 { 419 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); 420 } 421 422 void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, 423 unsigned long end) 424 { 425 if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || 426 !resv->css) 427 return; 428 429 page_counter_uncharge(resv->reservation_counter, 430 (end - start) * resv->pages_per_hpage); 431 css_put(resv->css); 432 } 433 434 void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, 435 struct file_region *rg, 436 unsigned long nr_pages, 437 bool region_del) 438 { 439 if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) 440 return; 441 442 if (rg->reservation_counter && resv->pages_per_hpage && 443 !resv->reservation_counter) { 444 page_counter_uncharge(rg->reservation_counter, 445 nr_pages * resv->pages_per_hpage); 446 /* 447 * Only do css_put(rg->css) when we delete the entire region 448 * because one file_region must hold exactly one css reference. 449 */ 450 if (region_del) 451 css_put(rg->css); 452 } 453 } 454 455 enum { 456 RES_USAGE, 457 RES_RSVD_USAGE, 458 RES_LIMIT, 459 RES_RSVD_LIMIT, 460 RES_MAX_USAGE, 461 RES_RSVD_MAX_USAGE, 462 RES_FAILCNT, 463 RES_RSVD_FAILCNT, 464 }; 465 466 static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) 467 { 468 int nid; 469 struct cftype *cft = seq_cft(seq); 470 int idx = MEMFILE_IDX(cft->private); 471 bool legacy = MEMFILE_ATTR(cft->private); 472 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 473 struct cgroup_subsys_state *css; 474 unsigned long usage; 475 476 if (legacy) { 477 /* Add up usage across all nodes for the non-hierarchical total. */ 478 usage = 0; 479 for_each_node_state(nid, N_MEMORY) 480 usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); 481 seq_printf(seq, "total=%lu", usage * PAGE_SIZE); 482 483 /* Simply print the per-node usage for the non-hierarchical total. */ 484 for_each_node_state(nid, N_MEMORY) 485 seq_printf(seq, " N%d=%lu", nid, 486 READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * 487 PAGE_SIZE); 488 seq_putc(seq, '\n'); 489 } 490 491 /* 492 * The hierarchical total is pretty much the value recorded by the 493 * counter, so use that. 494 */ 495 seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", 496 page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); 497 498 /* 499 * For each node, transverse the css tree to obtain the hierarchical 500 * node usage. 501 */ 502 for_each_node_state(nid, N_MEMORY) { 503 usage = 0; 504 rcu_read_lock(); 505 css_for_each_descendant_pre(css, &h_cg->css) { 506 usage += READ_ONCE(hugetlb_cgroup_from_css(css) 507 ->nodeinfo[nid] 508 ->usage[idx]); 509 } 510 rcu_read_unlock(); 511 seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); 512 } 513 514 seq_putc(seq, '\n'); 515 516 return 0; 517 } 518 519 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 520 struct cftype *cft) 521 { 522 struct page_counter *counter; 523 struct page_counter *rsvd_counter; 524 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 525 526 counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; 527 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; 528 529 switch (MEMFILE_ATTR(cft->private)) { 530 case RES_USAGE: 531 return (u64)page_counter_read(counter) * PAGE_SIZE; 532 case RES_RSVD_USAGE: 533 return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; 534 case RES_LIMIT: 535 return (u64)counter->max * PAGE_SIZE; 536 case RES_RSVD_LIMIT: 537 return (u64)rsvd_counter->max * PAGE_SIZE; 538 case RES_MAX_USAGE: 539 return (u64)counter->watermark * PAGE_SIZE; 540 case RES_RSVD_MAX_USAGE: 541 return (u64)rsvd_counter->watermark * PAGE_SIZE; 542 case RES_FAILCNT: 543 return counter->failcnt; 544 case RES_RSVD_FAILCNT: 545 return rsvd_counter->failcnt; 546 default: 547 BUG(); 548 } 549 } 550 551 static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) 552 { 553 int idx; 554 u64 val; 555 struct cftype *cft = seq_cft(seq); 556 unsigned long limit; 557 struct page_counter *counter; 558 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 559 560 idx = MEMFILE_IDX(cft->private); 561 counter = &h_cg->hugepage[idx]; 562 563 limit = round_down(PAGE_COUNTER_MAX, 564 pages_per_huge_page(&hstates[idx])); 565 566 switch (MEMFILE_ATTR(cft->private)) { 567 case RES_RSVD_USAGE: 568 counter = &h_cg->rsvd_hugepage[idx]; 569 fallthrough; 570 case RES_USAGE: 571 val = (u64)page_counter_read(counter); 572 seq_printf(seq, "%llu\n", val * PAGE_SIZE); 573 break; 574 case RES_RSVD_LIMIT: 575 counter = &h_cg->rsvd_hugepage[idx]; 576 fallthrough; 577 case RES_LIMIT: 578 val = (u64)counter->max; 579 if (val == limit) 580 seq_puts(seq, "max\n"); 581 else 582 seq_printf(seq, "%llu\n", val * PAGE_SIZE); 583 break; 584 default: 585 BUG(); 586 } 587 588 return 0; 589 } 590 591 static DEFINE_MUTEX(hugetlb_limit_mutex); 592 593 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, 594 char *buf, size_t nbytes, loff_t off, 595 const char *max) 596 { 597 int ret, idx; 598 unsigned long nr_pages; 599 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 600 bool rsvd = false; 601 602 if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ 603 return -EINVAL; 604 605 buf = strstrip(buf); 606 ret = page_counter_memparse(buf, max, &nr_pages); 607 if (ret) 608 return ret; 609 610 idx = MEMFILE_IDX(of_cft(of)->private); 611 nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); 612 613 switch (MEMFILE_ATTR(of_cft(of)->private)) { 614 case RES_RSVD_LIMIT: 615 rsvd = true; 616 fallthrough; 617 case RES_LIMIT: 618 mutex_lock(&hugetlb_limit_mutex); 619 ret = page_counter_set_max( 620 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), 621 nr_pages); 622 mutex_unlock(&hugetlb_limit_mutex); 623 break; 624 default: 625 ret = -EINVAL; 626 break; 627 } 628 return ret ?: nbytes; 629 } 630 631 static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, 632 char *buf, size_t nbytes, loff_t off) 633 { 634 return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); 635 } 636 637 static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, 638 char *buf, size_t nbytes, loff_t off) 639 { 640 return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); 641 } 642 643 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, 644 char *buf, size_t nbytes, loff_t off) 645 { 646 int ret = 0; 647 struct page_counter *counter, *rsvd_counter; 648 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 649 650 counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; 651 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; 652 653 switch (MEMFILE_ATTR(of_cft(of)->private)) { 654 case RES_MAX_USAGE: 655 page_counter_reset_watermark(counter); 656 break; 657 case RES_RSVD_MAX_USAGE: 658 page_counter_reset_watermark(rsvd_counter); 659 break; 660 case RES_FAILCNT: 661 counter->failcnt = 0; 662 break; 663 case RES_RSVD_FAILCNT: 664 rsvd_counter->failcnt = 0; 665 break; 666 default: 667 ret = -EINVAL; 668 break; 669 } 670 return ret ?: nbytes; 671 } 672 673 static char *mem_fmt(char *buf, int size, unsigned long hsize) 674 { 675 if (hsize >= SZ_1G) 676 snprintf(buf, size, "%luGB", hsize / SZ_1G); 677 else if (hsize >= SZ_1M) 678 snprintf(buf, size, "%luMB", hsize / SZ_1M); 679 else 680 snprintf(buf, size, "%luKB", hsize / SZ_1K); 681 return buf; 682 } 683 684 static int __hugetlb_events_show(struct seq_file *seq, bool local) 685 { 686 int idx; 687 long max; 688 struct cftype *cft = seq_cft(seq); 689 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 690 691 idx = MEMFILE_IDX(cft->private); 692 693 if (local) 694 max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); 695 else 696 max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); 697 698 seq_printf(seq, "max %lu\n", max); 699 700 return 0; 701 } 702 703 static int hugetlb_events_show(struct seq_file *seq, void *v) 704 { 705 return __hugetlb_events_show(seq, false); 706 } 707 708 static int hugetlb_events_local_show(struct seq_file *seq, void *v) 709 { 710 return __hugetlb_events_show(seq, true); 711 } 712 713 static void __init __hugetlb_cgroup_file_dfl_init(int idx) 714 { 715 char buf[32]; 716 struct cftype *cft; 717 struct hstate *h = &hstates[idx]; 718 719 /* format the size */ 720 mem_fmt(buf, sizeof(buf), huge_page_size(h)); 721 722 /* Add the limit file */ 723 cft = &h->cgroup_files_dfl[0]; 724 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf); 725 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 726 cft->seq_show = hugetlb_cgroup_read_u64_max; 727 cft->write = hugetlb_cgroup_write_dfl; 728 cft->flags = CFTYPE_NOT_ON_ROOT; 729 730 /* Add the reservation limit file */ 731 cft = &h->cgroup_files_dfl[1]; 732 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max", buf); 733 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); 734 cft->seq_show = hugetlb_cgroup_read_u64_max; 735 cft->write = hugetlb_cgroup_write_dfl; 736 cft->flags = CFTYPE_NOT_ON_ROOT; 737 738 /* Add the current usage file */ 739 cft = &h->cgroup_files_dfl[2]; 740 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf); 741 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 742 cft->seq_show = hugetlb_cgroup_read_u64_max; 743 cft->flags = CFTYPE_NOT_ON_ROOT; 744 745 /* Add the current reservation usage file */ 746 cft = &h->cgroup_files_dfl[3]; 747 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.current", buf); 748 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); 749 cft->seq_show = hugetlb_cgroup_read_u64_max; 750 cft->flags = CFTYPE_NOT_ON_ROOT; 751 752 /* Add the events file */ 753 cft = &h->cgroup_files_dfl[4]; 754 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf); 755 cft->private = MEMFILE_PRIVATE(idx, 0); 756 cft->seq_show = hugetlb_events_show; 757 cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]); 758 cft->flags = CFTYPE_NOT_ON_ROOT; 759 760 /* Add the events.local file */ 761 cft = &h->cgroup_files_dfl[5]; 762 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf); 763 cft->private = MEMFILE_PRIVATE(idx, 0); 764 cft->seq_show = hugetlb_events_local_show; 765 cft->file_offset = offsetof(struct hugetlb_cgroup, 766 events_local_file[idx]); 767 cft->flags = CFTYPE_NOT_ON_ROOT; 768 769 /* Add the numa stat file */ 770 cft = &h->cgroup_files_dfl[6]; 771 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); 772 cft->private = MEMFILE_PRIVATE(idx, 0); 773 cft->seq_show = hugetlb_cgroup_read_numa_stat; 774 cft->flags = CFTYPE_NOT_ON_ROOT; 775 776 /* NULL terminate the last cft */ 777 cft = &h->cgroup_files_dfl[7]; 778 memset(cft, 0, sizeof(*cft)); 779 780 WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, 781 h->cgroup_files_dfl)); 782 } 783 784 static void __init __hugetlb_cgroup_file_legacy_init(int idx) 785 { 786 char buf[32]; 787 struct cftype *cft; 788 struct hstate *h = &hstates[idx]; 789 790 /* format the size */ 791 mem_fmt(buf, sizeof(buf), huge_page_size(h)); 792 793 /* Add the limit file */ 794 cft = &h->cgroup_files_legacy[0]; 795 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); 796 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 797 cft->read_u64 = hugetlb_cgroup_read_u64; 798 cft->write = hugetlb_cgroup_write_legacy; 799 800 /* Add the reservation limit file */ 801 cft = &h->cgroup_files_legacy[1]; 802 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.limit_in_bytes", buf); 803 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); 804 cft->read_u64 = hugetlb_cgroup_read_u64; 805 cft->write = hugetlb_cgroup_write_legacy; 806 807 /* Add the usage file */ 808 cft = &h->cgroup_files_legacy[2]; 809 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); 810 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 811 cft->read_u64 = hugetlb_cgroup_read_u64; 812 813 /* Add the reservation usage file */ 814 cft = &h->cgroup_files_legacy[3]; 815 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.usage_in_bytes", buf); 816 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); 817 cft->read_u64 = hugetlb_cgroup_read_u64; 818 819 /* Add the MAX usage file */ 820 cft = &h->cgroup_files_legacy[4]; 821 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); 822 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); 823 cft->write = hugetlb_cgroup_reset; 824 cft->read_u64 = hugetlb_cgroup_read_u64; 825 826 /* Add the MAX reservation usage file */ 827 cft = &h->cgroup_files_legacy[5]; 828 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max_usage_in_bytes", buf); 829 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_MAX_USAGE); 830 cft->write = hugetlb_cgroup_reset; 831 cft->read_u64 = hugetlb_cgroup_read_u64; 832 833 /* Add the failcntfile */ 834 cft = &h->cgroup_files_legacy[6]; 835 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); 836 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); 837 cft->write = hugetlb_cgroup_reset; 838 cft->read_u64 = hugetlb_cgroup_read_u64; 839 840 /* Add the reservation failcntfile */ 841 cft = &h->cgroup_files_legacy[7]; 842 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.failcnt", buf); 843 cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_FAILCNT); 844 cft->write = hugetlb_cgroup_reset; 845 cft->read_u64 = hugetlb_cgroup_read_u64; 846 847 /* Add the numa stat file */ 848 cft = &h->cgroup_files_legacy[8]; 849 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); 850 cft->private = MEMFILE_PRIVATE(idx, 1); 851 cft->seq_show = hugetlb_cgroup_read_numa_stat; 852 853 /* NULL terminate the last cft */ 854 cft = &h->cgroup_files_legacy[9]; 855 memset(cft, 0, sizeof(*cft)); 856 857 WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, 858 h->cgroup_files_legacy)); 859 } 860 861 static void __init __hugetlb_cgroup_file_init(int idx) 862 { 863 __hugetlb_cgroup_file_dfl_init(idx); 864 __hugetlb_cgroup_file_legacy_init(idx); 865 } 866 867 void __init hugetlb_cgroup_file_init(void) 868 { 869 struct hstate *h; 870 871 for_each_hstate(h) { 872 /* 873 * Add cgroup control files only if the huge page consists 874 * of more than two normal pages. This is because we use 875 * page[2].private for storing cgroup details. 876 */ 877 if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) 878 __hugetlb_cgroup_file_init(hstate_index(h)); 879 } 880 } 881 882 /* 883 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen 884 * when we migrate hugepages 885 */ 886 void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) 887 { 888 struct hugetlb_cgroup *h_cg; 889 struct hugetlb_cgroup *h_cg_rsvd; 890 struct hstate *h = page_hstate(oldhpage); 891 892 if (hugetlb_cgroup_disabled()) 893 return; 894 895 spin_lock_irq(&hugetlb_lock); 896 h_cg = hugetlb_cgroup_from_page(oldhpage); 897 h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); 898 set_hugetlb_cgroup(oldhpage, NULL); 899 set_hugetlb_cgroup_rsvd(oldhpage, NULL); 900 901 /* move the h_cg details to new cgroup */ 902 set_hugetlb_cgroup(newhpage, h_cg); 903 set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd); 904 list_move(&newhpage->lru, &h->hugepage_activelist); 905 spin_unlock_irq(&hugetlb_lock); 906 return; 907 } 908 909 static struct cftype hugetlb_files[] = { 910 {} /* terminate */ 911 }; 912 913 struct cgroup_subsys hugetlb_cgrp_subsys = { 914 .css_alloc = hugetlb_cgroup_css_alloc, 915 .css_offline = hugetlb_cgroup_css_offline, 916 .css_free = hugetlb_cgroup_css_free, 917 .dfl_cftypes = hugetlb_files, 918 .legacy_cftypes = hugetlb_files, 919 }; 920