1 /* 2 * 3 * Copyright IBM Corporation, 2012 4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of version 2.1 of the GNU Lesser General Public License 8 * as published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it would be useful, but 11 * WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 * 14 */ 15 16 #include <linux/cgroup.h> 17 #include <linux/page_counter.h> 18 #include <linux/slab.h> 19 #include <linux/hugetlb.h> 20 #include <linux/hugetlb_cgroup.h> 21 22 struct hugetlb_cgroup { 23 struct cgroup_subsys_state css; 24 /* 25 * the counter to account for hugepages from hugetlb. 26 */ 27 struct page_counter hugepage[HUGE_MAX_HSTATE]; 28 }; 29 30 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 31 #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) 32 #define MEMFILE_ATTR(val) ((val) & 0xffff) 33 34 static struct hugetlb_cgroup *root_h_cgroup __read_mostly; 35 36 static inline 37 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) 38 { 39 return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; 40 } 41 42 static inline 43 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 44 { 45 return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); 46 } 47 48 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 49 { 50 return (h_cg == root_h_cgroup); 51 } 52 53 static inline struct hugetlb_cgroup * 54 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) 55 { 56 return hugetlb_cgroup_from_css(h_cg->css.parent); 57 } 58 59 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) 60 { 61 int idx; 62 63 for (idx = 0; idx < hugetlb_max_hstate; idx++) { 64 if (page_counter_read(&h_cg->hugepage[idx])) 65 return true; 66 } 67 return false; 68 } 69 70 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, 71 struct hugetlb_cgroup *parent_h_cgroup) 72 { 73 int idx; 74 75 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { 76 struct page_counter *counter = &h_cgroup->hugepage[idx]; 77 struct page_counter *parent = NULL; 78 unsigned long limit; 79 int ret; 80 81 if (parent_h_cgroup) 82 parent = &parent_h_cgroup->hugepage[idx]; 83 page_counter_init(counter, parent); 84 85 limit = round_down(PAGE_COUNTER_MAX, 86 1 << huge_page_order(&hstates[idx])); 87 ret = page_counter_set_max(counter, limit); 88 VM_BUG_ON(ret); 89 } 90 } 91 92 static struct cgroup_subsys_state * 93 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 94 { 95 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); 96 struct hugetlb_cgroup *h_cgroup; 97 98 h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); 99 if (!h_cgroup) 100 return ERR_PTR(-ENOMEM); 101 102 if (!parent_h_cgroup) 103 root_h_cgroup = h_cgroup; 104 105 hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); 106 return &h_cgroup->css; 107 } 108 109 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) 110 { 111 struct hugetlb_cgroup *h_cgroup; 112 113 h_cgroup = hugetlb_cgroup_from_css(css); 114 kfree(h_cgroup); 115 } 116 117 118 /* 119 * Should be called with hugetlb_lock held. 120 * Since we are holding hugetlb_lock, pages cannot get moved from 121 * active list or uncharged from the cgroup, So no need to get 122 * page reference and test for page active here. This function 123 * cannot fail. 124 */ 125 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, 126 struct page *page) 127 { 128 unsigned int nr_pages; 129 struct page_counter *counter; 130 struct hugetlb_cgroup *page_hcg; 131 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); 132 133 page_hcg = hugetlb_cgroup_from_page(page); 134 /* 135 * We can have pages in active list without any cgroup 136 * ie, hugepage with less than 3 pages. We can safely 137 * ignore those pages. 138 */ 139 if (!page_hcg || page_hcg != h_cg) 140 goto out; 141 142 nr_pages = compound_nr(page); 143 if (!parent) { 144 parent = root_h_cgroup; 145 /* root has no limit */ 146 page_counter_charge(&parent->hugepage[idx], nr_pages); 147 } 148 counter = &h_cg->hugepage[idx]; 149 /* Take the pages off the local counter */ 150 page_counter_cancel(counter, nr_pages); 151 152 set_hugetlb_cgroup(page, parent); 153 out: 154 return; 155 } 156 157 /* 158 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 159 * the parent cgroup. 160 */ 161 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) 162 { 163 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 164 struct hstate *h; 165 struct page *page; 166 int idx = 0; 167 168 do { 169 for_each_hstate(h) { 170 spin_lock(&hugetlb_lock); 171 list_for_each_entry(page, &h->hugepage_activelist, lru) 172 hugetlb_cgroup_move_parent(idx, h_cg, page); 173 174 spin_unlock(&hugetlb_lock); 175 idx++; 176 } 177 cond_resched(); 178 } while (hugetlb_cgroup_have_usage(h_cg)); 179 } 180 181 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 182 struct hugetlb_cgroup **ptr) 183 { 184 int ret = 0; 185 struct page_counter *counter; 186 struct hugetlb_cgroup *h_cg = NULL; 187 188 if (hugetlb_cgroup_disabled()) 189 goto done; 190 /* 191 * We don't charge any cgroup if the compound page have less 192 * than 3 pages. 193 */ 194 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 195 goto done; 196 again: 197 rcu_read_lock(); 198 h_cg = hugetlb_cgroup_from_task(current); 199 if (!css_tryget_online(&h_cg->css)) { 200 rcu_read_unlock(); 201 goto again; 202 } 203 rcu_read_unlock(); 204 205 if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter)) 206 ret = -ENOMEM; 207 css_put(&h_cg->css); 208 done: 209 *ptr = h_cg; 210 return ret; 211 } 212 213 /* Should be called with hugetlb_lock held */ 214 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 215 struct hugetlb_cgroup *h_cg, 216 struct page *page) 217 { 218 if (hugetlb_cgroup_disabled() || !h_cg) 219 return; 220 221 set_hugetlb_cgroup(page, h_cg); 222 return; 223 } 224 225 /* 226 * Should be called with hugetlb_lock held 227 */ 228 void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, 229 struct page *page) 230 { 231 struct hugetlb_cgroup *h_cg; 232 233 if (hugetlb_cgroup_disabled()) 234 return; 235 lockdep_assert_held(&hugetlb_lock); 236 h_cg = hugetlb_cgroup_from_page(page); 237 if (unlikely(!h_cg)) 238 return; 239 set_hugetlb_cgroup(page, NULL); 240 page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); 241 return; 242 } 243 244 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 245 struct hugetlb_cgroup *h_cg) 246 { 247 if (hugetlb_cgroup_disabled() || !h_cg) 248 return; 249 250 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 251 return; 252 253 page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); 254 return; 255 } 256 257 enum { 258 RES_USAGE, 259 RES_LIMIT, 260 RES_MAX_USAGE, 261 RES_FAILCNT, 262 }; 263 264 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 265 struct cftype *cft) 266 { 267 struct page_counter *counter; 268 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 269 270 counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; 271 272 switch (MEMFILE_ATTR(cft->private)) { 273 case RES_USAGE: 274 return (u64)page_counter_read(counter) * PAGE_SIZE; 275 case RES_LIMIT: 276 return (u64)counter->max * PAGE_SIZE; 277 case RES_MAX_USAGE: 278 return (u64)counter->watermark * PAGE_SIZE; 279 case RES_FAILCNT: 280 return counter->failcnt; 281 default: 282 BUG(); 283 } 284 } 285 286 static DEFINE_MUTEX(hugetlb_limit_mutex); 287 288 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, 289 char *buf, size_t nbytes, loff_t off) 290 { 291 int ret, idx; 292 unsigned long nr_pages; 293 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 294 295 if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ 296 return -EINVAL; 297 298 buf = strstrip(buf); 299 ret = page_counter_memparse(buf, "-1", &nr_pages); 300 if (ret) 301 return ret; 302 303 idx = MEMFILE_IDX(of_cft(of)->private); 304 nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx])); 305 306 switch (MEMFILE_ATTR(of_cft(of)->private)) { 307 case RES_LIMIT: 308 mutex_lock(&hugetlb_limit_mutex); 309 ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages); 310 mutex_unlock(&hugetlb_limit_mutex); 311 break; 312 default: 313 ret = -EINVAL; 314 break; 315 } 316 return ret ?: nbytes; 317 } 318 319 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, 320 char *buf, size_t nbytes, loff_t off) 321 { 322 int ret = 0; 323 struct page_counter *counter; 324 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 325 326 counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; 327 328 switch (MEMFILE_ATTR(of_cft(of)->private)) { 329 case RES_MAX_USAGE: 330 page_counter_reset_watermark(counter); 331 break; 332 case RES_FAILCNT: 333 counter->failcnt = 0; 334 break; 335 default: 336 ret = -EINVAL; 337 break; 338 } 339 return ret ?: nbytes; 340 } 341 342 static char *mem_fmt(char *buf, int size, unsigned long hsize) 343 { 344 if (hsize >= (1UL << 30)) 345 snprintf(buf, size, "%luGB", hsize >> 30); 346 else if (hsize >= (1UL << 20)) 347 snprintf(buf, size, "%luMB", hsize >> 20); 348 else 349 snprintf(buf, size, "%luKB", hsize >> 10); 350 return buf; 351 } 352 353 static void __init __hugetlb_cgroup_file_init(int idx) 354 { 355 char buf[32]; 356 struct cftype *cft; 357 struct hstate *h = &hstates[idx]; 358 359 /* format the size */ 360 mem_fmt(buf, 32, huge_page_size(h)); 361 362 /* Add the limit file */ 363 cft = &h->cgroup_files[0]; 364 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); 365 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 366 cft->read_u64 = hugetlb_cgroup_read_u64; 367 cft->write = hugetlb_cgroup_write; 368 369 /* Add the usage file */ 370 cft = &h->cgroup_files[1]; 371 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); 372 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 373 cft->read_u64 = hugetlb_cgroup_read_u64; 374 375 /* Add the MAX usage file */ 376 cft = &h->cgroup_files[2]; 377 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); 378 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); 379 cft->write = hugetlb_cgroup_reset; 380 cft->read_u64 = hugetlb_cgroup_read_u64; 381 382 /* Add the failcntfile */ 383 cft = &h->cgroup_files[3]; 384 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); 385 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); 386 cft->write = hugetlb_cgroup_reset; 387 cft->read_u64 = hugetlb_cgroup_read_u64; 388 389 /* NULL terminate the last cft */ 390 cft = &h->cgroup_files[4]; 391 memset(cft, 0, sizeof(*cft)); 392 393 WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, 394 h->cgroup_files)); 395 } 396 397 void __init hugetlb_cgroup_file_init(void) 398 { 399 struct hstate *h; 400 401 for_each_hstate(h) { 402 /* 403 * Add cgroup control files only if the huge page consists 404 * of more than two normal pages. This is because we use 405 * page[2].private for storing cgroup details. 406 */ 407 if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) 408 __hugetlb_cgroup_file_init(hstate_index(h)); 409 } 410 } 411 412 /* 413 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen 414 * when we migrate hugepages 415 */ 416 void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) 417 { 418 struct hugetlb_cgroup *h_cg; 419 struct hstate *h = page_hstate(oldhpage); 420 421 if (hugetlb_cgroup_disabled()) 422 return; 423 424 VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); 425 spin_lock(&hugetlb_lock); 426 h_cg = hugetlb_cgroup_from_page(oldhpage); 427 set_hugetlb_cgroup(oldhpage, NULL); 428 429 /* move the h_cg details to new cgroup */ 430 set_hugetlb_cgroup(newhpage, h_cg); 431 list_move(&newhpage->lru, &h->hugepage_activelist); 432 spin_unlock(&hugetlb_lock); 433 return; 434 } 435 436 struct cgroup_subsys hugetlb_cgrp_subsys = { 437 .css_alloc = hugetlb_cgroup_css_alloc, 438 .css_offline = hugetlb_cgroup_css_offline, 439 .css_free = hugetlb_cgroup_css_free, 440 }; 441