1 /* 2 * 3 * Copyright IBM Corporation, 2012 4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of version 2.1 of the GNU Lesser General Public License 8 * as published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it would be useful, but 11 * WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 * 14 */ 15 16 #include <linux/cgroup.h> 17 #include <linux/page_counter.h> 18 #include <linux/slab.h> 19 #include <linux/hugetlb.h> 20 #include <linux/hugetlb_cgroup.h> 21 22 struct hugetlb_cgroup { 23 struct cgroup_subsys_state css; 24 /* 25 * the counter to account for hugepages from hugetlb. 26 */ 27 struct page_counter hugepage[HUGE_MAX_HSTATE]; 28 }; 29 30 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 31 #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) 32 #define MEMFILE_ATTR(val) ((val) & 0xffff) 33 34 static struct hugetlb_cgroup *root_h_cgroup __read_mostly; 35 36 static inline 37 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) 38 { 39 return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; 40 } 41 42 static inline 43 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 44 { 45 return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); 46 } 47 48 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 49 { 50 return (h_cg == root_h_cgroup); 51 } 52 53 static inline struct hugetlb_cgroup * 54 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) 55 { 56 return hugetlb_cgroup_from_css(h_cg->css.parent); 57 } 58 59 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) 60 { 61 int idx; 62 63 for (idx = 0; idx < hugetlb_max_hstate; idx++) { 64 if (page_counter_read(&h_cg->hugepage[idx])) 65 return true; 66 } 67 return false; 68 } 69 70 static struct cgroup_subsys_state * 71 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 72 { 73 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); 74 struct hugetlb_cgroup *h_cgroup; 75 int idx; 76 77 h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); 78 if (!h_cgroup) 79 return ERR_PTR(-ENOMEM); 80 81 if (parent_h_cgroup) { 82 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) 83 page_counter_init(&h_cgroup->hugepage[idx], 84 &parent_h_cgroup->hugepage[idx]); 85 } else { 86 root_h_cgroup = h_cgroup; 87 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) 88 page_counter_init(&h_cgroup->hugepage[idx], NULL); 89 } 90 return &h_cgroup->css; 91 } 92 93 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) 94 { 95 struct hugetlb_cgroup *h_cgroup; 96 97 h_cgroup = hugetlb_cgroup_from_css(css); 98 kfree(h_cgroup); 99 } 100 101 102 /* 103 * Should be called with hugetlb_lock held. 104 * Since we are holding hugetlb_lock, pages cannot get moved from 105 * active list or uncharged from the cgroup, So no need to get 106 * page reference and test for page active here. This function 107 * cannot fail. 108 */ 109 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, 110 struct page *page) 111 { 112 unsigned int nr_pages; 113 struct page_counter *counter; 114 struct hugetlb_cgroup *page_hcg; 115 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); 116 117 page_hcg = hugetlb_cgroup_from_page(page); 118 /* 119 * We can have pages in active list without any cgroup 120 * ie, hugepage with less than 3 pages. We can safely 121 * ignore those pages. 122 */ 123 if (!page_hcg || page_hcg != h_cg) 124 goto out; 125 126 nr_pages = 1 << compound_order(page); 127 if (!parent) { 128 parent = root_h_cgroup; 129 /* root has no limit */ 130 page_counter_charge(&parent->hugepage[idx], nr_pages); 131 } 132 counter = &h_cg->hugepage[idx]; 133 /* Take the pages off the local counter */ 134 page_counter_cancel(counter, nr_pages); 135 136 set_hugetlb_cgroup(page, parent); 137 out: 138 return; 139 } 140 141 /* 142 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 143 * the parent cgroup. 144 */ 145 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) 146 { 147 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 148 struct hstate *h; 149 struct page *page; 150 int idx = 0; 151 152 do { 153 for_each_hstate(h) { 154 spin_lock(&hugetlb_lock); 155 list_for_each_entry(page, &h->hugepage_activelist, lru) 156 hugetlb_cgroup_move_parent(idx, h_cg, page); 157 158 spin_unlock(&hugetlb_lock); 159 idx++; 160 } 161 cond_resched(); 162 } while (hugetlb_cgroup_have_usage(h_cg)); 163 } 164 165 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 166 struct hugetlb_cgroup **ptr) 167 { 168 int ret = 0; 169 struct page_counter *counter; 170 struct hugetlb_cgroup *h_cg = NULL; 171 172 if (hugetlb_cgroup_disabled()) 173 goto done; 174 /* 175 * We don't charge any cgroup if the compound page have less 176 * than 3 pages. 177 */ 178 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 179 goto done; 180 again: 181 rcu_read_lock(); 182 h_cg = hugetlb_cgroup_from_task(current); 183 if (!css_tryget_online(&h_cg->css)) { 184 rcu_read_unlock(); 185 goto again; 186 } 187 rcu_read_unlock(); 188 189 if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter)) 190 ret = -ENOMEM; 191 css_put(&h_cg->css); 192 done: 193 *ptr = h_cg; 194 return ret; 195 } 196 197 /* Should be called with hugetlb_lock held */ 198 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 199 struct hugetlb_cgroup *h_cg, 200 struct page *page) 201 { 202 if (hugetlb_cgroup_disabled() || !h_cg) 203 return; 204 205 set_hugetlb_cgroup(page, h_cg); 206 return; 207 } 208 209 /* 210 * Should be called with hugetlb_lock held 211 */ 212 void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, 213 struct page *page) 214 { 215 struct hugetlb_cgroup *h_cg; 216 217 if (hugetlb_cgroup_disabled()) 218 return; 219 lockdep_assert_held(&hugetlb_lock); 220 h_cg = hugetlb_cgroup_from_page(page); 221 if (unlikely(!h_cg)) 222 return; 223 set_hugetlb_cgroup(page, NULL); 224 page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); 225 return; 226 } 227 228 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 229 struct hugetlb_cgroup *h_cg) 230 { 231 if (hugetlb_cgroup_disabled() || !h_cg) 232 return; 233 234 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 235 return; 236 237 page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); 238 return; 239 } 240 241 enum { 242 RES_USAGE, 243 RES_LIMIT, 244 RES_MAX_USAGE, 245 RES_FAILCNT, 246 }; 247 248 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 249 struct cftype *cft) 250 { 251 struct page_counter *counter; 252 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 253 254 counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; 255 256 switch (MEMFILE_ATTR(cft->private)) { 257 case RES_USAGE: 258 return (u64)page_counter_read(counter) * PAGE_SIZE; 259 case RES_LIMIT: 260 return (u64)counter->limit * PAGE_SIZE; 261 case RES_MAX_USAGE: 262 return (u64)counter->watermark * PAGE_SIZE; 263 case RES_FAILCNT: 264 return counter->failcnt; 265 default: 266 BUG(); 267 } 268 } 269 270 static DEFINE_MUTEX(hugetlb_limit_mutex); 271 272 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, 273 char *buf, size_t nbytes, loff_t off) 274 { 275 int ret, idx; 276 unsigned long nr_pages; 277 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 278 279 if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ 280 return -EINVAL; 281 282 buf = strstrip(buf); 283 ret = page_counter_memparse(buf, "-1", &nr_pages); 284 if (ret) 285 return ret; 286 287 idx = MEMFILE_IDX(of_cft(of)->private); 288 289 switch (MEMFILE_ATTR(of_cft(of)->private)) { 290 case RES_LIMIT: 291 mutex_lock(&hugetlb_limit_mutex); 292 ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); 293 mutex_unlock(&hugetlb_limit_mutex); 294 break; 295 default: 296 ret = -EINVAL; 297 break; 298 } 299 return ret ?: nbytes; 300 } 301 302 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, 303 char *buf, size_t nbytes, loff_t off) 304 { 305 int ret = 0; 306 struct page_counter *counter; 307 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 308 309 counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; 310 311 switch (MEMFILE_ATTR(of_cft(of)->private)) { 312 case RES_MAX_USAGE: 313 page_counter_reset_watermark(counter); 314 break; 315 case RES_FAILCNT: 316 counter->failcnt = 0; 317 break; 318 default: 319 ret = -EINVAL; 320 break; 321 } 322 return ret ?: nbytes; 323 } 324 325 static char *mem_fmt(char *buf, int size, unsigned long hsize) 326 { 327 if (hsize >= (1UL << 30)) 328 snprintf(buf, size, "%luGB", hsize >> 30); 329 else if (hsize >= (1UL << 20)) 330 snprintf(buf, size, "%luMB", hsize >> 20); 331 else 332 snprintf(buf, size, "%luKB", hsize >> 10); 333 return buf; 334 } 335 336 static void __init __hugetlb_cgroup_file_init(int idx) 337 { 338 char buf[32]; 339 struct cftype *cft; 340 struct hstate *h = &hstates[idx]; 341 342 /* format the size */ 343 mem_fmt(buf, 32, huge_page_size(h)); 344 345 /* Add the limit file */ 346 cft = &h->cgroup_files[0]; 347 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); 348 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); 349 cft->read_u64 = hugetlb_cgroup_read_u64; 350 cft->write = hugetlb_cgroup_write; 351 352 /* Add the usage file */ 353 cft = &h->cgroup_files[1]; 354 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); 355 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); 356 cft->read_u64 = hugetlb_cgroup_read_u64; 357 358 /* Add the MAX usage file */ 359 cft = &h->cgroup_files[2]; 360 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); 361 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); 362 cft->write = hugetlb_cgroup_reset; 363 cft->read_u64 = hugetlb_cgroup_read_u64; 364 365 /* Add the failcntfile */ 366 cft = &h->cgroup_files[3]; 367 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); 368 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); 369 cft->write = hugetlb_cgroup_reset; 370 cft->read_u64 = hugetlb_cgroup_read_u64; 371 372 /* NULL terminate the last cft */ 373 cft = &h->cgroup_files[4]; 374 memset(cft, 0, sizeof(*cft)); 375 376 WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, 377 h->cgroup_files)); 378 } 379 380 void __init hugetlb_cgroup_file_init(void) 381 { 382 struct hstate *h; 383 384 for_each_hstate(h) { 385 /* 386 * Add cgroup control files only if the huge page consists 387 * of more than two normal pages. This is because we use 388 * page[2].private for storing cgroup details. 389 */ 390 if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) 391 __hugetlb_cgroup_file_init(hstate_index(h)); 392 } 393 } 394 395 /* 396 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen 397 * when we migrate hugepages 398 */ 399 void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) 400 { 401 struct hugetlb_cgroup *h_cg; 402 struct hstate *h = page_hstate(oldhpage); 403 404 if (hugetlb_cgroup_disabled()) 405 return; 406 407 VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); 408 spin_lock(&hugetlb_lock); 409 h_cg = hugetlb_cgroup_from_page(oldhpage); 410 set_hugetlb_cgroup(oldhpage, NULL); 411 412 /* move the h_cg details to new cgroup */ 413 set_hugetlb_cgroup(newhpage, h_cg); 414 list_move(&newhpage->lru, &h->hugepage_activelist); 415 spin_unlock(&hugetlb_lock); 416 return; 417 } 418 419 struct cgroup_subsys hugetlb_cgrp_subsys = { 420 .css_alloc = hugetlb_cgroup_css_alloc, 421 .css_offline = hugetlb_cgroup_css_offline, 422 .css_free = hugetlb_cgroup_css_free, 423 }; 424