1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Lockless hierarchical page accounting & limiting 4 * 5 * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner 6 */ 7 8 #include <linux/page_counter.h> 9 #include <linux/atomic.h> 10 #include <linux/kernel.h> 11 #include <linux/string.h> 12 #include <linux/sched.h> 13 #include <linux/bug.h> 14 #include <asm/page.h> 15 16 static void propagate_protected_usage(struct page_counter *c, 17 unsigned long usage) 18 { 19 unsigned long protected, old_protected; 20 unsigned long low, min; 21 long delta; 22 23 if (!c->parent) 24 return; 25 26 min = READ_ONCE(c->min); 27 if (min || atomic_long_read(&c->min_usage)) { 28 protected = min(usage, min); 29 old_protected = atomic_long_xchg(&c->min_usage, protected); 30 delta = protected - old_protected; 31 if (delta) 32 atomic_long_add(delta, &c->parent->children_min_usage); 33 } 34 35 low = READ_ONCE(c->low); 36 if (low || atomic_long_read(&c->low_usage)) { 37 protected = min(usage, low); 38 old_protected = atomic_long_xchg(&c->low_usage, protected); 39 delta = protected - old_protected; 40 if (delta) 41 atomic_long_add(delta, &c->parent->children_low_usage); 42 } 43 } 44 45 /** 46 * page_counter_cancel - take pages out of the local counter 47 * @counter: counter 48 * @nr_pages: number of pages to cancel 49 */ 50 void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) 51 { 52 long new; 53 54 new = atomic_long_sub_return(nr_pages, &counter->usage); 55 /* More uncharges than charges? */ 56 if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", 57 new, nr_pages)) { 58 new = 0; 59 atomic_long_set(&counter->usage, new); 60 } 61 propagate_protected_usage(counter, new); 62 } 63 64 /** 65 * page_counter_charge - hierarchically charge pages 66 * @counter: counter 67 * @nr_pages: number of pages to charge 68 * 69 * NOTE: This does not consider any configured counter limits. 70 */ 71 void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) 72 { 73 struct page_counter *c; 74 75 for (c = counter; c; c = c->parent) { 76 long new; 77 78 new = atomic_long_add_return(nr_pages, &c->usage); 79 propagate_protected_usage(c, new); 80 /* 81 * This is indeed racy, but we can live with some 82 * inaccuracy in the watermark. 83 */ 84 if (new > READ_ONCE(c->watermark)) 85 WRITE_ONCE(c->watermark, new); 86 } 87 } 88 89 /** 90 * page_counter_try_charge - try to hierarchically charge pages 91 * @counter: counter 92 * @nr_pages: number of pages to charge 93 * @fail: points first counter to hit its limit, if any 94 * 95 * Returns %true on success, or %false and @fail if the counter or one 96 * of its ancestors has hit its configured limit. 97 */ 98 bool page_counter_try_charge(struct page_counter *counter, 99 unsigned long nr_pages, 100 struct page_counter **fail) 101 { 102 struct page_counter *c; 103 104 for (c = counter; c; c = c->parent) { 105 long new; 106 /* 107 * Charge speculatively to avoid an expensive CAS. If 108 * a bigger charge fails, it might falsely lock out a 109 * racing smaller charge and send it into reclaim 110 * early, but the error is limited to the difference 111 * between the two sizes, which is less than 2M/4M in 112 * case of a THP locking out a regular page charge. 113 * 114 * The atomic_long_add_return() implies a full memory 115 * barrier between incrementing the count and reading 116 * the limit. When racing with page_counter_set_max(), 117 * we either see the new limit or the setter sees the 118 * counter has changed and retries. 119 */ 120 new = atomic_long_add_return(nr_pages, &c->usage); 121 if (new > c->max) { 122 atomic_long_sub(nr_pages, &c->usage); 123 /* 124 * This is racy, but we can live with some 125 * inaccuracy in the failcnt which is only used 126 * to report stats. 127 */ 128 data_race(c->failcnt++); 129 *fail = c; 130 goto failed; 131 } 132 propagate_protected_usage(c, new); 133 /* 134 * Just like with failcnt, we can live with some 135 * inaccuracy in the watermark. 136 */ 137 if (new > READ_ONCE(c->watermark)) 138 WRITE_ONCE(c->watermark, new); 139 } 140 return true; 141 142 failed: 143 for (c = counter; c != *fail; c = c->parent) 144 page_counter_cancel(c, nr_pages); 145 146 return false; 147 } 148 149 /** 150 * page_counter_uncharge - hierarchically uncharge pages 151 * @counter: counter 152 * @nr_pages: number of pages to uncharge 153 */ 154 void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) 155 { 156 struct page_counter *c; 157 158 for (c = counter; c; c = c->parent) 159 page_counter_cancel(c, nr_pages); 160 } 161 162 /** 163 * page_counter_set_max - set the maximum number of pages allowed 164 * @counter: counter 165 * @nr_pages: limit to set 166 * 167 * Returns 0 on success, -EBUSY if the current number of pages on the 168 * counter already exceeds the specified limit. 169 * 170 * The caller must serialize invocations on the same counter. 171 */ 172 int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) 173 { 174 for (;;) { 175 unsigned long old; 176 long usage; 177 178 /* 179 * Update the limit while making sure that it's not 180 * below the concurrently-changing counter value. 181 * 182 * The xchg implies two full memory barriers before 183 * and after, so the read-swap-read is ordered and 184 * ensures coherency with page_counter_try_charge(): 185 * that function modifies the count before checking 186 * the limit, so if it sees the old limit, we see the 187 * modified counter and retry. 188 */ 189 usage = page_counter_read(counter); 190 191 if (usage > nr_pages) 192 return -EBUSY; 193 194 old = xchg(&counter->max, nr_pages); 195 196 if (page_counter_read(counter) <= usage) 197 return 0; 198 199 counter->max = old; 200 cond_resched(); 201 } 202 } 203 204 /** 205 * page_counter_set_min - set the amount of protected memory 206 * @counter: counter 207 * @nr_pages: value to set 208 * 209 * The caller must serialize invocations on the same counter. 210 */ 211 void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) 212 { 213 struct page_counter *c; 214 215 WRITE_ONCE(counter->min, nr_pages); 216 217 for (c = counter; c; c = c->parent) 218 propagate_protected_usage(c, atomic_long_read(&c->usage)); 219 } 220 221 /** 222 * page_counter_set_low - set the amount of protected memory 223 * @counter: counter 224 * @nr_pages: value to set 225 * 226 * The caller must serialize invocations on the same counter. 227 */ 228 void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) 229 { 230 struct page_counter *c; 231 232 WRITE_ONCE(counter->low, nr_pages); 233 234 for (c = counter; c; c = c->parent) 235 propagate_protected_usage(c, atomic_long_read(&c->usage)); 236 } 237 238 /** 239 * page_counter_memparse - memparse() for page counter limits 240 * @buf: string to parse 241 * @max: string meaning maximum possible value 242 * @nr_pages: returns the result in number of pages 243 * 244 * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be 245 * limited to %PAGE_COUNTER_MAX. 246 */ 247 int page_counter_memparse(const char *buf, const char *max, 248 unsigned long *nr_pages) 249 { 250 char *end; 251 u64 bytes; 252 253 if (!strcmp(buf, max)) { 254 *nr_pages = PAGE_COUNTER_MAX; 255 return 0; 256 } 257 258 bytes = memparse(buf, &end); 259 if (*end != '\0') 260 return -EINVAL; 261 262 *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); 263 264 return 0; 265 } 266