1 #include <linux/init.h> 2 #include <linux/bootmem.h> 3 #include <linux/fs.h> 4 #include <linux/sysfs.h> 5 #include <linux/kobject.h> 6 #include <linux/mm.h> 7 #include <linux/mmzone.h> 8 #include <linux/pagemap.h> 9 #include <linux/rmap.h> 10 #include <linux/mmu_notifier.h> 11 #include <linux/page_ext.h> 12 #include <linux/page_idle.h> 13 14 #define BITMAP_CHUNK_SIZE sizeof(u64) 15 #define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE) 16 17 /* 18 * Idle page tracking only considers user memory pages, for other types of 19 * pages the idle flag is always unset and an attempt to set it is silently 20 * ignored. 21 * 22 * We treat a page as a user memory page if it is on an LRU list, because it is 23 * always safe to pass such a page to rmap_walk(), which is essential for idle 24 * page tracking. With such an indicator of user pages we can skip isolated 25 * pages, but since there are not usually many of them, it will hardly affect 26 * the overall result. 27 * 28 * This function tries to get a user memory page by pfn as described above. 29 */ 30 static struct page *page_idle_get_page(unsigned long pfn) 31 { 32 struct page *page; 33 struct zone *zone; 34 35 if (!pfn_valid(pfn)) 36 return NULL; 37 38 page = pfn_to_page(pfn); 39 if (!page || !PageLRU(page) || 40 !get_page_unless_zero(page)) 41 return NULL; 42 43 zone = page_zone(page); 44 spin_lock_irq(zone_lru_lock(zone)); 45 if (unlikely(!PageLRU(page))) { 46 put_page(page); 47 page = NULL; 48 } 49 spin_unlock_irq(zone_lru_lock(zone)); 50 return page; 51 } 52 53 static int page_idle_clear_pte_refs_one(struct page *page, 54 struct vm_area_struct *vma, 55 unsigned long addr, void *arg) 56 { 57 struct page_vma_mapped_walk pvmw = { 58 .page = page, 59 .vma = vma, 60 .address = addr, 61 }; 62 bool referenced = false; 63 64 while (page_vma_mapped_walk(&pvmw)) { 65 addr = pvmw.address; 66 if (pvmw.pte) { 67 referenced = ptep_clear_young_notify(vma, addr, 68 pvmw.pte); 69 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 70 referenced = pmdp_clear_young_notify(vma, addr, 71 pvmw.pmd); 72 } else { 73 /* unexpected pmd-mapped page? */ 74 WARN_ON_ONCE(1); 75 } 76 } 77 78 if (referenced) { 79 clear_page_idle(page); 80 /* 81 * We cleared the referenced bit in a mapping to this page. To 82 * avoid interference with page reclaim, mark it young so that 83 * page_referenced() will return > 0. 84 */ 85 set_page_young(page); 86 } 87 return SWAP_AGAIN; 88 } 89 90 static void page_idle_clear_pte_refs(struct page *page) 91 { 92 /* 93 * Since rwc.arg is unused, rwc is effectively immutable, so we 94 * can make it static const to save some cycles and stack. 95 */ 96 static const struct rmap_walk_control rwc = { 97 .rmap_one = page_idle_clear_pte_refs_one, 98 .anon_lock = page_lock_anon_vma_read, 99 }; 100 bool need_lock; 101 102 if (!page_mapped(page) || 103 !page_rmapping(page)) 104 return; 105 106 need_lock = !PageAnon(page) || PageKsm(page); 107 if (need_lock && !trylock_page(page)) 108 return; 109 110 rmap_walk(page, (struct rmap_walk_control *)&rwc); 111 112 if (need_lock) 113 unlock_page(page); 114 } 115 116 static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, 117 struct bin_attribute *attr, char *buf, 118 loff_t pos, size_t count) 119 { 120 u64 *out = (u64 *)buf; 121 struct page *page; 122 unsigned long pfn, end_pfn; 123 int bit; 124 125 if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) 126 return -EINVAL; 127 128 pfn = pos * BITS_PER_BYTE; 129 if (pfn >= max_pfn) 130 return 0; 131 132 end_pfn = pfn + count * BITS_PER_BYTE; 133 if (end_pfn > max_pfn) 134 end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); 135 136 for (; pfn < end_pfn; pfn++) { 137 bit = pfn % BITMAP_CHUNK_BITS; 138 if (!bit) 139 *out = 0ULL; 140 page = page_idle_get_page(pfn); 141 if (page) { 142 if (page_is_idle(page)) { 143 /* 144 * The page might have been referenced via a 145 * pte, in which case it is not idle. Clear 146 * refs and recheck. 147 */ 148 page_idle_clear_pte_refs(page); 149 if (page_is_idle(page)) 150 *out |= 1ULL << bit; 151 } 152 put_page(page); 153 } 154 if (bit == BITMAP_CHUNK_BITS - 1) 155 out++; 156 cond_resched(); 157 } 158 return (char *)out - buf; 159 } 160 161 static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, 162 struct bin_attribute *attr, char *buf, 163 loff_t pos, size_t count) 164 { 165 const u64 *in = (u64 *)buf; 166 struct page *page; 167 unsigned long pfn, end_pfn; 168 int bit; 169 170 if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) 171 return -EINVAL; 172 173 pfn = pos * BITS_PER_BYTE; 174 if (pfn >= max_pfn) 175 return -ENXIO; 176 177 end_pfn = pfn + count * BITS_PER_BYTE; 178 if (end_pfn > max_pfn) 179 end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); 180 181 for (; pfn < end_pfn; pfn++) { 182 bit = pfn % BITMAP_CHUNK_BITS; 183 if ((*in >> bit) & 1) { 184 page = page_idle_get_page(pfn); 185 if (page) { 186 page_idle_clear_pte_refs(page); 187 set_page_idle(page); 188 put_page(page); 189 } 190 } 191 if (bit == BITMAP_CHUNK_BITS - 1) 192 in++; 193 cond_resched(); 194 } 195 return (char *)in - buf; 196 } 197 198 static struct bin_attribute page_idle_bitmap_attr = 199 __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR, 200 page_idle_bitmap_read, page_idle_bitmap_write, 0); 201 202 static struct bin_attribute *page_idle_bin_attrs[] = { 203 &page_idle_bitmap_attr, 204 NULL, 205 }; 206 207 static struct attribute_group page_idle_attr_group = { 208 .bin_attrs = page_idle_bin_attrs, 209 .name = "page_idle", 210 }; 211 212 #ifndef CONFIG_64BIT 213 static bool need_page_idle(void) 214 { 215 return true; 216 } 217 struct page_ext_operations page_idle_ops = { 218 .need = need_page_idle, 219 }; 220 #endif 221 222 static int __init page_idle_init(void) 223 { 224 int err; 225 226 err = sysfs_create_group(mm_kobj, &page_idle_attr_group); 227 if (err) { 228 pr_err("page_idle: register sysfs failed\n"); 229 return err; 230 } 231 return 0; 232 } 233 subsys_initcall(page_idle_init); 234