1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Page Attribute Table (PAT) support: handle memory caching attributes in page tables. 4 * 5 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> 6 * Suresh B Siddha <suresh.b.siddha@intel.com> 7 * 8 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 9 * 10 * Basic principles: 11 * 12 * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and 13 * the kernel to set one of a handful of 'caching type' attributes for physical 14 * memory ranges: uncached, write-combining, write-through, write-protected, 15 * and the most commonly used and default attribute: write-back caching. 16 * 17 * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is 18 * a hardware interface to enumerate a limited number of physical memory ranges 19 * and set their caching attributes explicitly, programmed into the CPU via MSRs. 20 * Even modern CPUs have MTRRs enabled - but these are typically not touched 21 * by the kernel or by user-space (such as the X server), we rely on PAT for any 22 * additional cache attribute logic. 23 * 24 * PAT doesn't work via explicit memory ranges, but uses page table entries to add 25 * cache attribute information to the mapped memory range: there's 3 bits used, 26 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the 27 * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT). 28 * 29 * ( There's a metric ton of finer details, such as compatibility with CPU quirks 30 * that only support 4 types of PAT entries, and interaction with MTRRs, see 31 * below for details. ) 32 */ 33 34 #include <linux/seq_file.h> 35 #include <linux/memblock.h> 36 #include <linux/debugfs.h> 37 #include <linux/ioport.h> 38 #include <linux/kernel.h> 39 #include <linux/pfn_t.h> 40 #include <linux/slab.h> 41 #include <linux/mm.h> 42 #include <linux/fs.h> 43 #include <linux/rbtree.h> 44 45 #include <asm/cacheflush.h> 46 #include <asm/cacheinfo.h> 47 #include <asm/processor.h> 48 #include <asm/tlbflush.h> 49 #include <asm/x86_init.h> 50 #include <asm/fcntl.h> 51 #include <asm/e820/api.h> 52 #include <asm/mtrr.h> 53 #include <asm/page.h> 54 #include <asm/msr.h> 55 #include <asm/memtype.h> 56 #include <asm/io.h> 57 58 #include "memtype.h" 59 #include "../mm_internal.h" 60 61 #undef pr_fmt 62 #define pr_fmt(fmt) "" fmt 63 64 static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); 65 static u64 __ro_after_init pat_msr_val; 66 67 /* 68 * PAT support is enabled by default, but can be disabled for 69 * various user-requested or hardware-forced reasons: 70 */ 71 static void __init pat_disable(const char *msg_reason) 72 { 73 if (pat_disabled) 74 return; 75 76 pat_disabled = true; 77 pr_info("x86/PAT: %s\n", msg_reason); 78 79 memory_caching_control &= ~CACHE_PAT; 80 } 81 82 static int __init nopat(char *str) 83 { 84 pat_disable("PAT support disabled via boot option."); 85 return 0; 86 } 87 early_param("nopat", nopat); 88 89 bool pat_enabled(void) 90 { 91 return !pat_disabled; 92 } 93 EXPORT_SYMBOL_GPL(pat_enabled); 94 95 int pat_debug_enable; 96 97 static int __init pat_debug_setup(char *str) 98 { 99 pat_debug_enable = 1; 100 return 1; 101 } 102 __setup("debugpat", pat_debug_setup); 103 104 #ifdef CONFIG_X86_PAT 105 /* 106 * X86 PAT uses page flags arch_1 and uncached together to keep track of 107 * memory type of pages that have backing page struct. 108 * 109 * X86 PAT supports 4 different memory types: 110 * - _PAGE_CACHE_MODE_WB 111 * - _PAGE_CACHE_MODE_WC 112 * - _PAGE_CACHE_MODE_UC_MINUS 113 * - _PAGE_CACHE_MODE_WT 114 * 115 * _PAGE_CACHE_MODE_WB is the default type. 116 */ 117 118 #define _PGMT_WB 0 119 #define _PGMT_WC (1UL << PG_arch_1) 120 #define _PGMT_UC_MINUS (1UL << PG_uncached) 121 #define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) 122 #define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) 123 #define _PGMT_CLEAR_MASK (~_PGMT_MASK) 124 125 static inline enum page_cache_mode get_page_memtype(struct page *pg) 126 { 127 unsigned long pg_flags = pg->flags & _PGMT_MASK; 128 129 if (pg_flags == _PGMT_WB) 130 return _PAGE_CACHE_MODE_WB; 131 else if (pg_flags == _PGMT_WC) 132 return _PAGE_CACHE_MODE_WC; 133 else if (pg_flags == _PGMT_UC_MINUS) 134 return _PAGE_CACHE_MODE_UC_MINUS; 135 else 136 return _PAGE_CACHE_MODE_WT; 137 } 138 139 static inline void set_page_memtype(struct page *pg, 140 enum page_cache_mode memtype) 141 { 142 unsigned long memtype_flags; 143 unsigned long old_flags; 144 unsigned long new_flags; 145 146 switch (memtype) { 147 case _PAGE_CACHE_MODE_WC: 148 memtype_flags = _PGMT_WC; 149 break; 150 case _PAGE_CACHE_MODE_UC_MINUS: 151 memtype_flags = _PGMT_UC_MINUS; 152 break; 153 case _PAGE_CACHE_MODE_WT: 154 memtype_flags = _PGMT_WT; 155 break; 156 case _PAGE_CACHE_MODE_WB: 157 default: 158 memtype_flags = _PGMT_WB; 159 break; 160 } 161 162 old_flags = READ_ONCE(pg->flags); 163 do { 164 new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; 165 } while (!try_cmpxchg(&pg->flags, &old_flags, new_flags)); 166 } 167 #else 168 static inline enum page_cache_mode get_page_memtype(struct page *pg) 169 { 170 return -1; 171 } 172 static inline void set_page_memtype(struct page *pg, 173 enum page_cache_mode memtype) 174 { 175 } 176 #endif 177 178 enum { 179 PAT_UC = 0, /* uncached */ 180 PAT_WC = 1, /* Write combining */ 181 PAT_WT = 4, /* Write Through */ 182 PAT_WP = 5, /* Write Protected */ 183 PAT_WB = 6, /* Write Back (default) */ 184 PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ 185 }; 186 187 #define CM(c) (_PAGE_CACHE_MODE_ ## c) 188 189 static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val, 190 char *msg) 191 { 192 enum page_cache_mode cache; 193 char *cache_mode; 194 195 switch (pat_val) { 196 case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; 197 case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; 198 case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; 199 case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; 200 case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; 201 case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; 202 default: cache = CM(WB); cache_mode = "WB "; break; 203 } 204 205 memcpy(msg, cache_mode, 4); 206 207 return cache; 208 } 209 210 #undef CM 211 212 /* 213 * Update the cache mode to pgprot translation tables according to PAT 214 * configuration. 215 * Using lower indices is preferred, so we start with highest index. 216 */ 217 static void __init init_cache_modes(u64 pat) 218 { 219 enum page_cache_mode cache; 220 char pat_msg[33]; 221 int i; 222 223 pat_msg[32] = 0; 224 for (i = 7; i >= 0; i--) { 225 cache = pat_get_cache_mode((pat >> (i * 8)) & 7, 226 pat_msg + 4 * i); 227 update_cache_mode_entry(i, cache); 228 } 229 pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); 230 } 231 232 void pat_cpu_init(void) 233 { 234 if (!boot_cpu_has(X86_FEATURE_PAT)) { 235 /* 236 * If this happens we are on a secondary CPU, but switched to 237 * PAT on the boot CPU. We have no way to undo PAT. 238 */ 239 panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); 240 } 241 242 wrmsrl(MSR_IA32_CR_PAT, pat_msr_val); 243 } 244 245 /** 246 * pat_bp_init - Initialize the PAT MSR value and PAT table 247 * 248 * This function initializes PAT MSR value and PAT table with an OS-defined 249 * value to enable additional cache attributes, WC, WT and WP. 250 * 251 * This function prepares the calls of pat_cpu_init() via cache_cpu_init() 252 * on all CPUs. 253 */ 254 void __init pat_bp_init(void) 255 { 256 struct cpuinfo_x86 *c = &boot_cpu_data; 257 #define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \ 258 (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) | \ 259 ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) | \ 260 ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) | \ 261 ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56)) 262 263 264 if (!IS_ENABLED(CONFIG_X86_PAT)) 265 pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); 266 267 if (!cpu_feature_enabled(X86_FEATURE_PAT)) 268 pat_disable("PAT not supported by the CPU."); 269 else 270 rdmsrl(MSR_IA32_CR_PAT, pat_msr_val); 271 272 if (!pat_msr_val) { 273 pat_disable("PAT support disabled by the firmware."); 274 275 /* 276 * No PAT. Emulate the PAT table that corresponds to the two 277 * cache bits, PWT (Write Through) and PCD (Cache Disable). 278 * This setup is also the same as the BIOS default setup. 279 * 280 * PTE encoding: 281 * 282 * PCD 283 * |PWT PAT 284 * || slot 285 * 00 0 WB : _PAGE_CACHE_MODE_WB 286 * 01 1 WT : _PAGE_CACHE_MODE_WT 287 * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 288 * 11 3 UC : _PAGE_CACHE_MODE_UC 289 * 290 * NOTE: When WC or WP is used, it is redirected to UC- per 291 * the default setup in __cachemode2pte_tbl[]. 292 */ 293 pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC); 294 } 295 296 /* 297 * Xen PV doesn't allow to set PAT MSR, but all cache modes are 298 * supported. 299 * When running as TDX guest setting the PAT MSR won't work either 300 * due to the requirement to set CR0.CD when doing so. Rely on 301 * firmware to have set the PAT MSR correctly. 302 */ 303 if (pat_disabled || 304 cpu_feature_enabled(X86_FEATURE_XENPV) || 305 cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { 306 init_cache_modes(pat_msr_val); 307 return; 308 } 309 310 if ((c->x86_vendor == X86_VENDOR_INTEL) && 311 (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || 312 ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { 313 /* 314 * PAT support with the lower four entries. Intel Pentium 2, 315 * 3, M, and 4 are affected by PAT errata, which makes the 316 * upper four entries unusable. To be on the safe side, we don't 317 * use those. 318 * 319 * PTE encoding: 320 * PAT 321 * |PCD 322 * ||PWT PAT 323 * ||| slot 324 * 000 0 WB : _PAGE_CACHE_MODE_WB 325 * 001 1 WC : _PAGE_CACHE_MODE_WC 326 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 327 * 011 3 UC : _PAGE_CACHE_MODE_UC 328 * PAT bit unused 329 * 330 * NOTE: When WT or WP is used, it is redirected to UC- per 331 * the default setup in __cachemode2pte_tbl[]. 332 */ 333 pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC); 334 } else { 335 /* 336 * Full PAT support. We put WT in slot 7 to improve 337 * robustness in the presence of errata that might cause 338 * the high PAT bit to be ignored. This way, a buggy slot 7 339 * access will hit slot 3, and slot 3 is UC, so at worst 340 * we lose performance without causing a correctness issue. 341 * Pentium 4 erratum N46 is an example for such an erratum, 342 * although we try not to use PAT at all on affected CPUs. 343 * 344 * PTE encoding: 345 * PAT 346 * |PCD 347 * ||PWT PAT 348 * ||| slot 349 * 000 0 WB : _PAGE_CACHE_MODE_WB 350 * 001 1 WC : _PAGE_CACHE_MODE_WC 351 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 352 * 011 3 UC : _PAGE_CACHE_MODE_UC 353 * 100 4 WB : Reserved 354 * 101 5 WP : _PAGE_CACHE_MODE_WP 355 * 110 6 UC-: Reserved 356 * 111 7 WT : _PAGE_CACHE_MODE_WT 357 * 358 * The reserved slots are unused, but mapped to their 359 * corresponding types in the presence of PAT errata. 360 */ 361 pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT); 362 } 363 364 memory_caching_control |= CACHE_PAT; 365 366 init_cache_modes(pat_msr_val); 367 #undef PAT 368 } 369 370 static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ 371 372 /* 373 * Does intersection of PAT memory type and MTRR memory type and returns 374 * the resulting memory type as PAT understands it. 375 * (Type in pat and mtrr will not have same value) 376 * The intersection is based on "Effective Memory Type" tables in IA-32 377 * SDM vol 3a 378 */ 379 static unsigned long pat_x_mtrr_type(u64 start, u64 end, 380 enum page_cache_mode req_type) 381 { 382 /* 383 * Look for MTRR hint to get the effective type in case where PAT 384 * request is for WB. 385 */ 386 if (req_type == _PAGE_CACHE_MODE_WB) { 387 u8 mtrr_type, uniform; 388 389 mtrr_type = mtrr_type_lookup(start, end, &uniform); 390 if (mtrr_type != MTRR_TYPE_WRBACK) 391 return _PAGE_CACHE_MODE_UC_MINUS; 392 393 return _PAGE_CACHE_MODE_WB; 394 } 395 396 return req_type; 397 } 398 399 struct pagerange_state { 400 unsigned long cur_pfn; 401 int ram; 402 int not_ram; 403 }; 404 405 static int 406 pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) 407 { 408 struct pagerange_state *state = arg; 409 410 state->not_ram |= initial_pfn > state->cur_pfn; 411 state->ram |= total_nr_pages > 0; 412 state->cur_pfn = initial_pfn + total_nr_pages; 413 414 return state->ram && state->not_ram; 415 } 416 417 static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) 418 { 419 int ret = 0; 420 unsigned long start_pfn = start >> PAGE_SHIFT; 421 unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 422 struct pagerange_state state = {start_pfn, 0, 0}; 423 424 /* 425 * For legacy reasons, physical address range in the legacy ISA 426 * region is tracked as non-RAM. This will allow users of 427 * /dev/mem to map portions of legacy ISA region, even when 428 * some of those portions are listed(or not even listed) with 429 * different e820 types(RAM/reserved/..) 430 */ 431 if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) 432 start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; 433 434 if (start_pfn < end_pfn) { 435 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, 436 &state, pagerange_is_ram_callback); 437 } 438 439 return (ret > 0) ? -1 : (state.ram ? 1 : 0); 440 } 441 442 /* 443 * For RAM pages, we use page flags to mark the pages with appropriate type. 444 * The page flags are limited to four types, WB (default), WC, WT and UC-. 445 * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting 446 * a new memory type is only allowed for a page mapped with the default WB 447 * type. 448 * 449 * Here we do two passes: 450 * - Find the memtype of all the pages in the range, look for any conflicts. 451 * - In case of no conflicts, set the new memtype for pages in the range. 452 */ 453 static int reserve_ram_pages_type(u64 start, u64 end, 454 enum page_cache_mode req_type, 455 enum page_cache_mode *new_type) 456 { 457 struct page *page; 458 u64 pfn; 459 460 if (req_type == _PAGE_CACHE_MODE_WP) { 461 if (new_type) 462 *new_type = _PAGE_CACHE_MODE_UC_MINUS; 463 return -EINVAL; 464 } 465 466 if (req_type == _PAGE_CACHE_MODE_UC) { 467 /* We do not support strong UC */ 468 WARN_ON_ONCE(1); 469 req_type = _PAGE_CACHE_MODE_UC_MINUS; 470 } 471 472 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 473 enum page_cache_mode type; 474 475 page = pfn_to_page(pfn); 476 type = get_page_memtype(page); 477 if (type != _PAGE_CACHE_MODE_WB) { 478 pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", 479 start, end - 1, type, req_type); 480 if (new_type) 481 *new_type = type; 482 483 return -EBUSY; 484 } 485 } 486 487 if (new_type) 488 *new_type = req_type; 489 490 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 491 page = pfn_to_page(pfn); 492 set_page_memtype(page, req_type); 493 } 494 return 0; 495 } 496 497 static int free_ram_pages_type(u64 start, u64 end) 498 { 499 struct page *page; 500 u64 pfn; 501 502 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 503 page = pfn_to_page(pfn); 504 set_page_memtype(page, _PAGE_CACHE_MODE_WB); 505 } 506 return 0; 507 } 508 509 static u64 sanitize_phys(u64 address) 510 { 511 /* 512 * When changing the memtype for pages containing poison allow 513 * for a "decoy" virtual address (bit 63 clear) passed to 514 * set_memory_X(). __pa() on a "decoy" address results in a 515 * physical address with bit 63 set. 516 * 517 * Decoy addresses are not present for 32-bit builds, see 518 * set_mce_nospec(). 519 */ 520 if (IS_ENABLED(CONFIG_X86_64)) 521 return address & __PHYSICAL_MASK; 522 return address; 523 } 524 525 /* 526 * req_type typically has one of the: 527 * - _PAGE_CACHE_MODE_WB 528 * - _PAGE_CACHE_MODE_WC 529 * - _PAGE_CACHE_MODE_UC_MINUS 530 * - _PAGE_CACHE_MODE_UC 531 * - _PAGE_CACHE_MODE_WT 532 * 533 * If new_type is NULL, function will return an error if it cannot reserve the 534 * region with req_type. If new_type is non-NULL, function will return 535 * available type in new_type in case of no error. In case of any error 536 * it will return a negative return value. 537 */ 538 int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type, 539 enum page_cache_mode *new_type) 540 { 541 struct memtype *entry_new; 542 enum page_cache_mode actual_type; 543 int is_range_ram; 544 int err = 0; 545 546 start = sanitize_phys(start); 547 548 /* 549 * The end address passed into this function is exclusive, but 550 * sanitize_phys() expects an inclusive address. 551 */ 552 end = sanitize_phys(end - 1) + 1; 553 if (start >= end) { 554 WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, 555 start, end - 1, cattr_name(req_type)); 556 return -EINVAL; 557 } 558 559 if (!pat_enabled()) { 560 /* This is identical to page table setting without PAT */ 561 if (new_type) 562 *new_type = req_type; 563 return 0; 564 } 565 566 /* Low ISA region is always mapped WB in page table. No need to track */ 567 if (x86_platform.is_untracked_pat_range(start, end)) { 568 if (new_type) 569 *new_type = _PAGE_CACHE_MODE_WB; 570 return 0; 571 } 572 573 /* 574 * Call mtrr_lookup to get the type hint. This is an 575 * optimization for /dev/mem mmap'ers into WB memory (BIOS 576 * tools and ACPI tools). Use WB request for WB memory and use 577 * UC_MINUS otherwise. 578 */ 579 actual_type = pat_x_mtrr_type(start, end, req_type); 580 581 if (new_type) 582 *new_type = actual_type; 583 584 is_range_ram = pat_pagerange_is_ram(start, end); 585 if (is_range_ram == 1) { 586 587 err = reserve_ram_pages_type(start, end, req_type, new_type); 588 589 return err; 590 } else if (is_range_ram < 0) { 591 return -EINVAL; 592 } 593 594 entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL); 595 if (!entry_new) 596 return -ENOMEM; 597 598 entry_new->start = start; 599 entry_new->end = end; 600 entry_new->type = actual_type; 601 602 spin_lock(&memtype_lock); 603 604 err = memtype_check_insert(entry_new, new_type); 605 if (err) { 606 pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n", 607 start, end - 1, 608 cattr_name(entry_new->type), cattr_name(req_type)); 609 kfree(entry_new); 610 spin_unlock(&memtype_lock); 611 612 return err; 613 } 614 615 spin_unlock(&memtype_lock); 616 617 dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", 618 start, end - 1, cattr_name(entry_new->type), cattr_name(req_type), 619 new_type ? cattr_name(*new_type) : "-"); 620 621 return err; 622 } 623 624 int memtype_free(u64 start, u64 end) 625 { 626 int is_range_ram; 627 struct memtype *entry_old; 628 629 if (!pat_enabled()) 630 return 0; 631 632 start = sanitize_phys(start); 633 end = sanitize_phys(end); 634 635 /* Low ISA region is always mapped WB. No need to track */ 636 if (x86_platform.is_untracked_pat_range(start, end)) 637 return 0; 638 639 is_range_ram = pat_pagerange_is_ram(start, end); 640 if (is_range_ram == 1) 641 return free_ram_pages_type(start, end); 642 if (is_range_ram < 0) 643 return -EINVAL; 644 645 spin_lock(&memtype_lock); 646 entry_old = memtype_erase(start, end); 647 spin_unlock(&memtype_lock); 648 649 if (IS_ERR(entry_old)) { 650 pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", 651 current->comm, current->pid, start, end - 1); 652 return -EINVAL; 653 } 654 655 kfree(entry_old); 656 657 dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1); 658 659 return 0; 660 } 661 662 663 /** 664 * lookup_memtype - Looks up the memory type for a physical address 665 * @paddr: physical address of which memory type needs to be looked up 666 * 667 * Only to be called when PAT is enabled 668 * 669 * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS 670 * or _PAGE_CACHE_MODE_WT. 671 */ 672 static enum page_cache_mode lookup_memtype(u64 paddr) 673 { 674 enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; 675 struct memtype *entry; 676 677 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) 678 return rettype; 679 680 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 681 struct page *page; 682 683 page = pfn_to_page(paddr >> PAGE_SHIFT); 684 return get_page_memtype(page); 685 } 686 687 spin_lock(&memtype_lock); 688 689 entry = memtype_lookup(paddr); 690 if (entry != NULL) 691 rettype = entry->type; 692 else 693 rettype = _PAGE_CACHE_MODE_UC_MINUS; 694 695 spin_unlock(&memtype_lock); 696 697 return rettype; 698 } 699 700 /** 701 * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type 702 * of @pfn cannot be overridden by UC MTRR memory type. 703 * 704 * Only to be called when PAT is enabled. 705 * 706 * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC. 707 * Returns false in other cases. 708 */ 709 bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) 710 { 711 enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn)); 712 713 return cm == _PAGE_CACHE_MODE_UC || 714 cm == _PAGE_CACHE_MODE_UC_MINUS || 715 cm == _PAGE_CACHE_MODE_WC; 716 } 717 EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); 718 719 /** 720 * memtype_reserve_io - Request a memory type mapping for a region of memory 721 * @start: start (physical address) of the region 722 * @end: end (physical address) of the region 723 * @type: A pointer to memtype, with requested type. On success, requested 724 * or any other compatible type that was available for the region is returned 725 * 726 * On success, returns 0 727 * On failure, returns non-zero 728 */ 729 int memtype_reserve_io(resource_size_t start, resource_size_t end, 730 enum page_cache_mode *type) 731 { 732 resource_size_t size = end - start; 733 enum page_cache_mode req_type = *type; 734 enum page_cache_mode new_type; 735 int ret; 736 737 WARN_ON_ONCE(iomem_map_sanity_check(start, size)); 738 739 ret = memtype_reserve(start, end, req_type, &new_type); 740 if (ret) 741 goto out_err; 742 743 if (!is_new_memtype_allowed(start, size, req_type, new_type)) 744 goto out_free; 745 746 if (memtype_kernel_map_sync(start, size, new_type) < 0) 747 goto out_free; 748 749 *type = new_type; 750 return 0; 751 752 out_free: 753 memtype_free(start, end); 754 ret = -EBUSY; 755 out_err: 756 return ret; 757 } 758 759 /** 760 * memtype_free_io - Release a memory type mapping for a region of memory 761 * @start: start (physical address) of the region 762 * @end: end (physical address) of the region 763 */ 764 void memtype_free_io(resource_size_t start, resource_size_t end) 765 { 766 memtype_free(start, end); 767 } 768 769 #ifdef CONFIG_X86_PAT 770 int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) 771 { 772 enum page_cache_mode type = _PAGE_CACHE_MODE_WC; 773 774 return memtype_reserve_io(start, start + size, &type); 775 } 776 EXPORT_SYMBOL(arch_io_reserve_memtype_wc); 777 778 void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) 779 { 780 memtype_free_io(start, start + size); 781 } 782 EXPORT_SYMBOL(arch_io_free_memtype_wc); 783 #endif 784 785 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 786 unsigned long size, pgprot_t vma_prot) 787 { 788 if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) 789 vma_prot = pgprot_decrypted(vma_prot); 790 791 return vma_prot; 792 } 793 794 #ifdef CONFIG_STRICT_DEVMEM 795 /* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ 796 static inline int range_is_allowed(unsigned long pfn, unsigned long size) 797 { 798 return 1; 799 } 800 #else 801 /* This check is needed to avoid cache aliasing when PAT is enabled */ 802 static inline int range_is_allowed(unsigned long pfn, unsigned long size) 803 { 804 u64 from = ((u64)pfn) << PAGE_SHIFT; 805 u64 to = from + size; 806 u64 cursor = from; 807 808 if (!pat_enabled()) 809 return 1; 810 811 while (cursor < to) { 812 if (!devmem_is_allowed(pfn)) 813 return 0; 814 cursor += PAGE_SIZE; 815 pfn++; 816 } 817 return 1; 818 } 819 #endif /* CONFIG_STRICT_DEVMEM */ 820 821 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 822 unsigned long size, pgprot_t *vma_prot) 823 { 824 enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; 825 826 if (!range_is_allowed(pfn, size)) 827 return 0; 828 829 if (file->f_flags & O_DSYNC) 830 pcm = _PAGE_CACHE_MODE_UC_MINUS; 831 832 *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | 833 cachemode2protval(pcm)); 834 return 1; 835 } 836 837 /* 838 * Change the memory type for the physical address range in kernel identity 839 * mapping space if that range is a part of identity map. 840 */ 841 int memtype_kernel_map_sync(u64 base, unsigned long size, 842 enum page_cache_mode pcm) 843 { 844 unsigned long id_sz; 845 846 if (base > __pa(high_memory-1)) 847 return 0; 848 849 /* 850 * Some areas in the middle of the kernel identity range 851 * are not mapped, for example the PCI space. 852 */ 853 if (!page_is_ram(base >> PAGE_SHIFT)) 854 return 0; 855 856 id_sz = (__pa(high_memory-1) <= base + size) ? 857 __pa(high_memory) - base : size; 858 859 if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { 860 pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", 861 current->comm, current->pid, 862 cattr_name(pcm), 863 base, (unsigned long long)(base + size-1)); 864 return -EINVAL; 865 } 866 return 0; 867 } 868 869 /* 870 * Internal interface to reserve a range of physical memory with prot. 871 * Reserved non RAM regions only and after successful memtype_reserve, 872 * this func also keeps identity mapping (if any) in sync with this new prot. 873 */ 874 static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, 875 int strict_prot) 876 { 877 int is_ram = 0; 878 int ret; 879 enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); 880 enum page_cache_mode pcm = want_pcm; 881 882 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 883 884 /* 885 * reserve_pfn_range() for RAM pages. We do not refcount to keep 886 * track of number of mappings of RAM pages. We can assert that 887 * the type requested matches the type of first page in the range. 888 */ 889 if (is_ram) { 890 if (!pat_enabled()) 891 return 0; 892 893 pcm = lookup_memtype(paddr); 894 if (want_pcm != pcm) { 895 pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", 896 current->comm, current->pid, 897 cattr_name(want_pcm), 898 (unsigned long long)paddr, 899 (unsigned long long)(paddr + size - 1), 900 cattr_name(pcm)); 901 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 902 (~_PAGE_CACHE_MASK)) | 903 cachemode2protval(pcm)); 904 } 905 return 0; 906 } 907 908 ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm); 909 if (ret) 910 return ret; 911 912 if (pcm != want_pcm) { 913 if (strict_prot || 914 !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { 915 memtype_free(paddr, paddr + size); 916 pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", 917 current->comm, current->pid, 918 cattr_name(want_pcm), 919 (unsigned long long)paddr, 920 (unsigned long long)(paddr + size - 1), 921 cattr_name(pcm)); 922 return -EINVAL; 923 } 924 /* 925 * We allow returning different type than the one requested in 926 * non strict case. 927 */ 928 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 929 (~_PAGE_CACHE_MASK)) | 930 cachemode2protval(pcm)); 931 } 932 933 if (memtype_kernel_map_sync(paddr, size, pcm) < 0) { 934 memtype_free(paddr, paddr + size); 935 return -EINVAL; 936 } 937 return 0; 938 } 939 940 /* 941 * Internal interface to free a range of physical memory. 942 * Frees non RAM regions only. 943 */ 944 static void free_pfn_range(u64 paddr, unsigned long size) 945 { 946 int is_ram; 947 948 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 949 if (is_ram == 0) 950 memtype_free(paddr, paddr + size); 951 } 952 953 /* 954 * track_pfn_copy is called when vma that is covering the pfnmap gets 955 * copied through copy_page_range(). 956 * 957 * If the vma has a linear pfn mapping for the entire range, we get the prot 958 * from pte and reserve the entire vma range with single reserve_pfn_range call. 959 */ 960 int track_pfn_copy(struct vm_area_struct *vma) 961 { 962 resource_size_t paddr; 963 unsigned long prot; 964 unsigned long vma_size = vma->vm_end - vma->vm_start; 965 pgprot_t pgprot; 966 967 if (vma->vm_flags & VM_PAT) { 968 /* 969 * reserve the whole chunk covered by vma. We need the 970 * starting address and protection from pte. 971 */ 972 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 973 WARN_ON_ONCE(1); 974 return -EINVAL; 975 } 976 pgprot = __pgprot(prot); 977 return reserve_pfn_range(paddr, vma_size, &pgprot, 1); 978 } 979 980 return 0; 981 } 982 983 /* 984 * prot is passed in as a parameter for the new mapping. If the vma has 985 * a linear pfn mapping for the entire range, or no vma is provided, 986 * reserve the entire pfn + size range with single reserve_pfn_range 987 * call. 988 */ 989 int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, 990 unsigned long pfn, unsigned long addr, unsigned long size) 991 { 992 resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; 993 enum page_cache_mode pcm; 994 995 /* reserve the whole chunk starting from paddr */ 996 if (!vma || (addr == vma->vm_start 997 && size == (vma->vm_end - vma->vm_start))) { 998 int ret; 999 1000 ret = reserve_pfn_range(paddr, size, prot, 0); 1001 if (ret == 0 && vma) 1002 vm_flags_set(vma, VM_PAT); 1003 return ret; 1004 } 1005 1006 if (!pat_enabled()) 1007 return 0; 1008 1009 /* 1010 * For anything smaller than the vma size we set prot based on the 1011 * lookup. 1012 */ 1013 pcm = lookup_memtype(paddr); 1014 1015 /* Check memtype for the remaining pages */ 1016 while (size > PAGE_SIZE) { 1017 size -= PAGE_SIZE; 1018 paddr += PAGE_SIZE; 1019 if (pcm != lookup_memtype(paddr)) 1020 return -EINVAL; 1021 } 1022 1023 *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | 1024 cachemode2protval(pcm)); 1025 1026 return 0; 1027 } 1028 1029 void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) 1030 { 1031 enum page_cache_mode pcm; 1032 1033 if (!pat_enabled()) 1034 return; 1035 1036 /* Set prot based on lookup */ 1037 pcm = lookup_memtype(pfn_t_to_phys(pfn)); 1038 *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | 1039 cachemode2protval(pcm)); 1040 } 1041 1042 /* 1043 * untrack_pfn is called while unmapping a pfnmap for a region. 1044 * untrack can be called for a specific region indicated by pfn and size or 1045 * can be for the entire vma (in which case pfn, size are zero). 1046 */ 1047 void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, 1048 unsigned long size, bool mm_wr_locked) 1049 { 1050 resource_size_t paddr; 1051 unsigned long prot; 1052 1053 if (vma && !(vma->vm_flags & VM_PAT)) 1054 return; 1055 1056 /* free the chunk starting from pfn or the whole chunk */ 1057 paddr = (resource_size_t)pfn << PAGE_SHIFT; 1058 if (!paddr && !size) { 1059 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 1060 WARN_ON_ONCE(1); 1061 return; 1062 } 1063 1064 size = vma->vm_end - vma->vm_start; 1065 } 1066 free_pfn_range(paddr, size); 1067 if (vma) { 1068 if (mm_wr_locked) 1069 vm_flags_clear(vma, VM_PAT); 1070 else 1071 __vm_flags_mod(vma, 0, VM_PAT); 1072 } 1073 } 1074 1075 /* 1076 * untrack_pfn_moved is called, while mremapping a pfnmap for a new region, 1077 * with the old vma after its pfnmap page table has been removed. The new 1078 * vma has a new pfnmap to the same pfn & cache type with VM_PAT set. 1079 */ 1080 void untrack_pfn_moved(struct vm_area_struct *vma) 1081 { 1082 vm_flags_clear(vma, VM_PAT); 1083 } 1084 1085 pgprot_t pgprot_writecombine(pgprot_t prot) 1086 { 1087 return __pgprot(pgprot_val(prot) | 1088 cachemode2protval(_PAGE_CACHE_MODE_WC)); 1089 } 1090 EXPORT_SYMBOL_GPL(pgprot_writecombine); 1091 1092 pgprot_t pgprot_writethrough(pgprot_t prot) 1093 { 1094 return __pgprot(pgprot_val(prot) | 1095 cachemode2protval(_PAGE_CACHE_MODE_WT)); 1096 } 1097 EXPORT_SYMBOL_GPL(pgprot_writethrough); 1098 1099 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) 1100 1101 /* 1102 * We are allocating a temporary printout-entry to be passed 1103 * between seq_start()/next() and seq_show(): 1104 */ 1105 static struct memtype *memtype_get_idx(loff_t pos) 1106 { 1107 struct memtype *entry_print; 1108 int ret; 1109 1110 entry_print = kzalloc(sizeof(struct memtype), GFP_KERNEL); 1111 if (!entry_print) 1112 return NULL; 1113 1114 spin_lock(&memtype_lock); 1115 ret = memtype_copy_nth_element(entry_print, pos); 1116 spin_unlock(&memtype_lock); 1117 1118 /* Free it on error: */ 1119 if (ret) { 1120 kfree(entry_print); 1121 return NULL; 1122 } 1123 1124 return entry_print; 1125 } 1126 1127 static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) 1128 { 1129 if (*pos == 0) { 1130 ++*pos; 1131 seq_puts(seq, "PAT memtype list:\n"); 1132 } 1133 1134 return memtype_get_idx(*pos); 1135 } 1136 1137 static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1138 { 1139 kfree(v); 1140 ++*pos; 1141 return memtype_get_idx(*pos); 1142 } 1143 1144 static void memtype_seq_stop(struct seq_file *seq, void *v) 1145 { 1146 kfree(v); 1147 } 1148 1149 static int memtype_seq_show(struct seq_file *seq, void *v) 1150 { 1151 struct memtype *entry_print = (struct memtype *)v; 1152 1153 seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n", 1154 entry_print->start, 1155 entry_print->end, 1156 cattr_name(entry_print->type)); 1157 1158 return 0; 1159 } 1160 1161 static const struct seq_operations memtype_seq_ops = { 1162 .start = memtype_seq_start, 1163 .next = memtype_seq_next, 1164 .stop = memtype_seq_stop, 1165 .show = memtype_seq_show, 1166 }; 1167 1168 static int memtype_seq_open(struct inode *inode, struct file *file) 1169 { 1170 return seq_open(file, &memtype_seq_ops); 1171 } 1172 1173 static const struct file_operations memtype_fops = { 1174 .open = memtype_seq_open, 1175 .read = seq_read, 1176 .llseek = seq_lseek, 1177 .release = seq_release, 1178 }; 1179 1180 static int __init pat_memtype_list_init(void) 1181 { 1182 if (pat_enabled()) { 1183 debugfs_create_file("pat_memtype_list", S_IRUSR, 1184 arch_debugfs_dir, NULL, &memtype_fops); 1185 } 1186 return 0; 1187 } 1188 late_initcall(pat_memtype_list_init); 1189 1190 #endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ 1191