1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Page Attribute Table (PAT) support: handle memory caching attributes in page tables. 4 * 5 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> 6 * Suresh B Siddha <suresh.b.siddha@intel.com> 7 * 8 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 9 * 10 * Basic principles: 11 * 12 * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and 13 * the kernel to set one of a handful of 'caching type' attributes for physical 14 * memory ranges: uncached, write-combining, write-through, write-protected, 15 * and the most commonly used and default attribute: write-back caching. 16 * 17 * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is 18 * a hardware interface to enumerate a limited number of physical memory ranges 19 * and set their caching attributes explicitly, programmed into the CPU via MSRs. 20 * Even modern CPUs have MTRRs enabled - but these are typically not touched 21 * by the kernel or by user-space (such as the X server), we rely on PAT for any 22 * additional cache attribute logic. 23 * 24 * PAT doesn't work via explicit memory ranges, but uses page table entries to add 25 * cache attribute information to the mapped memory range: there's 3 bits used, 26 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the 27 * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT). 28 * 29 * ( There's a metric ton of finer details, such as compatibility with CPU quirks 30 * that only support 4 types of PAT entries, and interaction with MTRRs, see 31 * below for details. ) 32 */ 33 34 #include <linux/seq_file.h> 35 #include <linux/memblock.h> 36 #include <linux/debugfs.h> 37 #include <linux/ioport.h> 38 #include <linux/kernel.h> 39 #include <linux/pfn_t.h> 40 #include <linux/slab.h> 41 #include <linux/mm.h> 42 #include <linux/fs.h> 43 #include <linux/rbtree.h> 44 45 #include <asm/cacheflush.h> 46 #include <asm/processor.h> 47 #include <asm/tlbflush.h> 48 #include <asm/x86_init.h> 49 #include <asm/fcntl.h> 50 #include <asm/e820/api.h> 51 #include <asm/mtrr.h> 52 #include <asm/page.h> 53 #include <asm/msr.h> 54 #include <asm/memtype.h> 55 #include <asm/io.h> 56 57 #include "memtype.h" 58 #include "../mm_internal.h" 59 60 #undef pr_fmt 61 #define pr_fmt(fmt) "" fmt 62 63 static bool __read_mostly pat_bp_initialized; 64 static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); 65 static bool __read_mostly pat_bp_enabled; 66 static bool __read_mostly pat_cm_initialized; 67 68 /* 69 * PAT support is enabled by default, but can be disabled for 70 * various user-requested or hardware-forced reasons: 71 */ 72 void pat_disable(const char *msg_reason) 73 { 74 if (pat_disabled) 75 return; 76 77 if (pat_bp_initialized) { 78 WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n"); 79 return; 80 } 81 82 pat_disabled = true; 83 pr_info("x86/PAT: %s\n", msg_reason); 84 } 85 86 static int __init nopat(char *str) 87 { 88 pat_disable("PAT support disabled via boot option."); 89 return 0; 90 } 91 early_param("nopat", nopat); 92 93 bool pat_enabled(void) 94 { 95 return pat_bp_enabled; 96 } 97 EXPORT_SYMBOL_GPL(pat_enabled); 98 99 int pat_debug_enable; 100 101 static int __init pat_debug_setup(char *str) 102 { 103 pat_debug_enable = 1; 104 return 0; 105 } 106 __setup("debugpat", pat_debug_setup); 107 108 #ifdef CONFIG_X86_PAT 109 /* 110 * X86 PAT uses page flags arch_1 and uncached together to keep track of 111 * memory type of pages that have backing page struct. 112 * 113 * X86 PAT supports 4 different memory types: 114 * - _PAGE_CACHE_MODE_WB 115 * - _PAGE_CACHE_MODE_WC 116 * - _PAGE_CACHE_MODE_UC_MINUS 117 * - _PAGE_CACHE_MODE_WT 118 * 119 * _PAGE_CACHE_MODE_WB is the default type. 120 */ 121 122 #define _PGMT_WB 0 123 #define _PGMT_WC (1UL << PG_arch_1) 124 #define _PGMT_UC_MINUS (1UL << PG_uncached) 125 #define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) 126 #define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) 127 #define _PGMT_CLEAR_MASK (~_PGMT_MASK) 128 129 static inline enum page_cache_mode get_page_memtype(struct page *pg) 130 { 131 unsigned long pg_flags = pg->flags & _PGMT_MASK; 132 133 if (pg_flags == _PGMT_WB) 134 return _PAGE_CACHE_MODE_WB; 135 else if (pg_flags == _PGMT_WC) 136 return _PAGE_CACHE_MODE_WC; 137 else if (pg_flags == _PGMT_UC_MINUS) 138 return _PAGE_CACHE_MODE_UC_MINUS; 139 else 140 return _PAGE_CACHE_MODE_WT; 141 } 142 143 static inline void set_page_memtype(struct page *pg, 144 enum page_cache_mode memtype) 145 { 146 unsigned long memtype_flags; 147 unsigned long old_flags; 148 unsigned long new_flags; 149 150 switch (memtype) { 151 case _PAGE_CACHE_MODE_WC: 152 memtype_flags = _PGMT_WC; 153 break; 154 case _PAGE_CACHE_MODE_UC_MINUS: 155 memtype_flags = _PGMT_UC_MINUS; 156 break; 157 case _PAGE_CACHE_MODE_WT: 158 memtype_flags = _PGMT_WT; 159 break; 160 case _PAGE_CACHE_MODE_WB: 161 default: 162 memtype_flags = _PGMT_WB; 163 break; 164 } 165 166 do { 167 old_flags = pg->flags; 168 new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; 169 } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); 170 } 171 #else 172 static inline enum page_cache_mode get_page_memtype(struct page *pg) 173 { 174 return -1; 175 } 176 static inline void set_page_memtype(struct page *pg, 177 enum page_cache_mode memtype) 178 { 179 } 180 #endif 181 182 enum { 183 PAT_UC = 0, /* uncached */ 184 PAT_WC = 1, /* Write combining */ 185 PAT_WT = 4, /* Write Through */ 186 PAT_WP = 5, /* Write Protected */ 187 PAT_WB = 6, /* Write Back (default) */ 188 PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ 189 }; 190 191 #define CM(c) (_PAGE_CACHE_MODE_ ## c) 192 193 static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) 194 { 195 enum page_cache_mode cache; 196 char *cache_mode; 197 198 switch (pat_val) { 199 case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; 200 case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; 201 case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; 202 case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; 203 case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; 204 case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; 205 default: cache = CM(WB); cache_mode = "WB "; break; 206 } 207 208 memcpy(msg, cache_mode, 4); 209 210 return cache; 211 } 212 213 #undef CM 214 215 /* 216 * Update the cache mode to pgprot translation tables according to PAT 217 * configuration. 218 * Using lower indices is preferred, so we start with highest index. 219 */ 220 static void __init_cache_modes(u64 pat) 221 { 222 enum page_cache_mode cache; 223 char pat_msg[33]; 224 int i; 225 226 WARN_ON_ONCE(pat_cm_initialized); 227 228 pat_msg[32] = 0; 229 for (i = 7; i >= 0; i--) { 230 cache = pat_get_cache_mode((pat >> (i * 8)) & 7, 231 pat_msg + 4 * i); 232 update_cache_mode_entry(i, cache); 233 } 234 pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); 235 236 pat_cm_initialized = true; 237 } 238 239 #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) 240 241 static void pat_bp_init(u64 pat) 242 { 243 u64 tmp_pat; 244 245 if (!boot_cpu_has(X86_FEATURE_PAT)) { 246 pat_disable("PAT not supported by the CPU."); 247 return; 248 } 249 250 rdmsrl(MSR_IA32_CR_PAT, tmp_pat); 251 if (!tmp_pat) { 252 pat_disable("PAT support disabled by the firmware."); 253 return; 254 } 255 256 wrmsrl(MSR_IA32_CR_PAT, pat); 257 pat_bp_enabled = true; 258 259 __init_cache_modes(pat); 260 } 261 262 static void pat_ap_init(u64 pat) 263 { 264 if (!boot_cpu_has(X86_FEATURE_PAT)) { 265 /* 266 * If this happens we are on a secondary CPU, but switched to 267 * PAT on the boot CPU. We have no way to undo PAT. 268 */ 269 panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); 270 } 271 272 wrmsrl(MSR_IA32_CR_PAT, pat); 273 } 274 275 void init_cache_modes(void) 276 { 277 u64 pat = 0; 278 279 if (pat_cm_initialized) 280 return; 281 282 if (boot_cpu_has(X86_FEATURE_PAT)) { 283 /* 284 * CPU supports PAT. Set PAT table to be consistent with 285 * PAT MSR. This case supports "nopat" boot option, and 286 * virtual machine environments which support PAT without 287 * MTRRs. In specific, Xen has unique setup to PAT MSR. 288 * 289 * If PAT MSR returns 0, it is considered invalid and emulates 290 * as No PAT. 291 */ 292 rdmsrl(MSR_IA32_CR_PAT, pat); 293 } 294 295 if (!pat) { 296 /* 297 * No PAT. Emulate the PAT table that corresponds to the two 298 * cache bits, PWT (Write Through) and PCD (Cache Disable). 299 * This setup is also the same as the BIOS default setup. 300 * 301 * PTE encoding: 302 * 303 * PCD 304 * |PWT PAT 305 * || slot 306 * 00 0 WB : _PAGE_CACHE_MODE_WB 307 * 01 1 WT : _PAGE_CACHE_MODE_WT 308 * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 309 * 11 3 UC : _PAGE_CACHE_MODE_UC 310 * 311 * NOTE: When WC or WP is used, it is redirected to UC- per 312 * the default setup in __cachemode2pte_tbl[]. 313 */ 314 pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | 315 PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); 316 } 317 318 __init_cache_modes(pat); 319 } 320 321 /** 322 * pat_init - Initialize the PAT MSR and PAT table on the current CPU 323 * 324 * This function initializes PAT MSR and PAT table with an OS-defined value 325 * to enable additional cache attributes, WC, WT and WP. 326 * 327 * This function must be called on all CPUs using the specific sequence of 328 * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this 329 * procedure for PAT. 330 */ 331 void pat_init(void) 332 { 333 u64 pat; 334 struct cpuinfo_x86 *c = &boot_cpu_data; 335 336 #ifndef CONFIG_X86_PAT 337 pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); 338 #endif 339 340 if (pat_disabled) 341 return; 342 343 if ((c->x86_vendor == X86_VENDOR_INTEL) && 344 (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || 345 ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { 346 /* 347 * PAT support with the lower four entries. Intel Pentium 2, 348 * 3, M, and 4 are affected by PAT errata, which makes the 349 * upper four entries unusable. To be on the safe side, we don't 350 * use those. 351 * 352 * PTE encoding: 353 * PAT 354 * |PCD 355 * ||PWT PAT 356 * ||| slot 357 * 000 0 WB : _PAGE_CACHE_MODE_WB 358 * 001 1 WC : _PAGE_CACHE_MODE_WC 359 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 360 * 011 3 UC : _PAGE_CACHE_MODE_UC 361 * PAT bit unused 362 * 363 * NOTE: When WT or WP is used, it is redirected to UC- per 364 * the default setup in __cachemode2pte_tbl[]. 365 */ 366 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 367 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); 368 } else { 369 /* 370 * Full PAT support. We put WT in slot 7 to improve 371 * robustness in the presence of errata that might cause 372 * the high PAT bit to be ignored. This way, a buggy slot 7 373 * access will hit slot 3, and slot 3 is UC, so at worst 374 * we lose performance without causing a correctness issue. 375 * Pentium 4 erratum N46 is an example for such an erratum, 376 * although we try not to use PAT at all on affected CPUs. 377 * 378 * PTE encoding: 379 * PAT 380 * |PCD 381 * ||PWT PAT 382 * ||| slot 383 * 000 0 WB : _PAGE_CACHE_MODE_WB 384 * 001 1 WC : _PAGE_CACHE_MODE_WC 385 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 386 * 011 3 UC : _PAGE_CACHE_MODE_UC 387 * 100 4 WB : Reserved 388 * 101 5 WP : _PAGE_CACHE_MODE_WP 389 * 110 6 UC-: Reserved 390 * 111 7 WT : _PAGE_CACHE_MODE_WT 391 * 392 * The reserved slots are unused, but mapped to their 393 * corresponding types in the presence of PAT errata. 394 */ 395 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 396 PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); 397 } 398 399 if (!pat_bp_initialized) { 400 pat_bp_init(pat); 401 pat_bp_initialized = true; 402 } else { 403 pat_ap_init(pat); 404 } 405 } 406 407 #undef PAT 408 409 static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ 410 411 /* 412 * Does intersection of PAT memory type and MTRR memory type and returns 413 * the resulting memory type as PAT understands it. 414 * (Type in pat and mtrr will not have same value) 415 * The intersection is based on "Effective Memory Type" tables in IA-32 416 * SDM vol 3a 417 */ 418 static unsigned long pat_x_mtrr_type(u64 start, u64 end, 419 enum page_cache_mode req_type) 420 { 421 /* 422 * Look for MTRR hint to get the effective type in case where PAT 423 * request is for WB. 424 */ 425 if (req_type == _PAGE_CACHE_MODE_WB) { 426 u8 mtrr_type, uniform; 427 428 mtrr_type = mtrr_type_lookup(start, end, &uniform); 429 if (mtrr_type != MTRR_TYPE_WRBACK) 430 return _PAGE_CACHE_MODE_UC_MINUS; 431 432 return _PAGE_CACHE_MODE_WB; 433 } 434 435 return req_type; 436 } 437 438 struct pagerange_state { 439 unsigned long cur_pfn; 440 int ram; 441 int not_ram; 442 }; 443 444 static int 445 pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) 446 { 447 struct pagerange_state *state = arg; 448 449 state->not_ram |= initial_pfn > state->cur_pfn; 450 state->ram |= total_nr_pages > 0; 451 state->cur_pfn = initial_pfn + total_nr_pages; 452 453 return state->ram && state->not_ram; 454 } 455 456 static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) 457 { 458 int ret = 0; 459 unsigned long start_pfn = start >> PAGE_SHIFT; 460 unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 461 struct pagerange_state state = {start_pfn, 0, 0}; 462 463 /* 464 * For legacy reasons, physical address range in the legacy ISA 465 * region is tracked as non-RAM. This will allow users of 466 * /dev/mem to map portions of legacy ISA region, even when 467 * some of those portions are listed(or not even listed) with 468 * different e820 types(RAM/reserved/..) 469 */ 470 if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) 471 start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; 472 473 if (start_pfn < end_pfn) { 474 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, 475 &state, pagerange_is_ram_callback); 476 } 477 478 return (ret > 0) ? -1 : (state.ram ? 1 : 0); 479 } 480 481 /* 482 * For RAM pages, we use page flags to mark the pages with appropriate type. 483 * The page flags are limited to four types, WB (default), WC, WT and UC-. 484 * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting 485 * a new memory type is only allowed for a page mapped with the default WB 486 * type. 487 * 488 * Here we do two passes: 489 * - Find the memtype of all the pages in the range, look for any conflicts. 490 * - In case of no conflicts, set the new memtype for pages in the range. 491 */ 492 static int reserve_ram_pages_type(u64 start, u64 end, 493 enum page_cache_mode req_type, 494 enum page_cache_mode *new_type) 495 { 496 struct page *page; 497 u64 pfn; 498 499 if (req_type == _PAGE_CACHE_MODE_WP) { 500 if (new_type) 501 *new_type = _PAGE_CACHE_MODE_UC_MINUS; 502 return -EINVAL; 503 } 504 505 if (req_type == _PAGE_CACHE_MODE_UC) { 506 /* We do not support strong UC */ 507 WARN_ON_ONCE(1); 508 req_type = _PAGE_CACHE_MODE_UC_MINUS; 509 } 510 511 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 512 enum page_cache_mode type; 513 514 page = pfn_to_page(pfn); 515 type = get_page_memtype(page); 516 if (type != _PAGE_CACHE_MODE_WB) { 517 pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", 518 start, end - 1, type, req_type); 519 if (new_type) 520 *new_type = type; 521 522 return -EBUSY; 523 } 524 } 525 526 if (new_type) 527 *new_type = req_type; 528 529 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 530 page = pfn_to_page(pfn); 531 set_page_memtype(page, req_type); 532 } 533 return 0; 534 } 535 536 static int free_ram_pages_type(u64 start, u64 end) 537 { 538 struct page *page; 539 u64 pfn; 540 541 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 542 page = pfn_to_page(pfn); 543 set_page_memtype(page, _PAGE_CACHE_MODE_WB); 544 } 545 return 0; 546 } 547 548 static u64 sanitize_phys(u64 address) 549 { 550 /* 551 * When changing the memtype for pages containing poison allow 552 * for a "decoy" virtual address (bit 63 clear) passed to 553 * set_memory_X(). __pa() on a "decoy" address results in a 554 * physical address with bit 63 set. 555 * 556 * Decoy addresses are not present for 32-bit builds, see 557 * set_mce_nospec(). 558 */ 559 if (IS_ENABLED(CONFIG_X86_64)) 560 return address & __PHYSICAL_MASK; 561 return address; 562 } 563 564 /* 565 * req_type typically has one of the: 566 * - _PAGE_CACHE_MODE_WB 567 * - _PAGE_CACHE_MODE_WC 568 * - _PAGE_CACHE_MODE_UC_MINUS 569 * - _PAGE_CACHE_MODE_UC 570 * - _PAGE_CACHE_MODE_WT 571 * 572 * If new_type is NULL, function will return an error if it cannot reserve the 573 * region with req_type. If new_type is non-NULL, function will return 574 * available type in new_type in case of no error. In case of any error 575 * it will return a negative return value. 576 */ 577 int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type, 578 enum page_cache_mode *new_type) 579 { 580 struct memtype *entry_new; 581 enum page_cache_mode actual_type; 582 int is_range_ram; 583 int err = 0; 584 585 start = sanitize_phys(start); 586 end = sanitize_phys(end); 587 if (start >= end) { 588 WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, 589 start, end - 1, cattr_name(req_type)); 590 return -EINVAL; 591 } 592 593 if (!pat_enabled()) { 594 /* This is identical to page table setting without PAT */ 595 if (new_type) 596 *new_type = req_type; 597 return 0; 598 } 599 600 /* Low ISA region is always mapped WB in page table. No need to track */ 601 if (x86_platform.is_untracked_pat_range(start, end)) { 602 if (new_type) 603 *new_type = _PAGE_CACHE_MODE_WB; 604 return 0; 605 } 606 607 /* 608 * Call mtrr_lookup to get the type hint. This is an 609 * optimization for /dev/mem mmap'ers into WB memory (BIOS 610 * tools and ACPI tools). Use WB request for WB memory and use 611 * UC_MINUS otherwise. 612 */ 613 actual_type = pat_x_mtrr_type(start, end, req_type); 614 615 if (new_type) 616 *new_type = actual_type; 617 618 is_range_ram = pat_pagerange_is_ram(start, end); 619 if (is_range_ram == 1) { 620 621 err = reserve_ram_pages_type(start, end, req_type, new_type); 622 623 return err; 624 } else if (is_range_ram < 0) { 625 return -EINVAL; 626 } 627 628 entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL); 629 if (!entry_new) 630 return -ENOMEM; 631 632 entry_new->start = start; 633 entry_new->end = end; 634 entry_new->type = actual_type; 635 636 spin_lock(&memtype_lock); 637 638 err = memtype_check_insert(entry_new, new_type); 639 if (err) { 640 pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n", 641 start, end - 1, 642 cattr_name(entry_new->type), cattr_name(req_type)); 643 kfree(entry_new); 644 spin_unlock(&memtype_lock); 645 646 return err; 647 } 648 649 spin_unlock(&memtype_lock); 650 651 dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", 652 start, end - 1, cattr_name(entry_new->type), cattr_name(req_type), 653 new_type ? cattr_name(*new_type) : "-"); 654 655 return err; 656 } 657 658 int memtype_free(u64 start, u64 end) 659 { 660 int is_range_ram; 661 struct memtype *entry_old; 662 663 if (!pat_enabled()) 664 return 0; 665 666 start = sanitize_phys(start); 667 end = sanitize_phys(end); 668 669 /* Low ISA region is always mapped WB. No need to track */ 670 if (x86_platform.is_untracked_pat_range(start, end)) 671 return 0; 672 673 is_range_ram = pat_pagerange_is_ram(start, end); 674 if (is_range_ram == 1) 675 return free_ram_pages_type(start, end); 676 if (is_range_ram < 0) 677 return -EINVAL; 678 679 spin_lock(&memtype_lock); 680 entry_old = memtype_erase(start, end); 681 spin_unlock(&memtype_lock); 682 683 if (IS_ERR(entry_old)) { 684 pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", 685 current->comm, current->pid, start, end - 1); 686 return -EINVAL; 687 } 688 689 kfree(entry_old); 690 691 dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1); 692 693 return 0; 694 } 695 696 697 /** 698 * lookup_memtype - Looks up the memory type for a physical address 699 * @paddr: physical address of which memory type needs to be looked up 700 * 701 * Only to be called when PAT is enabled 702 * 703 * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS 704 * or _PAGE_CACHE_MODE_WT. 705 */ 706 static enum page_cache_mode lookup_memtype(u64 paddr) 707 { 708 enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; 709 struct memtype *entry; 710 711 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) 712 return rettype; 713 714 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 715 struct page *page; 716 717 page = pfn_to_page(paddr >> PAGE_SHIFT); 718 return get_page_memtype(page); 719 } 720 721 spin_lock(&memtype_lock); 722 723 entry = memtype_lookup(paddr); 724 if (entry != NULL) 725 rettype = entry->type; 726 else 727 rettype = _PAGE_CACHE_MODE_UC_MINUS; 728 729 spin_unlock(&memtype_lock); 730 731 return rettype; 732 } 733 734 /** 735 * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type 736 * of @pfn cannot be overridden by UC MTRR memory type. 737 * 738 * Only to be called when PAT is enabled. 739 * 740 * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC. 741 * Returns false in other cases. 742 */ 743 bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) 744 { 745 enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn)); 746 747 return cm == _PAGE_CACHE_MODE_UC || 748 cm == _PAGE_CACHE_MODE_UC_MINUS || 749 cm == _PAGE_CACHE_MODE_WC; 750 } 751 EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); 752 753 /** 754 * memtype_reserve_io - Request a memory type mapping for a region of memory 755 * @start: start (physical address) of the region 756 * @end: end (physical address) of the region 757 * @type: A pointer to memtype, with requested type. On success, requested 758 * or any other compatible type that was available for the region is returned 759 * 760 * On success, returns 0 761 * On failure, returns non-zero 762 */ 763 int memtype_reserve_io(resource_size_t start, resource_size_t end, 764 enum page_cache_mode *type) 765 { 766 resource_size_t size = end - start; 767 enum page_cache_mode req_type = *type; 768 enum page_cache_mode new_type; 769 int ret; 770 771 WARN_ON_ONCE(iomem_map_sanity_check(start, size)); 772 773 ret = memtype_reserve(start, end, req_type, &new_type); 774 if (ret) 775 goto out_err; 776 777 if (!is_new_memtype_allowed(start, size, req_type, new_type)) 778 goto out_free; 779 780 if (memtype_kernel_map_sync(start, size, new_type) < 0) 781 goto out_free; 782 783 *type = new_type; 784 return 0; 785 786 out_free: 787 memtype_free(start, end); 788 ret = -EBUSY; 789 out_err: 790 return ret; 791 } 792 793 /** 794 * memtype_free_io - Release a memory type mapping for a region of memory 795 * @start: start (physical address) of the region 796 * @end: end (physical address) of the region 797 */ 798 void memtype_free_io(resource_size_t start, resource_size_t end) 799 { 800 memtype_free(start, end); 801 } 802 803 #ifdef CONFIG_X86_PAT 804 int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) 805 { 806 enum page_cache_mode type = _PAGE_CACHE_MODE_WC; 807 808 return memtype_reserve_io(start, start + size, &type); 809 } 810 EXPORT_SYMBOL(arch_io_reserve_memtype_wc); 811 812 void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) 813 { 814 memtype_free_io(start, start + size); 815 } 816 EXPORT_SYMBOL(arch_io_free_memtype_wc); 817 #endif 818 819 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 820 unsigned long size, pgprot_t vma_prot) 821 { 822 if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) 823 vma_prot = pgprot_decrypted(vma_prot); 824 825 return vma_prot; 826 } 827 828 #ifdef CONFIG_STRICT_DEVMEM 829 /* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ 830 static inline int range_is_allowed(unsigned long pfn, unsigned long size) 831 { 832 return 1; 833 } 834 #else 835 /* This check is needed to avoid cache aliasing when PAT is enabled */ 836 static inline int range_is_allowed(unsigned long pfn, unsigned long size) 837 { 838 u64 from = ((u64)pfn) << PAGE_SHIFT; 839 u64 to = from + size; 840 u64 cursor = from; 841 842 if (!pat_enabled()) 843 return 1; 844 845 while (cursor < to) { 846 if (!devmem_is_allowed(pfn)) 847 return 0; 848 cursor += PAGE_SIZE; 849 pfn++; 850 } 851 return 1; 852 } 853 #endif /* CONFIG_STRICT_DEVMEM */ 854 855 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 856 unsigned long size, pgprot_t *vma_prot) 857 { 858 enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; 859 860 if (!range_is_allowed(pfn, size)) 861 return 0; 862 863 if (file->f_flags & O_DSYNC) 864 pcm = _PAGE_CACHE_MODE_UC_MINUS; 865 866 *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | 867 cachemode2protval(pcm)); 868 return 1; 869 } 870 871 /* 872 * Change the memory type for the physical address range in kernel identity 873 * mapping space if that range is a part of identity map. 874 */ 875 int memtype_kernel_map_sync(u64 base, unsigned long size, 876 enum page_cache_mode pcm) 877 { 878 unsigned long id_sz; 879 880 if (base > __pa(high_memory-1)) 881 return 0; 882 883 /* 884 * Some areas in the middle of the kernel identity range 885 * are not mapped, for example the PCI space. 886 */ 887 if (!page_is_ram(base >> PAGE_SHIFT)) 888 return 0; 889 890 id_sz = (__pa(high_memory-1) <= base + size) ? 891 __pa(high_memory) - base : size; 892 893 if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { 894 pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", 895 current->comm, current->pid, 896 cattr_name(pcm), 897 base, (unsigned long long)(base + size-1)); 898 return -EINVAL; 899 } 900 return 0; 901 } 902 903 /* 904 * Internal interface to reserve a range of physical memory with prot. 905 * Reserved non RAM regions only and after successful memtype_reserve, 906 * this func also keeps identity mapping (if any) in sync with this new prot. 907 */ 908 static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, 909 int strict_prot) 910 { 911 int is_ram = 0; 912 int ret; 913 enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); 914 enum page_cache_mode pcm = want_pcm; 915 916 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 917 918 /* 919 * reserve_pfn_range() for RAM pages. We do not refcount to keep 920 * track of number of mappings of RAM pages. We can assert that 921 * the type requested matches the type of first page in the range. 922 */ 923 if (is_ram) { 924 if (!pat_enabled()) 925 return 0; 926 927 pcm = lookup_memtype(paddr); 928 if (want_pcm != pcm) { 929 pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", 930 current->comm, current->pid, 931 cattr_name(want_pcm), 932 (unsigned long long)paddr, 933 (unsigned long long)(paddr + size - 1), 934 cattr_name(pcm)); 935 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 936 (~_PAGE_CACHE_MASK)) | 937 cachemode2protval(pcm)); 938 } 939 return 0; 940 } 941 942 ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm); 943 if (ret) 944 return ret; 945 946 if (pcm != want_pcm) { 947 if (strict_prot || 948 !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { 949 memtype_free(paddr, paddr + size); 950 pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", 951 current->comm, current->pid, 952 cattr_name(want_pcm), 953 (unsigned long long)paddr, 954 (unsigned long long)(paddr + size - 1), 955 cattr_name(pcm)); 956 return -EINVAL; 957 } 958 /* 959 * We allow returning different type than the one requested in 960 * non strict case. 961 */ 962 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 963 (~_PAGE_CACHE_MASK)) | 964 cachemode2protval(pcm)); 965 } 966 967 if (memtype_kernel_map_sync(paddr, size, pcm) < 0) { 968 memtype_free(paddr, paddr + size); 969 return -EINVAL; 970 } 971 return 0; 972 } 973 974 /* 975 * Internal interface to free a range of physical memory. 976 * Frees non RAM regions only. 977 */ 978 static void free_pfn_range(u64 paddr, unsigned long size) 979 { 980 int is_ram; 981 982 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 983 if (is_ram == 0) 984 memtype_free(paddr, paddr + size); 985 } 986 987 /* 988 * track_pfn_copy is called when vma that is covering the pfnmap gets 989 * copied through copy_page_range(). 990 * 991 * If the vma has a linear pfn mapping for the entire range, we get the prot 992 * from pte and reserve the entire vma range with single reserve_pfn_range call. 993 */ 994 int track_pfn_copy(struct vm_area_struct *vma) 995 { 996 resource_size_t paddr; 997 unsigned long prot; 998 unsigned long vma_size = vma->vm_end - vma->vm_start; 999 pgprot_t pgprot; 1000 1001 if (vma->vm_flags & VM_PAT) { 1002 /* 1003 * reserve the whole chunk covered by vma. We need the 1004 * starting address and protection from pte. 1005 */ 1006 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 1007 WARN_ON_ONCE(1); 1008 return -EINVAL; 1009 } 1010 pgprot = __pgprot(prot); 1011 return reserve_pfn_range(paddr, vma_size, &pgprot, 1); 1012 } 1013 1014 return 0; 1015 } 1016 1017 /* 1018 * prot is passed in as a parameter for the new mapping. If the vma has 1019 * a linear pfn mapping for the entire range, or no vma is provided, 1020 * reserve the entire pfn + size range with single reserve_pfn_range 1021 * call. 1022 */ 1023 int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, 1024 unsigned long pfn, unsigned long addr, unsigned long size) 1025 { 1026 resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; 1027 enum page_cache_mode pcm; 1028 1029 /* reserve the whole chunk starting from paddr */ 1030 if (!vma || (addr == vma->vm_start 1031 && size == (vma->vm_end - vma->vm_start))) { 1032 int ret; 1033 1034 ret = reserve_pfn_range(paddr, size, prot, 0); 1035 if (ret == 0 && vma) 1036 vma->vm_flags |= VM_PAT; 1037 return ret; 1038 } 1039 1040 if (!pat_enabled()) 1041 return 0; 1042 1043 /* 1044 * For anything smaller than the vma size we set prot based on the 1045 * lookup. 1046 */ 1047 pcm = lookup_memtype(paddr); 1048 1049 /* Check memtype for the remaining pages */ 1050 while (size > PAGE_SIZE) { 1051 size -= PAGE_SIZE; 1052 paddr += PAGE_SIZE; 1053 if (pcm != lookup_memtype(paddr)) 1054 return -EINVAL; 1055 } 1056 1057 *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | 1058 cachemode2protval(pcm)); 1059 1060 return 0; 1061 } 1062 1063 void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) 1064 { 1065 enum page_cache_mode pcm; 1066 1067 if (!pat_enabled()) 1068 return; 1069 1070 /* Set prot based on lookup */ 1071 pcm = lookup_memtype(pfn_t_to_phys(pfn)); 1072 *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | 1073 cachemode2protval(pcm)); 1074 } 1075 1076 /* 1077 * untrack_pfn is called while unmapping a pfnmap for a region. 1078 * untrack can be called for a specific region indicated by pfn and size or 1079 * can be for the entire vma (in which case pfn, size are zero). 1080 */ 1081 void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, 1082 unsigned long size) 1083 { 1084 resource_size_t paddr; 1085 unsigned long prot; 1086 1087 if (vma && !(vma->vm_flags & VM_PAT)) 1088 return; 1089 1090 /* free the chunk starting from pfn or the whole chunk */ 1091 paddr = (resource_size_t)pfn << PAGE_SHIFT; 1092 if (!paddr && !size) { 1093 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 1094 WARN_ON_ONCE(1); 1095 return; 1096 } 1097 1098 size = vma->vm_end - vma->vm_start; 1099 } 1100 free_pfn_range(paddr, size); 1101 if (vma) 1102 vma->vm_flags &= ~VM_PAT; 1103 } 1104 1105 /* 1106 * untrack_pfn_moved is called, while mremapping a pfnmap for a new region, 1107 * with the old vma after its pfnmap page table has been removed. The new 1108 * vma has a new pfnmap to the same pfn & cache type with VM_PAT set. 1109 */ 1110 void untrack_pfn_moved(struct vm_area_struct *vma) 1111 { 1112 vma->vm_flags &= ~VM_PAT; 1113 } 1114 1115 pgprot_t pgprot_writecombine(pgprot_t prot) 1116 { 1117 return __pgprot(pgprot_val(prot) | 1118 cachemode2protval(_PAGE_CACHE_MODE_WC)); 1119 } 1120 EXPORT_SYMBOL_GPL(pgprot_writecombine); 1121 1122 pgprot_t pgprot_writethrough(pgprot_t prot) 1123 { 1124 return __pgprot(pgprot_val(prot) | 1125 cachemode2protval(_PAGE_CACHE_MODE_WT)); 1126 } 1127 EXPORT_SYMBOL_GPL(pgprot_writethrough); 1128 1129 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) 1130 1131 /* 1132 * We are allocating a temporary printout-entry to be passed 1133 * between seq_start()/next() and seq_show(): 1134 */ 1135 static struct memtype *memtype_get_idx(loff_t pos) 1136 { 1137 struct memtype *entry_print; 1138 int ret; 1139 1140 entry_print = kzalloc(sizeof(struct memtype), GFP_KERNEL); 1141 if (!entry_print) 1142 return NULL; 1143 1144 spin_lock(&memtype_lock); 1145 ret = memtype_copy_nth_element(entry_print, pos); 1146 spin_unlock(&memtype_lock); 1147 1148 /* Free it on error: */ 1149 if (ret) { 1150 kfree(entry_print); 1151 return NULL; 1152 } 1153 1154 return entry_print; 1155 } 1156 1157 static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) 1158 { 1159 if (*pos == 0) { 1160 ++*pos; 1161 seq_puts(seq, "PAT memtype list:\n"); 1162 } 1163 1164 return memtype_get_idx(*pos); 1165 } 1166 1167 static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1168 { 1169 kfree(v); 1170 ++*pos; 1171 return memtype_get_idx(*pos); 1172 } 1173 1174 static void memtype_seq_stop(struct seq_file *seq, void *v) 1175 { 1176 kfree(v); 1177 } 1178 1179 static int memtype_seq_show(struct seq_file *seq, void *v) 1180 { 1181 struct memtype *entry_print = (struct memtype *)v; 1182 1183 seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n", 1184 entry_print->start, 1185 entry_print->end, 1186 cattr_name(entry_print->type)); 1187 1188 return 0; 1189 } 1190 1191 static const struct seq_operations memtype_seq_ops = { 1192 .start = memtype_seq_start, 1193 .next = memtype_seq_next, 1194 .stop = memtype_seq_stop, 1195 .show = memtype_seq_show, 1196 }; 1197 1198 static int memtype_seq_open(struct inode *inode, struct file *file) 1199 { 1200 return seq_open(file, &memtype_seq_ops); 1201 } 1202 1203 static const struct file_operations memtype_fops = { 1204 .open = memtype_seq_open, 1205 .read = seq_read, 1206 .llseek = seq_lseek, 1207 .release = seq_release, 1208 }; 1209 1210 static int __init pat_memtype_list_init(void) 1211 { 1212 if (pat_enabled()) { 1213 debugfs_create_file("pat_memtype_list", S_IRUSR, 1214 arch_debugfs_dir, NULL, &memtype_fops); 1215 } 1216 return 0; 1217 } 1218 late_initcall(pat_memtype_list_init); 1219 1220 #endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ 1221