1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Page Attribute Table (PAT) support: handle memory caching attributes in page tables. 4 * 5 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> 6 * Suresh B Siddha <suresh.b.siddha@intel.com> 7 * 8 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 9 * 10 * Basic principles: 11 * 12 * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and 13 * the kernel to set one of a handful of 'caching type' attributes for physical 14 * memory ranges: uncached, write-combining, write-through, write-protected, 15 * and the most commonly used and default attribute: write-back caching. 16 * 17 * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is 18 * a hardware interface to enumerate a limited number of physical memory ranges 19 * and set their caching attributes explicitly, programmed into the CPU via MSRs. 20 * Even modern CPUs have MTRRs enabled - but these are typically not touched 21 * by the kernel or by user-space (such as the X server), we rely on PAT for any 22 * additional cache attribute logic. 23 * 24 * PAT doesn't work via explicit memory ranges, but uses page table entries to add 25 * cache attribute information to the mapped memory range: there's 3 bits used, 26 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the 27 * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT). 28 * 29 * ( There's a metric ton of finer details, such as compatibility with CPU quirks 30 * that only support 4 types of PAT entries, and interaction with MTRRs, see 31 * below for details. ) 32 */ 33 34 #include <linux/seq_file.h> 35 #include <linux/memblock.h> 36 #include <linux/debugfs.h> 37 #include <linux/ioport.h> 38 #include <linux/kernel.h> 39 #include <linux/pfn_t.h> 40 #include <linux/slab.h> 41 #include <linux/mm.h> 42 #include <linux/fs.h> 43 #include <linux/rbtree.h> 44 45 #include <asm/cacheflush.h> 46 #include <asm/processor.h> 47 #include <asm/tlbflush.h> 48 #include <asm/x86_init.h> 49 #include <asm/pgtable.h> 50 #include <asm/fcntl.h> 51 #include <asm/e820/api.h> 52 #include <asm/mtrr.h> 53 #include <asm/page.h> 54 #include <asm/msr.h> 55 #include <asm/memtype.h> 56 #include <asm/io.h> 57 58 #include "memtype.h" 59 #include "../mm_internal.h" 60 61 #undef pr_fmt 62 #define pr_fmt(fmt) "" fmt 63 64 static bool __read_mostly pat_bp_initialized; 65 static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); 66 static bool __read_mostly pat_bp_enabled; 67 static bool __read_mostly pat_cm_initialized; 68 69 /* 70 * PAT support is enabled by default, but can be disabled for 71 * various user-requested or hardware-forced reasons: 72 */ 73 void pat_disable(const char *msg_reason) 74 { 75 if (pat_disabled) 76 return; 77 78 if (pat_bp_initialized) { 79 WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n"); 80 return; 81 } 82 83 pat_disabled = true; 84 pr_info("x86/PAT: %s\n", msg_reason); 85 } 86 87 static int __init nopat(char *str) 88 { 89 pat_disable("PAT support disabled via boot option."); 90 return 0; 91 } 92 early_param("nopat", nopat); 93 94 bool pat_enabled(void) 95 { 96 return pat_bp_enabled; 97 } 98 EXPORT_SYMBOL_GPL(pat_enabled); 99 100 int pat_debug_enable; 101 102 static int __init pat_debug_setup(char *str) 103 { 104 pat_debug_enable = 1; 105 return 0; 106 } 107 __setup("debugpat", pat_debug_setup); 108 109 #ifdef CONFIG_X86_PAT 110 /* 111 * X86 PAT uses page flags arch_1 and uncached together to keep track of 112 * memory type of pages that have backing page struct. 113 * 114 * X86 PAT supports 4 different memory types: 115 * - _PAGE_CACHE_MODE_WB 116 * - _PAGE_CACHE_MODE_WC 117 * - _PAGE_CACHE_MODE_UC_MINUS 118 * - _PAGE_CACHE_MODE_WT 119 * 120 * _PAGE_CACHE_MODE_WB is the default type. 121 */ 122 123 #define _PGMT_WB 0 124 #define _PGMT_WC (1UL << PG_arch_1) 125 #define _PGMT_UC_MINUS (1UL << PG_uncached) 126 #define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) 127 #define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) 128 #define _PGMT_CLEAR_MASK (~_PGMT_MASK) 129 130 static inline enum page_cache_mode get_page_memtype(struct page *pg) 131 { 132 unsigned long pg_flags = pg->flags & _PGMT_MASK; 133 134 if (pg_flags == _PGMT_WB) 135 return _PAGE_CACHE_MODE_WB; 136 else if (pg_flags == _PGMT_WC) 137 return _PAGE_CACHE_MODE_WC; 138 else if (pg_flags == _PGMT_UC_MINUS) 139 return _PAGE_CACHE_MODE_UC_MINUS; 140 else 141 return _PAGE_CACHE_MODE_WT; 142 } 143 144 static inline void set_page_memtype(struct page *pg, 145 enum page_cache_mode memtype) 146 { 147 unsigned long memtype_flags; 148 unsigned long old_flags; 149 unsigned long new_flags; 150 151 switch (memtype) { 152 case _PAGE_CACHE_MODE_WC: 153 memtype_flags = _PGMT_WC; 154 break; 155 case _PAGE_CACHE_MODE_UC_MINUS: 156 memtype_flags = _PGMT_UC_MINUS; 157 break; 158 case _PAGE_CACHE_MODE_WT: 159 memtype_flags = _PGMT_WT; 160 break; 161 case _PAGE_CACHE_MODE_WB: 162 default: 163 memtype_flags = _PGMT_WB; 164 break; 165 } 166 167 do { 168 old_flags = pg->flags; 169 new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; 170 } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); 171 } 172 #else 173 static inline enum page_cache_mode get_page_memtype(struct page *pg) 174 { 175 return -1; 176 } 177 static inline void set_page_memtype(struct page *pg, 178 enum page_cache_mode memtype) 179 { 180 } 181 #endif 182 183 enum { 184 PAT_UC = 0, /* uncached */ 185 PAT_WC = 1, /* Write combining */ 186 PAT_WT = 4, /* Write Through */ 187 PAT_WP = 5, /* Write Protected */ 188 PAT_WB = 6, /* Write Back (default) */ 189 PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ 190 }; 191 192 #define CM(c) (_PAGE_CACHE_MODE_ ## c) 193 194 static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) 195 { 196 enum page_cache_mode cache; 197 char *cache_mode; 198 199 switch (pat_val) { 200 case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; 201 case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; 202 case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; 203 case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; 204 case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; 205 case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; 206 default: cache = CM(WB); cache_mode = "WB "; break; 207 } 208 209 memcpy(msg, cache_mode, 4); 210 211 return cache; 212 } 213 214 #undef CM 215 216 /* 217 * Update the cache mode to pgprot translation tables according to PAT 218 * configuration. 219 * Using lower indices is preferred, so we start with highest index. 220 */ 221 static void __init_cache_modes(u64 pat) 222 { 223 enum page_cache_mode cache; 224 char pat_msg[33]; 225 int i; 226 227 WARN_ON_ONCE(pat_cm_initialized); 228 229 pat_msg[32] = 0; 230 for (i = 7; i >= 0; i--) { 231 cache = pat_get_cache_mode((pat >> (i * 8)) & 7, 232 pat_msg + 4 * i); 233 update_cache_mode_entry(i, cache); 234 } 235 pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); 236 237 pat_cm_initialized = true; 238 } 239 240 #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) 241 242 static void pat_bp_init(u64 pat) 243 { 244 u64 tmp_pat; 245 246 if (!boot_cpu_has(X86_FEATURE_PAT)) { 247 pat_disable("PAT not supported by the CPU."); 248 return; 249 } 250 251 rdmsrl(MSR_IA32_CR_PAT, tmp_pat); 252 if (!tmp_pat) { 253 pat_disable("PAT support disabled by the firmware."); 254 return; 255 } 256 257 wrmsrl(MSR_IA32_CR_PAT, pat); 258 pat_bp_enabled = true; 259 260 __init_cache_modes(pat); 261 } 262 263 static void pat_ap_init(u64 pat) 264 { 265 if (!boot_cpu_has(X86_FEATURE_PAT)) { 266 /* 267 * If this happens we are on a secondary CPU, but switched to 268 * PAT on the boot CPU. We have no way to undo PAT. 269 */ 270 panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); 271 } 272 273 wrmsrl(MSR_IA32_CR_PAT, pat); 274 } 275 276 void init_cache_modes(void) 277 { 278 u64 pat = 0; 279 280 if (pat_cm_initialized) 281 return; 282 283 if (boot_cpu_has(X86_FEATURE_PAT)) { 284 /* 285 * CPU supports PAT. Set PAT table to be consistent with 286 * PAT MSR. This case supports "nopat" boot option, and 287 * virtual machine environments which support PAT without 288 * MTRRs. In specific, Xen has unique setup to PAT MSR. 289 * 290 * If PAT MSR returns 0, it is considered invalid and emulates 291 * as No PAT. 292 */ 293 rdmsrl(MSR_IA32_CR_PAT, pat); 294 } 295 296 if (!pat) { 297 /* 298 * No PAT. Emulate the PAT table that corresponds to the two 299 * cache bits, PWT (Write Through) and PCD (Cache Disable). 300 * This setup is also the same as the BIOS default setup. 301 * 302 * PTE encoding: 303 * 304 * PCD 305 * |PWT PAT 306 * || slot 307 * 00 0 WB : _PAGE_CACHE_MODE_WB 308 * 01 1 WT : _PAGE_CACHE_MODE_WT 309 * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 310 * 11 3 UC : _PAGE_CACHE_MODE_UC 311 * 312 * NOTE: When WC or WP is used, it is redirected to UC- per 313 * the default setup in __cachemode2pte_tbl[]. 314 */ 315 pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | 316 PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); 317 } 318 319 __init_cache_modes(pat); 320 } 321 322 /** 323 * pat_init - Initialize the PAT MSR and PAT table on the current CPU 324 * 325 * This function initializes PAT MSR and PAT table with an OS-defined value 326 * to enable additional cache attributes, WC, WT and WP. 327 * 328 * This function must be called on all CPUs using the specific sequence of 329 * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this 330 * procedure for PAT. 331 */ 332 void pat_init(void) 333 { 334 u64 pat; 335 struct cpuinfo_x86 *c = &boot_cpu_data; 336 337 #ifndef CONFIG_X86_PAT 338 pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); 339 #endif 340 341 if (pat_disabled) 342 return; 343 344 if ((c->x86_vendor == X86_VENDOR_INTEL) && 345 (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || 346 ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { 347 /* 348 * PAT support with the lower four entries. Intel Pentium 2, 349 * 3, M, and 4 are affected by PAT errata, which makes the 350 * upper four entries unusable. To be on the safe side, we don't 351 * use those. 352 * 353 * PTE encoding: 354 * PAT 355 * |PCD 356 * ||PWT PAT 357 * ||| slot 358 * 000 0 WB : _PAGE_CACHE_MODE_WB 359 * 001 1 WC : _PAGE_CACHE_MODE_WC 360 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 361 * 011 3 UC : _PAGE_CACHE_MODE_UC 362 * PAT bit unused 363 * 364 * NOTE: When WT or WP is used, it is redirected to UC- per 365 * the default setup in __cachemode2pte_tbl[]. 366 */ 367 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 368 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); 369 } else { 370 /* 371 * Full PAT support. We put WT in slot 7 to improve 372 * robustness in the presence of errata that might cause 373 * the high PAT bit to be ignored. This way, a buggy slot 7 374 * access will hit slot 3, and slot 3 is UC, so at worst 375 * we lose performance without causing a correctness issue. 376 * Pentium 4 erratum N46 is an example for such an erratum, 377 * although we try not to use PAT at all on affected CPUs. 378 * 379 * PTE encoding: 380 * PAT 381 * |PCD 382 * ||PWT PAT 383 * ||| slot 384 * 000 0 WB : _PAGE_CACHE_MODE_WB 385 * 001 1 WC : _PAGE_CACHE_MODE_WC 386 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 387 * 011 3 UC : _PAGE_CACHE_MODE_UC 388 * 100 4 WB : Reserved 389 * 101 5 WP : _PAGE_CACHE_MODE_WP 390 * 110 6 UC-: Reserved 391 * 111 7 WT : _PAGE_CACHE_MODE_WT 392 * 393 * The reserved slots are unused, but mapped to their 394 * corresponding types in the presence of PAT errata. 395 */ 396 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 397 PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); 398 } 399 400 if (!pat_bp_initialized) { 401 pat_bp_init(pat); 402 pat_bp_initialized = true; 403 } else { 404 pat_ap_init(pat); 405 } 406 } 407 408 #undef PAT 409 410 static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ 411 412 /* 413 * Does intersection of PAT memory type and MTRR memory type and returns 414 * the resulting memory type as PAT understands it. 415 * (Type in pat and mtrr will not have same value) 416 * The intersection is based on "Effective Memory Type" tables in IA-32 417 * SDM vol 3a 418 */ 419 static unsigned long pat_x_mtrr_type(u64 start, u64 end, 420 enum page_cache_mode req_type) 421 { 422 /* 423 * Look for MTRR hint to get the effective type in case where PAT 424 * request is for WB. 425 */ 426 if (req_type == _PAGE_CACHE_MODE_WB) { 427 u8 mtrr_type, uniform; 428 429 mtrr_type = mtrr_type_lookup(start, end, &uniform); 430 if (mtrr_type != MTRR_TYPE_WRBACK) 431 return _PAGE_CACHE_MODE_UC_MINUS; 432 433 return _PAGE_CACHE_MODE_WB; 434 } 435 436 return req_type; 437 } 438 439 struct pagerange_state { 440 unsigned long cur_pfn; 441 int ram; 442 int not_ram; 443 }; 444 445 static int 446 pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) 447 { 448 struct pagerange_state *state = arg; 449 450 state->not_ram |= initial_pfn > state->cur_pfn; 451 state->ram |= total_nr_pages > 0; 452 state->cur_pfn = initial_pfn + total_nr_pages; 453 454 return state->ram && state->not_ram; 455 } 456 457 static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) 458 { 459 int ret = 0; 460 unsigned long start_pfn = start >> PAGE_SHIFT; 461 unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 462 struct pagerange_state state = {start_pfn, 0, 0}; 463 464 /* 465 * For legacy reasons, physical address range in the legacy ISA 466 * region is tracked as non-RAM. This will allow users of 467 * /dev/mem to map portions of legacy ISA region, even when 468 * some of those portions are listed(or not even listed) with 469 * different e820 types(RAM/reserved/..) 470 */ 471 if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) 472 start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; 473 474 if (start_pfn < end_pfn) { 475 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, 476 &state, pagerange_is_ram_callback); 477 } 478 479 return (ret > 0) ? -1 : (state.ram ? 1 : 0); 480 } 481 482 /* 483 * For RAM pages, we use page flags to mark the pages with appropriate type. 484 * The page flags are limited to four types, WB (default), WC, WT and UC-. 485 * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting 486 * a new memory type is only allowed for a page mapped with the default WB 487 * type. 488 * 489 * Here we do two passes: 490 * - Find the memtype of all the pages in the range, look for any conflicts. 491 * - In case of no conflicts, set the new memtype for pages in the range. 492 */ 493 static int reserve_ram_pages_type(u64 start, u64 end, 494 enum page_cache_mode req_type, 495 enum page_cache_mode *new_type) 496 { 497 struct page *page; 498 u64 pfn; 499 500 if (req_type == _PAGE_CACHE_MODE_WP) { 501 if (new_type) 502 *new_type = _PAGE_CACHE_MODE_UC_MINUS; 503 return -EINVAL; 504 } 505 506 if (req_type == _PAGE_CACHE_MODE_UC) { 507 /* We do not support strong UC */ 508 WARN_ON_ONCE(1); 509 req_type = _PAGE_CACHE_MODE_UC_MINUS; 510 } 511 512 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 513 enum page_cache_mode type; 514 515 page = pfn_to_page(pfn); 516 type = get_page_memtype(page); 517 if (type != _PAGE_CACHE_MODE_WB) { 518 pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", 519 start, end - 1, type, req_type); 520 if (new_type) 521 *new_type = type; 522 523 return -EBUSY; 524 } 525 } 526 527 if (new_type) 528 *new_type = req_type; 529 530 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 531 page = pfn_to_page(pfn); 532 set_page_memtype(page, req_type); 533 } 534 return 0; 535 } 536 537 static int free_ram_pages_type(u64 start, u64 end) 538 { 539 struct page *page; 540 u64 pfn; 541 542 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 543 page = pfn_to_page(pfn); 544 set_page_memtype(page, _PAGE_CACHE_MODE_WB); 545 } 546 return 0; 547 } 548 549 static u64 sanitize_phys(u64 address) 550 { 551 /* 552 * When changing the memtype for pages containing poison allow 553 * for a "decoy" virtual address (bit 63 clear) passed to 554 * set_memory_X(). __pa() on a "decoy" address results in a 555 * physical address with bit 63 set. 556 * 557 * Decoy addresses are not present for 32-bit builds, see 558 * set_mce_nospec(). 559 */ 560 if (IS_ENABLED(CONFIG_X86_64)) 561 return address & __PHYSICAL_MASK; 562 return address; 563 } 564 565 /* 566 * req_type typically has one of the: 567 * - _PAGE_CACHE_MODE_WB 568 * - _PAGE_CACHE_MODE_WC 569 * - _PAGE_CACHE_MODE_UC_MINUS 570 * - _PAGE_CACHE_MODE_UC 571 * - _PAGE_CACHE_MODE_WT 572 * 573 * If new_type is NULL, function will return an error if it cannot reserve the 574 * region with req_type. If new_type is non-NULL, function will return 575 * available type in new_type in case of no error. In case of any error 576 * it will return a negative return value. 577 */ 578 int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type, 579 enum page_cache_mode *new_type) 580 { 581 struct memtype *entry_new; 582 enum page_cache_mode actual_type; 583 int is_range_ram; 584 int err = 0; 585 586 start = sanitize_phys(start); 587 end = sanitize_phys(end); 588 if (start >= end) { 589 WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, 590 start, end - 1, cattr_name(req_type)); 591 return -EINVAL; 592 } 593 594 if (!pat_enabled()) { 595 /* This is identical to page table setting without PAT */ 596 if (new_type) 597 *new_type = req_type; 598 return 0; 599 } 600 601 /* Low ISA region is always mapped WB in page table. No need to track */ 602 if (x86_platform.is_untracked_pat_range(start, end)) { 603 if (new_type) 604 *new_type = _PAGE_CACHE_MODE_WB; 605 return 0; 606 } 607 608 /* 609 * Call mtrr_lookup to get the type hint. This is an 610 * optimization for /dev/mem mmap'ers into WB memory (BIOS 611 * tools and ACPI tools). Use WB request for WB memory and use 612 * UC_MINUS otherwise. 613 */ 614 actual_type = pat_x_mtrr_type(start, end, req_type); 615 616 if (new_type) 617 *new_type = actual_type; 618 619 is_range_ram = pat_pagerange_is_ram(start, end); 620 if (is_range_ram == 1) { 621 622 err = reserve_ram_pages_type(start, end, req_type, new_type); 623 624 return err; 625 } else if (is_range_ram < 0) { 626 return -EINVAL; 627 } 628 629 entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL); 630 if (!entry_new) 631 return -ENOMEM; 632 633 entry_new->start = start; 634 entry_new->end = end; 635 entry_new->type = actual_type; 636 637 spin_lock(&memtype_lock); 638 639 err = memtype_check_insert(entry_new, new_type); 640 if (err) { 641 pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n", 642 start, end - 1, 643 cattr_name(entry_new->type), cattr_name(req_type)); 644 kfree(entry_new); 645 spin_unlock(&memtype_lock); 646 647 return err; 648 } 649 650 spin_unlock(&memtype_lock); 651 652 dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", 653 start, end - 1, cattr_name(entry_new->type), cattr_name(req_type), 654 new_type ? cattr_name(*new_type) : "-"); 655 656 return err; 657 } 658 659 int memtype_free(u64 start, u64 end) 660 { 661 int is_range_ram; 662 struct memtype *entry_old; 663 664 if (!pat_enabled()) 665 return 0; 666 667 start = sanitize_phys(start); 668 end = sanitize_phys(end); 669 670 /* Low ISA region is always mapped WB. No need to track */ 671 if (x86_platform.is_untracked_pat_range(start, end)) 672 return 0; 673 674 is_range_ram = pat_pagerange_is_ram(start, end); 675 if (is_range_ram == 1) 676 return free_ram_pages_type(start, end); 677 if (is_range_ram < 0) 678 return -EINVAL; 679 680 spin_lock(&memtype_lock); 681 entry_old = memtype_erase(start, end); 682 spin_unlock(&memtype_lock); 683 684 if (IS_ERR(entry_old)) { 685 pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", 686 current->comm, current->pid, start, end - 1); 687 return -EINVAL; 688 } 689 690 kfree(entry_old); 691 692 dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1); 693 694 return 0; 695 } 696 697 698 /** 699 * lookup_memtype - Looksup the memory type for a physical address 700 * @paddr: physical address of which memory type needs to be looked up 701 * 702 * Only to be called when PAT is enabled 703 * 704 * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS 705 * or _PAGE_CACHE_MODE_WT. 706 */ 707 static enum page_cache_mode lookup_memtype(u64 paddr) 708 { 709 enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; 710 struct memtype *entry; 711 712 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) 713 return rettype; 714 715 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 716 struct page *page; 717 718 page = pfn_to_page(paddr >> PAGE_SHIFT); 719 return get_page_memtype(page); 720 } 721 722 spin_lock(&memtype_lock); 723 724 entry = memtype_lookup(paddr); 725 if (entry != NULL) 726 rettype = entry->type; 727 else 728 rettype = _PAGE_CACHE_MODE_UC_MINUS; 729 730 spin_unlock(&memtype_lock); 731 732 return rettype; 733 } 734 735 /** 736 * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type 737 * of @pfn cannot be overridden by UC MTRR memory type. 738 * 739 * Only to be called when PAT is enabled. 740 * 741 * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC. 742 * Returns false in other cases. 743 */ 744 bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) 745 { 746 enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn)); 747 748 return cm == _PAGE_CACHE_MODE_UC || 749 cm == _PAGE_CACHE_MODE_UC_MINUS || 750 cm == _PAGE_CACHE_MODE_WC; 751 } 752 EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); 753 754 /** 755 * memtype_reserve_io - Request a memory type mapping for a region of memory 756 * @start: start (physical address) of the region 757 * @end: end (physical address) of the region 758 * @type: A pointer to memtype, with requested type. On success, requested 759 * or any other compatible type that was available for the region is returned 760 * 761 * On success, returns 0 762 * On failure, returns non-zero 763 */ 764 int memtype_reserve_io(resource_size_t start, resource_size_t end, 765 enum page_cache_mode *type) 766 { 767 resource_size_t size = end - start; 768 enum page_cache_mode req_type = *type; 769 enum page_cache_mode new_type; 770 int ret; 771 772 WARN_ON_ONCE(iomem_map_sanity_check(start, size)); 773 774 ret = memtype_reserve(start, end, req_type, &new_type); 775 if (ret) 776 goto out_err; 777 778 if (!is_new_memtype_allowed(start, size, req_type, new_type)) 779 goto out_free; 780 781 if (memtype_kernel_map_sync(start, size, new_type) < 0) 782 goto out_free; 783 784 *type = new_type; 785 return 0; 786 787 out_free: 788 memtype_free(start, end); 789 ret = -EBUSY; 790 out_err: 791 return ret; 792 } 793 794 /** 795 * memtype_free_io - Release a memory type mapping for a region of memory 796 * @start: start (physical address) of the region 797 * @end: end (physical address) of the region 798 */ 799 void memtype_free_io(resource_size_t start, resource_size_t end) 800 { 801 memtype_free(start, end); 802 } 803 804 int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) 805 { 806 enum page_cache_mode type = _PAGE_CACHE_MODE_WC; 807 808 return memtype_reserve_io(start, start + size, &type); 809 } 810 EXPORT_SYMBOL(arch_io_reserve_memtype_wc); 811 812 void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) 813 { 814 memtype_free_io(start, start + size); 815 } 816 EXPORT_SYMBOL(arch_io_free_memtype_wc); 817 818 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 819 unsigned long size, pgprot_t vma_prot) 820 { 821 if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) 822 vma_prot = pgprot_decrypted(vma_prot); 823 824 return vma_prot; 825 } 826 827 #ifdef CONFIG_STRICT_DEVMEM 828 /* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ 829 static inline int range_is_allowed(unsigned long pfn, unsigned long size) 830 { 831 return 1; 832 } 833 #else 834 /* This check is needed to avoid cache aliasing when PAT is enabled */ 835 static inline int range_is_allowed(unsigned long pfn, unsigned long size) 836 { 837 u64 from = ((u64)pfn) << PAGE_SHIFT; 838 u64 to = from + size; 839 u64 cursor = from; 840 841 if (!pat_enabled()) 842 return 1; 843 844 while (cursor < to) { 845 if (!devmem_is_allowed(pfn)) 846 return 0; 847 cursor += PAGE_SIZE; 848 pfn++; 849 } 850 return 1; 851 } 852 #endif /* CONFIG_STRICT_DEVMEM */ 853 854 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 855 unsigned long size, pgprot_t *vma_prot) 856 { 857 enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; 858 859 if (!range_is_allowed(pfn, size)) 860 return 0; 861 862 if (file->f_flags & O_DSYNC) 863 pcm = _PAGE_CACHE_MODE_UC_MINUS; 864 865 *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | 866 cachemode2protval(pcm)); 867 return 1; 868 } 869 870 /* 871 * Change the memory type for the physical address range in kernel identity 872 * mapping space if that range is a part of identity map. 873 */ 874 int memtype_kernel_map_sync(u64 base, unsigned long size, 875 enum page_cache_mode pcm) 876 { 877 unsigned long id_sz; 878 879 if (base > __pa(high_memory-1)) 880 return 0; 881 882 /* 883 * Some areas in the middle of the kernel identity range 884 * are not mapped, for example the PCI space. 885 */ 886 if (!page_is_ram(base >> PAGE_SHIFT)) 887 return 0; 888 889 id_sz = (__pa(high_memory-1) <= base + size) ? 890 __pa(high_memory) - base : size; 891 892 if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { 893 pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", 894 current->comm, current->pid, 895 cattr_name(pcm), 896 base, (unsigned long long)(base + size-1)); 897 return -EINVAL; 898 } 899 return 0; 900 } 901 902 /* 903 * Internal interface to reserve a range of physical memory with prot. 904 * Reserved non RAM regions only and after successful memtype_reserve, 905 * this func also keeps identity mapping (if any) in sync with this new prot. 906 */ 907 static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, 908 int strict_prot) 909 { 910 int is_ram = 0; 911 int ret; 912 enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); 913 enum page_cache_mode pcm = want_pcm; 914 915 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 916 917 /* 918 * reserve_pfn_range() for RAM pages. We do not refcount to keep 919 * track of number of mappings of RAM pages. We can assert that 920 * the type requested matches the type of first page in the range. 921 */ 922 if (is_ram) { 923 if (!pat_enabled()) 924 return 0; 925 926 pcm = lookup_memtype(paddr); 927 if (want_pcm != pcm) { 928 pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", 929 current->comm, current->pid, 930 cattr_name(want_pcm), 931 (unsigned long long)paddr, 932 (unsigned long long)(paddr + size - 1), 933 cattr_name(pcm)); 934 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 935 (~_PAGE_CACHE_MASK)) | 936 cachemode2protval(pcm)); 937 } 938 return 0; 939 } 940 941 ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm); 942 if (ret) 943 return ret; 944 945 if (pcm != want_pcm) { 946 if (strict_prot || 947 !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { 948 memtype_free(paddr, paddr + size); 949 pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", 950 current->comm, current->pid, 951 cattr_name(want_pcm), 952 (unsigned long long)paddr, 953 (unsigned long long)(paddr + size - 1), 954 cattr_name(pcm)); 955 return -EINVAL; 956 } 957 /* 958 * We allow returning different type than the one requested in 959 * non strict case. 960 */ 961 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 962 (~_PAGE_CACHE_MASK)) | 963 cachemode2protval(pcm)); 964 } 965 966 if (memtype_kernel_map_sync(paddr, size, pcm) < 0) { 967 memtype_free(paddr, paddr + size); 968 return -EINVAL; 969 } 970 return 0; 971 } 972 973 /* 974 * Internal interface to free a range of physical memory. 975 * Frees non RAM regions only. 976 */ 977 static void free_pfn_range(u64 paddr, unsigned long size) 978 { 979 int is_ram; 980 981 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 982 if (is_ram == 0) 983 memtype_free(paddr, paddr + size); 984 } 985 986 /* 987 * track_pfn_copy is called when vma that is covering the pfnmap gets 988 * copied through copy_page_range(). 989 * 990 * If the vma has a linear pfn mapping for the entire range, we get the prot 991 * from pte and reserve the entire vma range with single reserve_pfn_range call. 992 */ 993 int track_pfn_copy(struct vm_area_struct *vma) 994 { 995 resource_size_t paddr; 996 unsigned long prot; 997 unsigned long vma_size = vma->vm_end - vma->vm_start; 998 pgprot_t pgprot; 999 1000 if (vma->vm_flags & VM_PAT) { 1001 /* 1002 * reserve the whole chunk covered by vma. We need the 1003 * starting address and protection from pte. 1004 */ 1005 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 1006 WARN_ON_ONCE(1); 1007 return -EINVAL; 1008 } 1009 pgprot = __pgprot(prot); 1010 return reserve_pfn_range(paddr, vma_size, &pgprot, 1); 1011 } 1012 1013 return 0; 1014 } 1015 1016 /* 1017 * prot is passed in as a parameter for the new mapping. If the vma has 1018 * a linear pfn mapping for the entire range, or no vma is provided, 1019 * reserve the entire pfn + size range with single reserve_pfn_range 1020 * call. 1021 */ 1022 int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, 1023 unsigned long pfn, unsigned long addr, unsigned long size) 1024 { 1025 resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; 1026 enum page_cache_mode pcm; 1027 1028 /* reserve the whole chunk starting from paddr */ 1029 if (!vma || (addr == vma->vm_start 1030 && size == (vma->vm_end - vma->vm_start))) { 1031 int ret; 1032 1033 ret = reserve_pfn_range(paddr, size, prot, 0); 1034 if (ret == 0 && vma) 1035 vma->vm_flags |= VM_PAT; 1036 return ret; 1037 } 1038 1039 if (!pat_enabled()) 1040 return 0; 1041 1042 /* 1043 * For anything smaller than the vma size we set prot based on the 1044 * lookup. 1045 */ 1046 pcm = lookup_memtype(paddr); 1047 1048 /* Check memtype for the remaining pages */ 1049 while (size > PAGE_SIZE) { 1050 size -= PAGE_SIZE; 1051 paddr += PAGE_SIZE; 1052 if (pcm != lookup_memtype(paddr)) 1053 return -EINVAL; 1054 } 1055 1056 *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | 1057 cachemode2protval(pcm)); 1058 1059 return 0; 1060 } 1061 1062 void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) 1063 { 1064 enum page_cache_mode pcm; 1065 1066 if (!pat_enabled()) 1067 return; 1068 1069 /* Set prot based on lookup */ 1070 pcm = lookup_memtype(pfn_t_to_phys(pfn)); 1071 *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | 1072 cachemode2protval(pcm)); 1073 } 1074 1075 /* 1076 * untrack_pfn is called while unmapping a pfnmap for a region. 1077 * untrack can be called for a specific region indicated by pfn and size or 1078 * can be for the entire vma (in which case pfn, size are zero). 1079 */ 1080 void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, 1081 unsigned long size) 1082 { 1083 resource_size_t paddr; 1084 unsigned long prot; 1085 1086 if (vma && !(vma->vm_flags & VM_PAT)) 1087 return; 1088 1089 /* free the chunk starting from pfn or the whole chunk */ 1090 paddr = (resource_size_t)pfn << PAGE_SHIFT; 1091 if (!paddr && !size) { 1092 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 1093 WARN_ON_ONCE(1); 1094 return; 1095 } 1096 1097 size = vma->vm_end - vma->vm_start; 1098 } 1099 free_pfn_range(paddr, size); 1100 if (vma) 1101 vma->vm_flags &= ~VM_PAT; 1102 } 1103 1104 /* 1105 * untrack_pfn_moved is called, while mremapping a pfnmap for a new region, 1106 * with the old vma after its pfnmap page table has been removed. The new 1107 * vma has a new pfnmap to the same pfn & cache type with VM_PAT set. 1108 */ 1109 void untrack_pfn_moved(struct vm_area_struct *vma) 1110 { 1111 vma->vm_flags &= ~VM_PAT; 1112 } 1113 1114 pgprot_t pgprot_writecombine(pgprot_t prot) 1115 { 1116 return __pgprot(pgprot_val(prot) | 1117 cachemode2protval(_PAGE_CACHE_MODE_WC)); 1118 } 1119 EXPORT_SYMBOL_GPL(pgprot_writecombine); 1120 1121 pgprot_t pgprot_writethrough(pgprot_t prot) 1122 { 1123 return __pgprot(pgprot_val(prot) | 1124 cachemode2protval(_PAGE_CACHE_MODE_WT)); 1125 } 1126 EXPORT_SYMBOL_GPL(pgprot_writethrough); 1127 1128 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) 1129 1130 /* 1131 * We are allocating a temporary printout-entry to be passed 1132 * between seq_start()/next() and seq_show(): 1133 */ 1134 static struct memtype *memtype_get_idx(loff_t pos) 1135 { 1136 struct memtype *entry_print; 1137 int ret; 1138 1139 entry_print = kzalloc(sizeof(struct memtype), GFP_KERNEL); 1140 if (!entry_print) 1141 return NULL; 1142 1143 spin_lock(&memtype_lock); 1144 ret = memtype_copy_nth_element(entry_print, pos); 1145 spin_unlock(&memtype_lock); 1146 1147 /* Free it on error: */ 1148 if (ret) { 1149 kfree(entry_print); 1150 return NULL; 1151 } 1152 1153 return entry_print; 1154 } 1155 1156 static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) 1157 { 1158 if (*pos == 0) { 1159 ++*pos; 1160 seq_puts(seq, "PAT memtype list:\n"); 1161 } 1162 1163 return memtype_get_idx(*pos); 1164 } 1165 1166 static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1167 { 1168 ++*pos; 1169 return memtype_get_idx(*pos); 1170 } 1171 1172 static void memtype_seq_stop(struct seq_file *seq, void *v) 1173 { 1174 } 1175 1176 static int memtype_seq_show(struct seq_file *seq, void *v) 1177 { 1178 struct memtype *entry_print = (struct memtype *)v; 1179 1180 seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n", 1181 entry_print->start, 1182 entry_print->end, 1183 cattr_name(entry_print->type)); 1184 1185 kfree(entry_print); 1186 1187 return 0; 1188 } 1189 1190 static const struct seq_operations memtype_seq_ops = { 1191 .start = memtype_seq_start, 1192 .next = memtype_seq_next, 1193 .stop = memtype_seq_stop, 1194 .show = memtype_seq_show, 1195 }; 1196 1197 static int memtype_seq_open(struct inode *inode, struct file *file) 1198 { 1199 return seq_open(file, &memtype_seq_ops); 1200 } 1201 1202 static const struct file_operations memtype_fops = { 1203 .open = memtype_seq_open, 1204 .read = seq_read, 1205 .llseek = seq_lseek, 1206 .release = seq_release, 1207 }; 1208 1209 static int __init pat_memtype_list_init(void) 1210 { 1211 if (pat_enabled()) { 1212 debugfs_create_file("pat_memtype_list", S_IRUSR, 1213 arch_debugfs_dir, NULL, &memtype_fops); 1214 } 1215 return 0; 1216 } 1217 late_initcall(pat_memtype_list_init); 1218 1219 #endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ 1220