1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Page Attribute Table (PAT) support: handle memory caching attributes in page tables. 4 * 5 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> 6 * Suresh B Siddha <suresh.b.siddha@intel.com> 7 * 8 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 9 * 10 * Basic principles: 11 * 12 * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and 13 * the kernel to set one of a handful of 'caching type' attributes for physical 14 * memory ranges: uncached, write-combining, write-through, write-protected, 15 * and the most commonly used and default attribute: write-back caching. 16 * 17 * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is 18 * a hardware interface to enumerate a limited number of physical memory ranges 19 * and set their caching attributes explicitly, programmed into the CPU via MSRs. 20 * Even modern CPUs have MTRRs enabled - but these are typically not touched 21 * by the kernel or by user-space (such as the X server), we rely on PAT for any 22 * additional cache attribute logic. 23 * 24 * PAT doesn't work via explicit memory ranges, but uses page table entries to add 25 * cache attribute information to the mapped memory range: there's 3 bits used, 26 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the 27 * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT). 28 * 29 * ( There's a metric ton of finer details, such as compatibility with CPU quirks 30 * that only support 4 types of PAT entries, and interaction with MTRRs, see 31 * below for details. ) 32 */ 33 34 #include <linux/seq_file.h> 35 #include <linux/memblock.h> 36 #include <linux/debugfs.h> 37 #include <linux/ioport.h> 38 #include <linux/kernel.h> 39 #include <linux/pfn_t.h> 40 #include <linux/slab.h> 41 #include <linux/mm.h> 42 #include <linux/fs.h> 43 #include <linux/rbtree.h> 44 45 #include <asm/cacheflush.h> 46 #include <asm/processor.h> 47 #include <asm/tlbflush.h> 48 #include <asm/x86_init.h> 49 #include <asm/fcntl.h> 50 #include <asm/e820/api.h> 51 #include <asm/mtrr.h> 52 #include <asm/page.h> 53 #include <asm/msr.h> 54 #include <asm/memtype.h> 55 #include <asm/io.h> 56 57 #include "memtype.h" 58 #include "../mm_internal.h" 59 60 #undef pr_fmt 61 #define pr_fmt(fmt) "" fmt 62 63 static bool __read_mostly pat_bp_initialized; 64 static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); 65 static bool __initdata pat_force_disabled = !IS_ENABLED(CONFIG_X86_PAT); 66 static bool __read_mostly pat_bp_enabled; 67 static bool __read_mostly pat_cm_initialized; 68 69 /* 70 * PAT support is enabled by default, but can be disabled for 71 * various user-requested or hardware-forced reasons: 72 */ 73 void pat_disable(const char *msg_reason) 74 { 75 if (pat_disabled) 76 return; 77 78 if (pat_bp_initialized) { 79 WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n"); 80 return; 81 } 82 83 pat_disabled = true; 84 pr_info("x86/PAT: %s\n", msg_reason); 85 } 86 87 static int __init nopat(char *str) 88 { 89 pat_disable("PAT support disabled via boot option."); 90 pat_force_disabled = true; 91 return 0; 92 } 93 early_param("nopat", nopat); 94 95 bool pat_enabled(void) 96 { 97 return pat_bp_enabled; 98 } 99 EXPORT_SYMBOL_GPL(pat_enabled); 100 101 int pat_debug_enable; 102 103 static int __init pat_debug_setup(char *str) 104 { 105 pat_debug_enable = 1; 106 return 1; 107 } 108 __setup("debugpat", pat_debug_setup); 109 110 #ifdef CONFIG_X86_PAT 111 /* 112 * X86 PAT uses page flags arch_1 and uncached together to keep track of 113 * memory type of pages that have backing page struct. 114 * 115 * X86 PAT supports 4 different memory types: 116 * - _PAGE_CACHE_MODE_WB 117 * - _PAGE_CACHE_MODE_WC 118 * - _PAGE_CACHE_MODE_UC_MINUS 119 * - _PAGE_CACHE_MODE_WT 120 * 121 * _PAGE_CACHE_MODE_WB is the default type. 122 */ 123 124 #define _PGMT_WB 0 125 #define _PGMT_WC (1UL << PG_arch_1) 126 #define _PGMT_UC_MINUS (1UL << PG_uncached) 127 #define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) 128 #define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) 129 #define _PGMT_CLEAR_MASK (~_PGMT_MASK) 130 131 static inline enum page_cache_mode get_page_memtype(struct page *pg) 132 { 133 unsigned long pg_flags = pg->flags & _PGMT_MASK; 134 135 if (pg_flags == _PGMT_WB) 136 return _PAGE_CACHE_MODE_WB; 137 else if (pg_flags == _PGMT_WC) 138 return _PAGE_CACHE_MODE_WC; 139 else if (pg_flags == _PGMT_UC_MINUS) 140 return _PAGE_CACHE_MODE_UC_MINUS; 141 else 142 return _PAGE_CACHE_MODE_WT; 143 } 144 145 static inline void set_page_memtype(struct page *pg, 146 enum page_cache_mode memtype) 147 { 148 unsigned long memtype_flags; 149 unsigned long old_flags; 150 unsigned long new_flags; 151 152 switch (memtype) { 153 case _PAGE_CACHE_MODE_WC: 154 memtype_flags = _PGMT_WC; 155 break; 156 case _PAGE_CACHE_MODE_UC_MINUS: 157 memtype_flags = _PGMT_UC_MINUS; 158 break; 159 case _PAGE_CACHE_MODE_WT: 160 memtype_flags = _PGMT_WT; 161 break; 162 case _PAGE_CACHE_MODE_WB: 163 default: 164 memtype_flags = _PGMT_WB; 165 break; 166 } 167 168 do { 169 old_flags = pg->flags; 170 new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; 171 } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); 172 } 173 #else 174 static inline enum page_cache_mode get_page_memtype(struct page *pg) 175 { 176 return -1; 177 } 178 static inline void set_page_memtype(struct page *pg, 179 enum page_cache_mode memtype) 180 { 181 } 182 #endif 183 184 enum { 185 PAT_UC = 0, /* uncached */ 186 PAT_WC = 1, /* Write combining */ 187 PAT_WT = 4, /* Write Through */ 188 PAT_WP = 5, /* Write Protected */ 189 PAT_WB = 6, /* Write Back (default) */ 190 PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ 191 }; 192 193 #define CM(c) (_PAGE_CACHE_MODE_ ## c) 194 195 static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) 196 { 197 enum page_cache_mode cache; 198 char *cache_mode; 199 200 switch (pat_val) { 201 case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; 202 case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; 203 case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; 204 case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; 205 case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; 206 case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; 207 default: cache = CM(WB); cache_mode = "WB "; break; 208 } 209 210 memcpy(msg, cache_mode, 4); 211 212 return cache; 213 } 214 215 #undef CM 216 217 /* 218 * Update the cache mode to pgprot translation tables according to PAT 219 * configuration. 220 * Using lower indices is preferred, so we start with highest index. 221 */ 222 static void __init_cache_modes(u64 pat) 223 { 224 enum page_cache_mode cache; 225 char pat_msg[33]; 226 int i; 227 228 WARN_ON_ONCE(pat_cm_initialized); 229 230 pat_msg[32] = 0; 231 for (i = 7; i >= 0; i--) { 232 cache = pat_get_cache_mode((pat >> (i * 8)) & 7, 233 pat_msg + 4 * i); 234 update_cache_mode_entry(i, cache); 235 } 236 pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); 237 238 pat_cm_initialized = true; 239 } 240 241 #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) 242 243 static void pat_bp_init(u64 pat) 244 { 245 u64 tmp_pat; 246 247 if (!boot_cpu_has(X86_FEATURE_PAT)) { 248 pat_disable("PAT not supported by the CPU."); 249 return; 250 } 251 252 rdmsrl(MSR_IA32_CR_PAT, tmp_pat); 253 if (!tmp_pat) { 254 pat_disable("PAT support disabled by the firmware."); 255 return; 256 } 257 258 wrmsrl(MSR_IA32_CR_PAT, pat); 259 pat_bp_enabled = true; 260 261 __init_cache_modes(pat); 262 } 263 264 static void pat_ap_init(u64 pat) 265 { 266 if (!boot_cpu_has(X86_FEATURE_PAT)) { 267 /* 268 * If this happens we are on a secondary CPU, but switched to 269 * PAT on the boot CPU. We have no way to undo PAT. 270 */ 271 panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); 272 } 273 274 wrmsrl(MSR_IA32_CR_PAT, pat); 275 } 276 277 void __init init_cache_modes(void) 278 { 279 u64 pat = 0; 280 281 if (pat_cm_initialized) 282 return; 283 284 if (boot_cpu_has(X86_FEATURE_PAT)) { 285 /* 286 * CPU supports PAT. Set PAT table to be consistent with 287 * PAT MSR. This case supports "nopat" boot option, and 288 * virtual machine environments which support PAT without 289 * MTRRs. In specific, Xen has unique setup to PAT MSR. 290 * 291 * If PAT MSR returns 0, it is considered invalid and emulates 292 * as No PAT. 293 */ 294 rdmsrl(MSR_IA32_CR_PAT, pat); 295 } 296 297 if (!pat) { 298 /* 299 * No PAT. Emulate the PAT table that corresponds to the two 300 * cache bits, PWT (Write Through) and PCD (Cache Disable). 301 * This setup is also the same as the BIOS default setup. 302 * 303 * PTE encoding: 304 * 305 * PCD 306 * |PWT PAT 307 * || slot 308 * 00 0 WB : _PAGE_CACHE_MODE_WB 309 * 01 1 WT : _PAGE_CACHE_MODE_WT 310 * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 311 * 11 3 UC : _PAGE_CACHE_MODE_UC 312 * 313 * NOTE: When WC or WP is used, it is redirected to UC- per 314 * the default setup in __cachemode2pte_tbl[]. 315 */ 316 pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | 317 PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); 318 } else if (!pat_force_disabled && cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) { 319 /* 320 * Clearly PAT is enabled underneath. Allow pat_enabled() to 321 * reflect this. 322 */ 323 pat_bp_enabled = true; 324 } 325 326 __init_cache_modes(pat); 327 } 328 329 /** 330 * pat_init - Initialize the PAT MSR and PAT table on the current CPU 331 * 332 * This function initializes PAT MSR and PAT table with an OS-defined value 333 * to enable additional cache attributes, WC, WT and WP. 334 * 335 * This function must be called on all CPUs using the specific sequence of 336 * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this 337 * procedure for PAT. 338 */ 339 void pat_init(void) 340 { 341 u64 pat; 342 struct cpuinfo_x86 *c = &boot_cpu_data; 343 344 #ifndef CONFIG_X86_PAT 345 pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); 346 #endif 347 348 if (pat_disabled) 349 return; 350 351 if ((c->x86_vendor == X86_VENDOR_INTEL) && 352 (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || 353 ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { 354 /* 355 * PAT support with the lower four entries. Intel Pentium 2, 356 * 3, M, and 4 are affected by PAT errata, which makes the 357 * upper four entries unusable. To be on the safe side, we don't 358 * use those. 359 * 360 * PTE encoding: 361 * PAT 362 * |PCD 363 * ||PWT PAT 364 * ||| slot 365 * 000 0 WB : _PAGE_CACHE_MODE_WB 366 * 001 1 WC : _PAGE_CACHE_MODE_WC 367 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 368 * 011 3 UC : _PAGE_CACHE_MODE_UC 369 * PAT bit unused 370 * 371 * NOTE: When WT or WP is used, it is redirected to UC- per 372 * the default setup in __cachemode2pte_tbl[]. 373 */ 374 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 375 PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); 376 } else { 377 /* 378 * Full PAT support. We put WT in slot 7 to improve 379 * robustness in the presence of errata that might cause 380 * the high PAT bit to be ignored. This way, a buggy slot 7 381 * access will hit slot 3, and slot 3 is UC, so at worst 382 * we lose performance without causing a correctness issue. 383 * Pentium 4 erratum N46 is an example for such an erratum, 384 * although we try not to use PAT at all on affected CPUs. 385 * 386 * PTE encoding: 387 * PAT 388 * |PCD 389 * ||PWT PAT 390 * ||| slot 391 * 000 0 WB : _PAGE_CACHE_MODE_WB 392 * 001 1 WC : _PAGE_CACHE_MODE_WC 393 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 394 * 011 3 UC : _PAGE_CACHE_MODE_UC 395 * 100 4 WB : Reserved 396 * 101 5 WP : _PAGE_CACHE_MODE_WP 397 * 110 6 UC-: Reserved 398 * 111 7 WT : _PAGE_CACHE_MODE_WT 399 * 400 * The reserved slots are unused, but mapped to their 401 * corresponding types in the presence of PAT errata. 402 */ 403 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 404 PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); 405 } 406 407 if (!pat_bp_initialized) { 408 pat_bp_init(pat); 409 pat_bp_initialized = true; 410 } else { 411 pat_ap_init(pat); 412 } 413 } 414 415 #undef PAT 416 417 static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ 418 419 /* 420 * Does intersection of PAT memory type and MTRR memory type and returns 421 * the resulting memory type as PAT understands it. 422 * (Type in pat and mtrr will not have same value) 423 * The intersection is based on "Effective Memory Type" tables in IA-32 424 * SDM vol 3a 425 */ 426 static unsigned long pat_x_mtrr_type(u64 start, u64 end, 427 enum page_cache_mode req_type) 428 { 429 /* 430 * Look for MTRR hint to get the effective type in case where PAT 431 * request is for WB. 432 */ 433 if (req_type == _PAGE_CACHE_MODE_WB) { 434 u8 mtrr_type, uniform; 435 436 mtrr_type = mtrr_type_lookup(start, end, &uniform); 437 if (mtrr_type != MTRR_TYPE_WRBACK) 438 return _PAGE_CACHE_MODE_UC_MINUS; 439 440 return _PAGE_CACHE_MODE_WB; 441 } 442 443 return req_type; 444 } 445 446 struct pagerange_state { 447 unsigned long cur_pfn; 448 int ram; 449 int not_ram; 450 }; 451 452 static int 453 pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) 454 { 455 struct pagerange_state *state = arg; 456 457 state->not_ram |= initial_pfn > state->cur_pfn; 458 state->ram |= total_nr_pages > 0; 459 state->cur_pfn = initial_pfn + total_nr_pages; 460 461 return state->ram && state->not_ram; 462 } 463 464 static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) 465 { 466 int ret = 0; 467 unsigned long start_pfn = start >> PAGE_SHIFT; 468 unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 469 struct pagerange_state state = {start_pfn, 0, 0}; 470 471 /* 472 * For legacy reasons, physical address range in the legacy ISA 473 * region is tracked as non-RAM. This will allow users of 474 * /dev/mem to map portions of legacy ISA region, even when 475 * some of those portions are listed(or not even listed) with 476 * different e820 types(RAM/reserved/..) 477 */ 478 if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) 479 start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; 480 481 if (start_pfn < end_pfn) { 482 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, 483 &state, pagerange_is_ram_callback); 484 } 485 486 return (ret > 0) ? -1 : (state.ram ? 1 : 0); 487 } 488 489 /* 490 * For RAM pages, we use page flags to mark the pages with appropriate type. 491 * The page flags are limited to four types, WB (default), WC, WT and UC-. 492 * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting 493 * a new memory type is only allowed for a page mapped with the default WB 494 * type. 495 * 496 * Here we do two passes: 497 * - Find the memtype of all the pages in the range, look for any conflicts. 498 * - In case of no conflicts, set the new memtype for pages in the range. 499 */ 500 static int reserve_ram_pages_type(u64 start, u64 end, 501 enum page_cache_mode req_type, 502 enum page_cache_mode *new_type) 503 { 504 struct page *page; 505 u64 pfn; 506 507 if (req_type == _PAGE_CACHE_MODE_WP) { 508 if (new_type) 509 *new_type = _PAGE_CACHE_MODE_UC_MINUS; 510 return -EINVAL; 511 } 512 513 if (req_type == _PAGE_CACHE_MODE_UC) { 514 /* We do not support strong UC */ 515 WARN_ON_ONCE(1); 516 req_type = _PAGE_CACHE_MODE_UC_MINUS; 517 } 518 519 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 520 enum page_cache_mode type; 521 522 page = pfn_to_page(pfn); 523 type = get_page_memtype(page); 524 if (type != _PAGE_CACHE_MODE_WB) { 525 pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", 526 start, end - 1, type, req_type); 527 if (new_type) 528 *new_type = type; 529 530 return -EBUSY; 531 } 532 } 533 534 if (new_type) 535 *new_type = req_type; 536 537 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 538 page = pfn_to_page(pfn); 539 set_page_memtype(page, req_type); 540 } 541 return 0; 542 } 543 544 static int free_ram_pages_type(u64 start, u64 end) 545 { 546 struct page *page; 547 u64 pfn; 548 549 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { 550 page = pfn_to_page(pfn); 551 set_page_memtype(page, _PAGE_CACHE_MODE_WB); 552 } 553 return 0; 554 } 555 556 static u64 sanitize_phys(u64 address) 557 { 558 /* 559 * When changing the memtype for pages containing poison allow 560 * for a "decoy" virtual address (bit 63 clear) passed to 561 * set_memory_X(). __pa() on a "decoy" address results in a 562 * physical address with bit 63 set. 563 * 564 * Decoy addresses are not present for 32-bit builds, see 565 * set_mce_nospec(). 566 */ 567 if (IS_ENABLED(CONFIG_X86_64)) 568 return address & __PHYSICAL_MASK; 569 return address; 570 } 571 572 /* 573 * req_type typically has one of the: 574 * - _PAGE_CACHE_MODE_WB 575 * - _PAGE_CACHE_MODE_WC 576 * - _PAGE_CACHE_MODE_UC_MINUS 577 * - _PAGE_CACHE_MODE_UC 578 * - _PAGE_CACHE_MODE_WT 579 * 580 * If new_type is NULL, function will return an error if it cannot reserve the 581 * region with req_type. If new_type is non-NULL, function will return 582 * available type in new_type in case of no error. In case of any error 583 * it will return a negative return value. 584 */ 585 int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type, 586 enum page_cache_mode *new_type) 587 { 588 struct memtype *entry_new; 589 enum page_cache_mode actual_type; 590 int is_range_ram; 591 int err = 0; 592 593 start = sanitize_phys(start); 594 595 /* 596 * The end address passed into this function is exclusive, but 597 * sanitize_phys() expects an inclusive address. 598 */ 599 end = sanitize_phys(end - 1) + 1; 600 if (start >= end) { 601 WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, 602 start, end - 1, cattr_name(req_type)); 603 return -EINVAL; 604 } 605 606 if (!pat_enabled()) { 607 /* This is identical to page table setting without PAT */ 608 if (new_type) 609 *new_type = req_type; 610 return 0; 611 } 612 613 /* Low ISA region is always mapped WB in page table. No need to track */ 614 if (x86_platform.is_untracked_pat_range(start, end)) { 615 if (new_type) 616 *new_type = _PAGE_CACHE_MODE_WB; 617 return 0; 618 } 619 620 /* 621 * Call mtrr_lookup to get the type hint. This is an 622 * optimization for /dev/mem mmap'ers into WB memory (BIOS 623 * tools and ACPI tools). Use WB request for WB memory and use 624 * UC_MINUS otherwise. 625 */ 626 actual_type = pat_x_mtrr_type(start, end, req_type); 627 628 if (new_type) 629 *new_type = actual_type; 630 631 is_range_ram = pat_pagerange_is_ram(start, end); 632 if (is_range_ram == 1) { 633 634 err = reserve_ram_pages_type(start, end, req_type, new_type); 635 636 return err; 637 } else if (is_range_ram < 0) { 638 return -EINVAL; 639 } 640 641 entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL); 642 if (!entry_new) 643 return -ENOMEM; 644 645 entry_new->start = start; 646 entry_new->end = end; 647 entry_new->type = actual_type; 648 649 spin_lock(&memtype_lock); 650 651 err = memtype_check_insert(entry_new, new_type); 652 if (err) { 653 pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n", 654 start, end - 1, 655 cattr_name(entry_new->type), cattr_name(req_type)); 656 kfree(entry_new); 657 spin_unlock(&memtype_lock); 658 659 return err; 660 } 661 662 spin_unlock(&memtype_lock); 663 664 dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", 665 start, end - 1, cattr_name(entry_new->type), cattr_name(req_type), 666 new_type ? cattr_name(*new_type) : "-"); 667 668 return err; 669 } 670 671 int memtype_free(u64 start, u64 end) 672 { 673 int is_range_ram; 674 struct memtype *entry_old; 675 676 if (!pat_enabled()) 677 return 0; 678 679 start = sanitize_phys(start); 680 end = sanitize_phys(end); 681 682 /* Low ISA region is always mapped WB. No need to track */ 683 if (x86_platform.is_untracked_pat_range(start, end)) 684 return 0; 685 686 is_range_ram = pat_pagerange_is_ram(start, end); 687 if (is_range_ram == 1) 688 return free_ram_pages_type(start, end); 689 if (is_range_ram < 0) 690 return -EINVAL; 691 692 spin_lock(&memtype_lock); 693 entry_old = memtype_erase(start, end); 694 spin_unlock(&memtype_lock); 695 696 if (IS_ERR(entry_old)) { 697 pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", 698 current->comm, current->pid, start, end - 1); 699 return -EINVAL; 700 } 701 702 kfree(entry_old); 703 704 dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1); 705 706 return 0; 707 } 708 709 710 /** 711 * lookup_memtype - Looks up the memory type for a physical address 712 * @paddr: physical address of which memory type needs to be looked up 713 * 714 * Only to be called when PAT is enabled 715 * 716 * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS 717 * or _PAGE_CACHE_MODE_WT. 718 */ 719 static enum page_cache_mode lookup_memtype(u64 paddr) 720 { 721 enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; 722 struct memtype *entry; 723 724 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) 725 return rettype; 726 727 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 728 struct page *page; 729 730 page = pfn_to_page(paddr >> PAGE_SHIFT); 731 return get_page_memtype(page); 732 } 733 734 spin_lock(&memtype_lock); 735 736 entry = memtype_lookup(paddr); 737 if (entry != NULL) 738 rettype = entry->type; 739 else 740 rettype = _PAGE_CACHE_MODE_UC_MINUS; 741 742 spin_unlock(&memtype_lock); 743 744 return rettype; 745 } 746 747 /** 748 * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type 749 * of @pfn cannot be overridden by UC MTRR memory type. 750 * 751 * Only to be called when PAT is enabled. 752 * 753 * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC. 754 * Returns false in other cases. 755 */ 756 bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) 757 { 758 enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn)); 759 760 return cm == _PAGE_CACHE_MODE_UC || 761 cm == _PAGE_CACHE_MODE_UC_MINUS || 762 cm == _PAGE_CACHE_MODE_WC; 763 } 764 EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); 765 766 /** 767 * memtype_reserve_io - Request a memory type mapping for a region of memory 768 * @start: start (physical address) of the region 769 * @end: end (physical address) of the region 770 * @type: A pointer to memtype, with requested type. On success, requested 771 * or any other compatible type that was available for the region is returned 772 * 773 * On success, returns 0 774 * On failure, returns non-zero 775 */ 776 int memtype_reserve_io(resource_size_t start, resource_size_t end, 777 enum page_cache_mode *type) 778 { 779 resource_size_t size = end - start; 780 enum page_cache_mode req_type = *type; 781 enum page_cache_mode new_type; 782 int ret; 783 784 WARN_ON_ONCE(iomem_map_sanity_check(start, size)); 785 786 ret = memtype_reserve(start, end, req_type, &new_type); 787 if (ret) 788 goto out_err; 789 790 if (!is_new_memtype_allowed(start, size, req_type, new_type)) 791 goto out_free; 792 793 if (memtype_kernel_map_sync(start, size, new_type) < 0) 794 goto out_free; 795 796 *type = new_type; 797 return 0; 798 799 out_free: 800 memtype_free(start, end); 801 ret = -EBUSY; 802 out_err: 803 return ret; 804 } 805 806 /** 807 * memtype_free_io - Release a memory type mapping for a region of memory 808 * @start: start (physical address) of the region 809 * @end: end (physical address) of the region 810 */ 811 void memtype_free_io(resource_size_t start, resource_size_t end) 812 { 813 memtype_free(start, end); 814 } 815 816 #ifdef CONFIG_X86_PAT 817 int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) 818 { 819 enum page_cache_mode type = _PAGE_CACHE_MODE_WC; 820 821 return memtype_reserve_io(start, start + size, &type); 822 } 823 EXPORT_SYMBOL(arch_io_reserve_memtype_wc); 824 825 void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) 826 { 827 memtype_free_io(start, start + size); 828 } 829 EXPORT_SYMBOL(arch_io_free_memtype_wc); 830 #endif 831 832 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 833 unsigned long size, pgprot_t vma_prot) 834 { 835 if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) 836 vma_prot = pgprot_decrypted(vma_prot); 837 838 return vma_prot; 839 } 840 841 #ifdef CONFIG_STRICT_DEVMEM 842 /* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ 843 static inline int range_is_allowed(unsigned long pfn, unsigned long size) 844 { 845 return 1; 846 } 847 #else 848 /* This check is needed to avoid cache aliasing when PAT is enabled */ 849 static inline int range_is_allowed(unsigned long pfn, unsigned long size) 850 { 851 u64 from = ((u64)pfn) << PAGE_SHIFT; 852 u64 to = from + size; 853 u64 cursor = from; 854 855 if (!pat_enabled()) 856 return 1; 857 858 while (cursor < to) { 859 if (!devmem_is_allowed(pfn)) 860 return 0; 861 cursor += PAGE_SIZE; 862 pfn++; 863 } 864 return 1; 865 } 866 #endif /* CONFIG_STRICT_DEVMEM */ 867 868 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 869 unsigned long size, pgprot_t *vma_prot) 870 { 871 enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; 872 873 if (!range_is_allowed(pfn, size)) 874 return 0; 875 876 if (file->f_flags & O_DSYNC) 877 pcm = _PAGE_CACHE_MODE_UC_MINUS; 878 879 *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | 880 cachemode2protval(pcm)); 881 return 1; 882 } 883 884 /* 885 * Change the memory type for the physical address range in kernel identity 886 * mapping space if that range is a part of identity map. 887 */ 888 int memtype_kernel_map_sync(u64 base, unsigned long size, 889 enum page_cache_mode pcm) 890 { 891 unsigned long id_sz; 892 893 if (base > __pa(high_memory-1)) 894 return 0; 895 896 /* 897 * Some areas in the middle of the kernel identity range 898 * are not mapped, for example the PCI space. 899 */ 900 if (!page_is_ram(base >> PAGE_SHIFT)) 901 return 0; 902 903 id_sz = (__pa(high_memory-1) <= base + size) ? 904 __pa(high_memory) - base : size; 905 906 if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { 907 pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", 908 current->comm, current->pid, 909 cattr_name(pcm), 910 base, (unsigned long long)(base + size-1)); 911 return -EINVAL; 912 } 913 return 0; 914 } 915 916 /* 917 * Internal interface to reserve a range of physical memory with prot. 918 * Reserved non RAM regions only and after successful memtype_reserve, 919 * this func also keeps identity mapping (if any) in sync with this new prot. 920 */ 921 static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, 922 int strict_prot) 923 { 924 int is_ram = 0; 925 int ret; 926 enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); 927 enum page_cache_mode pcm = want_pcm; 928 929 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 930 931 /* 932 * reserve_pfn_range() for RAM pages. We do not refcount to keep 933 * track of number of mappings of RAM pages. We can assert that 934 * the type requested matches the type of first page in the range. 935 */ 936 if (is_ram) { 937 if (!pat_enabled()) 938 return 0; 939 940 pcm = lookup_memtype(paddr); 941 if (want_pcm != pcm) { 942 pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", 943 current->comm, current->pid, 944 cattr_name(want_pcm), 945 (unsigned long long)paddr, 946 (unsigned long long)(paddr + size - 1), 947 cattr_name(pcm)); 948 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 949 (~_PAGE_CACHE_MASK)) | 950 cachemode2protval(pcm)); 951 } 952 return 0; 953 } 954 955 ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm); 956 if (ret) 957 return ret; 958 959 if (pcm != want_pcm) { 960 if (strict_prot || 961 !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { 962 memtype_free(paddr, paddr + size); 963 pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", 964 current->comm, current->pid, 965 cattr_name(want_pcm), 966 (unsigned long long)paddr, 967 (unsigned long long)(paddr + size - 1), 968 cattr_name(pcm)); 969 return -EINVAL; 970 } 971 /* 972 * We allow returning different type than the one requested in 973 * non strict case. 974 */ 975 *vma_prot = __pgprot((pgprot_val(*vma_prot) & 976 (~_PAGE_CACHE_MASK)) | 977 cachemode2protval(pcm)); 978 } 979 980 if (memtype_kernel_map_sync(paddr, size, pcm) < 0) { 981 memtype_free(paddr, paddr + size); 982 return -EINVAL; 983 } 984 return 0; 985 } 986 987 /* 988 * Internal interface to free a range of physical memory. 989 * Frees non RAM regions only. 990 */ 991 static void free_pfn_range(u64 paddr, unsigned long size) 992 { 993 int is_ram; 994 995 is_ram = pat_pagerange_is_ram(paddr, paddr + size); 996 if (is_ram == 0) 997 memtype_free(paddr, paddr + size); 998 } 999 1000 /* 1001 * track_pfn_copy is called when vma that is covering the pfnmap gets 1002 * copied through copy_page_range(). 1003 * 1004 * If the vma has a linear pfn mapping for the entire range, we get the prot 1005 * from pte and reserve the entire vma range with single reserve_pfn_range call. 1006 */ 1007 int track_pfn_copy(struct vm_area_struct *vma) 1008 { 1009 resource_size_t paddr; 1010 unsigned long prot; 1011 unsigned long vma_size = vma->vm_end - vma->vm_start; 1012 pgprot_t pgprot; 1013 1014 if (vma->vm_flags & VM_PAT) { 1015 /* 1016 * reserve the whole chunk covered by vma. We need the 1017 * starting address and protection from pte. 1018 */ 1019 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 1020 WARN_ON_ONCE(1); 1021 return -EINVAL; 1022 } 1023 pgprot = __pgprot(prot); 1024 return reserve_pfn_range(paddr, vma_size, &pgprot, 1); 1025 } 1026 1027 return 0; 1028 } 1029 1030 /* 1031 * prot is passed in as a parameter for the new mapping. If the vma has 1032 * a linear pfn mapping for the entire range, or no vma is provided, 1033 * reserve the entire pfn + size range with single reserve_pfn_range 1034 * call. 1035 */ 1036 int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, 1037 unsigned long pfn, unsigned long addr, unsigned long size) 1038 { 1039 resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; 1040 enum page_cache_mode pcm; 1041 1042 /* reserve the whole chunk starting from paddr */ 1043 if (!vma || (addr == vma->vm_start 1044 && size == (vma->vm_end - vma->vm_start))) { 1045 int ret; 1046 1047 ret = reserve_pfn_range(paddr, size, prot, 0); 1048 if (ret == 0 && vma) 1049 vma->vm_flags |= VM_PAT; 1050 return ret; 1051 } 1052 1053 if (!pat_enabled()) 1054 return 0; 1055 1056 /* 1057 * For anything smaller than the vma size we set prot based on the 1058 * lookup. 1059 */ 1060 pcm = lookup_memtype(paddr); 1061 1062 /* Check memtype for the remaining pages */ 1063 while (size > PAGE_SIZE) { 1064 size -= PAGE_SIZE; 1065 paddr += PAGE_SIZE; 1066 if (pcm != lookup_memtype(paddr)) 1067 return -EINVAL; 1068 } 1069 1070 *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | 1071 cachemode2protval(pcm)); 1072 1073 return 0; 1074 } 1075 1076 void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) 1077 { 1078 enum page_cache_mode pcm; 1079 1080 if (!pat_enabled()) 1081 return; 1082 1083 /* Set prot based on lookup */ 1084 pcm = lookup_memtype(pfn_t_to_phys(pfn)); 1085 *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | 1086 cachemode2protval(pcm)); 1087 } 1088 1089 /* 1090 * untrack_pfn is called while unmapping a pfnmap for a region. 1091 * untrack can be called for a specific region indicated by pfn and size or 1092 * can be for the entire vma (in which case pfn, size are zero). 1093 */ 1094 void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, 1095 unsigned long size) 1096 { 1097 resource_size_t paddr; 1098 unsigned long prot; 1099 1100 if (vma && !(vma->vm_flags & VM_PAT)) 1101 return; 1102 1103 /* free the chunk starting from pfn or the whole chunk */ 1104 paddr = (resource_size_t)pfn << PAGE_SHIFT; 1105 if (!paddr && !size) { 1106 if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { 1107 WARN_ON_ONCE(1); 1108 return; 1109 } 1110 1111 size = vma->vm_end - vma->vm_start; 1112 } 1113 free_pfn_range(paddr, size); 1114 if (vma) 1115 vma->vm_flags &= ~VM_PAT; 1116 } 1117 1118 /* 1119 * untrack_pfn_moved is called, while mremapping a pfnmap for a new region, 1120 * with the old vma after its pfnmap page table has been removed. The new 1121 * vma has a new pfnmap to the same pfn & cache type with VM_PAT set. 1122 */ 1123 void untrack_pfn_moved(struct vm_area_struct *vma) 1124 { 1125 vma->vm_flags &= ~VM_PAT; 1126 } 1127 1128 pgprot_t pgprot_writecombine(pgprot_t prot) 1129 { 1130 return __pgprot(pgprot_val(prot) | 1131 cachemode2protval(_PAGE_CACHE_MODE_WC)); 1132 } 1133 EXPORT_SYMBOL_GPL(pgprot_writecombine); 1134 1135 pgprot_t pgprot_writethrough(pgprot_t prot) 1136 { 1137 return __pgprot(pgprot_val(prot) | 1138 cachemode2protval(_PAGE_CACHE_MODE_WT)); 1139 } 1140 EXPORT_SYMBOL_GPL(pgprot_writethrough); 1141 1142 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) 1143 1144 /* 1145 * We are allocating a temporary printout-entry to be passed 1146 * between seq_start()/next() and seq_show(): 1147 */ 1148 static struct memtype *memtype_get_idx(loff_t pos) 1149 { 1150 struct memtype *entry_print; 1151 int ret; 1152 1153 entry_print = kzalloc(sizeof(struct memtype), GFP_KERNEL); 1154 if (!entry_print) 1155 return NULL; 1156 1157 spin_lock(&memtype_lock); 1158 ret = memtype_copy_nth_element(entry_print, pos); 1159 spin_unlock(&memtype_lock); 1160 1161 /* Free it on error: */ 1162 if (ret) { 1163 kfree(entry_print); 1164 return NULL; 1165 } 1166 1167 return entry_print; 1168 } 1169 1170 static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) 1171 { 1172 if (*pos == 0) { 1173 ++*pos; 1174 seq_puts(seq, "PAT memtype list:\n"); 1175 } 1176 1177 return memtype_get_idx(*pos); 1178 } 1179 1180 static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1181 { 1182 kfree(v); 1183 ++*pos; 1184 return memtype_get_idx(*pos); 1185 } 1186 1187 static void memtype_seq_stop(struct seq_file *seq, void *v) 1188 { 1189 kfree(v); 1190 } 1191 1192 static int memtype_seq_show(struct seq_file *seq, void *v) 1193 { 1194 struct memtype *entry_print = (struct memtype *)v; 1195 1196 seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n", 1197 entry_print->start, 1198 entry_print->end, 1199 cattr_name(entry_print->type)); 1200 1201 return 0; 1202 } 1203 1204 static const struct seq_operations memtype_seq_ops = { 1205 .start = memtype_seq_start, 1206 .next = memtype_seq_next, 1207 .stop = memtype_seq_stop, 1208 .show = memtype_seq_show, 1209 }; 1210 1211 static int memtype_seq_open(struct inode *inode, struct file *file) 1212 { 1213 return seq_open(file, &memtype_seq_ops); 1214 } 1215 1216 static const struct file_operations memtype_fops = { 1217 .open = memtype_seq_open, 1218 .read = seq_read, 1219 .llseek = seq_lseek, 1220 .release = seq_release, 1221 }; 1222 1223 static int __init pat_memtype_list_init(void) 1224 { 1225 if (pat_enabled()) { 1226 debugfs_create_file("pat_memtype_list", S_IRUSR, 1227 arch_debugfs_dir, NULL, &memtype_fops); 1228 } 1229 return 0; 1230 } 1231 late_initcall(pat_memtype_list_init); 1232 1233 #endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ 1234