1 /* 2 * include/asm-s390/pgtable.h 3 * 4 * S390 version 5 * Copyright (C) 1999,2000 IBM Deutschland Entwicklung GmbH, IBM Corporation 6 * Author(s): Hartmut Penner (hp@de.ibm.com) 7 * Ulrich Weigand (weigand@de.ibm.com) 8 * Martin Schwidefsky (schwidefsky@de.ibm.com) 9 * 10 * Derived from "include/asm-i386/pgtable.h" 11 */ 12 13 #ifndef _ASM_S390_PGTABLE_H 14 #define _ASM_S390_PGTABLE_H 15 16 /* 17 * The Linux memory management assumes a three-level page table setup. For 18 * s390 31 bit we "fold" the mid level into the top-level page table, so 19 * that we physically have the same two-level page table as the s390 mmu 20 * expects in 31 bit mode. For s390 64 bit we use three of the five levels 21 * the hardware provides (region first and region second tables are not 22 * used). 23 * 24 * The "pgd_xxx()" functions are trivial for a folded two-level 25 * setup: the pgd is never bad, and a pmd always exists (as it's folded 26 * into the pgd entry) 27 * 28 * This file contains the functions and defines necessary to modify and use 29 * the S390 page table tree. 30 */ 31 #ifndef __ASSEMBLY__ 32 #include <linux/sched.h> 33 #include <linux/mm_types.h> 34 #include <asm/bitops.h> 35 #include <asm/bug.h> 36 #include <asm/processor.h> 37 38 extern pgd_t swapper_pg_dir[] __attribute__ ((aligned (4096))); 39 extern void paging_init(void); 40 extern void vmem_map_init(void); 41 42 /* 43 * The S390 doesn't have any external MMU info: the kernel page 44 * tables contain all the necessary information. 45 */ 46 #define update_mmu_cache(vma, address, pte) do { } while (0) 47 48 /* 49 * ZERO_PAGE is a global shared page that is always zero: used 50 * for zero-mapped memory areas etc.. 51 */ 52 extern char empty_zero_page[PAGE_SIZE]; 53 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) 54 #endif /* !__ASSEMBLY__ */ 55 56 /* 57 * PMD_SHIFT determines the size of the area a second-level page 58 * table can map 59 * PGDIR_SHIFT determines what a third-level page table entry can map 60 */ 61 #ifndef __s390x__ 62 # define PMD_SHIFT 20 63 # define PUD_SHIFT 20 64 # define PGDIR_SHIFT 20 65 #else /* __s390x__ */ 66 # define PMD_SHIFT 20 67 # define PUD_SHIFT 31 68 # define PGDIR_SHIFT 42 69 #endif /* __s390x__ */ 70 71 #define PMD_SIZE (1UL << PMD_SHIFT) 72 #define PMD_MASK (~(PMD_SIZE-1)) 73 #define PUD_SIZE (1UL << PUD_SHIFT) 74 #define PUD_MASK (~(PUD_SIZE-1)) 75 #define PGDIR_SIZE (1UL << PGDIR_SHIFT) 76 #define PGDIR_MASK (~(PGDIR_SIZE-1)) 77 78 /* 79 * entries per page directory level: the S390 is two-level, so 80 * we don't really have any PMD directory physically. 81 * for S390 segment-table entries are combined to one PGD 82 * that leads to 1024 pte per pgd 83 */ 84 #define PTRS_PER_PTE 256 85 #ifndef __s390x__ 86 #define PTRS_PER_PMD 1 87 #define PTRS_PER_PUD 1 88 #else /* __s390x__ */ 89 #define PTRS_PER_PMD 2048 90 #define PTRS_PER_PUD 2048 91 #endif /* __s390x__ */ 92 #define PTRS_PER_PGD 2048 93 94 #define FIRST_USER_ADDRESS 0 95 96 #define pte_ERROR(e) \ 97 printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e)) 98 #define pmd_ERROR(e) \ 99 printk("%s:%d: bad pmd %p.\n", __FILE__, __LINE__, (void *) pmd_val(e)) 100 #define pud_ERROR(e) \ 101 printk("%s:%d: bad pud %p.\n", __FILE__, __LINE__, (void *) pud_val(e)) 102 #define pgd_ERROR(e) \ 103 printk("%s:%d: bad pgd %p.\n", __FILE__, __LINE__, (void *) pgd_val(e)) 104 105 #ifndef __ASSEMBLY__ 106 /* 107 * The vmalloc area will always be on the topmost area of the kernel 108 * mapping. We reserve 96MB (31bit) / 1GB (64bit) for vmalloc, 109 * which should be enough for any sane case. 110 * By putting vmalloc at the top, we maximise the gap between physical 111 * memory and vmalloc to catch misplaced memory accesses. As a side 112 * effect, this also makes sure that 64 bit module code cannot be used 113 * as system call address. 114 */ 115 116 extern unsigned long VMALLOC_START; 117 118 #ifndef __s390x__ 119 #define VMALLOC_SIZE (96UL << 20) 120 #define VMALLOC_END 0x7e000000UL 121 #define VMEM_MAP_END 0x80000000UL 122 #else /* __s390x__ */ 123 #define VMALLOC_SIZE (1UL << 30) 124 #define VMALLOC_END 0x3e040000000UL 125 #define VMEM_MAP_END 0x40000000000UL 126 #endif /* __s390x__ */ 127 128 /* 129 * VMEM_MAX_PHYS is the highest physical address that can be added to the 1:1 130 * mapping. This needs to be calculated at compile time since the size of the 131 * VMEM_MAP is static but the size of struct page can change. 132 */ 133 #define VMEM_MAX_PAGES ((VMEM_MAP_END - VMALLOC_END) / sizeof(struct page)) 134 #define VMEM_MAX_PFN min(VMALLOC_START >> PAGE_SHIFT, VMEM_MAX_PAGES) 135 #define VMEM_MAX_PHYS ((VMEM_MAX_PFN << PAGE_SHIFT) & ~((16 << 20) - 1)) 136 #define vmemmap ((struct page *) VMALLOC_END) 137 138 /* 139 * A 31 bit pagetable entry of S390 has following format: 140 * | PFRA | | OS | 141 * 0 0IP0 142 * 00000000001111111111222222222233 143 * 01234567890123456789012345678901 144 * 145 * I Page-Invalid Bit: Page is not available for address-translation 146 * P Page-Protection Bit: Store access not possible for page 147 * 148 * A 31 bit segmenttable entry of S390 has following format: 149 * | P-table origin | |PTL 150 * 0 IC 151 * 00000000001111111111222222222233 152 * 01234567890123456789012345678901 153 * 154 * I Segment-Invalid Bit: Segment is not available for address-translation 155 * C Common-Segment Bit: Segment is not private (PoP 3-30) 156 * PTL Page-Table-Length: Page-table length (PTL+1*16 entries -> up to 256) 157 * 158 * The 31 bit segmenttable origin of S390 has following format: 159 * 160 * |S-table origin | | STL | 161 * X **GPS 162 * 00000000001111111111222222222233 163 * 01234567890123456789012345678901 164 * 165 * X Space-Switch event: 166 * G Segment-Invalid Bit: * 167 * P Private-Space Bit: Segment is not private (PoP 3-30) 168 * S Storage-Alteration: 169 * STL Segment-Table-Length: Segment-table length (STL+1*16 entries -> up to 2048) 170 * 171 * A 64 bit pagetable entry of S390 has following format: 172 * | PFRA |0IPC| OS | 173 * 0000000000111111111122222222223333333333444444444455555555556666 174 * 0123456789012345678901234567890123456789012345678901234567890123 175 * 176 * I Page-Invalid Bit: Page is not available for address-translation 177 * P Page-Protection Bit: Store access not possible for page 178 * C Change-bit override: HW is not required to set change bit 179 * 180 * A 64 bit segmenttable entry of S390 has following format: 181 * | P-table origin | TT 182 * 0000000000111111111122222222223333333333444444444455555555556666 183 * 0123456789012345678901234567890123456789012345678901234567890123 184 * 185 * I Segment-Invalid Bit: Segment is not available for address-translation 186 * C Common-Segment Bit: Segment is not private (PoP 3-30) 187 * P Page-Protection Bit: Store access not possible for page 188 * TT Type 00 189 * 190 * A 64 bit region table entry of S390 has following format: 191 * | S-table origin | TF TTTL 192 * 0000000000111111111122222222223333333333444444444455555555556666 193 * 0123456789012345678901234567890123456789012345678901234567890123 194 * 195 * I Segment-Invalid Bit: Segment is not available for address-translation 196 * TT Type 01 197 * TF 198 * TL Table length 199 * 200 * The 64 bit regiontable origin of S390 has following format: 201 * | region table origon | DTTL 202 * 0000000000111111111122222222223333333333444444444455555555556666 203 * 0123456789012345678901234567890123456789012345678901234567890123 204 * 205 * X Space-Switch event: 206 * G Segment-Invalid Bit: 207 * P Private-Space Bit: 208 * S Storage-Alteration: 209 * R Real space 210 * TL Table-Length: 211 * 212 * A storage key has the following format: 213 * | ACC |F|R|C|0| 214 * 0 3 4 5 6 7 215 * ACC: access key 216 * F : fetch protection bit 217 * R : referenced bit 218 * C : changed bit 219 */ 220 221 /* Hardware bits in the page table entry */ 222 #define _PAGE_CO 0x100 /* HW Change-bit override */ 223 #define _PAGE_RO 0x200 /* HW read-only bit */ 224 #define _PAGE_INVALID 0x400 /* HW invalid bit */ 225 226 /* Software bits in the page table entry */ 227 #define _PAGE_SWT 0x001 /* SW pte type bit t */ 228 #define _PAGE_SWX 0x002 /* SW pte type bit x */ 229 #define _PAGE_SPECIAL 0x004 /* SW associated with special page */ 230 #define __HAVE_ARCH_PTE_SPECIAL 231 232 /* Set of bits not changed in pte_modify */ 233 #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_SPECIAL) 234 235 /* Six different types of pages. */ 236 #define _PAGE_TYPE_EMPTY 0x400 237 #define _PAGE_TYPE_NONE 0x401 238 #define _PAGE_TYPE_SWAP 0x403 239 #define _PAGE_TYPE_FILE 0x601 /* bit 0x002 is used for offset !! */ 240 #define _PAGE_TYPE_RO 0x200 241 #define _PAGE_TYPE_RW 0x000 242 #define _PAGE_TYPE_EX_RO 0x202 243 #define _PAGE_TYPE_EX_RW 0x002 244 245 /* 246 * Only four types for huge pages, using the invalid bit and protection bit 247 * of a segment table entry. 248 */ 249 #define _HPAGE_TYPE_EMPTY 0x020 /* _SEGMENT_ENTRY_INV */ 250 #define _HPAGE_TYPE_NONE 0x220 251 #define _HPAGE_TYPE_RO 0x200 /* _SEGMENT_ENTRY_RO */ 252 #define _HPAGE_TYPE_RW 0x000 253 254 /* 255 * PTE type bits are rather complicated. handle_pte_fault uses pte_present, 256 * pte_none and pte_file to find out the pte type WITHOUT holding the page 257 * table lock. ptep_clear_flush on the other hand uses ptep_clear_flush to 258 * invalidate a given pte. ipte sets the hw invalid bit and clears all tlbs 259 * for the page. The page table entry is set to _PAGE_TYPE_EMPTY afterwards. 260 * This change is done while holding the lock, but the intermediate step 261 * of a previously valid pte with the hw invalid bit set can be observed by 262 * handle_pte_fault. That makes it necessary that all valid pte types with 263 * the hw invalid bit set must be distinguishable from the four pte types 264 * empty, none, swap and file. 265 * 266 * irxt ipte irxt 267 * _PAGE_TYPE_EMPTY 1000 -> 1000 268 * _PAGE_TYPE_NONE 1001 -> 1001 269 * _PAGE_TYPE_SWAP 1011 -> 1011 270 * _PAGE_TYPE_FILE 11?1 -> 11?1 271 * _PAGE_TYPE_RO 0100 -> 1100 272 * _PAGE_TYPE_RW 0000 -> 1000 273 * _PAGE_TYPE_EX_RO 0110 -> 1110 274 * _PAGE_TYPE_EX_RW 0010 -> 1010 275 * 276 * pte_none is true for bits combinations 1000, 1010, 1100, 1110 277 * pte_present is true for bits combinations 0000, 0010, 0100, 0110, 1001 278 * pte_file is true for bits combinations 1101, 1111 279 * swap pte is 1011 and 0001, 0011, 0101, 0111 are invalid. 280 */ 281 282 /* Page status table bits for virtualization */ 283 #define RCP_PCL_BIT 55 284 #define RCP_HR_BIT 54 285 #define RCP_HC_BIT 53 286 #define RCP_GR_BIT 50 287 #define RCP_GC_BIT 49 288 289 /* User dirty bit for KVM's migration feature */ 290 #define KVM_UD_BIT 47 291 292 #ifndef __s390x__ 293 294 /* Bits in the segment table address-space-control-element */ 295 #define _ASCE_SPACE_SWITCH 0x80000000UL /* space switch event */ 296 #define _ASCE_ORIGIN_MASK 0x7ffff000UL /* segment table origin */ 297 #define _ASCE_PRIVATE_SPACE 0x100 /* private space control */ 298 #define _ASCE_ALT_EVENT 0x80 /* storage alteration event control */ 299 #define _ASCE_TABLE_LENGTH 0x7f /* 128 x 64 entries = 8k */ 300 301 /* Bits in the segment table entry */ 302 #define _SEGMENT_ENTRY_ORIGIN 0x7fffffc0UL /* page table origin */ 303 #define _SEGMENT_ENTRY_INV 0x20 /* invalid segment table entry */ 304 #define _SEGMENT_ENTRY_COMMON 0x10 /* common segment bit */ 305 #define _SEGMENT_ENTRY_PTL 0x0f /* page table length */ 306 307 #define _SEGMENT_ENTRY (_SEGMENT_ENTRY_PTL) 308 #define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INV) 309 310 #else /* __s390x__ */ 311 312 /* Bits in the segment/region table address-space-control-element */ 313 #define _ASCE_ORIGIN ~0xfffUL/* segment table origin */ 314 #define _ASCE_PRIVATE_SPACE 0x100 /* private space control */ 315 #define _ASCE_ALT_EVENT 0x80 /* storage alteration event control */ 316 #define _ASCE_SPACE_SWITCH 0x40 /* space switch event */ 317 #define _ASCE_REAL_SPACE 0x20 /* real space control */ 318 #define _ASCE_TYPE_MASK 0x0c /* asce table type mask */ 319 #define _ASCE_TYPE_REGION1 0x0c /* region first table type */ 320 #define _ASCE_TYPE_REGION2 0x08 /* region second table type */ 321 #define _ASCE_TYPE_REGION3 0x04 /* region third table type */ 322 #define _ASCE_TYPE_SEGMENT 0x00 /* segment table type */ 323 #define _ASCE_TABLE_LENGTH 0x03 /* region table length */ 324 325 /* Bits in the region table entry */ 326 #define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */ 327 #define _REGION_ENTRY_INV 0x20 /* invalid region table entry */ 328 #define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */ 329 #define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ 330 #define _REGION_ENTRY_TYPE_R2 0x08 /* region second table type */ 331 #define _REGION_ENTRY_TYPE_R3 0x04 /* region third table type */ 332 #define _REGION_ENTRY_LENGTH 0x03 /* region third length */ 333 334 #define _REGION1_ENTRY (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_LENGTH) 335 #define _REGION1_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INV) 336 #define _REGION2_ENTRY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_LENGTH) 337 #define _REGION2_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INV) 338 #define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH) 339 #define _REGION3_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INV) 340 341 /* Bits in the segment table entry */ 342 #define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* segment table origin */ 343 #define _SEGMENT_ENTRY_RO 0x200 /* page protection bit */ 344 #define _SEGMENT_ENTRY_INV 0x20 /* invalid segment table entry */ 345 346 #define _SEGMENT_ENTRY (0) 347 #define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INV) 348 349 #define _SEGMENT_ENTRY_LARGE 0x400 /* STE-format control, large page */ 350 #define _SEGMENT_ENTRY_CO 0x100 /* change-recording override */ 351 352 #endif /* __s390x__ */ 353 354 /* 355 * A user page table pointer has the space-switch-event bit, the 356 * private-space-control bit and the storage-alteration-event-control 357 * bit set. A kernel page table pointer doesn't need them. 358 */ 359 #define _ASCE_USER_BITS (_ASCE_SPACE_SWITCH | _ASCE_PRIVATE_SPACE | \ 360 _ASCE_ALT_EVENT) 361 362 /* Bits int the storage key */ 363 #define _PAGE_CHANGED 0x02 /* HW changed bit */ 364 #define _PAGE_REFERENCED 0x04 /* HW referenced bit */ 365 366 /* 367 * Page protection definitions. 368 */ 369 #define PAGE_NONE __pgprot(_PAGE_TYPE_NONE) 370 #define PAGE_RO __pgprot(_PAGE_TYPE_RO) 371 #define PAGE_RW __pgprot(_PAGE_TYPE_RW) 372 #define PAGE_EX_RO __pgprot(_PAGE_TYPE_EX_RO) 373 #define PAGE_EX_RW __pgprot(_PAGE_TYPE_EX_RW) 374 375 #define PAGE_KERNEL PAGE_RW 376 #define PAGE_COPY PAGE_RO 377 378 /* 379 * Dependent on the EXEC_PROTECT option s390 can do execute protection. 380 * Write permission always implies read permission. In theory with a 381 * primary/secondary page table execute only can be implemented but 382 * it would cost an additional bit in the pte to distinguish all the 383 * different pte types. To avoid that execute permission currently 384 * implies read permission as well. 385 */ 386 /*xwr*/ 387 #define __P000 PAGE_NONE 388 #define __P001 PAGE_RO 389 #define __P010 PAGE_RO 390 #define __P011 PAGE_RO 391 #define __P100 PAGE_EX_RO 392 #define __P101 PAGE_EX_RO 393 #define __P110 PAGE_EX_RO 394 #define __P111 PAGE_EX_RO 395 396 #define __S000 PAGE_NONE 397 #define __S001 PAGE_RO 398 #define __S010 PAGE_RW 399 #define __S011 PAGE_RW 400 #define __S100 PAGE_EX_RO 401 #define __S101 PAGE_EX_RO 402 #define __S110 PAGE_EX_RW 403 #define __S111 PAGE_EX_RW 404 405 #ifndef __s390x__ 406 # define PxD_SHADOW_SHIFT 1 407 #else /* __s390x__ */ 408 # define PxD_SHADOW_SHIFT 2 409 #endif /* __s390x__ */ 410 411 static inline void *get_shadow_table(void *table) 412 { 413 unsigned long addr, offset; 414 struct page *page; 415 416 addr = (unsigned long) table; 417 offset = addr & ((PAGE_SIZE << PxD_SHADOW_SHIFT) - 1); 418 page = virt_to_page((void *)(addr ^ offset)); 419 return (void *)(addr_t)(page->index ? (page->index | offset) : 0UL); 420 } 421 422 /* 423 * Certain architectures need to do special things when PTEs 424 * within a page table are directly modified. Thus, the following 425 * hook is made available. 426 */ 427 static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, 428 pte_t *ptep, pte_t entry) 429 { 430 *ptep = entry; 431 if (mm->context.noexec) { 432 if (!(pte_val(entry) & _PAGE_INVALID) && 433 (pte_val(entry) & _PAGE_SWX)) 434 pte_val(entry) |= _PAGE_RO; 435 else 436 pte_val(entry) = _PAGE_TYPE_EMPTY; 437 ptep[PTRS_PER_PTE] = entry; 438 } 439 } 440 441 /* 442 * pgd/pmd/pte query functions 443 */ 444 #ifndef __s390x__ 445 446 static inline int pgd_present(pgd_t pgd) { return 1; } 447 static inline int pgd_none(pgd_t pgd) { return 0; } 448 static inline int pgd_bad(pgd_t pgd) { return 0; } 449 450 static inline int pud_present(pud_t pud) { return 1; } 451 static inline int pud_none(pud_t pud) { return 0; } 452 static inline int pud_bad(pud_t pud) { return 0; } 453 454 #else /* __s390x__ */ 455 456 static inline int pgd_present(pgd_t pgd) 457 { 458 if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2) 459 return 1; 460 return (pgd_val(pgd) & _REGION_ENTRY_ORIGIN) != 0UL; 461 } 462 463 static inline int pgd_none(pgd_t pgd) 464 { 465 if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2) 466 return 0; 467 return (pgd_val(pgd) & _REGION_ENTRY_INV) != 0UL; 468 } 469 470 static inline int pgd_bad(pgd_t pgd) 471 { 472 /* 473 * With dynamic page table levels the pgd can be a region table 474 * entry or a segment table entry. Check for the bit that are 475 * invalid for either table entry. 476 */ 477 unsigned long mask = 478 ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INV & 479 ~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH; 480 return (pgd_val(pgd) & mask) != 0; 481 } 482 483 static inline int pud_present(pud_t pud) 484 { 485 if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3) 486 return 1; 487 return (pud_val(pud) & _REGION_ENTRY_ORIGIN) != 0UL; 488 } 489 490 static inline int pud_none(pud_t pud) 491 { 492 if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3) 493 return 0; 494 return (pud_val(pud) & _REGION_ENTRY_INV) != 0UL; 495 } 496 497 static inline int pud_bad(pud_t pud) 498 { 499 /* 500 * With dynamic page table levels the pud can be a region table 501 * entry or a segment table entry. Check for the bit that are 502 * invalid for either table entry. 503 */ 504 unsigned long mask = 505 ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INV & 506 ~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH; 507 return (pud_val(pud) & mask) != 0; 508 } 509 510 #endif /* __s390x__ */ 511 512 static inline int pmd_present(pmd_t pmd) 513 { 514 return (pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN) != 0UL; 515 } 516 517 static inline int pmd_none(pmd_t pmd) 518 { 519 return (pmd_val(pmd) & _SEGMENT_ENTRY_INV) != 0UL; 520 } 521 522 static inline int pmd_bad(pmd_t pmd) 523 { 524 unsigned long mask = ~_SEGMENT_ENTRY_ORIGIN & ~_SEGMENT_ENTRY_INV; 525 return (pmd_val(pmd) & mask) != _SEGMENT_ENTRY; 526 } 527 528 static inline int pte_none(pte_t pte) 529 { 530 return (pte_val(pte) & _PAGE_INVALID) && !(pte_val(pte) & _PAGE_SWT); 531 } 532 533 static inline int pte_present(pte_t pte) 534 { 535 unsigned long mask = _PAGE_RO | _PAGE_INVALID | _PAGE_SWT | _PAGE_SWX; 536 return (pte_val(pte) & mask) == _PAGE_TYPE_NONE || 537 (!(pte_val(pte) & _PAGE_INVALID) && 538 !(pte_val(pte) & _PAGE_SWT)); 539 } 540 541 static inline int pte_file(pte_t pte) 542 { 543 unsigned long mask = _PAGE_RO | _PAGE_INVALID | _PAGE_SWT; 544 return (pte_val(pte) & mask) == _PAGE_TYPE_FILE; 545 } 546 547 static inline int pte_special(pte_t pte) 548 { 549 return (pte_val(pte) & _PAGE_SPECIAL); 550 } 551 552 #define __HAVE_ARCH_PTE_SAME 553 #define pte_same(a,b) (pte_val(a) == pte_val(b)) 554 555 static inline void rcp_lock(pte_t *ptep) 556 { 557 #ifdef CONFIG_PGSTE 558 unsigned long *pgste = (unsigned long *) (ptep + PTRS_PER_PTE); 559 preempt_disable(); 560 while (test_and_set_bit(RCP_PCL_BIT, pgste)) 561 ; 562 #endif 563 } 564 565 static inline void rcp_unlock(pte_t *ptep) 566 { 567 #ifdef CONFIG_PGSTE 568 unsigned long *pgste = (unsigned long *) (ptep + PTRS_PER_PTE); 569 clear_bit(RCP_PCL_BIT, pgste); 570 preempt_enable(); 571 #endif 572 } 573 574 /* forward declaration for SetPageUptodate in page-flags.h*/ 575 static inline void page_clear_dirty(struct page *page); 576 #include <linux/page-flags.h> 577 578 static inline void ptep_rcp_copy(pte_t *ptep) 579 { 580 #ifdef CONFIG_PGSTE 581 struct page *page = virt_to_page(pte_val(*ptep)); 582 unsigned int skey; 583 unsigned long *pgste = (unsigned long *) (ptep + PTRS_PER_PTE); 584 585 skey = page_get_storage_key(page_to_phys(page)); 586 if (skey & _PAGE_CHANGED) { 587 set_bit_simple(RCP_GC_BIT, pgste); 588 set_bit_simple(KVM_UD_BIT, pgste); 589 } 590 if (skey & _PAGE_REFERENCED) 591 set_bit_simple(RCP_GR_BIT, pgste); 592 if (test_and_clear_bit_simple(RCP_HC_BIT, pgste)) { 593 SetPageDirty(page); 594 set_bit_simple(KVM_UD_BIT, pgste); 595 } 596 if (test_and_clear_bit_simple(RCP_HR_BIT, pgste)) 597 SetPageReferenced(page); 598 #endif 599 } 600 601 /* 602 * query functions pte_write/pte_dirty/pte_young only work if 603 * pte_present() is true. Undefined behaviour if not.. 604 */ 605 static inline int pte_write(pte_t pte) 606 { 607 return (pte_val(pte) & _PAGE_RO) == 0; 608 } 609 610 static inline int pte_dirty(pte_t pte) 611 { 612 /* A pte is neither clean nor dirty on s/390. The dirty bit 613 * is in the storage key. See page_test_and_clear_dirty for 614 * details. 615 */ 616 return 0; 617 } 618 619 static inline int pte_young(pte_t pte) 620 { 621 /* A pte is neither young nor old on s/390. The young bit 622 * is in the storage key. See page_test_and_clear_young for 623 * details. 624 */ 625 return 0; 626 } 627 628 /* 629 * pgd/pmd/pte modification functions 630 */ 631 632 #ifndef __s390x__ 633 634 #define pgd_clear(pgd) do { } while (0) 635 #define pud_clear(pud) do { } while (0) 636 637 #else /* __s390x__ */ 638 639 static inline void pgd_clear_kernel(pgd_t * pgd) 640 { 641 if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) 642 pgd_val(*pgd) = _REGION2_ENTRY_EMPTY; 643 } 644 645 static inline void pgd_clear(pgd_t * pgd) 646 { 647 pgd_t *shadow = get_shadow_table(pgd); 648 649 pgd_clear_kernel(pgd); 650 if (shadow) 651 pgd_clear_kernel(shadow); 652 } 653 654 static inline void pud_clear_kernel(pud_t *pud) 655 { 656 if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) 657 pud_val(*pud) = _REGION3_ENTRY_EMPTY; 658 } 659 660 static inline void pud_clear(pud_t *pud) 661 { 662 pud_t *shadow = get_shadow_table(pud); 663 664 pud_clear_kernel(pud); 665 if (shadow) 666 pud_clear_kernel(shadow); 667 } 668 669 #endif /* __s390x__ */ 670 671 static inline void pmd_clear_kernel(pmd_t * pmdp) 672 { 673 pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY; 674 } 675 676 static inline void pmd_clear(pmd_t *pmd) 677 { 678 pmd_t *shadow = get_shadow_table(pmd); 679 680 pmd_clear_kernel(pmd); 681 if (shadow) 682 pmd_clear_kernel(shadow); 683 } 684 685 static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 686 { 687 pte_val(*ptep) = _PAGE_TYPE_EMPTY; 688 if (mm->context.noexec) 689 pte_val(ptep[PTRS_PER_PTE]) = _PAGE_TYPE_EMPTY; 690 } 691 692 /* 693 * The following pte modification functions only work if 694 * pte_present() is true. Undefined behaviour if not.. 695 */ 696 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) 697 { 698 pte_val(pte) &= _PAGE_CHG_MASK; 699 pte_val(pte) |= pgprot_val(newprot); 700 return pte; 701 } 702 703 static inline pte_t pte_wrprotect(pte_t pte) 704 { 705 /* Do not clobber _PAGE_TYPE_NONE pages! */ 706 if (!(pte_val(pte) & _PAGE_INVALID)) 707 pte_val(pte) |= _PAGE_RO; 708 return pte; 709 } 710 711 static inline pte_t pte_mkwrite(pte_t pte) 712 { 713 pte_val(pte) &= ~_PAGE_RO; 714 return pte; 715 } 716 717 static inline pte_t pte_mkclean(pte_t pte) 718 { 719 /* The only user of pte_mkclean is the fork() code. 720 We must *not* clear the *physical* page dirty bit 721 just because fork() wants to clear the dirty bit in 722 *one* of the page's mappings. So we just do nothing. */ 723 return pte; 724 } 725 726 static inline pte_t pte_mkdirty(pte_t pte) 727 { 728 /* We do not explicitly set the dirty bit because the 729 * sske instruction is slow. It is faster to let the 730 * next instruction set the dirty bit. 731 */ 732 return pte; 733 } 734 735 static inline pte_t pte_mkold(pte_t pte) 736 { 737 /* S/390 doesn't keep its dirty/referenced bit in the pte. 738 * There is no point in clearing the real referenced bit. 739 */ 740 return pte; 741 } 742 743 static inline pte_t pte_mkyoung(pte_t pte) 744 { 745 /* S/390 doesn't keep its dirty/referenced bit in the pte. 746 * There is no point in setting the real referenced bit. 747 */ 748 return pte; 749 } 750 751 static inline pte_t pte_mkspecial(pte_t pte) 752 { 753 pte_val(pte) |= _PAGE_SPECIAL; 754 return pte; 755 } 756 757 #ifdef CONFIG_PGSTE 758 /* 759 * Get (and clear) the user dirty bit for a PTE. 760 */ 761 static inline int kvm_s390_test_and_clear_page_dirty(struct mm_struct *mm, 762 pte_t *ptep) 763 { 764 int dirty; 765 unsigned long *pgste; 766 struct page *page; 767 unsigned int skey; 768 769 if (!mm->context.has_pgste) 770 return -EINVAL; 771 rcp_lock(ptep); 772 pgste = (unsigned long *) (ptep + PTRS_PER_PTE); 773 page = virt_to_page(pte_val(*ptep)); 774 skey = page_get_storage_key(page_to_phys(page)); 775 if (skey & _PAGE_CHANGED) { 776 set_bit_simple(RCP_GC_BIT, pgste); 777 set_bit_simple(KVM_UD_BIT, pgste); 778 } 779 if (test_and_clear_bit_simple(RCP_HC_BIT, pgste)) { 780 SetPageDirty(page); 781 set_bit_simple(KVM_UD_BIT, pgste); 782 } 783 dirty = test_and_clear_bit_simple(KVM_UD_BIT, pgste); 784 if (skey & _PAGE_CHANGED) 785 page_clear_dirty(page); 786 rcp_unlock(ptep); 787 return dirty; 788 } 789 #endif 790 791 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 792 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, 793 unsigned long addr, pte_t *ptep) 794 { 795 #ifdef CONFIG_PGSTE 796 unsigned long physpage; 797 int young; 798 unsigned long *pgste; 799 800 if (!vma->vm_mm->context.has_pgste) 801 return 0; 802 physpage = pte_val(*ptep) & PAGE_MASK; 803 pgste = (unsigned long *) (ptep + PTRS_PER_PTE); 804 805 young = ((page_get_storage_key(physpage) & _PAGE_REFERENCED) != 0); 806 rcp_lock(ptep); 807 if (young) 808 set_bit_simple(RCP_GR_BIT, pgste); 809 young |= test_and_clear_bit_simple(RCP_HR_BIT, pgste); 810 rcp_unlock(ptep); 811 return young; 812 #endif 813 return 0; 814 } 815 816 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH 817 static inline int ptep_clear_flush_young(struct vm_area_struct *vma, 818 unsigned long address, pte_t *ptep) 819 { 820 /* No need to flush TLB 821 * On s390 reference bits are in storage key and never in TLB 822 * With virtualization we handle the reference bit, without we 823 * we can simply return */ 824 #ifdef CONFIG_PGSTE 825 return ptep_test_and_clear_young(vma, address, ptep); 826 #endif 827 return 0; 828 } 829 830 static inline void __ptep_ipte(unsigned long address, pte_t *ptep) 831 { 832 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 833 #ifndef __s390x__ 834 /* pto must point to the start of the segment table */ 835 pte_t *pto = (pte_t *) (((unsigned long) ptep) & 0x7ffffc00); 836 #else 837 /* ipte in zarch mode can do the math */ 838 pte_t *pto = ptep; 839 #endif 840 asm volatile( 841 " ipte %2,%3" 842 : "=m" (*ptep) : "m" (*ptep), 843 "a" (pto), "a" (address)); 844 } 845 } 846 847 static inline void ptep_invalidate(struct mm_struct *mm, 848 unsigned long address, pte_t *ptep) 849 { 850 if (mm->context.has_pgste) { 851 rcp_lock(ptep); 852 __ptep_ipte(address, ptep); 853 ptep_rcp_copy(ptep); 854 pte_val(*ptep) = _PAGE_TYPE_EMPTY; 855 rcp_unlock(ptep); 856 return; 857 } 858 __ptep_ipte(address, ptep); 859 pte_val(*ptep) = _PAGE_TYPE_EMPTY; 860 if (mm->context.noexec) { 861 __ptep_ipte(address, ptep + PTRS_PER_PTE); 862 pte_val(*(ptep + PTRS_PER_PTE)) = _PAGE_TYPE_EMPTY; 863 } 864 } 865 866 /* 867 * This is hard to understand. ptep_get_and_clear and ptep_clear_flush 868 * both clear the TLB for the unmapped pte. The reason is that 869 * ptep_get_and_clear is used in common code (e.g. change_pte_range) 870 * to modify an active pte. The sequence is 871 * 1) ptep_get_and_clear 872 * 2) set_pte_at 873 * 3) flush_tlb_range 874 * On s390 the tlb needs to get flushed with the modification of the pte 875 * if the pte is active. The only way how this can be implemented is to 876 * have ptep_get_and_clear do the tlb flush. In exchange flush_tlb_range 877 * is a nop. 878 */ 879 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR 880 #define ptep_get_and_clear(__mm, __address, __ptep) \ 881 ({ \ 882 pte_t __pte = *(__ptep); \ 883 if (atomic_read(&(__mm)->mm_users) > 1 || \ 884 (__mm) != current->active_mm) \ 885 ptep_invalidate(__mm, __address, __ptep); \ 886 else \ 887 pte_clear((__mm), (__address), (__ptep)); \ 888 __pte; \ 889 }) 890 891 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH 892 static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, 893 unsigned long address, pte_t *ptep) 894 { 895 pte_t pte = *ptep; 896 ptep_invalidate(vma->vm_mm, address, ptep); 897 return pte; 898 } 899 900 /* 901 * The batched pte unmap code uses ptep_get_and_clear_full to clear the 902 * ptes. Here an optimization is possible. tlb_gather_mmu flushes all 903 * tlbs of an mm if it can guarantee that the ptes of the mm_struct 904 * cannot be accessed while the batched unmap is running. In this case 905 * full==1 and a simple pte_clear is enough. See tlb.h. 906 */ 907 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL 908 static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, 909 unsigned long addr, 910 pte_t *ptep, int full) 911 { 912 pte_t pte = *ptep; 913 914 if (full) 915 pte_clear(mm, addr, ptep); 916 else 917 ptep_invalidate(mm, addr, ptep); 918 return pte; 919 } 920 921 #define __HAVE_ARCH_PTEP_SET_WRPROTECT 922 #define ptep_set_wrprotect(__mm, __addr, __ptep) \ 923 ({ \ 924 pte_t __pte = *(__ptep); \ 925 if (pte_write(__pte)) { \ 926 if (atomic_read(&(__mm)->mm_users) > 1 || \ 927 (__mm) != current->active_mm) \ 928 ptep_invalidate(__mm, __addr, __ptep); \ 929 set_pte_at(__mm, __addr, __ptep, pte_wrprotect(__pte)); \ 930 } \ 931 }) 932 933 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 934 #define ptep_set_access_flags(__vma, __addr, __ptep, __entry, __dirty) \ 935 ({ \ 936 int __changed = !pte_same(*(__ptep), __entry); \ 937 if (__changed) { \ 938 ptep_invalidate((__vma)->vm_mm, __addr, __ptep); \ 939 set_pte_at((__vma)->vm_mm, __addr, __ptep, __entry); \ 940 } \ 941 __changed; \ 942 }) 943 944 /* 945 * Test and clear dirty bit in storage key. 946 * We can't clear the changed bit atomically. This is a potential 947 * race against modification of the referenced bit. This function 948 * should therefore only be called if it is not mapped in any 949 * address space. 950 */ 951 #define __HAVE_ARCH_PAGE_TEST_DIRTY 952 static inline int page_test_dirty(struct page *page) 953 { 954 return (page_get_storage_key(page_to_phys(page)) & _PAGE_CHANGED) != 0; 955 } 956 957 #define __HAVE_ARCH_PAGE_CLEAR_DIRTY 958 static inline void page_clear_dirty(struct page *page) 959 { 960 page_set_storage_key(page_to_phys(page), PAGE_DEFAULT_KEY); 961 } 962 963 /* 964 * Test and clear referenced bit in storage key. 965 */ 966 #define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_YOUNG 967 static inline int page_test_and_clear_young(struct page *page) 968 { 969 unsigned long physpage = page_to_phys(page); 970 int ccode; 971 972 asm volatile( 973 " rrbe 0,%1\n" 974 " ipm %0\n" 975 " srl %0,28\n" 976 : "=d" (ccode) : "a" (physpage) : "cc" ); 977 return ccode & 2; 978 } 979 980 /* 981 * Conversion functions: convert a page and protection to a page entry, 982 * and a page entry and page directory to the page they refer to. 983 */ 984 static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot) 985 { 986 pte_t __pte; 987 pte_val(__pte) = physpage + pgprot_val(pgprot); 988 return __pte; 989 } 990 991 static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) 992 { 993 unsigned long physpage = page_to_phys(page); 994 995 return mk_pte_phys(physpage, pgprot); 996 } 997 998 #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) 999 #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) 1000 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) 1001 #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) 1002 1003 #define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) 1004 #define pgd_offset_k(address) pgd_offset(&init_mm, address) 1005 1006 #ifndef __s390x__ 1007 1008 #define pmd_deref(pmd) (pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN) 1009 #define pud_deref(pmd) ({ BUG(); 0UL; }) 1010 #define pgd_deref(pmd) ({ BUG(); 0UL; }) 1011 1012 #define pud_offset(pgd, address) ((pud_t *) pgd) 1013 #define pmd_offset(pud, address) ((pmd_t *) pud + pmd_index(address)) 1014 1015 #else /* __s390x__ */ 1016 1017 #define pmd_deref(pmd) (pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN) 1018 #define pud_deref(pud) (pud_val(pud) & _REGION_ENTRY_ORIGIN) 1019 #define pgd_deref(pgd) (pgd_val(pgd) & _REGION_ENTRY_ORIGIN) 1020 1021 static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) 1022 { 1023 pud_t *pud = (pud_t *) pgd; 1024 if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) 1025 pud = (pud_t *) pgd_deref(*pgd); 1026 return pud + pud_index(address); 1027 } 1028 1029 static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) 1030 { 1031 pmd_t *pmd = (pmd_t *) pud; 1032 if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) 1033 pmd = (pmd_t *) pud_deref(*pud); 1034 return pmd + pmd_index(address); 1035 } 1036 1037 #endif /* __s390x__ */ 1038 1039 #define pfn_pte(pfn,pgprot) mk_pte_phys(__pa((pfn) << PAGE_SHIFT),(pgprot)) 1040 #define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT) 1041 #define pte_page(x) pfn_to_page(pte_pfn(x)) 1042 1043 #define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) 1044 1045 /* Find an entry in the lowest level page table.. */ 1046 #define pte_offset(pmd, addr) ((pte_t *) pmd_deref(*(pmd)) + pte_index(addr)) 1047 #define pte_offset_kernel(pmd, address) pte_offset(pmd,address) 1048 #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) 1049 #define pte_offset_map_nested(pmd, address) pte_offset_kernel(pmd, address) 1050 #define pte_unmap(pte) do { } while (0) 1051 #define pte_unmap_nested(pte) do { } while (0) 1052 1053 /* 1054 * 31 bit swap entry format: 1055 * A page-table entry has some bits we have to treat in a special way. 1056 * Bits 0, 20 and bit 23 have to be zero, otherwise an specification 1057 * exception will occur instead of a page translation exception. The 1058 * specifiation exception has the bad habit not to store necessary 1059 * information in the lowcore. 1060 * Bit 21 and bit 22 are the page invalid bit and the page protection 1061 * bit. We set both to indicate a swapped page. 1062 * Bit 30 and 31 are used to distinguish the different page types. For 1063 * a swapped page these bits need to be zero. 1064 * This leaves the bits 1-19 and bits 24-29 to store type and offset. 1065 * We use the 5 bits from 25-29 for the type and the 20 bits from 1-19 1066 * plus 24 for the offset. 1067 * 0| offset |0110|o|type |00| 1068 * 0 0000000001111111111 2222 2 22222 33 1069 * 0 1234567890123456789 0123 4 56789 01 1070 * 1071 * 64 bit swap entry format: 1072 * A page-table entry has some bits we have to treat in a special way. 1073 * Bits 52 and bit 55 have to be zero, otherwise an specification 1074 * exception will occur instead of a page translation exception. The 1075 * specifiation exception has the bad habit not to store necessary 1076 * information in the lowcore. 1077 * Bit 53 and bit 54 are the page invalid bit and the page protection 1078 * bit. We set both to indicate a swapped page. 1079 * Bit 62 and 63 are used to distinguish the different page types. For 1080 * a swapped page these bits need to be zero. 1081 * This leaves the bits 0-51 and bits 56-61 to store type and offset. 1082 * We use the 5 bits from 57-61 for the type and the 53 bits from 0-51 1083 * plus 56 for the offset. 1084 * | offset |0110|o|type |00| 1085 * 0000000000111111111122222222223333333333444444444455 5555 5 55566 66 1086 * 0123456789012345678901234567890123456789012345678901 2345 6 78901 23 1087 */ 1088 #ifndef __s390x__ 1089 #define __SWP_OFFSET_MASK (~0UL >> 12) 1090 #else 1091 #define __SWP_OFFSET_MASK (~0UL >> 11) 1092 #endif 1093 static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) 1094 { 1095 pte_t pte; 1096 offset &= __SWP_OFFSET_MASK; 1097 pte_val(pte) = _PAGE_TYPE_SWAP | ((type & 0x1f) << 2) | 1098 ((offset & 1UL) << 7) | ((offset & ~1UL) << 11); 1099 return pte; 1100 } 1101 1102 #define __swp_type(entry) (((entry).val >> 2) & 0x1f) 1103 #define __swp_offset(entry) (((entry).val >> 11) | (((entry).val >> 7) & 1)) 1104 #define __swp_entry(type,offset) ((swp_entry_t) { pte_val(mk_swap_pte((type),(offset))) }) 1105 1106 #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) 1107 #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) 1108 1109 #ifndef __s390x__ 1110 # define PTE_FILE_MAX_BITS 26 1111 #else /* __s390x__ */ 1112 # define PTE_FILE_MAX_BITS 59 1113 #endif /* __s390x__ */ 1114 1115 #define pte_to_pgoff(__pte) \ 1116 ((((__pte).pte >> 12) << 7) + (((__pte).pte >> 1) & 0x7f)) 1117 1118 #define pgoff_to_pte(__off) \ 1119 ((pte_t) { ((((__off) & 0x7f) << 1) + (((__off) >> 7) << 12)) \ 1120 | _PAGE_TYPE_FILE }) 1121 1122 #endif /* !__ASSEMBLY__ */ 1123 1124 #define kern_addr_valid(addr) (1) 1125 1126 extern int vmem_add_mapping(unsigned long start, unsigned long size); 1127 extern int vmem_remove_mapping(unsigned long start, unsigned long size); 1128 extern int s390_enable_sie(void); 1129 1130 /* 1131 * No page table caches to initialise 1132 */ 1133 #define pgtable_cache_init() do { } while (0) 1134 1135 #include <asm-generic/pgtable.h> 1136 1137 #endif /* _S390_PAGE_H */ 1138