1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Stand-alone page-table allocator for hyp stage-1 and guest stage-2. 4 * No bombay mix was harmed in the writing of this file. 5 * 6 * Copyright (C) 2020 Google LLC 7 * Author: Will Deacon <will@kernel.org> 8 */ 9 10 #include <linux/bitfield.h> 11 #include <asm/kvm_pgtable.h> 12 #include <asm/stage2_pgtable.h> 13 14 15 #define KVM_PTE_TYPE BIT(1) 16 #define KVM_PTE_TYPE_BLOCK 0 17 #define KVM_PTE_TYPE_PAGE 1 18 #define KVM_PTE_TYPE_TABLE 1 19 20 #define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) 21 22 #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) 23 #define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) 24 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ 25 ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) 26 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ 27 ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) 28 #define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) 29 #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 30 #define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) 31 32 #define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) 33 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) 34 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) 35 #define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) 36 #define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 37 #define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) 38 39 #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) 40 41 #define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) 42 43 #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) 44 45 #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) 46 47 #define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) 48 49 #define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ 50 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ 51 KVM_PTE_LEAF_ATTR_HI_S2_XN) 52 53 #define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) 54 #define KVM_MAX_OWNER_ID 1 55 56 /* 57 * Used to indicate a pte for which a 'break-before-make' sequence is in 58 * progress. 59 */ 60 #define KVM_INVALID_PTE_LOCKED BIT(10) 61 62 struct kvm_pgtable_walk_data { 63 struct kvm_pgtable_walker *walker; 64 65 const u64 start; 66 u64 addr; 67 const u64 end; 68 }; 69 70 static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx) 71 { 72 return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI); 73 } 74 75 static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) 76 { 77 return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); 78 } 79 80 static bool kvm_phys_is_valid(u64 phys) 81 { 82 return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); 83 } 84 85 static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) 86 { 87 u64 granule = kvm_granule_size(ctx->level); 88 89 if (!kvm_level_supports_block_mapping(ctx->level)) 90 return false; 91 92 if (granule > (ctx->end - ctx->addr)) 93 return false; 94 95 if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule)) 96 return false; 97 98 return IS_ALIGNED(ctx->addr, granule); 99 } 100 101 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) 102 { 103 u64 shift = kvm_granule_shift(level); 104 u64 mask = BIT(PAGE_SHIFT - 3) - 1; 105 106 return (data->addr >> shift) & mask; 107 } 108 109 static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) 110 { 111 u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */ 112 u64 mask = BIT(pgt->ia_bits) - 1; 113 114 return (addr & mask) >> shift; 115 } 116 117 static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) 118 { 119 struct kvm_pgtable pgt = { 120 .ia_bits = ia_bits, 121 .start_level = start_level, 122 }; 123 124 return kvm_pgd_page_idx(&pgt, -1ULL) + 1; 125 } 126 127 static bool kvm_pte_table(kvm_pte_t pte, u32 level) 128 { 129 if (level == KVM_PGTABLE_MAX_LEVELS - 1) 130 return false; 131 132 if (!kvm_pte_valid(pte)) 133 return false; 134 135 return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; 136 } 137 138 static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops) 139 { 140 return mm_ops->phys_to_virt(kvm_pte_to_phys(pte)); 141 } 142 143 static void kvm_clear_pte(kvm_pte_t *ptep) 144 { 145 WRITE_ONCE(*ptep, 0); 146 } 147 148 static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops) 149 { 150 kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); 151 152 pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); 153 pte |= KVM_PTE_VALID; 154 return pte; 155 } 156 157 static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) 158 { 159 kvm_pte_t pte = kvm_phys_to_pte(pa); 160 u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE : 161 KVM_PTE_TYPE_BLOCK; 162 163 pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); 164 pte |= FIELD_PREP(KVM_PTE_TYPE, type); 165 pte |= KVM_PTE_VALID; 166 167 return pte; 168 } 169 170 static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) 171 { 172 return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); 173 } 174 175 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, 176 const struct kvm_pgtable_visit_ctx *ctx, 177 enum kvm_pgtable_walk_flags visit) 178 { 179 struct kvm_pgtable_walker *walker = data->walker; 180 181 /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */ 182 WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held()); 183 return walker->cb(ctx, visit); 184 } 185 186 static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, 187 int r) 188 { 189 /* 190 * Visitor callbacks return EAGAIN when the conditions that led to a 191 * fault are no longer reflected in the page tables due to a race to 192 * update a PTE. In the context of a fault handler this is interpreted 193 * as a signal to retry guest execution. 194 * 195 * Ignore the return code altogether for walkers outside a fault handler 196 * (e.g. write protecting a range of memory) and chug along with the 197 * page table walk. 198 */ 199 if (r == -EAGAIN) 200 return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT); 201 202 return !r; 203 } 204 205 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, 206 struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); 207 208 static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, 209 struct kvm_pgtable_mm_ops *mm_ops, 210 kvm_pteref_t pteref, u32 level) 211 { 212 enum kvm_pgtable_walk_flags flags = data->walker->flags; 213 kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); 214 struct kvm_pgtable_visit_ctx ctx = { 215 .ptep = ptep, 216 .old = READ_ONCE(*ptep), 217 .arg = data->walker->arg, 218 .mm_ops = mm_ops, 219 .start = data->start, 220 .addr = data->addr, 221 .end = data->end, 222 .level = level, 223 .flags = flags, 224 }; 225 int ret = 0; 226 bool reload = false; 227 kvm_pteref_t childp; 228 bool table = kvm_pte_table(ctx.old, level); 229 230 if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) { 231 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); 232 reload = true; 233 } 234 235 if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { 236 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); 237 reload = true; 238 } 239 240 /* 241 * Reload the page table after invoking the walker callback for leaf 242 * entries or after pre-order traversal, to allow the walker to descend 243 * into a newly installed or replaced table. 244 */ 245 if (reload) { 246 ctx.old = READ_ONCE(*ptep); 247 table = kvm_pte_table(ctx.old, level); 248 } 249 250 if (!kvm_pgtable_walk_continue(data->walker, ret)) 251 goto out; 252 253 if (!table) { 254 data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); 255 data->addr += kvm_granule_size(level); 256 goto out; 257 } 258 259 childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops); 260 ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); 261 if (!kvm_pgtable_walk_continue(data->walker, ret)) 262 goto out; 263 264 if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST) 265 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST); 266 267 out: 268 if (kvm_pgtable_walk_continue(data->walker, ret)) 269 return 0; 270 271 return ret; 272 } 273 274 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, 275 struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level) 276 { 277 u32 idx; 278 int ret = 0; 279 280 if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS)) 281 return -EINVAL; 282 283 for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { 284 kvm_pteref_t pteref = &pgtable[idx]; 285 286 if (data->addr >= data->end) 287 break; 288 289 ret = __kvm_pgtable_visit(data, mm_ops, pteref, level); 290 if (ret) 291 break; 292 } 293 294 return ret; 295 } 296 297 static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data) 298 { 299 u32 idx; 300 int ret = 0; 301 u64 limit = BIT(pgt->ia_bits); 302 303 if (data->addr > limit || data->end > limit) 304 return -ERANGE; 305 306 if (!pgt->pgd) 307 return -EINVAL; 308 309 for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) { 310 kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE]; 311 312 ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level); 313 if (ret) 314 break; 315 } 316 317 return ret; 318 } 319 320 int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, 321 struct kvm_pgtable_walker *walker) 322 { 323 struct kvm_pgtable_walk_data walk_data = { 324 .start = ALIGN_DOWN(addr, PAGE_SIZE), 325 .addr = ALIGN_DOWN(addr, PAGE_SIZE), 326 .end = PAGE_ALIGN(walk_data.addr + size), 327 .walker = walker, 328 }; 329 int r; 330 331 r = kvm_pgtable_walk_begin(walker); 332 if (r) 333 return r; 334 335 r = _kvm_pgtable_walk(pgt, &walk_data); 336 kvm_pgtable_walk_end(walker); 337 338 return r; 339 } 340 341 struct leaf_walk_data { 342 kvm_pte_t pte; 343 u32 level; 344 }; 345 346 static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, 347 enum kvm_pgtable_walk_flags visit) 348 { 349 struct leaf_walk_data *data = ctx->arg; 350 351 data->pte = ctx->old; 352 data->level = ctx->level; 353 354 return 0; 355 } 356 357 int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, 358 kvm_pte_t *ptep, u32 *level) 359 { 360 struct leaf_walk_data data; 361 struct kvm_pgtable_walker walker = { 362 .cb = leaf_walker, 363 .flags = KVM_PGTABLE_WALK_LEAF, 364 .arg = &data, 365 }; 366 int ret; 367 368 ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE), 369 PAGE_SIZE, &walker); 370 if (!ret) { 371 if (ptep) 372 *ptep = data.pte; 373 if (level) 374 *level = data.level; 375 } 376 377 return ret; 378 } 379 380 struct hyp_map_data { 381 const u64 phys; 382 kvm_pte_t attr; 383 }; 384 385 static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) 386 { 387 bool device = prot & KVM_PGTABLE_PROT_DEVICE; 388 u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; 389 kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); 390 u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; 391 u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW : 392 KVM_PTE_LEAF_ATTR_LO_S1_AP_RO; 393 394 if (!(prot & KVM_PGTABLE_PROT_R)) 395 return -EINVAL; 396 397 if (prot & KVM_PGTABLE_PROT_X) { 398 if (prot & KVM_PGTABLE_PROT_W) 399 return -EINVAL; 400 401 if (device) 402 return -EINVAL; 403 404 if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti()) 405 attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; 406 } else { 407 attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; 408 } 409 410 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); 411 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); 412 attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; 413 attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; 414 *ptep = attr; 415 416 return 0; 417 } 418 419 enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) 420 { 421 enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; 422 u32 ap; 423 424 if (!kvm_pte_valid(pte)) 425 return prot; 426 427 if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) 428 prot |= KVM_PGTABLE_PROT_X; 429 430 ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); 431 if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) 432 prot |= KVM_PGTABLE_PROT_R; 433 else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW) 434 prot |= KVM_PGTABLE_PROT_RW; 435 436 return prot; 437 } 438 439 static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, 440 struct hyp_map_data *data) 441 { 442 u64 phys = data->phys + (ctx->addr - ctx->start); 443 kvm_pte_t new; 444 445 if (!kvm_block_mapping_supported(ctx, phys)) 446 return false; 447 448 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); 449 if (ctx->old == new) 450 return true; 451 if (!kvm_pte_valid(ctx->old)) 452 ctx->mm_ops->get_page(ctx->ptep); 453 else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) 454 return false; 455 456 smp_store_release(ctx->ptep, new); 457 return true; 458 } 459 460 static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, 461 enum kvm_pgtable_walk_flags visit) 462 { 463 kvm_pte_t *childp, new; 464 struct hyp_map_data *data = ctx->arg; 465 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 466 467 if (hyp_map_walker_try_leaf(ctx, data)) 468 return 0; 469 470 if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) 471 return -EINVAL; 472 473 childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); 474 if (!childp) 475 return -ENOMEM; 476 477 new = kvm_init_table_pte(childp, mm_ops); 478 mm_ops->get_page(ctx->ptep); 479 smp_store_release(ctx->ptep, new); 480 481 return 0; 482 } 483 484 int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, 485 enum kvm_pgtable_prot prot) 486 { 487 int ret; 488 struct hyp_map_data map_data = { 489 .phys = ALIGN_DOWN(phys, PAGE_SIZE), 490 }; 491 struct kvm_pgtable_walker walker = { 492 .cb = hyp_map_walker, 493 .flags = KVM_PGTABLE_WALK_LEAF, 494 .arg = &map_data, 495 }; 496 497 ret = hyp_set_prot_attr(prot, &map_data.attr); 498 if (ret) 499 return ret; 500 501 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 502 dsb(ishst); 503 isb(); 504 return ret; 505 } 506 507 static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, 508 enum kvm_pgtable_walk_flags visit) 509 { 510 kvm_pte_t *childp = NULL; 511 u64 granule = kvm_granule_size(ctx->level); 512 u64 *unmapped = ctx->arg; 513 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 514 515 if (!kvm_pte_valid(ctx->old)) 516 return -EINVAL; 517 518 if (kvm_pte_table(ctx->old, ctx->level)) { 519 childp = kvm_pte_follow(ctx->old, mm_ops); 520 521 if (mm_ops->page_count(childp) != 1) 522 return 0; 523 524 kvm_clear_pte(ctx->ptep); 525 dsb(ishst); 526 __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); 527 } else { 528 if (ctx->end - ctx->addr < granule) 529 return -EINVAL; 530 531 kvm_clear_pte(ctx->ptep); 532 dsb(ishst); 533 __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); 534 *unmapped += granule; 535 } 536 537 dsb(ish); 538 isb(); 539 mm_ops->put_page(ctx->ptep); 540 541 if (childp) 542 mm_ops->put_page(childp); 543 544 return 0; 545 } 546 547 u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) 548 { 549 u64 unmapped = 0; 550 struct kvm_pgtable_walker walker = { 551 .cb = hyp_unmap_walker, 552 .arg = &unmapped, 553 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, 554 }; 555 556 if (!pgt->mm_ops->page_count) 557 return 0; 558 559 kvm_pgtable_walk(pgt, addr, size, &walker); 560 return unmapped; 561 } 562 563 int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, 564 struct kvm_pgtable_mm_ops *mm_ops) 565 { 566 u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); 567 568 pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); 569 if (!pgt->pgd) 570 return -ENOMEM; 571 572 pgt->ia_bits = va_bits; 573 pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; 574 pgt->mm_ops = mm_ops; 575 pgt->mmu = NULL; 576 pgt->force_pte_cb = NULL; 577 578 return 0; 579 } 580 581 static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx, 582 enum kvm_pgtable_walk_flags visit) 583 { 584 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 585 586 if (!kvm_pte_valid(ctx->old)) 587 return 0; 588 589 mm_ops->put_page(ctx->ptep); 590 591 if (kvm_pte_table(ctx->old, ctx->level)) 592 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); 593 594 return 0; 595 } 596 597 void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) 598 { 599 struct kvm_pgtable_walker walker = { 600 .cb = hyp_free_walker, 601 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, 602 }; 603 604 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); 605 pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd)); 606 pgt->pgd = NULL; 607 } 608 609 struct stage2_map_data { 610 const u64 phys; 611 kvm_pte_t attr; 612 u8 owner_id; 613 614 kvm_pte_t *anchor; 615 kvm_pte_t *childp; 616 617 struct kvm_s2_mmu *mmu; 618 void *memcache; 619 620 /* Force mappings to page granularity */ 621 bool force_pte; 622 }; 623 624 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) 625 { 626 u64 vtcr = VTCR_EL2_FLAGS; 627 u8 lvls; 628 629 vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; 630 vtcr |= VTCR_EL2_T0SZ(phys_shift); 631 /* 632 * Use a minimum 2 level page table to prevent splitting 633 * host PMD huge pages at stage2. 634 */ 635 lvls = stage2_pgtable_levels(phys_shift); 636 if (lvls < 2) 637 lvls = 2; 638 vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); 639 640 #ifdef CONFIG_ARM64_HW_AFDBM 641 /* 642 * Enable the Hardware Access Flag management, unconditionally 643 * on all CPUs. In systems that have asymmetric support for the feature 644 * this allows KVM to leverage hardware support on the subset of cores 645 * that implement the feature. 646 * 647 * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by 648 * hardware) on implementations that do not advertise support for the 649 * feature. As such, setting HA unconditionally is safe, unless you 650 * happen to be running on a design that has unadvertised support for 651 * HAFDBS. Here be dragons. 652 */ 653 if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) 654 vtcr |= VTCR_EL2_HA; 655 #endif /* CONFIG_ARM64_HW_AFDBM */ 656 657 /* Set the vmid bits */ 658 vtcr |= (get_vmid_bits(mmfr1) == 16) ? 659 VTCR_EL2_VS_16BIT : 660 VTCR_EL2_VS_8BIT; 661 662 return vtcr; 663 } 664 665 static bool stage2_has_fwb(struct kvm_pgtable *pgt) 666 { 667 if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) 668 return false; 669 670 return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); 671 } 672 673 void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, 674 phys_addr_t addr, size_t size) 675 { 676 unsigned long pages, inval_pages; 677 678 if (!system_supports_tlb_range()) { 679 kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); 680 return; 681 } 682 683 pages = size >> PAGE_SHIFT; 684 while (pages > 0) { 685 inval_pages = min(pages, MAX_TLBI_RANGE_PAGES); 686 kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages); 687 688 addr += inval_pages << PAGE_SHIFT; 689 pages -= inval_pages; 690 } 691 } 692 693 #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) 694 695 static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, 696 kvm_pte_t *ptep) 697 { 698 bool device = prot & KVM_PGTABLE_PROT_DEVICE; 699 kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) : 700 KVM_S2_MEMATTR(pgt, NORMAL); 701 u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; 702 703 if (!(prot & KVM_PGTABLE_PROT_X)) 704 attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; 705 else if (device) 706 return -EINVAL; 707 708 if (prot & KVM_PGTABLE_PROT_R) 709 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; 710 711 if (prot & KVM_PGTABLE_PROT_W) 712 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; 713 714 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); 715 attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; 716 attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; 717 *ptep = attr; 718 719 return 0; 720 } 721 722 enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) 723 { 724 enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; 725 726 if (!kvm_pte_valid(pte)) 727 return prot; 728 729 if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R) 730 prot |= KVM_PGTABLE_PROT_R; 731 if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) 732 prot |= KVM_PGTABLE_PROT_W; 733 if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) 734 prot |= KVM_PGTABLE_PROT_X; 735 736 return prot; 737 } 738 739 static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) 740 { 741 if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) 742 return true; 743 744 return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)); 745 } 746 747 static bool stage2_pte_is_counted(kvm_pte_t pte) 748 { 749 /* 750 * The refcount tracks valid entries as well as invalid entries if they 751 * encode ownership of a page to another entity than the page-table 752 * owner, whose id is 0. 753 */ 754 return !!pte; 755 } 756 757 static bool stage2_pte_is_locked(kvm_pte_t pte) 758 { 759 return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED); 760 } 761 762 static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) 763 { 764 if (!kvm_pgtable_walk_shared(ctx)) { 765 WRITE_ONCE(*ctx->ptep, new); 766 return true; 767 } 768 769 return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old; 770 } 771 772 /** 773 * stage2_try_break_pte() - Invalidates a pte according to the 774 * 'break-before-make' requirements of the 775 * architecture. 776 * 777 * @ctx: context of the visited pte. 778 * @mmu: stage-2 mmu 779 * 780 * Returns: true if the pte was successfully broken. 781 * 782 * If the removed pte was valid, performs the necessary serialization and TLB 783 * invalidation for the old value. For counted ptes, drops the reference count 784 * on the containing table page. 785 */ 786 static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, 787 struct kvm_s2_mmu *mmu) 788 { 789 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 790 791 if (stage2_pte_is_locked(ctx->old)) { 792 /* 793 * Should never occur if this walker has exclusive access to the 794 * page tables. 795 */ 796 WARN_ON(!kvm_pgtable_walk_shared(ctx)); 797 return false; 798 } 799 800 if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) 801 return false; 802 803 if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { 804 /* 805 * Perform the appropriate TLB invalidation based on the 806 * evicted pte value (if any). 807 */ 808 if (kvm_pte_table(ctx->old, ctx->level)) 809 kvm_tlb_flush_vmid_range(mmu, ctx->addr, 810 kvm_granule_size(ctx->level)); 811 else if (kvm_pte_valid(ctx->old)) 812 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, 813 ctx->addr, ctx->level); 814 } 815 816 if (stage2_pte_is_counted(ctx->old)) 817 mm_ops->put_page(ctx->ptep); 818 819 return true; 820 } 821 822 static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) 823 { 824 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 825 826 WARN_ON(!stage2_pte_is_locked(*ctx->ptep)); 827 828 if (stage2_pte_is_counted(new)) 829 mm_ops->get_page(ctx->ptep); 830 831 smp_store_release(ctx->ptep, new); 832 } 833 834 static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt) 835 { 836 /* 837 * If FEAT_TLBIRANGE is implemented, defer the individual 838 * TLB invalidations until the entire walk is finished, and 839 * then use the range-based TLBI instructions to do the 840 * invalidations. Condition deferred TLB invalidation on the 841 * system supporting FWB as the optimization is entirely 842 * pointless when the unmap walker needs to perform CMOs. 843 */ 844 return system_supports_tlb_range() && stage2_has_fwb(pgt); 845 } 846 847 static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, 848 struct kvm_s2_mmu *mmu, 849 struct kvm_pgtable_mm_ops *mm_ops) 850 { 851 struct kvm_pgtable *pgt = ctx->arg; 852 853 /* 854 * Clear the existing PTE, and perform break-before-make if it was 855 * valid. Depending on the system support, defer the TLB maintenance 856 * for the same until the entire unmap walk is completed. 857 */ 858 if (kvm_pte_valid(ctx->old)) { 859 kvm_clear_pte(ctx->ptep); 860 861 if (!stage2_unmap_defer_tlb_flush(pgt)) 862 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, 863 ctx->addr, ctx->level); 864 } 865 866 mm_ops->put_page(ctx->ptep); 867 } 868 869 static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) 870 { 871 u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; 872 return memattr == KVM_S2_MEMATTR(pgt, NORMAL); 873 } 874 875 static bool stage2_pte_executable(kvm_pte_t pte) 876 { 877 return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); 878 } 879 880 static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, 881 const struct stage2_map_data *data) 882 { 883 u64 phys = data->phys; 884 885 /* 886 * Stage-2 walks to update ownership data are communicated to the map 887 * walker using an invalid PA. Avoid offsetting an already invalid PA, 888 * which could overflow and make the address valid again. 889 */ 890 if (!kvm_phys_is_valid(phys)) 891 return phys; 892 893 /* 894 * Otherwise, work out the correct PA based on how far the walk has 895 * gotten. 896 */ 897 return phys + (ctx->addr - ctx->start); 898 } 899 900 static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, 901 struct stage2_map_data *data) 902 { 903 u64 phys = stage2_map_walker_phys_addr(ctx, data); 904 905 if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1))) 906 return false; 907 908 return kvm_block_mapping_supported(ctx, phys); 909 } 910 911 static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, 912 struct stage2_map_data *data) 913 { 914 kvm_pte_t new; 915 u64 phys = stage2_map_walker_phys_addr(ctx, data); 916 u64 granule = kvm_granule_size(ctx->level); 917 struct kvm_pgtable *pgt = data->mmu->pgt; 918 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 919 920 if (!stage2_leaf_mapping_allowed(ctx, data)) 921 return -E2BIG; 922 923 if (kvm_phys_is_valid(phys)) 924 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); 925 else 926 new = kvm_init_invalid_leaf_owner(data->owner_id); 927 928 /* 929 * Skip updating the PTE if we are trying to recreate the exact 930 * same mapping or only change the access permissions. Instead, 931 * the vCPU will exit one more time from guest if still needed 932 * and then go through the path of relaxing permissions. 933 */ 934 if (!stage2_pte_needs_update(ctx->old, new)) 935 return -EAGAIN; 936 937 if (!stage2_try_break_pte(ctx, data->mmu)) 938 return -EAGAIN; 939 940 /* Perform CMOs before installation of the guest stage-2 PTE */ 941 if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc && 942 stage2_pte_cacheable(pgt, new)) 943 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), 944 granule); 945 946 if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou && 947 stage2_pte_executable(new)) 948 mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); 949 950 stage2_make_pte(ctx, new); 951 952 return 0; 953 } 954 955 static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, 956 struct stage2_map_data *data) 957 { 958 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 959 kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); 960 int ret; 961 962 if (!stage2_leaf_mapping_allowed(ctx, data)) 963 return 0; 964 965 ret = stage2_map_walker_try_leaf(ctx, data); 966 if (ret) 967 return ret; 968 969 mm_ops->free_unlinked_table(childp, ctx->level); 970 return 0; 971 } 972 973 static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, 974 struct stage2_map_data *data) 975 { 976 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 977 kvm_pte_t *childp, new; 978 int ret; 979 980 ret = stage2_map_walker_try_leaf(ctx, data); 981 if (ret != -E2BIG) 982 return ret; 983 984 if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) 985 return -EINVAL; 986 987 if (!data->memcache) 988 return -ENOMEM; 989 990 childp = mm_ops->zalloc_page(data->memcache); 991 if (!childp) 992 return -ENOMEM; 993 994 if (!stage2_try_break_pte(ctx, data->mmu)) { 995 mm_ops->put_page(childp); 996 return -EAGAIN; 997 } 998 999 /* 1000 * If we've run into an existing block mapping then replace it with 1001 * a table. Accesses beyond 'end' that fall within the new table 1002 * will be mapped lazily. 1003 */ 1004 new = kvm_init_table_pte(childp, mm_ops); 1005 stage2_make_pte(ctx, new); 1006 1007 return 0; 1008 } 1009 1010 /* 1011 * The TABLE_PRE callback runs for table entries on the way down, looking 1012 * for table entries which we could conceivably replace with a block entry 1013 * for this mapping. If it finds one it replaces the entry and calls 1014 * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. 1015 * 1016 * Otherwise, the LEAF callback performs the mapping at the existing leaves 1017 * instead. 1018 */ 1019 static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx, 1020 enum kvm_pgtable_walk_flags visit) 1021 { 1022 struct stage2_map_data *data = ctx->arg; 1023 1024 switch (visit) { 1025 case KVM_PGTABLE_WALK_TABLE_PRE: 1026 return stage2_map_walk_table_pre(ctx, data); 1027 case KVM_PGTABLE_WALK_LEAF: 1028 return stage2_map_walk_leaf(ctx, data); 1029 default: 1030 return -EINVAL; 1031 } 1032 } 1033 1034 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, 1035 u64 phys, enum kvm_pgtable_prot prot, 1036 void *mc, enum kvm_pgtable_walk_flags flags) 1037 { 1038 int ret; 1039 struct stage2_map_data map_data = { 1040 .phys = ALIGN_DOWN(phys, PAGE_SIZE), 1041 .mmu = pgt->mmu, 1042 .memcache = mc, 1043 .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), 1044 }; 1045 struct kvm_pgtable_walker walker = { 1046 .cb = stage2_map_walker, 1047 .flags = flags | 1048 KVM_PGTABLE_WALK_TABLE_PRE | 1049 KVM_PGTABLE_WALK_LEAF, 1050 .arg = &map_data, 1051 }; 1052 1053 if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys))) 1054 return -EINVAL; 1055 1056 ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); 1057 if (ret) 1058 return ret; 1059 1060 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1061 dsb(ishst); 1062 return ret; 1063 } 1064 1065 int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, 1066 void *mc, u8 owner_id) 1067 { 1068 int ret; 1069 struct stage2_map_data map_data = { 1070 .phys = KVM_PHYS_INVALID, 1071 .mmu = pgt->mmu, 1072 .memcache = mc, 1073 .owner_id = owner_id, 1074 .force_pte = true, 1075 }; 1076 struct kvm_pgtable_walker walker = { 1077 .cb = stage2_map_walker, 1078 .flags = KVM_PGTABLE_WALK_TABLE_PRE | 1079 KVM_PGTABLE_WALK_LEAF, 1080 .arg = &map_data, 1081 }; 1082 1083 if (owner_id > KVM_MAX_OWNER_ID) 1084 return -EINVAL; 1085 1086 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1087 return ret; 1088 } 1089 1090 static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, 1091 enum kvm_pgtable_walk_flags visit) 1092 { 1093 struct kvm_pgtable *pgt = ctx->arg; 1094 struct kvm_s2_mmu *mmu = pgt->mmu; 1095 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1096 kvm_pte_t *childp = NULL; 1097 bool need_flush = false; 1098 1099 if (!kvm_pte_valid(ctx->old)) { 1100 if (stage2_pte_is_counted(ctx->old)) { 1101 kvm_clear_pte(ctx->ptep); 1102 mm_ops->put_page(ctx->ptep); 1103 } 1104 return 0; 1105 } 1106 1107 if (kvm_pte_table(ctx->old, ctx->level)) { 1108 childp = kvm_pte_follow(ctx->old, mm_ops); 1109 1110 if (mm_ops->page_count(childp) != 1) 1111 return 0; 1112 } else if (stage2_pte_cacheable(pgt, ctx->old)) { 1113 need_flush = !stage2_has_fwb(pgt); 1114 } 1115 1116 /* 1117 * This is similar to the map() path in that we unmap the entire 1118 * block entry and rely on the remaining portions being faulted 1119 * back lazily. 1120 */ 1121 stage2_unmap_put_pte(ctx, mmu, mm_ops); 1122 1123 if (need_flush && mm_ops->dcache_clean_inval_poc) 1124 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), 1125 kvm_granule_size(ctx->level)); 1126 1127 if (childp) 1128 mm_ops->put_page(childp); 1129 1130 return 0; 1131 } 1132 1133 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) 1134 { 1135 int ret; 1136 struct kvm_pgtable_walker walker = { 1137 .cb = stage2_unmap_walker, 1138 .arg = pgt, 1139 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, 1140 }; 1141 1142 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1143 if (stage2_unmap_defer_tlb_flush(pgt)) 1144 /* Perform the deferred TLB invalidations */ 1145 kvm_tlb_flush_vmid_range(pgt->mmu, addr, size); 1146 1147 return ret; 1148 } 1149 1150 struct stage2_attr_data { 1151 kvm_pte_t attr_set; 1152 kvm_pte_t attr_clr; 1153 kvm_pte_t pte; 1154 u32 level; 1155 }; 1156 1157 static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, 1158 enum kvm_pgtable_walk_flags visit) 1159 { 1160 kvm_pte_t pte = ctx->old; 1161 struct stage2_attr_data *data = ctx->arg; 1162 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1163 1164 if (!kvm_pte_valid(ctx->old)) 1165 return -EAGAIN; 1166 1167 data->level = ctx->level; 1168 data->pte = pte; 1169 pte &= ~data->attr_clr; 1170 pte |= data->attr_set; 1171 1172 /* 1173 * We may race with the CPU trying to set the access flag here, 1174 * but worst-case the access flag update gets lost and will be 1175 * set on the next access instead. 1176 */ 1177 if (data->pte != pte) { 1178 /* 1179 * Invalidate instruction cache before updating the guest 1180 * stage-2 PTE if we are going to add executable permission. 1181 */ 1182 if (mm_ops->icache_inval_pou && 1183 stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old)) 1184 mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops), 1185 kvm_granule_size(ctx->level)); 1186 1187 if (!stage2_try_set_pte(ctx, pte)) 1188 return -EAGAIN; 1189 } 1190 1191 return 0; 1192 } 1193 1194 static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, 1195 u64 size, kvm_pte_t attr_set, 1196 kvm_pte_t attr_clr, kvm_pte_t *orig_pte, 1197 u32 *level, enum kvm_pgtable_walk_flags flags) 1198 { 1199 int ret; 1200 kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; 1201 struct stage2_attr_data data = { 1202 .attr_set = attr_set & attr_mask, 1203 .attr_clr = attr_clr & attr_mask, 1204 }; 1205 struct kvm_pgtable_walker walker = { 1206 .cb = stage2_attr_walker, 1207 .arg = &data, 1208 .flags = flags | KVM_PGTABLE_WALK_LEAF, 1209 }; 1210 1211 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1212 if (ret) 1213 return ret; 1214 1215 if (orig_pte) 1216 *orig_pte = data.pte; 1217 1218 if (level) 1219 *level = data.level; 1220 return 0; 1221 } 1222 1223 int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) 1224 { 1225 return stage2_update_leaf_attrs(pgt, addr, size, 0, 1226 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, 1227 NULL, NULL, 0); 1228 } 1229 1230 kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) 1231 { 1232 kvm_pte_t pte = 0; 1233 int ret; 1234 1235 ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, 1236 &pte, NULL, 1237 KVM_PGTABLE_WALK_HANDLE_FAULT | 1238 KVM_PGTABLE_WALK_SHARED); 1239 if (!ret) 1240 dsb(ishst); 1241 1242 return pte; 1243 } 1244 1245 struct stage2_age_data { 1246 bool mkold; 1247 bool young; 1248 }; 1249 1250 static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx, 1251 enum kvm_pgtable_walk_flags visit) 1252 { 1253 kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF; 1254 struct stage2_age_data *data = ctx->arg; 1255 1256 if (!kvm_pte_valid(ctx->old) || new == ctx->old) 1257 return 0; 1258 1259 data->young = true; 1260 1261 /* 1262 * stage2_age_walker() is always called while holding the MMU lock for 1263 * write, so this will always succeed. Nonetheless, this deliberately 1264 * follows the race detection pattern of the other stage-2 walkers in 1265 * case the locking mechanics of the MMU notifiers is ever changed. 1266 */ 1267 if (data->mkold && !stage2_try_set_pte(ctx, new)) 1268 return -EAGAIN; 1269 1270 /* 1271 * "But where's the TLBI?!", you scream. 1272 * "Over in the core code", I sigh. 1273 * 1274 * See the '->clear_flush_young()' callback on the KVM mmu notifier. 1275 */ 1276 return 0; 1277 } 1278 1279 bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, 1280 u64 size, bool mkold) 1281 { 1282 struct stage2_age_data data = { 1283 .mkold = mkold, 1284 }; 1285 struct kvm_pgtable_walker walker = { 1286 .cb = stage2_age_walker, 1287 .arg = &data, 1288 .flags = KVM_PGTABLE_WALK_LEAF, 1289 }; 1290 1291 WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); 1292 return data.young; 1293 } 1294 1295 int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, 1296 enum kvm_pgtable_prot prot) 1297 { 1298 int ret; 1299 u32 level; 1300 kvm_pte_t set = 0, clr = 0; 1301 1302 if (prot & KVM_PTE_LEAF_ATTR_HI_SW) 1303 return -EINVAL; 1304 1305 if (prot & KVM_PGTABLE_PROT_R) 1306 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; 1307 1308 if (prot & KVM_PGTABLE_PROT_W) 1309 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; 1310 1311 if (prot & KVM_PGTABLE_PROT_X) 1312 clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; 1313 1314 ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, 1315 KVM_PGTABLE_WALK_HANDLE_FAULT | 1316 KVM_PGTABLE_WALK_SHARED); 1317 if (!ret) 1318 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); 1319 return ret; 1320 } 1321 1322 static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, 1323 enum kvm_pgtable_walk_flags visit) 1324 { 1325 struct kvm_pgtable *pgt = ctx->arg; 1326 struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; 1327 1328 if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old)) 1329 return 0; 1330 1331 if (mm_ops->dcache_clean_inval_poc) 1332 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), 1333 kvm_granule_size(ctx->level)); 1334 return 0; 1335 } 1336 1337 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) 1338 { 1339 struct kvm_pgtable_walker walker = { 1340 .cb = stage2_flush_walker, 1341 .flags = KVM_PGTABLE_WALK_LEAF, 1342 .arg = pgt, 1343 }; 1344 1345 if (stage2_has_fwb(pgt)) 1346 return 0; 1347 1348 return kvm_pgtable_walk(pgt, addr, size, &walker); 1349 } 1350 1351 kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, 1352 u64 phys, u32 level, 1353 enum kvm_pgtable_prot prot, 1354 void *mc, bool force_pte) 1355 { 1356 struct stage2_map_data map_data = { 1357 .phys = phys, 1358 .mmu = pgt->mmu, 1359 .memcache = mc, 1360 .force_pte = force_pte, 1361 }; 1362 struct kvm_pgtable_walker walker = { 1363 .cb = stage2_map_walker, 1364 .flags = KVM_PGTABLE_WALK_LEAF | 1365 KVM_PGTABLE_WALK_SKIP_BBM_TLBI | 1366 KVM_PGTABLE_WALK_SKIP_CMO, 1367 .arg = &map_data, 1368 }; 1369 /* 1370 * The input address (.addr) is irrelevant for walking an 1371 * unlinked table. Construct an ambiguous IA range to map 1372 * kvm_granule_size(level) worth of memory. 1373 */ 1374 struct kvm_pgtable_walk_data data = { 1375 .walker = &walker, 1376 .addr = 0, 1377 .end = kvm_granule_size(level), 1378 }; 1379 struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; 1380 kvm_pte_t *pgtable; 1381 int ret; 1382 1383 if (!IS_ALIGNED(phys, kvm_granule_size(level))) 1384 return ERR_PTR(-EINVAL); 1385 1386 ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); 1387 if (ret) 1388 return ERR_PTR(ret); 1389 1390 pgtable = mm_ops->zalloc_page(mc); 1391 if (!pgtable) 1392 return ERR_PTR(-ENOMEM); 1393 1394 ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable, 1395 level + 1); 1396 if (ret) { 1397 kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); 1398 mm_ops->put_page(pgtable); 1399 return ERR_PTR(ret); 1400 } 1401 1402 return pgtable; 1403 } 1404 1405 /* 1406 * Get the number of page-tables needed to replace a block with a 1407 * fully populated tree up to the PTE entries. Note that @level is 1408 * interpreted as in "level @level entry". 1409 */ 1410 static int stage2_block_get_nr_page_tables(u32 level) 1411 { 1412 switch (level) { 1413 case 1: 1414 return PTRS_PER_PTE + 1; 1415 case 2: 1416 return 1; 1417 case 3: 1418 return 0; 1419 default: 1420 WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || 1421 level >= KVM_PGTABLE_MAX_LEVELS); 1422 return -EINVAL; 1423 }; 1424 } 1425 1426 static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, 1427 enum kvm_pgtable_walk_flags visit) 1428 { 1429 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1430 struct kvm_mmu_memory_cache *mc = ctx->arg; 1431 struct kvm_s2_mmu *mmu; 1432 kvm_pte_t pte = ctx->old, new, *childp; 1433 enum kvm_pgtable_prot prot; 1434 u32 level = ctx->level; 1435 bool force_pte; 1436 int nr_pages; 1437 u64 phys; 1438 1439 /* No huge-pages exist at the last level */ 1440 if (level == KVM_PGTABLE_MAX_LEVELS - 1) 1441 return 0; 1442 1443 /* We only split valid block mappings */ 1444 if (!kvm_pte_valid(pte)) 1445 return 0; 1446 1447 nr_pages = stage2_block_get_nr_page_tables(level); 1448 if (nr_pages < 0) 1449 return nr_pages; 1450 1451 if (mc->nobjs >= nr_pages) { 1452 /* Build a tree mapped down to the PTE granularity. */ 1453 force_pte = true; 1454 } else { 1455 /* 1456 * Don't force PTEs, so create_unlinked() below does 1457 * not populate the tree up to the PTE level. The 1458 * consequence is that the call will require a single 1459 * page of level 2 entries at level 1, or a single 1460 * page of PTEs at level 2. If we are at level 1, the 1461 * PTEs will be created recursively. 1462 */ 1463 force_pte = false; 1464 nr_pages = 1; 1465 } 1466 1467 if (mc->nobjs < nr_pages) 1468 return -ENOMEM; 1469 1470 mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache); 1471 phys = kvm_pte_to_phys(pte); 1472 prot = kvm_pgtable_stage2_pte_prot(pte); 1473 1474 childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys, 1475 level, prot, mc, force_pte); 1476 if (IS_ERR(childp)) 1477 return PTR_ERR(childp); 1478 1479 if (!stage2_try_break_pte(ctx, mmu)) { 1480 kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); 1481 mm_ops->put_page(childp); 1482 return -EAGAIN; 1483 } 1484 1485 /* 1486 * Note, the contents of the page table are guaranteed to be made 1487 * visible before the new PTE is assigned because stage2_make_pte() 1488 * writes the PTE using smp_store_release(). 1489 */ 1490 new = kvm_init_table_pte(childp, mm_ops); 1491 stage2_make_pte(ctx, new); 1492 dsb(ishst); 1493 return 0; 1494 } 1495 1496 int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, 1497 struct kvm_mmu_memory_cache *mc) 1498 { 1499 struct kvm_pgtable_walker walker = { 1500 .cb = stage2_split_walker, 1501 .flags = KVM_PGTABLE_WALK_LEAF, 1502 .arg = mc, 1503 }; 1504 1505 return kvm_pgtable_walk(pgt, addr, size, &walker); 1506 } 1507 1508 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, 1509 struct kvm_pgtable_mm_ops *mm_ops, 1510 enum kvm_pgtable_stage2_flags flags, 1511 kvm_pgtable_force_pte_cb_t force_pte_cb) 1512 { 1513 size_t pgd_sz; 1514 u64 vtcr = mmu->arch->vtcr; 1515 u32 ia_bits = VTCR_EL2_IPA(vtcr); 1516 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); 1517 u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; 1518 1519 pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; 1520 pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); 1521 if (!pgt->pgd) 1522 return -ENOMEM; 1523 1524 pgt->ia_bits = ia_bits; 1525 pgt->start_level = start_level; 1526 pgt->mm_ops = mm_ops; 1527 pgt->mmu = mmu; 1528 pgt->flags = flags; 1529 pgt->force_pte_cb = force_pte_cb; 1530 1531 /* Ensure zeroed PGD pages are visible to the hardware walker */ 1532 dsb(ishst); 1533 return 0; 1534 } 1535 1536 size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) 1537 { 1538 u32 ia_bits = VTCR_EL2_IPA(vtcr); 1539 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); 1540 u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; 1541 1542 return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; 1543 } 1544 1545 static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, 1546 enum kvm_pgtable_walk_flags visit) 1547 { 1548 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1549 1550 if (!stage2_pte_is_counted(ctx->old)) 1551 return 0; 1552 1553 mm_ops->put_page(ctx->ptep); 1554 1555 if (kvm_pte_table(ctx->old, ctx->level)) 1556 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); 1557 1558 return 0; 1559 } 1560 1561 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) 1562 { 1563 size_t pgd_sz; 1564 struct kvm_pgtable_walker walker = { 1565 .cb = stage2_free_walker, 1566 .flags = KVM_PGTABLE_WALK_LEAF | 1567 KVM_PGTABLE_WALK_TABLE_POST, 1568 }; 1569 1570 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); 1571 pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; 1572 pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); 1573 pgt->pgd = NULL; 1574 } 1575 1576 void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) 1577 { 1578 kvm_pteref_t ptep = (kvm_pteref_t)pgtable; 1579 struct kvm_pgtable_walker walker = { 1580 .cb = stage2_free_walker, 1581 .flags = KVM_PGTABLE_WALK_LEAF | 1582 KVM_PGTABLE_WALK_TABLE_POST, 1583 }; 1584 struct kvm_pgtable_walk_data data = { 1585 .walker = &walker, 1586 1587 /* 1588 * At this point the IPA really doesn't matter, as the page 1589 * table being traversed has already been removed from the stage 1590 * 2. Set an appropriate range to cover the entire page table. 1591 */ 1592 .addr = 0, 1593 .end = kvm_granule_size(level), 1594 }; 1595 1596 WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1)); 1597 1598 WARN_ON(mm_ops->page_count(pgtable) != 1); 1599 mm_ops->put_page(pgtable); 1600 } 1601