1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Stand-alone page-table allocator for hyp stage-1 and guest stage-2. 4 * No bombay mix was harmed in the writing of this file. 5 * 6 * Copyright (C) 2020 Google LLC 7 * Author: Will Deacon <will@kernel.org> 8 */ 9 10 #include <linux/bitfield.h> 11 #include <asm/kvm_pgtable.h> 12 #include <asm/stage2_pgtable.h> 13 14 15 #define KVM_PTE_TYPE BIT(1) 16 #define KVM_PTE_TYPE_BLOCK 0 17 #define KVM_PTE_TYPE_PAGE 1 18 #define KVM_PTE_TYPE_TABLE 1 19 20 #define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) 21 22 #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) 23 #define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) 24 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ 25 ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) 26 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ 27 ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) 28 #define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) 29 #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 30 #define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) 31 32 #define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) 33 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) 34 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) 35 #define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) 36 #define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 37 #define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) 38 39 #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) 40 41 #define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) 42 43 #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) 44 45 #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) 46 47 #define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) 48 49 #define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ 50 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ 51 KVM_PTE_LEAF_ATTR_HI_S2_XN) 52 53 #define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) 54 #define KVM_MAX_OWNER_ID 1 55 56 /* 57 * Used to indicate a pte for which a 'break-before-make' sequence is in 58 * progress. 59 */ 60 #define KVM_INVALID_PTE_LOCKED BIT(10) 61 62 struct kvm_pgtable_walk_data { 63 struct kvm_pgtable_walker *walker; 64 65 const u64 start; 66 u64 addr; 67 const u64 end; 68 }; 69 70 static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx) 71 { 72 return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI); 73 } 74 75 static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) 76 { 77 return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); 78 } 79 80 static bool kvm_phys_is_valid(u64 phys) 81 { 82 return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); 83 } 84 85 static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) 86 { 87 u64 granule = kvm_granule_size(ctx->level); 88 89 if (!kvm_level_supports_block_mapping(ctx->level)) 90 return false; 91 92 if (granule > (ctx->end - ctx->addr)) 93 return false; 94 95 if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule)) 96 return false; 97 98 return IS_ALIGNED(ctx->addr, granule); 99 } 100 101 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) 102 { 103 u64 shift = kvm_granule_shift(level); 104 u64 mask = BIT(PAGE_SHIFT - 3) - 1; 105 106 return (data->addr >> shift) & mask; 107 } 108 109 static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) 110 { 111 u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */ 112 u64 mask = BIT(pgt->ia_bits) - 1; 113 114 return (addr & mask) >> shift; 115 } 116 117 static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) 118 { 119 struct kvm_pgtable pgt = { 120 .ia_bits = ia_bits, 121 .start_level = start_level, 122 }; 123 124 return kvm_pgd_page_idx(&pgt, -1ULL) + 1; 125 } 126 127 static bool kvm_pte_table(kvm_pte_t pte, u32 level) 128 { 129 if (level == KVM_PGTABLE_MAX_LEVELS - 1) 130 return false; 131 132 if (!kvm_pte_valid(pte)) 133 return false; 134 135 return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; 136 } 137 138 static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops) 139 { 140 return mm_ops->phys_to_virt(kvm_pte_to_phys(pte)); 141 } 142 143 static void kvm_clear_pte(kvm_pte_t *ptep) 144 { 145 WRITE_ONCE(*ptep, 0); 146 } 147 148 static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops) 149 { 150 kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); 151 152 pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); 153 pte |= KVM_PTE_VALID; 154 return pte; 155 } 156 157 static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) 158 { 159 kvm_pte_t pte = kvm_phys_to_pte(pa); 160 u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE : 161 KVM_PTE_TYPE_BLOCK; 162 163 pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); 164 pte |= FIELD_PREP(KVM_PTE_TYPE, type); 165 pte |= KVM_PTE_VALID; 166 167 return pte; 168 } 169 170 static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) 171 { 172 return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); 173 } 174 175 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, 176 const struct kvm_pgtable_visit_ctx *ctx, 177 enum kvm_pgtable_walk_flags visit) 178 { 179 struct kvm_pgtable_walker *walker = data->walker; 180 181 /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */ 182 WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held()); 183 return walker->cb(ctx, visit); 184 } 185 186 static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, 187 int r) 188 { 189 /* 190 * Visitor callbacks return EAGAIN when the conditions that led to a 191 * fault are no longer reflected in the page tables due to a race to 192 * update a PTE. In the context of a fault handler this is interpreted 193 * as a signal to retry guest execution. 194 * 195 * Ignore the return code altogether for walkers outside a fault handler 196 * (e.g. write protecting a range of memory) and chug along with the 197 * page table walk. 198 */ 199 if (r == -EAGAIN) 200 return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT); 201 202 return !r; 203 } 204 205 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, 206 struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); 207 208 static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, 209 struct kvm_pgtable_mm_ops *mm_ops, 210 kvm_pteref_t pteref, u32 level) 211 { 212 enum kvm_pgtable_walk_flags flags = data->walker->flags; 213 kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); 214 struct kvm_pgtable_visit_ctx ctx = { 215 .ptep = ptep, 216 .old = READ_ONCE(*ptep), 217 .arg = data->walker->arg, 218 .mm_ops = mm_ops, 219 .start = data->start, 220 .addr = data->addr, 221 .end = data->end, 222 .level = level, 223 .flags = flags, 224 }; 225 int ret = 0; 226 bool reload = false; 227 kvm_pteref_t childp; 228 bool table = kvm_pte_table(ctx.old, level); 229 230 if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) { 231 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); 232 reload = true; 233 } 234 235 if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { 236 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); 237 reload = true; 238 } 239 240 /* 241 * Reload the page table after invoking the walker callback for leaf 242 * entries or after pre-order traversal, to allow the walker to descend 243 * into a newly installed or replaced table. 244 */ 245 if (reload) { 246 ctx.old = READ_ONCE(*ptep); 247 table = kvm_pte_table(ctx.old, level); 248 } 249 250 if (!kvm_pgtable_walk_continue(data->walker, ret)) 251 goto out; 252 253 if (!table) { 254 data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); 255 data->addr += kvm_granule_size(level); 256 goto out; 257 } 258 259 childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops); 260 ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); 261 if (!kvm_pgtable_walk_continue(data->walker, ret)) 262 goto out; 263 264 if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST) 265 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST); 266 267 out: 268 if (kvm_pgtable_walk_continue(data->walker, ret)) 269 return 0; 270 271 return ret; 272 } 273 274 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, 275 struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level) 276 { 277 u32 idx; 278 int ret = 0; 279 280 if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS)) 281 return -EINVAL; 282 283 for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { 284 kvm_pteref_t pteref = &pgtable[idx]; 285 286 if (data->addr >= data->end) 287 break; 288 289 ret = __kvm_pgtable_visit(data, mm_ops, pteref, level); 290 if (ret) 291 break; 292 } 293 294 return ret; 295 } 296 297 static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data) 298 { 299 u32 idx; 300 int ret = 0; 301 u64 limit = BIT(pgt->ia_bits); 302 303 if (data->addr > limit || data->end > limit) 304 return -ERANGE; 305 306 if (!pgt->pgd) 307 return -EINVAL; 308 309 for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) { 310 kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE]; 311 312 ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level); 313 if (ret) 314 break; 315 } 316 317 return ret; 318 } 319 320 int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, 321 struct kvm_pgtable_walker *walker) 322 { 323 struct kvm_pgtable_walk_data walk_data = { 324 .start = ALIGN_DOWN(addr, PAGE_SIZE), 325 .addr = ALIGN_DOWN(addr, PAGE_SIZE), 326 .end = PAGE_ALIGN(walk_data.addr + size), 327 .walker = walker, 328 }; 329 int r; 330 331 r = kvm_pgtable_walk_begin(walker); 332 if (r) 333 return r; 334 335 r = _kvm_pgtable_walk(pgt, &walk_data); 336 kvm_pgtable_walk_end(walker); 337 338 return r; 339 } 340 341 struct leaf_walk_data { 342 kvm_pte_t pte; 343 u32 level; 344 }; 345 346 static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, 347 enum kvm_pgtable_walk_flags visit) 348 { 349 struct leaf_walk_data *data = ctx->arg; 350 351 data->pte = ctx->old; 352 data->level = ctx->level; 353 354 return 0; 355 } 356 357 int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, 358 kvm_pte_t *ptep, u32 *level) 359 { 360 struct leaf_walk_data data; 361 struct kvm_pgtable_walker walker = { 362 .cb = leaf_walker, 363 .flags = KVM_PGTABLE_WALK_LEAF, 364 .arg = &data, 365 }; 366 int ret; 367 368 ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE), 369 PAGE_SIZE, &walker); 370 if (!ret) { 371 if (ptep) 372 *ptep = data.pte; 373 if (level) 374 *level = data.level; 375 } 376 377 return ret; 378 } 379 380 struct hyp_map_data { 381 const u64 phys; 382 kvm_pte_t attr; 383 }; 384 385 static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) 386 { 387 bool device = prot & KVM_PGTABLE_PROT_DEVICE; 388 u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; 389 kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); 390 u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; 391 u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW : 392 KVM_PTE_LEAF_ATTR_LO_S1_AP_RO; 393 394 if (!(prot & KVM_PGTABLE_PROT_R)) 395 return -EINVAL; 396 397 if (prot & KVM_PGTABLE_PROT_X) { 398 if (prot & KVM_PGTABLE_PROT_W) 399 return -EINVAL; 400 401 if (device) 402 return -EINVAL; 403 404 if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti()) 405 attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; 406 } else { 407 attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; 408 } 409 410 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); 411 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); 412 attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; 413 attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; 414 *ptep = attr; 415 416 return 0; 417 } 418 419 enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) 420 { 421 enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; 422 u32 ap; 423 424 if (!kvm_pte_valid(pte)) 425 return prot; 426 427 if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) 428 prot |= KVM_PGTABLE_PROT_X; 429 430 ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); 431 if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) 432 prot |= KVM_PGTABLE_PROT_R; 433 else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW) 434 prot |= KVM_PGTABLE_PROT_RW; 435 436 return prot; 437 } 438 439 static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, 440 struct hyp_map_data *data) 441 { 442 u64 phys = data->phys + (ctx->addr - ctx->start); 443 kvm_pte_t new; 444 445 if (!kvm_block_mapping_supported(ctx, phys)) 446 return false; 447 448 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); 449 if (ctx->old == new) 450 return true; 451 if (!kvm_pte_valid(ctx->old)) 452 ctx->mm_ops->get_page(ctx->ptep); 453 else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) 454 return false; 455 456 smp_store_release(ctx->ptep, new); 457 return true; 458 } 459 460 static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, 461 enum kvm_pgtable_walk_flags visit) 462 { 463 kvm_pte_t *childp, new; 464 struct hyp_map_data *data = ctx->arg; 465 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 466 467 if (hyp_map_walker_try_leaf(ctx, data)) 468 return 0; 469 470 if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) 471 return -EINVAL; 472 473 childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); 474 if (!childp) 475 return -ENOMEM; 476 477 new = kvm_init_table_pte(childp, mm_ops); 478 mm_ops->get_page(ctx->ptep); 479 smp_store_release(ctx->ptep, new); 480 481 return 0; 482 } 483 484 int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, 485 enum kvm_pgtable_prot prot) 486 { 487 int ret; 488 struct hyp_map_data map_data = { 489 .phys = ALIGN_DOWN(phys, PAGE_SIZE), 490 }; 491 struct kvm_pgtable_walker walker = { 492 .cb = hyp_map_walker, 493 .flags = KVM_PGTABLE_WALK_LEAF, 494 .arg = &map_data, 495 }; 496 497 ret = hyp_set_prot_attr(prot, &map_data.attr); 498 if (ret) 499 return ret; 500 501 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 502 dsb(ishst); 503 isb(); 504 return ret; 505 } 506 507 static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, 508 enum kvm_pgtable_walk_flags visit) 509 { 510 kvm_pte_t *childp = NULL; 511 u64 granule = kvm_granule_size(ctx->level); 512 u64 *unmapped = ctx->arg; 513 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 514 515 if (!kvm_pte_valid(ctx->old)) 516 return -EINVAL; 517 518 if (kvm_pte_table(ctx->old, ctx->level)) { 519 childp = kvm_pte_follow(ctx->old, mm_ops); 520 521 if (mm_ops->page_count(childp) != 1) 522 return 0; 523 524 kvm_clear_pte(ctx->ptep); 525 dsb(ishst); 526 __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), 0); 527 } else { 528 if (ctx->end - ctx->addr < granule) 529 return -EINVAL; 530 531 kvm_clear_pte(ctx->ptep); 532 dsb(ishst); 533 __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); 534 *unmapped += granule; 535 } 536 537 dsb(ish); 538 isb(); 539 mm_ops->put_page(ctx->ptep); 540 541 if (childp) 542 mm_ops->put_page(childp); 543 544 return 0; 545 } 546 547 u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) 548 { 549 u64 unmapped = 0; 550 struct kvm_pgtable_walker walker = { 551 .cb = hyp_unmap_walker, 552 .arg = &unmapped, 553 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, 554 }; 555 556 if (!pgt->mm_ops->page_count) 557 return 0; 558 559 kvm_pgtable_walk(pgt, addr, size, &walker); 560 return unmapped; 561 } 562 563 int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, 564 struct kvm_pgtable_mm_ops *mm_ops) 565 { 566 u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); 567 568 pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); 569 if (!pgt->pgd) 570 return -ENOMEM; 571 572 pgt->ia_bits = va_bits; 573 pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; 574 pgt->mm_ops = mm_ops; 575 pgt->mmu = NULL; 576 pgt->force_pte_cb = NULL; 577 578 return 0; 579 } 580 581 static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx, 582 enum kvm_pgtable_walk_flags visit) 583 { 584 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 585 586 if (!kvm_pte_valid(ctx->old)) 587 return 0; 588 589 mm_ops->put_page(ctx->ptep); 590 591 if (kvm_pte_table(ctx->old, ctx->level)) 592 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); 593 594 return 0; 595 } 596 597 void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) 598 { 599 struct kvm_pgtable_walker walker = { 600 .cb = hyp_free_walker, 601 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, 602 }; 603 604 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); 605 pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd)); 606 pgt->pgd = NULL; 607 } 608 609 struct stage2_map_data { 610 const u64 phys; 611 kvm_pte_t attr; 612 u8 owner_id; 613 614 kvm_pte_t *anchor; 615 kvm_pte_t *childp; 616 617 struct kvm_s2_mmu *mmu; 618 void *memcache; 619 620 /* Force mappings to page granularity */ 621 bool force_pte; 622 }; 623 624 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) 625 { 626 u64 vtcr = VTCR_EL2_FLAGS; 627 u8 lvls; 628 629 vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; 630 vtcr |= VTCR_EL2_T0SZ(phys_shift); 631 /* 632 * Use a minimum 2 level page table to prevent splitting 633 * host PMD huge pages at stage2. 634 */ 635 lvls = stage2_pgtable_levels(phys_shift); 636 if (lvls < 2) 637 lvls = 2; 638 vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); 639 640 #ifdef CONFIG_ARM64_HW_AFDBM 641 /* 642 * Enable the Hardware Access Flag management, unconditionally 643 * on all CPUs. In systems that have asymmetric support for the feature 644 * this allows KVM to leverage hardware support on the subset of cores 645 * that implement the feature. 646 * 647 * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by 648 * hardware) on implementations that do not advertise support for the 649 * feature. As such, setting HA unconditionally is safe, unless you 650 * happen to be running on a design that has unadvertised support for 651 * HAFDBS. Here be dragons. 652 */ 653 if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) 654 vtcr |= VTCR_EL2_HA; 655 #endif /* CONFIG_ARM64_HW_AFDBM */ 656 657 /* Set the vmid bits */ 658 vtcr |= (get_vmid_bits(mmfr1) == 16) ? 659 VTCR_EL2_VS_16BIT : 660 VTCR_EL2_VS_8BIT; 661 662 return vtcr; 663 } 664 665 static bool stage2_has_fwb(struct kvm_pgtable *pgt) 666 { 667 if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) 668 return false; 669 670 return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); 671 } 672 673 void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, 674 phys_addr_t addr, size_t size) 675 { 676 unsigned long pages, inval_pages; 677 678 if (!system_supports_tlb_range()) { 679 kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); 680 return; 681 } 682 683 pages = size >> PAGE_SHIFT; 684 while (pages > 0) { 685 inval_pages = min(pages, MAX_TLBI_RANGE_PAGES); 686 kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages); 687 688 addr += inval_pages << PAGE_SHIFT; 689 pages -= inval_pages; 690 } 691 } 692 693 #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) 694 695 static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, 696 kvm_pte_t *ptep) 697 { 698 bool device = prot & KVM_PGTABLE_PROT_DEVICE; 699 kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) : 700 KVM_S2_MEMATTR(pgt, NORMAL); 701 u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; 702 703 if (!(prot & KVM_PGTABLE_PROT_X)) 704 attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; 705 else if (device) 706 return -EINVAL; 707 708 if (prot & KVM_PGTABLE_PROT_R) 709 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; 710 711 if (prot & KVM_PGTABLE_PROT_W) 712 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; 713 714 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); 715 attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; 716 attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; 717 *ptep = attr; 718 719 return 0; 720 } 721 722 enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) 723 { 724 enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; 725 726 if (!kvm_pte_valid(pte)) 727 return prot; 728 729 if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R) 730 prot |= KVM_PGTABLE_PROT_R; 731 if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) 732 prot |= KVM_PGTABLE_PROT_W; 733 if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) 734 prot |= KVM_PGTABLE_PROT_X; 735 736 return prot; 737 } 738 739 static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) 740 { 741 if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) 742 return true; 743 744 return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)); 745 } 746 747 static bool stage2_pte_is_counted(kvm_pte_t pte) 748 { 749 /* 750 * The refcount tracks valid entries as well as invalid entries if they 751 * encode ownership of a page to another entity than the page-table 752 * owner, whose id is 0. 753 */ 754 return !!pte; 755 } 756 757 static bool stage2_pte_is_locked(kvm_pte_t pte) 758 { 759 return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED); 760 } 761 762 static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) 763 { 764 if (!kvm_pgtable_walk_shared(ctx)) { 765 WRITE_ONCE(*ctx->ptep, new); 766 return true; 767 } 768 769 return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old; 770 } 771 772 /** 773 * stage2_try_break_pte() - Invalidates a pte according to the 774 * 'break-before-make' requirements of the 775 * architecture. 776 * 777 * @ctx: context of the visited pte. 778 * @mmu: stage-2 mmu 779 * 780 * Returns: true if the pte was successfully broken. 781 * 782 * If the removed pte was valid, performs the necessary serialization and TLB 783 * invalidation for the old value. For counted ptes, drops the reference count 784 * on the containing table page. 785 */ 786 static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, 787 struct kvm_s2_mmu *mmu) 788 { 789 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 790 791 if (stage2_pte_is_locked(ctx->old)) { 792 /* 793 * Should never occur if this walker has exclusive access to the 794 * page tables. 795 */ 796 WARN_ON(!kvm_pgtable_walk_shared(ctx)); 797 return false; 798 } 799 800 if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) 801 return false; 802 803 if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { 804 /* 805 * Perform the appropriate TLB invalidation based on the 806 * evicted pte value (if any). 807 */ 808 if (kvm_pte_table(ctx->old, ctx->level)) { 809 u64 size = kvm_granule_size(ctx->level); 810 u64 addr = ALIGN_DOWN(ctx->addr, size); 811 812 kvm_tlb_flush_vmid_range(mmu, addr, size); 813 } else if (kvm_pte_valid(ctx->old)) { 814 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, 815 ctx->addr, ctx->level); 816 } 817 } 818 819 if (stage2_pte_is_counted(ctx->old)) 820 mm_ops->put_page(ctx->ptep); 821 822 return true; 823 } 824 825 static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) 826 { 827 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 828 829 WARN_ON(!stage2_pte_is_locked(*ctx->ptep)); 830 831 if (stage2_pte_is_counted(new)) 832 mm_ops->get_page(ctx->ptep); 833 834 smp_store_release(ctx->ptep, new); 835 } 836 837 static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt) 838 { 839 /* 840 * If FEAT_TLBIRANGE is implemented, defer the individual 841 * TLB invalidations until the entire walk is finished, and 842 * then use the range-based TLBI instructions to do the 843 * invalidations. Condition deferred TLB invalidation on the 844 * system supporting FWB as the optimization is entirely 845 * pointless when the unmap walker needs to perform CMOs. 846 */ 847 return system_supports_tlb_range() && stage2_has_fwb(pgt); 848 } 849 850 static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, 851 struct kvm_s2_mmu *mmu, 852 struct kvm_pgtable_mm_ops *mm_ops) 853 { 854 struct kvm_pgtable *pgt = ctx->arg; 855 856 /* 857 * Clear the existing PTE, and perform break-before-make if it was 858 * valid. Depending on the system support, defer the TLB maintenance 859 * for the same until the entire unmap walk is completed. 860 */ 861 if (kvm_pte_valid(ctx->old)) { 862 kvm_clear_pte(ctx->ptep); 863 864 if (kvm_pte_table(ctx->old, ctx->level)) { 865 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, 866 0); 867 } else if (!stage2_unmap_defer_tlb_flush(pgt)) { 868 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, 869 ctx->level); 870 } 871 } 872 873 mm_ops->put_page(ctx->ptep); 874 } 875 876 static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) 877 { 878 u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; 879 return memattr == KVM_S2_MEMATTR(pgt, NORMAL); 880 } 881 882 static bool stage2_pte_executable(kvm_pte_t pte) 883 { 884 return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); 885 } 886 887 static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, 888 const struct stage2_map_data *data) 889 { 890 u64 phys = data->phys; 891 892 /* 893 * Stage-2 walks to update ownership data are communicated to the map 894 * walker using an invalid PA. Avoid offsetting an already invalid PA, 895 * which could overflow and make the address valid again. 896 */ 897 if (!kvm_phys_is_valid(phys)) 898 return phys; 899 900 /* 901 * Otherwise, work out the correct PA based on how far the walk has 902 * gotten. 903 */ 904 return phys + (ctx->addr - ctx->start); 905 } 906 907 static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, 908 struct stage2_map_data *data) 909 { 910 u64 phys = stage2_map_walker_phys_addr(ctx, data); 911 912 if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1))) 913 return false; 914 915 return kvm_block_mapping_supported(ctx, phys); 916 } 917 918 static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, 919 struct stage2_map_data *data) 920 { 921 kvm_pte_t new; 922 u64 phys = stage2_map_walker_phys_addr(ctx, data); 923 u64 granule = kvm_granule_size(ctx->level); 924 struct kvm_pgtable *pgt = data->mmu->pgt; 925 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 926 927 if (!stage2_leaf_mapping_allowed(ctx, data)) 928 return -E2BIG; 929 930 if (kvm_phys_is_valid(phys)) 931 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); 932 else 933 new = kvm_init_invalid_leaf_owner(data->owner_id); 934 935 /* 936 * Skip updating the PTE if we are trying to recreate the exact 937 * same mapping or only change the access permissions. Instead, 938 * the vCPU will exit one more time from guest if still needed 939 * and then go through the path of relaxing permissions. 940 */ 941 if (!stage2_pte_needs_update(ctx->old, new)) 942 return -EAGAIN; 943 944 if (!stage2_try_break_pte(ctx, data->mmu)) 945 return -EAGAIN; 946 947 /* Perform CMOs before installation of the guest stage-2 PTE */ 948 if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc && 949 stage2_pte_cacheable(pgt, new)) 950 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), 951 granule); 952 953 if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou && 954 stage2_pte_executable(new)) 955 mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); 956 957 stage2_make_pte(ctx, new); 958 959 return 0; 960 } 961 962 static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, 963 struct stage2_map_data *data) 964 { 965 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 966 kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); 967 int ret; 968 969 if (!stage2_leaf_mapping_allowed(ctx, data)) 970 return 0; 971 972 ret = stage2_map_walker_try_leaf(ctx, data); 973 if (ret) 974 return ret; 975 976 mm_ops->free_unlinked_table(childp, ctx->level); 977 return 0; 978 } 979 980 static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, 981 struct stage2_map_data *data) 982 { 983 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 984 kvm_pte_t *childp, new; 985 int ret; 986 987 ret = stage2_map_walker_try_leaf(ctx, data); 988 if (ret != -E2BIG) 989 return ret; 990 991 if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) 992 return -EINVAL; 993 994 if (!data->memcache) 995 return -ENOMEM; 996 997 childp = mm_ops->zalloc_page(data->memcache); 998 if (!childp) 999 return -ENOMEM; 1000 1001 if (!stage2_try_break_pte(ctx, data->mmu)) { 1002 mm_ops->put_page(childp); 1003 return -EAGAIN; 1004 } 1005 1006 /* 1007 * If we've run into an existing block mapping then replace it with 1008 * a table. Accesses beyond 'end' that fall within the new table 1009 * will be mapped lazily. 1010 */ 1011 new = kvm_init_table_pte(childp, mm_ops); 1012 stage2_make_pte(ctx, new); 1013 1014 return 0; 1015 } 1016 1017 /* 1018 * The TABLE_PRE callback runs for table entries on the way down, looking 1019 * for table entries which we could conceivably replace with a block entry 1020 * for this mapping. If it finds one it replaces the entry and calls 1021 * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. 1022 * 1023 * Otherwise, the LEAF callback performs the mapping at the existing leaves 1024 * instead. 1025 */ 1026 static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx, 1027 enum kvm_pgtable_walk_flags visit) 1028 { 1029 struct stage2_map_data *data = ctx->arg; 1030 1031 switch (visit) { 1032 case KVM_PGTABLE_WALK_TABLE_PRE: 1033 return stage2_map_walk_table_pre(ctx, data); 1034 case KVM_PGTABLE_WALK_LEAF: 1035 return stage2_map_walk_leaf(ctx, data); 1036 default: 1037 return -EINVAL; 1038 } 1039 } 1040 1041 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, 1042 u64 phys, enum kvm_pgtable_prot prot, 1043 void *mc, enum kvm_pgtable_walk_flags flags) 1044 { 1045 int ret; 1046 struct stage2_map_data map_data = { 1047 .phys = ALIGN_DOWN(phys, PAGE_SIZE), 1048 .mmu = pgt->mmu, 1049 .memcache = mc, 1050 .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), 1051 }; 1052 struct kvm_pgtable_walker walker = { 1053 .cb = stage2_map_walker, 1054 .flags = flags | 1055 KVM_PGTABLE_WALK_TABLE_PRE | 1056 KVM_PGTABLE_WALK_LEAF, 1057 .arg = &map_data, 1058 }; 1059 1060 if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys))) 1061 return -EINVAL; 1062 1063 ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); 1064 if (ret) 1065 return ret; 1066 1067 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1068 dsb(ishst); 1069 return ret; 1070 } 1071 1072 int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, 1073 void *mc, u8 owner_id) 1074 { 1075 int ret; 1076 struct stage2_map_data map_data = { 1077 .phys = KVM_PHYS_INVALID, 1078 .mmu = pgt->mmu, 1079 .memcache = mc, 1080 .owner_id = owner_id, 1081 .force_pte = true, 1082 }; 1083 struct kvm_pgtable_walker walker = { 1084 .cb = stage2_map_walker, 1085 .flags = KVM_PGTABLE_WALK_TABLE_PRE | 1086 KVM_PGTABLE_WALK_LEAF, 1087 .arg = &map_data, 1088 }; 1089 1090 if (owner_id > KVM_MAX_OWNER_ID) 1091 return -EINVAL; 1092 1093 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1094 return ret; 1095 } 1096 1097 static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, 1098 enum kvm_pgtable_walk_flags visit) 1099 { 1100 struct kvm_pgtable *pgt = ctx->arg; 1101 struct kvm_s2_mmu *mmu = pgt->mmu; 1102 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1103 kvm_pte_t *childp = NULL; 1104 bool need_flush = false; 1105 1106 if (!kvm_pte_valid(ctx->old)) { 1107 if (stage2_pte_is_counted(ctx->old)) { 1108 kvm_clear_pte(ctx->ptep); 1109 mm_ops->put_page(ctx->ptep); 1110 } 1111 return 0; 1112 } 1113 1114 if (kvm_pte_table(ctx->old, ctx->level)) { 1115 childp = kvm_pte_follow(ctx->old, mm_ops); 1116 1117 if (mm_ops->page_count(childp) != 1) 1118 return 0; 1119 } else if (stage2_pte_cacheable(pgt, ctx->old)) { 1120 need_flush = !stage2_has_fwb(pgt); 1121 } 1122 1123 /* 1124 * This is similar to the map() path in that we unmap the entire 1125 * block entry and rely on the remaining portions being faulted 1126 * back lazily. 1127 */ 1128 stage2_unmap_put_pte(ctx, mmu, mm_ops); 1129 1130 if (need_flush && mm_ops->dcache_clean_inval_poc) 1131 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), 1132 kvm_granule_size(ctx->level)); 1133 1134 if (childp) 1135 mm_ops->put_page(childp); 1136 1137 return 0; 1138 } 1139 1140 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) 1141 { 1142 int ret; 1143 struct kvm_pgtable_walker walker = { 1144 .cb = stage2_unmap_walker, 1145 .arg = pgt, 1146 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, 1147 }; 1148 1149 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1150 if (stage2_unmap_defer_tlb_flush(pgt)) 1151 /* Perform the deferred TLB invalidations */ 1152 kvm_tlb_flush_vmid_range(pgt->mmu, addr, size); 1153 1154 return ret; 1155 } 1156 1157 struct stage2_attr_data { 1158 kvm_pte_t attr_set; 1159 kvm_pte_t attr_clr; 1160 kvm_pte_t pte; 1161 u32 level; 1162 }; 1163 1164 static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, 1165 enum kvm_pgtable_walk_flags visit) 1166 { 1167 kvm_pte_t pte = ctx->old; 1168 struct stage2_attr_data *data = ctx->arg; 1169 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1170 1171 if (!kvm_pte_valid(ctx->old)) 1172 return -EAGAIN; 1173 1174 data->level = ctx->level; 1175 data->pte = pte; 1176 pte &= ~data->attr_clr; 1177 pte |= data->attr_set; 1178 1179 /* 1180 * We may race with the CPU trying to set the access flag here, 1181 * but worst-case the access flag update gets lost and will be 1182 * set on the next access instead. 1183 */ 1184 if (data->pte != pte) { 1185 /* 1186 * Invalidate instruction cache before updating the guest 1187 * stage-2 PTE if we are going to add executable permission. 1188 */ 1189 if (mm_ops->icache_inval_pou && 1190 stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old)) 1191 mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops), 1192 kvm_granule_size(ctx->level)); 1193 1194 if (!stage2_try_set_pte(ctx, pte)) 1195 return -EAGAIN; 1196 } 1197 1198 return 0; 1199 } 1200 1201 static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, 1202 u64 size, kvm_pte_t attr_set, 1203 kvm_pte_t attr_clr, kvm_pte_t *orig_pte, 1204 u32 *level, enum kvm_pgtable_walk_flags flags) 1205 { 1206 int ret; 1207 kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; 1208 struct stage2_attr_data data = { 1209 .attr_set = attr_set & attr_mask, 1210 .attr_clr = attr_clr & attr_mask, 1211 }; 1212 struct kvm_pgtable_walker walker = { 1213 .cb = stage2_attr_walker, 1214 .arg = &data, 1215 .flags = flags | KVM_PGTABLE_WALK_LEAF, 1216 }; 1217 1218 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1219 if (ret) 1220 return ret; 1221 1222 if (orig_pte) 1223 *orig_pte = data.pte; 1224 1225 if (level) 1226 *level = data.level; 1227 return 0; 1228 } 1229 1230 int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) 1231 { 1232 return stage2_update_leaf_attrs(pgt, addr, size, 0, 1233 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, 1234 NULL, NULL, 0); 1235 } 1236 1237 kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) 1238 { 1239 kvm_pte_t pte = 0; 1240 int ret; 1241 1242 ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, 1243 &pte, NULL, 1244 KVM_PGTABLE_WALK_HANDLE_FAULT | 1245 KVM_PGTABLE_WALK_SHARED); 1246 if (!ret) 1247 dsb(ishst); 1248 1249 return pte; 1250 } 1251 1252 struct stage2_age_data { 1253 bool mkold; 1254 bool young; 1255 }; 1256 1257 static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx, 1258 enum kvm_pgtable_walk_flags visit) 1259 { 1260 kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF; 1261 struct stage2_age_data *data = ctx->arg; 1262 1263 if (!kvm_pte_valid(ctx->old) || new == ctx->old) 1264 return 0; 1265 1266 data->young = true; 1267 1268 /* 1269 * stage2_age_walker() is always called while holding the MMU lock for 1270 * write, so this will always succeed. Nonetheless, this deliberately 1271 * follows the race detection pattern of the other stage-2 walkers in 1272 * case the locking mechanics of the MMU notifiers is ever changed. 1273 */ 1274 if (data->mkold && !stage2_try_set_pte(ctx, new)) 1275 return -EAGAIN; 1276 1277 /* 1278 * "But where's the TLBI?!", you scream. 1279 * "Over in the core code", I sigh. 1280 * 1281 * See the '->clear_flush_young()' callback on the KVM mmu notifier. 1282 */ 1283 return 0; 1284 } 1285 1286 bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, 1287 u64 size, bool mkold) 1288 { 1289 struct stage2_age_data data = { 1290 .mkold = mkold, 1291 }; 1292 struct kvm_pgtable_walker walker = { 1293 .cb = stage2_age_walker, 1294 .arg = &data, 1295 .flags = KVM_PGTABLE_WALK_LEAF, 1296 }; 1297 1298 WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); 1299 return data.young; 1300 } 1301 1302 int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, 1303 enum kvm_pgtable_prot prot) 1304 { 1305 int ret; 1306 u32 level; 1307 kvm_pte_t set = 0, clr = 0; 1308 1309 if (prot & KVM_PTE_LEAF_ATTR_HI_SW) 1310 return -EINVAL; 1311 1312 if (prot & KVM_PGTABLE_PROT_R) 1313 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; 1314 1315 if (prot & KVM_PGTABLE_PROT_W) 1316 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; 1317 1318 if (prot & KVM_PGTABLE_PROT_X) 1319 clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; 1320 1321 ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, 1322 KVM_PGTABLE_WALK_HANDLE_FAULT | 1323 KVM_PGTABLE_WALK_SHARED); 1324 if (!ret) 1325 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); 1326 return ret; 1327 } 1328 1329 static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, 1330 enum kvm_pgtable_walk_flags visit) 1331 { 1332 struct kvm_pgtable *pgt = ctx->arg; 1333 struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; 1334 1335 if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old)) 1336 return 0; 1337 1338 if (mm_ops->dcache_clean_inval_poc) 1339 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), 1340 kvm_granule_size(ctx->level)); 1341 return 0; 1342 } 1343 1344 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) 1345 { 1346 struct kvm_pgtable_walker walker = { 1347 .cb = stage2_flush_walker, 1348 .flags = KVM_PGTABLE_WALK_LEAF, 1349 .arg = pgt, 1350 }; 1351 1352 if (stage2_has_fwb(pgt)) 1353 return 0; 1354 1355 return kvm_pgtable_walk(pgt, addr, size, &walker); 1356 } 1357 1358 kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, 1359 u64 phys, u32 level, 1360 enum kvm_pgtable_prot prot, 1361 void *mc, bool force_pte) 1362 { 1363 struct stage2_map_data map_data = { 1364 .phys = phys, 1365 .mmu = pgt->mmu, 1366 .memcache = mc, 1367 .force_pte = force_pte, 1368 }; 1369 struct kvm_pgtable_walker walker = { 1370 .cb = stage2_map_walker, 1371 .flags = KVM_PGTABLE_WALK_LEAF | 1372 KVM_PGTABLE_WALK_SKIP_BBM_TLBI | 1373 KVM_PGTABLE_WALK_SKIP_CMO, 1374 .arg = &map_data, 1375 }; 1376 /* 1377 * The input address (.addr) is irrelevant for walking an 1378 * unlinked table. Construct an ambiguous IA range to map 1379 * kvm_granule_size(level) worth of memory. 1380 */ 1381 struct kvm_pgtable_walk_data data = { 1382 .walker = &walker, 1383 .addr = 0, 1384 .end = kvm_granule_size(level), 1385 }; 1386 struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; 1387 kvm_pte_t *pgtable; 1388 int ret; 1389 1390 if (!IS_ALIGNED(phys, kvm_granule_size(level))) 1391 return ERR_PTR(-EINVAL); 1392 1393 ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); 1394 if (ret) 1395 return ERR_PTR(ret); 1396 1397 pgtable = mm_ops->zalloc_page(mc); 1398 if (!pgtable) 1399 return ERR_PTR(-ENOMEM); 1400 1401 ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable, 1402 level + 1); 1403 if (ret) { 1404 kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); 1405 mm_ops->put_page(pgtable); 1406 return ERR_PTR(ret); 1407 } 1408 1409 return pgtable; 1410 } 1411 1412 /* 1413 * Get the number of page-tables needed to replace a block with a 1414 * fully populated tree up to the PTE entries. Note that @level is 1415 * interpreted as in "level @level entry". 1416 */ 1417 static int stage2_block_get_nr_page_tables(u32 level) 1418 { 1419 switch (level) { 1420 case 1: 1421 return PTRS_PER_PTE + 1; 1422 case 2: 1423 return 1; 1424 case 3: 1425 return 0; 1426 default: 1427 WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || 1428 level >= KVM_PGTABLE_MAX_LEVELS); 1429 return -EINVAL; 1430 }; 1431 } 1432 1433 static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, 1434 enum kvm_pgtable_walk_flags visit) 1435 { 1436 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1437 struct kvm_mmu_memory_cache *mc = ctx->arg; 1438 struct kvm_s2_mmu *mmu; 1439 kvm_pte_t pte = ctx->old, new, *childp; 1440 enum kvm_pgtable_prot prot; 1441 u32 level = ctx->level; 1442 bool force_pte; 1443 int nr_pages; 1444 u64 phys; 1445 1446 /* No huge-pages exist at the last level */ 1447 if (level == KVM_PGTABLE_MAX_LEVELS - 1) 1448 return 0; 1449 1450 /* We only split valid block mappings */ 1451 if (!kvm_pte_valid(pte)) 1452 return 0; 1453 1454 nr_pages = stage2_block_get_nr_page_tables(level); 1455 if (nr_pages < 0) 1456 return nr_pages; 1457 1458 if (mc->nobjs >= nr_pages) { 1459 /* Build a tree mapped down to the PTE granularity. */ 1460 force_pte = true; 1461 } else { 1462 /* 1463 * Don't force PTEs, so create_unlinked() below does 1464 * not populate the tree up to the PTE level. The 1465 * consequence is that the call will require a single 1466 * page of level 2 entries at level 1, or a single 1467 * page of PTEs at level 2. If we are at level 1, the 1468 * PTEs will be created recursively. 1469 */ 1470 force_pte = false; 1471 nr_pages = 1; 1472 } 1473 1474 if (mc->nobjs < nr_pages) 1475 return -ENOMEM; 1476 1477 mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache); 1478 phys = kvm_pte_to_phys(pte); 1479 prot = kvm_pgtable_stage2_pte_prot(pte); 1480 1481 childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys, 1482 level, prot, mc, force_pte); 1483 if (IS_ERR(childp)) 1484 return PTR_ERR(childp); 1485 1486 if (!stage2_try_break_pte(ctx, mmu)) { 1487 kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); 1488 mm_ops->put_page(childp); 1489 return -EAGAIN; 1490 } 1491 1492 /* 1493 * Note, the contents of the page table are guaranteed to be made 1494 * visible before the new PTE is assigned because stage2_make_pte() 1495 * writes the PTE using smp_store_release(). 1496 */ 1497 new = kvm_init_table_pte(childp, mm_ops); 1498 stage2_make_pte(ctx, new); 1499 dsb(ishst); 1500 return 0; 1501 } 1502 1503 int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, 1504 struct kvm_mmu_memory_cache *mc) 1505 { 1506 struct kvm_pgtable_walker walker = { 1507 .cb = stage2_split_walker, 1508 .flags = KVM_PGTABLE_WALK_LEAF, 1509 .arg = mc, 1510 }; 1511 1512 return kvm_pgtable_walk(pgt, addr, size, &walker); 1513 } 1514 1515 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, 1516 struct kvm_pgtable_mm_ops *mm_ops, 1517 enum kvm_pgtable_stage2_flags flags, 1518 kvm_pgtable_force_pte_cb_t force_pte_cb) 1519 { 1520 size_t pgd_sz; 1521 u64 vtcr = mmu->arch->vtcr; 1522 u32 ia_bits = VTCR_EL2_IPA(vtcr); 1523 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); 1524 u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; 1525 1526 pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; 1527 pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); 1528 if (!pgt->pgd) 1529 return -ENOMEM; 1530 1531 pgt->ia_bits = ia_bits; 1532 pgt->start_level = start_level; 1533 pgt->mm_ops = mm_ops; 1534 pgt->mmu = mmu; 1535 pgt->flags = flags; 1536 pgt->force_pte_cb = force_pte_cb; 1537 1538 /* Ensure zeroed PGD pages are visible to the hardware walker */ 1539 dsb(ishst); 1540 return 0; 1541 } 1542 1543 size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) 1544 { 1545 u32 ia_bits = VTCR_EL2_IPA(vtcr); 1546 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); 1547 u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; 1548 1549 return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; 1550 } 1551 1552 static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, 1553 enum kvm_pgtable_walk_flags visit) 1554 { 1555 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1556 1557 if (!stage2_pte_is_counted(ctx->old)) 1558 return 0; 1559 1560 mm_ops->put_page(ctx->ptep); 1561 1562 if (kvm_pte_table(ctx->old, ctx->level)) 1563 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); 1564 1565 return 0; 1566 } 1567 1568 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) 1569 { 1570 size_t pgd_sz; 1571 struct kvm_pgtable_walker walker = { 1572 .cb = stage2_free_walker, 1573 .flags = KVM_PGTABLE_WALK_LEAF | 1574 KVM_PGTABLE_WALK_TABLE_POST, 1575 }; 1576 1577 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); 1578 pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; 1579 pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); 1580 pgt->pgd = NULL; 1581 } 1582 1583 void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) 1584 { 1585 kvm_pteref_t ptep = (kvm_pteref_t)pgtable; 1586 struct kvm_pgtable_walker walker = { 1587 .cb = stage2_free_walker, 1588 .flags = KVM_PGTABLE_WALK_LEAF | 1589 KVM_PGTABLE_WALK_TABLE_POST, 1590 }; 1591 struct kvm_pgtable_walk_data data = { 1592 .walker = &walker, 1593 1594 /* 1595 * At this point the IPA really doesn't matter, as the page 1596 * table being traversed has already been removed from the stage 1597 * 2. Set an appropriate range to cover the entire page table. 1598 */ 1599 .addr = 0, 1600 .end = kvm_granule_size(level), 1601 }; 1602 1603 WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1)); 1604 1605 WARN_ON(mm_ops->page_count(pgtable) != 1); 1606 mm_ops->put_page(pgtable); 1607 } 1608